1 |
commit: 1b28da13cd7150f66fae58043d3de661105a513a |
2 |
Author: Mike Pagano <mpagano <AT> gentoo <DOT> org> |
3 |
AuthorDate: Sat Sep 27 13:37:37 2014 +0000 |
4 |
Commit: Mike Pagano <mpagano <AT> gentoo <DOT> org> |
5 |
CommitDate: Sat Sep 27 13:37:37 2014 +0000 |
6 |
URL: http://sources.gentoo.org/gitweb/?p=proj/linux-patches.git;a=commit;h=1b28da13 |
7 |
|
8 |
Move mpctp patch to experimental |
9 |
|
10 |
--- |
11 |
0000_README | 9 +- |
12 |
5010_multipath-tcp-v3.16-872d7f6c6f4e.patch | 19230 ++++++++++++++++++++++++++ |
13 |
2 files changed, 19235 insertions(+), 4 deletions(-) |
14 |
|
15 |
diff --git a/0000_README b/0000_README |
16 |
index d92e6b7..3cc9441 100644 |
17 |
--- a/0000_README |
18 |
+++ b/0000_README |
19 |
@@ -58,10 +58,6 @@ Patch: 2400_kcopy-patch-for-infiniband-driver.patch |
20 |
From: Alexey Shvetsov <alexxy@g.o> |
21 |
Desc: Zero copy for infiniband psm userspace driver |
22 |
|
23 |
-Patch: 2500_multipath-tcp-v3.16-872d7f6c6f4e.patch |
24 |
-From: http://multipath-tcp.org/ |
25 |
-Desc: Patch for simultaneous use of several IP-addresses/interfaces in TCP for better resource utilization, better throughput and smoother reaction to failures. |
26 |
- |
27 |
Patch: 2700_ThinkPad-30-brightness-control-fix.patch |
28 |
From: Seth Forshee <seth.forshee@×××××××××.com> |
29 |
Desc: ACPI: Disable Windows 8 compatibility for some Lenovo ThinkPads |
30 |
@@ -101,3 +97,8 @@ Desc: BFQ v7r5 patch 2 for 3.16: BFQ Scheduler |
31 |
Patch: 5003_BFQ-3-block-add-Early-Queue-Merge-EQM-v7r5-for-3.16.0.patch |
32 |
From: http://algo.ing.unimo.it/people/paolo/disk_sched/ |
33 |
Desc: BFQ v7r5 patch 3 for 3.16: Early Queue Merge (EQM) |
34 |
+ |
35 |
+Patch: 5010_multipath-tcp-v3.16-872d7f6c6f4e.patch |
36 |
+From: http://multipath-tcp.org/ |
37 |
+Desc: Patch for simultaneous use of several IP-addresses/interfaces in TCP for better resource utilization, better throughput and smoother reaction to failures. |
38 |
+ |
39 |
|
40 |
diff --git a/5010_multipath-tcp-v3.16-872d7f6c6f4e.patch b/5010_multipath-tcp-v3.16-872d7f6c6f4e.patch |
41 |
new file mode 100644 |
42 |
index 0000000..3000da3 |
43 |
--- /dev/null |
44 |
+++ b/5010_multipath-tcp-v3.16-872d7f6c6f4e.patch |
45 |
@@ -0,0 +1,19230 @@ |
46 |
+diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c |
47 |
+index 768a0fb67dd6..5a46d91a8df9 100644 |
48 |
+--- a/drivers/infiniband/hw/cxgb4/cm.c |
49 |
++++ b/drivers/infiniband/hw/cxgb4/cm.c |
50 |
+@@ -3432,7 +3432,7 @@ static void build_cpl_pass_accept_req(struct sk_buff *skb, int stid , u8 tos) |
51 |
+ */ |
52 |
+ memset(&tmp_opt, 0, sizeof(tmp_opt)); |
53 |
+ tcp_clear_options(&tmp_opt); |
54 |
+- tcp_parse_options(skb, &tmp_opt, 0, NULL); |
55 |
++ tcp_parse_options(skb, &tmp_opt, NULL, 0, NULL); |
56 |
+ |
57 |
+ req = (struct cpl_pass_accept_req *)__skb_push(skb, sizeof(*req)); |
58 |
+ memset(req, 0, sizeof(*req)); |
59 |
+diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h |
60 |
+index 2faef339d8f2..d86c853ffaad 100644 |
61 |
+--- a/include/linux/ipv6.h |
62 |
++++ b/include/linux/ipv6.h |
63 |
+@@ -256,16 +256,6 @@ static inline struct ipv6_pinfo * inet6_sk(const struct sock *__sk) |
64 |
+ return inet_sk(__sk)->pinet6; |
65 |
+ } |
66 |
+ |
67 |
+-static inline struct request_sock *inet6_reqsk_alloc(struct request_sock_ops *ops) |
68 |
+-{ |
69 |
+- struct request_sock *req = reqsk_alloc(ops); |
70 |
+- |
71 |
+- if (req) |
72 |
+- inet_rsk(req)->pktopts = NULL; |
73 |
+- |
74 |
+- return req; |
75 |
+-} |
76 |
+- |
77 |
+ static inline struct raw6_sock *raw6_sk(const struct sock *sk) |
78 |
+ { |
79 |
+ return (struct raw6_sock *)sk; |
80 |
+@@ -309,12 +299,6 @@ static inline struct ipv6_pinfo * inet6_sk(const struct sock *__sk) |
81 |
+ return NULL; |
82 |
+ } |
83 |
+ |
84 |
+-static inline struct inet6_request_sock * |
85 |
+- inet6_rsk(const struct request_sock *rsk) |
86 |
+-{ |
87 |
+- return NULL; |
88 |
+-} |
89 |
+- |
90 |
+ static inline struct raw6_sock *raw6_sk(const struct sock *sk) |
91 |
+ { |
92 |
+ return NULL; |
93 |
+diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h |
94 |
+index ec89301ada41..99ea4b0e3693 100644 |
95 |
+--- a/include/linux/skbuff.h |
96 |
++++ b/include/linux/skbuff.h |
97 |
+@@ -2784,8 +2784,10 @@ static inline bool __skb_checksum_validate_needed(struct sk_buff *skb, |
98 |
+ bool zero_okay, |
99 |
+ __sum16 check) |
100 |
+ { |
101 |
+- if (skb_csum_unnecessary(skb) || (zero_okay && !check)) { |
102 |
+- skb->csum_valid = 1; |
103 |
++ if (skb_csum_unnecessary(skb)) { |
104 |
++ return false; |
105 |
++ } else if (zero_okay && !check) { |
106 |
++ skb->ip_summed = CHECKSUM_UNNECESSARY; |
107 |
+ return false; |
108 |
+ } |
109 |
+ |
110 |
+diff --git a/include/linux/tcp.h b/include/linux/tcp.h |
111 |
+index a0513210798f..7bc2e078d6ca 100644 |
112 |
+--- a/include/linux/tcp.h |
113 |
++++ b/include/linux/tcp.h |
114 |
+@@ -53,7 +53,7 @@ static inline unsigned int tcp_optlen(const struct sk_buff *skb) |
115 |
+ /* TCP Fast Open */ |
116 |
+ #define TCP_FASTOPEN_COOKIE_MIN 4 /* Min Fast Open Cookie size in bytes */ |
117 |
+ #define TCP_FASTOPEN_COOKIE_MAX 16 /* Max Fast Open Cookie size in bytes */ |
118 |
+-#define TCP_FASTOPEN_COOKIE_SIZE 8 /* the size employed by this impl. */ |
119 |
++#define TCP_FASTOPEN_COOKIE_SIZE 4 /* the size employed by this impl. */ |
120 |
+ |
121 |
+ /* TCP Fast Open Cookie as stored in memory */ |
122 |
+ struct tcp_fastopen_cookie { |
123 |
+@@ -72,6 +72,51 @@ struct tcp_sack_block { |
124 |
+ u32 end_seq; |
125 |
+ }; |
126 |
+ |
127 |
++struct tcp_out_options { |
128 |
++ u16 options; /* bit field of OPTION_* */ |
129 |
++ u8 ws; /* window scale, 0 to disable */ |
130 |
++ u8 num_sack_blocks;/* number of SACK blocks to include */ |
131 |
++ u8 hash_size; /* bytes in hash_location */ |
132 |
++ u16 mss; /* 0 to disable */ |
133 |
++ __u8 *hash_location; /* temporary pointer, overloaded */ |
134 |
++ __u32 tsval, tsecr; /* need to include OPTION_TS */ |
135 |
++ struct tcp_fastopen_cookie *fastopen_cookie; /* Fast open cookie */ |
136 |
++#ifdef CONFIG_MPTCP |
137 |
++ u16 mptcp_options; /* bit field of MPTCP related OPTION_* */ |
138 |
++ u8 dss_csum:1, |
139 |
++ add_addr_v4:1, |
140 |
++ add_addr_v6:1; /* dss-checksum required? */ |
141 |
++ |
142 |
++ union { |
143 |
++ struct { |
144 |
++ __u64 sender_key; /* sender's key for mptcp */ |
145 |
++ __u64 receiver_key; /* receiver's key for mptcp */ |
146 |
++ } mp_capable; |
147 |
++ |
148 |
++ struct { |
149 |
++ __u64 sender_truncated_mac; |
150 |
++ __u32 sender_nonce; |
151 |
++ /* random number of the sender */ |
152 |
++ __u32 token; /* token for mptcp */ |
153 |
++ u8 low_prio:1; |
154 |
++ } mp_join_syns; |
155 |
++ }; |
156 |
++ |
157 |
++ struct { |
158 |
++ struct in_addr addr; |
159 |
++ u8 addr_id; |
160 |
++ } add_addr4; |
161 |
++ |
162 |
++ struct { |
163 |
++ struct in6_addr addr; |
164 |
++ u8 addr_id; |
165 |
++ } add_addr6; |
166 |
++ |
167 |
++ u16 remove_addrs; /* list of address id */ |
168 |
++ u8 addr_id; /* address id (mp_join or add_address) */ |
169 |
++#endif /* CONFIG_MPTCP */ |
170 |
++}; |
171 |
++ |
172 |
+ /*These are used to set the sack_ok field in struct tcp_options_received */ |
173 |
+ #define TCP_SACK_SEEN (1 << 0) /*1 = peer is SACK capable, */ |
174 |
+ #define TCP_FACK_ENABLED (1 << 1) /*1 = FACK is enabled locally*/ |
175 |
+@@ -95,6 +140,9 @@ struct tcp_options_received { |
176 |
+ u16 mss_clamp; /* Maximal mss, negotiated at connection setup */ |
177 |
+ }; |
178 |
+ |
179 |
++struct mptcp_cb; |
180 |
++struct mptcp_tcp_sock; |
181 |
++ |
182 |
+ static inline void tcp_clear_options(struct tcp_options_received *rx_opt) |
183 |
+ { |
184 |
+ rx_opt->tstamp_ok = rx_opt->sack_ok = 0; |
185 |
+@@ -111,10 +159,7 @@ struct tcp_request_sock_ops; |
186 |
+ |
187 |
+ struct tcp_request_sock { |
188 |
+ struct inet_request_sock req; |
189 |
+-#ifdef CONFIG_TCP_MD5SIG |
190 |
+- /* Only used by TCP MD5 Signature so far. */ |
191 |
+ const struct tcp_request_sock_ops *af_specific; |
192 |
+-#endif |
193 |
+ struct sock *listener; /* needed for TFO */ |
194 |
+ u32 rcv_isn; |
195 |
+ u32 snt_isn; |
196 |
+@@ -130,6 +175,8 @@ static inline struct tcp_request_sock *tcp_rsk(const struct request_sock *req) |
197 |
+ return (struct tcp_request_sock *)req; |
198 |
+ } |
199 |
+ |
200 |
++struct tcp_md5sig_key; |
201 |
++ |
202 |
+ struct tcp_sock { |
203 |
+ /* inet_connection_sock has to be the first member of tcp_sock */ |
204 |
+ struct inet_connection_sock inet_conn; |
205 |
+@@ -326,6 +373,37 @@ struct tcp_sock { |
206 |
+ * socket. Used to retransmit SYNACKs etc. |
207 |
+ */ |
208 |
+ struct request_sock *fastopen_rsk; |
209 |
++ |
210 |
++ /* MPTCP/TCP-specific callbacks */ |
211 |
++ const struct tcp_sock_ops *ops; |
212 |
++ |
213 |
++ struct mptcp_cb *mpcb; |
214 |
++ struct sock *meta_sk; |
215 |
++ /* We keep these flags even if CONFIG_MPTCP is not checked, because |
216 |
++ * it allows checking MPTCP capability just by checking the mpc flag, |
217 |
++ * rather than adding ifdefs everywhere. |
218 |
++ */ |
219 |
++ u16 mpc:1, /* Other end is multipath capable */ |
220 |
++ inside_tk_table:1, /* Is the tcp_sock inside the token-table? */ |
221 |
++ send_mp_fclose:1, |
222 |
++ request_mptcp:1, /* Did we send out an MP_CAPABLE? |
223 |
++ * (this speeds up mptcp_doit() in tcp_recvmsg) |
224 |
++ */ |
225 |
++ mptcp_enabled:1, /* Is MPTCP enabled from the application ? */ |
226 |
++ pf:1, /* Potentially Failed state: when this flag is set, we |
227 |
++ * stop using the subflow |
228 |
++ */ |
229 |
++ mp_killed:1, /* Killed with a tcp_done in mptcp? */ |
230 |
++ was_meta_sk:1, /* This was a meta sk (in case of reuse) */ |
231 |
++ is_master_sk, |
232 |
++ close_it:1, /* Must close socket in mptcp_data_ready? */ |
233 |
++ closing:1; |
234 |
++ struct mptcp_tcp_sock *mptcp; |
235 |
++#ifdef CONFIG_MPTCP |
236 |
++ struct hlist_nulls_node tk_table; |
237 |
++ u32 mptcp_loc_token; |
238 |
++ u64 mptcp_loc_key; |
239 |
++#endif /* CONFIG_MPTCP */ |
240 |
+ }; |
241 |
+ |
242 |
+ enum tsq_flags { |
243 |
+@@ -337,6 +415,8 @@ enum tsq_flags { |
244 |
+ TCP_MTU_REDUCED_DEFERRED, /* tcp_v{4|6}_err() could not call |
245 |
+ * tcp_v{4|6}_mtu_reduced() |
246 |
+ */ |
247 |
++ MPTCP_PATH_MANAGER, /* MPTCP deferred creation of new subflows */ |
248 |
++ MPTCP_SUB_DEFERRED, /* A subflow got deferred - process them */ |
249 |
+ }; |
250 |
+ |
251 |
+ static inline struct tcp_sock *tcp_sk(const struct sock *sk) |
252 |
+@@ -355,6 +435,7 @@ struct tcp_timewait_sock { |
253 |
+ #ifdef CONFIG_TCP_MD5SIG |
254 |
+ struct tcp_md5sig_key *tw_md5_key; |
255 |
+ #endif |
256 |
++ struct mptcp_tw *mptcp_tw; |
257 |
+ }; |
258 |
+ |
259 |
+ static inline struct tcp_timewait_sock *tcp_twsk(const struct sock *sk) |
260 |
+diff --git a/include/net/inet6_connection_sock.h b/include/net/inet6_connection_sock.h |
261 |
+index 74af137304be..83f63033897a 100644 |
262 |
+--- a/include/net/inet6_connection_sock.h |
263 |
++++ b/include/net/inet6_connection_sock.h |
264 |
+@@ -27,6 +27,8 @@ int inet6_csk_bind_conflict(const struct sock *sk, |
265 |
+ |
266 |
+ struct dst_entry *inet6_csk_route_req(struct sock *sk, struct flowi6 *fl6, |
267 |
+ const struct request_sock *req); |
268 |
++u32 inet6_synq_hash(const struct in6_addr *raddr, const __be16 rport, |
269 |
++ const u32 rnd, const u32 synq_hsize); |
270 |
+ |
271 |
+ struct request_sock *inet6_csk_search_req(const struct sock *sk, |
272 |
+ struct request_sock ***prevp, |
273 |
+diff --git a/include/net/inet_common.h b/include/net/inet_common.h |
274 |
+index fe7994c48b75..780f229f46a8 100644 |
275 |
+--- a/include/net/inet_common.h |
276 |
++++ b/include/net/inet_common.h |
277 |
+@@ -1,6 +1,8 @@ |
278 |
+ #ifndef _INET_COMMON_H |
279 |
+ #define _INET_COMMON_H |
280 |
+ |
281 |
++#include <net/sock.h> |
282 |
++ |
283 |
+ extern const struct proto_ops inet_stream_ops; |
284 |
+ extern const struct proto_ops inet_dgram_ops; |
285 |
+ |
286 |
+@@ -13,6 +15,8 @@ struct sock; |
287 |
+ struct sockaddr; |
288 |
+ struct socket; |
289 |
+ |
290 |
++int inet_create(struct net *net, struct socket *sock, int protocol, int kern); |
291 |
++int inet6_create(struct net *net, struct socket *sock, int protocol, int kern); |
292 |
+ int inet_release(struct socket *sock); |
293 |
+ int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, |
294 |
+ int addr_len, int flags); |
295 |
+diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h |
296 |
+index 7a4313887568..f62159e39839 100644 |
297 |
+--- a/include/net/inet_connection_sock.h |
298 |
++++ b/include/net/inet_connection_sock.h |
299 |
+@@ -30,6 +30,7 @@ |
300 |
+ |
301 |
+ struct inet_bind_bucket; |
302 |
+ struct tcp_congestion_ops; |
303 |
++struct tcp_options_received; |
304 |
+ |
305 |
+ /* |
306 |
+ * Pointers to address related TCP functions |
307 |
+@@ -243,6 +244,9 @@ static inline void inet_csk_reset_xmit_timer(struct sock *sk, const int what, |
308 |
+ |
309 |
+ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err); |
310 |
+ |
311 |
++u32 inet_synq_hash(const __be32 raddr, const __be16 rport, const u32 rnd, |
312 |
++ const u32 synq_hsize); |
313 |
++ |
314 |
+ struct request_sock *inet_csk_search_req(const struct sock *sk, |
315 |
+ struct request_sock ***prevp, |
316 |
+ const __be16 rport, |
317 |
+diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h |
318 |
+index b1edf17bec01..6a32d8d6b85e 100644 |
319 |
+--- a/include/net/inet_sock.h |
320 |
++++ b/include/net/inet_sock.h |
321 |
+@@ -86,10 +86,14 @@ struct inet_request_sock { |
322 |
+ wscale_ok : 1, |
323 |
+ ecn_ok : 1, |
324 |
+ acked : 1, |
325 |
+- no_srccheck: 1; |
326 |
++ no_srccheck: 1, |
327 |
++ mptcp_rqsk : 1, |
328 |
++ saw_mpc : 1; |
329 |
+ kmemcheck_bitfield_end(flags); |
330 |
+- struct ip_options_rcu *opt; |
331 |
+- struct sk_buff *pktopts; |
332 |
++ union { |
333 |
++ struct ip_options_rcu *opt; |
334 |
++ struct sk_buff *pktopts; |
335 |
++ }; |
336 |
+ u32 ir_mark; |
337 |
+ }; |
338 |
+ |
339 |
+diff --git a/include/net/mptcp.h b/include/net/mptcp.h |
340 |
+new file mode 100644 |
341 |
+index 000000000000..712780fc39e4 |
342 |
+--- /dev/null |
343 |
++++ b/include/net/mptcp.h |
344 |
+@@ -0,0 +1,1439 @@ |
345 |
++/* |
346 |
++ * MPTCP implementation |
347 |
++ * |
348 |
++ * Initial Design & Implementation: |
349 |
++ * Sébastien Barré <sebastien.barre@×××××××××.be> |
350 |
++ * |
351 |
++ * Current Maintainer & Author: |
352 |
++ * Christoph Paasch <christoph.paasch@×××××××××.be> |
353 |
++ * |
354 |
++ * Additional authors: |
355 |
++ * Jaakko Korkeaniemi <jaakko.korkeaniemi@×××××.fi> |
356 |
++ * Gregory Detal <gregory.detal@×××××××××.be> |
357 |
++ * Fabien Duchêne <fabien.duchene@×××××××××.be> |
358 |
++ * Andreas Seelinger <Andreas.Seelinger@×××××××××××.de> |
359 |
++ * Lavkesh Lahngir <lavkesh51@×××××.com> |
360 |
++ * Andreas Ripke <ripke@××××××.eu> |
361 |
++ * Vlad Dogaru <vlad.dogaru@×××××.com> |
362 |
++ * Octavian Purdila <octavian.purdila@×××××.com> |
363 |
++ * John Ronan <jronan@××××.org> |
364 |
++ * Catalin Nicutar <catalin.nicutar@×××××.com> |
365 |
++ * Brandon Heller <brandonh@××××××××.edu> |
366 |
++ * |
367 |
++ * |
368 |
++ * This program is free software; you can redistribute it and/or |
369 |
++ * modify it under the terms of the GNU General Public License |
370 |
++ * as published by the Free Software Foundation; either version |
371 |
++ * 2 of the License, or (at your option) any later version. |
372 |
++ */ |
373 |
++ |
374 |
++#ifndef _MPTCP_H |
375 |
++#define _MPTCP_H |
376 |
++ |
377 |
++#include <linux/inetdevice.h> |
378 |
++#include <linux/ipv6.h> |
379 |
++#include <linux/list.h> |
380 |
++#include <linux/net.h> |
381 |
++#include <linux/netpoll.h> |
382 |
++#include <linux/skbuff.h> |
383 |
++#include <linux/socket.h> |
384 |
++#include <linux/tcp.h> |
385 |
++#include <linux/kernel.h> |
386 |
++ |
387 |
++#include <asm/byteorder.h> |
388 |
++#include <asm/unaligned.h> |
389 |
++#include <crypto/hash.h> |
390 |
++#include <net/tcp.h> |
391 |
++ |
392 |
++#if defined(__LITTLE_ENDIAN_BITFIELD) |
393 |
++ #define ntohll(x) be64_to_cpu(x) |
394 |
++ #define htonll(x) cpu_to_be64(x) |
395 |
++#elif defined(__BIG_ENDIAN_BITFIELD) |
396 |
++ #define ntohll(x) (x) |
397 |
++ #define htonll(x) (x) |
398 |
++#endif |
399 |
++ |
400 |
++struct mptcp_loc4 { |
401 |
++ u8 loc4_id; |
402 |
++ u8 low_prio:1; |
403 |
++ struct in_addr addr; |
404 |
++}; |
405 |
++ |
406 |
++struct mptcp_rem4 { |
407 |
++ u8 rem4_id; |
408 |
++ __be16 port; |
409 |
++ struct in_addr addr; |
410 |
++}; |
411 |
++ |
412 |
++struct mptcp_loc6 { |
413 |
++ u8 loc6_id; |
414 |
++ u8 low_prio:1; |
415 |
++ struct in6_addr addr; |
416 |
++}; |
417 |
++ |
418 |
++struct mptcp_rem6 { |
419 |
++ u8 rem6_id; |
420 |
++ __be16 port; |
421 |
++ struct in6_addr addr; |
422 |
++}; |
423 |
++ |
424 |
++struct mptcp_request_sock { |
425 |
++ struct tcp_request_sock req; |
426 |
++ /* hlist-nulls entry to the hash-table. Depending on whether this is a |
427 |
++ * a new MPTCP connection or an additional subflow, the request-socket |
428 |
++ * is either in the mptcp_reqsk_tk_htb or mptcp_reqsk_htb. |
429 |
++ */ |
430 |
++ struct hlist_nulls_node hash_entry; |
431 |
++ |
432 |
++ union { |
433 |
++ struct { |
434 |
++ /* Only on initial subflows */ |
435 |
++ u64 mptcp_loc_key; |
436 |
++ u64 mptcp_rem_key; |
437 |
++ u32 mptcp_loc_token; |
438 |
++ }; |
439 |
++ |
440 |
++ struct { |
441 |
++ /* Only on additional subflows */ |
442 |
++ struct mptcp_cb *mptcp_mpcb; |
443 |
++ u32 mptcp_rem_nonce; |
444 |
++ u32 mptcp_loc_nonce; |
445 |
++ u64 mptcp_hash_tmac; |
446 |
++ }; |
447 |
++ }; |
448 |
++ |
449 |
++ u8 loc_id; |
450 |
++ u8 rem_id; /* Address-id in the MP_JOIN */ |
451 |
++ u8 dss_csum:1, |
452 |
++ is_sub:1, /* Is this a new subflow? */ |
453 |
++ low_prio:1, /* Interface set to low-prio? */ |
454 |
++ rcv_low_prio:1; |
455 |
++}; |
456 |
++ |
457 |
++struct mptcp_options_received { |
458 |
++ u16 saw_mpc:1, |
459 |
++ dss_csum:1, |
460 |
++ drop_me:1, |
461 |
++ |
462 |
++ is_mp_join:1, |
463 |
++ join_ack:1, |
464 |
++ |
465 |
++ saw_low_prio:2, /* 0x1 - low-prio set for this subflow |
466 |
++ * 0x2 - low-prio set for another subflow |
467 |
++ */ |
468 |
++ low_prio:1, |
469 |
++ |
470 |
++ saw_add_addr:2, /* Saw at least one add_addr option: |
471 |
++ * 0x1: IPv4 - 0x2: IPv6 |
472 |
++ */ |
473 |
++ more_add_addr:1, /* Saw one more add-addr. */ |
474 |
++ |
475 |
++ saw_rem_addr:1, /* Saw at least one rem_addr option */ |
476 |
++ more_rem_addr:1, /* Saw one more rem-addr. */ |
477 |
++ |
478 |
++ mp_fail:1, |
479 |
++ mp_fclose:1; |
480 |
++ u8 rem_id; /* Address-id in the MP_JOIN */ |
481 |
++ u8 prio_addr_id; /* Address-id in the MP_PRIO */ |
482 |
++ |
483 |
++ const unsigned char *add_addr_ptr; /* Pointer to add-address option */ |
484 |
++ const unsigned char *rem_addr_ptr; /* Pointer to rem-address option */ |
485 |
++ |
486 |
++ u32 data_ack; |
487 |
++ u32 data_seq; |
488 |
++ u16 data_len; |
489 |
++ |
490 |
++ u32 mptcp_rem_token;/* Remote token */ |
491 |
++ |
492 |
++ /* Key inside the option (from mp_capable or fast_close) */ |
493 |
++ u64 mptcp_key; |
494 |
++ |
495 |
++ u32 mptcp_recv_nonce; |
496 |
++ u64 mptcp_recv_tmac; |
497 |
++ u8 mptcp_recv_mac[20]; |
498 |
++}; |
499 |
++ |
500 |
++struct mptcp_tcp_sock { |
501 |
++ struct tcp_sock *next; /* Next subflow socket */ |
502 |
++ struct hlist_node cb_list; |
503 |
++ struct mptcp_options_received rx_opt; |
504 |
++ |
505 |
++ /* Those three fields record the current mapping */ |
506 |
++ u64 map_data_seq; |
507 |
++ u32 map_subseq; |
508 |
++ u16 map_data_len; |
509 |
++ u16 slave_sk:1, |
510 |
++ fully_established:1, |
511 |
++ establish_increased:1, |
512 |
++ second_packet:1, |
513 |
++ attached:1, |
514 |
++ send_mp_fail:1, |
515 |
++ include_mpc:1, |
516 |
++ mapping_present:1, |
517 |
++ map_data_fin:1, |
518 |
++ low_prio:1, /* use this socket as backup */ |
519 |
++ rcv_low_prio:1, /* Peer sent low-prio option to us */ |
520 |
++ send_mp_prio:1, /* Trigger to send mp_prio on this socket */ |
521 |
++ pre_established:1; /* State between sending 3rd ACK and |
522 |
++ * receiving the fourth ack of new subflows. |
523 |
++ */ |
524 |
++ |
525 |
++ /* isn: needed to translate abs to relative subflow seqnums */ |
526 |
++ u32 snt_isn; |
527 |
++ u32 rcv_isn; |
528 |
++ u8 path_index; |
529 |
++ u8 loc_id; |
530 |
++ u8 rem_id; |
531 |
++ |
532 |
++#define MPTCP_SCHED_SIZE 4 |
533 |
++ u8 mptcp_sched[MPTCP_SCHED_SIZE] __aligned(8); |
534 |
++ |
535 |
++ struct sk_buff *shortcut_ofoqueue; /* Shortcut to the current modified |
536 |
++ * skb in the ofo-queue. |
537 |
++ */ |
538 |
++ |
539 |
++ int init_rcv_wnd; |
540 |
++ u32 infinite_cutoff_seq; |
541 |
++ struct delayed_work work; |
542 |
++ u32 mptcp_loc_nonce; |
543 |
++ struct tcp_sock *tp; /* Where is my daddy? */ |
544 |
++ u32 last_end_data_seq; |
545 |
++ |
546 |
++ /* MP_JOIN subflow: timer for retransmitting the 3rd ack */ |
547 |
++ struct timer_list mptcp_ack_timer; |
548 |
++ |
549 |
++ /* HMAC of the third ack */ |
550 |
++ char sender_mac[20]; |
551 |
++}; |
552 |
++ |
553 |
++struct mptcp_tw { |
554 |
++ struct list_head list; |
555 |
++ u64 loc_key; |
556 |
++ u64 rcv_nxt; |
557 |
++ struct mptcp_cb __rcu *mpcb; |
558 |
++ u8 meta_tw:1, |
559 |
++ in_list:1; |
560 |
++}; |
561 |
++ |
562 |
++#define MPTCP_PM_NAME_MAX 16 |
563 |
++struct mptcp_pm_ops { |
564 |
++ struct list_head list; |
565 |
++ |
566 |
++ /* Signal the creation of a new MPTCP-session. */ |
567 |
++ void (*new_session)(const struct sock *meta_sk); |
568 |
++ void (*release_sock)(struct sock *meta_sk); |
569 |
++ void (*fully_established)(struct sock *meta_sk); |
570 |
++ void (*new_remote_address)(struct sock *meta_sk); |
571 |
++ int (*get_local_id)(sa_family_t family, union inet_addr *addr, |
572 |
++ struct net *net, bool *low_prio); |
573 |
++ void (*addr_signal)(struct sock *sk, unsigned *size, |
574 |
++ struct tcp_out_options *opts, struct sk_buff *skb); |
575 |
++ void (*add_raddr)(struct mptcp_cb *mpcb, const union inet_addr *addr, |
576 |
++ sa_family_t family, __be16 port, u8 id); |
577 |
++ void (*rem_raddr)(struct mptcp_cb *mpcb, u8 rem_id); |
578 |
++ void (*init_subsocket_v4)(struct sock *sk, struct in_addr addr); |
579 |
++ void (*init_subsocket_v6)(struct sock *sk, struct in6_addr addr); |
580 |
++ |
581 |
++ char name[MPTCP_PM_NAME_MAX]; |
582 |
++ struct module *owner; |
583 |
++}; |
584 |
++ |
585 |
++#define MPTCP_SCHED_NAME_MAX 16 |
586 |
++struct mptcp_sched_ops { |
587 |
++ struct list_head list; |
588 |
++ |
589 |
++ struct sock * (*get_subflow)(struct sock *meta_sk, |
590 |
++ struct sk_buff *skb, |
591 |
++ bool zero_wnd_test); |
592 |
++ struct sk_buff * (*next_segment)(struct sock *meta_sk, |
593 |
++ int *reinject, |
594 |
++ struct sock **subsk, |
595 |
++ unsigned int *limit); |
596 |
++ void (*init)(struct sock *sk); |
597 |
++ |
598 |
++ char name[MPTCP_SCHED_NAME_MAX]; |
599 |
++ struct module *owner; |
600 |
++}; |
601 |
++ |
602 |
++struct mptcp_cb { |
603 |
++ /* list of sockets in this multipath connection */ |
604 |
++ struct tcp_sock *connection_list; |
605 |
++ /* list of sockets that need a call to release_cb */ |
606 |
++ struct hlist_head callback_list; |
607 |
++ |
608 |
++ /* High-order bits of 64-bit sequence numbers */ |
609 |
++ u32 snd_high_order[2]; |
610 |
++ u32 rcv_high_order[2]; |
611 |
++ |
612 |
++ u16 send_infinite_mapping:1, |
613 |
++ in_time_wait:1, |
614 |
++ list_rcvd:1, /* XXX TO REMOVE */ |
615 |
++ addr_signal:1, /* Path-manager wants us to call addr_signal */ |
616 |
++ dss_csum:1, |
617 |
++ server_side:1, |
618 |
++ infinite_mapping_rcv:1, |
619 |
++ infinite_mapping_snd:1, |
620 |
++ dfin_combined:1, /* Was the DFIN combined with subflow-fin? */ |
621 |
++ passive_close:1, |
622 |
++ snd_hiseq_index:1, /* Index in snd_high_order of snd_nxt */ |
623 |
++ rcv_hiseq_index:1; /* Index in rcv_high_order of rcv_nxt */ |
624 |
++ |
625 |
++ /* socket count in this connection */ |
626 |
++ u8 cnt_subflows; |
627 |
++ u8 cnt_established; |
628 |
++ |
629 |
++ struct mptcp_sched_ops *sched_ops; |
630 |
++ |
631 |
++ struct sk_buff_head reinject_queue; |
632 |
++ /* First cache-line boundary is here minus 8 bytes. But from the |
633 |
++ * reinject-queue only the next and prev pointers are regularly |
634 |
++ * accessed. Thus, the whole data-path is on a single cache-line. |
635 |
++ */ |
636 |
++ |
637 |
++ u64 csum_cutoff_seq; |
638 |
++ |
639 |
++ /***** Start of fields, used for connection closure */ |
640 |
++ spinlock_t tw_lock; |
641 |
++ unsigned char mptw_state; |
642 |
++ u8 dfin_path_index; |
643 |
++ |
644 |
++ struct list_head tw_list; |
645 |
++ |
646 |
++ /***** Start of fields, used for subflow establishment and closure */ |
647 |
++ atomic_t mpcb_refcnt; |
648 |
++ |
649 |
++ /* Mutex needed, because otherwise mptcp_close will complain that the |
650 |
++ * socket is owned by the user. |
651 |
++ * E.g., mptcp_sub_close_wq is taking the meta-lock. |
652 |
++ */ |
653 |
++ struct mutex mpcb_mutex; |
654 |
++ |
655 |
++ /***** Start of fields, used for subflow establishment */ |
656 |
++ struct sock *meta_sk; |
657 |
++ |
658 |
++ /* Master socket, also part of the connection_list, this |
659 |
++ * socket is the one that the application sees. |
660 |
++ */ |
661 |
++ struct sock *master_sk; |
662 |
++ |
663 |
++ __u64 mptcp_loc_key; |
664 |
++ __u64 mptcp_rem_key; |
665 |
++ __u32 mptcp_loc_token; |
666 |
++ __u32 mptcp_rem_token; |
667 |
++ |
668 |
++#define MPTCP_PM_SIZE 608 |
669 |
++ u8 mptcp_pm[MPTCP_PM_SIZE] __aligned(8); |
670 |
++ struct mptcp_pm_ops *pm_ops; |
671 |
++ |
672 |
++ u32 path_index_bits; |
673 |
++ /* Next pi to pick up in case a new path becomes available */ |
674 |
++ u8 next_path_index; |
675 |
++ |
676 |
++ /* Original snd/rcvbuf of the initial subflow. |
677 |
++ * Used for the new subflows on the server-side to allow correct |
678 |
++ * autotuning |
679 |
++ */ |
680 |
++ int orig_sk_rcvbuf; |
681 |
++ int orig_sk_sndbuf; |
682 |
++ u32 orig_window_clamp; |
683 |
++ |
684 |
++ /* Timer for retransmitting SYN/ACK+MP_JOIN */ |
685 |
++ struct timer_list synack_timer; |
686 |
++}; |
687 |
++ |
688 |
++#define MPTCP_SUB_CAPABLE 0 |
689 |
++#define MPTCP_SUB_LEN_CAPABLE_SYN 12 |
690 |
++#define MPTCP_SUB_LEN_CAPABLE_SYN_ALIGN 12 |
691 |
++#define MPTCP_SUB_LEN_CAPABLE_ACK 20 |
692 |
++#define MPTCP_SUB_LEN_CAPABLE_ACK_ALIGN 20 |
693 |
++ |
694 |
++#define MPTCP_SUB_JOIN 1 |
695 |
++#define MPTCP_SUB_LEN_JOIN_SYN 12 |
696 |
++#define MPTCP_SUB_LEN_JOIN_SYN_ALIGN 12 |
697 |
++#define MPTCP_SUB_LEN_JOIN_SYNACK 16 |
698 |
++#define MPTCP_SUB_LEN_JOIN_SYNACK_ALIGN 16 |
699 |
++#define MPTCP_SUB_LEN_JOIN_ACK 24 |
700 |
++#define MPTCP_SUB_LEN_JOIN_ACK_ALIGN 24 |
701 |
++ |
702 |
++#define MPTCP_SUB_DSS 2 |
703 |
++#define MPTCP_SUB_LEN_DSS 4 |
704 |
++#define MPTCP_SUB_LEN_DSS_ALIGN 4 |
705 |
++ |
706 |
++/* Lengths for seq and ack are the ones without the generic MPTCP-option header, |
707 |
++ * as they are part of the DSS-option. |
708 |
++ * To get the total length, just add the different options together. |
709 |
++ */ |
710 |
++#define MPTCP_SUB_LEN_SEQ 10 |
711 |
++#define MPTCP_SUB_LEN_SEQ_CSUM 12 |
712 |
++#define MPTCP_SUB_LEN_SEQ_ALIGN 12 |
713 |
++ |
714 |
++#define MPTCP_SUB_LEN_SEQ_64 14 |
715 |
++#define MPTCP_SUB_LEN_SEQ_CSUM_64 16 |
716 |
++#define MPTCP_SUB_LEN_SEQ_64_ALIGN 16 |
717 |
++ |
718 |
++#define MPTCP_SUB_LEN_ACK 4 |
719 |
++#define MPTCP_SUB_LEN_ACK_ALIGN 4 |
720 |
++ |
721 |
++#define MPTCP_SUB_LEN_ACK_64 8 |
722 |
++#define MPTCP_SUB_LEN_ACK_64_ALIGN 8 |
723 |
++ |
724 |
++/* This is the "default" option-length we will send out most often. |
725 |
++ * MPTCP DSS-header |
726 |
++ * 32-bit data sequence number |
727 |
++ * 32-bit data ack |
728 |
++ * |
729 |
++ * It is necessary to calculate the effective MSS we will be using when |
730 |
++ * sending data. |
731 |
++ */ |
732 |
++#define MPTCP_SUB_LEN_DSM_ALIGN (MPTCP_SUB_LEN_DSS_ALIGN + \ |
733 |
++ MPTCP_SUB_LEN_SEQ_ALIGN + \ |
734 |
++ MPTCP_SUB_LEN_ACK_ALIGN) |
735 |
++ |
736 |
++#define MPTCP_SUB_ADD_ADDR 3 |
737 |
++#define MPTCP_SUB_LEN_ADD_ADDR4 8 |
738 |
++#define MPTCP_SUB_LEN_ADD_ADDR6 20 |
739 |
++#define MPTCP_SUB_LEN_ADD_ADDR4_ALIGN 8 |
740 |
++#define MPTCP_SUB_LEN_ADD_ADDR6_ALIGN 20 |
741 |
++ |
742 |
++#define MPTCP_SUB_REMOVE_ADDR 4 |
743 |
++#define MPTCP_SUB_LEN_REMOVE_ADDR 4 |
744 |
++ |
745 |
++#define MPTCP_SUB_PRIO 5 |
746 |
++#define MPTCP_SUB_LEN_PRIO 3 |
747 |
++#define MPTCP_SUB_LEN_PRIO_ADDR 4 |
748 |
++#define MPTCP_SUB_LEN_PRIO_ALIGN 4 |
749 |
++ |
750 |
++#define MPTCP_SUB_FAIL 6 |
751 |
++#define MPTCP_SUB_LEN_FAIL 12 |
752 |
++#define MPTCP_SUB_LEN_FAIL_ALIGN 12 |
753 |
++ |
754 |
++#define MPTCP_SUB_FCLOSE 7 |
755 |
++#define MPTCP_SUB_LEN_FCLOSE 12 |
756 |
++#define MPTCP_SUB_LEN_FCLOSE_ALIGN 12 |
757 |
++ |
758 |
++ |
759 |
++#define OPTION_MPTCP (1 << 5) |
760 |
++ |
761 |
++#ifdef CONFIG_MPTCP |
762 |
++ |
763 |
++/* Used for checking if the mptcp initialization has been successful */ |
764 |
++extern bool mptcp_init_failed; |
765 |
++ |
766 |
++/* MPTCP options */ |
767 |
++#define OPTION_TYPE_SYN (1 << 0) |
768 |
++#define OPTION_TYPE_SYNACK (1 << 1) |
769 |
++#define OPTION_TYPE_ACK (1 << 2) |
770 |
++#define OPTION_MP_CAPABLE (1 << 3) |
771 |
++#define OPTION_DATA_ACK (1 << 4) |
772 |
++#define OPTION_ADD_ADDR (1 << 5) |
773 |
++#define OPTION_MP_JOIN (1 << 6) |
774 |
++#define OPTION_MP_FAIL (1 << 7) |
775 |
++#define OPTION_MP_FCLOSE (1 << 8) |
776 |
++#define OPTION_REMOVE_ADDR (1 << 9) |
777 |
++#define OPTION_MP_PRIO (1 << 10) |
778 |
++ |
779 |
++/* MPTCP flags: both TX and RX */ |
780 |
++#define MPTCPHDR_SEQ 0x01 /* DSS.M option is present */ |
781 |
++#define MPTCPHDR_FIN 0x02 /* DSS.F option is present */ |
782 |
++#define MPTCPHDR_SEQ64_INDEX 0x04 /* index of seq in mpcb->snd_high_order */ |
783 |
++/* MPTCP flags: RX only */ |
784 |
++#define MPTCPHDR_ACK 0x08 |
785 |
++#define MPTCPHDR_SEQ64_SET 0x10 /* Did we received a 64-bit seq number? */ |
786 |
++#define MPTCPHDR_SEQ64_OFO 0x20 /* Is it not in our circular array? */ |
787 |
++#define MPTCPHDR_DSS_CSUM 0x40 |
788 |
++#define MPTCPHDR_JOIN 0x80 |
789 |
++/* MPTCP flags: TX only */ |
790 |
++#define MPTCPHDR_INF 0x08 |
791 |
++ |
792 |
++struct mptcp_option { |
793 |
++ __u8 kind; |
794 |
++ __u8 len; |
795 |
++#if defined(__LITTLE_ENDIAN_BITFIELD) |
796 |
++ __u8 ver:4, |
797 |
++ sub:4; |
798 |
++#elif defined(__BIG_ENDIAN_BITFIELD) |
799 |
++ __u8 sub:4, |
800 |
++ ver:4; |
801 |
++#else |
802 |
++#error "Adjust your <asm/byteorder.h> defines" |
803 |
++#endif |
804 |
++}; |
805 |
++ |
806 |
++struct mp_capable { |
807 |
++ __u8 kind; |
808 |
++ __u8 len; |
809 |
++#if defined(__LITTLE_ENDIAN_BITFIELD) |
810 |
++ __u8 ver:4, |
811 |
++ sub:4; |
812 |
++ __u8 h:1, |
813 |
++ rsv:5, |
814 |
++ b:1, |
815 |
++ a:1; |
816 |
++#elif defined(__BIG_ENDIAN_BITFIELD) |
817 |
++ __u8 sub:4, |
818 |
++ ver:4; |
819 |
++ __u8 a:1, |
820 |
++ b:1, |
821 |
++ rsv:5, |
822 |
++ h:1; |
823 |
++#else |
824 |
++#error "Adjust your <asm/byteorder.h> defines" |
825 |
++#endif |
826 |
++ __u64 sender_key; |
827 |
++ __u64 receiver_key; |
828 |
++} __attribute__((__packed__)); |
829 |
++ |
830 |
++struct mp_join { |
831 |
++ __u8 kind; |
832 |
++ __u8 len; |
833 |
++#if defined(__LITTLE_ENDIAN_BITFIELD) |
834 |
++ __u8 b:1, |
835 |
++ rsv:3, |
836 |
++ sub:4; |
837 |
++#elif defined(__BIG_ENDIAN_BITFIELD) |
838 |
++ __u8 sub:4, |
839 |
++ rsv:3, |
840 |
++ b:1; |
841 |
++#else |
842 |
++#error "Adjust your <asm/byteorder.h> defines" |
843 |
++#endif |
844 |
++ __u8 addr_id; |
845 |
++ union { |
846 |
++ struct { |
847 |
++ u32 token; |
848 |
++ u32 nonce; |
849 |
++ } syn; |
850 |
++ struct { |
851 |
++ __u64 mac; |
852 |
++ u32 nonce; |
853 |
++ } synack; |
854 |
++ struct { |
855 |
++ __u8 mac[20]; |
856 |
++ } ack; |
857 |
++ } u; |
858 |
++} __attribute__((__packed__)); |
859 |
++ |
860 |
++struct mp_dss { |
861 |
++ __u8 kind; |
862 |
++ __u8 len; |
863 |
++#if defined(__LITTLE_ENDIAN_BITFIELD) |
864 |
++ __u16 rsv1:4, |
865 |
++ sub:4, |
866 |
++ A:1, |
867 |
++ a:1, |
868 |
++ M:1, |
869 |
++ m:1, |
870 |
++ F:1, |
871 |
++ rsv2:3; |
872 |
++#elif defined(__BIG_ENDIAN_BITFIELD) |
873 |
++ __u16 sub:4, |
874 |
++ rsv1:4, |
875 |
++ rsv2:3, |
876 |
++ F:1, |
877 |
++ m:1, |
878 |
++ M:1, |
879 |
++ a:1, |
880 |
++ A:1; |
881 |
++#else |
882 |
++#error "Adjust your <asm/byteorder.h> defines" |
883 |
++#endif |
884 |
++}; |
885 |
++ |
886 |
++struct mp_add_addr { |
887 |
++ __u8 kind; |
888 |
++ __u8 len; |
889 |
++#if defined(__LITTLE_ENDIAN_BITFIELD) |
890 |
++ __u8 ipver:4, |
891 |
++ sub:4; |
892 |
++#elif defined(__BIG_ENDIAN_BITFIELD) |
893 |
++ __u8 sub:4, |
894 |
++ ipver:4; |
895 |
++#else |
896 |
++#error "Adjust your <asm/byteorder.h> defines" |
897 |
++#endif |
898 |
++ __u8 addr_id; |
899 |
++ union { |
900 |
++ struct { |
901 |
++ struct in_addr addr; |
902 |
++ __be16 port; |
903 |
++ } v4; |
904 |
++ struct { |
905 |
++ struct in6_addr addr; |
906 |
++ __be16 port; |
907 |
++ } v6; |
908 |
++ } u; |
909 |
++} __attribute__((__packed__)); |
910 |
++ |
911 |
++struct mp_remove_addr { |
912 |
++ __u8 kind; |
913 |
++ __u8 len; |
914 |
++#if defined(__LITTLE_ENDIAN_BITFIELD) |
915 |
++ __u8 rsv:4, |
916 |
++ sub:4; |
917 |
++#elif defined(__BIG_ENDIAN_BITFIELD) |
918 |
++ __u8 sub:4, |
919 |
++ rsv:4; |
920 |
++#else |
921 |
++#error "Adjust your <asm/byteorder.h> defines" |
922 |
++#endif |
923 |
++ /* list of addr_id */ |
924 |
++ __u8 addrs_id; |
925 |
++}; |
926 |
++ |
927 |
++struct mp_fail { |
928 |
++ __u8 kind; |
929 |
++ __u8 len; |
930 |
++#if defined(__LITTLE_ENDIAN_BITFIELD) |
931 |
++ __u16 rsv1:4, |
932 |
++ sub:4, |
933 |
++ rsv2:8; |
934 |
++#elif defined(__BIG_ENDIAN_BITFIELD) |
935 |
++ __u16 sub:4, |
936 |
++ rsv1:4, |
937 |
++ rsv2:8; |
938 |
++#else |
939 |
++#error "Adjust your <asm/byteorder.h> defines" |
940 |
++#endif |
941 |
++ __be64 data_seq; |
942 |
++} __attribute__((__packed__)); |
943 |
++ |
944 |
++struct mp_fclose { |
945 |
++ __u8 kind; |
946 |
++ __u8 len; |
947 |
++#if defined(__LITTLE_ENDIAN_BITFIELD) |
948 |
++ __u16 rsv1:4, |
949 |
++ sub:4, |
950 |
++ rsv2:8; |
951 |
++#elif defined(__BIG_ENDIAN_BITFIELD) |
952 |
++ __u16 sub:4, |
953 |
++ rsv1:4, |
954 |
++ rsv2:8; |
955 |
++#else |
956 |
++#error "Adjust your <asm/byteorder.h> defines" |
957 |
++#endif |
958 |
++ __u64 key; |
959 |
++} __attribute__((__packed__)); |
960 |
++ |
961 |
++struct mp_prio { |
962 |
++ __u8 kind; |
963 |
++ __u8 len; |
964 |
++#if defined(__LITTLE_ENDIAN_BITFIELD) |
965 |
++ __u8 b:1, |
966 |
++ rsv:3, |
967 |
++ sub:4; |
968 |
++#elif defined(__BIG_ENDIAN_BITFIELD) |
969 |
++ __u8 sub:4, |
970 |
++ rsv:3, |
971 |
++ b:1; |
972 |
++#else |
973 |
++#error "Adjust your <asm/byteorder.h> defines" |
974 |
++#endif |
975 |
++ __u8 addr_id; |
976 |
++} __attribute__((__packed__)); |
977 |
++ |
978 |
++static inline int mptcp_sub_len_dss(const struct mp_dss *m, const int csum) |
979 |
++{ |
980 |
++ return 4 + m->A * (4 + m->a * 4) + m->M * (10 + m->m * 4 + csum * 2); |
981 |
++} |
982 |
++ |
983 |
++#define MPTCP_APP 2 |
984 |
++ |
985 |
++extern int sysctl_mptcp_enabled; |
986 |
++extern int sysctl_mptcp_checksum; |
987 |
++extern int sysctl_mptcp_debug; |
988 |
++extern int sysctl_mptcp_syn_retries; |
989 |
++ |
990 |
++extern struct workqueue_struct *mptcp_wq; |
991 |
++ |
992 |
++#define mptcp_debug(fmt, args...) \ |
993 |
++ do { \ |
994 |
++ if (unlikely(sysctl_mptcp_debug)) \ |
995 |
++ pr_err(__FILE__ ": " fmt, ##args); \ |
996 |
++ } while (0) |
997 |
++ |
998 |
++/* Iterates over all subflows */ |
999 |
++#define mptcp_for_each_tp(mpcb, tp) \ |
1000 |
++ for ((tp) = (mpcb)->connection_list; (tp); (tp) = (tp)->mptcp->next) |
1001 |
++ |
1002 |
++#define mptcp_for_each_sk(mpcb, sk) \ |
1003 |
++ for ((sk) = (struct sock *)(mpcb)->connection_list; \ |
1004 |
++ sk; \ |
1005 |
++ sk = (struct sock *)tcp_sk(sk)->mptcp->next) |
1006 |
++ |
1007 |
++#define mptcp_for_each_sk_safe(__mpcb, __sk, __temp) \ |
1008 |
++ for (__sk = (struct sock *)(__mpcb)->connection_list, \ |
1009 |
++ __temp = __sk ? (struct sock *)tcp_sk(__sk)->mptcp->next : NULL; \ |
1010 |
++ __sk; \ |
1011 |
++ __sk = __temp, \ |
1012 |
++ __temp = __sk ? (struct sock *)tcp_sk(__sk)->mptcp->next : NULL) |
1013 |
++ |
1014 |
++/* Iterates over all bit set to 1 in a bitset */ |
1015 |
++#define mptcp_for_each_bit_set(b, i) \ |
1016 |
++ for (i = ffs(b) - 1; i >= 0; i = ffs(b >> (i + 1) << (i + 1)) - 1) |
1017 |
++ |
1018 |
++#define mptcp_for_each_bit_unset(b, i) \ |
1019 |
++ mptcp_for_each_bit_set(~b, i) |
1020 |
++ |
1021 |
++extern struct lock_class_key meta_key; |
1022 |
++extern struct lock_class_key meta_slock_key; |
1023 |
++extern u32 mptcp_secret[MD5_MESSAGE_BYTES / 4]; |
1024 |
++ |
1025 |
++/* This is needed to ensure that two subsequent key/nonce-generation result in |
1026 |
++ * different keys/nonces if the IPs and ports are the same. |
1027 |
++ */ |
1028 |
++extern u32 mptcp_seed; |
1029 |
++ |
1030 |
++#define MPTCP_HASH_SIZE 1024 |
1031 |
++ |
1032 |
++extern struct hlist_nulls_head tk_hashtable[MPTCP_HASH_SIZE]; |
1033 |
++ |
1034 |
++/* This second hashtable is needed to retrieve request socks |
1035 |
++ * created as a result of a join request. While the SYN contains |
1036 |
++ * the token, the final ack does not, so we need a separate hashtable |
1037 |
++ * to retrieve the mpcb. |
1038 |
++ */ |
1039 |
++extern struct hlist_nulls_head mptcp_reqsk_htb[MPTCP_HASH_SIZE]; |
1040 |
++extern spinlock_t mptcp_reqsk_hlock; /* hashtable protection */ |
1041 |
++ |
1042 |
++/* Lock, protecting the two hash-tables that hold the token. Namely, |
1043 |
++ * mptcp_reqsk_tk_htb and tk_hashtable |
1044 |
++ */ |
1045 |
++extern spinlock_t mptcp_tk_hashlock; /* hashtable protection */ |
1046 |
++ |
1047 |
++/* Request-sockets can be hashed in the tk_htb for collision-detection or in |
1048 |
++ * the regular htb for join-connections. We need to define different NULLS |
1049 |
++ * values so that we can correctly detect a request-socket that has been |
1050 |
++ * recycled. See also c25eb3bfb9729. |
1051 |
++ */ |
1052 |
++#define MPTCP_REQSK_NULLS_BASE (1U << 29) |
1053 |
++ |
1054 |
++ |
1055 |
++void mptcp_data_ready(struct sock *sk); |
1056 |
++void mptcp_write_space(struct sock *sk); |
1057 |
++ |
1058 |
++void mptcp_add_meta_ofo_queue(const struct sock *meta_sk, struct sk_buff *skb, |
1059 |
++ struct sock *sk); |
1060 |
++void mptcp_ofo_queue(struct sock *meta_sk); |
1061 |
++void mptcp_purge_ofo_queue(struct tcp_sock *meta_tp); |
1062 |
++void mptcp_cleanup_rbuf(struct sock *meta_sk, int copied); |
1063 |
++int mptcp_add_sock(struct sock *meta_sk, struct sock *sk, u8 loc_id, u8 rem_id, |
1064 |
++ gfp_t flags); |
1065 |
++void mptcp_del_sock(struct sock *sk); |
1066 |
++void mptcp_update_metasocket(struct sock *sock, const struct sock *meta_sk); |
1067 |
++void mptcp_reinject_data(struct sock *orig_sk, int clone_it); |
1068 |
++void mptcp_update_sndbuf(const struct tcp_sock *tp); |
1069 |
++void mptcp_send_fin(struct sock *meta_sk); |
1070 |
++void mptcp_send_active_reset(struct sock *meta_sk, gfp_t priority); |
1071 |
++bool mptcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, |
1072 |
++ int push_one, gfp_t gfp); |
1073 |
++void tcp_parse_mptcp_options(const struct sk_buff *skb, |
1074 |
++ struct mptcp_options_received *mopt); |
1075 |
++void mptcp_parse_options(const uint8_t *ptr, int opsize, |
1076 |
++ struct mptcp_options_received *mopt, |
1077 |
++ const struct sk_buff *skb); |
1078 |
++void mptcp_syn_options(const struct sock *sk, struct tcp_out_options *opts, |
1079 |
++ unsigned *remaining); |
1080 |
++void mptcp_synack_options(struct request_sock *req, |
1081 |
++ struct tcp_out_options *opts, |
1082 |
++ unsigned *remaining); |
1083 |
++void mptcp_established_options(struct sock *sk, struct sk_buff *skb, |
1084 |
++ struct tcp_out_options *opts, unsigned *size); |
1085 |
++void mptcp_options_write(__be32 *ptr, struct tcp_sock *tp, |
1086 |
++ const struct tcp_out_options *opts, |
1087 |
++ struct sk_buff *skb); |
1088 |
++void mptcp_close(struct sock *meta_sk, long timeout); |
1089 |
++int mptcp_doit(struct sock *sk); |
1090 |
++int mptcp_create_master_sk(struct sock *meta_sk, __u64 remote_key, u32 window); |
1091 |
++int mptcp_check_req_fastopen(struct sock *child, struct request_sock *req); |
1092 |
++int mptcp_check_req_master(struct sock *sk, struct sock *child, |
1093 |
++ struct request_sock *req, |
1094 |
++ struct request_sock **prev); |
1095 |
++struct sock *mptcp_check_req_child(struct sock *sk, struct sock *child, |
1096 |
++ struct request_sock *req, |
1097 |
++ struct request_sock **prev, |
1098 |
++ const struct mptcp_options_received *mopt); |
1099 |
++u32 __mptcp_select_window(struct sock *sk); |
1100 |
++void mptcp_select_initial_window(int __space, __u32 mss, __u32 *rcv_wnd, |
1101 |
++ __u32 *window_clamp, int wscale_ok, |
1102 |
++ __u8 *rcv_wscale, __u32 init_rcv_wnd, |
1103 |
++ const struct sock *sk); |
1104 |
++unsigned int mptcp_current_mss(struct sock *meta_sk); |
1105 |
++int mptcp_select_size(const struct sock *meta_sk, bool sg); |
1106 |
++void mptcp_key_sha1(u64 key, u32 *token, u64 *idsn); |
1107 |
++void mptcp_hmac_sha1(u8 *key_1, u8 *key_2, u8 *rand_1, u8 *rand_2, |
1108 |
++ u32 *hash_out); |
1109 |
++void mptcp_clean_rtx_infinite(const struct sk_buff *skb, struct sock *sk); |
1110 |
++void mptcp_fin(struct sock *meta_sk); |
1111 |
++void mptcp_retransmit_timer(struct sock *meta_sk); |
1112 |
++int mptcp_write_wakeup(struct sock *meta_sk); |
1113 |
++void mptcp_sub_close_wq(struct work_struct *work); |
1114 |
++void mptcp_sub_close(struct sock *sk, unsigned long delay); |
1115 |
++struct sock *mptcp_select_ack_sock(const struct sock *meta_sk); |
1116 |
++void mptcp_fallback_meta_sk(struct sock *meta_sk); |
1117 |
++int mptcp_backlog_rcv(struct sock *meta_sk, struct sk_buff *skb); |
1118 |
++void mptcp_ack_handler(unsigned long); |
1119 |
++int mptcp_check_rtt(const struct tcp_sock *tp, int time); |
1120 |
++int mptcp_check_snd_buf(const struct tcp_sock *tp); |
1121 |
++int mptcp_handle_options(struct sock *sk, const struct tcphdr *th, |
1122 |
++ const struct sk_buff *skb); |
1123 |
++void __init mptcp_init(void); |
1124 |
++int mptcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len); |
1125 |
++void mptcp_destroy_sock(struct sock *sk); |
1126 |
++int mptcp_rcv_synsent_state_process(struct sock *sk, struct sock **skptr, |
1127 |
++ const struct sk_buff *skb, |
1128 |
++ const struct mptcp_options_received *mopt); |
1129 |
++unsigned int mptcp_xmit_size_goal(const struct sock *meta_sk, u32 mss_now, |
1130 |
++ int large_allowed); |
1131 |
++int mptcp_init_tw_sock(struct sock *sk, struct tcp_timewait_sock *tw); |
1132 |
++void mptcp_twsk_destructor(struct tcp_timewait_sock *tw); |
1133 |
++void mptcp_time_wait(struct sock *sk, int state, int timeo); |
1134 |
++void mptcp_disconnect(struct sock *sk); |
1135 |
++bool mptcp_should_expand_sndbuf(const struct sock *sk); |
1136 |
++int mptcp_retransmit_skb(struct sock *meta_sk, struct sk_buff *skb); |
1137 |
++void mptcp_tsq_flags(struct sock *sk); |
1138 |
++void mptcp_tsq_sub_deferred(struct sock *meta_sk); |
1139 |
++struct mp_join *mptcp_find_join(const struct sk_buff *skb); |
1140 |
++void mptcp_hash_remove_bh(struct tcp_sock *meta_tp); |
1141 |
++void mptcp_hash_remove(struct tcp_sock *meta_tp); |
1142 |
++struct sock *mptcp_hash_find(const struct net *net, const u32 token); |
1143 |
++int mptcp_lookup_join(struct sk_buff *skb, struct inet_timewait_sock *tw); |
1144 |
++int mptcp_do_join_short(struct sk_buff *skb, |
1145 |
++ const struct mptcp_options_received *mopt, |
1146 |
++ struct net *net); |
1147 |
++void mptcp_reqsk_destructor(struct request_sock *req); |
1148 |
++void mptcp_reqsk_new_mptcp(struct request_sock *req, |
1149 |
++ const struct mptcp_options_received *mopt, |
1150 |
++ const struct sk_buff *skb); |
1151 |
++int mptcp_check_req(struct sk_buff *skb, struct net *net); |
1152 |
++void mptcp_connect_init(struct sock *sk); |
1153 |
++void mptcp_sub_force_close(struct sock *sk); |
1154 |
++int mptcp_sub_len_remove_addr_align(u16 bitfield); |
1155 |
++void mptcp_remove_shortcuts(const struct mptcp_cb *mpcb, |
1156 |
++ const struct sk_buff *skb); |
1157 |
++void mptcp_init_buffer_space(struct sock *sk); |
1158 |
++void mptcp_join_reqsk_init(struct mptcp_cb *mpcb, const struct request_sock *req, |
1159 |
++ struct sk_buff *skb); |
1160 |
++void mptcp_reqsk_init(struct request_sock *req, const struct sk_buff *skb); |
1161 |
++int mptcp_conn_request(struct sock *sk, struct sk_buff *skb); |
1162 |
++void mptcp_init_congestion_control(struct sock *sk); |
1163 |
++ |
1164 |
++/* MPTCP-path-manager registration/initialization functions */ |
1165 |
++int mptcp_register_path_manager(struct mptcp_pm_ops *pm); |
1166 |
++void mptcp_unregister_path_manager(struct mptcp_pm_ops *pm); |
1167 |
++void mptcp_init_path_manager(struct mptcp_cb *mpcb); |
1168 |
++void mptcp_cleanup_path_manager(struct mptcp_cb *mpcb); |
1169 |
++void mptcp_fallback_default(struct mptcp_cb *mpcb); |
1170 |
++void mptcp_get_default_path_manager(char *name); |
1171 |
++int mptcp_set_default_path_manager(const char *name); |
1172 |
++extern struct mptcp_pm_ops mptcp_pm_default; |
1173 |
++ |
1174 |
++/* MPTCP-scheduler registration/initialization functions */ |
1175 |
++int mptcp_register_scheduler(struct mptcp_sched_ops *sched); |
1176 |
++void mptcp_unregister_scheduler(struct mptcp_sched_ops *sched); |
1177 |
++void mptcp_init_scheduler(struct mptcp_cb *mpcb); |
1178 |
++void mptcp_cleanup_scheduler(struct mptcp_cb *mpcb); |
1179 |
++void mptcp_get_default_scheduler(char *name); |
1180 |
++int mptcp_set_default_scheduler(const char *name); |
1181 |
++extern struct mptcp_sched_ops mptcp_sched_default; |
1182 |
++ |
1183 |
++static inline void mptcp_reset_synack_timer(struct sock *meta_sk, |
1184 |
++ unsigned long len) |
1185 |
++{ |
1186 |
++ sk_reset_timer(meta_sk, &tcp_sk(meta_sk)->mpcb->synack_timer, |
1187 |
++ jiffies + len); |
1188 |
++} |
1189 |
++ |
1190 |
++static inline void mptcp_delete_synack_timer(struct sock *meta_sk) |
1191 |
++{ |
1192 |
++ sk_stop_timer(meta_sk, &tcp_sk(meta_sk)->mpcb->synack_timer); |
1193 |
++} |
1194 |
++ |
1195 |
++static inline bool is_mptcp_enabled(const struct sock *sk) |
1196 |
++{ |
1197 |
++ if (!sysctl_mptcp_enabled || mptcp_init_failed) |
1198 |
++ return false; |
1199 |
++ |
1200 |
++ if (sysctl_mptcp_enabled == MPTCP_APP && !tcp_sk(sk)->mptcp_enabled) |
1201 |
++ return false; |
1202 |
++ |
1203 |
++ return true; |
1204 |
++} |
1205 |
++ |
1206 |
++static inline int mptcp_pi_to_flag(int pi) |
1207 |
++{ |
1208 |
++ return 1 << (pi - 1); |
1209 |
++} |
1210 |
++ |
1211 |
++static inline |
1212 |
++struct mptcp_request_sock *mptcp_rsk(const struct request_sock *req) |
1213 |
++{ |
1214 |
++ return (struct mptcp_request_sock *)req; |
1215 |
++} |
1216 |
++ |
1217 |
++static inline |
1218 |
++struct request_sock *rev_mptcp_rsk(const struct mptcp_request_sock *req) |
1219 |
++{ |
1220 |
++ return (struct request_sock *)req; |
1221 |
++} |
1222 |
++ |
1223 |
++static inline bool mptcp_can_sendpage(struct sock *sk) |
1224 |
++{ |
1225 |
++ struct sock *sk_it; |
1226 |
++ |
1227 |
++ if (tcp_sk(sk)->mpcb->dss_csum) |
1228 |
++ return false; |
1229 |
++ |
1230 |
++ mptcp_for_each_sk(tcp_sk(sk)->mpcb, sk_it) { |
1231 |
++ if (!(sk_it->sk_route_caps & NETIF_F_SG) || |
1232 |
++ !(sk_it->sk_route_caps & NETIF_F_ALL_CSUM)) |
1233 |
++ return false; |
1234 |
++ } |
1235 |
++ |
1236 |
++ return true; |
1237 |
++} |
1238 |
++ |
1239 |
++static inline void mptcp_push_pending_frames(struct sock *meta_sk) |
1240 |
++{ |
1241 |
++ /* We check packets out and send-head here. TCP only checks the |
1242 |
++ * send-head. But, MPTCP also checks packets_out, as this is an |
1243 |
++ * indication that we might want to do opportunistic reinjection. |
1244 |
++ */ |
1245 |
++ if (tcp_sk(meta_sk)->packets_out || tcp_send_head(meta_sk)) { |
1246 |
++ struct tcp_sock *tp = tcp_sk(meta_sk); |
1247 |
++ |
1248 |
++ /* We don't care about the MSS, because it will be set in |
1249 |
++ * mptcp_write_xmit. |
1250 |
++ */ |
1251 |
++ __tcp_push_pending_frames(meta_sk, 0, tp->nonagle); |
1252 |
++ } |
1253 |
++} |
1254 |
++ |
1255 |
++static inline void mptcp_send_reset(struct sock *sk) |
1256 |
++{ |
1257 |
++ tcp_sk(sk)->ops->send_active_reset(sk, GFP_ATOMIC); |
1258 |
++ mptcp_sub_force_close(sk); |
1259 |
++} |
1260 |
++ |
1261 |
++static inline bool mptcp_is_data_seq(const struct sk_buff *skb) |
1262 |
++{ |
1263 |
++ return TCP_SKB_CB(skb)->mptcp_flags & MPTCPHDR_SEQ; |
1264 |
++} |
1265 |
++ |
1266 |
++static inline bool mptcp_is_data_fin(const struct sk_buff *skb) |
1267 |
++{ |
1268 |
++ return TCP_SKB_CB(skb)->mptcp_flags & MPTCPHDR_FIN; |
1269 |
++} |
1270 |
++ |
1271 |
++/* Is it a data-fin while in infinite mapping mode? |
1272 |
++ * In infinite mode, a subflow-fin is in fact a data-fin. |
1273 |
++ */ |
1274 |
++static inline bool mptcp_is_data_fin2(const struct sk_buff *skb, |
1275 |
++ const struct tcp_sock *tp) |
1276 |
++{ |
1277 |
++ return mptcp_is_data_fin(skb) || |
1278 |
++ (tp->mpcb->infinite_mapping_rcv && tcp_hdr(skb)->fin); |
1279 |
++} |
1280 |
++ |
1281 |
++static inline u8 mptcp_get_64_bit(u64 data_seq, struct mptcp_cb *mpcb) |
1282 |
++{ |
1283 |
++ u64 data_seq_high = (u32)(data_seq >> 32); |
1284 |
++ |
1285 |
++ if (mpcb->rcv_high_order[0] == data_seq_high) |
1286 |
++ return 0; |
1287 |
++ else if (mpcb->rcv_high_order[1] == data_seq_high) |
1288 |
++ return MPTCPHDR_SEQ64_INDEX; |
1289 |
++ else |
1290 |
++ return MPTCPHDR_SEQ64_OFO; |
1291 |
++} |
1292 |
++ |
1293 |
++/* Sets the data_seq and returns pointer to the in-skb field of the data_seq. |
1294 |
++ * If the packet has a 64-bit dseq, the pointer points to the last 32 bits. |
1295 |
++ */ |
1296 |
++static inline __u32 *mptcp_skb_set_data_seq(const struct sk_buff *skb, |
1297 |
++ u32 *data_seq, |
1298 |
++ struct mptcp_cb *mpcb) |
1299 |
++{ |
1300 |
++ __u32 *ptr = (__u32 *)(skb_transport_header(skb) + TCP_SKB_CB(skb)->dss_off); |
1301 |
++ |
1302 |
++ if (TCP_SKB_CB(skb)->mptcp_flags & MPTCPHDR_SEQ64_SET) { |
1303 |
++ u64 data_seq64 = get_unaligned_be64(ptr); |
1304 |
++ |
1305 |
++ if (mpcb) |
1306 |
++ TCP_SKB_CB(skb)->mptcp_flags |= mptcp_get_64_bit(data_seq64, mpcb); |
1307 |
++ |
1308 |
++ *data_seq = (u32)data_seq64; |
1309 |
++ ptr++; |
1310 |
++ } else { |
1311 |
++ *data_seq = get_unaligned_be32(ptr); |
1312 |
++ } |
1313 |
++ |
1314 |
++ return ptr; |
1315 |
++} |
1316 |
++ |
1317 |
++static inline struct sock *mptcp_meta_sk(const struct sock *sk) |
1318 |
++{ |
1319 |
++ return tcp_sk(sk)->meta_sk; |
1320 |
++} |
1321 |
++ |
1322 |
++static inline struct tcp_sock *mptcp_meta_tp(const struct tcp_sock *tp) |
1323 |
++{ |
1324 |
++ return tcp_sk(tp->meta_sk); |
1325 |
++} |
1326 |
++ |
1327 |
++static inline int is_meta_tp(const struct tcp_sock *tp) |
1328 |
++{ |
1329 |
++ return tp->mpcb && mptcp_meta_tp(tp) == tp; |
1330 |
++} |
1331 |
++ |
1332 |
++static inline int is_meta_sk(const struct sock *sk) |
1333 |
++{ |
1334 |
++ return sk->sk_type == SOCK_STREAM && sk->sk_protocol == IPPROTO_TCP && |
1335 |
++ mptcp(tcp_sk(sk)) && mptcp_meta_sk(sk) == sk; |
1336 |
++} |
1337 |
++ |
1338 |
++static inline int is_master_tp(const struct tcp_sock *tp) |
1339 |
++{ |
1340 |
++ return !mptcp(tp) || (!tp->mptcp->slave_sk && !is_meta_tp(tp)); |
1341 |
++} |
1342 |
++ |
1343 |
++static inline void mptcp_hash_request_remove(struct request_sock *req) |
1344 |
++{ |
1345 |
++ int in_softirq = 0; |
1346 |
++ |
1347 |
++ if (hlist_nulls_unhashed(&mptcp_rsk(req)->hash_entry)) |
1348 |
++ return; |
1349 |
++ |
1350 |
++ if (in_softirq()) { |
1351 |
++ spin_lock(&mptcp_reqsk_hlock); |
1352 |
++ in_softirq = 1; |
1353 |
++ } else { |
1354 |
++ spin_lock_bh(&mptcp_reqsk_hlock); |
1355 |
++ } |
1356 |
++ |
1357 |
++ hlist_nulls_del_init_rcu(&mptcp_rsk(req)->hash_entry); |
1358 |
++ |
1359 |
++ if (in_softirq) |
1360 |
++ spin_unlock(&mptcp_reqsk_hlock); |
1361 |
++ else |
1362 |
++ spin_unlock_bh(&mptcp_reqsk_hlock); |
1363 |
++} |
1364 |
++ |
1365 |
++static inline void mptcp_init_mp_opt(struct mptcp_options_received *mopt) |
1366 |
++{ |
1367 |
++ mopt->saw_mpc = 0; |
1368 |
++ mopt->dss_csum = 0; |
1369 |
++ mopt->drop_me = 0; |
1370 |
++ |
1371 |
++ mopt->is_mp_join = 0; |
1372 |
++ mopt->join_ack = 0; |
1373 |
++ |
1374 |
++ mopt->saw_low_prio = 0; |
1375 |
++ mopt->low_prio = 0; |
1376 |
++ |
1377 |
++ mopt->saw_add_addr = 0; |
1378 |
++ mopt->more_add_addr = 0; |
1379 |
++ |
1380 |
++ mopt->saw_rem_addr = 0; |
1381 |
++ mopt->more_rem_addr = 0; |
1382 |
++ |
1383 |
++ mopt->mp_fail = 0; |
1384 |
++ mopt->mp_fclose = 0; |
1385 |
++} |
1386 |
++ |
1387 |
++static inline void mptcp_reset_mopt(struct tcp_sock *tp) |
1388 |
++{ |
1389 |
++ struct mptcp_options_received *mopt = &tp->mptcp->rx_opt; |
1390 |
++ |
1391 |
++ mopt->saw_low_prio = 0; |
1392 |
++ mopt->saw_add_addr = 0; |
1393 |
++ mopt->more_add_addr = 0; |
1394 |
++ mopt->saw_rem_addr = 0; |
1395 |
++ mopt->more_rem_addr = 0; |
1396 |
++ mopt->join_ack = 0; |
1397 |
++ mopt->mp_fail = 0; |
1398 |
++ mopt->mp_fclose = 0; |
1399 |
++} |
1400 |
++ |
1401 |
++static inline __be32 mptcp_get_highorder_sndbits(const struct sk_buff *skb, |
1402 |
++ const struct mptcp_cb *mpcb) |
1403 |
++{ |
1404 |
++ return htonl(mpcb->snd_high_order[(TCP_SKB_CB(skb)->mptcp_flags & |
1405 |
++ MPTCPHDR_SEQ64_INDEX) ? 1 : 0]); |
1406 |
++} |
1407 |
++ |
1408 |
++static inline u64 mptcp_get_data_seq_64(const struct mptcp_cb *mpcb, int index, |
1409 |
++ u32 data_seq_32) |
1410 |
++{ |
1411 |
++ return ((u64)mpcb->rcv_high_order[index] << 32) | data_seq_32; |
1412 |
++} |
1413 |
++ |
1414 |
++static inline u64 mptcp_get_rcv_nxt_64(const struct tcp_sock *meta_tp) |
1415 |
++{ |
1416 |
++ struct mptcp_cb *mpcb = meta_tp->mpcb; |
1417 |
++ return mptcp_get_data_seq_64(mpcb, mpcb->rcv_hiseq_index, |
1418 |
++ meta_tp->rcv_nxt); |
1419 |
++} |
1420 |
++ |
1421 |
++static inline void mptcp_check_sndseq_wrap(struct tcp_sock *meta_tp, int inc) |
1422 |
++{ |
1423 |
++ if (unlikely(meta_tp->snd_nxt > meta_tp->snd_nxt + inc)) { |
1424 |
++ struct mptcp_cb *mpcb = meta_tp->mpcb; |
1425 |
++ mpcb->snd_hiseq_index = mpcb->snd_hiseq_index ? 0 : 1; |
1426 |
++ mpcb->snd_high_order[mpcb->snd_hiseq_index] += 2; |
1427 |
++ } |
1428 |
++} |
1429 |
++ |
1430 |
++static inline void mptcp_check_rcvseq_wrap(struct tcp_sock *meta_tp, |
1431 |
++ u32 old_rcv_nxt) |
1432 |
++{ |
1433 |
++ if (unlikely(old_rcv_nxt > meta_tp->rcv_nxt)) { |
1434 |
++ struct mptcp_cb *mpcb = meta_tp->mpcb; |
1435 |
++ mpcb->rcv_high_order[mpcb->rcv_hiseq_index] += 2; |
1436 |
++ mpcb->rcv_hiseq_index = mpcb->rcv_hiseq_index ? 0 : 1; |
1437 |
++ } |
1438 |
++} |
1439 |
++ |
1440 |
++static inline int mptcp_sk_can_send(const struct sock *sk) |
1441 |
++{ |
1442 |
++ return tcp_passive_fastopen(sk) || |
1443 |
++ ((1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) && |
1444 |
++ !tcp_sk(sk)->mptcp->pre_established); |
1445 |
++} |
1446 |
++ |
1447 |
++static inline int mptcp_sk_can_recv(const struct sock *sk) |
1448 |
++{ |
1449 |
++ return (1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2); |
1450 |
++} |
1451 |
++ |
1452 |
++static inline int mptcp_sk_can_send_ack(const struct sock *sk) |
1453 |
++{ |
1454 |
++ return !((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV | |
1455 |
++ TCPF_CLOSE | TCPF_LISTEN)) && |
1456 |
++ !tcp_sk(sk)->mptcp->pre_established; |
1457 |
++} |
1458 |
++ |
1459 |
++/* Only support GSO if all subflows supports it */ |
1460 |
++static inline bool mptcp_sk_can_gso(const struct sock *meta_sk) |
1461 |
++{ |
1462 |
++ struct sock *sk; |
1463 |
++ |
1464 |
++ if (tcp_sk(meta_sk)->mpcb->dss_csum) |
1465 |
++ return false; |
1466 |
++ |
1467 |
++ mptcp_for_each_sk(tcp_sk(meta_sk)->mpcb, sk) { |
1468 |
++ if (!mptcp_sk_can_send(sk)) |
1469 |
++ continue; |
1470 |
++ if (!sk_can_gso(sk)) |
1471 |
++ return false; |
1472 |
++ } |
1473 |
++ return true; |
1474 |
++} |
1475 |
++ |
1476 |
++static inline bool mptcp_can_sg(const struct sock *meta_sk) |
1477 |
++{ |
1478 |
++ struct sock *sk; |
1479 |
++ |
1480 |
++ if (tcp_sk(meta_sk)->mpcb->dss_csum) |
1481 |
++ return false; |
1482 |
++ |
1483 |
++ mptcp_for_each_sk(tcp_sk(meta_sk)->mpcb, sk) { |
1484 |
++ if (!mptcp_sk_can_send(sk)) |
1485 |
++ continue; |
1486 |
++ if (!(sk->sk_route_caps & NETIF_F_SG)) |
1487 |
++ return false; |
1488 |
++ } |
1489 |
++ return true; |
1490 |
++} |
1491 |
++ |
1492 |
++static inline void mptcp_set_rto(struct sock *sk) |
1493 |
++{ |
1494 |
++ struct tcp_sock *tp = tcp_sk(sk); |
1495 |
++ struct sock *sk_it; |
1496 |
++ struct inet_connection_sock *micsk = inet_csk(mptcp_meta_sk(sk)); |
1497 |
++ __u32 max_rto = 0; |
1498 |
++ |
1499 |
++ /* We are in recovery-phase on the MPTCP-level. Do not update the |
1500 |
++ * RTO, because this would kill exponential backoff. |
1501 |
++ */ |
1502 |
++ if (micsk->icsk_retransmits) |
1503 |
++ return; |
1504 |
++ |
1505 |
++ mptcp_for_each_sk(tp->mpcb, sk_it) { |
1506 |
++ if (mptcp_sk_can_send(sk_it) && |
1507 |
++ inet_csk(sk_it)->icsk_rto > max_rto) |
1508 |
++ max_rto = inet_csk(sk_it)->icsk_rto; |
1509 |
++ } |
1510 |
++ if (max_rto) { |
1511 |
++ micsk->icsk_rto = max_rto << 1; |
1512 |
++ |
1513 |
++ /* A successfull rto-measurement - reset backoff counter */ |
1514 |
++ micsk->icsk_backoff = 0; |
1515 |
++ } |
1516 |
++} |
1517 |
++ |
1518 |
++static inline int mptcp_sysctl_syn_retries(void) |
1519 |
++{ |
1520 |
++ return sysctl_mptcp_syn_retries; |
1521 |
++} |
1522 |
++ |
1523 |
++static inline void mptcp_sub_close_passive(struct sock *sk) |
1524 |
++{ |
1525 |
++ struct sock *meta_sk = mptcp_meta_sk(sk); |
1526 |
++ struct tcp_sock *tp = tcp_sk(sk), *meta_tp = tcp_sk(meta_sk); |
1527 |
++ |
1528 |
++ /* Only close, if the app did a send-shutdown (passive close), and we |
1529 |
++ * received the data-ack of the data-fin. |
1530 |
++ */ |
1531 |
++ if (tp->mpcb->passive_close && meta_tp->snd_una == meta_tp->write_seq) |
1532 |
++ mptcp_sub_close(sk, 0); |
1533 |
++} |
1534 |
++ |
1535 |
++static inline bool mptcp_fallback_infinite(struct sock *sk, int flag) |
1536 |
++{ |
1537 |
++ struct tcp_sock *tp = tcp_sk(sk); |
1538 |
++ |
1539 |
++ /* If data has been acknowleged on the meta-level, fully_established |
1540 |
++ * will have been set before and thus we will not fall back to infinite |
1541 |
++ * mapping. |
1542 |
++ */ |
1543 |
++ if (likely(tp->mptcp->fully_established)) |
1544 |
++ return false; |
1545 |
++ |
1546 |
++ if (!(flag & MPTCP_FLAG_DATA_ACKED)) |
1547 |
++ return false; |
1548 |
++ |
1549 |
++ /* Don't fallback twice ;) */ |
1550 |
++ if (tp->mpcb->infinite_mapping_snd) |
1551 |
++ return false; |
1552 |
++ |
1553 |
++ pr_err("%s %#x will fallback - pi %d, src %pI4 dst %pI4 from %pS\n", |
1554 |
++ __func__, tp->mpcb->mptcp_loc_token, tp->mptcp->path_index, |
1555 |
++ &inet_sk(sk)->inet_saddr, &inet_sk(sk)->inet_daddr, |
1556 |
++ __builtin_return_address(0)); |
1557 |
++ if (!is_master_tp(tp)) |
1558 |
++ return true; |
1559 |
++ |
1560 |
++ tp->mpcb->infinite_mapping_snd = 1; |
1561 |
++ tp->mpcb->infinite_mapping_rcv = 1; |
1562 |
++ tp->mptcp->fully_established = 1; |
1563 |
++ |
1564 |
++ return false; |
1565 |
++} |
1566 |
++ |
1567 |
++/* Find the first index whose bit in the bit-field == 0 */ |
1568 |
++static inline u8 mptcp_set_new_pathindex(struct mptcp_cb *mpcb) |
1569 |
++{ |
1570 |
++ u8 base = mpcb->next_path_index; |
1571 |
++ int i; |
1572 |
++ |
1573 |
++ /* Start at 1, because 0 is reserved for the meta-sk */ |
1574 |
++ mptcp_for_each_bit_unset(mpcb->path_index_bits >> base, i) { |
1575 |
++ if (i + base < 1) |
1576 |
++ continue; |
1577 |
++ if (i + base >= sizeof(mpcb->path_index_bits) * 8) |
1578 |
++ break; |
1579 |
++ i += base; |
1580 |
++ mpcb->path_index_bits |= (1 << i); |
1581 |
++ mpcb->next_path_index = i + 1; |
1582 |
++ return i; |
1583 |
++ } |
1584 |
++ mptcp_for_each_bit_unset(mpcb->path_index_bits, i) { |
1585 |
++ if (i >= sizeof(mpcb->path_index_bits) * 8) |
1586 |
++ break; |
1587 |
++ if (i < 1) |
1588 |
++ continue; |
1589 |
++ mpcb->path_index_bits |= (1 << i); |
1590 |
++ mpcb->next_path_index = i + 1; |
1591 |
++ return i; |
1592 |
++ } |
1593 |
++ |
1594 |
++ return 0; |
1595 |
++} |
1596 |
++ |
1597 |
++static inline bool mptcp_v6_is_v4_mapped(const struct sock *sk) |
1598 |
++{ |
1599 |
++ return sk->sk_family == AF_INET6 && |
1600 |
++ ipv6_addr_type(&inet6_sk(sk)->saddr) == IPV6_ADDR_MAPPED; |
1601 |
++} |
1602 |
++ |
1603 |
++/* TCP and MPTCP mpc flag-depending functions */ |
1604 |
++u16 mptcp_select_window(struct sock *sk); |
1605 |
++void mptcp_init_buffer_space(struct sock *sk); |
1606 |
++void mptcp_tcp_set_rto(struct sock *sk); |
1607 |
++ |
1608 |
++/* TCP and MPTCP flag-depending functions */ |
1609 |
++bool mptcp_prune_ofo_queue(struct sock *sk); |
1610 |
++ |
1611 |
++#else /* CONFIG_MPTCP */ |
1612 |
++#define mptcp_debug(fmt, args...) \ |
1613 |
++ do { \ |
1614 |
++ } while (0) |
1615 |
++ |
1616 |
++/* Without MPTCP, we just do one iteration |
1617 |
++ * over the only socket available. This assumes that |
1618 |
++ * the sk/tp arg is the socket in that case. |
1619 |
++ */ |
1620 |
++#define mptcp_for_each_sk(mpcb, sk) |
1621 |
++#define mptcp_for_each_sk_safe(__mpcb, __sk, __temp) |
1622 |
++ |
1623 |
++static inline bool mptcp_is_data_fin(const struct sk_buff *skb) |
1624 |
++{ |
1625 |
++ return false; |
1626 |
++} |
1627 |
++static inline bool mptcp_is_data_seq(const struct sk_buff *skb) |
1628 |
++{ |
1629 |
++ return false; |
1630 |
++} |
1631 |
++static inline struct sock *mptcp_meta_sk(const struct sock *sk) |
1632 |
++{ |
1633 |
++ return NULL; |
1634 |
++} |
1635 |
++static inline struct tcp_sock *mptcp_meta_tp(const struct tcp_sock *tp) |
1636 |
++{ |
1637 |
++ return NULL; |
1638 |
++} |
1639 |
++static inline int is_meta_sk(const struct sock *sk) |
1640 |
++{ |
1641 |
++ return 0; |
1642 |
++} |
1643 |
++static inline int is_master_tp(const struct tcp_sock *tp) |
1644 |
++{ |
1645 |
++ return 0; |
1646 |
++} |
1647 |
++static inline void mptcp_purge_ofo_queue(struct tcp_sock *meta_tp) {} |
1648 |
++static inline void mptcp_del_sock(const struct sock *sk) {} |
1649 |
++static inline void mptcp_update_metasocket(struct sock *sock, const struct sock *meta_sk) {} |
1650 |
++static inline void mptcp_reinject_data(struct sock *orig_sk, int clone_it) {} |
1651 |
++static inline void mptcp_update_sndbuf(const struct tcp_sock *tp) {} |
1652 |
++static inline void mptcp_clean_rtx_infinite(const struct sk_buff *skb, |
1653 |
++ const struct sock *sk) {} |
1654 |
++static inline void mptcp_sub_close(struct sock *sk, unsigned long delay) {} |
1655 |
++static inline void mptcp_set_rto(const struct sock *sk) {} |
1656 |
++static inline void mptcp_send_fin(const struct sock *meta_sk) {} |
1657 |
++static inline void mptcp_parse_options(const uint8_t *ptr, const int opsize, |
1658 |
++ const struct mptcp_options_received *mopt, |
1659 |
++ const struct sk_buff *skb) {} |
1660 |
++static inline void mptcp_syn_options(const struct sock *sk, |
1661 |
++ struct tcp_out_options *opts, |
1662 |
++ unsigned *remaining) {} |
1663 |
++static inline void mptcp_synack_options(struct request_sock *req, |
1664 |
++ struct tcp_out_options *opts, |
1665 |
++ unsigned *remaining) {} |
1666 |
++ |
1667 |
++static inline void mptcp_established_options(struct sock *sk, |
1668 |
++ struct sk_buff *skb, |
1669 |
++ struct tcp_out_options *opts, |
1670 |
++ unsigned *size) {} |
1671 |
++static inline void mptcp_options_write(__be32 *ptr, struct tcp_sock *tp, |
1672 |
++ const struct tcp_out_options *opts, |
1673 |
++ struct sk_buff *skb) {} |
1674 |
++static inline void mptcp_close(struct sock *meta_sk, long timeout) {} |
1675 |
++static inline int mptcp_doit(struct sock *sk) |
1676 |
++{ |
1677 |
++ return 0; |
1678 |
++} |
1679 |
++static inline int mptcp_check_req_fastopen(struct sock *child, |
1680 |
++ struct request_sock *req) |
1681 |
++{ |
1682 |
++ return 1; |
1683 |
++} |
1684 |
++static inline int mptcp_check_req_master(const struct sock *sk, |
1685 |
++ const struct sock *child, |
1686 |
++ struct request_sock *req, |
1687 |
++ struct request_sock **prev) |
1688 |
++{ |
1689 |
++ return 1; |
1690 |
++} |
1691 |
++static inline struct sock *mptcp_check_req_child(struct sock *sk, |
1692 |
++ struct sock *child, |
1693 |
++ struct request_sock *req, |
1694 |
++ struct request_sock **prev, |
1695 |
++ const struct mptcp_options_received *mopt) |
1696 |
++{ |
1697 |
++ return NULL; |
1698 |
++} |
1699 |
++static inline unsigned int mptcp_current_mss(struct sock *meta_sk) |
1700 |
++{ |
1701 |
++ return 0; |
1702 |
++} |
1703 |
++static inline int mptcp_select_size(const struct sock *meta_sk, bool sg) |
1704 |
++{ |
1705 |
++ return 0; |
1706 |
++} |
1707 |
++static inline void mptcp_sub_close_passive(struct sock *sk) {} |
1708 |
++static inline bool mptcp_fallback_infinite(const struct sock *sk, int flag) |
1709 |
++{ |
1710 |
++ return false; |
1711 |
++} |
1712 |
++static inline void mptcp_init_mp_opt(const struct mptcp_options_received *mopt) {} |
1713 |
++static inline int mptcp_check_rtt(const struct tcp_sock *tp, int time) |
1714 |
++{ |
1715 |
++ return 0; |
1716 |
++} |
1717 |
++static inline int mptcp_check_snd_buf(const struct tcp_sock *tp) |
1718 |
++{ |
1719 |
++ return 0; |
1720 |
++} |
1721 |
++static inline int mptcp_sysctl_syn_retries(void) |
1722 |
++{ |
1723 |
++ return 0; |
1724 |
++} |
1725 |
++static inline void mptcp_send_reset(const struct sock *sk) {} |
1726 |
++static inline int mptcp_handle_options(struct sock *sk, |
1727 |
++ const struct tcphdr *th, |
1728 |
++ struct sk_buff *skb) |
1729 |
++{ |
1730 |
++ return 0; |
1731 |
++} |
1732 |
++static inline void mptcp_reset_mopt(struct tcp_sock *tp) {} |
1733 |
++static inline void __init mptcp_init(void) {} |
1734 |
++static inline int mptcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) |
1735 |
++{ |
1736 |
++ return 0; |
1737 |
++} |
1738 |
++static inline bool mptcp_sk_can_gso(const struct sock *sk) |
1739 |
++{ |
1740 |
++ return false; |
1741 |
++} |
1742 |
++static inline bool mptcp_can_sg(const struct sock *meta_sk) |
1743 |
++{ |
1744 |
++ return false; |
1745 |
++} |
1746 |
++static inline unsigned int mptcp_xmit_size_goal(const struct sock *meta_sk, |
1747 |
++ u32 mss_now, int large_allowed) |
1748 |
++{ |
1749 |
++ return 0; |
1750 |
++} |
1751 |
++static inline void mptcp_destroy_sock(struct sock *sk) {} |
1752 |
++static inline int mptcp_rcv_synsent_state_process(struct sock *sk, |
1753 |
++ struct sock **skptr, |
1754 |
++ struct sk_buff *skb, |
1755 |
++ const struct mptcp_options_received *mopt) |
1756 |
++{ |
1757 |
++ return 0; |
1758 |
++} |
1759 |
++static inline bool mptcp_can_sendpage(struct sock *sk) |
1760 |
++{ |
1761 |
++ return false; |
1762 |
++} |
1763 |
++static inline int mptcp_init_tw_sock(struct sock *sk, |
1764 |
++ struct tcp_timewait_sock *tw) |
1765 |
++{ |
1766 |
++ return 0; |
1767 |
++} |
1768 |
++static inline void mptcp_twsk_destructor(struct tcp_timewait_sock *tw) {} |
1769 |
++static inline void mptcp_disconnect(struct sock *sk) {} |
1770 |
++static inline void mptcp_tsq_flags(struct sock *sk) {} |
1771 |
++static inline void mptcp_tsq_sub_deferred(struct sock *meta_sk) {} |
1772 |
++static inline void mptcp_hash_remove_bh(struct tcp_sock *meta_tp) {} |
1773 |
++static inline void mptcp_hash_remove(struct tcp_sock *meta_tp) {} |
1774 |
++static inline void mptcp_reqsk_new_mptcp(struct request_sock *req, |
1775 |
++ const struct tcp_options_received *rx_opt, |
1776 |
++ const struct mptcp_options_received *mopt, |
1777 |
++ const struct sk_buff *skb) {} |
1778 |
++static inline void mptcp_remove_shortcuts(const struct mptcp_cb *mpcb, |
1779 |
++ const struct sk_buff *skb) {} |
1780 |
++static inline void mptcp_delete_synack_timer(struct sock *meta_sk) {} |
1781 |
++#endif /* CONFIG_MPTCP */ |
1782 |
++ |
1783 |
++#endif /* _MPTCP_H */ |
1784 |
+diff --git a/include/net/mptcp_v4.h b/include/net/mptcp_v4.h |
1785 |
+new file mode 100644 |
1786 |
+index 000000000000..93ad97c77c5a |
1787 |
+--- /dev/null |
1788 |
++++ b/include/net/mptcp_v4.h |
1789 |
+@@ -0,0 +1,67 @@ |
1790 |
++/* |
1791 |
++ * MPTCP implementation |
1792 |
++ * |
1793 |
++ * Initial Design & Implementation: |
1794 |
++ * Sébastien Barré <sebastien.barre@×××××××××.be> |
1795 |
++ * |
1796 |
++ * Current Maintainer & Author: |
1797 |
++ * Christoph Paasch <christoph.paasch@×××××××××.be> |
1798 |
++ * |
1799 |
++ * Additional authors: |
1800 |
++ * Jaakko Korkeaniemi <jaakko.korkeaniemi@×××××.fi> |
1801 |
++ * Gregory Detal <gregory.detal@×××××××××.be> |
1802 |
++ * Fabien Duchêne <fabien.duchene@×××××××××.be> |
1803 |
++ * Andreas Seelinger <Andreas.Seelinger@×××××××××××.de> |
1804 |
++ * Lavkesh Lahngir <lavkesh51@×××××.com> |
1805 |
++ * Andreas Ripke <ripke@××××××.eu> |
1806 |
++ * Vlad Dogaru <vlad.dogaru@×××××.com> |
1807 |
++ * Octavian Purdila <octavian.purdila@×××××.com> |
1808 |
++ * John Ronan <jronan@××××.org> |
1809 |
++ * Catalin Nicutar <catalin.nicutar@×××××.com> |
1810 |
++ * Brandon Heller <brandonh@××××××××.edu> |
1811 |
++ * |
1812 |
++ * |
1813 |
++ * This program is free software; you can redistribute it and/or |
1814 |
++ * modify it under the terms of the GNU General Public License |
1815 |
++ * as published by the Free Software Foundation; either version |
1816 |
++ * 2 of the License, or (at your option) any later version. |
1817 |
++ */ |
1818 |
++ |
1819 |
++#ifndef MPTCP_V4_H_ |
1820 |
++#define MPTCP_V4_H_ |
1821 |
++ |
1822 |
++ |
1823 |
++#include <linux/in.h> |
1824 |
++#include <linux/skbuff.h> |
1825 |
++#include <net/mptcp.h> |
1826 |
++#include <net/request_sock.h> |
1827 |
++#include <net/sock.h> |
1828 |
++ |
1829 |
++extern struct request_sock_ops mptcp_request_sock_ops; |
1830 |
++extern const struct inet_connection_sock_af_ops mptcp_v4_specific; |
1831 |
++extern struct tcp_request_sock_ops mptcp_request_sock_ipv4_ops; |
1832 |
++extern struct tcp_request_sock_ops mptcp_join_request_sock_ipv4_ops; |
1833 |
++ |
1834 |
++#ifdef CONFIG_MPTCP |
1835 |
++ |
1836 |
++int mptcp_v4_do_rcv(struct sock *meta_sk, struct sk_buff *skb); |
1837 |
++struct sock *mptcp_v4_search_req(const __be16 rport, const __be32 raddr, |
1838 |
++ const __be32 laddr, const struct net *net); |
1839 |
++int mptcp_init4_subsockets(struct sock *meta_sk, const struct mptcp_loc4 *loc, |
1840 |
++ struct mptcp_rem4 *rem); |
1841 |
++int mptcp_pm_v4_init(void); |
1842 |
++void mptcp_pm_v4_undo(void); |
1843 |
++u32 mptcp_v4_get_nonce(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport); |
1844 |
++u64 mptcp_v4_get_key(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport); |
1845 |
++ |
1846 |
++#else |
1847 |
++ |
1848 |
++static inline int mptcp_v4_do_rcv(const struct sock *meta_sk, |
1849 |
++ const struct sk_buff *skb) |
1850 |
++{ |
1851 |
++ return 0; |
1852 |
++} |
1853 |
++ |
1854 |
++#endif /* CONFIG_MPTCP */ |
1855 |
++ |
1856 |
++#endif /* MPTCP_V4_H_ */ |
1857 |
+diff --git a/include/net/mptcp_v6.h b/include/net/mptcp_v6.h |
1858 |
+new file mode 100644 |
1859 |
+index 000000000000..49a4f30ccd4d |
1860 |
+--- /dev/null |
1861 |
++++ b/include/net/mptcp_v6.h |
1862 |
+@@ -0,0 +1,69 @@ |
1863 |
++/* |
1864 |
++ * MPTCP implementation |
1865 |
++ * |
1866 |
++ * Initial Design & Implementation: |
1867 |
++ * Sébastien Barré <sebastien.barre@×××××××××.be> |
1868 |
++ * |
1869 |
++ * Current Maintainer & Author: |
1870 |
++ * Jaakko Korkeaniemi <jaakko.korkeaniemi@×××××.fi> |
1871 |
++ * |
1872 |
++ * Additional authors: |
1873 |
++ * Jaakko Korkeaniemi <jaakko.korkeaniemi@×××××.fi> |
1874 |
++ * Gregory Detal <gregory.detal@×××××××××.be> |
1875 |
++ * Fabien Duchêne <fabien.duchene@×××××××××.be> |
1876 |
++ * Andreas Seelinger <Andreas.Seelinger@×××××××××××.de> |
1877 |
++ * Lavkesh Lahngir <lavkesh51@×××××.com> |
1878 |
++ * Andreas Ripke <ripke@××××××.eu> |
1879 |
++ * Vlad Dogaru <vlad.dogaru@×××××.com> |
1880 |
++ * Octavian Purdila <octavian.purdila@×××××.com> |
1881 |
++ * John Ronan <jronan@××××.org> |
1882 |
++ * Catalin Nicutar <catalin.nicutar@×××××.com> |
1883 |
++ * Brandon Heller <brandonh@××××××××.edu> |
1884 |
++ * |
1885 |
++ * |
1886 |
++ * This program is free software; you can redistribute it and/or |
1887 |
++ * modify it under the terms of the GNU General Public License |
1888 |
++ * as published by the Free Software Foundation; either version |
1889 |
++ * 2 of the License, or (at your option) any later version. |
1890 |
++ */ |
1891 |
++ |
1892 |
++#ifndef _MPTCP_V6_H |
1893 |
++#define _MPTCP_V6_H |
1894 |
++ |
1895 |
++#include <linux/in6.h> |
1896 |
++#include <net/if_inet6.h> |
1897 |
++ |
1898 |
++#include <net/mptcp.h> |
1899 |
++ |
1900 |
++ |
1901 |
++#ifdef CONFIG_MPTCP |
1902 |
++extern const struct inet_connection_sock_af_ops mptcp_v6_mapped; |
1903 |
++extern const struct inet_connection_sock_af_ops mptcp_v6_specific; |
1904 |
++extern struct request_sock_ops mptcp6_request_sock_ops; |
1905 |
++extern struct tcp_request_sock_ops mptcp_request_sock_ipv6_ops; |
1906 |
++extern struct tcp_request_sock_ops mptcp_join_request_sock_ipv6_ops; |
1907 |
++ |
1908 |
++int mptcp_v6_do_rcv(struct sock *meta_sk, struct sk_buff *skb); |
1909 |
++struct sock *mptcp_v6_search_req(const __be16 rport, const struct in6_addr *raddr, |
1910 |
++ const struct in6_addr *laddr, const struct net *net); |
1911 |
++int mptcp_init6_subsockets(struct sock *meta_sk, const struct mptcp_loc6 *loc, |
1912 |
++ struct mptcp_rem6 *rem); |
1913 |
++int mptcp_pm_v6_init(void); |
1914 |
++void mptcp_pm_v6_undo(void); |
1915 |
++__u32 mptcp_v6_get_nonce(const __be32 *saddr, const __be32 *daddr, |
1916 |
++ __be16 sport, __be16 dport); |
1917 |
++u64 mptcp_v6_get_key(const __be32 *saddr, const __be32 *daddr, |
1918 |
++ __be16 sport, __be16 dport); |
1919 |
++ |
1920 |
++#else /* CONFIG_MPTCP */ |
1921 |
++ |
1922 |
++#define mptcp_v6_mapped ipv6_mapped |
1923 |
++ |
1924 |
++static inline int mptcp_v6_do_rcv(struct sock *meta_sk, struct sk_buff *skb) |
1925 |
++{ |
1926 |
++ return 0; |
1927 |
++} |
1928 |
++ |
1929 |
++#endif /* CONFIG_MPTCP */ |
1930 |
++ |
1931 |
++#endif /* _MPTCP_V6_H */ |
1932 |
+diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h |
1933 |
+index 361d26077196..bae95a11c531 100644 |
1934 |
+--- a/include/net/net_namespace.h |
1935 |
++++ b/include/net/net_namespace.h |
1936 |
+@@ -16,6 +16,7 @@ |
1937 |
+ #include <net/netns/packet.h> |
1938 |
+ #include <net/netns/ipv4.h> |
1939 |
+ #include <net/netns/ipv6.h> |
1940 |
++#include <net/netns/mptcp.h> |
1941 |
+ #include <net/netns/ieee802154_6lowpan.h> |
1942 |
+ #include <net/netns/sctp.h> |
1943 |
+ #include <net/netns/dccp.h> |
1944 |
+@@ -92,6 +93,9 @@ struct net { |
1945 |
+ #if IS_ENABLED(CONFIG_IPV6) |
1946 |
+ struct netns_ipv6 ipv6; |
1947 |
+ #endif |
1948 |
++#if IS_ENABLED(CONFIG_MPTCP) |
1949 |
++ struct netns_mptcp mptcp; |
1950 |
++#endif |
1951 |
+ #if IS_ENABLED(CONFIG_IEEE802154_6LOWPAN) |
1952 |
+ struct netns_ieee802154_lowpan ieee802154_lowpan; |
1953 |
+ #endif |
1954 |
+diff --git a/include/net/netns/mptcp.h b/include/net/netns/mptcp.h |
1955 |
+new file mode 100644 |
1956 |
+index 000000000000..bad418b04cc8 |
1957 |
+--- /dev/null |
1958 |
++++ b/include/net/netns/mptcp.h |
1959 |
+@@ -0,0 +1,44 @@ |
1960 |
++/* |
1961 |
++ * MPTCP implementation - MPTCP namespace |
1962 |
++ * |
1963 |
++ * Initial Design & Implementation: |
1964 |
++ * Sébastien Barré <sebastien.barre@×××××××××.be> |
1965 |
++ * |
1966 |
++ * Current Maintainer: |
1967 |
++ * Christoph Paasch <christoph.paasch@×××××××××.be> |
1968 |
++ * |
1969 |
++ * Additional authors: |
1970 |
++ * Jaakko Korkeaniemi <jaakko.korkeaniemi@×××××.fi> |
1971 |
++ * Gregory Detal <gregory.detal@×××××××××.be> |
1972 |
++ * Fabien Duchêne <fabien.duchene@×××××××××.be> |
1973 |
++ * Andreas Seelinger <Andreas.Seelinger@×××××××××××.de> |
1974 |
++ * Lavkesh Lahngir <lavkesh51@×××××.com> |
1975 |
++ * Andreas Ripke <ripke@××××××.eu> |
1976 |
++ * Vlad Dogaru <vlad.dogaru@×××××.com> |
1977 |
++ * Octavian Purdila <octavian.purdila@×××××.com> |
1978 |
++ * John Ronan <jronan@××××.org> |
1979 |
++ * Catalin Nicutar <catalin.nicutar@×××××.com> |
1980 |
++ * Brandon Heller <brandonh@××××××××.edu> |
1981 |
++ * |
1982 |
++ * |
1983 |
++ * This program is free software; you can redistribute it and/or |
1984 |
++ * modify it under the terms of the GNU General Public License |
1985 |
++ * as published by the Free Software Foundation; either version |
1986 |
++ * 2 of the License, or (at your option) any later version. |
1987 |
++ */ |
1988 |
++ |
1989 |
++#ifndef __NETNS_MPTCP_H__ |
1990 |
++#define __NETNS_MPTCP_H__ |
1991 |
++ |
1992 |
++#include <linux/compiler.h> |
1993 |
++ |
1994 |
++enum { |
1995 |
++ MPTCP_PM_FULLMESH = 0, |
1996 |
++ MPTCP_PM_MAX |
1997 |
++}; |
1998 |
++ |
1999 |
++struct netns_mptcp { |
2000 |
++ void *path_managers[MPTCP_PM_MAX]; |
2001 |
++}; |
2002 |
++ |
2003 |
++#endif /* __NETNS_MPTCP_H__ */ |
2004 |
+diff --git a/include/net/request_sock.h b/include/net/request_sock.h |
2005 |
+index 7f830ff67f08..e79e87a8e1a6 100644 |
2006 |
+--- a/include/net/request_sock.h |
2007 |
++++ b/include/net/request_sock.h |
2008 |
+@@ -164,7 +164,7 @@ struct request_sock_queue { |
2009 |
+ }; |
2010 |
+ |
2011 |
+ int reqsk_queue_alloc(struct request_sock_queue *queue, |
2012 |
+- unsigned int nr_table_entries); |
2013 |
++ unsigned int nr_table_entries, gfp_t flags); |
2014 |
+ |
2015 |
+ void __reqsk_queue_destroy(struct request_sock_queue *queue); |
2016 |
+ void reqsk_queue_destroy(struct request_sock_queue *queue); |
2017 |
+diff --git a/include/net/sock.h b/include/net/sock.h |
2018 |
+index 156350745700..0e23cae8861f 100644 |
2019 |
+--- a/include/net/sock.h |
2020 |
++++ b/include/net/sock.h |
2021 |
+@@ -901,6 +901,16 @@ void sk_clear_memalloc(struct sock *sk); |
2022 |
+ |
2023 |
+ int sk_wait_data(struct sock *sk, long *timeo); |
2024 |
+ |
2025 |
++/* START - needed for MPTCP */ |
2026 |
++struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority, int family); |
2027 |
++void sock_lock_init(struct sock *sk); |
2028 |
++ |
2029 |
++extern struct lock_class_key af_callback_keys[AF_MAX]; |
2030 |
++extern char *const af_family_clock_key_strings[AF_MAX+1]; |
2031 |
++ |
2032 |
++#define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE)) |
2033 |
++/* END - needed for MPTCP */ |
2034 |
++ |
2035 |
+ struct request_sock_ops; |
2036 |
+ struct timewait_sock_ops; |
2037 |
+ struct inet_hashinfo; |
2038 |
+diff --git a/include/net/tcp.h b/include/net/tcp.h |
2039 |
+index 7286db80e8b8..ff92e74cd684 100644 |
2040 |
+--- a/include/net/tcp.h |
2041 |
++++ b/include/net/tcp.h |
2042 |
+@@ -177,6 +177,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo); |
2043 |
+ #define TCPOPT_SACK 5 /* SACK Block */ |
2044 |
+ #define TCPOPT_TIMESTAMP 8 /* Better RTT estimations/PAWS */ |
2045 |
+ #define TCPOPT_MD5SIG 19 /* MD5 Signature (RFC2385) */ |
2046 |
++#define TCPOPT_MPTCP 30 |
2047 |
+ #define TCPOPT_EXP 254 /* Experimental */ |
2048 |
+ /* Magic number to be after the option value for sharing TCP |
2049 |
+ * experimental options. See draft-ietf-tcpm-experimental-options-00.txt |
2050 |
+@@ -229,6 +230,27 @@ void tcp_time_wait(struct sock *sk, int state, int timeo); |
2051 |
+ #define TFO_SERVER_WO_SOCKOPT1 0x400 |
2052 |
+ #define TFO_SERVER_WO_SOCKOPT2 0x800 |
2053 |
+ |
2054 |
++/* Flags from tcp_input.c for tcp_ack */ |
2055 |
++#define FLAG_DATA 0x01 /* Incoming frame contained data. */ |
2056 |
++#define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */ |
2057 |
++#define FLAG_DATA_ACKED 0x04 /* This ACK acknowledged new data. */ |
2058 |
++#define FLAG_RETRANS_DATA_ACKED 0x08 /* "" "" some of which was retransmitted. */ |
2059 |
++#define FLAG_SYN_ACKED 0x10 /* This ACK acknowledged SYN. */ |
2060 |
++#define FLAG_DATA_SACKED 0x20 /* New SACK. */ |
2061 |
++#define FLAG_ECE 0x40 /* ECE in this ACK */ |
2062 |
++#define FLAG_SLOWPATH 0x100 /* Do not skip RFC checks for window update.*/ |
2063 |
++#define FLAG_ORIG_SACK_ACKED 0x200 /* Never retransmitted data are (s)acked */ |
2064 |
++#define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */ |
2065 |
++#define FLAG_DSACKING_ACK 0x800 /* SACK blocks contained D-SACK info */ |
2066 |
++#define FLAG_SACK_RENEGING 0x2000 /* snd_una advanced to a sacked seq */ |
2067 |
++#define FLAG_UPDATE_TS_RECENT 0x4000 /* tcp_replace_ts_recent() */ |
2068 |
++#define MPTCP_FLAG_DATA_ACKED 0x8000 |
2069 |
++ |
2070 |
++#define FLAG_ACKED (FLAG_DATA_ACKED|FLAG_SYN_ACKED) |
2071 |
++#define FLAG_NOT_DUP (FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED) |
2072 |
++#define FLAG_CA_ALERT (FLAG_DATA_SACKED|FLAG_ECE) |
2073 |
++#define FLAG_FORWARD_PROGRESS (FLAG_ACKED|FLAG_DATA_SACKED) |
2074 |
++ |
2075 |
+ extern struct inet_timewait_death_row tcp_death_row; |
2076 |
+ |
2077 |
+ /* sysctl variables for tcp */ |
2078 |
+@@ -344,6 +366,107 @@ extern struct proto tcp_prot; |
2079 |
+ #define TCP_ADD_STATS_USER(net, field, val) SNMP_ADD_STATS_USER((net)->mib.tcp_statistics, field, val) |
2080 |
+ #define TCP_ADD_STATS(net, field, val) SNMP_ADD_STATS((net)->mib.tcp_statistics, field, val) |
2081 |
+ |
2082 |
++/**** START - Exports needed for MPTCP ****/ |
2083 |
++extern const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops; |
2084 |
++extern const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops; |
2085 |
++ |
2086 |
++struct mptcp_options_received; |
2087 |
++ |
2088 |
++void tcp_enter_quickack_mode(struct sock *sk); |
2089 |
++int tcp_close_state(struct sock *sk); |
2090 |
++void tcp_minshall_update(struct tcp_sock *tp, unsigned int mss_now, |
2091 |
++ const struct sk_buff *skb); |
2092 |
++int tcp_xmit_probe_skb(struct sock *sk, int urgent); |
2093 |
++void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb); |
2094 |
++int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, |
2095 |
++ gfp_t gfp_mask); |
2096 |
++unsigned int tcp_mss_split_point(const struct sock *sk, |
2097 |
++ const struct sk_buff *skb, |
2098 |
++ unsigned int mss_now, |
2099 |
++ unsigned int max_segs, |
2100 |
++ int nonagle); |
2101 |
++bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buff *skb, |
2102 |
++ unsigned int cur_mss, int nonagle); |
2103 |
++bool tcp_snd_wnd_test(const struct tcp_sock *tp, const struct sk_buff *skb, |
2104 |
++ unsigned int cur_mss); |
2105 |
++unsigned int tcp_cwnd_test(const struct tcp_sock *tp, const struct sk_buff *skb); |
2106 |
++int tcp_init_tso_segs(const struct sock *sk, struct sk_buff *skb, |
2107 |
++ unsigned int mss_now); |
2108 |
++void __pskb_trim_head(struct sk_buff *skb, int len); |
2109 |
++void tcp_queue_skb(struct sock *sk, struct sk_buff *skb); |
2110 |
++void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags); |
2111 |
++void tcp_reset(struct sock *sk); |
2112 |
++bool tcp_may_update_window(const struct tcp_sock *tp, const u32 ack, |
2113 |
++ const u32 ack_seq, const u32 nwin); |
2114 |
++bool tcp_urg_mode(const struct tcp_sock *tp); |
2115 |
++void tcp_ack_probe(struct sock *sk); |
2116 |
++void tcp_rearm_rto(struct sock *sk); |
2117 |
++int tcp_write_timeout(struct sock *sk); |
2118 |
++bool retransmits_timed_out(struct sock *sk, unsigned int boundary, |
2119 |
++ unsigned int timeout, bool syn_set); |
2120 |
++void tcp_write_err(struct sock *sk); |
2121 |
++void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int decr); |
2122 |
++void tcp_set_skb_tso_segs(const struct sock *sk, struct sk_buff *skb, |
2123 |
++ unsigned int mss_now); |
2124 |
++ |
2125 |
++int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req); |
2126 |
++void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, |
2127 |
++ struct request_sock *req); |
2128 |
++__u32 tcp_v4_init_sequence(const struct sk_buff *skb); |
2129 |
++int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst, |
2130 |
++ struct flowi *fl, |
2131 |
++ struct request_sock *req, |
2132 |
++ u16 queue_mapping, |
2133 |
++ struct tcp_fastopen_cookie *foc); |
2134 |
++void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb); |
2135 |
++struct ip_options_rcu *tcp_v4_save_options(struct sk_buff *skb); |
2136 |
++struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb); |
2137 |
++void tcp_v4_reqsk_destructor(struct request_sock *req); |
2138 |
++ |
2139 |
++int tcp_v6_rtx_synack(struct sock *sk, struct request_sock *req); |
2140 |
++void tcp_v6_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, |
2141 |
++ struct request_sock *req); |
2142 |
++__u32 tcp_v6_init_sequence(const struct sk_buff *skb); |
2143 |
++int tcp_v6_send_synack(struct sock *sk, struct dst_entry *dst, |
2144 |
++ struct flowi *fl, struct request_sock *req, |
2145 |
++ u16 queue_mapping, struct tcp_fastopen_cookie *foc); |
2146 |
++void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb); |
2147 |
++int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb); |
2148 |
++int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len); |
2149 |
++void tcp_v6_destroy_sock(struct sock *sk); |
2150 |
++void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb); |
2151 |
++void tcp_v6_hash(struct sock *sk); |
2152 |
++struct sock *tcp_v6_hnd_req(struct sock *sk,struct sk_buff *skb); |
2153 |
++struct sock *tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, |
2154 |
++ struct request_sock *req, |
2155 |
++ struct dst_entry *dst); |
2156 |
++void tcp_v6_reqsk_destructor(struct request_sock *req); |
2157 |
++ |
2158 |
++unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now, |
2159 |
++ int large_allowed); |
2160 |
++u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb); |
2161 |
++ |
2162 |
++void skb_clone_fraglist(struct sk_buff *skb); |
2163 |
++void copy_skb_header(struct sk_buff *new, const struct sk_buff *old); |
2164 |
++ |
2165 |
++void inet_twsk_free(struct inet_timewait_sock *tw); |
2166 |
++int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb); |
2167 |
++/* These states need RST on ABORT according to RFC793 */ |
2168 |
++static inline bool tcp_need_reset(int state) |
2169 |
++{ |
2170 |
++ return (1 << state) & |
2171 |
++ (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 | |
2172 |
++ TCPF_FIN_WAIT2 | TCPF_SYN_RECV); |
2173 |
++} |
2174 |
++ |
2175 |
++bool tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb, |
2176 |
++ int hlen); |
2177 |
++int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int hdrlen, |
2178 |
++ bool *fragstolen); |
2179 |
++bool tcp_try_coalesce(struct sock *sk, struct sk_buff *to, |
2180 |
++ struct sk_buff *from, bool *fragstolen); |
2181 |
++/**** END - Exports needed for MPTCP ****/ |
2182 |
++ |
2183 |
+ void tcp_tasklet_init(void); |
2184 |
+ |
2185 |
+ void tcp_v4_err(struct sk_buff *skb, u32); |
2186 |
+@@ -440,6 +563,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, |
2187 |
+ size_t len, int nonblock, int flags, int *addr_len); |
2188 |
+ void tcp_parse_options(const struct sk_buff *skb, |
2189 |
+ struct tcp_options_received *opt_rx, |
2190 |
++ struct mptcp_options_received *mopt_rx, |
2191 |
+ int estab, struct tcp_fastopen_cookie *foc); |
2192 |
+ const u8 *tcp_parse_md5sig_option(const struct tcphdr *th); |
2193 |
+ |
2194 |
+@@ -493,14 +617,8 @@ static inline u32 tcp_cookie_time(void) |
2195 |
+ |
2196 |
+ u32 __cookie_v4_init_sequence(const struct iphdr *iph, const struct tcphdr *th, |
2197 |
+ u16 *mssp); |
2198 |
+-__u32 cookie_v4_init_sequence(struct sock *sk, struct sk_buff *skb, __u16 *mss); |
2199 |
+-#else |
2200 |
+-static inline __u32 cookie_v4_init_sequence(struct sock *sk, |
2201 |
+- struct sk_buff *skb, |
2202 |
+- __u16 *mss) |
2203 |
+-{ |
2204 |
+- return 0; |
2205 |
+-} |
2206 |
++__u32 cookie_v4_init_sequence(struct sock *sk, const struct sk_buff *skb, |
2207 |
++ __u16 *mss); |
2208 |
+ #endif |
2209 |
+ |
2210 |
+ __u32 cookie_init_timestamp(struct request_sock *req); |
2211 |
+@@ -516,13 +634,6 @@ u32 __cookie_v6_init_sequence(const struct ipv6hdr *iph, |
2212 |
+ const struct tcphdr *th, u16 *mssp); |
2213 |
+ __u32 cookie_v6_init_sequence(struct sock *sk, const struct sk_buff *skb, |
2214 |
+ __u16 *mss); |
2215 |
+-#else |
2216 |
+-static inline __u32 cookie_v6_init_sequence(struct sock *sk, |
2217 |
+- struct sk_buff *skb, |
2218 |
+- __u16 *mss) |
2219 |
+-{ |
2220 |
+- return 0; |
2221 |
+-} |
2222 |
+ #endif |
2223 |
+ /* tcp_output.c */ |
2224 |
+ |
2225 |
+@@ -551,10 +662,17 @@ void tcp_send_delayed_ack(struct sock *sk); |
2226 |
+ void tcp_send_loss_probe(struct sock *sk); |
2227 |
+ bool tcp_schedule_loss_probe(struct sock *sk); |
2228 |
+ |
2229 |
++u16 tcp_select_window(struct sock *sk); |
2230 |
++bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, |
2231 |
++ int push_one, gfp_t gfp); |
2232 |
++ |
2233 |
+ /* tcp_input.c */ |
2234 |
+ void tcp_resume_early_retransmit(struct sock *sk); |
2235 |
+ void tcp_rearm_rto(struct sock *sk); |
2236 |
+ void tcp_reset(struct sock *sk); |
2237 |
++void tcp_set_rto(struct sock *sk); |
2238 |
++bool tcp_should_expand_sndbuf(const struct sock *sk); |
2239 |
++bool tcp_prune_ofo_queue(struct sock *sk); |
2240 |
+ |
2241 |
+ /* tcp_timer.c */ |
2242 |
+ void tcp_init_xmit_timers(struct sock *); |
2243 |
+@@ -703,14 +821,27 @@ void tcp_send_window_probe(struct sock *sk); |
2244 |
+ */ |
2245 |
+ struct tcp_skb_cb { |
2246 |
+ union { |
2247 |
+- struct inet_skb_parm h4; |
2248 |
++ union { |
2249 |
++ struct inet_skb_parm h4; |
2250 |
+ #if IS_ENABLED(CONFIG_IPV6) |
2251 |
+- struct inet6_skb_parm h6; |
2252 |
++ struct inet6_skb_parm h6; |
2253 |
+ #endif |
2254 |
+- } header; /* For incoming frames */ |
2255 |
++ } header; /* For incoming frames */ |
2256 |
++#ifdef CONFIG_MPTCP |
2257 |
++ union { /* For MPTCP outgoing frames */ |
2258 |
++ __u32 path_mask; /* paths that tried to send this skb */ |
2259 |
++ __u32 dss[6]; /* DSS options */ |
2260 |
++ }; |
2261 |
++#endif |
2262 |
++ }; |
2263 |
+ __u32 seq; /* Starting sequence number */ |
2264 |
+ __u32 end_seq; /* SEQ + FIN + SYN + datalen */ |
2265 |
+ __u32 when; /* used to compute rtt's */ |
2266 |
++#ifdef CONFIG_MPTCP |
2267 |
++ __u8 mptcp_flags; /* flags for the MPTCP layer */ |
2268 |
++ __u8 dss_off; /* Number of 4-byte words until |
2269 |
++ * seq-number */ |
2270 |
++#endif |
2271 |
+ __u8 tcp_flags; /* TCP header flags. (tcp[13]) */ |
2272 |
+ |
2273 |
+ __u8 sacked; /* State flags for SACK/FACK. */ |
2274 |
+@@ -1075,7 +1206,8 @@ u32 tcp_default_init_rwnd(u32 mss); |
2275 |
+ /* Determine a window scaling and initial window to offer. */ |
2276 |
+ void tcp_select_initial_window(int __space, __u32 mss, __u32 *rcv_wnd, |
2277 |
+ __u32 *window_clamp, int wscale_ok, |
2278 |
+- __u8 *rcv_wscale, __u32 init_rcv_wnd); |
2279 |
++ __u8 *rcv_wscale, __u32 init_rcv_wnd, |
2280 |
++ const struct sock *sk); |
2281 |
+ |
2282 |
+ static inline int tcp_win_from_space(int space) |
2283 |
+ { |
2284 |
+@@ -1084,15 +1216,34 @@ static inline int tcp_win_from_space(int space) |
2285 |
+ space - (space>>sysctl_tcp_adv_win_scale); |
2286 |
+ } |
2287 |
+ |
2288 |
++#ifdef CONFIG_MPTCP |
2289 |
++extern struct static_key mptcp_static_key; |
2290 |
++static inline bool mptcp(const struct tcp_sock *tp) |
2291 |
++{ |
2292 |
++ return static_key_false(&mptcp_static_key) && tp->mpc; |
2293 |
++} |
2294 |
++#else |
2295 |
++static inline bool mptcp(const struct tcp_sock *tp) |
2296 |
++{ |
2297 |
++ return 0; |
2298 |
++} |
2299 |
++#endif |
2300 |
++ |
2301 |
+ /* Note: caller must be prepared to deal with negative returns */ |
2302 |
+ static inline int tcp_space(const struct sock *sk) |
2303 |
+ { |
2304 |
++ if (mptcp(tcp_sk(sk))) |
2305 |
++ sk = tcp_sk(sk)->meta_sk; |
2306 |
++ |
2307 |
+ return tcp_win_from_space(sk->sk_rcvbuf - |
2308 |
+ atomic_read(&sk->sk_rmem_alloc)); |
2309 |
+ } |
2310 |
+ |
2311 |
+ static inline int tcp_full_space(const struct sock *sk) |
2312 |
+ { |
2313 |
++ if (mptcp(tcp_sk(sk))) |
2314 |
++ sk = tcp_sk(sk)->meta_sk; |
2315 |
++ |
2316 |
+ return tcp_win_from_space(sk->sk_rcvbuf); |
2317 |
+ } |
2318 |
+ |
2319 |
+@@ -1115,6 +1266,8 @@ static inline void tcp_openreq_init(struct request_sock *req, |
2320 |
+ ireq->wscale_ok = rx_opt->wscale_ok; |
2321 |
+ ireq->acked = 0; |
2322 |
+ ireq->ecn_ok = 0; |
2323 |
++ ireq->mptcp_rqsk = 0; |
2324 |
++ ireq->saw_mpc = 0; |
2325 |
+ ireq->ir_rmt_port = tcp_hdr(skb)->source; |
2326 |
+ ireq->ir_num = ntohs(tcp_hdr(skb)->dest); |
2327 |
+ } |
2328 |
+@@ -1585,6 +1738,11 @@ int tcp4_proc_init(void); |
2329 |
+ void tcp4_proc_exit(void); |
2330 |
+ #endif |
2331 |
+ |
2332 |
++int tcp_rtx_synack(struct sock *sk, struct request_sock *req); |
2333 |
++int tcp_conn_request(struct request_sock_ops *rsk_ops, |
2334 |
++ const struct tcp_request_sock_ops *af_ops, |
2335 |
++ struct sock *sk, struct sk_buff *skb); |
2336 |
++ |
2337 |
+ /* TCP af-specific functions */ |
2338 |
+ struct tcp_sock_af_ops { |
2339 |
+ #ifdef CONFIG_TCP_MD5SIG |
2340 |
+@@ -1601,7 +1759,32 @@ struct tcp_sock_af_ops { |
2341 |
+ #endif |
2342 |
+ }; |
2343 |
+ |
2344 |
++/* TCP/MPTCP-specific functions */ |
2345 |
++struct tcp_sock_ops { |
2346 |
++ u32 (*__select_window)(struct sock *sk); |
2347 |
++ u16 (*select_window)(struct sock *sk); |
2348 |
++ void (*select_initial_window)(int __space, __u32 mss, __u32 *rcv_wnd, |
2349 |
++ __u32 *window_clamp, int wscale_ok, |
2350 |
++ __u8 *rcv_wscale, __u32 init_rcv_wnd, |
2351 |
++ const struct sock *sk); |
2352 |
++ void (*init_buffer_space)(struct sock *sk); |
2353 |
++ void (*set_rto)(struct sock *sk); |
2354 |
++ bool (*should_expand_sndbuf)(const struct sock *sk); |
2355 |
++ void (*send_fin)(struct sock *sk); |
2356 |
++ bool (*write_xmit)(struct sock *sk, unsigned int mss_now, int nonagle, |
2357 |
++ int push_one, gfp_t gfp); |
2358 |
++ void (*send_active_reset)(struct sock *sk, gfp_t priority); |
2359 |
++ int (*write_wakeup)(struct sock *sk); |
2360 |
++ bool (*prune_ofo_queue)(struct sock *sk); |
2361 |
++ void (*retransmit_timer)(struct sock *sk); |
2362 |
++ void (*time_wait)(struct sock *sk, int state, int timeo); |
2363 |
++ void (*cleanup_rbuf)(struct sock *sk, int copied); |
2364 |
++ void (*init_congestion_control)(struct sock *sk); |
2365 |
++}; |
2366 |
++extern const struct tcp_sock_ops tcp_specific; |
2367 |
++ |
2368 |
+ struct tcp_request_sock_ops { |
2369 |
++ u16 mss_clamp; |
2370 |
+ #ifdef CONFIG_TCP_MD5SIG |
2371 |
+ struct tcp_md5sig_key *(*md5_lookup) (struct sock *sk, |
2372 |
+ struct request_sock *req); |
2373 |
+@@ -1611,8 +1794,39 @@ struct tcp_request_sock_ops { |
2374 |
+ const struct request_sock *req, |
2375 |
+ const struct sk_buff *skb); |
2376 |
+ #endif |
2377 |
++ int (*init_req)(struct request_sock *req, struct sock *sk, |
2378 |
++ struct sk_buff *skb); |
2379 |
++#ifdef CONFIG_SYN_COOKIES |
2380 |
++ __u32 (*cookie_init_seq)(struct sock *sk, const struct sk_buff *skb, |
2381 |
++ __u16 *mss); |
2382 |
++#endif |
2383 |
++ struct dst_entry *(*route_req)(struct sock *sk, struct flowi *fl, |
2384 |
++ const struct request_sock *req, |
2385 |
++ bool *strict); |
2386 |
++ __u32 (*init_seq)(const struct sk_buff *skb); |
2387 |
++ int (*send_synack)(struct sock *sk, struct dst_entry *dst, |
2388 |
++ struct flowi *fl, struct request_sock *req, |
2389 |
++ u16 queue_mapping, struct tcp_fastopen_cookie *foc); |
2390 |
++ void (*queue_hash_add)(struct sock *sk, struct request_sock *req, |
2391 |
++ const unsigned long timeout); |
2392 |
+ }; |
2393 |
+ |
2394 |
++#ifdef CONFIG_SYN_COOKIES |
2395 |
++static inline __u32 cookie_init_sequence(const struct tcp_request_sock_ops *ops, |
2396 |
++ struct sock *sk, struct sk_buff *skb, |
2397 |
++ __u16 *mss) |
2398 |
++{ |
2399 |
++ return ops->cookie_init_seq(sk, skb, mss); |
2400 |
++} |
2401 |
++#else |
2402 |
++static inline __u32 cookie_init_sequence(const struct tcp_request_sock_ops *ops, |
2403 |
++ struct sock *sk, struct sk_buff *skb, |
2404 |
++ __u16 *mss) |
2405 |
++{ |
2406 |
++ return 0; |
2407 |
++} |
2408 |
++#endif |
2409 |
++ |
2410 |
+ int tcpv4_offload_init(void); |
2411 |
+ |
2412 |
+ void tcp_v4_init(void); |
2413 |
+diff --git a/include/uapi/linux/if.h b/include/uapi/linux/if.h |
2414 |
+index 9cf2394f0bcf..c2634b6ed854 100644 |
2415 |
+--- a/include/uapi/linux/if.h |
2416 |
++++ b/include/uapi/linux/if.h |
2417 |
+@@ -109,6 +109,9 @@ enum net_device_flags { |
2418 |
+ #define IFF_DORMANT IFF_DORMANT |
2419 |
+ #define IFF_ECHO IFF_ECHO |
2420 |
+ |
2421 |
++#define IFF_NOMULTIPATH 0x80000 /* Disable for MPTCP */ |
2422 |
++#define IFF_MPBACKUP 0x100000 /* Use as backup path for MPTCP */ |
2423 |
++ |
2424 |
+ #define IFF_VOLATILE (IFF_LOOPBACK|IFF_POINTOPOINT|IFF_BROADCAST|IFF_ECHO|\ |
2425 |
+ IFF_MASTER|IFF_SLAVE|IFF_RUNNING|IFF_LOWER_UP|IFF_DORMANT) |
2426 |
+ |
2427 |
+diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h |
2428 |
+index 3b9718328d8b..487475681d84 100644 |
2429 |
+--- a/include/uapi/linux/tcp.h |
2430 |
++++ b/include/uapi/linux/tcp.h |
2431 |
+@@ -112,6 +112,7 @@ enum { |
2432 |
+ #define TCP_FASTOPEN 23 /* Enable FastOpen on listeners */ |
2433 |
+ #define TCP_TIMESTAMP 24 |
2434 |
+ #define TCP_NOTSENT_LOWAT 25 /* limit number of unsent bytes in write queue */ |
2435 |
++#define MPTCP_ENABLED 26 |
2436 |
+ |
2437 |
+ struct tcp_repair_opt { |
2438 |
+ __u32 opt_code; |
2439 |
+diff --git a/net/Kconfig b/net/Kconfig |
2440 |
+index d92afe4204d9..96b58593ad5e 100644 |
2441 |
+--- a/net/Kconfig |
2442 |
++++ b/net/Kconfig |
2443 |
+@@ -79,6 +79,7 @@ if INET |
2444 |
+ source "net/ipv4/Kconfig" |
2445 |
+ source "net/ipv6/Kconfig" |
2446 |
+ source "net/netlabel/Kconfig" |
2447 |
++source "net/mptcp/Kconfig" |
2448 |
+ |
2449 |
+ endif # if INET |
2450 |
+ |
2451 |
+diff --git a/net/Makefile b/net/Makefile |
2452 |
+index cbbbe6d657ca..244bac1435b1 100644 |
2453 |
+--- a/net/Makefile |
2454 |
++++ b/net/Makefile |
2455 |
+@@ -20,6 +20,7 @@ obj-$(CONFIG_INET) += ipv4/ |
2456 |
+ obj-$(CONFIG_XFRM) += xfrm/ |
2457 |
+ obj-$(CONFIG_UNIX) += unix/ |
2458 |
+ obj-$(CONFIG_NET) += ipv6/ |
2459 |
++obj-$(CONFIG_MPTCP) += mptcp/ |
2460 |
+ obj-$(CONFIG_PACKET) += packet/ |
2461 |
+ obj-$(CONFIG_NET_KEY) += key/ |
2462 |
+ obj-$(CONFIG_BRIDGE) += bridge/ |
2463 |
+diff --git a/net/core/dev.c b/net/core/dev.c |
2464 |
+index 367a586d0c8a..215d2757fbf6 100644 |
2465 |
+--- a/net/core/dev.c |
2466 |
++++ b/net/core/dev.c |
2467 |
+@@ -5420,7 +5420,7 @@ int __dev_change_flags(struct net_device *dev, unsigned int flags) |
2468 |
+ |
2469 |
+ dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP | |
2470 |
+ IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL | |
2471 |
+- IFF_AUTOMEDIA)) | |
2472 |
++ IFF_AUTOMEDIA | IFF_NOMULTIPATH | IFF_MPBACKUP)) | |
2473 |
+ (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC | |
2474 |
+ IFF_ALLMULTI)); |
2475 |
+ |
2476 |
+diff --git a/net/core/request_sock.c b/net/core/request_sock.c |
2477 |
+index 467f326126e0..909dfa13f499 100644 |
2478 |
+--- a/net/core/request_sock.c |
2479 |
++++ b/net/core/request_sock.c |
2480 |
+@@ -38,7 +38,8 @@ int sysctl_max_syn_backlog = 256; |
2481 |
+ EXPORT_SYMBOL(sysctl_max_syn_backlog); |
2482 |
+ |
2483 |
+ int reqsk_queue_alloc(struct request_sock_queue *queue, |
2484 |
+- unsigned int nr_table_entries) |
2485 |
++ unsigned int nr_table_entries, |
2486 |
++ gfp_t flags) |
2487 |
+ { |
2488 |
+ size_t lopt_size = sizeof(struct listen_sock); |
2489 |
+ struct listen_sock *lopt; |
2490 |
+@@ -48,9 +49,11 @@ int reqsk_queue_alloc(struct request_sock_queue *queue, |
2491 |
+ nr_table_entries = roundup_pow_of_two(nr_table_entries + 1); |
2492 |
+ lopt_size += nr_table_entries * sizeof(struct request_sock *); |
2493 |
+ if (lopt_size > PAGE_SIZE) |
2494 |
+- lopt = vzalloc(lopt_size); |
2495 |
++ lopt = __vmalloc(lopt_size, |
2496 |
++ flags | __GFP_HIGHMEM | __GFP_ZERO, |
2497 |
++ PAGE_KERNEL); |
2498 |
+ else |
2499 |
+- lopt = kzalloc(lopt_size, GFP_KERNEL); |
2500 |
++ lopt = kzalloc(lopt_size, flags); |
2501 |
+ if (lopt == NULL) |
2502 |
+ return -ENOMEM; |
2503 |
+ |
2504 |
+diff --git a/net/core/skbuff.c b/net/core/skbuff.c |
2505 |
+index c1a33033cbe2..8abc5d60fbe3 100644 |
2506 |
+--- a/net/core/skbuff.c |
2507 |
++++ b/net/core/skbuff.c |
2508 |
+@@ -472,7 +472,7 @@ static inline void skb_drop_fraglist(struct sk_buff *skb) |
2509 |
+ skb_drop_list(&skb_shinfo(skb)->frag_list); |
2510 |
+ } |
2511 |
+ |
2512 |
+-static void skb_clone_fraglist(struct sk_buff *skb) |
2513 |
++void skb_clone_fraglist(struct sk_buff *skb) |
2514 |
+ { |
2515 |
+ struct sk_buff *list; |
2516 |
+ |
2517 |
+@@ -897,7 +897,7 @@ static void skb_headers_offset_update(struct sk_buff *skb, int off) |
2518 |
+ skb->inner_mac_header += off; |
2519 |
+ } |
2520 |
+ |
2521 |
+-static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old) |
2522 |
++void copy_skb_header(struct sk_buff *new, const struct sk_buff *old) |
2523 |
+ { |
2524 |
+ __copy_skb_header(new, old); |
2525 |
+ |
2526 |
+diff --git a/net/core/sock.c b/net/core/sock.c |
2527 |
+index 026e01f70274..359295523177 100644 |
2528 |
+--- a/net/core/sock.c |
2529 |
++++ b/net/core/sock.c |
2530 |
+@@ -136,6 +136,11 @@ |
2531 |
+ |
2532 |
+ #include <trace/events/sock.h> |
2533 |
+ |
2534 |
++#ifdef CONFIG_MPTCP |
2535 |
++#include <net/mptcp.h> |
2536 |
++#include <net/inet_common.h> |
2537 |
++#endif |
2538 |
++ |
2539 |
+ #ifdef CONFIG_INET |
2540 |
+ #include <net/tcp.h> |
2541 |
+ #endif |
2542 |
+@@ -280,7 +285,7 @@ static const char *const af_family_slock_key_strings[AF_MAX+1] = { |
2543 |
+ "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG" , |
2544 |
+ "slock-AF_NFC" , "slock-AF_VSOCK" ,"slock-AF_MAX" |
2545 |
+ }; |
2546 |
+-static const char *const af_family_clock_key_strings[AF_MAX+1] = { |
2547 |
++char *const af_family_clock_key_strings[AF_MAX+1] = { |
2548 |
+ "clock-AF_UNSPEC", "clock-AF_UNIX" , "clock-AF_INET" , |
2549 |
+ "clock-AF_AX25" , "clock-AF_IPX" , "clock-AF_APPLETALK", |
2550 |
+ "clock-AF_NETROM", "clock-AF_BRIDGE" , "clock-AF_ATMPVC" , |
2551 |
+@@ -301,7 +306,7 @@ static const char *const af_family_clock_key_strings[AF_MAX+1] = { |
2552 |
+ * sk_callback_lock locking rules are per-address-family, |
2553 |
+ * so split the lock classes by using a per-AF key: |
2554 |
+ */ |
2555 |
+-static struct lock_class_key af_callback_keys[AF_MAX]; |
2556 |
++struct lock_class_key af_callback_keys[AF_MAX]; |
2557 |
+ |
2558 |
+ /* Take into consideration the size of the struct sk_buff overhead in the |
2559 |
+ * determination of these values, since that is non-constant across |
2560 |
+@@ -422,8 +427,6 @@ static void sock_warn_obsolete_bsdism(const char *name) |
2561 |
+ } |
2562 |
+ } |
2563 |
+ |
2564 |
+-#define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE)) |
2565 |
+- |
2566 |
+ static void sock_disable_timestamp(struct sock *sk, unsigned long flags) |
2567 |
+ { |
2568 |
+ if (sk->sk_flags & flags) { |
2569 |
+@@ -1253,8 +1256,25 @@ lenout: |
2570 |
+ * |
2571 |
+ * (We also register the sk_lock with the lock validator.) |
2572 |
+ */ |
2573 |
+-static inline void sock_lock_init(struct sock *sk) |
2574 |
+-{ |
2575 |
++void sock_lock_init(struct sock *sk) |
2576 |
++{ |
2577 |
++#ifdef CONFIG_MPTCP |
2578 |
++ /* Reclassify the lock-class for subflows */ |
2579 |
++ if (sk->sk_type == SOCK_STREAM && sk->sk_protocol == IPPROTO_TCP) |
2580 |
++ if (mptcp(tcp_sk(sk)) || tcp_sk(sk)->is_master_sk) { |
2581 |
++ sock_lock_init_class_and_name(sk, "slock-AF_INET-MPTCP", |
2582 |
++ &meta_slock_key, |
2583 |
++ "sk_lock-AF_INET-MPTCP", |
2584 |
++ &meta_key); |
2585 |
++ |
2586 |
++ /* We don't yet have the mptcp-point. |
2587 |
++ * Thus we still need inet_sock_destruct |
2588 |
++ */ |
2589 |
++ sk->sk_destruct = inet_sock_destruct; |
2590 |
++ return; |
2591 |
++ } |
2592 |
++#endif |
2593 |
++ |
2594 |
+ sock_lock_init_class_and_name(sk, |
2595 |
+ af_family_slock_key_strings[sk->sk_family], |
2596 |
+ af_family_slock_keys + sk->sk_family, |
2597 |
+@@ -1301,7 +1321,7 @@ void sk_prot_clear_portaddr_nulls(struct sock *sk, int size) |
2598 |
+ } |
2599 |
+ EXPORT_SYMBOL(sk_prot_clear_portaddr_nulls); |
2600 |
+ |
2601 |
+-static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority, |
2602 |
++struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority, |
2603 |
+ int family) |
2604 |
+ { |
2605 |
+ struct sock *sk; |
2606 |
+diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c |
2607 |
+index 4db3c2a1679c..04cb17d4b0ce 100644 |
2608 |
+--- a/net/dccp/ipv6.c |
2609 |
++++ b/net/dccp/ipv6.c |
2610 |
+@@ -386,7 +386,7 @@ static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb) |
2611 |
+ if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) |
2612 |
+ goto drop; |
2613 |
+ |
2614 |
+- req = inet6_reqsk_alloc(&dccp6_request_sock_ops); |
2615 |
++ req = inet_reqsk_alloc(&dccp6_request_sock_ops); |
2616 |
+ if (req == NULL) |
2617 |
+ goto drop; |
2618 |
+ |
2619 |
+diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig |
2620 |
+index 05c57f0fcabe..630434db0085 100644 |
2621 |
+--- a/net/ipv4/Kconfig |
2622 |
++++ b/net/ipv4/Kconfig |
2623 |
+@@ -556,6 +556,30 @@ config TCP_CONG_ILLINOIS |
2624 |
+ For further details see: |
2625 |
+ http://www.ews.uiuc.edu/~shaoliu/tcpillinois/index.html |
2626 |
+ |
2627 |
++config TCP_CONG_COUPLED |
2628 |
++ tristate "MPTCP COUPLED CONGESTION CONTROL" |
2629 |
++ depends on MPTCP |
2630 |
++ default n |
2631 |
++ ---help--- |
2632 |
++ MultiPath TCP Coupled Congestion Control |
2633 |
++ To enable it, just put 'coupled' in tcp_congestion_control |
2634 |
++ |
2635 |
++config TCP_CONG_OLIA |
2636 |
++ tristate "MPTCP Opportunistic Linked Increase" |
2637 |
++ depends on MPTCP |
2638 |
++ default n |
2639 |
++ ---help--- |
2640 |
++ MultiPath TCP Opportunistic Linked Increase Congestion Control |
2641 |
++ To enable it, just put 'olia' in tcp_congestion_control |
2642 |
++ |
2643 |
++config TCP_CONG_WVEGAS |
2644 |
++ tristate "MPTCP WVEGAS CONGESTION CONTROL" |
2645 |
++ depends on MPTCP |
2646 |
++ default n |
2647 |
++ ---help--- |
2648 |
++ wVegas congestion control for MPTCP |
2649 |
++ To enable it, just put 'wvegas' in tcp_congestion_control |
2650 |
++ |
2651 |
+ choice |
2652 |
+ prompt "Default TCP congestion control" |
2653 |
+ default DEFAULT_CUBIC |
2654 |
+@@ -584,6 +608,15 @@ choice |
2655 |
+ config DEFAULT_WESTWOOD |
2656 |
+ bool "Westwood" if TCP_CONG_WESTWOOD=y |
2657 |
+ |
2658 |
++ config DEFAULT_COUPLED |
2659 |
++ bool "Coupled" if TCP_CONG_COUPLED=y |
2660 |
++ |
2661 |
++ config DEFAULT_OLIA |
2662 |
++ bool "Olia" if TCP_CONG_OLIA=y |
2663 |
++ |
2664 |
++ config DEFAULT_WVEGAS |
2665 |
++ bool "Wvegas" if TCP_CONG_WVEGAS=y |
2666 |
++ |
2667 |
+ config DEFAULT_RENO |
2668 |
+ bool "Reno" |
2669 |
+ |
2670 |
+@@ -605,6 +638,8 @@ config DEFAULT_TCP_CONG |
2671 |
+ default "vegas" if DEFAULT_VEGAS |
2672 |
+ default "westwood" if DEFAULT_WESTWOOD |
2673 |
+ default "veno" if DEFAULT_VENO |
2674 |
++ default "coupled" if DEFAULT_COUPLED |
2675 |
++ default "wvegas" if DEFAULT_WVEGAS |
2676 |
+ default "reno" if DEFAULT_RENO |
2677 |
+ default "cubic" |
2678 |
+ |
2679 |
+diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c |
2680 |
+index d156b3c5f363..4afd6d8d9028 100644 |
2681 |
+--- a/net/ipv4/af_inet.c |
2682 |
++++ b/net/ipv4/af_inet.c |
2683 |
+@@ -104,6 +104,7 @@ |
2684 |
+ #include <net/ip_fib.h> |
2685 |
+ #include <net/inet_connection_sock.h> |
2686 |
+ #include <net/tcp.h> |
2687 |
++#include <net/mptcp.h> |
2688 |
+ #include <net/udp.h> |
2689 |
+ #include <net/udplite.h> |
2690 |
+ #include <net/ping.h> |
2691 |
+@@ -246,8 +247,7 @@ EXPORT_SYMBOL(inet_listen); |
2692 |
+ * Create an inet socket. |
2693 |
+ */ |
2694 |
+ |
2695 |
+-static int inet_create(struct net *net, struct socket *sock, int protocol, |
2696 |
+- int kern) |
2697 |
++int inet_create(struct net *net, struct socket *sock, int protocol, int kern) |
2698 |
+ { |
2699 |
+ struct sock *sk; |
2700 |
+ struct inet_protosw *answer; |
2701 |
+@@ -676,6 +676,23 @@ int inet_accept(struct socket *sock, struct socket *newsock, int flags) |
2702 |
+ lock_sock(sk2); |
2703 |
+ |
2704 |
+ sock_rps_record_flow(sk2); |
2705 |
++ |
2706 |
++ if (sk2->sk_protocol == IPPROTO_TCP && mptcp(tcp_sk(sk2))) { |
2707 |
++ struct sock *sk_it = sk2; |
2708 |
++ |
2709 |
++ mptcp_for_each_sk(tcp_sk(sk2)->mpcb, sk_it) |
2710 |
++ sock_rps_record_flow(sk_it); |
2711 |
++ |
2712 |
++ if (tcp_sk(sk2)->mpcb->master_sk) { |
2713 |
++ sk_it = tcp_sk(sk2)->mpcb->master_sk; |
2714 |
++ |
2715 |
++ write_lock_bh(&sk_it->sk_callback_lock); |
2716 |
++ sk_it->sk_wq = newsock->wq; |
2717 |
++ sk_it->sk_socket = newsock; |
2718 |
++ write_unlock_bh(&sk_it->sk_callback_lock); |
2719 |
++ } |
2720 |
++ } |
2721 |
++ |
2722 |
+ WARN_ON(!((1 << sk2->sk_state) & |
2723 |
+ (TCPF_ESTABLISHED | TCPF_SYN_RECV | |
2724 |
+ TCPF_CLOSE_WAIT | TCPF_CLOSE))); |
2725 |
+@@ -1763,6 +1780,9 @@ static int __init inet_init(void) |
2726 |
+ |
2727 |
+ ip_init(); |
2728 |
+ |
2729 |
++ /* We must initialize MPTCP before TCP. */ |
2730 |
++ mptcp_init(); |
2731 |
++ |
2732 |
+ tcp_v4_init(); |
2733 |
+ |
2734 |
+ /* Setup TCP slab cache for open requests. */ |
2735 |
+diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c |
2736 |
+index 14d02ea905b6..7d734d8af19b 100644 |
2737 |
+--- a/net/ipv4/inet_connection_sock.c |
2738 |
++++ b/net/ipv4/inet_connection_sock.c |
2739 |
+@@ -23,6 +23,7 @@ |
2740 |
+ #include <net/route.h> |
2741 |
+ #include <net/tcp_states.h> |
2742 |
+ #include <net/xfrm.h> |
2743 |
++#include <net/mptcp.h> |
2744 |
+ |
2745 |
+ #ifdef INET_CSK_DEBUG |
2746 |
+ const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n"; |
2747 |
+@@ -465,8 +466,8 @@ no_route: |
2748 |
+ } |
2749 |
+ EXPORT_SYMBOL_GPL(inet_csk_route_child_sock); |
2750 |
+ |
2751 |
+-static inline u32 inet_synq_hash(const __be32 raddr, const __be16 rport, |
2752 |
+- const u32 rnd, const u32 synq_hsize) |
2753 |
++u32 inet_synq_hash(const __be32 raddr, const __be16 rport, const u32 rnd, |
2754 |
++ const u32 synq_hsize) |
2755 |
+ { |
2756 |
+ return jhash_2words((__force u32)raddr, (__force u32)rport, rnd) & (synq_hsize - 1); |
2757 |
+ } |
2758 |
+@@ -647,7 +648,7 @@ void inet_csk_reqsk_queue_prune(struct sock *parent, |
2759 |
+ |
2760 |
+ lopt->clock_hand = i; |
2761 |
+ |
2762 |
+- if (lopt->qlen) |
2763 |
++ if (lopt->qlen && !is_meta_sk(parent)) |
2764 |
+ inet_csk_reset_keepalive_timer(parent, interval); |
2765 |
+ } |
2766 |
+ EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_prune); |
2767 |
+@@ -664,7 +665,9 @@ struct sock *inet_csk_clone_lock(const struct sock *sk, |
2768 |
+ const struct request_sock *req, |
2769 |
+ const gfp_t priority) |
2770 |
+ { |
2771 |
+- struct sock *newsk = sk_clone_lock(sk, priority); |
2772 |
++ struct sock *newsk; |
2773 |
++ |
2774 |
++ newsk = sk_clone_lock(sk, priority); |
2775 |
+ |
2776 |
+ if (newsk != NULL) { |
2777 |
+ struct inet_connection_sock *newicsk = inet_csk(newsk); |
2778 |
+@@ -743,7 +746,8 @@ int inet_csk_listen_start(struct sock *sk, const int nr_table_entries) |
2779 |
+ { |
2780 |
+ struct inet_sock *inet = inet_sk(sk); |
2781 |
+ struct inet_connection_sock *icsk = inet_csk(sk); |
2782 |
+- int rc = reqsk_queue_alloc(&icsk->icsk_accept_queue, nr_table_entries); |
2783 |
++ int rc = reqsk_queue_alloc(&icsk->icsk_accept_queue, nr_table_entries, |
2784 |
++ GFP_KERNEL); |
2785 |
+ |
2786 |
+ if (rc != 0) |
2787 |
+ return rc; |
2788 |
+@@ -801,9 +805,14 @@ void inet_csk_listen_stop(struct sock *sk) |
2789 |
+ |
2790 |
+ while ((req = acc_req) != NULL) { |
2791 |
+ struct sock *child = req->sk; |
2792 |
++ bool mutex_taken = false; |
2793 |
+ |
2794 |
+ acc_req = req->dl_next; |
2795 |
+ |
2796 |
++ if (is_meta_sk(child)) { |
2797 |
++ mutex_lock(&tcp_sk(child)->mpcb->mpcb_mutex); |
2798 |
++ mutex_taken = true; |
2799 |
++ } |
2800 |
+ local_bh_disable(); |
2801 |
+ bh_lock_sock(child); |
2802 |
+ WARN_ON(sock_owned_by_user(child)); |
2803 |
+@@ -832,6 +841,8 @@ void inet_csk_listen_stop(struct sock *sk) |
2804 |
+ |
2805 |
+ bh_unlock_sock(child); |
2806 |
+ local_bh_enable(); |
2807 |
++ if (mutex_taken) |
2808 |
++ mutex_unlock(&tcp_sk(child)->mpcb->mpcb_mutex); |
2809 |
+ sock_put(child); |
2810 |
+ |
2811 |
+ sk_acceptq_removed(sk); |
2812 |
+diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c |
2813 |
+index c86624b36a62..0ff3fe004d62 100644 |
2814 |
+--- a/net/ipv4/syncookies.c |
2815 |
++++ b/net/ipv4/syncookies.c |
2816 |
+@@ -170,7 +170,8 @@ u32 __cookie_v4_init_sequence(const struct iphdr *iph, const struct tcphdr *th, |
2817 |
+ } |
2818 |
+ EXPORT_SYMBOL_GPL(__cookie_v4_init_sequence); |
2819 |
+ |
2820 |
+-__u32 cookie_v4_init_sequence(struct sock *sk, struct sk_buff *skb, __u16 *mssp) |
2821 |
++__u32 cookie_v4_init_sequence(struct sock *sk, const struct sk_buff *skb, |
2822 |
++ __u16 *mssp) |
2823 |
+ { |
2824 |
+ const struct iphdr *iph = ip_hdr(skb); |
2825 |
+ const struct tcphdr *th = tcp_hdr(skb); |
2826 |
+@@ -284,7 +285,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, |
2827 |
+ |
2828 |
+ /* check for timestamp cookie support */ |
2829 |
+ memset(&tcp_opt, 0, sizeof(tcp_opt)); |
2830 |
+- tcp_parse_options(skb, &tcp_opt, 0, NULL); |
2831 |
++ tcp_parse_options(skb, &tcp_opt, NULL, 0, NULL); |
2832 |
+ |
2833 |
+ if (!cookie_check_timestamp(&tcp_opt, sock_net(sk), &ecn_ok)) |
2834 |
+ goto out; |
2835 |
+@@ -355,10 +356,10 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, |
2836 |
+ /* Try to redo what tcp_v4_send_synack did. */ |
2837 |
+ req->window_clamp = tp->window_clamp ? :dst_metric(&rt->dst, RTAX_WINDOW); |
2838 |
+ |
2839 |
+- tcp_select_initial_window(tcp_full_space(sk), req->mss, |
2840 |
+- &req->rcv_wnd, &req->window_clamp, |
2841 |
+- ireq->wscale_ok, &rcv_wscale, |
2842 |
+- dst_metric(&rt->dst, RTAX_INITRWND)); |
2843 |
++ tp->ops->select_initial_window(tcp_full_space(sk), req->mss, |
2844 |
++ &req->rcv_wnd, &req->window_clamp, |
2845 |
++ ireq->wscale_ok, &rcv_wscale, |
2846 |
++ dst_metric(&rt->dst, RTAX_INITRWND), sk); |
2847 |
+ |
2848 |
+ ireq->rcv_wscale = rcv_wscale; |
2849 |
+ |
2850 |
+diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c |
2851 |
+index 9d2118e5fbc7..2cb89f886d45 100644 |
2852 |
+--- a/net/ipv4/tcp.c |
2853 |
++++ b/net/ipv4/tcp.c |
2854 |
+@@ -271,6 +271,7 @@ |
2855 |
+ |
2856 |
+ #include <net/icmp.h> |
2857 |
+ #include <net/inet_common.h> |
2858 |
++#include <net/mptcp.h> |
2859 |
+ #include <net/tcp.h> |
2860 |
+ #include <net/xfrm.h> |
2861 |
+ #include <net/ip.h> |
2862 |
+@@ -371,6 +372,24 @@ static int retrans_to_secs(u8 retrans, int timeout, int rto_max) |
2863 |
+ return period; |
2864 |
+ } |
2865 |
+ |
2866 |
++const struct tcp_sock_ops tcp_specific = { |
2867 |
++ .__select_window = __tcp_select_window, |
2868 |
++ .select_window = tcp_select_window, |
2869 |
++ .select_initial_window = tcp_select_initial_window, |
2870 |
++ .init_buffer_space = tcp_init_buffer_space, |
2871 |
++ .set_rto = tcp_set_rto, |
2872 |
++ .should_expand_sndbuf = tcp_should_expand_sndbuf, |
2873 |
++ .init_congestion_control = tcp_init_congestion_control, |
2874 |
++ .send_fin = tcp_send_fin, |
2875 |
++ .write_xmit = tcp_write_xmit, |
2876 |
++ .send_active_reset = tcp_send_active_reset, |
2877 |
++ .write_wakeup = tcp_write_wakeup, |
2878 |
++ .prune_ofo_queue = tcp_prune_ofo_queue, |
2879 |
++ .retransmit_timer = tcp_retransmit_timer, |
2880 |
++ .time_wait = tcp_time_wait, |
2881 |
++ .cleanup_rbuf = tcp_cleanup_rbuf, |
2882 |
++}; |
2883 |
++ |
2884 |
+ /* Address-family independent initialization for a tcp_sock. |
2885 |
+ * |
2886 |
+ * NOTE: A lot of things set to zero explicitly by call to |
2887 |
+@@ -419,6 +438,8 @@ void tcp_init_sock(struct sock *sk) |
2888 |
+ sk->sk_sndbuf = sysctl_tcp_wmem[1]; |
2889 |
+ sk->sk_rcvbuf = sysctl_tcp_rmem[1]; |
2890 |
+ |
2891 |
++ tp->ops = &tcp_specific; |
2892 |
++ |
2893 |
+ local_bh_disable(); |
2894 |
+ sock_update_memcg(sk); |
2895 |
+ sk_sockets_allocated_inc(sk); |
2896 |
+@@ -726,6 +747,14 @@ ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos, |
2897 |
+ int ret; |
2898 |
+ |
2899 |
+ sock_rps_record_flow(sk); |
2900 |
++ |
2901 |
++#ifdef CONFIG_MPTCP |
2902 |
++ if (mptcp(tcp_sk(sk))) { |
2903 |
++ struct sock *sk_it; |
2904 |
++ mptcp_for_each_sk(tcp_sk(sk)->mpcb, sk_it) |
2905 |
++ sock_rps_record_flow(sk_it); |
2906 |
++ } |
2907 |
++#endif |
2908 |
+ /* |
2909 |
+ * We can't seek on a socket input |
2910 |
+ */ |
2911 |
+@@ -821,8 +850,7 @@ struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp) |
2912 |
+ return NULL; |
2913 |
+ } |
2914 |
+ |
2915 |
+-static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now, |
2916 |
+- int large_allowed) |
2917 |
++unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now, int large_allowed) |
2918 |
+ { |
2919 |
+ struct tcp_sock *tp = tcp_sk(sk); |
2920 |
+ u32 xmit_size_goal, old_size_goal; |
2921 |
+@@ -872,8 +900,13 @@ static int tcp_send_mss(struct sock *sk, int *size_goal, int flags) |
2922 |
+ { |
2923 |
+ int mss_now; |
2924 |
+ |
2925 |
+- mss_now = tcp_current_mss(sk); |
2926 |
+- *size_goal = tcp_xmit_size_goal(sk, mss_now, !(flags & MSG_OOB)); |
2927 |
++ if (mptcp(tcp_sk(sk))) { |
2928 |
++ mss_now = mptcp_current_mss(sk); |
2929 |
++ *size_goal = mptcp_xmit_size_goal(sk, mss_now, !(flags & MSG_OOB)); |
2930 |
++ } else { |
2931 |
++ mss_now = tcp_current_mss(sk); |
2932 |
++ *size_goal = tcp_xmit_size_goal(sk, mss_now, !(flags & MSG_OOB)); |
2933 |
++ } |
2934 |
+ |
2935 |
+ return mss_now; |
2936 |
+ } |
2937 |
+@@ -892,11 +925,32 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset, |
2938 |
+ * is fully established. |
2939 |
+ */ |
2940 |
+ if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) && |
2941 |
+- !tcp_passive_fastopen(sk)) { |
2942 |
++ !tcp_passive_fastopen(mptcp(tp) && tp->mpcb->master_sk ? |
2943 |
++ tp->mpcb->master_sk : sk)) { |
2944 |
+ if ((err = sk_stream_wait_connect(sk, &timeo)) != 0) |
2945 |
+ goto out_err; |
2946 |
+ } |
2947 |
+ |
2948 |
++ if (mptcp(tp)) { |
2949 |
++ struct sock *sk_it = sk; |
2950 |
++ |
2951 |
++ /* We must check this with socket-lock hold because we iterate |
2952 |
++ * over the subflows. |
2953 |
++ */ |
2954 |
++ if (!mptcp_can_sendpage(sk)) { |
2955 |
++ ssize_t ret; |
2956 |
++ |
2957 |
++ release_sock(sk); |
2958 |
++ ret = sock_no_sendpage(sk->sk_socket, page, offset, |
2959 |
++ size, flags); |
2960 |
++ lock_sock(sk); |
2961 |
++ return ret; |
2962 |
++ } |
2963 |
++ |
2964 |
++ mptcp_for_each_sk(tp->mpcb, sk_it) |
2965 |
++ sock_rps_record_flow(sk_it); |
2966 |
++ } |
2967 |
++ |
2968 |
+ clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); |
2969 |
+ |
2970 |
+ mss_now = tcp_send_mss(sk, &size_goal, flags); |
2971 |
+@@ -1001,8 +1055,9 @@ int tcp_sendpage(struct sock *sk, struct page *page, int offset, |
2972 |
+ { |
2973 |
+ ssize_t res; |
2974 |
+ |
2975 |
+- if (!(sk->sk_route_caps & NETIF_F_SG) || |
2976 |
+- !(sk->sk_route_caps & NETIF_F_ALL_CSUM)) |
2977 |
++ /* If MPTCP is enabled, we check it later after establishment */ |
2978 |
++ if (!mptcp(tcp_sk(sk)) && (!(sk->sk_route_caps & NETIF_F_SG) || |
2979 |
++ !(sk->sk_route_caps & NETIF_F_ALL_CSUM))) |
2980 |
+ return sock_no_sendpage(sk->sk_socket, page, offset, size, |
2981 |
+ flags); |
2982 |
+ |
2983 |
+@@ -1018,6 +1073,9 @@ static inline int select_size(const struct sock *sk, bool sg) |
2984 |
+ const struct tcp_sock *tp = tcp_sk(sk); |
2985 |
+ int tmp = tp->mss_cache; |
2986 |
+ |
2987 |
++ if (mptcp(tp)) |
2988 |
++ return mptcp_select_size(sk, sg); |
2989 |
++ |
2990 |
+ if (sg) { |
2991 |
+ if (sk_can_gso(sk)) { |
2992 |
+ /* Small frames wont use a full page: |
2993 |
+@@ -1100,11 +1158,18 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, |
2994 |
+ * is fully established. |
2995 |
+ */ |
2996 |
+ if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) && |
2997 |
+- !tcp_passive_fastopen(sk)) { |
2998 |
++ !tcp_passive_fastopen(mptcp(tp) && tp->mpcb->master_sk ? |
2999 |
++ tp->mpcb->master_sk : sk)) { |
3000 |
+ if ((err = sk_stream_wait_connect(sk, &timeo)) != 0) |
3001 |
+ goto do_error; |
3002 |
+ } |
3003 |
+ |
3004 |
++ if (mptcp(tp)) { |
3005 |
++ struct sock *sk_it = sk; |
3006 |
++ mptcp_for_each_sk(tp->mpcb, sk_it) |
3007 |
++ sock_rps_record_flow(sk_it); |
3008 |
++ } |
3009 |
++ |
3010 |
+ if (unlikely(tp->repair)) { |
3011 |
+ if (tp->repair_queue == TCP_RECV_QUEUE) { |
3012 |
+ copied = tcp_send_rcvq(sk, msg, size); |
3013 |
+@@ -1132,7 +1197,10 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, |
3014 |
+ if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) |
3015 |
+ goto out_err; |
3016 |
+ |
3017 |
+- sg = !!(sk->sk_route_caps & NETIF_F_SG); |
3018 |
++ if (mptcp(tp)) |
3019 |
++ sg = mptcp_can_sg(sk); |
3020 |
++ else |
3021 |
++ sg = !!(sk->sk_route_caps & NETIF_F_SG); |
3022 |
+ |
3023 |
+ while (--iovlen >= 0) { |
3024 |
+ size_t seglen = iov->iov_len; |
3025 |
+@@ -1183,8 +1251,15 @@ new_segment: |
3026 |
+ |
3027 |
+ /* |
3028 |
+ * Check whether we can use HW checksum. |
3029 |
++ * |
3030 |
++ * If dss-csum is enabled, we do not do hw-csum. |
3031 |
++ * In case of non-mptcp we check the |
3032 |
++ * device-capabilities. |
3033 |
++ * In case of mptcp, hw-csum's will be handled |
3034 |
++ * later in mptcp_write_xmit. |
3035 |
+ */ |
3036 |
+- if (sk->sk_route_caps & NETIF_F_ALL_CSUM) |
3037 |
++ if (((mptcp(tp) && !tp->mpcb->dss_csum) || !mptcp(tp)) && |
3038 |
++ (mptcp(tp) || sk->sk_route_caps & NETIF_F_ALL_CSUM)) |
3039 |
+ skb->ip_summed = CHECKSUM_PARTIAL; |
3040 |
+ |
3041 |
+ skb_entail(sk, skb); |
3042 |
+@@ -1422,7 +1497,7 @@ void tcp_cleanup_rbuf(struct sock *sk, int copied) |
3043 |
+ |
3044 |
+ /* Optimize, __tcp_select_window() is not cheap. */ |
3045 |
+ if (2*rcv_window_now <= tp->window_clamp) { |
3046 |
+- __u32 new_window = __tcp_select_window(sk); |
3047 |
++ __u32 new_window = tp->ops->__select_window(sk); |
3048 |
+ |
3049 |
+ /* Send ACK now, if this read freed lots of space |
3050 |
+ * in our buffer. Certainly, new_window is new window. |
3051 |
+@@ -1587,7 +1662,7 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, |
3052 |
+ /* Clean up data we have read: This will do ACK frames. */ |
3053 |
+ if (copied > 0) { |
3054 |
+ tcp_recv_skb(sk, seq, &offset); |
3055 |
+- tcp_cleanup_rbuf(sk, copied); |
3056 |
++ tp->ops->cleanup_rbuf(sk, copied); |
3057 |
+ } |
3058 |
+ return copied; |
3059 |
+ } |
3060 |
+@@ -1623,6 +1698,14 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, |
3061 |
+ |
3062 |
+ lock_sock(sk); |
3063 |
+ |
3064 |
++#ifdef CONFIG_MPTCP |
3065 |
++ if (mptcp(tp)) { |
3066 |
++ struct sock *sk_it; |
3067 |
++ mptcp_for_each_sk(tp->mpcb, sk_it) |
3068 |
++ sock_rps_record_flow(sk_it); |
3069 |
++ } |
3070 |
++#endif |
3071 |
++ |
3072 |
+ err = -ENOTCONN; |
3073 |
+ if (sk->sk_state == TCP_LISTEN) |
3074 |
+ goto out; |
3075 |
+@@ -1761,7 +1844,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, |
3076 |
+ } |
3077 |
+ } |
3078 |
+ |
3079 |
+- tcp_cleanup_rbuf(sk, copied); |
3080 |
++ tp->ops->cleanup_rbuf(sk, copied); |
3081 |
+ |
3082 |
+ if (!sysctl_tcp_low_latency && tp->ucopy.task == user_recv) { |
3083 |
+ /* Install new reader */ |
3084 |
+@@ -1813,7 +1896,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, |
3085 |
+ if (tp->rcv_wnd == 0 && |
3086 |
+ !skb_queue_empty(&sk->sk_async_wait_queue)) { |
3087 |
+ tcp_service_net_dma(sk, true); |
3088 |
+- tcp_cleanup_rbuf(sk, copied); |
3089 |
++ tp->ops->cleanup_rbuf(sk, copied); |
3090 |
+ } else |
3091 |
+ dma_async_issue_pending(tp->ucopy.dma_chan); |
3092 |
+ } |
3093 |
+@@ -1993,7 +2076,7 @@ skip_copy: |
3094 |
+ */ |
3095 |
+ |
3096 |
+ /* Clean up data we have read: This will do ACK frames. */ |
3097 |
+- tcp_cleanup_rbuf(sk, copied); |
3098 |
++ tp->ops->cleanup_rbuf(sk, copied); |
3099 |
+ |
3100 |
+ release_sock(sk); |
3101 |
+ return copied; |
3102 |
+@@ -2070,7 +2153,7 @@ static const unsigned char new_state[16] = { |
3103 |
+ /* TCP_CLOSING */ TCP_CLOSING, |
3104 |
+ }; |
3105 |
+ |
3106 |
+-static int tcp_close_state(struct sock *sk) |
3107 |
++int tcp_close_state(struct sock *sk) |
3108 |
+ { |
3109 |
+ int next = (int)new_state[sk->sk_state]; |
3110 |
+ int ns = next & TCP_STATE_MASK; |
3111 |
+@@ -2100,7 +2183,7 @@ void tcp_shutdown(struct sock *sk, int how) |
3112 |
+ TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) { |
3113 |
+ /* Clear out any half completed packets. FIN if needed. */ |
3114 |
+ if (tcp_close_state(sk)) |
3115 |
+- tcp_send_fin(sk); |
3116 |
++ tcp_sk(sk)->ops->send_fin(sk); |
3117 |
+ } |
3118 |
+ } |
3119 |
+ EXPORT_SYMBOL(tcp_shutdown); |
3120 |
+@@ -2125,6 +2208,11 @@ void tcp_close(struct sock *sk, long timeout) |
3121 |
+ int data_was_unread = 0; |
3122 |
+ int state; |
3123 |
+ |
3124 |
++ if (is_meta_sk(sk)) { |
3125 |
++ mptcp_close(sk, timeout); |
3126 |
++ return; |
3127 |
++ } |
3128 |
++ |
3129 |
+ lock_sock(sk); |
3130 |
+ sk->sk_shutdown = SHUTDOWN_MASK; |
3131 |
+ |
3132 |
+@@ -2167,7 +2255,7 @@ void tcp_close(struct sock *sk, long timeout) |
3133 |
+ /* Unread data was tossed, zap the connection. */ |
3134 |
+ NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE); |
3135 |
+ tcp_set_state(sk, TCP_CLOSE); |
3136 |
+- tcp_send_active_reset(sk, sk->sk_allocation); |
3137 |
++ tcp_sk(sk)->ops->send_active_reset(sk, sk->sk_allocation); |
3138 |
+ } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) { |
3139 |
+ /* Check zero linger _after_ checking for unread data. */ |
3140 |
+ sk->sk_prot->disconnect(sk, 0); |
3141 |
+@@ -2247,7 +2335,7 @@ adjudge_to_death: |
3142 |
+ struct tcp_sock *tp = tcp_sk(sk); |
3143 |
+ if (tp->linger2 < 0) { |
3144 |
+ tcp_set_state(sk, TCP_CLOSE); |
3145 |
+- tcp_send_active_reset(sk, GFP_ATOMIC); |
3146 |
++ tp->ops->send_active_reset(sk, GFP_ATOMIC); |
3147 |
+ NET_INC_STATS_BH(sock_net(sk), |
3148 |
+ LINUX_MIB_TCPABORTONLINGER); |
3149 |
+ } else { |
3150 |
+@@ -2257,7 +2345,8 @@ adjudge_to_death: |
3151 |
+ inet_csk_reset_keepalive_timer(sk, |
3152 |
+ tmo - TCP_TIMEWAIT_LEN); |
3153 |
+ } else { |
3154 |
+- tcp_time_wait(sk, TCP_FIN_WAIT2, tmo); |
3155 |
++ tcp_sk(sk)->ops->time_wait(sk, TCP_FIN_WAIT2, |
3156 |
++ tmo); |
3157 |
+ goto out; |
3158 |
+ } |
3159 |
+ } |
3160 |
+@@ -2266,7 +2355,7 @@ adjudge_to_death: |
3161 |
+ sk_mem_reclaim(sk); |
3162 |
+ if (tcp_check_oom(sk, 0)) { |
3163 |
+ tcp_set_state(sk, TCP_CLOSE); |
3164 |
+- tcp_send_active_reset(sk, GFP_ATOMIC); |
3165 |
++ tcp_sk(sk)->ops->send_active_reset(sk, GFP_ATOMIC); |
3166 |
+ NET_INC_STATS_BH(sock_net(sk), |
3167 |
+ LINUX_MIB_TCPABORTONMEMORY); |
3168 |
+ } |
3169 |
+@@ -2291,15 +2380,6 @@ out: |
3170 |
+ } |
3171 |
+ EXPORT_SYMBOL(tcp_close); |
3172 |
+ |
3173 |
+-/* These states need RST on ABORT according to RFC793 */ |
3174 |
+- |
3175 |
+-static inline bool tcp_need_reset(int state) |
3176 |
+-{ |
3177 |
+- return (1 << state) & |
3178 |
+- (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 | |
3179 |
+- TCPF_FIN_WAIT2 | TCPF_SYN_RECV); |
3180 |
+-} |
3181 |
+- |
3182 |
+ int tcp_disconnect(struct sock *sk, int flags) |
3183 |
+ { |
3184 |
+ struct inet_sock *inet = inet_sk(sk); |
3185 |
+@@ -2322,7 +2402,7 @@ int tcp_disconnect(struct sock *sk, int flags) |
3186 |
+ /* The last check adjusts for discrepancy of Linux wrt. RFC |
3187 |
+ * states |
3188 |
+ */ |
3189 |
+- tcp_send_active_reset(sk, gfp_any()); |
3190 |
++ tp->ops->send_active_reset(sk, gfp_any()); |
3191 |
+ sk->sk_err = ECONNRESET; |
3192 |
+ } else if (old_state == TCP_SYN_SENT) |
3193 |
+ sk->sk_err = ECONNRESET; |
3194 |
+@@ -2340,6 +2420,13 @@ int tcp_disconnect(struct sock *sk, int flags) |
3195 |
+ if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) |
3196 |
+ inet_reset_saddr(sk); |
3197 |
+ |
3198 |
++ if (is_meta_sk(sk)) { |
3199 |
++ mptcp_disconnect(sk); |
3200 |
++ } else { |
3201 |
++ if (tp->inside_tk_table) |
3202 |
++ mptcp_hash_remove_bh(tp); |
3203 |
++ } |
3204 |
++ |
3205 |
+ sk->sk_shutdown = 0; |
3206 |
+ sock_reset_flag(sk, SOCK_DONE); |
3207 |
+ tp->srtt_us = 0; |
3208 |
+@@ -2632,6 +2719,12 @@ static int do_tcp_setsockopt(struct sock *sk, int level, |
3209 |
+ break; |
3210 |
+ |
3211 |
+ case TCP_DEFER_ACCEPT: |
3212 |
++ /* An established MPTCP-connection (mptcp(tp) only returns true |
3213 |
++ * if the socket is established) should not use DEFER on new |
3214 |
++ * subflows. |
3215 |
++ */ |
3216 |
++ if (mptcp(tp)) |
3217 |
++ break; |
3218 |
+ /* Translate value in seconds to number of retransmits */ |
3219 |
+ icsk->icsk_accept_queue.rskq_defer_accept = |
3220 |
+ secs_to_retrans(val, TCP_TIMEOUT_INIT / HZ, |
3221 |
+@@ -2659,7 +2752,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, |
3222 |
+ (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) && |
3223 |
+ inet_csk_ack_scheduled(sk)) { |
3224 |
+ icsk->icsk_ack.pending |= ICSK_ACK_PUSHED; |
3225 |
+- tcp_cleanup_rbuf(sk, 1); |
3226 |
++ tp->ops->cleanup_rbuf(sk, 1); |
3227 |
+ if (!(val & 1)) |
3228 |
+ icsk->icsk_ack.pingpong = 1; |
3229 |
+ } |
3230 |
+@@ -2699,6 +2792,18 @@ static int do_tcp_setsockopt(struct sock *sk, int level, |
3231 |
+ tp->notsent_lowat = val; |
3232 |
+ sk->sk_write_space(sk); |
3233 |
+ break; |
3234 |
++#ifdef CONFIG_MPTCP |
3235 |
++ case MPTCP_ENABLED: |
3236 |
++ if (sk->sk_state == TCP_CLOSE || sk->sk_state == TCP_LISTEN) { |
3237 |
++ if (val) |
3238 |
++ tp->mptcp_enabled = 1; |
3239 |
++ else |
3240 |
++ tp->mptcp_enabled = 0; |
3241 |
++ } else { |
3242 |
++ err = -EPERM; |
3243 |
++ } |
3244 |
++ break; |
3245 |
++#endif |
3246 |
+ default: |
3247 |
+ err = -ENOPROTOOPT; |
3248 |
+ break; |
3249 |
+@@ -2931,6 +3036,11 @@ static int do_tcp_getsockopt(struct sock *sk, int level, |
3250 |
+ case TCP_NOTSENT_LOWAT: |
3251 |
+ val = tp->notsent_lowat; |
3252 |
+ break; |
3253 |
++#ifdef CONFIG_MPTCP |
3254 |
++ case MPTCP_ENABLED: |
3255 |
++ val = tp->mptcp_enabled; |
3256 |
++ break; |
3257 |
++#endif |
3258 |
+ default: |
3259 |
+ return -ENOPROTOOPT; |
3260 |
+ } |
3261 |
+@@ -3120,8 +3230,11 @@ void tcp_done(struct sock *sk) |
3262 |
+ if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV) |
3263 |
+ TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_ATTEMPTFAILS); |
3264 |
+ |
3265 |
++ WARN_ON(sk->sk_state == TCP_CLOSE); |
3266 |
+ tcp_set_state(sk, TCP_CLOSE); |
3267 |
++ |
3268 |
+ tcp_clear_xmit_timers(sk); |
3269 |
++ |
3270 |
+ if (req != NULL) |
3271 |
+ reqsk_fastopen_remove(sk, req, false); |
3272 |
+ |
3273 |
+diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c |
3274 |
+index 9771563ab564..5c230d96c4c1 100644 |
3275 |
+--- a/net/ipv4/tcp_fastopen.c |
3276 |
++++ b/net/ipv4/tcp_fastopen.c |
3277 |
+@@ -7,6 +7,7 @@ |
3278 |
+ #include <linux/rculist.h> |
3279 |
+ #include <net/inetpeer.h> |
3280 |
+ #include <net/tcp.h> |
3281 |
++#include <net/mptcp.h> |
3282 |
+ |
3283 |
+ int sysctl_tcp_fastopen __read_mostly = TFO_CLIENT_ENABLE; |
3284 |
+ |
3285 |
+@@ -133,7 +134,7 @@ static bool tcp_fastopen_create_child(struct sock *sk, |
3286 |
+ { |
3287 |
+ struct tcp_sock *tp; |
3288 |
+ struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue; |
3289 |
+- struct sock *child; |
3290 |
++ struct sock *child, *meta_sk; |
3291 |
+ |
3292 |
+ req->num_retrans = 0; |
3293 |
+ req->num_timeout = 0; |
3294 |
+@@ -176,13 +177,6 @@ static bool tcp_fastopen_create_child(struct sock *sk, |
3295 |
+ /* Add the child socket directly into the accept queue */ |
3296 |
+ inet_csk_reqsk_queue_add(sk, req, child); |
3297 |
+ |
3298 |
+- /* Now finish processing the fastopen child socket. */ |
3299 |
+- inet_csk(child)->icsk_af_ops->rebuild_header(child); |
3300 |
+- tcp_init_congestion_control(child); |
3301 |
+- tcp_mtup_init(child); |
3302 |
+- tcp_init_metrics(child); |
3303 |
+- tcp_init_buffer_space(child); |
3304 |
+- |
3305 |
+ /* Queue the data carried in the SYN packet. We need to first |
3306 |
+ * bump skb's refcnt because the caller will attempt to free it. |
3307 |
+ * |
3308 |
+@@ -199,8 +193,24 @@ static bool tcp_fastopen_create_child(struct sock *sk, |
3309 |
+ tp->syn_data_acked = 1; |
3310 |
+ } |
3311 |
+ tcp_rsk(req)->rcv_nxt = tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; |
3312 |
++ |
3313 |
++ meta_sk = child; |
3314 |
++ if (!mptcp_check_req_fastopen(meta_sk, req)) { |
3315 |
++ child = tcp_sk(meta_sk)->mpcb->master_sk; |
3316 |
++ tp = tcp_sk(child); |
3317 |
++ } |
3318 |
++ |
3319 |
++ /* Now finish processing the fastopen child socket. */ |
3320 |
++ inet_csk(child)->icsk_af_ops->rebuild_header(child); |
3321 |
++ tp->ops->init_congestion_control(child); |
3322 |
++ tcp_mtup_init(child); |
3323 |
++ tcp_init_metrics(child); |
3324 |
++ tp->ops->init_buffer_space(child); |
3325 |
++ |
3326 |
+ sk->sk_data_ready(sk); |
3327 |
+- bh_unlock_sock(child); |
3328 |
++ if (mptcp(tcp_sk(child))) |
3329 |
++ bh_unlock_sock(child); |
3330 |
++ bh_unlock_sock(meta_sk); |
3331 |
+ sock_put(child); |
3332 |
+ WARN_ON(req->sk == NULL); |
3333 |
+ return true; |
3334 |
+diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c |
3335 |
+index 40639c288dc2..3273bb69f387 100644 |
3336 |
+--- a/net/ipv4/tcp_input.c |
3337 |
++++ b/net/ipv4/tcp_input.c |
3338 |
+@@ -74,6 +74,9 @@ |
3339 |
+ #include <linux/ipsec.h> |
3340 |
+ #include <asm/unaligned.h> |
3341 |
+ #include <net/netdma.h> |
3342 |
++#include <net/mptcp.h> |
3343 |
++#include <net/mptcp_v4.h> |
3344 |
++#include <net/mptcp_v6.h> |
3345 |
+ |
3346 |
+ int sysctl_tcp_timestamps __read_mostly = 1; |
3347 |
+ int sysctl_tcp_window_scaling __read_mostly = 1; |
3348 |
+@@ -99,25 +102,6 @@ int sysctl_tcp_thin_dupack __read_mostly; |
3349 |
+ int sysctl_tcp_moderate_rcvbuf __read_mostly = 1; |
3350 |
+ int sysctl_tcp_early_retrans __read_mostly = 3; |
3351 |
+ |
3352 |
+-#define FLAG_DATA 0x01 /* Incoming frame contained data. */ |
3353 |
+-#define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */ |
3354 |
+-#define FLAG_DATA_ACKED 0x04 /* This ACK acknowledged new data. */ |
3355 |
+-#define FLAG_RETRANS_DATA_ACKED 0x08 /* "" "" some of which was retransmitted. */ |
3356 |
+-#define FLAG_SYN_ACKED 0x10 /* This ACK acknowledged SYN. */ |
3357 |
+-#define FLAG_DATA_SACKED 0x20 /* New SACK. */ |
3358 |
+-#define FLAG_ECE 0x40 /* ECE in this ACK */ |
3359 |
+-#define FLAG_SLOWPATH 0x100 /* Do not skip RFC checks for window update.*/ |
3360 |
+-#define FLAG_ORIG_SACK_ACKED 0x200 /* Never retransmitted data are (s)acked */ |
3361 |
+-#define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */ |
3362 |
+-#define FLAG_DSACKING_ACK 0x800 /* SACK blocks contained D-SACK info */ |
3363 |
+-#define FLAG_SACK_RENEGING 0x2000 /* snd_una advanced to a sacked seq */ |
3364 |
+-#define FLAG_UPDATE_TS_RECENT 0x4000 /* tcp_replace_ts_recent() */ |
3365 |
+- |
3366 |
+-#define FLAG_ACKED (FLAG_DATA_ACKED|FLAG_SYN_ACKED) |
3367 |
+-#define FLAG_NOT_DUP (FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED) |
3368 |
+-#define FLAG_CA_ALERT (FLAG_DATA_SACKED|FLAG_ECE) |
3369 |
+-#define FLAG_FORWARD_PROGRESS (FLAG_ACKED|FLAG_DATA_SACKED) |
3370 |
+- |
3371 |
+ #define TCP_REMNANT (TCP_FLAG_FIN|TCP_FLAG_URG|TCP_FLAG_SYN|TCP_FLAG_PSH) |
3372 |
+ #define TCP_HP_BITS (~(TCP_RESERVED_BITS|TCP_FLAG_PSH)) |
3373 |
+ |
3374 |
+@@ -181,7 +165,7 @@ static void tcp_incr_quickack(struct sock *sk) |
3375 |
+ icsk->icsk_ack.quick = min(quickacks, TCP_MAX_QUICKACKS); |
3376 |
+ } |
3377 |
+ |
3378 |
+-static void tcp_enter_quickack_mode(struct sock *sk) |
3379 |
++void tcp_enter_quickack_mode(struct sock *sk) |
3380 |
+ { |
3381 |
+ struct inet_connection_sock *icsk = inet_csk(sk); |
3382 |
+ tcp_incr_quickack(sk); |
3383 |
+@@ -283,8 +267,12 @@ static void tcp_sndbuf_expand(struct sock *sk) |
3384 |
+ per_mss = roundup_pow_of_two(per_mss) + |
3385 |
+ SKB_DATA_ALIGN(sizeof(struct sk_buff)); |
3386 |
+ |
3387 |
+- nr_segs = max_t(u32, TCP_INIT_CWND, tp->snd_cwnd); |
3388 |
+- nr_segs = max_t(u32, nr_segs, tp->reordering + 1); |
3389 |
++ if (mptcp(tp)) { |
3390 |
++ nr_segs = mptcp_check_snd_buf(tp); |
3391 |
++ } else { |
3392 |
++ nr_segs = max_t(u32, TCP_INIT_CWND, tp->snd_cwnd); |
3393 |
++ nr_segs = max_t(u32, nr_segs, tp->reordering + 1); |
3394 |
++ } |
3395 |
+ |
3396 |
+ /* Fast Recovery (RFC 5681 3.2) : |
3397 |
+ * Cubic needs 1.7 factor, rounded to 2 to include |
3398 |
+@@ -292,8 +280,16 @@ static void tcp_sndbuf_expand(struct sock *sk) |
3399 |
+ */ |
3400 |
+ sndmem = 2 * nr_segs * per_mss; |
3401 |
+ |
3402 |
+- if (sk->sk_sndbuf < sndmem) |
3403 |
++ /* MPTCP: after this sndmem is the new contribution of the |
3404 |
++ * current subflow to the aggregated sndbuf */ |
3405 |
++ if (sk->sk_sndbuf < sndmem) { |
3406 |
++ int old_sndbuf = sk->sk_sndbuf; |
3407 |
+ sk->sk_sndbuf = min(sndmem, sysctl_tcp_wmem[2]); |
3408 |
++ /* MPTCP: ok, the subflow sndbuf has grown, reflect |
3409 |
++ * this in the aggregate buffer.*/ |
3410 |
++ if (mptcp(tp) && old_sndbuf != sk->sk_sndbuf) |
3411 |
++ mptcp_update_sndbuf(tp); |
3412 |
++ } |
3413 |
+ } |
3414 |
+ |
3415 |
+ /* 2. Tuning advertised window (window_clamp, rcv_ssthresh) |
3416 |
+@@ -342,10 +338,12 @@ static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb) |
3417 |
+ static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb) |
3418 |
+ { |
3419 |
+ struct tcp_sock *tp = tcp_sk(sk); |
3420 |
++ struct sock *meta_sk = mptcp(tp) ? mptcp_meta_sk(sk) : sk; |
3421 |
++ struct tcp_sock *meta_tp = tcp_sk(meta_sk); |
3422 |
+ |
3423 |
+ /* Check #1 */ |
3424 |
+- if (tp->rcv_ssthresh < tp->window_clamp && |
3425 |
+- (int)tp->rcv_ssthresh < tcp_space(sk) && |
3426 |
++ if (meta_tp->rcv_ssthresh < meta_tp->window_clamp && |
3427 |
++ (int)meta_tp->rcv_ssthresh < tcp_space(sk) && |
3428 |
+ !sk_under_memory_pressure(sk)) { |
3429 |
+ int incr; |
3430 |
+ |
3431 |
+@@ -353,14 +351,14 @@ static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb) |
3432 |
+ * will fit to rcvbuf in future. |
3433 |
+ */ |
3434 |
+ if (tcp_win_from_space(skb->truesize) <= skb->len) |
3435 |
+- incr = 2 * tp->advmss; |
3436 |
++ incr = 2 * meta_tp->advmss; |
3437 |
+ else |
3438 |
+- incr = __tcp_grow_window(sk, skb); |
3439 |
++ incr = __tcp_grow_window(meta_sk, skb); |
3440 |
+ |
3441 |
+ if (incr) { |
3442 |
+ incr = max_t(int, incr, 2 * skb->len); |
3443 |
+- tp->rcv_ssthresh = min(tp->rcv_ssthresh + incr, |
3444 |
+- tp->window_clamp); |
3445 |
++ meta_tp->rcv_ssthresh = min(meta_tp->rcv_ssthresh + incr, |
3446 |
++ meta_tp->window_clamp); |
3447 |
+ inet_csk(sk)->icsk_ack.quick |= 1; |
3448 |
+ } |
3449 |
+ } |
3450 |
+@@ -543,7 +541,10 @@ void tcp_rcv_space_adjust(struct sock *sk) |
3451 |
+ int copied; |
3452 |
+ |
3453 |
+ time = tcp_time_stamp - tp->rcvq_space.time; |
3454 |
+- if (time < (tp->rcv_rtt_est.rtt >> 3) || tp->rcv_rtt_est.rtt == 0) |
3455 |
++ if (mptcp(tp)) { |
3456 |
++ if (mptcp_check_rtt(tp, time)) |
3457 |
++ return; |
3458 |
++ } else if (time < (tp->rcv_rtt_est.rtt >> 3) || tp->rcv_rtt_est.rtt == 0) |
3459 |
+ return; |
3460 |
+ |
3461 |
+ /* Number of bytes copied to user in last RTT */ |
3462 |
+@@ -761,7 +762,7 @@ static void tcp_update_pacing_rate(struct sock *sk) |
3463 |
+ /* Calculate rto without backoff. This is the second half of Van Jacobson's |
3464 |
+ * routine referred to above. |
3465 |
+ */ |
3466 |
+-static void tcp_set_rto(struct sock *sk) |
3467 |
++void tcp_set_rto(struct sock *sk) |
3468 |
+ { |
3469 |
+ const struct tcp_sock *tp = tcp_sk(sk); |
3470 |
+ /* Old crap is replaced with new one. 8) |
3471 |
+@@ -1376,7 +1377,11 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb, |
3472 |
+ int len; |
3473 |
+ int in_sack; |
3474 |
+ |
3475 |
+- if (!sk_can_gso(sk)) |
3476 |
++ /* For MPTCP we cannot shift skb-data and remove one skb from the |
3477 |
++ * send-queue, because this will make us loose the DSS-option (which |
3478 |
++ * is stored in TCP_SKB_CB(skb)->dss) of the skb we are removing. |
3479 |
++ */ |
3480 |
++ if (!sk_can_gso(sk) || mptcp(tp)) |
3481 |
+ goto fallback; |
3482 |
+ |
3483 |
+ /* Normally R but no L won't result in plain S */ |
3484 |
+@@ -2915,7 +2920,7 @@ static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag, |
3485 |
+ return false; |
3486 |
+ |
3487 |
+ tcp_rtt_estimator(sk, seq_rtt_us); |
3488 |
+- tcp_set_rto(sk); |
3489 |
++ tp->ops->set_rto(sk); |
3490 |
+ |
3491 |
+ /* RFC6298: only reset backoff on valid RTT measurement. */ |
3492 |
+ inet_csk(sk)->icsk_backoff = 0; |
3493 |
+@@ -3000,7 +3005,7 @@ void tcp_resume_early_retransmit(struct sock *sk) |
3494 |
+ } |
3495 |
+ |
3496 |
+ /* If we get here, the whole TSO packet has not been acked. */ |
3497 |
+-static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb) |
3498 |
++u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb) |
3499 |
+ { |
3500 |
+ struct tcp_sock *tp = tcp_sk(sk); |
3501 |
+ u32 packets_acked; |
3502 |
+@@ -3095,6 +3100,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, |
3503 |
+ */ |
3504 |
+ if (!(scb->tcp_flags & TCPHDR_SYN)) { |
3505 |
+ flag |= FLAG_DATA_ACKED; |
3506 |
++ if (mptcp(tp) && mptcp_is_data_seq(skb)) |
3507 |
++ flag |= MPTCP_FLAG_DATA_ACKED; |
3508 |
+ } else { |
3509 |
+ flag |= FLAG_SYN_ACKED; |
3510 |
+ tp->retrans_stamp = 0; |
3511 |
+@@ -3189,7 +3196,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, |
3512 |
+ return flag; |
3513 |
+ } |
3514 |
+ |
3515 |
+-static void tcp_ack_probe(struct sock *sk) |
3516 |
++void tcp_ack_probe(struct sock *sk) |
3517 |
+ { |
3518 |
+ const struct tcp_sock *tp = tcp_sk(sk); |
3519 |
+ struct inet_connection_sock *icsk = inet_csk(sk); |
3520 |
+@@ -3236,9 +3243,8 @@ static inline bool tcp_may_raise_cwnd(const struct sock *sk, const int flag) |
3521 |
+ /* Check that window update is acceptable. |
3522 |
+ * The function assumes that snd_una<=ack<=snd_next. |
3523 |
+ */ |
3524 |
+-static inline bool tcp_may_update_window(const struct tcp_sock *tp, |
3525 |
+- const u32 ack, const u32 ack_seq, |
3526 |
+- const u32 nwin) |
3527 |
++bool tcp_may_update_window(const struct tcp_sock *tp, const u32 ack, |
3528 |
++ const u32 ack_seq, const u32 nwin) |
3529 |
+ { |
3530 |
+ return after(ack, tp->snd_una) || |
3531 |
+ after(ack_seq, tp->snd_wl1) || |
3532 |
+@@ -3357,7 +3363,7 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag) |
3533 |
+ } |
3534 |
+ |
3535 |
+ /* This routine deals with incoming acks, but not outgoing ones. */ |
3536 |
+-static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) |
3537 |
++static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) |
3538 |
+ { |
3539 |
+ struct inet_connection_sock *icsk = inet_csk(sk); |
3540 |
+ struct tcp_sock *tp = tcp_sk(sk); |
3541 |
+@@ -3449,6 +3455,16 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) |
3542 |
+ sack_rtt_us); |
3543 |
+ acked -= tp->packets_out; |
3544 |
+ |
3545 |
++ if (mptcp(tp)) { |
3546 |
++ if (mptcp_fallback_infinite(sk, flag)) { |
3547 |
++ pr_err("%s resetting flow\n", __func__); |
3548 |
++ mptcp_send_reset(sk); |
3549 |
++ goto invalid_ack; |
3550 |
++ } |
3551 |
++ |
3552 |
++ mptcp_clean_rtx_infinite(skb, sk); |
3553 |
++ } |
3554 |
++ |
3555 |
+ /* Advance cwnd if state allows */ |
3556 |
+ if (tcp_may_raise_cwnd(sk, flag)) |
3557 |
+ tcp_cong_avoid(sk, ack, acked); |
3558 |
+@@ -3512,8 +3528,9 @@ old_ack: |
3559 |
+ * the fast version below fails. |
3560 |
+ */ |
3561 |
+ void tcp_parse_options(const struct sk_buff *skb, |
3562 |
+- struct tcp_options_received *opt_rx, int estab, |
3563 |
+- struct tcp_fastopen_cookie *foc) |
3564 |
++ struct tcp_options_received *opt_rx, |
3565 |
++ struct mptcp_options_received *mopt, |
3566 |
++ int estab, struct tcp_fastopen_cookie *foc) |
3567 |
+ { |
3568 |
+ const unsigned char *ptr; |
3569 |
+ const struct tcphdr *th = tcp_hdr(skb); |
3570 |
+@@ -3596,6 +3613,9 @@ void tcp_parse_options(const struct sk_buff *skb, |
3571 |
+ */ |
3572 |
+ break; |
3573 |
+ #endif |
3574 |
++ case TCPOPT_MPTCP: |
3575 |
++ mptcp_parse_options(ptr - 2, opsize, mopt, skb); |
3576 |
++ break; |
3577 |
+ case TCPOPT_EXP: |
3578 |
+ /* Fast Open option shares code 254 using a |
3579 |
+ * 16 bits magic number. It's valid only in |
3580 |
+@@ -3657,8 +3677,8 @@ static bool tcp_fast_parse_options(const struct sk_buff *skb, |
3581 |
+ if (tcp_parse_aligned_timestamp(tp, th)) |
3582 |
+ return true; |
3583 |
+ } |
3584 |
+- |
3585 |
+- tcp_parse_options(skb, &tp->rx_opt, 1, NULL); |
3586 |
++ tcp_parse_options(skb, &tp->rx_opt, mptcp(tp) ? &tp->mptcp->rx_opt : NULL, |
3587 |
++ 1, NULL); |
3588 |
+ if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) |
3589 |
+ tp->rx_opt.rcv_tsecr -= tp->tsoffset; |
3590 |
+ |
3591 |
+@@ -3831,6 +3851,8 @@ static void tcp_fin(struct sock *sk) |
3592 |
+ dst = __sk_dst_get(sk); |
3593 |
+ if (!dst || !dst_metric(dst, RTAX_QUICKACK)) |
3594 |
+ inet_csk(sk)->icsk_ack.pingpong = 1; |
3595 |
++ if (mptcp(tp)) |
3596 |
++ mptcp_sub_close_passive(sk); |
3597 |
+ break; |
3598 |
+ |
3599 |
+ case TCP_CLOSE_WAIT: |
3600 |
+@@ -3852,9 +3874,16 @@ static void tcp_fin(struct sock *sk) |
3601 |
+ tcp_set_state(sk, TCP_CLOSING); |
3602 |
+ break; |
3603 |
+ case TCP_FIN_WAIT2: |
3604 |
++ if (mptcp(tp)) { |
3605 |
++ /* The socket will get closed by mptcp_data_ready. |
3606 |
++ * We first have to process all data-sequences. |
3607 |
++ */ |
3608 |
++ tp->close_it = 1; |
3609 |
++ break; |
3610 |
++ } |
3611 |
+ /* Received a FIN -- send ACK and enter TIME_WAIT. */ |
3612 |
+ tcp_send_ack(sk); |
3613 |
+- tcp_time_wait(sk, TCP_TIME_WAIT, 0); |
3614 |
++ tp->ops->time_wait(sk, TCP_TIME_WAIT, 0); |
3615 |
+ break; |
3616 |
+ default: |
3617 |
+ /* Only TCP_LISTEN and TCP_CLOSE are left, in these |
3618 |
+@@ -3876,6 +3905,10 @@ static void tcp_fin(struct sock *sk) |
3619 |
+ if (!sock_flag(sk, SOCK_DEAD)) { |
3620 |
+ sk->sk_state_change(sk); |
3621 |
+ |
3622 |
++ /* Don't wake up MPTCP-subflows */ |
3623 |
++ if (mptcp(tp)) |
3624 |
++ return; |
3625 |
++ |
3626 |
+ /* Do not send POLL_HUP for half duplex close. */ |
3627 |
+ if (sk->sk_shutdown == SHUTDOWN_MASK || |
3628 |
+ sk->sk_state == TCP_CLOSE) |
3629 |
+@@ -4073,7 +4106,11 @@ static void tcp_ofo_queue(struct sock *sk) |
3630 |
+ tcp_dsack_extend(sk, TCP_SKB_CB(skb)->seq, dsack); |
3631 |
+ } |
3632 |
+ |
3633 |
+- if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) { |
3634 |
++ /* In case of MPTCP, the segment may be empty if it's a |
3635 |
++ * non-data DATA_FIN. (see beginning of tcp_data_queue) |
3636 |
++ */ |
3637 |
++ if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt) && |
3638 |
++ !(mptcp(tp) && TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq)) { |
3639 |
+ SOCK_DEBUG(sk, "ofo packet was already received\n"); |
3640 |
+ __skb_unlink(skb, &tp->out_of_order_queue); |
3641 |
+ __kfree_skb(skb); |
3642 |
+@@ -4091,12 +4128,14 @@ static void tcp_ofo_queue(struct sock *sk) |
3643 |
+ } |
3644 |
+ } |
3645 |
+ |
3646 |
+-static bool tcp_prune_ofo_queue(struct sock *sk); |
3647 |
+ static int tcp_prune_queue(struct sock *sk); |
3648 |
+ |
3649 |
+ static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb, |
3650 |
+ unsigned int size) |
3651 |
+ { |
3652 |
++ if (mptcp(tcp_sk(sk))) |
3653 |
++ sk = mptcp_meta_sk(sk); |
3654 |
++ |
3655 |
+ if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || |
3656 |
+ !sk_rmem_schedule(sk, skb, size)) { |
3657 |
+ |
3658 |
+@@ -4104,7 +4143,7 @@ static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb, |
3659 |
+ return -1; |
3660 |
+ |
3661 |
+ if (!sk_rmem_schedule(sk, skb, size)) { |
3662 |
+- if (!tcp_prune_ofo_queue(sk)) |
3663 |
++ if (!tcp_sk(sk)->ops->prune_ofo_queue(sk)) |
3664 |
+ return -1; |
3665 |
+ |
3666 |
+ if (!sk_rmem_schedule(sk, skb, size)) |
3667 |
+@@ -4127,15 +4166,16 @@ static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb, |
3668 |
+ * Better try to coalesce them right now to avoid future collapses. |
3669 |
+ * Returns true if caller should free @from instead of queueing it |
3670 |
+ */ |
3671 |
+-static bool tcp_try_coalesce(struct sock *sk, |
3672 |
+- struct sk_buff *to, |
3673 |
+- struct sk_buff *from, |
3674 |
+- bool *fragstolen) |
3675 |
++bool tcp_try_coalesce(struct sock *sk, struct sk_buff *to, struct sk_buff *from, |
3676 |
++ bool *fragstolen) |
3677 |
+ { |
3678 |
+ int delta; |
3679 |
+ |
3680 |
+ *fragstolen = false; |
3681 |
+ |
3682 |
++ if (mptcp(tcp_sk(sk)) && !is_meta_sk(sk)) |
3683 |
++ return false; |
3684 |
++ |
3685 |
+ if (tcp_hdr(from)->fin) |
3686 |
+ return false; |
3687 |
+ |
3688 |
+@@ -4225,7 +4265,9 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) |
3689 |
+ |
3690 |
+ /* Do skb overlap to previous one? */ |
3691 |
+ if (skb1 && before(seq, TCP_SKB_CB(skb1)->end_seq)) { |
3692 |
+- if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) { |
3693 |
++ /* MPTCP allows non-data data-fin to be in the ofo-queue */ |
3694 |
++ if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq) && |
3695 |
++ !(mptcp(tp) && end_seq == seq)) { |
3696 |
+ /* All the bits are present. Drop. */ |
3697 |
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFOMERGE); |
3698 |
+ __kfree_skb(skb); |
3699 |
+@@ -4263,6 +4305,9 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) |
3700 |
+ end_seq); |
3701 |
+ break; |
3702 |
+ } |
3703 |
++ /* MPTCP allows non-data data-fin to be in the ofo-queue */ |
3704 |
++ if (mptcp(tp) && TCP_SKB_CB(skb1)->seq == TCP_SKB_CB(skb1)->end_seq) |
3705 |
++ continue; |
3706 |
+ __skb_unlink(skb1, &tp->out_of_order_queue); |
3707 |
+ tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq, |
3708 |
+ TCP_SKB_CB(skb1)->end_seq); |
3709 |
+@@ -4280,8 +4325,8 @@ end: |
3710 |
+ } |
3711 |
+ } |
3712 |
+ |
3713 |
+-static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int hdrlen, |
3714 |
+- bool *fragstolen) |
3715 |
++int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int hdrlen, |
3716 |
++ bool *fragstolen) |
3717 |
+ { |
3718 |
+ int eaten; |
3719 |
+ struct sk_buff *tail = skb_peek_tail(&sk->sk_receive_queue); |
3720 |
+@@ -4343,7 +4388,10 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) |
3721 |
+ int eaten = -1; |
3722 |
+ bool fragstolen = false; |
3723 |
+ |
3724 |
+- if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) |
3725 |
++ /* If no data is present, but a data_fin is in the options, we still |
3726 |
++ * have to call mptcp_queue_skb later on. */ |
3727 |
++ if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq && |
3728 |
++ !(mptcp(tp) && mptcp_is_data_fin(skb))) |
3729 |
+ goto drop; |
3730 |
+ |
3731 |
+ skb_dst_drop(skb); |
3732 |
+@@ -4389,7 +4437,7 @@ queue_and_out: |
3733 |
+ eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen); |
3734 |
+ } |
3735 |
+ tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; |
3736 |
+- if (skb->len) |
3737 |
++ if (skb->len || mptcp_is_data_fin(skb)) |
3738 |
+ tcp_event_data_recv(sk, skb); |
3739 |
+ if (th->fin) |
3740 |
+ tcp_fin(sk); |
3741 |
+@@ -4411,7 +4459,11 @@ queue_and_out: |
3742 |
+ |
3743 |
+ if (eaten > 0) |
3744 |
+ kfree_skb_partial(skb, fragstolen); |
3745 |
+- if (!sock_flag(sk, SOCK_DEAD)) |
3746 |
++ if (!sock_flag(sk, SOCK_DEAD) || mptcp(tp)) |
3747 |
++ /* MPTCP: we always have to call data_ready, because |
3748 |
++ * we may be about to receive a data-fin, which still |
3749 |
++ * must get queued. |
3750 |
++ */ |
3751 |
+ sk->sk_data_ready(sk); |
3752 |
+ return; |
3753 |
+ } |
3754 |
+@@ -4463,6 +4515,8 @@ static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb, |
3755 |
+ next = skb_queue_next(list, skb); |
3756 |
+ |
3757 |
+ __skb_unlink(skb, list); |
3758 |
++ if (mptcp(tcp_sk(sk))) |
3759 |
++ mptcp_remove_shortcuts(tcp_sk(sk)->mpcb, skb); |
3760 |
+ __kfree_skb(skb); |
3761 |
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRCVCOLLAPSED); |
3762 |
+ |
3763 |
+@@ -4630,7 +4684,7 @@ static void tcp_collapse_ofo_queue(struct sock *sk) |
3764 |
+ * Purge the out-of-order queue. |
3765 |
+ * Return true if queue was pruned. |
3766 |
+ */ |
3767 |
+-static bool tcp_prune_ofo_queue(struct sock *sk) |
3768 |
++bool tcp_prune_ofo_queue(struct sock *sk) |
3769 |
+ { |
3770 |
+ struct tcp_sock *tp = tcp_sk(sk); |
3771 |
+ bool res = false; |
3772 |
+@@ -4686,7 +4740,7 @@ static int tcp_prune_queue(struct sock *sk) |
3773 |
+ /* Collapsing did not help, destructive actions follow. |
3774 |
+ * This must not ever occur. */ |
3775 |
+ |
3776 |
+- tcp_prune_ofo_queue(sk); |
3777 |
++ tp->ops->prune_ofo_queue(sk); |
3778 |
+ |
3779 |
+ if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf) |
3780 |
+ return 0; |
3781 |
+@@ -4702,7 +4756,29 @@ static int tcp_prune_queue(struct sock *sk) |
3782 |
+ return -1; |
3783 |
+ } |
3784 |
+ |
3785 |
+-static bool tcp_should_expand_sndbuf(const struct sock *sk) |
3786 |
++/* RFC2861, slow part. Adjust cwnd, after it was not full during one rto. |
3787 |
++ * As additional protections, we do not touch cwnd in retransmission phases, |
3788 |
++ * and if application hit its sndbuf limit recently. |
3789 |
++ */ |
3790 |
++void tcp_cwnd_application_limited(struct sock *sk) |
3791 |
++{ |
3792 |
++ struct tcp_sock *tp = tcp_sk(sk); |
3793 |
++ |
3794 |
++ if (inet_csk(sk)->icsk_ca_state == TCP_CA_Open && |
3795 |
++ sk->sk_socket && !test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) { |
3796 |
++ /* Limited by application or receiver window. */ |
3797 |
++ u32 init_win = tcp_init_cwnd(tp, __sk_dst_get(sk)); |
3798 |
++ u32 win_used = max(tp->snd_cwnd_used, init_win); |
3799 |
++ if (win_used < tp->snd_cwnd) { |
3800 |
++ tp->snd_ssthresh = tcp_current_ssthresh(sk); |
3801 |
++ tp->snd_cwnd = (tp->snd_cwnd + win_used) >> 1; |
3802 |
++ } |
3803 |
++ tp->snd_cwnd_used = 0; |
3804 |
++ } |
3805 |
++ tp->snd_cwnd_stamp = tcp_time_stamp; |
3806 |
++} |
3807 |
++ |
3808 |
++bool tcp_should_expand_sndbuf(const struct sock *sk) |
3809 |
+ { |
3810 |
+ const struct tcp_sock *tp = tcp_sk(sk); |
3811 |
+ |
3812 |
+@@ -4737,7 +4813,7 @@ static void tcp_new_space(struct sock *sk) |
3813 |
+ { |
3814 |
+ struct tcp_sock *tp = tcp_sk(sk); |
3815 |
+ |
3816 |
+- if (tcp_should_expand_sndbuf(sk)) { |
3817 |
++ if (tp->ops->should_expand_sndbuf(sk)) { |
3818 |
+ tcp_sndbuf_expand(sk); |
3819 |
+ tp->snd_cwnd_stamp = tcp_time_stamp; |
3820 |
+ } |
3821 |
+@@ -4749,8 +4825,9 @@ static void tcp_check_space(struct sock *sk) |
3822 |
+ { |
3823 |
+ if (sock_flag(sk, SOCK_QUEUE_SHRUNK)) { |
3824 |
+ sock_reset_flag(sk, SOCK_QUEUE_SHRUNK); |
3825 |
+- if (sk->sk_socket && |
3826 |
+- test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) |
3827 |
++ if (mptcp(tcp_sk(sk)) || |
3828 |
++ (sk->sk_socket && |
3829 |
++ test_bit(SOCK_NOSPACE, &sk->sk_socket->flags))) |
3830 |
+ tcp_new_space(sk); |
3831 |
+ } |
3832 |
+ } |
3833 |
+@@ -4773,7 +4850,7 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible) |
3834 |
+ /* ... and right edge of window advances far enough. |
3835 |
+ * (tcp_recvmsg() will send ACK otherwise). Or... |
3836 |
+ */ |
3837 |
+- __tcp_select_window(sk) >= tp->rcv_wnd) || |
3838 |
++ tp->ops->__select_window(sk) >= tp->rcv_wnd) || |
3839 |
+ /* We ACK each frame or... */ |
3840 |
+ tcp_in_quickack_mode(sk) || |
3841 |
+ /* We have out of order data. */ |
3842 |
+@@ -4875,6 +4952,10 @@ static void tcp_urg(struct sock *sk, struct sk_buff *skb, const struct tcphdr *t |
3843 |
+ { |
3844 |
+ struct tcp_sock *tp = tcp_sk(sk); |
3845 |
+ |
3846 |
++ /* MPTCP urgent data is not yet supported */ |
3847 |
++ if (mptcp(tp)) |
3848 |
++ return; |
3849 |
++ |
3850 |
+ /* Check if we get a new urgent pointer - normally not. */ |
3851 |
+ if (th->urg) |
3852 |
+ tcp_check_urg(sk, th); |
3853 |
+@@ -4942,8 +5023,7 @@ static inline bool tcp_checksum_complete_user(struct sock *sk, |
3854 |
+ } |
3855 |
+ |
3856 |
+ #ifdef CONFIG_NET_DMA |
3857 |
+-static bool tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb, |
3858 |
+- int hlen) |
3859 |
++bool tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb, int hlen) |
3860 |
+ { |
3861 |
+ struct tcp_sock *tp = tcp_sk(sk); |
3862 |
+ int chunk = skb->len - hlen; |
3863 |
+@@ -5052,9 +5132,15 @@ syn_challenge: |
3864 |
+ goto discard; |
3865 |
+ } |
3866 |
+ |
3867 |
++ /* If valid: post process the received MPTCP options. */ |
3868 |
++ if (mptcp(tp) && mptcp_handle_options(sk, th, skb)) |
3869 |
++ goto discard; |
3870 |
++ |
3871 |
+ return true; |
3872 |
+ |
3873 |
+ discard: |
3874 |
++ if (mptcp(tp)) |
3875 |
++ mptcp_reset_mopt(tp); |
3876 |
+ __kfree_skb(skb); |
3877 |
+ return false; |
3878 |
+ } |
3879 |
+@@ -5106,6 +5192,10 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb, |
3880 |
+ |
3881 |
+ tp->rx_opt.saw_tstamp = 0; |
3882 |
+ |
3883 |
++ /* MPTCP: force slowpath. */ |
3884 |
++ if (mptcp(tp)) |
3885 |
++ goto slow_path; |
3886 |
++ |
3887 |
+ /* pred_flags is 0xS?10 << 16 + snd_wnd |
3888 |
+ * if header_prediction is to be made |
3889 |
+ * 'S' will always be tp->tcp_header_len >> 2 |
3890 |
+@@ -5205,7 +5295,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb, |
3891 |
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPHITSTOUSER); |
3892 |
+ } |
3893 |
+ if (copied_early) |
3894 |
+- tcp_cleanup_rbuf(sk, skb->len); |
3895 |
++ tp->ops->cleanup_rbuf(sk, skb->len); |
3896 |
+ } |
3897 |
+ if (!eaten) { |
3898 |
+ if (tcp_checksum_complete_user(sk, skb)) |
3899 |
+@@ -5313,14 +5403,14 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb) |
3900 |
+ |
3901 |
+ tcp_init_metrics(sk); |
3902 |
+ |
3903 |
+- tcp_init_congestion_control(sk); |
3904 |
++ tp->ops->init_congestion_control(sk); |
3905 |
+ |
3906 |
+ /* Prevent spurious tcp_cwnd_restart() on first data |
3907 |
+ * packet. |
3908 |
+ */ |
3909 |
+ tp->lsndtime = tcp_time_stamp; |
3910 |
+ |
3911 |
+- tcp_init_buffer_space(sk); |
3912 |
++ tp->ops->init_buffer_space(sk); |
3913 |
+ |
3914 |
+ if (sock_flag(sk, SOCK_KEEPOPEN)) |
3915 |
+ inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tp)); |
3916 |
+@@ -5350,7 +5440,7 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack, |
3917 |
+ /* Get original SYNACK MSS value if user MSS sets mss_clamp */ |
3918 |
+ tcp_clear_options(&opt); |
3919 |
+ opt.user_mss = opt.mss_clamp = 0; |
3920 |
+- tcp_parse_options(synack, &opt, 0, NULL); |
3921 |
++ tcp_parse_options(synack, &opt, NULL, 0, NULL); |
3922 |
+ mss = opt.mss_clamp; |
3923 |
+ } |
3924 |
+ |
3925 |
+@@ -5365,7 +5455,11 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack, |
3926 |
+ |
3927 |
+ tcp_fastopen_cache_set(sk, mss, cookie, syn_drop); |
3928 |
+ |
3929 |
+- if (data) { /* Retransmit unacked data in SYN */ |
3930 |
++ /* In mptcp case, we do not rely on "retransmit", but instead on |
3931 |
++ * "transmit", because if fastopen data is not acked, the retransmission |
3932 |
++ * becomes the first MPTCP data (see mptcp_rcv_synsent_fastopen). |
3933 |
++ */ |
3934 |
++ if (data && !mptcp(tp)) { /* Retransmit unacked data in SYN */ |
3935 |
+ tcp_for_write_queue_from(data, sk) { |
3936 |
+ if (data == tcp_send_head(sk) || |
3937 |
+ __tcp_retransmit_skb(sk, data)) |
3938 |
+@@ -5388,8 +5482,11 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, |
3939 |
+ struct tcp_sock *tp = tcp_sk(sk); |
3940 |
+ struct tcp_fastopen_cookie foc = { .len = -1 }; |
3941 |
+ int saved_clamp = tp->rx_opt.mss_clamp; |
3942 |
++ struct mptcp_options_received mopt; |
3943 |
++ mptcp_init_mp_opt(&mopt); |
3944 |
+ |
3945 |
+- tcp_parse_options(skb, &tp->rx_opt, 0, &foc); |
3946 |
++ tcp_parse_options(skb, &tp->rx_opt, |
3947 |
++ mptcp(tp) ? &tp->mptcp->rx_opt : &mopt, 0, &foc); |
3948 |
+ if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) |
3949 |
+ tp->rx_opt.rcv_tsecr -= tp->tsoffset; |
3950 |
+ |
3951 |
+@@ -5448,6 +5545,30 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, |
3952 |
+ tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); |
3953 |
+ tcp_ack(sk, skb, FLAG_SLOWPATH); |
3954 |
+ |
3955 |
++ if (tp->request_mptcp || mptcp(tp)) { |
3956 |
++ int ret; |
3957 |
++ ret = mptcp_rcv_synsent_state_process(sk, &sk, |
3958 |
++ skb, &mopt); |
3959 |
++ |
3960 |
++ /* May have changed if we support MPTCP */ |
3961 |
++ tp = tcp_sk(sk); |
3962 |
++ icsk = inet_csk(sk); |
3963 |
++ |
3964 |
++ if (ret == 1) |
3965 |
++ goto reset_and_undo; |
3966 |
++ if (ret == 2) |
3967 |
++ goto discard; |
3968 |
++ } |
3969 |
++ |
3970 |
++ if (mptcp(tp) && !is_master_tp(tp)) { |
3971 |
++ /* Timer for repeating the ACK until an answer |
3972 |
++ * arrives. Used only when establishing an additional |
3973 |
++ * subflow inside of an MPTCP connection. |
3974 |
++ */ |
3975 |
++ sk_reset_timer(sk, &tp->mptcp->mptcp_ack_timer, |
3976 |
++ jiffies + icsk->icsk_rto); |
3977 |
++ } |
3978 |
++ |
3979 |
+ /* Ok.. it's good. Set up sequence numbers and |
3980 |
+ * move to established. |
3981 |
+ */ |
3982 |
+@@ -5474,6 +5595,11 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, |
3983 |
+ tp->tcp_header_len = sizeof(struct tcphdr); |
3984 |
+ } |
3985 |
+ |
3986 |
++ if (mptcp(tp)) { |
3987 |
++ tp->tcp_header_len += MPTCP_SUB_LEN_DSM_ALIGN; |
3988 |
++ tp->advmss -= MPTCP_SUB_LEN_DSM_ALIGN; |
3989 |
++ } |
3990 |
++ |
3991 |
+ if (tcp_is_sack(tp) && sysctl_tcp_fack) |
3992 |
+ tcp_enable_fack(tp); |
3993 |
+ |
3994 |
+@@ -5494,9 +5620,12 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, |
3995 |
+ tcp_rcv_fastopen_synack(sk, skb, &foc)) |
3996 |
+ return -1; |
3997 |
+ |
3998 |
+- if (sk->sk_write_pending || |
3999 |
++ /* With MPTCP we cannot send data on the third ack due to the |
4000 |
++ * lack of option-space to combine with an MP_CAPABLE. |
4001 |
++ */ |
4002 |
++ if (!mptcp(tp) && (sk->sk_write_pending || |
4003 |
+ icsk->icsk_accept_queue.rskq_defer_accept || |
4004 |
+- icsk->icsk_ack.pingpong) { |
4005 |
++ icsk->icsk_ack.pingpong)) { |
4006 |
+ /* Save one ACK. Data will be ready after |
4007 |
+ * several ticks, if write_pending is set. |
4008 |
+ * |
4009 |
+@@ -5536,6 +5665,7 @@ discard: |
4010 |
+ tcp_paws_reject(&tp->rx_opt, 0)) |
4011 |
+ goto discard_and_undo; |
4012 |
+ |
4013 |
++ /* TODO - check this here for MPTCP */ |
4014 |
+ if (th->syn) { |
4015 |
+ /* We see SYN without ACK. It is attempt of |
4016 |
+ * simultaneous connect with crossed SYNs. |
4017 |
+@@ -5552,6 +5682,11 @@ discard: |
4018 |
+ tp->tcp_header_len = sizeof(struct tcphdr); |
4019 |
+ } |
4020 |
+ |
4021 |
++ if (mptcp(tp)) { |
4022 |
++ tp->tcp_header_len += MPTCP_SUB_LEN_DSM_ALIGN; |
4023 |
++ tp->advmss -= MPTCP_SUB_LEN_DSM_ALIGN; |
4024 |
++ } |
4025 |
++ |
4026 |
+ tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1; |
4027 |
+ tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1; |
4028 |
+ |
4029 |
+@@ -5610,6 +5745,7 @@ reset_and_undo: |
4030 |
+ |
4031 |
+ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, |
4032 |
+ const struct tcphdr *th, unsigned int len) |
4033 |
++ __releases(&sk->sk_lock.slock) |
4034 |
+ { |
4035 |
+ struct tcp_sock *tp = tcp_sk(sk); |
4036 |
+ struct inet_connection_sock *icsk = inet_csk(sk); |
4037 |
+@@ -5661,6 +5797,16 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, |
4038 |
+ |
4039 |
+ case TCP_SYN_SENT: |
4040 |
+ queued = tcp_rcv_synsent_state_process(sk, skb, th, len); |
4041 |
++ if (is_meta_sk(sk)) { |
4042 |
++ sk = tcp_sk(sk)->mpcb->master_sk; |
4043 |
++ tp = tcp_sk(sk); |
4044 |
++ |
4045 |
++ /* Need to call it here, because it will announce new |
4046 |
++ * addresses, which can only be done after the third ack |
4047 |
++ * of the 3-way handshake. |
4048 |
++ */ |
4049 |
++ mptcp_update_metasocket(sk, tp->meta_sk); |
4050 |
++ } |
4051 |
+ if (queued >= 0) |
4052 |
+ return queued; |
4053 |
+ |
4054 |
+@@ -5668,6 +5814,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, |
4055 |
+ tcp_urg(sk, skb, th); |
4056 |
+ __kfree_skb(skb); |
4057 |
+ tcp_data_snd_check(sk); |
4058 |
++ if (mptcp(tp) && is_master_tp(tp)) |
4059 |
++ bh_unlock_sock(sk); |
4060 |
+ return 0; |
4061 |
+ } |
4062 |
+ |
4063 |
+@@ -5706,11 +5854,11 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, |
4064 |
+ synack_stamp = tp->lsndtime; |
4065 |
+ /* Make sure socket is routed, for correct metrics. */ |
4066 |
+ icsk->icsk_af_ops->rebuild_header(sk); |
4067 |
+- tcp_init_congestion_control(sk); |
4068 |
++ tp->ops->init_congestion_control(sk); |
4069 |
+ |
4070 |
+ tcp_mtup_init(sk); |
4071 |
+ tp->copied_seq = tp->rcv_nxt; |
4072 |
+- tcp_init_buffer_space(sk); |
4073 |
++ tp->ops->init_buffer_space(sk); |
4074 |
+ } |
4075 |
+ smp_mb(); |
4076 |
+ tcp_set_state(sk, TCP_ESTABLISHED); |
4077 |
+@@ -5730,6 +5878,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, |
4078 |
+ |
4079 |
+ if (tp->rx_opt.tstamp_ok) |
4080 |
+ tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; |
4081 |
++ if (mptcp(tp)) |
4082 |
++ tp->advmss -= MPTCP_SUB_LEN_DSM_ALIGN; |
4083 |
+ |
4084 |
+ if (req) { |
4085 |
+ /* Re-arm the timer because data may have been sent out. |
4086 |
+@@ -5751,6 +5901,12 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, |
4087 |
+ |
4088 |
+ tcp_initialize_rcv_mss(sk); |
4089 |
+ tcp_fast_path_on(tp); |
4090 |
++ /* Send an ACK when establishing a new |
4091 |
++ * MPTCP subflow, i.e. using an MP_JOIN |
4092 |
++ * subtype. |
4093 |
++ */ |
4094 |
++ if (mptcp(tp) && !is_master_tp(tp)) |
4095 |
++ tcp_send_ack(sk); |
4096 |
+ break; |
4097 |
+ |
4098 |
+ case TCP_FIN_WAIT1: { |
4099 |
+@@ -5802,7 +5958,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, |
4100 |
+ tmo = tcp_fin_time(sk); |
4101 |
+ if (tmo > TCP_TIMEWAIT_LEN) { |
4102 |
+ inet_csk_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN); |
4103 |
+- } else if (th->fin || sock_owned_by_user(sk)) { |
4104 |
++ } else if (th->fin || mptcp_is_data_fin(skb) || |
4105 |
++ sock_owned_by_user(sk)) { |
4106 |
+ /* Bad case. We could lose such FIN otherwise. |
4107 |
+ * It is not a big problem, but it looks confusing |
4108 |
+ * and not so rare event. We still can lose it now, |
4109 |
+@@ -5811,7 +5968,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, |
4110 |
+ */ |
4111 |
+ inet_csk_reset_keepalive_timer(sk, tmo); |
4112 |
+ } else { |
4113 |
+- tcp_time_wait(sk, TCP_FIN_WAIT2, tmo); |
4114 |
++ tp->ops->time_wait(sk, TCP_FIN_WAIT2, tmo); |
4115 |
+ goto discard; |
4116 |
+ } |
4117 |
+ break; |
4118 |
+@@ -5819,7 +5976,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, |
4119 |
+ |
4120 |
+ case TCP_CLOSING: |
4121 |
+ if (tp->snd_una == tp->write_seq) { |
4122 |
+- tcp_time_wait(sk, TCP_TIME_WAIT, 0); |
4123 |
++ tp->ops->time_wait(sk, TCP_TIME_WAIT, 0); |
4124 |
+ goto discard; |
4125 |
+ } |
4126 |
+ break; |
4127 |
+@@ -5831,6 +5988,9 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, |
4128 |
+ goto discard; |
4129 |
+ } |
4130 |
+ break; |
4131 |
++ case TCP_CLOSE: |
4132 |
++ if (tp->mp_killed) |
4133 |
++ goto discard; |
4134 |
+ } |
4135 |
+ |
4136 |
+ /* step 6: check the URG bit */ |
4137 |
+@@ -5851,7 +6011,11 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, |
4138 |
+ */ |
4139 |
+ if (sk->sk_shutdown & RCV_SHUTDOWN) { |
4140 |
+ if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq && |
4141 |
+- after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) { |
4142 |
++ after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt) && |
4143 |
++ !mptcp(tp)) { |
4144 |
++ /* In case of mptcp, the reset is handled by |
4145 |
++ * mptcp_rcv_state_process |
4146 |
++ */ |
4147 |
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONDATA); |
4148 |
+ tcp_reset(sk); |
4149 |
+ return 1; |
4150 |
+@@ -5877,3 +6041,154 @@ discard: |
4151 |
+ return 0; |
4152 |
+ } |
4153 |
+ EXPORT_SYMBOL(tcp_rcv_state_process); |
4154 |
++ |
4155 |
++static inline void pr_drop_req(struct request_sock *req, __u16 port, int family) |
4156 |
++{ |
4157 |
++ struct inet_request_sock *ireq = inet_rsk(req); |
4158 |
++ |
4159 |
++ if (family == AF_INET) |
4160 |
++ LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("drop open request from %pI4/%u\n"), |
4161 |
++ &ireq->ir_rmt_addr, port); |
4162 |
++#if IS_ENABLED(CONFIG_IPV6) |
4163 |
++ else if (family == AF_INET6) |
4164 |
++ LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("drop open request from %pI6/%u\n"), |
4165 |
++ &ireq->ir_v6_rmt_addr, port); |
4166 |
++#endif |
4167 |
++} |
4168 |
++ |
4169 |
++int tcp_conn_request(struct request_sock_ops *rsk_ops, |
4170 |
++ const struct tcp_request_sock_ops *af_ops, |
4171 |
++ struct sock *sk, struct sk_buff *skb) |
4172 |
++{ |
4173 |
++ struct tcp_options_received tmp_opt; |
4174 |
++ struct request_sock *req; |
4175 |
++ struct tcp_sock *tp = tcp_sk(sk); |
4176 |
++ struct dst_entry *dst = NULL; |
4177 |
++ __u32 isn = TCP_SKB_CB(skb)->when; |
4178 |
++ bool want_cookie = false, fastopen; |
4179 |
++ struct flowi fl; |
4180 |
++ struct tcp_fastopen_cookie foc = { .len = -1 }; |
4181 |
++ int err; |
4182 |
++ |
4183 |
++ |
4184 |
++ /* TW buckets are converted to open requests without |
4185 |
++ * limitations, they conserve resources and peer is |
4186 |
++ * evidently real one. |
4187 |
++ */ |
4188 |
++ if ((sysctl_tcp_syncookies == 2 || |
4189 |
++ inet_csk_reqsk_queue_is_full(sk)) && !isn) { |
4190 |
++ want_cookie = tcp_syn_flood_action(sk, skb, rsk_ops->slab_name); |
4191 |
++ if (!want_cookie) |
4192 |
++ goto drop; |
4193 |
++ } |
4194 |
++ |
4195 |
++ |
4196 |
++ /* Accept backlog is full. If we have already queued enough |
4197 |
++ * of warm entries in syn queue, drop request. It is better than |
4198 |
++ * clogging syn queue with openreqs with exponentially increasing |
4199 |
++ * timeout. |
4200 |
++ */ |
4201 |
++ if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) { |
4202 |
++ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); |
4203 |
++ goto drop; |
4204 |
++ } |
4205 |
++ |
4206 |
++ req = inet_reqsk_alloc(rsk_ops); |
4207 |
++ if (!req) |
4208 |
++ goto drop; |
4209 |
++ |
4210 |
++ tcp_rsk(req)->af_specific = af_ops; |
4211 |
++ |
4212 |
++ tcp_clear_options(&tmp_opt); |
4213 |
++ tmp_opt.mss_clamp = af_ops->mss_clamp; |
4214 |
++ tmp_opt.user_mss = tp->rx_opt.user_mss; |
4215 |
++ tcp_parse_options(skb, &tmp_opt, NULL, 0, want_cookie ? NULL : &foc); |
4216 |
++ |
4217 |
++ if (want_cookie && !tmp_opt.saw_tstamp) |
4218 |
++ tcp_clear_options(&tmp_opt); |
4219 |
++ |
4220 |
++ tmp_opt.tstamp_ok = tmp_opt.saw_tstamp; |
4221 |
++ tcp_openreq_init(req, &tmp_opt, skb); |
4222 |
++ |
4223 |
++ if (af_ops->init_req(req, sk, skb)) |
4224 |
++ goto drop_and_free; |
4225 |
++ |
4226 |
++ if (security_inet_conn_request(sk, skb, req)) |
4227 |
++ goto drop_and_free; |
4228 |
++ |
4229 |
++ if (!want_cookie || tmp_opt.tstamp_ok) |
4230 |
++ TCP_ECN_create_request(req, skb, sock_net(sk)); |
4231 |
++ |
4232 |
++ if (want_cookie) { |
4233 |
++ isn = cookie_init_sequence(af_ops, sk, skb, &req->mss); |
4234 |
++ req->cookie_ts = tmp_opt.tstamp_ok; |
4235 |
++ } else if (!isn) { |
4236 |
++ /* VJ's idea. We save last timestamp seen |
4237 |
++ * from the destination in peer table, when entering |
4238 |
++ * state TIME-WAIT, and check against it before |
4239 |
++ * accepting new connection request. |
4240 |
++ * |
4241 |
++ * If "isn" is not zero, this request hit alive |
4242 |
++ * timewait bucket, so that all the necessary checks |
4243 |
++ * are made in the function processing timewait state. |
4244 |
++ */ |
4245 |
++ if (tmp_opt.saw_tstamp && tcp_death_row.sysctl_tw_recycle) { |
4246 |
++ bool strict; |
4247 |
++ |
4248 |
++ dst = af_ops->route_req(sk, &fl, req, &strict); |
4249 |
++ if (dst && strict && |
4250 |
++ !tcp_peer_is_proven(req, dst, true)) { |
4251 |
++ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED); |
4252 |
++ goto drop_and_release; |
4253 |
++ } |
4254 |
++ } |
4255 |
++ /* Kill the following clause, if you dislike this way. */ |
4256 |
++ else if (!sysctl_tcp_syncookies && |
4257 |
++ (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) < |
4258 |
++ (sysctl_max_syn_backlog >> 2)) && |
4259 |
++ !tcp_peer_is_proven(req, dst, false)) { |
4260 |
++ /* Without syncookies last quarter of |
4261 |
++ * backlog is filled with destinations, |
4262 |
++ * proven to be alive. |
4263 |
++ * It means that we continue to communicate |
4264 |
++ * to destinations, already remembered |
4265 |
++ * to the moment of synflood. |
4266 |
++ */ |
4267 |
++ pr_drop_req(req, ntohs(tcp_hdr(skb)->source), |
4268 |
++ rsk_ops->family); |
4269 |
++ goto drop_and_release; |
4270 |
++ } |
4271 |
++ |
4272 |
++ isn = af_ops->init_seq(skb); |
4273 |
++ } |
4274 |
++ if (!dst) { |
4275 |
++ dst = af_ops->route_req(sk, &fl, req, NULL); |
4276 |
++ if (!dst) |
4277 |
++ goto drop_and_free; |
4278 |
++ } |
4279 |
++ |
4280 |
++ tcp_rsk(req)->snt_isn = isn; |
4281 |
++ tcp_openreq_init_rwin(req, sk, dst); |
4282 |
++ fastopen = !want_cookie && |
4283 |
++ tcp_try_fastopen(sk, skb, req, &foc, dst); |
4284 |
++ err = af_ops->send_synack(sk, dst, &fl, req, |
4285 |
++ skb_get_queue_mapping(skb), &foc); |
4286 |
++ if (!fastopen) { |
4287 |
++ if (err || want_cookie) |
4288 |
++ goto drop_and_free; |
4289 |
++ |
4290 |
++ tcp_rsk(req)->listener = NULL; |
4291 |
++ af_ops->queue_hash_add(sk, req, TCP_TIMEOUT_INIT); |
4292 |
++ } |
4293 |
++ |
4294 |
++ return 0; |
4295 |
++ |
4296 |
++drop_and_release: |
4297 |
++ dst_release(dst); |
4298 |
++drop_and_free: |
4299 |
++ reqsk_free(req); |
4300 |
++drop: |
4301 |
++ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS); |
4302 |
++ return 0; |
4303 |
++} |
4304 |
++EXPORT_SYMBOL(tcp_conn_request); |
4305 |
+diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c |
4306 |
+index 77cccda1ad0c..c77017f600f1 100644 |
4307 |
+--- a/net/ipv4/tcp_ipv4.c |
4308 |
++++ b/net/ipv4/tcp_ipv4.c |
4309 |
+@@ -67,6 +67,8 @@ |
4310 |
+ #include <net/icmp.h> |
4311 |
+ #include <net/inet_hashtables.h> |
4312 |
+ #include <net/tcp.h> |
4313 |
++#include <net/mptcp.h> |
4314 |
++#include <net/mptcp_v4.h> |
4315 |
+ #include <net/transp_v6.h> |
4316 |
+ #include <net/ipv6.h> |
4317 |
+ #include <net/inet_common.h> |
4318 |
+@@ -99,7 +101,7 @@ static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, |
4319 |
+ struct inet_hashinfo tcp_hashinfo; |
4320 |
+ EXPORT_SYMBOL(tcp_hashinfo); |
4321 |
+ |
4322 |
+-static inline __u32 tcp_v4_init_sequence(const struct sk_buff *skb) |
4323 |
++__u32 tcp_v4_init_sequence(const struct sk_buff *skb) |
4324 |
+ { |
4325 |
+ return secure_tcp_sequence_number(ip_hdr(skb)->daddr, |
4326 |
+ ip_hdr(skb)->saddr, |
4327 |
+@@ -334,7 +336,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) |
4328 |
+ struct inet_sock *inet; |
4329 |
+ const int type = icmp_hdr(icmp_skb)->type; |
4330 |
+ const int code = icmp_hdr(icmp_skb)->code; |
4331 |
+- struct sock *sk; |
4332 |
++ struct sock *sk, *meta_sk; |
4333 |
+ struct sk_buff *skb; |
4334 |
+ struct request_sock *fastopen; |
4335 |
+ __u32 seq, snd_una; |
4336 |
+@@ -358,13 +360,19 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) |
4337 |
+ return; |
4338 |
+ } |
4339 |
+ |
4340 |
+- bh_lock_sock(sk); |
4341 |
++ tp = tcp_sk(sk); |
4342 |
++ if (mptcp(tp)) |
4343 |
++ meta_sk = mptcp_meta_sk(sk); |
4344 |
++ else |
4345 |
++ meta_sk = sk; |
4346 |
++ |
4347 |
++ bh_lock_sock(meta_sk); |
4348 |
+ /* If too many ICMPs get dropped on busy |
4349 |
+ * servers this needs to be solved differently. |
4350 |
+ * We do take care of PMTU discovery (RFC1191) special case : |
4351 |
+ * we can receive locally generated ICMP messages while socket is held. |
4352 |
+ */ |
4353 |
+- if (sock_owned_by_user(sk)) { |
4354 |
++ if (sock_owned_by_user(meta_sk)) { |
4355 |
+ if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)) |
4356 |
+ NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS); |
4357 |
+ } |
4358 |
+@@ -377,7 +385,6 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) |
4359 |
+ } |
4360 |
+ |
4361 |
+ icsk = inet_csk(sk); |
4362 |
+- tp = tcp_sk(sk); |
4363 |
+ seq = ntohl(th->seq); |
4364 |
+ /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */ |
4365 |
+ fastopen = tp->fastopen_rsk; |
4366 |
+@@ -411,11 +418,13 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) |
4367 |
+ goto out; |
4368 |
+ |
4369 |
+ tp->mtu_info = info; |
4370 |
+- if (!sock_owned_by_user(sk)) { |
4371 |
++ if (!sock_owned_by_user(meta_sk)) { |
4372 |
+ tcp_v4_mtu_reduced(sk); |
4373 |
+ } else { |
4374 |
+ if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags)) |
4375 |
+ sock_hold(sk); |
4376 |
++ if (mptcp(tp)) |
4377 |
++ mptcp_tsq_flags(sk); |
4378 |
+ } |
4379 |
+ goto out; |
4380 |
+ } |
4381 |
+@@ -429,7 +438,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) |
4382 |
+ !icsk->icsk_backoff || fastopen) |
4383 |
+ break; |
4384 |
+ |
4385 |
+- if (sock_owned_by_user(sk)) |
4386 |
++ if (sock_owned_by_user(meta_sk)) |
4387 |
+ break; |
4388 |
+ |
4389 |
+ icsk->icsk_backoff--; |
4390 |
+@@ -463,7 +472,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) |
4391 |
+ switch (sk->sk_state) { |
4392 |
+ struct request_sock *req, **prev; |
4393 |
+ case TCP_LISTEN: |
4394 |
+- if (sock_owned_by_user(sk)) |
4395 |
++ if (sock_owned_by_user(meta_sk)) |
4396 |
+ goto out; |
4397 |
+ |
4398 |
+ req = inet_csk_search_req(sk, &prev, th->dest, |
4399 |
+@@ -499,7 +508,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) |
4400 |
+ if (fastopen && fastopen->sk == NULL) |
4401 |
+ break; |
4402 |
+ |
4403 |
+- if (!sock_owned_by_user(sk)) { |
4404 |
++ if (!sock_owned_by_user(meta_sk)) { |
4405 |
+ sk->sk_err = err; |
4406 |
+ |
4407 |
+ sk->sk_error_report(sk); |
4408 |
+@@ -528,7 +537,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) |
4409 |
+ */ |
4410 |
+ |
4411 |
+ inet = inet_sk(sk); |
4412 |
+- if (!sock_owned_by_user(sk) && inet->recverr) { |
4413 |
++ if (!sock_owned_by_user(meta_sk) && inet->recverr) { |
4414 |
+ sk->sk_err = err; |
4415 |
+ sk->sk_error_report(sk); |
4416 |
+ } else { /* Only an error on timeout */ |
4417 |
+@@ -536,7 +545,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) |
4418 |
+ } |
4419 |
+ |
4420 |
+ out: |
4421 |
+- bh_unlock_sock(sk); |
4422 |
++ bh_unlock_sock(meta_sk); |
4423 |
+ sock_put(sk); |
4424 |
+ } |
4425 |
+ |
4426 |
+@@ -578,7 +587,7 @@ EXPORT_SYMBOL(tcp_v4_send_check); |
4427 |
+ * Exception: precedence violation. We do not implement it in any case. |
4428 |
+ */ |
4429 |
+ |
4430 |
+-static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb) |
4431 |
++void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb) |
4432 |
+ { |
4433 |
+ const struct tcphdr *th = tcp_hdr(skb); |
4434 |
+ struct { |
4435 |
+@@ -702,10 +711,10 @@ release_sk1: |
4436 |
+ outside socket context is ugly, certainly. What can I do? |
4437 |
+ */ |
4438 |
+ |
4439 |
+-static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack, |
4440 |
++static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack, u32 data_ack, |
4441 |
+ u32 win, u32 tsval, u32 tsecr, int oif, |
4442 |
+ struct tcp_md5sig_key *key, |
4443 |
+- int reply_flags, u8 tos) |
4444 |
++ int reply_flags, u8 tos, int mptcp) |
4445 |
+ { |
4446 |
+ const struct tcphdr *th = tcp_hdr(skb); |
4447 |
+ struct { |
4448 |
+@@ -714,6 +723,10 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack, |
4449 |
+ #ifdef CONFIG_TCP_MD5SIG |
4450 |
+ + (TCPOLEN_MD5SIG_ALIGNED >> 2) |
4451 |
+ #endif |
4452 |
++#ifdef CONFIG_MPTCP |
4453 |
++ + ((MPTCP_SUB_LEN_DSS >> 2) + |
4454 |
++ (MPTCP_SUB_LEN_ACK >> 2)) |
4455 |
++#endif |
4456 |
+ ]; |
4457 |
+ } rep; |
4458 |
+ struct ip_reply_arg arg; |
4459 |
+@@ -758,6 +771,21 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack, |
4460 |
+ ip_hdr(skb)->daddr, &rep.th); |
4461 |
+ } |
4462 |
+ #endif |
4463 |
++#ifdef CONFIG_MPTCP |
4464 |
++ if (mptcp) { |
4465 |
++ int offset = (tsecr) ? 3 : 0; |
4466 |
++ /* Construction of 32-bit data_ack */ |
4467 |
++ rep.opt[offset++] = htonl((TCPOPT_MPTCP << 24) | |
4468 |
++ ((MPTCP_SUB_LEN_DSS + MPTCP_SUB_LEN_ACK) << 16) | |
4469 |
++ (0x20 << 8) | |
4470 |
++ (0x01)); |
4471 |
++ rep.opt[offset] = htonl(data_ack); |
4472 |
++ |
4473 |
++ arg.iov[0].iov_len += MPTCP_SUB_LEN_DSS + MPTCP_SUB_LEN_ACK; |
4474 |
++ rep.th.doff = arg.iov[0].iov_len / 4; |
4475 |
++ } |
4476 |
++#endif /* CONFIG_MPTCP */ |
4477 |
++ |
4478 |
+ arg.flags = reply_flags; |
4479 |
+ arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, |
4480 |
+ ip_hdr(skb)->saddr, /* XXX */ |
4481 |
+@@ -776,36 +804,44 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb) |
4482 |
+ { |
4483 |
+ struct inet_timewait_sock *tw = inet_twsk(sk); |
4484 |
+ struct tcp_timewait_sock *tcptw = tcp_twsk(sk); |
4485 |
++ u32 data_ack = 0; |
4486 |
++ int mptcp = 0; |
4487 |
++ |
4488 |
++ if (tcptw->mptcp_tw && tcptw->mptcp_tw->meta_tw) { |
4489 |
++ data_ack = (u32)tcptw->mptcp_tw->rcv_nxt; |
4490 |
++ mptcp = 1; |
4491 |
++ } |
4492 |
+ |
4493 |
+ tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt, |
4494 |
++ data_ack, |
4495 |
+ tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, |
4496 |
+ tcp_time_stamp + tcptw->tw_ts_offset, |
4497 |
+ tcptw->tw_ts_recent, |
4498 |
+ tw->tw_bound_dev_if, |
4499 |
+ tcp_twsk_md5_key(tcptw), |
4500 |
+ tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0, |
4501 |
+- tw->tw_tos |
4502 |
++ tw->tw_tos, mptcp |
4503 |
+ ); |
4504 |
+ |
4505 |
+ inet_twsk_put(tw); |
4506 |
+ } |
4507 |
+ |
4508 |
+-static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, |
4509 |
+- struct request_sock *req) |
4510 |
++void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, |
4511 |
++ struct request_sock *req) |
4512 |
+ { |
4513 |
+ /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV |
4514 |
+ * sk->sk_state == TCP_SYN_RECV -> for Fast Open. |
4515 |
+ */ |
4516 |
+ tcp_v4_send_ack(skb, (sk->sk_state == TCP_LISTEN) ? |
4517 |
+ tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt, |
4518 |
+- tcp_rsk(req)->rcv_nxt, req->rcv_wnd, |
4519 |
++ tcp_rsk(req)->rcv_nxt, 0, req->rcv_wnd, |
4520 |
+ tcp_time_stamp, |
4521 |
+ req->ts_recent, |
4522 |
+ 0, |
4523 |
+ tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr, |
4524 |
+ AF_INET), |
4525 |
+ inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0, |
4526 |
+- ip_hdr(skb)->tos); |
4527 |
++ ip_hdr(skb)->tos, 0); |
4528 |
+ } |
4529 |
+ |
4530 |
+ /* |
4531 |
+@@ -813,10 +849,11 @@ static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, |
4532 |
+ * This still operates on a request_sock only, not on a big |
4533 |
+ * socket. |
4534 |
+ */ |
4535 |
+-static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst, |
4536 |
+- struct request_sock *req, |
4537 |
+- u16 queue_mapping, |
4538 |
+- struct tcp_fastopen_cookie *foc) |
4539 |
++int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst, |
4540 |
++ struct flowi *fl, |
4541 |
++ struct request_sock *req, |
4542 |
++ u16 queue_mapping, |
4543 |
++ struct tcp_fastopen_cookie *foc) |
4544 |
+ { |
4545 |
+ const struct inet_request_sock *ireq = inet_rsk(req); |
4546 |
+ struct flowi4 fl4; |
4547 |
+@@ -844,21 +881,10 @@ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst, |
4548 |
+ return err; |
4549 |
+ } |
4550 |
+ |
4551 |
+-static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req) |
4552 |
+-{ |
4553 |
+- int res = tcp_v4_send_synack(sk, NULL, req, 0, NULL); |
4554 |
+- |
4555 |
+- if (!res) { |
4556 |
+- TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS); |
4557 |
+- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS); |
4558 |
+- } |
4559 |
+- return res; |
4560 |
+-} |
4561 |
+- |
4562 |
+ /* |
4563 |
+ * IPv4 request_sock destructor. |
4564 |
+ */ |
4565 |
+-static void tcp_v4_reqsk_destructor(struct request_sock *req) |
4566 |
++void tcp_v4_reqsk_destructor(struct request_sock *req) |
4567 |
+ { |
4568 |
+ kfree(inet_rsk(req)->opt); |
4569 |
+ } |
4570 |
+@@ -896,7 +922,7 @@ EXPORT_SYMBOL(tcp_syn_flood_action); |
4571 |
+ /* |
4572 |
+ * Save and compile IPv4 options into the request_sock if needed. |
4573 |
+ */ |
4574 |
+-static struct ip_options_rcu *tcp_v4_save_options(struct sk_buff *skb) |
4575 |
++struct ip_options_rcu *tcp_v4_save_options(struct sk_buff *skb) |
4576 |
+ { |
4577 |
+ const struct ip_options *opt = &(IPCB(skb)->opt); |
4578 |
+ struct ip_options_rcu *dopt = NULL; |
4579 |
+@@ -1237,161 +1263,71 @@ static bool tcp_v4_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb) |
4580 |
+ |
4581 |
+ #endif |
4582 |
+ |
4583 |
++static int tcp_v4_init_req(struct request_sock *req, struct sock *sk, |
4584 |
++ struct sk_buff *skb) |
4585 |
++{ |
4586 |
++ struct inet_request_sock *ireq = inet_rsk(req); |
4587 |
++ |
4588 |
++ ireq->ir_loc_addr = ip_hdr(skb)->daddr; |
4589 |
++ ireq->ir_rmt_addr = ip_hdr(skb)->saddr; |
4590 |
++ ireq->no_srccheck = inet_sk(sk)->transparent; |
4591 |
++ ireq->opt = tcp_v4_save_options(skb); |
4592 |
++ ireq->ir_mark = inet_request_mark(sk, skb); |
4593 |
++ |
4594 |
++ return 0; |
4595 |
++} |
4596 |
++ |
4597 |
++static struct dst_entry *tcp_v4_route_req(struct sock *sk, struct flowi *fl, |
4598 |
++ const struct request_sock *req, |
4599 |
++ bool *strict) |
4600 |
++{ |
4601 |
++ struct dst_entry *dst = inet_csk_route_req(sk, &fl->u.ip4, req); |
4602 |
++ |
4603 |
++ if (strict) { |
4604 |
++ if (fl->u.ip4.daddr == inet_rsk(req)->ir_rmt_addr) |
4605 |
++ *strict = true; |
4606 |
++ else |
4607 |
++ *strict = false; |
4608 |
++ } |
4609 |
++ |
4610 |
++ return dst; |
4611 |
++} |
4612 |
++ |
4613 |
+ struct request_sock_ops tcp_request_sock_ops __read_mostly = { |
4614 |
+ .family = PF_INET, |
4615 |
+ .obj_size = sizeof(struct tcp_request_sock), |
4616 |
+- .rtx_syn_ack = tcp_v4_rtx_synack, |
4617 |
++ .rtx_syn_ack = tcp_rtx_synack, |
4618 |
+ .send_ack = tcp_v4_reqsk_send_ack, |
4619 |
+ .destructor = tcp_v4_reqsk_destructor, |
4620 |
+ .send_reset = tcp_v4_send_reset, |
4621 |
+ .syn_ack_timeout = tcp_syn_ack_timeout, |
4622 |
+ }; |
4623 |
+ |
4624 |
++const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { |
4625 |
++ .mss_clamp = TCP_MSS_DEFAULT, |
4626 |
+ #ifdef CONFIG_TCP_MD5SIG |
4627 |
+-static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { |
4628 |
+ .md5_lookup = tcp_v4_reqsk_md5_lookup, |
4629 |
+ .calc_md5_hash = tcp_v4_md5_hash_skb, |
4630 |
+-}; |
4631 |
+ #endif |
4632 |
++ .init_req = tcp_v4_init_req, |
4633 |
++#ifdef CONFIG_SYN_COOKIES |
4634 |
++ .cookie_init_seq = cookie_v4_init_sequence, |
4635 |
++#endif |
4636 |
++ .route_req = tcp_v4_route_req, |
4637 |
++ .init_seq = tcp_v4_init_sequence, |
4638 |
++ .send_synack = tcp_v4_send_synack, |
4639 |
++ .queue_hash_add = inet_csk_reqsk_queue_hash_add, |
4640 |
++}; |
4641 |
+ |
4642 |
+ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) |
4643 |
+ { |
4644 |
+- struct tcp_options_received tmp_opt; |
4645 |
+- struct request_sock *req; |
4646 |
+- struct inet_request_sock *ireq; |
4647 |
+- struct tcp_sock *tp = tcp_sk(sk); |
4648 |
+- struct dst_entry *dst = NULL; |
4649 |
+- __be32 saddr = ip_hdr(skb)->saddr; |
4650 |
+- __be32 daddr = ip_hdr(skb)->daddr; |
4651 |
+- __u32 isn = TCP_SKB_CB(skb)->when; |
4652 |
+- bool want_cookie = false, fastopen; |
4653 |
+- struct flowi4 fl4; |
4654 |
+- struct tcp_fastopen_cookie foc = { .len = -1 }; |
4655 |
+- int err; |
4656 |
+- |
4657 |
+ /* Never answer to SYNs send to broadcast or multicast */ |
4658 |
+ if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) |
4659 |
+ goto drop; |
4660 |
+ |
4661 |
+- /* TW buckets are converted to open requests without |
4662 |
+- * limitations, they conserve resources and peer is |
4663 |
+- * evidently real one. |
4664 |
+- */ |
4665 |
+- if ((sysctl_tcp_syncookies == 2 || |
4666 |
+- inet_csk_reqsk_queue_is_full(sk)) && !isn) { |
4667 |
+- want_cookie = tcp_syn_flood_action(sk, skb, "TCP"); |
4668 |
+- if (!want_cookie) |
4669 |
+- goto drop; |
4670 |
+- } |
4671 |
+- |
4672 |
+- /* Accept backlog is full. If we have already queued enough |
4673 |
+- * of warm entries in syn queue, drop request. It is better than |
4674 |
+- * clogging syn queue with openreqs with exponentially increasing |
4675 |
+- * timeout. |
4676 |
+- */ |
4677 |
+- if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) { |
4678 |
+- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); |
4679 |
+- goto drop; |
4680 |
+- } |
4681 |
+- |
4682 |
+- req = inet_reqsk_alloc(&tcp_request_sock_ops); |
4683 |
+- if (!req) |
4684 |
+- goto drop; |
4685 |
+- |
4686 |
+-#ifdef CONFIG_TCP_MD5SIG |
4687 |
+- tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops; |
4688 |
+-#endif |
4689 |
+- |
4690 |
+- tcp_clear_options(&tmp_opt); |
4691 |
+- tmp_opt.mss_clamp = TCP_MSS_DEFAULT; |
4692 |
+- tmp_opt.user_mss = tp->rx_opt.user_mss; |
4693 |
+- tcp_parse_options(skb, &tmp_opt, 0, want_cookie ? NULL : &foc); |
4694 |
+- |
4695 |
+- if (want_cookie && !tmp_opt.saw_tstamp) |
4696 |
+- tcp_clear_options(&tmp_opt); |
4697 |
+- |
4698 |
+- tmp_opt.tstamp_ok = tmp_opt.saw_tstamp; |
4699 |
+- tcp_openreq_init(req, &tmp_opt, skb); |
4700 |
++ return tcp_conn_request(&tcp_request_sock_ops, |
4701 |
++ &tcp_request_sock_ipv4_ops, sk, skb); |
4702 |
+ |
4703 |
+- ireq = inet_rsk(req); |
4704 |
+- ireq->ir_loc_addr = daddr; |
4705 |
+- ireq->ir_rmt_addr = saddr; |
4706 |
+- ireq->no_srccheck = inet_sk(sk)->transparent; |
4707 |
+- ireq->opt = tcp_v4_save_options(skb); |
4708 |
+- ireq->ir_mark = inet_request_mark(sk, skb); |
4709 |
+- |
4710 |
+- if (security_inet_conn_request(sk, skb, req)) |
4711 |
+- goto drop_and_free; |
4712 |
+- |
4713 |
+- if (!want_cookie || tmp_opt.tstamp_ok) |
4714 |
+- TCP_ECN_create_request(req, skb, sock_net(sk)); |
4715 |
+- |
4716 |
+- if (want_cookie) { |
4717 |
+- isn = cookie_v4_init_sequence(sk, skb, &req->mss); |
4718 |
+- req->cookie_ts = tmp_opt.tstamp_ok; |
4719 |
+- } else if (!isn) { |
4720 |
+- /* VJ's idea. We save last timestamp seen |
4721 |
+- * from the destination in peer table, when entering |
4722 |
+- * state TIME-WAIT, and check against it before |
4723 |
+- * accepting new connection request. |
4724 |
+- * |
4725 |
+- * If "isn" is not zero, this request hit alive |
4726 |
+- * timewait bucket, so that all the necessary checks |
4727 |
+- * are made in the function processing timewait state. |
4728 |
+- */ |
4729 |
+- if (tmp_opt.saw_tstamp && |
4730 |
+- tcp_death_row.sysctl_tw_recycle && |
4731 |
+- (dst = inet_csk_route_req(sk, &fl4, req)) != NULL && |
4732 |
+- fl4.daddr == saddr) { |
4733 |
+- if (!tcp_peer_is_proven(req, dst, true)) { |
4734 |
+- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED); |
4735 |
+- goto drop_and_release; |
4736 |
+- } |
4737 |
+- } |
4738 |
+- /* Kill the following clause, if you dislike this way. */ |
4739 |
+- else if (!sysctl_tcp_syncookies && |
4740 |
+- (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) < |
4741 |
+- (sysctl_max_syn_backlog >> 2)) && |
4742 |
+- !tcp_peer_is_proven(req, dst, false)) { |
4743 |
+- /* Without syncookies last quarter of |
4744 |
+- * backlog is filled with destinations, |
4745 |
+- * proven to be alive. |
4746 |
+- * It means that we continue to communicate |
4747 |
+- * to destinations, already remembered |
4748 |
+- * to the moment of synflood. |
4749 |
+- */ |
4750 |
+- LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("drop open request from %pI4/%u\n"), |
4751 |
+- &saddr, ntohs(tcp_hdr(skb)->source)); |
4752 |
+- goto drop_and_release; |
4753 |
+- } |
4754 |
+- |
4755 |
+- isn = tcp_v4_init_sequence(skb); |
4756 |
+- } |
4757 |
+- if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL) |
4758 |
+- goto drop_and_free; |
4759 |
+- |
4760 |
+- tcp_rsk(req)->snt_isn = isn; |
4761 |
+- tcp_rsk(req)->snt_synack = tcp_time_stamp; |
4762 |
+- tcp_openreq_init_rwin(req, sk, dst); |
4763 |
+- fastopen = !want_cookie && |
4764 |
+- tcp_try_fastopen(sk, skb, req, &foc, dst); |
4765 |
+- err = tcp_v4_send_synack(sk, dst, req, |
4766 |
+- skb_get_queue_mapping(skb), &foc); |
4767 |
+- if (!fastopen) { |
4768 |
+- if (err || want_cookie) |
4769 |
+- goto drop_and_free; |
4770 |
+- |
4771 |
+- tcp_rsk(req)->snt_synack = tcp_time_stamp; |
4772 |
+- tcp_rsk(req)->listener = NULL; |
4773 |
+- inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT); |
4774 |
+- } |
4775 |
+- |
4776 |
+- return 0; |
4777 |
+- |
4778 |
+-drop_and_release: |
4779 |
+- dst_release(dst); |
4780 |
+-drop_and_free: |
4781 |
+- reqsk_free(req); |
4782 |
+ drop: |
4783 |
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS); |
4784 |
+ return 0; |
4785 |
+@@ -1497,7 +1433,7 @@ put_and_exit: |
4786 |
+ } |
4787 |
+ EXPORT_SYMBOL(tcp_v4_syn_recv_sock); |
4788 |
+ |
4789 |
+-static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb) |
4790 |
++struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb) |
4791 |
+ { |
4792 |
+ struct tcphdr *th = tcp_hdr(skb); |
4793 |
+ const struct iphdr *iph = ip_hdr(skb); |
4794 |
+@@ -1514,8 +1450,15 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb) |
4795 |
+ |
4796 |
+ if (nsk) { |
4797 |
+ if (nsk->sk_state != TCP_TIME_WAIT) { |
4798 |
++ /* Don't lock again the meta-sk. It has been locked |
4799 |
++ * before mptcp_v4_do_rcv. |
4800 |
++ */ |
4801 |
++ if (mptcp(tcp_sk(nsk)) && !is_meta_sk(sk)) |
4802 |
++ bh_lock_sock(mptcp_meta_sk(nsk)); |
4803 |
+ bh_lock_sock(nsk); |
4804 |
++ |
4805 |
+ return nsk; |
4806 |
++ |
4807 |
+ } |
4808 |
+ inet_twsk_put(inet_twsk(nsk)); |
4809 |
+ return NULL; |
4810 |
+@@ -1550,6 +1493,9 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) |
4811 |
+ goto discard; |
4812 |
+ #endif |
4813 |
+ |
4814 |
++ if (is_meta_sk(sk)) |
4815 |
++ return mptcp_v4_do_rcv(sk, skb); |
4816 |
++ |
4817 |
+ if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ |
4818 |
+ struct dst_entry *dst = sk->sk_rx_dst; |
4819 |
+ |
4820 |
+@@ -1681,7 +1627,7 @@ bool tcp_prequeue(struct sock *sk, struct sk_buff *skb) |
4821 |
+ } else if (skb_queue_len(&tp->ucopy.prequeue) == 1) { |
4822 |
+ wake_up_interruptible_sync_poll(sk_sleep(sk), |
4823 |
+ POLLIN | POLLRDNORM | POLLRDBAND); |
4824 |
+- if (!inet_csk_ack_scheduled(sk)) |
4825 |
++ if (!inet_csk_ack_scheduled(sk) && !mptcp(tp)) |
4826 |
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, |
4827 |
+ (3 * tcp_rto_min(sk)) / 4, |
4828 |
+ TCP_RTO_MAX); |
4829 |
+@@ -1698,7 +1644,7 @@ int tcp_v4_rcv(struct sk_buff *skb) |
4830 |
+ { |
4831 |
+ const struct iphdr *iph; |
4832 |
+ const struct tcphdr *th; |
4833 |
+- struct sock *sk; |
4834 |
++ struct sock *sk, *meta_sk = NULL; |
4835 |
+ int ret; |
4836 |
+ struct net *net = dev_net(skb->dev); |
4837 |
+ |
4838 |
+@@ -1732,18 +1678,42 @@ int tcp_v4_rcv(struct sk_buff *skb) |
4839 |
+ TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + |
4840 |
+ skb->len - th->doff * 4); |
4841 |
+ TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); |
4842 |
++#ifdef CONFIG_MPTCP |
4843 |
++ TCP_SKB_CB(skb)->mptcp_flags = 0; |
4844 |
++ TCP_SKB_CB(skb)->dss_off = 0; |
4845 |
++#endif |
4846 |
+ TCP_SKB_CB(skb)->when = 0; |
4847 |
+ TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph); |
4848 |
+ TCP_SKB_CB(skb)->sacked = 0; |
4849 |
+ |
4850 |
+ sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest); |
4851 |
+- if (!sk) |
4852 |
+- goto no_tcp_socket; |
4853 |
+ |
4854 |
+ process: |
4855 |
+- if (sk->sk_state == TCP_TIME_WAIT) |
4856 |
++ if (sk && sk->sk_state == TCP_TIME_WAIT) |
4857 |
+ goto do_time_wait; |
4858 |
+ |
4859 |
++#ifdef CONFIG_MPTCP |
4860 |
++ if (!sk && th->syn && !th->ack) { |
4861 |
++ int ret = mptcp_lookup_join(skb, NULL); |
4862 |
++ |
4863 |
++ if (ret < 0) { |
4864 |
++ tcp_v4_send_reset(NULL, skb); |
4865 |
++ goto discard_it; |
4866 |
++ } else if (ret > 0) { |
4867 |
++ return 0; |
4868 |
++ } |
4869 |
++ } |
4870 |
++ |
4871 |
++ /* Is there a pending request sock for this segment ? */ |
4872 |
++ if ((!sk || sk->sk_state == TCP_LISTEN) && mptcp_check_req(skb, net)) { |
4873 |
++ if (sk) |
4874 |
++ sock_put(sk); |
4875 |
++ return 0; |
4876 |
++ } |
4877 |
++#endif |
4878 |
++ if (!sk) |
4879 |
++ goto no_tcp_socket; |
4880 |
++ |
4881 |
+ if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) { |
4882 |
+ NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP); |
4883 |
+ goto discard_and_relse; |
4884 |
+@@ -1759,11 +1729,21 @@ process: |
4885 |
+ sk_mark_napi_id(sk, skb); |
4886 |
+ skb->dev = NULL; |
4887 |
+ |
4888 |
+- bh_lock_sock_nested(sk); |
4889 |
++ if (mptcp(tcp_sk(sk))) { |
4890 |
++ meta_sk = mptcp_meta_sk(sk); |
4891 |
++ |
4892 |
++ bh_lock_sock_nested(meta_sk); |
4893 |
++ if (sock_owned_by_user(meta_sk)) |
4894 |
++ skb->sk = sk; |
4895 |
++ } else { |
4896 |
++ meta_sk = sk; |
4897 |
++ bh_lock_sock_nested(sk); |
4898 |
++ } |
4899 |
++ |
4900 |
+ ret = 0; |
4901 |
+- if (!sock_owned_by_user(sk)) { |
4902 |
++ if (!sock_owned_by_user(meta_sk)) { |
4903 |
+ #ifdef CONFIG_NET_DMA |
4904 |
+- struct tcp_sock *tp = tcp_sk(sk); |
4905 |
++ struct tcp_sock *tp = tcp_sk(meta_sk); |
4906 |
+ if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list) |
4907 |
+ tp->ucopy.dma_chan = net_dma_find_channel(); |
4908 |
+ if (tp->ucopy.dma_chan) |
4909 |
+@@ -1771,16 +1751,16 @@ process: |
4910 |
+ else |
4911 |
+ #endif |
4912 |
+ { |
4913 |
+- if (!tcp_prequeue(sk, skb)) |
4914 |
++ if (!tcp_prequeue(meta_sk, skb)) |
4915 |
+ ret = tcp_v4_do_rcv(sk, skb); |
4916 |
+ } |
4917 |
+- } else if (unlikely(sk_add_backlog(sk, skb, |
4918 |
+- sk->sk_rcvbuf + sk->sk_sndbuf))) { |
4919 |
+- bh_unlock_sock(sk); |
4920 |
++ } else if (unlikely(sk_add_backlog(meta_sk, skb, |
4921 |
++ meta_sk->sk_rcvbuf + meta_sk->sk_sndbuf))) { |
4922 |
++ bh_unlock_sock(meta_sk); |
4923 |
+ NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP); |
4924 |
+ goto discard_and_relse; |
4925 |
+ } |
4926 |
+- bh_unlock_sock(sk); |
4927 |
++ bh_unlock_sock(meta_sk); |
4928 |
+ |
4929 |
+ sock_put(sk); |
4930 |
+ |
4931 |
+@@ -1835,6 +1815,18 @@ do_time_wait: |
4932 |
+ sk = sk2; |
4933 |
+ goto process; |
4934 |
+ } |
4935 |
++#ifdef CONFIG_MPTCP |
4936 |
++ if (th->syn && !th->ack) { |
4937 |
++ int ret = mptcp_lookup_join(skb, inet_twsk(sk)); |
4938 |
++ |
4939 |
++ if (ret < 0) { |
4940 |
++ tcp_v4_send_reset(NULL, skb); |
4941 |
++ goto discard_it; |
4942 |
++ } else if (ret > 0) { |
4943 |
++ return 0; |
4944 |
++ } |
4945 |
++ } |
4946 |
++#endif |
4947 |
+ /* Fall through to ACK */ |
4948 |
+ } |
4949 |
+ case TCP_TW_ACK: |
4950 |
+@@ -1900,7 +1892,12 @@ static int tcp_v4_init_sock(struct sock *sk) |
4951 |
+ |
4952 |
+ tcp_init_sock(sk); |
4953 |
+ |
4954 |
+- icsk->icsk_af_ops = &ipv4_specific; |
4955 |
++#ifdef CONFIG_MPTCP |
4956 |
++ if (is_mptcp_enabled(sk)) |
4957 |
++ icsk->icsk_af_ops = &mptcp_v4_specific; |
4958 |
++ else |
4959 |
++#endif |
4960 |
++ icsk->icsk_af_ops = &ipv4_specific; |
4961 |
+ |
4962 |
+ #ifdef CONFIG_TCP_MD5SIG |
4963 |
+ tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific; |
4964 |
+@@ -1917,6 +1914,11 @@ void tcp_v4_destroy_sock(struct sock *sk) |
4965 |
+ |
4966 |
+ tcp_cleanup_congestion_control(sk); |
4967 |
+ |
4968 |
++ if (mptcp(tp)) |
4969 |
++ mptcp_destroy_sock(sk); |
4970 |
++ if (tp->inside_tk_table) |
4971 |
++ mptcp_hash_remove(tp); |
4972 |
++ |
4973 |
+ /* Cleanup up the write buffer. */ |
4974 |
+ tcp_write_queue_purge(sk); |
4975 |
+ |
4976 |
+@@ -2481,6 +2483,19 @@ void tcp4_proc_exit(void) |
4977 |
+ } |
4978 |
+ #endif /* CONFIG_PROC_FS */ |
4979 |
+ |
4980 |
++#ifdef CONFIG_MPTCP |
4981 |
++static void tcp_v4_clear_sk(struct sock *sk, int size) |
4982 |
++{ |
4983 |
++ struct tcp_sock *tp = tcp_sk(sk); |
4984 |
++ |
4985 |
++ /* we do not want to clear tk_table field, because of RCU lookups */ |
4986 |
++ sk_prot_clear_nulls(sk, offsetof(struct tcp_sock, tk_table)); |
4987 |
++ |
4988 |
++ size -= offsetof(struct tcp_sock, tk_table) + sizeof(tp->tk_table); |
4989 |
++ memset((char *)&tp->tk_table + sizeof(tp->tk_table), 0, size); |
4990 |
++} |
4991 |
++#endif |
4992 |
++ |
4993 |
+ struct proto tcp_prot = { |
4994 |
+ .name = "TCP", |
4995 |
+ .owner = THIS_MODULE, |
4996 |
+@@ -2528,6 +2543,9 @@ struct proto tcp_prot = { |
4997 |
+ .destroy_cgroup = tcp_destroy_cgroup, |
4998 |
+ .proto_cgroup = tcp_proto_cgroup, |
4999 |
+ #endif |
5000 |
++#ifdef CONFIG_MPTCP |
5001 |
++ .clear_sk = tcp_v4_clear_sk, |
5002 |
++#endif |
5003 |
+ }; |
5004 |
+ EXPORT_SYMBOL(tcp_prot); |
5005 |
+ |
5006 |
+diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c |
5007 |
+index e68e0d4af6c9..ae6946857dff 100644 |
5008 |
+--- a/net/ipv4/tcp_minisocks.c |
5009 |
++++ b/net/ipv4/tcp_minisocks.c |
5010 |
+@@ -18,11 +18,13 @@ |
5011 |
+ * Jorge Cwik, <jorge@×××××××××××××.net> |
5012 |
+ */ |
5013 |
+ |
5014 |
++#include <linux/kconfig.h> |
5015 |
+ #include <linux/mm.h> |
5016 |
+ #include <linux/module.h> |
5017 |
+ #include <linux/slab.h> |
5018 |
+ #include <linux/sysctl.h> |
5019 |
+ #include <linux/workqueue.h> |
5020 |
++#include <net/mptcp.h> |
5021 |
+ #include <net/tcp.h> |
5022 |
+ #include <net/inet_common.h> |
5023 |
+ #include <net/xfrm.h> |
5024 |
+@@ -95,10 +97,13 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, |
5025 |
+ struct tcp_options_received tmp_opt; |
5026 |
+ struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); |
5027 |
+ bool paws_reject = false; |
5028 |
++ struct mptcp_options_received mopt; |
5029 |
+ |
5030 |
+ tmp_opt.saw_tstamp = 0; |
5031 |
+ if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) { |
5032 |
+- tcp_parse_options(skb, &tmp_opt, 0, NULL); |
5033 |
++ mptcp_init_mp_opt(&mopt); |
5034 |
++ |
5035 |
++ tcp_parse_options(skb, &tmp_opt, &mopt, 0, NULL); |
5036 |
+ |
5037 |
+ if (tmp_opt.saw_tstamp) { |
5038 |
+ tmp_opt.rcv_tsecr -= tcptw->tw_ts_offset; |
5039 |
+@@ -106,6 +111,11 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, |
5040 |
+ tmp_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp; |
5041 |
+ paws_reject = tcp_paws_reject(&tmp_opt, th->rst); |
5042 |
+ } |
5043 |
++ |
5044 |
++ if (unlikely(mopt.mp_fclose) && tcptw->mptcp_tw) { |
5045 |
++ if (mopt.mptcp_key == tcptw->mptcp_tw->loc_key) |
5046 |
++ goto kill_with_rst; |
5047 |
++ } |
5048 |
+ } |
5049 |
+ |
5050 |
+ if (tw->tw_substate == TCP_FIN_WAIT2) { |
5051 |
+@@ -128,6 +138,16 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, |
5052 |
+ if (!th->ack || |
5053 |
+ !after(TCP_SKB_CB(skb)->end_seq, tcptw->tw_rcv_nxt) || |
5054 |
+ TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq) { |
5055 |
++ /* If mptcp_is_data_fin() returns true, we are sure that |
5056 |
++ * mopt has been initialized - otherwise it would not |
5057 |
++ * be a DATA_FIN. |
5058 |
++ */ |
5059 |
++ if (tcptw->mptcp_tw && tcptw->mptcp_tw->meta_tw && |
5060 |
++ mptcp_is_data_fin(skb) && |
5061 |
++ TCP_SKB_CB(skb)->seq == tcptw->tw_rcv_nxt && |
5062 |
++ mopt.data_seq + 1 == (u32)tcptw->mptcp_tw->rcv_nxt) |
5063 |
++ return TCP_TW_ACK; |
5064 |
++ |
5065 |
+ inet_twsk_put(tw); |
5066 |
+ return TCP_TW_SUCCESS; |
5067 |
+ } |
5068 |
+@@ -290,6 +310,15 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) |
5069 |
+ tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp; |
5070 |
+ tcptw->tw_ts_offset = tp->tsoffset; |
5071 |
+ |
5072 |
++ if (mptcp(tp)) { |
5073 |
++ if (mptcp_init_tw_sock(sk, tcptw)) { |
5074 |
++ inet_twsk_free(tw); |
5075 |
++ goto exit; |
5076 |
++ } |
5077 |
++ } else { |
5078 |
++ tcptw->mptcp_tw = NULL; |
5079 |
++ } |
5080 |
++ |
5081 |
+ #if IS_ENABLED(CONFIG_IPV6) |
5082 |
+ if (tw->tw_family == PF_INET6) { |
5083 |
+ struct ipv6_pinfo *np = inet6_sk(sk); |
5084 |
+@@ -347,15 +376,18 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) |
5085 |
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPTIMEWAITOVERFLOW); |
5086 |
+ } |
5087 |
+ |
5088 |
++exit: |
5089 |
+ tcp_update_metrics(sk); |
5090 |
+ tcp_done(sk); |
5091 |
+ } |
5092 |
+ |
5093 |
+ void tcp_twsk_destructor(struct sock *sk) |
5094 |
+ { |
5095 |
+-#ifdef CONFIG_TCP_MD5SIG |
5096 |
+ struct tcp_timewait_sock *twsk = tcp_twsk(sk); |
5097 |
+ |
5098 |
++ if (twsk->mptcp_tw) |
5099 |
++ mptcp_twsk_destructor(twsk); |
5100 |
++#ifdef CONFIG_TCP_MD5SIG |
5101 |
+ if (twsk->tw_md5_key) |
5102 |
+ kfree_rcu(twsk->tw_md5_key, rcu); |
5103 |
+ #endif |
5104 |
+@@ -382,13 +414,14 @@ void tcp_openreq_init_rwin(struct request_sock *req, |
5105 |
+ req->window_clamp = tcp_full_space(sk); |
5106 |
+ |
5107 |
+ /* tcp_full_space because it is guaranteed to be the first packet */ |
5108 |
+- tcp_select_initial_window(tcp_full_space(sk), |
5109 |
+- mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0), |
5110 |
++ tp->ops->select_initial_window(tcp_full_space(sk), |
5111 |
++ mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0) - |
5112 |
++ (ireq->saw_mpc ? MPTCP_SUB_LEN_DSM_ALIGN : 0), |
5113 |
+ &req->rcv_wnd, |
5114 |
+ &req->window_clamp, |
5115 |
+ ireq->wscale_ok, |
5116 |
+ &rcv_wscale, |
5117 |
+- dst_metric(dst, RTAX_INITRWND)); |
5118 |
++ dst_metric(dst, RTAX_INITRWND), sk); |
5119 |
+ ireq->rcv_wscale = rcv_wscale; |
5120 |
+ } |
5121 |
+ EXPORT_SYMBOL(tcp_openreq_init_rwin); |
5122 |
+@@ -499,6 +532,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, |
5123 |
+ newtp->rx_opt.ts_recent_stamp = 0; |
5124 |
+ newtp->tcp_header_len = sizeof(struct tcphdr); |
5125 |
+ } |
5126 |
++ if (ireq->saw_mpc) |
5127 |
++ newtp->tcp_header_len += MPTCP_SUB_LEN_DSM_ALIGN; |
5128 |
+ newtp->tsoffset = 0; |
5129 |
+ #ifdef CONFIG_TCP_MD5SIG |
5130 |
+ newtp->md5sig_info = NULL; /*XXX*/ |
5131 |
+@@ -535,16 +570,20 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, |
5132 |
+ bool fastopen) |
5133 |
+ { |
5134 |
+ struct tcp_options_received tmp_opt; |
5135 |
++ struct mptcp_options_received mopt; |
5136 |
+ struct sock *child; |
5137 |
+ const struct tcphdr *th = tcp_hdr(skb); |
5138 |
+ __be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK); |
5139 |
+ bool paws_reject = false; |
5140 |
+ |
5141 |
+- BUG_ON(fastopen == (sk->sk_state == TCP_LISTEN)); |
5142 |
++ BUG_ON(!mptcp(tcp_sk(sk)) && fastopen == (sk->sk_state == TCP_LISTEN)); |
5143 |
+ |
5144 |
+ tmp_opt.saw_tstamp = 0; |
5145 |
++ |
5146 |
++ mptcp_init_mp_opt(&mopt); |
5147 |
++ |
5148 |
+ if (th->doff > (sizeof(struct tcphdr)>>2)) { |
5149 |
+- tcp_parse_options(skb, &tmp_opt, 0, NULL); |
5150 |
++ tcp_parse_options(skb, &tmp_opt, &mopt, 0, NULL); |
5151 |
+ |
5152 |
+ if (tmp_opt.saw_tstamp) { |
5153 |
+ tmp_opt.ts_recent = req->ts_recent; |
5154 |
+@@ -583,7 +622,14 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, |
5155 |
+ * |
5156 |
+ * Reset timer after retransmitting SYNACK, similar to |
5157 |
+ * the idea of fast retransmit in recovery. |
5158 |
++ * |
5159 |
++ * Fall back to TCP if MP_CAPABLE is not set. |
5160 |
+ */ |
5161 |
++ |
5162 |
++ if (inet_rsk(req)->saw_mpc && !mopt.saw_mpc) |
5163 |
++ inet_rsk(req)->saw_mpc = false; |
5164 |
++ |
5165 |
++ |
5166 |
+ if (!inet_rtx_syn_ack(sk, req)) |
5167 |
+ req->expires = min(TCP_TIMEOUT_INIT << req->num_timeout, |
5168 |
+ TCP_RTO_MAX) + jiffies; |
5169 |
+@@ -718,9 +764,21 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, |
5170 |
+ * socket is created, wait for troubles. |
5171 |
+ */ |
5172 |
+ child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL); |
5173 |
++ |
5174 |
+ if (child == NULL) |
5175 |
+ goto listen_overflow; |
5176 |
+ |
5177 |
++ if (!is_meta_sk(sk)) { |
5178 |
++ int ret = mptcp_check_req_master(sk, child, req, prev); |
5179 |
++ if (ret < 0) |
5180 |
++ goto listen_overflow; |
5181 |
++ |
5182 |
++ /* MPTCP-supported */ |
5183 |
++ if (!ret) |
5184 |
++ return tcp_sk(child)->mpcb->master_sk; |
5185 |
++ } else { |
5186 |
++ return mptcp_check_req_child(sk, child, req, prev, &mopt); |
5187 |
++ } |
5188 |
+ inet_csk_reqsk_queue_unlink(sk, req, prev); |
5189 |
+ inet_csk_reqsk_queue_removed(sk, req); |
5190 |
+ |
5191 |
+@@ -746,7 +804,17 @@ embryonic_reset: |
5192 |
+ tcp_reset(sk); |
5193 |
+ } |
5194 |
+ if (!fastopen) { |
5195 |
+- inet_csk_reqsk_queue_drop(sk, req, prev); |
5196 |
++ if (is_meta_sk(sk)) { |
5197 |
++ /* We want to avoid stoping the keepalive-timer and so |
5198 |
++ * avoid ending up in inet_csk_reqsk_queue_removed ... |
5199 |
++ */ |
5200 |
++ inet_csk_reqsk_queue_unlink(sk, req, prev); |
5201 |
++ if (reqsk_queue_removed(&inet_csk(sk)->icsk_accept_queue, req) == 0) |
5202 |
++ mptcp_delete_synack_timer(sk); |
5203 |
++ reqsk_free(req); |
5204 |
++ } else { |
5205 |
++ inet_csk_reqsk_queue_drop(sk, req, prev); |
5206 |
++ } |
5207 |
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_EMBRYONICRSTS); |
5208 |
+ } |
5209 |
+ return NULL; |
5210 |
+@@ -770,8 +838,9 @@ int tcp_child_process(struct sock *parent, struct sock *child, |
5211 |
+ { |
5212 |
+ int ret = 0; |
5213 |
+ int state = child->sk_state; |
5214 |
++ struct sock *meta_sk = mptcp(tcp_sk(child)) ? mptcp_meta_sk(child) : child; |
5215 |
+ |
5216 |
+- if (!sock_owned_by_user(child)) { |
5217 |
++ if (!sock_owned_by_user(meta_sk)) { |
5218 |
+ ret = tcp_rcv_state_process(child, skb, tcp_hdr(skb), |
5219 |
+ skb->len); |
5220 |
+ /* Wakeup parent, send SIGIO */ |
5221 |
+@@ -782,10 +851,14 @@ int tcp_child_process(struct sock *parent, struct sock *child, |
5222 |
+ * in main socket hash table and lock on listening |
5223 |
+ * socket does not protect us more. |
5224 |
+ */ |
5225 |
+- __sk_add_backlog(child, skb); |
5226 |
++ if (mptcp(tcp_sk(child))) |
5227 |
++ skb->sk = child; |
5228 |
++ __sk_add_backlog(meta_sk, skb); |
5229 |
+ } |
5230 |
+ |
5231 |
+- bh_unlock_sock(child); |
5232 |
++ if (mptcp(tcp_sk(child))) |
5233 |
++ bh_unlock_sock(child); |
5234 |
++ bh_unlock_sock(meta_sk); |
5235 |
+ sock_put(child); |
5236 |
+ return ret; |
5237 |
+ } |
5238 |
+diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c |
5239 |
+index 179b51e6bda3..efd31b6c5784 100644 |
5240 |
+--- a/net/ipv4/tcp_output.c |
5241 |
++++ b/net/ipv4/tcp_output.c |
5242 |
+@@ -36,6 +36,12 @@ |
5243 |
+ |
5244 |
+ #define pr_fmt(fmt) "TCP: " fmt |
5245 |
+ |
5246 |
++#include <net/mptcp.h> |
5247 |
++#include <net/mptcp_v4.h> |
5248 |
++#if IS_ENABLED(CONFIG_IPV6) |
5249 |
++#include <net/mptcp_v6.h> |
5250 |
++#endif |
5251 |
++#include <net/ipv6.h> |
5252 |
+ #include <net/tcp.h> |
5253 |
+ |
5254 |
+ #include <linux/compiler.h> |
5255 |
+@@ -68,11 +74,8 @@ int sysctl_tcp_slow_start_after_idle __read_mostly = 1; |
5256 |
+ unsigned int sysctl_tcp_notsent_lowat __read_mostly = UINT_MAX; |
5257 |
+ EXPORT_SYMBOL(sysctl_tcp_notsent_lowat); |
5258 |
+ |
5259 |
+-static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, |
5260 |
+- int push_one, gfp_t gfp); |
5261 |
+- |
5262 |
+ /* Account for new data that has been sent to the network. */ |
5263 |
+-static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb) |
5264 |
++void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb) |
5265 |
+ { |
5266 |
+ struct inet_connection_sock *icsk = inet_csk(sk); |
5267 |
+ struct tcp_sock *tp = tcp_sk(sk); |
5268 |
+@@ -214,7 +217,7 @@ u32 tcp_default_init_rwnd(u32 mss) |
5269 |
+ void tcp_select_initial_window(int __space, __u32 mss, |
5270 |
+ __u32 *rcv_wnd, __u32 *window_clamp, |
5271 |
+ int wscale_ok, __u8 *rcv_wscale, |
5272 |
+- __u32 init_rcv_wnd) |
5273 |
++ __u32 init_rcv_wnd, const struct sock *sk) |
5274 |
+ { |
5275 |
+ unsigned int space = (__space < 0 ? 0 : __space); |
5276 |
+ |
5277 |
+@@ -269,12 +272,16 @@ EXPORT_SYMBOL(tcp_select_initial_window); |
5278 |
+ * value can be stuffed directly into th->window for an outgoing |
5279 |
+ * frame. |
5280 |
+ */ |
5281 |
+-static u16 tcp_select_window(struct sock *sk) |
5282 |
++u16 tcp_select_window(struct sock *sk) |
5283 |
+ { |
5284 |
+ struct tcp_sock *tp = tcp_sk(sk); |
5285 |
+ u32 old_win = tp->rcv_wnd; |
5286 |
+- u32 cur_win = tcp_receive_window(tp); |
5287 |
+- u32 new_win = __tcp_select_window(sk); |
5288 |
++ /* The window must never shrink at the meta-level. At the subflow we |
5289 |
++ * have to allow this. Otherwise we may announce a window too large |
5290 |
++ * for the current meta-level sk_rcvbuf. |
5291 |
++ */ |
5292 |
++ u32 cur_win = tcp_receive_window(mptcp(tp) ? tcp_sk(mptcp_meta_sk(sk)) : tp); |
5293 |
++ u32 new_win = tp->ops->__select_window(sk); |
5294 |
+ |
5295 |
+ /* Never shrink the offered window */ |
5296 |
+ if (new_win < cur_win) { |
5297 |
+@@ -290,6 +297,7 @@ static u16 tcp_select_window(struct sock *sk) |
5298 |
+ LINUX_MIB_TCPWANTZEROWINDOWADV); |
5299 |
+ new_win = ALIGN(cur_win, 1 << tp->rx_opt.rcv_wscale); |
5300 |
+ } |
5301 |
++ |
5302 |
+ tp->rcv_wnd = new_win; |
5303 |
+ tp->rcv_wup = tp->rcv_nxt; |
5304 |
+ |
5305 |
+@@ -374,7 +382,7 @@ static inline void TCP_ECN_send(struct sock *sk, struct sk_buff *skb, |
5306 |
+ /* Constructs common control bits of non-data skb. If SYN/FIN is present, |
5307 |
+ * auto increment end seqno. |
5308 |
+ */ |
5309 |
+-static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags) |
5310 |
++void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags) |
5311 |
+ { |
5312 |
+ struct skb_shared_info *shinfo = skb_shinfo(skb); |
5313 |
+ |
5314 |
+@@ -394,7 +402,7 @@ static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags) |
5315 |
+ TCP_SKB_CB(skb)->end_seq = seq; |
5316 |
+ } |
5317 |
+ |
5318 |
+-static inline bool tcp_urg_mode(const struct tcp_sock *tp) |
5319 |
++bool tcp_urg_mode(const struct tcp_sock *tp) |
5320 |
+ { |
5321 |
+ return tp->snd_una != tp->snd_up; |
5322 |
+ } |
5323 |
+@@ -404,17 +412,7 @@ static inline bool tcp_urg_mode(const struct tcp_sock *tp) |
5324 |
+ #define OPTION_MD5 (1 << 2) |
5325 |
+ #define OPTION_WSCALE (1 << 3) |
5326 |
+ #define OPTION_FAST_OPEN_COOKIE (1 << 8) |
5327 |
+- |
5328 |
+-struct tcp_out_options { |
5329 |
+- u16 options; /* bit field of OPTION_* */ |
5330 |
+- u16 mss; /* 0 to disable */ |
5331 |
+- u8 ws; /* window scale, 0 to disable */ |
5332 |
+- u8 num_sack_blocks; /* number of SACK blocks to include */ |
5333 |
+- u8 hash_size; /* bytes in hash_location */ |
5334 |
+- __u8 *hash_location; /* temporary pointer, overloaded */ |
5335 |
+- __u32 tsval, tsecr; /* need to include OPTION_TS */ |
5336 |
+- struct tcp_fastopen_cookie *fastopen_cookie; /* Fast open cookie */ |
5337 |
+-}; |
5338 |
++/* Before adding here - take a look at OPTION_MPTCP in include/net/mptcp.h */ |
5339 |
+ |
5340 |
+ /* Write previously computed TCP options to the packet. |
5341 |
+ * |
5342 |
+@@ -430,7 +428,7 @@ struct tcp_out_options { |
5343 |
+ * (but it may well be that other scenarios fail similarly). |
5344 |
+ */ |
5345 |
+ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp, |
5346 |
+- struct tcp_out_options *opts) |
5347 |
++ struct tcp_out_options *opts, struct sk_buff *skb) |
5348 |
+ { |
5349 |
+ u16 options = opts->options; /* mungable copy */ |
5350 |
+ |
5351 |
+@@ -513,6 +511,9 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp, |
5352 |
+ } |
5353 |
+ ptr += (foc->len + 3) >> 2; |
5354 |
+ } |
5355 |
++ |
5356 |
++ if (unlikely(OPTION_MPTCP & opts->options)) |
5357 |
++ mptcp_options_write(ptr, tp, opts, skb); |
5358 |
+ } |
5359 |
+ |
5360 |
+ /* Compute TCP options for SYN packets. This is not the final |
5361 |
+@@ -564,6 +565,8 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb, |
5362 |
+ if (unlikely(!(OPTION_TS & opts->options))) |
5363 |
+ remaining -= TCPOLEN_SACKPERM_ALIGNED; |
5364 |
+ } |
5365 |
++ if (tp->request_mptcp || mptcp(tp)) |
5366 |
++ mptcp_syn_options(sk, opts, &remaining); |
5367 |
+ |
5368 |
+ if (fastopen && fastopen->cookie.len >= 0) { |
5369 |
+ u32 need = TCPOLEN_EXP_FASTOPEN_BASE + fastopen->cookie.len; |
5370 |
+@@ -637,6 +640,9 @@ static unsigned int tcp_synack_options(struct sock *sk, |
5371 |
+ } |
5372 |
+ } |
5373 |
+ |
5374 |
++ if (ireq->saw_mpc) |
5375 |
++ mptcp_synack_options(req, opts, &remaining); |
5376 |
++ |
5377 |
+ return MAX_TCP_OPTION_SPACE - remaining; |
5378 |
+ } |
5379 |
+ |
5380 |
+@@ -670,16 +676,22 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb |
5381 |
+ opts->tsecr = tp->rx_opt.ts_recent; |
5382 |
+ size += TCPOLEN_TSTAMP_ALIGNED; |
5383 |
+ } |
5384 |
++ if (mptcp(tp)) |
5385 |
++ mptcp_established_options(sk, skb, opts, &size); |
5386 |
+ |
5387 |
+ eff_sacks = tp->rx_opt.num_sacks + tp->rx_opt.dsack; |
5388 |
+ if (unlikely(eff_sacks)) { |
5389 |
+- const unsigned int remaining = MAX_TCP_OPTION_SPACE - size; |
5390 |
+- opts->num_sack_blocks = |
5391 |
+- min_t(unsigned int, eff_sacks, |
5392 |
+- (remaining - TCPOLEN_SACK_BASE_ALIGNED) / |
5393 |
+- TCPOLEN_SACK_PERBLOCK); |
5394 |
+- size += TCPOLEN_SACK_BASE_ALIGNED + |
5395 |
+- opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK; |
5396 |
++ const unsigned remaining = MAX_TCP_OPTION_SPACE - size; |
5397 |
++ if (remaining < TCPOLEN_SACK_BASE_ALIGNED) |
5398 |
++ opts->num_sack_blocks = 0; |
5399 |
++ else |
5400 |
++ opts->num_sack_blocks = |
5401 |
++ min_t(unsigned int, eff_sacks, |
5402 |
++ (remaining - TCPOLEN_SACK_BASE_ALIGNED) / |
5403 |
++ TCPOLEN_SACK_PERBLOCK); |
5404 |
++ if (opts->num_sack_blocks) |
5405 |
++ size += TCPOLEN_SACK_BASE_ALIGNED + |
5406 |
++ opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK; |
5407 |
+ } |
5408 |
+ |
5409 |
+ return size; |
5410 |
+@@ -711,8 +723,8 @@ static void tcp_tsq_handler(struct sock *sk) |
5411 |
+ if ((1 << sk->sk_state) & |
5412 |
+ (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_CLOSING | |
5413 |
+ TCPF_CLOSE_WAIT | TCPF_LAST_ACK)) |
5414 |
+- tcp_write_xmit(sk, tcp_current_mss(sk), tcp_sk(sk)->nonagle, |
5415 |
+- 0, GFP_ATOMIC); |
5416 |
++ tcp_sk(sk)->ops->write_xmit(sk, tcp_current_mss(sk), |
5417 |
++ tcp_sk(sk)->nonagle, 0, GFP_ATOMIC); |
5418 |
+ } |
5419 |
+ /* |
5420 |
+ * One tasklet per cpu tries to send more skbs. |
5421 |
+@@ -727,7 +739,7 @@ static void tcp_tasklet_func(unsigned long data) |
5422 |
+ unsigned long flags; |
5423 |
+ struct list_head *q, *n; |
5424 |
+ struct tcp_sock *tp; |
5425 |
+- struct sock *sk; |
5426 |
++ struct sock *sk, *meta_sk; |
5427 |
+ |
5428 |
+ local_irq_save(flags); |
5429 |
+ list_splice_init(&tsq->head, &list); |
5430 |
+@@ -738,15 +750,25 @@ static void tcp_tasklet_func(unsigned long data) |
5431 |
+ list_del(&tp->tsq_node); |
5432 |
+ |
5433 |
+ sk = (struct sock *)tp; |
5434 |
+- bh_lock_sock(sk); |
5435 |
++ meta_sk = mptcp(tp) ? mptcp_meta_sk(sk) : sk; |
5436 |
++ bh_lock_sock(meta_sk); |
5437 |
+ |
5438 |
+- if (!sock_owned_by_user(sk)) { |
5439 |
++ if (!sock_owned_by_user(meta_sk)) { |
5440 |
+ tcp_tsq_handler(sk); |
5441 |
++ if (mptcp(tp)) |
5442 |
++ tcp_tsq_handler(meta_sk); |
5443 |
+ } else { |
5444 |
++ if (mptcp(tp) && sk->sk_state == TCP_CLOSE) |
5445 |
++ goto exit; |
5446 |
++ |
5447 |
+ /* defer the work to tcp_release_cb() */ |
5448 |
+ set_bit(TCP_TSQ_DEFERRED, &tp->tsq_flags); |
5449 |
++ |
5450 |
++ if (mptcp(tp)) |
5451 |
++ mptcp_tsq_flags(sk); |
5452 |
+ } |
5453 |
+- bh_unlock_sock(sk); |
5454 |
++exit: |
5455 |
++ bh_unlock_sock(meta_sk); |
5456 |
+ |
5457 |
+ clear_bit(TSQ_QUEUED, &tp->tsq_flags); |
5458 |
+ sk_free(sk); |
5459 |
+@@ -756,7 +778,10 @@ static void tcp_tasklet_func(unsigned long data) |
5460 |
+ #define TCP_DEFERRED_ALL ((1UL << TCP_TSQ_DEFERRED) | \ |
5461 |
+ (1UL << TCP_WRITE_TIMER_DEFERRED) | \ |
5462 |
+ (1UL << TCP_DELACK_TIMER_DEFERRED) | \ |
5463 |
+- (1UL << TCP_MTU_REDUCED_DEFERRED)) |
5464 |
++ (1UL << TCP_MTU_REDUCED_DEFERRED) | \ |
5465 |
++ (1UL << MPTCP_PATH_MANAGER) | \ |
5466 |
++ (1UL << MPTCP_SUB_DEFERRED)) |
5467 |
++ |
5468 |
+ /** |
5469 |
+ * tcp_release_cb - tcp release_sock() callback |
5470 |
+ * @sk: socket |
5471 |
+@@ -803,6 +828,13 @@ void tcp_release_cb(struct sock *sk) |
5472 |
+ sk->sk_prot->mtu_reduced(sk); |
5473 |
+ __sock_put(sk); |
5474 |
+ } |
5475 |
++ if (flags & (1UL << MPTCP_PATH_MANAGER)) { |
5476 |
++ if (tcp_sk(sk)->mpcb->pm_ops->release_sock) |
5477 |
++ tcp_sk(sk)->mpcb->pm_ops->release_sock(sk); |
5478 |
++ __sock_put(sk); |
5479 |
++ } |
5480 |
++ if (flags & (1UL << MPTCP_SUB_DEFERRED)) |
5481 |
++ mptcp_tsq_sub_deferred(sk); |
5482 |
+ } |
5483 |
+ EXPORT_SYMBOL(tcp_release_cb); |
5484 |
+ |
5485 |
+@@ -862,8 +894,8 @@ void tcp_wfree(struct sk_buff *skb) |
5486 |
+ * We are working here with either a clone of the original |
5487 |
+ * SKB, or a fresh unique copy made by the retransmit engine. |
5488 |
+ */ |
5489 |
+-static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, |
5490 |
+- gfp_t gfp_mask) |
5491 |
++int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, |
5492 |
++ gfp_t gfp_mask) |
5493 |
+ { |
5494 |
+ const struct inet_connection_sock *icsk = inet_csk(sk); |
5495 |
+ struct inet_sock *inet; |
5496 |
+@@ -933,7 +965,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, |
5497 |
+ */ |
5498 |
+ th->window = htons(min(tp->rcv_wnd, 65535U)); |
5499 |
+ } else { |
5500 |
+- th->window = htons(tcp_select_window(sk)); |
5501 |
++ th->window = htons(tp->ops->select_window(sk)); |
5502 |
+ } |
5503 |
+ th->check = 0; |
5504 |
+ th->urg_ptr = 0; |
5505 |
+@@ -949,7 +981,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, |
5506 |
+ } |
5507 |
+ } |
5508 |
+ |
5509 |
+- tcp_options_write((__be32 *)(th + 1), tp, &opts); |
5510 |
++ tcp_options_write((__be32 *)(th + 1), tp, &opts, skb); |
5511 |
+ if (likely((tcb->tcp_flags & TCPHDR_SYN) == 0)) |
5512 |
+ TCP_ECN_send(sk, skb, tcp_header_size); |
5513 |
+ |
5514 |
+@@ -988,7 +1020,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, |
5515 |
+ * NOTE: probe0 timer is not checked, do not forget tcp_push_pending_frames, |
5516 |
+ * otherwise socket can stall. |
5517 |
+ */ |
5518 |
+-static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb) |
5519 |
++void tcp_queue_skb(struct sock *sk, struct sk_buff *skb) |
5520 |
+ { |
5521 |
+ struct tcp_sock *tp = tcp_sk(sk); |
5522 |
+ |
5523 |
+@@ -1001,15 +1033,16 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb) |
5524 |
+ } |
5525 |
+ |
5526 |
+ /* Initialize TSO segments for a packet. */ |
5527 |
+-static void tcp_set_skb_tso_segs(const struct sock *sk, struct sk_buff *skb, |
5528 |
+- unsigned int mss_now) |
5529 |
++void tcp_set_skb_tso_segs(const struct sock *sk, struct sk_buff *skb, |
5530 |
++ unsigned int mss_now) |
5531 |
+ { |
5532 |
+ struct skb_shared_info *shinfo = skb_shinfo(skb); |
5533 |
+ |
5534 |
+ /* Make sure we own this skb before messing gso_size/gso_segs */ |
5535 |
+ WARN_ON_ONCE(skb_cloned(skb)); |
5536 |
+ |
5537 |
+- if (skb->len <= mss_now || skb->ip_summed == CHECKSUM_NONE) { |
5538 |
++ if (skb->len <= mss_now || (is_meta_sk(sk) && !mptcp_sk_can_gso(sk)) || |
5539 |
++ (!is_meta_sk(sk) && !sk_can_gso(sk)) || skb->ip_summed == CHECKSUM_NONE) { |
5540 |
+ /* Avoid the costly divide in the normal |
5541 |
+ * non-TSO case. |
5542 |
+ */ |
5543 |
+@@ -1041,7 +1074,7 @@ static void tcp_adjust_fackets_out(struct sock *sk, const struct sk_buff *skb, |
5544 |
+ /* Pcount in the middle of the write queue got changed, we need to do various |
5545 |
+ * tweaks to fix counters |
5546 |
+ */ |
5547 |
+-static void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int decr) |
5548 |
++void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int decr) |
5549 |
+ { |
5550 |
+ struct tcp_sock *tp = tcp_sk(sk); |
5551 |
+ |
5552 |
+@@ -1164,7 +1197,7 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, |
5553 |
+ * eventually). The difference is that pulled data not copied, but |
5554 |
+ * immediately discarded. |
5555 |
+ */ |
5556 |
+-static void __pskb_trim_head(struct sk_buff *skb, int len) |
5557 |
++void __pskb_trim_head(struct sk_buff *skb, int len) |
5558 |
+ { |
5559 |
+ struct skb_shared_info *shinfo; |
5560 |
+ int i, k, eat; |
5561 |
+@@ -1205,6 +1238,9 @@ static void __pskb_trim_head(struct sk_buff *skb, int len) |
5562 |
+ /* Remove acked data from a packet in the transmit queue. */ |
5563 |
+ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) |
5564 |
+ { |
5565 |
++ if (mptcp(tcp_sk(sk)) && !is_meta_sk(sk) && mptcp_is_data_seq(skb)) |
5566 |
++ return mptcp_trim_head(sk, skb, len); |
5567 |
++ |
5568 |
+ if (skb_unclone(skb, GFP_ATOMIC)) |
5569 |
+ return -ENOMEM; |
5570 |
+ |
5571 |
+@@ -1222,6 +1258,15 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) |
5572 |
+ if (tcp_skb_pcount(skb) > 1) |
5573 |
+ tcp_set_skb_tso_segs(sk, skb, tcp_skb_mss(skb)); |
5574 |
+ |
5575 |
++#ifdef CONFIG_MPTCP |
5576 |
++ /* Some data got acked - we assume that the seq-number reached the dest. |
5577 |
++ * Anyway, our MPTCP-option has been trimmed above - we lost it here. |
5578 |
++ * Only remove the SEQ if the call does not come from a meta retransmit. |
5579 |
++ */ |
5580 |
++ if (mptcp(tcp_sk(sk)) && !is_meta_sk(sk)) |
5581 |
++ TCP_SKB_CB(skb)->mptcp_flags &= ~MPTCPHDR_SEQ; |
5582 |
++#endif |
5583 |
++ |
5584 |
+ return 0; |
5585 |
+ } |
5586 |
+ |
5587 |
+@@ -1379,6 +1424,7 @@ unsigned int tcp_current_mss(struct sock *sk) |
5588 |
+ |
5589 |
+ return mss_now; |
5590 |
+ } |
5591 |
++EXPORT_SYMBOL(tcp_current_mss); |
5592 |
+ |
5593 |
+ /* RFC2861, slow part. Adjust cwnd, after it was not full during one rto. |
5594 |
+ * As additional protections, we do not touch cwnd in retransmission phases, |
5595 |
+@@ -1446,8 +1492,8 @@ static bool tcp_minshall_check(const struct tcp_sock *tp) |
5596 |
+ * But we can avoid doing the divide again given we already have |
5597 |
+ * skb_pcount = skb->len / mss_now |
5598 |
+ */ |
5599 |
+-static void tcp_minshall_update(struct tcp_sock *tp, unsigned int mss_now, |
5600 |
+- const struct sk_buff *skb) |
5601 |
++void tcp_minshall_update(struct tcp_sock *tp, unsigned int mss_now, |
5602 |
++ const struct sk_buff *skb) |
5603 |
+ { |
5604 |
+ if (skb->len < tcp_skb_pcount(skb) * mss_now) |
5605 |
+ tp->snd_sml = TCP_SKB_CB(skb)->end_seq; |
5606 |
+@@ -1468,11 +1514,11 @@ static bool tcp_nagle_check(bool partial, const struct tcp_sock *tp, |
5607 |
+ (!nonagle && tp->packets_out && tcp_minshall_check(tp))); |
5608 |
+ } |
5609 |
+ /* Returns the portion of skb which can be sent right away */ |
5610 |
+-static unsigned int tcp_mss_split_point(const struct sock *sk, |
5611 |
+- const struct sk_buff *skb, |
5612 |
+- unsigned int mss_now, |
5613 |
+- unsigned int max_segs, |
5614 |
+- int nonagle) |
5615 |
++unsigned int tcp_mss_split_point(const struct sock *sk, |
5616 |
++ const struct sk_buff *skb, |
5617 |
++ unsigned int mss_now, |
5618 |
++ unsigned int max_segs, |
5619 |
++ int nonagle) |
5620 |
+ { |
5621 |
+ const struct tcp_sock *tp = tcp_sk(sk); |
5622 |
+ u32 partial, needed, window, max_len; |
5623 |
+@@ -1502,13 +1548,14 @@ static unsigned int tcp_mss_split_point(const struct sock *sk, |
5624 |
+ /* Can at least one segment of SKB be sent right now, according to the |
5625 |
+ * congestion window rules? If so, return how many segments are allowed. |
5626 |
+ */ |
5627 |
+-static inline unsigned int tcp_cwnd_test(const struct tcp_sock *tp, |
5628 |
+- const struct sk_buff *skb) |
5629 |
++unsigned int tcp_cwnd_test(const struct tcp_sock *tp, |
5630 |
++ const struct sk_buff *skb) |
5631 |
+ { |
5632 |
+ u32 in_flight, cwnd; |
5633 |
+ |
5634 |
+ /* Don't be strict about the congestion window for the final FIN. */ |
5635 |
+- if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) && |
5636 |
++ if (skb && |
5637 |
++ (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) && |
5638 |
+ tcp_skb_pcount(skb) == 1) |
5639 |
+ return 1; |
5640 |
+ |
5641 |
+@@ -1524,8 +1571,8 @@ static inline unsigned int tcp_cwnd_test(const struct tcp_sock *tp, |
5642 |
+ * This must be invoked the first time we consider transmitting |
5643 |
+ * SKB onto the wire. |
5644 |
+ */ |
5645 |
+-static int tcp_init_tso_segs(const struct sock *sk, struct sk_buff *skb, |
5646 |
+- unsigned int mss_now) |
5647 |
++int tcp_init_tso_segs(const struct sock *sk, struct sk_buff *skb, |
5648 |
++ unsigned int mss_now) |
5649 |
+ { |
5650 |
+ int tso_segs = tcp_skb_pcount(skb); |
5651 |
+ |
5652 |
+@@ -1540,8 +1587,8 @@ static int tcp_init_tso_segs(const struct sock *sk, struct sk_buff *skb, |
5653 |
+ /* Return true if the Nagle test allows this packet to be |
5654 |
+ * sent now. |
5655 |
+ */ |
5656 |
+-static inline bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buff *skb, |
5657 |
+- unsigned int cur_mss, int nonagle) |
5658 |
++bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buff *skb, |
5659 |
++ unsigned int cur_mss, int nonagle) |
5660 |
+ { |
5661 |
+ /* Nagle rule does not apply to frames, which sit in the middle of the |
5662 |
+ * write_queue (they have no chances to get new data). |
5663 |
+@@ -1553,7 +1600,8 @@ static inline bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buf |
5664 |
+ return true; |
5665 |
+ |
5666 |
+ /* Don't use the nagle rule for urgent data (or for the final FIN). */ |
5667 |
+- if (tcp_urg_mode(tp) || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)) |
5668 |
++ if (tcp_urg_mode(tp) || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) || |
5669 |
++ mptcp_is_data_fin(skb)) |
5670 |
+ return true; |
5671 |
+ |
5672 |
+ if (!tcp_nagle_check(skb->len < cur_mss, tp, nonagle)) |
5673 |
+@@ -1563,9 +1611,8 @@ static inline bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buf |
5674 |
+ } |
5675 |
+ |
5676 |
+ /* Does at least the first segment of SKB fit into the send window? */ |
5677 |
+-static bool tcp_snd_wnd_test(const struct tcp_sock *tp, |
5678 |
+- const struct sk_buff *skb, |
5679 |
+- unsigned int cur_mss) |
5680 |
++bool tcp_snd_wnd_test(const struct tcp_sock *tp, const struct sk_buff *skb, |
5681 |
++ unsigned int cur_mss) |
5682 |
+ { |
5683 |
+ u32 end_seq = TCP_SKB_CB(skb)->end_seq; |
5684 |
+ |
5685 |
+@@ -1676,7 +1723,7 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb, |
5686 |
+ u32 send_win, cong_win, limit, in_flight; |
5687 |
+ int win_divisor; |
5688 |
+ |
5689 |
+- if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) |
5690 |
++ if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) || mptcp_is_data_fin(skb)) |
5691 |
+ goto send_now; |
5692 |
+ |
5693 |
+ if (icsk->icsk_ca_state != TCP_CA_Open) |
5694 |
+@@ -1888,7 +1935,7 @@ static int tcp_mtu_probe(struct sock *sk) |
5695 |
+ * Returns true, if no segments are in flight and we have queued segments, |
5696 |
+ * but cannot send anything now because of SWS or another problem. |
5697 |
+ */ |
5698 |
+-static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, |
5699 |
++bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, |
5700 |
+ int push_one, gfp_t gfp) |
5701 |
+ { |
5702 |
+ struct tcp_sock *tp = tcp_sk(sk); |
5703 |
+@@ -1900,7 +1947,11 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, |
5704 |
+ |
5705 |
+ sent_pkts = 0; |
5706 |
+ |
5707 |
+- if (!push_one) { |
5708 |
++ /* pmtu not yet supported with MPTCP. Should be possible, by early |
5709 |
++ * exiting the loop inside tcp_mtu_probe, making sure that only one |
5710 |
++ * single DSS-mapping gets probed. |
5711 |
++ */ |
5712 |
++ if (!push_one && !mptcp(tp)) { |
5713 |
+ /* Do MTU probing. */ |
5714 |
+ result = tcp_mtu_probe(sk); |
5715 |
+ if (!result) { |
5716 |
+@@ -2099,7 +2150,8 @@ void tcp_send_loss_probe(struct sock *sk) |
5717 |
+ int err = -1; |
5718 |
+ |
5719 |
+ if (tcp_send_head(sk) != NULL) { |
5720 |
+- err = tcp_write_xmit(sk, mss, TCP_NAGLE_OFF, 2, GFP_ATOMIC); |
5721 |
++ err = tp->ops->write_xmit(sk, mss, TCP_NAGLE_OFF, 2, |
5722 |
++ GFP_ATOMIC); |
5723 |
+ goto rearm_timer; |
5724 |
+ } |
5725 |
+ |
5726 |
+@@ -2159,8 +2211,8 @@ void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss, |
5727 |
+ if (unlikely(sk->sk_state == TCP_CLOSE)) |
5728 |
+ return; |
5729 |
+ |
5730 |
+- if (tcp_write_xmit(sk, cur_mss, nonagle, 0, |
5731 |
+- sk_gfp_atomic(sk, GFP_ATOMIC))) |
5732 |
++ if (tcp_sk(sk)->ops->write_xmit(sk, cur_mss, nonagle, 0, |
5733 |
++ sk_gfp_atomic(sk, GFP_ATOMIC))) |
5734 |
+ tcp_check_probe_timer(sk); |
5735 |
+ } |
5736 |
+ |
5737 |
+@@ -2173,7 +2225,8 @@ void tcp_push_one(struct sock *sk, unsigned int mss_now) |
5738 |
+ |
5739 |
+ BUG_ON(!skb || skb->len < mss_now); |
5740 |
+ |
5741 |
+- tcp_write_xmit(sk, mss_now, TCP_NAGLE_PUSH, 1, sk->sk_allocation); |
5742 |
++ tcp_sk(sk)->ops->write_xmit(sk, mss_now, TCP_NAGLE_PUSH, 1, |
5743 |
++ sk->sk_allocation); |
5744 |
+ } |
5745 |
+ |
5746 |
+ /* This function returns the amount that we can raise the |
5747 |
+@@ -2386,6 +2439,10 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to, |
5748 |
+ if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN) |
5749 |
+ return; |
5750 |
+ |
5751 |
++ /* Currently not supported for MPTCP - but it should be possible */ |
5752 |
++ if (mptcp(tp)) |
5753 |
++ return; |
5754 |
++ |
5755 |
+ tcp_for_write_queue_from_safe(skb, tmp, sk) { |
5756 |
+ if (!tcp_can_collapse(sk, skb)) |
5757 |
+ break; |
5758 |
+@@ -2843,7 +2900,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, |
5759 |
+ |
5760 |
+ /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */ |
5761 |
+ th->window = htons(min(req->rcv_wnd, 65535U)); |
5762 |
+- tcp_options_write((__be32 *)(th + 1), tp, &opts); |
5763 |
++ tcp_options_write((__be32 *)(th + 1), tp, &opts, skb); |
5764 |
+ th->doff = (tcp_header_size >> 2); |
5765 |
+ TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_OUTSEGS); |
5766 |
+ |
5767 |
+@@ -2897,13 +2954,13 @@ static void tcp_connect_init(struct sock *sk) |
5768 |
+ (tp->window_clamp > tcp_full_space(sk) || tp->window_clamp == 0)) |
5769 |
+ tp->window_clamp = tcp_full_space(sk); |
5770 |
+ |
5771 |
+- tcp_select_initial_window(tcp_full_space(sk), |
5772 |
+- tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0), |
5773 |
+- &tp->rcv_wnd, |
5774 |
+- &tp->window_clamp, |
5775 |
+- sysctl_tcp_window_scaling, |
5776 |
+- &rcv_wscale, |
5777 |
+- dst_metric(dst, RTAX_INITRWND)); |
5778 |
++ tp->ops->select_initial_window(tcp_full_space(sk), |
5779 |
++ tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0), |
5780 |
++ &tp->rcv_wnd, |
5781 |
++ &tp->window_clamp, |
5782 |
++ sysctl_tcp_window_scaling, |
5783 |
++ &rcv_wscale, |
5784 |
++ dst_metric(dst, RTAX_INITRWND), sk); |
5785 |
+ |
5786 |
+ tp->rx_opt.rcv_wscale = rcv_wscale; |
5787 |
+ tp->rcv_ssthresh = tp->rcv_wnd; |
5788 |
+@@ -2927,6 +2984,36 @@ static void tcp_connect_init(struct sock *sk) |
5789 |
+ inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT; |
5790 |
+ inet_csk(sk)->icsk_retransmits = 0; |
5791 |
+ tcp_clear_retrans(tp); |
5792 |
++ |
5793 |
++#ifdef CONFIG_MPTCP |
5794 |
++ if (sysctl_mptcp_enabled && mptcp_doit(sk)) { |
5795 |
++ if (is_master_tp(tp)) { |
5796 |
++ tp->request_mptcp = 1; |
5797 |
++ mptcp_connect_init(sk); |
5798 |
++ } else if (tp->mptcp) { |
5799 |
++ struct inet_sock *inet = inet_sk(sk); |
5800 |
++ |
5801 |
++ tp->mptcp->snt_isn = tp->write_seq; |
5802 |
++ tp->mptcp->init_rcv_wnd = tp->rcv_wnd; |
5803 |
++ |
5804 |
++ /* Set nonce for new subflows */ |
5805 |
++ if (sk->sk_family == AF_INET) |
5806 |
++ tp->mptcp->mptcp_loc_nonce = mptcp_v4_get_nonce( |
5807 |
++ inet->inet_saddr, |
5808 |
++ inet->inet_daddr, |
5809 |
++ inet->inet_sport, |
5810 |
++ inet->inet_dport); |
5811 |
++#if IS_ENABLED(CONFIG_IPV6) |
5812 |
++ else |
5813 |
++ tp->mptcp->mptcp_loc_nonce = mptcp_v6_get_nonce( |
5814 |
++ inet6_sk(sk)->saddr.s6_addr32, |
5815 |
++ sk->sk_v6_daddr.s6_addr32, |
5816 |
++ inet->inet_sport, |
5817 |
++ inet->inet_dport); |
5818 |
++#endif |
5819 |
++ } |
5820 |
++ } |
5821 |
++#endif |
5822 |
+ } |
5823 |
+ |
5824 |
+ static void tcp_connect_queue_skb(struct sock *sk, struct sk_buff *skb) |
5825 |
+@@ -3176,6 +3263,7 @@ void tcp_send_ack(struct sock *sk) |
5826 |
+ TCP_SKB_CB(buff)->when = tcp_time_stamp; |
5827 |
+ tcp_transmit_skb(sk, buff, 0, sk_gfp_atomic(sk, GFP_ATOMIC)); |
5828 |
+ } |
5829 |
++EXPORT_SYMBOL(tcp_send_ack); |
5830 |
+ |
5831 |
+ /* This routine sends a packet with an out of date sequence |
5832 |
+ * number. It assumes the other end will try to ack it. |
5833 |
+@@ -3188,7 +3276,7 @@ void tcp_send_ack(struct sock *sk) |
5834 |
+ * one is with SEG.SEQ=SND.UNA to deliver urgent pointer, another is |
5835 |
+ * out-of-date with SND.UNA-1 to probe window. |
5836 |
+ */ |
5837 |
+-static int tcp_xmit_probe_skb(struct sock *sk, int urgent) |
5838 |
++int tcp_xmit_probe_skb(struct sock *sk, int urgent) |
5839 |
+ { |
5840 |
+ struct tcp_sock *tp = tcp_sk(sk); |
5841 |
+ struct sk_buff *skb; |
5842 |
+@@ -3270,7 +3358,7 @@ void tcp_send_probe0(struct sock *sk) |
5843 |
+ struct tcp_sock *tp = tcp_sk(sk); |
5844 |
+ int err; |
5845 |
+ |
5846 |
+- err = tcp_write_wakeup(sk); |
5847 |
++ err = tp->ops->write_wakeup(sk); |
5848 |
+ |
5849 |
+ if (tp->packets_out || !tcp_send_head(sk)) { |
5850 |
+ /* Cancel probe timer, if it is not required. */ |
5851 |
+@@ -3301,3 +3389,18 @@ void tcp_send_probe0(struct sock *sk) |
5852 |
+ TCP_RTO_MAX); |
5853 |
+ } |
5854 |
+ } |
5855 |
++ |
5856 |
++int tcp_rtx_synack(struct sock *sk, struct request_sock *req) |
5857 |
++{ |
5858 |
++ const struct tcp_request_sock_ops *af_ops = tcp_rsk(req)->af_specific; |
5859 |
++ struct flowi fl; |
5860 |
++ int res; |
5861 |
++ |
5862 |
++ res = af_ops->send_synack(sk, NULL, &fl, req, 0, NULL); |
5863 |
++ if (!res) { |
5864 |
++ TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS); |
5865 |
++ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS); |
5866 |
++ } |
5867 |
++ return res; |
5868 |
++} |
5869 |
++EXPORT_SYMBOL(tcp_rtx_synack); |
5870 |
+diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c |
5871 |
+index 286227abed10..966b873cbf3e 100644 |
5872 |
+--- a/net/ipv4/tcp_timer.c |
5873 |
++++ b/net/ipv4/tcp_timer.c |
5874 |
+@@ -20,6 +20,7 @@ |
5875 |
+ |
5876 |
+ #include <linux/module.h> |
5877 |
+ #include <linux/gfp.h> |
5878 |
++#include <net/mptcp.h> |
5879 |
+ #include <net/tcp.h> |
5880 |
+ |
5881 |
+ int sysctl_tcp_syn_retries __read_mostly = TCP_SYN_RETRIES; |
5882 |
+@@ -32,7 +33,7 @@ int sysctl_tcp_retries2 __read_mostly = TCP_RETR2; |
5883 |
+ int sysctl_tcp_orphan_retries __read_mostly; |
5884 |
+ int sysctl_tcp_thin_linear_timeouts __read_mostly; |
5885 |
+ |
5886 |
+-static void tcp_write_err(struct sock *sk) |
5887 |
++void tcp_write_err(struct sock *sk) |
5888 |
+ { |
5889 |
+ sk->sk_err = sk->sk_err_soft ? : ETIMEDOUT; |
5890 |
+ sk->sk_error_report(sk); |
5891 |
+@@ -74,7 +75,7 @@ static int tcp_out_of_resources(struct sock *sk, int do_reset) |
5892 |
+ (!tp->snd_wnd && !tp->packets_out)) |
5893 |
+ do_reset = 1; |
5894 |
+ if (do_reset) |
5895 |
+- tcp_send_active_reset(sk, GFP_ATOMIC); |
5896 |
++ tp->ops->send_active_reset(sk, GFP_ATOMIC); |
5897 |
+ tcp_done(sk); |
5898 |
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONMEMORY); |
5899 |
+ return 1; |
5900 |
+@@ -124,10 +125,8 @@ static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk) |
5901 |
+ * retransmissions with an initial RTO of TCP_RTO_MIN or TCP_TIMEOUT_INIT if |
5902 |
+ * syn_set flag is set. |
5903 |
+ */ |
5904 |
+-static bool retransmits_timed_out(struct sock *sk, |
5905 |
+- unsigned int boundary, |
5906 |
+- unsigned int timeout, |
5907 |
+- bool syn_set) |
5908 |
++bool retransmits_timed_out(struct sock *sk, unsigned int boundary, |
5909 |
++ unsigned int timeout, bool syn_set) |
5910 |
+ { |
5911 |
+ unsigned int linear_backoff_thresh, start_ts; |
5912 |
+ unsigned int rto_base = syn_set ? TCP_TIMEOUT_INIT : TCP_RTO_MIN; |
5913 |
+@@ -153,7 +152,7 @@ static bool retransmits_timed_out(struct sock *sk, |
5914 |
+ } |
5915 |
+ |
5916 |
+ /* A write timeout has occurred. Process the after effects. */ |
5917 |
+-static int tcp_write_timeout(struct sock *sk) |
5918 |
++int tcp_write_timeout(struct sock *sk) |
5919 |
+ { |
5920 |
+ struct inet_connection_sock *icsk = inet_csk(sk); |
5921 |
+ struct tcp_sock *tp = tcp_sk(sk); |
5922 |
+@@ -171,6 +170,10 @@ static int tcp_write_timeout(struct sock *sk) |
5923 |
+ } |
5924 |
+ retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries; |
5925 |
+ syn_set = true; |
5926 |
++ /* Stop retransmitting MP_CAPABLE options in SYN if timed out. */ |
5927 |
++ if (tcp_sk(sk)->request_mptcp && |
5928 |
++ icsk->icsk_retransmits >= mptcp_sysctl_syn_retries()) |
5929 |
++ tcp_sk(sk)->request_mptcp = 0; |
5930 |
+ } else { |
5931 |
+ if (retransmits_timed_out(sk, sysctl_tcp_retries1, 0, 0)) { |
5932 |
+ /* Black hole detection */ |
5933 |
+@@ -251,18 +254,22 @@ out: |
5934 |
+ static void tcp_delack_timer(unsigned long data) |
5935 |
+ { |
5936 |
+ struct sock *sk = (struct sock *)data; |
5937 |
++ struct tcp_sock *tp = tcp_sk(sk); |
5938 |
++ struct sock *meta_sk = mptcp(tp) ? mptcp_meta_sk(sk) : sk; |
5939 |
+ |
5940 |
+- bh_lock_sock(sk); |
5941 |
+- if (!sock_owned_by_user(sk)) { |
5942 |
++ bh_lock_sock(meta_sk); |
5943 |
++ if (!sock_owned_by_user(meta_sk)) { |
5944 |
+ tcp_delack_timer_handler(sk); |
5945 |
+ } else { |
5946 |
+ inet_csk(sk)->icsk_ack.blocked = 1; |
5947 |
+- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED); |
5948 |
++ NET_INC_STATS_BH(sock_net(meta_sk), LINUX_MIB_DELAYEDACKLOCKED); |
5949 |
+ /* deleguate our work to tcp_release_cb() */ |
5950 |
+ if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags)) |
5951 |
+ sock_hold(sk); |
5952 |
++ if (mptcp(tp)) |
5953 |
++ mptcp_tsq_flags(sk); |
5954 |
+ } |
5955 |
+- bh_unlock_sock(sk); |
5956 |
++ bh_unlock_sock(meta_sk); |
5957 |
+ sock_put(sk); |
5958 |
+ } |
5959 |
+ |
5960 |
+@@ -479,6 +486,10 @@ out_reset_timer: |
5961 |
+ __sk_dst_reset(sk); |
5962 |
+ |
5963 |
+ out:; |
5964 |
++ if (mptcp(tp)) { |
5965 |
++ mptcp_reinject_data(sk, 1); |
5966 |
++ mptcp_set_rto(sk); |
5967 |
++ } |
5968 |
+ } |
5969 |
+ |
5970 |
+ void tcp_write_timer_handler(struct sock *sk) |
5971 |
+@@ -505,7 +516,7 @@ void tcp_write_timer_handler(struct sock *sk) |
5972 |
+ break; |
5973 |
+ case ICSK_TIME_RETRANS: |
5974 |
+ icsk->icsk_pending = 0; |
5975 |
+- tcp_retransmit_timer(sk); |
5976 |
++ tcp_sk(sk)->ops->retransmit_timer(sk); |
5977 |
+ break; |
5978 |
+ case ICSK_TIME_PROBE0: |
5979 |
+ icsk->icsk_pending = 0; |
5980 |
+@@ -520,16 +531,19 @@ out: |
5981 |
+ static void tcp_write_timer(unsigned long data) |
5982 |
+ { |
5983 |
+ struct sock *sk = (struct sock *)data; |
5984 |
++ struct sock *meta_sk = mptcp(tcp_sk(sk)) ? mptcp_meta_sk(sk) : sk; |
5985 |
+ |
5986 |
+- bh_lock_sock(sk); |
5987 |
+- if (!sock_owned_by_user(sk)) { |
5988 |
++ bh_lock_sock(meta_sk); |
5989 |
++ if (!sock_owned_by_user(meta_sk)) { |
5990 |
+ tcp_write_timer_handler(sk); |
5991 |
+ } else { |
5992 |
+ /* deleguate our work to tcp_release_cb() */ |
5993 |
+ if (!test_and_set_bit(TCP_WRITE_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags)) |
5994 |
+ sock_hold(sk); |
5995 |
++ if (mptcp(tcp_sk(sk))) |
5996 |
++ mptcp_tsq_flags(sk); |
5997 |
+ } |
5998 |
+- bh_unlock_sock(sk); |
5999 |
++ bh_unlock_sock(meta_sk); |
6000 |
+ sock_put(sk); |
6001 |
+ } |
6002 |
+ |
6003 |
+@@ -566,11 +580,12 @@ static void tcp_keepalive_timer (unsigned long data) |
6004 |
+ struct sock *sk = (struct sock *) data; |
6005 |
+ struct inet_connection_sock *icsk = inet_csk(sk); |
6006 |
+ struct tcp_sock *tp = tcp_sk(sk); |
6007 |
++ struct sock *meta_sk = mptcp(tp) ? mptcp_meta_sk(sk) : sk; |
6008 |
+ u32 elapsed; |
6009 |
+ |
6010 |
+ /* Only process if socket is not in use. */ |
6011 |
+- bh_lock_sock(sk); |
6012 |
+- if (sock_owned_by_user(sk)) { |
6013 |
++ bh_lock_sock(meta_sk); |
6014 |
++ if (sock_owned_by_user(meta_sk)) { |
6015 |
+ /* Try again later. */ |
6016 |
+ inet_csk_reset_keepalive_timer (sk, HZ/20); |
6017 |
+ goto out; |
6018 |
+@@ -581,16 +596,38 @@ static void tcp_keepalive_timer (unsigned long data) |
6019 |
+ goto out; |
6020 |
+ } |
6021 |
+ |
6022 |
++ if (tp->send_mp_fclose) { |
6023 |
++ /* MUST do this before tcp_write_timeout, because retrans_stamp |
6024 |
++ * may have been set to 0 in another part while we are |
6025 |
++ * retransmitting MP_FASTCLOSE. Then, we would crash, because |
6026 |
++ * retransmits_timed_out accesses the meta-write-queue. |
6027 |
++ * |
6028 |
++ * We make sure that the timestamp is != 0. |
6029 |
++ */ |
6030 |
++ if (!tp->retrans_stamp) |
6031 |
++ tp->retrans_stamp = tcp_time_stamp ? : 1; |
6032 |
++ |
6033 |
++ if (tcp_write_timeout(sk)) |
6034 |
++ goto out; |
6035 |
++ |
6036 |
++ tcp_send_ack(sk); |
6037 |
++ icsk->icsk_retransmits++; |
6038 |
++ |
6039 |
++ icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX); |
6040 |
++ elapsed = icsk->icsk_rto; |
6041 |
++ goto resched; |
6042 |
++ } |
6043 |
++ |
6044 |
+ if (sk->sk_state == TCP_FIN_WAIT2 && sock_flag(sk, SOCK_DEAD)) { |
6045 |
+ if (tp->linger2 >= 0) { |
6046 |
+ const int tmo = tcp_fin_time(sk) - TCP_TIMEWAIT_LEN; |
6047 |
+ |
6048 |
+ if (tmo > 0) { |
6049 |
+- tcp_time_wait(sk, TCP_FIN_WAIT2, tmo); |
6050 |
++ tp->ops->time_wait(sk, TCP_FIN_WAIT2, tmo); |
6051 |
+ goto out; |
6052 |
+ } |
6053 |
+ } |
6054 |
+- tcp_send_active_reset(sk, GFP_ATOMIC); |
6055 |
++ tp->ops->send_active_reset(sk, GFP_ATOMIC); |
6056 |
+ goto death; |
6057 |
+ } |
6058 |
+ |
6059 |
+@@ -614,11 +651,11 @@ static void tcp_keepalive_timer (unsigned long data) |
6060 |
+ icsk->icsk_probes_out > 0) || |
6061 |
+ (icsk->icsk_user_timeout == 0 && |
6062 |
+ icsk->icsk_probes_out >= keepalive_probes(tp))) { |
6063 |
+- tcp_send_active_reset(sk, GFP_ATOMIC); |
6064 |
++ tp->ops->send_active_reset(sk, GFP_ATOMIC); |
6065 |
+ tcp_write_err(sk); |
6066 |
+ goto out; |
6067 |
+ } |
6068 |
+- if (tcp_write_wakeup(sk) <= 0) { |
6069 |
++ if (tp->ops->write_wakeup(sk) <= 0) { |
6070 |
+ icsk->icsk_probes_out++; |
6071 |
+ elapsed = keepalive_intvl_when(tp); |
6072 |
+ } else { |
6073 |
+@@ -642,7 +679,7 @@ death: |
6074 |
+ tcp_done(sk); |
6075 |
+ |
6076 |
+ out: |
6077 |
+- bh_unlock_sock(sk); |
6078 |
++ bh_unlock_sock(meta_sk); |
6079 |
+ sock_put(sk); |
6080 |
+ } |
6081 |
+ |
6082 |
+diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c |
6083 |
+index 5667b3003af9..7139c2973fd2 100644 |
6084 |
+--- a/net/ipv6/addrconf.c |
6085 |
++++ b/net/ipv6/addrconf.c |
6086 |
+@@ -760,6 +760,7 @@ void inet6_ifa_finish_destroy(struct inet6_ifaddr *ifp) |
6087 |
+ |
6088 |
+ kfree_rcu(ifp, rcu); |
6089 |
+ } |
6090 |
++EXPORT_SYMBOL(inet6_ifa_finish_destroy); |
6091 |
+ |
6092 |
+ static void |
6093 |
+ ipv6_link_dev_addr(struct inet6_dev *idev, struct inet6_ifaddr *ifp) |
6094 |
+diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c |
6095 |
+index 7cb4392690dd..7057afbca4df 100644 |
6096 |
+--- a/net/ipv6/af_inet6.c |
6097 |
++++ b/net/ipv6/af_inet6.c |
6098 |
+@@ -97,8 +97,7 @@ static __inline__ struct ipv6_pinfo *inet6_sk_generic(struct sock *sk) |
6099 |
+ return (struct ipv6_pinfo *)(((u8 *)sk) + offset); |
6100 |
+ } |
6101 |
+ |
6102 |
+-static int inet6_create(struct net *net, struct socket *sock, int protocol, |
6103 |
+- int kern) |
6104 |
++int inet6_create(struct net *net, struct socket *sock, int protocol, int kern) |
6105 |
+ { |
6106 |
+ struct inet_sock *inet; |
6107 |
+ struct ipv6_pinfo *np; |
6108 |
+diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c |
6109 |
+index a245e5ddffbd..99c892b8992d 100644 |
6110 |
+--- a/net/ipv6/inet6_connection_sock.c |
6111 |
++++ b/net/ipv6/inet6_connection_sock.c |
6112 |
+@@ -96,8 +96,8 @@ struct dst_entry *inet6_csk_route_req(struct sock *sk, |
6113 |
+ /* |
6114 |
+ * request_sock (formerly open request) hash tables. |
6115 |
+ */ |
6116 |
+-static u32 inet6_synq_hash(const struct in6_addr *raddr, const __be16 rport, |
6117 |
+- const u32 rnd, const u32 synq_hsize) |
6118 |
++u32 inet6_synq_hash(const struct in6_addr *raddr, const __be16 rport, |
6119 |
++ const u32 rnd, const u32 synq_hsize) |
6120 |
+ { |
6121 |
+ u32 c; |
6122 |
+ |
6123 |
+diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c |
6124 |
+index edb58aff4ae7..ea4d9fda0927 100644 |
6125 |
+--- a/net/ipv6/ipv6_sockglue.c |
6126 |
++++ b/net/ipv6/ipv6_sockglue.c |
6127 |
+@@ -48,6 +48,8 @@ |
6128 |
+ #include <net/addrconf.h> |
6129 |
+ #include <net/inet_common.h> |
6130 |
+ #include <net/tcp.h> |
6131 |
++#include <net/mptcp.h> |
6132 |
++#include <net/mptcp_v4.h> |
6133 |
+ #include <net/udp.h> |
6134 |
+ #include <net/udplite.h> |
6135 |
+ #include <net/xfrm.h> |
6136 |
+@@ -196,7 +198,12 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, |
6137 |
+ sock_prot_inuse_add(net, &tcp_prot, 1); |
6138 |
+ local_bh_enable(); |
6139 |
+ sk->sk_prot = &tcp_prot; |
6140 |
+- icsk->icsk_af_ops = &ipv4_specific; |
6141 |
++#ifdef CONFIG_MPTCP |
6142 |
++ if (is_mptcp_enabled(sk)) |
6143 |
++ icsk->icsk_af_ops = &mptcp_v4_specific; |
6144 |
++ else |
6145 |
++#endif |
6146 |
++ icsk->icsk_af_ops = &ipv4_specific; |
6147 |
+ sk->sk_socket->ops = &inet_stream_ops; |
6148 |
+ sk->sk_family = PF_INET; |
6149 |
+ tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); |
6150 |
+diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c |
6151 |
+index a822b880689b..b2b38869d795 100644 |
6152 |
+--- a/net/ipv6/syncookies.c |
6153 |
++++ b/net/ipv6/syncookies.c |
6154 |
+@@ -181,13 +181,13 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) |
6155 |
+ |
6156 |
+ /* check for timestamp cookie support */ |
6157 |
+ memset(&tcp_opt, 0, sizeof(tcp_opt)); |
6158 |
+- tcp_parse_options(skb, &tcp_opt, 0, NULL); |
6159 |
++ tcp_parse_options(skb, &tcp_opt, NULL, 0, NULL); |
6160 |
+ |
6161 |
+ if (!cookie_check_timestamp(&tcp_opt, sock_net(sk), &ecn_ok)) |
6162 |
+ goto out; |
6163 |
+ |
6164 |
+ ret = NULL; |
6165 |
+- req = inet6_reqsk_alloc(&tcp6_request_sock_ops); |
6166 |
++ req = inet_reqsk_alloc(&tcp6_request_sock_ops); |
6167 |
+ if (!req) |
6168 |
+ goto out; |
6169 |
+ |
6170 |
+@@ -255,10 +255,10 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) |
6171 |
+ } |
6172 |
+ |
6173 |
+ req->window_clamp = tp->window_clamp ? :dst_metric(dst, RTAX_WINDOW); |
6174 |
+- tcp_select_initial_window(tcp_full_space(sk), req->mss, |
6175 |
+- &req->rcv_wnd, &req->window_clamp, |
6176 |
+- ireq->wscale_ok, &rcv_wscale, |
6177 |
+- dst_metric(dst, RTAX_INITRWND)); |
6178 |
++ tp->ops->select_initial_window(tcp_full_space(sk), req->mss, |
6179 |
++ &req->rcv_wnd, &req->window_clamp, |
6180 |
++ ireq->wscale_ok, &rcv_wscale, |
6181 |
++ dst_metric(dst, RTAX_INITRWND), sk); |
6182 |
+ |
6183 |
+ ireq->rcv_wscale = rcv_wscale; |
6184 |
+ |
6185 |
+diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c |
6186 |
+index 229239ad96b1..fda94d71666e 100644 |
6187 |
+--- a/net/ipv6/tcp_ipv6.c |
6188 |
++++ b/net/ipv6/tcp_ipv6.c |
6189 |
+@@ -63,6 +63,8 @@ |
6190 |
+ #include <net/inet_common.h> |
6191 |
+ #include <net/secure_seq.h> |
6192 |
+ #include <net/tcp_memcontrol.h> |
6193 |
++#include <net/mptcp.h> |
6194 |
++#include <net/mptcp_v6.h> |
6195 |
+ #include <net/busy_poll.h> |
6196 |
+ |
6197 |
+ #include <linux/proc_fs.h> |
6198 |
+@@ -71,12 +73,6 @@ |
6199 |
+ #include <linux/crypto.h> |
6200 |
+ #include <linux/scatterlist.h> |
6201 |
+ |
6202 |
+-static void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb); |
6203 |
+-static void tcp_v6_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, |
6204 |
+- struct request_sock *req); |
6205 |
+- |
6206 |
+-static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb); |
6207 |
+- |
6208 |
+ static const struct inet_connection_sock_af_ops ipv6_mapped; |
6209 |
+ static const struct inet_connection_sock_af_ops ipv6_specific; |
6210 |
+ #ifdef CONFIG_TCP_MD5SIG |
6211 |
+@@ -90,7 +86,7 @@ static struct tcp_md5sig_key *tcp_v6_md5_do_lookup(struct sock *sk, |
6212 |
+ } |
6213 |
+ #endif |
6214 |
+ |
6215 |
+-static void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) |
6216 |
++void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) |
6217 |
+ { |
6218 |
+ struct dst_entry *dst = skb_dst(skb); |
6219 |
+ const struct rt6_info *rt = (const struct rt6_info *)dst; |
6220 |
+@@ -102,10 +98,11 @@ static void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) |
6221 |
+ inet6_sk(sk)->rx_dst_cookie = rt->rt6i_node->fn_sernum; |
6222 |
+ } |
6223 |
+ |
6224 |
+-static void tcp_v6_hash(struct sock *sk) |
6225 |
++void tcp_v6_hash(struct sock *sk) |
6226 |
+ { |
6227 |
+ if (sk->sk_state != TCP_CLOSE) { |
6228 |
+- if (inet_csk(sk)->icsk_af_ops == &ipv6_mapped) { |
6229 |
++ if (inet_csk(sk)->icsk_af_ops == &ipv6_mapped || |
6230 |
++ inet_csk(sk)->icsk_af_ops == &mptcp_v6_mapped) { |
6231 |
+ tcp_prot.hash(sk); |
6232 |
+ return; |
6233 |
+ } |
6234 |
+@@ -115,7 +112,7 @@ static void tcp_v6_hash(struct sock *sk) |
6235 |
+ } |
6236 |
+ } |
6237 |
+ |
6238 |
+-static __u32 tcp_v6_init_sequence(const struct sk_buff *skb) |
6239 |
++__u32 tcp_v6_init_sequence(const struct sk_buff *skb) |
6240 |
+ { |
6241 |
+ return secure_tcpv6_sequence_number(ipv6_hdr(skb)->daddr.s6_addr32, |
6242 |
+ ipv6_hdr(skb)->saddr.s6_addr32, |
6243 |
+@@ -123,7 +120,7 @@ static __u32 tcp_v6_init_sequence(const struct sk_buff *skb) |
6244 |
+ tcp_hdr(skb)->source); |
6245 |
+ } |
6246 |
+ |
6247 |
+-static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, |
6248 |
++int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, |
6249 |
+ int addr_len) |
6250 |
+ { |
6251 |
+ struct sockaddr_in6 *usin = (struct sockaddr_in6 *) uaddr; |
6252 |
+@@ -215,7 +212,12 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, |
6253 |
+ sin.sin_port = usin->sin6_port; |
6254 |
+ sin.sin_addr.s_addr = usin->sin6_addr.s6_addr32[3]; |
6255 |
+ |
6256 |
+- icsk->icsk_af_ops = &ipv6_mapped; |
6257 |
++#ifdef CONFIG_MPTCP |
6258 |
++ if (is_mptcp_enabled(sk)) |
6259 |
++ icsk->icsk_af_ops = &mptcp_v6_mapped; |
6260 |
++ else |
6261 |
++#endif |
6262 |
++ icsk->icsk_af_ops = &ipv6_mapped; |
6263 |
+ sk->sk_backlog_rcv = tcp_v4_do_rcv; |
6264 |
+ #ifdef CONFIG_TCP_MD5SIG |
6265 |
+ tp->af_specific = &tcp_sock_ipv6_mapped_specific; |
6266 |
+@@ -225,7 +227,12 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, |
6267 |
+ |
6268 |
+ if (err) { |
6269 |
+ icsk->icsk_ext_hdr_len = exthdrlen; |
6270 |
+- icsk->icsk_af_ops = &ipv6_specific; |
6271 |
++#ifdef CONFIG_MPTCP |
6272 |
++ if (is_mptcp_enabled(sk)) |
6273 |
++ icsk->icsk_af_ops = &mptcp_v6_specific; |
6274 |
++ else |
6275 |
++#endif |
6276 |
++ icsk->icsk_af_ops = &ipv6_specific; |
6277 |
+ sk->sk_backlog_rcv = tcp_v6_do_rcv; |
6278 |
+ #ifdef CONFIG_TCP_MD5SIG |
6279 |
+ tp->af_specific = &tcp_sock_ipv6_specific; |
6280 |
+@@ -337,7 +344,7 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, |
6281 |
+ const struct ipv6hdr *hdr = (const struct ipv6hdr *)skb->data; |
6282 |
+ const struct tcphdr *th = (struct tcphdr *)(skb->data+offset); |
6283 |
+ struct ipv6_pinfo *np; |
6284 |
+- struct sock *sk; |
6285 |
++ struct sock *sk, *meta_sk; |
6286 |
+ int err; |
6287 |
+ struct tcp_sock *tp; |
6288 |
+ struct request_sock *fastopen; |
6289 |
+@@ -358,8 +365,14 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, |
6290 |
+ return; |
6291 |
+ } |
6292 |
+ |
6293 |
+- bh_lock_sock(sk); |
6294 |
+- if (sock_owned_by_user(sk) && type != ICMPV6_PKT_TOOBIG) |
6295 |
++ tp = tcp_sk(sk); |
6296 |
++ if (mptcp(tp)) |
6297 |
++ meta_sk = mptcp_meta_sk(sk); |
6298 |
++ else |
6299 |
++ meta_sk = sk; |
6300 |
++ |
6301 |
++ bh_lock_sock(meta_sk); |
6302 |
++ if (sock_owned_by_user(meta_sk) && type != ICMPV6_PKT_TOOBIG) |
6303 |
+ NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS); |
6304 |
+ |
6305 |
+ if (sk->sk_state == TCP_CLOSE) |
6306 |
+@@ -370,7 +383,6 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, |
6307 |
+ goto out; |
6308 |
+ } |
6309 |
+ |
6310 |
+- tp = tcp_sk(sk); |
6311 |
+ seq = ntohl(th->seq); |
6312 |
+ /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */ |
6313 |
+ fastopen = tp->fastopen_rsk; |
6314 |
+@@ -403,11 +415,15 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, |
6315 |
+ goto out; |
6316 |
+ |
6317 |
+ tp->mtu_info = ntohl(info); |
6318 |
+- if (!sock_owned_by_user(sk)) |
6319 |
++ if (!sock_owned_by_user(meta_sk)) |
6320 |
+ tcp_v6_mtu_reduced(sk); |
6321 |
+- else if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, |
6322 |
++ else { |
6323 |
++ if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, |
6324 |
+ &tp->tsq_flags)) |
6325 |
+- sock_hold(sk); |
6326 |
++ sock_hold(sk); |
6327 |
++ if (mptcp(tp)) |
6328 |
++ mptcp_tsq_flags(sk); |
6329 |
++ } |
6330 |
+ goto out; |
6331 |
+ } |
6332 |
+ |
6333 |
+@@ -417,7 +433,7 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, |
6334 |
+ switch (sk->sk_state) { |
6335 |
+ struct request_sock *req, **prev; |
6336 |
+ case TCP_LISTEN: |
6337 |
+- if (sock_owned_by_user(sk)) |
6338 |
++ if (sock_owned_by_user(meta_sk)) |
6339 |
+ goto out; |
6340 |
+ |
6341 |
+ req = inet6_csk_search_req(sk, &prev, th->dest, &hdr->daddr, |
6342 |
+@@ -447,7 +463,7 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, |
6343 |
+ if (fastopen && fastopen->sk == NULL) |
6344 |
+ break; |
6345 |
+ |
6346 |
+- if (!sock_owned_by_user(sk)) { |
6347 |
++ if (!sock_owned_by_user(meta_sk)) { |
6348 |
+ sk->sk_err = err; |
6349 |
+ sk->sk_error_report(sk); /* Wake people up to see the error (see connect in sock.c) */ |
6350 |
+ |
6351 |
+@@ -457,26 +473,27 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, |
6352 |
+ goto out; |
6353 |
+ } |
6354 |
+ |
6355 |
+- if (!sock_owned_by_user(sk) && np->recverr) { |
6356 |
++ if (!sock_owned_by_user(meta_sk) && np->recverr) { |
6357 |
+ sk->sk_err = err; |
6358 |
+ sk->sk_error_report(sk); |
6359 |
+ } else |
6360 |
+ sk->sk_err_soft = err; |
6361 |
+ |
6362 |
+ out: |
6363 |
+- bh_unlock_sock(sk); |
6364 |
++ bh_unlock_sock(meta_sk); |
6365 |
+ sock_put(sk); |
6366 |
+ } |
6367 |
+ |
6368 |
+ |
6369 |
+-static int tcp_v6_send_synack(struct sock *sk, struct dst_entry *dst, |
6370 |
+- struct flowi6 *fl6, |
6371 |
+- struct request_sock *req, |
6372 |
+- u16 queue_mapping, |
6373 |
+- struct tcp_fastopen_cookie *foc) |
6374 |
++int tcp_v6_send_synack(struct sock *sk, struct dst_entry *dst, |
6375 |
++ struct flowi *fl, |
6376 |
++ struct request_sock *req, |
6377 |
++ u16 queue_mapping, |
6378 |
++ struct tcp_fastopen_cookie *foc) |
6379 |
+ { |
6380 |
+ struct inet_request_sock *ireq = inet_rsk(req); |
6381 |
+ struct ipv6_pinfo *np = inet6_sk(sk); |
6382 |
++ struct flowi6 *fl6 = &fl->u.ip6; |
6383 |
+ struct sk_buff *skb; |
6384 |
+ int err = -ENOMEM; |
6385 |
+ |
6386 |
+@@ -497,18 +514,21 @@ static int tcp_v6_send_synack(struct sock *sk, struct dst_entry *dst, |
6387 |
+ skb_set_queue_mapping(skb, queue_mapping); |
6388 |
+ err = ip6_xmit(sk, skb, fl6, np->opt, np->tclass); |
6389 |
+ err = net_xmit_eval(err); |
6390 |
++ if (!tcp_rsk(req)->snt_synack && !err) |
6391 |
++ tcp_rsk(req)->snt_synack = tcp_time_stamp; |
6392 |
+ } |
6393 |
+ |
6394 |
+ done: |
6395 |
+ return err; |
6396 |
+ } |
6397 |
+ |
6398 |
+-static int tcp_v6_rtx_synack(struct sock *sk, struct request_sock *req) |
6399 |
++int tcp_v6_rtx_synack(struct sock *sk, struct request_sock *req) |
6400 |
+ { |
6401 |
+- struct flowi6 fl6; |
6402 |
++ const struct tcp_request_sock_ops *af_ops = tcp_rsk(req)->af_specific; |
6403 |
++ struct flowi fl; |
6404 |
+ int res; |
6405 |
+ |
6406 |
+- res = tcp_v6_send_synack(sk, NULL, &fl6, req, 0, NULL); |
6407 |
++ res = af_ops->send_synack(sk, NULL, &fl, req, 0, NULL); |
6408 |
+ if (!res) { |
6409 |
+ TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS); |
6410 |
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS); |
6411 |
+@@ -516,7 +536,7 @@ static int tcp_v6_rtx_synack(struct sock *sk, struct request_sock *req) |
6412 |
+ return res; |
6413 |
+ } |
6414 |
+ |
6415 |
+-static void tcp_v6_reqsk_destructor(struct request_sock *req) |
6416 |
++void tcp_v6_reqsk_destructor(struct request_sock *req) |
6417 |
+ { |
6418 |
+ kfree_skb(inet_rsk(req)->pktopts); |
6419 |
+ } |
6420 |
+@@ -718,27 +738,74 @@ static int tcp_v6_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb) |
6421 |
+ } |
6422 |
+ #endif |
6423 |
+ |
6424 |
++static int tcp_v6_init_req(struct request_sock *req, struct sock *sk, |
6425 |
++ struct sk_buff *skb) |
6426 |
++{ |
6427 |
++ struct inet_request_sock *ireq = inet_rsk(req); |
6428 |
++ struct ipv6_pinfo *np = inet6_sk(sk); |
6429 |
++ |
6430 |
++ ireq->ir_v6_rmt_addr = ipv6_hdr(skb)->saddr; |
6431 |
++ ireq->ir_v6_loc_addr = ipv6_hdr(skb)->daddr; |
6432 |
++ |
6433 |
++ ireq->ir_iif = sk->sk_bound_dev_if; |
6434 |
++ ireq->ir_mark = inet_request_mark(sk, skb); |
6435 |
++ |
6436 |
++ /* So that link locals have meaning */ |
6437 |
++ if (!sk->sk_bound_dev_if && |
6438 |
++ ipv6_addr_type(&ireq->ir_v6_rmt_addr) & IPV6_ADDR_LINKLOCAL) |
6439 |
++ ireq->ir_iif = inet6_iif(skb); |
6440 |
++ |
6441 |
++ if (!TCP_SKB_CB(skb)->when && |
6442 |
++ (ipv6_opt_accepted(sk, skb) || np->rxopt.bits.rxinfo || |
6443 |
++ np->rxopt.bits.rxoinfo || np->rxopt.bits.rxhlim || |
6444 |
++ np->rxopt.bits.rxohlim || np->repflow)) { |
6445 |
++ atomic_inc(&skb->users); |
6446 |
++ ireq->pktopts = skb; |
6447 |
++ } |
6448 |
++ |
6449 |
++ return 0; |
6450 |
++} |
6451 |
++ |
6452 |
++static struct dst_entry *tcp_v6_route_req(struct sock *sk, struct flowi *fl, |
6453 |
++ const struct request_sock *req, |
6454 |
++ bool *strict) |
6455 |
++{ |
6456 |
++ if (strict) |
6457 |
++ *strict = true; |
6458 |
++ return inet6_csk_route_req(sk, &fl->u.ip6, req); |
6459 |
++} |
6460 |
++ |
6461 |
+ struct request_sock_ops tcp6_request_sock_ops __read_mostly = { |
6462 |
+ .family = AF_INET6, |
6463 |
+ .obj_size = sizeof(struct tcp6_request_sock), |
6464 |
+- .rtx_syn_ack = tcp_v6_rtx_synack, |
6465 |
++ .rtx_syn_ack = tcp_rtx_synack, |
6466 |
+ .send_ack = tcp_v6_reqsk_send_ack, |
6467 |
+ .destructor = tcp_v6_reqsk_destructor, |
6468 |
+ .send_reset = tcp_v6_send_reset, |
6469 |
+ .syn_ack_timeout = tcp_syn_ack_timeout, |
6470 |
+ }; |
6471 |
+ |
6472 |
++const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = { |
6473 |
++ .mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - |
6474 |
++ sizeof(struct ipv6hdr), |
6475 |
+ #ifdef CONFIG_TCP_MD5SIG |
6476 |
+-static const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = { |
6477 |
+ .md5_lookup = tcp_v6_reqsk_md5_lookup, |
6478 |
+ .calc_md5_hash = tcp_v6_md5_hash_skb, |
6479 |
+-}; |
6480 |
+ #endif |
6481 |
++ .init_req = tcp_v6_init_req, |
6482 |
++#ifdef CONFIG_SYN_COOKIES |
6483 |
++ .cookie_init_seq = cookie_v6_init_sequence, |
6484 |
++#endif |
6485 |
++ .route_req = tcp_v6_route_req, |
6486 |
++ .init_seq = tcp_v6_init_sequence, |
6487 |
++ .send_synack = tcp_v6_send_synack, |
6488 |
++ .queue_hash_add = inet6_csk_reqsk_queue_hash_add, |
6489 |
++}; |
6490 |
+ |
6491 |
+-static void tcp_v6_send_response(struct sk_buff *skb, u32 seq, u32 ack, u32 win, |
6492 |
+- u32 tsval, u32 tsecr, int oif, |
6493 |
+- struct tcp_md5sig_key *key, int rst, u8 tclass, |
6494 |
+- u32 label) |
6495 |
++static void tcp_v6_send_response(struct sk_buff *skb, u32 seq, u32 ack, |
6496 |
++ u32 data_ack, u32 win, u32 tsval, u32 tsecr, |
6497 |
++ int oif, struct tcp_md5sig_key *key, int rst, |
6498 |
++ u8 tclass, u32 label, int mptcp) |
6499 |
+ { |
6500 |
+ const struct tcphdr *th = tcp_hdr(skb); |
6501 |
+ struct tcphdr *t1; |
6502 |
+@@ -756,7 +823,10 @@ static void tcp_v6_send_response(struct sk_buff *skb, u32 seq, u32 ack, u32 win, |
6503 |
+ if (key) |
6504 |
+ tot_len += TCPOLEN_MD5SIG_ALIGNED; |
6505 |
+ #endif |
6506 |
+- |
6507 |
++#ifdef CONFIG_MPTCP |
6508 |
++ if (mptcp) |
6509 |
++ tot_len += MPTCP_SUB_LEN_DSS + MPTCP_SUB_LEN_ACK; |
6510 |
++#endif |
6511 |
+ buff = alloc_skb(MAX_HEADER + sizeof(struct ipv6hdr) + tot_len, |
6512 |
+ GFP_ATOMIC); |
6513 |
+ if (buff == NULL) |
6514 |
+@@ -794,6 +864,17 @@ static void tcp_v6_send_response(struct sk_buff *skb, u32 seq, u32 ack, u32 win, |
6515 |
+ tcp_v6_md5_hash_hdr((__u8 *)topt, key, |
6516 |
+ &ipv6_hdr(skb)->saddr, |
6517 |
+ &ipv6_hdr(skb)->daddr, t1); |
6518 |
++ topt += 4; |
6519 |
++ } |
6520 |
++#endif |
6521 |
++#ifdef CONFIG_MPTCP |
6522 |
++ if (mptcp) { |
6523 |
++ /* Construction of 32-bit data_ack */ |
6524 |
++ *topt++ = htonl((TCPOPT_MPTCP << 24) | |
6525 |
++ ((MPTCP_SUB_LEN_DSS + MPTCP_SUB_LEN_ACK) << 16) | |
6526 |
++ (0x20 << 8) | |
6527 |
++ (0x01)); |
6528 |
++ *topt++ = htonl(data_ack); |
6529 |
+ } |
6530 |
+ #endif |
6531 |
+ |
6532 |
+@@ -834,7 +915,7 @@ static void tcp_v6_send_response(struct sk_buff *skb, u32 seq, u32 ack, u32 win, |
6533 |
+ kfree_skb(buff); |
6534 |
+ } |
6535 |
+ |
6536 |
+-static void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb) |
6537 |
++void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb) |
6538 |
+ { |
6539 |
+ const struct tcphdr *th = tcp_hdr(skb); |
6540 |
+ u32 seq = 0, ack_seq = 0; |
6541 |
+@@ -891,7 +972,7 @@ static void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb) |
6542 |
+ (th->doff << 2); |
6543 |
+ |
6544 |
+ oif = sk ? sk->sk_bound_dev_if : 0; |
6545 |
+- tcp_v6_send_response(skb, seq, ack_seq, 0, 0, 0, oif, key, 1, 0, 0); |
6546 |
++ tcp_v6_send_response(skb, seq, ack_seq, 0, 0, 0, 0, oif, key, 1, 0, 0, 0); |
6547 |
+ |
6548 |
+ #ifdef CONFIG_TCP_MD5SIG |
6549 |
+ release_sk1: |
6550 |
+@@ -902,45 +983,52 @@ release_sk1: |
6551 |
+ #endif |
6552 |
+ } |
6553 |
+ |
6554 |
+-static void tcp_v6_send_ack(struct sk_buff *skb, u32 seq, u32 ack, |
6555 |
++static void tcp_v6_send_ack(struct sk_buff *skb, u32 seq, u32 ack, u32 data_ack, |
6556 |
+ u32 win, u32 tsval, u32 tsecr, int oif, |
6557 |
+ struct tcp_md5sig_key *key, u8 tclass, |
6558 |
+- u32 label) |
6559 |
++ u32 label, int mptcp) |
6560 |
+ { |
6561 |
+- tcp_v6_send_response(skb, seq, ack, win, tsval, tsecr, oif, key, 0, tclass, |
6562 |
+- label); |
6563 |
++ tcp_v6_send_response(skb, seq, ack, data_ack, win, tsval, tsecr, oif, |
6564 |
++ key, 0, tclass, label, mptcp); |
6565 |
+ } |
6566 |
+ |
6567 |
+ static void tcp_v6_timewait_ack(struct sock *sk, struct sk_buff *skb) |
6568 |
+ { |
6569 |
+ struct inet_timewait_sock *tw = inet_twsk(sk); |
6570 |
+ struct tcp_timewait_sock *tcptw = tcp_twsk(sk); |
6571 |
++ u32 data_ack = 0; |
6572 |
++ int mptcp = 0; |
6573 |
+ |
6574 |
++ if (tcptw->mptcp_tw && tcptw->mptcp_tw->meta_tw) { |
6575 |
++ data_ack = (u32)tcptw->mptcp_tw->rcv_nxt; |
6576 |
++ mptcp = 1; |
6577 |
++ } |
6578 |
+ tcp_v6_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt, |
6579 |
++ data_ack, |
6580 |
+ tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, |
6581 |
+ tcp_time_stamp + tcptw->tw_ts_offset, |
6582 |
+ tcptw->tw_ts_recent, tw->tw_bound_dev_if, tcp_twsk_md5_key(tcptw), |
6583 |
+- tw->tw_tclass, (tw->tw_flowlabel << 12)); |
6584 |
++ tw->tw_tclass, (tw->tw_flowlabel << 12), mptcp); |
6585 |
+ |
6586 |
+ inet_twsk_put(tw); |
6587 |
+ } |
6588 |
+ |
6589 |
+-static void tcp_v6_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, |
6590 |
+- struct request_sock *req) |
6591 |
++void tcp_v6_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, |
6592 |
++ struct request_sock *req) |
6593 |
+ { |
6594 |
+ /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV |
6595 |
+ * sk->sk_state == TCP_SYN_RECV -> for Fast Open. |
6596 |
+ */ |
6597 |
+ tcp_v6_send_ack(skb, (sk->sk_state == TCP_LISTEN) ? |
6598 |
+ tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt, |
6599 |
+- tcp_rsk(req)->rcv_nxt, |
6600 |
++ tcp_rsk(req)->rcv_nxt, 0, |
6601 |
+ req->rcv_wnd, tcp_time_stamp, req->ts_recent, sk->sk_bound_dev_if, |
6602 |
+ tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->daddr), |
6603 |
+- 0, 0); |
6604 |
++ 0, 0, 0); |
6605 |
+ } |
6606 |
+ |
6607 |
+ |
6608 |
+-static struct sock *tcp_v6_hnd_req(struct sock *sk, struct sk_buff *skb) |
6609 |
++struct sock *tcp_v6_hnd_req(struct sock *sk, struct sk_buff *skb) |
6610 |
+ { |
6611 |
+ struct request_sock *req, **prev; |
6612 |
+ const struct tcphdr *th = tcp_hdr(skb); |
6613 |
+@@ -959,7 +1047,13 @@ static struct sock *tcp_v6_hnd_req(struct sock *sk, struct sk_buff *skb) |
6614 |
+ |
6615 |
+ if (nsk) { |
6616 |
+ if (nsk->sk_state != TCP_TIME_WAIT) { |
6617 |
++ /* Don't lock again the meta-sk. It has been locked |
6618 |
++ * before mptcp_v6_do_rcv. |
6619 |
++ */ |
6620 |
++ if (mptcp(tcp_sk(nsk)) && !is_meta_sk(sk)) |
6621 |
++ bh_lock_sock(mptcp_meta_sk(nsk)); |
6622 |
+ bh_lock_sock(nsk); |
6623 |
++ |
6624 |
+ return nsk; |
6625 |
+ } |
6626 |
+ inet_twsk_put(inet_twsk(nsk)); |
6627 |
+@@ -973,161 +1067,25 @@ static struct sock *tcp_v6_hnd_req(struct sock *sk, struct sk_buff *skb) |
6628 |
+ return sk; |
6629 |
+ } |
6630 |
+ |
6631 |
+-/* FIXME: this is substantially similar to the ipv4 code. |
6632 |
+- * Can some kind of merge be done? -- erics |
6633 |
+- */ |
6634 |
+-static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) |
6635 |
++int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) |
6636 |
+ { |
6637 |
+- struct tcp_options_received tmp_opt; |
6638 |
+- struct request_sock *req; |
6639 |
+- struct inet_request_sock *ireq; |
6640 |
+- struct ipv6_pinfo *np = inet6_sk(sk); |
6641 |
+- struct tcp_sock *tp = tcp_sk(sk); |
6642 |
+- __u32 isn = TCP_SKB_CB(skb)->when; |
6643 |
+- struct dst_entry *dst = NULL; |
6644 |
+- struct tcp_fastopen_cookie foc = { .len = -1 }; |
6645 |
+- bool want_cookie = false, fastopen; |
6646 |
+- struct flowi6 fl6; |
6647 |
+- int err; |
6648 |
+- |
6649 |
+ if (skb->protocol == htons(ETH_P_IP)) |
6650 |
+ return tcp_v4_conn_request(sk, skb); |
6651 |
+ |
6652 |
+ if (!ipv6_unicast_destination(skb)) |
6653 |
+ goto drop; |
6654 |
+ |
6655 |
+- if ((sysctl_tcp_syncookies == 2 || |
6656 |
+- inet_csk_reqsk_queue_is_full(sk)) && !isn) { |
6657 |
+- want_cookie = tcp_syn_flood_action(sk, skb, "TCPv6"); |
6658 |
+- if (!want_cookie) |
6659 |
+- goto drop; |
6660 |
+- } |
6661 |
+- |
6662 |
+- if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) { |
6663 |
+- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); |
6664 |
+- goto drop; |
6665 |
+- } |
6666 |
+- |
6667 |
+- req = inet6_reqsk_alloc(&tcp6_request_sock_ops); |
6668 |
+- if (req == NULL) |
6669 |
+- goto drop; |
6670 |
+- |
6671 |
+-#ifdef CONFIG_TCP_MD5SIG |
6672 |
+- tcp_rsk(req)->af_specific = &tcp_request_sock_ipv6_ops; |
6673 |
+-#endif |
6674 |
+- |
6675 |
+- tcp_clear_options(&tmp_opt); |
6676 |
+- tmp_opt.mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr); |
6677 |
+- tmp_opt.user_mss = tp->rx_opt.user_mss; |
6678 |
+- tcp_parse_options(skb, &tmp_opt, 0, want_cookie ? NULL : &foc); |
6679 |
+- |
6680 |
+- if (want_cookie && !tmp_opt.saw_tstamp) |
6681 |
+- tcp_clear_options(&tmp_opt); |
6682 |
++ return tcp_conn_request(&tcp6_request_sock_ops, |
6683 |
++ &tcp_request_sock_ipv6_ops, sk, skb); |
6684 |
+ |
6685 |
+- tmp_opt.tstamp_ok = tmp_opt.saw_tstamp; |
6686 |
+- tcp_openreq_init(req, &tmp_opt, skb); |
6687 |
+- |
6688 |
+- ireq = inet_rsk(req); |
6689 |
+- ireq->ir_v6_rmt_addr = ipv6_hdr(skb)->saddr; |
6690 |
+- ireq->ir_v6_loc_addr = ipv6_hdr(skb)->daddr; |
6691 |
+- if (!want_cookie || tmp_opt.tstamp_ok) |
6692 |
+- TCP_ECN_create_request(req, skb, sock_net(sk)); |
6693 |
+- |
6694 |
+- ireq->ir_iif = sk->sk_bound_dev_if; |
6695 |
+- ireq->ir_mark = inet_request_mark(sk, skb); |
6696 |
+- |
6697 |
+- /* So that link locals have meaning */ |
6698 |
+- if (!sk->sk_bound_dev_if && |
6699 |
+- ipv6_addr_type(&ireq->ir_v6_rmt_addr) & IPV6_ADDR_LINKLOCAL) |
6700 |
+- ireq->ir_iif = inet6_iif(skb); |
6701 |
+- |
6702 |
+- if (!isn) { |
6703 |
+- if (ipv6_opt_accepted(sk, skb) || |
6704 |
+- np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo || |
6705 |
+- np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim || |
6706 |
+- np->repflow) { |
6707 |
+- atomic_inc(&skb->users); |
6708 |
+- ireq->pktopts = skb; |
6709 |
+- } |
6710 |
+- |
6711 |
+- if (want_cookie) { |
6712 |
+- isn = cookie_v6_init_sequence(sk, skb, &req->mss); |
6713 |
+- req->cookie_ts = tmp_opt.tstamp_ok; |
6714 |
+- goto have_isn; |
6715 |
+- } |
6716 |
+- |
6717 |
+- /* VJ's idea. We save last timestamp seen |
6718 |
+- * from the destination in peer table, when entering |
6719 |
+- * state TIME-WAIT, and check against it before |
6720 |
+- * accepting new connection request. |
6721 |
+- * |
6722 |
+- * If "isn" is not zero, this request hit alive |
6723 |
+- * timewait bucket, so that all the necessary checks |
6724 |
+- * are made in the function processing timewait state. |
6725 |
+- */ |
6726 |
+- if (tmp_opt.saw_tstamp && |
6727 |
+- tcp_death_row.sysctl_tw_recycle && |
6728 |
+- (dst = inet6_csk_route_req(sk, &fl6, req)) != NULL) { |
6729 |
+- if (!tcp_peer_is_proven(req, dst, true)) { |
6730 |
+- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED); |
6731 |
+- goto drop_and_release; |
6732 |
+- } |
6733 |
+- } |
6734 |
+- /* Kill the following clause, if you dislike this way. */ |
6735 |
+- else if (!sysctl_tcp_syncookies && |
6736 |
+- (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) < |
6737 |
+- (sysctl_max_syn_backlog >> 2)) && |
6738 |
+- !tcp_peer_is_proven(req, dst, false)) { |
6739 |
+- /* Without syncookies last quarter of |
6740 |
+- * backlog is filled with destinations, |
6741 |
+- * proven to be alive. |
6742 |
+- * It means that we continue to communicate |
6743 |
+- * to destinations, already remembered |
6744 |
+- * to the moment of synflood. |
6745 |
+- */ |
6746 |
+- LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open request from %pI6/%u\n", |
6747 |
+- &ireq->ir_v6_rmt_addr, ntohs(tcp_hdr(skb)->source)); |
6748 |
+- goto drop_and_release; |
6749 |
+- } |
6750 |
+- |
6751 |
+- isn = tcp_v6_init_sequence(skb); |
6752 |
+- } |
6753 |
+-have_isn: |
6754 |
+- |
6755 |
+- if (security_inet_conn_request(sk, skb, req)) |
6756 |
+- goto drop_and_release; |
6757 |
+- |
6758 |
+- if (!dst && (dst = inet6_csk_route_req(sk, &fl6, req)) == NULL) |
6759 |
+- goto drop_and_free; |
6760 |
+- |
6761 |
+- tcp_rsk(req)->snt_isn = isn; |
6762 |
+- tcp_rsk(req)->snt_synack = tcp_time_stamp; |
6763 |
+- tcp_openreq_init_rwin(req, sk, dst); |
6764 |
+- fastopen = !want_cookie && |
6765 |
+- tcp_try_fastopen(sk, skb, req, &foc, dst); |
6766 |
+- err = tcp_v6_send_synack(sk, dst, &fl6, req, |
6767 |
+- skb_get_queue_mapping(skb), &foc); |
6768 |
+- if (!fastopen) { |
6769 |
+- if (err || want_cookie) |
6770 |
+- goto drop_and_free; |
6771 |
+- |
6772 |
+- tcp_rsk(req)->listener = NULL; |
6773 |
+- inet6_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT); |
6774 |
+- } |
6775 |
+- return 0; |
6776 |
+- |
6777 |
+-drop_and_release: |
6778 |
+- dst_release(dst); |
6779 |
+-drop_and_free: |
6780 |
+- reqsk_free(req); |
6781 |
+ drop: |
6782 |
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS); |
6783 |
+ return 0; /* don't send reset */ |
6784 |
+ } |
6785 |
+ |
6786 |
+-static struct sock *tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, |
6787 |
+- struct request_sock *req, |
6788 |
+- struct dst_entry *dst) |
6789 |
++struct sock *tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, |
6790 |
++ struct request_sock *req, |
6791 |
++ struct dst_entry *dst) |
6792 |
+ { |
6793 |
+ struct inet_request_sock *ireq; |
6794 |
+ struct ipv6_pinfo *newnp, *np = inet6_sk(sk); |
6795 |
+@@ -1165,7 +1123,12 @@ static struct sock *tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, |
6796 |
+ |
6797 |
+ newsk->sk_v6_rcv_saddr = newnp->saddr; |
6798 |
+ |
6799 |
+- inet_csk(newsk)->icsk_af_ops = &ipv6_mapped; |
6800 |
++#ifdef CONFIG_MPTCP |
6801 |
++ if (is_mptcp_enabled(newsk)) |
6802 |
++ inet_csk(newsk)->icsk_af_ops = &mptcp_v6_mapped; |
6803 |
++ else |
6804 |
++#endif |
6805 |
++ inet_csk(newsk)->icsk_af_ops = &ipv6_mapped; |
6806 |
+ newsk->sk_backlog_rcv = tcp_v4_do_rcv; |
6807 |
+ #ifdef CONFIG_TCP_MD5SIG |
6808 |
+ newtp->af_specific = &tcp_sock_ipv6_mapped_specific; |
6809 |
+@@ -1329,7 +1292,7 @@ out: |
6810 |
+ * This is because we cannot sleep with the original spinlock |
6811 |
+ * held. |
6812 |
+ */ |
6813 |
+-static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) |
6814 |
++int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) |
6815 |
+ { |
6816 |
+ struct ipv6_pinfo *np = inet6_sk(sk); |
6817 |
+ struct tcp_sock *tp; |
6818 |
+@@ -1351,6 +1314,9 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) |
6819 |
+ goto discard; |
6820 |
+ #endif |
6821 |
+ |
6822 |
++ if (is_meta_sk(sk)) |
6823 |
++ return mptcp_v6_do_rcv(sk, skb); |
6824 |
++ |
6825 |
+ if (sk_filter(sk, skb)) |
6826 |
+ goto discard; |
6827 |
+ |
6828 |
+@@ -1472,7 +1438,7 @@ static int tcp_v6_rcv(struct sk_buff *skb) |
6829 |
+ { |
6830 |
+ const struct tcphdr *th; |
6831 |
+ const struct ipv6hdr *hdr; |
6832 |
+- struct sock *sk; |
6833 |
++ struct sock *sk, *meta_sk = NULL; |
6834 |
+ int ret; |
6835 |
+ struct net *net = dev_net(skb->dev); |
6836 |
+ |
6837 |
+@@ -1503,18 +1469,43 @@ static int tcp_v6_rcv(struct sk_buff *skb) |
6838 |
+ TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + |
6839 |
+ skb->len - th->doff*4); |
6840 |
+ TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); |
6841 |
++#ifdef CONFIG_MPTCP |
6842 |
++ TCP_SKB_CB(skb)->mptcp_flags = 0; |
6843 |
++ TCP_SKB_CB(skb)->dss_off = 0; |
6844 |
++#endif |
6845 |
+ TCP_SKB_CB(skb)->when = 0; |
6846 |
+ TCP_SKB_CB(skb)->ip_dsfield = ipv6_get_dsfield(hdr); |
6847 |
+ TCP_SKB_CB(skb)->sacked = 0; |
6848 |
+ |
6849 |
+ sk = __inet6_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest); |
6850 |
+- if (!sk) |
6851 |
+- goto no_tcp_socket; |
6852 |
+ |
6853 |
+ process: |
6854 |
+- if (sk->sk_state == TCP_TIME_WAIT) |
6855 |
++ if (sk && sk->sk_state == TCP_TIME_WAIT) |
6856 |
+ goto do_time_wait; |
6857 |
+ |
6858 |
++#ifdef CONFIG_MPTCP |
6859 |
++ if (!sk && th->syn && !th->ack) { |
6860 |
++ int ret = mptcp_lookup_join(skb, NULL); |
6861 |
++ |
6862 |
++ if (ret < 0) { |
6863 |
++ tcp_v6_send_reset(NULL, skb); |
6864 |
++ goto discard_it; |
6865 |
++ } else if (ret > 0) { |
6866 |
++ return 0; |
6867 |
++ } |
6868 |
++ } |
6869 |
++ |
6870 |
++ /* Is there a pending request sock for this segment ? */ |
6871 |
++ if ((!sk || sk->sk_state == TCP_LISTEN) && mptcp_check_req(skb, net)) { |
6872 |
++ if (sk) |
6873 |
++ sock_put(sk); |
6874 |
++ return 0; |
6875 |
++ } |
6876 |
++#endif |
6877 |
++ |
6878 |
++ if (!sk) |
6879 |
++ goto no_tcp_socket; |
6880 |
++ |
6881 |
+ if (hdr->hop_limit < inet6_sk(sk)->min_hopcount) { |
6882 |
+ NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP); |
6883 |
+ goto discard_and_relse; |
6884 |
+@@ -1529,11 +1520,21 @@ process: |
6885 |
+ sk_mark_napi_id(sk, skb); |
6886 |
+ skb->dev = NULL; |
6887 |
+ |
6888 |
+- bh_lock_sock_nested(sk); |
6889 |
++ if (mptcp(tcp_sk(sk))) { |
6890 |
++ meta_sk = mptcp_meta_sk(sk); |
6891 |
++ |
6892 |
++ bh_lock_sock_nested(meta_sk); |
6893 |
++ if (sock_owned_by_user(meta_sk)) |
6894 |
++ skb->sk = sk; |
6895 |
++ } else { |
6896 |
++ meta_sk = sk; |
6897 |
++ bh_lock_sock_nested(sk); |
6898 |
++ } |
6899 |
++ |
6900 |
+ ret = 0; |
6901 |
+- if (!sock_owned_by_user(sk)) { |
6902 |
++ if (!sock_owned_by_user(meta_sk)) { |
6903 |
+ #ifdef CONFIG_NET_DMA |
6904 |
+- struct tcp_sock *tp = tcp_sk(sk); |
6905 |
++ struct tcp_sock *tp = tcp_sk(meta_sk); |
6906 |
+ if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list) |
6907 |
+ tp->ucopy.dma_chan = net_dma_find_channel(); |
6908 |
+ if (tp->ucopy.dma_chan) |
6909 |
+@@ -1541,16 +1542,17 @@ process: |
6910 |
+ else |
6911 |
+ #endif |
6912 |
+ { |
6913 |
+- if (!tcp_prequeue(sk, skb)) |
6914 |
++ if (!tcp_prequeue(meta_sk, skb)) |
6915 |
+ ret = tcp_v6_do_rcv(sk, skb); |
6916 |
+ } |
6917 |
+- } else if (unlikely(sk_add_backlog(sk, skb, |
6918 |
+- sk->sk_rcvbuf + sk->sk_sndbuf))) { |
6919 |
+- bh_unlock_sock(sk); |
6920 |
++ } else if (unlikely(sk_add_backlog(meta_sk, skb, |
6921 |
++ meta_sk->sk_rcvbuf + meta_sk->sk_sndbuf))) { |
6922 |
++ bh_unlock_sock(meta_sk); |
6923 |
+ NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP); |
6924 |
+ goto discard_and_relse; |
6925 |
+ } |
6926 |
+- bh_unlock_sock(sk); |
6927 |
++ |
6928 |
++ bh_unlock_sock(meta_sk); |
6929 |
+ |
6930 |
+ sock_put(sk); |
6931 |
+ return ret ? -1 : 0; |
6932 |
+@@ -1607,6 +1609,18 @@ do_time_wait: |
6933 |
+ sk = sk2; |
6934 |
+ goto process; |
6935 |
+ } |
6936 |
++#ifdef CONFIG_MPTCP |
6937 |
++ if (th->syn && !th->ack) { |
6938 |
++ int ret = mptcp_lookup_join(skb, inet_twsk(sk)); |
6939 |
++ |
6940 |
++ if (ret < 0) { |
6941 |
++ tcp_v6_send_reset(NULL, skb); |
6942 |
++ goto discard_it; |
6943 |
++ } else if (ret > 0) { |
6944 |
++ return 0; |
6945 |
++ } |
6946 |
++ } |
6947 |
++#endif |
6948 |
+ /* Fall through to ACK */ |
6949 |
+ } |
6950 |
+ case TCP_TW_ACK: |
6951 |
+@@ -1657,7 +1671,7 @@ static void tcp_v6_early_demux(struct sk_buff *skb) |
6952 |
+ } |
6953 |
+ } |
6954 |
+ |
6955 |
+-static struct timewait_sock_ops tcp6_timewait_sock_ops = { |
6956 |
++struct timewait_sock_ops tcp6_timewait_sock_ops = { |
6957 |
+ .twsk_obj_size = sizeof(struct tcp6_timewait_sock), |
6958 |
+ .twsk_unique = tcp_twsk_unique, |
6959 |
+ .twsk_destructor = tcp_twsk_destructor, |
6960 |
+@@ -1730,7 +1744,12 @@ static int tcp_v6_init_sock(struct sock *sk) |
6961 |
+ |
6962 |
+ tcp_init_sock(sk); |
6963 |
+ |
6964 |
+- icsk->icsk_af_ops = &ipv6_specific; |
6965 |
++#ifdef CONFIG_MPTCP |
6966 |
++ if (is_mptcp_enabled(sk)) |
6967 |
++ icsk->icsk_af_ops = &mptcp_v6_specific; |
6968 |
++ else |
6969 |
++#endif |
6970 |
++ icsk->icsk_af_ops = &ipv6_specific; |
6971 |
+ |
6972 |
+ #ifdef CONFIG_TCP_MD5SIG |
6973 |
+ tcp_sk(sk)->af_specific = &tcp_sock_ipv6_specific; |
6974 |
+@@ -1739,7 +1758,7 @@ static int tcp_v6_init_sock(struct sock *sk) |
6975 |
+ return 0; |
6976 |
+ } |
6977 |
+ |
6978 |
+-static void tcp_v6_destroy_sock(struct sock *sk) |
6979 |
++void tcp_v6_destroy_sock(struct sock *sk) |
6980 |
+ { |
6981 |
+ tcp_v4_destroy_sock(sk); |
6982 |
+ inet6_destroy_sock(sk); |
6983 |
+@@ -1924,12 +1943,28 @@ void tcp6_proc_exit(struct net *net) |
6984 |
+ static void tcp_v6_clear_sk(struct sock *sk, int size) |
6985 |
+ { |
6986 |
+ struct inet_sock *inet = inet_sk(sk); |
6987 |
++#ifdef CONFIG_MPTCP |
6988 |
++ struct tcp_sock *tp = tcp_sk(sk); |
6989 |
++ /* size_tk_table goes from the end of tk_table to the end of sk */ |
6990 |
++ int size_tk_table = size - offsetof(struct tcp_sock, tk_table) - |
6991 |
++ sizeof(tp->tk_table); |
6992 |
++#endif |
6993 |
+ |
6994 |
+ /* we do not want to clear pinet6 field, because of RCU lookups */ |
6995 |
+ sk_prot_clear_nulls(sk, offsetof(struct inet_sock, pinet6)); |
6996 |
+ |
6997 |
+ size -= offsetof(struct inet_sock, pinet6) + sizeof(inet->pinet6); |
6998 |
++ |
6999 |
++#ifdef CONFIG_MPTCP |
7000 |
++ /* We zero out only from pinet6 to tk_table */ |
7001 |
++ size -= size_tk_table + sizeof(tp->tk_table); |
7002 |
++#endif |
7003 |
+ memset(&inet->pinet6 + 1, 0, size); |
7004 |
++ |
7005 |
++#ifdef CONFIG_MPTCP |
7006 |
++ memset((char *)&tp->tk_table + sizeof(tp->tk_table), 0, size_tk_table); |
7007 |
++#endif |
7008 |
++ |
7009 |
+ } |
7010 |
+ |
7011 |
+ struct proto tcpv6_prot = { |
7012 |
+diff --git a/net/mptcp/Kconfig b/net/mptcp/Kconfig |
7013 |
+new file mode 100644 |
7014 |
+index 000000000000..cdfc03adabf8 |
7015 |
+--- /dev/null |
7016 |
++++ b/net/mptcp/Kconfig |
7017 |
+@@ -0,0 +1,115 @@ |
7018 |
++# |
7019 |
++# MPTCP configuration |
7020 |
++# |
7021 |
++config MPTCP |
7022 |
++ bool "MPTCP protocol" |
7023 |
++ depends on (IPV6=y || IPV6=n) |
7024 |
++ ---help--- |
7025 |
++ This replaces the normal TCP stack with a Multipath TCP stack, |
7026 |
++ able to use several paths at once. |
7027 |
++ |
7028 |
++menuconfig MPTCP_PM_ADVANCED |
7029 |
++ bool "MPTCP: advanced path-manager control" |
7030 |
++ depends on MPTCP=y |
7031 |
++ ---help--- |
7032 |
++ Support for selection of different path-managers. You should choose 'Y' here, |
7033 |
++ because otherwise you will not actively create new MPTCP-subflows. |
7034 |
++ |
7035 |
++if MPTCP_PM_ADVANCED |
7036 |
++ |
7037 |
++config MPTCP_FULLMESH |
7038 |
++ tristate "MPTCP Full-Mesh Path-Manager" |
7039 |
++ depends on MPTCP=y |
7040 |
++ ---help--- |
7041 |
++ This path-management module will create a full-mesh among all IP-addresses. |
7042 |
++ |
7043 |
++config MPTCP_NDIFFPORTS |
7044 |
++ tristate "MPTCP ndiff-ports" |
7045 |
++ depends on MPTCP=y |
7046 |
++ ---help--- |
7047 |
++ This path-management module will create multiple subflows between the same |
7048 |
++ pair of IP-addresses, modifying the source-port. You can set the number |
7049 |
++ of subflows via the mptcp_ndiffports-sysctl. |
7050 |
++ |
7051 |
++config MPTCP_BINDER |
7052 |
++ tristate "MPTCP Binder" |
7053 |
++ depends on (MPTCP=y) |
7054 |
++ ---help--- |
7055 |
++ This path-management module works like ndiffports, and adds the sysctl |
7056 |
++ option to set the gateway (and/or path to) per each additional subflow |
7057 |
++ via Loose Source Routing (IPv4 only). |
7058 |
++ |
7059 |
++choice |
7060 |
++ prompt "Default MPTCP Path-Manager" |
7061 |
++ default DEFAULT |
7062 |
++ help |
7063 |
++ Select the Path-Manager of your choice |
7064 |
++ |
7065 |
++ config DEFAULT_FULLMESH |
7066 |
++ bool "Full mesh" if MPTCP_FULLMESH=y |
7067 |
++ |
7068 |
++ config DEFAULT_NDIFFPORTS |
7069 |
++ bool "ndiff-ports" if MPTCP_NDIFFPORTS=y |
7070 |
++ |
7071 |
++ config DEFAULT_BINDER |
7072 |
++ bool "binder" if MPTCP_BINDER=y |
7073 |
++ |
7074 |
++ config DEFAULT_DUMMY |
7075 |
++ bool "Default" |
7076 |
++ |
7077 |
++endchoice |
7078 |
++ |
7079 |
++endif |
7080 |
++ |
7081 |
++config DEFAULT_MPTCP_PM |
7082 |
++ string |
7083 |
++ default "default" if DEFAULT_DUMMY |
7084 |
++ default "fullmesh" if DEFAULT_FULLMESH |
7085 |
++ default "ndiffports" if DEFAULT_NDIFFPORTS |
7086 |
++ default "binder" if DEFAULT_BINDER |
7087 |
++ default "default" |
7088 |
++ |
7089 |
++menuconfig MPTCP_SCHED_ADVANCED |
7090 |
++ bool "MPTCP: advanced scheduler control" |
7091 |
++ depends on MPTCP=y |
7092 |
++ ---help--- |
7093 |
++ Support for selection of different schedulers. You should choose 'Y' here, |
7094 |
++ if you want to choose a different scheduler than the default one. |
7095 |
++ |
7096 |
++if MPTCP_SCHED_ADVANCED |
7097 |
++ |
7098 |
++config MPTCP_ROUNDROBIN |
7099 |
++ tristate "MPTCP Round-Robin" |
7100 |
++ depends on (MPTCP=y) |
7101 |
++ ---help--- |
7102 |
++ This is a very simple round-robin scheduler. Probably has bad performance |
7103 |
++ but might be interesting for researchers. |
7104 |
++ |
7105 |
++choice |
7106 |
++ prompt "Default MPTCP Scheduler" |
7107 |
++ default DEFAULT |
7108 |
++ help |
7109 |
++ Select the Scheduler of your choice |
7110 |
++ |
7111 |
++ config DEFAULT_SCHEDULER |
7112 |
++ bool "Default" |
7113 |
++ ---help--- |
7114 |
++ This is the default scheduler, sending first on the subflow |
7115 |
++ with the lowest RTT. |
7116 |
++ |
7117 |
++ config DEFAULT_ROUNDROBIN |
7118 |
++ bool "Round-Robin" if MPTCP_ROUNDROBIN=y |
7119 |
++ ---help--- |
7120 |
++ This is the round-rob scheduler, sending in a round-robin |
7121 |
++ fashion.. |
7122 |
++ |
7123 |
++endchoice |
7124 |
++endif |
7125 |
++ |
7126 |
++config DEFAULT_MPTCP_SCHED |
7127 |
++ string |
7128 |
++ depends on (MPTCP=y) |
7129 |
++ default "default" if DEFAULT_SCHEDULER |
7130 |
++ default "roundrobin" if DEFAULT_ROUNDROBIN |
7131 |
++ default "default" |
7132 |
++ |
7133 |
+diff --git a/net/mptcp/Makefile b/net/mptcp/Makefile |
7134 |
+new file mode 100644 |
7135 |
+index 000000000000..35561a7012e3 |
7136 |
+--- /dev/null |
7137 |
++++ b/net/mptcp/Makefile |
7138 |
+@@ -0,0 +1,20 @@ |
7139 |
++# |
7140 |
++## Makefile for MultiPath TCP support code. |
7141 |
++# |
7142 |
++# |
7143 |
++ |
7144 |
++obj-$(CONFIG_MPTCP) += mptcp.o |
7145 |
++ |
7146 |
++mptcp-y := mptcp_ctrl.o mptcp_ipv4.o mptcp_ofo_queue.o mptcp_pm.o \ |
7147 |
++ mptcp_output.o mptcp_input.o mptcp_sched.o |
7148 |
++ |
7149 |
++obj-$(CONFIG_TCP_CONG_COUPLED) += mptcp_coupled.o |
7150 |
++obj-$(CONFIG_TCP_CONG_OLIA) += mptcp_olia.o |
7151 |
++obj-$(CONFIG_TCP_CONG_WVEGAS) += mptcp_wvegas.o |
7152 |
++obj-$(CONFIG_MPTCP_FULLMESH) += mptcp_fullmesh.o |
7153 |
++obj-$(CONFIG_MPTCP_NDIFFPORTS) += mptcp_ndiffports.o |
7154 |
++obj-$(CONFIG_MPTCP_BINDER) += mptcp_binder.o |
7155 |
++obj-$(CONFIG_MPTCP_ROUNDROBIN) += mptcp_rr.o |
7156 |
++ |
7157 |
++mptcp-$(subst m,y,$(CONFIG_IPV6)) += mptcp_ipv6.o |
7158 |
++ |
7159 |
+diff --git a/net/mptcp/mptcp_binder.c b/net/mptcp/mptcp_binder.c |
7160 |
+new file mode 100644 |
7161 |
+index 000000000000..95d8da560715 |
7162 |
+--- /dev/null |
7163 |
++++ b/net/mptcp/mptcp_binder.c |
7164 |
+@@ -0,0 +1,487 @@ |
7165 |
++#include <linux/module.h> |
7166 |
++ |
7167 |
++#include <net/mptcp.h> |
7168 |
++#include <net/mptcp_v4.h> |
7169 |
++ |
7170 |
++#include <linux/route.h> |
7171 |
++#include <linux/inet.h> |
7172 |
++#include <linux/mroute.h> |
7173 |
++#include <linux/spinlock_types.h> |
7174 |
++#include <net/inet_ecn.h> |
7175 |
++#include <net/route.h> |
7176 |
++#include <net/xfrm.h> |
7177 |
++#include <net/compat.h> |
7178 |
++#include <linux/slab.h> |
7179 |
++ |
7180 |
++#define MPTCP_GW_MAX_LISTS 10 |
7181 |
++#define MPTCP_GW_LIST_MAX_LEN 6 |
7182 |
++#define MPTCP_GW_SYSCTL_MAX_LEN (15 * MPTCP_GW_LIST_MAX_LEN * \ |
7183 |
++ MPTCP_GW_MAX_LISTS) |
7184 |
++ |
7185 |
++struct mptcp_gw_list { |
7186 |
++ struct in_addr list[MPTCP_GW_MAX_LISTS][MPTCP_GW_LIST_MAX_LEN]; |
7187 |
++ u8 len[MPTCP_GW_MAX_LISTS]; |
7188 |
++}; |
7189 |
++ |
7190 |
++struct binder_priv { |
7191 |
++ /* Worker struct for subflow establishment */ |
7192 |
++ struct work_struct subflow_work; |
7193 |
++ |
7194 |
++ struct mptcp_cb *mpcb; |
7195 |
++ |
7196 |
++ /* Prevent multiple sub-sockets concurrently iterating over sockets */ |
7197 |
++ spinlock_t *flow_lock; |
7198 |
++}; |
7199 |
++ |
7200 |
++static struct mptcp_gw_list *mptcp_gws; |
7201 |
++static rwlock_t mptcp_gws_lock; |
7202 |
++ |
7203 |
++static int mptcp_binder_ndiffports __read_mostly = 1; |
7204 |
++ |
7205 |
++static char sysctl_mptcp_binder_gateways[MPTCP_GW_SYSCTL_MAX_LEN] __read_mostly; |
7206 |
++ |
7207 |
++static int mptcp_get_avail_list_ipv4(struct sock *sk) |
7208 |
++{ |
7209 |
++ int i, j, list_taken, opt_ret, opt_len; |
7210 |
++ unsigned char *opt_ptr, *opt_end_ptr, opt[MAX_IPOPTLEN]; |
7211 |
++ |
7212 |
++ for (i = 0; i < MPTCP_GW_MAX_LISTS; ++i) { |
7213 |
++ if (mptcp_gws->len[i] == 0) |
7214 |
++ goto error; |
7215 |
++ |
7216 |
++ mptcp_debug("mptcp_get_avail_list_ipv4: List %i\n", i); |
7217 |
++ list_taken = 0; |
7218 |
++ |
7219 |
++ /* Loop through all sub-sockets in this connection */ |
7220 |
++ mptcp_for_each_sk(tcp_sk(sk)->mpcb, sk) { |
7221 |
++ mptcp_debug("mptcp_get_avail_list_ipv4: Next sock\n"); |
7222 |
++ |
7223 |
++ /* Reset length and options buffer, then retrieve |
7224 |
++ * from socket |
7225 |
++ */ |
7226 |
++ opt_len = MAX_IPOPTLEN; |
7227 |
++ memset(opt, 0, MAX_IPOPTLEN); |
7228 |
++ opt_ret = ip_getsockopt(sk, IPPROTO_IP, |
7229 |
++ IP_OPTIONS, opt, &opt_len); |
7230 |
++ if (opt_ret < 0) { |
7231 |
++ mptcp_debug(KERN_ERR "%s: MPTCP subsocket getsockopt() IP_OPTIONS failed, error %d\n", |
7232 |
++ __func__, opt_ret); |
7233 |
++ goto error; |
7234 |
++ } |
7235 |
++ |
7236 |
++ /* If socket has no options, it has no stake in this list */ |
7237 |
++ if (opt_len <= 0) |
7238 |
++ continue; |
7239 |
++ |
7240 |
++ /* Iterate options buffer */ |
7241 |
++ for (opt_ptr = &opt[0]; opt_ptr < &opt[opt_len]; opt_ptr++) { |
7242 |
++ if (*opt_ptr == IPOPT_LSRR) { |
7243 |
++ mptcp_debug("mptcp_get_avail_list_ipv4: LSRR options found\n"); |
7244 |
++ goto sock_lsrr; |
7245 |
++ } |
7246 |
++ } |
7247 |
++ continue; |
7248 |
++ |
7249 |
++sock_lsrr: |
7250 |
++ /* Pointer to the 2nd to last address */ |
7251 |
++ opt_end_ptr = opt_ptr+(*(opt_ptr+1))-4; |
7252 |
++ |
7253 |
++ /* Addresses start 3 bytes after type offset */ |
7254 |
++ opt_ptr += 3; |
7255 |
++ j = 0; |
7256 |
++ |
7257 |
++ /* Different length lists cannot be the same */ |
7258 |
++ if ((opt_end_ptr-opt_ptr)/4 != mptcp_gws->len[i]) |
7259 |
++ continue; |
7260 |
++ |
7261 |
++ /* Iterate if we are still inside options list |
7262 |
++ * and sysctl list |
7263 |
++ */ |
7264 |
++ while (opt_ptr < opt_end_ptr && j < mptcp_gws->len[i]) { |
7265 |
++ /* If there is a different address, this list must |
7266 |
++ * not be set on this socket |
7267 |
++ */ |
7268 |
++ if (memcmp(&mptcp_gws->list[i][j], opt_ptr, 4)) |
7269 |
++ break; |
7270 |
++ |
7271 |
++ /* Jump 4 bytes to next address */ |
7272 |
++ opt_ptr += 4; |
7273 |
++ j++; |
7274 |
++ } |
7275 |
++ |
7276 |
++ /* Reached the end without a differing address, lists |
7277 |
++ * are therefore identical. |
7278 |
++ */ |
7279 |
++ if (j == mptcp_gws->len[i]) { |
7280 |
++ mptcp_debug("mptcp_get_avail_list_ipv4: List already used\n"); |
7281 |
++ list_taken = 1; |
7282 |
++ break; |
7283 |
++ } |
7284 |
++ } |
7285 |
++ |
7286 |
++ /* Free list found if not taken by a socket */ |
7287 |
++ if (!list_taken) { |
7288 |
++ mptcp_debug("mptcp_get_avail_list_ipv4: List free\n"); |
7289 |
++ break; |
7290 |
++ } |
7291 |
++ } |
7292 |
++ |
7293 |
++ if (i >= MPTCP_GW_MAX_LISTS) |
7294 |
++ goto error; |
7295 |
++ |
7296 |
++ return i; |
7297 |
++error: |
7298 |
++ return -1; |
7299 |
++} |
7300 |
++ |
7301 |
++/* The list of addresses is parsed each time a new connection is opened, |
7302 |
++ * to make sure it's up to date. In case of error, all the lists are |
7303 |
++ * marked as unavailable and the subflow's fingerprint is set to 0. |
7304 |
++ */ |
7305 |
++static void mptcp_v4_add_lsrr(struct sock *sk, struct in_addr addr) |
7306 |
++{ |
7307 |
++ int i, j, ret; |
7308 |
++ unsigned char opt[MAX_IPOPTLEN] = {0}; |
7309 |
++ struct tcp_sock *tp = tcp_sk(sk); |
7310 |
++ struct binder_priv *fmp = (struct binder_priv *)&tp->mpcb->mptcp_pm[0]; |
7311 |
++ |
7312 |
++ /* Read lock: multiple sockets can read LSRR addresses at the same |
7313 |
++ * time, but writes are done in mutual exclusion. |
7314 |
++ * Spin lock: must search for free list for one socket at a time, or |
7315 |
++ * multiple sockets could take the same list. |
7316 |
++ */ |
7317 |
++ read_lock(&mptcp_gws_lock); |
7318 |
++ spin_lock(fmp->flow_lock); |
7319 |
++ |
7320 |
++ i = mptcp_get_avail_list_ipv4(sk); |
7321 |
++ |
7322 |
++ /* Execution enters here only if a free path is found. |
7323 |
++ */ |
7324 |
++ if (i >= 0) { |
7325 |
++ opt[0] = IPOPT_NOP; |
7326 |
++ opt[1] = IPOPT_LSRR; |
7327 |
++ opt[2] = sizeof(mptcp_gws->list[i][0].s_addr) * |
7328 |
++ (mptcp_gws->len[i] + 1) + 3; |
7329 |
++ opt[3] = IPOPT_MINOFF; |
7330 |
++ for (j = 0; j < mptcp_gws->len[i]; ++j) |
7331 |
++ memcpy(opt + 4 + |
7332 |
++ (j * sizeof(mptcp_gws->list[i][0].s_addr)), |
7333 |
++ &mptcp_gws->list[i][j].s_addr, |
7334 |
++ sizeof(mptcp_gws->list[i][0].s_addr)); |
7335 |
++ /* Final destination must be part of IP_OPTIONS parameter. */ |
7336 |
++ memcpy(opt + 4 + (j * sizeof(addr.s_addr)), &addr.s_addr, |
7337 |
++ sizeof(addr.s_addr)); |
7338 |
++ |
7339 |
++ /* setsockopt must be inside the lock, otherwise another |
7340 |
++ * subflow could fail to see that we have taken a list. |
7341 |
++ */ |
7342 |
++ ret = ip_setsockopt(sk, IPPROTO_IP, IP_OPTIONS, opt, |
7343 |
++ 4 + sizeof(mptcp_gws->list[i][0].s_addr) |
7344 |
++ * (mptcp_gws->len[i] + 1)); |
7345 |
++ |
7346 |
++ if (ret < 0) { |
7347 |
++ mptcp_debug(KERN_ERR "%s: MPTCP subsock setsockopt() IP_OPTIONS failed, error %d\n", |
7348 |
++ __func__, ret); |
7349 |
++ } |
7350 |
++ } |
7351 |
++ |
7352 |
++ spin_unlock(fmp->flow_lock); |
7353 |
++ read_unlock(&mptcp_gws_lock); |
7354 |
++ |
7355 |
++ return; |
7356 |
++} |
7357 |
++ |
7358 |
++/* Parses gateways string for a list of paths to different |
7359 |
++ * gateways, and stores them for use with the Loose Source Routing (LSRR) |
7360 |
++ * socket option. Each list must have "," separated addresses, and the lists |
7361 |
++ * themselves must be separated by "-". Returns -1 in case one or more of the |
7362 |
++ * addresses is not a valid ipv4/6 address. |
7363 |
++ */ |
7364 |
++static int mptcp_parse_gateway_ipv4(char *gateways) |
7365 |
++{ |
7366 |
++ int i, j, k, ret; |
7367 |
++ char *tmp_string = NULL; |
7368 |
++ struct in_addr tmp_addr; |
7369 |
++ |
7370 |
++ tmp_string = kzalloc(16, GFP_KERNEL); |
7371 |
++ if (tmp_string == NULL) |
7372 |
++ return -ENOMEM; |
7373 |
++ |
7374 |
++ write_lock(&mptcp_gws_lock); |
7375 |
++ |
7376 |
++ memset(mptcp_gws, 0, sizeof(struct mptcp_gw_list)); |
7377 |
++ |
7378 |
++ /* A TMP string is used since inet_pton needs a null terminated string |
7379 |
++ * but we do not want to modify the sysctl for obvious reasons. |
7380 |
++ * i will iterate over the SYSCTL string, j will iterate over the |
7381 |
++ * temporary string where each IP is copied into, k will iterate over |
7382 |
++ * the IPs in each list. |
7383 |
++ */ |
7384 |
++ for (i = j = k = 0; |
7385 |
++ i < MPTCP_GW_SYSCTL_MAX_LEN && k < MPTCP_GW_MAX_LISTS; |
7386 |
++ ++i) { |
7387 |
++ if (gateways[i] == '-' || gateways[i] == ',' || gateways[i] == '\0') { |
7388 |
++ /* If the temp IP is empty and the current list is |
7389 |
++ * empty, we are done. |
7390 |
++ */ |
7391 |
++ if (j == 0 && mptcp_gws->len[k] == 0) |
7392 |
++ break; |
7393 |
++ |
7394 |
++ /* Terminate the temp IP string, then if it is |
7395 |
++ * non-empty parse the IP and copy it. |
7396 |
++ */ |
7397 |
++ tmp_string[j] = '\0'; |
7398 |
++ if (j > 0) { |
7399 |
++ mptcp_debug("mptcp_parse_gateway_list tmp: %s i: %d\n", tmp_string, i); |
7400 |
++ |
7401 |
++ ret = in4_pton(tmp_string, strlen(tmp_string), |
7402 |
++ (u8 *)&tmp_addr.s_addr, '\0', |
7403 |
++ NULL); |
7404 |
++ |
7405 |
++ if (ret) { |
7406 |
++ mptcp_debug("mptcp_parse_gateway_list ret: %d s_addr: %pI4\n", |
7407 |
++ ret, |
7408 |
++ &tmp_addr.s_addr); |
7409 |
++ memcpy(&mptcp_gws->list[k][mptcp_gws->len[k]].s_addr, |
7410 |
++ &tmp_addr.s_addr, |
7411 |
++ sizeof(tmp_addr.s_addr)); |
7412 |
++ mptcp_gws->len[k]++; |
7413 |
++ j = 0; |
7414 |
++ tmp_string[j] = '\0'; |
7415 |
++ /* Since we can't impose a limit to |
7416 |
++ * what the user can input, make sure |
7417 |
++ * there are not too many IPs in the |
7418 |
++ * SYSCTL string. |
7419 |
++ */ |
7420 |
++ if (mptcp_gws->len[k] > MPTCP_GW_LIST_MAX_LEN) { |
7421 |
++ mptcp_debug("mptcp_parse_gateway_list too many members in list %i: max %i\n", |
7422 |
++ k, |
7423 |
++ MPTCP_GW_LIST_MAX_LEN); |
7424 |
++ goto error; |
7425 |
++ } |
7426 |
++ } else { |
7427 |
++ goto error; |
7428 |
++ } |
7429 |
++ } |
7430 |
++ |
7431 |
++ if (gateways[i] == '-' || gateways[i] == '\0') |
7432 |
++ ++k; |
7433 |
++ } else { |
7434 |
++ tmp_string[j] = gateways[i]; |
7435 |
++ ++j; |
7436 |
++ } |
7437 |
++ } |
7438 |
++ |
7439 |
++ /* Number of flows is number of gateway lists plus master flow */ |
7440 |
++ mptcp_binder_ndiffports = k+1; |
7441 |
++ |
7442 |
++ write_unlock(&mptcp_gws_lock); |
7443 |
++ kfree(tmp_string); |
7444 |
++ |
7445 |
++ return 0; |
7446 |
++ |
7447 |
++error: |
7448 |
++ memset(mptcp_gws, 0, sizeof(struct mptcp_gw_list)); |
7449 |
++ memset(gateways, 0, sizeof(char) * MPTCP_GW_SYSCTL_MAX_LEN); |
7450 |
++ write_unlock(&mptcp_gws_lock); |
7451 |
++ kfree(tmp_string); |
7452 |
++ return -1; |
7453 |
++} |
7454 |
++ |
7455 |
++/** |
7456 |
++ * Create all new subflows, by doing calls to mptcp_initX_subsockets |
7457 |
++ * |
7458 |
++ * This function uses a goto next_subflow, to allow releasing the lock between |
7459 |
++ * new subflows and giving other processes a chance to do some work on the |
7460 |
++ * socket and potentially finishing the communication. |
7461 |
++ **/ |
7462 |
++static void create_subflow_worker(struct work_struct *work) |
7463 |
++{ |
7464 |
++ const struct binder_priv *pm_priv = container_of(work, |
7465 |
++ struct binder_priv, |
7466 |
++ subflow_work); |
7467 |
++ struct mptcp_cb *mpcb = pm_priv->mpcb; |
7468 |
++ struct sock *meta_sk = mpcb->meta_sk; |
7469 |
++ int iter = 0; |
7470 |
++ |
7471 |
++next_subflow: |
7472 |
++ if (iter) { |
7473 |
++ release_sock(meta_sk); |
7474 |
++ mutex_unlock(&mpcb->mpcb_mutex); |
7475 |
++ |
7476 |
++ cond_resched(); |
7477 |
++ } |
7478 |
++ mutex_lock(&mpcb->mpcb_mutex); |
7479 |
++ lock_sock_nested(meta_sk, SINGLE_DEPTH_NESTING); |
7480 |
++ |
7481 |
++ iter++; |
7482 |
++ |
7483 |
++ if (sock_flag(meta_sk, SOCK_DEAD)) |
7484 |
++ goto exit; |
7485 |
++ |
7486 |
++ if (mpcb->master_sk && |
7487 |
++ !tcp_sk(mpcb->master_sk)->mptcp->fully_established) |
7488 |
++ goto exit; |
7489 |
++ |
7490 |
++ if (mptcp_binder_ndiffports > iter && |
7491 |
++ mptcp_binder_ndiffports > mpcb->cnt_subflows) { |
7492 |
++ struct mptcp_loc4 loc; |
7493 |
++ struct mptcp_rem4 rem; |
7494 |
++ |
7495 |
++ loc.addr.s_addr = inet_sk(meta_sk)->inet_saddr; |
7496 |
++ loc.loc4_id = 0; |
7497 |
++ loc.low_prio = 0; |
7498 |
++ |
7499 |
++ rem.addr.s_addr = inet_sk(meta_sk)->inet_daddr; |
7500 |
++ rem.port = inet_sk(meta_sk)->inet_dport; |
7501 |
++ rem.rem4_id = 0; /* Default 0 */ |
7502 |
++ |
7503 |
++ mptcp_init4_subsockets(meta_sk, &loc, &rem); |
7504 |
++ |
7505 |
++ goto next_subflow; |
7506 |
++ } |
7507 |
++ |
7508 |
++exit: |
7509 |
++ release_sock(meta_sk); |
7510 |
++ mutex_unlock(&mpcb->mpcb_mutex); |
7511 |
++ sock_put(meta_sk); |
7512 |
++} |
7513 |
++ |
7514 |
++static void binder_new_session(const struct sock *meta_sk) |
7515 |
++{ |
7516 |
++ struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb; |
7517 |
++ struct binder_priv *fmp = (struct binder_priv *)&mpcb->mptcp_pm[0]; |
7518 |
++ static DEFINE_SPINLOCK(flow_lock); |
7519 |
++ |
7520 |
++#if IS_ENABLED(CONFIG_IPV6) |
7521 |
++ if (meta_sk->sk_family == AF_INET6 && |
7522 |
++ !mptcp_v6_is_v4_mapped(meta_sk)) { |
7523 |
++ mptcp_fallback_default(mpcb); |
7524 |
++ return; |
7525 |
++ } |
7526 |
++#endif |
7527 |
++ |
7528 |
++ /* Initialize workqueue-struct */ |
7529 |
++ INIT_WORK(&fmp->subflow_work, create_subflow_worker); |
7530 |
++ fmp->mpcb = mpcb; |
7531 |
++ |
7532 |
++ fmp->flow_lock = &flow_lock; |
7533 |
++} |
7534 |
++ |
7535 |
++static void binder_create_subflows(struct sock *meta_sk) |
7536 |
++{ |
7537 |
++ struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb; |
7538 |
++ struct binder_priv *pm_priv = (struct binder_priv *)&mpcb->mptcp_pm[0]; |
7539 |
++ |
7540 |
++ if (mpcb->infinite_mapping_snd || mpcb->infinite_mapping_rcv || |
7541 |
++ mpcb->send_infinite_mapping || |
7542 |
++ mpcb->server_side || sock_flag(meta_sk, SOCK_DEAD)) |
7543 |
++ return; |
7544 |
++ |
7545 |
++ if (!work_pending(&pm_priv->subflow_work)) { |
7546 |
++ sock_hold(meta_sk); |
7547 |
++ queue_work(mptcp_wq, &pm_priv->subflow_work); |
7548 |
++ } |
7549 |
++} |
7550 |
++ |
7551 |
++static int binder_get_local_id(sa_family_t family, union inet_addr *addr, |
7552 |
++ struct net *net, bool *low_prio) |
7553 |
++{ |
7554 |
++ return 0; |
7555 |
++} |
7556 |
++ |
7557 |
++/* Callback functions, executed when syctl mptcp.mptcp_gateways is updated. |
7558 |
++ * Inspired from proc_tcp_congestion_control(). |
7559 |
++ */ |
7560 |
++static int proc_mptcp_gateways(ctl_table *ctl, int write, |
7561 |
++ void __user *buffer, size_t *lenp, |
7562 |
++ loff_t *ppos) |
7563 |
++{ |
7564 |
++ int ret; |
7565 |
++ ctl_table tbl = { |
7566 |
++ .maxlen = MPTCP_GW_SYSCTL_MAX_LEN, |
7567 |
++ }; |
7568 |
++ |
7569 |
++ if (write) { |
7570 |
++ tbl.data = kzalloc(MPTCP_GW_SYSCTL_MAX_LEN, GFP_KERNEL); |
7571 |
++ if (tbl.data == NULL) |
7572 |
++ return -1; |
7573 |
++ ret = proc_dostring(&tbl, write, buffer, lenp, ppos); |
7574 |
++ if (ret == 0) { |
7575 |
++ ret = mptcp_parse_gateway_ipv4(tbl.data); |
7576 |
++ memcpy(ctl->data, tbl.data, MPTCP_GW_SYSCTL_MAX_LEN); |
7577 |
++ } |
7578 |
++ kfree(tbl.data); |
7579 |
++ } else { |
7580 |
++ ret = proc_dostring(ctl, write, buffer, lenp, ppos); |
7581 |
++ } |
7582 |
++ |
7583 |
++ |
7584 |
++ return ret; |
7585 |
++} |
7586 |
++ |
7587 |
++static struct mptcp_pm_ops binder __read_mostly = { |
7588 |
++ .new_session = binder_new_session, |
7589 |
++ .fully_established = binder_create_subflows, |
7590 |
++ .get_local_id = binder_get_local_id, |
7591 |
++ .init_subsocket_v4 = mptcp_v4_add_lsrr, |
7592 |
++ .name = "binder", |
7593 |
++ .owner = THIS_MODULE, |
7594 |
++}; |
7595 |
++ |
7596 |
++static struct ctl_table binder_table[] = { |
7597 |
++ { |
7598 |
++ .procname = "mptcp_binder_gateways", |
7599 |
++ .data = &sysctl_mptcp_binder_gateways, |
7600 |
++ .maxlen = sizeof(char) * MPTCP_GW_SYSCTL_MAX_LEN, |
7601 |
++ .mode = 0644, |
7602 |
++ .proc_handler = &proc_mptcp_gateways |
7603 |
++ }, |
7604 |
++ { } |
7605 |
++}; |
7606 |
++ |
7607 |
++struct ctl_table_header *mptcp_sysctl_binder; |
7608 |
++ |
7609 |
++/* General initialization of MPTCP_PM */ |
7610 |
++static int __init binder_register(void) |
7611 |
++{ |
7612 |
++ mptcp_gws = kzalloc(sizeof(*mptcp_gws), GFP_KERNEL); |
7613 |
++ if (!mptcp_gws) |
7614 |
++ return -ENOMEM; |
7615 |
++ |
7616 |
++ rwlock_init(&mptcp_gws_lock); |
7617 |
++ |
7618 |
++ BUILD_BUG_ON(sizeof(struct binder_priv) > MPTCP_PM_SIZE); |
7619 |
++ |
7620 |
++ mptcp_sysctl_binder = register_net_sysctl(&init_net, "net/mptcp", |
7621 |
++ binder_table); |
7622 |
++ if (!mptcp_sysctl_binder) |
7623 |
++ goto sysctl_fail; |
7624 |
++ |
7625 |
++ if (mptcp_register_path_manager(&binder)) |
7626 |
++ goto pm_failed; |
7627 |
++ |
7628 |
++ return 0; |
7629 |
++ |
7630 |
++pm_failed: |
7631 |
++ unregister_net_sysctl_table(mptcp_sysctl_binder); |
7632 |
++sysctl_fail: |
7633 |
++ kfree(mptcp_gws); |
7634 |
++ |
7635 |
++ return -1; |
7636 |
++} |
7637 |
++ |
7638 |
++static void binder_unregister(void) |
7639 |
++{ |
7640 |
++ mptcp_unregister_path_manager(&binder); |
7641 |
++ unregister_net_sysctl_table(mptcp_sysctl_binder); |
7642 |
++ kfree(mptcp_gws); |
7643 |
++} |
7644 |
++ |
7645 |
++module_init(binder_register); |
7646 |
++module_exit(binder_unregister); |
7647 |
++ |
7648 |
++MODULE_AUTHOR("Luca Boccassi, Duncan Eastoe, Christoph Paasch (ndiffports)"); |
7649 |
++MODULE_LICENSE("GPL"); |
7650 |
++MODULE_DESCRIPTION("BINDER MPTCP"); |
7651 |
++MODULE_VERSION("0.1"); |
7652 |
+diff --git a/net/mptcp/mptcp_coupled.c b/net/mptcp/mptcp_coupled.c |
7653 |
+new file mode 100644 |
7654 |
+index 000000000000..5d761164eb85 |
7655 |
+--- /dev/null |
7656 |
++++ b/net/mptcp/mptcp_coupled.c |
7657 |
+@@ -0,0 +1,270 @@ |
7658 |
++/* |
7659 |
++ * MPTCP implementation - Linked Increase congestion control Algorithm (LIA) |
7660 |
++ * |
7661 |
++ * Initial Design & Implementation: |
7662 |
++ * Sébastien Barré <sebastien.barre@×××××××××.be> |
7663 |
++ * |
7664 |
++ * Current Maintainer & Author: |
7665 |
++ * Christoph Paasch <christoph.paasch@×××××××××.be> |
7666 |
++ * |
7667 |
++ * Additional authors: |
7668 |
++ * Jaakko Korkeaniemi <jaakko.korkeaniemi@×××××.fi> |
7669 |
++ * Gregory Detal <gregory.detal@×××××××××.be> |
7670 |
++ * Fabien Duchêne <fabien.duchene@×××××××××.be> |
7671 |
++ * Andreas Seelinger <Andreas.Seelinger@×××××××××××.de> |
7672 |
++ * Lavkesh Lahngir <lavkesh51@×××××.com> |
7673 |
++ * Andreas Ripke <ripke@××××××.eu> |
7674 |
++ * Vlad Dogaru <vlad.dogaru@×××××.com> |
7675 |
++ * Octavian Purdila <octavian.purdila@×××××.com> |
7676 |
++ * John Ronan <jronan@××××.org> |
7677 |
++ * Catalin Nicutar <catalin.nicutar@×××××.com> |
7678 |
++ * Brandon Heller <brandonh@××××××××.edu> |
7679 |
++ * |
7680 |
++ * |
7681 |
++ * This program is free software; you can redistribute it and/or |
7682 |
++ * modify it under the terms of the GNU General Public License |
7683 |
++ * as published by the Free Software Foundation; either version |
7684 |
++ * 2 of the License, or (at your option) any later version. |
7685 |
++ */ |
7686 |
++#include <net/tcp.h> |
7687 |
++#include <net/mptcp.h> |
7688 |
++ |
7689 |
++#include <linux/module.h> |
7690 |
++ |
7691 |
++/* Scaling is done in the numerator with alpha_scale_num and in the denominator |
7692 |
++ * with alpha_scale_den. |
7693 |
++ * |
7694 |
++ * To downscale, we just need to use alpha_scale. |
7695 |
++ * |
7696 |
++ * We have: alpha_scale = alpha_scale_num / (alpha_scale_den ^ 2) |
7697 |
++ */ |
7698 |
++static int alpha_scale_den = 10; |
7699 |
++static int alpha_scale_num = 32; |
7700 |
++static int alpha_scale = 12; |
7701 |
++ |
7702 |
++struct mptcp_ccc { |
7703 |
++ u64 alpha; |
7704 |
++ bool forced_update; |
7705 |
++}; |
7706 |
++ |
7707 |
++static inline int mptcp_ccc_sk_can_send(const struct sock *sk) |
7708 |
++{ |
7709 |
++ return mptcp_sk_can_send(sk) && tcp_sk(sk)->srtt_us; |
7710 |
++} |
7711 |
++ |
7712 |
++static inline u64 mptcp_get_alpha(const struct sock *meta_sk) |
7713 |
++{ |
7714 |
++ return ((struct mptcp_ccc *)inet_csk_ca(meta_sk))->alpha; |
7715 |
++} |
7716 |
++ |
7717 |
++static inline void mptcp_set_alpha(const struct sock *meta_sk, u64 alpha) |
7718 |
++{ |
7719 |
++ ((struct mptcp_ccc *)inet_csk_ca(meta_sk))->alpha = alpha; |
7720 |
++} |
7721 |
++ |
7722 |
++static inline u64 mptcp_ccc_scale(u32 val, int scale) |
7723 |
++{ |
7724 |
++ return (u64) val << scale; |
7725 |
++} |
7726 |
++ |
7727 |
++static inline bool mptcp_get_forced(const struct sock *meta_sk) |
7728 |
++{ |
7729 |
++ return ((struct mptcp_ccc *)inet_csk_ca(meta_sk))->forced_update; |
7730 |
++} |
7731 |
++ |
7732 |
++static inline void mptcp_set_forced(const struct sock *meta_sk, bool force) |
7733 |
++{ |
7734 |
++ ((struct mptcp_ccc *)inet_csk_ca(meta_sk))->forced_update = force; |
7735 |
++} |
7736 |
++ |
7737 |
++static void mptcp_ccc_recalc_alpha(const struct sock *sk) |
7738 |
++{ |
7739 |
++ const struct mptcp_cb *mpcb = tcp_sk(sk)->mpcb; |
7740 |
++ const struct sock *sub_sk; |
7741 |
++ int best_cwnd = 0, best_rtt = 0, can_send = 0; |
7742 |
++ u64 max_numerator = 0, sum_denominator = 0, alpha = 1; |
7743 |
++ |
7744 |
++ if (!mpcb) |
7745 |
++ return; |
7746 |
++ |
7747 |
++ /* Only one subflow left - fall back to normal reno-behavior |
7748 |
++ * (set alpha to 1) |
7749 |
++ */ |
7750 |
++ if (mpcb->cnt_established <= 1) |
7751 |
++ goto exit; |
7752 |
++ |
7753 |
++ /* Do regular alpha-calculation for multiple subflows */ |
7754 |
++ |
7755 |
++ /* Find the max numerator of the alpha-calculation */ |
7756 |
++ mptcp_for_each_sk(mpcb, sub_sk) { |
7757 |
++ struct tcp_sock *sub_tp = tcp_sk(sub_sk); |
7758 |
++ u64 tmp; |
7759 |
++ |
7760 |
++ if (!mptcp_ccc_sk_can_send(sub_sk)) |
7761 |
++ continue; |
7762 |
++ |
7763 |
++ can_send++; |
7764 |
++ |
7765 |
++ /* We need to look for the path, that provides the max-value. |
7766 |
++ * Integer-overflow is not possible here, because |
7767 |
++ * tmp will be in u64. |
7768 |
++ */ |
7769 |
++ tmp = div64_u64(mptcp_ccc_scale(sub_tp->snd_cwnd, |
7770 |
++ alpha_scale_num), (u64)sub_tp->srtt_us * sub_tp->srtt_us); |
7771 |
++ |
7772 |
++ if (tmp >= max_numerator) { |
7773 |
++ max_numerator = tmp; |
7774 |
++ best_cwnd = sub_tp->snd_cwnd; |
7775 |
++ best_rtt = sub_tp->srtt_us; |
7776 |
++ } |
7777 |
++ } |
7778 |
++ |
7779 |
++ /* No subflow is able to send - we don't care anymore */ |
7780 |
++ if (unlikely(!can_send)) |
7781 |
++ goto exit; |
7782 |
++ |
7783 |
++ /* Calculate the denominator */ |
7784 |
++ mptcp_for_each_sk(mpcb, sub_sk) { |
7785 |
++ struct tcp_sock *sub_tp = tcp_sk(sub_sk); |
7786 |
++ |
7787 |
++ if (!mptcp_ccc_sk_can_send(sub_sk)) |
7788 |
++ continue; |
7789 |
++ |
7790 |
++ sum_denominator += div_u64( |
7791 |
++ mptcp_ccc_scale(sub_tp->snd_cwnd, |
7792 |
++ alpha_scale_den) * best_rtt, |
7793 |
++ sub_tp->srtt_us); |
7794 |
++ } |
7795 |
++ sum_denominator *= sum_denominator; |
7796 |
++ if (unlikely(!sum_denominator)) { |
7797 |
++ pr_err("%s: sum_denominator == 0, cnt_established:%d\n", |
7798 |
++ __func__, mpcb->cnt_established); |
7799 |
++ mptcp_for_each_sk(mpcb, sub_sk) { |
7800 |
++ struct tcp_sock *sub_tp = tcp_sk(sub_sk); |
7801 |
++ pr_err("%s: pi:%d, state:%d\n, rtt:%u, cwnd: %u", |
7802 |
++ __func__, sub_tp->mptcp->path_index, |
7803 |
++ sub_sk->sk_state, sub_tp->srtt_us, |
7804 |
++ sub_tp->snd_cwnd); |
7805 |
++ } |
7806 |
++ } |
7807 |
++ |
7808 |
++ alpha = div64_u64(mptcp_ccc_scale(best_cwnd, alpha_scale_num), sum_denominator); |
7809 |
++ |
7810 |
++ if (unlikely(!alpha)) |
7811 |
++ alpha = 1; |
7812 |
++ |
7813 |
++exit: |
7814 |
++ mptcp_set_alpha(mptcp_meta_sk(sk), alpha); |
7815 |
++} |
7816 |
++ |
7817 |
++static void mptcp_ccc_init(struct sock *sk) |
7818 |
++{ |
7819 |
++ if (mptcp(tcp_sk(sk))) { |
7820 |
++ mptcp_set_forced(mptcp_meta_sk(sk), 0); |
7821 |
++ mptcp_set_alpha(mptcp_meta_sk(sk), 1); |
7822 |
++ } |
7823 |
++ /* If we do not mptcp, behave like reno: return */ |
7824 |
++} |
7825 |
++ |
7826 |
++static void mptcp_ccc_cwnd_event(struct sock *sk, enum tcp_ca_event event) |
7827 |
++{ |
7828 |
++ if (event == CA_EVENT_LOSS) |
7829 |
++ mptcp_ccc_recalc_alpha(sk); |
7830 |
++} |
7831 |
++ |
7832 |
++static void mptcp_ccc_set_state(struct sock *sk, u8 ca_state) |
7833 |
++{ |
7834 |
++ if (!mptcp(tcp_sk(sk))) |
7835 |
++ return; |
7836 |
++ |
7837 |
++ mptcp_set_forced(mptcp_meta_sk(sk), 1); |
7838 |
++} |
7839 |
++ |
7840 |
++static void mptcp_ccc_cong_avoid(struct sock *sk, u32 ack, u32 acked) |
7841 |
++{ |
7842 |
++ struct tcp_sock *tp = tcp_sk(sk); |
7843 |
++ const struct mptcp_cb *mpcb = tp->mpcb; |
7844 |
++ int snd_cwnd; |
7845 |
++ |
7846 |
++ if (!mptcp(tp)) { |
7847 |
++ tcp_reno_cong_avoid(sk, ack, acked); |
7848 |
++ return; |
7849 |
++ } |
7850 |
++ |
7851 |
++ if (!tcp_is_cwnd_limited(sk)) |
7852 |
++ return; |
7853 |
++ |
7854 |
++ if (tp->snd_cwnd <= tp->snd_ssthresh) { |
7855 |
++ /* In "safe" area, increase. */ |
7856 |
++ tcp_slow_start(tp, acked); |
7857 |
++ mptcp_ccc_recalc_alpha(sk); |
7858 |
++ return; |
7859 |
++ } |
7860 |
++ |
7861 |
++ if (mptcp_get_forced(mptcp_meta_sk(sk))) { |
7862 |
++ mptcp_ccc_recalc_alpha(sk); |
7863 |
++ mptcp_set_forced(mptcp_meta_sk(sk), 0); |
7864 |
++ } |
7865 |
++ |
7866 |
++ if (mpcb->cnt_established > 1) { |
7867 |
++ u64 alpha = mptcp_get_alpha(mptcp_meta_sk(sk)); |
7868 |
++ |
7869 |
++ /* This may happen, if at the initialization, the mpcb |
7870 |
++ * was not yet attached to the sock, and thus |
7871 |
++ * initializing alpha failed. |
7872 |
++ */ |
7873 |
++ if (unlikely(!alpha)) |
7874 |
++ alpha = 1; |
7875 |
++ |
7876 |
++ snd_cwnd = (int) div_u64 ((u64) mptcp_ccc_scale(1, alpha_scale), |
7877 |
++ alpha); |
7878 |
++ |
7879 |
++ /* snd_cwnd_cnt >= max (scale * tot_cwnd / alpha, cwnd) |
7880 |
++ * Thus, we select here the max value. |
7881 |
++ */ |
7882 |
++ if (snd_cwnd < tp->snd_cwnd) |
7883 |
++ snd_cwnd = tp->snd_cwnd; |
7884 |
++ } else { |
7885 |
++ snd_cwnd = tp->snd_cwnd; |
7886 |
++ } |
7887 |
++ |
7888 |
++ if (tp->snd_cwnd_cnt >= snd_cwnd) { |
7889 |
++ if (tp->snd_cwnd < tp->snd_cwnd_clamp) { |
7890 |
++ tp->snd_cwnd++; |
7891 |
++ mptcp_ccc_recalc_alpha(sk); |
7892 |
++ } |
7893 |
++ |
7894 |
++ tp->snd_cwnd_cnt = 0; |
7895 |
++ } else { |
7896 |
++ tp->snd_cwnd_cnt++; |
7897 |
++ } |
7898 |
++} |
7899 |
++ |
7900 |
++static struct tcp_congestion_ops mptcp_ccc = { |
7901 |
++ .init = mptcp_ccc_init, |
7902 |
++ .ssthresh = tcp_reno_ssthresh, |
7903 |
++ .cong_avoid = mptcp_ccc_cong_avoid, |
7904 |
++ .cwnd_event = mptcp_ccc_cwnd_event, |
7905 |
++ .set_state = mptcp_ccc_set_state, |
7906 |
++ .owner = THIS_MODULE, |
7907 |
++ .name = "lia", |
7908 |
++}; |
7909 |
++ |
7910 |
++static int __init mptcp_ccc_register(void) |
7911 |
++{ |
7912 |
++ BUILD_BUG_ON(sizeof(struct mptcp_ccc) > ICSK_CA_PRIV_SIZE); |
7913 |
++ return tcp_register_congestion_control(&mptcp_ccc); |
7914 |
++} |
7915 |
++ |
7916 |
++static void __exit mptcp_ccc_unregister(void) |
7917 |
++{ |
7918 |
++ tcp_unregister_congestion_control(&mptcp_ccc); |
7919 |
++} |
7920 |
++ |
7921 |
++module_init(mptcp_ccc_register); |
7922 |
++module_exit(mptcp_ccc_unregister); |
7923 |
++ |
7924 |
++MODULE_AUTHOR("Christoph Paasch, Sébastien Barré"); |
7925 |
++MODULE_LICENSE("GPL"); |
7926 |
++MODULE_DESCRIPTION("MPTCP LINKED INCREASE CONGESTION CONTROL ALGORITHM"); |
7927 |
++MODULE_VERSION("0.1"); |
7928 |
+diff --git a/net/mptcp/mptcp_ctrl.c b/net/mptcp/mptcp_ctrl.c |
7929 |
+new file mode 100644 |
7930 |
+index 000000000000..28dfa0479f5e |
7931 |
+--- /dev/null |
7932 |
++++ b/net/mptcp/mptcp_ctrl.c |
7933 |
+@@ -0,0 +1,2401 @@ |
7934 |
++/* |
7935 |
++ * MPTCP implementation - MPTCP-control |
7936 |
++ * |
7937 |
++ * Initial Design & Implementation: |
7938 |
++ * Sébastien Barré <sebastien.barre@×××××××××.be> |
7939 |
++ * |
7940 |
++ * Current Maintainer & Author: |
7941 |
++ * Christoph Paasch <christoph.paasch@×××××××××.be> |
7942 |
++ * |
7943 |
++ * Additional authors: |
7944 |
++ * Jaakko Korkeaniemi <jaakko.korkeaniemi@×××××.fi> |
7945 |
++ * Gregory Detal <gregory.detal@×××××××××.be> |
7946 |
++ * Fabien Duchêne <fabien.duchene@×××××××××.be> |
7947 |
++ * Andreas Seelinger <Andreas.Seelinger@×××××××××××.de> |
7948 |
++ * Lavkesh Lahngir <lavkesh51@×××××.com> |
7949 |
++ * Andreas Ripke <ripke@××××××.eu> |
7950 |
++ * Vlad Dogaru <vlad.dogaru@×××××.com> |
7951 |
++ * Octavian Purdila <octavian.purdila@×××××.com> |
7952 |
++ * John Ronan <jronan@××××.org> |
7953 |
++ * Catalin Nicutar <catalin.nicutar@×××××.com> |
7954 |
++ * Brandon Heller <brandonh@××××××××.edu> |
7955 |
++ * |
7956 |
++ * |
7957 |
++ * This program is free software; you can redistribute it and/or |
7958 |
++ * modify it under the terms of the GNU General Public License |
7959 |
++ * as published by the Free Software Foundation; either version |
7960 |
++ * 2 of the License, or (at your option) any later version. |
7961 |
++ */ |
7962 |
++ |
7963 |
++#include <net/inet_common.h> |
7964 |
++#include <net/inet6_hashtables.h> |
7965 |
++#include <net/ipv6.h> |
7966 |
++#include <net/ip6_checksum.h> |
7967 |
++#include <net/mptcp.h> |
7968 |
++#include <net/mptcp_v4.h> |
7969 |
++#if IS_ENABLED(CONFIG_IPV6) |
7970 |
++#include <net/ip6_route.h> |
7971 |
++#include <net/mptcp_v6.h> |
7972 |
++#endif |
7973 |
++#include <net/sock.h> |
7974 |
++#include <net/tcp.h> |
7975 |
++#include <net/tcp_states.h> |
7976 |
++#include <net/transp_v6.h> |
7977 |
++#include <net/xfrm.h> |
7978 |
++ |
7979 |
++#include <linux/cryptohash.h> |
7980 |
++#include <linux/kconfig.h> |
7981 |
++#include <linux/module.h> |
7982 |
++#include <linux/netpoll.h> |
7983 |
++#include <linux/list.h> |
7984 |
++#include <linux/jhash.h> |
7985 |
++#include <linux/tcp.h> |
7986 |
++#include <linux/net.h> |
7987 |
++#include <linux/in.h> |
7988 |
++#include <linux/random.h> |
7989 |
++#include <linux/inetdevice.h> |
7990 |
++#include <linux/workqueue.h> |
7991 |
++#include <linux/atomic.h> |
7992 |
++#include <linux/sysctl.h> |
7993 |
++ |
7994 |
++static struct kmem_cache *mptcp_sock_cache __read_mostly; |
7995 |
++static struct kmem_cache *mptcp_cb_cache __read_mostly; |
7996 |
++static struct kmem_cache *mptcp_tw_cache __read_mostly; |
7997 |
++ |
7998 |
++int sysctl_mptcp_enabled __read_mostly = 1; |
7999 |
++int sysctl_mptcp_checksum __read_mostly = 1; |
8000 |
++int sysctl_mptcp_debug __read_mostly; |
8001 |
++EXPORT_SYMBOL(sysctl_mptcp_debug); |
8002 |
++int sysctl_mptcp_syn_retries __read_mostly = 3; |
8003 |
++ |
8004 |
++bool mptcp_init_failed __read_mostly; |
8005 |
++ |
8006 |
++struct static_key mptcp_static_key = STATIC_KEY_INIT_FALSE; |
8007 |
++EXPORT_SYMBOL(mptcp_static_key); |
8008 |
++ |
8009 |
++static int proc_mptcp_path_manager(ctl_table *ctl, int write, |
8010 |
++ void __user *buffer, size_t *lenp, |
8011 |
++ loff_t *ppos) |
8012 |
++{ |
8013 |
++ char val[MPTCP_PM_NAME_MAX]; |
8014 |
++ ctl_table tbl = { |
8015 |
++ .data = val, |
8016 |
++ .maxlen = MPTCP_PM_NAME_MAX, |
8017 |
++ }; |
8018 |
++ int ret; |
8019 |
++ |
8020 |
++ mptcp_get_default_path_manager(val); |
8021 |
++ |
8022 |
++ ret = proc_dostring(&tbl, write, buffer, lenp, ppos); |
8023 |
++ if (write && ret == 0) |
8024 |
++ ret = mptcp_set_default_path_manager(val); |
8025 |
++ return ret; |
8026 |
++} |
8027 |
++ |
8028 |
++static int proc_mptcp_scheduler(ctl_table *ctl, int write, |
8029 |
++ void __user *buffer, size_t *lenp, |
8030 |
++ loff_t *ppos) |
8031 |
++{ |
8032 |
++ char val[MPTCP_SCHED_NAME_MAX]; |
8033 |
++ ctl_table tbl = { |
8034 |
++ .data = val, |
8035 |
++ .maxlen = MPTCP_SCHED_NAME_MAX, |
8036 |
++ }; |
8037 |
++ int ret; |
8038 |
++ |
8039 |
++ mptcp_get_default_scheduler(val); |
8040 |
++ |
8041 |
++ ret = proc_dostring(&tbl, write, buffer, lenp, ppos); |
8042 |
++ if (write && ret == 0) |
8043 |
++ ret = mptcp_set_default_scheduler(val); |
8044 |
++ return ret; |
8045 |
++} |
8046 |
++ |
8047 |
++static struct ctl_table mptcp_table[] = { |
8048 |
++ { |
8049 |
++ .procname = "mptcp_enabled", |
8050 |
++ .data = &sysctl_mptcp_enabled, |
8051 |
++ .maxlen = sizeof(int), |
8052 |
++ .mode = 0644, |
8053 |
++ .proc_handler = &proc_dointvec |
8054 |
++ }, |
8055 |
++ { |
8056 |
++ .procname = "mptcp_checksum", |
8057 |
++ .data = &sysctl_mptcp_checksum, |
8058 |
++ .maxlen = sizeof(int), |
8059 |
++ .mode = 0644, |
8060 |
++ .proc_handler = &proc_dointvec |
8061 |
++ }, |
8062 |
++ { |
8063 |
++ .procname = "mptcp_debug", |
8064 |
++ .data = &sysctl_mptcp_debug, |
8065 |
++ .maxlen = sizeof(int), |
8066 |
++ .mode = 0644, |
8067 |
++ .proc_handler = &proc_dointvec |
8068 |
++ }, |
8069 |
++ { |
8070 |
++ .procname = "mptcp_syn_retries", |
8071 |
++ .data = &sysctl_mptcp_syn_retries, |
8072 |
++ .maxlen = sizeof(int), |
8073 |
++ .mode = 0644, |
8074 |
++ .proc_handler = &proc_dointvec |
8075 |
++ }, |
8076 |
++ { |
8077 |
++ .procname = "mptcp_path_manager", |
8078 |
++ .mode = 0644, |
8079 |
++ .maxlen = MPTCP_PM_NAME_MAX, |
8080 |
++ .proc_handler = proc_mptcp_path_manager, |
8081 |
++ }, |
8082 |
++ { |
8083 |
++ .procname = "mptcp_scheduler", |
8084 |
++ .mode = 0644, |
8085 |
++ .maxlen = MPTCP_SCHED_NAME_MAX, |
8086 |
++ .proc_handler = proc_mptcp_scheduler, |
8087 |
++ }, |
8088 |
++ { } |
8089 |
++}; |
8090 |
++ |
8091 |
++static inline u32 mptcp_hash_tk(u32 token) |
8092 |
++{ |
8093 |
++ return token % MPTCP_HASH_SIZE; |
8094 |
++} |
8095 |
++ |
8096 |
++struct hlist_nulls_head tk_hashtable[MPTCP_HASH_SIZE]; |
8097 |
++EXPORT_SYMBOL(tk_hashtable); |
8098 |
++ |
8099 |
++/* This second hashtable is needed to retrieve request socks |
8100 |
++ * created as a result of a join request. While the SYN contains |
8101 |
++ * the token, the final ack does not, so we need a separate hashtable |
8102 |
++ * to retrieve the mpcb. |
8103 |
++ */ |
8104 |
++struct hlist_nulls_head mptcp_reqsk_htb[MPTCP_HASH_SIZE]; |
8105 |
++spinlock_t mptcp_reqsk_hlock; /* hashtable protection */ |
8106 |
++ |
8107 |
++/* The following hash table is used to avoid collision of token */ |
8108 |
++static struct hlist_nulls_head mptcp_reqsk_tk_htb[MPTCP_HASH_SIZE]; |
8109 |
++spinlock_t mptcp_tk_hashlock; /* hashtable protection */ |
8110 |
++ |
8111 |
++static bool mptcp_reqsk_find_tk(const u32 token) |
8112 |
++{ |
8113 |
++ const u32 hash = mptcp_hash_tk(token); |
8114 |
++ const struct mptcp_request_sock *mtreqsk; |
8115 |
++ const struct hlist_nulls_node *node; |
8116 |
++ |
8117 |
++begin: |
8118 |
++ hlist_nulls_for_each_entry_rcu(mtreqsk, node, |
8119 |
++ &mptcp_reqsk_tk_htb[hash], hash_entry) { |
8120 |
++ if (token == mtreqsk->mptcp_loc_token) |
8121 |
++ return true; |
8122 |
++ } |
8123 |
++ /* A request-socket is destroyed by RCU. So, it might have been recycled |
8124 |
++ * and put into another hash-table list. So, after the lookup we may |
8125 |
++ * end up in a different list. So, we may need to restart. |
8126 |
++ * |
8127 |
++ * See also the comment in __inet_lookup_established. |
8128 |
++ */ |
8129 |
++ if (get_nulls_value(node) != hash) |
8130 |
++ goto begin; |
8131 |
++ return false; |
8132 |
++} |
8133 |
++ |
8134 |
++static void mptcp_reqsk_insert_tk(struct request_sock *reqsk, const u32 token) |
8135 |
++{ |
8136 |
++ u32 hash = mptcp_hash_tk(token); |
8137 |
++ |
8138 |
++ hlist_nulls_add_head_rcu(&mptcp_rsk(reqsk)->hash_entry, |
8139 |
++ &mptcp_reqsk_tk_htb[hash]); |
8140 |
++} |
8141 |
++ |
8142 |
++static void mptcp_reqsk_remove_tk(const struct request_sock *reqsk) |
8143 |
++{ |
8144 |
++ rcu_read_lock(); |
8145 |
++ spin_lock(&mptcp_tk_hashlock); |
8146 |
++ hlist_nulls_del_init_rcu(&mptcp_rsk(reqsk)->hash_entry); |
8147 |
++ spin_unlock(&mptcp_tk_hashlock); |
8148 |
++ rcu_read_unlock(); |
8149 |
++} |
8150 |
++ |
8151 |
++void mptcp_reqsk_destructor(struct request_sock *req) |
8152 |
++{ |
8153 |
++ if (!mptcp_rsk(req)->is_sub) { |
8154 |
++ if (in_softirq()) { |
8155 |
++ mptcp_reqsk_remove_tk(req); |
8156 |
++ } else { |
8157 |
++ rcu_read_lock_bh(); |
8158 |
++ spin_lock(&mptcp_tk_hashlock); |
8159 |
++ hlist_nulls_del_init_rcu(&mptcp_rsk(req)->hash_entry); |
8160 |
++ spin_unlock(&mptcp_tk_hashlock); |
8161 |
++ rcu_read_unlock_bh(); |
8162 |
++ } |
8163 |
++ } else { |
8164 |
++ mptcp_hash_request_remove(req); |
8165 |
++ } |
8166 |
++} |
8167 |
++ |
8168 |
++static void __mptcp_hash_insert(struct tcp_sock *meta_tp, const u32 token) |
8169 |
++{ |
8170 |
++ u32 hash = mptcp_hash_tk(token); |
8171 |
++ hlist_nulls_add_head_rcu(&meta_tp->tk_table, &tk_hashtable[hash]); |
8172 |
++ meta_tp->inside_tk_table = 1; |
8173 |
++} |
8174 |
++ |
8175 |
++static bool mptcp_find_token(u32 token) |
8176 |
++{ |
8177 |
++ const u32 hash = mptcp_hash_tk(token); |
8178 |
++ const struct tcp_sock *meta_tp; |
8179 |
++ const struct hlist_nulls_node *node; |
8180 |
++ |
8181 |
++begin: |
8182 |
++ hlist_nulls_for_each_entry_rcu(meta_tp, node, &tk_hashtable[hash], tk_table) { |
8183 |
++ if (token == meta_tp->mptcp_loc_token) |
8184 |
++ return true; |
8185 |
++ } |
8186 |
++ /* A TCP-socket is destroyed by RCU. So, it might have been recycled |
8187 |
++ * and put into another hash-table list. So, after the lookup we may |
8188 |
++ * end up in a different list. So, we may need to restart. |
8189 |
++ * |
8190 |
++ * See also the comment in __inet_lookup_established. |
8191 |
++ */ |
8192 |
++ if (get_nulls_value(node) != hash) |
8193 |
++ goto begin; |
8194 |
++ return false; |
8195 |
++} |
8196 |
++ |
8197 |
++static void mptcp_set_key_reqsk(struct request_sock *req, |
8198 |
++ const struct sk_buff *skb) |
8199 |
++{ |
8200 |
++ const struct inet_request_sock *ireq = inet_rsk(req); |
8201 |
++ struct mptcp_request_sock *mtreq = mptcp_rsk(req); |
8202 |
++ |
8203 |
++ if (skb->protocol == htons(ETH_P_IP)) { |
8204 |
++ mtreq->mptcp_loc_key = mptcp_v4_get_key(ip_hdr(skb)->saddr, |
8205 |
++ ip_hdr(skb)->daddr, |
8206 |
++ htons(ireq->ir_num), |
8207 |
++ ireq->ir_rmt_port); |
8208 |
++#if IS_ENABLED(CONFIG_IPV6) |
8209 |
++ } else { |
8210 |
++ mtreq->mptcp_loc_key = mptcp_v6_get_key(ipv6_hdr(skb)->saddr.s6_addr32, |
8211 |
++ ipv6_hdr(skb)->daddr.s6_addr32, |
8212 |
++ htons(ireq->ir_num), |
8213 |
++ ireq->ir_rmt_port); |
8214 |
++#endif |
8215 |
++ } |
8216 |
++ |
8217 |
++ mptcp_key_sha1(mtreq->mptcp_loc_key, &mtreq->mptcp_loc_token, NULL); |
8218 |
++} |
8219 |
++ |
8220 |
++/* New MPTCP-connection request, prepare a new token for the meta-socket that |
8221 |
++ * will be created in mptcp_check_req_master(), and store the received token. |
8222 |
++ */ |
8223 |
++void mptcp_reqsk_new_mptcp(struct request_sock *req, |
8224 |
++ const struct mptcp_options_received *mopt, |
8225 |
++ const struct sk_buff *skb) |
8226 |
++{ |
8227 |
++ struct mptcp_request_sock *mtreq = mptcp_rsk(req); |
8228 |
++ |
8229 |
++ inet_rsk(req)->saw_mpc = 1; |
8230 |
++ |
8231 |
++ rcu_read_lock(); |
8232 |
++ spin_lock(&mptcp_tk_hashlock); |
8233 |
++ do { |
8234 |
++ mptcp_set_key_reqsk(req, skb); |
8235 |
++ } while (mptcp_reqsk_find_tk(mtreq->mptcp_loc_token) || |
8236 |
++ mptcp_find_token(mtreq->mptcp_loc_token)); |
8237 |
++ |
8238 |
++ mptcp_reqsk_insert_tk(req, mtreq->mptcp_loc_token); |
8239 |
++ spin_unlock(&mptcp_tk_hashlock); |
8240 |
++ rcu_read_unlock(); |
8241 |
++ mtreq->mptcp_rem_key = mopt->mptcp_key; |
8242 |
++} |
8243 |
++ |
8244 |
++static void mptcp_set_key_sk(const struct sock *sk) |
8245 |
++{ |
8246 |
++ struct tcp_sock *tp = tcp_sk(sk); |
8247 |
++ const struct inet_sock *isk = inet_sk(sk); |
8248 |
++ |
8249 |
++ if (sk->sk_family == AF_INET) |
8250 |
++ tp->mptcp_loc_key = mptcp_v4_get_key(isk->inet_saddr, |
8251 |
++ isk->inet_daddr, |
8252 |
++ isk->inet_sport, |
8253 |
++ isk->inet_dport); |
8254 |
++#if IS_ENABLED(CONFIG_IPV6) |
8255 |
++ else |
8256 |
++ tp->mptcp_loc_key = mptcp_v6_get_key(inet6_sk(sk)->saddr.s6_addr32, |
8257 |
++ sk->sk_v6_daddr.s6_addr32, |
8258 |
++ isk->inet_sport, |
8259 |
++ isk->inet_dport); |
8260 |
++#endif |
8261 |
++ |
8262 |
++ mptcp_key_sha1(tp->mptcp_loc_key, |
8263 |
++ &tp->mptcp_loc_token, NULL); |
8264 |
++} |
8265 |
++ |
8266 |
++void mptcp_connect_init(struct sock *sk) |
8267 |
++{ |
8268 |
++ struct tcp_sock *tp = tcp_sk(sk); |
8269 |
++ |
8270 |
++ rcu_read_lock_bh(); |
8271 |
++ spin_lock(&mptcp_tk_hashlock); |
8272 |
++ do { |
8273 |
++ mptcp_set_key_sk(sk); |
8274 |
++ } while (mptcp_reqsk_find_tk(tp->mptcp_loc_token) || |
8275 |
++ mptcp_find_token(tp->mptcp_loc_token)); |
8276 |
++ |
8277 |
++ __mptcp_hash_insert(tp, tp->mptcp_loc_token); |
8278 |
++ spin_unlock(&mptcp_tk_hashlock); |
8279 |
++ rcu_read_unlock_bh(); |
8280 |
++} |
8281 |
++ |
8282 |
++/** |
8283 |
++ * This function increments the refcount of the mpcb struct. |
8284 |
++ * It is the responsibility of the caller to decrement when releasing |
8285 |
++ * the structure. |
8286 |
++ */ |
8287 |
++struct sock *mptcp_hash_find(const struct net *net, const u32 token) |
8288 |
++{ |
8289 |
++ const u32 hash = mptcp_hash_tk(token); |
8290 |
++ const struct tcp_sock *meta_tp; |
8291 |
++ struct sock *meta_sk = NULL; |
8292 |
++ const struct hlist_nulls_node *node; |
8293 |
++ |
8294 |
++ rcu_read_lock(); |
8295 |
++begin: |
8296 |
++ hlist_nulls_for_each_entry_rcu(meta_tp, node, &tk_hashtable[hash], |
8297 |
++ tk_table) { |
8298 |
++ meta_sk = (struct sock *)meta_tp; |
8299 |
++ if (token == meta_tp->mptcp_loc_token && |
8300 |
++ net_eq(net, sock_net(meta_sk))) { |
8301 |
++ if (unlikely(!atomic_inc_not_zero(&meta_sk->sk_refcnt))) |
8302 |
++ goto out; |
8303 |
++ if (unlikely(token != meta_tp->mptcp_loc_token || |
8304 |
++ !net_eq(net, sock_net(meta_sk)))) { |
8305 |
++ sock_gen_put(meta_sk); |
8306 |
++ goto begin; |
8307 |
++ } |
8308 |
++ goto found; |
8309 |
++ } |
8310 |
++ } |
8311 |
++ /* A TCP-socket is destroyed by RCU. So, it might have been recycled |
8312 |
++ * and put into another hash-table list. So, after the lookup we may |
8313 |
++ * end up in a different list. So, we may need to restart. |
8314 |
++ * |
8315 |
++ * See also the comment in __inet_lookup_established. |
8316 |
++ */ |
8317 |
++ if (get_nulls_value(node) != hash) |
8318 |
++ goto begin; |
8319 |
++out: |
8320 |
++ meta_sk = NULL; |
8321 |
++found: |
8322 |
++ rcu_read_unlock(); |
8323 |
++ return meta_sk; |
8324 |
++} |
8325 |
++ |
8326 |
++void mptcp_hash_remove_bh(struct tcp_sock *meta_tp) |
8327 |
++{ |
8328 |
++ /* remove from the token hashtable */ |
8329 |
++ rcu_read_lock_bh(); |
8330 |
++ spin_lock(&mptcp_tk_hashlock); |
8331 |
++ hlist_nulls_del_init_rcu(&meta_tp->tk_table); |
8332 |
++ meta_tp->inside_tk_table = 0; |
8333 |
++ spin_unlock(&mptcp_tk_hashlock); |
8334 |
++ rcu_read_unlock_bh(); |
8335 |
++} |
8336 |
++ |
8337 |
++void mptcp_hash_remove(struct tcp_sock *meta_tp) |
8338 |
++{ |
8339 |
++ rcu_read_lock(); |
8340 |
++ spin_lock(&mptcp_tk_hashlock); |
8341 |
++ hlist_nulls_del_init_rcu(&meta_tp->tk_table); |
8342 |
++ meta_tp->inside_tk_table = 0; |
8343 |
++ spin_unlock(&mptcp_tk_hashlock); |
8344 |
++ rcu_read_unlock(); |
8345 |
++} |
8346 |
++ |
8347 |
++struct sock *mptcp_select_ack_sock(const struct sock *meta_sk) |
8348 |
++{ |
8349 |
++ const struct tcp_sock *meta_tp = tcp_sk(meta_sk); |
8350 |
++ struct sock *sk, *rttsk = NULL, *lastsk = NULL; |
8351 |
++ u32 min_time = 0, last_active = 0; |
8352 |
++ |
8353 |
++ mptcp_for_each_sk(meta_tp->mpcb, sk) { |
8354 |
++ struct tcp_sock *tp = tcp_sk(sk); |
8355 |
++ u32 elapsed; |
8356 |
++ |
8357 |
++ if (!mptcp_sk_can_send_ack(sk) || tp->pf) |
8358 |
++ continue; |
8359 |
++ |
8360 |
++ elapsed = keepalive_time_elapsed(tp); |
8361 |
++ |
8362 |
++ /* We take the one with the lowest RTT within a reasonable |
8363 |
++ * (meta-RTO)-timeframe |
8364 |
++ */ |
8365 |
++ if (elapsed < inet_csk(meta_sk)->icsk_rto) { |
8366 |
++ if (!min_time || tp->srtt_us < min_time) { |
8367 |
++ min_time = tp->srtt_us; |
8368 |
++ rttsk = sk; |
8369 |
++ } |
8370 |
++ continue; |
8371 |
++ } |
8372 |
++ |
8373 |
++ /* Otherwise, we just take the most recent active */ |
8374 |
++ if (!rttsk && (!last_active || elapsed < last_active)) { |
8375 |
++ last_active = elapsed; |
8376 |
++ lastsk = sk; |
8377 |
++ } |
8378 |
++ } |
8379 |
++ |
8380 |
++ if (rttsk) |
8381 |
++ return rttsk; |
8382 |
++ |
8383 |
++ return lastsk; |
8384 |
++} |
8385 |
++EXPORT_SYMBOL(mptcp_select_ack_sock); |
8386 |
++ |
8387 |
++static void mptcp_sock_def_error_report(struct sock *sk) |
8388 |
++{ |
8389 |
++ const struct mptcp_cb *mpcb = tcp_sk(sk)->mpcb; |
8390 |
++ |
8391 |
++ if (!sock_flag(sk, SOCK_DEAD)) |
8392 |
++ mptcp_sub_close(sk, 0); |
8393 |
++ |
8394 |
++ if (mpcb->infinite_mapping_rcv || mpcb->infinite_mapping_snd || |
8395 |
++ mpcb->send_infinite_mapping) { |
8396 |
++ struct sock *meta_sk = mptcp_meta_sk(sk); |
8397 |
++ |
8398 |
++ meta_sk->sk_err = sk->sk_err; |
8399 |
++ meta_sk->sk_err_soft = sk->sk_err_soft; |
8400 |
++ |
8401 |
++ if (!sock_flag(meta_sk, SOCK_DEAD)) |
8402 |
++ meta_sk->sk_error_report(meta_sk); |
8403 |
++ |
8404 |
++ tcp_done(meta_sk); |
8405 |
++ } |
8406 |
++ |
8407 |
++ sk->sk_err = 0; |
8408 |
++ return; |
8409 |
++} |
8410 |
++ |
8411 |
++static void mptcp_mpcb_put(struct mptcp_cb *mpcb) |
8412 |
++{ |
8413 |
++ if (atomic_dec_and_test(&mpcb->mpcb_refcnt)) { |
8414 |
++ mptcp_cleanup_path_manager(mpcb); |
8415 |
++ mptcp_cleanup_scheduler(mpcb); |
8416 |
++ kmem_cache_free(mptcp_cb_cache, mpcb); |
8417 |
++ } |
8418 |
++} |
8419 |
++ |
8420 |
++static void mptcp_sock_destruct(struct sock *sk) |
8421 |
++{ |
8422 |
++ struct tcp_sock *tp = tcp_sk(sk); |
8423 |
++ |
8424 |
++ inet_sock_destruct(sk); |
8425 |
++ |
8426 |
++ if (!is_meta_sk(sk) && !tp->was_meta_sk) { |
8427 |
++ BUG_ON(!hlist_unhashed(&tp->mptcp->cb_list)); |
8428 |
++ |
8429 |
++ kmem_cache_free(mptcp_sock_cache, tp->mptcp); |
8430 |
++ tp->mptcp = NULL; |
8431 |
++ |
8432 |
++ /* Taken when mpcb pointer was set */ |
8433 |
++ sock_put(mptcp_meta_sk(sk)); |
8434 |
++ mptcp_mpcb_put(tp->mpcb); |
8435 |
++ } else { |
8436 |
++ struct mptcp_cb *mpcb = tp->mpcb; |
8437 |
++ struct mptcp_tw *mptw; |
8438 |
++ |
8439 |
++ /* The mpcb is disappearing - we can make the final |
8440 |
++ * update to the rcv_nxt of the time-wait-sock and remove |
8441 |
++ * its reference to the mpcb. |
8442 |
++ */ |
8443 |
++ spin_lock_bh(&mpcb->tw_lock); |
8444 |
++ list_for_each_entry_rcu(mptw, &mpcb->tw_list, list) { |
8445 |
++ list_del_rcu(&mptw->list); |
8446 |
++ mptw->in_list = 0; |
8447 |
++ mptcp_mpcb_put(mpcb); |
8448 |
++ rcu_assign_pointer(mptw->mpcb, NULL); |
8449 |
++ } |
8450 |
++ spin_unlock_bh(&mpcb->tw_lock); |
8451 |
++ |
8452 |
++ mptcp_mpcb_put(mpcb); |
8453 |
++ |
8454 |
++ mptcp_debug("%s destroying meta-sk\n", __func__); |
8455 |
++ } |
8456 |
++ |
8457 |
++ WARN_ON(!static_key_false(&mptcp_static_key)); |
8458 |
++ /* Must be the last call, because is_meta_sk() above still needs the |
8459 |
++ * static key |
8460 |
++ */ |
8461 |
++ static_key_slow_dec(&mptcp_static_key); |
8462 |
++} |
8463 |
++ |
8464 |
++void mptcp_destroy_sock(struct sock *sk) |
8465 |
++{ |
8466 |
++ if (is_meta_sk(sk)) { |
8467 |
++ struct sock *sk_it, *tmpsk; |
8468 |
++ |
8469 |
++ __skb_queue_purge(&tcp_sk(sk)->mpcb->reinject_queue); |
8470 |
++ mptcp_purge_ofo_queue(tcp_sk(sk)); |
8471 |
++ |
8472 |
++ /* We have to close all remaining subflows. Normally, they |
8473 |
++ * should all be about to get closed. But, if the kernel is |
8474 |
++ * forcing a closure (e.g., tcp_write_err), the subflows might |
8475 |
++ * not have been closed properly (as we are waiting for the |
8476 |
++ * DATA_ACK of the DATA_FIN). |
8477 |
++ */ |
8478 |
++ mptcp_for_each_sk_safe(tcp_sk(sk)->mpcb, sk_it, tmpsk) { |
8479 |
++ /* Already did call tcp_close - waiting for graceful |
8480 |
++ * closure, or if we are retransmitting fast-close on |
8481 |
++ * the subflow. The reset (or timeout) will kill the |
8482 |
++ * subflow.. |
8483 |
++ */ |
8484 |
++ if (tcp_sk(sk_it)->closing || |
8485 |
++ tcp_sk(sk_it)->send_mp_fclose) |
8486 |
++ continue; |
8487 |
++ |
8488 |
++ /* Allow the delayed work first to prevent time-wait state */ |
8489 |
++ if (delayed_work_pending(&tcp_sk(sk_it)->mptcp->work)) |
8490 |
++ continue; |
8491 |
++ |
8492 |
++ mptcp_sub_close(sk_it, 0); |
8493 |
++ } |
8494 |
++ |
8495 |
++ mptcp_delete_synack_timer(sk); |
8496 |
++ } else { |
8497 |
++ mptcp_del_sock(sk); |
8498 |
++ } |
8499 |
++} |
8500 |
++ |
8501 |
++static void mptcp_set_state(struct sock *sk) |
8502 |
++{ |
8503 |
++ struct sock *meta_sk = mptcp_meta_sk(sk); |
8504 |
++ |
8505 |
++ /* Meta is not yet established - wake up the application */ |
8506 |
++ if ((1 << meta_sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV) && |
8507 |
++ sk->sk_state == TCP_ESTABLISHED) { |
8508 |
++ tcp_set_state(meta_sk, TCP_ESTABLISHED); |
8509 |
++ |
8510 |
++ if (!sock_flag(meta_sk, SOCK_DEAD)) { |
8511 |
++ meta_sk->sk_state_change(meta_sk); |
8512 |
++ sk_wake_async(meta_sk, SOCK_WAKE_IO, POLL_OUT); |
8513 |
++ } |
8514 |
++ } |
8515 |
++ |
8516 |
++ if (sk->sk_state == TCP_ESTABLISHED) { |
8517 |
++ tcp_sk(sk)->mptcp->establish_increased = 1; |
8518 |
++ tcp_sk(sk)->mpcb->cnt_established++; |
8519 |
++ } |
8520 |
++} |
8521 |
++ |
8522 |
++void mptcp_init_congestion_control(struct sock *sk) |
8523 |
++{ |
8524 |
++ struct inet_connection_sock *icsk = inet_csk(sk); |
8525 |
++ struct inet_connection_sock *meta_icsk = inet_csk(mptcp_meta_sk(sk)); |
8526 |
++ const struct tcp_congestion_ops *ca = meta_icsk->icsk_ca_ops; |
8527 |
++ |
8528 |
++ /* The application didn't set the congestion control to use |
8529 |
++ * fallback to the default one. |
8530 |
++ */ |
8531 |
++ if (ca == &tcp_init_congestion_ops) |
8532 |
++ goto use_default; |
8533 |
++ |
8534 |
++ /* Use the same congestion control as set by the user. If the |
8535 |
++ * module is not available fallback to the default one. |
8536 |
++ */ |
8537 |
++ if (!try_module_get(ca->owner)) { |
8538 |
++ pr_warn("%s: fallback to the system default CC\n", __func__); |
8539 |
++ goto use_default; |
8540 |
++ } |
8541 |
++ |
8542 |
++ icsk->icsk_ca_ops = ca; |
8543 |
++ if (icsk->icsk_ca_ops->init) |
8544 |
++ icsk->icsk_ca_ops->init(sk); |
8545 |
++ |
8546 |
++ return; |
8547 |
++ |
8548 |
++use_default: |
8549 |
++ icsk->icsk_ca_ops = &tcp_init_congestion_ops; |
8550 |
++ tcp_init_congestion_control(sk); |
8551 |
++} |
8552 |
++ |
8553 |
++u32 mptcp_secret[MD5_MESSAGE_BYTES / 4] ____cacheline_aligned; |
8554 |
++u32 mptcp_seed = 0; |
8555 |
++ |
8556 |
++void mptcp_key_sha1(u64 key, u32 *token, u64 *idsn) |
8557 |
++{ |
8558 |
++ u32 workspace[SHA_WORKSPACE_WORDS]; |
8559 |
++ u32 mptcp_hashed_key[SHA_DIGEST_WORDS]; |
8560 |
++ u8 input[64]; |
8561 |
++ int i; |
8562 |
++ |
8563 |
++ memset(workspace, 0, sizeof(workspace)); |
8564 |
++ |
8565 |
++ /* Initialize input with appropriate padding */ |
8566 |
++ memset(&input[9], 0, sizeof(input) - 10); /* -10, because the last byte |
8567 |
++ * is explicitly set too |
8568 |
++ */ |
8569 |
++ memcpy(input, &key, sizeof(key)); /* Copy key to the msg beginning */ |
8570 |
++ input[8] = 0x80; /* Padding: First bit after message = 1 */ |
8571 |
++ input[63] = 0x40; /* Padding: Length of the message = 64 bits */ |
8572 |
++ |
8573 |
++ sha_init(mptcp_hashed_key); |
8574 |
++ sha_transform(mptcp_hashed_key, input, workspace); |
8575 |
++ |
8576 |
++ for (i = 0; i < 5; i++) |
8577 |
++ mptcp_hashed_key[i] = cpu_to_be32(mptcp_hashed_key[i]); |
8578 |
++ |
8579 |
++ if (token) |
8580 |
++ *token = mptcp_hashed_key[0]; |
8581 |
++ if (idsn) |
8582 |
++ *idsn = *((u64 *)&mptcp_hashed_key[3]); |
8583 |
++} |
8584 |
++ |
8585 |
++void mptcp_hmac_sha1(u8 *key_1, u8 *key_2, u8 *rand_1, u8 *rand_2, |
8586 |
++ u32 *hash_out) |
8587 |
++{ |
8588 |
++ u32 workspace[SHA_WORKSPACE_WORDS]; |
8589 |
++ u8 input[128]; /* 2 512-bit blocks */ |
8590 |
++ int i; |
8591 |
++ |
8592 |
++ memset(workspace, 0, sizeof(workspace)); |
8593 |
++ |
8594 |
++ /* Generate key xored with ipad */ |
8595 |
++ memset(input, 0x36, 64); |
8596 |
++ for (i = 0; i < 8; i++) |
8597 |
++ input[i] ^= key_1[i]; |
8598 |
++ for (i = 0; i < 8; i++) |
8599 |
++ input[i + 8] ^= key_2[i]; |
8600 |
++ |
8601 |
++ memcpy(&input[64], rand_1, 4); |
8602 |
++ memcpy(&input[68], rand_2, 4); |
8603 |
++ input[72] = 0x80; /* Padding: First bit after message = 1 */ |
8604 |
++ memset(&input[73], 0, 53); |
8605 |
++ |
8606 |
++ /* Padding: Length of the message = 512 + 64 bits */ |
8607 |
++ input[126] = 0x02; |
8608 |
++ input[127] = 0x40; |
8609 |
++ |
8610 |
++ sha_init(hash_out); |
8611 |
++ sha_transform(hash_out, input, workspace); |
8612 |
++ memset(workspace, 0, sizeof(workspace)); |
8613 |
++ |
8614 |
++ sha_transform(hash_out, &input[64], workspace); |
8615 |
++ memset(workspace, 0, sizeof(workspace)); |
8616 |
++ |
8617 |
++ for (i = 0; i < 5; i++) |
8618 |
++ hash_out[i] = cpu_to_be32(hash_out[i]); |
8619 |
++ |
8620 |
++ /* Prepare second part of hmac */ |
8621 |
++ memset(input, 0x5C, 64); |
8622 |
++ for (i = 0; i < 8; i++) |
8623 |
++ input[i] ^= key_1[i]; |
8624 |
++ for (i = 0; i < 8; i++) |
8625 |
++ input[i + 8] ^= key_2[i]; |
8626 |
++ |
8627 |
++ memcpy(&input[64], hash_out, 20); |
8628 |
++ input[84] = 0x80; |
8629 |
++ memset(&input[85], 0, 41); |
8630 |
++ |
8631 |
++ /* Padding: Length of the message = 512 + 160 bits */ |
8632 |
++ input[126] = 0x02; |
8633 |
++ input[127] = 0xA0; |
8634 |
++ |
8635 |
++ sha_init(hash_out); |
8636 |
++ sha_transform(hash_out, input, workspace); |
8637 |
++ memset(workspace, 0, sizeof(workspace)); |
8638 |
++ |
8639 |
++ sha_transform(hash_out, &input[64], workspace); |
8640 |
++ |
8641 |
++ for (i = 0; i < 5; i++) |
8642 |
++ hash_out[i] = cpu_to_be32(hash_out[i]); |
8643 |
++} |
8644 |
++ |
8645 |
++static void mptcp_mpcb_inherit_sockopts(struct sock *meta_sk, struct sock *master_sk) |
8646 |
++{ |
8647 |
++ /* Socket-options handled by sk_clone_lock while creating the meta-sk. |
8648 |
++ * ====== |
8649 |
++ * SO_SNDBUF, SO_SNDBUFFORCE, SO_RCVBUF, SO_RCVBUFFORCE, SO_RCVLOWAT, |
8650 |
++ * SO_RCVTIMEO, SO_SNDTIMEO, SO_ATTACH_FILTER, SO_DETACH_FILTER, |
8651 |
++ * TCP_NODELAY, TCP_CORK |
8652 |
++ * |
8653 |
++ * Socket-options handled in this function here |
8654 |
++ * ====== |
8655 |
++ * TCP_DEFER_ACCEPT |
8656 |
++ * SO_KEEPALIVE |
8657 |
++ * |
8658 |
++ * Socket-options on the todo-list |
8659 |
++ * ====== |
8660 |
++ * SO_BINDTODEVICE - should probably prevent creation of new subsocks |
8661 |
++ * across other devices. - what about the api-draft? |
8662 |
++ * SO_DEBUG |
8663 |
++ * SO_REUSEADDR - probably we don't care about this |
8664 |
++ * SO_DONTROUTE, SO_BROADCAST |
8665 |
++ * SO_OOBINLINE |
8666 |
++ * SO_LINGER |
8667 |
++ * SO_TIMESTAMP* - I don't think this is of concern for a SOCK_STREAM |
8668 |
++ * SO_PASSSEC - I don't think this is of concern for a SOCK_STREAM |
8669 |
++ * SO_RXQ_OVFL |
8670 |
++ * TCP_COOKIE_TRANSACTIONS |
8671 |
++ * TCP_MAXSEG |
8672 |
++ * TCP_THIN_* - Handled by sk_clone_lock, but we need to support this |
8673 |
++ * in mptcp_retransmit_timer. AND we need to check what is |
8674 |
++ * about the subsockets. |
8675 |
++ * TCP_LINGER2 |
8676 |
++ * TCP_WINDOW_CLAMP |
8677 |
++ * TCP_USER_TIMEOUT |
8678 |
++ * TCP_MD5SIG |
8679 |
++ * |
8680 |
++ * Socket-options of no concern for the meta-socket (but for the subsocket) |
8681 |
++ * ====== |
8682 |
++ * SO_PRIORITY |
8683 |
++ * SO_MARK |
8684 |
++ * TCP_CONGESTION |
8685 |
++ * TCP_SYNCNT |
8686 |
++ * TCP_QUICKACK |
8687 |
++ */ |
8688 |
++ |
8689 |
++ /* DEFER_ACCEPT should not be set on the meta, as we want to accept new subflows directly */ |
8690 |
++ inet_csk(meta_sk)->icsk_accept_queue.rskq_defer_accept = 0; |
8691 |
++ |
8692 |
++ /* Keepalives are handled entirely at the MPTCP-layer */ |
8693 |
++ if (sock_flag(meta_sk, SOCK_KEEPOPEN)) { |
8694 |
++ inet_csk_reset_keepalive_timer(meta_sk, |
8695 |
++ keepalive_time_when(tcp_sk(meta_sk))); |
8696 |
++ sock_reset_flag(master_sk, SOCK_KEEPOPEN); |
8697 |
++ inet_csk_delete_keepalive_timer(master_sk); |
8698 |
++ } |
8699 |
++ |
8700 |
++ /* Do not propagate subflow-errors up to the MPTCP-layer */ |
8701 |
++ inet_sk(master_sk)->recverr = 0; |
8702 |
++} |
8703 |
++ |
8704 |
++static void mptcp_sub_inherit_sockopts(const struct sock *meta_sk, struct sock *sub_sk) |
8705 |
++{ |
8706 |
++ /* IP_TOS also goes to the subflow. */ |
8707 |
++ if (inet_sk(sub_sk)->tos != inet_sk(meta_sk)->tos) { |
8708 |
++ inet_sk(sub_sk)->tos = inet_sk(meta_sk)->tos; |
8709 |
++ sub_sk->sk_priority = meta_sk->sk_priority; |
8710 |
++ sk_dst_reset(sub_sk); |
8711 |
++ } |
8712 |
++ |
8713 |
++ /* Inherit SO_REUSEADDR */ |
8714 |
++ sub_sk->sk_reuse = meta_sk->sk_reuse; |
8715 |
++ |
8716 |
++ /* Inherit snd/rcv-buffer locks */ |
8717 |
++ sub_sk->sk_userlocks = meta_sk->sk_userlocks & ~SOCK_BINDPORT_LOCK; |
8718 |
++ |
8719 |
++ /* Nagle/Cork is forced off on the subflows. It is handled at the meta-layer */ |
8720 |
++ tcp_sk(sub_sk)->nonagle = TCP_NAGLE_OFF|TCP_NAGLE_PUSH; |
8721 |
++ |
8722 |
++ /* Keepalives are handled entirely at the MPTCP-layer */ |
8723 |
++ if (sock_flag(sub_sk, SOCK_KEEPOPEN)) { |
8724 |
++ sock_reset_flag(sub_sk, SOCK_KEEPOPEN); |
8725 |
++ inet_csk_delete_keepalive_timer(sub_sk); |
8726 |
++ } |
8727 |
++ |
8728 |
++ /* Do not propagate subflow-errors up to the MPTCP-layer */ |
8729 |
++ inet_sk(sub_sk)->recverr = 0; |
8730 |
++} |
8731 |
++ |
8732 |
++int mptcp_backlog_rcv(struct sock *meta_sk, struct sk_buff *skb) |
8733 |
++{ |
8734 |
++ /* skb-sk may be NULL if we receive a packet immediatly after the |
8735 |
++ * SYN/ACK + MP_CAPABLE. |
8736 |
++ */ |
8737 |
++ struct sock *sk = skb->sk ? skb->sk : meta_sk; |
8738 |
++ int ret = 0; |
8739 |
++ |
8740 |
++ skb->sk = NULL; |
8741 |
++ |
8742 |
++ if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt))) { |
8743 |
++ kfree_skb(skb); |
8744 |
++ return 0; |
8745 |
++ } |
8746 |
++ |
8747 |
++ if (sk->sk_family == AF_INET) |
8748 |
++ ret = tcp_v4_do_rcv(sk, skb); |
8749 |
++#if IS_ENABLED(CONFIG_IPV6) |
8750 |
++ else |
8751 |
++ ret = tcp_v6_do_rcv(sk, skb); |
8752 |
++#endif |
8753 |
++ |
8754 |
++ sock_put(sk); |
8755 |
++ return ret; |
8756 |
++} |
8757 |
++ |
8758 |
++struct lock_class_key meta_key; |
8759 |
++struct lock_class_key meta_slock_key; |
8760 |
++ |
8761 |
++static void mptcp_synack_timer_handler(unsigned long data) |
8762 |
++{ |
8763 |
++ struct sock *meta_sk = (struct sock *) data; |
8764 |
++ struct listen_sock *lopt = inet_csk(meta_sk)->icsk_accept_queue.listen_opt; |
8765 |
++ |
8766 |
++ /* Only process if socket is not in use. */ |
8767 |
++ bh_lock_sock(meta_sk); |
8768 |
++ |
8769 |
++ if (sock_owned_by_user(meta_sk)) { |
8770 |
++ /* Try again later. */ |
8771 |
++ mptcp_reset_synack_timer(meta_sk, HZ/20); |
8772 |
++ goto out; |
8773 |
++ } |
8774 |
++ |
8775 |
++ /* May happen if the queue got destructed in mptcp_close */ |
8776 |
++ if (!lopt) |
8777 |
++ goto out; |
8778 |
++ |
8779 |
++ inet_csk_reqsk_queue_prune(meta_sk, TCP_SYNQ_INTERVAL, |
8780 |
++ TCP_TIMEOUT_INIT, TCP_RTO_MAX); |
8781 |
++ |
8782 |
++ if (lopt->qlen) |
8783 |
++ mptcp_reset_synack_timer(meta_sk, TCP_SYNQ_INTERVAL); |
8784 |
++ |
8785 |
++out: |
8786 |
++ bh_unlock_sock(meta_sk); |
8787 |
++ sock_put(meta_sk); |
8788 |
++} |
8789 |
++ |
8790 |
++static const struct tcp_sock_ops mptcp_meta_specific = { |
8791 |
++ .__select_window = __mptcp_select_window, |
8792 |
++ .select_window = mptcp_select_window, |
8793 |
++ .select_initial_window = mptcp_select_initial_window, |
8794 |
++ .init_buffer_space = mptcp_init_buffer_space, |
8795 |
++ .set_rto = mptcp_tcp_set_rto, |
8796 |
++ .should_expand_sndbuf = mptcp_should_expand_sndbuf, |
8797 |
++ .init_congestion_control = mptcp_init_congestion_control, |
8798 |
++ .send_fin = mptcp_send_fin, |
8799 |
++ .write_xmit = mptcp_write_xmit, |
8800 |
++ .send_active_reset = mptcp_send_active_reset, |
8801 |
++ .write_wakeup = mptcp_write_wakeup, |
8802 |
++ .prune_ofo_queue = mptcp_prune_ofo_queue, |
8803 |
++ .retransmit_timer = mptcp_retransmit_timer, |
8804 |
++ .time_wait = mptcp_time_wait, |
8805 |
++ .cleanup_rbuf = mptcp_cleanup_rbuf, |
8806 |
++}; |
8807 |
++ |
8808 |
++static const struct tcp_sock_ops mptcp_sub_specific = { |
8809 |
++ .__select_window = __mptcp_select_window, |
8810 |
++ .select_window = mptcp_select_window, |
8811 |
++ .select_initial_window = mptcp_select_initial_window, |
8812 |
++ .init_buffer_space = mptcp_init_buffer_space, |
8813 |
++ .set_rto = mptcp_tcp_set_rto, |
8814 |
++ .should_expand_sndbuf = mptcp_should_expand_sndbuf, |
8815 |
++ .init_congestion_control = mptcp_init_congestion_control, |
8816 |
++ .send_fin = tcp_send_fin, |
8817 |
++ .write_xmit = tcp_write_xmit, |
8818 |
++ .send_active_reset = tcp_send_active_reset, |
8819 |
++ .write_wakeup = tcp_write_wakeup, |
8820 |
++ .prune_ofo_queue = tcp_prune_ofo_queue, |
8821 |
++ .retransmit_timer = tcp_retransmit_timer, |
8822 |
++ .time_wait = tcp_time_wait, |
8823 |
++ .cleanup_rbuf = tcp_cleanup_rbuf, |
8824 |
++}; |
8825 |
++ |
8826 |
++static int mptcp_alloc_mpcb(struct sock *meta_sk, __u64 remote_key, u32 window) |
8827 |
++{ |
8828 |
++ struct mptcp_cb *mpcb; |
8829 |
++ struct sock *master_sk; |
8830 |
++ struct inet_connection_sock *master_icsk, *meta_icsk = inet_csk(meta_sk); |
8831 |
++ struct tcp_sock *master_tp, *meta_tp = tcp_sk(meta_sk); |
8832 |
++ u64 idsn; |
8833 |
++ |
8834 |
++ dst_release(meta_sk->sk_rx_dst); |
8835 |
++ meta_sk->sk_rx_dst = NULL; |
8836 |
++ /* This flag is set to announce sock_lock_init to |
8837 |
++ * reclassify the lock-class of the master socket. |
8838 |
++ */ |
8839 |
++ meta_tp->is_master_sk = 1; |
8840 |
++ master_sk = sk_clone_lock(meta_sk, GFP_ATOMIC | __GFP_ZERO); |
8841 |
++ meta_tp->is_master_sk = 0; |
8842 |
++ if (!master_sk) |
8843 |
++ return -ENOBUFS; |
8844 |
++ |
8845 |
++ master_tp = tcp_sk(master_sk); |
8846 |
++ master_icsk = inet_csk(master_sk); |
8847 |
++ |
8848 |
++ mpcb = kmem_cache_zalloc(mptcp_cb_cache, GFP_ATOMIC); |
8849 |
++ if (!mpcb) { |
8850 |
++ /* sk_free (and __sk_free) requirese wmem_alloc to be 1. |
8851 |
++ * All the rest is set to 0 thanks to __GFP_ZERO above. |
8852 |
++ */ |
8853 |
++ atomic_set(&master_sk->sk_wmem_alloc, 1); |
8854 |
++ sk_free(master_sk); |
8855 |
++ return -ENOBUFS; |
8856 |
++ } |
8857 |
++ |
8858 |
++#if IS_ENABLED(CONFIG_IPV6) |
8859 |
++ if (meta_icsk->icsk_af_ops == &mptcp_v6_mapped) { |
8860 |
++ struct ipv6_pinfo *newnp, *np = inet6_sk(meta_sk); |
8861 |
++ |
8862 |
++ inet_sk(master_sk)->pinet6 = &((struct tcp6_sock *)master_sk)->inet6; |
8863 |
++ |
8864 |
++ newnp = inet6_sk(master_sk); |
8865 |
++ memcpy(newnp, np, sizeof(struct ipv6_pinfo)); |
8866 |
++ |
8867 |
++ newnp->ipv6_mc_list = NULL; |
8868 |
++ newnp->ipv6_ac_list = NULL; |
8869 |
++ newnp->ipv6_fl_list = NULL; |
8870 |
++ newnp->opt = NULL; |
8871 |
++ newnp->pktoptions = NULL; |
8872 |
++ (void)xchg(&newnp->rxpmtu, NULL); |
8873 |
++ } else if (meta_sk->sk_family == AF_INET6) { |
8874 |
++ struct ipv6_pinfo *newnp, *np = inet6_sk(meta_sk); |
8875 |
++ |
8876 |
++ inet_sk(master_sk)->pinet6 = &((struct tcp6_sock *)master_sk)->inet6; |
8877 |
++ |
8878 |
++ newnp = inet6_sk(master_sk); |
8879 |
++ memcpy(newnp, np, sizeof(struct ipv6_pinfo)); |
8880 |
++ |
8881 |
++ newnp->hop_limit = -1; |
8882 |
++ newnp->mcast_hops = IPV6_DEFAULT_MCASTHOPS; |
8883 |
++ newnp->mc_loop = 1; |
8884 |
++ newnp->pmtudisc = IPV6_PMTUDISC_WANT; |
8885 |
++ newnp->ipv6only = sock_net(master_sk)->ipv6.sysctl.bindv6only; |
8886 |
++ } |
8887 |
++#endif |
8888 |
++ |
8889 |
++ meta_tp->mptcp = NULL; |
8890 |
++ |
8891 |
++ /* Store the keys and generate the peer's token */ |
8892 |
++ mpcb->mptcp_loc_key = meta_tp->mptcp_loc_key; |
8893 |
++ mpcb->mptcp_loc_token = meta_tp->mptcp_loc_token; |
8894 |
++ |
8895 |
++ /* Generate Initial data-sequence-numbers */ |
8896 |
++ mptcp_key_sha1(mpcb->mptcp_loc_key, NULL, &idsn); |
8897 |
++ idsn = ntohll(idsn) + 1; |
8898 |
++ mpcb->snd_high_order[0] = idsn >> 32; |
8899 |
++ mpcb->snd_high_order[1] = mpcb->snd_high_order[0] - 1; |
8900 |
++ |
8901 |
++ meta_tp->write_seq = (u32)idsn; |
8902 |
++ meta_tp->snd_sml = meta_tp->write_seq; |
8903 |
++ meta_tp->snd_una = meta_tp->write_seq; |
8904 |
++ meta_tp->snd_nxt = meta_tp->write_seq; |
8905 |
++ meta_tp->pushed_seq = meta_tp->write_seq; |
8906 |
++ meta_tp->snd_up = meta_tp->write_seq; |
8907 |
++ |
8908 |
++ mpcb->mptcp_rem_key = remote_key; |
8909 |
++ mptcp_key_sha1(mpcb->mptcp_rem_key, &mpcb->mptcp_rem_token, &idsn); |
8910 |
++ idsn = ntohll(idsn) + 1; |
8911 |
++ mpcb->rcv_high_order[0] = idsn >> 32; |
8912 |
++ mpcb->rcv_high_order[1] = mpcb->rcv_high_order[0] + 1; |
8913 |
++ meta_tp->copied_seq = (u32) idsn; |
8914 |
++ meta_tp->rcv_nxt = (u32) idsn; |
8915 |
++ meta_tp->rcv_wup = (u32) idsn; |
8916 |
++ |
8917 |
++ meta_tp->snd_wl1 = meta_tp->rcv_nxt - 1; |
8918 |
++ meta_tp->snd_wnd = window; |
8919 |
++ meta_tp->retrans_stamp = 0; /* Set in tcp_connect() */ |
8920 |
++ |
8921 |
++ meta_tp->packets_out = 0; |
8922 |
++ meta_icsk->icsk_probes_out = 0; |
8923 |
++ |
8924 |
++ /* Set mptcp-pointers */ |
8925 |
++ master_tp->mpcb = mpcb; |
8926 |
++ master_tp->meta_sk = meta_sk; |
8927 |
++ meta_tp->mpcb = mpcb; |
8928 |
++ meta_tp->meta_sk = meta_sk; |
8929 |
++ mpcb->meta_sk = meta_sk; |
8930 |
++ mpcb->master_sk = master_sk; |
8931 |
++ |
8932 |
++ meta_tp->was_meta_sk = 0; |
8933 |
++ |
8934 |
++ /* Initialize the queues */ |
8935 |
++ skb_queue_head_init(&mpcb->reinject_queue); |
8936 |
++ skb_queue_head_init(&master_tp->out_of_order_queue); |
8937 |
++ tcp_prequeue_init(master_tp); |
8938 |
++ INIT_LIST_HEAD(&master_tp->tsq_node); |
8939 |
++ |
8940 |
++ master_tp->tsq_flags = 0; |
8941 |
++ |
8942 |
++ mutex_init(&mpcb->mpcb_mutex); |
8943 |
++ |
8944 |
++ /* Init the accept_queue structure, we support a queue of 32 pending |
8945 |
++ * connections, it does not need to be huge, since we only store here |
8946 |
++ * pending subflow creations. |
8947 |
++ */ |
8948 |
++ if (reqsk_queue_alloc(&meta_icsk->icsk_accept_queue, 32, GFP_ATOMIC)) { |
8949 |
++ inet_put_port(master_sk); |
8950 |
++ kmem_cache_free(mptcp_cb_cache, mpcb); |
8951 |
++ sk_free(master_sk); |
8952 |
++ return -ENOMEM; |
8953 |
++ } |
8954 |
++ |
8955 |
++ /* Redefine function-pointers as the meta-sk is now fully ready */ |
8956 |
++ static_key_slow_inc(&mptcp_static_key); |
8957 |
++ meta_tp->mpc = 1; |
8958 |
++ meta_tp->ops = &mptcp_meta_specific; |
8959 |
++ |
8960 |
++ meta_sk->sk_backlog_rcv = mptcp_backlog_rcv; |
8961 |
++ meta_sk->sk_destruct = mptcp_sock_destruct; |
8962 |
++ |
8963 |
++ /* Meta-level retransmit timer */ |
8964 |
++ meta_icsk->icsk_rto *= 2; /* Double of initial - rto */ |
8965 |
++ |
8966 |
++ tcp_init_xmit_timers(master_sk); |
8967 |
++ /* Has been set for sending out the SYN */ |
8968 |
++ inet_csk_clear_xmit_timer(meta_sk, ICSK_TIME_RETRANS); |
8969 |
++ |
8970 |
++ if (!meta_tp->inside_tk_table) { |
8971 |
++ /* Adding the meta_tp in the token hashtable - coming from server-side */ |
8972 |
++ rcu_read_lock(); |
8973 |
++ spin_lock(&mptcp_tk_hashlock); |
8974 |
++ |
8975 |
++ __mptcp_hash_insert(meta_tp, mpcb->mptcp_loc_token); |
8976 |
++ |
8977 |
++ spin_unlock(&mptcp_tk_hashlock); |
8978 |
++ rcu_read_unlock(); |
8979 |
++ } |
8980 |
++ master_tp->inside_tk_table = 0; |
8981 |
++ |
8982 |
++ /* Init time-wait stuff */ |
8983 |
++ INIT_LIST_HEAD(&mpcb->tw_list); |
8984 |
++ spin_lock_init(&mpcb->tw_lock); |
8985 |
++ |
8986 |
++ INIT_HLIST_HEAD(&mpcb->callback_list); |
8987 |
++ |
8988 |
++ mptcp_mpcb_inherit_sockopts(meta_sk, master_sk); |
8989 |
++ |
8990 |
++ mpcb->orig_sk_rcvbuf = meta_sk->sk_rcvbuf; |
8991 |
++ mpcb->orig_sk_sndbuf = meta_sk->sk_sndbuf; |
8992 |
++ mpcb->orig_window_clamp = meta_tp->window_clamp; |
8993 |
++ |
8994 |
++ /* The meta is directly linked - set refcnt to 1 */ |
8995 |
++ atomic_set(&mpcb->mpcb_refcnt, 1); |
8996 |
++ |
8997 |
++ mptcp_init_path_manager(mpcb); |
8998 |
++ mptcp_init_scheduler(mpcb); |
8999 |
++ |
9000 |
++ setup_timer(&mpcb->synack_timer, mptcp_synack_timer_handler, |
9001 |
++ (unsigned long)meta_sk); |
9002 |
++ |
9003 |
++ mptcp_debug("%s: created mpcb with token %#x\n", |
9004 |
++ __func__, mpcb->mptcp_loc_token); |
9005 |
++ |
9006 |
++ return 0; |
9007 |
++} |
9008 |
++ |
9009 |
++void mptcp_fallback_meta_sk(struct sock *meta_sk) |
9010 |
++{ |
9011 |
++ kfree(inet_csk(meta_sk)->icsk_accept_queue.listen_opt); |
9012 |
++ kmem_cache_free(mptcp_cb_cache, tcp_sk(meta_sk)->mpcb); |
9013 |
++} |
9014 |
++ |
9015 |
++int mptcp_add_sock(struct sock *meta_sk, struct sock *sk, u8 loc_id, u8 rem_id, |
9016 |
++ gfp_t flags) |
9017 |
++{ |
9018 |
++ struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb; |
9019 |
++ struct tcp_sock *tp = tcp_sk(sk); |
9020 |
++ |
9021 |
++ tp->mptcp = kmem_cache_zalloc(mptcp_sock_cache, flags); |
9022 |
++ if (!tp->mptcp) |
9023 |
++ return -ENOMEM; |
9024 |
++ |
9025 |
++ tp->mptcp->path_index = mptcp_set_new_pathindex(mpcb); |
9026 |
++ /* No more space for more subflows? */ |
9027 |
++ if (!tp->mptcp->path_index) { |
9028 |
++ kmem_cache_free(mptcp_sock_cache, tp->mptcp); |
9029 |
++ return -EPERM; |
9030 |
++ } |
9031 |
++ |
9032 |
++ INIT_HLIST_NODE(&tp->mptcp->cb_list); |
9033 |
++ |
9034 |
++ tp->mptcp->tp = tp; |
9035 |
++ tp->mpcb = mpcb; |
9036 |
++ tp->meta_sk = meta_sk; |
9037 |
++ |
9038 |
++ static_key_slow_inc(&mptcp_static_key); |
9039 |
++ tp->mpc = 1; |
9040 |
++ tp->ops = &mptcp_sub_specific; |
9041 |
++ |
9042 |
++ tp->mptcp->loc_id = loc_id; |
9043 |
++ tp->mptcp->rem_id = rem_id; |
9044 |
++ if (mpcb->sched_ops->init) |
9045 |
++ mpcb->sched_ops->init(sk); |
9046 |
++ |
9047 |
++ /* The corresponding sock_put is in mptcp_sock_destruct(). It cannot be |
9048 |
++ * included in mptcp_del_sock(), because the mpcb must remain alive |
9049 |
++ * until the last subsocket is completely destroyed. |
9050 |
++ */ |
9051 |
++ sock_hold(meta_sk); |
9052 |
++ atomic_inc(&mpcb->mpcb_refcnt); |
9053 |
++ |
9054 |
++ tp->mptcp->next = mpcb->connection_list; |
9055 |
++ mpcb->connection_list = tp; |
9056 |
++ tp->mptcp->attached = 1; |
9057 |
++ |
9058 |
++ mpcb->cnt_subflows++; |
9059 |
++ atomic_add(atomic_read(&((struct sock *)tp)->sk_rmem_alloc), |
9060 |
++ &meta_sk->sk_rmem_alloc); |
9061 |
++ |
9062 |
++ mptcp_sub_inherit_sockopts(meta_sk, sk); |
9063 |
++ INIT_DELAYED_WORK(&tp->mptcp->work, mptcp_sub_close_wq); |
9064 |
++ |
9065 |
++ /* As we successfully allocated the mptcp_tcp_sock, we have to |
9066 |
++ * change the function-pointers here (for sk_destruct to work correctly) |
9067 |
++ */ |
9068 |
++ sk->sk_error_report = mptcp_sock_def_error_report; |
9069 |
++ sk->sk_data_ready = mptcp_data_ready; |
9070 |
++ sk->sk_write_space = mptcp_write_space; |
9071 |
++ sk->sk_state_change = mptcp_set_state; |
9072 |
++ sk->sk_destruct = mptcp_sock_destruct; |
9073 |
++ |
9074 |
++ if (sk->sk_family == AF_INET) |
9075 |
++ mptcp_debug("%s: token %#x pi %d, src_addr:%pI4:%d dst_addr:%pI4:%d, cnt_subflows now %d\n", |
9076 |
++ __func__ , mpcb->mptcp_loc_token, |
9077 |
++ tp->mptcp->path_index, |
9078 |
++ &((struct inet_sock *)tp)->inet_saddr, |
9079 |
++ ntohs(((struct inet_sock *)tp)->inet_sport), |
9080 |
++ &((struct inet_sock *)tp)->inet_daddr, |
9081 |
++ ntohs(((struct inet_sock *)tp)->inet_dport), |
9082 |
++ mpcb->cnt_subflows); |
9083 |
++#if IS_ENABLED(CONFIG_IPV6) |
9084 |
++ else |
9085 |
++ mptcp_debug("%s: token %#x pi %d, src_addr:%pI6:%d dst_addr:%pI6:%d, cnt_subflows now %d\n", |
9086 |
++ __func__ , mpcb->mptcp_loc_token, |
9087 |
++ tp->mptcp->path_index, &inet6_sk(sk)->saddr, |
9088 |
++ ntohs(((struct inet_sock *)tp)->inet_sport), |
9089 |
++ &sk->sk_v6_daddr, |
9090 |
++ ntohs(((struct inet_sock *)tp)->inet_dport), |
9091 |
++ mpcb->cnt_subflows); |
9092 |
++#endif |
9093 |
++ |
9094 |
++ return 0; |
9095 |
++} |
9096 |
++ |
9097 |
++void mptcp_del_sock(struct sock *sk) |
9098 |
++{ |
9099 |
++ struct tcp_sock *tp = tcp_sk(sk), *tp_prev; |
9100 |
++ struct mptcp_cb *mpcb; |
9101 |
++ |
9102 |
++ if (!tp->mptcp || !tp->mptcp->attached) |
9103 |
++ return; |
9104 |
++ |
9105 |
++ mpcb = tp->mpcb; |
9106 |
++ tp_prev = mpcb->connection_list; |
9107 |
++ |
9108 |
++ mptcp_debug("%s: Removing subsock tok %#x pi:%d state %d is_meta? %d\n", |
9109 |
++ __func__, mpcb->mptcp_loc_token, tp->mptcp->path_index, |
9110 |
++ sk->sk_state, is_meta_sk(sk)); |
9111 |
++ |
9112 |
++ if (tp_prev == tp) { |
9113 |
++ mpcb->connection_list = tp->mptcp->next; |
9114 |
++ } else { |
9115 |
++ for (; tp_prev && tp_prev->mptcp->next; tp_prev = tp_prev->mptcp->next) { |
9116 |
++ if (tp_prev->mptcp->next == tp) { |
9117 |
++ tp_prev->mptcp->next = tp->mptcp->next; |
9118 |
++ break; |
9119 |
++ } |
9120 |
++ } |
9121 |
++ } |
9122 |
++ mpcb->cnt_subflows--; |
9123 |
++ if (tp->mptcp->establish_increased) |
9124 |
++ mpcb->cnt_established--; |
9125 |
++ |
9126 |
++ tp->mptcp->next = NULL; |
9127 |
++ tp->mptcp->attached = 0; |
9128 |
++ mpcb->path_index_bits &= ~(1 << tp->mptcp->path_index); |
9129 |
++ |
9130 |
++ if (!skb_queue_empty(&sk->sk_write_queue)) |
9131 |
++ mptcp_reinject_data(sk, 0); |
9132 |
++ |
9133 |
++ if (is_master_tp(tp)) |
9134 |
++ mpcb->master_sk = NULL; |
9135 |
++ else if (tp->mptcp->pre_established) |
9136 |
++ sk_stop_timer(sk, &tp->mptcp->mptcp_ack_timer); |
9137 |
++ |
9138 |
++ rcu_assign_pointer(inet_sk(sk)->inet_opt, NULL); |
9139 |
++} |
9140 |
++ |
9141 |
++/* Updates the metasocket ULID/port data, based on the given sock. |
9142 |
++ * The argument sock must be the sock accessible to the application. |
9143 |
++ * In this function, we update the meta socket info, based on the changes |
9144 |
++ * in the application socket (bind, address allocation, ...) |
9145 |
++ */ |
9146 |
++void mptcp_update_metasocket(struct sock *sk, const struct sock *meta_sk) |
9147 |
++{ |
9148 |
++ if (tcp_sk(sk)->mpcb->pm_ops->new_session) |
9149 |
++ tcp_sk(sk)->mpcb->pm_ops->new_session(meta_sk); |
9150 |
++ |
9151 |
++ tcp_sk(sk)->mptcp->send_mp_prio = tcp_sk(sk)->mptcp->low_prio; |
9152 |
++} |
9153 |
++ |
9154 |
++/* Clean up the receive buffer for full frames taken by the user, |
9155 |
++ * then send an ACK if necessary. COPIED is the number of bytes |
9156 |
++ * tcp_recvmsg has given to the user so far, it speeds up the |
9157 |
++ * calculation of whether or not we must ACK for the sake of |
9158 |
++ * a window update. |
9159 |
++ */ |
9160 |
++void mptcp_cleanup_rbuf(struct sock *meta_sk, int copied) |
9161 |
++{ |
9162 |
++ struct tcp_sock *meta_tp = tcp_sk(meta_sk); |
9163 |
++ struct sock *sk; |
9164 |
++ __u32 rcv_window_now = 0; |
9165 |
++ |
9166 |
++ if (copied > 0 && !(meta_sk->sk_shutdown & RCV_SHUTDOWN)) { |
9167 |
++ rcv_window_now = tcp_receive_window(meta_tp); |
9168 |
++ |
9169 |
++ if (2 * rcv_window_now > meta_tp->window_clamp) |
9170 |
++ rcv_window_now = 0; |
9171 |
++ } |
9172 |
++ |
9173 |
++ mptcp_for_each_sk(meta_tp->mpcb, sk) { |
9174 |
++ struct tcp_sock *tp = tcp_sk(sk); |
9175 |
++ const struct inet_connection_sock *icsk = inet_csk(sk); |
9176 |
++ |
9177 |
++ if (!mptcp_sk_can_send_ack(sk)) |
9178 |
++ continue; |
9179 |
++ |
9180 |
++ if (!inet_csk_ack_scheduled(sk)) |
9181 |
++ goto second_part; |
9182 |
++ /* Delayed ACKs frequently hit locked sockets during bulk |
9183 |
++ * receive. |
9184 |
++ */ |
9185 |
++ if (icsk->icsk_ack.blocked || |
9186 |
++ /* Once-per-two-segments ACK was not sent by tcp_input.c */ |
9187 |
++ tp->rcv_nxt - tp->rcv_wup > icsk->icsk_ack.rcv_mss || |
9188 |
++ /* If this read emptied read buffer, we send ACK, if |
9189 |
++ * connection is not bidirectional, user drained |
9190 |
++ * receive buffer and there was a small segment |
9191 |
++ * in queue. |
9192 |
++ */ |
9193 |
++ (copied > 0 && |
9194 |
++ ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED2) || |
9195 |
++ ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED) && |
9196 |
++ !icsk->icsk_ack.pingpong)) && |
9197 |
++ !atomic_read(&meta_sk->sk_rmem_alloc))) { |
9198 |
++ tcp_send_ack(sk); |
9199 |
++ continue; |
9200 |
++ } |
9201 |
++ |
9202 |
++second_part: |
9203 |
++ /* This here is the second part of tcp_cleanup_rbuf */ |
9204 |
++ if (rcv_window_now) { |
9205 |
++ __u32 new_window = tp->ops->__select_window(sk); |
9206 |
++ |
9207 |
++ /* Send ACK now, if this read freed lots of space |
9208 |
++ * in our buffer. Certainly, new_window is new window. |
9209 |
++ * We can advertise it now, if it is not less than |
9210 |
++ * current one. |
9211 |
++ * "Lots" means "at least twice" here. |
9212 |
++ */ |
9213 |
++ if (new_window && new_window >= 2 * rcv_window_now) |
9214 |
++ tcp_send_ack(sk); |
9215 |
++ } |
9216 |
++ } |
9217 |
++} |
9218 |
++ |
9219 |
++static int mptcp_sub_send_fin(struct sock *sk) |
9220 |
++{ |
9221 |
++ struct tcp_sock *tp = tcp_sk(sk); |
9222 |
++ struct sk_buff *skb = tcp_write_queue_tail(sk); |
9223 |
++ int mss_now; |
9224 |
++ |
9225 |
++ /* Optimization, tack on the FIN if we have a queue of |
9226 |
++ * unsent frames. But be careful about outgoing SACKS |
9227 |
++ * and IP options. |
9228 |
++ */ |
9229 |
++ mss_now = tcp_current_mss(sk); |
9230 |
++ |
9231 |
++ if (tcp_send_head(sk) != NULL) { |
9232 |
++ TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_FIN; |
9233 |
++ TCP_SKB_CB(skb)->end_seq++; |
9234 |
++ tp->write_seq++; |
9235 |
++ } else { |
9236 |
++ skb = alloc_skb_fclone(MAX_TCP_HEADER, GFP_ATOMIC); |
9237 |
++ if (!skb) |
9238 |
++ return 1; |
9239 |
++ |
9240 |
++ /* Reserve space for headers and prepare control bits. */ |
9241 |
++ skb_reserve(skb, MAX_TCP_HEADER); |
9242 |
++ /* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */ |
9243 |
++ tcp_init_nondata_skb(skb, tp->write_seq, |
9244 |
++ TCPHDR_ACK | TCPHDR_FIN); |
9245 |
++ tcp_queue_skb(sk, skb); |
9246 |
++ } |
9247 |
++ __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_OFF); |
9248 |
++ |
9249 |
++ return 0; |
9250 |
++} |
9251 |
++ |
9252 |
++void mptcp_sub_close_wq(struct work_struct *work) |
9253 |
++{ |
9254 |
++ struct tcp_sock *tp = container_of(work, struct mptcp_tcp_sock, work.work)->tp; |
9255 |
++ struct sock *sk = (struct sock *)tp; |
9256 |
++ struct sock *meta_sk = mptcp_meta_sk(sk); |
9257 |
++ |
9258 |
++ mutex_lock(&tp->mpcb->mpcb_mutex); |
9259 |
++ lock_sock_nested(meta_sk, SINGLE_DEPTH_NESTING); |
9260 |
++ |
9261 |
++ if (sock_flag(sk, SOCK_DEAD)) |
9262 |
++ goto exit; |
9263 |
++ |
9264 |
++ /* We come from tcp_disconnect. We are sure that meta_sk is set */ |
9265 |
++ if (!mptcp(tp)) { |
9266 |
++ tp->closing = 1; |
9267 |
++ sock_rps_reset_flow(sk); |
9268 |
++ tcp_close(sk, 0); |
9269 |
++ goto exit; |
9270 |
++ } |
9271 |
++ |
9272 |
++ if (meta_sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE) { |
9273 |
++ tp->closing = 1; |
9274 |
++ sock_rps_reset_flow(sk); |
9275 |
++ tcp_close(sk, 0); |
9276 |
++ } else if (tcp_close_state(sk)) { |
9277 |
++ sk->sk_shutdown |= SEND_SHUTDOWN; |
9278 |
++ tcp_send_fin(sk); |
9279 |
++ } |
9280 |
++ |
9281 |
++exit: |
9282 |
++ release_sock(meta_sk); |
9283 |
++ mutex_unlock(&tp->mpcb->mpcb_mutex); |
9284 |
++ sock_put(sk); |
9285 |
++} |
9286 |
++ |
9287 |
++void mptcp_sub_close(struct sock *sk, unsigned long delay) |
9288 |
++{ |
9289 |
++ struct tcp_sock *tp = tcp_sk(sk); |
9290 |
++ struct delayed_work *work = &tcp_sk(sk)->mptcp->work; |
9291 |
++ |
9292 |
++ /* We are already closing - e.g., call from sock_def_error_report upon |
9293 |
++ * tcp_disconnect in tcp_close. |
9294 |
++ */ |
9295 |
++ if (tp->closing) |
9296 |
++ return; |
9297 |
++ |
9298 |
++ /* Work already scheduled ? */ |
9299 |
++ if (work_pending(&work->work)) { |
9300 |
++ /* Work present - who will be first ? */ |
9301 |
++ if (jiffies + delay > work->timer.expires) |
9302 |
++ return; |
9303 |
++ |
9304 |
++ /* Try canceling - if it fails, work will be executed soon */ |
9305 |
++ if (!cancel_delayed_work(work)) |
9306 |
++ return; |
9307 |
++ sock_put(sk); |
9308 |
++ } |
9309 |
++ |
9310 |
++ if (!delay) { |
9311 |
++ unsigned char old_state = sk->sk_state; |
9312 |
++ |
9313 |
++ /* If we are in user-context we can directly do the closing |
9314 |
++ * procedure. No need to schedule a work-queue. |
9315 |
++ */ |
9316 |
++ if (!in_softirq()) { |
9317 |
++ if (sock_flag(sk, SOCK_DEAD)) |
9318 |
++ return; |
9319 |
++ |
9320 |
++ if (!mptcp(tp)) { |
9321 |
++ tp->closing = 1; |
9322 |
++ sock_rps_reset_flow(sk); |
9323 |
++ tcp_close(sk, 0); |
9324 |
++ return; |
9325 |
++ } |
9326 |
++ |
9327 |
++ if (mptcp_meta_sk(sk)->sk_shutdown == SHUTDOWN_MASK || |
9328 |
++ sk->sk_state == TCP_CLOSE) { |
9329 |
++ tp->closing = 1; |
9330 |
++ sock_rps_reset_flow(sk); |
9331 |
++ tcp_close(sk, 0); |
9332 |
++ } else if (tcp_close_state(sk)) { |
9333 |
++ sk->sk_shutdown |= SEND_SHUTDOWN; |
9334 |
++ tcp_send_fin(sk); |
9335 |
++ } |
9336 |
++ |
9337 |
++ return; |
9338 |
++ } |
9339 |
++ |
9340 |
++ /* We directly send the FIN. Because it may take so a long time, |
9341 |
++ * untile the work-queue will get scheduled... |
9342 |
++ * |
9343 |
++ * If mptcp_sub_send_fin returns 1, it failed and thus we reset |
9344 |
++ * the old state so that tcp_close will finally send the fin |
9345 |
++ * in user-context. |
9346 |
++ */ |
9347 |
++ if (!sk->sk_err && old_state != TCP_CLOSE && |
9348 |
++ tcp_close_state(sk) && mptcp_sub_send_fin(sk)) { |
9349 |
++ if (old_state == TCP_ESTABLISHED) |
9350 |
++ TCP_INC_STATS(sock_net(sk), TCP_MIB_CURRESTAB); |
9351 |
++ sk->sk_state = old_state; |
9352 |
++ } |
9353 |
++ } |
9354 |
++ |
9355 |
++ sock_hold(sk); |
9356 |
++ queue_delayed_work(mptcp_wq, work, delay); |
9357 |
++} |
9358 |
++ |
9359 |
++void mptcp_sub_force_close(struct sock *sk) |
9360 |
++{ |
9361 |
++ /* The below tcp_done may have freed the socket, if he is already dead. |
9362 |
++ * Thus, we are not allowed to access it afterwards. That's why |
9363 |
++ * we have to store the dead-state in this local variable. |
9364 |
++ */ |
9365 |
++ int sock_is_dead = sock_flag(sk, SOCK_DEAD); |
9366 |
++ |
9367 |
++ tcp_sk(sk)->mp_killed = 1; |
9368 |
++ |
9369 |
++ if (sk->sk_state != TCP_CLOSE) |
9370 |
++ tcp_done(sk); |
9371 |
++ |
9372 |
++ if (!sock_is_dead) |
9373 |
++ mptcp_sub_close(sk, 0); |
9374 |
++} |
9375 |
++EXPORT_SYMBOL(mptcp_sub_force_close); |
9376 |
++ |
9377 |
++/* Update the mpcb send window, based on the contributions |
9378 |
++ * of each subflow |
9379 |
++ */ |
9380 |
++void mptcp_update_sndbuf(const struct tcp_sock *tp) |
9381 |
++{ |
9382 |
++ struct sock *meta_sk = tp->meta_sk, *sk; |
9383 |
++ int new_sndbuf = 0, old_sndbuf = meta_sk->sk_sndbuf; |
9384 |
++ |
9385 |
++ mptcp_for_each_sk(tp->mpcb, sk) { |
9386 |
++ if (!mptcp_sk_can_send(sk)) |
9387 |
++ continue; |
9388 |
++ |
9389 |
++ new_sndbuf += sk->sk_sndbuf; |
9390 |
++ |
9391 |
++ if (new_sndbuf > sysctl_tcp_wmem[2] || new_sndbuf < 0) { |
9392 |
++ new_sndbuf = sysctl_tcp_wmem[2]; |
9393 |
++ break; |
9394 |
++ } |
9395 |
++ } |
9396 |
++ meta_sk->sk_sndbuf = max(min(new_sndbuf, sysctl_tcp_wmem[2]), meta_sk->sk_sndbuf); |
9397 |
++ |
9398 |
++ /* The subflow's call to sk_write_space in tcp_new_space ends up in |
9399 |
++ * mptcp_write_space. |
9400 |
++ * It has nothing to do with waking up the application. |
9401 |
++ * So, we do it here. |
9402 |
++ */ |
9403 |
++ if (old_sndbuf != meta_sk->sk_sndbuf) |
9404 |
++ meta_sk->sk_write_space(meta_sk); |
9405 |
++} |
9406 |
++ |
9407 |
++void mptcp_close(struct sock *meta_sk, long timeout) |
9408 |
++{ |
9409 |
++ struct tcp_sock *meta_tp = tcp_sk(meta_sk); |
9410 |
++ struct sock *sk_it, *tmpsk; |
9411 |
++ struct mptcp_cb *mpcb = meta_tp->mpcb; |
9412 |
++ struct sk_buff *skb; |
9413 |
++ int data_was_unread = 0; |
9414 |
++ int state; |
9415 |
++ |
9416 |
++ mptcp_debug("%s: Close of meta_sk with tok %#x\n", |
9417 |
++ __func__, mpcb->mptcp_loc_token); |
9418 |
++ |
9419 |
++ mutex_lock(&mpcb->mpcb_mutex); |
9420 |
++ lock_sock(meta_sk); |
9421 |
++ |
9422 |
++ if (meta_tp->inside_tk_table) { |
9423 |
++ /* Detach the mpcb from the token hashtable */ |
9424 |
++ mptcp_hash_remove_bh(meta_tp); |
9425 |
++ reqsk_queue_destroy(&inet_csk(meta_sk)->icsk_accept_queue); |
9426 |
++ } |
9427 |
++ |
9428 |
++ meta_sk->sk_shutdown = SHUTDOWN_MASK; |
9429 |
++ /* We need to flush the recv. buffs. We do this only on the |
9430 |
++ * descriptor close, not protocol-sourced closes, because the |
9431 |
++ * reader process may not have drained the data yet! |
9432 |
++ */ |
9433 |
++ while ((skb = __skb_dequeue(&meta_sk->sk_receive_queue)) != NULL) { |
9434 |
++ u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq - |
9435 |
++ tcp_hdr(skb)->fin; |
9436 |
++ data_was_unread += len; |
9437 |
++ __kfree_skb(skb); |
9438 |
++ } |
9439 |
++ |
9440 |
++ sk_mem_reclaim(meta_sk); |
9441 |
++ |
9442 |
++ /* If socket has been already reset (e.g. in tcp_reset()) - kill it. */ |
9443 |
++ if (meta_sk->sk_state == TCP_CLOSE) { |
9444 |
++ mptcp_for_each_sk_safe(mpcb, sk_it, tmpsk) { |
9445 |
++ if (tcp_sk(sk_it)->send_mp_fclose) |
9446 |
++ continue; |
9447 |
++ mptcp_sub_close(sk_it, 0); |
9448 |
++ } |
9449 |
++ goto adjudge_to_death; |
9450 |
++ } |
9451 |
++ |
9452 |
++ if (data_was_unread) { |
9453 |
++ /* Unread data was tossed, zap the connection. */ |
9454 |
++ NET_INC_STATS_USER(sock_net(meta_sk), LINUX_MIB_TCPABORTONCLOSE); |
9455 |
++ tcp_set_state(meta_sk, TCP_CLOSE); |
9456 |
++ tcp_sk(meta_sk)->ops->send_active_reset(meta_sk, |
9457 |
++ meta_sk->sk_allocation); |
9458 |
++ } else if (sock_flag(meta_sk, SOCK_LINGER) && !meta_sk->sk_lingertime) { |
9459 |
++ /* Check zero linger _after_ checking for unread data. */ |
9460 |
++ meta_sk->sk_prot->disconnect(meta_sk, 0); |
9461 |
++ NET_INC_STATS_USER(sock_net(meta_sk), LINUX_MIB_TCPABORTONDATA); |
9462 |
++ } else if (tcp_close_state(meta_sk)) { |
9463 |
++ mptcp_send_fin(meta_sk); |
9464 |
++ } else if (meta_tp->snd_una == meta_tp->write_seq) { |
9465 |
++ /* The DATA_FIN has been sent and acknowledged |
9466 |
++ * (e.g., by sk_shutdown). Close all the other subflows |
9467 |
++ */ |
9468 |
++ mptcp_for_each_sk_safe(mpcb, sk_it, tmpsk) { |
9469 |
++ unsigned long delay = 0; |
9470 |
++ /* If we are the passive closer, don't trigger |
9471 |
++ * subflow-fin until the subflow has been finned |
9472 |
++ * by the peer. - thus we add a delay |
9473 |
++ */ |
9474 |
++ if (mpcb->passive_close && |
9475 |
++ sk_it->sk_state == TCP_ESTABLISHED) |
9476 |
++ delay = inet_csk(sk_it)->icsk_rto << 3; |
9477 |
++ |
9478 |
++ mptcp_sub_close(sk_it, delay); |
9479 |
++ } |
9480 |
++ } |
9481 |
++ |
9482 |
++ sk_stream_wait_close(meta_sk, timeout); |
9483 |
++ |
9484 |
++adjudge_to_death: |
9485 |
++ state = meta_sk->sk_state; |
9486 |
++ sock_hold(meta_sk); |
9487 |
++ sock_orphan(meta_sk); |
9488 |
++ |
9489 |
++ /* socket will be freed after mptcp_close - we have to prevent |
9490 |
++ * access from the subflows. |
9491 |
++ */ |
9492 |
++ mptcp_for_each_sk(mpcb, sk_it) { |
9493 |
++ /* Similar to sock_orphan, but we don't set it DEAD, because |
9494 |
++ * the callbacks are still set and must be called. |
9495 |
++ */ |
9496 |
++ write_lock_bh(&sk_it->sk_callback_lock); |
9497 |
++ sk_set_socket(sk_it, NULL); |
9498 |
++ sk_it->sk_wq = NULL; |
9499 |
++ write_unlock_bh(&sk_it->sk_callback_lock); |
9500 |
++ } |
9501 |
++ |
9502 |
++ /* It is the last release_sock in its life. It will remove backlog. */ |
9503 |
++ release_sock(meta_sk); |
9504 |
++ |
9505 |
++ /* Now socket is owned by kernel and we acquire BH lock |
9506 |
++ * to finish close. No need to check for user refs. |
9507 |
++ */ |
9508 |
++ local_bh_disable(); |
9509 |
++ bh_lock_sock(meta_sk); |
9510 |
++ WARN_ON(sock_owned_by_user(meta_sk)); |
9511 |
++ |
9512 |
++ percpu_counter_inc(meta_sk->sk_prot->orphan_count); |
9513 |
++ |
9514 |
++ /* Have we already been destroyed by a softirq or backlog? */ |
9515 |
++ if (state != TCP_CLOSE && meta_sk->sk_state == TCP_CLOSE) |
9516 |
++ goto out; |
9517 |
++ |
9518 |
++ /* This is a (useful) BSD violating of the RFC. There is a |
9519 |
++ * problem with TCP as specified in that the other end could |
9520 |
++ * keep a socket open forever with no application left this end. |
9521 |
++ * We use a 3 minute timeout (about the same as BSD) then kill |
9522 |
++ * our end. If they send after that then tough - BUT: long enough |
9523 |
++ * that we won't make the old 4*rto = almost no time - whoops |
9524 |
++ * reset mistake. |
9525 |
++ * |
9526 |
++ * Nope, it was not mistake. It is really desired behaviour |
9527 |
++ * f.e. on http servers, when such sockets are useless, but |
9528 |
++ * consume significant resources. Let's do it with special |
9529 |
++ * linger2 option. --ANK |
9530 |
++ */ |
9531 |
++ |
9532 |
++ if (meta_sk->sk_state == TCP_FIN_WAIT2) { |
9533 |
++ if (meta_tp->linger2 < 0) { |
9534 |
++ tcp_set_state(meta_sk, TCP_CLOSE); |
9535 |
++ meta_tp->ops->send_active_reset(meta_sk, GFP_ATOMIC); |
9536 |
++ NET_INC_STATS_BH(sock_net(meta_sk), |
9537 |
++ LINUX_MIB_TCPABORTONLINGER); |
9538 |
++ } else { |
9539 |
++ const int tmo = tcp_fin_time(meta_sk); |
9540 |
++ |
9541 |
++ if (tmo > TCP_TIMEWAIT_LEN) { |
9542 |
++ inet_csk_reset_keepalive_timer(meta_sk, |
9543 |
++ tmo - TCP_TIMEWAIT_LEN); |
9544 |
++ } else { |
9545 |
++ meta_tp->ops->time_wait(meta_sk, TCP_FIN_WAIT2, |
9546 |
++ tmo); |
9547 |
++ goto out; |
9548 |
++ } |
9549 |
++ } |
9550 |
++ } |
9551 |
++ if (meta_sk->sk_state != TCP_CLOSE) { |
9552 |
++ sk_mem_reclaim(meta_sk); |
9553 |
++ if (tcp_too_many_orphans(meta_sk, 0)) { |
9554 |
++ if (net_ratelimit()) |
9555 |
++ pr_info("MPTCP: too many of orphaned sockets\n"); |
9556 |
++ tcp_set_state(meta_sk, TCP_CLOSE); |
9557 |
++ meta_tp->ops->send_active_reset(meta_sk, GFP_ATOMIC); |
9558 |
++ NET_INC_STATS_BH(sock_net(meta_sk), |
9559 |
++ LINUX_MIB_TCPABORTONMEMORY); |
9560 |
++ } |
9561 |
++ } |
9562 |
++ |
9563 |
++ |
9564 |
++ if (meta_sk->sk_state == TCP_CLOSE) |
9565 |
++ inet_csk_destroy_sock(meta_sk); |
9566 |
++ /* Otherwise, socket is reprieved until protocol close. */ |
9567 |
++ |
9568 |
++out: |
9569 |
++ bh_unlock_sock(meta_sk); |
9570 |
++ local_bh_enable(); |
9571 |
++ mutex_unlock(&mpcb->mpcb_mutex); |
9572 |
++ sock_put(meta_sk); /* Taken by sock_hold */ |
9573 |
++} |
9574 |
++ |
9575 |
++void mptcp_disconnect(struct sock *sk) |
9576 |
++{ |
9577 |
++ struct sock *subsk, *tmpsk; |
9578 |
++ struct tcp_sock *tp = tcp_sk(sk); |
9579 |
++ |
9580 |
++ mptcp_delete_synack_timer(sk); |
9581 |
++ |
9582 |
++ __skb_queue_purge(&tp->mpcb->reinject_queue); |
9583 |
++ |
9584 |
++ if (tp->inside_tk_table) { |
9585 |
++ mptcp_hash_remove_bh(tp); |
9586 |
++ reqsk_queue_destroy(&inet_csk(tp->meta_sk)->icsk_accept_queue); |
9587 |
++ } |
9588 |
++ |
9589 |
++ local_bh_disable(); |
9590 |
++ mptcp_for_each_sk_safe(tp->mpcb, subsk, tmpsk) { |
9591 |
++ /* The socket will get removed from the subsocket-list |
9592 |
++ * and made non-mptcp by setting mpc to 0. |
9593 |
++ * |
9594 |
++ * This is necessary, because tcp_disconnect assumes |
9595 |
++ * that the connection is completly dead afterwards. |
9596 |
++ * Thus we need to do a mptcp_del_sock. Due to this call |
9597 |
++ * we have to make it non-mptcp. |
9598 |
++ * |
9599 |
++ * We have to lock the socket, because we set mpc to 0. |
9600 |
++ * An incoming packet would take the subsocket's lock |
9601 |
++ * and go on into the receive-path. |
9602 |
++ * This would be a race. |
9603 |
++ */ |
9604 |
++ |
9605 |
++ bh_lock_sock(subsk); |
9606 |
++ mptcp_del_sock(subsk); |
9607 |
++ tcp_sk(subsk)->mpc = 0; |
9608 |
++ tcp_sk(subsk)->ops = &tcp_specific; |
9609 |
++ mptcp_sub_force_close(subsk); |
9610 |
++ bh_unlock_sock(subsk); |
9611 |
++ } |
9612 |
++ local_bh_enable(); |
9613 |
++ |
9614 |
++ tp->was_meta_sk = 1; |
9615 |
++ tp->mpc = 0; |
9616 |
++ tp->ops = &tcp_specific; |
9617 |
++} |
9618 |
++ |
9619 |
++ |
9620 |
++/* Returns 1 if we should enable MPTCP for that socket. */ |
9621 |
++int mptcp_doit(struct sock *sk) |
9622 |
++{ |
9623 |
++ /* Do not allow MPTCP enabling if the MPTCP initialization failed */ |
9624 |
++ if (mptcp_init_failed) |
9625 |
++ return 0; |
9626 |
++ |
9627 |
++ if (sysctl_mptcp_enabled == MPTCP_APP && !tcp_sk(sk)->mptcp_enabled) |
9628 |
++ return 0; |
9629 |
++ |
9630 |
++ /* Socket may already be established (e.g., called from tcp_recvmsg) */ |
9631 |
++ if (mptcp(tcp_sk(sk)) || tcp_sk(sk)->request_mptcp) |
9632 |
++ return 1; |
9633 |
++ |
9634 |
++ /* Don't do mptcp over loopback */ |
9635 |
++ if (sk->sk_family == AF_INET && |
9636 |
++ (ipv4_is_loopback(inet_sk(sk)->inet_daddr) || |
9637 |
++ ipv4_is_loopback(inet_sk(sk)->inet_saddr))) |
9638 |
++ return 0; |
9639 |
++#if IS_ENABLED(CONFIG_IPV6) |
9640 |
++ if (sk->sk_family == AF_INET6 && |
9641 |
++ (ipv6_addr_loopback(&sk->sk_v6_daddr) || |
9642 |
++ ipv6_addr_loopback(&inet6_sk(sk)->saddr))) |
9643 |
++ return 0; |
9644 |
++#endif |
9645 |
++ if (mptcp_v6_is_v4_mapped(sk) && |
9646 |
++ ipv4_is_loopback(inet_sk(sk)->inet_saddr)) |
9647 |
++ return 0; |
9648 |
++ |
9649 |
++#ifdef CONFIG_TCP_MD5SIG |
9650 |
++ /* If TCP_MD5SIG is enabled, do not do MPTCP - there is no Option-Space */ |
9651 |
++ if (tcp_sk(sk)->af_specific->md5_lookup(sk, sk)) |
9652 |
++ return 0; |
9653 |
++#endif |
9654 |
++ |
9655 |
++ return 1; |
9656 |
++} |
9657 |
++ |
9658 |
++int mptcp_create_master_sk(struct sock *meta_sk, __u64 remote_key, u32 window) |
9659 |
++{ |
9660 |
++ struct tcp_sock *master_tp; |
9661 |
++ struct sock *master_sk; |
9662 |
++ |
9663 |
++ if (mptcp_alloc_mpcb(meta_sk, remote_key, window)) |
9664 |
++ goto err_alloc_mpcb; |
9665 |
++ |
9666 |
++ master_sk = tcp_sk(meta_sk)->mpcb->master_sk; |
9667 |
++ master_tp = tcp_sk(master_sk); |
9668 |
++ |
9669 |
++ if (mptcp_add_sock(meta_sk, master_sk, 0, 0, GFP_ATOMIC)) |
9670 |
++ goto err_add_sock; |
9671 |
++ |
9672 |
++ if (__inet_inherit_port(meta_sk, master_sk) < 0) |
9673 |
++ goto err_add_sock; |
9674 |
++ |
9675 |
++ meta_sk->sk_prot->unhash(meta_sk); |
9676 |
++ |
9677 |
++ if (master_sk->sk_family == AF_INET || mptcp_v6_is_v4_mapped(master_sk)) |
9678 |
++ __inet_hash_nolisten(master_sk, NULL); |
9679 |
++#if IS_ENABLED(CONFIG_IPV6) |
9680 |
++ else |
9681 |
++ __inet6_hash(master_sk, NULL); |
9682 |
++#endif |
9683 |
++ |
9684 |
++ master_tp->mptcp->init_rcv_wnd = master_tp->rcv_wnd; |
9685 |
++ |
9686 |
++ return 0; |
9687 |
++ |
9688 |
++err_add_sock: |
9689 |
++ mptcp_fallback_meta_sk(meta_sk); |
9690 |
++ |
9691 |
++ inet_csk_prepare_forced_close(master_sk); |
9692 |
++ tcp_done(master_sk); |
9693 |
++ inet_csk_prepare_forced_close(meta_sk); |
9694 |
++ tcp_done(meta_sk); |
9695 |
++ |
9696 |
++err_alloc_mpcb: |
9697 |
++ return -ENOBUFS; |
9698 |
++} |
9699 |
++ |
9700 |
++static int __mptcp_check_req_master(struct sock *child, |
9701 |
++ struct request_sock *req) |
9702 |
++{ |
9703 |
++ struct tcp_sock *child_tp = tcp_sk(child); |
9704 |
++ struct sock *meta_sk = child; |
9705 |
++ struct mptcp_cb *mpcb; |
9706 |
++ struct mptcp_request_sock *mtreq; |
9707 |
++ |
9708 |
++ /* Never contained an MP_CAPABLE */ |
9709 |
++ if (!inet_rsk(req)->mptcp_rqsk) |
9710 |
++ return 1; |
9711 |
++ |
9712 |
++ if (!inet_rsk(req)->saw_mpc) { |
9713 |
++ /* Fallback to regular TCP, because we saw one SYN without |
9714 |
++ * MP_CAPABLE. In tcp_check_req we continue the regular path. |
9715 |
++ * But, the socket has been added to the reqsk_tk_htb, so we |
9716 |
++ * must still remove it. |
9717 |
++ */ |
9718 |
++ mptcp_reqsk_remove_tk(req); |
9719 |
++ return 1; |
9720 |
++ } |
9721 |
++ |
9722 |
++ /* Just set this values to pass them to mptcp_alloc_mpcb */ |
9723 |
++ mtreq = mptcp_rsk(req); |
9724 |
++ child_tp->mptcp_loc_key = mtreq->mptcp_loc_key; |
9725 |
++ child_tp->mptcp_loc_token = mtreq->mptcp_loc_token; |
9726 |
++ |
9727 |
++ if (mptcp_create_master_sk(meta_sk, mtreq->mptcp_rem_key, |
9728 |
++ child_tp->snd_wnd)) |
9729 |
++ return -ENOBUFS; |
9730 |
++ |
9731 |
++ child = tcp_sk(child)->mpcb->master_sk; |
9732 |
++ child_tp = tcp_sk(child); |
9733 |
++ mpcb = child_tp->mpcb; |
9734 |
++ |
9735 |
++ child_tp->mptcp->snt_isn = tcp_rsk(req)->snt_isn; |
9736 |
++ child_tp->mptcp->rcv_isn = tcp_rsk(req)->rcv_isn; |
9737 |
++ |
9738 |
++ mpcb->dss_csum = mtreq->dss_csum; |
9739 |
++ mpcb->server_side = 1; |
9740 |
++ |
9741 |
++ /* Will be moved to ESTABLISHED by tcp_rcv_state_process() */ |
9742 |
++ mptcp_update_metasocket(child, meta_sk); |
9743 |
++ |
9744 |
++ /* Needs to be done here additionally, because when accepting a |
9745 |
++ * new connection we pass by __reqsk_free and not reqsk_free. |
9746 |
++ */ |
9747 |
++ mptcp_reqsk_remove_tk(req); |
9748 |
++ |
9749 |
++ /* Hold when creating the meta-sk in tcp_vX_syn_recv_sock. */ |
9750 |
++ sock_put(meta_sk); |
9751 |
++ |
9752 |
++ return 0; |
9753 |
++} |
9754 |
++ |
9755 |
++int mptcp_check_req_fastopen(struct sock *child, struct request_sock *req) |
9756 |
++{ |
9757 |
++ struct sock *meta_sk = child, *master_sk; |
9758 |
++ struct sk_buff *skb; |
9759 |
++ u32 new_mapping; |
9760 |
++ int ret; |
9761 |
++ |
9762 |
++ ret = __mptcp_check_req_master(child, req); |
9763 |
++ if (ret) |
9764 |
++ return ret; |
9765 |
++ |
9766 |
++ master_sk = tcp_sk(meta_sk)->mpcb->master_sk; |
9767 |
++ |
9768 |
++ /* We need to rewind copied_seq as it is set to IDSN + 1 and as we have |
9769 |
++ * pre-MPTCP data in the receive queue. |
9770 |
++ */ |
9771 |
++ tcp_sk(meta_sk)->copied_seq -= tcp_sk(master_sk)->rcv_nxt - |
9772 |
++ tcp_rsk(req)->rcv_isn - 1; |
9773 |
++ |
9774 |
++ /* Map subflow sequence number to data sequence numbers. We need to map |
9775 |
++ * these data to [IDSN - len - 1, IDSN[. |
9776 |
++ */ |
9777 |
++ new_mapping = tcp_sk(meta_sk)->copied_seq - tcp_rsk(req)->rcv_isn - 1; |
9778 |
++ |
9779 |
++ /* There should be only one skb: the SYN + data. */ |
9780 |
++ skb_queue_walk(&meta_sk->sk_receive_queue, skb) { |
9781 |
++ TCP_SKB_CB(skb)->seq += new_mapping; |
9782 |
++ TCP_SKB_CB(skb)->end_seq += new_mapping; |
9783 |
++ } |
9784 |
++ |
9785 |
++ /* With fastopen we change the semantics of the relative subflow |
9786 |
++ * sequence numbers to deal with middleboxes that could add/remove |
9787 |
++ * multiple bytes in the SYN. We chose to start counting at rcv_nxt - 1 |
9788 |
++ * instead of the regular TCP ISN. |
9789 |
++ */ |
9790 |
++ tcp_sk(master_sk)->mptcp->rcv_isn = tcp_sk(master_sk)->rcv_nxt - 1; |
9791 |
++ |
9792 |
++ /* We need to update copied_seq of the master_sk to account for the |
9793 |
++ * already moved data to the meta receive queue. |
9794 |
++ */ |
9795 |
++ tcp_sk(master_sk)->copied_seq = tcp_sk(master_sk)->rcv_nxt; |
9796 |
++ |
9797 |
++ /* Handled by the master_sk */ |
9798 |
++ tcp_sk(meta_sk)->fastopen_rsk = NULL; |
9799 |
++ |
9800 |
++ return 0; |
9801 |
++} |
9802 |
++ |
9803 |
++int mptcp_check_req_master(struct sock *sk, struct sock *child, |
9804 |
++ struct request_sock *req, |
9805 |
++ struct request_sock **prev) |
9806 |
++{ |
9807 |
++ struct sock *meta_sk = child; |
9808 |
++ int ret; |
9809 |
++ |
9810 |
++ ret = __mptcp_check_req_master(child, req); |
9811 |
++ if (ret) |
9812 |
++ return ret; |
9813 |
++ |
9814 |
++ inet_csk_reqsk_queue_unlink(sk, req, prev); |
9815 |
++ inet_csk_reqsk_queue_removed(sk, req); |
9816 |
++ inet_csk_reqsk_queue_add(sk, req, meta_sk); |
9817 |
++ |
9818 |
++ return 0; |
9819 |
++} |
9820 |
++ |
9821 |
++struct sock *mptcp_check_req_child(struct sock *meta_sk, struct sock *child, |
9822 |
++ struct request_sock *req, |
9823 |
++ struct request_sock **prev, |
9824 |
++ const struct mptcp_options_received *mopt) |
9825 |
++{ |
9826 |
++ struct tcp_sock *child_tp = tcp_sk(child); |
9827 |
++ struct mptcp_request_sock *mtreq = mptcp_rsk(req); |
9828 |
++ struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb; |
9829 |
++ u8 hash_mac_check[20]; |
9830 |
++ |
9831 |
++ child_tp->inside_tk_table = 0; |
9832 |
++ |
9833 |
++ if (!mopt->join_ack) |
9834 |
++ goto teardown; |
9835 |
++ |
9836 |
++ mptcp_hmac_sha1((u8 *)&mpcb->mptcp_rem_key, |
9837 |
++ (u8 *)&mpcb->mptcp_loc_key, |
9838 |
++ (u8 *)&mtreq->mptcp_rem_nonce, |
9839 |
++ (u8 *)&mtreq->mptcp_loc_nonce, |
9840 |
++ (u32 *)hash_mac_check); |
9841 |
++ |
9842 |
++ if (memcmp(hash_mac_check, (char *)&mopt->mptcp_recv_mac, 20)) |
9843 |
++ goto teardown; |
9844 |
++ |
9845 |
++ /* Point it to the same struct socket and wq as the meta_sk */ |
9846 |
++ sk_set_socket(child, meta_sk->sk_socket); |
9847 |
++ child->sk_wq = meta_sk->sk_wq; |
9848 |
++ |
9849 |
++ if (mptcp_add_sock(meta_sk, child, mtreq->loc_id, mtreq->rem_id, GFP_ATOMIC)) { |
9850 |
++ /* Has been inherited, but now child_tp->mptcp is NULL */ |
9851 |
++ child_tp->mpc = 0; |
9852 |
++ child_tp->ops = &tcp_specific; |
9853 |
++ |
9854 |
++ /* TODO when we support acking the third ack for new subflows, |
9855 |
++ * we should silently discard this third ack, by returning NULL. |
9856 |
++ * |
9857 |
++ * Maybe, at the retransmission we will have enough memory to |
9858 |
++ * fully add the socket to the meta-sk. |
9859 |
++ */ |
9860 |
++ goto teardown; |
9861 |
++ } |
9862 |
++ |
9863 |
++ /* The child is a clone of the meta socket, we must now reset |
9864 |
++ * some of the fields |
9865 |
++ */ |
9866 |
++ child_tp->mptcp->rcv_low_prio = mtreq->rcv_low_prio; |
9867 |
++ |
9868 |
++ /* We should allow proper increase of the snd/rcv-buffers. Thus, we |
9869 |
++ * use the original values instead of the bloated up ones from the |
9870 |
++ * clone. |
9871 |
++ */ |
9872 |
++ child->sk_sndbuf = mpcb->orig_sk_sndbuf; |
9873 |
++ child->sk_rcvbuf = mpcb->orig_sk_rcvbuf; |
9874 |
++ |
9875 |
++ child_tp->mptcp->slave_sk = 1; |
9876 |
++ child_tp->mptcp->snt_isn = tcp_rsk(req)->snt_isn; |
9877 |
++ child_tp->mptcp->rcv_isn = tcp_rsk(req)->rcv_isn; |
9878 |
++ child_tp->mptcp->init_rcv_wnd = req->rcv_wnd; |
9879 |
++ |
9880 |
++ child_tp->tsq_flags = 0; |
9881 |
++ |
9882 |
++ /* Subflows do not use the accept queue, as they |
9883 |
++ * are attached immediately to the mpcb. |
9884 |
++ */ |
9885 |
++ inet_csk_reqsk_queue_unlink(meta_sk, req, prev); |
9886 |
++ reqsk_queue_removed(&inet_csk(meta_sk)->icsk_accept_queue, req); |
9887 |
++ reqsk_free(req); |
9888 |
++ return child; |
9889 |
++ |
9890 |
++teardown: |
9891 |
++ /* Drop this request - sock creation failed. */ |
9892 |
++ inet_csk_reqsk_queue_unlink(meta_sk, req, prev); |
9893 |
++ reqsk_queue_removed(&inet_csk(meta_sk)->icsk_accept_queue, req); |
9894 |
++ reqsk_free(req); |
9895 |
++ inet_csk_prepare_forced_close(child); |
9896 |
++ tcp_done(child); |
9897 |
++ return meta_sk; |
9898 |
++} |
9899 |
++ |
9900 |
++int mptcp_init_tw_sock(struct sock *sk, struct tcp_timewait_sock *tw) |
9901 |
++{ |
9902 |
++ struct mptcp_tw *mptw; |
9903 |
++ struct tcp_sock *tp = tcp_sk(sk); |
9904 |
++ struct mptcp_cb *mpcb = tp->mpcb; |
9905 |
++ |
9906 |
++ /* A subsocket in tw can only receive data. So, if we are in |
9907 |
++ * infinite-receive, then we should not reply with a data-ack or act |
9908 |
++ * upon general MPTCP-signaling. We prevent this by simply not creating |
9909 |
++ * the mptcp_tw_sock. |
9910 |
++ */ |
9911 |
++ if (mpcb->infinite_mapping_rcv) { |
9912 |
++ tw->mptcp_tw = NULL; |
9913 |
++ return 0; |
9914 |
++ } |
9915 |
++ |
9916 |
++ /* Alloc MPTCP-tw-sock */ |
9917 |
++ mptw = kmem_cache_alloc(mptcp_tw_cache, GFP_ATOMIC); |
9918 |
++ if (!mptw) |
9919 |
++ return -ENOBUFS; |
9920 |
++ |
9921 |
++ atomic_inc(&mpcb->mpcb_refcnt); |
9922 |
++ |
9923 |
++ tw->mptcp_tw = mptw; |
9924 |
++ mptw->loc_key = mpcb->mptcp_loc_key; |
9925 |
++ mptw->meta_tw = mpcb->in_time_wait; |
9926 |
++ if (mptw->meta_tw) { |
9927 |
++ mptw->rcv_nxt = mptcp_get_rcv_nxt_64(mptcp_meta_tp(tp)); |
9928 |
++ if (mpcb->mptw_state != TCP_TIME_WAIT) |
9929 |
++ mptw->rcv_nxt++; |
9930 |
++ } |
9931 |
++ rcu_assign_pointer(mptw->mpcb, mpcb); |
9932 |
++ |
9933 |
++ spin_lock(&mpcb->tw_lock); |
9934 |
++ list_add_rcu(&mptw->list, &tp->mpcb->tw_list); |
9935 |
++ mptw->in_list = 1; |
9936 |
++ spin_unlock(&mpcb->tw_lock); |
9937 |
++ |
9938 |
++ return 0; |
9939 |
++} |
9940 |
++ |
9941 |
++void mptcp_twsk_destructor(struct tcp_timewait_sock *tw) |
9942 |
++{ |
9943 |
++ struct mptcp_cb *mpcb; |
9944 |
++ |
9945 |
++ rcu_read_lock(); |
9946 |
++ mpcb = rcu_dereference(tw->mptcp_tw->mpcb); |
9947 |
++ |
9948 |
++ /* If we are still holding a ref to the mpcb, we have to remove ourself |
9949 |
++ * from the list and drop the ref properly. |
9950 |
++ */ |
9951 |
++ if (mpcb && atomic_inc_not_zero(&mpcb->mpcb_refcnt)) { |
9952 |
++ spin_lock(&mpcb->tw_lock); |
9953 |
++ if (tw->mptcp_tw->in_list) { |
9954 |
++ list_del_rcu(&tw->mptcp_tw->list); |
9955 |
++ tw->mptcp_tw->in_list = 0; |
9956 |
++ } |
9957 |
++ spin_unlock(&mpcb->tw_lock); |
9958 |
++ |
9959 |
++ /* Twice, because we increased it above */ |
9960 |
++ mptcp_mpcb_put(mpcb); |
9961 |
++ mptcp_mpcb_put(mpcb); |
9962 |
++ } |
9963 |
++ |
9964 |
++ rcu_read_unlock(); |
9965 |
++ |
9966 |
++ kmem_cache_free(mptcp_tw_cache, tw->mptcp_tw); |
9967 |
++} |
9968 |
++ |
9969 |
++/* Updates the rcv_nxt of the time-wait-socks and allows them to ack a |
9970 |
++ * data-fin. |
9971 |
++ */ |
9972 |
++void mptcp_time_wait(struct sock *sk, int state, int timeo) |
9973 |
++{ |
9974 |
++ struct tcp_sock *tp = tcp_sk(sk); |
9975 |
++ struct mptcp_tw *mptw; |
9976 |
++ |
9977 |
++ /* Used for sockets that go into tw after the meta |
9978 |
++ * (see mptcp_init_tw_sock()) |
9979 |
++ */ |
9980 |
++ tp->mpcb->in_time_wait = 1; |
9981 |
++ tp->mpcb->mptw_state = state; |
9982 |
++ |
9983 |
++ /* Update the time-wait-sock's information */ |
9984 |
++ rcu_read_lock_bh(); |
9985 |
++ list_for_each_entry_rcu(mptw, &tp->mpcb->tw_list, list) { |
9986 |
++ mptw->meta_tw = 1; |
9987 |
++ mptw->rcv_nxt = mptcp_get_rcv_nxt_64(tp); |
9988 |
++ |
9989 |
++ /* We want to ack a DATA_FIN, but are yet in FIN_WAIT_2 - |
9990 |
++ * pretend as if the DATA_FIN has already reached us, that way |
9991 |
++ * the checks in tcp_timewait_state_process will be good as the |
9992 |
++ * DATA_FIN comes in. |
9993 |
++ */ |
9994 |
++ if (state != TCP_TIME_WAIT) |
9995 |
++ mptw->rcv_nxt++; |
9996 |
++ } |
9997 |
++ rcu_read_unlock_bh(); |
9998 |
++ |
9999 |
++ tcp_done(sk); |
10000 |
++} |
10001 |
++ |
10002 |
++void mptcp_tsq_flags(struct sock *sk) |
10003 |
++{ |
10004 |
++ struct tcp_sock *tp = tcp_sk(sk); |
10005 |
++ struct sock *meta_sk = mptcp_meta_sk(sk); |
10006 |
++ |
10007 |
++ /* It will be handled as a regular deferred-call */ |
10008 |
++ if (is_meta_sk(sk)) |
10009 |
++ return; |
10010 |
++ |
10011 |
++ if (hlist_unhashed(&tp->mptcp->cb_list)) { |
10012 |
++ hlist_add_head(&tp->mptcp->cb_list, &tp->mpcb->callback_list); |
10013 |
++ /* We need to hold it here, as the sock_hold is not assured |
10014 |
++ * by the release_sock as it is done in regular TCP. |
10015 |
++ * |
10016 |
++ * The subsocket may get inet_csk_destroy'd while it is inside |
10017 |
++ * the callback_list. |
10018 |
++ */ |
10019 |
++ sock_hold(sk); |
10020 |
++ } |
10021 |
++ |
10022 |
++ if (!test_and_set_bit(MPTCP_SUB_DEFERRED, &tcp_sk(meta_sk)->tsq_flags)) |
10023 |
++ sock_hold(meta_sk); |
10024 |
++} |
10025 |
++ |
10026 |
++void mptcp_tsq_sub_deferred(struct sock *meta_sk) |
10027 |
++{ |
10028 |
++ struct tcp_sock *meta_tp = tcp_sk(meta_sk); |
10029 |
++ struct mptcp_tcp_sock *mptcp; |
10030 |
++ struct hlist_node *tmp; |
10031 |
++ |
10032 |
++ BUG_ON(!is_meta_sk(meta_sk) && !meta_tp->was_meta_sk); |
10033 |
++ |
10034 |
++ __sock_put(meta_sk); |
10035 |
++ hlist_for_each_entry_safe(mptcp, tmp, &meta_tp->mpcb->callback_list, cb_list) { |
10036 |
++ struct tcp_sock *tp = mptcp->tp; |
10037 |
++ struct sock *sk = (struct sock *)tp; |
10038 |
++ |
10039 |
++ hlist_del_init(&mptcp->cb_list); |
10040 |
++ sk->sk_prot->release_cb(sk); |
10041 |
++ /* Final sock_put (cfr. mptcp_tsq_flags */ |
10042 |
++ sock_put(sk); |
10043 |
++ } |
10044 |
++} |
10045 |
++ |
10046 |
++void mptcp_join_reqsk_init(struct mptcp_cb *mpcb, const struct request_sock *req, |
10047 |
++ struct sk_buff *skb) |
10048 |
++{ |
10049 |
++ struct mptcp_request_sock *mtreq = mptcp_rsk(req); |
10050 |
++ struct mptcp_options_received mopt; |
10051 |
++ u8 mptcp_hash_mac[20]; |
10052 |
++ |
10053 |
++ mptcp_init_mp_opt(&mopt); |
10054 |
++ tcp_parse_mptcp_options(skb, &mopt); |
10055 |
++ |
10056 |
++ mtreq = mptcp_rsk(req); |
10057 |
++ mtreq->mptcp_mpcb = mpcb; |
10058 |
++ mtreq->is_sub = 1; |
10059 |
++ inet_rsk(req)->mptcp_rqsk = 1; |
10060 |
++ |
10061 |
++ mtreq->mptcp_rem_nonce = mopt.mptcp_recv_nonce; |
10062 |
++ |
10063 |
++ mptcp_hmac_sha1((u8 *)&mpcb->mptcp_loc_key, |
10064 |
++ (u8 *)&mpcb->mptcp_rem_key, |
10065 |
++ (u8 *)&mtreq->mptcp_loc_nonce, |
10066 |
++ (u8 *)&mtreq->mptcp_rem_nonce, (u32 *)mptcp_hash_mac); |
10067 |
++ mtreq->mptcp_hash_tmac = *(u64 *)mptcp_hash_mac; |
10068 |
++ |
10069 |
++ mtreq->rem_id = mopt.rem_id; |
10070 |
++ mtreq->rcv_low_prio = mopt.low_prio; |
10071 |
++ inet_rsk(req)->saw_mpc = 1; |
10072 |
++} |
10073 |
++ |
10074 |
++void mptcp_reqsk_init(struct request_sock *req, const struct sk_buff *skb) |
10075 |
++{ |
10076 |
++ struct mptcp_options_received mopt; |
10077 |
++ struct mptcp_request_sock *mreq = mptcp_rsk(req); |
10078 |
++ |
10079 |
++ mptcp_init_mp_opt(&mopt); |
10080 |
++ tcp_parse_mptcp_options(skb, &mopt); |
10081 |
++ |
10082 |
++ mreq->is_sub = 0; |
10083 |
++ inet_rsk(req)->mptcp_rqsk = 1; |
10084 |
++ mreq->dss_csum = mopt.dss_csum; |
10085 |
++ mreq->hash_entry.pprev = NULL; |
10086 |
++ |
10087 |
++ mptcp_reqsk_new_mptcp(req, &mopt, skb); |
10088 |
++} |
10089 |
++ |
10090 |
++int mptcp_conn_request(struct sock *sk, struct sk_buff *skb) |
10091 |
++{ |
10092 |
++ struct mptcp_options_received mopt; |
10093 |
++ const struct tcp_sock *tp = tcp_sk(sk); |
10094 |
++ __u32 isn = TCP_SKB_CB(skb)->when; |
10095 |
++ bool want_cookie = false; |
10096 |
++ |
10097 |
++ if ((sysctl_tcp_syncookies == 2 || |
10098 |
++ inet_csk_reqsk_queue_is_full(sk)) && !isn) { |
10099 |
++ want_cookie = tcp_syn_flood_action(sk, skb, |
10100 |
++ mptcp_request_sock_ops.slab_name); |
10101 |
++ if (!want_cookie) |
10102 |
++ goto drop; |
10103 |
++ } |
10104 |
++ |
10105 |
++ mptcp_init_mp_opt(&mopt); |
10106 |
++ tcp_parse_mptcp_options(skb, &mopt); |
10107 |
++ |
10108 |
++ if (mopt.is_mp_join) |
10109 |
++ return mptcp_do_join_short(skb, &mopt, sock_net(sk)); |
10110 |
++ if (mopt.drop_me) |
10111 |
++ goto drop; |
10112 |
++ |
10113 |
++ if (sysctl_mptcp_enabled == MPTCP_APP && !tp->mptcp_enabled) |
10114 |
++ mopt.saw_mpc = 0; |
10115 |
++ |
10116 |
++ if (skb->protocol == htons(ETH_P_IP)) { |
10117 |
++ if (mopt.saw_mpc && !want_cookie) { |
10118 |
++ if (skb_rtable(skb)->rt_flags & |
10119 |
++ (RTCF_BROADCAST | RTCF_MULTICAST)) |
10120 |
++ goto drop; |
10121 |
++ |
10122 |
++ return tcp_conn_request(&mptcp_request_sock_ops, |
10123 |
++ &mptcp_request_sock_ipv4_ops, |
10124 |
++ sk, skb); |
10125 |
++ } |
10126 |
++ |
10127 |
++ return tcp_v4_conn_request(sk, skb); |
10128 |
++#if IS_ENABLED(CONFIG_IPV6) |
10129 |
++ } else { |
10130 |
++ if (mopt.saw_mpc && !want_cookie) { |
10131 |
++ if (!ipv6_unicast_destination(skb)) |
10132 |
++ goto drop; |
10133 |
++ |
10134 |
++ return tcp_conn_request(&mptcp6_request_sock_ops, |
10135 |
++ &mptcp_request_sock_ipv6_ops, |
10136 |
++ sk, skb); |
10137 |
++ } |
10138 |
++ |
10139 |
++ return tcp_v6_conn_request(sk, skb); |
10140 |
++#endif |
10141 |
++ } |
10142 |
++drop: |
10143 |
++ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS); |
10144 |
++ return 0; |
10145 |
++} |
10146 |
++ |
10147 |
++struct workqueue_struct *mptcp_wq; |
10148 |
++EXPORT_SYMBOL(mptcp_wq); |
10149 |
++ |
10150 |
++/* Output /proc/net/mptcp */ |
10151 |
++static int mptcp_pm_seq_show(struct seq_file *seq, void *v) |
10152 |
++{ |
10153 |
++ struct tcp_sock *meta_tp; |
10154 |
++ const struct net *net = seq->private; |
10155 |
++ int i, n = 0; |
10156 |
++ |
10157 |
++ seq_printf(seq, " sl loc_tok rem_tok v6 local_address remote_address st ns tx_queue rx_queue inode"); |
10158 |
++ seq_putc(seq, '\n'); |
10159 |
++ |
10160 |
++ for (i = 0; i < MPTCP_HASH_SIZE; i++) { |
10161 |
++ struct hlist_nulls_node *node; |
10162 |
++ rcu_read_lock_bh(); |
10163 |
++ hlist_nulls_for_each_entry_rcu(meta_tp, node, |
10164 |
++ &tk_hashtable[i], tk_table) { |
10165 |
++ struct mptcp_cb *mpcb = meta_tp->mpcb; |
10166 |
++ struct sock *meta_sk = (struct sock *)meta_tp; |
10167 |
++ struct inet_sock *isk = inet_sk(meta_sk); |
10168 |
++ |
10169 |
++ if (!mptcp(meta_tp) || !net_eq(net, sock_net(meta_sk))) |
10170 |
++ continue; |
10171 |
++ |
10172 |
++ if (capable(CAP_NET_ADMIN)) { |
10173 |
++ seq_printf(seq, "%4d: %04X %04X ", n++, |
10174 |
++ mpcb->mptcp_loc_token, |
10175 |
++ mpcb->mptcp_rem_token); |
10176 |
++ } else { |
10177 |
++ seq_printf(seq, "%4d: %04X %04X ", n++, -1, -1); |
10178 |
++ } |
10179 |
++ if (meta_sk->sk_family == AF_INET || |
10180 |
++ mptcp_v6_is_v4_mapped(meta_sk)) { |
10181 |
++ seq_printf(seq, " 0 %08X:%04X %08X:%04X ", |
10182 |
++ isk->inet_rcv_saddr, |
10183 |
++ ntohs(isk->inet_sport), |
10184 |
++ isk->inet_daddr, |
10185 |
++ ntohs(isk->inet_dport)); |
10186 |
++#if IS_ENABLED(CONFIG_IPV6) |
10187 |
++ } else if (meta_sk->sk_family == AF_INET6) { |
10188 |
++ struct in6_addr *src = &meta_sk->sk_v6_rcv_saddr; |
10189 |
++ struct in6_addr *dst = &meta_sk->sk_v6_daddr; |
10190 |
++ seq_printf(seq, " 1 %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X", |
10191 |
++ src->s6_addr32[0], src->s6_addr32[1], |
10192 |
++ src->s6_addr32[2], src->s6_addr32[3], |
10193 |
++ ntohs(isk->inet_sport), |
10194 |
++ dst->s6_addr32[0], dst->s6_addr32[1], |
10195 |
++ dst->s6_addr32[2], dst->s6_addr32[3], |
10196 |
++ ntohs(isk->inet_dport)); |
10197 |
++#endif |
10198 |
++ } |
10199 |
++ seq_printf(seq, " %02X %02X %08X:%08X %lu", |
10200 |
++ meta_sk->sk_state, mpcb->cnt_subflows, |
10201 |
++ meta_tp->write_seq - meta_tp->snd_una, |
10202 |
++ max_t(int, meta_tp->rcv_nxt - |
10203 |
++ meta_tp->copied_seq, 0), |
10204 |
++ sock_i_ino(meta_sk)); |
10205 |
++ seq_putc(seq, '\n'); |
10206 |
++ } |
10207 |
++ |
10208 |
++ rcu_read_unlock_bh(); |
10209 |
++ } |
10210 |
++ |
10211 |
++ return 0; |
10212 |
++} |
10213 |
++ |
10214 |
++static int mptcp_pm_seq_open(struct inode *inode, struct file *file) |
10215 |
++{ |
10216 |
++ return single_open_net(inode, file, mptcp_pm_seq_show); |
10217 |
++} |
10218 |
++ |
10219 |
++static const struct file_operations mptcp_pm_seq_fops = { |
10220 |
++ .owner = THIS_MODULE, |
10221 |
++ .open = mptcp_pm_seq_open, |
10222 |
++ .read = seq_read, |
10223 |
++ .llseek = seq_lseek, |
10224 |
++ .release = single_release_net, |
10225 |
++}; |
10226 |
++ |
10227 |
++static int mptcp_pm_init_net(struct net *net) |
10228 |
++{ |
10229 |
++ if (!proc_create("mptcp", S_IRUGO, net->proc_net, &mptcp_pm_seq_fops)) |
10230 |
++ return -ENOMEM; |
10231 |
++ |
10232 |
++ return 0; |
10233 |
++} |
10234 |
++ |
10235 |
++static void mptcp_pm_exit_net(struct net *net) |
10236 |
++{ |
10237 |
++ remove_proc_entry("mptcp", net->proc_net); |
10238 |
++} |
10239 |
++ |
10240 |
++static struct pernet_operations mptcp_pm_proc_ops = { |
10241 |
++ .init = mptcp_pm_init_net, |
10242 |
++ .exit = mptcp_pm_exit_net, |
10243 |
++}; |
10244 |
++ |
10245 |
++/* General initialization of mptcp */ |
10246 |
++void __init mptcp_init(void) |
10247 |
++{ |
10248 |
++ int i; |
10249 |
++ struct ctl_table_header *mptcp_sysctl; |
10250 |
++ |
10251 |
++ mptcp_sock_cache = kmem_cache_create("mptcp_sock", |
10252 |
++ sizeof(struct mptcp_tcp_sock), |
10253 |
++ 0, SLAB_HWCACHE_ALIGN, |
10254 |
++ NULL); |
10255 |
++ if (!mptcp_sock_cache) |
10256 |
++ goto mptcp_sock_cache_failed; |
10257 |
++ |
10258 |
++ mptcp_cb_cache = kmem_cache_create("mptcp_cb", sizeof(struct mptcp_cb), |
10259 |
++ 0, SLAB_DESTROY_BY_RCU|SLAB_HWCACHE_ALIGN, |
10260 |
++ NULL); |
10261 |
++ if (!mptcp_cb_cache) |
10262 |
++ goto mptcp_cb_cache_failed; |
10263 |
++ |
10264 |
++ mptcp_tw_cache = kmem_cache_create("mptcp_tw", sizeof(struct mptcp_tw), |
10265 |
++ 0, SLAB_DESTROY_BY_RCU|SLAB_HWCACHE_ALIGN, |
10266 |
++ NULL); |
10267 |
++ if (!mptcp_tw_cache) |
10268 |
++ goto mptcp_tw_cache_failed; |
10269 |
++ |
10270 |
++ get_random_bytes(mptcp_secret, sizeof(mptcp_secret)); |
10271 |
++ |
10272 |
++ mptcp_wq = alloc_workqueue("mptcp_wq", WQ_UNBOUND | WQ_MEM_RECLAIM, 8); |
10273 |
++ if (!mptcp_wq) |
10274 |
++ goto alloc_workqueue_failed; |
10275 |
++ |
10276 |
++ for (i = 0; i < MPTCP_HASH_SIZE; i++) { |
10277 |
++ INIT_HLIST_NULLS_HEAD(&tk_hashtable[i], i); |
10278 |
++ INIT_HLIST_NULLS_HEAD(&mptcp_reqsk_htb[i], |
10279 |
++ i + MPTCP_REQSK_NULLS_BASE); |
10280 |
++ INIT_HLIST_NULLS_HEAD(&mptcp_reqsk_tk_htb[i], i); |
10281 |
++ } |
10282 |
++ |
10283 |
++ spin_lock_init(&mptcp_reqsk_hlock); |
10284 |
++ spin_lock_init(&mptcp_tk_hashlock); |
10285 |
++ |
10286 |
++ if (register_pernet_subsys(&mptcp_pm_proc_ops)) |
10287 |
++ goto pernet_failed; |
10288 |
++ |
10289 |
++#if IS_ENABLED(CONFIG_IPV6) |
10290 |
++ if (mptcp_pm_v6_init()) |
10291 |
++ goto mptcp_pm_v6_failed; |
10292 |
++#endif |
10293 |
++ if (mptcp_pm_v4_init()) |
10294 |
++ goto mptcp_pm_v4_failed; |
10295 |
++ |
10296 |
++ mptcp_sysctl = register_net_sysctl(&init_net, "net/mptcp", mptcp_table); |
10297 |
++ if (!mptcp_sysctl) |
10298 |
++ goto register_sysctl_failed; |
10299 |
++ |
10300 |
++ if (mptcp_register_path_manager(&mptcp_pm_default)) |
10301 |
++ goto register_pm_failed; |
10302 |
++ |
10303 |
++ if (mptcp_register_scheduler(&mptcp_sched_default)) |
10304 |
++ goto register_sched_failed; |
10305 |
++ |
10306 |
++ pr_info("MPTCP: Stable release v0.89.0-rc"); |
10307 |
++ |
10308 |
++ mptcp_init_failed = false; |
10309 |
++ |
10310 |
++ return; |
10311 |
++ |
10312 |
++register_sched_failed: |
10313 |
++ mptcp_unregister_path_manager(&mptcp_pm_default); |
10314 |
++register_pm_failed: |
10315 |
++ unregister_net_sysctl_table(mptcp_sysctl); |
10316 |
++register_sysctl_failed: |
10317 |
++ mptcp_pm_v4_undo(); |
10318 |
++mptcp_pm_v4_failed: |
10319 |
++#if IS_ENABLED(CONFIG_IPV6) |
10320 |
++ mptcp_pm_v6_undo(); |
10321 |
++mptcp_pm_v6_failed: |
10322 |
++#endif |
10323 |
++ unregister_pernet_subsys(&mptcp_pm_proc_ops); |
10324 |
++pernet_failed: |
10325 |
++ destroy_workqueue(mptcp_wq); |
10326 |
++alloc_workqueue_failed: |
10327 |
++ kmem_cache_destroy(mptcp_tw_cache); |
10328 |
++mptcp_tw_cache_failed: |
10329 |
++ kmem_cache_destroy(mptcp_cb_cache); |
10330 |
++mptcp_cb_cache_failed: |
10331 |
++ kmem_cache_destroy(mptcp_sock_cache); |
10332 |
++mptcp_sock_cache_failed: |
10333 |
++ mptcp_init_failed = true; |
10334 |
++} |
10335 |
+diff --git a/net/mptcp/mptcp_fullmesh.c b/net/mptcp/mptcp_fullmesh.c |
10336 |
+new file mode 100644 |
10337 |
+index 000000000000..3a54413ce25b |
10338 |
+--- /dev/null |
10339 |
++++ b/net/mptcp/mptcp_fullmesh.c |
10340 |
+@@ -0,0 +1,1722 @@ |
10341 |
++#include <linux/module.h> |
10342 |
++ |
10343 |
++#include <net/mptcp.h> |
10344 |
++#include <net/mptcp_v4.h> |
10345 |
++ |
10346 |
++#if IS_ENABLED(CONFIG_IPV6) |
10347 |
++#include <net/mptcp_v6.h> |
10348 |
++#include <net/addrconf.h> |
10349 |
++#endif |
10350 |
++ |
10351 |
++enum { |
10352 |
++ MPTCP_EVENT_ADD = 1, |
10353 |
++ MPTCP_EVENT_DEL, |
10354 |
++ MPTCP_EVENT_MOD, |
10355 |
++}; |
10356 |
++ |
10357 |
++#define MPTCP_SUBFLOW_RETRY_DELAY 1000 |
10358 |
++ |
10359 |
++/* Max number of local or remote addresses we can store. |
10360 |
++ * When changing, see the bitfield below in fullmesh_rem4/6. |
10361 |
++ */ |
10362 |
++#define MPTCP_MAX_ADDR 8 |
10363 |
++ |
10364 |
++struct fullmesh_rem4 { |
10365 |
++ u8 rem4_id; |
10366 |
++ u8 bitfield; |
10367 |
++ u8 retry_bitfield; |
10368 |
++ __be16 port; |
10369 |
++ struct in_addr addr; |
10370 |
++}; |
10371 |
++ |
10372 |
++struct fullmesh_rem6 { |
10373 |
++ u8 rem6_id; |
10374 |
++ u8 bitfield; |
10375 |
++ u8 retry_bitfield; |
10376 |
++ __be16 port; |
10377 |
++ struct in6_addr addr; |
10378 |
++}; |
10379 |
++ |
10380 |
++struct mptcp_loc_addr { |
10381 |
++ struct mptcp_loc4 locaddr4[MPTCP_MAX_ADDR]; |
10382 |
++ u8 loc4_bits; |
10383 |
++ u8 next_v4_index; |
10384 |
++ |
10385 |
++ struct mptcp_loc6 locaddr6[MPTCP_MAX_ADDR]; |
10386 |
++ u8 loc6_bits; |
10387 |
++ u8 next_v6_index; |
10388 |
++}; |
10389 |
++ |
10390 |
++struct mptcp_addr_event { |
10391 |
++ struct list_head list; |
10392 |
++ unsigned short family; |
10393 |
++ u8 code:7, |
10394 |
++ low_prio:1; |
10395 |
++ union inet_addr addr; |
10396 |
++}; |
10397 |
++ |
10398 |
++struct fullmesh_priv { |
10399 |
++ /* Worker struct for subflow establishment */ |
10400 |
++ struct work_struct subflow_work; |
10401 |
++ /* Delayed worker, when the routing-tables are not yet ready. */ |
10402 |
++ struct delayed_work subflow_retry_work; |
10403 |
++ |
10404 |
++ /* Remote addresses */ |
10405 |
++ struct fullmesh_rem4 remaddr4[MPTCP_MAX_ADDR]; |
10406 |
++ struct fullmesh_rem6 remaddr6[MPTCP_MAX_ADDR]; |
10407 |
++ |
10408 |
++ struct mptcp_cb *mpcb; |
10409 |
++ |
10410 |
++ u16 remove_addrs; /* Addresses to remove */ |
10411 |
++ u8 announced_addrs_v4; /* IPv4 Addresses we did announce */ |
10412 |
++ u8 announced_addrs_v6; /* IPv6 Addresses we did announce */ |
10413 |
++ |
10414 |
++ u8 add_addr; /* Are we sending an add_addr? */ |
10415 |
++ |
10416 |
++ u8 rem4_bits; |
10417 |
++ u8 rem6_bits; |
10418 |
++}; |
10419 |
++ |
10420 |
++struct mptcp_fm_ns { |
10421 |
++ struct mptcp_loc_addr __rcu *local; |
10422 |
++ spinlock_t local_lock; /* Protecting the above pointer */ |
10423 |
++ struct list_head events; |
10424 |
++ struct delayed_work address_worker; |
10425 |
++ |
10426 |
++ struct net *net; |
10427 |
++}; |
10428 |
++ |
10429 |
++static struct mptcp_pm_ops full_mesh __read_mostly; |
10430 |
++ |
10431 |
++static void full_mesh_create_subflows(struct sock *meta_sk); |
10432 |
++ |
10433 |
++static struct mptcp_fm_ns *fm_get_ns(const struct net *net) |
10434 |
++{ |
10435 |
++ return (struct mptcp_fm_ns *)net->mptcp.path_managers[MPTCP_PM_FULLMESH]; |
10436 |
++} |
10437 |
++ |
10438 |
++static struct fullmesh_priv *fullmesh_get_priv(const struct mptcp_cb *mpcb) |
10439 |
++{ |
10440 |
++ return (struct fullmesh_priv *)&mpcb->mptcp_pm[0]; |
10441 |
++} |
10442 |
++ |
10443 |
++/* Find the first free index in the bitfield */ |
10444 |
++static int __mptcp_find_free_index(u8 bitfield, u8 base) |
10445 |
++{ |
10446 |
++ int i; |
10447 |
++ |
10448 |
++ /* There are anyways no free bits... */ |
10449 |
++ if (bitfield == 0xff) |
10450 |
++ goto exit; |
10451 |
++ |
10452 |
++ i = ffs(~(bitfield >> base)) - 1; |
10453 |
++ if (i < 0) |
10454 |
++ goto exit; |
10455 |
++ |
10456 |
++ /* No free bits when starting at base, try from 0 on */ |
10457 |
++ if (i + base >= sizeof(bitfield) * 8) |
10458 |
++ return __mptcp_find_free_index(bitfield, 0); |
10459 |
++ |
10460 |
++ return i + base; |
10461 |
++exit: |
10462 |
++ return -1; |
10463 |
++} |
10464 |
++ |
10465 |
++static int mptcp_find_free_index(u8 bitfield) |
10466 |
++{ |
10467 |
++ return __mptcp_find_free_index(bitfield, 0); |
10468 |
++} |
10469 |
++ |
10470 |
++static void mptcp_addv4_raddr(struct mptcp_cb *mpcb, |
10471 |
++ const struct in_addr *addr, |
10472 |
++ __be16 port, u8 id) |
10473 |
++{ |
10474 |
++ int i; |
10475 |
++ struct fullmesh_rem4 *rem4; |
10476 |
++ struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb); |
10477 |
++ |
10478 |
++ mptcp_for_each_bit_set(fmp->rem4_bits, i) { |
10479 |
++ rem4 = &fmp->remaddr4[i]; |
10480 |
++ |
10481 |
++ /* Address is already in the list --- continue */ |
10482 |
++ if (rem4->rem4_id == id && |
10483 |
++ rem4->addr.s_addr == addr->s_addr && rem4->port == port) |
10484 |
++ return; |
10485 |
++ |
10486 |
++ /* This may be the case, when the peer is behind a NAT. He is |
10487 |
++ * trying to JOIN, thus sending the JOIN with a certain ID. |
10488 |
++ * However the src_addr of the IP-packet has been changed. We |
10489 |
++ * update the addr in the list, because this is the address as |
10490 |
++ * OUR BOX sees it. |
10491 |
++ */ |
10492 |
++ if (rem4->rem4_id == id && rem4->addr.s_addr != addr->s_addr) { |
10493 |
++ /* update the address */ |
10494 |
++ mptcp_debug("%s: updating old addr:%pI4 to addr %pI4 with id:%d\n", |
10495 |
++ __func__, &rem4->addr.s_addr, |
10496 |
++ &addr->s_addr, id); |
10497 |
++ rem4->addr.s_addr = addr->s_addr; |
10498 |
++ rem4->port = port; |
10499 |
++ mpcb->list_rcvd = 1; |
10500 |
++ return; |
10501 |
++ } |
10502 |
++ } |
10503 |
++ |
10504 |
++ i = mptcp_find_free_index(fmp->rem4_bits); |
10505 |
++ /* Do we have already the maximum number of local/remote addresses? */ |
10506 |
++ if (i < 0) { |
10507 |
++ mptcp_debug("%s: At max num of remote addresses: %d --- not adding address: %pI4\n", |
10508 |
++ __func__, MPTCP_MAX_ADDR, &addr->s_addr); |
10509 |
++ return; |
10510 |
++ } |
10511 |
++ |
10512 |
++ rem4 = &fmp->remaddr4[i]; |
10513 |
++ |
10514 |
++ /* Address is not known yet, store it */ |
10515 |
++ rem4->addr.s_addr = addr->s_addr; |
10516 |
++ rem4->port = port; |
10517 |
++ rem4->bitfield = 0; |
10518 |
++ rem4->retry_bitfield = 0; |
10519 |
++ rem4->rem4_id = id; |
10520 |
++ mpcb->list_rcvd = 1; |
10521 |
++ fmp->rem4_bits |= (1 << i); |
10522 |
++ |
10523 |
++ return; |
10524 |
++} |
10525 |
++ |
10526 |
++static void mptcp_addv6_raddr(struct mptcp_cb *mpcb, |
10527 |
++ const struct in6_addr *addr, |
10528 |
++ __be16 port, u8 id) |
10529 |
++{ |
10530 |
++ int i; |
10531 |
++ struct fullmesh_rem6 *rem6; |
10532 |
++ struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb); |
10533 |
++ |
10534 |
++ mptcp_for_each_bit_set(fmp->rem6_bits, i) { |
10535 |
++ rem6 = &fmp->remaddr6[i]; |
10536 |
++ |
10537 |
++ /* Address is already in the list --- continue */ |
10538 |
++ if (rem6->rem6_id == id && |
10539 |
++ ipv6_addr_equal(&rem6->addr, addr) && rem6->port == port) |
10540 |
++ return; |
10541 |
++ |
10542 |
++ /* This may be the case, when the peer is behind a NAT. He is |
10543 |
++ * trying to JOIN, thus sending the JOIN with a certain ID. |
10544 |
++ * However the src_addr of the IP-packet has been changed. We |
10545 |
++ * update the addr in the list, because this is the address as |
10546 |
++ * OUR BOX sees it. |
10547 |
++ */ |
10548 |
++ if (rem6->rem6_id == id) { |
10549 |
++ /* update the address */ |
10550 |
++ mptcp_debug("%s: updating old addr: %pI6 to addr %pI6 with id:%d\n", |
10551 |
++ __func__, &rem6->addr, addr, id); |
10552 |
++ rem6->addr = *addr; |
10553 |
++ rem6->port = port; |
10554 |
++ mpcb->list_rcvd = 1; |
10555 |
++ return; |
10556 |
++ } |
10557 |
++ } |
10558 |
++ |
10559 |
++ i = mptcp_find_free_index(fmp->rem6_bits); |
10560 |
++ /* Do we have already the maximum number of local/remote addresses? */ |
10561 |
++ if (i < 0) { |
10562 |
++ mptcp_debug("%s: At max num of remote addresses: %d --- not adding address: %pI6\n", |
10563 |
++ __func__, MPTCP_MAX_ADDR, addr); |
10564 |
++ return; |
10565 |
++ } |
10566 |
++ |
10567 |
++ rem6 = &fmp->remaddr6[i]; |
10568 |
++ |
10569 |
++ /* Address is not known yet, store it */ |
10570 |
++ rem6->addr = *addr; |
10571 |
++ rem6->port = port; |
10572 |
++ rem6->bitfield = 0; |
10573 |
++ rem6->retry_bitfield = 0; |
10574 |
++ rem6->rem6_id = id; |
10575 |
++ mpcb->list_rcvd = 1; |
10576 |
++ fmp->rem6_bits |= (1 << i); |
10577 |
++ |
10578 |
++ return; |
10579 |
++} |
10580 |
++ |
10581 |
++static void mptcp_v4_rem_raddress(struct mptcp_cb *mpcb, u8 id) |
10582 |
++{ |
10583 |
++ int i; |
10584 |
++ struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb); |
10585 |
++ |
10586 |
++ mptcp_for_each_bit_set(fmp->rem4_bits, i) { |
10587 |
++ if (fmp->remaddr4[i].rem4_id == id) { |
10588 |
++ /* remove address from bitfield */ |
10589 |
++ fmp->rem4_bits &= ~(1 << i); |
10590 |
++ |
10591 |
++ break; |
10592 |
++ } |
10593 |
++ } |
10594 |
++} |
10595 |
++ |
10596 |
++static void mptcp_v6_rem_raddress(const struct mptcp_cb *mpcb, u8 id) |
10597 |
++{ |
10598 |
++ int i; |
10599 |
++ struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb); |
10600 |
++ |
10601 |
++ mptcp_for_each_bit_set(fmp->rem6_bits, i) { |
10602 |
++ if (fmp->remaddr6[i].rem6_id == id) { |
10603 |
++ /* remove address from bitfield */ |
10604 |
++ fmp->rem6_bits &= ~(1 << i); |
10605 |
++ |
10606 |
++ break; |
10607 |
++ } |
10608 |
++ } |
10609 |
++} |
10610 |
++ |
10611 |
++/* Sets the bitfield of the remote-address field */ |
10612 |
++static void mptcp_v4_set_init_addr_bit(const struct mptcp_cb *mpcb, |
10613 |
++ const struct in_addr *addr, u8 index) |
10614 |
++{ |
10615 |
++ int i; |
10616 |
++ struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb); |
10617 |
++ |
10618 |
++ mptcp_for_each_bit_set(fmp->rem4_bits, i) { |
10619 |
++ if (fmp->remaddr4[i].addr.s_addr == addr->s_addr) { |
10620 |
++ fmp->remaddr4[i].bitfield |= (1 << index); |
10621 |
++ return; |
10622 |
++ } |
10623 |
++ } |
10624 |
++} |
10625 |
++ |
10626 |
++/* Sets the bitfield of the remote-address field */ |
10627 |
++static void mptcp_v6_set_init_addr_bit(struct mptcp_cb *mpcb, |
10628 |
++ const struct in6_addr *addr, u8 index) |
10629 |
++{ |
10630 |
++ int i; |
10631 |
++ struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb); |
10632 |
++ |
10633 |
++ mptcp_for_each_bit_set(fmp->rem6_bits, i) { |
10634 |
++ if (ipv6_addr_equal(&fmp->remaddr6[i].addr, addr)) { |
10635 |
++ fmp->remaddr6[i].bitfield |= (1 << index); |
10636 |
++ return; |
10637 |
++ } |
10638 |
++ } |
10639 |
++} |
10640 |
++ |
10641 |
++static void mptcp_set_init_addr_bit(struct mptcp_cb *mpcb, |
10642 |
++ const union inet_addr *addr, |
10643 |
++ sa_family_t family, u8 id) |
10644 |
++{ |
10645 |
++ if (family == AF_INET) |
10646 |
++ mptcp_v4_set_init_addr_bit(mpcb, &addr->in, id); |
10647 |
++ else |
10648 |
++ mptcp_v6_set_init_addr_bit(mpcb, &addr->in6, id); |
10649 |
++} |
10650 |
++ |
10651 |
++static void retry_subflow_worker(struct work_struct *work) |
10652 |
++{ |
10653 |
++ struct delayed_work *delayed_work = container_of(work, |
10654 |
++ struct delayed_work, |
10655 |
++ work); |
10656 |
++ struct fullmesh_priv *fmp = container_of(delayed_work, |
10657 |
++ struct fullmesh_priv, |
10658 |
++ subflow_retry_work); |
10659 |
++ struct mptcp_cb *mpcb = fmp->mpcb; |
10660 |
++ struct sock *meta_sk = mpcb->meta_sk; |
10661 |
++ struct mptcp_loc_addr *mptcp_local; |
10662 |
++ struct mptcp_fm_ns *fm_ns = fm_get_ns(sock_net(meta_sk)); |
10663 |
++ int iter = 0, i; |
10664 |
++ |
10665 |
++ /* We need a local (stable) copy of the address-list. Really, it is not |
10666 |
++ * such a big deal, if the address-list is not 100% up-to-date. |
10667 |
++ */ |
10668 |
++ rcu_read_lock_bh(); |
10669 |
++ mptcp_local = rcu_dereference_bh(fm_ns->local); |
10670 |
++ mptcp_local = kmemdup(mptcp_local, sizeof(*mptcp_local), GFP_ATOMIC); |
10671 |
++ rcu_read_unlock_bh(); |
10672 |
++ |
10673 |
++ if (!mptcp_local) |
10674 |
++ return; |
10675 |
++ |
10676 |
++next_subflow: |
10677 |
++ if (iter) { |
10678 |
++ release_sock(meta_sk); |
10679 |
++ mutex_unlock(&mpcb->mpcb_mutex); |
10680 |
++ |
10681 |
++ cond_resched(); |
10682 |
++ } |
10683 |
++ mutex_lock(&mpcb->mpcb_mutex); |
10684 |
++ lock_sock_nested(meta_sk, SINGLE_DEPTH_NESTING); |
10685 |
++ |
10686 |
++ iter++; |
10687 |
++ |
10688 |
++ if (sock_flag(meta_sk, SOCK_DEAD)) |
10689 |
++ goto exit; |
10690 |
++ |
10691 |
++ mptcp_for_each_bit_set(fmp->rem4_bits, i) { |
10692 |
++ struct fullmesh_rem4 *rem = &fmp->remaddr4[i]; |
10693 |
++ /* Do we need to retry establishing a subflow ? */ |
10694 |
++ if (rem->retry_bitfield) { |
10695 |
++ int i = mptcp_find_free_index(~rem->retry_bitfield); |
10696 |
++ struct mptcp_rem4 rem4; |
10697 |
++ |
10698 |
++ rem->bitfield |= (1 << i); |
10699 |
++ rem->retry_bitfield &= ~(1 << i); |
10700 |
++ |
10701 |
++ rem4.addr = rem->addr; |
10702 |
++ rem4.port = rem->port; |
10703 |
++ rem4.rem4_id = rem->rem4_id; |
10704 |
++ |
10705 |
++ mptcp_init4_subsockets(meta_sk, &mptcp_local->locaddr4[i], &rem4); |
10706 |
++ goto next_subflow; |
10707 |
++ } |
10708 |
++ } |
10709 |
++ |
10710 |
++#if IS_ENABLED(CONFIG_IPV6) |
10711 |
++ mptcp_for_each_bit_set(fmp->rem6_bits, i) { |
10712 |
++ struct fullmesh_rem6 *rem = &fmp->remaddr6[i]; |
10713 |
++ |
10714 |
++ /* Do we need to retry establishing a subflow ? */ |
10715 |
++ if (rem->retry_bitfield) { |
10716 |
++ int i = mptcp_find_free_index(~rem->retry_bitfield); |
10717 |
++ struct mptcp_rem6 rem6; |
10718 |
++ |
10719 |
++ rem->bitfield |= (1 << i); |
10720 |
++ rem->retry_bitfield &= ~(1 << i); |
10721 |
++ |
10722 |
++ rem6.addr = rem->addr; |
10723 |
++ rem6.port = rem->port; |
10724 |
++ rem6.rem6_id = rem->rem6_id; |
10725 |
++ |
10726 |
++ mptcp_init6_subsockets(meta_sk, &mptcp_local->locaddr6[i], &rem6); |
10727 |
++ goto next_subflow; |
10728 |
++ } |
10729 |
++ } |
10730 |
++#endif |
10731 |
++ |
10732 |
++exit: |
10733 |
++ kfree(mptcp_local); |
10734 |
++ release_sock(meta_sk); |
10735 |
++ mutex_unlock(&mpcb->mpcb_mutex); |
10736 |
++ sock_put(meta_sk); |
10737 |
++} |
10738 |
++ |
10739 |
++/** |
10740 |
++ * Create all new subflows, by doing calls to mptcp_initX_subsockets |
10741 |
++ * |
10742 |
++ * This function uses a goto next_subflow, to allow releasing the lock between |
10743 |
++ * new subflows and giving other processes a chance to do some work on the |
10744 |
++ * socket and potentially finishing the communication. |
10745 |
++ **/ |
10746 |
++static void create_subflow_worker(struct work_struct *work) |
10747 |
++{ |
10748 |
++ struct fullmesh_priv *fmp = container_of(work, struct fullmesh_priv, |
10749 |
++ subflow_work); |
10750 |
++ struct mptcp_cb *mpcb = fmp->mpcb; |
10751 |
++ struct sock *meta_sk = mpcb->meta_sk; |
10752 |
++ struct mptcp_loc_addr *mptcp_local; |
10753 |
++ const struct mptcp_fm_ns *fm_ns = fm_get_ns(sock_net(meta_sk)); |
10754 |
++ int iter = 0, retry = 0; |
10755 |
++ int i; |
10756 |
++ |
10757 |
++ /* We need a local (stable) copy of the address-list. Really, it is not |
10758 |
++ * such a big deal, if the address-list is not 100% up-to-date. |
10759 |
++ */ |
10760 |
++ rcu_read_lock_bh(); |
10761 |
++ mptcp_local = rcu_dereference_bh(fm_ns->local); |
10762 |
++ mptcp_local = kmemdup(mptcp_local, sizeof(*mptcp_local), GFP_ATOMIC); |
10763 |
++ rcu_read_unlock_bh(); |
10764 |
++ |
10765 |
++ if (!mptcp_local) |
10766 |
++ return; |
10767 |
++ |
10768 |
++next_subflow: |
10769 |
++ if (iter) { |
10770 |
++ release_sock(meta_sk); |
10771 |
++ mutex_unlock(&mpcb->mpcb_mutex); |
10772 |
++ |
10773 |
++ cond_resched(); |
10774 |
++ } |
10775 |
++ mutex_lock(&mpcb->mpcb_mutex); |
10776 |
++ lock_sock_nested(meta_sk, SINGLE_DEPTH_NESTING); |
10777 |
++ |
10778 |
++ iter++; |
10779 |
++ |
10780 |
++ if (sock_flag(meta_sk, SOCK_DEAD)) |
10781 |
++ goto exit; |
10782 |
++ |
10783 |
++ if (mpcb->master_sk && |
10784 |
++ !tcp_sk(mpcb->master_sk)->mptcp->fully_established) |
10785 |
++ goto exit; |
10786 |
++ |
10787 |
++ mptcp_for_each_bit_set(fmp->rem4_bits, i) { |
10788 |
++ struct fullmesh_rem4 *rem; |
10789 |
++ u8 remaining_bits; |
10790 |
++ |
10791 |
++ rem = &fmp->remaddr4[i]; |
10792 |
++ remaining_bits = ~(rem->bitfield) & mptcp_local->loc4_bits; |
10793 |
++ |
10794 |
++ /* Are there still combinations to handle? */ |
10795 |
++ if (remaining_bits) { |
10796 |
++ int i = mptcp_find_free_index(~remaining_bits); |
10797 |
++ struct mptcp_rem4 rem4; |
10798 |
++ |
10799 |
++ rem->bitfield |= (1 << i); |
10800 |
++ |
10801 |
++ rem4.addr = rem->addr; |
10802 |
++ rem4.port = rem->port; |
10803 |
++ rem4.rem4_id = rem->rem4_id; |
10804 |
++ |
10805 |
++ /* If a route is not yet available then retry once */ |
10806 |
++ if (mptcp_init4_subsockets(meta_sk, &mptcp_local->locaddr4[i], |
10807 |
++ &rem4) == -ENETUNREACH) |
10808 |
++ retry = rem->retry_bitfield |= (1 << i); |
10809 |
++ goto next_subflow; |
10810 |
++ } |
10811 |
++ } |
10812 |
++ |
10813 |
++#if IS_ENABLED(CONFIG_IPV6) |
10814 |
++ mptcp_for_each_bit_set(fmp->rem6_bits, i) { |
10815 |
++ struct fullmesh_rem6 *rem; |
10816 |
++ u8 remaining_bits; |
10817 |
++ |
10818 |
++ rem = &fmp->remaddr6[i]; |
10819 |
++ remaining_bits = ~(rem->bitfield) & mptcp_local->loc6_bits; |
10820 |
++ |
10821 |
++ /* Are there still combinations to handle? */ |
10822 |
++ if (remaining_bits) { |
10823 |
++ int i = mptcp_find_free_index(~remaining_bits); |
10824 |
++ struct mptcp_rem6 rem6; |
10825 |
++ |
10826 |
++ rem->bitfield |= (1 << i); |
10827 |
++ |
10828 |
++ rem6.addr = rem->addr; |
10829 |
++ rem6.port = rem->port; |
10830 |
++ rem6.rem6_id = rem->rem6_id; |
10831 |
++ |
10832 |
++ /* If a route is not yet available then retry once */ |
10833 |
++ if (mptcp_init6_subsockets(meta_sk, &mptcp_local->locaddr6[i], |
10834 |
++ &rem6) == -ENETUNREACH) |
10835 |
++ retry = rem->retry_bitfield |= (1 << i); |
10836 |
++ goto next_subflow; |
10837 |
++ } |
10838 |
++ } |
10839 |
++#endif |
10840 |
++ |
10841 |
++ if (retry && !delayed_work_pending(&fmp->subflow_retry_work)) { |
10842 |
++ sock_hold(meta_sk); |
10843 |
++ queue_delayed_work(mptcp_wq, &fmp->subflow_retry_work, |
10844 |
++ msecs_to_jiffies(MPTCP_SUBFLOW_RETRY_DELAY)); |
10845 |
++ } |
10846 |
++ |
10847 |
++exit: |
10848 |
++ kfree(mptcp_local); |
10849 |
++ release_sock(meta_sk); |
10850 |
++ mutex_unlock(&mpcb->mpcb_mutex); |
10851 |
++ sock_put(meta_sk); |
10852 |
++} |
10853 |
++ |
10854 |
++static void announce_remove_addr(u8 addr_id, struct sock *meta_sk) |
10855 |
++{ |
10856 |
++ struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb; |
10857 |
++ struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb); |
10858 |
++ struct sock *sk = mptcp_select_ack_sock(meta_sk); |
10859 |
++ |
10860 |
++ fmp->remove_addrs |= (1 << addr_id); |
10861 |
++ mpcb->addr_signal = 1; |
10862 |
++ |
10863 |
++ if (sk) |
10864 |
++ tcp_send_ack(sk); |
10865 |
++} |
10866 |
++ |
10867 |
++static void update_addr_bitfields(struct sock *meta_sk, |
10868 |
++ const struct mptcp_loc_addr *mptcp_local) |
10869 |
++{ |
10870 |
++ struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb; |
10871 |
++ struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb); |
10872 |
++ int i; |
10873 |
++ |
10874 |
++ /* The bits in announced_addrs_* always match with loc*_bits. So, a |
10875 |
++ * simply & operation unsets the correct bits, because these go from |
10876 |
++ * announced to non-announced |
10877 |
++ */ |
10878 |
++ fmp->announced_addrs_v4 &= mptcp_local->loc4_bits; |
10879 |
++ |
10880 |
++ mptcp_for_each_bit_set(fmp->rem4_bits, i) { |
10881 |
++ fmp->remaddr4[i].bitfield &= mptcp_local->loc4_bits; |
10882 |
++ fmp->remaddr4[i].retry_bitfield &= mptcp_local->loc4_bits; |
10883 |
++ } |
10884 |
++ |
10885 |
++ fmp->announced_addrs_v6 &= mptcp_local->loc6_bits; |
10886 |
++ |
10887 |
++ mptcp_for_each_bit_set(fmp->rem6_bits, i) { |
10888 |
++ fmp->remaddr6[i].bitfield &= mptcp_local->loc6_bits; |
10889 |
++ fmp->remaddr6[i].retry_bitfield &= mptcp_local->loc6_bits; |
10890 |
++ } |
10891 |
++} |
10892 |
++ |
10893 |
++static int mptcp_find_address(const struct mptcp_loc_addr *mptcp_local, |
10894 |
++ sa_family_t family, const union inet_addr *addr) |
10895 |
++{ |
10896 |
++ int i; |
10897 |
++ u8 loc_bits; |
10898 |
++ bool found = false; |
10899 |
++ |
10900 |
++ if (family == AF_INET) |
10901 |
++ loc_bits = mptcp_local->loc4_bits; |
10902 |
++ else |
10903 |
++ loc_bits = mptcp_local->loc6_bits; |
10904 |
++ |
10905 |
++ mptcp_for_each_bit_set(loc_bits, i) { |
10906 |
++ if (family == AF_INET && |
10907 |
++ mptcp_local->locaddr4[i].addr.s_addr == addr->in.s_addr) { |
10908 |
++ found = true; |
10909 |
++ break; |
10910 |
++ } |
10911 |
++ if (family == AF_INET6 && |
10912 |
++ ipv6_addr_equal(&mptcp_local->locaddr6[i].addr, |
10913 |
++ &addr->in6)) { |
10914 |
++ found = true; |
10915 |
++ break; |
10916 |
++ } |
10917 |
++ } |
10918 |
++ |
10919 |
++ if (!found) |
10920 |
++ return -1; |
10921 |
++ |
10922 |
++ return i; |
10923 |
++} |
10924 |
++ |
10925 |
++static void mptcp_address_worker(struct work_struct *work) |
10926 |
++{ |
10927 |
++ const struct delayed_work *delayed_work = container_of(work, |
10928 |
++ struct delayed_work, |
10929 |
++ work); |
10930 |
++ struct mptcp_fm_ns *fm_ns = container_of(delayed_work, |
10931 |
++ struct mptcp_fm_ns, |
10932 |
++ address_worker); |
10933 |
++ struct net *net = fm_ns->net; |
10934 |
++ struct mptcp_addr_event *event = NULL; |
10935 |
++ struct mptcp_loc_addr *mptcp_local, *old; |
10936 |
++ int i, id = -1; /* id is used in the socket-code on a delete-event */ |
10937 |
++ bool success; /* Used to indicate if we succeeded handling the event */ |
10938 |
++ |
10939 |
++next_event: |
10940 |
++ success = false; |
10941 |
++ kfree(event); |
10942 |
++ |
10943 |
++ /* First, let's dequeue an event from our event-list */ |
10944 |
++ rcu_read_lock_bh(); |
10945 |
++ spin_lock(&fm_ns->local_lock); |
10946 |
++ |
10947 |
++ event = list_first_entry_or_null(&fm_ns->events, |
10948 |
++ struct mptcp_addr_event, list); |
10949 |
++ if (!event) { |
10950 |
++ spin_unlock(&fm_ns->local_lock); |
10951 |
++ rcu_read_unlock_bh(); |
10952 |
++ return; |
10953 |
++ } |
10954 |
++ |
10955 |
++ list_del(&event->list); |
10956 |
++ |
10957 |
++ mptcp_local = rcu_dereference_bh(fm_ns->local); |
10958 |
++ |
10959 |
++ if (event->code == MPTCP_EVENT_DEL) { |
10960 |
++ id = mptcp_find_address(mptcp_local, event->family, &event->addr); |
10961 |
++ |
10962 |
++ /* Not in the list - so we don't care */ |
10963 |
++ if (id < 0) { |
10964 |
++ mptcp_debug("%s could not find id\n", __func__); |
10965 |
++ goto duno; |
10966 |
++ } |
10967 |
++ |
10968 |
++ old = mptcp_local; |
10969 |
++ mptcp_local = kmemdup(mptcp_local, sizeof(*mptcp_local), |
10970 |
++ GFP_ATOMIC); |
10971 |
++ if (!mptcp_local) |
10972 |
++ goto duno; |
10973 |
++ |
10974 |
++ if (event->family == AF_INET) |
10975 |
++ mptcp_local->loc4_bits &= ~(1 << id); |
10976 |
++ else |
10977 |
++ mptcp_local->loc6_bits &= ~(1 << id); |
10978 |
++ |
10979 |
++ rcu_assign_pointer(fm_ns->local, mptcp_local); |
10980 |
++ kfree(old); |
10981 |
++ } else { |
10982 |
++ int i = mptcp_find_address(mptcp_local, event->family, &event->addr); |
10983 |
++ int j = i; |
10984 |
++ |
10985 |
++ if (j < 0) { |
10986 |
++ /* Not in the list, so we have to find an empty slot */ |
10987 |
++ if (event->family == AF_INET) |
10988 |
++ i = __mptcp_find_free_index(mptcp_local->loc4_bits, |
10989 |
++ mptcp_local->next_v4_index); |
10990 |
++ if (event->family == AF_INET6) |
10991 |
++ i = __mptcp_find_free_index(mptcp_local->loc6_bits, |
10992 |
++ mptcp_local->next_v6_index); |
10993 |
++ |
10994 |
++ if (i < 0) { |
10995 |
++ mptcp_debug("%s no more space\n", __func__); |
10996 |
++ goto duno; |
10997 |
++ } |
10998 |
++ |
10999 |
++ /* It might have been a MOD-event. */ |
11000 |
++ event->code = MPTCP_EVENT_ADD; |
11001 |
++ } else { |
11002 |
++ /* Let's check if anything changes */ |
11003 |
++ if (event->family == AF_INET && |
11004 |
++ event->low_prio == mptcp_local->locaddr4[i].low_prio) |
11005 |
++ goto duno; |
11006 |
++ |
11007 |
++ if (event->family == AF_INET6 && |
11008 |
++ event->low_prio == mptcp_local->locaddr6[i].low_prio) |
11009 |
++ goto duno; |
11010 |
++ } |
11011 |
++ |
11012 |
++ old = mptcp_local; |
11013 |
++ mptcp_local = kmemdup(mptcp_local, sizeof(*mptcp_local), |
11014 |
++ GFP_ATOMIC); |
11015 |
++ if (!mptcp_local) |
11016 |
++ goto duno; |
11017 |
++ |
11018 |
++ if (event->family == AF_INET) { |
11019 |
++ mptcp_local->locaddr4[i].addr.s_addr = event->addr.in.s_addr; |
11020 |
++ mptcp_local->locaddr4[i].loc4_id = i + 1; |
11021 |
++ mptcp_local->locaddr4[i].low_prio = event->low_prio; |
11022 |
++ } else { |
11023 |
++ mptcp_local->locaddr6[i].addr = event->addr.in6; |
11024 |
++ mptcp_local->locaddr6[i].loc6_id = i + MPTCP_MAX_ADDR; |
11025 |
++ mptcp_local->locaddr6[i].low_prio = event->low_prio; |
11026 |
++ } |
11027 |
++ |
11028 |
++ if (j < 0) { |
11029 |
++ if (event->family == AF_INET) { |
11030 |
++ mptcp_local->loc4_bits |= (1 << i); |
11031 |
++ mptcp_local->next_v4_index = i + 1; |
11032 |
++ } else { |
11033 |
++ mptcp_local->loc6_bits |= (1 << i); |
11034 |
++ mptcp_local->next_v6_index = i + 1; |
11035 |
++ } |
11036 |
++ } |
11037 |
++ |
11038 |
++ rcu_assign_pointer(fm_ns->local, mptcp_local); |
11039 |
++ kfree(old); |
11040 |
++ } |
11041 |
++ success = true; |
11042 |
++ |
11043 |
++duno: |
11044 |
++ spin_unlock(&fm_ns->local_lock); |
11045 |
++ rcu_read_unlock_bh(); |
11046 |
++ |
11047 |
++ if (!success) |
11048 |
++ goto next_event; |
11049 |
++ |
11050 |
++ /* Now we iterate over the MPTCP-sockets and apply the event. */ |
11051 |
++ for (i = 0; i < MPTCP_HASH_SIZE; i++) { |
11052 |
++ const struct hlist_nulls_node *node; |
11053 |
++ struct tcp_sock *meta_tp; |
11054 |
++ |
11055 |
++ rcu_read_lock_bh(); |
11056 |
++ hlist_nulls_for_each_entry_rcu(meta_tp, node, &tk_hashtable[i], |
11057 |
++ tk_table) { |
11058 |
++ struct mptcp_cb *mpcb = meta_tp->mpcb; |
11059 |
++ struct sock *meta_sk = (struct sock *)meta_tp, *sk; |
11060 |
++ struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb); |
11061 |
++ bool meta_v4 = meta_sk->sk_family == AF_INET; |
11062 |
++ |
11063 |
++ if (sock_net(meta_sk) != net) |
11064 |
++ continue; |
11065 |
++ |
11066 |
++ if (meta_v4) { |
11067 |
++ /* skip IPv6 events if meta is IPv4 */ |
11068 |
++ if (event->family == AF_INET6) |
11069 |
++ continue; |
11070 |
++ } |
11071 |
++ /* skip IPv4 events if IPV6_V6ONLY is set */ |
11072 |
++ else if (event->family == AF_INET && |
11073 |
++ inet6_sk(meta_sk)->ipv6only) |
11074 |
++ continue; |
11075 |
++ |
11076 |
++ if (unlikely(!atomic_inc_not_zero(&meta_sk->sk_refcnt))) |
11077 |
++ continue; |
11078 |
++ |
11079 |
++ bh_lock_sock(meta_sk); |
11080 |
++ |
11081 |
++ if (!mptcp(meta_tp) || !is_meta_sk(meta_sk) || |
11082 |
++ mpcb->infinite_mapping_snd || |
11083 |
++ mpcb->infinite_mapping_rcv || |
11084 |
++ mpcb->send_infinite_mapping) |
11085 |
++ goto next; |
11086 |
++ |
11087 |
++ /* May be that the pm has changed in-between */ |
11088 |
++ if (mpcb->pm_ops != &full_mesh) |
11089 |
++ goto next; |
11090 |
++ |
11091 |
++ if (sock_owned_by_user(meta_sk)) { |
11092 |
++ if (!test_and_set_bit(MPTCP_PATH_MANAGER, |
11093 |
++ &meta_tp->tsq_flags)) |
11094 |
++ sock_hold(meta_sk); |
11095 |
++ |
11096 |
++ goto next; |
11097 |
++ } |
11098 |
++ |
11099 |
++ if (event->code == MPTCP_EVENT_ADD) { |
11100 |
++ fmp->add_addr++; |
11101 |
++ mpcb->addr_signal = 1; |
11102 |
++ |
11103 |
++ sk = mptcp_select_ack_sock(meta_sk); |
11104 |
++ if (sk) |
11105 |
++ tcp_send_ack(sk); |
11106 |
++ |
11107 |
++ full_mesh_create_subflows(meta_sk); |
11108 |
++ } |
11109 |
++ |
11110 |
++ if (event->code == MPTCP_EVENT_DEL) { |
11111 |
++ struct sock *sk, *tmpsk; |
11112 |
++ struct mptcp_loc_addr *mptcp_local; |
11113 |
++ bool found = false; |
11114 |
++ |
11115 |
++ mptcp_local = rcu_dereference_bh(fm_ns->local); |
11116 |
++ |
11117 |
++ /* In any case, we need to update our bitfields */ |
11118 |
++ if (id >= 0) |
11119 |
++ update_addr_bitfields(meta_sk, mptcp_local); |
11120 |
++ |
11121 |
++ /* Look for the socket and remove him */ |
11122 |
++ mptcp_for_each_sk_safe(mpcb, sk, tmpsk) { |
11123 |
++ if ((event->family == AF_INET6 && |
11124 |
++ (sk->sk_family == AF_INET || |
11125 |
++ mptcp_v6_is_v4_mapped(sk))) || |
11126 |
++ (event->family == AF_INET && |
11127 |
++ (sk->sk_family == AF_INET6 && |
11128 |
++ !mptcp_v6_is_v4_mapped(sk)))) |
11129 |
++ continue; |
11130 |
++ |
11131 |
++ if (event->family == AF_INET && |
11132 |
++ (sk->sk_family == AF_INET || |
11133 |
++ mptcp_v6_is_v4_mapped(sk)) && |
11134 |
++ inet_sk(sk)->inet_saddr != event->addr.in.s_addr) |
11135 |
++ continue; |
11136 |
++ |
11137 |
++ if (event->family == AF_INET6 && |
11138 |
++ sk->sk_family == AF_INET6 && |
11139 |
++ !ipv6_addr_equal(&inet6_sk(sk)->saddr, &event->addr.in6)) |
11140 |
++ continue; |
11141 |
++ |
11142 |
++ /* Reinject, so that pf = 1 and so we |
11143 |
++ * won't select this one as the |
11144 |
++ * ack-sock. |
11145 |
++ */ |
11146 |
++ mptcp_reinject_data(sk, 0); |
11147 |
++ |
11148 |
++ /* We announce the removal of this id */ |
11149 |
++ announce_remove_addr(tcp_sk(sk)->mptcp->loc_id, meta_sk); |
11150 |
++ |
11151 |
++ mptcp_sub_force_close(sk); |
11152 |
++ found = true; |
11153 |
++ } |
11154 |
++ |
11155 |
++ if (found) |
11156 |
++ goto next; |
11157 |
++ |
11158 |
++ /* The id may have been given by the event, |
11159 |
++ * matching on a local address. And it may not |
11160 |
++ * have matched on one of the above sockets, |
11161 |
++ * because the client never created a subflow. |
11162 |
++ * So, we have to finally remove it here. |
11163 |
++ */ |
11164 |
++ if (id > 0) |
11165 |
++ announce_remove_addr(id, meta_sk); |
11166 |
++ } |
11167 |
++ |
11168 |
++ if (event->code == MPTCP_EVENT_MOD) { |
11169 |
++ struct sock *sk; |
11170 |
++ |
11171 |
++ mptcp_for_each_sk(mpcb, sk) { |
11172 |
++ struct tcp_sock *tp = tcp_sk(sk); |
11173 |
++ if (event->family == AF_INET && |
11174 |
++ (sk->sk_family == AF_INET || |
11175 |
++ mptcp_v6_is_v4_mapped(sk)) && |
11176 |
++ inet_sk(sk)->inet_saddr == event->addr.in.s_addr) { |
11177 |
++ if (event->low_prio != tp->mptcp->low_prio) { |
11178 |
++ tp->mptcp->send_mp_prio = 1; |
11179 |
++ tp->mptcp->low_prio = event->low_prio; |
11180 |
++ |
11181 |
++ tcp_send_ack(sk); |
11182 |
++ } |
11183 |
++ } |
11184 |
++ |
11185 |
++ if (event->family == AF_INET6 && |
11186 |
++ sk->sk_family == AF_INET6 && |
11187 |
++ !ipv6_addr_equal(&inet6_sk(sk)->saddr, &event->addr.in6)) { |
11188 |
++ if (event->low_prio != tp->mptcp->low_prio) { |
11189 |
++ tp->mptcp->send_mp_prio = 1; |
11190 |
++ tp->mptcp->low_prio = event->low_prio; |
11191 |
++ |
11192 |
++ tcp_send_ack(sk); |
11193 |
++ } |
11194 |
++ } |
11195 |
++ } |
11196 |
++ } |
11197 |
++next: |
11198 |
++ bh_unlock_sock(meta_sk); |
11199 |
++ sock_put(meta_sk); |
11200 |
++ } |
11201 |
++ rcu_read_unlock_bh(); |
11202 |
++ } |
11203 |
++ goto next_event; |
11204 |
++} |
11205 |
++ |
11206 |
++static struct mptcp_addr_event *lookup_similar_event(const struct net *net, |
11207 |
++ const struct mptcp_addr_event *event) |
11208 |
++{ |
11209 |
++ struct mptcp_addr_event *eventq; |
11210 |
++ struct mptcp_fm_ns *fm_ns = fm_get_ns(net); |
11211 |
++ |
11212 |
++ list_for_each_entry(eventq, &fm_ns->events, list) { |
11213 |
++ if (eventq->family != event->family) |
11214 |
++ continue; |
11215 |
++ if (event->family == AF_INET) { |
11216 |
++ if (eventq->addr.in.s_addr == event->addr.in.s_addr) |
11217 |
++ return eventq; |
11218 |
++ } else { |
11219 |
++ if (ipv6_addr_equal(&eventq->addr.in6, &event->addr.in6)) |
11220 |
++ return eventq; |
11221 |
++ } |
11222 |
++ } |
11223 |
++ return NULL; |
11224 |
++} |
11225 |
++ |
11226 |
++/* We already hold the net-namespace MPTCP-lock */ |
11227 |
++static void add_pm_event(struct net *net, const struct mptcp_addr_event *event) |
11228 |
++{ |
11229 |
++ struct mptcp_addr_event *eventq = lookup_similar_event(net, event); |
11230 |
++ struct mptcp_fm_ns *fm_ns = fm_get_ns(net); |
11231 |
++ |
11232 |
++ if (eventq) { |
11233 |
++ switch (event->code) { |
11234 |
++ case MPTCP_EVENT_DEL: |
11235 |
++ mptcp_debug("%s del old_code %u\n", __func__, eventq->code); |
11236 |
++ list_del(&eventq->list); |
11237 |
++ kfree(eventq); |
11238 |
++ break; |
11239 |
++ case MPTCP_EVENT_ADD: |
11240 |
++ mptcp_debug("%s add old_code %u\n", __func__, eventq->code); |
11241 |
++ eventq->low_prio = event->low_prio; |
11242 |
++ eventq->code = MPTCP_EVENT_ADD; |
11243 |
++ return; |
11244 |
++ case MPTCP_EVENT_MOD: |
11245 |
++ mptcp_debug("%s mod old_code %u\n", __func__, eventq->code); |
11246 |
++ eventq->low_prio = event->low_prio; |
11247 |
++ eventq->code = MPTCP_EVENT_MOD; |
11248 |
++ return; |
11249 |
++ } |
11250 |
++ } |
11251 |
++ |
11252 |
++ /* OK, we have to add the new address to the wait queue */ |
11253 |
++ eventq = kmemdup(event, sizeof(struct mptcp_addr_event), GFP_ATOMIC); |
11254 |
++ if (!eventq) |
11255 |
++ return; |
11256 |
++ |
11257 |
++ list_add_tail(&eventq->list, &fm_ns->events); |
11258 |
++ |
11259 |
++ /* Create work-queue */ |
11260 |
++ if (!delayed_work_pending(&fm_ns->address_worker)) |
11261 |
++ queue_delayed_work(mptcp_wq, &fm_ns->address_worker, |
11262 |
++ msecs_to_jiffies(500)); |
11263 |
++} |
11264 |
++ |
11265 |
++static void addr4_event_handler(const struct in_ifaddr *ifa, unsigned long event, |
11266 |
++ struct net *net) |
11267 |
++{ |
11268 |
++ const struct net_device *netdev = ifa->ifa_dev->dev; |
11269 |
++ struct mptcp_fm_ns *fm_ns = fm_get_ns(net); |
11270 |
++ struct mptcp_addr_event mpevent; |
11271 |
++ |
11272 |
++ if (ifa->ifa_scope > RT_SCOPE_LINK || |
11273 |
++ ipv4_is_loopback(ifa->ifa_local)) |
11274 |
++ return; |
11275 |
++ |
11276 |
++ spin_lock_bh(&fm_ns->local_lock); |
11277 |
++ |
11278 |
++ mpevent.family = AF_INET; |
11279 |
++ mpevent.addr.in.s_addr = ifa->ifa_local; |
11280 |
++ mpevent.low_prio = (netdev->flags & IFF_MPBACKUP) ? 1 : 0; |
11281 |
++ |
11282 |
++ if (event == NETDEV_DOWN || !netif_running(netdev) || |
11283 |
++ (netdev->flags & IFF_NOMULTIPATH) || !(netdev->flags & IFF_UP)) |
11284 |
++ mpevent.code = MPTCP_EVENT_DEL; |
11285 |
++ else if (event == NETDEV_UP) |
11286 |
++ mpevent.code = MPTCP_EVENT_ADD; |
11287 |
++ else if (event == NETDEV_CHANGE) |
11288 |
++ mpevent.code = MPTCP_EVENT_MOD; |
11289 |
++ |
11290 |
++ mptcp_debug("%s created event for %pI4, code %u prio %u\n", __func__, |
11291 |
++ &ifa->ifa_local, mpevent.code, mpevent.low_prio); |
11292 |
++ add_pm_event(net, &mpevent); |
11293 |
++ |
11294 |
++ spin_unlock_bh(&fm_ns->local_lock); |
11295 |
++ return; |
11296 |
++} |
11297 |
++ |
11298 |
++/* React on IPv4-addr add/rem-events */ |
11299 |
++static int mptcp_pm_inetaddr_event(struct notifier_block *this, |
11300 |
++ unsigned long event, void *ptr) |
11301 |
++{ |
11302 |
++ const struct in_ifaddr *ifa = (struct in_ifaddr *)ptr; |
11303 |
++ struct net *net = dev_net(ifa->ifa_dev->dev); |
11304 |
++ |
11305 |
++ if (!(event == NETDEV_UP || event == NETDEV_DOWN || |
11306 |
++ event == NETDEV_CHANGE)) |
11307 |
++ return NOTIFY_DONE; |
11308 |
++ |
11309 |
++ addr4_event_handler(ifa, event, net); |
11310 |
++ |
11311 |
++ return NOTIFY_DONE; |
11312 |
++} |
11313 |
++ |
11314 |
++static struct notifier_block mptcp_pm_inetaddr_notifier = { |
11315 |
++ .notifier_call = mptcp_pm_inetaddr_event, |
11316 |
++}; |
11317 |
++ |
11318 |
++#if IS_ENABLED(CONFIG_IPV6) |
11319 |
++ |
11320 |
++/* IPV6-related address/interface watchers */ |
11321 |
++struct mptcp_dad_data { |
11322 |
++ struct timer_list timer; |
11323 |
++ struct inet6_ifaddr *ifa; |
11324 |
++}; |
11325 |
++ |
11326 |
++static void dad_callback(unsigned long arg); |
11327 |
++static int inet6_addr_event(struct notifier_block *this, |
11328 |
++ unsigned long event, void *ptr); |
11329 |
++ |
11330 |
++static int ipv6_is_in_dad_state(const struct inet6_ifaddr *ifa) |
11331 |
++{ |
11332 |
++ return (ifa->flags & IFA_F_TENTATIVE) && |
11333 |
++ ifa->state == INET6_IFADDR_STATE_DAD; |
11334 |
++} |
11335 |
++ |
11336 |
++static void dad_init_timer(struct mptcp_dad_data *data, |
11337 |
++ struct inet6_ifaddr *ifa) |
11338 |
++{ |
11339 |
++ data->ifa = ifa; |
11340 |
++ data->timer.data = (unsigned long)data; |
11341 |
++ data->timer.function = dad_callback; |
11342 |
++ if (ifa->idev->cnf.rtr_solicit_delay) |
11343 |
++ data->timer.expires = jiffies + ifa->idev->cnf.rtr_solicit_delay; |
11344 |
++ else |
11345 |
++ data->timer.expires = jiffies + (HZ/10); |
11346 |
++} |
11347 |
++ |
11348 |
++static void dad_callback(unsigned long arg) |
11349 |
++{ |
11350 |
++ struct mptcp_dad_data *data = (struct mptcp_dad_data *)arg; |
11351 |
++ |
11352 |
++ if (ipv6_is_in_dad_state(data->ifa)) { |
11353 |
++ dad_init_timer(data, data->ifa); |
11354 |
++ add_timer(&data->timer); |
11355 |
++ } else { |
11356 |
++ inet6_addr_event(NULL, NETDEV_UP, data->ifa); |
11357 |
++ in6_ifa_put(data->ifa); |
11358 |
++ kfree(data); |
11359 |
++ } |
11360 |
++} |
11361 |
++ |
11362 |
++static inline void dad_setup_timer(struct inet6_ifaddr *ifa) |
11363 |
++{ |
11364 |
++ struct mptcp_dad_data *data; |
11365 |
++ |
11366 |
++ data = kmalloc(sizeof(*data), GFP_ATOMIC); |
11367 |
++ |
11368 |
++ if (!data) |
11369 |
++ return; |
11370 |
++ |
11371 |
++ init_timer(&data->timer); |
11372 |
++ dad_init_timer(data, ifa); |
11373 |
++ add_timer(&data->timer); |
11374 |
++ in6_ifa_hold(ifa); |
11375 |
++} |
11376 |
++ |
11377 |
++static void addr6_event_handler(const struct inet6_ifaddr *ifa, unsigned long event, |
11378 |
++ struct net *net) |
11379 |
++{ |
11380 |
++ const struct net_device *netdev = ifa->idev->dev; |
11381 |
++ int addr_type = ipv6_addr_type(&ifa->addr); |
11382 |
++ struct mptcp_fm_ns *fm_ns = fm_get_ns(net); |
11383 |
++ struct mptcp_addr_event mpevent; |
11384 |
++ |
11385 |
++ if (ifa->scope > RT_SCOPE_LINK || |
11386 |
++ addr_type == IPV6_ADDR_ANY || |
11387 |
++ (addr_type & IPV6_ADDR_LOOPBACK) || |
11388 |
++ (addr_type & IPV6_ADDR_LINKLOCAL)) |
11389 |
++ return; |
11390 |
++ |
11391 |
++ spin_lock_bh(&fm_ns->local_lock); |
11392 |
++ |
11393 |
++ mpevent.family = AF_INET6; |
11394 |
++ mpevent.addr.in6 = ifa->addr; |
11395 |
++ mpevent.low_prio = (netdev->flags & IFF_MPBACKUP) ? 1 : 0; |
11396 |
++ |
11397 |
++ if (event == NETDEV_DOWN || !netif_running(netdev) || |
11398 |
++ (netdev->flags & IFF_NOMULTIPATH) || !(netdev->flags & IFF_UP)) |
11399 |
++ mpevent.code = MPTCP_EVENT_DEL; |
11400 |
++ else if (event == NETDEV_UP) |
11401 |
++ mpevent.code = MPTCP_EVENT_ADD; |
11402 |
++ else if (event == NETDEV_CHANGE) |
11403 |
++ mpevent.code = MPTCP_EVENT_MOD; |
11404 |
++ |
11405 |
++ mptcp_debug("%s created event for %pI6, code %u prio %u\n", __func__, |
11406 |
++ &ifa->addr, mpevent.code, mpevent.low_prio); |
11407 |
++ add_pm_event(net, &mpevent); |
11408 |
++ |
11409 |
++ spin_unlock_bh(&fm_ns->local_lock); |
11410 |
++ return; |
11411 |
++} |
11412 |
++ |
11413 |
++/* React on IPv6-addr add/rem-events */ |
11414 |
++static int inet6_addr_event(struct notifier_block *this, unsigned long event, |
11415 |
++ void *ptr) |
11416 |
++{ |
11417 |
++ struct inet6_ifaddr *ifa6 = (struct inet6_ifaddr *)ptr; |
11418 |
++ struct net *net = dev_net(ifa6->idev->dev); |
11419 |
++ |
11420 |
++ if (!(event == NETDEV_UP || event == NETDEV_DOWN || |
11421 |
++ event == NETDEV_CHANGE)) |
11422 |
++ return NOTIFY_DONE; |
11423 |
++ |
11424 |
++ if (ipv6_is_in_dad_state(ifa6)) |
11425 |
++ dad_setup_timer(ifa6); |
11426 |
++ else |
11427 |
++ addr6_event_handler(ifa6, event, net); |
11428 |
++ |
11429 |
++ return NOTIFY_DONE; |
11430 |
++} |
11431 |
++ |
11432 |
++static struct notifier_block inet6_addr_notifier = { |
11433 |
++ .notifier_call = inet6_addr_event, |
11434 |
++}; |
11435 |
++ |
11436 |
++#endif |
11437 |
++ |
11438 |
++/* React on ifup/down-events */ |
11439 |
++static int netdev_event(struct notifier_block *this, unsigned long event, |
11440 |
++ void *ptr) |
11441 |
++{ |
11442 |
++ const struct net_device *dev = netdev_notifier_info_to_dev(ptr); |
11443 |
++ struct in_device *in_dev; |
11444 |
++#if IS_ENABLED(CONFIG_IPV6) |
11445 |
++ struct inet6_dev *in6_dev; |
11446 |
++#endif |
11447 |
++ |
11448 |
++ if (!(event == NETDEV_UP || event == NETDEV_DOWN || |
11449 |
++ event == NETDEV_CHANGE)) |
11450 |
++ return NOTIFY_DONE; |
11451 |
++ |
11452 |
++ rcu_read_lock(); |
11453 |
++ in_dev = __in_dev_get_rtnl(dev); |
11454 |
++ |
11455 |
++ if (in_dev) { |
11456 |
++ for_ifa(in_dev) { |
11457 |
++ mptcp_pm_inetaddr_event(NULL, event, ifa); |
11458 |
++ } endfor_ifa(in_dev); |
11459 |
++ } |
11460 |
++ |
11461 |
++#if IS_ENABLED(CONFIG_IPV6) |
11462 |
++ in6_dev = __in6_dev_get(dev); |
11463 |
++ |
11464 |
++ if (in6_dev) { |
11465 |
++ struct inet6_ifaddr *ifa6; |
11466 |
++ list_for_each_entry(ifa6, &in6_dev->addr_list, if_list) |
11467 |
++ inet6_addr_event(NULL, event, ifa6); |
11468 |
++ } |
11469 |
++#endif |
11470 |
++ |
11471 |
++ rcu_read_unlock(); |
11472 |
++ return NOTIFY_DONE; |
11473 |
++} |
11474 |
++ |
11475 |
++static struct notifier_block mptcp_pm_netdev_notifier = { |
11476 |
++ .notifier_call = netdev_event, |
11477 |
++}; |
11478 |
++ |
11479 |
++static void full_mesh_add_raddr(struct mptcp_cb *mpcb, |
11480 |
++ const union inet_addr *addr, |
11481 |
++ sa_family_t family, __be16 port, u8 id) |
11482 |
++{ |
11483 |
++ if (family == AF_INET) |
11484 |
++ mptcp_addv4_raddr(mpcb, &addr->in, port, id); |
11485 |
++ else |
11486 |
++ mptcp_addv6_raddr(mpcb, &addr->in6, port, id); |
11487 |
++} |
11488 |
++ |
11489 |
++static void full_mesh_new_session(const struct sock *meta_sk) |
11490 |
++{ |
11491 |
++ struct mptcp_loc_addr *mptcp_local; |
11492 |
++ struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb; |
11493 |
++ struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb); |
11494 |
++ const struct mptcp_fm_ns *fm_ns = fm_get_ns(sock_net(meta_sk)); |
11495 |
++ int i, index; |
11496 |
++ union inet_addr saddr, daddr; |
11497 |
++ sa_family_t family; |
11498 |
++ bool meta_v4 = meta_sk->sk_family == AF_INET; |
11499 |
++ |
11500 |
++ /* Init local variables necessary for the rest */ |
11501 |
++ if (meta_sk->sk_family == AF_INET || mptcp_v6_is_v4_mapped(meta_sk)) { |
11502 |
++ saddr.ip = inet_sk(meta_sk)->inet_saddr; |
11503 |
++ daddr.ip = inet_sk(meta_sk)->inet_daddr; |
11504 |
++ family = AF_INET; |
11505 |
++#if IS_ENABLED(CONFIG_IPV6) |
11506 |
++ } else { |
11507 |
++ saddr.in6 = inet6_sk(meta_sk)->saddr; |
11508 |
++ daddr.in6 = meta_sk->sk_v6_daddr; |
11509 |
++ family = AF_INET6; |
11510 |
++#endif |
11511 |
++ } |
11512 |
++ |
11513 |
++ rcu_read_lock(); |
11514 |
++ mptcp_local = rcu_dereference(fm_ns->local); |
11515 |
++ |
11516 |
++ index = mptcp_find_address(mptcp_local, family, &saddr); |
11517 |
++ if (index < 0) |
11518 |
++ goto fallback; |
11519 |
++ |
11520 |
++ full_mesh_add_raddr(mpcb, &daddr, family, 0, 0); |
11521 |
++ mptcp_set_init_addr_bit(mpcb, &daddr, family, index); |
11522 |
++ |
11523 |
++ /* Initialize workqueue-struct */ |
11524 |
++ INIT_WORK(&fmp->subflow_work, create_subflow_worker); |
11525 |
++ INIT_DELAYED_WORK(&fmp->subflow_retry_work, retry_subflow_worker); |
11526 |
++ fmp->mpcb = mpcb; |
11527 |
++ |
11528 |
++ if (!meta_v4 && inet6_sk(meta_sk)->ipv6only) |
11529 |
++ goto skip_ipv4; |
11530 |
++ |
11531 |
++ /* Look for the address among the local addresses */ |
11532 |
++ mptcp_for_each_bit_set(mptcp_local->loc4_bits, i) { |
11533 |
++ __be32 ifa_address = mptcp_local->locaddr4[i].addr.s_addr; |
11534 |
++ |
11535 |
++ /* We do not need to announce the initial subflow's address again */ |
11536 |
++ if (family == AF_INET && saddr.ip == ifa_address) |
11537 |
++ continue; |
11538 |
++ |
11539 |
++ fmp->add_addr++; |
11540 |
++ mpcb->addr_signal = 1; |
11541 |
++ } |
11542 |
++ |
11543 |
++skip_ipv4: |
11544 |
++#if IS_ENABLED(CONFIG_IPV6) |
11545 |
++ /* skip IPv6 addresses if meta-socket is IPv4 */ |
11546 |
++ if (meta_v4) |
11547 |
++ goto skip_ipv6; |
11548 |
++ |
11549 |
++ mptcp_for_each_bit_set(mptcp_local->loc6_bits, i) { |
11550 |
++ const struct in6_addr *ifa6 = &mptcp_local->locaddr6[i].addr; |
11551 |
++ |
11552 |
++ /* We do not need to announce the initial subflow's address again */ |
11553 |
++ if (family == AF_INET6 && ipv6_addr_equal(&saddr.in6, ifa6)) |
11554 |
++ continue; |
11555 |
++ |
11556 |
++ fmp->add_addr++; |
11557 |
++ mpcb->addr_signal = 1; |
11558 |
++ } |
11559 |
++ |
11560 |
++skip_ipv6: |
11561 |
++#endif |
11562 |
++ |
11563 |
++ rcu_read_unlock(); |
11564 |
++ |
11565 |
++ if (family == AF_INET) |
11566 |
++ fmp->announced_addrs_v4 |= (1 << index); |
11567 |
++ else |
11568 |
++ fmp->announced_addrs_v6 |= (1 << index); |
11569 |
++ |
11570 |
++ for (i = fmp->add_addr; i && fmp->add_addr; i--) |
11571 |
++ tcp_send_ack(mpcb->master_sk); |
11572 |
++ |
11573 |
++ return; |
11574 |
++ |
11575 |
++fallback: |
11576 |
++ rcu_read_unlock(); |
11577 |
++ mptcp_fallback_default(mpcb); |
11578 |
++ return; |
11579 |
++} |
11580 |
++ |
11581 |
++static void full_mesh_create_subflows(struct sock *meta_sk) |
11582 |
++{ |
11583 |
++ const struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb; |
11584 |
++ struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb); |
11585 |
++ |
11586 |
++ if (mpcb->infinite_mapping_snd || mpcb->infinite_mapping_rcv || |
11587 |
++ mpcb->send_infinite_mapping || |
11588 |
++ mpcb->server_side || sock_flag(meta_sk, SOCK_DEAD)) |
11589 |
++ return; |
11590 |
++ |
11591 |
++ if (mpcb->master_sk && |
11592 |
++ !tcp_sk(mpcb->master_sk)->mptcp->fully_established) |
11593 |
++ return; |
11594 |
++ |
11595 |
++ if (!work_pending(&fmp->subflow_work)) { |
11596 |
++ sock_hold(meta_sk); |
11597 |
++ queue_work(mptcp_wq, &fmp->subflow_work); |
11598 |
++ } |
11599 |
++} |
11600 |
++ |
11601 |
++/* Called upon release_sock, if the socket was owned by the user during |
11602 |
++ * a path-management event. |
11603 |
++ */ |
11604 |
++static void full_mesh_release_sock(struct sock *meta_sk) |
11605 |
++{ |
11606 |
++ struct mptcp_loc_addr *mptcp_local; |
11607 |
++ struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb; |
11608 |
++ struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb); |
11609 |
++ const struct mptcp_fm_ns *fm_ns = fm_get_ns(sock_net(meta_sk)); |
11610 |
++ struct sock *sk, *tmpsk; |
11611 |
++ bool meta_v4 = meta_sk->sk_family == AF_INET; |
11612 |
++ int i; |
11613 |
++ |
11614 |
++ rcu_read_lock(); |
11615 |
++ mptcp_local = rcu_dereference(fm_ns->local); |
11616 |
++ |
11617 |
++ if (!meta_v4 && inet6_sk(meta_sk)->ipv6only) |
11618 |
++ goto skip_ipv4; |
11619 |
++ |
11620 |
++ /* First, detect modifications or additions */ |
11621 |
++ mptcp_for_each_bit_set(mptcp_local->loc4_bits, i) { |
11622 |
++ struct in_addr ifa = mptcp_local->locaddr4[i].addr; |
11623 |
++ bool found = false; |
11624 |
++ |
11625 |
++ mptcp_for_each_sk(mpcb, sk) { |
11626 |
++ struct tcp_sock *tp = tcp_sk(sk); |
11627 |
++ |
11628 |
++ if (sk->sk_family == AF_INET6 && |
11629 |
++ !mptcp_v6_is_v4_mapped(sk)) |
11630 |
++ continue; |
11631 |
++ |
11632 |
++ if (inet_sk(sk)->inet_saddr != ifa.s_addr) |
11633 |
++ continue; |
11634 |
++ |
11635 |
++ found = true; |
11636 |
++ |
11637 |
++ if (mptcp_local->locaddr4[i].low_prio != tp->mptcp->low_prio) { |
11638 |
++ tp->mptcp->send_mp_prio = 1; |
11639 |
++ tp->mptcp->low_prio = mptcp_local->locaddr4[i].low_prio; |
11640 |
++ |
11641 |
++ tcp_send_ack(sk); |
11642 |
++ } |
11643 |
++ } |
11644 |
++ |
11645 |
++ if (!found) { |
11646 |
++ fmp->add_addr++; |
11647 |
++ mpcb->addr_signal = 1; |
11648 |
++ |
11649 |
++ sk = mptcp_select_ack_sock(meta_sk); |
11650 |
++ if (sk) |
11651 |
++ tcp_send_ack(sk); |
11652 |
++ full_mesh_create_subflows(meta_sk); |
11653 |
++ } |
11654 |
++ } |
11655 |
++ |
11656 |
++skip_ipv4: |
11657 |
++#if IS_ENABLED(CONFIG_IPV6) |
11658 |
++ /* skip IPv6 addresses if meta-socket is IPv4 */ |
11659 |
++ if (meta_v4) |
11660 |
++ goto removal; |
11661 |
++ |
11662 |
++ mptcp_for_each_bit_set(mptcp_local->loc6_bits, i) { |
11663 |
++ struct in6_addr ifa = mptcp_local->locaddr6[i].addr; |
11664 |
++ bool found = false; |
11665 |
++ |
11666 |
++ mptcp_for_each_sk(mpcb, sk) { |
11667 |
++ struct tcp_sock *tp = tcp_sk(sk); |
11668 |
++ |
11669 |
++ if (sk->sk_family == AF_INET || |
11670 |
++ mptcp_v6_is_v4_mapped(sk)) |
11671 |
++ continue; |
11672 |
++ |
11673 |
++ if (!ipv6_addr_equal(&inet6_sk(sk)->saddr, &ifa)) |
11674 |
++ continue; |
11675 |
++ |
11676 |
++ found = true; |
11677 |
++ |
11678 |
++ if (mptcp_local->locaddr6[i].low_prio != tp->mptcp->low_prio) { |
11679 |
++ tp->mptcp->send_mp_prio = 1; |
11680 |
++ tp->mptcp->low_prio = mptcp_local->locaddr6[i].low_prio; |
11681 |
++ |
11682 |
++ tcp_send_ack(sk); |
11683 |
++ } |
11684 |
++ } |
11685 |
++ |
11686 |
++ if (!found) { |
11687 |
++ fmp->add_addr++; |
11688 |
++ mpcb->addr_signal = 1; |
11689 |
++ |
11690 |
++ sk = mptcp_select_ack_sock(meta_sk); |
11691 |
++ if (sk) |
11692 |
++ tcp_send_ack(sk); |
11693 |
++ full_mesh_create_subflows(meta_sk); |
11694 |
++ } |
11695 |
++ } |
11696 |
++ |
11697 |
++removal: |
11698 |
++#endif |
11699 |
++ |
11700 |
++ /* Now, detect address-removals */ |
11701 |
++ mptcp_for_each_sk_safe(mpcb, sk, tmpsk) { |
11702 |
++ bool shall_remove = true; |
11703 |
++ |
11704 |
++ if (sk->sk_family == AF_INET || mptcp_v6_is_v4_mapped(sk)) { |
11705 |
++ mptcp_for_each_bit_set(mptcp_local->loc4_bits, i) { |
11706 |
++ if (inet_sk(sk)->inet_saddr == mptcp_local->locaddr4[i].addr.s_addr) { |
11707 |
++ shall_remove = false; |
11708 |
++ break; |
11709 |
++ } |
11710 |
++ } |
11711 |
++ } else { |
11712 |
++ mptcp_for_each_bit_set(mptcp_local->loc6_bits, i) { |
11713 |
++ if (ipv6_addr_equal(&inet6_sk(sk)->saddr, &mptcp_local->locaddr6[i].addr)) { |
11714 |
++ shall_remove = false; |
11715 |
++ break; |
11716 |
++ } |
11717 |
++ } |
11718 |
++ } |
11719 |
++ |
11720 |
++ if (shall_remove) { |
11721 |
++ /* Reinject, so that pf = 1 and so we |
11722 |
++ * won't select this one as the |
11723 |
++ * ack-sock. |
11724 |
++ */ |
11725 |
++ mptcp_reinject_data(sk, 0); |
11726 |
++ |
11727 |
++ announce_remove_addr(tcp_sk(sk)->mptcp->loc_id, |
11728 |
++ meta_sk); |
11729 |
++ |
11730 |
++ mptcp_sub_force_close(sk); |
11731 |
++ } |
11732 |
++ } |
11733 |
++ |
11734 |
++ /* Just call it optimistically. It actually cannot do any harm */ |
11735 |
++ update_addr_bitfields(meta_sk, mptcp_local); |
11736 |
++ |
11737 |
++ rcu_read_unlock(); |
11738 |
++} |
11739 |
++ |
11740 |
++static int full_mesh_get_local_id(sa_family_t family, union inet_addr *addr, |
11741 |
++ struct net *net, bool *low_prio) |
11742 |
++{ |
11743 |
++ struct mptcp_loc_addr *mptcp_local; |
11744 |
++ const struct mptcp_fm_ns *fm_ns = fm_get_ns(net); |
11745 |
++ int index, id = -1; |
11746 |
++ |
11747 |
++ /* Handle the backup-flows */ |
11748 |
++ rcu_read_lock(); |
11749 |
++ mptcp_local = rcu_dereference(fm_ns->local); |
11750 |
++ |
11751 |
++ index = mptcp_find_address(mptcp_local, family, addr); |
11752 |
++ |
11753 |
++ if (index != -1) { |
11754 |
++ if (family == AF_INET) { |
11755 |
++ id = mptcp_local->locaddr4[index].loc4_id; |
11756 |
++ *low_prio = mptcp_local->locaddr4[index].low_prio; |
11757 |
++ } else { |
11758 |
++ id = mptcp_local->locaddr6[index].loc6_id; |
11759 |
++ *low_prio = mptcp_local->locaddr6[index].low_prio; |
11760 |
++ } |
11761 |
++ } |
11762 |
++ |
11763 |
++ |
11764 |
++ rcu_read_unlock(); |
11765 |
++ |
11766 |
++ return id; |
11767 |
++} |
11768 |
++ |
11769 |
++static void full_mesh_addr_signal(struct sock *sk, unsigned *size, |
11770 |
++ struct tcp_out_options *opts, |
11771 |
++ struct sk_buff *skb) |
11772 |
++{ |
11773 |
++ const struct tcp_sock *tp = tcp_sk(sk); |
11774 |
++ struct mptcp_cb *mpcb = tp->mpcb; |
11775 |
++ struct sock *meta_sk = mpcb->meta_sk; |
11776 |
++ struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb); |
11777 |
++ struct mptcp_loc_addr *mptcp_local; |
11778 |
++ struct mptcp_fm_ns *fm_ns = fm_get_ns(sock_net(sk)); |
11779 |
++ int remove_addr_len; |
11780 |
++ u8 unannouncedv4 = 0, unannouncedv6 = 0; |
11781 |
++ bool meta_v4 = meta_sk->sk_family == AF_INET; |
11782 |
++ |
11783 |
++ mpcb->addr_signal = 0; |
11784 |
++ |
11785 |
++ if (likely(!fmp->add_addr)) |
11786 |
++ goto remove_addr; |
11787 |
++ |
11788 |
++ rcu_read_lock(); |
11789 |
++ mptcp_local = rcu_dereference(fm_ns->local); |
11790 |
++ |
11791 |
++ if (!meta_v4 && inet6_sk(meta_sk)->ipv6only) |
11792 |
++ goto skip_ipv4; |
11793 |
++ |
11794 |
++ /* IPv4 */ |
11795 |
++ unannouncedv4 = (~fmp->announced_addrs_v4) & mptcp_local->loc4_bits; |
11796 |
++ if (unannouncedv4 && |
11797 |
++ MAX_TCP_OPTION_SPACE - *size >= MPTCP_SUB_LEN_ADD_ADDR4_ALIGN) { |
11798 |
++ int ind = mptcp_find_free_index(~unannouncedv4); |
11799 |
++ |
11800 |
++ opts->options |= OPTION_MPTCP; |
11801 |
++ opts->mptcp_options |= OPTION_ADD_ADDR; |
11802 |
++ opts->add_addr4.addr_id = mptcp_local->locaddr4[ind].loc4_id; |
11803 |
++ opts->add_addr4.addr = mptcp_local->locaddr4[ind].addr; |
11804 |
++ opts->add_addr_v4 = 1; |
11805 |
++ |
11806 |
++ if (skb) { |
11807 |
++ fmp->announced_addrs_v4 |= (1 << ind); |
11808 |
++ fmp->add_addr--; |
11809 |
++ } |
11810 |
++ *size += MPTCP_SUB_LEN_ADD_ADDR4_ALIGN; |
11811 |
++ } |
11812 |
++ |
11813 |
++ if (meta_v4) |
11814 |
++ goto skip_ipv6; |
11815 |
++ |
11816 |
++skip_ipv4: |
11817 |
++ /* IPv6 */ |
11818 |
++ unannouncedv6 = (~fmp->announced_addrs_v6) & mptcp_local->loc6_bits; |
11819 |
++ if (unannouncedv6 && |
11820 |
++ MAX_TCP_OPTION_SPACE - *size >= MPTCP_SUB_LEN_ADD_ADDR6_ALIGN) { |
11821 |
++ int ind = mptcp_find_free_index(~unannouncedv6); |
11822 |
++ |
11823 |
++ opts->options |= OPTION_MPTCP; |
11824 |
++ opts->mptcp_options |= OPTION_ADD_ADDR; |
11825 |
++ opts->add_addr6.addr_id = mptcp_local->locaddr6[ind].loc6_id; |
11826 |
++ opts->add_addr6.addr = mptcp_local->locaddr6[ind].addr; |
11827 |
++ opts->add_addr_v6 = 1; |
11828 |
++ |
11829 |
++ if (skb) { |
11830 |
++ fmp->announced_addrs_v6 |= (1 << ind); |
11831 |
++ fmp->add_addr--; |
11832 |
++ } |
11833 |
++ *size += MPTCP_SUB_LEN_ADD_ADDR6_ALIGN; |
11834 |
++ } |
11835 |
++ |
11836 |
++skip_ipv6: |
11837 |
++ rcu_read_unlock(); |
11838 |
++ |
11839 |
++ if (!unannouncedv4 && !unannouncedv6 && skb) |
11840 |
++ fmp->add_addr--; |
11841 |
++ |
11842 |
++remove_addr: |
11843 |
++ if (likely(!fmp->remove_addrs)) |
11844 |
++ goto exit; |
11845 |
++ |
11846 |
++ remove_addr_len = mptcp_sub_len_remove_addr_align(fmp->remove_addrs); |
11847 |
++ if (MAX_TCP_OPTION_SPACE - *size < remove_addr_len) |
11848 |
++ goto exit; |
11849 |
++ |
11850 |
++ opts->options |= OPTION_MPTCP; |
11851 |
++ opts->mptcp_options |= OPTION_REMOVE_ADDR; |
11852 |
++ opts->remove_addrs = fmp->remove_addrs; |
11853 |
++ *size += remove_addr_len; |
11854 |
++ if (skb) |
11855 |
++ fmp->remove_addrs = 0; |
11856 |
++ |
11857 |
++exit: |
11858 |
++ mpcb->addr_signal = !!(fmp->add_addr || fmp->remove_addrs); |
11859 |
++} |
11860 |
++ |
11861 |
++static void full_mesh_rem_raddr(struct mptcp_cb *mpcb, u8 rem_id) |
11862 |
++{ |
11863 |
++ mptcp_v4_rem_raddress(mpcb, rem_id); |
11864 |
++ mptcp_v6_rem_raddress(mpcb, rem_id); |
11865 |
++} |
11866 |
++ |
11867 |
++/* Output /proc/net/mptcp_fullmesh */ |
11868 |
++static int mptcp_fm_seq_show(struct seq_file *seq, void *v) |
11869 |
++{ |
11870 |
++ const struct net *net = seq->private; |
11871 |
++ struct mptcp_loc_addr *mptcp_local; |
11872 |
++ const struct mptcp_fm_ns *fm_ns = fm_get_ns(net); |
11873 |
++ int i; |
11874 |
++ |
11875 |
++ seq_printf(seq, "Index, Address-ID, Backup, IP-address\n"); |
11876 |
++ |
11877 |
++ rcu_read_lock_bh(); |
11878 |
++ mptcp_local = rcu_dereference(fm_ns->local); |
11879 |
++ |
11880 |
++ seq_printf(seq, "IPv4, next v4-index: %u\n", mptcp_local->next_v4_index); |
11881 |
++ |
11882 |
++ mptcp_for_each_bit_set(mptcp_local->loc4_bits, i) { |
11883 |
++ struct mptcp_loc4 *loc4 = &mptcp_local->locaddr4[i]; |
11884 |
++ |
11885 |
++ seq_printf(seq, "%u, %u, %u, %pI4\n", i, loc4->loc4_id, |
11886 |
++ loc4->low_prio, &loc4->addr); |
11887 |
++ } |
11888 |
++ |
11889 |
++ seq_printf(seq, "IPv6, next v6-index: %u\n", mptcp_local->next_v6_index); |
11890 |
++ |
11891 |
++ mptcp_for_each_bit_set(mptcp_local->loc6_bits, i) { |
11892 |
++ struct mptcp_loc6 *loc6 = &mptcp_local->locaddr6[i]; |
11893 |
++ |
11894 |
++ seq_printf(seq, "%u, %u, %u, %pI6\n", i, loc6->loc6_id, |
11895 |
++ loc6->low_prio, &loc6->addr); |
11896 |
++ } |
11897 |
++ rcu_read_unlock_bh(); |
11898 |
++ |
11899 |
++ return 0; |
11900 |
++} |
11901 |
++ |
11902 |
++static int mptcp_fm_seq_open(struct inode *inode, struct file *file) |
11903 |
++{ |
11904 |
++ return single_open_net(inode, file, mptcp_fm_seq_show); |
11905 |
++} |
11906 |
++ |
11907 |
++static const struct file_operations mptcp_fm_seq_fops = { |
11908 |
++ .owner = THIS_MODULE, |
11909 |
++ .open = mptcp_fm_seq_open, |
11910 |
++ .read = seq_read, |
11911 |
++ .llseek = seq_lseek, |
11912 |
++ .release = single_release_net, |
11913 |
++}; |
11914 |
++ |
11915 |
++static int mptcp_fm_init_net(struct net *net) |
11916 |
++{ |
11917 |
++ struct mptcp_loc_addr *mptcp_local; |
11918 |
++ struct mptcp_fm_ns *fm_ns; |
11919 |
++ int err = 0; |
11920 |
++ |
11921 |
++ fm_ns = kzalloc(sizeof(*fm_ns), GFP_KERNEL); |
11922 |
++ if (!fm_ns) |
11923 |
++ return -ENOBUFS; |
11924 |
++ |
11925 |
++ mptcp_local = kzalloc(sizeof(*mptcp_local), GFP_KERNEL); |
11926 |
++ if (!mptcp_local) { |
11927 |
++ err = -ENOBUFS; |
11928 |
++ goto err_mptcp_local; |
11929 |
++ } |
11930 |
++ |
11931 |
++ if (!proc_create("mptcp_fullmesh", S_IRUGO, net->proc_net, |
11932 |
++ &mptcp_fm_seq_fops)) { |
11933 |
++ err = -ENOMEM; |
11934 |
++ goto err_seq_fops; |
11935 |
++ } |
11936 |
++ |
11937 |
++ mptcp_local->next_v4_index = 1; |
11938 |
++ |
11939 |
++ rcu_assign_pointer(fm_ns->local, mptcp_local); |
11940 |
++ INIT_DELAYED_WORK(&fm_ns->address_worker, mptcp_address_worker); |
11941 |
++ INIT_LIST_HEAD(&fm_ns->events); |
11942 |
++ spin_lock_init(&fm_ns->local_lock); |
11943 |
++ fm_ns->net = net; |
11944 |
++ net->mptcp.path_managers[MPTCP_PM_FULLMESH] = fm_ns; |
11945 |
++ |
11946 |
++ return 0; |
11947 |
++err_seq_fops: |
11948 |
++ kfree(mptcp_local); |
11949 |
++err_mptcp_local: |
11950 |
++ kfree(fm_ns); |
11951 |
++ return err; |
11952 |
++} |
11953 |
++ |
11954 |
++static void mptcp_fm_exit_net(struct net *net) |
11955 |
++{ |
11956 |
++ struct mptcp_addr_event *eventq, *tmp; |
11957 |
++ struct mptcp_fm_ns *fm_ns; |
11958 |
++ struct mptcp_loc_addr *mptcp_local; |
11959 |
++ |
11960 |
++ fm_ns = fm_get_ns(net); |
11961 |
++ cancel_delayed_work_sync(&fm_ns->address_worker); |
11962 |
++ |
11963 |
++ rcu_read_lock_bh(); |
11964 |
++ |
11965 |
++ mptcp_local = rcu_dereference_bh(fm_ns->local); |
11966 |
++ kfree(mptcp_local); |
11967 |
++ |
11968 |
++ spin_lock(&fm_ns->local_lock); |
11969 |
++ list_for_each_entry_safe(eventq, tmp, &fm_ns->events, list) { |
11970 |
++ list_del(&eventq->list); |
11971 |
++ kfree(eventq); |
11972 |
++ } |
11973 |
++ spin_unlock(&fm_ns->local_lock); |
11974 |
++ |
11975 |
++ rcu_read_unlock_bh(); |
11976 |
++ |
11977 |
++ remove_proc_entry("mptcp_fullmesh", net->proc_net); |
11978 |
++ |
11979 |
++ kfree(fm_ns); |
11980 |
++} |
11981 |
++ |
11982 |
++static struct pernet_operations full_mesh_net_ops = { |
11983 |
++ .init = mptcp_fm_init_net, |
11984 |
++ .exit = mptcp_fm_exit_net, |
11985 |
++}; |
11986 |
++ |
11987 |
++static struct mptcp_pm_ops full_mesh __read_mostly = { |
11988 |
++ .new_session = full_mesh_new_session, |
11989 |
++ .release_sock = full_mesh_release_sock, |
11990 |
++ .fully_established = full_mesh_create_subflows, |
11991 |
++ .new_remote_address = full_mesh_create_subflows, |
11992 |
++ .get_local_id = full_mesh_get_local_id, |
11993 |
++ .addr_signal = full_mesh_addr_signal, |
11994 |
++ .add_raddr = full_mesh_add_raddr, |
11995 |
++ .rem_raddr = full_mesh_rem_raddr, |
11996 |
++ .name = "fullmesh", |
11997 |
++ .owner = THIS_MODULE, |
11998 |
++}; |
11999 |
++ |
12000 |
++/* General initialization of MPTCP_PM */ |
12001 |
++static int __init full_mesh_register(void) |
12002 |
++{ |
12003 |
++ int ret; |
12004 |
++ |
12005 |
++ BUILD_BUG_ON(sizeof(struct fullmesh_priv) > MPTCP_PM_SIZE); |
12006 |
++ |
12007 |
++ ret = register_pernet_subsys(&full_mesh_net_ops); |
12008 |
++ if (ret) |
12009 |
++ goto out; |
12010 |
++ |
12011 |
++ ret = register_inetaddr_notifier(&mptcp_pm_inetaddr_notifier); |
12012 |
++ if (ret) |
12013 |
++ goto err_reg_inetaddr; |
12014 |
++ ret = register_netdevice_notifier(&mptcp_pm_netdev_notifier); |
12015 |
++ if (ret) |
12016 |
++ goto err_reg_netdev; |
12017 |
++ |
12018 |
++#if IS_ENABLED(CONFIG_IPV6) |
12019 |
++ ret = register_inet6addr_notifier(&inet6_addr_notifier); |
12020 |
++ if (ret) |
12021 |
++ goto err_reg_inet6addr; |
12022 |
++#endif |
12023 |
++ |
12024 |
++ ret = mptcp_register_path_manager(&full_mesh); |
12025 |
++ if (ret) |
12026 |
++ goto err_reg_pm; |
12027 |
++ |
12028 |
++out: |
12029 |
++ return ret; |
12030 |
++ |
12031 |
++ |
12032 |
++err_reg_pm: |
12033 |
++#if IS_ENABLED(CONFIG_IPV6) |
12034 |
++ unregister_inet6addr_notifier(&inet6_addr_notifier); |
12035 |
++err_reg_inet6addr: |
12036 |
++#endif |
12037 |
++ unregister_netdevice_notifier(&mptcp_pm_netdev_notifier); |
12038 |
++err_reg_netdev: |
12039 |
++ unregister_inetaddr_notifier(&mptcp_pm_inetaddr_notifier); |
12040 |
++err_reg_inetaddr: |
12041 |
++ unregister_pernet_subsys(&full_mesh_net_ops); |
12042 |
++ goto out; |
12043 |
++} |
12044 |
++ |
12045 |
++static void full_mesh_unregister(void) |
12046 |
++{ |
12047 |
++#if IS_ENABLED(CONFIG_IPV6) |
12048 |
++ unregister_inet6addr_notifier(&inet6_addr_notifier); |
12049 |
++#endif |
12050 |
++ unregister_netdevice_notifier(&mptcp_pm_netdev_notifier); |
12051 |
++ unregister_inetaddr_notifier(&mptcp_pm_inetaddr_notifier); |
12052 |
++ unregister_pernet_subsys(&full_mesh_net_ops); |
12053 |
++ mptcp_unregister_path_manager(&full_mesh); |
12054 |
++} |
12055 |
++ |
12056 |
++module_init(full_mesh_register); |
12057 |
++module_exit(full_mesh_unregister); |
12058 |
++ |
12059 |
++MODULE_AUTHOR("Christoph Paasch"); |
12060 |
++MODULE_LICENSE("GPL"); |
12061 |
++MODULE_DESCRIPTION("Full-Mesh MPTCP"); |
12062 |
++MODULE_VERSION("0.88"); |
12063 |
+diff --git a/net/mptcp/mptcp_input.c b/net/mptcp/mptcp_input.c |
12064 |
+new file mode 100644 |
12065 |
+index 000000000000..43704ccb639e |
12066 |
+--- /dev/null |
12067 |
++++ b/net/mptcp/mptcp_input.c |
12068 |
+@@ -0,0 +1,2405 @@ |
12069 |
++/* |
12070 |
++ * MPTCP implementation - Sending side |
12071 |
++ * |
12072 |
++ * Initial Design & Implementation: |
12073 |
++ * Sébastien Barré <sebastien.barre@×××××××××.be> |
12074 |
++ * |
12075 |
++ * Current Maintainer & Author: |
12076 |
++ * Christoph Paasch <christoph.paasch@×××××××××.be> |
12077 |
++ * |
12078 |
++ * Additional authors: |
12079 |
++ * Jaakko Korkeaniemi <jaakko.korkeaniemi@×××××.fi> |
12080 |
++ * Gregory Detal <gregory.detal@×××××××××.be> |
12081 |
++ * Fabien Duchêne <fabien.duchene@×××××××××.be> |
12082 |
++ * Andreas Seelinger <Andreas.Seelinger@×××××××××××.de> |
12083 |
++ * Lavkesh Lahngir <lavkesh51@×××××.com> |
12084 |
++ * Andreas Ripke <ripke@××××××.eu> |
12085 |
++ * Vlad Dogaru <vlad.dogaru@×××××.com> |
12086 |
++ * Octavian Purdila <octavian.purdila@×××××.com> |
12087 |
++ * John Ronan <jronan@××××.org> |
12088 |
++ * Catalin Nicutar <catalin.nicutar@×××××.com> |
12089 |
++ * Brandon Heller <brandonh@××××××××.edu> |
12090 |
++ * |
12091 |
++ * |
12092 |
++ * This program is free software; you can redistribute it and/or |
12093 |
++ * modify it under the terms of the GNU General Public License |
12094 |
++ * as published by the Free Software Foundation; either version |
12095 |
++ * 2 of the License, or (at your option) any later version. |
12096 |
++ */ |
12097 |
++ |
12098 |
++#include <asm/unaligned.h> |
12099 |
++ |
12100 |
++#include <net/mptcp.h> |
12101 |
++#include <net/mptcp_v4.h> |
12102 |
++#include <net/mptcp_v6.h> |
12103 |
++ |
12104 |
++#include <linux/kconfig.h> |
12105 |
++ |
12106 |
++/* is seq1 < seq2 ? */ |
12107 |
++static inline bool before64(const u64 seq1, const u64 seq2) |
12108 |
++{ |
12109 |
++ return (s64)(seq1 - seq2) < 0; |
12110 |
++} |
12111 |
++ |
12112 |
++/* is seq1 > seq2 ? */ |
12113 |
++#define after64(seq1, seq2) before64(seq2, seq1) |
12114 |
++ |
12115 |
++static inline void mptcp_become_fully_estab(struct sock *sk) |
12116 |
++{ |
12117 |
++ tcp_sk(sk)->mptcp->fully_established = 1; |
12118 |
++ |
12119 |
++ if (is_master_tp(tcp_sk(sk)) && |
12120 |
++ tcp_sk(sk)->mpcb->pm_ops->fully_established) |
12121 |
++ tcp_sk(sk)->mpcb->pm_ops->fully_established(mptcp_meta_sk(sk)); |
12122 |
++} |
12123 |
++ |
12124 |
++/* Similar to tcp_tso_acked without any memory accounting */ |
12125 |
++static inline int mptcp_tso_acked_reinject(const struct sock *meta_sk, |
12126 |
++ struct sk_buff *skb) |
12127 |
++{ |
12128 |
++ const struct tcp_sock *meta_tp = tcp_sk(meta_sk); |
12129 |
++ u32 packets_acked, len; |
12130 |
++ |
12131 |
++ BUG_ON(!after(TCP_SKB_CB(skb)->end_seq, meta_tp->snd_una)); |
12132 |
++ |
12133 |
++ packets_acked = tcp_skb_pcount(skb); |
12134 |
++ |
12135 |
++ if (skb_unclone(skb, GFP_ATOMIC)) |
12136 |
++ return 0; |
12137 |
++ |
12138 |
++ len = meta_tp->snd_una - TCP_SKB_CB(skb)->seq; |
12139 |
++ __pskb_trim_head(skb, len); |
12140 |
++ |
12141 |
++ TCP_SKB_CB(skb)->seq += len; |
12142 |
++ skb->ip_summed = CHECKSUM_PARTIAL; |
12143 |
++ skb->truesize -= len; |
12144 |
++ |
12145 |
++ /* Any change of skb->len requires recalculation of tso factor. */ |
12146 |
++ if (tcp_skb_pcount(skb) > 1) |
12147 |
++ tcp_set_skb_tso_segs(meta_sk, skb, tcp_skb_mss(skb)); |
12148 |
++ packets_acked -= tcp_skb_pcount(skb); |
12149 |
++ |
12150 |
++ if (packets_acked) { |
12151 |
++ BUG_ON(tcp_skb_pcount(skb) == 0); |
12152 |
++ BUG_ON(!before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)); |
12153 |
++ } |
12154 |
++ |
12155 |
++ return packets_acked; |
12156 |
++} |
12157 |
++ |
12158 |
++/** |
12159 |
++ * Cleans the meta-socket retransmission queue and the reinject-queue. |
12160 |
++ * @sk must be the metasocket. |
12161 |
++ */ |
12162 |
++static void mptcp_clean_rtx_queue(struct sock *meta_sk, u32 prior_snd_una) |
12163 |
++{ |
12164 |
++ struct sk_buff *skb, *tmp; |
12165 |
++ struct tcp_sock *meta_tp = tcp_sk(meta_sk); |
12166 |
++ struct mptcp_cb *mpcb = meta_tp->mpcb; |
12167 |
++ bool acked = false; |
12168 |
++ u32 acked_pcount; |
12169 |
++ |
12170 |
++ while ((skb = tcp_write_queue_head(meta_sk)) && |
12171 |
++ skb != tcp_send_head(meta_sk)) { |
12172 |
++ bool fully_acked = true; |
12173 |
++ |
12174 |
++ if (before(meta_tp->snd_una, TCP_SKB_CB(skb)->end_seq)) { |
12175 |
++ if (tcp_skb_pcount(skb) == 1 || |
12176 |
++ !after(meta_tp->snd_una, TCP_SKB_CB(skb)->seq)) |
12177 |
++ break; |
12178 |
++ |
12179 |
++ acked_pcount = tcp_tso_acked(meta_sk, skb); |
12180 |
++ if (!acked_pcount) |
12181 |
++ break; |
12182 |
++ |
12183 |
++ fully_acked = false; |
12184 |
++ } else { |
12185 |
++ acked_pcount = tcp_skb_pcount(skb); |
12186 |
++ } |
12187 |
++ |
12188 |
++ acked = true; |
12189 |
++ meta_tp->packets_out -= acked_pcount; |
12190 |
++ meta_tp->retrans_stamp = 0; |
12191 |
++ |
12192 |
++ if (!fully_acked) |
12193 |
++ break; |
12194 |
++ |
12195 |
++ tcp_unlink_write_queue(skb, meta_sk); |
12196 |
++ |
12197 |
++ if (mptcp_is_data_fin(skb)) { |
12198 |
++ struct sock *sk_it; |
12199 |
++ |
12200 |
++ /* DATA_FIN has been acknowledged - now we can close |
12201 |
++ * the subflows |
12202 |
++ */ |
12203 |
++ mptcp_for_each_sk(mpcb, sk_it) { |
12204 |
++ unsigned long delay = 0; |
12205 |
++ |
12206 |
++ /* If we are the passive closer, don't trigger |
12207 |
++ * subflow-fin until the subflow has been finned |
12208 |
++ * by the peer - thus we add a delay. |
12209 |
++ */ |
12210 |
++ if (mpcb->passive_close && |
12211 |
++ sk_it->sk_state == TCP_ESTABLISHED) |
12212 |
++ delay = inet_csk(sk_it)->icsk_rto << 3; |
12213 |
++ |
12214 |
++ mptcp_sub_close(sk_it, delay); |
12215 |
++ } |
12216 |
++ } |
12217 |
++ sk_wmem_free_skb(meta_sk, skb); |
12218 |
++ } |
12219 |
++ /* Remove acknowledged data from the reinject queue */ |
12220 |
++ skb_queue_walk_safe(&mpcb->reinject_queue, skb, tmp) { |
12221 |
++ if (before(meta_tp->snd_una, TCP_SKB_CB(skb)->end_seq)) { |
12222 |
++ if (tcp_skb_pcount(skb) == 1 || |
12223 |
++ !after(meta_tp->snd_una, TCP_SKB_CB(skb)->seq)) |
12224 |
++ break; |
12225 |
++ |
12226 |
++ mptcp_tso_acked_reinject(meta_sk, skb); |
12227 |
++ break; |
12228 |
++ } |
12229 |
++ |
12230 |
++ __skb_unlink(skb, &mpcb->reinject_queue); |
12231 |
++ __kfree_skb(skb); |
12232 |
++ } |
12233 |
++ |
12234 |
++ if (likely(between(meta_tp->snd_up, prior_snd_una, meta_tp->snd_una))) |
12235 |
++ meta_tp->snd_up = meta_tp->snd_una; |
12236 |
++ |
12237 |
++ if (acked) { |
12238 |
++ tcp_rearm_rto(meta_sk); |
12239 |
++ /* Normally this is done in tcp_try_undo_loss - but MPTCP |
12240 |
++ * does not call this function. |
12241 |
++ */ |
12242 |
++ inet_csk(meta_sk)->icsk_retransmits = 0; |
12243 |
++ } |
12244 |
++} |
12245 |
++ |
12246 |
++/* Inspired by tcp_rcv_state_process */ |
12247 |
++static int mptcp_rcv_state_process(struct sock *meta_sk, struct sock *sk, |
12248 |
++ const struct sk_buff *skb, u32 data_seq, |
12249 |
++ u16 data_len) |
12250 |
++{ |
12251 |
++ struct tcp_sock *meta_tp = tcp_sk(meta_sk), *tp = tcp_sk(sk); |
12252 |
++ const struct tcphdr *th = tcp_hdr(skb); |
12253 |
++ |
12254 |
++ /* State-machine handling if FIN has been enqueued and he has |
12255 |
++ * been acked (snd_una == write_seq) - it's important that this |
12256 |
++ * here is after sk_wmem_free_skb because otherwise |
12257 |
++ * sk_forward_alloc is wrong upon inet_csk_destroy_sock() |
12258 |
++ */ |
12259 |
++ switch (meta_sk->sk_state) { |
12260 |
++ case TCP_FIN_WAIT1: { |
12261 |
++ struct dst_entry *dst; |
12262 |
++ int tmo; |
12263 |
++ |
12264 |
++ if (meta_tp->snd_una != meta_tp->write_seq) |
12265 |
++ break; |
12266 |
++ |
12267 |
++ tcp_set_state(meta_sk, TCP_FIN_WAIT2); |
12268 |
++ meta_sk->sk_shutdown |= SEND_SHUTDOWN; |
12269 |
++ |
12270 |
++ dst = __sk_dst_get(sk); |
12271 |
++ if (dst) |
12272 |
++ dst_confirm(dst); |
12273 |
++ |
12274 |
++ if (!sock_flag(meta_sk, SOCK_DEAD)) { |
12275 |
++ /* Wake up lingering close() */ |
12276 |
++ meta_sk->sk_state_change(meta_sk); |
12277 |
++ break; |
12278 |
++ } |
12279 |
++ |
12280 |
++ if (meta_tp->linger2 < 0 || |
12281 |
++ (data_len && |
12282 |
++ after(data_seq + data_len - (mptcp_is_data_fin2(skb, tp) ? 1 : 0), |
12283 |
++ meta_tp->rcv_nxt))) { |
12284 |
++ mptcp_send_active_reset(meta_sk, GFP_ATOMIC); |
12285 |
++ tcp_done(meta_sk); |
12286 |
++ NET_INC_STATS_BH(sock_net(meta_sk), LINUX_MIB_TCPABORTONDATA); |
12287 |
++ return 1; |
12288 |
++ } |
12289 |
++ |
12290 |
++ tmo = tcp_fin_time(meta_sk); |
12291 |
++ if (tmo > TCP_TIMEWAIT_LEN) { |
12292 |
++ inet_csk_reset_keepalive_timer(meta_sk, tmo - TCP_TIMEWAIT_LEN); |
12293 |
++ } else if (mptcp_is_data_fin2(skb, tp) || sock_owned_by_user(meta_sk)) { |
12294 |
++ /* Bad case. We could lose such FIN otherwise. |
12295 |
++ * It is not a big problem, but it looks confusing |
12296 |
++ * and not so rare event. We still can lose it now, |
12297 |
++ * if it spins in bh_lock_sock(), but it is really |
12298 |
++ * marginal case. |
12299 |
++ */ |
12300 |
++ inet_csk_reset_keepalive_timer(meta_sk, tmo); |
12301 |
++ } else { |
12302 |
++ meta_tp->ops->time_wait(meta_sk, TCP_FIN_WAIT2, tmo); |
12303 |
++ } |
12304 |
++ break; |
12305 |
++ } |
12306 |
++ case TCP_CLOSING: |
12307 |
++ case TCP_LAST_ACK: |
12308 |
++ if (meta_tp->snd_una == meta_tp->write_seq) { |
12309 |
++ tcp_done(meta_sk); |
12310 |
++ return 1; |
12311 |
++ } |
12312 |
++ break; |
12313 |
++ } |
12314 |
++ |
12315 |
++ /* step 7: process the segment text */ |
12316 |
++ switch (meta_sk->sk_state) { |
12317 |
++ case TCP_FIN_WAIT1: |
12318 |
++ case TCP_FIN_WAIT2: |
12319 |
++ /* RFC 793 says to queue data in these states, |
12320 |
++ * RFC 1122 says we MUST send a reset. |
12321 |
++ * BSD 4.4 also does reset. |
12322 |
++ */ |
12323 |
++ if (meta_sk->sk_shutdown & RCV_SHUTDOWN) { |
12324 |
++ if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq && |
12325 |
++ after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt) && |
12326 |
++ !mptcp_is_data_fin2(skb, tp)) { |
12327 |
++ NET_INC_STATS_BH(sock_net(meta_sk), LINUX_MIB_TCPABORTONDATA); |
12328 |
++ mptcp_send_active_reset(meta_sk, GFP_ATOMIC); |
12329 |
++ tcp_reset(meta_sk); |
12330 |
++ return 1; |
12331 |
++ } |
12332 |
++ } |
12333 |
++ break; |
12334 |
++ } |
12335 |
++ |
12336 |
++ return 0; |
12337 |
++} |
12338 |
++ |
12339 |
++/** |
12340 |
++ * @return: |
12341 |
++ * i) 1: Everything's fine. |
12342 |
++ * ii) -1: A reset has been sent on the subflow - csum-failure |
12343 |
++ * iii) 0: csum-failure but no reset sent, because it's the last subflow. |
12344 |
++ * Last packet should not be destroyed by the caller because it has |
12345 |
++ * been done here. |
12346 |
++ */ |
12347 |
++static int mptcp_verif_dss_csum(struct sock *sk) |
12348 |
++{ |
12349 |
++ struct tcp_sock *tp = tcp_sk(sk); |
12350 |
++ struct sk_buff *tmp, *tmp1, *last = NULL; |
12351 |
++ __wsum csum_tcp = 0; /* cumulative checksum of pld + mptcp-header */ |
12352 |
++ int ans = 1, overflowed = 0, offset = 0, dss_csum_added = 0; |
12353 |
++ int iter = 0; |
12354 |
++ |
12355 |
++ skb_queue_walk_safe(&sk->sk_receive_queue, tmp, tmp1) { |
12356 |
++ unsigned int csum_len; |
12357 |
++ |
12358 |
++ if (before(tp->mptcp->map_subseq + tp->mptcp->map_data_len, TCP_SKB_CB(tmp)->end_seq)) |
12359 |
++ /* Mapping ends in the middle of the packet - |
12360 |
++ * csum only these bytes |
12361 |
++ */ |
12362 |
++ csum_len = tp->mptcp->map_subseq + tp->mptcp->map_data_len - TCP_SKB_CB(tmp)->seq; |
12363 |
++ else |
12364 |
++ csum_len = tmp->len; |
12365 |
++ |
12366 |
++ offset = 0; |
12367 |
++ if (overflowed) { |
12368 |
++ char first_word[4]; |
12369 |
++ first_word[0] = 0; |
12370 |
++ first_word[1] = 0; |
12371 |
++ first_word[2] = 0; |
12372 |
++ first_word[3] = *(tmp->data); |
12373 |
++ csum_tcp = csum_partial(first_word, 4, csum_tcp); |
12374 |
++ offset = 1; |
12375 |
++ csum_len--; |
12376 |
++ overflowed = 0; |
12377 |
++ } |
12378 |
++ |
12379 |
++ csum_tcp = skb_checksum(tmp, offset, csum_len, csum_tcp); |
12380 |
++ |
12381 |
++ /* Was it on an odd-length? Then we have to merge the next byte |
12382 |
++ * correctly (see above) |
12383 |
++ */ |
12384 |
++ if (csum_len != (csum_len & (~1))) |
12385 |
++ overflowed = 1; |
12386 |
++ |
12387 |
++ if (mptcp_is_data_seq(tmp) && !dss_csum_added) { |
12388 |
++ __be32 data_seq = htonl((u32)(tp->mptcp->map_data_seq >> 32)); |
12389 |
++ |
12390 |
++ /* If a 64-bit dss is present, we increase the offset |
12391 |
++ * by 4 bytes, as the high-order 64-bits will be added |
12392 |
++ * in the final csum_partial-call. |
12393 |
++ */ |
12394 |
++ u32 offset = skb_transport_offset(tmp) + |
12395 |
++ TCP_SKB_CB(tmp)->dss_off; |
12396 |
++ if (TCP_SKB_CB(tmp)->mptcp_flags & MPTCPHDR_SEQ64_SET) |
12397 |
++ offset += 4; |
12398 |
++ |
12399 |
++ csum_tcp = skb_checksum(tmp, offset, |
12400 |
++ MPTCP_SUB_LEN_SEQ_CSUM, |
12401 |
++ csum_tcp); |
12402 |
++ |
12403 |
++ csum_tcp = csum_partial(&data_seq, |
12404 |
++ sizeof(data_seq), csum_tcp); |
12405 |
++ |
12406 |
++ dss_csum_added = 1; /* Just do it once */ |
12407 |
++ } |
12408 |
++ last = tmp; |
12409 |
++ iter++; |
12410 |
++ |
12411 |
++ if (!skb_queue_is_last(&sk->sk_receive_queue, tmp) && |
12412 |
++ !before(TCP_SKB_CB(tmp1)->seq, |
12413 |
++ tp->mptcp->map_subseq + tp->mptcp->map_data_len)) |
12414 |
++ break; |
12415 |
++ } |
12416 |
++ |
12417 |
++ /* Now, checksum must be 0 */ |
12418 |
++ if (unlikely(csum_fold(csum_tcp))) { |
12419 |
++ pr_err("%s csum is wrong: %#x data_seq %u dss_csum_added %d overflowed %d iterations %d\n", |
12420 |
++ __func__, csum_fold(csum_tcp), TCP_SKB_CB(last)->seq, |
12421 |
++ dss_csum_added, overflowed, iter); |
12422 |
++ |
12423 |
++ tp->mptcp->send_mp_fail = 1; |
12424 |
++ |
12425 |
++ /* map_data_seq is the data-seq number of the |
12426 |
++ * mapping we are currently checking |
12427 |
++ */ |
12428 |
++ tp->mpcb->csum_cutoff_seq = tp->mptcp->map_data_seq; |
12429 |
++ |
12430 |
++ if (tp->mpcb->cnt_subflows > 1) { |
12431 |
++ mptcp_send_reset(sk); |
12432 |
++ ans = -1; |
12433 |
++ } else { |
12434 |
++ tp->mpcb->send_infinite_mapping = 1; |
12435 |
++ |
12436 |
++ /* Need to purge the rcv-queue as it's no more valid */ |
12437 |
++ while ((tmp = __skb_dequeue(&sk->sk_receive_queue)) != NULL) { |
12438 |
++ tp->copied_seq = TCP_SKB_CB(tmp)->end_seq; |
12439 |
++ kfree_skb(tmp); |
12440 |
++ } |
12441 |
++ |
12442 |
++ ans = 0; |
12443 |
++ } |
12444 |
++ } |
12445 |
++ |
12446 |
++ return ans; |
12447 |
++} |
12448 |
++ |
12449 |
++static inline void mptcp_prepare_skb(struct sk_buff *skb, |
12450 |
++ const struct sock *sk) |
12451 |
++{ |
12452 |
++ const struct tcp_sock *tp = tcp_sk(sk); |
12453 |
++ struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); |
12454 |
++ u32 inc = 0; |
12455 |
++ |
12456 |
++ /* If skb is the end of this mapping (end is always at mapping-boundary |
12457 |
++ * thanks to the splitting/trimming), then we need to increase |
12458 |
++ * data-end-seq by 1 if this here is a data-fin. |
12459 |
++ * |
12460 |
++ * We need to do -1 because end_seq includes the subflow-FIN. |
12461 |
++ */ |
12462 |
++ if (tp->mptcp->map_data_fin && |
12463 |
++ (tcb->end_seq - (tcp_hdr(skb)->fin ? 1 : 0)) == |
12464 |
++ (tp->mptcp->map_subseq + tp->mptcp->map_data_len)) { |
12465 |
++ inc = 1; |
12466 |
++ |
12467 |
++ /* We manually set the fin-flag if it is a data-fin. For easy |
12468 |
++ * processing in tcp_recvmsg. |
12469 |
++ */ |
12470 |
++ tcp_hdr(skb)->fin = 1; |
12471 |
++ } else { |
12472 |
++ /* We may have a subflow-fin with data but without data-fin */ |
12473 |
++ tcp_hdr(skb)->fin = 0; |
12474 |
++ } |
12475 |
++ |
12476 |
++ /* Adapt data-seq's to the packet itself. We kinda transform the |
12477 |
++ * dss-mapping to a per-packet granularity. This is necessary to |
12478 |
++ * correctly handle overlapping mappings coming from different |
12479 |
++ * subflows. Otherwise it would be a complete mess. |
12480 |
++ */ |
12481 |
++ tcb->seq = ((u32)tp->mptcp->map_data_seq) + tcb->seq - tp->mptcp->map_subseq; |
12482 |
++ tcb->end_seq = tcb->seq + skb->len + inc; |
12483 |
++} |
12484 |
++ |
12485 |
++/** |
12486 |
++ * @return: 1 if the segment has been eaten and can be suppressed, |
12487 |
++ * otherwise 0. |
12488 |
++ */ |
12489 |
++static inline int mptcp_direct_copy(const struct sk_buff *skb, |
12490 |
++ struct sock *meta_sk) |
12491 |
++{ |
12492 |
++ struct tcp_sock *meta_tp = tcp_sk(meta_sk); |
12493 |
++ int chunk = min_t(unsigned int, skb->len, meta_tp->ucopy.len); |
12494 |
++ int eaten = 0; |
12495 |
++ |
12496 |
++ __set_current_state(TASK_RUNNING); |
12497 |
++ |
12498 |
++ local_bh_enable(); |
12499 |
++ if (!skb_copy_datagram_iovec(skb, 0, meta_tp->ucopy.iov, chunk)) { |
12500 |
++ meta_tp->ucopy.len -= chunk; |
12501 |
++ meta_tp->copied_seq += chunk; |
12502 |
++ eaten = (chunk == skb->len); |
12503 |
++ tcp_rcv_space_adjust(meta_sk); |
12504 |
++ } |
12505 |
++ local_bh_disable(); |
12506 |
++ return eaten; |
12507 |
++} |
12508 |
++ |
12509 |
++static inline void mptcp_reset_mapping(struct tcp_sock *tp) |
12510 |
++{ |
12511 |
++ tp->mptcp->map_data_len = 0; |
12512 |
++ tp->mptcp->map_data_seq = 0; |
12513 |
++ tp->mptcp->map_subseq = 0; |
12514 |
++ tp->mptcp->map_data_fin = 0; |
12515 |
++ tp->mptcp->mapping_present = 0; |
12516 |
++} |
12517 |
++ |
12518 |
++/* The DSS-mapping received on the sk only covers the second half of the skb |
12519 |
++ * (cut at seq). We trim the head from the skb. |
12520 |
++ * Data will be freed upon kfree(). |
12521 |
++ * |
12522 |
++ * Inspired by tcp_trim_head(). |
12523 |
++ */ |
12524 |
++static void mptcp_skb_trim_head(struct sk_buff *skb, struct sock *sk, u32 seq) |
12525 |
++{ |
12526 |
++ int len = seq - TCP_SKB_CB(skb)->seq; |
12527 |
++ u32 new_seq = TCP_SKB_CB(skb)->seq + len; |
12528 |
++ |
12529 |
++ if (len < skb_headlen(skb)) |
12530 |
++ __skb_pull(skb, len); |
12531 |
++ else |
12532 |
++ __pskb_trim_head(skb, len - skb_headlen(skb)); |
12533 |
++ |
12534 |
++ TCP_SKB_CB(skb)->seq = new_seq; |
12535 |
++ |
12536 |
++ skb->truesize -= len; |
12537 |
++ atomic_sub(len, &sk->sk_rmem_alloc); |
12538 |
++ sk_mem_uncharge(sk, len); |
12539 |
++} |
12540 |
++ |
12541 |
++/* The DSS-mapping received on the sk only covers the first half of the skb |
12542 |
++ * (cut at seq). We create a second skb (@return), and queue it in the rcv-queue |
12543 |
++ * as further packets may resolve the mapping of the second half of data. |
12544 |
++ * |
12545 |
++ * Inspired by tcp_fragment(). |
12546 |
++ */ |
12547 |
++static int mptcp_skb_split_tail(struct sk_buff *skb, struct sock *sk, u32 seq) |
12548 |
++{ |
12549 |
++ struct sk_buff *buff; |
12550 |
++ int nsize; |
12551 |
++ int nlen, len; |
12552 |
++ |
12553 |
++ len = seq - TCP_SKB_CB(skb)->seq; |
12554 |
++ nsize = skb_headlen(skb) - len + tcp_sk(sk)->tcp_header_len; |
12555 |
++ if (nsize < 0) |
12556 |
++ nsize = 0; |
12557 |
++ |
12558 |
++ /* Get a new skb... force flag on. */ |
12559 |
++ buff = alloc_skb(nsize, GFP_ATOMIC); |
12560 |
++ if (buff == NULL) |
12561 |
++ return -ENOMEM; |
12562 |
++ |
12563 |
++ skb_reserve(buff, tcp_sk(sk)->tcp_header_len); |
12564 |
++ skb_reset_transport_header(buff); |
12565 |
++ |
12566 |
++ tcp_hdr(buff)->fin = tcp_hdr(skb)->fin; |
12567 |
++ tcp_hdr(skb)->fin = 0; |
12568 |
++ |
12569 |
++ /* We absolutly need to call skb_set_owner_r before refreshing the |
12570 |
++ * truesize of buff, otherwise the moved data will account twice. |
12571 |
++ */ |
12572 |
++ skb_set_owner_r(buff, sk); |
12573 |
++ nlen = skb->len - len - nsize; |
12574 |
++ buff->truesize += nlen; |
12575 |
++ skb->truesize -= nlen; |
12576 |
++ |
12577 |
++ /* Correct the sequence numbers. */ |
12578 |
++ TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len; |
12579 |
++ TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq; |
12580 |
++ TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq; |
12581 |
++ |
12582 |
++ skb_split(skb, buff, len); |
12583 |
++ |
12584 |
++ __skb_queue_after(&sk->sk_receive_queue, skb, buff); |
12585 |
++ |
12586 |
++ return 0; |
12587 |
++} |
12588 |
++ |
12589 |
++/* @return: 0 everything is fine. Just continue processing |
12590 |
++ * 1 subflow is broken stop everything |
12591 |
++ * -1 this packet was broken - continue with the next one. |
12592 |
++ */ |
12593 |
++static int mptcp_prevalidate_skb(struct sock *sk, struct sk_buff *skb) |
12594 |
++{ |
12595 |
++ struct tcp_sock *tp = tcp_sk(sk); |
12596 |
++ |
12597 |
++ /* If we are in infinite mode, the subflow-fin is in fact a data-fin. */ |
12598 |
++ if (!skb->len && tcp_hdr(skb)->fin && !mptcp_is_data_fin(skb) && |
12599 |
++ !tp->mpcb->infinite_mapping_rcv) { |
12600 |
++ /* Remove a pure subflow-fin from the queue and increase |
12601 |
++ * copied_seq. |
12602 |
++ */ |
12603 |
++ tp->copied_seq = TCP_SKB_CB(skb)->end_seq; |
12604 |
++ __skb_unlink(skb, &sk->sk_receive_queue); |
12605 |
++ __kfree_skb(skb); |
12606 |
++ return -1; |
12607 |
++ } |
12608 |
++ |
12609 |
++ /* If we are not yet fully established and do not know the mapping for |
12610 |
++ * this segment, this path has to fallback to infinite or be torn down. |
12611 |
++ */ |
12612 |
++ if (!tp->mptcp->fully_established && !mptcp_is_data_seq(skb) && |
12613 |
++ !tp->mptcp->mapping_present && !tp->mpcb->infinite_mapping_rcv) { |
12614 |
++ pr_err("%s %#x will fallback - pi %d from %pS, seq %u\n", |
12615 |
++ __func__, tp->mpcb->mptcp_loc_token, |
12616 |
++ tp->mptcp->path_index, __builtin_return_address(0), |
12617 |
++ TCP_SKB_CB(skb)->seq); |
12618 |
++ |
12619 |
++ if (!is_master_tp(tp)) { |
12620 |
++ mptcp_send_reset(sk); |
12621 |
++ return 1; |
12622 |
++ } |
12623 |
++ |
12624 |
++ tp->mpcb->infinite_mapping_snd = 1; |
12625 |
++ tp->mpcb->infinite_mapping_rcv = 1; |
12626 |
++ /* We do a seamless fallback and should not send a inf.mapping. */ |
12627 |
++ tp->mpcb->send_infinite_mapping = 0; |
12628 |
++ tp->mptcp->fully_established = 1; |
12629 |
++ } |
12630 |
++ |
12631 |
++ /* Receiver-side becomes fully established when a whole rcv-window has |
12632 |
++ * been received without the need to fallback due to the previous |
12633 |
++ * condition. |
12634 |
++ */ |
12635 |
++ if (!tp->mptcp->fully_established) { |
12636 |
++ tp->mptcp->init_rcv_wnd -= skb->len; |
12637 |
++ if (tp->mptcp->init_rcv_wnd < 0) |
12638 |
++ mptcp_become_fully_estab(sk); |
12639 |
++ } |
12640 |
++ |
12641 |
++ return 0; |
12642 |
++} |
12643 |
++ |
12644 |
++/* @return: 0 everything is fine. Just continue processing |
12645 |
++ * 1 subflow is broken stop everything |
12646 |
++ * -1 this packet was broken - continue with the next one. |
12647 |
++ */ |
12648 |
++static int mptcp_detect_mapping(struct sock *sk, struct sk_buff *skb) |
12649 |
++{ |
12650 |
++ struct tcp_sock *tp = tcp_sk(sk), *meta_tp = mptcp_meta_tp(tp); |
12651 |
++ struct mptcp_cb *mpcb = tp->mpcb; |
12652 |
++ struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); |
12653 |
++ u32 *ptr; |
12654 |
++ u32 data_seq, sub_seq, data_len, tcp_end_seq; |
12655 |
++ |
12656 |
++ /* If we are in infinite-mapping-mode, the subflow is guaranteed to be |
12657 |
++ * in-order at the data-level. Thus data-seq-numbers can be inferred |
12658 |
++ * from what is expected at the data-level. |
12659 |
++ */ |
12660 |
++ if (mpcb->infinite_mapping_rcv) { |
12661 |
++ tp->mptcp->map_data_seq = mptcp_get_rcv_nxt_64(meta_tp); |
12662 |
++ tp->mptcp->map_subseq = tcb->seq; |
12663 |
++ tp->mptcp->map_data_len = skb->len; |
12664 |
++ tp->mptcp->map_data_fin = tcp_hdr(skb)->fin; |
12665 |
++ tp->mptcp->mapping_present = 1; |
12666 |
++ return 0; |
12667 |
++ } |
12668 |
++ |
12669 |
++ /* No mapping here? Exit - it is either already set or still on its way */ |
12670 |
++ if (!mptcp_is_data_seq(skb)) { |
12671 |
++ /* Too many packets without a mapping - this subflow is broken */ |
12672 |
++ if (!tp->mptcp->mapping_present && |
12673 |
++ tp->rcv_nxt - tp->copied_seq > 65536) { |
12674 |
++ mptcp_send_reset(sk); |
12675 |
++ return 1; |
12676 |
++ } |
12677 |
++ |
12678 |
++ return 0; |
12679 |
++ } |
12680 |
++ |
12681 |
++ ptr = mptcp_skb_set_data_seq(skb, &data_seq, mpcb); |
12682 |
++ ptr++; |
12683 |
++ sub_seq = get_unaligned_be32(ptr) + tp->mptcp->rcv_isn; |
12684 |
++ ptr++; |
12685 |
++ data_len = get_unaligned_be16(ptr); |
12686 |
++ |
12687 |
++ /* If it's an empty skb with DATA_FIN, sub_seq must get fixed. |
12688 |
++ * The draft sets it to 0, but we really would like to have the |
12689 |
++ * real value, to have an easy handling afterwards here in this |
12690 |
++ * function. |
12691 |
++ */ |
12692 |
++ if (mptcp_is_data_fin(skb) && skb->len == 0) |
12693 |
++ sub_seq = TCP_SKB_CB(skb)->seq; |
12694 |
++ |
12695 |
++ /* If there is already a mapping - we check if it maps with the current |
12696 |
++ * one. If not - we reset. |
12697 |
++ */ |
12698 |
++ if (tp->mptcp->mapping_present && |
12699 |
++ (data_seq != (u32)tp->mptcp->map_data_seq || |
12700 |
++ sub_seq != tp->mptcp->map_subseq || |
12701 |
++ data_len != tp->mptcp->map_data_len + tp->mptcp->map_data_fin || |
12702 |
++ mptcp_is_data_fin(skb) != tp->mptcp->map_data_fin)) { |
12703 |
++ /* Mapping in packet is different from what we want */ |
12704 |
++ pr_err("%s Mappings do not match!\n", __func__); |
12705 |
++ pr_err("%s dseq %u mdseq %u, sseq %u msseq %u dlen %u mdlen %u dfin %d mdfin %d\n", |
12706 |
++ __func__, data_seq, (u32)tp->mptcp->map_data_seq, |
12707 |
++ sub_seq, tp->mptcp->map_subseq, data_len, |
12708 |
++ tp->mptcp->map_data_len, mptcp_is_data_fin(skb), |
12709 |
++ tp->mptcp->map_data_fin); |
12710 |
++ mptcp_send_reset(sk); |
12711 |
++ return 1; |
12712 |
++ } |
12713 |
++ |
12714 |
++ /* If the previous check was good, the current mapping is valid and we exit. */ |
12715 |
++ if (tp->mptcp->mapping_present) |
12716 |
++ return 0; |
12717 |
++ |
12718 |
++ /* Mapping not yet set on this subflow - we set it here! */ |
12719 |
++ |
12720 |
++ if (!data_len) { |
12721 |
++ mpcb->infinite_mapping_rcv = 1; |
12722 |
++ tp->mptcp->fully_established = 1; |
12723 |
++ /* We need to repeat mp_fail's until the sender felt |
12724 |
++ * back to infinite-mapping - here we stop repeating it. |
12725 |
++ */ |
12726 |
++ tp->mptcp->send_mp_fail = 0; |
12727 |
++ |
12728 |
++ /* We have to fixup data_len - it must be the same as skb->len */ |
12729 |
++ data_len = skb->len + (mptcp_is_data_fin(skb) ? 1 : 0); |
12730 |
++ sub_seq = tcb->seq; |
12731 |
++ |
12732 |
++ /* TODO kill all other subflows than this one */ |
12733 |
++ /* data_seq and so on are set correctly */ |
12734 |
++ |
12735 |
++ /* At this point, the meta-ofo-queue has to be emptied, |
12736 |
++ * as the following data is guaranteed to be in-order at |
12737 |
++ * the data and subflow-level |
12738 |
++ */ |
12739 |
++ mptcp_purge_ofo_queue(meta_tp); |
12740 |
++ } |
12741 |
++ |
12742 |
++ /* We are sending mp-fail's and thus are in fallback mode. |
12743 |
++ * Ignore packets which do not announce the fallback and still |
12744 |
++ * want to provide a mapping. |
12745 |
++ */ |
12746 |
++ if (tp->mptcp->send_mp_fail) { |
12747 |
++ tp->copied_seq = TCP_SKB_CB(skb)->end_seq; |
12748 |
++ __skb_unlink(skb, &sk->sk_receive_queue); |
12749 |
++ __kfree_skb(skb); |
12750 |
++ return -1; |
12751 |
++ } |
12752 |
++ |
12753 |
++ /* FIN increased the mapping-length by 1 */ |
12754 |
++ if (mptcp_is_data_fin(skb)) |
12755 |
++ data_len--; |
12756 |
++ |
12757 |
++ /* Subflow-sequences of packet must be |
12758 |
++ * (at least partially) be part of the DSS-mapping's |
12759 |
++ * subflow-sequence-space. |
12760 |
++ * |
12761 |
++ * Basically the mapping is not valid, if either of the |
12762 |
++ * following conditions is true: |
12763 |
++ * |
12764 |
++ * 1. It's not a data_fin and |
12765 |
++ * MPTCP-sub_seq >= TCP-end_seq |
12766 |
++ * |
12767 |
++ * 2. It's a data_fin and TCP-end_seq > TCP-seq and |
12768 |
++ * MPTCP-sub_seq >= TCP-end_seq |
12769 |
++ * |
12770 |
++ * The previous two can be merged into: |
12771 |
++ * TCP-end_seq > TCP-seq and MPTCP-sub_seq >= TCP-end_seq |
12772 |
++ * Because if it's not a data-fin, TCP-end_seq > TCP-seq |
12773 |
++ * |
12774 |
++ * 3. It's a data_fin and skb->len == 0 and |
12775 |
++ * MPTCP-sub_seq > TCP-end_seq |
12776 |
++ * |
12777 |
++ * 4. It's not a data_fin and TCP-end_seq > TCP-seq and |
12778 |
++ * MPTCP-sub_seq + MPTCP-data_len <= TCP-seq |
12779 |
++ * |
12780 |
++ * 5. MPTCP-sub_seq is prior to what we already copied (copied_seq) |
12781 |
++ */ |
12782 |
++ |
12783 |
++ /* subflow-fin is not part of the mapping - ignore it here ! */ |
12784 |
++ tcp_end_seq = tcb->end_seq - tcp_hdr(skb)->fin; |
12785 |
++ if ((!before(sub_seq, tcb->end_seq) && after(tcp_end_seq, tcb->seq)) || |
12786 |
++ (mptcp_is_data_fin(skb) && skb->len == 0 && after(sub_seq, tcb->end_seq)) || |
12787 |
++ (!after(sub_seq + data_len, tcb->seq) && after(tcp_end_seq, tcb->seq)) || |
12788 |
++ before(sub_seq, tp->copied_seq)) { |
12789 |
++ /* Subflow-sequences of packet is different from what is in the |
12790 |
++ * packet's dss-mapping. The peer is misbehaving - reset |
12791 |
++ */ |
12792 |
++ pr_err("%s Packet's mapping does not map to the DSS sub_seq %u " |
12793 |
++ "end_seq %u, tcp_end_seq %u seq %u dfin %u len %u data_len %u" |
12794 |
++ "copied_seq %u\n", __func__, sub_seq, tcb->end_seq, tcp_end_seq, tcb->seq, mptcp_is_data_fin(skb), |
12795 |
++ skb->len, data_len, tp->copied_seq); |
12796 |
++ mptcp_send_reset(sk); |
12797 |
++ return 1; |
12798 |
++ } |
12799 |
++ |
12800 |
++ /* Does the DSS had 64-bit seqnum's ? */ |
12801 |
++ if (!(tcb->mptcp_flags & MPTCPHDR_SEQ64_SET)) { |
12802 |
++ /* Wrapped around? */ |
12803 |
++ if (unlikely(after(data_seq, meta_tp->rcv_nxt) && data_seq < meta_tp->rcv_nxt)) { |
12804 |
++ tp->mptcp->map_data_seq = mptcp_get_data_seq_64(mpcb, !mpcb->rcv_hiseq_index, data_seq); |
12805 |
++ } else { |
12806 |
++ /* Else, access the default high-order bits */ |
12807 |
++ tp->mptcp->map_data_seq = mptcp_get_data_seq_64(mpcb, mpcb->rcv_hiseq_index, data_seq); |
12808 |
++ } |
12809 |
++ } else { |
12810 |
++ tp->mptcp->map_data_seq = mptcp_get_data_seq_64(mpcb, (tcb->mptcp_flags & MPTCPHDR_SEQ64_INDEX) ? 1 : 0, data_seq); |
12811 |
++ |
12812 |
++ if (unlikely(tcb->mptcp_flags & MPTCPHDR_SEQ64_OFO)) { |
12813 |
++ /* We make sure that the data_seq is invalid. |
12814 |
++ * It will be dropped later. |
12815 |
++ */ |
12816 |
++ tp->mptcp->map_data_seq += 0xFFFFFFFF; |
12817 |
++ tp->mptcp->map_data_seq += 0xFFFFFFFF; |
12818 |
++ } |
12819 |
++ } |
12820 |
++ |
12821 |
++ tp->mptcp->map_data_len = data_len; |
12822 |
++ tp->mptcp->map_subseq = sub_seq; |
12823 |
++ tp->mptcp->map_data_fin = mptcp_is_data_fin(skb) ? 1 : 0; |
12824 |
++ tp->mptcp->mapping_present = 1; |
12825 |
++ |
12826 |
++ return 0; |
12827 |
++} |
12828 |
++ |
12829 |
++/* Similar to tcp_sequence(...) */ |
12830 |
++static inline bool mptcp_sequence(const struct tcp_sock *meta_tp, |
12831 |
++ u64 data_seq, u64 end_data_seq) |
12832 |
++{ |
12833 |
++ const struct mptcp_cb *mpcb = meta_tp->mpcb; |
12834 |
++ u64 rcv_wup64; |
12835 |
++ |
12836 |
++ /* Wrap-around? */ |
12837 |
++ if (meta_tp->rcv_wup > meta_tp->rcv_nxt) { |
12838 |
++ rcv_wup64 = ((u64)(mpcb->rcv_high_order[mpcb->rcv_hiseq_index] - 1) << 32) | |
12839 |
++ meta_tp->rcv_wup; |
12840 |
++ } else { |
12841 |
++ rcv_wup64 = mptcp_get_data_seq_64(mpcb, mpcb->rcv_hiseq_index, |
12842 |
++ meta_tp->rcv_wup); |
12843 |
++ } |
12844 |
++ |
12845 |
++ return !before64(end_data_seq, rcv_wup64) && |
12846 |
++ !after64(data_seq, mptcp_get_rcv_nxt_64(meta_tp) + tcp_receive_window(meta_tp)); |
12847 |
++} |
12848 |
++ |
12849 |
++/* @return: 0 everything is fine. Just continue processing |
12850 |
++ * -1 this packet was broken - continue with the next one. |
12851 |
++ */ |
12852 |
++static int mptcp_validate_mapping(struct sock *sk, struct sk_buff *skb) |
12853 |
++{ |
12854 |
++ struct tcp_sock *tp = tcp_sk(sk); |
12855 |
++ struct sk_buff *tmp, *tmp1; |
12856 |
++ u32 tcp_end_seq; |
12857 |
++ |
12858 |
++ if (!tp->mptcp->mapping_present) |
12859 |
++ return 0; |
12860 |
++ |
12861 |
++ /* either, the new skb gave us the mapping and the first segment |
12862 |
++ * in the sub-rcv-queue has to be trimmed ... |
12863 |
++ */ |
12864 |
++ tmp = skb_peek(&sk->sk_receive_queue); |
12865 |
++ if (before(TCP_SKB_CB(tmp)->seq, tp->mptcp->map_subseq) && |
12866 |
++ after(TCP_SKB_CB(tmp)->end_seq, tp->mptcp->map_subseq)) |
12867 |
++ mptcp_skb_trim_head(tmp, sk, tp->mptcp->map_subseq); |
12868 |
++ |
12869 |
++ /* ... or the new skb (tail) has to be split at the end. */ |
12870 |
++ tcp_end_seq = TCP_SKB_CB(skb)->end_seq - (tcp_hdr(skb)->fin ? 1 : 0); |
12871 |
++ if (after(tcp_end_seq, tp->mptcp->map_subseq + tp->mptcp->map_data_len)) { |
12872 |
++ u32 seq = tp->mptcp->map_subseq + tp->mptcp->map_data_len; |
12873 |
++ if (mptcp_skb_split_tail(skb, sk, seq)) { /* Allocation failed */ |
12874 |
++ /* TODO : maybe handle this here better. |
12875 |
++ * We now just force meta-retransmission. |
12876 |
++ */ |
12877 |
++ tp->copied_seq = TCP_SKB_CB(skb)->end_seq; |
12878 |
++ __skb_unlink(skb, &sk->sk_receive_queue); |
12879 |
++ __kfree_skb(skb); |
12880 |
++ return -1; |
12881 |
++ } |
12882 |
++ } |
12883 |
++ |
12884 |
++ /* Now, remove old sk_buff's from the receive-queue. |
12885 |
++ * This may happen if the mapping has been lost for these segments and |
12886 |
++ * the next mapping has already been received. |
12887 |
++ */ |
12888 |
++ if (before(TCP_SKB_CB(skb_peek(&sk->sk_receive_queue))->seq, tp->mptcp->map_subseq)) { |
12889 |
++ skb_queue_walk_safe(&sk->sk_receive_queue, tmp1, tmp) { |
12890 |
++ if (!before(TCP_SKB_CB(tmp1)->seq, tp->mptcp->map_subseq)) |
12891 |
++ break; |
12892 |
++ |
12893 |
++ tp->copied_seq = TCP_SKB_CB(tmp1)->end_seq; |
12894 |
++ __skb_unlink(tmp1, &sk->sk_receive_queue); |
12895 |
++ |
12896 |
++ /* Impossible that we could free skb here, because his |
12897 |
++ * mapping is known to be valid from previous checks |
12898 |
++ */ |
12899 |
++ __kfree_skb(tmp1); |
12900 |
++ } |
12901 |
++ } |
12902 |
++ |
12903 |
++ return 0; |
12904 |
++} |
12905 |
++ |
12906 |
++/* @return: 0 everything is fine. Just continue processing |
12907 |
++ * 1 subflow is broken stop everything |
12908 |
++ * -1 this mapping has been put in the meta-receive-queue |
12909 |
++ * -2 this mapping has been eaten by the application |
12910 |
++ */ |
12911 |
++static int mptcp_queue_skb(struct sock *sk) |
12912 |
++{ |
12913 |
++ struct tcp_sock *tp = tcp_sk(sk), *meta_tp = mptcp_meta_tp(tp); |
12914 |
++ struct sock *meta_sk = mptcp_meta_sk(sk); |
12915 |
++ struct mptcp_cb *mpcb = tp->mpcb; |
12916 |
++ struct sk_buff *tmp, *tmp1; |
12917 |
++ u64 rcv_nxt64 = mptcp_get_rcv_nxt_64(meta_tp); |
12918 |
++ bool data_queued = false; |
12919 |
++ |
12920 |
++ /* Have we not yet received the full mapping? */ |
12921 |
++ if (!tp->mptcp->mapping_present || |
12922 |
++ before(tp->rcv_nxt, tp->mptcp->map_subseq + tp->mptcp->map_data_len)) |
12923 |
++ return 0; |
12924 |
++ |
12925 |
++ /* Is this an overlapping mapping? rcv_nxt >= end_data_seq |
12926 |
++ * OR |
12927 |
++ * This mapping is out of window |
12928 |
++ */ |
12929 |
++ if (!before64(rcv_nxt64, tp->mptcp->map_data_seq + tp->mptcp->map_data_len + tp->mptcp->map_data_fin) || |
12930 |
++ !mptcp_sequence(meta_tp, tp->mptcp->map_data_seq, |
12931 |
++ tp->mptcp->map_data_seq + tp->mptcp->map_data_len + tp->mptcp->map_data_fin)) { |
12932 |
++ skb_queue_walk_safe(&sk->sk_receive_queue, tmp1, tmp) { |
12933 |
++ __skb_unlink(tmp1, &sk->sk_receive_queue); |
12934 |
++ tp->copied_seq = TCP_SKB_CB(tmp1)->end_seq; |
12935 |
++ __kfree_skb(tmp1); |
12936 |
++ |
12937 |
++ if (!skb_queue_empty(&sk->sk_receive_queue) && |
12938 |
++ !before(TCP_SKB_CB(tmp)->seq, |
12939 |
++ tp->mptcp->map_subseq + tp->mptcp->map_data_len)) |
12940 |
++ break; |
12941 |
++ } |
12942 |
++ |
12943 |
++ mptcp_reset_mapping(tp); |
12944 |
++ |
12945 |
++ return -1; |
12946 |
++ } |
12947 |
++ |
12948 |
++ /* Record it, because we want to send our data_fin on the same path */ |
12949 |
++ if (tp->mptcp->map_data_fin) { |
12950 |
++ mpcb->dfin_path_index = tp->mptcp->path_index; |
12951 |
++ mpcb->dfin_combined = !!(sk->sk_shutdown & RCV_SHUTDOWN); |
12952 |
++ } |
12953 |
++ |
12954 |
++ /* Verify the checksum */ |
12955 |
++ if (mpcb->dss_csum && !mpcb->infinite_mapping_rcv) { |
12956 |
++ int ret = mptcp_verif_dss_csum(sk); |
12957 |
++ |
12958 |
++ if (ret <= 0) { |
12959 |
++ mptcp_reset_mapping(tp); |
12960 |
++ return 1; |
12961 |
++ } |
12962 |
++ } |
12963 |
++ |
12964 |
++ if (before64(rcv_nxt64, tp->mptcp->map_data_seq)) { |
12965 |
++ /* Seg's have to go to the meta-ofo-queue */ |
12966 |
++ skb_queue_walk_safe(&sk->sk_receive_queue, tmp1, tmp) { |
12967 |
++ tp->copied_seq = TCP_SKB_CB(tmp1)->end_seq; |
12968 |
++ mptcp_prepare_skb(tmp1, sk); |
12969 |
++ __skb_unlink(tmp1, &sk->sk_receive_queue); |
12970 |
++ /* MUST be done here, because fragstolen may be true later. |
12971 |
++ * Then, kfree_skb_partial will not account the memory. |
12972 |
++ */ |
12973 |
++ skb_orphan(tmp1); |
12974 |
++ |
12975 |
++ if (!mpcb->in_time_wait) /* In time-wait, do not receive data */ |
12976 |
++ mptcp_add_meta_ofo_queue(meta_sk, tmp1, sk); |
12977 |
++ else |
12978 |
++ __kfree_skb(tmp1); |
12979 |
++ |
12980 |
++ if (!skb_queue_empty(&sk->sk_receive_queue) && |
12981 |
++ !before(TCP_SKB_CB(tmp)->seq, |
12982 |
++ tp->mptcp->map_subseq + tp->mptcp->map_data_len)) |
12983 |
++ break; |
12984 |
++ } |
12985 |
++ tcp_enter_quickack_mode(sk); |
12986 |
++ } else { |
12987 |
++ /* Ready for the meta-rcv-queue */ |
12988 |
++ skb_queue_walk_safe(&sk->sk_receive_queue, tmp1, tmp) { |
12989 |
++ int eaten = 0; |
12990 |
++ const bool copied_early = false; |
12991 |
++ bool fragstolen = false; |
12992 |
++ u32 old_rcv_nxt = meta_tp->rcv_nxt; |
12993 |
++ |
12994 |
++ tp->copied_seq = TCP_SKB_CB(tmp1)->end_seq; |
12995 |
++ mptcp_prepare_skb(tmp1, sk); |
12996 |
++ __skb_unlink(tmp1, &sk->sk_receive_queue); |
12997 |
++ /* MUST be done here, because fragstolen may be true. |
12998 |
++ * Then, kfree_skb_partial will not account the memory. |
12999 |
++ */ |
13000 |
++ skb_orphan(tmp1); |
13001 |
++ |
13002 |
++ /* This segment has already been received */ |
13003 |
++ if (!after(TCP_SKB_CB(tmp1)->end_seq, meta_tp->rcv_nxt)) { |
13004 |
++ __kfree_skb(tmp1); |
13005 |
++ goto next; |
13006 |
++ } |
13007 |
++ |
13008 |
++#ifdef CONFIG_NET_DMA |
13009 |
++ if (TCP_SKB_CB(tmp1)->seq == meta_tp->rcv_nxt && |
13010 |
++ meta_tp->ucopy.task == current && |
13011 |
++ meta_tp->copied_seq == meta_tp->rcv_nxt && |
13012 |
++ tmp1->len <= meta_tp->ucopy.len && |
13013 |
++ sock_owned_by_user(meta_sk) && |
13014 |
++ tcp_dma_try_early_copy(meta_sk, tmp1, 0)) { |
13015 |
++ copied_early = true; |
13016 |
++ eaten = 1; |
13017 |
++ } |
13018 |
++#endif |
13019 |
++ |
13020 |
++ /* Is direct copy possible ? */ |
13021 |
++ if (TCP_SKB_CB(tmp1)->seq == meta_tp->rcv_nxt && |
13022 |
++ meta_tp->ucopy.task == current && |
13023 |
++ meta_tp->copied_seq == meta_tp->rcv_nxt && |
13024 |
++ meta_tp->ucopy.len && sock_owned_by_user(meta_sk) && |
13025 |
++ !copied_early) |
13026 |
++ eaten = mptcp_direct_copy(tmp1, meta_sk); |
13027 |
++ |
13028 |
++ if (mpcb->in_time_wait) /* In time-wait, do not receive data */ |
13029 |
++ eaten = 1; |
13030 |
++ |
13031 |
++ if (!eaten) |
13032 |
++ eaten = tcp_queue_rcv(meta_sk, tmp1, 0, &fragstolen); |
13033 |
++ |
13034 |
++ meta_tp->rcv_nxt = TCP_SKB_CB(tmp1)->end_seq; |
13035 |
++ mptcp_check_rcvseq_wrap(meta_tp, old_rcv_nxt); |
13036 |
++ |
13037 |
++#ifdef CONFIG_NET_DMA |
13038 |
++ if (copied_early) |
13039 |
++ meta_tp->cleanup_rbuf(meta_sk, tmp1->len); |
13040 |
++#endif |
13041 |
++ |
13042 |
++ if (tcp_hdr(tmp1)->fin && !mpcb->in_time_wait) |
13043 |
++ mptcp_fin(meta_sk); |
13044 |
++ |
13045 |
++ /* Check if this fills a gap in the ofo queue */ |
13046 |
++ if (!skb_queue_empty(&meta_tp->out_of_order_queue)) |
13047 |
++ mptcp_ofo_queue(meta_sk); |
13048 |
++ |
13049 |
++#ifdef CONFIG_NET_DMA |
13050 |
++ if (copied_early) |
13051 |
++ __skb_queue_tail(&meta_sk->sk_async_wait_queue, |
13052 |
++ tmp1); |
13053 |
++ else |
13054 |
++#endif |
13055 |
++ if (eaten) |
13056 |
++ kfree_skb_partial(tmp1, fragstolen); |
13057 |
++ |
13058 |
++ data_queued = true; |
13059 |
++next: |
13060 |
++ if (!skb_queue_empty(&sk->sk_receive_queue) && |
13061 |
++ !before(TCP_SKB_CB(tmp)->seq, |
13062 |
++ tp->mptcp->map_subseq + tp->mptcp->map_data_len)) |
13063 |
++ break; |
13064 |
++ } |
13065 |
++ } |
13066 |
++ |
13067 |
++ inet_csk(meta_sk)->icsk_ack.lrcvtime = tcp_time_stamp; |
13068 |
++ mptcp_reset_mapping(tp); |
13069 |
++ |
13070 |
++ return data_queued ? -1 : -2; |
13071 |
++} |
13072 |
++ |
13073 |
++void mptcp_data_ready(struct sock *sk) |
13074 |
++{ |
13075 |
++ struct sock *meta_sk = mptcp_meta_sk(sk); |
13076 |
++ struct sk_buff *skb, *tmp; |
13077 |
++ int queued = 0; |
13078 |
++ |
13079 |
++ /* restart before the check, because mptcp_fin might have changed the |
13080 |
++ * state. |
13081 |
++ */ |
13082 |
++restart: |
13083 |
++ /* If the meta cannot receive data, there is no point in pushing data. |
13084 |
++ * If we are in time-wait, we may still be waiting for the final FIN. |
13085 |
++ * So, we should proceed with the processing. |
13086 |
++ */ |
13087 |
++ if (!mptcp_sk_can_recv(meta_sk) && !tcp_sk(sk)->mpcb->in_time_wait) { |
13088 |
++ skb_queue_purge(&sk->sk_receive_queue); |
13089 |
++ tcp_sk(sk)->copied_seq = tcp_sk(sk)->rcv_nxt; |
13090 |
++ goto exit; |
13091 |
++ } |
13092 |
++ |
13093 |
++ /* Iterate over all segments, detect their mapping (if we don't have |
13094 |
++ * one yet), validate them and push everything one level higher. |
13095 |
++ */ |
13096 |
++ skb_queue_walk_safe(&sk->sk_receive_queue, skb, tmp) { |
13097 |
++ int ret; |
13098 |
++ /* Pre-validation - e.g., early fallback */ |
13099 |
++ ret = mptcp_prevalidate_skb(sk, skb); |
13100 |
++ if (ret < 0) |
13101 |
++ goto restart; |
13102 |
++ else if (ret > 0) |
13103 |
++ break; |
13104 |
++ |
13105 |
++ /* Set the current mapping */ |
13106 |
++ ret = mptcp_detect_mapping(sk, skb); |
13107 |
++ if (ret < 0) |
13108 |
++ goto restart; |
13109 |
++ else if (ret > 0) |
13110 |
++ break; |
13111 |
++ |
13112 |
++ /* Validation */ |
13113 |
++ if (mptcp_validate_mapping(sk, skb) < 0) |
13114 |
++ goto restart; |
13115 |
++ |
13116 |
++ /* Push a level higher */ |
13117 |
++ ret = mptcp_queue_skb(sk); |
13118 |
++ if (ret < 0) { |
13119 |
++ if (ret == -1) |
13120 |
++ queued = ret; |
13121 |
++ goto restart; |
13122 |
++ } else if (ret == 0) { |
13123 |
++ continue; |
13124 |
++ } else { /* ret == 1 */ |
13125 |
++ break; |
13126 |
++ } |
13127 |
++ } |
13128 |
++ |
13129 |
++exit: |
13130 |
++ if (tcp_sk(sk)->close_it) { |
13131 |
++ tcp_send_ack(sk); |
13132 |
++ tcp_sk(sk)->ops->time_wait(sk, TCP_TIME_WAIT, 0); |
13133 |
++ } |
13134 |
++ |
13135 |
++ if (queued == -1 && !sock_flag(meta_sk, SOCK_DEAD)) |
13136 |
++ meta_sk->sk_data_ready(meta_sk); |
13137 |
++} |
13138 |
++ |
13139 |
++ |
13140 |
++int mptcp_check_req(struct sk_buff *skb, struct net *net) |
13141 |
++{ |
13142 |
++ const struct tcphdr *th = tcp_hdr(skb); |
13143 |
++ struct sock *meta_sk = NULL; |
13144 |
++ |
13145 |
++ /* MPTCP structures not initialized */ |
13146 |
++ if (mptcp_init_failed) |
13147 |
++ return 0; |
13148 |
++ |
13149 |
++ if (skb->protocol == htons(ETH_P_IP)) |
13150 |
++ meta_sk = mptcp_v4_search_req(th->source, ip_hdr(skb)->saddr, |
13151 |
++ ip_hdr(skb)->daddr, net); |
13152 |
++#if IS_ENABLED(CONFIG_IPV6) |
13153 |
++ else /* IPv6 */ |
13154 |
++ meta_sk = mptcp_v6_search_req(th->source, &ipv6_hdr(skb)->saddr, |
13155 |
++ &ipv6_hdr(skb)->daddr, net); |
13156 |
++#endif /* CONFIG_IPV6 */ |
13157 |
++ |
13158 |
++ if (!meta_sk) |
13159 |
++ return 0; |
13160 |
++ |
13161 |
++ TCP_SKB_CB(skb)->mptcp_flags |= MPTCPHDR_JOIN; |
13162 |
++ |
13163 |
++ bh_lock_sock_nested(meta_sk); |
13164 |
++ if (sock_owned_by_user(meta_sk)) { |
13165 |
++ skb->sk = meta_sk; |
13166 |
++ if (unlikely(sk_add_backlog(meta_sk, skb, |
13167 |
++ meta_sk->sk_rcvbuf + meta_sk->sk_sndbuf))) { |
13168 |
++ bh_unlock_sock(meta_sk); |
13169 |
++ NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP); |
13170 |
++ sock_put(meta_sk); /* Taken by mptcp_search_req */ |
13171 |
++ kfree_skb(skb); |
13172 |
++ return 1; |
13173 |
++ } |
13174 |
++ } else if (skb->protocol == htons(ETH_P_IP)) { |
13175 |
++ tcp_v4_do_rcv(meta_sk, skb); |
13176 |
++#if IS_ENABLED(CONFIG_IPV6) |
13177 |
++ } else { /* IPv6 */ |
13178 |
++ tcp_v6_do_rcv(meta_sk, skb); |
13179 |
++#endif /* CONFIG_IPV6 */ |
13180 |
++ } |
13181 |
++ bh_unlock_sock(meta_sk); |
13182 |
++ sock_put(meta_sk); /* Taken by mptcp_vX_search_req */ |
13183 |
++ return 1; |
13184 |
++} |
13185 |
++ |
13186 |
++struct mp_join *mptcp_find_join(const struct sk_buff *skb) |
13187 |
++{ |
13188 |
++ const struct tcphdr *th = tcp_hdr(skb); |
13189 |
++ unsigned char *ptr; |
13190 |
++ int length = (th->doff * 4) - sizeof(struct tcphdr); |
13191 |
++ |
13192 |
++ /* Jump through the options to check whether JOIN is there */ |
13193 |
++ ptr = (unsigned char *)(th + 1); |
13194 |
++ while (length > 0) { |
13195 |
++ int opcode = *ptr++; |
13196 |
++ int opsize; |
13197 |
++ |
13198 |
++ switch (opcode) { |
13199 |
++ case TCPOPT_EOL: |
13200 |
++ return NULL; |
13201 |
++ case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */ |
13202 |
++ length--; |
13203 |
++ continue; |
13204 |
++ default: |
13205 |
++ opsize = *ptr++; |
13206 |
++ if (opsize < 2) /* "silly options" */ |
13207 |
++ return NULL; |
13208 |
++ if (opsize > length) |
13209 |
++ return NULL; /* don't parse partial options */ |
13210 |
++ if (opcode == TCPOPT_MPTCP && |
13211 |
++ ((struct mptcp_option *)(ptr - 2))->sub == MPTCP_SUB_JOIN) { |
13212 |
++ return (struct mp_join *)(ptr - 2); |
13213 |
++ } |
13214 |
++ ptr += opsize - 2; |
13215 |
++ length -= opsize; |
13216 |
++ } |
13217 |
++ } |
13218 |
++ return NULL; |
13219 |
++} |
13220 |
++ |
13221 |
++int mptcp_lookup_join(struct sk_buff *skb, struct inet_timewait_sock *tw) |
13222 |
++{ |
13223 |
++ const struct mptcp_cb *mpcb; |
13224 |
++ struct sock *meta_sk; |
13225 |
++ u32 token; |
13226 |
++ bool meta_v4; |
13227 |
++ struct mp_join *join_opt = mptcp_find_join(skb); |
13228 |
++ if (!join_opt) |
13229 |
++ return 0; |
13230 |
++ |
13231 |
++ /* MPTCP structures were not initialized, so return error */ |
13232 |
++ if (mptcp_init_failed) |
13233 |
++ return -1; |
13234 |
++ |
13235 |
++ token = join_opt->u.syn.token; |
13236 |
++ meta_sk = mptcp_hash_find(dev_net(skb_dst(skb)->dev), token); |
13237 |
++ if (!meta_sk) { |
13238 |
++ mptcp_debug("%s:mpcb not found:%x\n", __func__, token); |
13239 |
++ return -1; |
13240 |
++ } |
13241 |
++ |
13242 |
++ meta_v4 = meta_sk->sk_family == AF_INET; |
13243 |
++ if (meta_v4) { |
13244 |
++ if (skb->protocol == htons(ETH_P_IPV6)) { |
13245 |
++ mptcp_debug("SYN+MP_JOIN with IPV6 address on pure IPV4 meta\n"); |
13246 |
++ sock_put(meta_sk); /* Taken by mptcp_hash_find */ |
13247 |
++ return -1; |
13248 |
++ } |
13249 |
++ } else if (skb->protocol == htons(ETH_P_IP) && |
13250 |
++ inet6_sk(meta_sk)->ipv6only) { |
13251 |
++ mptcp_debug("SYN+MP_JOIN with IPV4 address on IPV6_V6ONLY meta\n"); |
13252 |
++ sock_put(meta_sk); /* Taken by mptcp_hash_find */ |
13253 |
++ return -1; |
13254 |
++ } |
13255 |
++ |
13256 |
++ mpcb = tcp_sk(meta_sk)->mpcb; |
13257 |
++ if (mpcb->infinite_mapping_rcv || mpcb->send_infinite_mapping) { |
13258 |
++ /* We are in fallback-mode on the reception-side - |
13259 |
++ * no new subflows! |
13260 |
++ */ |
13261 |
++ sock_put(meta_sk); /* Taken by mptcp_hash_find */ |
13262 |
++ return -1; |
13263 |
++ } |
13264 |
++ |
13265 |
++ /* Coming from time-wait-sock processing in tcp_v4_rcv. |
13266 |
++ * We have to deschedule it before continuing, because otherwise |
13267 |
++ * mptcp_v4_do_rcv will hit again on it inside tcp_v4_hnd_req. |
13268 |
++ */ |
13269 |
++ if (tw) { |
13270 |
++ inet_twsk_deschedule(tw, &tcp_death_row); |
13271 |
++ inet_twsk_put(tw); |
13272 |
++ } |
13273 |
++ |
13274 |
++ TCP_SKB_CB(skb)->mptcp_flags |= MPTCPHDR_JOIN; |
13275 |
++ /* OK, this is a new syn/join, let's create a new open request and |
13276 |
++ * send syn+ack |
13277 |
++ */ |
13278 |
++ bh_lock_sock_nested(meta_sk); |
13279 |
++ if (sock_owned_by_user(meta_sk)) { |
13280 |
++ skb->sk = meta_sk; |
13281 |
++ if (unlikely(sk_add_backlog(meta_sk, skb, |
13282 |
++ meta_sk->sk_rcvbuf + meta_sk->sk_sndbuf))) { |
13283 |
++ bh_unlock_sock(meta_sk); |
13284 |
++ NET_INC_STATS_BH(sock_net(meta_sk), |
13285 |
++ LINUX_MIB_TCPBACKLOGDROP); |
13286 |
++ sock_put(meta_sk); /* Taken by mptcp_hash_find */ |
13287 |
++ kfree_skb(skb); |
13288 |
++ return 1; |
13289 |
++ } |
13290 |
++ } else if (skb->protocol == htons(ETH_P_IP)) { |
13291 |
++ tcp_v4_do_rcv(meta_sk, skb); |
13292 |
++#if IS_ENABLED(CONFIG_IPV6) |
13293 |
++ } else { |
13294 |
++ tcp_v6_do_rcv(meta_sk, skb); |
13295 |
++#endif /* CONFIG_IPV6 */ |
13296 |
++ } |
13297 |
++ bh_unlock_sock(meta_sk); |
13298 |
++ sock_put(meta_sk); /* Taken by mptcp_hash_find */ |
13299 |
++ return 1; |
13300 |
++} |
13301 |
++ |
13302 |
++int mptcp_do_join_short(struct sk_buff *skb, |
13303 |
++ const struct mptcp_options_received *mopt, |
13304 |
++ struct net *net) |
13305 |
++{ |
13306 |
++ struct sock *meta_sk; |
13307 |
++ u32 token; |
13308 |
++ bool meta_v4; |
13309 |
++ |
13310 |
++ token = mopt->mptcp_rem_token; |
13311 |
++ meta_sk = mptcp_hash_find(net, token); |
13312 |
++ if (!meta_sk) { |
13313 |
++ mptcp_debug("%s:mpcb not found:%x\n", __func__, token); |
13314 |
++ return -1; |
13315 |
++ } |
13316 |
++ |
13317 |
++ meta_v4 = meta_sk->sk_family == AF_INET; |
13318 |
++ if (meta_v4) { |
13319 |
++ if (skb->protocol == htons(ETH_P_IPV6)) { |
13320 |
++ mptcp_debug("SYN+MP_JOIN with IPV6 address on pure IPV4 meta\n"); |
13321 |
++ sock_put(meta_sk); /* Taken by mptcp_hash_find */ |
13322 |
++ return -1; |
13323 |
++ } |
13324 |
++ } else if (skb->protocol == htons(ETH_P_IP) && |
13325 |
++ inet6_sk(meta_sk)->ipv6only) { |
13326 |
++ mptcp_debug("SYN+MP_JOIN with IPV4 address on IPV6_V6ONLY meta\n"); |
13327 |
++ sock_put(meta_sk); /* Taken by mptcp_hash_find */ |
13328 |
++ return -1; |
13329 |
++ } |
13330 |
++ |
13331 |
++ TCP_SKB_CB(skb)->mptcp_flags |= MPTCPHDR_JOIN; |
13332 |
++ |
13333 |
++ /* OK, this is a new syn/join, let's create a new open request and |
13334 |
++ * send syn+ack |
13335 |
++ */ |
13336 |
++ bh_lock_sock(meta_sk); |
13337 |
++ |
13338 |
++ /* This check is also done in mptcp_vX_do_rcv. But, there we cannot |
13339 |
++ * call tcp_vX_send_reset, because we hold already two socket-locks. |
13340 |
++ * (the listener and the meta from above) |
13341 |
++ * |
13342 |
++ * And the send-reset will try to take yet another one (ip_send_reply). |
13343 |
++ * Thus, we propagate the reset up to tcp_rcv_state_process. |
13344 |
++ */ |
13345 |
++ if (tcp_sk(meta_sk)->mpcb->infinite_mapping_rcv || |
13346 |
++ tcp_sk(meta_sk)->mpcb->send_infinite_mapping || |
13347 |
++ meta_sk->sk_state == TCP_CLOSE || !tcp_sk(meta_sk)->inside_tk_table) { |
13348 |
++ bh_unlock_sock(meta_sk); |
13349 |
++ sock_put(meta_sk); /* Taken by mptcp_hash_find */ |
13350 |
++ return -1; |
13351 |
++ } |
13352 |
++ |
13353 |
++ if (sock_owned_by_user(meta_sk)) { |
13354 |
++ skb->sk = meta_sk; |
13355 |
++ if (unlikely(sk_add_backlog(meta_sk, skb, |
13356 |
++ meta_sk->sk_rcvbuf + meta_sk->sk_sndbuf))) |
13357 |
++ NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP); |
13358 |
++ else |
13359 |
++ /* Must make sure that upper layers won't free the |
13360 |
++ * skb if it is added to the backlog-queue. |
13361 |
++ */ |
13362 |
++ skb_get(skb); |
13363 |
++ } else { |
13364 |
++ /* mptcp_v4_do_rcv tries to free the skb - we prevent this, as |
13365 |
++ * the skb will finally be freed by tcp_v4_do_rcv (where we are |
13366 |
++ * coming from) |
13367 |
++ */ |
13368 |
++ skb_get(skb); |
13369 |
++ if (skb->protocol == htons(ETH_P_IP)) { |
13370 |
++ tcp_v4_do_rcv(meta_sk, skb); |
13371 |
++#if IS_ENABLED(CONFIG_IPV6) |
13372 |
++ } else { /* IPv6 */ |
13373 |
++ tcp_v6_do_rcv(meta_sk, skb); |
13374 |
++#endif /* CONFIG_IPV6 */ |
13375 |
++ } |
13376 |
++ } |
13377 |
++ |
13378 |
++ bh_unlock_sock(meta_sk); |
13379 |
++ sock_put(meta_sk); /* Taken by mptcp_hash_find */ |
13380 |
++ return 0; |
13381 |
++} |
13382 |
++ |
13383 |
++/** |
13384 |
++ * Equivalent of tcp_fin() for MPTCP |
13385 |
++ * Can be called only when the FIN is validly part |
13386 |
++ * of the data seqnum space. Not before when we get holes. |
13387 |
++ */ |
13388 |
++void mptcp_fin(struct sock *meta_sk) |
13389 |
++{ |
13390 |
++ struct sock *sk = NULL, *sk_it; |
13391 |
++ struct tcp_sock *meta_tp = tcp_sk(meta_sk); |
13392 |
++ struct mptcp_cb *mpcb = meta_tp->mpcb; |
13393 |
++ |
13394 |
++ mptcp_for_each_sk(mpcb, sk_it) { |
13395 |
++ if (tcp_sk(sk_it)->mptcp->path_index == mpcb->dfin_path_index) { |
13396 |
++ sk = sk_it; |
13397 |
++ break; |
13398 |
++ } |
13399 |
++ } |
13400 |
++ |
13401 |
++ if (!sk || sk->sk_state == TCP_CLOSE) |
13402 |
++ sk = mptcp_select_ack_sock(meta_sk); |
13403 |
++ |
13404 |
++ inet_csk_schedule_ack(sk); |
13405 |
++ |
13406 |
++ meta_sk->sk_shutdown |= RCV_SHUTDOWN; |
13407 |
++ sock_set_flag(meta_sk, SOCK_DONE); |
13408 |
++ |
13409 |
++ switch (meta_sk->sk_state) { |
13410 |
++ case TCP_SYN_RECV: |
13411 |
++ case TCP_ESTABLISHED: |
13412 |
++ /* Move to CLOSE_WAIT */ |
13413 |
++ tcp_set_state(meta_sk, TCP_CLOSE_WAIT); |
13414 |
++ inet_csk(sk)->icsk_ack.pingpong = 1; |
13415 |
++ break; |
13416 |
++ |
13417 |
++ case TCP_CLOSE_WAIT: |
13418 |
++ case TCP_CLOSING: |
13419 |
++ /* Received a retransmission of the FIN, do |
13420 |
++ * nothing. |
13421 |
++ */ |
13422 |
++ break; |
13423 |
++ case TCP_LAST_ACK: |
13424 |
++ /* RFC793: Remain in the LAST-ACK state. */ |
13425 |
++ break; |
13426 |
++ |
13427 |
++ case TCP_FIN_WAIT1: |
13428 |
++ /* This case occurs when a simultaneous close |
13429 |
++ * happens, we must ack the received FIN and |
13430 |
++ * enter the CLOSING state. |
13431 |
++ */ |
13432 |
++ tcp_send_ack(sk); |
13433 |
++ tcp_set_state(meta_sk, TCP_CLOSING); |
13434 |
++ break; |
13435 |
++ case TCP_FIN_WAIT2: |
13436 |
++ /* Received a FIN -- send ACK and enter TIME_WAIT. */ |
13437 |
++ tcp_send_ack(sk); |
13438 |
++ meta_tp->ops->time_wait(meta_sk, TCP_TIME_WAIT, 0); |
13439 |
++ break; |
13440 |
++ default: |
13441 |
++ /* Only TCP_LISTEN and TCP_CLOSE are left, in these |
13442 |
++ * cases we should never reach this piece of code. |
13443 |
++ */ |
13444 |
++ pr_err("%s: Impossible, meta_sk->sk_state=%d\n", __func__, |
13445 |
++ meta_sk->sk_state); |
13446 |
++ break; |
13447 |
++ } |
13448 |
++ |
13449 |
++ /* It _is_ possible, that we have something out-of-order _after_ FIN. |
13450 |
++ * Probably, we should reset in this case. For now drop them. |
13451 |
++ */ |
13452 |
++ mptcp_purge_ofo_queue(meta_tp); |
13453 |
++ sk_mem_reclaim(meta_sk); |
13454 |
++ |
13455 |
++ if (!sock_flag(meta_sk, SOCK_DEAD)) { |
13456 |
++ meta_sk->sk_state_change(meta_sk); |
13457 |
++ |
13458 |
++ /* Do not send POLL_HUP for half duplex close. */ |
13459 |
++ if (meta_sk->sk_shutdown == SHUTDOWN_MASK || |
13460 |
++ meta_sk->sk_state == TCP_CLOSE) |
13461 |
++ sk_wake_async(meta_sk, SOCK_WAKE_WAITD, POLL_HUP); |
13462 |
++ else |
13463 |
++ sk_wake_async(meta_sk, SOCK_WAKE_WAITD, POLL_IN); |
13464 |
++ } |
13465 |
++ |
13466 |
++ return; |
13467 |
++} |
13468 |
++ |
13469 |
++static void mptcp_xmit_retransmit_queue(struct sock *meta_sk) |
13470 |
++{ |
13471 |
++ struct tcp_sock *meta_tp = tcp_sk(meta_sk); |
13472 |
++ struct sk_buff *skb; |
13473 |
++ |
13474 |
++ if (!meta_tp->packets_out) |
13475 |
++ return; |
13476 |
++ |
13477 |
++ tcp_for_write_queue(skb, meta_sk) { |
13478 |
++ if (skb == tcp_send_head(meta_sk)) |
13479 |
++ break; |
13480 |
++ |
13481 |
++ if (mptcp_retransmit_skb(meta_sk, skb)) |
13482 |
++ return; |
13483 |
++ |
13484 |
++ if (skb == tcp_write_queue_head(meta_sk)) |
13485 |
++ inet_csk_reset_xmit_timer(meta_sk, ICSK_TIME_RETRANS, |
13486 |
++ inet_csk(meta_sk)->icsk_rto, |
13487 |
++ TCP_RTO_MAX); |
13488 |
++ } |
13489 |
++} |
13490 |
++ |
13491 |
++/* Handle the DATA_ACK */ |
13492 |
++static void mptcp_data_ack(struct sock *sk, const struct sk_buff *skb) |
13493 |
++{ |
13494 |
++ struct sock *meta_sk = mptcp_meta_sk(sk); |
13495 |
++ struct tcp_sock *meta_tp = tcp_sk(meta_sk), *tp = tcp_sk(sk); |
13496 |
++ struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); |
13497 |
++ u32 prior_snd_una = meta_tp->snd_una; |
13498 |
++ int prior_packets; |
13499 |
++ u32 nwin, data_ack, data_seq; |
13500 |
++ u16 data_len = 0; |
13501 |
++ |
13502 |
++ /* A valid packet came in - subflow is operational again */ |
13503 |
++ tp->pf = 0; |
13504 |
++ |
13505 |
++ /* Even if there is no data-ack, we stop retransmitting. |
13506 |
++ * Except if this is a SYN/ACK. Then it is just a retransmission |
13507 |
++ */ |
13508 |
++ if (tp->mptcp->pre_established && !tcp_hdr(skb)->syn) { |
13509 |
++ tp->mptcp->pre_established = 0; |
13510 |
++ sk_stop_timer(sk, &tp->mptcp->mptcp_ack_timer); |
13511 |
++ } |
13512 |
++ |
13513 |
++ /* If we are in infinite mapping mode, rx_opt.data_ack has been |
13514 |
++ * set by mptcp_clean_rtx_infinite. |
13515 |
++ */ |
13516 |
++ if (!(tcb->mptcp_flags & MPTCPHDR_ACK) && !tp->mpcb->infinite_mapping_snd) |
13517 |
++ goto exit; |
13518 |
++ |
13519 |
++ data_ack = tp->mptcp->rx_opt.data_ack; |
13520 |
++ |
13521 |
++ if (unlikely(!tp->mptcp->fully_established) && |
13522 |
++ tp->mptcp->snt_isn + 1 != TCP_SKB_CB(skb)->ack_seq) |
13523 |
++ /* As soon as a subflow-data-ack (not acking syn, thus snt_isn + 1) |
13524 |
++ * includes a data-ack, we are fully established |
13525 |
++ */ |
13526 |
++ mptcp_become_fully_estab(sk); |
13527 |
++ |
13528 |
++ /* Get the data_seq */ |
13529 |
++ if (mptcp_is_data_seq(skb)) { |
13530 |
++ data_seq = tp->mptcp->rx_opt.data_seq; |
13531 |
++ data_len = tp->mptcp->rx_opt.data_len; |
13532 |
++ } else { |
13533 |
++ data_seq = meta_tp->snd_wl1; |
13534 |
++ } |
13535 |
++ |
13536 |
++ /* If the ack is older than previous acks |
13537 |
++ * then we can probably ignore it. |
13538 |
++ */ |
13539 |
++ if (before(data_ack, prior_snd_una)) |
13540 |
++ goto exit; |
13541 |
++ |
13542 |
++ /* If the ack includes data we haven't sent yet, discard |
13543 |
++ * this segment (RFC793 Section 3.9). |
13544 |
++ */ |
13545 |
++ if (after(data_ack, meta_tp->snd_nxt)) |
13546 |
++ goto exit; |
13547 |
++ |
13548 |
++ /*** Now, update the window - inspired by tcp_ack_update_window ***/ |
13549 |
++ nwin = ntohs(tcp_hdr(skb)->window); |
13550 |
++ |
13551 |
++ if (likely(!tcp_hdr(skb)->syn)) |
13552 |
++ nwin <<= tp->rx_opt.snd_wscale; |
13553 |
++ |
13554 |
++ if (tcp_may_update_window(meta_tp, data_ack, data_seq, nwin)) { |
13555 |
++ tcp_update_wl(meta_tp, data_seq); |
13556 |
++ |
13557 |
++ /* Draft v09, Section 3.3.5: |
13558 |
++ * [...] It should only update its local receive window values |
13559 |
++ * when the largest sequence number allowed (i.e. DATA_ACK + |
13560 |
++ * receive window) increases. [...] |
13561 |
++ */ |
13562 |
++ if (meta_tp->snd_wnd != nwin && |
13563 |
++ !before(data_ack + nwin, tcp_wnd_end(meta_tp))) { |
13564 |
++ meta_tp->snd_wnd = nwin; |
13565 |
++ |
13566 |
++ if (nwin > meta_tp->max_window) |
13567 |
++ meta_tp->max_window = nwin; |
13568 |
++ } |
13569 |
++ } |
13570 |
++ /*** Done, update the window ***/ |
13571 |
++ |
13572 |
++ /* We passed data and got it acked, remove any soft error |
13573 |
++ * log. Something worked... |
13574 |
++ */ |
13575 |
++ sk->sk_err_soft = 0; |
13576 |
++ inet_csk(meta_sk)->icsk_probes_out = 0; |
13577 |
++ meta_tp->rcv_tstamp = tcp_time_stamp; |
13578 |
++ prior_packets = meta_tp->packets_out; |
13579 |
++ if (!prior_packets) |
13580 |
++ goto no_queue; |
13581 |
++ |
13582 |
++ meta_tp->snd_una = data_ack; |
13583 |
++ |
13584 |
++ mptcp_clean_rtx_queue(meta_sk, prior_snd_una); |
13585 |
++ |
13586 |
++ /* We are in loss-state, and something got acked, retransmit the whole |
13587 |
++ * queue now! |
13588 |
++ */ |
13589 |
++ if (inet_csk(meta_sk)->icsk_ca_state == TCP_CA_Loss && |
13590 |
++ after(data_ack, prior_snd_una)) { |
13591 |
++ mptcp_xmit_retransmit_queue(meta_sk); |
13592 |
++ inet_csk(meta_sk)->icsk_ca_state = TCP_CA_Open; |
13593 |
++ } |
13594 |
++ |
13595 |
++ /* Simplified version of tcp_new_space, because the snd-buffer |
13596 |
++ * is handled by all the subflows. |
13597 |
++ */ |
13598 |
++ if (sock_flag(meta_sk, SOCK_QUEUE_SHRUNK)) { |
13599 |
++ sock_reset_flag(meta_sk, SOCK_QUEUE_SHRUNK); |
13600 |
++ if (meta_sk->sk_socket && |
13601 |
++ test_bit(SOCK_NOSPACE, &meta_sk->sk_socket->flags)) |
13602 |
++ meta_sk->sk_write_space(meta_sk); |
13603 |
++ } |
13604 |
++ |
13605 |
++ if (meta_sk->sk_state != TCP_ESTABLISHED && |
13606 |
++ mptcp_rcv_state_process(meta_sk, sk, skb, data_seq, data_len)) |
13607 |
++ return; |
13608 |
++ |
13609 |
++exit: |
13610 |
++ mptcp_push_pending_frames(meta_sk); |
13611 |
++ |
13612 |
++ return; |
13613 |
++ |
13614 |
++no_queue: |
13615 |
++ if (tcp_send_head(meta_sk)) |
13616 |
++ tcp_ack_probe(meta_sk); |
13617 |
++ |
13618 |
++ mptcp_push_pending_frames(meta_sk); |
13619 |
++ |
13620 |
++ return; |
13621 |
++} |
13622 |
++ |
13623 |
++void mptcp_clean_rtx_infinite(const struct sk_buff *skb, struct sock *sk) |
13624 |
++{ |
13625 |
++ struct tcp_sock *tp = tcp_sk(sk), *meta_tp = tcp_sk(mptcp_meta_sk(sk)); |
13626 |
++ |
13627 |
++ if (!tp->mpcb->infinite_mapping_snd) |
13628 |
++ return; |
13629 |
++ |
13630 |
++ /* The difference between both write_seq's represents the offset between |
13631 |
++ * data-sequence and subflow-sequence. As we are infinite, this must |
13632 |
++ * match. |
13633 |
++ * |
13634 |
++ * Thus, from this difference we can infer the meta snd_una. |
13635 |
++ */ |
13636 |
++ tp->mptcp->rx_opt.data_ack = meta_tp->snd_nxt - tp->snd_nxt + |
13637 |
++ tp->snd_una; |
13638 |
++ |
13639 |
++ mptcp_data_ack(sk, skb); |
13640 |
++} |
13641 |
++ |
13642 |
++/**** static functions used by mptcp_parse_options */ |
13643 |
++ |
13644 |
++static void mptcp_send_reset_rem_id(const struct mptcp_cb *mpcb, u8 rem_id) |
13645 |
++{ |
13646 |
++ struct sock *sk_it, *tmpsk; |
13647 |
++ |
13648 |
++ mptcp_for_each_sk_safe(mpcb, sk_it, tmpsk) { |
13649 |
++ if (tcp_sk(sk_it)->mptcp->rem_id == rem_id) { |
13650 |
++ mptcp_reinject_data(sk_it, 0); |
13651 |
++ sk_it->sk_err = ECONNRESET; |
13652 |
++ if (tcp_need_reset(sk_it->sk_state)) |
13653 |
++ tcp_sk(sk_it)->ops->send_active_reset(sk_it, |
13654 |
++ GFP_ATOMIC); |
13655 |
++ mptcp_sub_force_close(sk_it); |
13656 |
++ } |
13657 |
++ } |
13658 |
++} |
13659 |
++ |
13660 |
++void mptcp_parse_options(const uint8_t *ptr, int opsize, |
13661 |
++ struct mptcp_options_received *mopt, |
13662 |
++ const struct sk_buff *skb) |
13663 |
++{ |
13664 |
++ const struct mptcp_option *mp_opt = (struct mptcp_option *)ptr; |
13665 |
++ |
13666 |
++ /* If the socket is mp-capable we would have a mopt. */ |
13667 |
++ if (!mopt) |
13668 |
++ return; |
13669 |
++ |
13670 |
++ switch (mp_opt->sub) { |
13671 |
++ case MPTCP_SUB_CAPABLE: |
13672 |
++ { |
13673 |
++ const struct mp_capable *mpcapable = (struct mp_capable *)ptr; |
13674 |
++ |
13675 |
++ if (opsize != MPTCP_SUB_LEN_CAPABLE_SYN && |
13676 |
++ opsize != MPTCP_SUB_LEN_CAPABLE_ACK) { |
13677 |
++ mptcp_debug("%s: mp_capable: bad option size %d\n", |
13678 |
++ __func__, opsize); |
13679 |
++ break; |
13680 |
++ } |
13681 |
++ |
13682 |
++ if (!sysctl_mptcp_enabled) |
13683 |
++ break; |
13684 |
++ |
13685 |
++ /* We only support MPTCP version 0 */ |
13686 |
++ if (mpcapable->ver != 0) |
13687 |
++ break; |
13688 |
++ |
13689 |
++ /* MPTCP-RFC 6824: |
13690 |
++ * "If receiving a message with the 'B' flag set to 1, and this |
13691 |
++ * is not understood, then this SYN MUST be silently ignored; |
13692 |
++ */ |
13693 |
++ if (mpcapable->b) { |
13694 |
++ mopt->drop_me = 1; |
13695 |
++ break; |
13696 |
++ } |
13697 |
++ |
13698 |
++ /* MPTCP-RFC 6824: |
13699 |
++ * "An implementation that only supports this method MUST set |
13700 |
++ * bit "H" to 1, and bits "C" through "G" to 0." |
13701 |
++ */ |
13702 |
++ if (!mpcapable->h) |
13703 |
++ break; |
13704 |
++ |
13705 |
++ mopt->saw_mpc = 1; |
13706 |
++ mopt->dss_csum = sysctl_mptcp_checksum || mpcapable->a; |
13707 |
++ |
13708 |
++ if (opsize >= MPTCP_SUB_LEN_CAPABLE_SYN) |
13709 |
++ mopt->mptcp_key = mpcapable->sender_key; |
13710 |
++ |
13711 |
++ break; |
13712 |
++ } |
13713 |
++ case MPTCP_SUB_JOIN: |
13714 |
++ { |
13715 |
++ const struct mp_join *mpjoin = (struct mp_join *)ptr; |
13716 |
++ |
13717 |
++ if (opsize != MPTCP_SUB_LEN_JOIN_SYN && |
13718 |
++ opsize != MPTCP_SUB_LEN_JOIN_SYNACK && |
13719 |
++ opsize != MPTCP_SUB_LEN_JOIN_ACK) { |
13720 |
++ mptcp_debug("%s: mp_join: bad option size %d\n", |
13721 |
++ __func__, opsize); |
13722 |
++ break; |
13723 |
++ } |
13724 |
++ |
13725 |
++ /* saw_mpc must be set, because in tcp_check_req we assume that |
13726 |
++ * it is set to support falling back to reg. TCP if a rexmitted |
13727 |
++ * SYN has no MP_CAPABLE or MP_JOIN |
13728 |
++ */ |
13729 |
++ switch (opsize) { |
13730 |
++ case MPTCP_SUB_LEN_JOIN_SYN: |
13731 |
++ mopt->is_mp_join = 1; |
13732 |
++ mopt->saw_mpc = 1; |
13733 |
++ mopt->low_prio = mpjoin->b; |
13734 |
++ mopt->rem_id = mpjoin->addr_id; |
13735 |
++ mopt->mptcp_rem_token = mpjoin->u.syn.token; |
13736 |
++ mopt->mptcp_recv_nonce = mpjoin->u.syn.nonce; |
13737 |
++ break; |
13738 |
++ case MPTCP_SUB_LEN_JOIN_SYNACK: |
13739 |
++ mopt->saw_mpc = 1; |
13740 |
++ mopt->low_prio = mpjoin->b; |
13741 |
++ mopt->rem_id = mpjoin->addr_id; |
13742 |
++ mopt->mptcp_recv_tmac = mpjoin->u.synack.mac; |
13743 |
++ mopt->mptcp_recv_nonce = mpjoin->u.synack.nonce; |
13744 |
++ break; |
13745 |
++ case MPTCP_SUB_LEN_JOIN_ACK: |
13746 |
++ mopt->saw_mpc = 1; |
13747 |
++ mopt->join_ack = 1; |
13748 |
++ memcpy(mopt->mptcp_recv_mac, mpjoin->u.ack.mac, 20); |
13749 |
++ break; |
13750 |
++ } |
13751 |
++ break; |
13752 |
++ } |
13753 |
++ case MPTCP_SUB_DSS: |
13754 |
++ { |
13755 |
++ const struct mp_dss *mdss = (struct mp_dss *)ptr; |
13756 |
++ struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); |
13757 |
++ |
13758 |
++ /* We check opsize for the csum and non-csum case. We do this, |
13759 |
++ * because the draft says that the csum SHOULD be ignored if |
13760 |
++ * it has not been negotiated in the MP_CAPABLE but still is |
13761 |
++ * present in the data. |
13762 |
++ * |
13763 |
++ * It will get ignored later in mptcp_queue_skb. |
13764 |
++ */ |
13765 |
++ if (opsize != mptcp_sub_len_dss(mdss, 0) && |
13766 |
++ opsize != mptcp_sub_len_dss(mdss, 1)) { |
13767 |
++ mptcp_debug("%s: mp_dss: bad option size %d\n", |
13768 |
++ __func__, opsize); |
13769 |
++ break; |
13770 |
++ } |
13771 |
++ |
13772 |
++ ptr += 4; |
13773 |
++ |
13774 |
++ if (mdss->A) { |
13775 |
++ tcb->mptcp_flags |= MPTCPHDR_ACK; |
13776 |
++ |
13777 |
++ if (mdss->a) { |
13778 |
++ mopt->data_ack = (u32) get_unaligned_be64(ptr); |
13779 |
++ ptr += MPTCP_SUB_LEN_ACK_64; |
13780 |
++ } else { |
13781 |
++ mopt->data_ack = get_unaligned_be32(ptr); |
13782 |
++ ptr += MPTCP_SUB_LEN_ACK; |
13783 |
++ } |
13784 |
++ } |
13785 |
++ |
13786 |
++ tcb->dss_off = (ptr - skb_transport_header(skb)); |
13787 |
++ |
13788 |
++ if (mdss->M) { |
13789 |
++ if (mdss->m) { |
13790 |
++ u64 data_seq64 = get_unaligned_be64(ptr); |
13791 |
++ |
13792 |
++ tcb->mptcp_flags |= MPTCPHDR_SEQ64_SET; |
13793 |
++ mopt->data_seq = (u32) data_seq64; |
13794 |
++ |
13795 |
++ ptr += 12; /* 64-bit dseq + subseq */ |
13796 |
++ } else { |
13797 |
++ mopt->data_seq = get_unaligned_be32(ptr); |
13798 |
++ ptr += 8; /* 32-bit dseq + subseq */ |
13799 |
++ } |
13800 |
++ mopt->data_len = get_unaligned_be16(ptr); |
13801 |
++ |
13802 |
++ tcb->mptcp_flags |= MPTCPHDR_SEQ; |
13803 |
++ |
13804 |
++ /* Is a check-sum present? */ |
13805 |
++ if (opsize == mptcp_sub_len_dss(mdss, 1)) |
13806 |
++ tcb->mptcp_flags |= MPTCPHDR_DSS_CSUM; |
13807 |
++ |
13808 |
++ /* DATA_FIN only possible with DSS-mapping */ |
13809 |
++ if (mdss->F) |
13810 |
++ tcb->mptcp_flags |= MPTCPHDR_FIN; |
13811 |
++ } |
13812 |
++ |
13813 |
++ break; |
13814 |
++ } |
13815 |
++ case MPTCP_SUB_ADD_ADDR: |
13816 |
++ { |
13817 |
++#if IS_ENABLED(CONFIG_IPV6) |
13818 |
++ const struct mp_add_addr *mpadd = (struct mp_add_addr *)ptr; |
13819 |
++ |
13820 |
++ if ((mpadd->ipver == 4 && opsize != MPTCP_SUB_LEN_ADD_ADDR4 && |
13821 |
++ opsize != MPTCP_SUB_LEN_ADD_ADDR4 + 2) || |
13822 |
++ (mpadd->ipver == 6 && opsize != MPTCP_SUB_LEN_ADD_ADDR6 && |
13823 |
++ opsize != MPTCP_SUB_LEN_ADD_ADDR6 + 2)) { |
13824 |
++#else |
13825 |
++ if (opsize != MPTCP_SUB_LEN_ADD_ADDR4 && |
13826 |
++ opsize != MPTCP_SUB_LEN_ADD_ADDR4 + 2) { |
13827 |
++#endif /* CONFIG_IPV6 */ |
13828 |
++ mptcp_debug("%s: mp_add_addr: bad option size %d\n", |
13829 |
++ __func__, opsize); |
13830 |
++ break; |
13831 |
++ } |
13832 |
++ |
13833 |
++ /* We have to manually parse the options if we got two of them. */ |
13834 |
++ if (mopt->saw_add_addr) { |
13835 |
++ mopt->more_add_addr = 1; |
13836 |
++ break; |
13837 |
++ } |
13838 |
++ mopt->saw_add_addr = 1; |
13839 |
++ mopt->add_addr_ptr = ptr; |
13840 |
++ break; |
13841 |
++ } |
13842 |
++ case MPTCP_SUB_REMOVE_ADDR: |
13843 |
++ if ((opsize - MPTCP_SUB_LEN_REMOVE_ADDR) < 0) { |
13844 |
++ mptcp_debug("%s: mp_remove_addr: bad option size %d\n", |
13845 |
++ __func__, opsize); |
13846 |
++ break; |
13847 |
++ } |
13848 |
++ |
13849 |
++ if (mopt->saw_rem_addr) { |
13850 |
++ mopt->more_rem_addr = 1; |
13851 |
++ break; |
13852 |
++ } |
13853 |
++ mopt->saw_rem_addr = 1; |
13854 |
++ mopt->rem_addr_ptr = ptr; |
13855 |
++ break; |
13856 |
++ case MPTCP_SUB_PRIO: |
13857 |
++ { |
13858 |
++ const struct mp_prio *mpprio = (struct mp_prio *)ptr; |
13859 |
++ |
13860 |
++ if (opsize != MPTCP_SUB_LEN_PRIO && |
13861 |
++ opsize != MPTCP_SUB_LEN_PRIO_ADDR) { |
13862 |
++ mptcp_debug("%s: mp_prio: bad option size %d\n", |
13863 |
++ __func__, opsize); |
13864 |
++ break; |
13865 |
++ } |
13866 |
++ |
13867 |
++ mopt->saw_low_prio = 1; |
13868 |
++ mopt->low_prio = mpprio->b; |
13869 |
++ |
13870 |
++ if (opsize == MPTCP_SUB_LEN_PRIO_ADDR) { |
13871 |
++ mopt->saw_low_prio = 2; |
13872 |
++ mopt->prio_addr_id = mpprio->addr_id; |
13873 |
++ } |
13874 |
++ break; |
13875 |
++ } |
13876 |
++ case MPTCP_SUB_FAIL: |
13877 |
++ if (opsize != MPTCP_SUB_LEN_FAIL) { |
13878 |
++ mptcp_debug("%s: mp_fail: bad option size %d\n", |
13879 |
++ __func__, opsize); |
13880 |
++ break; |
13881 |
++ } |
13882 |
++ mopt->mp_fail = 1; |
13883 |
++ break; |
13884 |
++ case MPTCP_SUB_FCLOSE: |
13885 |
++ if (opsize != MPTCP_SUB_LEN_FCLOSE) { |
13886 |
++ mptcp_debug("%s: mp_fclose: bad option size %d\n", |
13887 |
++ __func__, opsize); |
13888 |
++ break; |
13889 |
++ } |
13890 |
++ |
13891 |
++ mopt->mp_fclose = 1; |
13892 |
++ mopt->mptcp_key = ((struct mp_fclose *)ptr)->key; |
13893 |
++ |
13894 |
++ break; |
13895 |
++ default: |
13896 |
++ mptcp_debug("%s: Received unkown subtype: %d\n", |
13897 |
++ __func__, mp_opt->sub); |
13898 |
++ break; |
13899 |
++ } |
13900 |
++} |
13901 |
++ |
13902 |
++/** Parse only MPTCP options */ |
13903 |
++void tcp_parse_mptcp_options(const struct sk_buff *skb, |
13904 |
++ struct mptcp_options_received *mopt) |
13905 |
++{ |
13906 |
++ const struct tcphdr *th = tcp_hdr(skb); |
13907 |
++ int length = (th->doff * 4) - sizeof(struct tcphdr); |
13908 |
++ const unsigned char *ptr = (const unsigned char *)(th + 1); |
13909 |
++ |
13910 |
++ while (length > 0) { |
13911 |
++ int opcode = *ptr++; |
13912 |
++ int opsize; |
13913 |
++ |
13914 |
++ switch (opcode) { |
13915 |
++ case TCPOPT_EOL: |
13916 |
++ return; |
13917 |
++ case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */ |
13918 |
++ length--; |
13919 |
++ continue; |
13920 |
++ default: |
13921 |
++ opsize = *ptr++; |
13922 |
++ if (opsize < 2) /* "silly options" */ |
13923 |
++ return; |
13924 |
++ if (opsize > length) |
13925 |
++ return; /* don't parse partial options */ |
13926 |
++ if (opcode == TCPOPT_MPTCP) |
13927 |
++ mptcp_parse_options(ptr - 2, opsize, mopt, skb); |
13928 |
++ } |
13929 |
++ ptr += opsize - 2; |
13930 |
++ length -= opsize; |
13931 |
++ } |
13932 |
++} |
13933 |
++ |
13934 |
++int mptcp_check_rtt(const struct tcp_sock *tp, int time) |
13935 |
++{ |
13936 |
++ struct mptcp_cb *mpcb = tp->mpcb; |
13937 |
++ struct sock *sk; |
13938 |
++ u32 rtt_max = 0; |
13939 |
++ |
13940 |
++ /* In MPTCP, we take the max delay across all flows, |
13941 |
++ * in order to take into account meta-reordering buffers. |
13942 |
++ */ |
13943 |
++ mptcp_for_each_sk(mpcb, sk) { |
13944 |
++ if (!mptcp_sk_can_recv(sk)) |
13945 |
++ continue; |
13946 |
++ |
13947 |
++ if (rtt_max < tcp_sk(sk)->rcv_rtt_est.rtt) |
13948 |
++ rtt_max = tcp_sk(sk)->rcv_rtt_est.rtt; |
13949 |
++ } |
13950 |
++ if (time < (rtt_max >> 3) || !rtt_max) |
13951 |
++ return 1; |
13952 |
++ |
13953 |
++ return 0; |
13954 |
++} |
13955 |
++ |
13956 |
++static void mptcp_handle_add_addr(const unsigned char *ptr, struct sock *sk) |
13957 |
++{ |
13958 |
++ struct mp_add_addr *mpadd = (struct mp_add_addr *)ptr; |
13959 |
++ struct mptcp_cb *mpcb = tcp_sk(sk)->mpcb; |
13960 |
++ __be16 port = 0; |
13961 |
++ union inet_addr addr; |
13962 |
++ sa_family_t family; |
13963 |
++ |
13964 |
++ if (mpadd->ipver == 4) { |
13965 |
++ if (mpadd->len == MPTCP_SUB_LEN_ADD_ADDR4 + 2) |
13966 |
++ port = mpadd->u.v4.port; |
13967 |
++ family = AF_INET; |
13968 |
++ addr.in = mpadd->u.v4.addr; |
13969 |
++#if IS_ENABLED(CONFIG_IPV6) |
13970 |
++ } else if (mpadd->ipver == 6) { |
13971 |
++ if (mpadd->len == MPTCP_SUB_LEN_ADD_ADDR6 + 2) |
13972 |
++ port = mpadd->u.v6.port; |
13973 |
++ family = AF_INET6; |
13974 |
++ addr.in6 = mpadd->u.v6.addr; |
13975 |
++#endif /* CONFIG_IPV6 */ |
13976 |
++ } else { |
13977 |
++ return; |
13978 |
++ } |
13979 |
++ |
13980 |
++ if (mpcb->pm_ops->add_raddr) |
13981 |
++ mpcb->pm_ops->add_raddr(mpcb, &addr, family, port, mpadd->addr_id); |
13982 |
++} |
13983 |
++ |
13984 |
++static void mptcp_handle_rem_addr(const unsigned char *ptr, struct sock *sk) |
13985 |
++{ |
13986 |
++ struct mp_remove_addr *mprem = (struct mp_remove_addr *)ptr; |
13987 |
++ int i; |
13988 |
++ u8 rem_id; |
13989 |
++ struct mptcp_cb *mpcb = tcp_sk(sk)->mpcb; |
13990 |
++ |
13991 |
++ for (i = 0; i <= mprem->len - MPTCP_SUB_LEN_REMOVE_ADDR; i++) { |
13992 |
++ rem_id = (&mprem->addrs_id)[i]; |
13993 |
++ |
13994 |
++ if (mpcb->pm_ops->rem_raddr) |
13995 |
++ mpcb->pm_ops->rem_raddr(mpcb, rem_id); |
13996 |
++ mptcp_send_reset_rem_id(mpcb, rem_id); |
13997 |
++ } |
13998 |
++} |
13999 |
++ |
14000 |
++static void mptcp_parse_addropt(const struct sk_buff *skb, struct sock *sk) |
14001 |
++{ |
14002 |
++ struct tcphdr *th = tcp_hdr(skb); |
14003 |
++ unsigned char *ptr; |
14004 |
++ int length = (th->doff * 4) - sizeof(struct tcphdr); |
14005 |
++ |
14006 |
++ /* Jump through the options to check whether ADD_ADDR is there */ |
14007 |
++ ptr = (unsigned char *)(th + 1); |
14008 |
++ while (length > 0) { |
14009 |
++ int opcode = *ptr++; |
14010 |
++ int opsize; |
14011 |
++ |
14012 |
++ switch (opcode) { |
14013 |
++ case TCPOPT_EOL: |
14014 |
++ return; |
14015 |
++ case TCPOPT_NOP: |
14016 |
++ length--; |
14017 |
++ continue; |
14018 |
++ default: |
14019 |
++ opsize = *ptr++; |
14020 |
++ if (opsize < 2) |
14021 |
++ return; |
14022 |
++ if (opsize > length) |
14023 |
++ return; /* don't parse partial options */ |
14024 |
++ if (opcode == TCPOPT_MPTCP && |
14025 |
++ ((struct mptcp_option *)ptr)->sub == MPTCP_SUB_ADD_ADDR) { |
14026 |
++#if IS_ENABLED(CONFIG_IPV6) |
14027 |
++ struct mp_add_addr *mpadd = (struct mp_add_addr *)ptr; |
14028 |
++ if ((mpadd->ipver == 4 && opsize != MPTCP_SUB_LEN_ADD_ADDR4 && |
14029 |
++ opsize != MPTCP_SUB_LEN_ADD_ADDR4 + 2) || |
14030 |
++ (mpadd->ipver == 6 && opsize != MPTCP_SUB_LEN_ADD_ADDR6 && |
14031 |
++ opsize != MPTCP_SUB_LEN_ADD_ADDR6 + 2)) |
14032 |
++#else |
14033 |
++ if (opsize != MPTCP_SUB_LEN_ADD_ADDR4 && |
14034 |
++ opsize != MPTCP_SUB_LEN_ADD_ADDR4 + 2) |
14035 |
++#endif /* CONFIG_IPV6 */ |
14036 |
++ goto cont; |
14037 |
++ |
14038 |
++ mptcp_handle_add_addr(ptr, sk); |
14039 |
++ } |
14040 |
++ if (opcode == TCPOPT_MPTCP && |
14041 |
++ ((struct mptcp_option *)ptr)->sub == MPTCP_SUB_REMOVE_ADDR) { |
14042 |
++ if ((opsize - MPTCP_SUB_LEN_REMOVE_ADDR) < 0) |
14043 |
++ goto cont; |
14044 |
++ |
14045 |
++ mptcp_handle_rem_addr(ptr, sk); |
14046 |
++ } |
14047 |
++cont: |
14048 |
++ ptr += opsize - 2; |
14049 |
++ length -= opsize; |
14050 |
++ } |
14051 |
++ } |
14052 |
++ return; |
14053 |
++} |
14054 |
++ |
14055 |
++static inline int mptcp_mp_fail_rcvd(struct sock *sk, const struct tcphdr *th) |
14056 |
++{ |
14057 |
++ struct mptcp_tcp_sock *mptcp = tcp_sk(sk)->mptcp; |
14058 |
++ struct sock *meta_sk = mptcp_meta_sk(sk); |
14059 |
++ struct mptcp_cb *mpcb = tcp_sk(sk)->mpcb; |
14060 |
++ |
14061 |
++ if (unlikely(mptcp->rx_opt.mp_fail)) { |
14062 |
++ mptcp->rx_opt.mp_fail = 0; |
14063 |
++ |
14064 |
++ if (!th->rst && !mpcb->infinite_mapping_snd) { |
14065 |
++ struct sock *sk_it; |
14066 |
++ |
14067 |
++ mpcb->send_infinite_mapping = 1; |
14068 |
++ /* We resend everything that has not been acknowledged */ |
14069 |
++ meta_sk->sk_send_head = tcp_write_queue_head(meta_sk); |
14070 |
++ |
14071 |
++ /* We artificially restart the whole send-queue. Thus, |
14072 |
++ * it is as if no packets are in flight |
14073 |
++ */ |
14074 |
++ tcp_sk(meta_sk)->packets_out = 0; |
14075 |
++ |
14076 |
++ /* If the snd_nxt already wrapped around, we have to |
14077 |
++ * undo the wrapping, as we are restarting from snd_una |
14078 |
++ * on. |
14079 |
++ */ |
14080 |
++ if (tcp_sk(meta_sk)->snd_nxt < tcp_sk(meta_sk)->snd_una) { |
14081 |
++ mpcb->snd_high_order[mpcb->snd_hiseq_index] -= 2; |
14082 |
++ mpcb->snd_hiseq_index = mpcb->snd_hiseq_index ? 0 : 1; |
14083 |
++ } |
14084 |
++ tcp_sk(meta_sk)->snd_nxt = tcp_sk(meta_sk)->snd_una; |
14085 |
++ |
14086 |
++ /* Trigger a sending on the meta. */ |
14087 |
++ mptcp_push_pending_frames(meta_sk); |
14088 |
++ |
14089 |
++ mptcp_for_each_sk(mpcb, sk_it) { |
14090 |
++ if (sk != sk_it) |
14091 |
++ mptcp_sub_force_close(sk_it); |
14092 |
++ } |
14093 |
++ } |
14094 |
++ |
14095 |
++ return 0; |
14096 |
++ } |
14097 |
++ |
14098 |
++ if (unlikely(mptcp->rx_opt.mp_fclose)) { |
14099 |
++ struct sock *sk_it, *tmpsk; |
14100 |
++ |
14101 |
++ mptcp->rx_opt.mp_fclose = 0; |
14102 |
++ if (mptcp->rx_opt.mptcp_key != mpcb->mptcp_loc_key) |
14103 |
++ return 0; |
14104 |
++ |
14105 |
++ if (tcp_need_reset(sk->sk_state)) |
14106 |
++ tcp_sk(sk)->ops->send_active_reset(sk, GFP_ATOMIC); |
14107 |
++ |
14108 |
++ mptcp_for_each_sk_safe(mpcb, sk_it, tmpsk) |
14109 |
++ mptcp_sub_force_close(sk_it); |
14110 |
++ |
14111 |
++ tcp_reset(meta_sk); |
14112 |
++ |
14113 |
++ return 1; |
14114 |
++ } |
14115 |
++ |
14116 |
++ return 0; |
14117 |
++} |
14118 |
++ |
14119 |
++static inline void mptcp_path_array_check(struct sock *meta_sk) |
14120 |
++{ |
14121 |
++ struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb; |
14122 |
++ |
14123 |
++ if (unlikely(mpcb->list_rcvd)) { |
14124 |
++ mpcb->list_rcvd = 0; |
14125 |
++ if (mpcb->pm_ops->new_remote_address) |
14126 |
++ mpcb->pm_ops->new_remote_address(meta_sk); |
14127 |
++ } |
14128 |
++} |
14129 |
++ |
14130 |
++int mptcp_handle_options(struct sock *sk, const struct tcphdr *th, |
14131 |
++ const struct sk_buff *skb) |
14132 |
++{ |
14133 |
++ struct tcp_sock *tp = tcp_sk(sk); |
14134 |
++ struct mptcp_options_received *mopt = &tp->mptcp->rx_opt; |
14135 |
++ |
14136 |
++ if (tp->mpcb->infinite_mapping_rcv || tp->mpcb->infinite_mapping_snd) |
14137 |
++ return 0; |
14138 |
++ |
14139 |
++ if (mptcp_mp_fail_rcvd(sk, th)) |
14140 |
++ return 1; |
14141 |
++ |
14142 |
++ /* RFC 6824, Section 3.3: |
14143 |
++ * If a checksum is not present when its use has been negotiated, the |
14144 |
++ * receiver MUST close the subflow with a RST as it is considered broken. |
14145 |
++ */ |
14146 |
++ if (mptcp_is_data_seq(skb) && tp->mpcb->dss_csum && |
14147 |
++ !(TCP_SKB_CB(skb)->mptcp_flags & MPTCPHDR_DSS_CSUM)) { |
14148 |
++ if (tcp_need_reset(sk->sk_state)) |
14149 |
++ tp->ops->send_active_reset(sk, GFP_ATOMIC); |
14150 |
++ |
14151 |
++ mptcp_sub_force_close(sk); |
14152 |
++ return 1; |
14153 |
++ } |
14154 |
++ |
14155 |
++ /* We have to acknowledge retransmissions of the third |
14156 |
++ * ack. |
14157 |
++ */ |
14158 |
++ if (mopt->join_ack) { |
14159 |
++ tcp_send_delayed_ack(sk); |
14160 |
++ mopt->join_ack = 0; |
14161 |
++ } |
14162 |
++ |
14163 |
++ if (mopt->saw_add_addr || mopt->saw_rem_addr) { |
14164 |
++ if (mopt->more_add_addr || mopt->more_rem_addr) { |
14165 |
++ mptcp_parse_addropt(skb, sk); |
14166 |
++ } else { |
14167 |
++ if (mopt->saw_add_addr) |
14168 |
++ mptcp_handle_add_addr(mopt->add_addr_ptr, sk); |
14169 |
++ if (mopt->saw_rem_addr) |
14170 |
++ mptcp_handle_rem_addr(mopt->rem_addr_ptr, sk); |
14171 |
++ } |
14172 |
++ |
14173 |
++ mopt->more_add_addr = 0; |
14174 |
++ mopt->saw_add_addr = 0; |
14175 |
++ mopt->more_rem_addr = 0; |
14176 |
++ mopt->saw_rem_addr = 0; |
14177 |
++ } |
14178 |
++ if (mopt->saw_low_prio) { |
14179 |
++ if (mopt->saw_low_prio == 1) { |
14180 |
++ tp->mptcp->rcv_low_prio = mopt->low_prio; |
14181 |
++ } else { |
14182 |
++ struct sock *sk_it; |
14183 |
++ mptcp_for_each_sk(tp->mpcb, sk_it) { |
14184 |
++ struct mptcp_tcp_sock *mptcp = tcp_sk(sk_it)->mptcp; |
14185 |
++ if (mptcp->rem_id == mopt->prio_addr_id) |
14186 |
++ mptcp->rcv_low_prio = mopt->low_prio; |
14187 |
++ } |
14188 |
++ } |
14189 |
++ mopt->saw_low_prio = 0; |
14190 |
++ } |
14191 |
++ |
14192 |
++ mptcp_data_ack(sk, skb); |
14193 |
++ |
14194 |
++ mptcp_path_array_check(mptcp_meta_sk(sk)); |
14195 |
++ /* Socket may have been mp_killed by a REMOVE_ADDR */ |
14196 |
++ if (tp->mp_killed) |
14197 |
++ return 1; |
14198 |
++ |
14199 |
++ return 0; |
14200 |
++} |
14201 |
++ |
14202 |
++/* In case of fastopen, some data can already be in the write queue. |
14203 |
++ * We need to update the sequence number of the segments as they |
14204 |
++ * were initially TCP sequence numbers. |
14205 |
++ */ |
14206 |
++static void mptcp_rcv_synsent_fastopen(struct sock *meta_sk) |
14207 |
++{ |
14208 |
++ struct tcp_sock *meta_tp = tcp_sk(meta_sk); |
14209 |
++ struct tcp_sock *master_tp = tcp_sk(meta_tp->mpcb->master_sk); |
14210 |
++ struct sk_buff *skb; |
14211 |
++ u32 new_mapping = meta_tp->write_seq - master_tp->snd_una; |
14212 |
++ |
14213 |
++ /* There should only be one skb in write queue: the data not |
14214 |
++ * acknowledged in the SYN+ACK. In this case, we need to map |
14215 |
++ * this data to data sequence numbers. |
14216 |
++ */ |
14217 |
++ skb_queue_walk(&meta_sk->sk_write_queue, skb) { |
14218 |
++ /* If the server only acknowledges partially the data sent in |
14219 |
++ * the SYN, we need to trim the acknowledged part because |
14220 |
++ * we don't want to retransmit this already received data. |
14221 |
++ * When we reach this point, tcp_ack() has already cleaned up |
14222 |
++ * fully acked segments. However, tcp trims partially acked |
14223 |
++ * segments only when retransmitting. Since MPTCP comes into |
14224 |
++ * play only now, we will fake an initial transmit, and |
14225 |
++ * retransmit_skb() will not be called. The following fragment |
14226 |
++ * comes from __tcp_retransmit_skb(). |
14227 |
++ */ |
14228 |
++ if (before(TCP_SKB_CB(skb)->seq, master_tp->snd_una)) { |
14229 |
++ BUG_ON(before(TCP_SKB_CB(skb)->end_seq, |
14230 |
++ master_tp->snd_una)); |
14231 |
++ /* tcp_trim_head can only returns ENOMEM if skb is |
14232 |
++ * cloned. It is not the case here (see |
14233 |
++ * tcp_send_syn_data). |
14234 |
++ */ |
14235 |
++ BUG_ON(tcp_trim_head(meta_sk, skb, master_tp->snd_una - |
14236 |
++ TCP_SKB_CB(skb)->seq)); |
14237 |
++ } |
14238 |
++ |
14239 |
++ TCP_SKB_CB(skb)->seq += new_mapping; |
14240 |
++ TCP_SKB_CB(skb)->end_seq += new_mapping; |
14241 |
++ } |
14242 |
++ |
14243 |
++ /* We can advance write_seq by the number of bytes unacknowledged |
14244 |
++ * and that were mapped in the previous loop. |
14245 |
++ */ |
14246 |
++ meta_tp->write_seq += master_tp->write_seq - master_tp->snd_una; |
14247 |
++ |
14248 |
++ /* The packets from the master_sk will be entailed to it later |
14249 |
++ * Until that time, its write queue is empty, and |
14250 |
++ * write_seq must align with snd_una |
14251 |
++ */ |
14252 |
++ master_tp->snd_nxt = master_tp->write_seq = master_tp->snd_una; |
14253 |
++ master_tp->packets_out = 0; |
14254 |
++ |
14255 |
++ /* Although these data have been sent already over the subsk, |
14256 |
++ * They have never been sent over the meta_sk, so we rewind |
14257 |
++ * the send_head so that tcp considers it as an initial send |
14258 |
++ * (instead of retransmit). |
14259 |
++ */ |
14260 |
++ meta_sk->sk_send_head = tcp_write_queue_head(meta_sk); |
14261 |
++} |
14262 |
++ |
14263 |
++/* The skptr is needed, because if we become MPTCP-capable, we have to switch |
14264 |
++ * from meta-socket to master-socket. |
14265 |
++ * |
14266 |
++ * @return: 1 - we want to reset this connection |
14267 |
++ * 2 - we want to discard the received syn/ack |
14268 |
++ * 0 - everything is fine - continue |
14269 |
++ */ |
14270 |
++int mptcp_rcv_synsent_state_process(struct sock *sk, struct sock **skptr, |
14271 |
++ const struct sk_buff *skb, |
14272 |
++ const struct mptcp_options_received *mopt) |
14273 |
++{ |
14274 |
++ struct tcp_sock *tp = tcp_sk(sk); |
14275 |
++ |
14276 |
++ if (mptcp(tp)) { |
14277 |
++ u8 hash_mac_check[20]; |
14278 |
++ struct mptcp_cb *mpcb = tp->mpcb; |
14279 |
++ |
14280 |
++ mptcp_hmac_sha1((u8 *)&mpcb->mptcp_rem_key, |
14281 |
++ (u8 *)&mpcb->mptcp_loc_key, |
14282 |
++ (u8 *)&tp->mptcp->rx_opt.mptcp_recv_nonce, |
14283 |
++ (u8 *)&tp->mptcp->mptcp_loc_nonce, |
14284 |
++ (u32 *)hash_mac_check); |
14285 |
++ if (memcmp(hash_mac_check, |
14286 |
++ (char *)&tp->mptcp->rx_opt.mptcp_recv_tmac, 8)) { |
14287 |
++ mptcp_sub_force_close(sk); |
14288 |
++ return 1; |
14289 |
++ } |
14290 |
++ |
14291 |
++ /* Set this flag in order to postpone data sending |
14292 |
++ * until the 4th ack arrives. |
14293 |
++ */ |
14294 |
++ tp->mptcp->pre_established = 1; |
14295 |
++ tp->mptcp->rcv_low_prio = tp->mptcp->rx_opt.low_prio; |
14296 |
++ |
14297 |
++ mptcp_hmac_sha1((u8 *)&mpcb->mptcp_loc_key, |
14298 |
++ (u8 *)&mpcb->mptcp_rem_key, |
14299 |
++ (u8 *)&tp->mptcp->mptcp_loc_nonce, |
14300 |
++ (u8 *)&tp->mptcp->rx_opt.mptcp_recv_nonce, |
14301 |
++ (u32 *)&tp->mptcp->sender_mac[0]); |
14302 |
++ |
14303 |
++ } else if (mopt->saw_mpc) { |
14304 |
++ struct sock *meta_sk = sk; |
14305 |
++ |
14306 |
++ if (mptcp_create_master_sk(sk, mopt->mptcp_key, |
14307 |
++ ntohs(tcp_hdr(skb)->window))) |
14308 |
++ return 2; |
14309 |
++ |
14310 |
++ sk = tcp_sk(sk)->mpcb->master_sk; |
14311 |
++ *skptr = sk; |
14312 |
++ tp = tcp_sk(sk); |
14313 |
++ |
14314 |
++ /* If fastopen was used data might be in the send queue. We |
14315 |
++ * need to update their sequence number to MPTCP-level seqno. |
14316 |
++ * Note that it can happen in rare cases that fastopen_req is |
14317 |
++ * NULL and syn_data is 0 but fastopen indeed occurred and |
14318 |
++ * data has been queued in the write queue (but not sent). |
14319 |
++ * Example of such rare cases: connect is non-blocking and |
14320 |
++ * TFO is configured to work without cookies. |
14321 |
++ */ |
14322 |
++ if (!skb_queue_empty(&meta_sk->sk_write_queue)) |
14323 |
++ mptcp_rcv_synsent_fastopen(meta_sk); |
14324 |
++ |
14325 |
++ /* -1, because the SYN consumed 1 byte. In case of TFO, we |
14326 |
++ * start the subflow-sequence number as if the data of the SYN |
14327 |
++ * is not part of any mapping. |
14328 |
++ */ |
14329 |
++ tp->mptcp->snt_isn = tp->snd_una - 1; |
14330 |
++ tp->mpcb->dss_csum = mopt->dss_csum; |
14331 |
++ tp->mptcp->include_mpc = 1; |
14332 |
++ |
14333 |
++ /* Ensure that fastopen is handled at the meta-level. */ |
14334 |
++ tp->fastopen_req = NULL; |
14335 |
++ |
14336 |
++ sk_set_socket(sk, mptcp_meta_sk(sk)->sk_socket); |
14337 |
++ sk->sk_wq = mptcp_meta_sk(sk)->sk_wq; |
14338 |
++ |
14339 |
++ /* hold in sk_clone_lock due to initialization to 2 */ |
14340 |
++ sock_put(sk); |
14341 |
++ } else { |
14342 |
++ tp->request_mptcp = 0; |
14343 |
++ |
14344 |
++ if (tp->inside_tk_table) |
14345 |
++ mptcp_hash_remove(tp); |
14346 |
++ } |
14347 |
++ |
14348 |
++ if (mptcp(tp)) |
14349 |
++ tp->mptcp->rcv_isn = TCP_SKB_CB(skb)->seq; |
14350 |
++ |
14351 |
++ return 0; |
14352 |
++} |
14353 |
++ |
14354 |
++bool mptcp_should_expand_sndbuf(const struct sock *sk) |
14355 |
++{ |
14356 |
++ const struct sock *sk_it; |
14357 |
++ const struct sock *meta_sk = mptcp_meta_sk(sk); |
14358 |
++ const struct tcp_sock *meta_tp = tcp_sk(meta_sk); |
14359 |
++ int cnt_backups = 0; |
14360 |
++ int backup_available = 0; |
14361 |
++ |
14362 |
++ /* We circumvent this check in tcp_check_space, because we want to |
14363 |
++ * always call sk_write_space. So, we reproduce the check here. |
14364 |
++ */ |
14365 |
++ if (!meta_sk->sk_socket || |
14366 |
++ !test_bit(SOCK_NOSPACE, &meta_sk->sk_socket->flags)) |
14367 |
++ return false; |
14368 |
++ |
14369 |
++ /* If the user specified a specific send buffer setting, do |
14370 |
++ * not modify it. |
14371 |
++ */ |
14372 |
++ if (meta_sk->sk_userlocks & SOCK_SNDBUF_LOCK) |
14373 |
++ return false; |
14374 |
++ |
14375 |
++ /* If we are under global TCP memory pressure, do not expand. */ |
14376 |
++ if (sk_under_memory_pressure(meta_sk)) |
14377 |
++ return false; |
14378 |
++ |
14379 |
++ /* If we are under soft global TCP memory pressure, do not expand. */ |
14380 |
++ if (sk_memory_allocated(meta_sk) >= sk_prot_mem_limits(meta_sk, 0)) |
14381 |
++ return false; |
14382 |
++ |
14383 |
++ |
14384 |
++ /* For MPTCP we look for a subsocket that could send data. |
14385 |
++ * If we found one, then we update the send-buffer. |
14386 |
++ */ |
14387 |
++ mptcp_for_each_sk(meta_tp->mpcb, sk_it) { |
14388 |
++ struct tcp_sock *tp_it = tcp_sk(sk_it); |
14389 |
++ |
14390 |
++ if (!mptcp_sk_can_send(sk_it)) |
14391 |
++ continue; |
14392 |
++ |
14393 |
++ /* Backup-flows have to be counted - if there is no other |
14394 |
++ * subflow we take the backup-flow into account. |
14395 |
++ */ |
14396 |
++ if (tp_it->mptcp->rcv_low_prio || tp_it->mptcp->low_prio) |
14397 |
++ cnt_backups++; |
14398 |
++ |
14399 |
++ if (tp_it->packets_out < tp_it->snd_cwnd) { |
14400 |
++ if (tp_it->mptcp->rcv_low_prio || tp_it->mptcp->low_prio) { |
14401 |
++ backup_available = 1; |
14402 |
++ continue; |
14403 |
++ } |
14404 |
++ return true; |
14405 |
++ } |
14406 |
++ } |
14407 |
++ |
14408 |
++ /* Backup-flow is available for sending - update send-buffer */ |
14409 |
++ if (meta_tp->mpcb->cnt_established == cnt_backups && backup_available) |
14410 |
++ return true; |
14411 |
++ return false; |
14412 |
++} |
14413 |
++ |
14414 |
++void mptcp_init_buffer_space(struct sock *sk) |
14415 |
++{ |
14416 |
++ struct tcp_sock *tp = tcp_sk(sk); |
14417 |
++ struct sock *meta_sk = mptcp_meta_sk(sk); |
14418 |
++ struct tcp_sock *meta_tp = tcp_sk(meta_sk); |
14419 |
++ int space; |
14420 |
++ |
14421 |
++ tcp_init_buffer_space(sk); |
14422 |
++ |
14423 |
++ if (is_master_tp(tp)) { |
14424 |
++ meta_tp->rcvq_space.space = meta_tp->rcv_wnd; |
14425 |
++ meta_tp->rcvq_space.time = tcp_time_stamp; |
14426 |
++ meta_tp->rcvq_space.seq = meta_tp->copied_seq; |
14427 |
++ |
14428 |
++ /* If there is only one subflow, we just use regular TCP |
14429 |
++ * autotuning. User-locks are handled already by |
14430 |
++ * tcp_init_buffer_space |
14431 |
++ */ |
14432 |
++ meta_tp->window_clamp = tp->window_clamp; |
14433 |
++ meta_tp->rcv_ssthresh = tp->rcv_ssthresh; |
14434 |
++ meta_sk->sk_rcvbuf = sk->sk_rcvbuf; |
14435 |
++ meta_sk->sk_sndbuf = sk->sk_sndbuf; |
14436 |
++ |
14437 |
++ return; |
14438 |
++ } |
14439 |
++ |
14440 |
++ if (meta_sk->sk_userlocks & SOCK_RCVBUF_LOCK) |
14441 |
++ goto snd_buf; |
14442 |
++ |
14443 |
++ /* Adding a new subflow to the rcv-buffer space. We make a simple |
14444 |
++ * addition, to give some space to allow traffic on the new subflow. |
14445 |
++ * Autotuning will increase it further later on. |
14446 |
++ */ |
14447 |
++ space = min(meta_sk->sk_rcvbuf + sk->sk_rcvbuf, sysctl_tcp_rmem[2]); |
14448 |
++ if (space > meta_sk->sk_rcvbuf) { |
14449 |
++ meta_tp->window_clamp += tp->window_clamp; |
14450 |
++ meta_tp->rcv_ssthresh += tp->rcv_ssthresh; |
14451 |
++ meta_sk->sk_rcvbuf = space; |
14452 |
++ } |
14453 |
++ |
14454 |
++snd_buf: |
14455 |
++ if (meta_sk->sk_userlocks & SOCK_SNDBUF_LOCK) |
14456 |
++ return; |
14457 |
++ |
14458 |
++ /* Adding a new subflow to the send-buffer space. We make a simple |
14459 |
++ * addition, to give some space to allow traffic on the new subflow. |
14460 |
++ * Autotuning will increase it further later on. |
14461 |
++ */ |
14462 |
++ space = min(meta_sk->sk_sndbuf + sk->sk_sndbuf, sysctl_tcp_wmem[2]); |
14463 |
++ if (space > meta_sk->sk_sndbuf) { |
14464 |
++ meta_sk->sk_sndbuf = space; |
14465 |
++ meta_sk->sk_write_space(meta_sk); |
14466 |
++ } |
14467 |
++} |
14468 |
++ |
14469 |
++void mptcp_tcp_set_rto(struct sock *sk) |
14470 |
++{ |
14471 |
++ tcp_set_rto(sk); |
14472 |
++ mptcp_set_rto(sk); |
14473 |
++} |
14474 |
+diff --git a/net/mptcp/mptcp_ipv4.c b/net/mptcp/mptcp_ipv4.c |
14475 |
+new file mode 100644 |
14476 |
+index 000000000000..1183d1305d35 |
14477 |
+--- /dev/null |
14478 |
++++ b/net/mptcp/mptcp_ipv4.c |
14479 |
+@@ -0,0 +1,483 @@ |
14480 |
++/* |
14481 |
++ * MPTCP implementation - IPv4-specific functions |
14482 |
++ * |
14483 |
++ * Initial Design & Implementation: |
14484 |
++ * Sébastien Barré <sebastien.barre@×××××××××.be> |
14485 |
++ * |
14486 |
++ * Current Maintainer: |
14487 |
++ * Christoph Paasch <christoph.paasch@×××××××××.be> |
14488 |
++ * |
14489 |
++ * Additional authors: |
14490 |
++ * Jaakko Korkeaniemi <jaakko.korkeaniemi@×××××.fi> |
14491 |
++ * Gregory Detal <gregory.detal@×××××××××.be> |
14492 |
++ * Fabien Duchêne <fabien.duchene@×××××××××.be> |
14493 |
++ * Andreas Seelinger <Andreas.Seelinger@×××××××××××.de> |
14494 |
++ * Lavkesh Lahngir <lavkesh51@×××××.com> |
14495 |
++ * Andreas Ripke <ripke@××××××.eu> |
14496 |
++ * Vlad Dogaru <vlad.dogaru@×××××.com> |
14497 |
++ * Octavian Purdila <octavian.purdila@×××××.com> |
14498 |
++ * John Ronan <jronan@××××.org> |
14499 |
++ * Catalin Nicutar <catalin.nicutar@×××××.com> |
14500 |
++ * Brandon Heller <brandonh@××××××××.edu> |
14501 |
++ * |
14502 |
++ * |
14503 |
++ * This program is free software; you can redistribute it and/or |
14504 |
++ * modify it under the terms of the GNU General Public License |
14505 |
++ * as published by the Free Software Foundation; either version |
14506 |
++ * 2 of the License, or (at your option) any later version. |
14507 |
++ */ |
14508 |
++ |
14509 |
++#include <linux/export.h> |
14510 |
++#include <linux/ip.h> |
14511 |
++#include <linux/list.h> |
14512 |
++#include <linux/skbuff.h> |
14513 |
++#include <linux/spinlock.h> |
14514 |
++#include <linux/tcp.h> |
14515 |
++ |
14516 |
++#include <net/inet_common.h> |
14517 |
++#include <net/inet_connection_sock.h> |
14518 |
++#include <net/mptcp.h> |
14519 |
++#include <net/mptcp_v4.h> |
14520 |
++#include <net/request_sock.h> |
14521 |
++#include <net/tcp.h> |
14522 |
++ |
14523 |
++u32 mptcp_v4_get_nonce(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport) |
14524 |
++{ |
14525 |
++ u32 hash[MD5_DIGEST_WORDS]; |
14526 |
++ |
14527 |
++ hash[0] = (__force u32)saddr; |
14528 |
++ hash[1] = (__force u32)daddr; |
14529 |
++ hash[2] = ((__force u16)sport << 16) + (__force u16)dport; |
14530 |
++ hash[3] = mptcp_seed++; |
14531 |
++ |
14532 |
++ md5_transform(hash, mptcp_secret); |
14533 |
++ |
14534 |
++ return hash[0]; |
14535 |
++} |
14536 |
++ |
14537 |
++u64 mptcp_v4_get_key(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport) |
14538 |
++{ |
14539 |
++ u32 hash[MD5_DIGEST_WORDS]; |
14540 |
++ |
14541 |
++ hash[0] = (__force u32)saddr; |
14542 |
++ hash[1] = (__force u32)daddr; |
14543 |
++ hash[2] = ((__force u16)sport << 16) + (__force u16)dport; |
14544 |
++ hash[3] = mptcp_seed++; |
14545 |
++ |
14546 |
++ md5_transform(hash, mptcp_secret); |
14547 |
++ |
14548 |
++ return *((u64 *)hash); |
14549 |
++} |
14550 |
++ |
14551 |
++ |
14552 |
++static void mptcp_v4_reqsk_destructor(struct request_sock *req) |
14553 |
++{ |
14554 |
++ mptcp_reqsk_destructor(req); |
14555 |
++ |
14556 |
++ tcp_v4_reqsk_destructor(req); |
14557 |
++} |
14558 |
++ |
14559 |
++static int mptcp_v4_init_req(struct request_sock *req, struct sock *sk, |
14560 |
++ struct sk_buff *skb) |
14561 |
++{ |
14562 |
++ tcp_request_sock_ipv4_ops.init_req(req, sk, skb); |
14563 |
++ mptcp_reqsk_init(req, skb); |
14564 |
++ |
14565 |
++ return 0; |
14566 |
++} |
14567 |
++ |
14568 |
++static int mptcp_v4_join_init_req(struct request_sock *req, struct sock *sk, |
14569 |
++ struct sk_buff *skb) |
14570 |
++{ |
14571 |
++ struct mptcp_request_sock *mtreq = mptcp_rsk(req); |
14572 |
++ struct mptcp_cb *mpcb = tcp_sk(sk)->mpcb; |
14573 |
++ union inet_addr addr; |
14574 |
++ int loc_id; |
14575 |
++ bool low_prio = false; |
14576 |
++ |
14577 |
++ /* We need to do this as early as possible. Because, if we fail later |
14578 |
++ * (e.g., get_local_id), then reqsk_free tries to remove the |
14579 |
++ * request-socket from the htb in mptcp_hash_request_remove as pprev |
14580 |
++ * may be different from NULL. |
14581 |
++ */ |
14582 |
++ mtreq->hash_entry.pprev = NULL; |
14583 |
++ |
14584 |
++ tcp_request_sock_ipv4_ops.init_req(req, sk, skb); |
14585 |
++ |
14586 |
++ mtreq->mptcp_loc_nonce = mptcp_v4_get_nonce(ip_hdr(skb)->saddr, |
14587 |
++ ip_hdr(skb)->daddr, |
14588 |
++ tcp_hdr(skb)->source, |
14589 |
++ tcp_hdr(skb)->dest); |
14590 |
++ addr.ip = inet_rsk(req)->ir_loc_addr; |
14591 |
++ loc_id = mpcb->pm_ops->get_local_id(AF_INET, &addr, sock_net(sk), &low_prio); |
14592 |
++ if (loc_id == -1) |
14593 |
++ return -1; |
14594 |
++ mtreq->loc_id = loc_id; |
14595 |
++ mtreq->low_prio = low_prio; |
14596 |
++ |
14597 |
++ mptcp_join_reqsk_init(mpcb, req, skb); |
14598 |
++ |
14599 |
++ return 0; |
14600 |
++} |
14601 |
++ |
14602 |
++/* Similar to tcp_request_sock_ops */ |
14603 |
++struct request_sock_ops mptcp_request_sock_ops __read_mostly = { |
14604 |
++ .family = PF_INET, |
14605 |
++ .obj_size = sizeof(struct mptcp_request_sock), |
14606 |
++ .rtx_syn_ack = tcp_rtx_synack, |
14607 |
++ .send_ack = tcp_v4_reqsk_send_ack, |
14608 |
++ .destructor = mptcp_v4_reqsk_destructor, |
14609 |
++ .send_reset = tcp_v4_send_reset, |
14610 |
++ .syn_ack_timeout = tcp_syn_ack_timeout, |
14611 |
++}; |
14612 |
++ |
14613 |
++static void mptcp_v4_reqsk_queue_hash_add(struct sock *meta_sk, |
14614 |
++ struct request_sock *req, |
14615 |
++ const unsigned long timeout) |
14616 |
++{ |
14617 |
++ const u32 h1 = inet_synq_hash(inet_rsk(req)->ir_rmt_addr, |
14618 |
++ inet_rsk(req)->ir_rmt_port, |
14619 |
++ 0, MPTCP_HASH_SIZE); |
14620 |
++ /* We cannot call inet_csk_reqsk_queue_hash_add(), because we do not |
14621 |
++ * want to reset the keepalive-timer (responsible for retransmitting |
14622 |
++ * SYN/ACKs). We do not retransmit SYN/ACKs+MP_JOINs, because we cannot |
14623 |
++ * overload the keepalive timer. Also, it's not a big deal, because the |
14624 |
++ * third ACK of the MP_JOIN-handshake is sent in a reliable manner. So, |
14625 |
++ * if the third ACK gets lost, the client will handle the retransmission |
14626 |
++ * anyways. If our SYN/ACK gets lost, the client will retransmit the |
14627 |
++ * SYN. |
14628 |
++ */ |
14629 |
++ struct inet_connection_sock *meta_icsk = inet_csk(meta_sk); |
14630 |
++ struct listen_sock *lopt = meta_icsk->icsk_accept_queue.listen_opt; |
14631 |
++ const u32 h2 = inet_synq_hash(inet_rsk(req)->ir_rmt_addr, |
14632 |
++ inet_rsk(req)->ir_rmt_port, |
14633 |
++ lopt->hash_rnd, lopt->nr_table_entries); |
14634 |
++ |
14635 |
++ reqsk_queue_hash_req(&meta_icsk->icsk_accept_queue, h2, req, timeout); |
14636 |
++ if (reqsk_queue_added(&meta_icsk->icsk_accept_queue) == 0) |
14637 |
++ mptcp_reset_synack_timer(meta_sk, timeout); |
14638 |
++ |
14639 |
++ rcu_read_lock(); |
14640 |
++ spin_lock(&mptcp_reqsk_hlock); |
14641 |
++ hlist_nulls_add_head_rcu(&mptcp_rsk(req)->hash_entry, &mptcp_reqsk_htb[h1]); |
14642 |
++ spin_unlock(&mptcp_reqsk_hlock); |
14643 |
++ rcu_read_unlock(); |
14644 |
++} |
14645 |
++ |
14646 |
++/* Similar to tcp_v4_conn_request */ |
14647 |
++static int mptcp_v4_join_request(struct sock *meta_sk, struct sk_buff *skb) |
14648 |
++{ |
14649 |
++ return tcp_conn_request(&mptcp_request_sock_ops, |
14650 |
++ &mptcp_join_request_sock_ipv4_ops, |
14651 |
++ meta_sk, skb); |
14652 |
++} |
14653 |
++ |
14654 |
++/* We only process join requests here. (either the SYN or the final ACK) */ |
14655 |
++int mptcp_v4_do_rcv(struct sock *meta_sk, struct sk_buff *skb) |
14656 |
++{ |
14657 |
++ const struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb; |
14658 |
++ struct sock *child, *rsk = NULL; |
14659 |
++ int ret; |
14660 |
++ |
14661 |
++ if (!(TCP_SKB_CB(skb)->mptcp_flags & MPTCPHDR_JOIN)) { |
14662 |
++ struct tcphdr *th = tcp_hdr(skb); |
14663 |
++ const struct iphdr *iph = ip_hdr(skb); |
14664 |
++ struct sock *sk; |
14665 |
++ |
14666 |
++ sk = inet_lookup_established(sock_net(meta_sk), &tcp_hashinfo, |
14667 |
++ iph->saddr, th->source, iph->daddr, |
14668 |
++ th->dest, inet_iif(skb)); |
14669 |
++ |
14670 |
++ if (!sk) { |
14671 |
++ kfree_skb(skb); |
14672 |
++ return 0; |
14673 |
++ } |
14674 |
++ if (is_meta_sk(sk)) { |
14675 |
++ WARN("%s Did not find a sub-sk - did found the meta!\n", __func__); |
14676 |
++ kfree_skb(skb); |
14677 |
++ sock_put(sk); |
14678 |
++ return 0; |
14679 |
++ } |
14680 |
++ |
14681 |
++ if (sk->sk_state == TCP_TIME_WAIT) { |
14682 |
++ inet_twsk_put(inet_twsk(sk)); |
14683 |
++ kfree_skb(skb); |
14684 |
++ return 0; |
14685 |
++ } |
14686 |
++ |
14687 |
++ ret = tcp_v4_do_rcv(sk, skb); |
14688 |
++ sock_put(sk); |
14689 |
++ |
14690 |
++ return ret; |
14691 |
++ } |
14692 |
++ TCP_SKB_CB(skb)->mptcp_flags = 0; |
14693 |
++ |
14694 |
++ /* Has been removed from the tk-table. Thus, no new subflows. |
14695 |
++ * |
14696 |
++ * Check for close-state is necessary, because we may have been closed |
14697 |
++ * without passing by mptcp_close(). |
14698 |
++ * |
14699 |
++ * When falling back, no new subflows are allowed either. |
14700 |
++ */ |
14701 |
++ if (meta_sk->sk_state == TCP_CLOSE || !tcp_sk(meta_sk)->inside_tk_table || |
14702 |
++ mpcb->infinite_mapping_rcv || mpcb->send_infinite_mapping) |
14703 |
++ goto reset_and_discard; |
14704 |
++ |
14705 |
++ child = tcp_v4_hnd_req(meta_sk, skb); |
14706 |
++ |
14707 |
++ if (!child) |
14708 |
++ goto discard; |
14709 |
++ |
14710 |
++ if (child != meta_sk) { |
14711 |
++ sock_rps_save_rxhash(child, skb); |
14712 |
++ /* We don't call tcp_child_process here, because we hold |
14713 |
++ * already the meta-sk-lock and are sure that it is not owned |
14714 |
++ * by the user. |
14715 |
++ */ |
14716 |
++ ret = tcp_rcv_state_process(child, skb, tcp_hdr(skb), skb->len); |
14717 |
++ bh_unlock_sock(child); |
14718 |
++ sock_put(child); |
14719 |
++ if (ret) { |
14720 |
++ rsk = child; |
14721 |
++ goto reset_and_discard; |
14722 |
++ } |
14723 |
++ } else { |
14724 |
++ if (tcp_hdr(skb)->syn) { |
14725 |
++ mptcp_v4_join_request(meta_sk, skb); |
14726 |
++ goto discard; |
14727 |
++ } |
14728 |
++ goto reset_and_discard; |
14729 |
++ } |
14730 |
++ return 0; |
14731 |
++ |
14732 |
++reset_and_discard: |
14733 |
++ if (reqsk_queue_len(&inet_csk(meta_sk)->icsk_accept_queue)) { |
14734 |
++ const struct tcphdr *th = tcp_hdr(skb); |
14735 |
++ const struct iphdr *iph = ip_hdr(skb); |
14736 |
++ struct request_sock **prev, *req; |
14737 |
++ /* If we end up here, it means we should not have matched on the |
14738 |
++ * request-socket. But, because the request-sock queue is only |
14739 |
++ * destroyed in mptcp_close, the socket may actually already be |
14740 |
++ * in close-state (e.g., through shutdown()) while still having |
14741 |
++ * pending request sockets. |
14742 |
++ */ |
14743 |
++ req = inet_csk_search_req(meta_sk, &prev, th->source, |
14744 |
++ iph->saddr, iph->daddr); |
14745 |
++ if (req) { |
14746 |
++ inet_csk_reqsk_queue_unlink(meta_sk, req, prev); |
14747 |
++ reqsk_queue_removed(&inet_csk(meta_sk)->icsk_accept_queue, |
14748 |
++ req); |
14749 |
++ reqsk_free(req); |
14750 |
++ } |
14751 |
++ } |
14752 |
++ |
14753 |
++ tcp_v4_send_reset(rsk, skb); |
14754 |
++discard: |
14755 |
++ kfree_skb(skb); |
14756 |
++ return 0; |
14757 |
++} |
14758 |
++ |
14759 |
++/* After this, the ref count of the meta_sk associated with the request_sock |
14760 |
++ * is incremented. Thus it is the responsibility of the caller |
14761 |
++ * to call sock_put() when the reference is not needed anymore. |
14762 |
++ */ |
14763 |
++struct sock *mptcp_v4_search_req(const __be16 rport, const __be32 raddr, |
14764 |
++ const __be32 laddr, const struct net *net) |
14765 |
++{ |
14766 |
++ const struct mptcp_request_sock *mtreq; |
14767 |
++ struct sock *meta_sk = NULL; |
14768 |
++ const struct hlist_nulls_node *node; |
14769 |
++ const u32 hash = inet_synq_hash(raddr, rport, 0, MPTCP_HASH_SIZE); |
14770 |
++ |
14771 |
++ rcu_read_lock(); |
14772 |
++begin: |
14773 |
++ hlist_nulls_for_each_entry_rcu(mtreq, node, &mptcp_reqsk_htb[hash], |
14774 |
++ hash_entry) { |
14775 |
++ struct inet_request_sock *ireq = inet_rsk(rev_mptcp_rsk(mtreq)); |
14776 |
++ meta_sk = mtreq->mptcp_mpcb->meta_sk; |
14777 |
++ |
14778 |
++ if (ireq->ir_rmt_port == rport && |
14779 |
++ ireq->ir_rmt_addr == raddr && |
14780 |
++ ireq->ir_loc_addr == laddr && |
14781 |
++ rev_mptcp_rsk(mtreq)->rsk_ops->family == AF_INET && |
14782 |
++ net_eq(net, sock_net(meta_sk))) |
14783 |
++ goto found; |
14784 |
++ meta_sk = NULL; |
14785 |
++ } |
14786 |
++ /* A request-socket is destroyed by RCU. So, it might have been recycled |
14787 |
++ * and put into another hash-table list. So, after the lookup we may |
14788 |
++ * end up in a different list. So, we may need to restart. |
14789 |
++ * |
14790 |
++ * See also the comment in __inet_lookup_established. |
14791 |
++ */ |
14792 |
++ if (get_nulls_value(node) != hash + MPTCP_REQSK_NULLS_BASE) |
14793 |
++ goto begin; |
14794 |
++ |
14795 |
++found: |
14796 |
++ if (meta_sk && unlikely(!atomic_inc_not_zero(&meta_sk->sk_refcnt))) |
14797 |
++ meta_sk = NULL; |
14798 |
++ rcu_read_unlock(); |
14799 |
++ |
14800 |
++ return meta_sk; |
14801 |
++} |
14802 |
++ |
14803 |
++/* Create a new IPv4 subflow. |
14804 |
++ * |
14805 |
++ * We are in user-context and meta-sock-lock is hold. |
14806 |
++ */ |
14807 |
++int mptcp_init4_subsockets(struct sock *meta_sk, const struct mptcp_loc4 *loc, |
14808 |
++ struct mptcp_rem4 *rem) |
14809 |
++{ |
14810 |
++ struct tcp_sock *tp; |
14811 |
++ struct sock *sk; |
14812 |
++ struct sockaddr_in loc_in, rem_in; |
14813 |
++ struct socket sock; |
14814 |
++ int ret; |
14815 |
++ |
14816 |
++ /** First, create and prepare the new socket */ |
14817 |
++ |
14818 |
++ sock.type = meta_sk->sk_socket->type; |
14819 |
++ sock.state = SS_UNCONNECTED; |
14820 |
++ sock.wq = meta_sk->sk_socket->wq; |
14821 |
++ sock.file = meta_sk->sk_socket->file; |
14822 |
++ sock.ops = NULL; |
14823 |
++ |
14824 |
++ ret = inet_create(sock_net(meta_sk), &sock, IPPROTO_TCP, 1); |
14825 |
++ if (unlikely(ret < 0)) { |
14826 |
++ mptcp_debug("%s inet_create failed ret: %d\n", __func__, ret); |
14827 |
++ return ret; |
14828 |
++ } |
14829 |
++ |
14830 |
++ sk = sock.sk; |
14831 |
++ tp = tcp_sk(sk); |
14832 |
++ |
14833 |
++ /* All subsockets need the MPTCP-lock-class */ |
14834 |
++ lockdep_set_class_and_name(&(sk)->sk_lock.slock, &meta_slock_key, "slock-AF_INET-MPTCP"); |
14835 |
++ lockdep_init_map(&(sk)->sk_lock.dep_map, "sk_lock-AF_INET-MPTCP", &meta_key, 0); |
14836 |
++ |
14837 |
++ if (mptcp_add_sock(meta_sk, sk, loc->loc4_id, rem->rem4_id, GFP_KERNEL)) |
14838 |
++ goto error; |
14839 |
++ |
14840 |
++ tp->mptcp->slave_sk = 1; |
14841 |
++ tp->mptcp->low_prio = loc->low_prio; |
14842 |
++ |
14843 |
++ /* Initializing the timer for an MPTCP subflow */ |
14844 |
++ setup_timer(&tp->mptcp->mptcp_ack_timer, mptcp_ack_handler, (unsigned long)sk); |
14845 |
++ |
14846 |
++ /** Then, connect the socket to the peer */ |
14847 |
++ loc_in.sin_family = AF_INET; |
14848 |
++ rem_in.sin_family = AF_INET; |
14849 |
++ loc_in.sin_port = 0; |
14850 |
++ if (rem->port) |
14851 |
++ rem_in.sin_port = rem->port; |
14852 |
++ else |
14853 |
++ rem_in.sin_port = inet_sk(meta_sk)->inet_dport; |
14854 |
++ loc_in.sin_addr = loc->addr; |
14855 |
++ rem_in.sin_addr = rem->addr; |
14856 |
++ |
14857 |
++ ret = sock.ops->bind(&sock, (struct sockaddr *)&loc_in, sizeof(struct sockaddr_in)); |
14858 |
++ if (ret < 0) { |
14859 |
++ mptcp_debug("%s: MPTCP subsocket bind() failed, error %d\n", |
14860 |
++ __func__, ret); |
14861 |
++ goto error; |
14862 |
++ } |
14863 |
++ |
14864 |
++ mptcp_debug("%s: token %#x pi %d src_addr:%pI4:%d dst_addr:%pI4:%d\n", |
14865 |
++ __func__, tcp_sk(meta_sk)->mpcb->mptcp_loc_token, |
14866 |
++ tp->mptcp->path_index, &loc_in.sin_addr, |
14867 |
++ ntohs(loc_in.sin_port), &rem_in.sin_addr, |
14868 |
++ ntohs(rem_in.sin_port)); |
14869 |
++ |
14870 |
++ if (tcp_sk(meta_sk)->mpcb->pm_ops->init_subsocket_v4) |
14871 |
++ tcp_sk(meta_sk)->mpcb->pm_ops->init_subsocket_v4(sk, rem->addr); |
14872 |
++ |
14873 |
++ ret = sock.ops->connect(&sock, (struct sockaddr *)&rem_in, |
14874 |
++ sizeof(struct sockaddr_in), O_NONBLOCK); |
14875 |
++ if (ret < 0 && ret != -EINPROGRESS) { |
14876 |
++ mptcp_debug("%s: MPTCP subsocket connect() failed, error %d\n", |
14877 |
++ __func__, ret); |
14878 |
++ goto error; |
14879 |
++ } |
14880 |
++ |
14881 |
++ sk_set_socket(sk, meta_sk->sk_socket); |
14882 |
++ sk->sk_wq = meta_sk->sk_wq; |
14883 |
++ |
14884 |
++ return 0; |
14885 |
++ |
14886 |
++error: |
14887 |
++ /* May happen if mptcp_add_sock fails first */ |
14888 |
++ if (!mptcp(tp)) { |
14889 |
++ tcp_close(sk, 0); |
14890 |
++ } else { |
14891 |
++ local_bh_disable(); |
14892 |
++ mptcp_sub_force_close(sk); |
14893 |
++ local_bh_enable(); |
14894 |
++ } |
14895 |
++ return ret; |
14896 |
++} |
14897 |
++EXPORT_SYMBOL(mptcp_init4_subsockets); |
14898 |
++ |
14899 |
++const struct inet_connection_sock_af_ops mptcp_v4_specific = { |
14900 |
++ .queue_xmit = ip_queue_xmit, |
14901 |
++ .send_check = tcp_v4_send_check, |
14902 |
++ .rebuild_header = inet_sk_rebuild_header, |
14903 |
++ .sk_rx_dst_set = inet_sk_rx_dst_set, |
14904 |
++ .conn_request = mptcp_conn_request, |
14905 |
++ .syn_recv_sock = tcp_v4_syn_recv_sock, |
14906 |
++ .net_header_len = sizeof(struct iphdr), |
14907 |
++ .setsockopt = ip_setsockopt, |
14908 |
++ .getsockopt = ip_getsockopt, |
14909 |
++ .addr2sockaddr = inet_csk_addr2sockaddr, |
14910 |
++ .sockaddr_len = sizeof(struct sockaddr_in), |
14911 |
++ .bind_conflict = inet_csk_bind_conflict, |
14912 |
++#ifdef CONFIG_COMPAT |
14913 |
++ .compat_setsockopt = compat_ip_setsockopt, |
14914 |
++ .compat_getsockopt = compat_ip_getsockopt, |
14915 |
++#endif |
14916 |
++}; |
14917 |
++ |
14918 |
++struct tcp_request_sock_ops mptcp_request_sock_ipv4_ops; |
14919 |
++struct tcp_request_sock_ops mptcp_join_request_sock_ipv4_ops; |
14920 |
++ |
14921 |
++/* General initialization of IPv4 for MPTCP */ |
14922 |
++int mptcp_pm_v4_init(void) |
14923 |
++{ |
14924 |
++ int ret = 0; |
14925 |
++ struct request_sock_ops *ops = &mptcp_request_sock_ops; |
14926 |
++ |
14927 |
++ mptcp_request_sock_ipv4_ops = tcp_request_sock_ipv4_ops; |
14928 |
++ mptcp_request_sock_ipv4_ops.init_req = mptcp_v4_init_req; |
14929 |
++ |
14930 |
++ mptcp_join_request_sock_ipv4_ops = tcp_request_sock_ipv4_ops; |
14931 |
++ mptcp_join_request_sock_ipv4_ops.init_req = mptcp_v4_join_init_req; |
14932 |
++ mptcp_join_request_sock_ipv4_ops.queue_hash_add = mptcp_v4_reqsk_queue_hash_add; |
14933 |
++ |
14934 |
++ ops->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", "MPTCP"); |
14935 |
++ if (ops->slab_name == NULL) { |
14936 |
++ ret = -ENOMEM; |
14937 |
++ goto out; |
14938 |
++ } |
14939 |
++ |
14940 |
++ ops->slab = kmem_cache_create(ops->slab_name, ops->obj_size, 0, |
14941 |
++ SLAB_DESTROY_BY_RCU|SLAB_HWCACHE_ALIGN, |
14942 |
++ NULL); |
14943 |
++ |
14944 |
++ if (ops->slab == NULL) { |
14945 |
++ ret = -ENOMEM; |
14946 |
++ goto err_reqsk_create; |
14947 |
++ } |
14948 |
++ |
14949 |
++out: |
14950 |
++ return ret; |
14951 |
++ |
14952 |
++err_reqsk_create: |
14953 |
++ kfree(ops->slab_name); |
14954 |
++ ops->slab_name = NULL; |
14955 |
++ goto out; |
14956 |
++} |
14957 |
++ |
14958 |
++void mptcp_pm_v4_undo(void) |
14959 |
++{ |
14960 |
++ kmem_cache_destroy(mptcp_request_sock_ops.slab); |
14961 |
++ kfree(mptcp_request_sock_ops.slab_name); |
14962 |
++} |
14963 |
+diff --git a/net/mptcp/mptcp_ipv6.c b/net/mptcp/mptcp_ipv6.c |
14964 |
+new file mode 100644 |
14965 |
+index 000000000000..1036973aa855 |
14966 |
+--- /dev/null |
14967 |
++++ b/net/mptcp/mptcp_ipv6.c |
14968 |
+@@ -0,0 +1,518 @@ |
14969 |
++/* |
14970 |
++ * MPTCP implementation - IPv6-specific functions |
14971 |
++ * |
14972 |
++ * Initial Design & Implementation: |
14973 |
++ * Sébastien Barré <sebastien.barre@×××××××××.be> |
14974 |
++ * |
14975 |
++ * Current Maintainer: |
14976 |
++ * Jaakko Korkeaniemi <jaakko.korkeaniemi@×××××.fi> |
14977 |
++ * |
14978 |
++ * Additional authors: |
14979 |
++ * Jaakko Korkeaniemi <jaakko.korkeaniemi@×××××.fi> |
14980 |
++ * Gregory Detal <gregory.detal@×××××××××.be> |
14981 |
++ * Fabien Duchêne <fabien.duchene@×××××××××.be> |
14982 |
++ * Andreas Seelinger <Andreas.Seelinger@×××××××××××.de> |
14983 |
++ * Lavkesh Lahngir <lavkesh51@×××××.com> |
14984 |
++ * Andreas Ripke <ripke@××××××.eu> |
14985 |
++ * Vlad Dogaru <vlad.dogaru@×××××.com> |
14986 |
++ * Octavian Purdila <octavian.purdila@×××××.com> |
14987 |
++ * John Ronan <jronan@××××.org> |
14988 |
++ * Catalin Nicutar <catalin.nicutar@×××××.com> |
14989 |
++ * Brandon Heller <brandonh@××××××××.edu> |
14990 |
++ * |
14991 |
++ * |
14992 |
++ * This program is free software; you can redistribute it and/or |
14993 |
++ * modify it under the terms of the GNU General Public License |
14994 |
++ * as published by the Free Software Foundation; either version |
14995 |
++ * 2 of the License, or (at your option) any later version. |
14996 |
++ */ |
14997 |
++ |
14998 |
++#include <linux/export.h> |
14999 |
++#include <linux/in6.h> |
15000 |
++#include <linux/kernel.h> |
15001 |
++ |
15002 |
++#include <net/addrconf.h> |
15003 |
++#include <net/flow.h> |
15004 |
++#include <net/inet6_connection_sock.h> |
15005 |
++#include <net/inet6_hashtables.h> |
15006 |
++#include <net/inet_common.h> |
15007 |
++#include <net/ipv6.h> |
15008 |
++#include <net/ip6_checksum.h> |
15009 |
++#include <net/ip6_route.h> |
15010 |
++#include <net/mptcp.h> |
15011 |
++#include <net/mptcp_v6.h> |
15012 |
++#include <net/tcp.h> |
15013 |
++#include <net/transp_v6.h> |
15014 |
++ |
15015 |
++__u32 mptcp_v6_get_nonce(const __be32 *saddr, const __be32 *daddr, |
15016 |
++ __be16 sport, __be16 dport) |
15017 |
++{ |
15018 |
++ u32 secret[MD5_MESSAGE_BYTES / 4]; |
15019 |
++ u32 hash[MD5_DIGEST_WORDS]; |
15020 |
++ u32 i; |
15021 |
++ |
15022 |
++ memcpy(hash, saddr, 16); |
15023 |
++ for (i = 0; i < 4; i++) |
15024 |
++ secret[i] = mptcp_secret[i] + (__force u32)daddr[i]; |
15025 |
++ secret[4] = mptcp_secret[4] + |
15026 |
++ (((__force u16)sport << 16) + (__force u16)dport); |
15027 |
++ secret[5] = mptcp_seed++; |
15028 |
++ for (i = 6; i < MD5_MESSAGE_BYTES / 4; i++) |
15029 |
++ secret[i] = mptcp_secret[i]; |
15030 |
++ |
15031 |
++ md5_transform(hash, secret); |
15032 |
++ |
15033 |
++ return hash[0]; |
15034 |
++} |
15035 |
++ |
15036 |
++u64 mptcp_v6_get_key(const __be32 *saddr, const __be32 *daddr, |
15037 |
++ __be16 sport, __be16 dport) |
15038 |
++{ |
15039 |
++ u32 secret[MD5_MESSAGE_BYTES / 4]; |
15040 |
++ u32 hash[MD5_DIGEST_WORDS]; |
15041 |
++ u32 i; |
15042 |
++ |
15043 |
++ memcpy(hash, saddr, 16); |
15044 |
++ for (i = 0; i < 4; i++) |
15045 |
++ secret[i] = mptcp_secret[i] + (__force u32)daddr[i]; |
15046 |
++ secret[4] = mptcp_secret[4] + |
15047 |
++ (((__force u16)sport << 16) + (__force u16)dport); |
15048 |
++ secret[5] = mptcp_seed++; |
15049 |
++ for (i = 6; i < MD5_MESSAGE_BYTES / 4; i++) |
15050 |
++ secret[i] = mptcp_secret[i]; |
15051 |
++ |
15052 |
++ md5_transform(hash, secret); |
15053 |
++ |
15054 |
++ return *((u64 *)hash); |
15055 |
++} |
15056 |
++ |
15057 |
++static void mptcp_v6_reqsk_destructor(struct request_sock *req) |
15058 |
++{ |
15059 |
++ mptcp_reqsk_destructor(req); |
15060 |
++ |
15061 |
++ tcp_v6_reqsk_destructor(req); |
15062 |
++} |
15063 |
++ |
15064 |
++static int mptcp_v6_init_req(struct request_sock *req, struct sock *sk, |
15065 |
++ struct sk_buff *skb) |
15066 |
++{ |
15067 |
++ tcp_request_sock_ipv6_ops.init_req(req, sk, skb); |
15068 |
++ mptcp_reqsk_init(req, skb); |
15069 |
++ |
15070 |
++ return 0; |
15071 |
++} |
15072 |
++ |
15073 |
++static int mptcp_v6_join_init_req(struct request_sock *req, struct sock *sk, |
15074 |
++ struct sk_buff *skb) |
15075 |
++{ |
15076 |
++ struct mptcp_request_sock *mtreq = mptcp_rsk(req); |
15077 |
++ struct mptcp_cb *mpcb = tcp_sk(sk)->mpcb; |
15078 |
++ union inet_addr addr; |
15079 |
++ int loc_id; |
15080 |
++ bool low_prio = false; |
15081 |
++ |
15082 |
++ /* We need to do this as early as possible. Because, if we fail later |
15083 |
++ * (e.g., get_local_id), then reqsk_free tries to remove the |
15084 |
++ * request-socket from the htb in mptcp_hash_request_remove as pprev |
15085 |
++ * may be different from NULL. |
15086 |
++ */ |
15087 |
++ mtreq->hash_entry.pprev = NULL; |
15088 |
++ |
15089 |
++ tcp_request_sock_ipv6_ops.init_req(req, sk, skb); |
15090 |
++ |
15091 |
++ mtreq->mptcp_loc_nonce = mptcp_v6_get_nonce(ipv6_hdr(skb)->saddr.s6_addr32, |
15092 |
++ ipv6_hdr(skb)->daddr.s6_addr32, |
15093 |
++ tcp_hdr(skb)->source, |
15094 |
++ tcp_hdr(skb)->dest); |
15095 |
++ addr.in6 = inet_rsk(req)->ir_v6_loc_addr; |
15096 |
++ loc_id = mpcb->pm_ops->get_local_id(AF_INET6, &addr, sock_net(sk), &low_prio); |
15097 |
++ if (loc_id == -1) |
15098 |
++ return -1; |
15099 |
++ mtreq->loc_id = loc_id; |
15100 |
++ mtreq->low_prio = low_prio; |
15101 |
++ |
15102 |
++ mptcp_join_reqsk_init(mpcb, req, skb); |
15103 |
++ |
15104 |
++ return 0; |
15105 |
++} |
15106 |
++ |
15107 |
++/* Similar to tcp6_request_sock_ops */ |
15108 |
++struct request_sock_ops mptcp6_request_sock_ops __read_mostly = { |
15109 |
++ .family = AF_INET6, |
15110 |
++ .obj_size = sizeof(struct mptcp_request_sock), |
15111 |
++ .rtx_syn_ack = tcp_v6_rtx_synack, |
15112 |
++ .send_ack = tcp_v6_reqsk_send_ack, |
15113 |
++ .destructor = mptcp_v6_reqsk_destructor, |
15114 |
++ .send_reset = tcp_v6_send_reset, |
15115 |
++ .syn_ack_timeout = tcp_syn_ack_timeout, |
15116 |
++}; |
15117 |
++ |
15118 |
++static void mptcp_v6_reqsk_queue_hash_add(struct sock *meta_sk, |
15119 |
++ struct request_sock *req, |
15120 |
++ const unsigned long timeout) |
15121 |
++{ |
15122 |
++ const u32 h1 = inet6_synq_hash(&inet_rsk(req)->ir_v6_rmt_addr, |
15123 |
++ inet_rsk(req)->ir_rmt_port, |
15124 |
++ 0, MPTCP_HASH_SIZE); |
15125 |
++ /* We cannot call inet6_csk_reqsk_queue_hash_add(), because we do not |
15126 |
++ * want to reset the keepalive-timer (responsible for retransmitting |
15127 |
++ * SYN/ACKs). We do not retransmit SYN/ACKs+MP_JOINs, because we cannot |
15128 |
++ * overload the keepalive timer. Also, it's not a big deal, because the |
15129 |
++ * third ACK of the MP_JOIN-handshake is sent in a reliable manner. So, |
15130 |
++ * if the third ACK gets lost, the client will handle the retransmission |
15131 |
++ * anyways. If our SYN/ACK gets lost, the client will retransmit the |
15132 |
++ * SYN. |
15133 |
++ */ |
15134 |
++ struct inet_connection_sock *meta_icsk = inet_csk(meta_sk); |
15135 |
++ struct listen_sock *lopt = meta_icsk->icsk_accept_queue.listen_opt; |
15136 |
++ const u32 h2 = inet6_synq_hash(&inet_rsk(req)->ir_v6_rmt_addr, |
15137 |
++ inet_rsk(req)->ir_rmt_port, |
15138 |
++ lopt->hash_rnd, lopt->nr_table_entries); |
15139 |
++ |
15140 |
++ reqsk_queue_hash_req(&meta_icsk->icsk_accept_queue, h2, req, timeout); |
15141 |
++ if (reqsk_queue_added(&meta_icsk->icsk_accept_queue) == 0) |
15142 |
++ mptcp_reset_synack_timer(meta_sk, timeout); |
15143 |
++ |
15144 |
++ rcu_read_lock(); |
15145 |
++ spin_lock(&mptcp_reqsk_hlock); |
15146 |
++ hlist_nulls_add_head_rcu(&mptcp_rsk(req)->hash_entry, &mptcp_reqsk_htb[h1]); |
15147 |
++ spin_unlock(&mptcp_reqsk_hlock); |
15148 |
++ rcu_read_unlock(); |
15149 |
++} |
15150 |
++ |
15151 |
++static int mptcp_v6_join_request(struct sock *meta_sk, struct sk_buff *skb) |
15152 |
++{ |
15153 |
++ return tcp_conn_request(&mptcp6_request_sock_ops, |
15154 |
++ &mptcp_join_request_sock_ipv6_ops, |
15155 |
++ meta_sk, skb); |
15156 |
++} |
15157 |
++ |
15158 |
++int mptcp_v6_do_rcv(struct sock *meta_sk, struct sk_buff *skb) |
15159 |
++{ |
15160 |
++ const struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb; |
15161 |
++ struct sock *child, *rsk = NULL; |
15162 |
++ int ret; |
15163 |
++ |
15164 |
++ if (!(TCP_SKB_CB(skb)->mptcp_flags & MPTCPHDR_JOIN)) { |
15165 |
++ struct tcphdr *th = tcp_hdr(skb); |
15166 |
++ const struct ipv6hdr *ip6h = ipv6_hdr(skb); |
15167 |
++ struct sock *sk; |
15168 |
++ |
15169 |
++ sk = __inet6_lookup_established(sock_net(meta_sk), |
15170 |
++ &tcp_hashinfo, |
15171 |
++ &ip6h->saddr, th->source, |
15172 |
++ &ip6h->daddr, ntohs(th->dest), |
15173 |
++ inet6_iif(skb)); |
15174 |
++ |
15175 |
++ if (!sk) { |
15176 |
++ kfree_skb(skb); |
15177 |
++ return 0; |
15178 |
++ } |
15179 |
++ if (is_meta_sk(sk)) { |
15180 |
++ WARN("%s Did not find a sub-sk!\n", __func__); |
15181 |
++ kfree_skb(skb); |
15182 |
++ sock_put(sk); |
15183 |
++ return 0; |
15184 |
++ } |
15185 |
++ |
15186 |
++ if (sk->sk_state == TCP_TIME_WAIT) { |
15187 |
++ inet_twsk_put(inet_twsk(sk)); |
15188 |
++ kfree_skb(skb); |
15189 |
++ return 0; |
15190 |
++ } |
15191 |
++ |
15192 |
++ ret = tcp_v6_do_rcv(sk, skb); |
15193 |
++ sock_put(sk); |
15194 |
++ |
15195 |
++ return ret; |
15196 |
++ } |
15197 |
++ TCP_SKB_CB(skb)->mptcp_flags = 0; |
15198 |
++ |
15199 |
++ /* Has been removed from the tk-table. Thus, no new subflows. |
15200 |
++ * |
15201 |
++ * Check for close-state is necessary, because we may have been closed |
15202 |
++ * without passing by mptcp_close(). |
15203 |
++ * |
15204 |
++ * When falling back, no new subflows are allowed either. |
15205 |
++ */ |
15206 |
++ if (meta_sk->sk_state == TCP_CLOSE || !tcp_sk(meta_sk)->inside_tk_table || |
15207 |
++ mpcb->infinite_mapping_rcv || mpcb->send_infinite_mapping) |
15208 |
++ goto reset_and_discard; |
15209 |
++ |
15210 |
++ child = tcp_v6_hnd_req(meta_sk, skb); |
15211 |
++ |
15212 |
++ if (!child) |
15213 |
++ goto discard; |
15214 |
++ |
15215 |
++ if (child != meta_sk) { |
15216 |
++ sock_rps_save_rxhash(child, skb); |
15217 |
++ /* We don't call tcp_child_process here, because we hold |
15218 |
++ * already the meta-sk-lock and are sure that it is not owned |
15219 |
++ * by the user. |
15220 |
++ */ |
15221 |
++ ret = tcp_rcv_state_process(child, skb, tcp_hdr(skb), skb->len); |
15222 |
++ bh_unlock_sock(child); |
15223 |
++ sock_put(child); |
15224 |
++ if (ret) { |
15225 |
++ rsk = child; |
15226 |
++ goto reset_and_discard; |
15227 |
++ } |
15228 |
++ } else { |
15229 |
++ if (tcp_hdr(skb)->syn) { |
15230 |
++ mptcp_v6_join_request(meta_sk, skb); |
15231 |
++ goto discard; |
15232 |
++ } |
15233 |
++ goto reset_and_discard; |
15234 |
++ } |
15235 |
++ return 0; |
15236 |
++ |
15237 |
++reset_and_discard: |
15238 |
++ if (reqsk_queue_len(&inet_csk(meta_sk)->icsk_accept_queue)) { |
15239 |
++ const struct tcphdr *th = tcp_hdr(skb); |
15240 |
++ struct request_sock **prev, *req; |
15241 |
++ /* If we end up here, it means we should not have matched on the |
15242 |
++ * request-socket. But, because the request-sock queue is only |
15243 |
++ * destroyed in mptcp_close, the socket may actually already be |
15244 |
++ * in close-state (e.g., through shutdown()) while still having |
15245 |
++ * pending request sockets. |
15246 |
++ */ |
15247 |
++ req = inet6_csk_search_req(meta_sk, &prev, th->source, |
15248 |
++ &ipv6_hdr(skb)->saddr, |
15249 |
++ &ipv6_hdr(skb)->daddr, inet6_iif(skb)); |
15250 |
++ if (req) { |
15251 |
++ inet_csk_reqsk_queue_unlink(meta_sk, req, prev); |
15252 |
++ reqsk_queue_removed(&inet_csk(meta_sk)->icsk_accept_queue, |
15253 |
++ req); |
15254 |
++ reqsk_free(req); |
15255 |
++ } |
15256 |
++ } |
15257 |
++ |
15258 |
++ tcp_v6_send_reset(rsk, skb); |
15259 |
++discard: |
15260 |
++ kfree_skb(skb); |
15261 |
++ return 0; |
15262 |
++} |
15263 |
++ |
15264 |
++/* After this, the ref count of the meta_sk associated with the request_sock |
15265 |
++ * is incremented. Thus it is the responsibility of the caller |
15266 |
++ * to call sock_put() when the reference is not needed anymore. |
15267 |
++ */ |
15268 |
++struct sock *mptcp_v6_search_req(const __be16 rport, const struct in6_addr *raddr, |
15269 |
++ const struct in6_addr *laddr, const struct net *net) |
15270 |
++{ |
15271 |
++ const struct mptcp_request_sock *mtreq; |
15272 |
++ struct sock *meta_sk = NULL; |
15273 |
++ const struct hlist_nulls_node *node; |
15274 |
++ const u32 hash = inet6_synq_hash(raddr, rport, 0, MPTCP_HASH_SIZE); |
15275 |
++ |
15276 |
++ rcu_read_lock(); |
15277 |
++begin: |
15278 |
++ hlist_nulls_for_each_entry_rcu(mtreq, node, &mptcp_reqsk_htb[hash], |
15279 |
++ hash_entry) { |
15280 |
++ struct inet_request_sock *treq = inet_rsk(rev_mptcp_rsk(mtreq)); |
15281 |
++ meta_sk = mtreq->mptcp_mpcb->meta_sk; |
15282 |
++ |
15283 |
++ if (inet_rsk(rev_mptcp_rsk(mtreq))->ir_rmt_port == rport && |
15284 |
++ rev_mptcp_rsk(mtreq)->rsk_ops->family == AF_INET6 && |
15285 |
++ ipv6_addr_equal(&treq->ir_v6_rmt_addr, raddr) && |
15286 |
++ ipv6_addr_equal(&treq->ir_v6_loc_addr, laddr) && |
15287 |
++ net_eq(net, sock_net(meta_sk))) |
15288 |
++ goto found; |
15289 |
++ meta_sk = NULL; |
15290 |
++ } |
15291 |
++ /* A request-socket is destroyed by RCU. So, it might have been recycled |
15292 |
++ * and put into another hash-table list. So, after the lookup we may |
15293 |
++ * end up in a different list. So, we may need to restart. |
15294 |
++ * |
15295 |
++ * See also the comment in __inet_lookup_established. |
15296 |
++ */ |
15297 |
++ if (get_nulls_value(node) != hash + MPTCP_REQSK_NULLS_BASE) |
15298 |
++ goto begin; |
15299 |
++ |
15300 |
++found: |
15301 |
++ if (meta_sk && unlikely(!atomic_inc_not_zero(&meta_sk->sk_refcnt))) |
15302 |
++ meta_sk = NULL; |
15303 |
++ rcu_read_unlock(); |
15304 |
++ |
15305 |
++ return meta_sk; |
15306 |
++} |
15307 |
++ |
15308 |
++/* Create a new IPv6 subflow. |
15309 |
++ * |
15310 |
++ * We are in user-context and meta-sock-lock is hold. |
15311 |
++ */ |
15312 |
++int mptcp_init6_subsockets(struct sock *meta_sk, const struct mptcp_loc6 *loc, |
15313 |
++ struct mptcp_rem6 *rem) |
15314 |
++{ |
15315 |
++ struct tcp_sock *tp; |
15316 |
++ struct sock *sk; |
15317 |
++ struct sockaddr_in6 loc_in, rem_in; |
15318 |
++ struct socket sock; |
15319 |
++ int ret; |
15320 |
++ |
15321 |
++ /** First, create and prepare the new socket */ |
15322 |
++ |
15323 |
++ sock.type = meta_sk->sk_socket->type; |
15324 |
++ sock.state = SS_UNCONNECTED; |
15325 |
++ sock.wq = meta_sk->sk_socket->wq; |
15326 |
++ sock.file = meta_sk->sk_socket->file; |
15327 |
++ sock.ops = NULL; |
15328 |
++ |
15329 |
++ ret = inet6_create(sock_net(meta_sk), &sock, IPPROTO_TCP, 1); |
15330 |
++ if (unlikely(ret < 0)) { |
15331 |
++ mptcp_debug("%s inet6_create failed ret: %d\n", __func__, ret); |
15332 |
++ return ret; |
15333 |
++ } |
15334 |
++ |
15335 |
++ sk = sock.sk; |
15336 |
++ tp = tcp_sk(sk); |
15337 |
++ |
15338 |
++ /* All subsockets need the MPTCP-lock-class */ |
15339 |
++ lockdep_set_class_and_name(&(sk)->sk_lock.slock, &meta_slock_key, "slock-AF_INET-MPTCP"); |
15340 |
++ lockdep_init_map(&(sk)->sk_lock.dep_map, "sk_lock-AF_INET-MPTCP", &meta_key, 0); |
15341 |
++ |
15342 |
++ if (mptcp_add_sock(meta_sk, sk, loc->loc6_id, rem->rem6_id, GFP_KERNEL)) |
15343 |
++ goto error; |
15344 |
++ |
15345 |
++ tp->mptcp->slave_sk = 1; |
15346 |
++ tp->mptcp->low_prio = loc->low_prio; |
15347 |
++ |
15348 |
++ /* Initializing the timer for an MPTCP subflow */ |
15349 |
++ setup_timer(&tp->mptcp->mptcp_ack_timer, mptcp_ack_handler, (unsigned long)sk); |
15350 |
++ |
15351 |
++ /** Then, connect the socket to the peer */ |
15352 |
++ loc_in.sin6_family = AF_INET6; |
15353 |
++ rem_in.sin6_family = AF_INET6; |
15354 |
++ loc_in.sin6_port = 0; |
15355 |
++ if (rem->port) |
15356 |
++ rem_in.sin6_port = rem->port; |
15357 |
++ else |
15358 |
++ rem_in.sin6_port = inet_sk(meta_sk)->inet_dport; |
15359 |
++ loc_in.sin6_addr = loc->addr; |
15360 |
++ rem_in.sin6_addr = rem->addr; |
15361 |
++ |
15362 |
++ ret = sock.ops->bind(&sock, (struct sockaddr *)&loc_in, sizeof(struct sockaddr_in6)); |
15363 |
++ if (ret < 0) { |
15364 |
++ mptcp_debug("%s: MPTCP subsocket bind()failed, error %d\n", |
15365 |
++ __func__, ret); |
15366 |
++ goto error; |
15367 |
++ } |
15368 |
++ |
15369 |
++ mptcp_debug("%s: token %#x pi %d src_addr:%pI6:%d dst_addr:%pI6:%d\n", |
15370 |
++ __func__, tcp_sk(meta_sk)->mpcb->mptcp_loc_token, |
15371 |
++ tp->mptcp->path_index, &loc_in.sin6_addr, |
15372 |
++ ntohs(loc_in.sin6_port), &rem_in.sin6_addr, |
15373 |
++ ntohs(rem_in.sin6_port)); |
15374 |
++ |
15375 |
++ if (tcp_sk(meta_sk)->mpcb->pm_ops->init_subsocket_v6) |
15376 |
++ tcp_sk(meta_sk)->mpcb->pm_ops->init_subsocket_v6(sk, rem->addr); |
15377 |
++ |
15378 |
++ ret = sock.ops->connect(&sock, (struct sockaddr *)&rem_in, |
15379 |
++ sizeof(struct sockaddr_in6), O_NONBLOCK); |
15380 |
++ if (ret < 0 && ret != -EINPROGRESS) { |
15381 |
++ mptcp_debug("%s: MPTCP subsocket connect() failed, error %d\n", |
15382 |
++ __func__, ret); |
15383 |
++ goto error; |
15384 |
++ } |
15385 |
++ |
15386 |
++ sk_set_socket(sk, meta_sk->sk_socket); |
15387 |
++ sk->sk_wq = meta_sk->sk_wq; |
15388 |
++ |
15389 |
++ return 0; |
15390 |
++ |
15391 |
++error: |
15392 |
++ /* May happen if mptcp_add_sock fails first */ |
15393 |
++ if (!mptcp(tp)) { |
15394 |
++ tcp_close(sk, 0); |
15395 |
++ } else { |
15396 |
++ local_bh_disable(); |
15397 |
++ mptcp_sub_force_close(sk); |
15398 |
++ local_bh_enable(); |
15399 |
++ } |
15400 |
++ return ret; |
15401 |
++} |
15402 |
++EXPORT_SYMBOL(mptcp_init6_subsockets); |
15403 |
++ |
15404 |
++const struct inet_connection_sock_af_ops mptcp_v6_specific = { |
15405 |
++ .queue_xmit = inet6_csk_xmit, |
15406 |
++ .send_check = tcp_v6_send_check, |
15407 |
++ .rebuild_header = inet6_sk_rebuild_header, |
15408 |
++ .sk_rx_dst_set = inet6_sk_rx_dst_set, |
15409 |
++ .conn_request = mptcp_conn_request, |
15410 |
++ .syn_recv_sock = tcp_v6_syn_recv_sock, |
15411 |
++ .net_header_len = sizeof(struct ipv6hdr), |
15412 |
++ .net_frag_header_len = sizeof(struct frag_hdr), |
15413 |
++ .setsockopt = ipv6_setsockopt, |
15414 |
++ .getsockopt = ipv6_getsockopt, |
15415 |
++ .addr2sockaddr = inet6_csk_addr2sockaddr, |
15416 |
++ .sockaddr_len = sizeof(struct sockaddr_in6), |
15417 |
++ .bind_conflict = inet6_csk_bind_conflict, |
15418 |
++#ifdef CONFIG_COMPAT |
15419 |
++ .compat_setsockopt = compat_ipv6_setsockopt, |
15420 |
++ .compat_getsockopt = compat_ipv6_getsockopt, |
15421 |
++#endif |
15422 |
++}; |
15423 |
++ |
15424 |
++const struct inet_connection_sock_af_ops mptcp_v6_mapped = { |
15425 |
++ .queue_xmit = ip_queue_xmit, |
15426 |
++ .send_check = tcp_v4_send_check, |
15427 |
++ .rebuild_header = inet_sk_rebuild_header, |
15428 |
++ .sk_rx_dst_set = inet_sk_rx_dst_set, |
15429 |
++ .conn_request = mptcp_conn_request, |
15430 |
++ .syn_recv_sock = tcp_v6_syn_recv_sock, |
15431 |
++ .net_header_len = sizeof(struct iphdr), |
15432 |
++ .setsockopt = ipv6_setsockopt, |
15433 |
++ .getsockopt = ipv6_getsockopt, |
15434 |
++ .addr2sockaddr = inet6_csk_addr2sockaddr, |
15435 |
++ .sockaddr_len = sizeof(struct sockaddr_in6), |
15436 |
++ .bind_conflict = inet6_csk_bind_conflict, |
15437 |
++#ifdef CONFIG_COMPAT |
15438 |
++ .compat_setsockopt = compat_ipv6_setsockopt, |
15439 |
++ .compat_getsockopt = compat_ipv6_getsockopt, |
15440 |
++#endif |
15441 |
++}; |
15442 |
++ |
15443 |
++struct tcp_request_sock_ops mptcp_request_sock_ipv6_ops; |
15444 |
++struct tcp_request_sock_ops mptcp_join_request_sock_ipv6_ops; |
15445 |
++ |
15446 |
++int mptcp_pm_v6_init(void) |
15447 |
++{ |
15448 |
++ int ret = 0; |
15449 |
++ struct request_sock_ops *ops = &mptcp6_request_sock_ops; |
15450 |
++ |
15451 |
++ mptcp_request_sock_ipv6_ops = tcp_request_sock_ipv6_ops; |
15452 |
++ mptcp_request_sock_ipv6_ops.init_req = mptcp_v6_init_req; |
15453 |
++ |
15454 |
++ mptcp_join_request_sock_ipv6_ops = tcp_request_sock_ipv6_ops; |
15455 |
++ mptcp_join_request_sock_ipv6_ops.init_req = mptcp_v6_join_init_req; |
15456 |
++ mptcp_join_request_sock_ipv6_ops.queue_hash_add = mptcp_v6_reqsk_queue_hash_add; |
15457 |
++ |
15458 |
++ ops->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", "MPTCP6"); |
15459 |
++ if (ops->slab_name == NULL) { |
15460 |
++ ret = -ENOMEM; |
15461 |
++ goto out; |
15462 |
++ } |
15463 |
++ |
15464 |
++ ops->slab = kmem_cache_create(ops->slab_name, ops->obj_size, 0, |
15465 |
++ SLAB_DESTROY_BY_RCU|SLAB_HWCACHE_ALIGN, |
15466 |
++ NULL); |
15467 |
++ |
15468 |
++ if (ops->slab == NULL) { |
15469 |
++ ret = -ENOMEM; |
15470 |
++ goto err_reqsk_create; |
15471 |
++ } |
15472 |
++ |
15473 |
++out: |
15474 |
++ return ret; |
15475 |
++ |
15476 |
++err_reqsk_create: |
15477 |
++ kfree(ops->slab_name); |
15478 |
++ ops->slab_name = NULL; |
15479 |
++ goto out; |
15480 |
++} |
15481 |
++ |
15482 |
++void mptcp_pm_v6_undo(void) |
15483 |
++{ |
15484 |
++ kmem_cache_destroy(mptcp6_request_sock_ops.slab); |
15485 |
++ kfree(mptcp6_request_sock_ops.slab_name); |
15486 |
++} |
15487 |
+diff --git a/net/mptcp/mptcp_ndiffports.c b/net/mptcp/mptcp_ndiffports.c |
15488 |
+new file mode 100644 |
15489 |
+index 000000000000..6f5087983175 |
15490 |
+--- /dev/null |
15491 |
++++ b/net/mptcp/mptcp_ndiffports.c |
15492 |
+@@ -0,0 +1,161 @@ |
15493 |
++#include <linux/module.h> |
15494 |
++ |
15495 |
++#include <net/mptcp.h> |
15496 |
++#include <net/mptcp_v4.h> |
15497 |
++ |
15498 |
++#if IS_ENABLED(CONFIG_IPV6) |
15499 |
++#include <net/mptcp_v6.h> |
15500 |
++#endif |
15501 |
++ |
15502 |
++struct ndiffports_priv { |
15503 |
++ /* Worker struct for subflow establishment */ |
15504 |
++ struct work_struct subflow_work; |
15505 |
++ |
15506 |
++ struct mptcp_cb *mpcb; |
15507 |
++}; |
15508 |
++ |
15509 |
++static int num_subflows __read_mostly = 2; |
15510 |
++module_param(num_subflows, int, 0644); |
15511 |
++MODULE_PARM_DESC(num_subflows, "choose the number of subflows per MPTCP connection"); |
15512 |
++ |
15513 |
++/** |
15514 |
++ * Create all new subflows, by doing calls to mptcp_initX_subsockets |
15515 |
++ * |
15516 |
++ * This function uses a goto next_subflow, to allow releasing the lock between |
15517 |
++ * new subflows and giving other processes a chance to do some work on the |
15518 |
++ * socket and potentially finishing the communication. |
15519 |
++ **/ |
15520 |
++static void create_subflow_worker(struct work_struct *work) |
15521 |
++{ |
15522 |
++ const struct ndiffports_priv *pm_priv = container_of(work, |
15523 |
++ struct ndiffports_priv, |
15524 |
++ subflow_work); |
15525 |
++ struct mptcp_cb *mpcb = pm_priv->mpcb; |
15526 |
++ struct sock *meta_sk = mpcb->meta_sk; |
15527 |
++ int iter = 0; |
15528 |
++ |
15529 |
++next_subflow: |
15530 |
++ if (iter) { |
15531 |
++ release_sock(meta_sk); |
15532 |
++ mutex_unlock(&mpcb->mpcb_mutex); |
15533 |
++ |
15534 |
++ cond_resched(); |
15535 |
++ } |
15536 |
++ mutex_lock(&mpcb->mpcb_mutex); |
15537 |
++ lock_sock_nested(meta_sk, SINGLE_DEPTH_NESTING); |
15538 |
++ |
15539 |
++ iter++; |
15540 |
++ |
15541 |
++ if (sock_flag(meta_sk, SOCK_DEAD)) |
15542 |
++ goto exit; |
15543 |
++ |
15544 |
++ if (mpcb->master_sk && |
15545 |
++ !tcp_sk(mpcb->master_sk)->mptcp->fully_established) |
15546 |
++ goto exit; |
15547 |
++ |
15548 |
++ if (num_subflows > iter && num_subflows > mpcb->cnt_subflows) { |
15549 |
++ if (meta_sk->sk_family == AF_INET || |
15550 |
++ mptcp_v6_is_v4_mapped(meta_sk)) { |
15551 |
++ struct mptcp_loc4 loc; |
15552 |
++ struct mptcp_rem4 rem; |
15553 |
++ |
15554 |
++ loc.addr.s_addr = inet_sk(meta_sk)->inet_saddr; |
15555 |
++ loc.loc4_id = 0; |
15556 |
++ loc.low_prio = 0; |
15557 |
++ |
15558 |
++ rem.addr.s_addr = inet_sk(meta_sk)->inet_daddr; |
15559 |
++ rem.port = inet_sk(meta_sk)->inet_dport; |
15560 |
++ rem.rem4_id = 0; /* Default 0 */ |
15561 |
++ |
15562 |
++ mptcp_init4_subsockets(meta_sk, &loc, &rem); |
15563 |
++ } else { |
15564 |
++#if IS_ENABLED(CONFIG_IPV6) |
15565 |
++ struct mptcp_loc6 loc; |
15566 |
++ struct mptcp_rem6 rem; |
15567 |
++ |
15568 |
++ loc.addr = inet6_sk(meta_sk)->saddr; |
15569 |
++ loc.loc6_id = 0; |
15570 |
++ loc.low_prio = 0; |
15571 |
++ |
15572 |
++ rem.addr = meta_sk->sk_v6_daddr; |
15573 |
++ rem.port = inet_sk(meta_sk)->inet_dport; |
15574 |
++ rem.rem6_id = 0; /* Default 0 */ |
15575 |
++ |
15576 |
++ mptcp_init6_subsockets(meta_sk, &loc, &rem); |
15577 |
++#endif |
15578 |
++ } |
15579 |
++ goto next_subflow; |
15580 |
++ } |
15581 |
++ |
15582 |
++exit: |
15583 |
++ release_sock(meta_sk); |
15584 |
++ mutex_unlock(&mpcb->mpcb_mutex); |
15585 |
++ sock_put(meta_sk); |
15586 |
++} |
15587 |
++ |
15588 |
++static void ndiffports_new_session(const struct sock *meta_sk) |
15589 |
++{ |
15590 |
++ struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb; |
15591 |
++ struct ndiffports_priv *fmp = (struct ndiffports_priv *)&mpcb->mptcp_pm[0]; |
15592 |
++ |
15593 |
++ /* Initialize workqueue-struct */ |
15594 |
++ INIT_WORK(&fmp->subflow_work, create_subflow_worker); |
15595 |
++ fmp->mpcb = mpcb; |
15596 |
++} |
15597 |
++ |
15598 |
++static void ndiffports_create_subflows(struct sock *meta_sk) |
15599 |
++{ |
15600 |
++ const struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb; |
15601 |
++ struct ndiffports_priv *pm_priv = (struct ndiffports_priv *)&mpcb->mptcp_pm[0]; |
15602 |
++ |
15603 |
++ if (mpcb->infinite_mapping_snd || mpcb->infinite_mapping_rcv || |
15604 |
++ mpcb->send_infinite_mapping || |
15605 |
++ mpcb->server_side || sock_flag(meta_sk, SOCK_DEAD)) |
15606 |
++ return; |
15607 |
++ |
15608 |
++ if (!work_pending(&pm_priv->subflow_work)) { |
15609 |
++ sock_hold(meta_sk); |
15610 |
++ queue_work(mptcp_wq, &pm_priv->subflow_work); |
15611 |
++ } |
15612 |
++} |
15613 |
++ |
15614 |
++static int ndiffports_get_local_id(sa_family_t family, union inet_addr *addr, |
15615 |
++ struct net *net, bool *low_prio) |
15616 |
++{ |
15617 |
++ return 0; |
15618 |
++} |
15619 |
++ |
15620 |
++static struct mptcp_pm_ops ndiffports __read_mostly = { |
15621 |
++ .new_session = ndiffports_new_session, |
15622 |
++ .fully_established = ndiffports_create_subflows, |
15623 |
++ .get_local_id = ndiffports_get_local_id, |
15624 |
++ .name = "ndiffports", |
15625 |
++ .owner = THIS_MODULE, |
15626 |
++}; |
15627 |
++ |
15628 |
++/* General initialization of MPTCP_PM */ |
15629 |
++static int __init ndiffports_register(void) |
15630 |
++{ |
15631 |
++ BUILD_BUG_ON(sizeof(struct ndiffports_priv) > MPTCP_PM_SIZE); |
15632 |
++ |
15633 |
++ if (mptcp_register_path_manager(&ndiffports)) |
15634 |
++ goto exit; |
15635 |
++ |
15636 |
++ return 0; |
15637 |
++ |
15638 |
++exit: |
15639 |
++ return -1; |
15640 |
++} |
15641 |
++ |
15642 |
++static void ndiffports_unregister(void) |
15643 |
++{ |
15644 |
++ mptcp_unregister_path_manager(&ndiffports); |
15645 |
++} |
15646 |
++ |
15647 |
++module_init(ndiffports_register); |
15648 |
++module_exit(ndiffports_unregister); |
15649 |
++ |
15650 |
++MODULE_AUTHOR("Christoph Paasch"); |
15651 |
++MODULE_LICENSE("GPL"); |
15652 |
++MODULE_DESCRIPTION("NDIFF-PORTS MPTCP"); |
15653 |
++MODULE_VERSION("0.88"); |
15654 |
+diff --git a/net/mptcp/mptcp_ofo_queue.c b/net/mptcp/mptcp_ofo_queue.c |
15655 |
+new file mode 100644 |
15656 |
+index 000000000000..ec4e98622637 |
15657 |
+--- /dev/null |
15658 |
++++ b/net/mptcp/mptcp_ofo_queue.c |
15659 |
+@@ -0,0 +1,295 @@ |
15660 |
++/* |
15661 |
++ * MPTCP implementation - Fast algorithm for MPTCP meta-reordering |
15662 |
++ * |
15663 |
++ * Initial Design & Implementation: |
15664 |
++ * Sébastien Barré <sebastien.barre@×××××××××.be> |
15665 |
++ * |
15666 |
++ * Current Maintainer & Author: |
15667 |
++ * Christoph Paasch <christoph.paasch@×××××××××.be> |
15668 |
++ * |
15669 |
++ * Additional authors: |
15670 |
++ * Jaakko Korkeaniemi <jaakko.korkeaniemi@×××××.fi> |
15671 |
++ * Gregory Detal <gregory.detal@×××××××××.be> |
15672 |
++ * Fabien Duchêne <fabien.duchene@×××××××××.be> |
15673 |
++ * Andreas Seelinger <Andreas.Seelinger@×××××××××××.de> |
15674 |
++ * Lavkesh Lahngir <lavkesh51@×××××.com> |
15675 |
++ * Andreas Ripke <ripke@××××××.eu> |
15676 |
++ * Vlad Dogaru <vlad.dogaru@×××××.com> |
15677 |
++ * Octavian Purdila <octavian.purdila@×××××.com> |
15678 |
++ * John Ronan <jronan@××××.org> |
15679 |
++ * Catalin Nicutar <catalin.nicutar@×××××.com> |
15680 |
++ * Brandon Heller <brandonh@××××××××.edu> |
15681 |
++ * |
15682 |
++ * This program is free software; you can redistribute it and/or |
15683 |
++ * modify it under the terms of the GNU General Public License |
15684 |
++ * as published by the Free Software Foundation; either version |
15685 |
++ * 2 of the License, or (at your option) any later version. |
15686 |
++ */ |
15687 |
++ |
15688 |
++#include <linux/skbuff.h> |
15689 |
++#include <linux/slab.h> |
15690 |
++#include <net/tcp.h> |
15691 |
++#include <net/mptcp.h> |
15692 |
++ |
15693 |
++void mptcp_remove_shortcuts(const struct mptcp_cb *mpcb, |
15694 |
++ const struct sk_buff *skb) |
15695 |
++{ |
15696 |
++ struct tcp_sock *tp; |
15697 |
++ |
15698 |
++ mptcp_for_each_tp(mpcb, tp) { |
15699 |
++ if (tp->mptcp->shortcut_ofoqueue == skb) { |
15700 |
++ tp->mptcp->shortcut_ofoqueue = NULL; |
15701 |
++ return; |
15702 |
++ } |
15703 |
++ } |
15704 |
++} |
15705 |
++ |
15706 |
++/* Does 'skb' fits after 'here' in the queue 'head' ? |
15707 |
++ * If yes, we queue it and return 1 |
15708 |
++ */ |
15709 |
++static int mptcp_ofo_queue_after(struct sk_buff_head *head, |
15710 |
++ struct sk_buff *skb, struct sk_buff *here, |
15711 |
++ const struct tcp_sock *tp) |
15712 |
++{ |
15713 |
++ struct sock *meta_sk = tp->meta_sk; |
15714 |
++ struct tcp_sock *meta_tp = tcp_sk(meta_sk); |
15715 |
++ u32 seq = TCP_SKB_CB(skb)->seq; |
15716 |
++ u32 end_seq = TCP_SKB_CB(skb)->end_seq; |
15717 |
++ |
15718 |
++ /* We want to queue skb after here, thus seq >= end_seq */ |
15719 |
++ if (before(seq, TCP_SKB_CB(here)->end_seq)) |
15720 |
++ return 0; |
15721 |
++ |
15722 |
++ if (seq == TCP_SKB_CB(here)->end_seq) { |
15723 |
++ bool fragstolen = false; |
15724 |
++ |
15725 |
++ if (!tcp_try_coalesce(meta_sk, here, skb, &fragstolen)) { |
15726 |
++ __skb_queue_after(&meta_tp->out_of_order_queue, here, skb); |
15727 |
++ return 1; |
15728 |
++ } else { |
15729 |
++ kfree_skb_partial(skb, fragstolen); |
15730 |
++ return -1; |
15731 |
++ } |
15732 |
++ } |
15733 |
++ |
15734 |
++ /* If here is the last one, we can always queue it */ |
15735 |
++ if (skb_queue_is_last(head, here)) { |
15736 |
++ __skb_queue_after(head, here, skb); |
15737 |
++ return 1; |
15738 |
++ } else { |
15739 |
++ struct sk_buff *skb1 = skb_queue_next(head, here); |
15740 |
++ /* It's not the last one, but does it fits between 'here' and |
15741 |
++ * the one after 'here' ? Thus, does end_seq <= after_here->seq |
15742 |
++ */ |
15743 |
++ if (!after(end_seq, TCP_SKB_CB(skb1)->seq)) { |
15744 |
++ __skb_queue_after(head, here, skb); |
15745 |
++ return 1; |
15746 |
++ } |
15747 |
++ } |
15748 |
++ |
15749 |
++ return 0; |
15750 |
++} |
15751 |
++ |
15752 |
++static void try_shortcut(struct sk_buff *shortcut, struct sk_buff *skb, |
15753 |
++ struct sk_buff_head *head, struct tcp_sock *tp) |
15754 |
++{ |
15755 |
++ struct sock *meta_sk = tp->meta_sk; |
15756 |
++ struct tcp_sock *tp_it, *meta_tp = tcp_sk(meta_sk); |
15757 |
++ struct mptcp_cb *mpcb = meta_tp->mpcb; |
15758 |
++ struct sk_buff *skb1, *best_shortcut = NULL; |
15759 |
++ u32 seq = TCP_SKB_CB(skb)->seq; |
15760 |
++ u32 end_seq = TCP_SKB_CB(skb)->end_seq; |
15761 |
++ u32 distance = 0xffffffff; |
15762 |
++ |
15763 |
++ /* First, check the tp's shortcut */ |
15764 |
++ if (!shortcut) { |
15765 |
++ if (skb_queue_empty(head)) { |
15766 |
++ __skb_queue_head(head, skb); |
15767 |
++ goto end; |
15768 |
++ } |
15769 |
++ } else { |
15770 |
++ int ret = mptcp_ofo_queue_after(head, skb, shortcut, tp); |
15771 |
++ /* Does the tp's shortcut is a hit? If yes, we insert. */ |
15772 |
++ |
15773 |
++ if (ret) { |
15774 |
++ skb = (ret > 0) ? skb : NULL; |
15775 |
++ goto end; |
15776 |
++ } |
15777 |
++ } |
15778 |
++ |
15779 |
++ /* Check the shortcuts of the other subsockets. */ |
15780 |
++ mptcp_for_each_tp(mpcb, tp_it) { |
15781 |
++ shortcut = tp_it->mptcp->shortcut_ofoqueue; |
15782 |
++ /* Can we queue it here? If yes, do so! */ |
15783 |
++ if (shortcut) { |
15784 |
++ int ret = mptcp_ofo_queue_after(head, skb, shortcut, tp); |
15785 |
++ |
15786 |
++ if (ret) { |
15787 |
++ skb = (ret > 0) ? skb : NULL; |
15788 |
++ goto end; |
15789 |
++ } |
15790 |
++ } |
15791 |
++ |
15792 |
++ /* Could not queue it, check if we are close. |
15793 |
++ * We are looking for a shortcut, close enough to seq to |
15794 |
++ * set skb1 prematurely and thus improve the subsequent lookup, |
15795 |
++ * which tries to find a skb1 so that skb1->seq <= seq. |
15796 |
++ * |
15797 |
++ * So, here we only take shortcuts, whose shortcut->seq > seq, |
15798 |
++ * and minimize the distance between shortcut->seq and seq and |
15799 |
++ * set best_shortcut to this one with the minimal distance. |
15800 |
++ * |
15801 |
++ * That way, the subsequent while-loop is shortest. |
15802 |
++ */ |
15803 |
++ if (shortcut && after(TCP_SKB_CB(shortcut)->seq, seq)) { |
15804 |
++ /* Are we closer than the current best shortcut? */ |
15805 |
++ if ((u32)(TCP_SKB_CB(shortcut)->seq - seq) < distance) { |
15806 |
++ distance = (u32)(TCP_SKB_CB(shortcut)->seq - seq); |
15807 |
++ best_shortcut = shortcut; |
15808 |
++ } |
15809 |
++ } |
15810 |
++ } |
15811 |
++ |
15812 |
++ if (best_shortcut) |
15813 |
++ skb1 = best_shortcut; |
15814 |
++ else |
15815 |
++ skb1 = skb_peek_tail(head); |
15816 |
++ |
15817 |
++ if (seq == TCP_SKB_CB(skb1)->end_seq) { |
15818 |
++ bool fragstolen = false; |
15819 |
++ |
15820 |
++ if (!tcp_try_coalesce(meta_sk, skb1, skb, &fragstolen)) { |
15821 |
++ __skb_queue_after(&meta_tp->out_of_order_queue, skb1, skb); |
15822 |
++ } else { |
15823 |
++ kfree_skb_partial(skb, fragstolen); |
15824 |
++ skb = NULL; |
15825 |
++ } |
15826 |
++ |
15827 |
++ goto end; |
15828 |
++ } |
15829 |
++ |
15830 |
++ /* Find the insertion point, starting from best_shortcut if available. |
15831 |
++ * |
15832 |
++ * Inspired from tcp_data_queue_ofo. |
15833 |
++ */ |
15834 |
++ while (1) { |
15835 |
++ /* skb1->seq <= seq */ |
15836 |
++ if (!after(TCP_SKB_CB(skb1)->seq, seq)) |
15837 |
++ break; |
15838 |
++ if (skb_queue_is_first(head, skb1)) { |
15839 |
++ skb1 = NULL; |
15840 |
++ break; |
15841 |
++ } |
15842 |
++ skb1 = skb_queue_prev(head, skb1); |
15843 |
++ } |
15844 |
++ |
15845 |
++ /* Do skb overlap to previous one? */ |
15846 |
++ if (skb1 && before(seq, TCP_SKB_CB(skb1)->end_seq)) { |
15847 |
++ if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) { |
15848 |
++ /* All the bits are present. */ |
15849 |
++ __kfree_skb(skb); |
15850 |
++ skb = NULL; |
15851 |
++ goto end; |
15852 |
++ } |
15853 |
++ if (seq == TCP_SKB_CB(skb1)->seq) { |
15854 |
++ if (skb_queue_is_first(head, skb1)) |
15855 |
++ skb1 = NULL; |
15856 |
++ else |
15857 |
++ skb1 = skb_queue_prev(head, skb1); |
15858 |
++ } |
15859 |
++ } |
15860 |
++ if (!skb1) |
15861 |
++ __skb_queue_head(head, skb); |
15862 |
++ else |
15863 |
++ __skb_queue_after(head, skb1, skb); |
15864 |
++ |
15865 |
++ /* And clean segments covered by new one as whole. */ |
15866 |
++ while (!skb_queue_is_last(head, skb)) { |
15867 |
++ skb1 = skb_queue_next(head, skb); |
15868 |
++ |
15869 |
++ if (!after(end_seq, TCP_SKB_CB(skb1)->seq)) |
15870 |
++ break; |
15871 |
++ |
15872 |
++ __skb_unlink(skb1, head); |
15873 |
++ mptcp_remove_shortcuts(mpcb, skb1); |
15874 |
++ __kfree_skb(skb1); |
15875 |
++ } |
15876 |
++ |
15877 |
++end: |
15878 |
++ if (skb) { |
15879 |
++ skb_set_owner_r(skb, meta_sk); |
15880 |
++ tp->mptcp->shortcut_ofoqueue = skb; |
15881 |
++ } |
15882 |
++ |
15883 |
++ return; |
15884 |
++} |
15885 |
++ |
15886 |
++/** |
15887 |
++ * @sk: the subflow that received this skb. |
15888 |
++ */ |
15889 |
++void mptcp_add_meta_ofo_queue(const struct sock *meta_sk, struct sk_buff *skb, |
15890 |
++ struct sock *sk) |
15891 |
++{ |
15892 |
++ struct tcp_sock *tp = tcp_sk(sk); |
15893 |
++ |
15894 |
++ try_shortcut(tp->mptcp->shortcut_ofoqueue, skb, |
15895 |
++ &tcp_sk(meta_sk)->out_of_order_queue, tp); |
15896 |
++} |
15897 |
++ |
15898 |
++bool mptcp_prune_ofo_queue(struct sock *sk) |
15899 |
++{ |
15900 |
++ struct tcp_sock *tp = tcp_sk(sk); |
15901 |
++ bool res = false; |
15902 |
++ |
15903 |
++ if (!skb_queue_empty(&tp->out_of_order_queue)) { |
15904 |
++ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_OFOPRUNED); |
15905 |
++ mptcp_purge_ofo_queue(tp); |
15906 |
++ |
15907 |
++ /* No sack at the mptcp-level */ |
15908 |
++ sk_mem_reclaim(sk); |
15909 |
++ res = true; |
15910 |
++ } |
15911 |
++ |
15912 |
++ return res; |
15913 |
++} |
15914 |
++ |
15915 |
++void mptcp_ofo_queue(struct sock *meta_sk) |
15916 |
++{ |
15917 |
++ struct tcp_sock *meta_tp = tcp_sk(meta_sk); |
15918 |
++ struct sk_buff *skb; |
15919 |
++ |
15920 |
++ while ((skb = skb_peek(&meta_tp->out_of_order_queue)) != NULL) { |
15921 |
++ u32 old_rcv_nxt = meta_tp->rcv_nxt; |
15922 |
++ if (after(TCP_SKB_CB(skb)->seq, meta_tp->rcv_nxt)) |
15923 |
++ break; |
15924 |
++ |
15925 |
++ if (!after(TCP_SKB_CB(skb)->end_seq, meta_tp->rcv_nxt)) { |
15926 |
++ __skb_unlink(skb, &meta_tp->out_of_order_queue); |
15927 |
++ mptcp_remove_shortcuts(meta_tp->mpcb, skb); |
15928 |
++ __kfree_skb(skb); |
15929 |
++ continue; |
15930 |
++ } |
15931 |
++ |
15932 |
++ __skb_unlink(skb, &meta_tp->out_of_order_queue); |
15933 |
++ mptcp_remove_shortcuts(meta_tp->mpcb, skb); |
15934 |
++ |
15935 |
++ __skb_queue_tail(&meta_sk->sk_receive_queue, skb); |
15936 |
++ meta_tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; |
15937 |
++ mptcp_check_rcvseq_wrap(meta_tp, old_rcv_nxt); |
15938 |
++ |
15939 |
++ if (tcp_hdr(skb)->fin) |
15940 |
++ mptcp_fin(meta_sk); |
15941 |
++ } |
15942 |
++} |
15943 |
++ |
15944 |
++void mptcp_purge_ofo_queue(struct tcp_sock *meta_tp) |
15945 |
++{ |
15946 |
++ struct sk_buff_head *head = &meta_tp->out_of_order_queue; |
15947 |
++ struct sk_buff *skb, *tmp; |
15948 |
++ |
15949 |
++ skb_queue_walk_safe(head, skb, tmp) { |
15950 |
++ __skb_unlink(skb, head); |
15951 |
++ mptcp_remove_shortcuts(meta_tp->mpcb, skb); |
15952 |
++ kfree_skb(skb); |
15953 |
++ } |
15954 |
++} |
15955 |
+diff --git a/net/mptcp/mptcp_olia.c b/net/mptcp/mptcp_olia.c |
15956 |
+new file mode 100644 |
15957 |
+index 000000000000..53f5c43bb488 |
15958 |
+--- /dev/null |
15959 |
++++ b/net/mptcp/mptcp_olia.c |
15960 |
+@@ -0,0 +1,311 @@ |
15961 |
++/* |
15962 |
++ * MPTCP implementation - OPPORTUNISTIC LINKED INCREASES CONGESTION CONTROL: |
15963 |
++ * |
15964 |
++ * Algorithm design: |
15965 |
++ * Ramin Khalili <ramin.khalili@××××.ch> |
15966 |
++ * Nicolas Gast <nicolas.gast@××××.ch> |
15967 |
++ * Jean-Yves Le Boudec <jean-yves.leboudec@××××.ch> |
15968 |
++ * |
15969 |
++ * Implementation: |
15970 |
++ * Ramin Khalili <ramin.khalili@××××.ch> |
15971 |
++ * |
15972 |
++ * Ported to the official MPTCP-kernel: |
15973 |
++ * Christoph Paasch <christoph.paasch@×××××××××.be> |
15974 |
++ * |
15975 |
++ * This program is free software; you can redistribute it and/or |
15976 |
++ * modify it under the terms of the GNU General Public License |
15977 |
++ * as published by the Free Software Foundation; either version |
15978 |
++ * 2 of the License, or (at your option) any later version. |
15979 |
++ */ |
15980 |
++ |
15981 |
++ |
15982 |
++#include <net/tcp.h> |
15983 |
++#include <net/mptcp.h> |
15984 |
++ |
15985 |
++#include <linux/module.h> |
15986 |
++ |
15987 |
++static int scale = 10; |
15988 |
++ |
15989 |
++struct mptcp_olia { |
15990 |
++ u32 mptcp_loss1; |
15991 |
++ u32 mptcp_loss2; |
15992 |
++ u32 mptcp_loss3; |
15993 |
++ int epsilon_num; |
15994 |
++ u32 epsilon_den; |
15995 |
++ int mptcp_snd_cwnd_cnt; |
15996 |
++}; |
15997 |
++ |
15998 |
++static inline int mptcp_olia_sk_can_send(const struct sock *sk) |
15999 |
++{ |
16000 |
++ return mptcp_sk_can_send(sk) && tcp_sk(sk)->srtt_us; |
16001 |
++} |
16002 |
++ |
16003 |
++static inline u64 mptcp_olia_scale(u64 val, int scale) |
16004 |
++{ |
16005 |
++ return (u64) val << scale; |
16006 |
++} |
16007 |
++ |
16008 |
++/* take care of artificially inflate (see RFC5681) |
16009 |
++ * of cwnd during fast-retransmit phase |
16010 |
++ */ |
16011 |
++static u32 mptcp_get_crt_cwnd(struct sock *sk) |
16012 |
++{ |
16013 |
++ const struct inet_connection_sock *icsk = inet_csk(sk); |
16014 |
++ |
16015 |
++ if (icsk->icsk_ca_state == TCP_CA_Recovery) |
16016 |
++ return tcp_sk(sk)->snd_ssthresh; |
16017 |
++ else |
16018 |
++ return tcp_sk(sk)->snd_cwnd; |
16019 |
++} |
16020 |
++ |
16021 |
++/* return the dominator of the first term of the increasing term */ |
16022 |
++static u64 mptcp_get_rate(const struct mptcp_cb *mpcb , u32 path_rtt) |
16023 |
++{ |
16024 |
++ struct sock *sk; |
16025 |
++ u64 rate = 1; /* We have to avoid a zero-rate because it is used as a divisor */ |
16026 |
++ |
16027 |
++ mptcp_for_each_sk(mpcb, sk) { |
16028 |
++ struct tcp_sock *tp = tcp_sk(sk); |
16029 |
++ u64 scaled_num; |
16030 |
++ u32 tmp_cwnd; |
16031 |
++ |
16032 |
++ if (!mptcp_olia_sk_can_send(sk)) |
16033 |
++ continue; |
16034 |
++ |
16035 |
++ tmp_cwnd = mptcp_get_crt_cwnd(sk); |
16036 |
++ scaled_num = mptcp_olia_scale(tmp_cwnd, scale) * path_rtt; |
16037 |
++ rate += div_u64(scaled_num , tp->srtt_us); |
16038 |
++ } |
16039 |
++ rate *= rate; |
16040 |
++ return rate; |
16041 |
++} |
16042 |
++ |
16043 |
++/* find the maximum cwnd, used to find set M */ |
16044 |
++static u32 mptcp_get_max_cwnd(const struct mptcp_cb *mpcb) |
16045 |
++{ |
16046 |
++ struct sock *sk; |
16047 |
++ u32 best_cwnd = 0; |
16048 |
++ |
16049 |
++ mptcp_for_each_sk(mpcb, sk) { |
16050 |
++ u32 tmp_cwnd; |
16051 |
++ |
16052 |
++ if (!mptcp_olia_sk_can_send(sk)) |
16053 |
++ continue; |
16054 |
++ |
16055 |
++ tmp_cwnd = mptcp_get_crt_cwnd(sk); |
16056 |
++ if (tmp_cwnd > best_cwnd) |
16057 |
++ best_cwnd = tmp_cwnd; |
16058 |
++ } |
16059 |
++ return best_cwnd; |
16060 |
++} |
16061 |
++ |
16062 |
++static void mptcp_get_epsilon(const struct mptcp_cb *mpcb) |
16063 |
++{ |
16064 |
++ struct mptcp_olia *ca; |
16065 |
++ struct tcp_sock *tp; |
16066 |
++ struct sock *sk; |
16067 |
++ u64 tmp_int, tmp_rtt, best_int = 0, best_rtt = 1; |
16068 |
++ u32 max_cwnd = 1, best_cwnd = 1, tmp_cwnd; |
16069 |
++ u8 M = 0, B_not_M = 0; |
16070 |
++ |
16071 |
++ /* TODO - integrate this in the following loop - we just want to iterate once */ |
16072 |
++ |
16073 |
++ max_cwnd = mptcp_get_max_cwnd(mpcb); |
16074 |
++ |
16075 |
++ /* find the best path */ |
16076 |
++ mptcp_for_each_sk(mpcb, sk) { |
16077 |
++ tp = tcp_sk(sk); |
16078 |
++ ca = inet_csk_ca(sk); |
16079 |
++ |
16080 |
++ if (!mptcp_olia_sk_can_send(sk)) |
16081 |
++ continue; |
16082 |
++ |
16083 |
++ tmp_rtt = (u64)tp->srtt_us * tp->srtt_us; |
16084 |
++ /* TODO - check here and rename variables */ |
16085 |
++ tmp_int = max(ca->mptcp_loss3 - ca->mptcp_loss2, |
16086 |
++ ca->mptcp_loss2 - ca->mptcp_loss1); |
16087 |
++ |
16088 |
++ tmp_cwnd = mptcp_get_crt_cwnd(sk); |
16089 |
++ if ((u64)tmp_int * best_rtt >= (u64)best_int * tmp_rtt) { |
16090 |
++ best_rtt = tmp_rtt; |
16091 |
++ best_int = tmp_int; |
16092 |
++ best_cwnd = tmp_cwnd; |
16093 |
++ } |
16094 |
++ } |
16095 |
++ |
16096 |
++ /* TODO - integrate this here in mptcp_get_max_cwnd and in the previous loop */ |
16097 |
++ /* find the size of M and B_not_M */ |
16098 |
++ mptcp_for_each_sk(mpcb, sk) { |
16099 |
++ tp = tcp_sk(sk); |
16100 |
++ ca = inet_csk_ca(sk); |
16101 |
++ |
16102 |
++ if (!mptcp_olia_sk_can_send(sk)) |
16103 |
++ continue; |
16104 |
++ |
16105 |
++ tmp_cwnd = mptcp_get_crt_cwnd(sk); |
16106 |
++ if (tmp_cwnd == max_cwnd) { |
16107 |
++ M++; |
16108 |
++ } else { |
16109 |
++ tmp_rtt = (u64)tp->srtt_us * tp->srtt_us; |
16110 |
++ tmp_int = max(ca->mptcp_loss3 - ca->mptcp_loss2, |
16111 |
++ ca->mptcp_loss2 - ca->mptcp_loss1); |
16112 |
++ |
16113 |
++ if ((u64)tmp_int * best_rtt == (u64)best_int * tmp_rtt) |
16114 |
++ B_not_M++; |
16115 |
++ } |
16116 |
++ } |
16117 |
++ |
16118 |
++ /* check if the path is in M or B_not_M and set the value of epsilon accordingly */ |
16119 |
++ mptcp_for_each_sk(mpcb, sk) { |
16120 |
++ tp = tcp_sk(sk); |
16121 |
++ ca = inet_csk_ca(sk); |
16122 |
++ |
16123 |
++ if (!mptcp_olia_sk_can_send(sk)) |
16124 |
++ continue; |
16125 |
++ |
16126 |
++ if (B_not_M == 0) { |
16127 |
++ ca->epsilon_num = 0; |
16128 |
++ ca->epsilon_den = 1; |
16129 |
++ } else { |
16130 |
++ tmp_rtt = (u64)tp->srtt_us * tp->srtt_us; |
16131 |
++ tmp_int = max(ca->mptcp_loss3 - ca->mptcp_loss2, |
16132 |
++ ca->mptcp_loss2 - ca->mptcp_loss1); |
16133 |
++ tmp_cwnd = mptcp_get_crt_cwnd(sk); |
16134 |
++ |
16135 |
++ if (tmp_cwnd < max_cwnd && |
16136 |
++ (u64)tmp_int * best_rtt == (u64)best_int * tmp_rtt) { |
16137 |
++ ca->epsilon_num = 1; |
16138 |
++ ca->epsilon_den = mpcb->cnt_established * B_not_M; |
16139 |
++ } else if (tmp_cwnd == max_cwnd) { |
16140 |
++ ca->epsilon_num = -1; |
16141 |
++ ca->epsilon_den = mpcb->cnt_established * M; |
16142 |
++ } else { |
16143 |
++ ca->epsilon_num = 0; |
16144 |
++ ca->epsilon_den = 1; |
16145 |
++ } |
16146 |
++ } |
16147 |
++ } |
16148 |
++} |
16149 |
++ |
16150 |
++/* setting the initial values */ |
16151 |
++static void mptcp_olia_init(struct sock *sk) |
16152 |
++{ |
16153 |
++ const struct tcp_sock *tp = tcp_sk(sk); |
16154 |
++ struct mptcp_olia *ca = inet_csk_ca(sk); |
16155 |
++ |
16156 |
++ if (mptcp(tp)) { |
16157 |
++ ca->mptcp_loss1 = tp->snd_una; |
16158 |
++ ca->mptcp_loss2 = tp->snd_una; |
16159 |
++ ca->mptcp_loss3 = tp->snd_una; |
16160 |
++ ca->mptcp_snd_cwnd_cnt = 0; |
16161 |
++ ca->epsilon_num = 0; |
16162 |
++ ca->epsilon_den = 1; |
16163 |
++ } |
16164 |
++} |
16165 |
++ |
16166 |
++/* updating inter-loss distance and ssthresh */ |
16167 |
++static void mptcp_olia_set_state(struct sock *sk, u8 new_state) |
16168 |
++{ |
16169 |
++ if (!mptcp(tcp_sk(sk))) |
16170 |
++ return; |
16171 |
++ |
16172 |
++ if (new_state == TCP_CA_Loss || |
16173 |
++ new_state == TCP_CA_Recovery || new_state == TCP_CA_CWR) { |
16174 |
++ struct mptcp_olia *ca = inet_csk_ca(sk); |
16175 |
++ |
16176 |
++ if (ca->mptcp_loss3 != ca->mptcp_loss2 && |
16177 |
++ !inet_csk(sk)->icsk_retransmits) { |
16178 |
++ ca->mptcp_loss1 = ca->mptcp_loss2; |
16179 |
++ ca->mptcp_loss2 = ca->mptcp_loss3; |
16180 |
++ } |
16181 |
++ } |
16182 |
++} |
16183 |
++ |
16184 |
++/* main algorithm */ |
16185 |
++static void mptcp_olia_cong_avoid(struct sock *sk, u32 ack, u32 acked) |
16186 |
++{ |
16187 |
++ struct tcp_sock *tp = tcp_sk(sk); |
16188 |
++ struct mptcp_olia *ca = inet_csk_ca(sk); |
16189 |
++ const struct mptcp_cb *mpcb = tp->mpcb; |
16190 |
++ |
16191 |
++ u64 inc_num, inc_den, rate, cwnd_scaled; |
16192 |
++ |
16193 |
++ if (!mptcp(tp)) { |
16194 |
++ tcp_reno_cong_avoid(sk, ack, acked); |
16195 |
++ return; |
16196 |
++ } |
16197 |
++ |
16198 |
++ ca->mptcp_loss3 = tp->snd_una; |
16199 |
++ |
16200 |
++ if (!tcp_is_cwnd_limited(sk)) |
16201 |
++ return; |
16202 |
++ |
16203 |
++ /* slow start if it is in the safe area */ |
16204 |
++ if (tp->snd_cwnd <= tp->snd_ssthresh) { |
16205 |
++ tcp_slow_start(tp, acked); |
16206 |
++ return; |
16207 |
++ } |
16208 |
++ |
16209 |
++ mptcp_get_epsilon(mpcb); |
16210 |
++ rate = mptcp_get_rate(mpcb, tp->srtt_us); |
16211 |
++ cwnd_scaled = mptcp_olia_scale(tp->snd_cwnd, scale); |
16212 |
++ inc_den = ca->epsilon_den * tp->snd_cwnd * rate ? : 1; |
16213 |
++ |
16214 |
++ /* calculate the increasing term, scaling is used to reduce the rounding effect */ |
16215 |
++ if (ca->epsilon_num == -1) { |
16216 |
++ if (ca->epsilon_den * cwnd_scaled * cwnd_scaled < rate) { |
16217 |
++ inc_num = rate - ca->epsilon_den * |
16218 |
++ cwnd_scaled * cwnd_scaled; |
16219 |
++ ca->mptcp_snd_cwnd_cnt -= div64_u64( |
16220 |
++ mptcp_olia_scale(inc_num , scale) , inc_den); |
16221 |
++ } else { |
16222 |
++ inc_num = ca->epsilon_den * |
16223 |
++ cwnd_scaled * cwnd_scaled - rate; |
16224 |
++ ca->mptcp_snd_cwnd_cnt += div64_u64( |
16225 |
++ mptcp_olia_scale(inc_num , scale) , inc_den); |
16226 |
++ } |
16227 |
++ } else { |
16228 |
++ inc_num = ca->epsilon_num * rate + |
16229 |
++ ca->epsilon_den * cwnd_scaled * cwnd_scaled; |
16230 |
++ ca->mptcp_snd_cwnd_cnt += div64_u64( |
16231 |
++ mptcp_olia_scale(inc_num , scale) , inc_den); |
16232 |
++ } |
16233 |
++ |
16234 |
++ |
16235 |
++ if (ca->mptcp_snd_cwnd_cnt >= (1 << scale) - 1) { |
16236 |
++ if (tp->snd_cwnd < tp->snd_cwnd_clamp) |
16237 |
++ tp->snd_cwnd++; |
16238 |
++ ca->mptcp_snd_cwnd_cnt = 0; |
16239 |
++ } else if (ca->mptcp_snd_cwnd_cnt <= 0 - (1 << scale) + 1) { |
16240 |
++ tp->snd_cwnd = max((int) 1 , (int) tp->snd_cwnd - 1); |
16241 |
++ ca->mptcp_snd_cwnd_cnt = 0; |
16242 |
++ } |
16243 |
++} |
16244 |
++ |
16245 |
++static struct tcp_congestion_ops mptcp_olia = { |
16246 |
++ .init = mptcp_olia_init, |
16247 |
++ .ssthresh = tcp_reno_ssthresh, |
16248 |
++ .cong_avoid = mptcp_olia_cong_avoid, |
16249 |
++ .set_state = mptcp_olia_set_state, |
16250 |
++ .owner = THIS_MODULE, |
16251 |
++ .name = "olia", |
16252 |
++}; |
16253 |
++ |
16254 |
++static int __init mptcp_olia_register(void) |
16255 |
++{ |
16256 |
++ BUILD_BUG_ON(sizeof(struct mptcp_olia) > ICSK_CA_PRIV_SIZE); |
16257 |
++ return tcp_register_congestion_control(&mptcp_olia); |
16258 |
++} |
16259 |
++ |
16260 |
++static void __exit mptcp_olia_unregister(void) |
16261 |
++{ |
16262 |
++ tcp_unregister_congestion_control(&mptcp_olia); |
16263 |
++} |
16264 |
++ |
16265 |
++module_init(mptcp_olia_register); |
16266 |
++module_exit(mptcp_olia_unregister); |
16267 |
++ |
16268 |
++MODULE_AUTHOR("Ramin Khalili, Nicolas Gast, Jean-Yves Le Boudec"); |
16269 |
++MODULE_LICENSE("GPL"); |
16270 |
++MODULE_DESCRIPTION("MPTCP COUPLED CONGESTION CONTROL"); |
16271 |
++MODULE_VERSION("0.1"); |
16272 |
+diff --git a/net/mptcp/mptcp_output.c b/net/mptcp/mptcp_output.c |
16273 |
+new file mode 100644 |
16274 |
+index 000000000000..400ea254c078 |
16275 |
+--- /dev/null |
16276 |
++++ b/net/mptcp/mptcp_output.c |
16277 |
+@@ -0,0 +1,1743 @@ |
16278 |
++/* |
16279 |
++ * MPTCP implementation - Sending side |
16280 |
++ * |
16281 |
++ * Initial Design & Implementation: |
16282 |
++ * Sébastien Barré <sebastien.barre@×××××××××.be> |
16283 |
++ * |
16284 |
++ * Current Maintainer & Author: |
16285 |
++ * Christoph Paasch <christoph.paasch@×××××××××.be> |
16286 |
++ * |
16287 |
++ * Additional authors: |
16288 |
++ * Jaakko Korkeaniemi <jaakko.korkeaniemi@×××××.fi> |
16289 |
++ * Gregory Detal <gregory.detal@×××××××××.be> |
16290 |
++ * Fabien Duchêne <fabien.duchene@×××××××××.be> |
16291 |
++ * Andreas Seelinger <Andreas.Seelinger@×××××××××××.de> |
16292 |
++ * Lavkesh Lahngir <lavkesh51@×××××.com> |
16293 |
++ * Andreas Ripke <ripke@××××××.eu> |
16294 |
++ * Vlad Dogaru <vlad.dogaru@×××××.com> |
16295 |
++ * Octavian Purdila <octavian.purdila@×××××.com> |
16296 |
++ * John Ronan <jronan@××××.org> |
16297 |
++ * Catalin Nicutar <catalin.nicutar@×××××.com> |
16298 |
++ * Brandon Heller <brandonh@××××××××.edu> |
16299 |
++ * |
16300 |
++ * |
16301 |
++ * This program is free software; you can redistribute it and/or |
16302 |
++ * modify it under the terms of the GNU General Public License |
16303 |
++ * as published by the Free Software Foundation; either version |
16304 |
++ * 2 of the License, or (at your option) any later version. |
16305 |
++ */ |
16306 |
++ |
16307 |
++#include <linux/kconfig.h> |
16308 |
++#include <linux/skbuff.h> |
16309 |
++#include <linux/tcp.h> |
16310 |
++ |
16311 |
++#include <net/mptcp.h> |
16312 |
++#include <net/mptcp_v4.h> |
16313 |
++#include <net/mptcp_v6.h> |
16314 |
++#include <net/sock.h> |
16315 |
++ |
16316 |
++static const int mptcp_dss_len = MPTCP_SUB_LEN_DSS_ALIGN + |
16317 |
++ MPTCP_SUB_LEN_ACK_ALIGN + |
16318 |
++ MPTCP_SUB_LEN_SEQ_ALIGN; |
16319 |
++ |
16320 |
++static inline int mptcp_sub_len_remove_addr(u16 bitfield) |
16321 |
++{ |
16322 |
++ unsigned int c; |
16323 |
++ for (c = 0; bitfield; c++) |
16324 |
++ bitfield &= bitfield - 1; |
16325 |
++ return MPTCP_SUB_LEN_REMOVE_ADDR + c - 1; |
16326 |
++} |
16327 |
++ |
16328 |
++int mptcp_sub_len_remove_addr_align(u16 bitfield) |
16329 |
++{ |
16330 |
++ return ALIGN(mptcp_sub_len_remove_addr(bitfield), 4); |
16331 |
++} |
16332 |
++EXPORT_SYMBOL(mptcp_sub_len_remove_addr_align); |
16333 |
++ |
16334 |
++/* get the data-seq and end-data-seq and store them again in the |
16335 |
++ * tcp_skb_cb |
16336 |
++ */ |
16337 |
++static int mptcp_reconstruct_mapping(struct sk_buff *skb) |
16338 |
++{ |
16339 |
++ const struct mp_dss *mpdss = (struct mp_dss *)TCP_SKB_CB(skb)->dss; |
16340 |
++ u32 *p32; |
16341 |
++ u16 *p16; |
16342 |
++ |
16343 |
++ if (!mpdss->M) |
16344 |
++ return 1; |
16345 |
++ |
16346 |
++ /* Move the pointer to the data-seq */ |
16347 |
++ p32 = (u32 *)mpdss; |
16348 |
++ p32++; |
16349 |
++ if (mpdss->A) { |
16350 |
++ p32++; |
16351 |
++ if (mpdss->a) |
16352 |
++ p32++; |
16353 |
++ } |
16354 |
++ |
16355 |
++ TCP_SKB_CB(skb)->seq = ntohl(*p32); |
16356 |
++ |
16357 |
++ /* Get the data_len to calculate the end_data_seq */ |
16358 |
++ p32++; |
16359 |
++ p32++; |
16360 |
++ p16 = (u16 *)p32; |
16361 |
++ TCP_SKB_CB(skb)->end_seq = ntohs(*p16) + TCP_SKB_CB(skb)->seq; |
16362 |
++ |
16363 |
++ return 0; |
16364 |
++} |
16365 |
++ |
16366 |
++static void mptcp_find_and_set_pathmask(const struct sock *meta_sk, struct sk_buff *skb) |
16367 |
++{ |
16368 |
++ struct sk_buff *skb_it; |
16369 |
++ |
16370 |
++ skb_it = tcp_write_queue_head(meta_sk); |
16371 |
++ |
16372 |
++ tcp_for_write_queue_from(skb_it, meta_sk) { |
16373 |
++ if (skb_it == tcp_send_head(meta_sk)) |
16374 |
++ break; |
16375 |
++ |
16376 |
++ if (TCP_SKB_CB(skb_it)->seq == TCP_SKB_CB(skb)->seq) { |
16377 |
++ TCP_SKB_CB(skb)->path_mask = TCP_SKB_CB(skb_it)->path_mask; |
16378 |
++ break; |
16379 |
++ } |
16380 |
++ } |
16381 |
++} |
16382 |
++ |
16383 |
++/* Reinject data from one TCP subflow to the meta_sk. If sk == NULL, we are |
16384 |
++ * coming from the meta-retransmit-timer |
16385 |
++ */ |
16386 |
++static void __mptcp_reinject_data(struct sk_buff *orig_skb, struct sock *meta_sk, |
16387 |
++ struct sock *sk, int clone_it) |
16388 |
++{ |
16389 |
++ struct sk_buff *skb, *skb1; |
16390 |
++ const struct tcp_sock *meta_tp = tcp_sk(meta_sk); |
16391 |
++ struct mptcp_cb *mpcb = meta_tp->mpcb; |
16392 |
++ u32 seq, end_seq; |
16393 |
++ |
16394 |
++ if (clone_it) { |
16395 |
++ /* pskb_copy is necessary here, because the TCP/IP-headers |
16396 |
++ * will be changed when it's going to be reinjected on another |
16397 |
++ * subflow. |
16398 |
++ */ |
16399 |
++ skb = pskb_copy_for_clone(orig_skb, GFP_ATOMIC); |
16400 |
++ } else { |
16401 |
++ __skb_unlink(orig_skb, &sk->sk_write_queue); |
16402 |
++ sock_set_flag(sk, SOCK_QUEUE_SHRUNK); |
16403 |
++ sk->sk_wmem_queued -= orig_skb->truesize; |
16404 |
++ sk_mem_uncharge(sk, orig_skb->truesize); |
16405 |
++ skb = orig_skb; |
16406 |
++ } |
16407 |
++ if (unlikely(!skb)) |
16408 |
++ return; |
16409 |
++ |
16410 |
++ if (sk && mptcp_reconstruct_mapping(skb)) { |
16411 |
++ __kfree_skb(skb); |
16412 |
++ return; |
16413 |
++ } |
16414 |
++ |
16415 |
++ skb->sk = meta_sk; |
16416 |
++ |
16417 |
++ /* If it reached already the destination, we don't have to reinject it */ |
16418 |
++ if (!after(TCP_SKB_CB(skb)->end_seq, meta_tp->snd_una)) { |
16419 |
++ __kfree_skb(skb); |
16420 |
++ return; |
16421 |
++ } |
16422 |
++ |
16423 |
++ /* Only reinject segments that are fully covered by the mapping */ |
16424 |
++ if (skb->len + (mptcp_is_data_fin(skb) ? 1 : 0) != |
16425 |
++ TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq) { |
16426 |
++ u32 seq = TCP_SKB_CB(skb)->seq; |
16427 |
++ u32 end_seq = TCP_SKB_CB(skb)->end_seq; |
16428 |
++ |
16429 |
++ __kfree_skb(skb); |
16430 |
++ |
16431 |
++ /* Ok, now we have to look for the full mapping in the meta |
16432 |
++ * send-queue :S |
16433 |
++ */ |
16434 |
++ tcp_for_write_queue(skb, meta_sk) { |
16435 |
++ /* Not yet at the mapping? */ |
16436 |
++ if (before(TCP_SKB_CB(skb)->seq, seq)) |
16437 |
++ continue; |
16438 |
++ /* We have passed by the mapping */ |
16439 |
++ if (after(TCP_SKB_CB(skb)->end_seq, end_seq)) |
16440 |
++ return; |
16441 |
++ |
16442 |
++ __mptcp_reinject_data(skb, meta_sk, NULL, 1); |
16443 |
++ } |
16444 |
++ return; |
16445 |
++ } |
16446 |
++ |
16447 |
++ /* Segment goes back to the MPTCP-layer. So, we need to zero the |
16448 |
++ * path_mask/dss. |
16449 |
++ */ |
16450 |
++ memset(TCP_SKB_CB(skb)->dss, 0 , mptcp_dss_len); |
16451 |
++ |
16452 |
++ /* We need to find out the path-mask from the meta-write-queue |
16453 |
++ * to properly select a subflow. |
16454 |
++ */ |
16455 |
++ mptcp_find_and_set_pathmask(meta_sk, skb); |
16456 |
++ |
16457 |
++ /* If it's empty, just add */ |
16458 |
++ if (skb_queue_empty(&mpcb->reinject_queue)) { |
16459 |
++ skb_queue_head(&mpcb->reinject_queue, skb); |
16460 |
++ return; |
16461 |
++ } |
16462 |
++ |
16463 |
++ /* Find place to insert skb - or even we can 'drop' it, as the |
16464 |
++ * data is already covered by other skb's in the reinject-queue. |
16465 |
++ * |
16466 |
++ * This is inspired by code from tcp_data_queue. |
16467 |
++ */ |
16468 |
++ |
16469 |
++ skb1 = skb_peek_tail(&mpcb->reinject_queue); |
16470 |
++ seq = TCP_SKB_CB(skb)->seq; |
16471 |
++ while (1) { |
16472 |
++ if (!after(TCP_SKB_CB(skb1)->seq, seq)) |
16473 |
++ break; |
16474 |
++ if (skb_queue_is_first(&mpcb->reinject_queue, skb1)) { |
16475 |
++ skb1 = NULL; |
16476 |
++ break; |
16477 |
++ } |
16478 |
++ skb1 = skb_queue_prev(&mpcb->reinject_queue, skb1); |
16479 |
++ } |
16480 |
++ |
16481 |
++ /* Do skb overlap to previous one? */ |
16482 |
++ end_seq = TCP_SKB_CB(skb)->end_seq; |
16483 |
++ if (skb1 && before(seq, TCP_SKB_CB(skb1)->end_seq)) { |
16484 |
++ if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) { |
16485 |
++ /* All the bits are present. Don't reinject */ |
16486 |
++ __kfree_skb(skb); |
16487 |
++ return; |
16488 |
++ } |
16489 |
++ if (seq == TCP_SKB_CB(skb1)->seq) { |
16490 |
++ if (skb_queue_is_first(&mpcb->reinject_queue, skb1)) |
16491 |
++ skb1 = NULL; |
16492 |
++ else |
16493 |
++ skb1 = skb_queue_prev(&mpcb->reinject_queue, skb1); |
16494 |
++ } |
16495 |
++ } |
16496 |
++ if (!skb1) |
16497 |
++ __skb_queue_head(&mpcb->reinject_queue, skb); |
16498 |
++ else |
16499 |
++ __skb_queue_after(&mpcb->reinject_queue, skb1, skb); |
16500 |
++ |
16501 |
++ /* And clean segments covered by new one as whole. */ |
16502 |
++ while (!skb_queue_is_last(&mpcb->reinject_queue, skb)) { |
16503 |
++ skb1 = skb_queue_next(&mpcb->reinject_queue, skb); |
16504 |
++ |
16505 |
++ if (!after(end_seq, TCP_SKB_CB(skb1)->seq)) |
16506 |
++ break; |
16507 |
++ |
16508 |
++ __skb_unlink(skb1, &mpcb->reinject_queue); |
16509 |
++ __kfree_skb(skb1); |
16510 |
++ } |
16511 |
++ return; |
16512 |
++} |
16513 |
++ |
16514 |
++/* Inserts data into the reinject queue */ |
16515 |
++void mptcp_reinject_data(struct sock *sk, int clone_it) |
16516 |
++{ |
16517 |
++ struct sk_buff *skb_it, *tmp; |
16518 |
++ struct tcp_sock *tp = tcp_sk(sk); |
16519 |
++ struct sock *meta_sk = tp->meta_sk; |
16520 |
++ |
16521 |
++ /* It has already been closed - there is really no point in reinjecting */ |
16522 |
++ if (meta_sk->sk_state == TCP_CLOSE) |
16523 |
++ return; |
16524 |
++ |
16525 |
++ skb_queue_walk_safe(&sk->sk_write_queue, skb_it, tmp) { |
16526 |
++ struct tcp_skb_cb *tcb = TCP_SKB_CB(skb_it); |
16527 |
++ /* Subflow syn's and fin's are not reinjected. |
16528 |
++ * |
16529 |
++ * As well as empty subflow-fins with a data-fin. |
16530 |
++ * They are reinjected below (without the subflow-fin-flag) |
16531 |
++ */ |
16532 |
++ if (tcb->tcp_flags & TCPHDR_SYN || |
16533 |
++ (tcb->tcp_flags & TCPHDR_FIN && !mptcp_is_data_fin(skb_it)) || |
16534 |
++ (tcb->tcp_flags & TCPHDR_FIN && mptcp_is_data_fin(skb_it) && !skb_it->len)) |
16535 |
++ continue; |
16536 |
++ |
16537 |
++ __mptcp_reinject_data(skb_it, meta_sk, sk, clone_it); |
16538 |
++ } |
16539 |
++ |
16540 |
++ skb_it = tcp_write_queue_tail(meta_sk); |
16541 |
++ /* If sk has sent the empty data-fin, we have to reinject it too. */ |
16542 |
++ if (skb_it && mptcp_is_data_fin(skb_it) && skb_it->len == 0 && |
16543 |
++ TCP_SKB_CB(skb_it)->path_mask & mptcp_pi_to_flag(tp->mptcp->path_index)) { |
16544 |
++ __mptcp_reinject_data(skb_it, meta_sk, NULL, 1); |
16545 |
++ } |
16546 |
++ |
16547 |
++ mptcp_push_pending_frames(meta_sk); |
16548 |
++ |
16549 |
++ tp->pf = 1; |
16550 |
++} |
16551 |
++EXPORT_SYMBOL(mptcp_reinject_data); |
16552 |
++ |
16553 |
++static void mptcp_combine_dfin(const struct sk_buff *skb, const struct sock *meta_sk, |
16554 |
++ struct sock *subsk) |
16555 |
++{ |
16556 |
++ const struct tcp_sock *meta_tp = tcp_sk(meta_sk); |
16557 |
++ struct mptcp_cb *mpcb = meta_tp->mpcb; |
16558 |
++ struct sock *sk_it; |
16559 |
++ int all_empty = 1, all_acked; |
16560 |
++ |
16561 |
++ /* In infinite mapping we always try to combine */ |
16562 |
++ if (mpcb->infinite_mapping_snd && tcp_close_state(subsk)) { |
16563 |
++ subsk->sk_shutdown |= SEND_SHUTDOWN; |
16564 |
++ TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_FIN; |
16565 |
++ return; |
16566 |
++ } |
16567 |
++ |
16568 |
++ /* Don't combine, if they didn't combine - otherwise we end up in |
16569 |
++ * TIME_WAIT, even if our app is smart enough to avoid it |
16570 |
++ */ |
16571 |
++ if (meta_sk->sk_shutdown & RCV_SHUTDOWN) { |
16572 |
++ if (!mpcb->dfin_combined) |
16573 |
++ return; |
16574 |
++ } |
16575 |
++ |
16576 |
++ /* If no other subflow has data to send, we can combine */ |
16577 |
++ mptcp_for_each_sk(mpcb, sk_it) { |
16578 |
++ if (!mptcp_sk_can_send(sk_it)) |
16579 |
++ continue; |
16580 |
++ |
16581 |
++ if (!tcp_write_queue_empty(sk_it)) |
16582 |
++ all_empty = 0; |
16583 |
++ } |
16584 |
++ |
16585 |
++ /* If all data has been DATA_ACKed, we can combine. |
16586 |
++ * -1, because the data_fin consumed one byte |
16587 |
++ */ |
16588 |
++ all_acked = (meta_tp->snd_una == (meta_tp->write_seq - 1)); |
16589 |
++ |
16590 |
++ if ((all_empty || all_acked) && tcp_close_state(subsk)) { |
16591 |
++ subsk->sk_shutdown |= SEND_SHUTDOWN; |
16592 |
++ TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_FIN; |
16593 |
++ } |
16594 |
++} |
16595 |
++ |
16596 |
++static int mptcp_write_dss_mapping(const struct tcp_sock *tp, const struct sk_buff *skb, |
16597 |
++ __be32 *ptr) |
16598 |
++{ |
16599 |
++ const struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); |
16600 |
++ __be32 *start = ptr; |
16601 |
++ __u16 data_len; |
16602 |
++ |
16603 |
++ *ptr++ = htonl(tcb->seq); /* data_seq */ |
16604 |
++ |
16605 |
++ /* If it's a non-data DATA_FIN, we set subseq to 0 (draft v7) */ |
16606 |
++ if (mptcp_is_data_fin(skb) && skb->len == 0) |
16607 |
++ *ptr++ = 0; /* subseq */ |
16608 |
++ else |
16609 |
++ *ptr++ = htonl(tp->write_seq - tp->mptcp->snt_isn); /* subseq */ |
16610 |
++ |
16611 |
++ if (tcb->mptcp_flags & MPTCPHDR_INF) |
16612 |
++ data_len = 0; |
16613 |
++ else |
16614 |
++ data_len = tcb->end_seq - tcb->seq; |
16615 |
++ |
16616 |
++ if (tp->mpcb->dss_csum && data_len) { |
16617 |
++ __be16 *p16 = (__be16 *)ptr; |
16618 |
++ __be32 hdseq = mptcp_get_highorder_sndbits(skb, tp->mpcb); |
16619 |
++ __wsum csum; |
16620 |
++ |
16621 |
++ *ptr = htonl(((data_len) << 16) | |
16622 |
++ (TCPOPT_EOL << 8) | |
16623 |
++ (TCPOPT_EOL)); |
16624 |
++ csum = csum_partial(ptr - 2, 12, skb->csum); |
16625 |
++ p16++; |
16626 |
++ *p16++ = csum_fold(csum_partial(&hdseq, sizeof(hdseq), csum)); |
16627 |
++ } else { |
16628 |
++ *ptr++ = htonl(((data_len) << 16) | |
16629 |
++ (TCPOPT_NOP << 8) | |
16630 |
++ (TCPOPT_NOP)); |
16631 |
++ } |
16632 |
++ |
16633 |
++ return ptr - start; |
16634 |
++} |
16635 |
++ |
16636 |
++static int mptcp_write_dss_data_ack(const struct tcp_sock *tp, const struct sk_buff *skb, |
16637 |
++ __be32 *ptr) |
16638 |
++{ |
16639 |
++ struct mp_dss *mdss = (struct mp_dss *)ptr; |
16640 |
++ __be32 *start = ptr; |
16641 |
++ |
16642 |
++ mdss->kind = TCPOPT_MPTCP; |
16643 |
++ mdss->sub = MPTCP_SUB_DSS; |
16644 |
++ mdss->rsv1 = 0; |
16645 |
++ mdss->rsv2 = 0; |
16646 |
++ mdss->F = mptcp_is_data_fin(skb) ? 1 : 0; |
16647 |
++ mdss->m = 0; |
16648 |
++ mdss->M = mptcp_is_data_seq(skb) ? 1 : 0; |
16649 |
++ mdss->a = 0; |
16650 |
++ mdss->A = 1; |
16651 |
++ mdss->len = mptcp_sub_len_dss(mdss, tp->mpcb->dss_csum); |
16652 |
++ ptr++; |
16653 |
++ |
16654 |
++ *ptr++ = htonl(mptcp_meta_tp(tp)->rcv_nxt); |
16655 |
++ |
16656 |
++ return ptr - start; |
16657 |
++} |
16658 |
++ |
16659 |
++/* RFC6824 states that once a particular subflow mapping has been sent |
16660 |
++ * out it must never be changed. However, packets may be split while |
16661 |
++ * they are in the retransmission queue (due to SACK or ACKs) and that |
16662 |
++ * arguably means that we would change the mapping (e.g. it splits it, |
16663 |
++ * our sends out a subset of the initial mapping). |
16664 |
++ * |
16665 |
++ * Furthermore, the skb checksum is not always preserved across splits |
16666 |
++ * (e.g. mptcp_fragment) which would mean that we need to recompute |
16667 |
++ * the DSS checksum in this case. |
16668 |
++ * |
16669 |
++ * To avoid this we save the initial DSS mapping which allows us to |
16670 |
++ * send the same DSS mapping even for fragmented retransmits. |
16671 |
++ */ |
16672 |
++static void mptcp_save_dss_data_seq(const struct tcp_sock *tp, struct sk_buff *skb) |
16673 |
++{ |
16674 |
++ struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); |
16675 |
++ __be32 *ptr = (__be32 *)tcb->dss; |
16676 |
++ |
16677 |
++ tcb->mptcp_flags |= MPTCPHDR_SEQ; |
16678 |
++ |
16679 |
++ ptr += mptcp_write_dss_data_ack(tp, skb, ptr); |
16680 |
++ ptr += mptcp_write_dss_mapping(tp, skb, ptr); |
16681 |
++} |
16682 |
++ |
16683 |
++/* Write the saved DSS mapping to the header */ |
16684 |
++static int mptcp_write_dss_data_seq(const struct tcp_sock *tp, struct sk_buff *skb, |
16685 |
++ __be32 *ptr) |
16686 |
++{ |
16687 |
++ __be32 *start = ptr; |
16688 |
++ |
16689 |
++ memcpy(ptr, TCP_SKB_CB(skb)->dss, mptcp_dss_len); |
16690 |
++ |
16691 |
++ /* update the data_ack */ |
16692 |
++ start[1] = htonl(mptcp_meta_tp(tp)->rcv_nxt); |
16693 |
++ |
16694 |
++ /* dss is in a union with inet_skb_parm and |
16695 |
++ * the IP layer expects zeroed IPCB fields. |
16696 |
++ */ |
16697 |
++ memset(TCP_SKB_CB(skb)->dss, 0 , mptcp_dss_len); |
16698 |
++ |
16699 |
++ return mptcp_dss_len/sizeof(*ptr); |
16700 |
++} |
16701 |
++ |
16702 |
++static bool mptcp_skb_entail(struct sock *sk, struct sk_buff *skb, int reinject) |
16703 |
++{ |
16704 |
++ struct tcp_sock *tp = tcp_sk(sk); |
16705 |
++ const struct sock *meta_sk = mptcp_meta_sk(sk); |
16706 |
++ const struct mptcp_cb *mpcb = tp->mpcb; |
16707 |
++ struct tcp_skb_cb *tcb; |
16708 |
++ struct sk_buff *subskb = NULL; |
16709 |
++ |
16710 |
++ if (!reinject) |
16711 |
++ TCP_SKB_CB(skb)->mptcp_flags |= (mpcb->snd_hiseq_index ? |
16712 |
++ MPTCPHDR_SEQ64_INDEX : 0); |
16713 |
++ |
16714 |
++ subskb = pskb_copy_for_clone(skb, GFP_ATOMIC); |
16715 |
++ if (!subskb) |
16716 |
++ return false; |
16717 |
++ |
16718 |
++ /* At the subflow-level we need to call again tcp_init_tso_segs. We |
16719 |
++ * force this, by setting gso_segs to 0. It has been set to 1 prior to |
16720 |
++ * the call to mptcp_skb_entail. |
16721 |
++ */ |
16722 |
++ skb_shinfo(subskb)->gso_segs = 0; |
16723 |
++ |
16724 |
++ TCP_SKB_CB(skb)->path_mask |= mptcp_pi_to_flag(tp->mptcp->path_index); |
16725 |
++ |
16726 |
++ if (!(sk->sk_route_caps & NETIF_F_ALL_CSUM) && |
16727 |
++ skb->ip_summed == CHECKSUM_PARTIAL) { |
16728 |
++ subskb->csum = skb->csum = skb_checksum(skb, 0, skb->len, 0); |
16729 |
++ subskb->ip_summed = skb->ip_summed = CHECKSUM_NONE; |
16730 |
++ } |
16731 |
++ |
16732 |
++ tcb = TCP_SKB_CB(subskb); |
16733 |
++ |
16734 |
++ if (tp->mpcb->send_infinite_mapping && |
16735 |
++ !tp->mpcb->infinite_mapping_snd && |
16736 |
++ !before(tcb->seq, mptcp_meta_tp(tp)->snd_nxt)) { |
16737 |
++ tp->mptcp->fully_established = 1; |
16738 |
++ tp->mpcb->infinite_mapping_snd = 1; |
16739 |
++ tp->mptcp->infinite_cutoff_seq = tp->write_seq; |
16740 |
++ tcb->mptcp_flags |= MPTCPHDR_INF; |
16741 |
++ } |
16742 |
++ |
16743 |
++ if (mptcp_is_data_fin(subskb)) |
16744 |
++ mptcp_combine_dfin(subskb, meta_sk, sk); |
16745 |
++ |
16746 |
++ mptcp_save_dss_data_seq(tp, subskb); |
16747 |
++ |
16748 |
++ tcb->seq = tp->write_seq; |
16749 |
++ tcb->sacked = 0; /* reset the sacked field: from the point of view |
16750 |
++ * of this subflow, we are sending a brand new |
16751 |
++ * segment |
16752 |
++ */ |
16753 |
++ /* Take into account seg len */ |
16754 |
++ tp->write_seq += subskb->len + ((tcb->tcp_flags & TCPHDR_FIN) ? 1 : 0); |
16755 |
++ tcb->end_seq = tp->write_seq; |
16756 |
++ |
16757 |
++ /* If it's a non-payload DATA_FIN (also no subflow-fin), the |
16758 |
++ * segment is not part of the subflow but on a meta-only-level. |
16759 |
++ */ |
16760 |
++ if (!mptcp_is_data_fin(subskb) || tcb->end_seq != tcb->seq) { |
16761 |
++ tcp_add_write_queue_tail(sk, subskb); |
16762 |
++ sk->sk_wmem_queued += subskb->truesize; |
16763 |
++ sk_mem_charge(sk, subskb->truesize); |
16764 |
++ } else { |
16765 |
++ int err; |
16766 |
++ |
16767 |
++ /* Necessary to initialize for tcp_transmit_skb. mss of 1, as |
16768 |
++ * skb->len = 0 will force tso_segs to 1. |
16769 |
++ */ |
16770 |
++ tcp_init_tso_segs(sk, subskb, 1); |
16771 |
++ /* Empty data-fins are sent immediatly on the subflow */ |
16772 |
++ TCP_SKB_CB(subskb)->when = tcp_time_stamp; |
16773 |
++ err = tcp_transmit_skb(sk, subskb, 1, GFP_ATOMIC); |
16774 |
++ |
16775 |
++ /* It has not been queued, we can free it now. */ |
16776 |
++ kfree_skb(subskb); |
16777 |
++ |
16778 |
++ if (err) |
16779 |
++ return false; |
16780 |
++ } |
16781 |
++ |
16782 |
++ if (!tp->mptcp->fully_established) { |
16783 |
++ tp->mptcp->second_packet = 1; |
16784 |
++ tp->mptcp->last_end_data_seq = TCP_SKB_CB(skb)->end_seq; |
16785 |
++ } |
16786 |
++ |
16787 |
++ return true; |
16788 |
++} |
16789 |
++ |
16790 |
++/* Fragment an skb and update the mptcp meta-data. Due to reinject, we |
16791 |
++ * might need to undo some operations done by tcp_fragment. |
16792 |
++ */ |
16793 |
++static int mptcp_fragment(struct sock *meta_sk, struct sk_buff *skb, u32 len, |
16794 |
++ gfp_t gfp, int reinject) |
16795 |
++{ |
16796 |
++ int ret, diff, old_factor; |
16797 |
++ struct sk_buff *buff; |
16798 |
++ u8 flags; |
16799 |
++ |
16800 |
++ if (skb_headlen(skb) < len) |
16801 |
++ diff = skb->len - len; |
16802 |
++ else |
16803 |
++ diff = skb->data_len; |
16804 |
++ old_factor = tcp_skb_pcount(skb); |
16805 |
++ |
16806 |
++ /* The mss_now in tcp_fragment is used to set the tso_segs of the skb. |
16807 |
++ * At the MPTCP-level we do not care about the absolute value. All we |
16808 |
++ * care about is that it is set to 1 for accurate packets_out |
16809 |
++ * accounting. |
16810 |
++ */ |
16811 |
++ ret = tcp_fragment(meta_sk, skb, len, UINT_MAX, gfp); |
16812 |
++ if (ret) |
16813 |
++ return ret; |
16814 |
++ |
16815 |
++ buff = skb->next; |
16816 |
++ |
16817 |
++ flags = TCP_SKB_CB(skb)->mptcp_flags; |
16818 |
++ TCP_SKB_CB(skb)->mptcp_flags = flags & ~(MPTCPHDR_FIN); |
16819 |
++ TCP_SKB_CB(buff)->mptcp_flags = flags; |
16820 |
++ TCP_SKB_CB(buff)->path_mask = TCP_SKB_CB(skb)->path_mask; |
16821 |
++ |
16822 |
++ /* If reinject == 1, the buff will be added to the reinject |
16823 |
++ * queue, which is currently not part of memory accounting. So |
16824 |
++ * undo the changes done by tcp_fragment and update the |
16825 |
++ * reinject queue. Also, undo changes to the packet counters. |
16826 |
++ */ |
16827 |
++ if (reinject == 1) { |
16828 |
++ int undo = buff->truesize - diff; |
16829 |
++ meta_sk->sk_wmem_queued -= undo; |
16830 |
++ sk_mem_uncharge(meta_sk, undo); |
16831 |
++ |
16832 |
++ tcp_sk(meta_sk)->mpcb->reinject_queue.qlen++; |
16833 |
++ meta_sk->sk_write_queue.qlen--; |
16834 |
++ |
16835 |
++ if (!before(tcp_sk(meta_sk)->snd_nxt, TCP_SKB_CB(buff)->end_seq)) { |
16836 |
++ undo = old_factor - tcp_skb_pcount(skb) - |
16837 |
++ tcp_skb_pcount(buff); |
16838 |
++ if (undo) |
16839 |
++ tcp_adjust_pcount(meta_sk, skb, -undo); |
16840 |
++ } |
16841 |
++ } |
16842 |
++ |
16843 |
++ return 0; |
16844 |
++} |
16845 |
++ |
16846 |
++/* Inspired by tcp_write_wakeup */ |
16847 |
++int mptcp_write_wakeup(struct sock *meta_sk) |
16848 |
++{ |
16849 |
++ struct tcp_sock *meta_tp = tcp_sk(meta_sk); |
16850 |
++ struct sk_buff *skb; |
16851 |
++ struct sock *sk_it; |
16852 |
++ int ans = 0; |
16853 |
++ |
16854 |
++ if (meta_sk->sk_state == TCP_CLOSE) |
16855 |
++ return -1; |
16856 |
++ |
16857 |
++ skb = tcp_send_head(meta_sk); |
16858 |
++ if (skb && |
16859 |
++ before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(meta_tp))) { |
16860 |
++ unsigned int mss; |
16861 |
++ unsigned int seg_size = tcp_wnd_end(meta_tp) - TCP_SKB_CB(skb)->seq; |
16862 |
++ struct sock *subsk = meta_tp->mpcb->sched_ops->get_subflow(meta_sk, skb, true); |
16863 |
++ struct tcp_sock *subtp; |
16864 |
++ if (!subsk) |
16865 |
++ goto window_probe; |
16866 |
++ subtp = tcp_sk(subsk); |
16867 |
++ mss = tcp_current_mss(subsk); |
16868 |
++ |
16869 |
++ seg_size = min(tcp_wnd_end(meta_tp) - TCP_SKB_CB(skb)->seq, |
16870 |
++ tcp_wnd_end(subtp) - subtp->write_seq); |
16871 |
++ |
16872 |
++ if (before(meta_tp->pushed_seq, TCP_SKB_CB(skb)->end_seq)) |
16873 |
++ meta_tp->pushed_seq = TCP_SKB_CB(skb)->end_seq; |
16874 |
++ |
16875 |
++ /* We are probing the opening of a window |
16876 |
++ * but the window size is != 0 |
16877 |
++ * must have been a result SWS avoidance ( sender ) |
16878 |
++ */ |
16879 |
++ if (seg_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq || |
16880 |
++ skb->len > mss) { |
16881 |
++ seg_size = min(seg_size, mss); |
16882 |
++ TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH; |
16883 |
++ if (mptcp_fragment(meta_sk, skb, seg_size, |
16884 |
++ GFP_ATOMIC, 0)) |
16885 |
++ return -1; |
16886 |
++ } else if (!tcp_skb_pcount(skb)) { |
16887 |
++ /* see mptcp_write_xmit on why we use UINT_MAX */ |
16888 |
++ tcp_set_skb_tso_segs(meta_sk, skb, UINT_MAX); |
16889 |
++ } |
16890 |
++ |
16891 |
++ TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH; |
16892 |
++ if (!mptcp_skb_entail(subsk, skb, 0)) |
16893 |
++ return -1; |
16894 |
++ TCP_SKB_CB(skb)->when = tcp_time_stamp; |
16895 |
++ |
16896 |
++ mptcp_check_sndseq_wrap(meta_tp, TCP_SKB_CB(skb)->end_seq - |
16897 |
++ TCP_SKB_CB(skb)->seq); |
16898 |
++ tcp_event_new_data_sent(meta_sk, skb); |
16899 |
++ |
16900 |
++ __tcp_push_pending_frames(subsk, mss, TCP_NAGLE_PUSH); |
16901 |
++ |
16902 |
++ return 0; |
16903 |
++ } else { |
16904 |
++window_probe: |
16905 |
++ if (between(meta_tp->snd_up, meta_tp->snd_una + 1, |
16906 |
++ meta_tp->snd_una + 0xFFFF)) { |
16907 |
++ mptcp_for_each_sk(meta_tp->mpcb, sk_it) { |
16908 |
++ if (mptcp_sk_can_send_ack(sk_it)) |
16909 |
++ tcp_xmit_probe_skb(sk_it, 1); |
16910 |
++ } |
16911 |
++ } |
16912 |
++ |
16913 |
++ /* At least one of the tcp_xmit_probe_skb's has to succeed */ |
16914 |
++ mptcp_for_each_sk(meta_tp->mpcb, sk_it) { |
16915 |
++ int ret; |
16916 |
++ |
16917 |
++ if (!mptcp_sk_can_send_ack(sk_it)) |
16918 |
++ continue; |
16919 |
++ |
16920 |
++ ret = tcp_xmit_probe_skb(sk_it, 0); |
16921 |
++ if (unlikely(ret > 0)) |
16922 |
++ ans = ret; |
16923 |
++ } |
16924 |
++ return ans; |
16925 |
++ } |
16926 |
++} |
16927 |
++ |
16928 |
++bool mptcp_write_xmit(struct sock *meta_sk, unsigned int mss_now, int nonagle, |
16929 |
++ int push_one, gfp_t gfp) |
16930 |
++{ |
16931 |
++ struct tcp_sock *meta_tp = tcp_sk(meta_sk), *subtp; |
16932 |
++ struct sock *subsk = NULL; |
16933 |
++ struct mptcp_cb *mpcb = meta_tp->mpcb; |
16934 |
++ struct sk_buff *skb; |
16935 |
++ unsigned int sent_pkts; |
16936 |
++ int reinject = 0; |
16937 |
++ unsigned int sublimit; |
16938 |
++ |
16939 |
++ sent_pkts = 0; |
16940 |
++ |
16941 |
++ while ((skb = mpcb->sched_ops->next_segment(meta_sk, &reinject, &subsk, |
16942 |
++ &sublimit))) { |
16943 |
++ unsigned int limit; |
16944 |
++ |
16945 |
++ subtp = tcp_sk(subsk); |
16946 |
++ mss_now = tcp_current_mss(subsk); |
16947 |
++ |
16948 |
++ if (reinject == 1) { |
16949 |
++ if (!after(TCP_SKB_CB(skb)->end_seq, meta_tp->snd_una)) { |
16950 |
++ /* Segment already reached the peer, take the next one */ |
16951 |
++ __skb_unlink(skb, &mpcb->reinject_queue); |
16952 |
++ __kfree_skb(skb); |
16953 |
++ continue; |
16954 |
++ } |
16955 |
++ } |
16956 |
++ |
16957 |
++ /* If the segment was cloned (e.g. a meta retransmission), |
16958 |
++ * the header must be expanded/copied so that there is no |
16959 |
++ * corruption of TSO information. |
16960 |
++ */ |
16961 |
++ if (skb_unclone(skb, GFP_ATOMIC)) |
16962 |
++ break; |
16963 |
++ |
16964 |
++ if (unlikely(!tcp_snd_wnd_test(meta_tp, skb, mss_now))) |
16965 |
++ break; |
16966 |
++ |
16967 |
++ /* Force tso_segs to 1 by using UINT_MAX. |
16968 |
++ * We actually don't care about the exact number of segments |
16969 |
++ * emitted on the subflow. We need just to set tso_segs, because |
16970 |
++ * we still need an accurate packets_out count in |
16971 |
++ * tcp_event_new_data_sent. |
16972 |
++ */ |
16973 |
++ tcp_set_skb_tso_segs(meta_sk, skb, UINT_MAX); |
16974 |
++ |
16975 |
++ /* Check for nagle, irregardless of tso_segs. If the segment is |
16976 |
++ * actually larger than mss_now (TSO segment), then |
16977 |
++ * tcp_nagle_check will have partial == false and always trigger |
16978 |
++ * the transmission. |
16979 |
++ * tcp_write_xmit has a TSO-level nagle check which is not |
16980 |
++ * subject to the MPTCP-level. It is based on the properties of |
16981 |
++ * the subflow, not the MPTCP-level. |
16982 |
++ */ |
16983 |
++ if (unlikely(!tcp_nagle_test(meta_tp, skb, mss_now, |
16984 |
++ (tcp_skb_is_last(meta_sk, skb) ? |
16985 |
++ nonagle : TCP_NAGLE_PUSH)))) |
16986 |
++ break; |
16987 |
++ |
16988 |
++ limit = mss_now; |
16989 |
++ /* skb->len > mss_now is the equivalent of tso_segs > 1 in |
16990 |
++ * tcp_write_xmit. Otherwise split-point would return 0. |
16991 |
++ */ |
16992 |
++ if (skb->len > mss_now && !tcp_urg_mode(meta_tp)) |
16993 |
++ /* We limit the size of the skb so that it fits into the |
16994 |
++ * window. Call tcp_mss_split_point to avoid duplicating |
16995 |
++ * code. |
16996 |
++ * We really only care about fitting the skb into the |
16997 |
++ * window. That's why we use UINT_MAX. If the skb does |
16998 |
++ * not fit into the cwnd_quota or the NIC's max-segs |
16999 |
++ * limitation, it will be split by the subflow's |
17000 |
++ * tcp_write_xmit which does the appropriate call to |
17001 |
++ * tcp_mss_split_point. |
17002 |
++ */ |
17003 |
++ limit = tcp_mss_split_point(meta_sk, skb, mss_now, |
17004 |
++ UINT_MAX / mss_now, |
17005 |
++ nonagle); |
17006 |
++ |
17007 |
++ if (sublimit) |
17008 |
++ limit = min(limit, sublimit); |
17009 |
++ |
17010 |
++ if (skb->len > limit && |
17011 |
++ unlikely(mptcp_fragment(meta_sk, skb, limit, gfp, reinject))) |
17012 |
++ break; |
17013 |
++ |
17014 |
++ if (!mptcp_skb_entail(subsk, skb, reinject)) |
17015 |
++ break; |
17016 |
++ /* Nagle is handled at the MPTCP-layer, so |
17017 |
++ * always push on the subflow |
17018 |
++ */ |
17019 |
++ __tcp_push_pending_frames(subsk, mss_now, TCP_NAGLE_PUSH); |
17020 |
++ TCP_SKB_CB(skb)->when = tcp_time_stamp; |
17021 |
++ |
17022 |
++ if (!reinject) { |
17023 |
++ mptcp_check_sndseq_wrap(meta_tp, |
17024 |
++ TCP_SKB_CB(skb)->end_seq - |
17025 |
++ TCP_SKB_CB(skb)->seq); |
17026 |
++ tcp_event_new_data_sent(meta_sk, skb); |
17027 |
++ } |
17028 |
++ |
17029 |
++ tcp_minshall_update(meta_tp, mss_now, skb); |
17030 |
++ sent_pkts += tcp_skb_pcount(skb); |
17031 |
++ |
17032 |
++ if (reinject > 0) { |
17033 |
++ __skb_unlink(skb, &mpcb->reinject_queue); |
17034 |
++ kfree_skb(skb); |
17035 |
++ } |
17036 |
++ |
17037 |
++ if (push_one) |
17038 |
++ break; |
17039 |
++ } |
17040 |
++ |
17041 |
++ return !meta_tp->packets_out && tcp_send_head(meta_sk); |
17042 |
++} |
17043 |
++ |
17044 |
++void mptcp_write_space(struct sock *sk) |
17045 |
++{ |
17046 |
++ mptcp_push_pending_frames(mptcp_meta_sk(sk)); |
17047 |
++} |
17048 |
++ |
17049 |
++u32 __mptcp_select_window(struct sock *sk) |
17050 |
++{ |
17051 |
++ struct inet_connection_sock *icsk = inet_csk(sk); |
17052 |
++ struct tcp_sock *tp = tcp_sk(sk), *meta_tp = mptcp_meta_tp(tp); |
17053 |
++ int mss, free_space, full_space, window; |
17054 |
++ |
17055 |
++ /* MSS for the peer's data. Previous versions used mss_clamp |
17056 |
++ * here. I don't know if the value based on our guesses |
17057 |
++ * of peer's MSS is better for the performance. It's more correct |
17058 |
++ * but may be worse for the performance because of rcv_mss |
17059 |
++ * fluctuations. --SAW 1998/11/1 |
17060 |
++ */ |
17061 |
++ mss = icsk->icsk_ack.rcv_mss; |
17062 |
++ free_space = tcp_space(sk); |
17063 |
++ full_space = min_t(int, meta_tp->window_clamp, |
17064 |
++ tcp_full_space(sk)); |
17065 |
++ |
17066 |
++ if (mss > full_space) |
17067 |
++ mss = full_space; |
17068 |
++ |
17069 |
++ if (free_space < (full_space >> 1)) { |
17070 |
++ icsk->icsk_ack.quick = 0; |
17071 |
++ |
17072 |
++ if (tcp_memory_pressure) |
17073 |
++ /* TODO this has to be adapted when we support different |
17074 |
++ * MSS's among the subflows. |
17075 |
++ */ |
17076 |
++ meta_tp->rcv_ssthresh = min(meta_tp->rcv_ssthresh, |
17077 |
++ 4U * meta_tp->advmss); |
17078 |
++ |
17079 |
++ if (free_space < mss) |
17080 |
++ return 0; |
17081 |
++ } |
17082 |
++ |
17083 |
++ if (free_space > meta_tp->rcv_ssthresh) |
17084 |
++ free_space = meta_tp->rcv_ssthresh; |
17085 |
++ |
17086 |
++ /* Don't do rounding if we are using window scaling, since the |
17087 |
++ * scaled window will not line up with the MSS boundary anyway. |
17088 |
++ */ |
17089 |
++ window = meta_tp->rcv_wnd; |
17090 |
++ if (tp->rx_opt.rcv_wscale) { |
17091 |
++ window = free_space; |
17092 |
++ |
17093 |
++ /* Advertise enough space so that it won't get scaled away. |
17094 |
++ * Import case: prevent zero window announcement if |
17095 |
++ * 1<<rcv_wscale > mss. |
17096 |
++ */ |
17097 |
++ if (((window >> tp->rx_opt.rcv_wscale) << tp-> |
17098 |
++ rx_opt.rcv_wscale) != window) |
17099 |
++ window = (((window >> tp->rx_opt.rcv_wscale) + 1) |
17100 |
++ << tp->rx_opt.rcv_wscale); |
17101 |
++ } else { |
17102 |
++ /* Get the largest window that is a nice multiple of mss. |
17103 |
++ * Window clamp already applied above. |
17104 |
++ * If our current window offering is within 1 mss of the |
17105 |
++ * free space we just keep it. This prevents the divide |
17106 |
++ * and multiply from happening most of the time. |
17107 |
++ * We also don't do any window rounding when the free space |
17108 |
++ * is too small. |
17109 |
++ */ |
17110 |
++ if (window <= free_space - mss || window > free_space) |
17111 |
++ window = (free_space / mss) * mss; |
17112 |
++ else if (mss == full_space && |
17113 |
++ free_space > window + (full_space >> 1)) |
17114 |
++ window = free_space; |
17115 |
++ } |
17116 |
++ |
17117 |
++ return window; |
17118 |
++} |
17119 |
++ |
17120 |
++void mptcp_syn_options(const struct sock *sk, struct tcp_out_options *opts, |
17121 |
++ unsigned *remaining) |
17122 |
++{ |
17123 |
++ const struct tcp_sock *tp = tcp_sk(sk); |
17124 |
++ |
17125 |
++ opts->options |= OPTION_MPTCP; |
17126 |
++ if (is_master_tp(tp)) { |
17127 |
++ opts->mptcp_options |= OPTION_MP_CAPABLE | OPTION_TYPE_SYN; |
17128 |
++ *remaining -= MPTCP_SUB_LEN_CAPABLE_SYN_ALIGN; |
17129 |
++ opts->mp_capable.sender_key = tp->mptcp_loc_key; |
17130 |
++ opts->dss_csum = !!sysctl_mptcp_checksum; |
17131 |
++ } else { |
17132 |
++ const struct mptcp_cb *mpcb = tp->mpcb; |
17133 |
++ |
17134 |
++ opts->mptcp_options |= OPTION_MP_JOIN | OPTION_TYPE_SYN; |
17135 |
++ *remaining -= MPTCP_SUB_LEN_JOIN_SYN_ALIGN; |
17136 |
++ opts->mp_join_syns.token = mpcb->mptcp_rem_token; |
17137 |
++ opts->mp_join_syns.low_prio = tp->mptcp->low_prio; |
17138 |
++ opts->addr_id = tp->mptcp->loc_id; |
17139 |
++ opts->mp_join_syns.sender_nonce = tp->mptcp->mptcp_loc_nonce; |
17140 |
++ } |
17141 |
++} |
17142 |
++ |
17143 |
++void mptcp_synack_options(struct request_sock *req, |
17144 |
++ struct tcp_out_options *opts, unsigned *remaining) |
17145 |
++{ |
17146 |
++ struct mptcp_request_sock *mtreq; |
17147 |
++ mtreq = mptcp_rsk(req); |
17148 |
++ |
17149 |
++ opts->options |= OPTION_MPTCP; |
17150 |
++ /* MPCB not yet set - thus it's a new MPTCP-session */ |
17151 |
++ if (!mtreq->is_sub) { |
17152 |
++ opts->mptcp_options |= OPTION_MP_CAPABLE | OPTION_TYPE_SYNACK; |
17153 |
++ opts->mp_capable.sender_key = mtreq->mptcp_loc_key; |
17154 |
++ opts->dss_csum = !!sysctl_mptcp_checksum || mtreq->dss_csum; |
17155 |
++ *remaining -= MPTCP_SUB_LEN_CAPABLE_SYN_ALIGN; |
17156 |
++ } else { |
17157 |
++ opts->mptcp_options |= OPTION_MP_JOIN | OPTION_TYPE_SYNACK; |
17158 |
++ opts->mp_join_syns.sender_truncated_mac = |
17159 |
++ mtreq->mptcp_hash_tmac; |
17160 |
++ opts->mp_join_syns.sender_nonce = mtreq->mptcp_loc_nonce; |
17161 |
++ opts->mp_join_syns.low_prio = mtreq->low_prio; |
17162 |
++ opts->addr_id = mtreq->loc_id; |
17163 |
++ *remaining -= MPTCP_SUB_LEN_JOIN_SYNACK_ALIGN; |
17164 |
++ } |
17165 |
++} |
17166 |
++ |
17167 |
++void mptcp_established_options(struct sock *sk, struct sk_buff *skb, |
17168 |
++ struct tcp_out_options *opts, unsigned *size) |
17169 |
++{ |
17170 |
++ struct tcp_sock *tp = tcp_sk(sk); |
17171 |
++ struct mptcp_cb *mpcb = tp->mpcb; |
17172 |
++ const struct tcp_skb_cb *tcb = skb ? TCP_SKB_CB(skb) : NULL; |
17173 |
++ |
17174 |
++ /* We are coming from tcp_current_mss with the meta_sk as an argument. |
17175 |
++ * It does not make sense to check for the options, because when the |
17176 |
++ * segment gets sent, another subflow will be chosen. |
17177 |
++ */ |
17178 |
++ if (!skb && is_meta_sk(sk)) |
17179 |
++ return; |
17180 |
++ |
17181 |
++ /* In fallback mp_fail-mode, we have to repeat it until the fallback |
17182 |
++ * has been done by the sender |
17183 |
++ */ |
17184 |
++ if (unlikely(tp->mptcp->send_mp_fail)) { |
17185 |
++ opts->options |= OPTION_MPTCP; |
17186 |
++ opts->mptcp_options |= OPTION_MP_FAIL; |
17187 |
++ *size += MPTCP_SUB_LEN_FAIL; |
17188 |
++ return; |
17189 |
++ } |
17190 |
++ |
17191 |
++ if (unlikely(tp->send_mp_fclose)) { |
17192 |
++ opts->options |= OPTION_MPTCP; |
17193 |
++ opts->mptcp_options |= OPTION_MP_FCLOSE; |
17194 |
++ opts->mp_capable.receiver_key = mpcb->mptcp_rem_key; |
17195 |
++ *size += MPTCP_SUB_LEN_FCLOSE_ALIGN; |
17196 |
++ return; |
17197 |
++ } |
17198 |
++ |
17199 |
++ /* 1. If we are the sender of the infinite-mapping, we need the |
17200 |
++ * MPTCPHDR_INF-flag, because a retransmission of the |
17201 |
++ * infinite-announcment still needs the mptcp-option. |
17202 |
++ * |
17203 |
++ * We need infinite_cutoff_seq, because retransmissions from before |
17204 |
++ * the infinite-cutoff-moment still need the MPTCP-signalling to stay |
17205 |
++ * consistent. |
17206 |
++ * |
17207 |
++ * 2. If we are the receiver of the infinite-mapping, we always skip |
17208 |
++ * mptcp-options, because acknowledgments from before the |
17209 |
++ * infinite-mapping point have already been sent out. |
17210 |
++ * |
17211 |
++ * I know, the whole infinite-mapping stuff is ugly... |
17212 |
++ * |
17213 |
++ * TODO: Handle wrapped data-sequence numbers |
17214 |
++ * (even if it's very unlikely) |
17215 |
++ */ |
17216 |
++ if (unlikely(mpcb->infinite_mapping_snd) && |
17217 |
++ ((mpcb->send_infinite_mapping && tcb && |
17218 |
++ mptcp_is_data_seq(skb) && |
17219 |
++ !(tcb->mptcp_flags & MPTCPHDR_INF) && |
17220 |
++ !before(tcb->seq, tp->mptcp->infinite_cutoff_seq)) || |
17221 |
++ !mpcb->send_infinite_mapping)) |
17222 |
++ return; |
17223 |
++ |
17224 |
++ if (unlikely(tp->mptcp->include_mpc)) { |
17225 |
++ opts->options |= OPTION_MPTCP; |
17226 |
++ opts->mptcp_options |= OPTION_MP_CAPABLE | |
17227 |
++ OPTION_TYPE_ACK; |
17228 |
++ *size += MPTCP_SUB_LEN_CAPABLE_ACK_ALIGN; |
17229 |
++ opts->mp_capable.sender_key = mpcb->mptcp_loc_key; |
17230 |
++ opts->mp_capable.receiver_key = mpcb->mptcp_rem_key; |
17231 |
++ opts->dss_csum = mpcb->dss_csum; |
17232 |
++ |
17233 |
++ if (skb) |
17234 |
++ tp->mptcp->include_mpc = 0; |
17235 |
++ } |
17236 |
++ if (unlikely(tp->mptcp->pre_established)) { |
17237 |
++ opts->options |= OPTION_MPTCP; |
17238 |
++ opts->mptcp_options |= OPTION_MP_JOIN | OPTION_TYPE_ACK; |
17239 |
++ *size += MPTCP_SUB_LEN_JOIN_ACK_ALIGN; |
17240 |
++ } |
17241 |
++ |
17242 |
++ if (!tp->mptcp->include_mpc && !tp->mptcp->pre_established) { |
17243 |
++ opts->options |= OPTION_MPTCP; |
17244 |
++ opts->mptcp_options |= OPTION_DATA_ACK; |
17245 |
++ /* If !skb, we come from tcp_current_mss and thus we always |
17246 |
++ * assume that the DSS-option will be set for the data-packet. |
17247 |
++ */ |
17248 |
++ if (skb && !mptcp_is_data_seq(skb)) { |
17249 |
++ *size += MPTCP_SUB_LEN_ACK_ALIGN; |
17250 |
++ } else { |
17251 |
++ /* Doesn't matter, if csum included or not. It will be |
17252 |
++ * either 10 or 12, and thus aligned = 12 |
17253 |
++ */ |
17254 |
++ *size += MPTCP_SUB_LEN_ACK_ALIGN + |
17255 |
++ MPTCP_SUB_LEN_SEQ_ALIGN; |
17256 |
++ } |
17257 |
++ |
17258 |
++ *size += MPTCP_SUB_LEN_DSS_ALIGN; |
17259 |
++ } |
17260 |
++ |
17261 |
++ if (unlikely(mpcb->addr_signal) && mpcb->pm_ops->addr_signal) |
17262 |
++ mpcb->pm_ops->addr_signal(sk, size, opts, skb); |
17263 |
++ |
17264 |
++ if (unlikely(tp->mptcp->send_mp_prio) && |
17265 |
++ MAX_TCP_OPTION_SPACE - *size >= MPTCP_SUB_LEN_PRIO_ALIGN) { |
17266 |
++ opts->options |= OPTION_MPTCP; |
17267 |
++ opts->mptcp_options |= OPTION_MP_PRIO; |
17268 |
++ if (skb) |
17269 |
++ tp->mptcp->send_mp_prio = 0; |
17270 |
++ *size += MPTCP_SUB_LEN_PRIO_ALIGN; |
17271 |
++ } |
17272 |
++ |
17273 |
++ return; |
17274 |
++} |
17275 |
++ |
17276 |
++u16 mptcp_select_window(struct sock *sk) |
17277 |
++{ |
17278 |
++ u16 new_win = tcp_select_window(sk); |
17279 |
++ struct tcp_sock *tp = tcp_sk(sk); |
17280 |
++ struct tcp_sock *meta_tp = mptcp_meta_tp(tp); |
17281 |
++ |
17282 |
++ meta_tp->rcv_wnd = tp->rcv_wnd; |
17283 |
++ meta_tp->rcv_wup = meta_tp->rcv_nxt; |
17284 |
++ |
17285 |
++ return new_win; |
17286 |
++} |
17287 |
++ |
17288 |
++void mptcp_options_write(__be32 *ptr, struct tcp_sock *tp, |
17289 |
++ const struct tcp_out_options *opts, |
17290 |
++ struct sk_buff *skb) |
17291 |
++{ |
17292 |
++ if (unlikely(OPTION_MP_CAPABLE & opts->mptcp_options)) { |
17293 |
++ struct mp_capable *mpc = (struct mp_capable *)ptr; |
17294 |
++ |
17295 |
++ mpc->kind = TCPOPT_MPTCP; |
17296 |
++ |
17297 |
++ if ((OPTION_TYPE_SYN & opts->mptcp_options) || |
17298 |
++ (OPTION_TYPE_SYNACK & opts->mptcp_options)) { |
17299 |
++ mpc->sender_key = opts->mp_capable.sender_key; |
17300 |
++ mpc->len = MPTCP_SUB_LEN_CAPABLE_SYN; |
17301 |
++ ptr += MPTCP_SUB_LEN_CAPABLE_SYN_ALIGN >> 2; |
17302 |
++ } else if (OPTION_TYPE_ACK & opts->mptcp_options) { |
17303 |
++ mpc->sender_key = opts->mp_capable.sender_key; |
17304 |
++ mpc->receiver_key = opts->mp_capable.receiver_key; |
17305 |
++ mpc->len = MPTCP_SUB_LEN_CAPABLE_ACK; |
17306 |
++ ptr += MPTCP_SUB_LEN_CAPABLE_ACK_ALIGN >> 2; |
17307 |
++ } |
17308 |
++ |
17309 |
++ mpc->sub = MPTCP_SUB_CAPABLE; |
17310 |
++ mpc->ver = 0; |
17311 |
++ mpc->a = opts->dss_csum; |
17312 |
++ mpc->b = 0; |
17313 |
++ mpc->rsv = 0; |
17314 |
++ mpc->h = 1; |
17315 |
++ } |
17316 |
++ |
17317 |
++ if (unlikely(OPTION_MP_JOIN & opts->mptcp_options)) { |
17318 |
++ struct mp_join *mpj = (struct mp_join *)ptr; |
17319 |
++ |
17320 |
++ mpj->kind = TCPOPT_MPTCP; |
17321 |
++ mpj->sub = MPTCP_SUB_JOIN; |
17322 |
++ mpj->rsv = 0; |
17323 |
++ |
17324 |
++ if (OPTION_TYPE_SYN & opts->mptcp_options) { |
17325 |
++ mpj->len = MPTCP_SUB_LEN_JOIN_SYN; |
17326 |
++ mpj->u.syn.token = opts->mp_join_syns.token; |
17327 |
++ mpj->u.syn.nonce = opts->mp_join_syns.sender_nonce; |
17328 |
++ mpj->b = opts->mp_join_syns.low_prio; |
17329 |
++ mpj->addr_id = opts->addr_id; |
17330 |
++ ptr += MPTCP_SUB_LEN_JOIN_SYN_ALIGN >> 2; |
17331 |
++ } else if (OPTION_TYPE_SYNACK & opts->mptcp_options) { |
17332 |
++ mpj->len = MPTCP_SUB_LEN_JOIN_SYNACK; |
17333 |
++ mpj->u.synack.mac = |
17334 |
++ opts->mp_join_syns.sender_truncated_mac; |
17335 |
++ mpj->u.synack.nonce = opts->mp_join_syns.sender_nonce; |
17336 |
++ mpj->b = opts->mp_join_syns.low_prio; |
17337 |
++ mpj->addr_id = opts->addr_id; |
17338 |
++ ptr += MPTCP_SUB_LEN_JOIN_SYNACK_ALIGN >> 2; |
17339 |
++ } else if (OPTION_TYPE_ACK & opts->mptcp_options) { |
17340 |
++ mpj->len = MPTCP_SUB_LEN_JOIN_ACK; |
17341 |
++ mpj->addr_id = 0; /* addr_id is rsv (RFC 6824, p. 21) */ |
17342 |
++ memcpy(mpj->u.ack.mac, &tp->mptcp->sender_mac[0], 20); |
17343 |
++ ptr += MPTCP_SUB_LEN_JOIN_ACK_ALIGN >> 2; |
17344 |
++ } |
17345 |
++ } |
17346 |
++ if (unlikely(OPTION_ADD_ADDR & opts->mptcp_options)) { |
17347 |
++ struct mp_add_addr *mpadd = (struct mp_add_addr *)ptr; |
17348 |
++ |
17349 |
++ mpadd->kind = TCPOPT_MPTCP; |
17350 |
++ if (opts->add_addr_v4) { |
17351 |
++ mpadd->len = MPTCP_SUB_LEN_ADD_ADDR4; |
17352 |
++ mpadd->sub = MPTCP_SUB_ADD_ADDR; |
17353 |
++ mpadd->ipver = 4; |
17354 |
++ mpadd->addr_id = opts->add_addr4.addr_id; |
17355 |
++ mpadd->u.v4.addr = opts->add_addr4.addr; |
17356 |
++ ptr += MPTCP_SUB_LEN_ADD_ADDR4_ALIGN >> 2; |
17357 |
++ } else if (opts->add_addr_v6) { |
17358 |
++ mpadd->len = MPTCP_SUB_LEN_ADD_ADDR6; |
17359 |
++ mpadd->sub = MPTCP_SUB_ADD_ADDR; |
17360 |
++ mpadd->ipver = 6; |
17361 |
++ mpadd->addr_id = opts->add_addr6.addr_id; |
17362 |
++ memcpy(&mpadd->u.v6.addr, &opts->add_addr6.addr, |
17363 |
++ sizeof(mpadd->u.v6.addr)); |
17364 |
++ ptr += MPTCP_SUB_LEN_ADD_ADDR6_ALIGN >> 2; |
17365 |
++ } |
17366 |
++ } |
17367 |
++ if (unlikely(OPTION_REMOVE_ADDR & opts->mptcp_options)) { |
17368 |
++ struct mp_remove_addr *mprem = (struct mp_remove_addr *)ptr; |
17369 |
++ u8 *addrs_id; |
17370 |
++ int id, len, len_align; |
17371 |
++ |
17372 |
++ len = mptcp_sub_len_remove_addr(opts->remove_addrs); |
17373 |
++ len_align = mptcp_sub_len_remove_addr_align(opts->remove_addrs); |
17374 |
++ |
17375 |
++ mprem->kind = TCPOPT_MPTCP; |
17376 |
++ mprem->len = len; |
17377 |
++ mprem->sub = MPTCP_SUB_REMOVE_ADDR; |
17378 |
++ mprem->rsv = 0; |
17379 |
++ addrs_id = &mprem->addrs_id; |
17380 |
++ |
17381 |
++ mptcp_for_each_bit_set(opts->remove_addrs, id) |
17382 |
++ *(addrs_id++) = id; |
17383 |
++ |
17384 |
++ /* Fill the rest with NOP's */ |
17385 |
++ if (len_align > len) { |
17386 |
++ int i; |
17387 |
++ for (i = 0; i < len_align - len; i++) |
17388 |
++ *(addrs_id++) = TCPOPT_NOP; |
17389 |
++ } |
17390 |
++ |
17391 |
++ ptr += len_align >> 2; |
17392 |
++ } |
17393 |
++ if (unlikely(OPTION_MP_FAIL & opts->mptcp_options)) { |
17394 |
++ struct mp_fail *mpfail = (struct mp_fail *)ptr; |
17395 |
++ |
17396 |
++ mpfail->kind = TCPOPT_MPTCP; |
17397 |
++ mpfail->len = MPTCP_SUB_LEN_FAIL; |
17398 |
++ mpfail->sub = MPTCP_SUB_FAIL; |
17399 |
++ mpfail->rsv1 = 0; |
17400 |
++ mpfail->rsv2 = 0; |
17401 |
++ mpfail->data_seq = htonll(tp->mpcb->csum_cutoff_seq); |
17402 |
++ |
17403 |
++ ptr += MPTCP_SUB_LEN_FAIL_ALIGN >> 2; |
17404 |
++ } |
17405 |
++ if (unlikely(OPTION_MP_FCLOSE & opts->mptcp_options)) { |
17406 |
++ struct mp_fclose *mpfclose = (struct mp_fclose *)ptr; |
17407 |
++ |
17408 |
++ mpfclose->kind = TCPOPT_MPTCP; |
17409 |
++ mpfclose->len = MPTCP_SUB_LEN_FCLOSE; |
17410 |
++ mpfclose->sub = MPTCP_SUB_FCLOSE; |
17411 |
++ mpfclose->rsv1 = 0; |
17412 |
++ mpfclose->rsv2 = 0; |
17413 |
++ mpfclose->key = opts->mp_capable.receiver_key; |
17414 |
++ |
17415 |
++ ptr += MPTCP_SUB_LEN_FCLOSE_ALIGN >> 2; |
17416 |
++ } |
17417 |
++ |
17418 |
++ if (OPTION_DATA_ACK & opts->mptcp_options) { |
17419 |
++ if (!mptcp_is_data_seq(skb)) |
17420 |
++ ptr += mptcp_write_dss_data_ack(tp, skb, ptr); |
17421 |
++ else |
17422 |
++ ptr += mptcp_write_dss_data_seq(tp, skb, ptr); |
17423 |
++ } |
17424 |
++ if (unlikely(OPTION_MP_PRIO & opts->mptcp_options)) { |
17425 |
++ struct mp_prio *mpprio = (struct mp_prio *)ptr; |
17426 |
++ |
17427 |
++ mpprio->kind = TCPOPT_MPTCP; |
17428 |
++ mpprio->len = MPTCP_SUB_LEN_PRIO; |
17429 |
++ mpprio->sub = MPTCP_SUB_PRIO; |
17430 |
++ mpprio->rsv = 0; |
17431 |
++ mpprio->b = tp->mptcp->low_prio; |
17432 |
++ mpprio->addr_id = TCPOPT_NOP; |
17433 |
++ |
17434 |
++ ptr += MPTCP_SUB_LEN_PRIO_ALIGN >> 2; |
17435 |
++ } |
17436 |
++} |
17437 |
++ |
17438 |
++/* Sends the datafin */ |
17439 |
++void mptcp_send_fin(struct sock *meta_sk) |
17440 |
++{ |
17441 |
++ struct tcp_sock *meta_tp = tcp_sk(meta_sk); |
17442 |
++ struct sk_buff *skb = tcp_write_queue_tail(meta_sk); |
17443 |
++ int mss_now; |
17444 |
++ |
17445 |
++ if ((1 << meta_sk->sk_state) & (TCPF_CLOSE_WAIT | TCPF_LAST_ACK)) |
17446 |
++ meta_tp->mpcb->passive_close = 1; |
17447 |
++ |
17448 |
++ /* Optimization, tack on the FIN if we have a queue of |
17449 |
++ * unsent frames. But be careful about outgoing SACKS |
17450 |
++ * and IP options. |
17451 |
++ */ |
17452 |
++ mss_now = mptcp_current_mss(meta_sk); |
17453 |
++ |
17454 |
++ if (tcp_send_head(meta_sk) != NULL) { |
17455 |
++ TCP_SKB_CB(skb)->mptcp_flags |= MPTCPHDR_FIN; |
17456 |
++ TCP_SKB_CB(skb)->end_seq++; |
17457 |
++ meta_tp->write_seq++; |
17458 |
++ } else { |
17459 |
++ /* Socket is locked, keep trying until memory is available. */ |
17460 |
++ for (;;) { |
17461 |
++ skb = alloc_skb_fclone(MAX_TCP_HEADER, |
17462 |
++ meta_sk->sk_allocation); |
17463 |
++ if (skb) |
17464 |
++ break; |
17465 |
++ yield(); |
17466 |
++ } |
17467 |
++ /* Reserve space for headers and prepare control bits. */ |
17468 |
++ skb_reserve(skb, MAX_TCP_HEADER); |
17469 |
++ |
17470 |
++ tcp_init_nondata_skb(skb, meta_tp->write_seq, TCPHDR_ACK); |
17471 |
++ TCP_SKB_CB(skb)->end_seq++; |
17472 |
++ TCP_SKB_CB(skb)->mptcp_flags |= MPTCPHDR_FIN; |
17473 |
++ tcp_queue_skb(meta_sk, skb); |
17474 |
++ } |
17475 |
++ __tcp_push_pending_frames(meta_sk, mss_now, TCP_NAGLE_OFF); |
17476 |
++} |
17477 |
++ |
17478 |
++void mptcp_send_active_reset(struct sock *meta_sk, gfp_t priority) |
17479 |
++{ |
17480 |
++ struct tcp_sock *meta_tp = tcp_sk(meta_sk); |
17481 |
++ struct mptcp_cb *mpcb = meta_tp->mpcb; |
17482 |
++ struct sock *sk = NULL, *sk_it = NULL, *tmpsk; |
17483 |
++ |
17484 |
++ if (!mpcb->cnt_subflows) |
17485 |
++ return; |
17486 |
++ |
17487 |
++ WARN_ON(meta_tp->send_mp_fclose); |
17488 |
++ |
17489 |
++ /* First - select a socket */ |
17490 |
++ sk = mptcp_select_ack_sock(meta_sk); |
17491 |
++ |
17492 |
++ /* May happen if no subflow is in an appropriate state */ |
17493 |
++ if (!sk) |
17494 |
++ return; |
17495 |
++ |
17496 |
++ /* We are in infinite mode - just send a reset */ |
17497 |
++ if (mpcb->infinite_mapping_snd || mpcb->infinite_mapping_rcv) { |
17498 |
++ sk->sk_err = ECONNRESET; |
17499 |
++ if (tcp_need_reset(sk->sk_state)) |
17500 |
++ tcp_send_active_reset(sk, priority); |
17501 |
++ mptcp_sub_force_close(sk); |
17502 |
++ return; |
17503 |
++ } |
17504 |
++ |
17505 |
++ |
17506 |
++ tcp_sk(sk)->send_mp_fclose = 1; |
17507 |
++ /** Reset all other subflows */ |
17508 |
++ |
17509 |
++ /* tcp_done must be handled with bh disabled */ |
17510 |
++ if (!in_serving_softirq()) |
17511 |
++ local_bh_disable(); |
17512 |
++ |
17513 |
++ mptcp_for_each_sk_safe(mpcb, sk_it, tmpsk) { |
17514 |
++ if (tcp_sk(sk_it)->send_mp_fclose) |
17515 |
++ continue; |
17516 |
++ |
17517 |
++ sk_it->sk_err = ECONNRESET; |
17518 |
++ if (tcp_need_reset(sk_it->sk_state)) |
17519 |
++ tcp_send_active_reset(sk_it, GFP_ATOMIC); |
17520 |
++ mptcp_sub_force_close(sk_it); |
17521 |
++ } |
17522 |
++ |
17523 |
++ if (!in_serving_softirq()) |
17524 |
++ local_bh_enable(); |
17525 |
++ |
17526 |
++ tcp_send_ack(sk); |
17527 |
++ inet_csk_reset_keepalive_timer(sk, inet_csk(sk)->icsk_rto); |
17528 |
++ |
17529 |
++ meta_tp->send_mp_fclose = 1; |
17530 |
++} |
17531 |
++ |
17532 |
++static void mptcp_ack_retransmit_timer(struct sock *sk) |
17533 |
++{ |
17534 |
++ struct sk_buff *skb; |
17535 |
++ struct tcp_sock *tp = tcp_sk(sk); |
17536 |
++ struct inet_connection_sock *icsk = inet_csk(sk); |
17537 |
++ |
17538 |
++ if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk)) |
17539 |
++ goto out; /* Routing failure or similar */ |
17540 |
++ |
17541 |
++ if (!tp->retrans_stamp) |
17542 |
++ tp->retrans_stamp = tcp_time_stamp ? : 1; |
17543 |
++ |
17544 |
++ if (tcp_write_timeout(sk)) { |
17545 |
++ tp->mptcp->pre_established = 0; |
17546 |
++ sk_stop_timer(sk, &tp->mptcp->mptcp_ack_timer); |
17547 |
++ tp->ops->send_active_reset(sk, GFP_ATOMIC); |
17548 |
++ goto out; |
17549 |
++ } |
17550 |
++ |
17551 |
++ skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC); |
17552 |
++ if (skb == NULL) { |
17553 |
++ sk_reset_timer(sk, &tp->mptcp->mptcp_ack_timer, |
17554 |
++ jiffies + icsk->icsk_rto); |
17555 |
++ return; |
17556 |
++ } |
17557 |
++ |
17558 |
++ /* Reserve space for headers and prepare control bits */ |
17559 |
++ skb_reserve(skb, MAX_TCP_HEADER); |
17560 |
++ tcp_init_nondata_skb(skb, tp->snd_una, TCPHDR_ACK); |
17561 |
++ |
17562 |
++ TCP_SKB_CB(skb)->when = tcp_time_stamp; |
17563 |
++ if (tcp_transmit_skb(sk, skb, 0, GFP_ATOMIC) > 0) { |
17564 |
++ /* Retransmission failed because of local congestion, |
17565 |
++ * do not backoff. |
17566 |
++ */ |
17567 |
++ if (!icsk->icsk_retransmits) |
17568 |
++ icsk->icsk_retransmits = 1; |
17569 |
++ sk_reset_timer(sk, &tp->mptcp->mptcp_ack_timer, |
17570 |
++ jiffies + icsk->icsk_rto); |
17571 |
++ return; |
17572 |
++ } |
17573 |
++ |
17574 |
++ |
17575 |
++ icsk->icsk_retransmits++; |
17576 |
++ icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX); |
17577 |
++ sk_reset_timer(sk, &tp->mptcp->mptcp_ack_timer, |
17578 |
++ jiffies + icsk->icsk_rto); |
17579 |
++ if (retransmits_timed_out(sk, sysctl_tcp_retries1 + 1, 0, 0)) |
17580 |
++ __sk_dst_reset(sk); |
17581 |
++ |
17582 |
++out:; |
17583 |
++} |
17584 |
++ |
17585 |
++void mptcp_ack_handler(unsigned long data) |
17586 |
++{ |
17587 |
++ struct sock *sk = (struct sock *)data; |
17588 |
++ struct sock *meta_sk = mptcp_meta_sk(sk); |
17589 |
++ |
17590 |
++ bh_lock_sock(meta_sk); |
17591 |
++ if (sock_owned_by_user(meta_sk)) { |
17592 |
++ /* Try again later */ |
17593 |
++ sk_reset_timer(sk, &tcp_sk(sk)->mptcp->mptcp_ack_timer, |
17594 |
++ jiffies + (HZ / 20)); |
17595 |
++ goto out_unlock; |
17596 |
++ } |
17597 |
++ |
17598 |
++ if (sk->sk_state == TCP_CLOSE) |
17599 |
++ goto out_unlock; |
17600 |
++ if (!tcp_sk(sk)->mptcp->pre_established) |
17601 |
++ goto out_unlock; |
17602 |
++ |
17603 |
++ mptcp_ack_retransmit_timer(sk); |
17604 |
++ |
17605 |
++ sk_mem_reclaim(sk); |
17606 |
++ |
17607 |
++out_unlock: |
17608 |
++ bh_unlock_sock(meta_sk); |
17609 |
++ sock_put(sk); |
17610 |
++} |
17611 |
++ |
17612 |
++/* Similar to tcp_retransmit_skb |
17613 |
++ * |
17614 |
++ * The diff is that we handle the retransmission-stats (retrans_stamp) at the |
17615 |
++ * meta-level. |
17616 |
++ */ |
17617 |
++int mptcp_retransmit_skb(struct sock *meta_sk, struct sk_buff *skb) |
17618 |
++{ |
17619 |
++ struct tcp_sock *meta_tp = tcp_sk(meta_sk); |
17620 |
++ struct sock *subsk; |
17621 |
++ unsigned int limit, mss_now; |
17622 |
++ int err = -1; |
17623 |
++ |
17624 |
++ /* Do not sent more than we queued. 1/4 is reserved for possible |
17625 |
++ * copying overhead: fragmentation, tunneling, mangling etc. |
17626 |
++ * |
17627 |
++ * This is a meta-retransmission thus we check on the meta-socket. |
17628 |
++ */ |
17629 |
++ if (atomic_read(&meta_sk->sk_wmem_alloc) > |
17630 |
++ min(meta_sk->sk_wmem_queued + (meta_sk->sk_wmem_queued >> 2), meta_sk->sk_sndbuf)) { |
17631 |
++ return -EAGAIN; |
17632 |
++ } |
17633 |
++ |
17634 |
++ /* We need to make sure that the retransmitted segment can be sent on a |
17635 |
++ * subflow right now. If it is too big, it needs to be fragmented. |
17636 |
++ */ |
17637 |
++ subsk = meta_tp->mpcb->sched_ops->get_subflow(meta_sk, skb, false); |
17638 |
++ if (!subsk) { |
17639 |
++ /* We want to increase icsk_retransmits, thus return 0, so that |
17640 |
++ * mptcp_retransmit_timer enters the desired branch. |
17641 |
++ */ |
17642 |
++ err = 0; |
17643 |
++ goto failed; |
17644 |
++ } |
17645 |
++ mss_now = tcp_current_mss(subsk); |
17646 |
++ |
17647 |
++ /* If the segment was cloned (e.g. a meta retransmission), the header |
17648 |
++ * must be expanded/copied so that there is no corruption of TSO |
17649 |
++ * information. |
17650 |
++ */ |
17651 |
++ if (skb_unclone(skb, GFP_ATOMIC)) { |
17652 |
++ err = -ENOMEM; |
17653 |
++ goto failed; |
17654 |
++ } |
17655 |
++ |
17656 |
++ /* Must have been set by mptcp_write_xmit before */ |
17657 |
++ BUG_ON(!tcp_skb_pcount(skb)); |
17658 |
++ |
17659 |
++ limit = mss_now; |
17660 |
++ /* skb->len > mss_now is the equivalent of tso_segs > 1 in |
17661 |
++ * tcp_write_xmit. Otherwise split-point would return 0. |
17662 |
++ */ |
17663 |
++ if (skb->len > mss_now && !tcp_urg_mode(meta_tp)) |
17664 |
++ limit = tcp_mss_split_point(meta_sk, skb, mss_now, |
17665 |
++ UINT_MAX / mss_now, |
17666 |
++ TCP_NAGLE_OFF); |
17667 |
++ |
17668 |
++ if (skb->len > limit && |
17669 |
++ unlikely(mptcp_fragment(meta_sk, skb, limit, |
17670 |
++ GFP_ATOMIC, 0))) |
17671 |
++ goto failed; |
17672 |
++ |
17673 |
++ if (!mptcp_skb_entail(subsk, skb, -1)) |
17674 |
++ goto failed; |
17675 |
++ TCP_SKB_CB(skb)->when = tcp_time_stamp; |
17676 |
++ |
17677 |
++ /* Update global TCP statistics. */ |
17678 |
++ TCP_INC_STATS(sock_net(meta_sk), TCP_MIB_RETRANSSEGS); |
17679 |
++ |
17680 |
++ /* Diff to tcp_retransmit_skb */ |
17681 |
++ |
17682 |
++ /* Save stamp of the first retransmit. */ |
17683 |
++ if (!meta_tp->retrans_stamp) |
17684 |
++ meta_tp->retrans_stamp = TCP_SKB_CB(skb)->when; |
17685 |
++ |
17686 |
++ __tcp_push_pending_frames(subsk, mss_now, TCP_NAGLE_PUSH); |
17687 |
++ |
17688 |
++ return 0; |
17689 |
++ |
17690 |
++failed: |
17691 |
++ NET_INC_STATS_BH(sock_net(meta_sk), LINUX_MIB_TCPRETRANSFAIL); |
17692 |
++ return err; |
17693 |
++} |
17694 |
++ |
17695 |
++/* Similar to tcp_retransmit_timer |
17696 |
++ * |
17697 |
++ * The diff is that we have to handle retransmissions of the FAST_CLOSE-message |
17698 |
++ * and that we don't have an srtt estimation at the meta-level. |
17699 |
++ */ |
17700 |
++void mptcp_retransmit_timer(struct sock *meta_sk) |
17701 |
++{ |
17702 |
++ struct tcp_sock *meta_tp = tcp_sk(meta_sk); |
17703 |
++ struct mptcp_cb *mpcb = meta_tp->mpcb; |
17704 |
++ struct inet_connection_sock *meta_icsk = inet_csk(meta_sk); |
17705 |
++ int err; |
17706 |
++ |
17707 |
++ /* In fallback, retransmission is handled at the subflow-level */ |
17708 |
++ if (!meta_tp->packets_out || mpcb->infinite_mapping_snd || |
17709 |
++ mpcb->send_infinite_mapping) |
17710 |
++ return; |
17711 |
++ |
17712 |
++ WARN_ON(tcp_write_queue_empty(meta_sk)); |
17713 |
++ |
17714 |
++ if (!meta_tp->snd_wnd && !sock_flag(meta_sk, SOCK_DEAD) && |
17715 |
++ !((1 << meta_sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))) { |
17716 |
++ /* Receiver dastardly shrinks window. Our retransmits |
17717 |
++ * become zero probes, but we should not timeout this |
17718 |
++ * connection. If the socket is an orphan, time it out, |
17719 |
++ * we cannot allow such beasts to hang infinitely. |
17720 |
++ */ |
17721 |
++ struct inet_sock *meta_inet = inet_sk(meta_sk); |
17722 |
++ if (meta_sk->sk_family == AF_INET) { |
17723 |
++ LIMIT_NETDEBUG(KERN_DEBUG "MPTCP: Peer %pI4:%u/%u unexpectedly shrunk window %u:%u (repaired)\n", |
17724 |
++ &meta_inet->inet_daddr, |
17725 |
++ ntohs(meta_inet->inet_dport), |
17726 |
++ meta_inet->inet_num, meta_tp->snd_una, |
17727 |
++ meta_tp->snd_nxt); |
17728 |
++ } |
17729 |
++#if IS_ENABLED(CONFIG_IPV6) |
17730 |
++ else if (meta_sk->sk_family == AF_INET6) { |
17731 |
++ LIMIT_NETDEBUG(KERN_DEBUG "MPTCP: Peer %pI6:%u/%u unexpectedly shrunk window %u:%u (repaired)\n", |
17732 |
++ &meta_sk->sk_v6_daddr, |
17733 |
++ ntohs(meta_inet->inet_dport), |
17734 |
++ meta_inet->inet_num, meta_tp->snd_una, |
17735 |
++ meta_tp->snd_nxt); |
17736 |
++ } |
17737 |
++#endif |
17738 |
++ if (tcp_time_stamp - meta_tp->rcv_tstamp > TCP_RTO_MAX) { |
17739 |
++ tcp_write_err(meta_sk); |
17740 |
++ return; |
17741 |
++ } |
17742 |
++ |
17743 |
++ mptcp_retransmit_skb(meta_sk, tcp_write_queue_head(meta_sk)); |
17744 |
++ goto out_reset_timer; |
17745 |
++ } |
17746 |
++ |
17747 |
++ if (tcp_write_timeout(meta_sk)) |
17748 |
++ return; |
17749 |
++ |
17750 |
++ if (meta_icsk->icsk_retransmits == 0) |
17751 |
++ NET_INC_STATS_BH(sock_net(meta_sk), LINUX_MIB_TCPTIMEOUTS); |
17752 |
++ |
17753 |
++ meta_icsk->icsk_ca_state = TCP_CA_Loss; |
17754 |
++ |
17755 |
++ err = mptcp_retransmit_skb(meta_sk, tcp_write_queue_head(meta_sk)); |
17756 |
++ if (err > 0) { |
17757 |
++ /* Retransmission failed because of local congestion, |
17758 |
++ * do not backoff. |
17759 |
++ */ |
17760 |
++ if (!meta_icsk->icsk_retransmits) |
17761 |
++ meta_icsk->icsk_retransmits = 1; |
17762 |
++ inet_csk_reset_xmit_timer(meta_sk, ICSK_TIME_RETRANS, |
17763 |
++ min(meta_icsk->icsk_rto, TCP_RESOURCE_PROBE_INTERVAL), |
17764 |
++ TCP_RTO_MAX); |
17765 |
++ return; |
17766 |
++ } |
17767 |
++ |
17768 |
++ /* Increase the timeout each time we retransmit. Note that |
17769 |
++ * we do not increase the rtt estimate. rto is initialized |
17770 |
++ * from rtt, but increases here. Jacobson (SIGCOMM 88) suggests |
17771 |
++ * that doubling rto each time is the least we can get away with. |
17772 |
++ * In KA9Q, Karn uses this for the first few times, and then |
17773 |
++ * goes to quadratic. netBSD doubles, but only goes up to *64, |
17774 |
++ * and clamps at 1 to 64 sec afterwards. Note that 120 sec is |
17775 |
++ * defined in the protocol as the maximum possible RTT. I guess |
17776 |
++ * we'll have to use something other than TCP to talk to the |
17777 |
++ * University of Mars. |
17778 |
++ * |
17779 |
++ * PAWS allows us longer timeouts and large windows, so once |
17780 |
++ * implemented ftp to mars will work nicely. We will have to fix |
17781 |
++ * the 120 second clamps though! |
17782 |
++ */ |
17783 |
++ meta_icsk->icsk_backoff++; |
17784 |
++ meta_icsk->icsk_retransmits++; |
17785 |
++ |
17786 |
++out_reset_timer: |
17787 |
++ /* If stream is thin, use linear timeouts. Since 'icsk_backoff' is |
17788 |
++ * used to reset timer, set to 0. Recalculate 'icsk_rto' as this |
17789 |
++ * might be increased if the stream oscillates between thin and thick, |
17790 |
++ * thus the old value might already be too high compared to the value |
17791 |
++ * set by 'tcp_set_rto' in tcp_input.c which resets the rto without |
17792 |
++ * backoff. Limit to TCP_THIN_LINEAR_RETRIES before initiating |
17793 |
++ * exponential backoff behaviour to avoid continue hammering |
17794 |
++ * linear-timeout retransmissions into a black hole |
17795 |
++ */ |
17796 |
++ if (meta_sk->sk_state == TCP_ESTABLISHED && |
17797 |
++ (meta_tp->thin_lto || sysctl_tcp_thin_linear_timeouts) && |
17798 |
++ tcp_stream_is_thin(meta_tp) && |
17799 |
++ meta_icsk->icsk_retransmits <= TCP_THIN_LINEAR_RETRIES) { |
17800 |
++ meta_icsk->icsk_backoff = 0; |
17801 |
++ /* We cannot do the same as in tcp_write_timer because the |
17802 |
++ * srtt is not set here. |
17803 |
++ */ |
17804 |
++ mptcp_set_rto(meta_sk); |
17805 |
++ } else { |
17806 |
++ /* Use normal (exponential) backoff */ |
17807 |
++ meta_icsk->icsk_rto = min(meta_icsk->icsk_rto << 1, TCP_RTO_MAX); |
17808 |
++ } |
17809 |
++ inet_csk_reset_xmit_timer(meta_sk, ICSK_TIME_RETRANS, meta_icsk->icsk_rto, TCP_RTO_MAX); |
17810 |
++ |
17811 |
++ return; |
17812 |
++} |
17813 |
++ |
17814 |
++/* Modify values to an mptcp-level for the initial window of new subflows */ |
17815 |
++void mptcp_select_initial_window(int __space, __u32 mss, __u32 *rcv_wnd, |
17816 |
++ __u32 *window_clamp, int wscale_ok, |
17817 |
++ __u8 *rcv_wscale, __u32 init_rcv_wnd, |
17818 |
++ const struct sock *sk) |
17819 |
++{ |
17820 |
++ struct mptcp_cb *mpcb = tcp_sk(sk)->mpcb; |
17821 |
++ |
17822 |
++ *window_clamp = mpcb->orig_window_clamp; |
17823 |
++ __space = tcp_win_from_space(mpcb->orig_sk_rcvbuf); |
17824 |
++ |
17825 |
++ tcp_select_initial_window(__space, mss, rcv_wnd, window_clamp, |
17826 |
++ wscale_ok, rcv_wscale, init_rcv_wnd, sk); |
17827 |
++} |
17828 |
++ |
17829 |
++static inline u64 mptcp_calc_rate(const struct sock *meta_sk, unsigned int mss, |
17830 |
++ unsigned int (*mss_cb)(struct sock *sk)) |
17831 |
++{ |
17832 |
++ struct sock *sk; |
17833 |
++ u64 rate = 0; |
17834 |
++ |
17835 |
++ mptcp_for_each_sk(tcp_sk(meta_sk)->mpcb, sk) { |
17836 |
++ struct tcp_sock *tp = tcp_sk(sk); |
17837 |
++ int this_mss; |
17838 |
++ u64 this_rate; |
17839 |
++ |
17840 |
++ if (!mptcp_sk_can_send(sk)) |
17841 |
++ continue; |
17842 |
++ |
17843 |
++ /* Do not consider subflows without a RTT estimation yet |
17844 |
++ * otherwise this_rate >>> rate. |
17845 |
++ */ |
17846 |
++ if (unlikely(!tp->srtt_us)) |
17847 |
++ continue; |
17848 |
++ |
17849 |
++ this_mss = mss_cb(sk); |
17850 |
++ |
17851 |
++ /* If this_mss is smaller than mss, it means that a segment will |
17852 |
++ * be splitted in two (or more) when pushed on this subflow. If |
17853 |
++ * you consider that mss = 1428 and this_mss = 1420 then two |
17854 |
++ * segments will be generated: a 1420-byte and 8-byte segment. |
17855 |
++ * The latter will introduce a large overhead as for a single |
17856 |
++ * data segment 2 slots will be used in the congestion window. |
17857 |
++ * Therefore reducing by ~2 the potential throughput of this |
17858 |
++ * subflow. Indeed, 1428 will be send while 2840 could have been |
17859 |
++ * sent if mss == 1420 reducing the throughput by 2840 / 1428. |
17860 |
++ * |
17861 |
++ * The following algorithm take into account this overhead |
17862 |
++ * when computing the potential throughput that MPTCP can |
17863 |
++ * achieve when generating mss-byte segments. |
17864 |
++ * |
17865 |
++ * The formulae is the following: |
17866 |
++ * \sum_{\forall sub} ratio * \frac{mss * cwnd_sub}{rtt_sub} |
17867 |
++ * Where ratio is computed as follows: |
17868 |
++ * \frac{mss}{\ceil{mss / mss_sub} * mss_sub} |
17869 |
++ * |
17870 |
++ * ratio gives the reduction factor of the theoretical |
17871 |
++ * throughput a subflow can achieve if MPTCP uses a specific |
17872 |
++ * MSS value. |
17873 |
++ */ |
17874 |
++ this_rate = div64_u64((u64)mss * mss * (USEC_PER_SEC << 3) * |
17875 |
++ max(tp->snd_cwnd, tp->packets_out), |
17876 |
++ (u64)tp->srtt_us * |
17877 |
++ DIV_ROUND_UP(mss, this_mss) * this_mss); |
17878 |
++ rate += this_rate; |
17879 |
++ } |
17880 |
++ |
17881 |
++ return rate; |
17882 |
++} |
17883 |
++ |
17884 |
++static unsigned int __mptcp_current_mss(const struct sock *meta_sk, |
17885 |
++ unsigned int (*mss_cb)(struct sock *sk)) |
17886 |
++{ |
17887 |
++ unsigned int mss = 0; |
17888 |
++ u64 rate = 0; |
17889 |
++ struct sock *sk; |
17890 |
++ |
17891 |
++ mptcp_for_each_sk(tcp_sk(meta_sk)->mpcb, sk) { |
17892 |
++ int this_mss; |
17893 |
++ u64 this_rate; |
17894 |
++ |
17895 |
++ if (!mptcp_sk_can_send(sk)) |
17896 |
++ continue; |
17897 |
++ |
17898 |
++ this_mss = mss_cb(sk); |
17899 |
++ |
17900 |
++ /* Same mss values will produce the same throughput. */ |
17901 |
++ if (this_mss == mss) |
17902 |
++ continue; |
17903 |
++ |
17904 |
++ /* See whether using this mss value can theoretically improve |
17905 |
++ * the performances. |
17906 |
++ */ |
17907 |
++ this_rate = mptcp_calc_rate(meta_sk, this_mss, mss_cb); |
17908 |
++ if (this_rate >= rate) { |
17909 |
++ mss = this_mss; |
17910 |
++ rate = this_rate; |
17911 |
++ } |
17912 |
++ } |
17913 |
++ |
17914 |
++ return mss; |
17915 |
++} |
17916 |
++ |
17917 |
++unsigned int mptcp_current_mss(struct sock *meta_sk) |
17918 |
++{ |
17919 |
++ unsigned int mss = __mptcp_current_mss(meta_sk, tcp_current_mss); |
17920 |
++ |
17921 |
++ /* If no subflow is available, we take a default-mss from the |
17922 |
++ * meta-socket. |
17923 |
++ */ |
17924 |
++ return !mss ? tcp_current_mss(meta_sk) : mss; |
17925 |
++} |
17926 |
++ |
17927 |
++static unsigned int mptcp_select_size_mss(struct sock *sk) |
17928 |
++{ |
17929 |
++ return tcp_sk(sk)->mss_cache; |
17930 |
++} |
17931 |
++ |
17932 |
++int mptcp_select_size(const struct sock *meta_sk, bool sg) |
17933 |
++{ |
17934 |
++ unsigned int mss = __mptcp_current_mss(meta_sk, mptcp_select_size_mss); |
17935 |
++ |
17936 |
++ if (sg) { |
17937 |
++ if (mptcp_sk_can_gso(meta_sk)) { |
17938 |
++ mss = SKB_WITH_OVERHEAD(2048 - MAX_TCP_HEADER); |
17939 |
++ } else { |
17940 |
++ int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER); |
17941 |
++ |
17942 |
++ if (mss >= pgbreak && |
17943 |
++ mss <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE) |
17944 |
++ mss = pgbreak; |
17945 |
++ } |
17946 |
++ } |
17947 |
++ |
17948 |
++ return !mss ? tcp_sk(meta_sk)->mss_cache : mss; |
17949 |
++} |
17950 |
++ |
17951 |
++int mptcp_check_snd_buf(const struct tcp_sock *tp) |
17952 |
++{ |
17953 |
++ const struct sock *sk; |
17954 |
++ u32 rtt_max = tp->srtt_us; |
17955 |
++ u64 bw_est; |
17956 |
++ |
17957 |
++ if (!tp->srtt_us) |
17958 |
++ return tp->reordering + 1; |
17959 |
++ |
17960 |
++ mptcp_for_each_sk(tp->mpcb, sk) { |
17961 |
++ if (!mptcp_sk_can_send(sk)) |
17962 |
++ continue; |
17963 |
++ |
17964 |
++ if (rtt_max < tcp_sk(sk)->srtt_us) |
17965 |
++ rtt_max = tcp_sk(sk)->srtt_us; |
17966 |
++ } |
17967 |
++ |
17968 |
++ bw_est = div64_u64(((u64)tp->snd_cwnd * rtt_max) << 16, |
17969 |
++ (u64)tp->srtt_us); |
17970 |
++ |
17971 |
++ return max_t(unsigned int, (u32)(bw_est >> 16), |
17972 |
++ tp->reordering + 1); |
17973 |
++} |
17974 |
++ |
17975 |
++unsigned int mptcp_xmit_size_goal(const struct sock *meta_sk, u32 mss_now, |
17976 |
++ int large_allowed) |
17977 |
++{ |
17978 |
++ struct sock *sk; |
17979 |
++ u32 xmit_size_goal = 0; |
17980 |
++ |
17981 |
++ if (large_allowed && mptcp_sk_can_gso(meta_sk)) { |
17982 |
++ mptcp_for_each_sk(tcp_sk(meta_sk)->mpcb, sk) { |
17983 |
++ int this_size_goal; |
17984 |
++ |
17985 |
++ if (!mptcp_sk_can_send(sk)) |
17986 |
++ continue; |
17987 |
++ |
17988 |
++ this_size_goal = tcp_xmit_size_goal(sk, mss_now, 1); |
17989 |
++ if (this_size_goal > xmit_size_goal) |
17990 |
++ xmit_size_goal = this_size_goal; |
17991 |
++ } |
17992 |
++ } |
17993 |
++ |
17994 |
++ return max(xmit_size_goal, mss_now); |
17995 |
++} |
17996 |
++ |
17997 |
++/* Similar to tcp_trim_head - but we correctly copy the DSS-option */ |
17998 |
++int mptcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) |
17999 |
++{ |
18000 |
++ if (skb_cloned(skb)) { |
18001 |
++ if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) |
18002 |
++ return -ENOMEM; |
18003 |
++ } |
18004 |
++ |
18005 |
++ __pskb_trim_head(skb, len); |
18006 |
++ |
18007 |
++ TCP_SKB_CB(skb)->seq += len; |
18008 |
++ skb->ip_summed = CHECKSUM_PARTIAL; |
18009 |
++ |
18010 |
++ skb->truesize -= len; |
18011 |
++ sk->sk_wmem_queued -= len; |
18012 |
++ sk_mem_uncharge(sk, len); |
18013 |
++ sock_set_flag(sk, SOCK_QUEUE_SHRUNK); |
18014 |
++ |
18015 |
++ /* Any change of skb->len requires recalculation of tso factor. */ |
18016 |
++ if (tcp_skb_pcount(skb) > 1) |
18017 |
++ tcp_set_skb_tso_segs(sk, skb, tcp_skb_mss(skb)); |
18018 |
++ |
18019 |
++ return 0; |
18020 |
++} |
18021 |
+diff --git a/net/mptcp/mptcp_pm.c b/net/mptcp/mptcp_pm.c |
18022 |
+new file mode 100644 |
18023 |
+index 000000000000..9542f950729f |
18024 |
+--- /dev/null |
18025 |
++++ b/net/mptcp/mptcp_pm.c |
18026 |
+@@ -0,0 +1,169 @@ |
18027 |
++/* |
18028 |
++ * MPTCP implementation - MPTCP-subflow-management |
18029 |
++ * |
18030 |
++ * Initial Design & Implementation: |
18031 |
++ * Sébastien Barré <sebastien.barre@×××××××××.be> |
18032 |
++ * |
18033 |
++ * Current Maintainer & Author: |
18034 |
++ * Christoph Paasch <christoph.paasch@×××××××××.be> |
18035 |
++ * |
18036 |
++ * Additional authors: |
18037 |
++ * Jaakko Korkeaniemi <jaakko.korkeaniemi@×××××.fi> |
18038 |
++ * Gregory Detal <gregory.detal@×××××××××.be> |
18039 |
++ * Fabien Duchêne <fabien.duchene@×××××××××.be> |
18040 |
++ * Andreas Seelinger <Andreas.Seelinger@×××××××××××.de> |
18041 |
++ * Lavkesh Lahngir <lavkesh51@×××××.com> |
18042 |
++ * Andreas Ripke <ripke@××××××.eu> |
18043 |
++ * Vlad Dogaru <vlad.dogaru@×××××.com> |
18044 |
++ * Octavian Purdila <octavian.purdila@×××××.com> |
18045 |
++ * John Ronan <jronan@××××.org> |
18046 |
++ * Catalin Nicutar <catalin.nicutar@×××××.com> |
18047 |
++ * Brandon Heller <brandonh@××××××××.edu> |
18048 |
++ * |
18049 |
++ * |
18050 |
++ * This program is free software; you can redistribute it and/or |
18051 |
++ * modify it under the terms of the GNU General Public License |
18052 |
++ * as published by the Free Software Foundation; either version |
18053 |
++ * 2 of the License, or (at your option) any later version. |
18054 |
++ */ |
18055 |
++ |
18056 |
++ |
18057 |
++#include <linux/module.h> |
18058 |
++#include <net/mptcp.h> |
18059 |
++ |
18060 |
++static DEFINE_SPINLOCK(mptcp_pm_list_lock); |
18061 |
++static LIST_HEAD(mptcp_pm_list); |
18062 |
++ |
18063 |
++static int mptcp_default_id(sa_family_t family, union inet_addr *addr, |
18064 |
++ struct net *net, bool *low_prio) |
18065 |
++{ |
18066 |
++ return 0; |
18067 |
++} |
18068 |
++ |
18069 |
++struct mptcp_pm_ops mptcp_pm_default = { |
18070 |
++ .get_local_id = mptcp_default_id, /* We do not care */ |
18071 |
++ .name = "default", |
18072 |
++ .owner = THIS_MODULE, |
18073 |
++}; |
18074 |
++ |
18075 |
++static struct mptcp_pm_ops *mptcp_pm_find(const char *name) |
18076 |
++{ |
18077 |
++ struct mptcp_pm_ops *e; |
18078 |
++ |
18079 |
++ list_for_each_entry_rcu(e, &mptcp_pm_list, list) { |
18080 |
++ if (strcmp(e->name, name) == 0) |
18081 |
++ return e; |
18082 |
++ } |
18083 |
++ |
18084 |
++ return NULL; |
18085 |
++} |
18086 |
++ |
18087 |
++int mptcp_register_path_manager(struct mptcp_pm_ops *pm) |
18088 |
++{ |
18089 |
++ int ret = 0; |
18090 |
++ |
18091 |
++ if (!pm->get_local_id) |
18092 |
++ return -EINVAL; |
18093 |
++ |
18094 |
++ spin_lock(&mptcp_pm_list_lock); |
18095 |
++ if (mptcp_pm_find(pm->name)) { |
18096 |
++ pr_notice("%s already registered\n", pm->name); |
18097 |
++ ret = -EEXIST; |
18098 |
++ } else { |
18099 |
++ list_add_tail_rcu(&pm->list, &mptcp_pm_list); |
18100 |
++ pr_info("%s registered\n", pm->name); |
18101 |
++ } |
18102 |
++ spin_unlock(&mptcp_pm_list_lock); |
18103 |
++ |
18104 |
++ return ret; |
18105 |
++} |
18106 |
++EXPORT_SYMBOL_GPL(mptcp_register_path_manager); |
18107 |
++ |
18108 |
++void mptcp_unregister_path_manager(struct mptcp_pm_ops *pm) |
18109 |
++{ |
18110 |
++ spin_lock(&mptcp_pm_list_lock); |
18111 |
++ list_del_rcu(&pm->list); |
18112 |
++ spin_unlock(&mptcp_pm_list_lock); |
18113 |
++} |
18114 |
++EXPORT_SYMBOL_GPL(mptcp_unregister_path_manager); |
18115 |
++ |
18116 |
++void mptcp_get_default_path_manager(char *name) |
18117 |
++{ |
18118 |
++ struct mptcp_pm_ops *pm; |
18119 |
++ |
18120 |
++ BUG_ON(list_empty(&mptcp_pm_list)); |
18121 |
++ |
18122 |
++ rcu_read_lock(); |
18123 |
++ pm = list_entry(mptcp_pm_list.next, struct mptcp_pm_ops, list); |
18124 |
++ strncpy(name, pm->name, MPTCP_PM_NAME_MAX); |
18125 |
++ rcu_read_unlock(); |
18126 |
++} |
18127 |
++ |
18128 |
++int mptcp_set_default_path_manager(const char *name) |
18129 |
++{ |
18130 |
++ struct mptcp_pm_ops *pm; |
18131 |
++ int ret = -ENOENT; |
18132 |
++ |
18133 |
++ spin_lock(&mptcp_pm_list_lock); |
18134 |
++ pm = mptcp_pm_find(name); |
18135 |
++#ifdef CONFIG_MODULES |
18136 |
++ if (!pm && capable(CAP_NET_ADMIN)) { |
18137 |
++ spin_unlock(&mptcp_pm_list_lock); |
18138 |
++ |
18139 |
++ request_module("mptcp_%s", name); |
18140 |
++ spin_lock(&mptcp_pm_list_lock); |
18141 |
++ pm = mptcp_pm_find(name); |
18142 |
++ } |
18143 |
++#endif |
18144 |
++ |
18145 |
++ if (pm) { |
18146 |
++ list_move(&pm->list, &mptcp_pm_list); |
18147 |
++ ret = 0; |
18148 |
++ } else { |
18149 |
++ pr_info("%s is not available\n", name); |
18150 |
++ } |
18151 |
++ spin_unlock(&mptcp_pm_list_lock); |
18152 |
++ |
18153 |
++ return ret; |
18154 |
++} |
18155 |
++ |
18156 |
++void mptcp_init_path_manager(struct mptcp_cb *mpcb) |
18157 |
++{ |
18158 |
++ struct mptcp_pm_ops *pm; |
18159 |
++ |
18160 |
++ rcu_read_lock(); |
18161 |
++ list_for_each_entry_rcu(pm, &mptcp_pm_list, list) { |
18162 |
++ if (try_module_get(pm->owner)) { |
18163 |
++ mpcb->pm_ops = pm; |
18164 |
++ break; |
18165 |
++ } |
18166 |
++ } |
18167 |
++ rcu_read_unlock(); |
18168 |
++} |
18169 |
++ |
18170 |
++/* Manage refcounts on socket close. */ |
18171 |
++void mptcp_cleanup_path_manager(struct mptcp_cb *mpcb) |
18172 |
++{ |
18173 |
++ module_put(mpcb->pm_ops->owner); |
18174 |
++} |
18175 |
++ |
18176 |
++/* Fallback to the default path-manager. */ |
18177 |
++void mptcp_fallback_default(struct mptcp_cb *mpcb) |
18178 |
++{ |
18179 |
++ struct mptcp_pm_ops *pm; |
18180 |
++ |
18181 |
++ mptcp_cleanup_path_manager(mpcb); |
18182 |
++ pm = mptcp_pm_find("default"); |
18183 |
++ |
18184 |
++ /* Cannot fail - it's the default module */ |
18185 |
++ try_module_get(pm->owner); |
18186 |
++ mpcb->pm_ops = pm; |
18187 |
++} |
18188 |
++EXPORT_SYMBOL_GPL(mptcp_fallback_default); |
18189 |
++ |
18190 |
++/* Set default value from kernel configuration at bootup */ |
18191 |
++static int __init mptcp_path_manager_default(void) |
18192 |
++{ |
18193 |
++ return mptcp_set_default_path_manager(CONFIG_DEFAULT_MPTCP_PM); |
18194 |
++} |
18195 |
++late_initcall(mptcp_path_manager_default); |
18196 |
+diff --git a/net/mptcp/mptcp_rr.c b/net/mptcp/mptcp_rr.c |
18197 |
+new file mode 100644 |
18198 |
+index 000000000000..93278f684069 |
18199 |
+--- /dev/null |
18200 |
++++ b/net/mptcp/mptcp_rr.c |
18201 |
+@@ -0,0 +1,301 @@ |
18202 |
++/* MPTCP Scheduler module selector. Highly inspired by tcp_cong.c */ |
18203 |
++ |
18204 |
++#include <linux/module.h> |
18205 |
++#include <net/mptcp.h> |
18206 |
++ |
18207 |
++static unsigned char num_segments __read_mostly = 1; |
18208 |
++module_param(num_segments, byte, 0644); |
18209 |
++MODULE_PARM_DESC(num_segments, "The number of consecutive segments that are part of a burst"); |
18210 |
++ |
18211 |
++static bool cwnd_limited __read_mostly = 1; |
18212 |
++module_param(cwnd_limited, bool, 0644); |
18213 |
++MODULE_PARM_DESC(cwnd_limited, "if set to 1, the scheduler tries to fill the congestion-window on all subflows"); |
18214 |
++ |
18215 |
++struct rrsched_priv { |
18216 |
++ unsigned char quota; |
18217 |
++}; |
18218 |
++ |
18219 |
++static struct rrsched_priv *rrsched_get_priv(const struct tcp_sock *tp) |
18220 |
++{ |
18221 |
++ return (struct rrsched_priv *)&tp->mptcp->mptcp_sched[0]; |
18222 |
++} |
18223 |
++ |
18224 |
++/* If the sub-socket sk available to send the skb? */ |
18225 |
++static bool mptcp_rr_is_available(const struct sock *sk, const struct sk_buff *skb, |
18226 |
++ bool zero_wnd_test, bool cwnd_test) |
18227 |
++{ |
18228 |
++ const struct tcp_sock *tp = tcp_sk(sk); |
18229 |
++ unsigned int space, in_flight; |
18230 |
++ |
18231 |
++ /* Set of states for which we are allowed to send data */ |
18232 |
++ if (!mptcp_sk_can_send(sk)) |
18233 |
++ return false; |
18234 |
++ |
18235 |
++ /* We do not send data on this subflow unless it is |
18236 |
++ * fully established, i.e. the 4th ack has been received. |
18237 |
++ */ |
18238 |
++ if (tp->mptcp->pre_established) |
18239 |
++ return false; |
18240 |
++ |
18241 |
++ if (tp->pf) |
18242 |
++ return false; |
18243 |
++ |
18244 |
++ if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss) { |
18245 |
++ /* If SACK is disabled, and we got a loss, TCP does not exit |
18246 |
++ * the loss-state until something above high_seq has been acked. |
18247 |
++ * (see tcp_try_undo_recovery) |
18248 |
++ * |
18249 |
++ * high_seq is the snd_nxt at the moment of the RTO. As soon |
18250 |
++ * as we have an RTO, we won't push data on the subflow. |
18251 |
++ * Thus, snd_una can never go beyond high_seq. |
18252 |
++ */ |
18253 |
++ if (!tcp_is_reno(tp)) |
18254 |
++ return false; |
18255 |
++ else if (tp->snd_una != tp->high_seq) |
18256 |
++ return false; |
18257 |
++ } |
18258 |
++ |
18259 |
++ if (!tp->mptcp->fully_established) { |
18260 |
++ /* Make sure that we send in-order data */ |
18261 |
++ if (skb && tp->mptcp->second_packet && |
18262 |
++ tp->mptcp->last_end_data_seq != TCP_SKB_CB(skb)->seq) |
18263 |
++ return false; |
18264 |
++ } |
18265 |
++ |
18266 |
++ if (!cwnd_test) |
18267 |
++ goto zero_wnd_test; |
18268 |
++ |
18269 |
++ in_flight = tcp_packets_in_flight(tp); |
18270 |
++ /* Not even a single spot in the cwnd */ |
18271 |
++ if (in_flight >= tp->snd_cwnd) |
18272 |
++ return false; |
18273 |
++ |
18274 |
++ /* Now, check if what is queued in the subflow's send-queue |
18275 |
++ * already fills the cwnd. |
18276 |
++ */ |
18277 |
++ space = (tp->snd_cwnd - in_flight) * tp->mss_cache; |
18278 |
++ |
18279 |
++ if (tp->write_seq - tp->snd_nxt > space) |
18280 |
++ return false; |
18281 |
++ |
18282 |
++zero_wnd_test: |
18283 |
++ if (zero_wnd_test && !before(tp->write_seq, tcp_wnd_end(tp))) |
18284 |
++ return false; |
18285 |
++ |
18286 |
++ return true; |
18287 |
++} |
18288 |
++ |
18289 |
++/* Are we not allowed to reinject this skb on tp? */ |
18290 |
++static int mptcp_rr_dont_reinject_skb(const struct tcp_sock *tp, const struct sk_buff *skb) |
18291 |
++{ |
18292 |
++ /* If the skb has already been enqueued in this sk, try to find |
18293 |
++ * another one. |
18294 |
++ */ |
18295 |
++ return skb && |
18296 |
++ /* Has the skb already been enqueued into this subsocket? */ |
18297 |
++ mptcp_pi_to_flag(tp->mptcp->path_index) & TCP_SKB_CB(skb)->path_mask; |
18298 |
++} |
18299 |
++ |
18300 |
++/* We just look for any subflow that is available */ |
18301 |
++static struct sock *rr_get_available_subflow(struct sock *meta_sk, |
18302 |
++ struct sk_buff *skb, |
18303 |
++ bool zero_wnd_test) |
18304 |
++{ |
18305 |
++ const struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb; |
18306 |
++ struct sock *sk, *bestsk = NULL, *backupsk = NULL; |
18307 |
++ |
18308 |
++ /* Answer data_fin on same subflow!!! */ |
18309 |
++ if (meta_sk->sk_shutdown & RCV_SHUTDOWN && |
18310 |
++ skb && mptcp_is_data_fin(skb)) { |
18311 |
++ mptcp_for_each_sk(mpcb, sk) { |
18312 |
++ if (tcp_sk(sk)->mptcp->path_index == mpcb->dfin_path_index && |
18313 |
++ mptcp_rr_is_available(sk, skb, zero_wnd_test, true)) |
18314 |
++ return sk; |
18315 |
++ } |
18316 |
++ } |
18317 |
++ |
18318 |
++ /* First, find the best subflow */ |
18319 |
++ mptcp_for_each_sk(mpcb, sk) { |
18320 |
++ struct tcp_sock *tp = tcp_sk(sk); |
18321 |
++ |
18322 |
++ if (!mptcp_rr_is_available(sk, skb, zero_wnd_test, true)) |
18323 |
++ continue; |
18324 |
++ |
18325 |
++ if (mptcp_rr_dont_reinject_skb(tp, skb)) { |
18326 |
++ backupsk = sk; |
18327 |
++ continue; |
18328 |
++ } |
18329 |
++ |
18330 |
++ bestsk = sk; |
18331 |
++ } |
18332 |
++ |
18333 |
++ if (bestsk) { |
18334 |
++ sk = bestsk; |
18335 |
++ } else if (backupsk) { |
18336 |
++ /* It has been sent on all subflows once - let's give it a |
18337 |
++ * chance again by restarting its pathmask. |
18338 |
++ */ |
18339 |
++ if (skb) |
18340 |
++ TCP_SKB_CB(skb)->path_mask = 0; |
18341 |
++ sk = backupsk; |
18342 |
++ } |
18343 |
++ |
18344 |
++ return sk; |
18345 |
++} |
18346 |
++ |
18347 |
++/* Returns the next segment to be sent from the mptcp meta-queue. |
18348 |
++ * (chooses the reinject queue if any segment is waiting in it, otherwise, |
18349 |
++ * chooses the normal write queue). |
18350 |
++ * Sets *@reinject to 1 if the returned segment comes from the |
18351 |
++ * reinject queue. Sets it to 0 if it is the regular send-head of the meta-sk, |
18352 |
++ * and sets it to -1 if it is a meta-level retransmission to optimize the |
18353 |
++ * receive-buffer. |
18354 |
++ */ |
18355 |
++static struct sk_buff *__mptcp_rr_next_segment(const struct sock *meta_sk, int *reinject) |
18356 |
++{ |
18357 |
++ const struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb; |
18358 |
++ struct sk_buff *skb = NULL; |
18359 |
++ |
18360 |
++ *reinject = 0; |
18361 |
++ |
18362 |
++ /* If we are in fallback-mode, just take from the meta-send-queue */ |
18363 |
++ if (mpcb->infinite_mapping_snd || mpcb->send_infinite_mapping) |
18364 |
++ return tcp_send_head(meta_sk); |
18365 |
++ |
18366 |
++ skb = skb_peek(&mpcb->reinject_queue); |
18367 |
++ |
18368 |
++ if (skb) |
18369 |
++ *reinject = 1; |
18370 |
++ else |
18371 |
++ skb = tcp_send_head(meta_sk); |
18372 |
++ return skb; |
18373 |
++} |
18374 |
++ |
18375 |
++static struct sk_buff *mptcp_rr_next_segment(struct sock *meta_sk, |
18376 |
++ int *reinject, |
18377 |
++ struct sock **subsk, |
18378 |
++ unsigned int *limit) |
18379 |
++{ |
18380 |
++ const struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb; |
18381 |
++ struct sock *sk_it, *choose_sk = NULL; |
18382 |
++ struct sk_buff *skb = __mptcp_rr_next_segment(meta_sk, reinject); |
18383 |
++ unsigned char split = num_segments; |
18384 |
++ unsigned char iter = 0, full_subs = 0; |
18385 |
++ |
18386 |
++ /* As we set it, we have to reset it as well. */ |
18387 |
++ *limit = 0; |
18388 |
++ |
18389 |
++ if (!skb) |
18390 |
++ return NULL; |
18391 |
++ |
18392 |
++ if (*reinject) { |
18393 |
++ *subsk = rr_get_available_subflow(meta_sk, skb, false); |
18394 |
++ if (!*subsk) |
18395 |
++ return NULL; |
18396 |
++ |
18397 |
++ return skb; |
18398 |
++ } |
18399 |
++ |
18400 |
++retry: |
18401 |
++ |
18402 |
++ /* First, we look for a subflow who is currently being used */ |
18403 |
++ mptcp_for_each_sk(mpcb, sk_it) { |
18404 |
++ struct tcp_sock *tp_it = tcp_sk(sk_it); |
18405 |
++ struct rrsched_priv *rsp = rrsched_get_priv(tp_it); |
18406 |
++ |
18407 |
++ if (!mptcp_rr_is_available(sk_it, skb, false, cwnd_limited)) |
18408 |
++ continue; |
18409 |
++ |
18410 |
++ iter++; |
18411 |
++ |
18412 |
++ /* Is this subflow currently being used? */ |
18413 |
++ if (rsp->quota > 0 && rsp->quota < num_segments) { |
18414 |
++ split = num_segments - rsp->quota; |
18415 |
++ choose_sk = sk_it; |
18416 |
++ goto found; |
18417 |
++ } |
18418 |
++ |
18419 |
++ /* Or, it's totally unused */ |
18420 |
++ if (!rsp->quota) { |
18421 |
++ split = num_segments; |
18422 |
++ choose_sk = sk_it; |
18423 |
++ } |
18424 |
++ |
18425 |
++ /* Or, it must then be fully used */ |
18426 |
++ if (rsp->quota == num_segments) |
18427 |
++ full_subs++; |
18428 |
++ } |
18429 |
++ |
18430 |
++ /* All considered subflows have a full quota, and we considered at |
18431 |
++ * least one. |
18432 |
++ */ |
18433 |
++ if (iter && iter == full_subs) { |
18434 |
++ /* So, we restart this round by setting quota to 0 and retry |
18435 |
++ * to find a subflow. |
18436 |
++ */ |
18437 |
++ mptcp_for_each_sk(mpcb, sk_it) { |
18438 |
++ struct tcp_sock *tp_it = tcp_sk(sk_it); |
18439 |
++ struct rrsched_priv *rsp = rrsched_get_priv(tp_it); |
18440 |
++ |
18441 |
++ if (!mptcp_rr_is_available(sk_it, skb, false, cwnd_limited)) |
18442 |
++ continue; |
18443 |
++ |
18444 |
++ rsp->quota = 0; |
18445 |
++ } |
18446 |
++ |
18447 |
++ goto retry; |
18448 |
++ } |
18449 |
++ |
18450 |
++found: |
18451 |
++ if (choose_sk) { |
18452 |
++ unsigned int mss_now; |
18453 |
++ struct tcp_sock *choose_tp = tcp_sk(choose_sk); |
18454 |
++ struct rrsched_priv *rsp = rrsched_get_priv(choose_tp); |
18455 |
++ |
18456 |
++ if (!mptcp_rr_is_available(choose_sk, skb, false, true)) |
18457 |
++ return NULL; |
18458 |
++ |
18459 |
++ *subsk = choose_sk; |
18460 |
++ mss_now = tcp_current_mss(*subsk); |
18461 |
++ *limit = split * mss_now; |
18462 |
++ |
18463 |
++ if (skb->len > mss_now) |
18464 |
++ rsp->quota += DIV_ROUND_UP(skb->len, mss_now); |
18465 |
++ else |
18466 |
++ rsp->quota++; |
18467 |
++ |
18468 |
++ return skb; |
18469 |
++ } |
18470 |
++ |
18471 |
++ return NULL; |
18472 |
++} |
18473 |
++ |
18474 |
++static struct mptcp_sched_ops mptcp_sched_rr = { |
18475 |
++ .get_subflow = rr_get_available_subflow, |
18476 |
++ .next_segment = mptcp_rr_next_segment, |
18477 |
++ .name = "roundrobin", |
18478 |
++ .owner = THIS_MODULE, |
18479 |
++}; |
18480 |
++ |
18481 |
++static int __init rr_register(void) |
18482 |
++{ |
18483 |
++ BUILD_BUG_ON(sizeof(struct rrsched_priv) > MPTCP_SCHED_SIZE); |
18484 |
++ |
18485 |
++ if (mptcp_register_scheduler(&mptcp_sched_rr)) |
18486 |
++ return -1; |
18487 |
++ |
18488 |
++ return 0; |
18489 |
++} |
18490 |
++ |
18491 |
++static void rr_unregister(void) |
18492 |
++{ |
18493 |
++ mptcp_unregister_scheduler(&mptcp_sched_rr); |
18494 |
++} |
18495 |
++ |
18496 |
++module_init(rr_register); |
18497 |
++module_exit(rr_unregister); |
18498 |
++ |
18499 |
++MODULE_AUTHOR("Christoph Paasch"); |
18500 |
++MODULE_LICENSE("GPL"); |
18501 |
++MODULE_DESCRIPTION("ROUNDROBIN MPTCP"); |
18502 |
++MODULE_VERSION("0.89"); |
18503 |
+diff --git a/net/mptcp/mptcp_sched.c b/net/mptcp/mptcp_sched.c |
18504 |
+new file mode 100644 |
18505 |
+index 000000000000..6c7ff4eceac1 |
18506 |
+--- /dev/null |
18507 |
++++ b/net/mptcp/mptcp_sched.c |
18508 |
+@@ -0,0 +1,493 @@ |
18509 |
++/* MPTCP Scheduler module selector. Highly inspired by tcp_cong.c */ |
18510 |
++ |
18511 |
++#include <linux/module.h> |
18512 |
++#include <net/mptcp.h> |
18513 |
++ |
18514 |
++static DEFINE_SPINLOCK(mptcp_sched_list_lock); |
18515 |
++static LIST_HEAD(mptcp_sched_list); |
18516 |
++ |
18517 |
++struct defsched_priv { |
18518 |
++ u32 last_rbuf_opti; |
18519 |
++}; |
18520 |
++ |
18521 |
++static struct defsched_priv *defsched_get_priv(const struct tcp_sock *tp) |
18522 |
++{ |
18523 |
++ return (struct defsched_priv *)&tp->mptcp->mptcp_sched[0]; |
18524 |
++} |
18525 |
++ |
18526 |
++/* If the sub-socket sk available to send the skb? */ |
18527 |
++static bool mptcp_is_available(struct sock *sk, const struct sk_buff *skb, |
18528 |
++ bool zero_wnd_test) |
18529 |
++{ |
18530 |
++ const struct tcp_sock *tp = tcp_sk(sk); |
18531 |
++ unsigned int mss_now, space, in_flight; |
18532 |
++ |
18533 |
++ /* Set of states for which we are allowed to send data */ |
18534 |
++ if (!mptcp_sk_can_send(sk)) |
18535 |
++ return false; |
18536 |
++ |
18537 |
++ /* We do not send data on this subflow unless it is |
18538 |
++ * fully established, i.e. the 4th ack has been received. |
18539 |
++ */ |
18540 |
++ if (tp->mptcp->pre_established) |
18541 |
++ return false; |
18542 |
++ |
18543 |
++ if (tp->pf) |
18544 |
++ return false; |
18545 |
++ |
18546 |
++ if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss) { |
18547 |
++ /* If SACK is disabled, and we got a loss, TCP does not exit |
18548 |
++ * the loss-state until something above high_seq has been acked. |
18549 |
++ * (see tcp_try_undo_recovery) |
18550 |
++ * |
18551 |
++ * high_seq is the snd_nxt at the moment of the RTO. As soon |
18552 |
++ * as we have an RTO, we won't push data on the subflow. |
18553 |
++ * Thus, snd_una can never go beyond high_seq. |
18554 |
++ */ |
18555 |
++ if (!tcp_is_reno(tp)) |
18556 |
++ return false; |
18557 |
++ else if (tp->snd_una != tp->high_seq) |
18558 |
++ return false; |
18559 |
++ } |
18560 |
++ |
18561 |
++ if (!tp->mptcp->fully_established) { |
18562 |
++ /* Make sure that we send in-order data */ |
18563 |
++ if (skb && tp->mptcp->second_packet && |
18564 |
++ tp->mptcp->last_end_data_seq != TCP_SKB_CB(skb)->seq) |
18565 |
++ return false; |
18566 |
++ } |
18567 |
++ |
18568 |
++ /* If TSQ is already throttling us, do not send on this subflow. When |
18569 |
++ * TSQ gets cleared the subflow becomes eligible again. |
18570 |
++ */ |
18571 |
++ if (test_bit(TSQ_THROTTLED, &tp->tsq_flags)) |
18572 |
++ return false; |
18573 |
++ |
18574 |
++ in_flight = tcp_packets_in_flight(tp); |
18575 |
++ /* Not even a single spot in the cwnd */ |
18576 |
++ if (in_flight >= tp->snd_cwnd) |
18577 |
++ return false; |
18578 |
++ |
18579 |
++ /* Now, check if what is queued in the subflow's send-queue |
18580 |
++ * already fills the cwnd. |
18581 |
++ */ |
18582 |
++ space = (tp->snd_cwnd - in_flight) * tp->mss_cache; |
18583 |
++ |
18584 |
++ if (tp->write_seq - tp->snd_nxt > space) |
18585 |
++ return false; |
18586 |
++ |
18587 |
++ if (zero_wnd_test && !before(tp->write_seq, tcp_wnd_end(tp))) |
18588 |
++ return false; |
18589 |
++ |
18590 |
++ mss_now = tcp_current_mss(sk); |
18591 |
++ |
18592 |
++ /* Don't send on this subflow if we bypass the allowed send-window at |
18593 |
++ * the per-subflow level. Similar to tcp_snd_wnd_test, but manually |
18594 |
++ * calculated end_seq (because here at this point end_seq is still at |
18595 |
++ * the meta-level). |
18596 |
++ */ |
18597 |
++ if (skb && !zero_wnd_test && |
18598 |
++ after(tp->write_seq + min(skb->len, mss_now), tcp_wnd_end(tp))) |
18599 |
++ return false; |
18600 |
++ |
18601 |
++ return true; |
18602 |
++} |
18603 |
++ |
18604 |
++/* Are we not allowed to reinject this skb on tp? */ |
18605 |
++static int mptcp_dont_reinject_skb(const struct tcp_sock *tp, const struct sk_buff *skb) |
18606 |
++{ |
18607 |
++ /* If the skb has already been enqueued in this sk, try to find |
18608 |
++ * another one. |
18609 |
++ */ |
18610 |
++ return skb && |
18611 |
++ /* Has the skb already been enqueued into this subsocket? */ |
18612 |
++ mptcp_pi_to_flag(tp->mptcp->path_index) & TCP_SKB_CB(skb)->path_mask; |
18613 |
++} |
18614 |
++ |
18615 |
++/* This is the scheduler. This function decides on which flow to send |
18616 |
++ * a given MSS. If all subflows are found to be busy, NULL is returned |
18617 |
++ * The flow is selected based on the shortest RTT. |
18618 |
++ * If all paths have full cong windows, we simply return NULL. |
18619 |
++ * |
18620 |
++ * Additionally, this function is aware of the backup-subflows. |
18621 |
++ */ |
18622 |
++static struct sock *get_available_subflow(struct sock *meta_sk, |
18623 |
++ struct sk_buff *skb, |
18624 |
++ bool zero_wnd_test) |
18625 |
++{ |
18626 |
++ struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb; |
18627 |
++ struct sock *sk, *bestsk = NULL, *lowpriosk = NULL, *backupsk = NULL; |
18628 |
++ u32 min_time_to_peer = 0xffffffff, lowprio_min_time_to_peer = 0xffffffff; |
18629 |
++ int cnt_backups = 0; |
18630 |
++ |
18631 |
++ /* if there is only one subflow, bypass the scheduling function */ |
18632 |
++ if (mpcb->cnt_subflows == 1) { |
18633 |
++ bestsk = (struct sock *)mpcb->connection_list; |
18634 |
++ if (!mptcp_is_available(bestsk, skb, zero_wnd_test)) |
18635 |
++ bestsk = NULL; |
18636 |
++ return bestsk; |
18637 |
++ } |
18638 |
++ |
18639 |
++ /* Answer data_fin on same subflow!!! */ |
18640 |
++ if (meta_sk->sk_shutdown & RCV_SHUTDOWN && |
18641 |
++ skb && mptcp_is_data_fin(skb)) { |
18642 |
++ mptcp_for_each_sk(mpcb, sk) { |
18643 |
++ if (tcp_sk(sk)->mptcp->path_index == mpcb->dfin_path_index && |
18644 |
++ mptcp_is_available(sk, skb, zero_wnd_test)) |
18645 |
++ return sk; |
18646 |
++ } |
18647 |
++ } |
18648 |
++ |
18649 |
++ /* First, find the best subflow */ |
18650 |
++ mptcp_for_each_sk(mpcb, sk) { |
18651 |
++ struct tcp_sock *tp = tcp_sk(sk); |
18652 |
++ |
18653 |
++ if (tp->mptcp->rcv_low_prio || tp->mptcp->low_prio) |
18654 |
++ cnt_backups++; |
18655 |
++ |
18656 |
++ if ((tp->mptcp->rcv_low_prio || tp->mptcp->low_prio) && |
18657 |
++ tp->srtt_us < lowprio_min_time_to_peer) { |
18658 |
++ if (!mptcp_is_available(sk, skb, zero_wnd_test)) |
18659 |
++ continue; |
18660 |
++ |
18661 |
++ if (mptcp_dont_reinject_skb(tp, skb)) { |
18662 |
++ backupsk = sk; |
18663 |
++ continue; |
18664 |
++ } |
18665 |
++ |
18666 |
++ lowprio_min_time_to_peer = tp->srtt_us; |
18667 |
++ lowpriosk = sk; |
18668 |
++ } else if (!(tp->mptcp->rcv_low_prio || tp->mptcp->low_prio) && |
18669 |
++ tp->srtt_us < min_time_to_peer) { |
18670 |
++ if (!mptcp_is_available(sk, skb, zero_wnd_test)) |
18671 |
++ continue; |
18672 |
++ |
18673 |
++ if (mptcp_dont_reinject_skb(tp, skb)) { |
18674 |
++ backupsk = sk; |
18675 |
++ continue; |
18676 |
++ } |
18677 |
++ |
18678 |
++ min_time_to_peer = tp->srtt_us; |
18679 |
++ bestsk = sk; |
18680 |
++ } |
18681 |
++ } |
18682 |
++ |
18683 |
++ if (mpcb->cnt_established == cnt_backups && lowpriosk) { |
18684 |
++ sk = lowpriosk; |
18685 |
++ } else if (bestsk) { |
18686 |
++ sk = bestsk; |
18687 |
++ } else if (backupsk) { |
18688 |
++ /* It has been sent on all subflows once - let's give it a |
18689 |
++ * chance again by restarting its pathmask. |
18690 |
++ */ |
18691 |
++ if (skb) |
18692 |
++ TCP_SKB_CB(skb)->path_mask = 0; |
18693 |
++ sk = backupsk; |
18694 |
++ } |
18695 |
++ |
18696 |
++ return sk; |
18697 |
++} |
18698 |
++ |
18699 |
++static struct sk_buff *mptcp_rcv_buf_optimization(struct sock *sk, int penal) |
18700 |
++{ |
18701 |
++ struct sock *meta_sk; |
18702 |
++ const struct tcp_sock *tp = tcp_sk(sk); |
18703 |
++ struct tcp_sock *tp_it; |
18704 |
++ struct sk_buff *skb_head; |
18705 |
++ struct defsched_priv *dsp = defsched_get_priv(tp); |
18706 |
++ |
18707 |
++ if (tp->mpcb->cnt_subflows == 1) |
18708 |
++ return NULL; |
18709 |
++ |
18710 |
++ meta_sk = mptcp_meta_sk(sk); |
18711 |
++ skb_head = tcp_write_queue_head(meta_sk); |
18712 |
++ |
18713 |
++ if (!skb_head || skb_head == tcp_send_head(meta_sk)) |
18714 |
++ return NULL; |
18715 |
++ |
18716 |
++ /* If penalization is optional (coming from mptcp_next_segment() and |
18717 |
++ * We are not send-buffer-limited we do not penalize. The retransmission |
18718 |
++ * is just an optimization to fix the idle-time due to the delay before |
18719 |
++ * we wake up the application. |
18720 |
++ */ |
18721 |
++ if (!penal && sk_stream_memory_free(meta_sk)) |
18722 |
++ goto retrans; |
18723 |
++ |
18724 |
++ /* Only penalize again after an RTT has elapsed */ |
18725 |
++ if (tcp_time_stamp - dsp->last_rbuf_opti < usecs_to_jiffies(tp->srtt_us >> 3)) |
18726 |
++ goto retrans; |
18727 |
++ |
18728 |
++ /* Half the cwnd of the slow flow */ |
18729 |
++ mptcp_for_each_tp(tp->mpcb, tp_it) { |
18730 |
++ if (tp_it != tp && |
18731 |
++ TCP_SKB_CB(skb_head)->path_mask & mptcp_pi_to_flag(tp_it->mptcp->path_index)) { |
18732 |
++ if (tp->srtt_us < tp_it->srtt_us && inet_csk((struct sock *)tp_it)->icsk_ca_state == TCP_CA_Open) { |
18733 |
++ tp_it->snd_cwnd = max(tp_it->snd_cwnd >> 1U, 1U); |
18734 |
++ if (tp_it->snd_ssthresh != TCP_INFINITE_SSTHRESH) |
18735 |
++ tp_it->snd_ssthresh = max(tp_it->snd_ssthresh >> 1U, 2U); |
18736 |
++ |
18737 |
++ dsp->last_rbuf_opti = tcp_time_stamp; |
18738 |
++ } |
18739 |
++ break; |
18740 |
++ } |
18741 |
++ } |
18742 |
++ |
18743 |
++retrans: |
18744 |
++ |
18745 |
++ /* Segment not yet injected into this path? Take it!!! */ |
18746 |
++ if (!(TCP_SKB_CB(skb_head)->path_mask & mptcp_pi_to_flag(tp->mptcp->path_index))) { |
18747 |
++ bool do_retrans = false; |
18748 |
++ mptcp_for_each_tp(tp->mpcb, tp_it) { |
18749 |
++ if (tp_it != tp && |
18750 |
++ TCP_SKB_CB(skb_head)->path_mask & mptcp_pi_to_flag(tp_it->mptcp->path_index)) { |
18751 |
++ if (tp_it->snd_cwnd <= 4) { |
18752 |
++ do_retrans = true; |
18753 |
++ break; |
18754 |
++ } |
18755 |
++ |
18756 |
++ if (4 * tp->srtt_us >= tp_it->srtt_us) { |
18757 |
++ do_retrans = false; |
18758 |
++ break; |
18759 |
++ } else { |
18760 |
++ do_retrans = true; |
18761 |
++ } |
18762 |
++ } |
18763 |
++ } |
18764 |
++ |
18765 |
++ if (do_retrans && mptcp_is_available(sk, skb_head, false)) |
18766 |
++ return skb_head; |
18767 |
++ } |
18768 |
++ return NULL; |
18769 |
++} |
18770 |
++ |
18771 |
++/* Returns the next segment to be sent from the mptcp meta-queue. |
18772 |
++ * (chooses the reinject queue if any segment is waiting in it, otherwise, |
18773 |
++ * chooses the normal write queue). |
18774 |
++ * Sets *@reinject to 1 if the returned segment comes from the |
18775 |
++ * reinject queue. Sets it to 0 if it is the regular send-head of the meta-sk, |
18776 |
++ * and sets it to -1 if it is a meta-level retransmission to optimize the |
18777 |
++ * receive-buffer. |
18778 |
++ */ |
18779 |
++static struct sk_buff *__mptcp_next_segment(struct sock *meta_sk, int *reinject) |
18780 |
++{ |
18781 |
++ const struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb; |
18782 |
++ struct sk_buff *skb = NULL; |
18783 |
++ |
18784 |
++ *reinject = 0; |
18785 |
++ |
18786 |
++ /* If we are in fallback-mode, just take from the meta-send-queue */ |
18787 |
++ if (mpcb->infinite_mapping_snd || mpcb->send_infinite_mapping) |
18788 |
++ return tcp_send_head(meta_sk); |
18789 |
++ |
18790 |
++ skb = skb_peek(&mpcb->reinject_queue); |
18791 |
++ |
18792 |
++ if (skb) { |
18793 |
++ *reinject = 1; |
18794 |
++ } else { |
18795 |
++ skb = tcp_send_head(meta_sk); |
18796 |
++ |
18797 |
++ if (!skb && meta_sk->sk_socket && |
18798 |
++ test_bit(SOCK_NOSPACE, &meta_sk->sk_socket->flags) && |
18799 |
++ sk_stream_wspace(meta_sk) < sk_stream_min_wspace(meta_sk)) { |
18800 |
++ struct sock *subsk = get_available_subflow(meta_sk, NULL, |
18801 |
++ false); |
18802 |
++ if (!subsk) |
18803 |
++ return NULL; |
18804 |
++ |
18805 |
++ skb = mptcp_rcv_buf_optimization(subsk, 0); |
18806 |
++ if (skb) |
18807 |
++ *reinject = -1; |
18808 |
++ } |
18809 |
++ } |
18810 |
++ return skb; |
18811 |
++} |
18812 |
++ |
18813 |
++static struct sk_buff *mptcp_next_segment(struct sock *meta_sk, |
18814 |
++ int *reinject, |
18815 |
++ struct sock **subsk, |
18816 |
++ unsigned int *limit) |
18817 |
++{ |
18818 |
++ struct sk_buff *skb = __mptcp_next_segment(meta_sk, reinject); |
18819 |
++ unsigned int mss_now; |
18820 |
++ struct tcp_sock *subtp; |
18821 |
++ u16 gso_max_segs; |
18822 |
++ u32 max_len, max_segs, window, needed; |
18823 |
++ |
18824 |
++ /* As we set it, we have to reset it as well. */ |
18825 |
++ *limit = 0; |
18826 |
++ |
18827 |
++ if (!skb) |
18828 |
++ return NULL; |
18829 |
++ |
18830 |
++ *subsk = get_available_subflow(meta_sk, skb, false); |
18831 |
++ if (!*subsk) |
18832 |
++ return NULL; |
18833 |
++ |
18834 |
++ subtp = tcp_sk(*subsk); |
18835 |
++ mss_now = tcp_current_mss(*subsk); |
18836 |
++ |
18837 |
++ if (!*reinject && unlikely(!tcp_snd_wnd_test(tcp_sk(meta_sk), skb, mss_now))) { |
18838 |
++ skb = mptcp_rcv_buf_optimization(*subsk, 1); |
18839 |
++ if (skb) |
18840 |
++ *reinject = -1; |
18841 |
++ else |
18842 |
++ return NULL; |
18843 |
++ } |
18844 |
++ |
18845 |
++ /* No splitting required, as we will only send one single segment */ |
18846 |
++ if (skb->len <= mss_now) |
18847 |
++ return skb; |
18848 |
++ |
18849 |
++ /* The following is similar to tcp_mss_split_point, but |
18850 |
++ * we do not care about nagle, because we will anyways |
18851 |
++ * use TCP_NAGLE_PUSH, which overrides this. |
18852 |
++ * |
18853 |
++ * So, we first limit according to the cwnd/gso-size and then according |
18854 |
++ * to the subflow's window. |
18855 |
++ */ |
18856 |
++ |
18857 |
++ gso_max_segs = (*subsk)->sk_gso_max_segs; |
18858 |
++ if (!gso_max_segs) /* No gso supported on the subflow's NIC */ |
18859 |
++ gso_max_segs = 1; |
18860 |
++ max_segs = min_t(unsigned int, tcp_cwnd_test(subtp, skb), gso_max_segs); |
18861 |
++ if (!max_segs) |
18862 |
++ return NULL; |
18863 |
++ |
18864 |
++ max_len = mss_now * max_segs; |
18865 |
++ window = tcp_wnd_end(subtp) - subtp->write_seq; |
18866 |
++ |
18867 |
++ needed = min(skb->len, window); |
18868 |
++ if (max_len <= skb->len) |
18869 |
++ /* Take max_win, which is actually the cwnd/gso-size */ |
18870 |
++ *limit = max_len; |
18871 |
++ else |
18872 |
++ /* Or, take the window */ |
18873 |
++ *limit = needed; |
18874 |
++ |
18875 |
++ return skb; |
18876 |
++} |
18877 |
++ |
18878 |
++static void defsched_init(struct sock *sk) |
18879 |
++{ |
18880 |
++ struct defsched_priv *dsp = defsched_get_priv(tcp_sk(sk)); |
18881 |
++ |
18882 |
++ dsp->last_rbuf_opti = tcp_time_stamp; |
18883 |
++} |
18884 |
++ |
18885 |
++struct mptcp_sched_ops mptcp_sched_default = { |
18886 |
++ .get_subflow = get_available_subflow, |
18887 |
++ .next_segment = mptcp_next_segment, |
18888 |
++ .init = defsched_init, |
18889 |
++ .name = "default", |
18890 |
++ .owner = THIS_MODULE, |
18891 |
++}; |
18892 |
++ |
18893 |
++static struct mptcp_sched_ops *mptcp_sched_find(const char *name) |
18894 |
++{ |
18895 |
++ struct mptcp_sched_ops *e; |
18896 |
++ |
18897 |
++ list_for_each_entry_rcu(e, &mptcp_sched_list, list) { |
18898 |
++ if (strcmp(e->name, name) == 0) |
18899 |
++ return e; |
18900 |
++ } |
18901 |
++ |
18902 |
++ return NULL; |
18903 |
++} |
18904 |
++ |
18905 |
++int mptcp_register_scheduler(struct mptcp_sched_ops *sched) |
18906 |
++{ |
18907 |
++ int ret = 0; |
18908 |
++ |
18909 |
++ if (!sched->get_subflow || !sched->next_segment) |
18910 |
++ return -EINVAL; |
18911 |
++ |
18912 |
++ spin_lock(&mptcp_sched_list_lock); |
18913 |
++ if (mptcp_sched_find(sched->name)) { |
18914 |
++ pr_notice("%s already registered\n", sched->name); |
18915 |
++ ret = -EEXIST; |
18916 |
++ } else { |
18917 |
++ list_add_tail_rcu(&sched->list, &mptcp_sched_list); |
18918 |
++ pr_info("%s registered\n", sched->name); |
18919 |
++ } |
18920 |
++ spin_unlock(&mptcp_sched_list_lock); |
18921 |
++ |
18922 |
++ return ret; |
18923 |
++} |
18924 |
++EXPORT_SYMBOL_GPL(mptcp_register_scheduler); |
18925 |
++ |
18926 |
++void mptcp_unregister_scheduler(struct mptcp_sched_ops *sched) |
18927 |
++{ |
18928 |
++ spin_lock(&mptcp_sched_list_lock); |
18929 |
++ list_del_rcu(&sched->list); |
18930 |
++ spin_unlock(&mptcp_sched_list_lock); |
18931 |
++} |
18932 |
++EXPORT_SYMBOL_GPL(mptcp_unregister_scheduler); |
18933 |
++ |
18934 |
++void mptcp_get_default_scheduler(char *name) |
18935 |
++{ |
18936 |
++ struct mptcp_sched_ops *sched; |
18937 |
++ |
18938 |
++ BUG_ON(list_empty(&mptcp_sched_list)); |
18939 |
++ |
18940 |
++ rcu_read_lock(); |
18941 |
++ sched = list_entry(mptcp_sched_list.next, struct mptcp_sched_ops, list); |
18942 |
++ strncpy(name, sched->name, MPTCP_SCHED_NAME_MAX); |
18943 |
++ rcu_read_unlock(); |
18944 |
++} |
18945 |
++ |
18946 |
++int mptcp_set_default_scheduler(const char *name) |
18947 |
++{ |
18948 |
++ struct mptcp_sched_ops *sched; |
18949 |
++ int ret = -ENOENT; |
18950 |
++ |
18951 |
++ spin_lock(&mptcp_sched_list_lock); |
18952 |
++ sched = mptcp_sched_find(name); |
18953 |
++#ifdef CONFIG_MODULES |
18954 |
++ if (!sched && capable(CAP_NET_ADMIN)) { |
18955 |
++ spin_unlock(&mptcp_sched_list_lock); |
18956 |
++ |
18957 |
++ request_module("mptcp_%s", name); |
18958 |
++ spin_lock(&mptcp_sched_list_lock); |
18959 |
++ sched = mptcp_sched_find(name); |
18960 |
++ } |
18961 |
++#endif |
18962 |
++ |
18963 |
++ if (sched) { |
18964 |
++ list_move(&sched->list, &mptcp_sched_list); |
18965 |
++ ret = 0; |
18966 |
++ } else { |
18967 |
++ pr_info("%s is not available\n", name); |
18968 |
++ } |
18969 |
++ spin_unlock(&mptcp_sched_list_lock); |
18970 |
++ |
18971 |
++ return ret; |
18972 |
++} |
18973 |
++ |
18974 |
++void mptcp_init_scheduler(struct mptcp_cb *mpcb) |
18975 |
++{ |
18976 |
++ struct mptcp_sched_ops *sched; |
18977 |
++ |
18978 |
++ rcu_read_lock(); |
18979 |
++ list_for_each_entry_rcu(sched, &mptcp_sched_list, list) { |
18980 |
++ if (try_module_get(sched->owner)) { |
18981 |
++ mpcb->sched_ops = sched; |
18982 |
++ break; |
18983 |
++ } |
18984 |
++ } |
18985 |
++ rcu_read_unlock(); |
18986 |
++} |
18987 |
++ |
18988 |
++/* Manage refcounts on socket close. */ |
18989 |
++void mptcp_cleanup_scheduler(struct mptcp_cb *mpcb) |
18990 |
++{ |
18991 |
++ module_put(mpcb->sched_ops->owner); |
18992 |
++} |
18993 |
++ |
18994 |
++/* Set default value from kernel configuration at bootup */ |
18995 |
++static int __init mptcp_scheduler_default(void) |
18996 |
++{ |
18997 |
++ BUILD_BUG_ON(sizeof(struct defsched_priv) > MPTCP_SCHED_SIZE); |
18998 |
++ |
18999 |
++ return mptcp_set_default_scheduler(CONFIG_DEFAULT_MPTCP_SCHED); |
19000 |
++} |
19001 |
++late_initcall(mptcp_scheduler_default); |
19002 |
+diff --git a/net/mptcp/mptcp_wvegas.c b/net/mptcp/mptcp_wvegas.c |
19003 |
+new file mode 100644 |
19004 |
+index 000000000000..29ca1d868d17 |
19005 |
+--- /dev/null |
19006 |
++++ b/net/mptcp/mptcp_wvegas.c |
19007 |
+@@ -0,0 +1,268 @@ |
19008 |
++/* |
19009 |
++ * MPTCP implementation - WEIGHTED VEGAS |
19010 |
++ * |
19011 |
++ * Algorithm design: |
19012 |
++ * Yu Cao <cyAnalyst@×××.com> |
19013 |
++ * Mingwei Xu <xmw@××××××××××××××××××××××.cn> |
19014 |
++ * Xiaoming Fu <fu@××××××××××××××××××.de> |
19015 |
++ * |
19016 |
++ * Implementation: |
19017 |
++ * Yu Cao <cyAnalyst@×××.com> |
19018 |
++ * Enhuan Dong <deh13@××××××××××××××××××.cn> |
19019 |
++ * |
19020 |
++ * Ported to the official MPTCP-kernel: |
19021 |
++ * Christoph Paasch <christoph.paasch@×××××××××.be> |
19022 |
++ * |
19023 |
++ * This program is free software; you can redistribute it and/or |
19024 |
++ * modify it under the terms of the GNU General Public License |
19025 |
++ * as published by the Free Software Foundation; either version |
19026 |
++ * 2 of the License, or (at your option) any later version. |
19027 |
++ */ |
19028 |
++ |
19029 |
++#include <linux/skbuff.h> |
19030 |
++#include <net/tcp.h> |
19031 |
++#include <net/mptcp.h> |
19032 |
++#include <linux/module.h> |
19033 |
++#include <linux/tcp.h> |
19034 |
++ |
19035 |
++static int initial_alpha = 2; |
19036 |
++static int total_alpha = 10; |
19037 |
++static int gamma = 1; |
19038 |
++ |
19039 |
++module_param(initial_alpha, int, 0644); |
19040 |
++MODULE_PARM_DESC(initial_alpha, "initial alpha for all subflows"); |
19041 |
++module_param(total_alpha, int, 0644); |
19042 |
++MODULE_PARM_DESC(total_alpha, "total alpha for all subflows"); |
19043 |
++module_param(gamma, int, 0644); |
19044 |
++MODULE_PARM_DESC(gamma, "limit on increase (scale by 2)"); |
19045 |
++ |
19046 |
++#define MPTCP_WVEGAS_SCALE 16 |
19047 |
++ |
19048 |
++/* wVegas variables */ |
19049 |
++struct wvegas { |
19050 |
++ u32 beg_snd_nxt; /* right edge during last RTT */ |
19051 |
++ u8 doing_wvegas_now;/* if true, do wvegas for this RTT */ |
19052 |
++ |
19053 |
++ u16 cnt_rtt; /* # of RTTs measured within last RTT */ |
19054 |
++ u32 sampled_rtt; /* cumulative RTTs measured within last RTT (in usec) */ |
19055 |
++ u32 base_rtt; /* the min of all wVegas RTT measurements seen (in usec) */ |
19056 |
++ |
19057 |
++ u64 instant_rate; /* cwnd / srtt_us, unit: pkts/us * 2^16 */ |
19058 |
++ u64 weight; /* the ratio of subflow's rate to the total rate, * 2^16 */ |
19059 |
++ int alpha; /* alpha for each subflows */ |
19060 |
++ |
19061 |
++ u32 queue_delay; /* queue delay*/ |
19062 |
++}; |
19063 |
++ |
19064 |
++ |
19065 |
++static inline u64 mptcp_wvegas_scale(u32 val, int scale) |
19066 |
++{ |
19067 |
++ return (u64) val << scale; |
19068 |
++} |
19069 |
++ |
19070 |
++static void wvegas_enable(const struct sock *sk) |
19071 |
++{ |
19072 |
++ const struct tcp_sock *tp = tcp_sk(sk); |
19073 |
++ struct wvegas *wvegas = inet_csk_ca(sk); |
19074 |
++ |
19075 |
++ wvegas->doing_wvegas_now = 1; |
19076 |
++ |
19077 |
++ wvegas->beg_snd_nxt = tp->snd_nxt; |
19078 |
++ |
19079 |
++ wvegas->cnt_rtt = 0; |
19080 |
++ wvegas->sampled_rtt = 0; |
19081 |
++ |
19082 |
++ wvegas->instant_rate = 0; |
19083 |
++ wvegas->alpha = initial_alpha; |
19084 |
++ wvegas->weight = mptcp_wvegas_scale(1, MPTCP_WVEGAS_SCALE); |
19085 |
++ |
19086 |
++ wvegas->queue_delay = 0; |
19087 |
++} |
19088 |
++ |
19089 |
++static inline void wvegas_disable(const struct sock *sk) |
19090 |
++{ |
19091 |
++ struct wvegas *wvegas = inet_csk_ca(sk); |
19092 |
++ |
19093 |
++ wvegas->doing_wvegas_now = 0; |
19094 |
++} |
19095 |
++ |
19096 |
++static void mptcp_wvegas_init(struct sock *sk) |
19097 |
++{ |
19098 |
++ struct wvegas *wvegas = inet_csk_ca(sk); |
19099 |
++ |
19100 |
++ wvegas->base_rtt = 0x7fffffff; |
19101 |
++ wvegas_enable(sk); |
19102 |
++} |
19103 |
++ |
19104 |
++static inline u64 mptcp_wvegas_rate(u32 cwnd, u32 rtt_us) |
19105 |
++{ |
19106 |
++ return div_u64(mptcp_wvegas_scale(cwnd, MPTCP_WVEGAS_SCALE), rtt_us); |
19107 |
++} |
19108 |
++ |
19109 |
++static void mptcp_wvegas_pkts_acked(struct sock *sk, u32 cnt, s32 rtt_us) |
19110 |
++{ |
19111 |
++ struct wvegas *wvegas = inet_csk_ca(sk); |
19112 |
++ u32 vrtt; |
19113 |
++ |
19114 |
++ if (rtt_us < 0) |
19115 |
++ return; |
19116 |
++ |
19117 |
++ vrtt = rtt_us + 1; |
19118 |
++ |
19119 |
++ if (vrtt < wvegas->base_rtt) |
19120 |
++ wvegas->base_rtt = vrtt; |
19121 |
++ |
19122 |
++ wvegas->sampled_rtt += vrtt; |
19123 |
++ wvegas->cnt_rtt++; |
19124 |
++} |
19125 |
++ |
19126 |
++static void mptcp_wvegas_state(struct sock *sk, u8 ca_state) |
19127 |
++{ |
19128 |
++ if (ca_state == TCP_CA_Open) |
19129 |
++ wvegas_enable(sk); |
19130 |
++ else |
19131 |
++ wvegas_disable(sk); |
19132 |
++} |
19133 |
++ |
19134 |
++static void mptcp_wvegas_cwnd_event(struct sock *sk, enum tcp_ca_event event) |
19135 |
++{ |
19136 |
++ if (event == CA_EVENT_CWND_RESTART) { |
19137 |
++ mptcp_wvegas_init(sk); |
19138 |
++ } else if (event == CA_EVENT_LOSS) { |
19139 |
++ struct wvegas *wvegas = inet_csk_ca(sk); |
19140 |
++ wvegas->instant_rate = 0; |
19141 |
++ } |
19142 |
++} |
19143 |
++ |
19144 |
++static inline u32 mptcp_wvegas_ssthresh(const struct tcp_sock *tp) |
19145 |
++{ |
19146 |
++ return min(tp->snd_ssthresh, tp->snd_cwnd - 1); |
19147 |
++} |
19148 |
++ |
19149 |
++static u64 mptcp_wvegas_weight(const struct mptcp_cb *mpcb, const struct sock *sk) |
19150 |
++{ |
19151 |
++ u64 total_rate = 0; |
19152 |
++ struct sock *sub_sk; |
19153 |
++ const struct wvegas *wvegas = inet_csk_ca(sk); |
19154 |
++ |
19155 |
++ if (!mpcb) |
19156 |
++ return wvegas->weight; |
19157 |
++ |
19158 |
++ |
19159 |
++ mptcp_for_each_sk(mpcb, sub_sk) { |
19160 |
++ struct wvegas *sub_wvegas = inet_csk_ca(sub_sk); |
19161 |
++ |
19162 |
++ /* sampled_rtt is initialized by 0 */ |
19163 |
++ if (mptcp_sk_can_send(sub_sk) && (sub_wvegas->sampled_rtt > 0)) |
19164 |
++ total_rate += sub_wvegas->instant_rate; |
19165 |
++ } |
19166 |
++ |
19167 |
++ if (total_rate && wvegas->instant_rate) |
19168 |
++ return div64_u64(mptcp_wvegas_scale(wvegas->instant_rate, MPTCP_WVEGAS_SCALE), total_rate); |
19169 |
++ else |
19170 |
++ return wvegas->weight; |
19171 |
++} |
19172 |
++ |
19173 |
++static void mptcp_wvegas_cong_avoid(struct sock *sk, u32 ack, u32 acked) |
19174 |
++{ |
19175 |
++ struct tcp_sock *tp = tcp_sk(sk); |
19176 |
++ struct wvegas *wvegas = inet_csk_ca(sk); |
19177 |
++ |
19178 |
++ if (!wvegas->doing_wvegas_now) { |
19179 |
++ tcp_reno_cong_avoid(sk, ack, acked); |
19180 |
++ return; |
19181 |
++ } |
19182 |
++ |
19183 |
++ if (after(ack, wvegas->beg_snd_nxt)) { |
19184 |
++ wvegas->beg_snd_nxt = tp->snd_nxt; |
19185 |
++ |
19186 |
++ if (wvegas->cnt_rtt <= 2) { |
19187 |
++ tcp_reno_cong_avoid(sk, ack, acked); |
19188 |
++ } else { |
19189 |
++ u32 rtt, diff, q_delay; |
19190 |
++ u64 target_cwnd; |
19191 |
++ |
19192 |
++ rtt = wvegas->sampled_rtt / wvegas->cnt_rtt; |
19193 |
++ target_cwnd = div_u64(((u64)tp->snd_cwnd * wvegas->base_rtt), rtt); |
19194 |
++ |
19195 |
++ diff = div_u64((u64)tp->snd_cwnd * (rtt - wvegas->base_rtt), rtt); |
19196 |
++ |
19197 |
++ if (diff > gamma && tp->snd_cwnd <= tp->snd_ssthresh) { |
19198 |
++ tp->snd_cwnd = min(tp->snd_cwnd, (u32)target_cwnd+1); |
19199 |
++ tp->snd_ssthresh = mptcp_wvegas_ssthresh(tp); |
19200 |
++ |
19201 |
++ } else if (tp->snd_cwnd <= tp->snd_ssthresh) { |
19202 |
++ tcp_slow_start(tp, acked); |
19203 |
++ } else { |
19204 |
++ if (diff >= wvegas->alpha) { |
19205 |
++ wvegas->instant_rate = mptcp_wvegas_rate(tp->snd_cwnd, rtt); |
19206 |
++ wvegas->weight = mptcp_wvegas_weight(tp->mpcb, sk); |
19207 |
++ wvegas->alpha = max(2U, (u32)((wvegas->weight * total_alpha) >> MPTCP_WVEGAS_SCALE)); |
19208 |
++ } |
19209 |
++ if (diff > wvegas->alpha) { |
19210 |
++ tp->snd_cwnd--; |
19211 |
++ tp->snd_ssthresh = mptcp_wvegas_ssthresh(tp); |
19212 |
++ } else if (diff < wvegas->alpha) { |
19213 |
++ tp->snd_cwnd++; |
19214 |
++ } |
19215 |
++ |
19216 |
++ /* Try to drain link queue if needed*/ |
19217 |
++ q_delay = rtt - wvegas->base_rtt; |
19218 |
++ if ((wvegas->queue_delay == 0) || (wvegas->queue_delay > q_delay)) |
19219 |
++ wvegas->queue_delay = q_delay; |
19220 |
++ |
19221 |
++ if (q_delay >= 2 * wvegas->queue_delay) { |
19222 |
++ u32 backoff_factor = div_u64(mptcp_wvegas_scale(wvegas->base_rtt, MPTCP_WVEGAS_SCALE), 2 * rtt); |
19223 |
++ tp->snd_cwnd = ((u64)tp->snd_cwnd * backoff_factor) >> MPTCP_WVEGAS_SCALE; |
19224 |
++ wvegas->queue_delay = 0; |
19225 |
++ } |
19226 |
++ } |
19227 |
++ |
19228 |
++ if (tp->snd_cwnd < 2) |
19229 |
++ tp->snd_cwnd = 2; |
19230 |
++ else if (tp->snd_cwnd > tp->snd_cwnd_clamp) |
19231 |
++ tp->snd_cwnd = tp->snd_cwnd_clamp; |
19232 |
++ |
19233 |
++ tp->snd_ssthresh = tcp_current_ssthresh(sk); |
19234 |
++ } |
19235 |
++ |
19236 |
++ wvegas->cnt_rtt = 0; |
19237 |
++ wvegas->sampled_rtt = 0; |
19238 |
++ } |
19239 |
++ /* Use normal slow start */ |
19240 |
++ else if (tp->snd_cwnd <= tp->snd_ssthresh) |
19241 |
++ tcp_slow_start(tp, acked); |
19242 |
++} |
19243 |
++ |
19244 |
++ |
19245 |
++static struct tcp_congestion_ops mptcp_wvegas __read_mostly = { |
19246 |
++ .init = mptcp_wvegas_init, |
19247 |
++ .ssthresh = tcp_reno_ssthresh, |
19248 |
++ .cong_avoid = mptcp_wvegas_cong_avoid, |
19249 |
++ .pkts_acked = mptcp_wvegas_pkts_acked, |
19250 |
++ .set_state = mptcp_wvegas_state, |
19251 |
++ .cwnd_event = mptcp_wvegas_cwnd_event, |
19252 |
++ |
19253 |
++ .owner = THIS_MODULE, |
19254 |
++ .name = "wvegas", |
19255 |
++}; |
19256 |
++ |
19257 |
++static int __init mptcp_wvegas_register(void) |
19258 |
++{ |
19259 |
++ BUILD_BUG_ON(sizeof(struct wvegas) > ICSK_CA_PRIV_SIZE); |
19260 |
++ tcp_register_congestion_control(&mptcp_wvegas); |
19261 |
++ return 0; |
19262 |
++} |
19263 |
++ |
19264 |
++static void __exit mptcp_wvegas_unregister(void) |
19265 |
++{ |
19266 |
++ tcp_unregister_congestion_control(&mptcp_wvegas); |
19267 |
++} |
19268 |
++ |
19269 |
++module_init(mptcp_wvegas_register); |
19270 |
++module_exit(mptcp_wvegas_unregister); |
19271 |
++ |
19272 |
++MODULE_AUTHOR("Yu Cao, Enhuan Dong"); |
19273 |
++MODULE_LICENSE("GPL"); |
19274 |
++MODULE_DESCRIPTION("MPTCP wVegas"); |
19275 |
++MODULE_VERSION("0.1"); |