[gentoo-commits] proj/linux-patches:3.16 commit in: / - gentoo-commits

From:	Mike Pagano <mpagano@g.o>
To:	gentoo-commits@l.g.o
Subject:	[gentoo-commits] proj/linux-patches:3.16 commit in: /
Date:	Sat, 27 Sep 2014 13:37:52
Message-Id:	`1411825057.1b28da13cd7150f66fae58043d3de661105a513a.mpagano@gentoo`

1

commit:     1b28da13cd7150f66fae58043d3de661105a513a

2

Author:     Mike Pagano <mpagano <AT> gentoo <DOT> org>

3

AuthorDate: Sat Sep 27 13:37:37 2014 +0000

4

Commit:     Mike Pagano <mpagano <AT> gentoo <DOT> org>

5

CommitDate: Sat Sep 27 13:37:37 2014 +0000

6

URL:        http://sources.gentoo.org/gitweb/?p=proj/linux-patches.git;a=commit;h=1b28da13

7

8

Move mpctp patch to experimental

9

10

---

11

 0000_README                                 |     9 +-

12

 5010_multipath-tcp-v3.16-872d7f6c6f4e.patch | 19230 ++++++++++++++++++++++++++

13

 2 files changed, 19235 insertions(+), 4 deletions(-)

14

15

diff --git a/0000_README b/0000_README

16

index d92e6b7..3cc9441 100644

17

--- a/0000_README

18

+++ b/0000_README

19

@@ -58,10 +58,6 @@ Patch:  2400_kcopy-patch-for-infiniband-driver.patch

20

 From:   Alexey Shvetsov <alexxy@g.o>

21

 Desc:   Zero copy for infiniband psm userspace driver

22

23

-Patch:  2500_multipath-tcp-v3.16-872d7f6c6f4e.patch

24

-From:   http://multipath-tcp.org/

25

-Desc:   Patch for simultaneous use of several IP-addresses/interfaces in TCP for better resource utilization, better throughput and smoother reaction to failures.

26

-

27

 Patch:  2700_ThinkPad-30-brightness-control-fix.patch

28

 From:   Seth Forshee <seth.forshee@×××××××××.com>

29

 Desc:   ACPI: Disable Windows 8 compatibility for some Lenovo ThinkPads

30

@@ -101,3 +97,8 @@ Desc:   BFQ v7r5 patch 2 for 3.16: BFQ Scheduler

31

 Patch:  5003_BFQ-3-block-add-Early-Queue-Merge-EQM-v7r5-for-3.16.0.patch

32

 From:   http://algo.ing.unimo.it/people/paolo/disk_sched/

33

 Desc:   BFQ v7r5 patch 3 for 3.16: Early Queue Merge (EQM)

34

+

35

+Patch:  5010_multipath-tcp-v3.16-872d7f6c6f4e.patch

36

+From:   http://multipath-tcp.org/

37

+Desc:   Patch for simultaneous use of several IP-addresses/interfaces in TCP for better resource utilization, better throughput and smoother reaction to failures.

38

+

39

40

diff --git a/5010_multipath-tcp-v3.16-872d7f6c6f4e.patch b/5010_multipath-tcp-v3.16-872d7f6c6f4e.patch

41

new file mode 100644

42

index 0000000..3000da3

43

--- /dev/null

44

+++ b/5010_multipath-tcp-v3.16-872d7f6c6f4e.patch

45

@@ -0,0 +1,19230 @@

46

+diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c

47

+index 768a0fb67dd6..5a46d91a8df9 100644

48

+--- a/drivers/infiniband/hw/cxgb4/cm.c

49

++++ b/drivers/infiniband/hw/cxgb4/cm.c

50

+@@ -3432,7 +3432,7 @@ static void build_cpl_pass_accept_req(struct sk_buff *skb, int stid , u8 tos)

51

+ 	 */

52

+ 	memset(&tmp_opt, 0, sizeof(tmp_opt));

53

+ 	tcp_clear_options(&tmp_opt);

54

+-	tcp_parse_options(skb, &tmp_opt, 0, NULL);

55

++	tcp_parse_options(skb, &tmp_opt, NULL, 0, NULL);

56

+

57

+ 	req = (struct cpl_pass_accept_req *)__skb_push(skb, sizeof(*req));

58

+ 	memset(req, 0, sizeof(*req));

59

+diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h

60

+index 2faef339d8f2..d86c853ffaad 100644

61

+--- a/include/linux/ipv6.h

62

++++ b/include/linux/ipv6.h

63

+@@ -256,16 +256,6 @@ static inline struct ipv6_pinfo * inet6_sk(const struct sock *__sk)

64

+ 	return inet_sk(__sk)->pinet6;

65

+ }

66

+

67

+-static inline struct request_sock *inet6_reqsk_alloc(struct request_sock_ops *ops)

68

+-{

69

+-	struct request_sock *req = reqsk_alloc(ops);

70

+-

71

+-	if (req)

72

+-		inet_rsk(req)->pktopts = NULL;

73

+-

74

+-	return req;

75

+-}

76

+-

77

+ static inline struct raw6_sock *raw6_sk(const struct sock *sk)

78

+ {

79

+ 	return (struct raw6_sock *)sk;

80

+@@ -309,12 +299,6 @@ static inline struct ipv6_pinfo * inet6_sk(const struct sock *__sk)

81

+ 	return NULL;

82

+ }

83

+

84

+-static inline struct inet6_request_sock *

85

+-			inet6_rsk(const struct request_sock *rsk)

86

+-{

87

+-	return NULL;

88

+-}

89

+-

90

+ static inline struct raw6_sock *raw6_sk(const struct sock *sk)

91

+ {

92

+ 	return NULL;

93

+diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h

94

+index ec89301ada41..99ea4b0e3693 100644

95

+--- a/include/linux/skbuff.h

96

++++ b/include/linux/skbuff.h

97

+@@ -2784,8 +2784,10 @@ static inline bool __skb_checksum_validate_needed(struct sk_buff *skb,

98

+ 						  bool zero_okay,

99

+ 						  __sum16 check)

100

+ {

101

+-	if (skb_csum_unnecessary(skb) || (zero_okay && !check)) {

102

+-		skb->csum_valid = 1;

103

++	if (skb_csum_unnecessary(skb)) {

104

++		return false;

105

++	} else if (zero_okay && !check) {

106

++		skb->ip_summed = CHECKSUM_UNNECESSARY;

107

+ 		return false;

108

+ 	}

109

+

110

+diff --git a/include/linux/tcp.h b/include/linux/tcp.h

111

+index a0513210798f..7bc2e078d6ca 100644

112

+--- a/include/linux/tcp.h

113

++++ b/include/linux/tcp.h

114

+@@ -53,7 +53,7 @@ static inline unsigned int tcp_optlen(const struct sk_buff *skb)

115

+ /* TCP Fast Open */

116

+ #define TCP_FASTOPEN_COOKIE_MIN	4	/* Min Fast Open Cookie size in bytes */

117

+ #define TCP_FASTOPEN_COOKIE_MAX	16	/* Max Fast Open Cookie size in bytes */

118

+-#define TCP_FASTOPEN_COOKIE_SIZE 8	/* the size employed by this impl. */

119

++#define TCP_FASTOPEN_COOKIE_SIZE 4	/* the size employed by this impl. */

120

+

121

+ /* TCP Fast Open Cookie as stored in memory */

122

+ struct tcp_fastopen_cookie {

123

+@@ -72,6 +72,51 @@ struct tcp_sack_block {

124

+ 	u32	end_seq;

125

+ };

126

+

127

++struct tcp_out_options {

128

++	u16	options;	/* bit field of OPTION_* */

129

++	u8	ws;		/* window scale, 0 to disable */

130

++	u8	num_sack_blocks;/* number of SACK blocks to include */

131

++	u8	hash_size;	/* bytes in hash_location */

132

++	u16	mss;		/* 0 to disable */

133

++	__u8	*hash_location;	/* temporary pointer, overloaded */

134

++	__u32	tsval, tsecr;	/* need to include OPTION_TS */

135

++	struct tcp_fastopen_cookie *fastopen_cookie;	/* Fast open cookie */

136

++#ifdef CONFIG_MPTCP

137

++	u16	mptcp_options;	/* bit field of MPTCP related OPTION_* */

138

++	u8	dss_csum:1,

139

++		add_addr_v4:1,

140

++		add_addr_v6:1;	/* dss-checksum required? */

141

++

142

++	union {

143

++		struct {

144

++			__u64	sender_key;	/* sender's key for mptcp */

145

++			__u64	receiver_key;	/* receiver's key for mptcp */

146

++		} mp_capable;

147

++

148

++		struct {

149

++			__u64	sender_truncated_mac;

150

++			__u32	sender_nonce;

151

++					/* random number of the sender */

152

++			__u32	token;	/* token for mptcp */

153

++			u8	low_prio:1;

154

++		} mp_join_syns;

155

++	};

156

++

157

++	struct {

158

++		struct in_addr addr;

159

++		u8 addr_id;

160

++	} add_addr4;

161

++

162

++	struct {

163

++		struct in6_addr addr;

164

++		u8 addr_id;

165

++	} add_addr6;

166

++

167

++	u16	remove_addrs;	/* list of address id */

168

++	u8	addr_id;	/* address id (mp_join or add_address) */

169

++#endif /* CONFIG_MPTCP */

170

++};

171

++

172

+ /*These are used to set the sack_ok field in struct tcp_options_received */

173

+ #define TCP_SACK_SEEN     (1 << 0)   /*1 = peer is SACK capable, */

174

+ #define TCP_FACK_ENABLED  (1 << 1)   /*1 = FACK is enabled locally*/

175

+@@ -95,6 +140,9 @@ struct tcp_options_received {

176

+ 	u16	mss_clamp;	/* Maximal mss, negotiated at connection setup */

177

+ };

178

+

179

++struct mptcp_cb;

180

++struct mptcp_tcp_sock;

181

++

182

+ static inline void tcp_clear_options(struct tcp_options_received *rx_opt)

183

+ {

184

+ 	rx_opt->tstamp_ok = rx_opt->sack_ok = 0;

185

+@@ -111,10 +159,7 @@ struct tcp_request_sock_ops;

186

+

187

+ struct tcp_request_sock {

188

+ 	struct inet_request_sock 	req;

189

+-#ifdef CONFIG_TCP_MD5SIG

190

+-	/* Only used by TCP MD5 Signature so far. */

191

+ 	const struct tcp_request_sock_ops *af_specific;

192

+-#endif

193

+ 	struct sock			*listener; /* needed for TFO */

194

+ 	u32				rcv_isn;

195

+ 	u32				snt_isn;

196

+@@ -130,6 +175,8 @@ static inline struct tcp_request_sock *tcp_rsk(const struct request_sock *req)

197

+ 	return (struct tcp_request_sock *)req;

198

+ }

199

+

200

++struct tcp_md5sig_key;

201

++

202

+ struct tcp_sock {

203

+ 	/* inet_connection_sock has to be the first member of tcp_sock */

204

+ 	struct inet_connection_sock	inet_conn;

205

+@@ -326,6 +373,37 @@ struct tcp_sock {

206

+ 	 * socket. Used to retransmit SYNACKs etc.

207

+ 	 */

208

+ 	struct request_sock *fastopen_rsk;

209

++

210

++	/* MPTCP/TCP-specific callbacks */

211

++	const struct tcp_sock_ops	*ops;

212

++

213

++	struct mptcp_cb		*mpcb;

214

++	struct sock		*meta_sk;

215

++	/* We keep these flags even if CONFIG_MPTCP is not checked, because

216

++	 * it allows checking MPTCP capability just by checking the mpc flag,

217

++	 * rather than adding ifdefs everywhere.

218

++	 */

219

++	u16     mpc:1,          /* Other end is multipath capable */

220

++		inside_tk_table:1, /* Is the tcp_sock inside the token-table? */

221

++		send_mp_fclose:1,

222

++		request_mptcp:1, /* Did we send out an MP_CAPABLE?

223

++				  * (this speeds up mptcp_doit() in tcp_recvmsg)

224

++				  */

225

++		mptcp_enabled:1, /* Is MPTCP enabled from the application ? */

226

++		pf:1, /* Potentially Failed state: when this flag is set, we

227

++		       * stop using the subflow

228

++		       */

229

++		mp_killed:1, /* Killed with a tcp_done in mptcp? */

230

++		was_meta_sk:1,	/* This was a meta sk (in case of reuse) */

231

++		is_master_sk,

232

++		close_it:1,	/* Must close socket in mptcp_data_ready? */

233

++		closing:1;

234

++	struct mptcp_tcp_sock *mptcp;

235

++#ifdef CONFIG_MPTCP

236

++	struct hlist_nulls_node tk_table;

237

++	u32		mptcp_loc_token;

238

++	u64		mptcp_loc_key;

239

++#endif /* CONFIG_MPTCP */

240

+ };

241

+

242

+ enum tsq_flags {

243

+@@ -337,6 +415,8 @@ enum tsq_flags {

244

+ 	TCP_MTU_REDUCED_DEFERRED,  /* tcp_v{4|6}_err() could not call

245

+ 				    * tcp_v{4|6}_mtu_reduced()

246

+ 				    */

247

++	MPTCP_PATH_MANAGER, /* MPTCP deferred creation of new subflows */

248

++	MPTCP_SUB_DEFERRED, /* A subflow got deferred - process them */

249

+ };

250

+

251

+ static inline struct tcp_sock *tcp_sk(const struct sock *sk)

252

+@@ -355,6 +435,7 @@ struct tcp_timewait_sock {

253

+ #ifdef CONFIG_TCP_MD5SIG

254

+ 	struct tcp_md5sig_key	  *tw_md5_key;

255

+ #endif

256

++	struct mptcp_tw		  *mptcp_tw;

257

+ };

258

+

259

+ static inline struct tcp_timewait_sock *tcp_twsk(const struct sock *sk)

260

+diff --git a/include/net/inet6_connection_sock.h b/include/net/inet6_connection_sock.h

261

+index 74af137304be..83f63033897a 100644

262

+--- a/include/net/inet6_connection_sock.h

263

++++ b/include/net/inet6_connection_sock.h

264

+@@ -27,6 +27,8 @@ int inet6_csk_bind_conflict(const struct sock *sk,

265

+

266

+ struct dst_entry *inet6_csk_route_req(struct sock *sk, struct flowi6 *fl6,

267

+ 				      const struct request_sock *req);

268

++u32 inet6_synq_hash(const struct in6_addr *raddr, const __be16 rport,

269

++		    const u32 rnd, const u32 synq_hsize);

270

+

271

+ struct request_sock *inet6_csk_search_req(const struct sock *sk,

272

+ 					  struct request_sock ***prevp,

273

+diff --git a/include/net/inet_common.h b/include/net/inet_common.h

274

+index fe7994c48b75..780f229f46a8 100644

275

+--- a/include/net/inet_common.h

276

++++ b/include/net/inet_common.h

277

+@@ -1,6 +1,8 @@

278

+ #ifndef _INET_COMMON_H

279

+ #define _INET_COMMON_H

280

+

281

++#include <net/sock.h>

282

++

283

+ extern const struct proto_ops inet_stream_ops;

284

+ extern const struct proto_ops inet_dgram_ops;

285

+

286

+@@ -13,6 +15,8 @@ struct sock;

287

+ struct sockaddr;

288

+ struct socket;

289

+

290

++int inet_create(struct net *net, struct socket *sock, int protocol, int kern);

291

++int inet6_create(struct net *net, struct socket *sock, int protocol, int kern);

292

+ int inet_release(struct socket *sock);

293

+ int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,

294

+ 			int addr_len, int flags);

295

+diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h

296

+index 7a4313887568..f62159e39839 100644

297

+--- a/include/net/inet_connection_sock.h

298

++++ b/include/net/inet_connection_sock.h

299

+@@ -30,6 +30,7 @@

300

+

301

+ struct inet_bind_bucket;

302

+ struct tcp_congestion_ops;

303

++struct tcp_options_received;

304

+

305

+ /*

306

+  * Pointers to address related TCP functions

307

+@@ -243,6 +244,9 @@ static inline void inet_csk_reset_xmit_timer(struct sock *sk, const int what,

308

+

309

+ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err);

310

+

311

++u32 inet_synq_hash(const __be32 raddr, const __be16 rport, const u32 rnd,

312

++		   const u32 synq_hsize);

313

++

314

+ struct request_sock *inet_csk_search_req(const struct sock *sk,

315

+ 					 struct request_sock ***prevp,

316

+ 					 const __be16 rport,

317

+diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h

318

+index b1edf17bec01..6a32d8d6b85e 100644

319

+--- a/include/net/inet_sock.h

320

++++ b/include/net/inet_sock.h

321

+@@ -86,10 +86,14 @@ struct inet_request_sock {

322

+ 				wscale_ok  : 1,

323

+ 				ecn_ok	   : 1,

324

+ 				acked	   : 1,

325

+-				no_srccheck: 1;

326

++				no_srccheck: 1,

327

++				mptcp_rqsk : 1,

328

++				saw_mpc    : 1;

329

+ 	kmemcheck_bitfield_end(flags);

330

+-	struct ip_options_rcu	*opt;

331

+-	struct sk_buff		*pktopts;

332

++	union {

333

++		struct ip_options_rcu	*opt;

334

++		struct sk_buff		*pktopts;

335

++	};

336

+ 	u32                     ir_mark;

337

+ };

338

+

339

+diff --git a/include/net/mptcp.h b/include/net/mptcp.h

340

+new file mode 100644

341

+index 000000000000..712780fc39e4

342

+--- /dev/null

343

++++ b/include/net/mptcp.h

344

+@@ -0,0 +1,1439 @@

345

++/*

346

++ *	MPTCP implementation

347

++ *

348

++ *	Initial Design & Implementation:

349

++ *	Sébastien Barré <sebastien.barre@×××××××××.be>

350

++ *

351

++ *	Current Maintainer & Author:

352

++ *	Christoph Paasch <christoph.paasch@×××××××××.be>

353

++ *

354

++ *	Additional authors:

355

++ *	Jaakko Korkeaniemi <jaakko.korkeaniemi@×××××.fi>

356

++ *	Gregory Detal <gregory.detal@×××××××××.be>

357

++ *	Fabien Duchêne <fabien.duchene@×××××××××.be>

358

++ *	Andreas Seelinger <Andreas.Seelinger@×××××××××××.de>

359

++ *	Lavkesh Lahngir <lavkesh51@×××××.com>

360

++ *	Andreas Ripke <ripke@××××××.eu>

361

++ *	Vlad Dogaru <vlad.dogaru@×××××.com>

362

++ *	Octavian Purdila <octavian.purdila@×××××.com>

363

++ *	John Ronan <jronan@××××.org>

364

++ *	Catalin Nicutar <catalin.nicutar@×××××.com>

365

++ *	Brandon Heller <brandonh@××××××××.edu>

366

++ *

367

++ *

368

++ *	This program is free software; you can redistribute it and/or

369

++ *      modify it under the terms of the GNU General Public License

370

++ *      as published by the Free Software Foundation; either version

371

++ *      2 of the License, or (at your option) any later version.

372

++ */

373

++

374

++#ifndef _MPTCP_H

375

++#define _MPTCP_H

376

++

377

++#include <linux/inetdevice.h>

378

++#include <linux/ipv6.h>

379

++#include <linux/list.h>

380

++#include <linux/net.h>

381

++#include <linux/netpoll.h>

382

++#include <linux/skbuff.h>

383

++#include <linux/socket.h>

384

++#include <linux/tcp.h>

385

++#include <linux/kernel.h>

386

++

387

++#include <asm/byteorder.h>

388

++#include <asm/unaligned.h>

389

++#include <crypto/hash.h>

390

++#include <net/tcp.h>

391

++

392

++#if defined(__LITTLE_ENDIAN_BITFIELD)

393

++	#define ntohll(x)  be64_to_cpu(x)

394

++	#define htonll(x)  cpu_to_be64(x)

395

++#elif defined(__BIG_ENDIAN_BITFIELD)

396

++	#define ntohll(x) (x)

397

++	#define htonll(x) (x)

398

++#endif

399

++

400

++struct mptcp_loc4 {

401

++	u8		loc4_id;

402

++	u8		low_prio:1;

403

++	struct in_addr	addr;

404

++};

405

++

406

++struct mptcp_rem4 {

407

++	u8		rem4_id;

408

++	__be16		port;

409

++	struct in_addr	addr;

410

++};

411

++

412

++struct mptcp_loc6 {

413

++	u8		loc6_id;

414

++	u8		low_prio:1;

415

++	struct in6_addr	addr;

416

++};

417

++

418

++struct mptcp_rem6 {

419

++	u8		rem6_id;

420

++	__be16		port;

421

++	struct in6_addr	addr;

422

++};

423

++

424

++struct mptcp_request_sock {

425

++	struct tcp_request_sock		req;

426

++	/* hlist-nulls entry to the hash-table. Depending on whether this is a

427

++	 * a new MPTCP connection or an additional subflow, the request-socket

428

++	 * is either in the mptcp_reqsk_tk_htb or mptcp_reqsk_htb.

429

++	 */

430

++	struct hlist_nulls_node		hash_entry;

431

++

432

++	union {

433

++		struct {

434

++			/* Only on initial subflows */

435

++			u64		mptcp_loc_key;

436

++			u64		mptcp_rem_key;

437

++			u32		mptcp_loc_token;

438

++		};

439

++

440

++		struct {

441

++			/* Only on additional subflows */

442

++			struct mptcp_cb	*mptcp_mpcb;

443

++			u32		mptcp_rem_nonce;

444

++			u32		mptcp_loc_nonce;

445

++			u64		mptcp_hash_tmac;

446

++		};

447

++	};

448

++

449

++	u8				loc_id;

450

++	u8				rem_id; /* Address-id in the MP_JOIN */

451

++	u8				dss_csum:1,

452

++					is_sub:1, /* Is this a new subflow? */

453

++					low_prio:1, /* Interface set to low-prio? */

454

++					rcv_low_prio:1;

455

++};

456

++

457

++struct mptcp_options_received {

458

++	u16	saw_mpc:1,

459

++		dss_csum:1,

460

++		drop_me:1,

461

++

462

++		is_mp_join:1,

463

++		join_ack:1,

464

++

465

++		saw_low_prio:2, /* 0x1 - low-prio set for this subflow

466

++				 * 0x2 - low-prio set for another subflow

467

++				 */

468

++		low_prio:1,

469

++

470

++		saw_add_addr:2, /* Saw at least one add_addr option:

471

++				 * 0x1: IPv4 - 0x2: IPv6

472

++				 */

473

++		more_add_addr:1, /* Saw one more add-addr. */

474

++

475

++		saw_rem_addr:1, /* Saw at least one rem_addr option */

476

++		more_rem_addr:1, /* Saw one more rem-addr. */

477

++

478

++		mp_fail:1,

479

++		mp_fclose:1;

480

++	u8	rem_id;		/* Address-id in the MP_JOIN */

481

++	u8	prio_addr_id;	/* Address-id in the MP_PRIO */

482

++

483

++	const unsigned char *add_addr_ptr; /* Pointer to add-address option */

484

++	const unsigned char *rem_addr_ptr; /* Pointer to rem-address option */

485

++

486

++	u32	data_ack;

487

++	u32	data_seq;

488

++	u16	data_len;

489

++

490

++	u32	mptcp_rem_token;/* Remote token */

491

++

492

++	/* Key inside the option (from mp_capable or fast_close) */

493

++	u64	mptcp_key;

494

++

495

++	u32	mptcp_recv_nonce;

496

++	u64	mptcp_recv_tmac;

497

++	u8	mptcp_recv_mac[20];

498

++};

499

++

500

++struct mptcp_tcp_sock {

501

++	struct tcp_sock	*next;		/* Next subflow socket */

502

++	struct hlist_node cb_list;

503

++	struct mptcp_options_received rx_opt;

504

++

505

++	 /* Those three fields record the current mapping */

506

++	u64	map_data_seq;

507

++	u32	map_subseq;

508

++	u16	map_data_len;

509

++	u16	slave_sk:1,

510

++		fully_established:1,

511

++		establish_increased:1,

512

++		second_packet:1,

513

++		attached:1,

514

++		send_mp_fail:1,

515

++		include_mpc:1,

516

++		mapping_present:1,

517

++		map_data_fin:1,

518

++		low_prio:1, /* use this socket as backup */

519

++		rcv_low_prio:1, /* Peer sent low-prio option to us */

520

++		send_mp_prio:1, /* Trigger to send mp_prio on this socket */

521

++		pre_established:1; /* State between sending 3rd ACK and

522

++				    * receiving the fourth ack of new subflows.

523

++				    */

524

++

525

++	/* isn: needed to translate abs to relative subflow seqnums */

526

++	u32	snt_isn;

527

++	u32	rcv_isn;

528

++	u8	path_index;

529

++	u8	loc_id;

530

++	u8	rem_id;

531

++

532

++#define MPTCP_SCHED_SIZE 4

533

++	u8	mptcp_sched[MPTCP_SCHED_SIZE] __aligned(8);

534

++

535

++	struct sk_buff  *shortcut_ofoqueue; /* Shortcut to the current modified

536

++					     * skb in the ofo-queue.

537

++					     */

538

++

539

++	int	init_rcv_wnd;

540

++	u32	infinite_cutoff_seq;

541

++	struct delayed_work work;

542

++	u32	mptcp_loc_nonce;

543

++	struct tcp_sock *tp; /* Where is my daddy? */

544

++	u32	last_end_data_seq;

545

++

546

++	/* MP_JOIN subflow: timer for retransmitting the 3rd ack */

547

++	struct timer_list mptcp_ack_timer;

548

++

549

++	/* HMAC of the third ack */

550

++	char sender_mac[20];

551

++};

552

++

553

++struct mptcp_tw {

554

++	struct list_head list;

555

++	u64 loc_key;

556

++	u64 rcv_nxt;

557

++	struct mptcp_cb __rcu *mpcb;

558

++	u8 meta_tw:1,

559

++	   in_list:1;

560

++};

561

++

562

++#define MPTCP_PM_NAME_MAX 16

563

++struct mptcp_pm_ops {

564

++	struct list_head list;

565

++

566

++	/* Signal the creation of a new MPTCP-session. */

567

++	void (*new_session)(const struct sock *meta_sk);

568

++	void (*release_sock)(struct sock *meta_sk);

569

++	void (*fully_established)(struct sock *meta_sk);

570

++	void (*new_remote_address)(struct sock *meta_sk);

571

++	int  (*get_local_id)(sa_family_t family, union inet_addr *addr,

572

++			     struct net *net, bool *low_prio);

573

++	void (*addr_signal)(struct sock *sk, unsigned *size,

574

++			    struct tcp_out_options *opts, struct sk_buff *skb);

575

++	void (*add_raddr)(struct mptcp_cb *mpcb, const union inet_addr *addr,

576

++			  sa_family_t family, __be16 port, u8 id);

577

++	void (*rem_raddr)(struct mptcp_cb *mpcb, u8 rem_id);

578

++	void (*init_subsocket_v4)(struct sock *sk, struct in_addr addr);

579

++	void (*init_subsocket_v6)(struct sock *sk, struct in6_addr addr);

580

++

581

++	char		name[MPTCP_PM_NAME_MAX];

582

++	struct module	*owner;

583

++};

584

++

585

++#define MPTCP_SCHED_NAME_MAX 16

586

++struct mptcp_sched_ops {

587

++	struct list_head list;

588

++

589

++	struct sock *		(*get_subflow)(struct sock *meta_sk,

590

++					       struct sk_buff *skb,

591

++					       bool zero_wnd_test);

592

++	struct sk_buff *	(*next_segment)(struct sock *meta_sk,

593

++						int *reinject,

594

++						struct sock **subsk,

595

++						unsigned int *limit);

596

++	void			(*init)(struct sock *sk);

597

++

598

++	char			name[MPTCP_SCHED_NAME_MAX];

599

++	struct module		*owner;

600

++};

601

++

602

++struct mptcp_cb {

603

++	/* list of sockets in this multipath connection */

604

++	struct tcp_sock *connection_list;

605

++	/* list of sockets that need a call to release_cb */

606

++	struct hlist_head callback_list;

607

++

608

++	/* High-order bits of 64-bit sequence numbers */

609

++	u32 snd_high_order[2];

610

++	u32 rcv_high_order[2];

611

++

612

++	u16	send_infinite_mapping:1,

613

++		in_time_wait:1,

614

++		list_rcvd:1, /* XXX TO REMOVE */

615

++		addr_signal:1, /* Path-manager wants us to call addr_signal */

616

++		dss_csum:1,

617

++		server_side:1,

618

++		infinite_mapping_rcv:1,

619

++		infinite_mapping_snd:1,

620

++		dfin_combined:1,   /* Was the DFIN combined with subflow-fin? */

621

++		passive_close:1,

622

++		snd_hiseq_index:1, /* Index in snd_high_order of snd_nxt */

623

++		rcv_hiseq_index:1; /* Index in rcv_high_order of rcv_nxt */

624

++

625

++	/* socket count in this connection */

626

++	u8 cnt_subflows;

627

++	u8 cnt_established;

628

++

629

++	struct mptcp_sched_ops *sched_ops;

630

++

631

++	struct sk_buff_head reinject_queue;

632

++	/* First cache-line boundary is here minus 8 bytes. But from the

633

++	 * reinject-queue only the next and prev pointers are regularly

634

++	 * accessed. Thus, the whole data-path is on a single cache-line.

635

++	 */

636

++

637

++	u64	csum_cutoff_seq;

638

++

639

++	/***** Start of fields, used for connection closure */

640

++	spinlock_t	 tw_lock;

641

++	unsigned char	 mptw_state;

642

++	u8		 dfin_path_index;

643

++

644

++	struct list_head tw_list;

645

++

646

++	/***** Start of fields, used for subflow establishment and closure */

647

++	atomic_t	mpcb_refcnt;

648

++

649

++	/* Mutex needed, because otherwise mptcp_close will complain that the

650

++	 * socket is owned by the user.

651

++	 * E.g., mptcp_sub_close_wq is taking the meta-lock.

652

++	 */

653

++	struct mutex	mpcb_mutex;

654

++

655

++	/***** Start of fields, used for subflow establishment */

656

++	struct sock *meta_sk;

657

++

658

++	/* Master socket, also part of the connection_list, this

659

++	 * socket is the one that the application sees.

660

++	 */

661

++	struct sock *master_sk;

662

++

663

++	__u64	mptcp_loc_key;

664

++	__u64	mptcp_rem_key;

665

++	__u32	mptcp_loc_token;

666

++	__u32	mptcp_rem_token;

667

++

668

++#define MPTCP_PM_SIZE 608

669

++	u8 mptcp_pm[MPTCP_PM_SIZE] __aligned(8);

670

++	struct mptcp_pm_ops *pm_ops;

671

++

672

++	u32 path_index_bits;

673

++	/* Next pi to pick up in case a new path becomes available */

674

++	u8 next_path_index;

675

++

676

++	/* Original snd/rcvbuf of the initial subflow.

677

++	 * Used for the new subflows on the server-side to allow correct

678

++	 * autotuning

679

++	 */

680

++	int orig_sk_rcvbuf;

681

++	int orig_sk_sndbuf;

682

++	u32 orig_window_clamp;

683

++

684

++	/* Timer for retransmitting SYN/ACK+MP_JOIN */

685

++	struct timer_list synack_timer;

686

++};

687

++

688

++#define MPTCP_SUB_CAPABLE			0

689

++#define MPTCP_SUB_LEN_CAPABLE_SYN		12

690

++#define MPTCP_SUB_LEN_CAPABLE_SYN_ALIGN		12

691

++#define MPTCP_SUB_LEN_CAPABLE_ACK		20

692

++#define MPTCP_SUB_LEN_CAPABLE_ACK_ALIGN		20

693

++

694

++#define MPTCP_SUB_JOIN			1

695

++#define MPTCP_SUB_LEN_JOIN_SYN		12

696

++#define MPTCP_SUB_LEN_JOIN_SYN_ALIGN	12

697

++#define MPTCP_SUB_LEN_JOIN_SYNACK	16

698

++#define MPTCP_SUB_LEN_JOIN_SYNACK_ALIGN	16

699

++#define MPTCP_SUB_LEN_JOIN_ACK		24

700

++#define MPTCP_SUB_LEN_JOIN_ACK_ALIGN	24

701

++

702

++#define MPTCP_SUB_DSS		2

703

++#define MPTCP_SUB_LEN_DSS	4

704

++#define MPTCP_SUB_LEN_DSS_ALIGN	4

705

++

706

++/* Lengths for seq and ack are the ones without the generic MPTCP-option header,

707

++ * as they are part of the DSS-option.

708

++ * To get the total length, just add the different options together.

709

++ */

710

++#define MPTCP_SUB_LEN_SEQ	10

711

++#define MPTCP_SUB_LEN_SEQ_CSUM	12

712

++#define MPTCP_SUB_LEN_SEQ_ALIGN	12

713

++

714

++#define MPTCP_SUB_LEN_SEQ_64		14

715

++#define MPTCP_SUB_LEN_SEQ_CSUM_64	16

716

++#define MPTCP_SUB_LEN_SEQ_64_ALIGN	16

717

++

718

++#define MPTCP_SUB_LEN_ACK	4

719

++#define MPTCP_SUB_LEN_ACK_ALIGN	4

720

++

721

++#define MPTCP_SUB_LEN_ACK_64		8

722

++#define MPTCP_SUB_LEN_ACK_64_ALIGN	8

723

++

724

++/* This is the "default" option-length we will send out most often.

725

++ * MPTCP DSS-header

726

++ * 32-bit data sequence number

727

++ * 32-bit data ack

728

++ *

729

++ * It is necessary to calculate the effective MSS we will be using when

730

++ * sending data.

731

++ */

732

++#define MPTCP_SUB_LEN_DSM_ALIGN  (MPTCP_SUB_LEN_DSS_ALIGN +		\

733

++				  MPTCP_SUB_LEN_SEQ_ALIGN +		\

734

++				  MPTCP_SUB_LEN_ACK_ALIGN)

735

++

736

++#define MPTCP_SUB_ADD_ADDR		3

737

++#define MPTCP_SUB_LEN_ADD_ADDR4		8

738

++#define MPTCP_SUB_LEN_ADD_ADDR6		20

739

++#define MPTCP_SUB_LEN_ADD_ADDR4_ALIGN	8

740

++#define MPTCP_SUB_LEN_ADD_ADDR6_ALIGN	20

741

++

742

++#define MPTCP_SUB_REMOVE_ADDR	4

743

++#define MPTCP_SUB_LEN_REMOVE_ADDR	4

744

++

745

++#define MPTCP_SUB_PRIO		5

746

++#define MPTCP_SUB_LEN_PRIO	3

747

++#define MPTCP_SUB_LEN_PRIO_ADDR	4

748

++#define MPTCP_SUB_LEN_PRIO_ALIGN	4

749

++

750

++#define MPTCP_SUB_FAIL		6

751

++#define MPTCP_SUB_LEN_FAIL	12

752

++#define MPTCP_SUB_LEN_FAIL_ALIGN	12

753

++

754

++#define MPTCP_SUB_FCLOSE	7

755

++#define MPTCP_SUB_LEN_FCLOSE	12

756

++#define MPTCP_SUB_LEN_FCLOSE_ALIGN	12

757

++

758

++

759

++#define OPTION_MPTCP		(1 << 5)

760

++

761

++#ifdef CONFIG_MPTCP

762

++

763

++/* Used for checking if the mptcp initialization has been successful */

764

++extern bool mptcp_init_failed;

765

++

766

++/* MPTCP options */

767

++#define OPTION_TYPE_SYN		(1 << 0)

768

++#define OPTION_TYPE_SYNACK	(1 << 1)

769

++#define OPTION_TYPE_ACK		(1 << 2)

770

++#define OPTION_MP_CAPABLE	(1 << 3)

771

++#define OPTION_DATA_ACK		(1 << 4)

772

++#define OPTION_ADD_ADDR		(1 << 5)

773

++#define OPTION_MP_JOIN		(1 << 6)

774

++#define OPTION_MP_FAIL		(1 << 7)

775

++#define OPTION_MP_FCLOSE	(1 << 8)

776

++#define OPTION_REMOVE_ADDR	(1 << 9)

777

++#define OPTION_MP_PRIO		(1 << 10)

778

++

779

++/* MPTCP flags: both TX and RX */

780

++#define MPTCPHDR_SEQ		0x01 /* DSS.M option is present */

781

++#define MPTCPHDR_FIN		0x02 /* DSS.F option is present */

782

++#define MPTCPHDR_SEQ64_INDEX	0x04 /* index of seq in mpcb->snd_high_order */

783

++/* MPTCP flags: RX only */

784

++#define MPTCPHDR_ACK		0x08

785

++#define MPTCPHDR_SEQ64_SET	0x10 /* Did we received a 64-bit seq number?  */

786

++#define MPTCPHDR_SEQ64_OFO	0x20 /* Is it not in our circular array? */

787

++#define MPTCPHDR_DSS_CSUM	0x40

788

++#define MPTCPHDR_JOIN		0x80

789

++/* MPTCP flags: TX only */

790

++#define MPTCPHDR_INF		0x08

791

++

792

++struct mptcp_option {

793

++	__u8	kind;

794

++	__u8	len;

795

++#if defined(__LITTLE_ENDIAN_BITFIELD)

796

++	__u8	ver:4,

797

++		sub:4;

798

++#elif defined(__BIG_ENDIAN_BITFIELD)

799

++	__u8	sub:4,

800

++		ver:4;

801

++#else

802

++#error	"Adjust your <asm/byteorder.h> defines"

803

++#endif

804

++};

805

++

806

++struct mp_capable {

807

++	__u8	kind;

808

++	__u8	len;

809

++#if defined(__LITTLE_ENDIAN_BITFIELD)

810

++	__u8	ver:4,

811

++		sub:4;

812

++	__u8	h:1,

813

++		rsv:5,

814

++		b:1,

815

++		a:1;

816

++#elif defined(__BIG_ENDIAN_BITFIELD)

817

++	__u8	sub:4,

818

++		ver:4;

819

++	__u8	a:1,

820

++		b:1,

821

++		rsv:5,

822

++		h:1;

823

++#else

824

++#error	"Adjust your <asm/byteorder.h> defines"

825

++#endif

826

++	__u64	sender_key;

827

++	__u64	receiver_key;

828

++} __attribute__((__packed__));

829

++

830

++struct mp_join {

831

++	__u8	kind;

832

++	__u8	len;

833

++#if defined(__LITTLE_ENDIAN_BITFIELD)

834

++	__u8	b:1,

835

++		rsv:3,

836

++		sub:4;

837

++#elif defined(__BIG_ENDIAN_BITFIELD)

838

++	__u8	sub:4,

839

++		rsv:3,

840

++		b:1;

841

++#else

842

++#error	"Adjust your <asm/byteorder.h> defines"

843

++#endif

844

++	__u8	addr_id;

845

++	union {

846

++		struct {

847

++			u32	token;

848

++			u32	nonce;

849

++		} syn;

850

++		struct {

851

++			__u64	mac;

852

++			u32	nonce;

853

++		} synack;

854

++		struct {

855

++			__u8	mac[20];

856

++		} ack;

857

++	} u;

858

++} __attribute__((__packed__));

859

++

860

++struct mp_dss {

861

++	__u8	kind;

862

++	__u8	len;

863

++#if defined(__LITTLE_ENDIAN_BITFIELD)

864

++	__u16	rsv1:4,

865

++		sub:4,

866

++		A:1,

867

++		a:1,

868

++		M:1,

869

++		m:1,

870

++		F:1,

871

++		rsv2:3;

872

++#elif defined(__BIG_ENDIAN_BITFIELD)

873

++	__u16	sub:4,

874

++		rsv1:4,

875

++		rsv2:3,

876

++		F:1,

877

++		m:1,

878

++		M:1,

879

++		a:1,

880

++		A:1;

881

++#else

882

++#error	"Adjust your <asm/byteorder.h> defines"

883

++#endif

884

++};

885

++

886

++struct mp_add_addr {

887

++	__u8	kind;

888

++	__u8	len;

889

++#if defined(__LITTLE_ENDIAN_BITFIELD)

890

++	__u8	ipver:4,

891

++		sub:4;

892

++#elif defined(__BIG_ENDIAN_BITFIELD)

893

++	__u8	sub:4,

894

++		ipver:4;

895

++#else

896

++#error	"Adjust your <asm/byteorder.h> defines"

897

++#endif

898

++	__u8	addr_id;

899

++	union {

900

++		struct {

901

++			struct in_addr	addr;

902

++			__be16		port;

903

++		} v4;

904

++		struct {

905

++			struct in6_addr	addr;

906

++			__be16		port;

907

++		} v6;

908

++	} u;

909

++} __attribute__((__packed__));

910

++

911

++struct mp_remove_addr {

912

++	__u8	kind;

913

++	__u8	len;

914

++#if defined(__LITTLE_ENDIAN_BITFIELD)

915

++	__u8	rsv:4,

916

++		sub:4;

917

++#elif defined(__BIG_ENDIAN_BITFIELD)

918

++	__u8	sub:4,

919

++		rsv:4;

920

++#else

921

++#error "Adjust your <asm/byteorder.h> defines"

922

++#endif

923

++	/* list of addr_id */

924

++	__u8	addrs_id;

925

++};

926

++

927

++struct mp_fail {

928

++	__u8	kind;

929

++	__u8	len;

930

++#if defined(__LITTLE_ENDIAN_BITFIELD)

931

++	__u16	rsv1:4,

932

++		sub:4,

933

++		rsv2:8;

934

++#elif defined(__BIG_ENDIAN_BITFIELD)

935

++	__u16	sub:4,

936

++		rsv1:4,

937

++		rsv2:8;

938

++#else

939

++#error	"Adjust your <asm/byteorder.h> defines"

940

++#endif

941

++	__be64	data_seq;

942

++} __attribute__((__packed__));

943

++

944

++struct mp_fclose {

945

++	__u8	kind;

946

++	__u8	len;

947

++#if defined(__LITTLE_ENDIAN_BITFIELD)

948

++	__u16	rsv1:4,

949

++		sub:4,

950

++		rsv2:8;

951

++#elif defined(__BIG_ENDIAN_BITFIELD)

952

++	__u16	sub:4,

953

++		rsv1:4,

954

++		rsv2:8;

955

++#else

956

++#error	"Adjust your <asm/byteorder.h> defines"

957

++#endif

958

++	__u64	key;

959

++} __attribute__((__packed__));

960

++

961

++struct mp_prio {

962

++	__u8	kind;

963

++	__u8	len;

964

++#if defined(__LITTLE_ENDIAN_BITFIELD)

965

++	__u8	b:1,

966

++		rsv:3,

967

++		sub:4;

968

++#elif defined(__BIG_ENDIAN_BITFIELD)

969

++	__u8	sub:4,

970

++		rsv:3,

971

++		b:1;

972

++#else

973

++#error	"Adjust your <asm/byteorder.h> defines"

974

++#endif

975

++	__u8	addr_id;

976

++} __attribute__((__packed__));

977

++

978

++static inline int mptcp_sub_len_dss(const struct mp_dss *m, const int csum)

979

++{

980

++	return 4 + m->A * (4 + m->a * 4) + m->M * (10 + m->m * 4 + csum * 2);

981

++}

982

++

983

++#define MPTCP_APP	2

984

++

985

++extern int sysctl_mptcp_enabled;

986

++extern int sysctl_mptcp_checksum;

987

++extern int sysctl_mptcp_debug;

988

++extern int sysctl_mptcp_syn_retries;

989

++

990

++extern struct workqueue_struct *mptcp_wq;

991

++

992

++#define mptcp_debug(fmt, args...)					\

993

++	do {								\

994

++		if (unlikely(sysctl_mptcp_debug))			\

995

++			pr_err(__FILE__ ": " fmt, ##args);	\

996

++	} while (0)

997

++

998

++/* Iterates over all subflows */

999

++#define mptcp_for_each_tp(mpcb, tp)					\

1000

++	for ((tp) = (mpcb)->connection_list; (tp); (tp) = (tp)->mptcp->next)

1001

++

1002

++#define mptcp_for_each_sk(mpcb, sk)					\

1003

++	for ((sk) = (struct sock *)(mpcb)->connection_list;		\

1004

++	     sk;							\

1005

++	     sk = (struct sock *)tcp_sk(sk)->mptcp->next)

1006

++

1007

++#define mptcp_for_each_sk_safe(__mpcb, __sk, __temp)			\

1008

++	for (__sk = (struct sock *)(__mpcb)->connection_list,		\

1009

++	     __temp = __sk ? (struct sock *)tcp_sk(__sk)->mptcp->next : NULL; \

1010

++	     __sk;							\

1011

++	     __sk = __temp,						\

1012

++	     __temp = __sk ? (struct sock *)tcp_sk(__sk)->mptcp->next : NULL)

1013

++

1014

++/* Iterates over all bit set to 1 in a bitset */

1015

++#define mptcp_for_each_bit_set(b, i)					\

1016

++	for (i = ffs(b) - 1; i >= 0; i = ffs(b >> (i + 1) << (i + 1)) - 1)

1017

++

1018

++#define mptcp_for_each_bit_unset(b, i)					\

1019

++	mptcp_for_each_bit_set(~b, i)

1020

++

1021

++extern struct lock_class_key meta_key;

1022

++extern struct lock_class_key meta_slock_key;

1023

++extern u32 mptcp_secret[MD5_MESSAGE_BYTES / 4];

1024

++

1025

++/* This is needed to ensure that two subsequent key/nonce-generation result in

1026

++ * different keys/nonces if the IPs and ports are the same.

1027

++ */

1028

++extern u32 mptcp_seed;

1029

++

1030

++#define MPTCP_HASH_SIZE                1024

1031

++

1032

++extern struct hlist_nulls_head tk_hashtable[MPTCP_HASH_SIZE];

1033

++

1034

++/* This second hashtable is needed to retrieve request socks

1035

++ * created as a result of a join request. While the SYN contains

1036

++ * the token, the final ack does not, so we need a separate hashtable

1037

++ * to retrieve the mpcb.

1038

++ */

1039

++extern struct hlist_nulls_head mptcp_reqsk_htb[MPTCP_HASH_SIZE];

1040

++extern spinlock_t mptcp_reqsk_hlock;	/* hashtable protection */

1041

++

1042

++/* Lock, protecting the two hash-tables that hold the token. Namely,

1043

++ * mptcp_reqsk_tk_htb and tk_hashtable

1044

++ */

1045

++extern spinlock_t mptcp_tk_hashlock;	/* hashtable protection */

1046

++

1047

++/* Request-sockets can be hashed in the tk_htb for collision-detection or in

1048

++ * the regular htb for join-connections. We need to define different NULLS

1049

++ * values so that we can correctly detect a request-socket that has been

1050

++ * recycled. See also c25eb3bfb9729.

1051

++ */

1052

++#define MPTCP_REQSK_NULLS_BASE (1U << 29)

1053

++

1054

++

1055

++void mptcp_data_ready(struct sock *sk);

1056

++void mptcp_write_space(struct sock *sk);

1057

++

1058

++void mptcp_add_meta_ofo_queue(const struct sock *meta_sk, struct sk_buff *skb,

1059

++			      struct sock *sk);

1060

++void mptcp_ofo_queue(struct sock *meta_sk);

1061

++void mptcp_purge_ofo_queue(struct tcp_sock *meta_tp);

1062

++void mptcp_cleanup_rbuf(struct sock *meta_sk, int copied);

1063

++int mptcp_add_sock(struct sock *meta_sk, struct sock *sk, u8 loc_id, u8 rem_id,

1064

++		   gfp_t flags);

1065

++void mptcp_del_sock(struct sock *sk);

1066

++void mptcp_update_metasocket(struct sock *sock, const struct sock *meta_sk);

1067

++void mptcp_reinject_data(struct sock *orig_sk, int clone_it);

1068

++void mptcp_update_sndbuf(const struct tcp_sock *tp);

1069

++void mptcp_send_fin(struct sock *meta_sk);

1070

++void mptcp_send_active_reset(struct sock *meta_sk, gfp_t priority);

1071

++bool mptcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,

1072

++		      int push_one, gfp_t gfp);

1073

++void tcp_parse_mptcp_options(const struct sk_buff *skb,

1074

++			     struct mptcp_options_received *mopt);

1075

++void mptcp_parse_options(const uint8_t *ptr, int opsize,

1076

++			 struct mptcp_options_received *mopt,

1077

++			 const struct sk_buff *skb);

1078

++void mptcp_syn_options(const struct sock *sk, struct tcp_out_options *opts,

1079

++		       unsigned *remaining);

1080

++void mptcp_synack_options(struct request_sock *req,

1081

++			  struct tcp_out_options *opts,

1082

++			  unsigned *remaining);

1083

++void mptcp_established_options(struct sock *sk, struct sk_buff *skb,

1084

++			       struct tcp_out_options *opts, unsigned *size);

1085

++void mptcp_options_write(__be32 *ptr, struct tcp_sock *tp,

1086

++			 const struct tcp_out_options *opts,

1087

++			 struct sk_buff *skb);

1088

++void mptcp_close(struct sock *meta_sk, long timeout);

1089

++int mptcp_doit(struct sock *sk);

1090

++int mptcp_create_master_sk(struct sock *meta_sk, __u64 remote_key, u32 window);

1091

++int mptcp_check_req_fastopen(struct sock *child, struct request_sock *req);

1092

++int mptcp_check_req_master(struct sock *sk, struct sock *child,

1093

++			   struct request_sock *req,

1094

++			   struct request_sock **prev);

1095

++struct sock *mptcp_check_req_child(struct sock *sk, struct sock *child,

1096

++				   struct request_sock *req,

1097

++				   struct request_sock **prev,

1098

++				   const struct mptcp_options_received *mopt);

1099

++u32 __mptcp_select_window(struct sock *sk);

1100

++void mptcp_select_initial_window(int __space, __u32 mss, __u32 *rcv_wnd,

1101

++					__u32 *window_clamp, int wscale_ok,

1102

++					__u8 *rcv_wscale, __u32 init_rcv_wnd,

1103

++					const struct sock *sk);

1104

++unsigned int mptcp_current_mss(struct sock *meta_sk);

1105

++int mptcp_select_size(const struct sock *meta_sk, bool sg);

1106

++void mptcp_key_sha1(u64 key, u32 *token, u64 *idsn);

1107

++void mptcp_hmac_sha1(u8 *key_1, u8 *key_2, u8 *rand_1, u8 *rand_2,

1108

++		     u32 *hash_out);

1109

++void mptcp_clean_rtx_infinite(const struct sk_buff *skb, struct sock *sk);

1110

++void mptcp_fin(struct sock *meta_sk);

1111

++void mptcp_retransmit_timer(struct sock *meta_sk);

1112

++int mptcp_write_wakeup(struct sock *meta_sk);

1113

++void mptcp_sub_close_wq(struct work_struct *work);

1114

++void mptcp_sub_close(struct sock *sk, unsigned long delay);

1115

++struct sock *mptcp_select_ack_sock(const struct sock *meta_sk);

1116

++void mptcp_fallback_meta_sk(struct sock *meta_sk);

1117

++int mptcp_backlog_rcv(struct sock *meta_sk, struct sk_buff *skb);

1118

++void mptcp_ack_handler(unsigned long);

1119

++int mptcp_check_rtt(const struct tcp_sock *tp, int time);

1120

++int mptcp_check_snd_buf(const struct tcp_sock *tp);

1121

++int mptcp_handle_options(struct sock *sk, const struct tcphdr *th,

1122

++			 const struct sk_buff *skb);

1123

++void __init mptcp_init(void);

1124

++int mptcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len);

1125

++void mptcp_destroy_sock(struct sock *sk);

1126

++int mptcp_rcv_synsent_state_process(struct sock *sk, struct sock **skptr,

1127

++				    const struct sk_buff *skb,

1128

++				    const struct mptcp_options_received *mopt);

1129

++unsigned int mptcp_xmit_size_goal(const struct sock *meta_sk, u32 mss_now,

1130

++				  int large_allowed);

1131

++int mptcp_init_tw_sock(struct sock *sk, struct tcp_timewait_sock *tw);

1132

++void mptcp_twsk_destructor(struct tcp_timewait_sock *tw);

1133

++void mptcp_time_wait(struct sock *sk, int state, int timeo);

1134

++void mptcp_disconnect(struct sock *sk);

1135

++bool mptcp_should_expand_sndbuf(const struct sock *sk);

1136

++int mptcp_retransmit_skb(struct sock *meta_sk, struct sk_buff *skb);

1137

++void mptcp_tsq_flags(struct sock *sk);

1138

++void mptcp_tsq_sub_deferred(struct sock *meta_sk);

1139

++struct mp_join *mptcp_find_join(const struct sk_buff *skb);

1140

++void mptcp_hash_remove_bh(struct tcp_sock *meta_tp);

1141

++void mptcp_hash_remove(struct tcp_sock *meta_tp);

1142

++struct sock *mptcp_hash_find(const struct net *net, const u32 token);

1143

++int mptcp_lookup_join(struct sk_buff *skb, struct inet_timewait_sock *tw);

1144

++int mptcp_do_join_short(struct sk_buff *skb,

1145

++			const struct mptcp_options_received *mopt,

1146

++			struct net *net);

1147

++void mptcp_reqsk_destructor(struct request_sock *req);

1148

++void mptcp_reqsk_new_mptcp(struct request_sock *req,

1149

++			   const struct mptcp_options_received *mopt,

1150

++			   const struct sk_buff *skb);

1151

++int mptcp_check_req(struct sk_buff *skb, struct net *net);

1152

++void mptcp_connect_init(struct sock *sk);

1153

++void mptcp_sub_force_close(struct sock *sk);

1154

++int mptcp_sub_len_remove_addr_align(u16 bitfield);

1155

++void mptcp_remove_shortcuts(const struct mptcp_cb *mpcb,

1156

++			    const struct sk_buff *skb);

1157

++void mptcp_init_buffer_space(struct sock *sk);

1158

++void mptcp_join_reqsk_init(struct mptcp_cb *mpcb, const struct request_sock *req,

1159

++			   struct sk_buff *skb);

1160

++void mptcp_reqsk_init(struct request_sock *req, const struct sk_buff *skb);

1161

++int mptcp_conn_request(struct sock *sk, struct sk_buff *skb);

1162

++void mptcp_init_congestion_control(struct sock *sk);

1163

++

1164

++/* MPTCP-path-manager registration/initialization functions */

1165

++int mptcp_register_path_manager(struct mptcp_pm_ops *pm);

1166

++void mptcp_unregister_path_manager(struct mptcp_pm_ops *pm);

1167

++void mptcp_init_path_manager(struct mptcp_cb *mpcb);

1168

++void mptcp_cleanup_path_manager(struct mptcp_cb *mpcb);

1169

++void mptcp_fallback_default(struct mptcp_cb *mpcb);

1170

++void mptcp_get_default_path_manager(char *name);

1171

++int mptcp_set_default_path_manager(const char *name);

1172

++extern struct mptcp_pm_ops mptcp_pm_default;

1173

++

1174

++/* MPTCP-scheduler registration/initialization functions */

1175

++int mptcp_register_scheduler(struct mptcp_sched_ops *sched);

1176

++void mptcp_unregister_scheduler(struct mptcp_sched_ops *sched);

1177

++void mptcp_init_scheduler(struct mptcp_cb *mpcb);

1178

++void mptcp_cleanup_scheduler(struct mptcp_cb *mpcb);

1179

++void mptcp_get_default_scheduler(char *name);

1180

++int mptcp_set_default_scheduler(const char *name);

1181

++extern struct mptcp_sched_ops mptcp_sched_default;

1182

++

1183

++static inline void mptcp_reset_synack_timer(struct sock *meta_sk,

1184

++					    unsigned long len)

1185

++{

1186

++	sk_reset_timer(meta_sk, &tcp_sk(meta_sk)->mpcb->synack_timer,

1187

++		       jiffies + len);

1188

++}

1189

++

1190

++static inline void mptcp_delete_synack_timer(struct sock *meta_sk)

1191

++{

1192

++	sk_stop_timer(meta_sk, &tcp_sk(meta_sk)->mpcb->synack_timer);

1193

++}

1194

++

1195

++static inline bool is_mptcp_enabled(const struct sock *sk)

1196

++{

1197

++	if (!sysctl_mptcp_enabled || mptcp_init_failed)

1198

++		return false;

1199

++

1200

++	if (sysctl_mptcp_enabled == MPTCP_APP && !tcp_sk(sk)->mptcp_enabled)

1201

++		return false;

1202

++

1203

++	return true;

1204

++}

1205

++

1206

++static inline int mptcp_pi_to_flag(int pi)

1207

++{

1208

++	return 1 << (pi - 1);

1209

++}

1210

++

1211

++static inline

1212

++struct mptcp_request_sock *mptcp_rsk(const struct request_sock *req)

1213

++{

1214

++	return (struct mptcp_request_sock *)req;

1215

++}

1216

++

1217

++static inline

1218

++struct request_sock *rev_mptcp_rsk(const struct mptcp_request_sock *req)

1219

++{

1220

++	return (struct request_sock *)req;

1221

++}

1222

++

1223

++static inline bool mptcp_can_sendpage(struct sock *sk)

1224

++{

1225

++	struct sock *sk_it;

1226

++

1227

++	if (tcp_sk(sk)->mpcb->dss_csum)

1228

++		return false;

1229

++

1230

++	mptcp_for_each_sk(tcp_sk(sk)->mpcb, sk_it) {

1231

++		if (!(sk_it->sk_route_caps & NETIF_F_SG) ||

1232

++		    !(sk_it->sk_route_caps & NETIF_F_ALL_CSUM))

1233

++			return false;

1234

++	}

1235

++

1236

++	return true;

1237

++}

1238

++

1239

++static inline void mptcp_push_pending_frames(struct sock *meta_sk)

1240

++{

1241

++	/* We check packets out and send-head here. TCP only checks the

1242

++	 * send-head. But, MPTCP also checks packets_out, as this is an

1243

++	 * indication that we might want to do opportunistic reinjection.

1244

++	 */

1245

++	if (tcp_sk(meta_sk)->packets_out || tcp_send_head(meta_sk)) {

1246

++		struct tcp_sock *tp = tcp_sk(meta_sk);

1247

++

1248

++		/* We don't care about the MSS, because it will be set in

1249

++		 * mptcp_write_xmit.

1250

++		 */

1251

++		__tcp_push_pending_frames(meta_sk, 0, tp->nonagle);

1252

++	}

1253

++}

1254

++

1255

++static inline void mptcp_send_reset(struct sock *sk)

1256

++{

1257

++	tcp_sk(sk)->ops->send_active_reset(sk, GFP_ATOMIC);

1258

++	mptcp_sub_force_close(sk);

1259

++}

1260

++

1261

++static inline bool mptcp_is_data_seq(const struct sk_buff *skb)

1262

++{

1263

++	return TCP_SKB_CB(skb)->mptcp_flags & MPTCPHDR_SEQ;

1264

++}

1265

++

1266

++static inline bool mptcp_is_data_fin(const struct sk_buff *skb)

1267

++{

1268

++	return TCP_SKB_CB(skb)->mptcp_flags & MPTCPHDR_FIN;

1269

++}

1270

++

1271

++/* Is it a data-fin while in infinite mapping mode?

1272

++ * In infinite mode, a subflow-fin is in fact a data-fin.

1273

++ */

1274

++static inline bool mptcp_is_data_fin2(const struct sk_buff *skb,

1275

++				     const struct tcp_sock *tp)

1276

++{

1277

++	return mptcp_is_data_fin(skb) ||

1278

++	       (tp->mpcb->infinite_mapping_rcv && tcp_hdr(skb)->fin);

1279

++}

1280

++

1281

++static inline u8 mptcp_get_64_bit(u64 data_seq, struct mptcp_cb *mpcb)

1282

++{

1283

++	u64 data_seq_high = (u32)(data_seq >> 32);

1284

++

1285

++	if (mpcb->rcv_high_order[0] == data_seq_high)

1286

++		return 0;

1287

++	else if (mpcb->rcv_high_order[1] == data_seq_high)

1288

++		return MPTCPHDR_SEQ64_INDEX;

1289

++	else

1290

++		return MPTCPHDR_SEQ64_OFO;

1291

++}

1292

++

1293

++/* Sets the data_seq and returns pointer to the in-skb field of the data_seq.

1294

++ * If the packet has a 64-bit dseq, the pointer points to the last 32 bits.

1295

++ */

1296

++static inline __u32 *mptcp_skb_set_data_seq(const struct sk_buff *skb,

1297

++					    u32 *data_seq,

1298

++					    struct mptcp_cb *mpcb)

1299

++{

1300

++	__u32 *ptr = (__u32 *)(skb_transport_header(skb) + TCP_SKB_CB(skb)->dss_off);

1301

++

1302

++	if (TCP_SKB_CB(skb)->mptcp_flags & MPTCPHDR_SEQ64_SET) {

1303

++		u64 data_seq64 = get_unaligned_be64(ptr);

1304

++

1305

++		if (mpcb)

1306

++			TCP_SKB_CB(skb)->mptcp_flags |= mptcp_get_64_bit(data_seq64, mpcb);

1307

++

1308

++		*data_seq = (u32)data_seq64;

1309

++		ptr++;

1310

++	} else {

1311

++		*data_seq = get_unaligned_be32(ptr);

1312

++	}

1313

++

1314

++	return ptr;

1315

++}

1316

++

1317

++static inline struct sock *mptcp_meta_sk(const struct sock *sk)

1318

++{

1319

++	return tcp_sk(sk)->meta_sk;

1320

++}

1321

++

1322

++static inline struct tcp_sock *mptcp_meta_tp(const struct tcp_sock *tp)

1323

++{

1324

++	return tcp_sk(tp->meta_sk);

1325

++}

1326

++

1327

++static inline int is_meta_tp(const struct tcp_sock *tp)

1328

++{

1329

++	return tp->mpcb && mptcp_meta_tp(tp) == tp;

1330

++}

1331

++

1332

++static inline int is_meta_sk(const struct sock *sk)

1333

++{

1334

++	return sk->sk_type == SOCK_STREAM  && sk->sk_protocol == IPPROTO_TCP &&

1335

++	       mptcp(tcp_sk(sk)) && mptcp_meta_sk(sk) == sk;

1336

++}

1337

++

1338

++static inline int is_master_tp(const struct tcp_sock *tp)

1339

++{

1340

++	return !mptcp(tp) || (!tp->mptcp->slave_sk && !is_meta_tp(tp));

1341

++}

1342

++

1343

++static inline void mptcp_hash_request_remove(struct request_sock *req)

1344

++{

1345

++	int in_softirq = 0;

1346

++

1347

++	if (hlist_nulls_unhashed(&mptcp_rsk(req)->hash_entry))

1348

++		return;

1349

++

1350

++	if (in_softirq()) {

1351

++		spin_lock(&mptcp_reqsk_hlock);

1352

++		in_softirq = 1;

1353

++	} else {

1354

++		spin_lock_bh(&mptcp_reqsk_hlock);

1355

++	}

1356

++

1357

++	hlist_nulls_del_init_rcu(&mptcp_rsk(req)->hash_entry);

1358

++

1359

++	if (in_softirq)

1360

++		spin_unlock(&mptcp_reqsk_hlock);

1361

++	else

1362

++		spin_unlock_bh(&mptcp_reqsk_hlock);

1363

++}

1364

++

1365

++static inline void mptcp_init_mp_opt(struct mptcp_options_received *mopt)

1366

++{

1367

++	mopt->saw_mpc = 0;

1368

++	mopt->dss_csum = 0;

1369

++	mopt->drop_me = 0;

1370

++

1371

++	mopt->is_mp_join = 0;

1372

++	mopt->join_ack = 0;

1373

++

1374

++	mopt->saw_low_prio = 0;

1375

++	mopt->low_prio = 0;

1376

++

1377

++	mopt->saw_add_addr = 0;

1378

++	mopt->more_add_addr = 0;

1379

++

1380

++	mopt->saw_rem_addr = 0;

1381

++	mopt->more_rem_addr = 0;

1382

++

1383

++	mopt->mp_fail = 0;

1384

++	mopt->mp_fclose = 0;

1385

++}

1386

++

1387

++static inline void mptcp_reset_mopt(struct tcp_sock *tp)

1388

++{

1389

++	struct mptcp_options_received *mopt = &tp->mptcp->rx_opt;

1390

++

1391

++	mopt->saw_low_prio = 0;

1392

++	mopt->saw_add_addr = 0;

1393

++	mopt->more_add_addr = 0;

1394

++	mopt->saw_rem_addr = 0;

1395

++	mopt->more_rem_addr = 0;

1396

++	mopt->join_ack = 0;

1397

++	mopt->mp_fail = 0;

1398

++	mopt->mp_fclose = 0;

1399

++}

1400

++

1401

++static inline __be32 mptcp_get_highorder_sndbits(const struct sk_buff *skb,

1402

++						 const struct mptcp_cb *mpcb)

1403

++{

1404

++	return htonl(mpcb->snd_high_order[(TCP_SKB_CB(skb)->mptcp_flags &

1405

++			MPTCPHDR_SEQ64_INDEX) ? 1 : 0]);

1406

++}

1407

++

1408

++static inline u64 mptcp_get_data_seq_64(const struct mptcp_cb *mpcb, int index,

1409

++					u32 data_seq_32)

1410

++{

1411

++	return ((u64)mpcb->rcv_high_order[index] << 32) | data_seq_32;

1412

++}

1413

++

1414

++static inline u64 mptcp_get_rcv_nxt_64(const struct tcp_sock *meta_tp)

1415

++{

1416

++	struct mptcp_cb *mpcb = meta_tp->mpcb;

1417

++	return mptcp_get_data_seq_64(mpcb, mpcb->rcv_hiseq_index,

1418

++				     meta_tp->rcv_nxt);

1419

++}

1420

++

1421

++static inline void mptcp_check_sndseq_wrap(struct tcp_sock *meta_tp, int inc)

1422

++{

1423

++	if (unlikely(meta_tp->snd_nxt > meta_tp->snd_nxt + inc)) {

1424

++		struct mptcp_cb *mpcb = meta_tp->mpcb;

1425

++		mpcb->snd_hiseq_index = mpcb->snd_hiseq_index ? 0 : 1;

1426

++		mpcb->snd_high_order[mpcb->snd_hiseq_index] += 2;

1427

++	}

1428

++}

1429

++

1430

++static inline void mptcp_check_rcvseq_wrap(struct tcp_sock *meta_tp,

1431

++					   u32 old_rcv_nxt)

1432

++{

1433

++	if (unlikely(old_rcv_nxt > meta_tp->rcv_nxt)) {

1434

++		struct mptcp_cb *mpcb = meta_tp->mpcb;

1435

++		mpcb->rcv_high_order[mpcb->rcv_hiseq_index] += 2;

1436

++		mpcb->rcv_hiseq_index = mpcb->rcv_hiseq_index ? 0 : 1;

1437

++	}

1438

++}

1439

++

1440

++static inline int mptcp_sk_can_send(const struct sock *sk)

1441

++{

1442

++	return tcp_passive_fastopen(sk) ||

1443

++	       ((1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&

1444

++		!tcp_sk(sk)->mptcp->pre_established);

1445

++}

1446

++

1447

++static inline int mptcp_sk_can_recv(const struct sock *sk)

1448

++{

1449

++	return (1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2);

1450

++}

1451

++

1452

++static inline int mptcp_sk_can_send_ack(const struct sock *sk)

1453

++{

1454

++	return !((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV |

1455

++					TCPF_CLOSE | TCPF_LISTEN)) &&

1456

++	       !tcp_sk(sk)->mptcp->pre_established;

1457

++}

1458

++

1459

++/* Only support GSO if all subflows supports it */

1460

++static inline bool mptcp_sk_can_gso(const struct sock *meta_sk)

1461

++{

1462

++	struct sock *sk;

1463

++

1464

++	if (tcp_sk(meta_sk)->mpcb->dss_csum)

1465

++		return false;

1466

++

1467

++	mptcp_for_each_sk(tcp_sk(meta_sk)->mpcb, sk) {

1468

++		if (!mptcp_sk_can_send(sk))

1469

++			continue;

1470

++		if (!sk_can_gso(sk))

1471

++			return false;

1472

++	}

1473

++	return true;

1474

++}

1475

++

1476

++static inline bool mptcp_can_sg(const struct sock *meta_sk)

1477

++{

1478

++	struct sock *sk;

1479

++

1480

++	if (tcp_sk(meta_sk)->mpcb->dss_csum)

1481

++		return false;

1482

++

1483

++	mptcp_for_each_sk(tcp_sk(meta_sk)->mpcb, sk) {

1484

++		if (!mptcp_sk_can_send(sk))

1485

++			continue;

1486

++		if (!(sk->sk_route_caps & NETIF_F_SG))

1487

++			return false;

1488

++	}

1489

++	return true;

1490

++}

1491

++

1492

++static inline void mptcp_set_rto(struct sock *sk)

1493

++{

1494

++	struct tcp_sock *tp = tcp_sk(sk);

1495

++	struct sock *sk_it;

1496

++	struct inet_connection_sock *micsk = inet_csk(mptcp_meta_sk(sk));

1497

++	__u32 max_rto = 0;

1498

++

1499

++	/* We are in recovery-phase on the MPTCP-level. Do not update the

1500

++	 * RTO, because this would kill exponential backoff.

1501

++	 */

1502

++	if (micsk->icsk_retransmits)

1503

++		return;

1504

++

1505

++	mptcp_for_each_sk(tp->mpcb, sk_it) {

1506

++		if (mptcp_sk_can_send(sk_it) &&

1507

++		    inet_csk(sk_it)->icsk_rto > max_rto)

1508

++			max_rto = inet_csk(sk_it)->icsk_rto;

1509

++	}

1510

++	if (max_rto) {

1511

++		micsk->icsk_rto = max_rto << 1;

1512

++

1513

++		/* A successfull rto-measurement - reset backoff counter */

1514

++		micsk->icsk_backoff = 0;

1515

++	}

1516

++}

1517

++

1518

++static inline int mptcp_sysctl_syn_retries(void)

1519

++{

1520

++	return sysctl_mptcp_syn_retries;

1521

++}

1522

++

1523

++static inline void mptcp_sub_close_passive(struct sock *sk)

1524

++{

1525

++	struct sock *meta_sk = mptcp_meta_sk(sk);

1526

++	struct tcp_sock *tp = tcp_sk(sk), *meta_tp = tcp_sk(meta_sk);

1527

++

1528

++	/* Only close, if the app did a send-shutdown (passive close), and we

1529

++	 * received the data-ack of the data-fin.

1530

++	 */

1531

++	if (tp->mpcb->passive_close && meta_tp->snd_una == meta_tp->write_seq)

1532

++		mptcp_sub_close(sk, 0);

1533

++}

1534

++

1535

++static inline bool mptcp_fallback_infinite(struct sock *sk, int flag)

1536

++{

1537

++	struct tcp_sock *tp = tcp_sk(sk);

1538

++

1539

++	/* If data has been acknowleged on the meta-level, fully_established

1540

++	 * will have been set before and thus we will not fall back to infinite

1541

++	 * mapping.

1542

++	 */

1543

++	if (likely(tp->mptcp->fully_established))

1544

++		return false;

1545

++

1546

++	if (!(flag & MPTCP_FLAG_DATA_ACKED))

1547

++		return false;

1548

++

1549

++	/* Don't fallback twice ;) */

1550

++	if (tp->mpcb->infinite_mapping_snd)

1551

++		return false;

1552

++

1553

++	pr_err("%s %#x will fallback - pi %d, src %pI4 dst %pI4 from %pS\n",

1554

++	       __func__, tp->mpcb->mptcp_loc_token, tp->mptcp->path_index,

1555

++	       &inet_sk(sk)->inet_saddr, &inet_sk(sk)->inet_daddr,

1556

++	       __builtin_return_address(0));

1557

++	if (!is_master_tp(tp))

1558

++		return true;

1559

++

1560

++	tp->mpcb->infinite_mapping_snd = 1;

1561

++	tp->mpcb->infinite_mapping_rcv = 1;

1562

++	tp->mptcp->fully_established = 1;

1563

++

1564

++	return false;

1565

++}

1566

++

1567

++/* Find the first index whose bit in the bit-field == 0 */

1568

++static inline u8 mptcp_set_new_pathindex(struct mptcp_cb *mpcb)

1569

++{

1570

++	u8 base = mpcb->next_path_index;

1571

++	int i;

1572

++

1573

++	/* Start at 1, because 0 is reserved for the meta-sk */

1574

++	mptcp_for_each_bit_unset(mpcb->path_index_bits >> base, i) {

1575

++		if (i + base < 1)

1576

++			continue;

1577

++		if (i + base >= sizeof(mpcb->path_index_bits) * 8)

1578

++			break;

1579

++		i += base;

1580

++		mpcb->path_index_bits |= (1 << i);

1581

++		mpcb->next_path_index = i + 1;

1582

++		return i;

1583

++	}

1584

++	mptcp_for_each_bit_unset(mpcb->path_index_bits, i) {

1585

++		if (i >= sizeof(mpcb->path_index_bits) * 8)

1586

++			break;

1587

++		if (i < 1)

1588

++			continue;

1589

++		mpcb->path_index_bits |= (1 << i);

1590

++		mpcb->next_path_index = i + 1;

1591

++		return i;

1592

++	}

1593

++

1594

++	return 0;

1595

++}

1596

++

1597

++static inline bool mptcp_v6_is_v4_mapped(const struct sock *sk)

1598

++{

1599

++	return sk->sk_family == AF_INET6 &&

1600

++	       ipv6_addr_type(&inet6_sk(sk)->saddr) == IPV6_ADDR_MAPPED;

1601

++}

1602

++

1603

++/* TCP and MPTCP mpc flag-depending functions */

1604

++u16 mptcp_select_window(struct sock *sk);

1605

++void mptcp_init_buffer_space(struct sock *sk);

1606

++void mptcp_tcp_set_rto(struct sock *sk);

1607

++

1608

++/* TCP and MPTCP flag-depending functions */

1609

++bool mptcp_prune_ofo_queue(struct sock *sk);

1610

++

1611

++#else /* CONFIG_MPTCP */

1612

++#define mptcp_debug(fmt, args...)	\

1613

++	do {				\

1614

++	} while (0)

1615

++

1616

++/* Without MPTCP, we just do one iteration

1617

++ * over the only socket available. This assumes that

1618

++ * the sk/tp arg is the socket in that case.

1619

++ */

1620

++#define mptcp_for_each_sk(mpcb, sk)

1621

++#define mptcp_for_each_sk_safe(__mpcb, __sk, __temp)

1622

++

1623

++static inline bool mptcp_is_data_fin(const struct sk_buff *skb)

1624

++{

1625

++	return false;

1626

++}

1627

++static inline bool mptcp_is_data_seq(const struct sk_buff *skb)

1628

++{

1629

++	return false;

1630

++}

1631

++static inline struct sock *mptcp_meta_sk(const struct sock *sk)

1632

++{

1633

++	return NULL;

1634

++}

1635

++static inline struct tcp_sock *mptcp_meta_tp(const struct tcp_sock *tp)

1636

++{

1637

++	return NULL;

1638

++}

1639

++static inline int is_meta_sk(const struct sock *sk)

1640

++{

1641

++	return 0;

1642

++}

1643

++static inline int is_master_tp(const struct tcp_sock *tp)

1644

++{

1645

++	return 0;

1646

++}

1647

++static inline void mptcp_purge_ofo_queue(struct tcp_sock *meta_tp) {}

1648

++static inline void mptcp_del_sock(const struct sock *sk) {}

1649

++static inline void mptcp_update_metasocket(struct sock *sock, const struct sock *meta_sk) {}

1650

++static inline void mptcp_reinject_data(struct sock *orig_sk, int clone_it) {}

1651

++static inline void mptcp_update_sndbuf(const struct tcp_sock *tp) {}

1652

++static inline void mptcp_clean_rtx_infinite(const struct sk_buff *skb,

1653

++					    const struct sock *sk) {}

1654

++static inline void mptcp_sub_close(struct sock *sk, unsigned long delay) {}

1655

++static inline void mptcp_set_rto(const struct sock *sk) {}

1656

++static inline void mptcp_send_fin(const struct sock *meta_sk) {}

1657

++static inline void mptcp_parse_options(const uint8_t *ptr, const int opsize,

1658

++				       const struct mptcp_options_received *mopt,

1659

++				       const struct sk_buff *skb) {}

1660

++static inline void mptcp_syn_options(const struct sock *sk,

1661

++				     struct tcp_out_options *opts,

1662

++				     unsigned *remaining) {}

1663

++static inline void mptcp_synack_options(struct request_sock *req,

1664

++					struct tcp_out_options *opts,

1665

++					unsigned *remaining) {}

1666

++

1667

++static inline void mptcp_established_options(struct sock *sk,

1668

++					     struct sk_buff *skb,

1669

++					     struct tcp_out_options *opts,

1670

++					     unsigned *size) {}

1671

++static inline void mptcp_options_write(__be32 *ptr, struct tcp_sock *tp,

1672

++				       const struct tcp_out_options *opts,

1673

++				       struct sk_buff *skb) {}

1674

++static inline void mptcp_close(struct sock *meta_sk, long timeout) {}

1675

++static inline int mptcp_doit(struct sock *sk)

1676

++{

1677

++	return 0;

1678

++}

1679

++static inline int mptcp_check_req_fastopen(struct sock *child,

1680

++					   struct request_sock *req)

1681

++{

1682

++	return 1;

1683

++}

1684

++static inline int mptcp_check_req_master(const struct sock *sk,

1685

++					 const struct sock *child,

1686

++					 struct request_sock *req,

1687

++					 struct request_sock **prev)

1688

++{

1689

++	return 1;

1690

++}

1691

++static inline struct sock *mptcp_check_req_child(struct sock *sk,

1692

++						 struct sock *child,

1693

++						 struct request_sock *req,

1694

++						 struct request_sock **prev,

1695

++						 const struct mptcp_options_received *mopt)

1696

++{

1697

++	return NULL;

1698

++}

1699

++static inline unsigned int mptcp_current_mss(struct sock *meta_sk)

1700

++{

1701

++	return 0;

1702

++}

1703

++static inline int mptcp_select_size(const struct sock *meta_sk, bool sg)

1704

++{

1705

++	return 0;

1706

++}

1707

++static inline void mptcp_sub_close_passive(struct sock *sk) {}

1708

++static inline bool mptcp_fallback_infinite(const struct sock *sk, int flag)

1709

++{

1710

++	return false;

1711

++}

1712

++static inline void mptcp_init_mp_opt(const struct mptcp_options_received *mopt) {}

1713

++static inline int mptcp_check_rtt(const struct tcp_sock *tp, int time)

1714

++{

1715

++	return 0;

1716

++}

1717

++static inline int mptcp_check_snd_buf(const struct tcp_sock *tp)

1718

++{

1719

++	return 0;

1720

++}

1721

++static inline int mptcp_sysctl_syn_retries(void)

1722

++{

1723

++	return 0;

1724

++}

1725

++static inline void mptcp_send_reset(const struct sock *sk) {}

1726

++static inline int mptcp_handle_options(struct sock *sk,

1727

++				       const struct tcphdr *th,

1728

++				       struct sk_buff *skb)

1729

++{

1730

++	return 0;

1731

++}

1732

++static inline void mptcp_reset_mopt(struct tcp_sock *tp) {}

1733

++static inline void  __init mptcp_init(void) {}

1734

++static inline int mptcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)

1735

++{

1736

++	return 0;

1737

++}

1738

++static inline bool mptcp_sk_can_gso(const struct sock *sk)

1739

++{

1740

++	return false;

1741

++}

1742

++static inline bool mptcp_can_sg(const struct sock *meta_sk)

1743

++{

1744

++	return false;

1745

++}

1746

++static inline unsigned int mptcp_xmit_size_goal(const struct sock *meta_sk,

1747

++						u32 mss_now, int large_allowed)

1748

++{

1749

++	return 0;

1750

++}

1751

++static inline void mptcp_destroy_sock(struct sock *sk) {}

1752

++static inline int mptcp_rcv_synsent_state_process(struct sock *sk,

1753

++						  struct sock **skptr,

1754

++						  struct sk_buff *skb,

1755

++						  const struct mptcp_options_received *mopt)

1756

++{

1757

++	return 0;

1758

++}

1759

++static inline bool mptcp_can_sendpage(struct sock *sk)

1760

++{

1761

++	return false;

1762

++}

1763

++static inline int mptcp_init_tw_sock(struct sock *sk,

1764

++				     struct tcp_timewait_sock *tw)

1765

++{

1766

++	return 0;

1767

++}

1768

++static inline void mptcp_twsk_destructor(struct tcp_timewait_sock *tw) {}

1769

++static inline void mptcp_disconnect(struct sock *sk) {}

1770

++static inline void mptcp_tsq_flags(struct sock *sk) {}

1771

++static inline void mptcp_tsq_sub_deferred(struct sock *meta_sk) {}

1772

++static inline void mptcp_hash_remove_bh(struct tcp_sock *meta_tp) {}

1773

++static inline void mptcp_hash_remove(struct tcp_sock *meta_tp) {}

1774

++static inline void mptcp_reqsk_new_mptcp(struct request_sock *req,

1775

++					 const struct tcp_options_received *rx_opt,

1776

++					 const struct mptcp_options_received *mopt,

1777

++					 const struct sk_buff *skb) {}

1778

++static inline void mptcp_remove_shortcuts(const struct mptcp_cb *mpcb,

1779

++					  const struct sk_buff *skb) {}

1780

++static inline void mptcp_delete_synack_timer(struct sock *meta_sk) {}

1781

++#endif /* CONFIG_MPTCP */

1782

++

1783

++#endif /* _MPTCP_H */

1784

+diff --git a/include/net/mptcp_v4.h b/include/net/mptcp_v4.h

1785

+new file mode 100644

1786

+index 000000000000..93ad97c77c5a

1787

+--- /dev/null

1788

++++ b/include/net/mptcp_v4.h

1789

+@@ -0,0 +1,67 @@

1790

++/*

1791

++ *	MPTCP implementation

1792

++ *

1793

++ *	Initial Design & Implementation:

1794

++ *	Sébastien Barré <sebastien.barre@×××××××××.be>

1795

++ *

1796

++ *	Current Maintainer & Author:

1797

++ *	Christoph Paasch <christoph.paasch@×××××××××.be>

1798

++ *

1799

++ *	Additional authors:

1800

++ *	Jaakko Korkeaniemi <jaakko.korkeaniemi@×××××.fi>

1801

++ *	Gregory Detal <gregory.detal@×××××××××.be>

1802

++ *	Fabien Duchêne <fabien.duchene@×××××××××.be>

1803

++ *	Andreas Seelinger <Andreas.Seelinger@×××××××××××.de>

1804

++ *	Lavkesh Lahngir <lavkesh51@×××××.com>

1805

++ *	Andreas Ripke <ripke@××××××.eu>

1806

++ *	Vlad Dogaru <vlad.dogaru@×××××.com>

1807

++ *	Octavian Purdila <octavian.purdila@×××××.com>

1808

++ *	John Ronan <jronan@××××.org>

1809

++ *	Catalin Nicutar <catalin.nicutar@×××××.com>

1810

++ *	Brandon Heller <brandonh@××××××××.edu>

1811

++ *

1812

++ *

1813

++ *	This program is free software; you can redistribute it and/or

1814

++ *      modify it under the terms of the GNU General Public License

1815

++ *      as published by the Free Software Foundation; either version

1816

++ *      2 of the License, or (at your option) any later version.

1817

++ */

1818

++

1819

++#ifndef MPTCP_V4_H_

1820

++#define MPTCP_V4_H_

1821

++

1822

++

1823

++#include <linux/in.h>

1824

++#include <linux/skbuff.h>

1825

++#include <net/mptcp.h>

1826

++#include <net/request_sock.h>

1827

++#include <net/sock.h>

1828

++

1829

++extern struct request_sock_ops mptcp_request_sock_ops;

1830

++extern const struct inet_connection_sock_af_ops mptcp_v4_specific;

1831

++extern struct tcp_request_sock_ops mptcp_request_sock_ipv4_ops;

1832

++extern struct tcp_request_sock_ops mptcp_join_request_sock_ipv4_ops;

1833

++

1834

++#ifdef CONFIG_MPTCP

1835

++

1836

++int mptcp_v4_do_rcv(struct sock *meta_sk, struct sk_buff *skb);

1837

++struct sock *mptcp_v4_search_req(const __be16 rport, const __be32 raddr,

1838

++				 const __be32 laddr, const struct net *net);

1839

++int mptcp_init4_subsockets(struct sock *meta_sk, const struct mptcp_loc4 *loc,

1840

++			   struct mptcp_rem4 *rem);

1841

++int mptcp_pm_v4_init(void);

1842

++void mptcp_pm_v4_undo(void);

1843

++u32 mptcp_v4_get_nonce(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport);

1844

++u64 mptcp_v4_get_key(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport);

1845

++

1846

++#else

1847

++

1848

++static inline int mptcp_v4_do_rcv(const struct sock *meta_sk,

1849

++				  const struct sk_buff *skb)

1850

++{

1851

++	return 0;

1852

++}

1853

++

1854

++#endif /* CONFIG_MPTCP */

1855

++

1856

++#endif /* MPTCP_V4_H_ */

1857

+diff --git a/include/net/mptcp_v6.h b/include/net/mptcp_v6.h

1858

+new file mode 100644

1859

+index 000000000000..49a4f30ccd4d

1860

+--- /dev/null

1861

++++ b/include/net/mptcp_v6.h

1862

+@@ -0,0 +1,69 @@

1863

++/*

1864

++ *	MPTCP implementation

1865

++ *

1866

++ *	Initial Design & Implementation:

1867

++ *	Sébastien Barré <sebastien.barre@×××××××××.be>

1868

++ *

1869

++ *	Current Maintainer & Author:

1870

++ *	Jaakko Korkeaniemi <jaakko.korkeaniemi@×××××.fi>

1871

++ *

1872

++ *	Additional authors:

1873

++ *	Jaakko Korkeaniemi <jaakko.korkeaniemi@×××××.fi>

1874

++ *	Gregory Detal <gregory.detal@×××××××××.be>

1875

++ *	Fabien Duchêne <fabien.duchene@×××××××××.be>

1876

++ *	Andreas Seelinger <Andreas.Seelinger@×××××××××××.de>

1877

++ *	Lavkesh Lahngir <lavkesh51@×××××.com>

1878

++ *	Andreas Ripke <ripke@××××××.eu>

1879

++ *	Vlad Dogaru <vlad.dogaru@×××××.com>

1880

++ *	Octavian Purdila <octavian.purdila@×××××.com>

1881

++ *	John Ronan <jronan@××××.org>

1882

++ *	Catalin Nicutar <catalin.nicutar@×××××.com>

1883

++ *	Brandon Heller <brandonh@××××××××.edu>

1884

++ *

1885

++ *

1886

++ *	This program is free software; you can redistribute it and/or

1887

++ *      modify it under the terms of the GNU General Public License

1888

++ *      as published by the Free Software Foundation; either version

1889

++ *      2 of the License, or (at your option) any later version.

1890

++ */

1891

++

1892

++#ifndef _MPTCP_V6_H

1893

++#define _MPTCP_V6_H

1894

++

1895

++#include <linux/in6.h>

1896

++#include <net/if_inet6.h>

1897

++

1898

++#include <net/mptcp.h>

1899

++

1900

++

1901

++#ifdef CONFIG_MPTCP

1902

++extern const struct inet_connection_sock_af_ops mptcp_v6_mapped;

1903

++extern const struct inet_connection_sock_af_ops mptcp_v6_specific;

1904

++extern struct request_sock_ops mptcp6_request_sock_ops;

1905

++extern struct tcp_request_sock_ops mptcp_request_sock_ipv6_ops;

1906

++extern struct tcp_request_sock_ops mptcp_join_request_sock_ipv6_ops;

1907

++

1908

++int mptcp_v6_do_rcv(struct sock *meta_sk, struct sk_buff *skb);

1909

++struct sock *mptcp_v6_search_req(const __be16 rport, const struct in6_addr *raddr,

1910

++				 const struct in6_addr *laddr, const struct net *net);

1911

++int mptcp_init6_subsockets(struct sock *meta_sk, const struct mptcp_loc6 *loc,

1912

++			   struct mptcp_rem6 *rem);

1913

++int mptcp_pm_v6_init(void);

1914

++void mptcp_pm_v6_undo(void);

1915

++__u32 mptcp_v6_get_nonce(const __be32 *saddr, const __be32 *daddr,

1916

++			 __be16 sport, __be16 dport);

1917

++u64 mptcp_v6_get_key(const __be32 *saddr, const __be32 *daddr,

1918

++		     __be16 sport, __be16 dport);

1919

++

1920

++#else /* CONFIG_MPTCP */

1921

++

1922

++#define mptcp_v6_mapped ipv6_mapped

1923

++

1924

++static inline int mptcp_v6_do_rcv(struct sock *meta_sk, struct sk_buff *skb)

1925

++{

1926

++	return 0;

1927

++}

1928

++

1929

++#endif /* CONFIG_MPTCP */

1930

++

1931

++#endif /* _MPTCP_V6_H */

1932

+diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h

1933

+index 361d26077196..bae95a11c531 100644

1934

+--- a/include/net/net_namespace.h

1935

++++ b/include/net/net_namespace.h

1936

+@@ -16,6 +16,7 @@

1937

+ #include <net/netns/packet.h>

1938

+ #include <net/netns/ipv4.h>

1939

+ #include <net/netns/ipv6.h>

1940

++#include <net/netns/mptcp.h>

1941

+ #include <net/netns/ieee802154_6lowpan.h>

1942

+ #include <net/netns/sctp.h>

1943

+ #include <net/netns/dccp.h>

1944

+@@ -92,6 +93,9 @@ struct net {

1945

+ #if IS_ENABLED(CONFIG_IPV6)

1946

+ 	struct netns_ipv6	ipv6;

1947

+ #endif

1948

++#if IS_ENABLED(CONFIG_MPTCP)

1949

++	struct netns_mptcp	mptcp;

1950

++#endif

1951

+ #if IS_ENABLED(CONFIG_IEEE802154_6LOWPAN)

1952

+ 	struct netns_ieee802154_lowpan	ieee802154_lowpan;

1953

+ #endif

1954

+diff --git a/include/net/netns/mptcp.h b/include/net/netns/mptcp.h

1955

+new file mode 100644

1956

+index 000000000000..bad418b04cc8

1957

+--- /dev/null

1958

++++ b/include/net/netns/mptcp.h

1959

+@@ -0,0 +1,44 @@

1960

++/*

1961

++ *	MPTCP implementation - MPTCP namespace

1962

++ *

1963

++ *	Initial Design & Implementation:

1964

++ *	Sébastien Barré <sebastien.barre@×××××××××.be>

1965

++ *

1966

++ *	Current Maintainer:

1967

++ *	Christoph Paasch <christoph.paasch@×××××××××.be>

1968

++ *

1969

++ *	Additional authors:

1970

++ *	Jaakko Korkeaniemi <jaakko.korkeaniemi@×××××.fi>

1971

++ *	Gregory Detal <gregory.detal@×××××××××.be>

1972

++ *	Fabien Duchêne <fabien.duchene@×××××××××.be>

1973

++ *	Andreas Seelinger <Andreas.Seelinger@×××××××××××.de>

1974

++ *	Lavkesh Lahngir <lavkesh51@×××××.com>

1975

++ *	Andreas Ripke <ripke@××××××.eu>

1976

++ *	Vlad Dogaru <vlad.dogaru@×××××.com>

1977

++ *	Octavian Purdila <octavian.purdila@×××××.com>

1978

++ *	John Ronan <jronan@××××.org>

1979

++ *	Catalin Nicutar <catalin.nicutar@×××××.com>

1980

++ *	Brandon Heller <brandonh@××××××××.edu>

1981

++ *

1982

++ *

1983

++ *	This program is free software; you can redistribute it and/or

1984

++ *      modify it under the terms of the GNU General Public License

1985

++ *      as published by the Free Software Foundation; either version

1986

++ *      2 of the License, or (at your option) any later version.

1987

++ */

1988

++

1989

++#ifndef __NETNS_MPTCP_H__

1990

++#define __NETNS_MPTCP_H__

1991

++

1992

++#include <linux/compiler.h>

1993

++

1994

++enum {

1995

++	MPTCP_PM_FULLMESH = 0,

1996

++	MPTCP_PM_MAX

1997

++};

1998

++

1999

++struct netns_mptcp {

2000

++	void *path_managers[MPTCP_PM_MAX];

2001

++};

2002

++

2003

++#endif /* __NETNS_MPTCP_H__ */

2004

+diff --git a/include/net/request_sock.h b/include/net/request_sock.h

2005

+index 7f830ff67f08..e79e87a8e1a6 100644

2006

+--- a/include/net/request_sock.h

2007

++++ b/include/net/request_sock.h

2008

+@@ -164,7 +164,7 @@ struct request_sock_queue {

2009

+ };

2010

+

2011

+ int reqsk_queue_alloc(struct request_sock_queue *queue,

2012

+-		      unsigned int nr_table_entries);

2013

++		      unsigned int nr_table_entries, gfp_t flags);

2014

+

2015

+ void __reqsk_queue_destroy(struct request_sock_queue *queue);

2016

+ void reqsk_queue_destroy(struct request_sock_queue *queue);

2017

+diff --git a/include/net/sock.h b/include/net/sock.h

2018

+index 156350745700..0e23cae8861f 100644

2019

+--- a/include/net/sock.h

2020

++++ b/include/net/sock.h

2021

+@@ -901,6 +901,16 @@ void sk_clear_memalloc(struct sock *sk);

2022

+

2023

+ int sk_wait_data(struct sock *sk, long *timeo);

2024

+

2025

++/* START - needed for MPTCP */

2026

++struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority, int family);

2027

++void sock_lock_init(struct sock *sk);

2028

++

2029

++extern struct lock_class_key af_callback_keys[AF_MAX];

2030

++extern char *const af_family_clock_key_strings[AF_MAX+1];

2031

++

2032

++#define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE))

2033

++/* END - needed for MPTCP */

2034

++

2035

+ struct request_sock_ops;

2036

+ struct timewait_sock_ops;

2037

+ struct inet_hashinfo;

2038

+diff --git a/include/net/tcp.h b/include/net/tcp.h

2039

+index 7286db80e8b8..ff92e74cd684 100644

2040

+--- a/include/net/tcp.h

2041

++++ b/include/net/tcp.h

2042

+@@ -177,6 +177,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo);

2043

+ #define TCPOPT_SACK             5       /* SACK Block */

2044

+ #define TCPOPT_TIMESTAMP	8	/* Better RTT estimations/PAWS */

2045

+ #define TCPOPT_MD5SIG		19	/* MD5 Signature (RFC2385) */

2046

++#define TCPOPT_MPTCP		30

2047

+ #define TCPOPT_EXP		254	/* Experimental */

2048

+ /* Magic number to be after the option value for sharing TCP

2049

+  * experimental options. See draft-ietf-tcpm-experimental-options-00.txt

2050

+@@ -229,6 +230,27 @@ void tcp_time_wait(struct sock *sk, int state, int timeo);

2051

+ #define	TFO_SERVER_WO_SOCKOPT1	0x400

2052

+ #define	TFO_SERVER_WO_SOCKOPT2	0x800

2053

+

2054

++/* Flags from tcp_input.c for tcp_ack */

2055

++#define FLAG_DATA               0x01 /* Incoming frame contained data.          */

2056

++#define FLAG_WIN_UPDATE         0x02 /* Incoming ACK was a window update.       */

2057

++#define FLAG_DATA_ACKED         0x04 /* This ACK acknowledged new data.         */

2058

++#define FLAG_RETRANS_DATA_ACKED 0x08 /* "" "" some of which was retransmitted.  */

2059

++#define FLAG_SYN_ACKED          0x10 /* This ACK acknowledged SYN.              */

2060

++#define FLAG_DATA_SACKED        0x20 /* New SACK.                               */

2061

++#define FLAG_ECE                0x40 /* ECE in this ACK                         */

2062

++#define FLAG_SLOWPATH           0x100 /* Do not skip RFC checks for window update.*/

2063

++#define FLAG_ORIG_SACK_ACKED    0x200 /* Never retransmitted data are (s)acked  */

2064

++#define FLAG_SND_UNA_ADVANCED   0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */

2065

++#define FLAG_DSACKING_ACK       0x800 /* SACK blocks contained D-SACK info */

2066

++#define FLAG_SACK_RENEGING      0x2000 /* snd_una advanced to a sacked seq */

2067

++#define FLAG_UPDATE_TS_RECENT   0x4000 /* tcp_replace_ts_recent() */

2068

++#define MPTCP_FLAG_DATA_ACKED	0x8000

2069

++

2070

++#define FLAG_ACKED              (FLAG_DATA_ACKED|FLAG_SYN_ACKED)

2071

++#define FLAG_NOT_DUP            (FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED)

2072

++#define FLAG_CA_ALERT           (FLAG_DATA_SACKED|FLAG_ECE)

2073

++#define FLAG_FORWARD_PROGRESS   (FLAG_ACKED|FLAG_DATA_SACKED)

2074

++

2075

+ extern struct inet_timewait_death_row tcp_death_row;

2076

+

2077

+ /* sysctl variables for tcp */

2078

+@@ -344,6 +366,107 @@ extern struct proto tcp_prot;

2079

+ #define TCP_ADD_STATS_USER(net, field, val) SNMP_ADD_STATS_USER((net)->mib.tcp_statistics, field, val)

2080

+ #define TCP_ADD_STATS(net, field, val)	SNMP_ADD_STATS((net)->mib.tcp_statistics, field, val)

2081

+

2082

++/**** START - Exports needed for MPTCP ****/

2083

++extern const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops;

2084

++extern const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops;

2085

++

2086

++struct mptcp_options_received;

2087

++

2088

++void tcp_enter_quickack_mode(struct sock *sk);

2089

++int tcp_close_state(struct sock *sk);

2090

++void tcp_minshall_update(struct tcp_sock *tp, unsigned int mss_now,

2091

++			 const struct sk_buff *skb);

2092

++int tcp_xmit_probe_skb(struct sock *sk, int urgent);

2093

++void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb);

2094

++int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,

2095

++		     gfp_t gfp_mask);

2096

++unsigned int tcp_mss_split_point(const struct sock *sk,

2097

++				 const struct sk_buff *skb,

2098

++				 unsigned int mss_now,

2099

++				 unsigned int max_segs,

2100

++				 int nonagle);

2101

++bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buff *skb,

2102

++		    unsigned int cur_mss, int nonagle);

2103

++bool tcp_snd_wnd_test(const struct tcp_sock *tp, const struct sk_buff *skb,

2104

++		      unsigned int cur_mss);

2105

++unsigned int tcp_cwnd_test(const struct tcp_sock *tp, const struct sk_buff *skb);

2106

++int tcp_init_tso_segs(const struct sock *sk, struct sk_buff *skb,

2107

++		      unsigned int mss_now);

2108

++void __pskb_trim_head(struct sk_buff *skb, int len);

2109

++void tcp_queue_skb(struct sock *sk, struct sk_buff *skb);

2110

++void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags);

2111

++void tcp_reset(struct sock *sk);

2112

++bool tcp_may_update_window(const struct tcp_sock *tp, const u32 ack,

2113

++			   const u32 ack_seq, const u32 nwin);

2114

++bool tcp_urg_mode(const struct tcp_sock *tp);

2115

++void tcp_ack_probe(struct sock *sk);

2116

++void tcp_rearm_rto(struct sock *sk);

2117

++int tcp_write_timeout(struct sock *sk);

2118

++bool retransmits_timed_out(struct sock *sk, unsigned int boundary,

2119

++			   unsigned int timeout, bool syn_set);

2120

++void tcp_write_err(struct sock *sk);

2121

++void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int decr);

2122

++void tcp_set_skb_tso_segs(const struct sock *sk, struct sk_buff *skb,

2123

++			  unsigned int mss_now);

2124

++

2125

++int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req);

2126

++void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,

2127

++			   struct request_sock *req);

2128

++__u32 tcp_v4_init_sequence(const struct sk_buff *skb);

2129

++int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,

2130

++		       struct flowi *fl,

2131

++		       struct request_sock *req,

2132

++		       u16 queue_mapping,

2133

++		       struct tcp_fastopen_cookie *foc);

2134

++void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb);

2135

++struct ip_options_rcu *tcp_v4_save_options(struct sk_buff *skb);

2136

++struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb);

2137

++void tcp_v4_reqsk_destructor(struct request_sock *req);

2138

++

2139

++int tcp_v6_rtx_synack(struct sock *sk, struct request_sock *req);

2140

++void tcp_v6_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,

2141

++			   struct request_sock *req);

2142

++__u32 tcp_v6_init_sequence(const struct sk_buff *skb);

2143

++int tcp_v6_send_synack(struct sock *sk, struct dst_entry *dst,

2144

++		       struct flowi *fl, struct request_sock *req,

2145

++		       u16 queue_mapping, struct tcp_fastopen_cookie *foc);

2146

++void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb);

2147

++int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb);

2148

++int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len);

2149

++void tcp_v6_destroy_sock(struct sock *sk);

2150

++void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb);

2151

++void tcp_v6_hash(struct sock *sk);

2152

++struct sock *tcp_v6_hnd_req(struct sock *sk,struct sk_buff *skb);

2153

++struct sock *tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,

2154

++			          struct request_sock *req,

2155

++				  struct dst_entry *dst);

2156

++void tcp_v6_reqsk_destructor(struct request_sock *req);

2157

++

2158

++unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,

2159

++				       int large_allowed);

2160

++u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb);

2161

++

2162

++void skb_clone_fraglist(struct sk_buff *skb);

2163

++void copy_skb_header(struct sk_buff *new, const struct sk_buff *old);

2164

++

2165

++void inet_twsk_free(struct inet_timewait_sock *tw);

2166

++int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb);

2167

++/* These states need RST on ABORT according to RFC793 */

2168

++static inline bool tcp_need_reset(int state)

2169

++{

2170

++	return (1 << state) &

2171

++	       (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 |

2172

++		TCPF_FIN_WAIT2 | TCPF_SYN_RECV);

2173

++}

2174

++

2175

++bool tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb,

2176

++			    int hlen);

2177

++int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int hdrlen,

2178

++			       bool *fragstolen);

2179

++bool tcp_try_coalesce(struct sock *sk, struct sk_buff *to,

2180

++		      struct sk_buff *from, bool *fragstolen);

2181

++/**** END - Exports needed for MPTCP ****/

2182

++

2183

+ void tcp_tasklet_init(void);

2184

+

2185

+ void tcp_v4_err(struct sk_buff *skb, u32);

2186

+@@ -440,6 +563,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,

2187

+ 		size_t len, int nonblock, int flags, int *addr_len);

2188

+ void tcp_parse_options(const struct sk_buff *skb,

2189

+ 		       struct tcp_options_received *opt_rx,

2190

++		       struct mptcp_options_received *mopt_rx,

2191

+ 		       int estab, struct tcp_fastopen_cookie *foc);

2192

+ const u8 *tcp_parse_md5sig_option(const struct tcphdr *th);

2193

+

2194

+@@ -493,14 +617,8 @@ static inline u32 tcp_cookie_time(void)

2195

+

2196

+ u32 __cookie_v4_init_sequence(const struct iphdr *iph, const struct tcphdr *th,

2197

+ 			      u16 *mssp);

2198

+-__u32 cookie_v4_init_sequence(struct sock *sk, struct sk_buff *skb, __u16 *mss);

2199

+-#else

2200

+-static inline __u32 cookie_v4_init_sequence(struct sock *sk,

2201

+-					    struct sk_buff *skb,

2202

+-					    __u16 *mss)

2203

+-{

2204

+-	return 0;

2205

+-}

2206

++__u32 cookie_v4_init_sequence(struct sock *sk, const struct sk_buff *skb,

2207

++			      __u16 *mss);

2208

+ #endif

2209

+

2210

+ __u32 cookie_init_timestamp(struct request_sock *req);

2211

+@@ -516,13 +634,6 @@ u32 __cookie_v6_init_sequence(const struct ipv6hdr *iph,

2212

+ 			      const struct tcphdr *th, u16 *mssp);

2213

+ __u32 cookie_v6_init_sequence(struct sock *sk, const struct sk_buff *skb,

2214

+ 			      __u16 *mss);

2215

+-#else

2216

+-static inline __u32 cookie_v6_init_sequence(struct sock *sk,

2217

+-					    struct sk_buff *skb,

2218

+-					    __u16 *mss)

2219

+-{

2220

+-	return 0;

2221

+-}

2222

+ #endif

2223

+ /* tcp_output.c */

2224

+

2225

+@@ -551,10 +662,17 @@ void tcp_send_delayed_ack(struct sock *sk);

2226

+ void tcp_send_loss_probe(struct sock *sk);

2227

+ bool tcp_schedule_loss_probe(struct sock *sk);

2228

+

2229

++u16 tcp_select_window(struct sock *sk);

2230

++bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,

2231

++		int push_one, gfp_t gfp);

2232

++

2233

+ /* tcp_input.c */

2234

+ void tcp_resume_early_retransmit(struct sock *sk);

2235

+ void tcp_rearm_rto(struct sock *sk);

2236

+ void tcp_reset(struct sock *sk);

2237

++void tcp_set_rto(struct sock *sk);

2238

++bool tcp_should_expand_sndbuf(const struct sock *sk);

2239

++bool tcp_prune_ofo_queue(struct sock *sk);

2240

+

2241

+ /* tcp_timer.c */

2242

+ void tcp_init_xmit_timers(struct sock *);

2243

+@@ -703,14 +821,27 @@ void tcp_send_window_probe(struct sock *sk);

2244

+  */

2245

+ struct tcp_skb_cb {

2246

+ 	union {

2247

+-		struct inet_skb_parm	h4;

2248

++		union {

2249

++			struct inet_skb_parm	h4;

2250

+ #if IS_ENABLED(CONFIG_IPV6)

2251

+-		struct inet6_skb_parm	h6;

2252

++			struct inet6_skb_parm	h6;

2253

+ #endif

2254

+-	} header;	/* For incoming frames		*/

2255

++		} header;	/* For incoming frames		*/

2256

++#ifdef CONFIG_MPTCP

2257

++		union {			/* For MPTCP outgoing frames */

2258

++			__u32 path_mask; /* paths that tried to send this skb */

2259

++			__u32 dss[6];	/* DSS options */

2260

++		};

2261

++#endif

2262

++	};

2263

+ 	__u32		seq;		/* Starting sequence number	*/

2264

+ 	__u32		end_seq;	/* SEQ + FIN + SYN + datalen	*/

2265

+ 	__u32		when;		/* used to compute rtt's	*/

2266

++#ifdef CONFIG_MPTCP

2267

++	__u8		mptcp_flags;	/* flags for the MPTCP layer    */

2268

++	__u8		dss_off;	/* Number of 4-byte words until

2269

++					 * seq-number */

2270

++#endif

2271

+ 	__u8		tcp_flags;	/* TCP header flags. (tcp[13])	*/

2272

+

2273

+ 	__u8		sacked;		/* State flags for SACK/FACK.	*/

2274

+@@ -1075,7 +1206,8 @@ u32 tcp_default_init_rwnd(u32 mss);

2275

+ /* Determine a window scaling and initial window to offer. */

2276

+ void tcp_select_initial_window(int __space, __u32 mss, __u32 *rcv_wnd,

2277

+ 			       __u32 *window_clamp, int wscale_ok,

2278

+-			       __u8 *rcv_wscale, __u32 init_rcv_wnd);

2279

++			       __u8 *rcv_wscale, __u32 init_rcv_wnd,

2280

++			       const struct sock *sk);

2281

+

2282

+ static inline int tcp_win_from_space(int space)

2283

+ {

2284

+@@ -1084,15 +1216,34 @@ static inline int tcp_win_from_space(int space)

2285

+ 		space - (space>>sysctl_tcp_adv_win_scale);

2286

+ }

2287

+

2288

++#ifdef CONFIG_MPTCP

2289

++extern struct static_key mptcp_static_key;

2290

++static inline bool mptcp(const struct tcp_sock *tp)

2291

++{

2292

++	return static_key_false(&mptcp_static_key) && tp->mpc;

2293

++}

2294

++#else

2295

++static inline bool mptcp(const struct tcp_sock *tp)

2296

++{

2297

++	return 0;

2298

++}

2299

++#endif

2300

++

2301

+ /* Note: caller must be prepared to deal with negative returns */ 

2302

+ static inline int tcp_space(const struct sock *sk)

2303

+ {

2304

++	if (mptcp(tcp_sk(sk)))

2305

++		sk = tcp_sk(sk)->meta_sk;

2306

++

2307

+ 	return tcp_win_from_space(sk->sk_rcvbuf -

2308

+ 				  atomic_read(&sk->sk_rmem_alloc));

2309

+ }

2310

+

2311

+ static inline int tcp_full_space(const struct sock *sk)

2312

+ {

2313

++	if (mptcp(tcp_sk(sk)))

2314

++		sk = tcp_sk(sk)->meta_sk;

2315

++

2316

+ 	return tcp_win_from_space(sk->sk_rcvbuf); 

2317

+ }

2318

+

2319

+@@ -1115,6 +1266,8 @@ static inline void tcp_openreq_init(struct request_sock *req,

2320

+ 	ireq->wscale_ok = rx_opt->wscale_ok;

2321

+ 	ireq->acked = 0;

2322

+ 	ireq->ecn_ok = 0;

2323

++	ireq->mptcp_rqsk = 0;

2324

++	ireq->saw_mpc = 0;

2325

+ 	ireq->ir_rmt_port = tcp_hdr(skb)->source;

2326

+ 	ireq->ir_num = ntohs(tcp_hdr(skb)->dest);

2327

+ }

2328

+@@ -1585,6 +1738,11 @@ int tcp4_proc_init(void);

2329

+ void tcp4_proc_exit(void);

2330

+ #endif

2331

+

2332

++int tcp_rtx_synack(struct sock *sk, struct request_sock *req);

2333

++int tcp_conn_request(struct request_sock_ops *rsk_ops,

2334

++		     const struct tcp_request_sock_ops *af_ops,

2335

++		     struct sock *sk, struct sk_buff *skb);

2336

++

2337

+ /* TCP af-specific functions */

2338

+ struct tcp_sock_af_ops {

2339

+ #ifdef CONFIG_TCP_MD5SIG

2340

+@@ -1601,7 +1759,32 @@ struct tcp_sock_af_ops {

2341

+ #endif

2342

+ };

2343

+

2344

++/* TCP/MPTCP-specific functions */

2345

++struct tcp_sock_ops {

2346

++	u32 (*__select_window)(struct sock *sk);

2347

++	u16 (*select_window)(struct sock *sk);

2348

++	void (*select_initial_window)(int __space, __u32 mss, __u32 *rcv_wnd,

2349

++				      __u32 *window_clamp, int wscale_ok,

2350

++				      __u8 *rcv_wscale, __u32 init_rcv_wnd,

2351

++				      const struct sock *sk);

2352

++	void (*init_buffer_space)(struct sock *sk);

2353

++	void (*set_rto)(struct sock *sk);

2354

++	bool (*should_expand_sndbuf)(const struct sock *sk);

2355

++	void (*send_fin)(struct sock *sk);

2356

++	bool (*write_xmit)(struct sock *sk, unsigned int mss_now, int nonagle,

2357

++			   int push_one, gfp_t gfp);

2358

++	void (*send_active_reset)(struct sock *sk, gfp_t priority);

2359

++	int (*write_wakeup)(struct sock *sk);

2360

++	bool (*prune_ofo_queue)(struct sock *sk);

2361

++	void (*retransmit_timer)(struct sock *sk);

2362

++	void (*time_wait)(struct sock *sk, int state, int timeo);

2363

++	void (*cleanup_rbuf)(struct sock *sk, int copied);

2364

++	void (*init_congestion_control)(struct sock *sk);

2365

++};

2366

++extern const struct tcp_sock_ops tcp_specific;

2367

++

2368

+ struct tcp_request_sock_ops {

2369

++	u16 mss_clamp;

2370

+ #ifdef CONFIG_TCP_MD5SIG

2371

+ 	struct tcp_md5sig_key	*(*md5_lookup) (struct sock *sk,

2372

+ 						struct request_sock *req);

2373

+@@ -1611,8 +1794,39 @@ struct tcp_request_sock_ops {

2374

+ 						  const struct request_sock *req,

2375

+ 						  const struct sk_buff *skb);

2376

+ #endif

2377

++	int (*init_req)(struct request_sock *req, struct sock *sk,

2378

++			 struct sk_buff *skb);

2379

++#ifdef CONFIG_SYN_COOKIES

2380

++	__u32 (*cookie_init_seq)(struct sock *sk, const struct sk_buff *skb,

2381

++				 __u16 *mss);

2382

++#endif

2383

++	struct dst_entry *(*route_req)(struct sock *sk, struct flowi *fl,

2384

++				       const struct request_sock *req,

2385

++				       bool *strict);

2386

++	__u32 (*init_seq)(const struct sk_buff *skb);

2387

++	int (*send_synack)(struct sock *sk, struct dst_entry *dst,

2388

++			   struct flowi *fl, struct request_sock *req,

2389

++			   u16 queue_mapping, struct tcp_fastopen_cookie *foc);

2390

++	void (*queue_hash_add)(struct sock *sk, struct request_sock *req,

2391

++			       const unsigned long timeout);

2392

+ };

2393

+

2394

++#ifdef CONFIG_SYN_COOKIES

2395

++static inline __u32 cookie_init_sequence(const struct tcp_request_sock_ops *ops,

2396

++					 struct sock *sk, struct sk_buff *skb,

2397

++					 __u16 *mss)

2398

++{

2399

++	return ops->cookie_init_seq(sk, skb, mss);

2400

++}

2401

++#else

2402

++static inline __u32 cookie_init_sequence(const struct tcp_request_sock_ops *ops,

2403

++					 struct sock *sk, struct sk_buff *skb,

2404

++					 __u16 *mss)

2405

++{

2406

++	return 0;

2407

++}

2408

++#endif

2409

++

2410

+ int tcpv4_offload_init(void);

2411

+

2412

+ void tcp_v4_init(void);

2413

+diff --git a/include/uapi/linux/if.h b/include/uapi/linux/if.h

2414

+index 9cf2394f0bcf..c2634b6ed854 100644

2415

+--- a/include/uapi/linux/if.h

2416

++++ b/include/uapi/linux/if.h

2417

+@@ -109,6 +109,9 @@ enum net_device_flags {

2418

+ #define IFF_DORMANT			IFF_DORMANT

2419

+ #define IFF_ECHO			IFF_ECHO

2420

+

2421

++#define IFF_NOMULTIPATH	0x80000		/* Disable for MPTCP 		*/

2422

++#define IFF_MPBACKUP	0x100000	/* Use as backup path for MPTCP */

2423

++

2424

+ #define IFF_VOLATILE	(IFF_LOOPBACK|IFF_POINTOPOINT|IFF_BROADCAST|IFF_ECHO|\

2425

+ 		IFF_MASTER|IFF_SLAVE|IFF_RUNNING|IFF_LOWER_UP|IFF_DORMANT)

2426

+

2427

+diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h

2428

+index 3b9718328d8b..487475681d84 100644

2429

+--- a/include/uapi/linux/tcp.h

2430

++++ b/include/uapi/linux/tcp.h

2431

+@@ -112,6 +112,7 @@ enum {

2432

+ #define TCP_FASTOPEN		23	/* Enable FastOpen on listeners */

2433

+ #define TCP_TIMESTAMP		24

2434

+ #define TCP_NOTSENT_LOWAT	25	/* limit number of unsent bytes in write queue */

2435

++#define MPTCP_ENABLED		26

2436

+

2437

+ struct tcp_repair_opt {

2438

+ 	__u32	opt_code;

2439

+diff --git a/net/Kconfig b/net/Kconfig

2440

+index d92afe4204d9..96b58593ad5e 100644

2441

+--- a/net/Kconfig

2442

++++ b/net/Kconfig

2443

+@@ -79,6 +79,7 @@ if INET

2444

+ source "net/ipv4/Kconfig"

2445

+ source "net/ipv6/Kconfig"

2446

+ source "net/netlabel/Kconfig"

2447

++source "net/mptcp/Kconfig"

2448

+

2449

+ endif # if INET

2450

+

2451

+diff --git a/net/Makefile b/net/Makefile

2452

+index cbbbe6d657ca..244bac1435b1 100644

2453

+--- a/net/Makefile

2454

++++ b/net/Makefile

2455

+@@ -20,6 +20,7 @@ obj-$(CONFIG_INET)		+= ipv4/

2456

+ obj-$(CONFIG_XFRM)		+= xfrm/

2457

+ obj-$(CONFIG_UNIX)		+= unix/

2458

+ obj-$(CONFIG_NET)		+= ipv6/

2459

++obj-$(CONFIG_MPTCP)		+= mptcp/

2460

+ obj-$(CONFIG_PACKET)		+= packet/

2461

+ obj-$(CONFIG_NET_KEY)		+= key/

2462

+ obj-$(CONFIG_BRIDGE)		+= bridge/

2463

+diff --git a/net/core/dev.c b/net/core/dev.c

2464

+index 367a586d0c8a..215d2757fbf6 100644

2465

+--- a/net/core/dev.c

2466

++++ b/net/core/dev.c

2467

+@@ -5420,7 +5420,7 @@ int __dev_change_flags(struct net_device *dev, unsigned int flags)

2468

+

2469

+ 	dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |

2470

+ 			       IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |

2471

+-			       IFF_AUTOMEDIA)) |

2472

++			       IFF_AUTOMEDIA | IFF_NOMULTIPATH | IFF_MPBACKUP)) |

2473

+ 		     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |

2474

+ 				    IFF_ALLMULTI));

2475

+

2476

+diff --git a/net/core/request_sock.c b/net/core/request_sock.c

2477

+index 467f326126e0..909dfa13f499 100644

2478

+--- a/net/core/request_sock.c

2479

++++ b/net/core/request_sock.c

2480

+@@ -38,7 +38,8 @@ int sysctl_max_syn_backlog = 256;

2481

+ EXPORT_SYMBOL(sysctl_max_syn_backlog);

2482

+

2483

+ int reqsk_queue_alloc(struct request_sock_queue *queue,

2484

+-		      unsigned int nr_table_entries)

2485

++		      unsigned int nr_table_entries,

2486

++		      gfp_t flags)

2487

+ {

2488

+ 	size_t lopt_size = sizeof(struct listen_sock);

2489

+ 	struct listen_sock *lopt;

2490

+@@ -48,9 +49,11 @@ int reqsk_queue_alloc(struct request_sock_queue *queue,

2491

+ 	nr_table_entries = roundup_pow_of_two(nr_table_entries + 1);

2492

+ 	lopt_size += nr_table_entries * sizeof(struct request_sock *);

2493

+ 	if (lopt_size > PAGE_SIZE)

2494

+-		lopt = vzalloc(lopt_size);

2495

++		lopt = __vmalloc(lopt_size,

2496

++			flags | __GFP_HIGHMEM | __GFP_ZERO,

2497

++			PAGE_KERNEL);

2498

+ 	else

2499

+-		lopt = kzalloc(lopt_size, GFP_KERNEL);

2500

++		lopt = kzalloc(lopt_size, flags);

2501

+ 	if (lopt == NULL)

2502

+ 		return -ENOMEM;

2503

+

2504

+diff --git a/net/core/skbuff.c b/net/core/skbuff.c

2505

+index c1a33033cbe2..8abc5d60fbe3 100644

2506

+--- a/net/core/skbuff.c

2507

++++ b/net/core/skbuff.c

2508

+@@ -472,7 +472,7 @@ static inline void skb_drop_fraglist(struct sk_buff *skb)

2509

+ 	skb_drop_list(&skb_shinfo(skb)->frag_list);

2510

+ }

2511

+

2512

+-static void skb_clone_fraglist(struct sk_buff *skb)

2513

++void skb_clone_fraglist(struct sk_buff *skb)

2514

+ {

2515

+ 	struct sk_buff *list;

2516

+

2517

+@@ -897,7 +897,7 @@ static void skb_headers_offset_update(struct sk_buff *skb, int off)

2518

+ 	skb->inner_mac_header += off;

2519

+ }

2520

+

2521

+-static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)

2522

++void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)

2523

+ {

2524

+ 	__copy_skb_header(new, old);

2525

+

2526

+diff --git a/net/core/sock.c b/net/core/sock.c

2527

+index 026e01f70274..359295523177 100644

2528

+--- a/net/core/sock.c

2529

++++ b/net/core/sock.c

2530

+@@ -136,6 +136,11 @@

2531

+

2532

+ #include <trace/events/sock.h>

2533

+

2534

++#ifdef CONFIG_MPTCP

2535

++#include <net/mptcp.h>

2536

++#include <net/inet_common.h>

2537

++#endif

2538

++

2539

+ #ifdef CONFIG_INET

2540

+ #include <net/tcp.h>

2541

+ #endif

2542

+@@ -280,7 +285,7 @@ static const char *const af_family_slock_key_strings[AF_MAX+1] = {

2543

+   "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG"      ,

2544

+   "slock-AF_NFC"   , "slock-AF_VSOCK"    ,"slock-AF_MAX"

2545

+ };

2546

+-static const char *const af_family_clock_key_strings[AF_MAX+1] = {

2547

++char *const af_family_clock_key_strings[AF_MAX+1] = {

2548

+   "clock-AF_UNSPEC", "clock-AF_UNIX"     , "clock-AF_INET"     ,

2549

+   "clock-AF_AX25"  , "clock-AF_IPX"      , "clock-AF_APPLETALK",

2550

+   "clock-AF_NETROM", "clock-AF_BRIDGE"   , "clock-AF_ATMPVC"   ,

2551

+@@ -301,7 +306,7 @@ static const char *const af_family_clock_key_strings[AF_MAX+1] = {

2552

+  * sk_callback_lock locking rules are per-address-family,

2553

+  * so split the lock classes by using a per-AF key:

2554

+  */

2555

+-static struct lock_class_key af_callback_keys[AF_MAX];

2556

++struct lock_class_key af_callback_keys[AF_MAX];

2557

+

2558

+ /* Take into consideration the size of the struct sk_buff overhead in the

2559

+  * determination of these values, since that is non-constant across

2560

+@@ -422,8 +427,6 @@ static void sock_warn_obsolete_bsdism(const char *name)

2561

+ 	}

2562

+ }

2563

+

2564

+-#define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE))

2565

+-

2566

+ static void sock_disable_timestamp(struct sock *sk, unsigned long flags)

2567

+ {

2568

+ 	if (sk->sk_flags & flags) {

2569

+@@ -1253,8 +1256,25 @@ lenout:

2570

+  *

2571

+  * (We also register the sk_lock with the lock validator.)

2572

+  */

2573

+-static inline void sock_lock_init(struct sock *sk)

2574

+-{

2575

++void sock_lock_init(struct sock *sk)

2576

++{

2577

++#ifdef CONFIG_MPTCP

2578

++	/* Reclassify the lock-class for subflows */

2579

++	if (sk->sk_type == SOCK_STREAM && sk->sk_protocol == IPPROTO_TCP)

2580

++		if (mptcp(tcp_sk(sk)) || tcp_sk(sk)->is_master_sk) {

2581

++			sock_lock_init_class_and_name(sk, "slock-AF_INET-MPTCP",

2582

++						      &meta_slock_key,

2583

++						      "sk_lock-AF_INET-MPTCP",

2584

++						      &meta_key);

2585

++

2586

++			/* We don't yet have the mptcp-point.

2587

++			 * Thus we still need inet_sock_destruct

2588

++			 */

2589

++			sk->sk_destruct = inet_sock_destruct;

2590

++			return;

2591

++		}

2592

++#endif

2593

++

2594

+ 	sock_lock_init_class_and_name(sk,

2595

+ 			af_family_slock_key_strings[sk->sk_family],

2596

+ 			af_family_slock_keys + sk->sk_family,

2597

+@@ -1301,7 +1321,7 @@ void sk_prot_clear_portaddr_nulls(struct sock *sk, int size)

2598

+ }

2599

+ EXPORT_SYMBOL(sk_prot_clear_portaddr_nulls);

2600

+

2601

+-static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,

2602

++struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,

2603

+ 		int family)

2604

+ {

2605

+ 	struct sock *sk;

2606

+diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c

2607

+index 4db3c2a1679c..04cb17d4b0ce 100644

2608

+--- a/net/dccp/ipv6.c

2609

++++ b/net/dccp/ipv6.c

2610

+@@ -386,7 +386,7 @@ static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb)

2611

+ 	if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)

2612

+ 		goto drop;

2613

+

2614

+-	req = inet6_reqsk_alloc(&dccp6_request_sock_ops);

2615

++	req = inet_reqsk_alloc(&dccp6_request_sock_ops);

2616

+ 	if (req == NULL)

2617

+ 		goto drop;

2618

+

2619

+diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig

2620

+index 05c57f0fcabe..630434db0085 100644

2621

+--- a/net/ipv4/Kconfig

2622

++++ b/net/ipv4/Kconfig

2623

+@@ -556,6 +556,30 @@ config TCP_CONG_ILLINOIS

2624

+ 	For further details see:

2625

+ 	  http://www.ews.uiuc.edu/~shaoliu/tcpillinois/index.html

2626

+

2627

++config TCP_CONG_COUPLED

2628

++	tristate "MPTCP COUPLED CONGESTION CONTROL"

2629

++	depends on MPTCP

2630

++	default n

2631

++	---help---

2632

++	MultiPath TCP Coupled Congestion Control

2633

++	To enable it, just put 'coupled' in tcp_congestion_control

2634

++

2635

++config TCP_CONG_OLIA

2636

++	tristate "MPTCP Opportunistic Linked Increase"

2637

++	depends on MPTCP

2638

++	default n

2639

++	---help---

2640

++	MultiPath TCP Opportunistic Linked Increase Congestion Control

2641

++	To enable it, just put 'olia' in tcp_congestion_control

2642

++

2643

++config TCP_CONG_WVEGAS

2644

++	tristate "MPTCP WVEGAS CONGESTION CONTROL"

2645

++	depends on MPTCP

2646

++	default n

2647

++	---help---

2648

++	wVegas congestion control for MPTCP

2649

++	To enable it, just put 'wvegas' in tcp_congestion_control

2650

++

2651

+ choice

2652

+ 	prompt "Default TCP congestion control"

2653

+ 	default DEFAULT_CUBIC

2654

+@@ -584,6 +608,15 @@ choice

2655

+ 	config DEFAULT_WESTWOOD

2656

+ 		bool "Westwood" if TCP_CONG_WESTWOOD=y

2657

+

2658

++	config DEFAULT_COUPLED

2659

++		bool "Coupled" if TCP_CONG_COUPLED=y

2660

++

2661

++	config DEFAULT_OLIA

2662

++		bool "Olia" if TCP_CONG_OLIA=y

2663

++

2664

++	config DEFAULT_WVEGAS

2665

++		bool "Wvegas" if TCP_CONG_WVEGAS=y

2666

++

2667

+ 	config DEFAULT_RENO

2668

+ 		bool "Reno"

2669

+

2670

+@@ -605,6 +638,8 @@ config DEFAULT_TCP_CONG

2671

+ 	default "vegas" if DEFAULT_VEGAS

2672

+ 	default "westwood" if DEFAULT_WESTWOOD

2673

+ 	default "veno" if DEFAULT_VENO

2674

++	default "coupled" if DEFAULT_COUPLED

2675

++	default "wvegas" if DEFAULT_WVEGAS

2676

+ 	default "reno" if DEFAULT_RENO

2677

+ 	default "cubic"

2678

+

2679

+diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c

2680

+index d156b3c5f363..4afd6d8d9028 100644

2681

+--- a/net/ipv4/af_inet.c

2682

++++ b/net/ipv4/af_inet.c

2683

+@@ -104,6 +104,7 @@

2684

+ #include <net/ip_fib.h>

2685

+ #include <net/inet_connection_sock.h>

2686

+ #include <net/tcp.h>

2687

++#include <net/mptcp.h>

2688

+ #include <net/udp.h>

2689

+ #include <net/udplite.h>

2690

+ #include <net/ping.h>

2691

+@@ -246,8 +247,7 @@ EXPORT_SYMBOL(inet_listen);

2692

+  *	Create an inet socket.

2693

+  */

2694

+

2695

+-static int inet_create(struct net *net, struct socket *sock, int protocol,

2696

+-		       int kern)

2697

++int inet_create(struct net *net, struct socket *sock, int protocol, int kern)

2698

+ {

2699

+ 	struct sock *sk;

2700

+ 	struct inet_protosw *answer;

2701

+@@ -676,6 +676,23 @@ int inet_accept(struct socket *sock, struct socket *newsock, int flags)

2702

+ 	lock_sock(sk2);

2703

+

2704

+ 	sock_rps_record_flow(sk2);

2705

++

2706

++	if (sk2->sk_protocol == IPPROTO_TCP && mptcp(tcp_sk(sk2))) {

2707

++		struct sock *sk_it = sk2;

2708

++

2709

++		mptcp_for_each_sk(tcp_sk(sk2)->mpcb, sk_it)

2710

++			sock_rps_record_flow(sk_it);

2711

++

2712

++		if (tcp_sk(sk2)->mpcb->master_sk) {

2713

++			sk_it = tcp_sk(sk2)->mpcb->master_sk;

2714

++

2715

++			write_lock_bh(&sk_it->sk_callback_lock);

2716

++			sk_it->sk_wq = newsock->wq;

2717

++			sk_it->sk_socket = newsock;

2718

++			write_unlock_bh(&sk_it->sk_callback_lock);

2719

++		}

2720

++	}

2721

++

2722

+ 	WARN_ON(!((1 << sk2->sk_state) &

2723

+ 		  (TCPF_ESTABLISHED | TCPF_SYN_RECV |

2724

+ 		  TCPF_CLOSE_WAIT | TCPF_CLOSE)));

2725

+@@ -1763,6 +1780,9 @@ static int __init inet_init(void)

2726

+

2727

+ 	ip_init();

2728

+

2729

++	/* We must initialize MPTCP before TCP. */

2730

++	mptcp_init();

2731

++

2732

+ 	tcp_v4_init();

2733

+

2734

+ 	/* Setup TCP slab cache for open requests. */

2735

+diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c

2736

+index 14d02ea905b6..7d734d8af19b 100644

2737

+--- a/net/ipv4/inet_connection_sock.c

2738

++++ b/net/ipv4/inet_connection_sock.c

2739

+@@ -23,6 +23,7 @@

2740

+ #include <net/route.h>

2741

+ #include <net/tcp_states.h>

2742

+ #include <net/xfrm.h>

2743

++#include <net/mptcp.h>

2744

+

2745

+ #ifdef INET_CSK_DEBUG

2746

+ const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n";

2747

+@@ -465,8 +466,8 @@ no_route:

2748

+ }

2749

+ EXPORT_SYMBOL_GPL(inet_csk_route_child_sock);

2750

+

2751

+-static inline u32 inet_synq_hash(const __be32 raddr, const __be16 rport,

2752

+-				 const u32 rnd, const u32 synq_hsize)

2753

++u32 inet_synq_hash(const __be32 raddr, const __be16 rport, const u32 rnd,

2754

++		   const u32 synq_hsize)

2755

+ {

2756

+ 	return jhash_2words((__force u32)raddr, (__force u32)rport, rnd) & (synq_hsize - 1);

2757

+ }

2758

+@@ -647,7 +648,7 @@ void inet_csk_reqsk_queue_prune(struct sock *parent,

2759

+

2760

+ 	lopt->clock_hand = i;

2761

+

2762

+-	if (lopt->qlen)

2763

++	if (lopt->qlen && !is_meta_sk(parent))

2764

+ 		inet_csk_reset_keepalive_timer(parent, interval);

2765

+ }

2766

+ EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_prune);

2767

+@@ -664,7 +665,9 @@ struct sock *inet_csk_clone_lock(const struct sock *sk,

2768

+ 				 const struct request_sock *req,

2769

+ 				 const gfp_t priority)

2770

+ {

2771

+-	struct sock *newsk = sk_clone_lock(sk, priority);

2772

++	struct sock *newsk;

2773

++

2774

++	newsk = sk_clone_lock(sk, priority);

2775

+

2776

+ 	if (newsk != NULL) {

2777

+ 		struct inet_connection_sock *newicsk = inet_csk(newsk);

2778

+@@ -743,7 +746,8 @@ int inet_csk_listen_start(struct sock *sk, const int nr_table_entries)

2779

+ {

2780

+ 	struct inet_sock *inet = inet_sk(sk);

2781

+ 	struct inet_connection_sock *icsk = inet_csk(sk);

2782

+-	int rc = reqsk_queue_alloc(&icsk->icsk_accept_queue, nr_table_entries);

2783

++	int rc = reqsk_queue_alloc(&icsk->icsk_accept_queue, nr_table_entries,

2784

++				   GFP_KERNEL);

2785

+

2786

+ 	if (rc != 0)

2787

+ 		return rc;

2788

+@@ -801,9 +805,14 @@ void inet_csk_listen_stop(struct sock *sk)

2789

+

2790

+ 	while ((req = acc_req) != NULL) {

2791

+ 		struct sock *child = req->sk;

2792

++		bool mutex_taken = false;

2793

+

2794

+ 		acc_req = req->dl_next;

2795

+

2796

++		if (is_meta_sk(child)) {

2797

++			mutex_lock(&tcp_sk(child)->mpcb->mpcb_mutex);

2798

++			mutex_taken = true;

2799

++		}

2800

+ 		local_bh_disable();

2801

+ 		bh_lock_sock(child);

2802

+ 		WARN_ON(sock_owned_by_user(child));

2803

+@@ -832,6 +841,8 @@ void inet_csk_listen_stop(struct sock *sk)

2804

+

2805

+ 		bh_unlock_sock(child);

2806

+ 		local_bh_enable();

2807

++		if (mutex_taken)

2808

++			mutex_unlock(&tcp_sk(child)->mpcb->mpcb_mutex);

2809

+ 		sock_put(child);

2810

+

2811

+ 		sk_acceptq_removed(sk);

2812

+diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c

2813

+index c86624b36a62..0ff3fe004d62 100644

2814

+--- a/net/ipv4/syncookies.c

2815

++++ b/net/ipv4/syncookies.c

2816

+@@ -170,7 +170,8 @@ u32 __cookie_v4_init_sequence(const struct iphdr *iph, const struct tcphdr *th,

2817

+ }

2818

+ EXPORT_SYMBOL_GPL(__cookie_v4_init_sequence);

2819

+

2820

+-__u32 cookie_v4_init_sequence(struct sock *sk, struct sk_buff *skb, __u16 *mssp)

2821

++__u32 cookie_v4_init_sequence(struct sock *sk, const struct sk_buff *skb,

2822

++			      __u16 *mssp)

2823

+ {

2824

+ 	const struct iphdr *iph = ip_hdr(skb);

2825

+ 	const struct tcphdr *th = tcp_hdr(skb);

2826

+@@ -284,7 +285,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,

2827

+

2828

+ 	/* check for timestamp cookie support */

2829

+ 	memset(&tcp_opt, 0, sizeof(tcp_opt));

2830

+-	tcp_parse_options(skb, &tcp_opt, 0, NULL);

2831

++	tcp_parse_options(skb, &tcp_opt, NULL, 0, NULL);

2832

+

2833

+ 	if (!cookie_check_timestamp(&tcp_opt, sock_net(sk), &ecn_ok))

2834

+ 		goto out;

2835

+@@ -355,10 +356,10 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,

2836

+ 	/* Try to redo what tcp_v4_send_synack did. */

2837

+ 	req->window_clamp = tp->window_clamp ? :dst_metric(&rt->dst, RTAX_WINDOW);

2838

+

2839

+-	tcp_select_initial_window(tcp_full_space(sk), req->mss,

2840

+-				  &req->rcv_wnd, &req->window_clamp,

2841

+-				  ireq->wscale_ok, &rcv_wscale,

2842

+-				  dst_metric(&rt->dst, RTAX_INITRWND));

2843

++	tp->ops->select_initial_window(tcp_full_space(sk), req->mss,

2844

++				       &req->rcv_wnd, &req->window_clamp,

2845

++				       ireq->wscale_ok, &rcv_wscale,

2846

++				       dst_metric(&rt->dst, RTAX_INITRWND), sk);

2847

+

2848

+ 	ireq->rcv_wscale  = rcv_wscale;

2849

+

2850

+diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c

2851

+index 9d2118e5fbc7..2cb89f886d45 100644

2852

+--- a/net/ipv4/tcp.c

2853

++++ b/net/ipv4/tcp.c

2854

+@@ -271,6 +271,7 @@

2855

+

2856

+ #include <net/icmp.h>

2857

+ #include <net/inet_common.h>

2858

++#include <net/mptcp.h>

2859

+ #include <net/tcp.h>

2860

+ #include <net/xfrm.h>

2861

+ #include <net/ip.h>

2862

+@@ -371,6 +372,24 @@ static int retrans_to_secs(u8 retrans, int timeout, int rto_max)

2863

+ 	return period;

2864

+ }

2865

+

2866

++const struct tcp_sock_ops tcp_specific = {

2867

++	.__select_window		= __tcp_select_window,

2868

++	.select_window			= tcp_select_window,

2869

++	.select_initial_window		= tcp_select_initial_window,

2870

++	.init_buffer_space		= tcp_init_buffer_space,

2871

++	.set_rto			= tcp_set_rto,

2872

++	.should_expand_sndbuf		= tcp_should_expand_sndbuf,

2873

++	.init_congestion_control	= tcp_init_congestion_control,

2874

++	.send_fin			= tcp_send_fin,

2875

++	.write_xmit			= tcp_write_xmit,

2876

++	.send_active_reset		= tcp_send_active_reset,

2877

++	.write_wakeup			= tcp_write_wakeup,

2878

++	.prune_ofo_queue		= tcp_prune_ofo_queue,

2879

++	.retransmit_timer		= tcp_retransmit_timer,

2880

++	.time_wait			= tcp_time_wait,

2881

++	.cleanup_rbuf			= tcp_cleanup_rbuf,

2882

++};

2883

++

2884

+ /* Address-family independent initialization for a tcp_sock.

2885

+  *

2886

+  * NOTE: A lot of things set to zero explicitly by call to

2887

+@@ -419,6 +438,8 @@ void tcp_init_sock(struct sock *sk)

2888

+ 	sk->sk_sndbuf = sysctl_tcp_wmem[1];

2889

+ 	sk->sk_rcvbuf = sysctl_tcp_rmem[1];

2890

+

2891

++	tp->ops = &tcp_specific;

2892

++

2893

+ 	local_bh_disable();

2894

+ 	sock_update_memcg(sk);

2895

+ 	sk_sockets_allocated_inc(sk);

2896

+@@ -726,6 +747,14 @@ ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos,

2897

+ 	int ret;

2898

+

2899

+ 	sock_rps_record_flow(sk);

2900

++

2901

++#ifdef CONFIG_MPTCP

2902

++	if (mptcp(tcp_sk(sk))) {

2903

++		struct sock *sk_it;

2904

++		mptcp_for_each_sk(tcp_sk(sk)->mpcb, sk_it)

2905

++			sock_rps_record_flow(sk_it);

2906

++	}

2907

++#endif

2908

+ 	/*

2909

+ 	 * We can't seek on a socket input

2910

+ 	 */

2911

+@@ -821,8 +850,7 @@ struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp)

2912

+ 	return NULL;

2913

+ }

2914

+

2915

+-static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,

2916

+-				       int large_allowed)

2917

++unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now, int large_allowed)

2918

+ {

2919

+ 	struct tcp_sock *tp = tcp_sk(sk);

2920

+ 	u32 xmit_size_goal, old_size_goal;

2921

+@@ -872,8 +900,13 @@ static int tcp_send_mss(struct sock *sk, int *size_goal, int flags)

2922

+ {

2923

+ 	int mss_now;

2924

+

2925

+-	mss_now = tcp_current_mss(sk);

2926

+-	*size_goal = tcp_xmit_size_goal(sk, mss_now, !(flags & MSG_OOB));

2927

++	if (mptcp(tcp_sk(sk))) {

2928

++		mss_now = mptcp_current_mss(sk);

2929

++		*size_goal = mptcp_xmit_size_goal(sk, mss_now, !(flags & MSG_OOB));

2930

++	} else {

2931

++		mss_now = tcp_current_mss(sk);

2932

++		*size_goal = tcp_xmit_size_goal(sk, mss_now, !(flags & MSG_OOB));

2933

++	}

2934

+

2935

+ 	return mss_now;

2936

+ }

2937

+@@ -892,11 +925,32 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,

2938

+ 	 * is fully established.

2939

+ 	 */

2940

+ 	if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &&

2941

+-	    !tcp_passive_fastopen(sk)) {

2942

++	    !tcp_passive_fastopen(mptcp(tp) && tp->mpcb->master_sk ?

2943

++				  tp->mpcb->master_sk : sk)) {

2944

+ 		if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)

2945

+ 			goto out_err;

2946

+ 	}

2947

+

2948

++	if (mptcp(tp)) {

2949

++		struct sock *sk_it = sk;

2950

++

2951

++		/* We must check this with socket-lock hold because we iterate

2952

++		 * over the subflows.

2953

++		 */

2954

++		if (!mptcp_can_sendpage(sk)) {

2955

++			ssize_t ret;

2956

++

2957

++			release_sock(sk);

2958

++			ret = sock_no_sendpage(sk->sk_socket, page, offset,

2959

++					       size, flags);

2960

++			lock_sock(sk);

2961

++			return ret;

2962

++		}

2963

++

2964

++		mptcp_for_each_sk(tp->mpcb, sk_it)

2965

++			sock_rps_record_flow(sk_it);

2966

++	}

2967

++

2968

+ 	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);

2969

+

2970

+ 	mss_now = tcp_send_mss(sk, &size_goal, flags);

2971

+@@ -1001,8 +1055,9 @@ int tcp_sendpage(struct sock *sk, struct page *page, int offset,

2972

+ {

2973

+ 	ssize_t res;

2974

+

2975

+-	if (!(sk->sk_route_caps & NETIF_F_SG) ||

2976

+-	    !(sk->sk_route_caps & NETIF_F_ALL_CSUM))

2977

++	/* If MPTCP is enabled, we check it later after establishment */

2978

++	if (!mptcp(tcp_sk(sk)) && (!(sk->sk_route_caps & NETIF_F_SG) ||

2979

++	    !(sk->sk_route_caps & NETIF_F_ALL_CSUM)))

2980

+ 		return sock_no_sendpage(sk->sk_socket, page, offset, size,

2981

+ 					flags);

2982

+

2983

+@@ -1018,6 +1073,9 @@ static inline int select_size(const struct sock *sk, bool sg)

2984

+ 	const struct tcp_sock *tp = tcp_sk(sk);

2985

+ 	int tmp = tp->mss_cache;

2986

+

2987

++	if (mptcp(tp))

2988

++		return mptcp_select_size(sk, sg);

2989

++

2990

+ 	if (sg) {

2991

+ 		if (sk_can_gso(sk)) {

2992

+ 			/* Small frames wont use a full page:

2993

+@@ -1100,11 +1158,18 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,

2994

+ 	 * is fully established.

2995

+ 	 */

2996

+ 	if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &&

2997

+-	    !tcp_passive_fastopen(sk)) {

2998

++	    !tcp_passive_fastopen(mptcp(tp) && tp->mpcb->master_sk ?

2999

++				  tp->mpcb->master_sk : sk)) {

3000

+ 		if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)

3001

+ 			goto do_error;

3002

+ 	}

3003

+

3004

++	if (mptcp(tp)) {

3005

++		struct sock *sk_it = sk;

3006

++		mptcp_for_each_sk(tp->mpcb, sk_it)

3007

++			sock_rps_record_flow(sk_it);

3008

++	}

3009

++

3010

+ 	if (unlikely(tp->repair)) {

3011

+ 		if (tp->repair_queue == TCP_RECV_QUEUE) {

3012

+ 			copied = tcp_send_rcvq(sk, msg, size);

3013

+@@ -1132,7 +1197,10 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,

3014

+ 	if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))

3015

+ 		goto out_err;

3016

+

3017

+-	sg = !!(sk->sk_route_caps & NETIF_F_SG);

3018

++	if (mptcp(tp))

3019

++		sg = mptcp_can_sg(sk);

3020

++	else

3021

++		sg = !!(sk->sk_route_caps & NETIF_F_SG);

3022

+

3023

+ 	while (--iovlen >= 0) {

3024

+ 		size_t seglen = iov->iov_len;

3025

+@@ -1183,8 +1251,15 @@ new_segment:

3026

+

3027

+ 				/*

3028

+ 				 * Check whether we can use HW checksum.

3029

++				 *

3030

++				 * If dss-csum is enabled, we do not do hw-csum.

3031

++				 * In case of non-mptcp we check the

3032

++				 * device-capabilities.

3033

++				 * In case of mptcp, hw-csum's will be handled

3034

++				 * later in mptcp_write_xmit.

3035

+ 				 */

3036

+-				if (sk->sk_route_caps & NETIF_F_ALL_CSUM)

3037

++				if (((mptcp(tp) && !tp->mpcb->dss_csum) || !mptcp(tp)) &&

3038

++				    (mptcp(tp) || sk->sk_route_caps & NETIF_F_ALL_CSUM))

3039

+ 					skb->ip_summed = CHECKSUM_PARTIAL;

3040

+

3041

+ 				skb_entail(sk, skb);

3042

+@@ -1422,7 +1497,7 @@ void tcp_cleanup_rbuf(struct sock *sk, int copied)

3043

+

3044

+ 		/* Optimize, __tcp_select_window() is not cheap. */

3045

+ 		if (2*rcv_window_now <= tp->window_clamp) {

3046

+-			__u32 new_window = __tcp_select_window(sk);

3047

++			__u32 new_window = tp->ops->__select_window(sk);

3048

+

3049

+ 			/* Send ACK now, if this read freed lots of space

3050

+ 			 * in our buffer. Certainly, new_window is new window.

3051

+@@ -1587,7 +1662,7 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,

3052

+ 	/* Clean up data we have read: This will do ACK frames. */

3053

+ 	if (copied > 0) {

3054

+ 		tcp_recv_skb(sk, seq, &offset);

3055

+-		tcp_cleanup_rbuf(sk, copied);

3056

++		tp->ops->cleanup_rbuf(sk, copied);

3057

+ 	}

3058

+ 	return copied;

3059

+ }

3060

+@@ -1623,6 +1698,14 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,

3061

+

3062

+ 	lock_sock(sk);

3063

+

3064

++#ifdef CONFIG_MPTCP

3065

++	if (mptcp(tp)) {

3066

++		struct sock *sk_it;

3067

++		mptcp_for_each_sk(tp->mpcb, sk_it)

3068

++			sock_rps_record_flow(sk_it);

3069

++	}

3070

++#endif

3071

++

3072

+ 	err = -ENOTCONN;

3073

+ 	if (sk->sk_state == TCP_LISTEN)

3074

+ 		goto out;

3075

+@@ -1761,7 +1844,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,

3076

+ 			}

3077

+ 		}

3078

+

3079

+-		tcp_cleanup_rbuf(sk, copied);

3080

++		tp->ops->cleanup_rbuf(sk, copied);

3081

+

3082

+ 		if (!sysctl_tcp_low_latency && tp->ucopy.task == user_recv) {

3083

+ 			/* Install new reader */

3084

+@@ -1813,7 +1896,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,

3085

+ 			if (tp->rcv_wnd == 0 &&

3086

+ 			    !skb_queue_empty(&sk->sk_async_wait_queue)) {

3087

+ 				tcp_service_net_dma(sk, true);

3088

+-				tcp_cleanup_rbuf(sk, copied);

3089

++				tp->ops->cleanup_rbuf(sk, copied);

3090

+ 			} else

3091

+ 				dma_async_issue_pending(tp->ucopy.dma_chan);

3092

+ 		}

3093

+@@ -1993,7 +2076,7 @@ skip_copy:

3094

+ 	 */

3095

+

3096

+ 	/* Clean up data we have read: This will do ACK frames. */

3097

+-	tcp_cleanup_rbuf(sk, copied);

3098

++	tp->ops->cleanup_rbuf(sk, copied);

3099

+

3100

+ 	release_sock(sk);

3101

+ 	return copied;

3102

+@@ -2070,7 +2153,7 @@ static const unsigned char new_state[16] = {

3103

+   /* TCP_CLOSING	*/ TCP_CLOSING,

3104

+ };

3105

+

3106

+-static int tcp_close_state(struct sock *sk)

3107

++int tcp_close_state(struct sock *sk)

3108

+ {

3109

+ 	int next = (int)new_state[sk->sk_state];

3110

+ 	int ns = next & TCP_STATE_MASK;

3111

+@@ -2100,7 +2183,7 @@ void tcp_shutdown(struct sock *sk, int how)

3112

+ 	     TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) {

3113

+ 		/* Clear out any half completed packets.  FIN if needed. */

3114

+ 		if (tcp_close_state(sk))

3115

+-			tcp_send_fin(sk);

3116

++			tcp_sk(sk)->ops->send_fin(sk);

3117

+ 	}

3118

+ }

3119

+ EXPORT_SYMBOL(tcp_shutdown);

3120

+@@ -2125,6 +2208,11 @@ void tcp_close(struct sock *sk, long timeout)

3121

+ 	int data_was_unread = 0;

3122

+ 	int state;

3123

+

3124

++	if (is_meta_sk(sk)) {

3125

++		mptcp_close(sk, timeout);

3126

++		return;

3127

++	}

3128

++

3129

+ 	lock_sock(sk);

3130

+ 	sk->sk_shutdown = SHUTDOWN_MASK;

3131

+

3132

+@@ -2167,7 +2255,7 @@ void tcp_close(struct sock *sk, long timeout)

3133

+ 		/* Unread data was tossed, zap the connection. */

3134

+ 		NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE);

3135

+ 		tcp_set_state(sk, TCP_CLOSE);

3136

+-		tcp_send_active_reset(sk, sk->sk_allocation);

3137

++		tcp_sk(sk)->ops->send_active_reset(sk, sk->sk_allocation);

3138

+ 	} else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {

3139

+ 		/* Check zero linger _after_ checking for unread data. */

3140

+ 		sk->sk_prot->disconnect(sk, 0);

3141

+@@ -2247,7 +2335,7 @@ adjudge_to_death:

3142

+ 		struct tcp_sock *tp = tcp_sk(sk);

3143

+ 		if (tp->linger2 < 0) {

3144

+ 			tcp_set_state(sk, TCP_CLOSE);

3145

+-			tcp_send_active_reset(sk, GFP_ATOMIC);

3146

++			tp->ops->send_active_reset(sk, GFP_ATOMIC);

3147

+ 			NET_INC_STATS_BH(sock_net(sk),

3148

+ 					LINUX_MIB_TCPABORTONLINGER);

3149

+ 		} else {

3150

+@@ -2257,7 +2345,8 @@ adjudge_to_death:

3151

+ 				inet_csk_reset_keepalive_timer(sk,

3152

+ 						tmo - TCP_TIMEWAIT_LEN);

3153

+ 			} else {

3154

+-				tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);

3155

++				tcp_sk(sk)->ops->time_wait(sk, TCP_FIN_WAIT2,

3156

++							   tmo);

3157

+ 				goto out;

3158

+ 			}

3159

+ 		}

3160

+@@ -2266,7 +2355,7 @@ adjudge_to_death:

3161

+ 		sk_mem_reclaim(sk);

3162

+ 		if (tcp_check_oom(sk, 0)) {

3163

+ 			tcp_set_state(sk, TCP_CLOSE);

3164

+-			tcp_send_active_reset(sk, GFP_ATOMIC);

3165

++			tcp_sk(sk)->ops->send_active_reset(sk, GFP_ATOMIC);

3166

+ 			NET_INC_STATS_BH(sock_net(sk),

3167

+ 					LINUX_MIB_TCPABORTONMEMORY);

3168

+ 		}

3169

+@@ -2291,15 +2380,6 @@ out:

3170

+ }

3171

+ EXPORT_SYMBOL(tcp_close);

3172

+

3173

+-/* These states need RST on ABORT according to RFC793 */

3174

+-

3175

+-static inline bool tcp_need_reset(int state)

3176

+-{

3177

+-	return (1 << state) &

3178

+-	       (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 |

3179

+-		TCPF_FIN_WAIT2 | TCPF_SYN_RECV);

3180

+-}

3181

+-

3182

+ int tcp_disconnect(struct sock *sk, int flags)

3183

+ {

3184

+ 	struct inet_sock *inet = inet_sk(sk);

3185

+@@ -2322,7 +2402,7 @@ int tcp_disconnect(struct sock *sk, int flags)

3186

+ 		/* The last check adjusts for discrepancy of Linux wrt. RFC

3187

+ 		 * states

3188

+ 		 */

3189

+-		tcp_send_active_reset(sk, gfp_any());

3190

++		tp->ops->send_active_reset(sk, gfp_any());

3191

+ 		sk->sk_err = ECONNRESET;

3192

+ 	} else if (old_state == TCP_SYN_SENT)

3193

+ 		sk->sk_err = ECONNRESET;

3194

+@@ -2340,6 +2420,13 @@ int tcp_disconnect(struct sock *sk, int flags)

3195

+ 	if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))

3196

+ 		inet_reset_saddr(sk);

3197

+

3198

++	if (is_meta_sk(sk)) {

3199

++		mptcp_disconnect(sk);

3200

++	} else {

3201

++		if (tp->inside_tk_table)

3202

++			mptcp_hash_remove_bh(tp);

3203

++	}

3204

++

3205

+ 	sk->sk_shutdown = 0;

3206

+ 	sock_reset_flag(sk, SOCK_DONE);

3207

+ 	tp->srtt_us = 0;

3208

+@@ -2632,6 +2719,12 @@ static int do_tcp_setsockopt(struct sock *sk, int level,

3209

+ 		break;

3210

+

3211

+ 	case TCP_DEFER_ACCEPT:

3212

++		/* An established MPTCP-connection (mptcp(tp) only returns true

3213

++		 * if the socket is established) should not use DEFER on new

3214

++		 * subflows.

3215

++		 */

3216

++		if (mptcp(tp))

3217

++			break;

3218

+ 		/* Translate value in seconds to number of retransmits */

3219

+ 		icsk->icsk_accept_queue.rskq_defer_accept =

3220

+ 			secs_to_retrans(val, TCP_TIMEOUT_INIT / HZ,

3221

+@@ -2659,7 +2752,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level,

3222

+ 			    (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&

3223

+ 			    inet_csk_ack_scheduled(sk)) {

3224

+ 				icsk->icsk_ack.pending |= ICSK_ACK_PUSHED;

3225

+-				tcp_cleanup_rbuf(sk, 1);

3226

++				tp->ops->cleanup_rbuf(sk, 1);

3227

+ 				if (!(val & 1))

3228

+ 					icsk->icsk_ack.pingpong = 1;

3229

+ 			}

3230

+@@ -2699,6 +2792,18 @@ static int do_tcp_setsockopt(struct sock *sk, int level,

3231

+ 		tp->notsent_lowat = val;

3232

+ 		sk->sk_write_space(sk);

3233

+ 		break;

3234

++#ifdef CONFIG_MPTCP

3235

++	case MPTCP_ENABLED:

3236

++		if (sk->sk_state == TCP_CLOSE || sk->sk_state == TCP_LISTEN) {

3237

++			if (val)

3238

++				tp->mptcp_enabled = 1;

3239

++			else

3240

++				tp->mptcp_enabled = 0;

3241

++		} else {

3242

++			err = -EPERM;

3243

++		}

3244

++		break;

3245

++#endif

3246

+ 	default:

3247

+ 		err = -ENOPROTOOPT;

3248

+ 		break;

3249

+@@ -2931,6 +3036,11 @@ static int do_tcp_getsockopt(struct sock *sk, int level,

3250

+ 	case TCP_NOTSENT_LOWAT:

3251

+ 		val = tp->notsent_lowat;

3252

+ 		break;

3253

++#ifdef CONFIG_MPTCP

3254

++	case MPTCP_ENABLED:

3255

++		val = tp->mptcp_enabled;

3256

++		break;

3257

++#endif

3258

+ 	default:

3259

+ 		return -ENOPROTOOPT;

3260

+ 	}

3261

+@@ -3120,8 +3230,11 @@ void tcp_done(struct sock *sk)

3262

+ 	if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV)

3263

+ 		TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_ATTEMPTFAILS);

3264

+

3265

++	WARN_ON(sk->sk_state == TCP_CLOSE);

3266

+ 	tcp_set_state(sk, TCP_CLOSE);

3267

++

3268

+ 	tcp_clear_xmit_timers(sk);

3269

++

3270

+ 	if (req != NULL)

3271

+ 		reqsk_fastopen_remove(sk, req, false);

3272

+

3273

+diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c

3274

+index 9771563ab564..5c230d96c4c1 100644

3275

+--- a/net/ipv4/tcp_fastopen.c

3276

++++ b/net/ipv4/tcp_fastopen.c

3277

+@@ -7,6 +7,7 @@

3278

+ #include <linux/rculist.h>

3279

+ #include <net/inetpeer.h>

3280

+ #include <net/tcp.h>

3281

++#include <net/mptcp.h>

3282

+

3283

+ int sysctl_tcp_fastopen __read_mostly = TFO_CLIENT_ENABLE;

3284

+

3285

+@@ -133,7 +134,7 @@ static bool tcp_fastopen_create_child(struct sock *sk,

3286

+ {

3287

+ 	struct tcp_sock *tp;

3288

+ 	struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;

3289

+-	struct sock *child;

3290

++	struct sock *child, *meta_sk;

3291

+

3292

+ 	req->num_retrans = 0;

3293

+ 	req->num_timeout = 0;

3294

+@@ -176,13 +177,6 @@ static bool tcp_fastopen_create_child(struct sock *sk,

3295

+ 	/* Add the child socket directly into the accept queue */

3296

+ 	inet_csk_reqsk_queue_add(sk, req, child);

3297

+

3298

+-	/* Now finish processing the fastopen child socket. */

3299

+-	inet_csk(child)->icsk_af_ops->rebuild_header(child);

3300

+-	tcp_init_congestion_control(child);

3301

+-	tcp_mtup_init(child);

3302

+-	tcp_init_metrics(child);

3303

+-	tcp_init_buffer_space(child);

3304

+-

3305

+ 	/* Queue the data carried in the SYN packet. We need to first

3306

+ 	 * bump skb's refcnt because the caller will attempt to free it.

3307

+ 	 *

3308

+@@ -199,8 +193,24 @@ static bool tcp_fastopen_create_child(struct sock *sk,

3309

+ 		tp->syn_data_acked = 1;

3310

+ 	}

3311

+ 	tcp_rsk(req)->rcv_nxt = tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;

3312

++

3313

++	meta_sk = child;

3314

++	if (!mptcp_check_req_fastopen(meta_sk, req)) {

3315

++		child = tcp_sk(meta_sk)->mpcb->master_sk;

3316

++		tp = tcp_sk(child);

3317

++	}

3318

++

3319

++	/* Now finish processing the fastopen child socket. */

3320

++	inet_csk(child)->icsk_af_ops->rebuild_header(child);

3321

++	tp->ops->init_congestion_control(child);

3322

++	tcp_mtup_init(child);

3323

++	tcp_init_metrics(child);

3324

++	tp->ops->init_buffer_space(child);

3325

++

3326

+ 	sk->sk_data_ready(sk);

3327

+-	bh_unlock_sock(child);

3328

++	if (mptcp(tcp_sk(child)))

3329

++		bh_unlock_sock(child);

3330

++	bh_unlock_sock(meta_sk);

3331

+ 	sock_put(child);

3332

+ 	WARN_ON(req->sk == NULL);

3333

+ 	return true;

3334

+diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c

3335

+index 40639c288dc2..3273bb69f387 100644

3336

+--- a/net/ipv4/tcp_input.c

3337

++++ b/net/ipv4/tcp_input.c

3338

+@@ -74,6 +74,9 @@

3339

+ #include <linux/ipsec.h>

3340

+ #include <asm/unaligned.h>

3341

+ #include <net/netdma.h>

3342

++#include <net/mptcp.h>

3343

++#include <net/mptcp_v4.h>

3344

++#include <net/mptcp_v6.h>

3345

+

3346

+ int sysctl_tcp_timestamps __read_mostly = 1;

3347

+ int sysctl_tcp_window_scaling __read_mostly = 1;

3348

+@@ -99,25 +102,6 @@ int sysctl_tcp_thin_dupack __read_mostly;

3349

+ int sysctl_tcp_moderate_rcvbuf __read_mostly = 1;

3350

+ int sysctl_tcp_early_retrans __read_mostly = 3;

3351

+

3352

+-#define FLAG_DATA		0x01 /* Incoming frame contained data.		*/

3353

+-#define FLAG_WIN_UPDATE		0x02 /* Incoming ACK was a window update.	*/

3354

+-#define FLAG_DATA_ACKED		0x04 /* This ACK acknowledged new data.		*/

3355

+-#define FLAG_RETRANS_DATA_ACKED	0x08 /* "" "" some of which was retransmitted.	*/

3356

+-#define FLAG_SYN_ACKED		0x10 /* This ACK acknowledged SYN.		*/

3357

+-#define FLAG_DATA_SACKED	0x20 /* New SACK.				*/

3358

+-#define FLAG_ECE		0x40 /* ECE in this ACK				*/

3359

+-#define FLAG_SLOWPATH		0x100 /* Do not skip RFC checks for window update.*/

3360

+-#define FLAG_ORIG_SACK_ACKED	0x200 /* Never retransmitted data are (s)acked	*/

3361

+-#define FLAG_SND_UNA_ADVANCED	0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */

3362

+-#define FLAG_DSACKING_ACK	0x800 /* SACK blocks contained D-SACK info */

3363

+-#define FLAG_SACK_RENEGING	0x2000 /* snd_una advanced to a sacked seq */

3364

+-#define FLAG_UPDATE_TS_RECENT	0x4000 /* tcp_replace_ts_recent() */

3365

+-

3366

+-#define FLAG_ACKED		(FLAG_DATA_ACKED|FLAG_SYN_ACKED)

3367

+-#define FLAG_NOT_DUP		(FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED)

3368

+-#define FLAG_CA_ALERT		(FLAG_DATA_SACKED|FLAG_ECE)

3369

+-#define FLAG_FORWARD_PROGRESS	(FLAG_ACKED|FLAG_DATA_SACKED)

3370

+-

3371

+ #define TCP_REMNANT (TCP_FLAG_FIN|TCP_FLAG_URG|TCP_FLAG_SYN|TCP_FLAG_PSH)

3372

+ #define TCP_HP_BITS (~(TCP_RESERVED_BITS|TCP_FLAG_PSH))

3373

+

3374

+@@ -181,7 +165,7 @@ static void tcp_incr_quickack(struct sock *sk)

3375

+ 		icsk->icsk_ack.quick = min(quickacks, TCP_MAX_QUICKACKS);

3376

+ }

3377

+

3378

+-static void tcp_enter_quickack_mode(struct sock *sk)

3379

++void tcp_enter_quickack_mode(struct sock *sk)

3380

+ {

3381

+ 	struct inet_connection_sock *icsk = inet_csk(sk);

3382

+ 	tcp_incr_quickack(sk);

3383

+@@ -283,8 +267,12 @@ static void tcp_sndbuf_expand(struct sock *sk)

3384

+ 	per_mss = roundup_pow_of_two(per_mss) +

3385

+ 		  SKB_DATA_ALIGN(sizeof(struct sk_buff));

3386

+

3387

+-	nr_segs = max_t(u32, TCP_INIT_CWND, tp->snd_cwnd);

3388

+-	nr_segs = max_t(u32, nr_segs, tp->reordering + 1);

3389

++	if (mptcp(tp)) {

3390

++		nr_segs = mptcp_check_snd_buf(tp);

3391

++	} else {

3392

++		nr_segs = max_t(u32, TCP_INIT_CWND, tp->snd_cwnd);

3393

++		nr_segs = max_t(u32, nr_segs, tp->reordering + 1);

3394

++	}

3395

+

3396

+ 	/* Fast Recovery (RFC 5681 3.2) :

3397

+ 	 * Cubic needs 1.7 factor, rounded to 2 to include

3398

+@@ -292,8 +280,16 @@ static void tcp_sndbuf_expand(struct sock *sk)

3399

+ 	 */

3400

+ 	sndmem = 2 * nr_segs * per_mss;

3401

+

3402

+-	if (sk->sk_sndbuf < sndmem)

3403

++	/* MPTCP: after this sndmem is the new contribution of the

3404

++	 * current subflow to the aggregated sndbuf */

3405

++	if (sk->sk_sndbuf < sndmem) {

3406

++		int old_sndbuf = sk->sk_sndbuf;

3407

+ 		sk->sk_sndbuf = min(sndmem, sysctl_tcp_wmem[2]);

3408

++		/* MPTCP: ok, the subflow sndbuf has grown, reflect

3409

++		 * this in the aggregate buffer.*/

3410

++		if (mptcp(tp) && old_sndbuf != sk->sk_sndbuf)

3411

++			mptcp_update_sndbuf(tp);

3412

++	}

3413

+ }

3414

+

3415

+ /* 2. Tuning advertised window (window_clamp, rcv_ssthresh)

3416

+@@ -342,10 +338,12 @@ static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb)

3417

+ static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb)

3418

+ {

3419

+ 	struct tcp_sock *tp = tcp_sk(sk);

3420

++	struct sock *meta_sk = mptcp(tp) ? mptcp_meta_sk(sk) : sk;

3421

++	struct tcp_sock *meta_tp = tcp_sk(meta_sk);

3422

+

3423

+ 	/* Check #1 */

3424

+-	if (tp->rcv_ssthresh < tp->window_clamp &&

3425

+-	    (int)tp->rcv_ssthresh < tcp_space(sk) &&

3426

++	if (meta_tp->rcv_ssthresh < meta_tp->window_clamp &&

3427

++	    (int)meta_tp->rcv_ssthresh < tcp_space(sk) &&

3428

+ 	    !sk_under_memory_pressure(sk)) {

3429

+ 		int incr;

3430

+

3431

+@@ -353,14 +351,14 @@ static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb)

3432

+ 		 * will fit to rcvbuf in future.

3433

+ 		 */

3434

+ 		if (tcp_win_from_space(skb->truesize) <= skb->len)

3435

+-			incr = 2 * tp->advmss;

3436

++			incr = 2 * meta_tp->advmss;

3437

+ 		else

3438

+-			incr = __tcp_grow_window(sk, skb);

3439

++			incr = __tcp_grow_window(meta_sk, skb);

3440

+

3441

+ 		if (incr) {

3442

+ 			incr = max_t(int, incr, 2 * skb->len);

3443

+-			tp->rcv_ssthresh = min(tp->rcv_ssthresh + incr,

3444

+-					       tp->window_clamp);

3445

++			meta_tp->rcv_ssthresh = min(meta_tp->rcv_ssthresh + incr,

3446

++					            meta_tp->window_clamp);

3447

+ 			inet_csk(sk)->icsk_ack.quick |= 1;

3448

+ 		}

3449

+ 	}

3450

+@@ -543,7 +541,10 @@ void tcp_rcv_space_adjust(struct sock *sk)

3451

+ 	int copied;

3452

+

3453

+ 	time = tcp_time_stamp - tp->rcvq_space.time;

3454

+-	if (time < (tp->rcv_rtt_est.rtt >> 3) || tp->rcv_rtt_est.rtt == 0)

3455

++	if (mptcp(tp)) {

3456

++		if (mptcp_check_rtt(tp, time))

3457

++			return;

3458

++	} else if (time < (tp->rcv_rtt_est.rtt >> 3) || tp->rcv_rtt_est.rtt == 0)

3459

+ 		return;

3460

+

3461

+ 	/* Number of bytes copied to user in last RTT */

3462

+@@ -761,7 +762,7 @@ static void tcp_update_pacing_rate(struct sock *sk)

3463

+ /* Calculate rto without backoff.  This is the second half of Van Jacobson's

3464

+  * routine referred to above.

3465

+  */

3466

+-static void tcp_set_rto(struct sock *sk)

3467

++void tcp_set_rto(struct sock *sk)

3468

+ {

3469

+ 	const struct tcp_sock *tp = tcp_sk(sk);

3470

+ 	/* Old crap is replaced with new one. 8)

3471

+@@ -1376,7 +1377,11 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb,

3472

+ 	int len;

3473

+ 	int in_sack;

3474

+

3475

+-	if (!sk_can_gso(sk))

3476

++	/* For MPTCP we cannot shift skb-data and remove one skb from the

3477

++	 * send-queue, because this will make us loose the DSS-option (which

3478

++	 * is stored in TCP_SKB_CB(skb)->dss) of the skb we are removing.

3479

++	 */

3480

++	if (!sk_can_gso(sk) || mptcp(tp))

3481

+ 		goto fallback;

3482

+

3483

+ 	/* Normally R but no L won't result in plain S */

3484

+@@ -2915,7 +2920,7 @@ static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag,

3485

+ 		return false;

3486

+

3487

+ 	tcp_rtt_estimator(sk, seq_rtt_us);

3488

+-	tcp_set_rto(sk);

3489

++	tp->ops->set_rto(sk);

3490

+

3491

+ 	/* RFC6298: only reset backoff on valid RTT measurement. */

3492

+ 	inet_csk(sk)->icsk_backoff = 0;

3493

+@@ -3000,7 +3005,7 @@ void tcp_resume_early_retransmit(struct sock *sk)

3494

+ }

3495

+

3496

+ /* If we get here, the whole TSO packet has not been acked. */

3497

+-static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb)

3498

++u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb)

3499

+ {

3500

+ 	struct tcp_sock *tp = tcp_sk(sk);

3501

+ 	u32 packets_acked;

3502

+@@ -3095,6 +3100,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,

3503

+ 		 */

3504

+ 		if (!(scb->tcp_flags & TCPHDR_SYN)) {

3505

+ 			flag |= FLAG_DATA_ACKED;

3506

++			if (mptcp(tp) && mptcp_is_data_seq(skb))

3507

++				flag |= MPTCP_FLAG_DATA_ACKED;

3508

+ 		} else {

3509

+ 			flag |= FLAG_SYN_ACKED;

3510

+ 			tp->retrans_stamp = 0;

3511

+@@ -3189,7 +3196,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,

3512

+ 	return flag;

3513

+ }

3514

+

3515

+-static void tcp_ack_probe(struct sock *sk)

3516

++void tcp_ack_probe(struct sock *sk)

3517

+ {

3518

+ 	const struct tcp_sock *tp = tcp_sk(sk);

3519

+ 	struct inet_connection_sock *icsk = inet_csk(sk);

3520

+@@ -3236,9 +3243,8 @@ static inline bool tcp_may_raise_cwnd(const struct sock *sk, const int flag)

3521

+ /* Check that window update is acceptable.

3522

+  * The function assumes that snd_una<=ack<=snd_next.

3523

+  */

3524

+-static inline bool tcp_may_update_window(const struct tcp_sock *tp,

3525

+-					const u32 ack, const u32 ack_seq,

3526

+-					const u32 nwin)

3527

++bool tcp_may_update_window(const struct tcp_sock *tp, const u32 ack,

3528

++			   const u32 ack_seq, const u32 nwin)

3529

+ {

3530

+ 	return	after(ack, tp->snd_una) ||

3531

+ 		after(ack_seq, tp->snd_wl1) ||

3532

+@@ -3357,7 +3363,7 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag)

3533

+ }

3534

+

3535

+ /* This routine deals with incoming acks, but not outgoing ones. */

3536

+-static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)

3537

++static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)

3538

+ {

3539

+ 	struct inet_connection_sock *icsk = inet_csk(sk);

3540

+ 	struct tcp_sock *tp = tcp_sk(sk);

3541

+@@ -3449,6 +3455,16 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)

3542

+ 				    sack_rtt_us);

3543

+ 	acked -= tp->packets_out;

3544

+

3545

++	if (mptcp(tp)) {

3546

++		if (mptcp_fallback_infinite(sk, flag)) {

3547

++			pr_err("%s resetting flow\n", __func__);

3548

++			mptcp_send_reset(sk);

3549

++			goto invalid_ack;

3550

++		}

3551

++

3552

++		mptcp_clean_rtx_infinite(skb, sk);

3553

++	}

3554

++

3555

+ 	/* Advance cwnd if state allows */

3556

+ 	if (tcp_may_raise_cwnd(sk, flag))

3557

+ 		tcp_cong_avoid(sk, ack, acked);

3558

+@@ -3512,8 +3528,9 @@ old_ack:

3559

+  * the fast version below fails.

3560

+  */

3561

+ void tcp_parse_options(const struct sk_buff *skb,

3562

+-		       struct tcp_options_received *opt_rx, int estab,

3563

+-		       struct tcp_fastopen_cookie *foc)

3564

++		       struct tcp_options_received *opt_rx,

3565

++		       struct mptcp_options_received *mopt,

3566

++		       int estab, struct tcp_fastopen_cookie *foc)

3567

+ {

3568

+ 	const unsigned char *ptr;

3569

+ 	const struct tcphdr *th = tcp_hdr(skb);

3570

+@@ -3596,6 +3613,9 @@ void tcp_parse_options(const struct sk_buff *skb,

3571

+ 				 */

3572

+ 				break;

3573

+ #endif

3574

++			case TCPOPT_MPTCP:

3575

++				mptcp_parse_options(ptr - 2, opsize, mopt, skb);

3576

++				break;

3577

+ 			case TCPOPT_EXP:

3578

+ 				/* Fast Open option shares code 254 using a

3579

+ 				 * 16 bits magic number. It's valid only in

3580

+@@ -3657,8 +3677,8 @@ static bool tcp_fast_parse_options(const struct sk_buff *skb,

3581

+ 		if (tcp_parse_aligned_timestamp(tp, th))

3582

+ 			return true;

3583

+ 	}

3584

+-

3585

+-	tcp_parse_options(skb, &tp->rx_opt, 1, NULL);

3586

++	tcp_parse_options(skb, &tp->rx_opt, mptcp(tp) ? &tp->mptcp->rx_opt : NULL,

3587

++			  1, NULL);

3588

+ 	if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)

3589

+ 		tp->rx_opt.rcv_tsecr -= tp->tsoffset;

3590

+

3591

+@@ -3831,6 +3851,8 @@ static void tcp_fin(struct sock *sk)

3592

+ 		dst = __sk_dst_get(sk);

3593

+ 		if (!dst || !dst_metric(dst, RTAX_QUICKACK))

3594

+ 			inet_csk(sk)->icsk_ack.pingpong = 1;

3595

++		if (mptcp(tp))

3596

++			mptcp_sub_close_passive(sk);

3597

+ 		break;

3598

+

3599

+ 	case TCP_CLOSE_WAIT:

3600

+@@ -3852,9 +3874,16 @@ static void tcp_fin(struct sock *sk)

3601

+ 		tcp_set_state(sk, TCP_CLOSING);

3602

+ 		break;

3603

+ 	case TCP_FIN_WAIT2:

3604

++		if (mptcp(tp)) {

3605

++			/* The socket will get closed by mptcp_data_ready.

3606

++			 * We first have to process all data-sequences.

3607

++			 */

3608

++			tp->close_it = 1;

3609

++			break;

3610

++		}

3611

+ 		/* Received a FIN -- send ACK and enter TIME_WAIT. */

3612

+ 		tcp_send_ack(sk);

3613

+-		tcp_time_wait(sk, TCP_TIME_WAIT, 0);

3614

++		tp->ops->time_wait(sk, TCP_TIME_WAIT, 0);

3615

+ 		break;

3616

+ 	default:

3617

+ 		/* Only TCP_LISTEN and TCP_CLOSE are left, in these

3618

+@@ -3876,6 +3905,10 @@ static void tcp_fin(struct sock *sk)

3619

+ 	if (!sock_flag(sk, SOCK_DEAD)) {

3620

+ 		sk->sk_state_change(sk);

3621

+

3622

++		/* Don't wake up MPTCP-subflows */

3623

++		if (mptcp(tp))

3624

++			return;

3625

++

3626

+ 		/* Do not send POLL_HUP for half duplex close. */

3627

+ 		if (sk->sk_shutdown == SHUTDOWN_MASK ||

3628

+ 		    sk->sk_state == TCP_CLOSE)

3629

+@@ -4073,7 +4106,11 @@ static void tcp_ofo_queue(struct sock *sk)

3630

+ 			tcp_dsack_extend(sk, TCP_SKB_CB(skb)->seq, dsack);

3631

+ 		}

3632

+

3633

+-		if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {

3634

++		/* In case of MPTCP, the segment may be empty if it's a

3635

++		 * non-data DATA_FIN. (see beginning of tcp_data_queue)

3636

++		 */

3637

++		if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt) &&

3638

++		    !(mptcp(tp) && TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq)) {

3639

+ 			SOCK_DEBUG(sk, "ofo packet was already received\n");

3640

+ 			__skb_unlink(skb, &tp->out_of_order_queue);

3641

+ 			__kfree_skb(skb);

3642

+@@ -4091,12 +4128,14 @@ static void tcp_ofo_queue(struct sock *sk)

3643

+ 	}

3644

+ }

3645

+

3646

+-static bool tcp_prune_ofo_queue(struct sock *sk);

3647

+ static int tcp_prune_queue(struct sock *sk);

3648

+

3649

+ static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb,

3650

+ 				 unsigned int size)

3651

+ {

3652

++	if (mptcp(tcp_sk(sk)))

3653

++		sk = mptcp_meta_sk(sk);

3654

++

3655

+ 	if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||

3656

+ 	    !sk_rmem_schedule(sk, skb, size)) {

3657

+

3658

+@@ -4104,7 +4143,7 @@ static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb,

3659

+ 			return -1;

3660

+

3661

+ 		if (!sk_rmem_schedule(sk, skb, size)) {

3662

+-			if (!tcp_prune_ofo_queue(sk))

3663

++			if (!tcp_sk(sk)->ops->prune_ofo_queue(sk))

3664

+ 				return -1;

3665

+

3666

+ 			if (!sk_rmem_schedule(sk, skb, size))

3667

+@@ -4127,15 +4166,16 @@ static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb,

3668

+  * Better try to coalesce them right now to avoid future collapses.

3669

+  * Returns true if caller should free @from instead of queueing it

3670

+  */

3671

+-static bool tcp_try_coalesce(struct sock *sk,

3672

+-			     struct sk_buff *to,

3673

+-			     struct sk_buff *from,

3674

+-			     bool *fragstolen)

3675

++bool tcp_try_coalesce(struct sock *sk, struct sk_buff *to, struct sk_buff *from,

3676

++		      bool *fragstolen)

3677

+ {

3678

+ 	int delta;

3679

+

3680

+ 	*fragstolen = false;

3681

+

3682

++	if (mptcp(tcp_sk(sk)) && !is_meta_sk(sk))

3683

++		return false;

3684

++

3685

+ 	if (tcp_hdr(from)->fin)

3686

+ 		return false;

3687

+

3688

+@@ -4225,7 +4265,9 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)

3689

+

3690

+ 	/* Do skb overlap to previous one? */

3691

+ 	if (skb1 && before(seq, TCP_SKB_CB(skb1)->end_seq)) {

3692

+-		if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) {

3693

++		/* MPTCP allows non-data data-fin to be in the ofo-queue */

3694

++		if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq) &&

3695

++		    !(mptcp(tp) && end_seq == seq)) {

3696

+ 			/* All the bits are present. Drop. */

3697

+ 			NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFOMERGE);

3698

+ 			__kfree_skb(skb);

3699

+@@ -4263,6 +4305,9 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)

3700

+ 					 end_seq);

3701

+ 			break;

3702

+ 		}

3703

++		/* MPTCP allows non-data data-fin to be in the ofo-queue */

3704

++		if (mptcp(tp) && TCP_SKB_CB(skb1)->seq == TCP_SKB_CB(skb1)->end_seq)

3705

++			continue;

3706

+ 		__skb_unlink(skb1, &tp->out_of_order_queue);

3707

+ 		tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq,

3708

+ 				 TCP_SKB_CB(skb1)->end_seq);

3709

+@@ -4280,8 +4325,8 @@ end:

3710

+ 	}

3711

+ }

3712

+

3713

+-static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int hdrlen,

3714

+-		  bool *fragstolen)

3715

++int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int hdrlen,

3716

++			       bool *fragstolen)

3717

+ {

3718

+ 	int eaten;

3719

+ 	struct sk_buff *tail = skb_peek_tail(&sk->sk_receive_queue);

3720

+@@ -4343,7 +4388,10 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)

3721

+ 	int eaten = -1;

3722

+ 	bool fragstolen = false;

3723

+

3724

+-	if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq)

3725

++	/* If no data is present, but a data_fin is in the options, we still

3726

++	 * have to call mptcp_queue_skb later on. */

3727

++	if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq &&

3728

++	    !(mptcp(tp) && mptcp_is_data_fin(skb)))

3729

+ 		goto drop;

3730

+

3731

+ 	skb_dst_drop(skb);

3732

+@@ -4389,7 +4437,7 @@ queue_and_out:

3733

+ 			eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen);

3734

+ 		}

3735

+ 		tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;

3736

+-		if (skb->len)

3737

++		if (skb->len || mptcp_is_data_fin(skb))

3738

+ 			tcp_event_data_recv(sk, skb);

3739

+ 		if (th->fin)

3740

+ 			tcp_fin(sk);

3741

+@@ -4411,7 +4459,11 @@ queue_and_out:

3742

+

3743

+ 		if (eaten > 0)

3744

+ 			kfree_skb_partial(skb, fragstolen);

3745

+-		if (!sock_flag(sk, SOCK_DEAD))

3746

++		if (!sock_flag(sk, SOCK_DEAD) || mptcp(tp))

3747

++			/* MPTCP: we always have to call data_ready, because

3748

++			 * we may be about to receive a data-fin, which still

3749

++			 * must get queued.

3750

++			 */

3751

+ 			sk->sk_data_ready(sk);

3752

+ 		return;

3753

+ 	}

3754

+@@ -4463,6 +4515,8 @@ static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb,

3755

+ 		next = skb_queue_next(list, skb);

3756

+

3757

+ 	__skb_unlink(skb, list);

3758

++	if (mptcp(tcp_sk(sk)))

3759

++		mptcp_remove_shortcuts(tcp_sk(sk)->mpcb, skb);

3760

+ 	__kfree_skb(skb);

3761

+ 	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRCVCOLLAPSED);

3762

+

3763

+@@ -4630,7 +4684,7 @@ static void tcp_collapse_ofo_queue(struct sock *sk)

3764

+  * Purge the out-of-order queue.

3765

+  * Return true if queue was pruned.

3766

+  */

3767

+-static bool tcp_prune_ofo_queue(struct sock *sk)

3768

++bool tcp_prune_ofo_queue(struct sock *sk)

3769

+ {

3770

+ 	struct tcp_sock *tp = tcp_sk(sk);

3771

+ 	bool res = false;

3772

+@@ -4686,7 +4740,7 @@ static int tcp_prune_queue(struct sock *sk)

3773

+ 	/* Collapsing did not help, destructive actions follow.

3774

+ 	 * This must not ever occur. */

3775

+

3776

+-	tcp_prune_ofo_queue(sk);

3777

++	tp->ops->prune_ofo_queue(sk);

3778

+

3779

+ 	if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)

3780

+ 		return 0;

3781

+@@ -4702,7 +4756,29 @@ static int tcp_prune_queue(struct sock *sk)

3782

+ 	return -1;

3783

+ }

3784

+

3785

+-static bool tcp_should_expand_sndbuf(const struct sock *sk)

3786

++/* RFC2861, slow part. Adjust cwnd, after it was not full during one rto.

3787

++ * As additional protections, we do not touch cwnd in retransmission phases,

3788

++ * and if application hit its sndbuf limit recently.

3789

++ */

3790

++void tcp_cwnd_application_limited(struct sock *sk)

3791

++{

3792

++	struct tcp_sock *tp = tcp_sk(sk);

3793

++

3794

++	if (inet_csk(sk)->icsk_ca_state == TCP_CA_Open &&

3795

++	    sk->sk_socket && !test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {

3796

++		/* Limited by application or receiver window. */

3797

++		u32 init_win = tcp_init_cwnd(tp, __sk_dst_get(sk));

3798

++		u32 win_used = max(tp->snd_cwnd_used, init_win);

3799

++		if (win_used < tp->snd_cwnd) {

3800

++			tp->snd_ssthresh = tcp_current_ssthresh(sk);

3801

++			tp->snd_cwnd = (tp->snd_cwnd + win_used) >> 1;

3802

++		}

3803

++		tp->snd_cwnd_used = 0;

3804

++	}

3805

++	tp->snd_cwnd_stamp = tcp_time_stamp;

3806

++}

3807

++

3808

++bool tcp_should_expand_sndbuf(const struct sock *sk)

3809

+ {

3810

+ 	const struct tcp_sock *tp = tcp_sk(sk);

3811

+

3812

+@@ -4737,7 +4813,7 @@ static void tcp_new_space(struct sock *sk)

3813

+ {

3814

+ 	struct tcp_sock *tp = tcp_sk(sk);

3815

+

3816

+-	if (tcp_should_expand_sndbuf(sk)) {

3817

++	if (tp->ops->should_expand_sndbuf(sk)) {

3818

+ 		tcp_sndbuf_expand(sk);

3819

+ 		tp->snd_cwnd_stamp = tcp_time_stamp;

3820

+ 	}

3821

+@@ -4749,8 +4825,9 @@ static void tcp_check_space(struct sock *sk)

3822

+ {

3823

+ 	if (sock_flag(sk, SOCK_QUEUE_SHRUNK)) {

3824

+ 		sock_reset_flag(sk, SOCK_QUEUE_SHRUNK);

3825

+-		if (sk->sk_socket &&

3826

+-		    test_bit(SOCK_NOSPACE, &sk->sk_socket->flags))

3827

++		if (mptcp(tcp_sk(sk)) ||

3828

++		    (sk->sk_socket &&

3829

++			test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)))

3830

+ 			tcp_new_space(sk);

3831

+ 	}

3832

+ }

3833

+@@ -4773,7 +4850,7 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)

3834

+ 	     /* ... and right edge of window advances far enough.

3835

+ 	      * (tcp_recvmsg() will send ACK otherwise). Or...

3836

+ 	      */

3837

+-	     __tcp_select_window(sk) >= tp->rcv_wnd) ||

3838

++	     tp->ops->__select_window(sk) >= tp->rcv_wnd) ||

3839

+ 	    /* We ACK each frame or... */

3840

+ 	    tcp_in_quickack_mode(sk) ||

3841

+ 	    /* We have out of order data. */

3842

+@@ -4875,6 +4952,10 @@ static void tcp_urg(struct sock *sk, struct sk_buff *skb, const struct tcphdr *t

3843

+ {

3844

+ 	struct tcp_sock *tp = tcp_sk(sk);

3845

+

3846

++	/* MPTCP urgent data is not yet supported */

3847

++	if (mptcp(tp))

3848

++		return;

3849

++

3850

+ 	/* Check if we get a new urgent pointer - normally not. */

3851

+ 	if (th->urg)

3852

+ 		tcp_check_urg(sk, th);

3853

+@@ -4942,8 +5023,7 @@ static inline bool tcp_checksum_complete_user(struct sock *sk,

3854

+ }

3855

+

3856

+ #ifdef CONFIG_NET_DMA

3857

+-static bool tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb,

3858

+-				  int hlen)

3859

++bool tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb, int hlen)

3860

+ {

3861

+ 	struct tcp_sock *tp = tcp_sk(sk);

3862

+ 	int chunk = skb->len - hlen;

3863

+@@ -5052,9 +5132,15 @@ syn_challenge:

3864

+ 		goto discard;

3865

+ 	}

3866

+

3867

++	/* If valid: post process the received MPTCP options. */

3868

++	if (mptcp(tp) && mptcp_handle_options(sk, th, skb))

3869

++		goto discard;

3870

++

3871

+ 	return true;

3872

+

3873

+ discard:

3874

++	if (mptcp(tp))

3875

++		mptcp_reset_mopt(tp);

3876

+ 	__kfree_skb(skb);

3877

+ 	return false;

3878

+ }

3879

+@@ -5106,6 +5192,10 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,

3880

+

3881

+ 	tp->rx_opt.saw_tstamp = 0;

3882

+

3883

++	/* MPTCP: force slowpath. */

3884

++	if (mptcp(tp))

3885

++		goto slow_path;

3886

++

3887

+ 	/*	pred_flags is 0xS?10 << 16 + snd_wnd

3888

+ 	 *	if header_prediction is to be made

3889

+ 	 *	'S' will always be tp->tcp_header_len >> 2

3890

+@@ -5205,7 +5295,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,

3891

+ 					NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPHITSTOUSER);

3892

+ 				}

3893

+ 				if (copied_early)

3894

+-					tcp_cleanup_rbuf(sk, skb->len);

3895

++					tp->ops->cleanup_rbuf(sk, skb->len);

3896

+ 			}

3897

+ 			if (!eaten) {

3898

+ 				if (tcp_checksum_complete_user(sk, skb))

3899

+@@ -5313,14 +5403,14 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb)

3900

+

3901

+ 	tcp_init_metrics(sk);

3902

+

3903

+-	tcp_init_congestion_control(sk);

3904

++	tp->ops->init_congestion_control(sk);

3905

+

3906

+ 	/* Prevent spurious tcp_cwnd_restart() on first data

3907

+ 	 * packet.

3908

+ 	 */

3909

+ 	tp->lsndtime = tcp_time_stamp;

3910

+

3911

+-	tcp_init_buffer_space(sk);

3912

++	tp->ops->init_buffer_space(sk);

3913

+

3914

+ 	if (sock_flag(sk, SOCK_KEEPOPEN))

3915

+ 		inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tp));

3916

+@@ -5350,7 +5440,7 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,

3917

+ 		/* Get original SYNACK MSS value if user MSS sets mss_clamp */

3918

+ 		tcp_clear_options(&opt);

3919

+ 		opt.user_mss = opt.mss_clamp = 0;

3920

+-		tcp_parse_options(synack, &opt, 0, NULL);

3921

++		tcp_parse_options(synack, &opt, NULL, 0, NULL);

3922

+ 		mss = opt.mss_clamp;

3923

+ 	}

3924

+

3925

+@@ -5365,7 +5455,11 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,

3926

+

3927

+ 	tcp_fastopen_cache_set(sk, mss, cookie, syn_drop);

3928

+

3929

+-	if (data) { /* Retransmit unacked data in SYN */

3930

++	/* In mptcp case, we do not rely on "retransmit", but instead on

3931

++	 * "transmit", because if fastopen data is not acked, the retransmission

3932

++	 * becomes the first MPTCP data (see mptcp_rcv_synsent_fastopen).

3933

++	 */

3934

++	if (data && !mptcp(tp)) { /* Retransmit unacked data in SYN */

3935

+ 		tcp_for_write_queue_from(data, sk) {

3936

+ 			if (data == tcp_send_head(sk) ||

3937

+ 			    __tcp_retransmit_skb(sk, data))

3938

+@@ -5388,8 +5482,11 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,

3939

+ 	struct tcp_sock *tp = tcp_sk(sk);

3940

+ 	struct tcp_fastopen_cookie foc = { .len = -1 };

3941

+ 	int saved_clamp = tp->rx_opt.mss_clamp;

3942

++	struct mptcp_options_received mopt;

3943

++	mptcp_init_mp_opt(&mopt);

3944

+

3945

+-	tcp_parse_options(skb, &tp->rx_opt, 0, &foc);

3946

++	tcp_parse_options(skb, &tp->rx_opt,

3947

++			  mptcp(tp) ? &tp->mptcp->rx_opt : &mopt, 0, &foc);

3948

+ 	if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)

3949

+ 		tp->rx_opt.rcv_tsecr -= tp->tsoffset;

3950

+

3951

+@@ -5448,6 +5545,30 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,

3952

+ 		tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);

3953

+ 		tcp_ack(sk, skb, FLAG_SLOWPATH);

3954

+

3955

++		if (tp->request_mptcp || mptcp(tp)) {

3956

++			int ret;

3957

++			ret = mptcp_rcv_synsent_state_process(sk, &sk,

3958

++							      skb, &mopt);

3959

++

3960

++			/* May have changed if we support MPTCP */

3961

++			tp = tcp_sk(sk);

3962

++			icsk = inet_csk(sk);

3963

++

3964

++			if (ret == 1)

3965

++				goto reset_and_undo;

3966

++			if (ret == 2)

3967

++				goto discard;

3968

++		}

3969

++

3970

++		if (mptcp(tp) && !is_master_tp(tp)) {

3971

++			/* Timer for repeating the ACK until an answer

3972

++			 * arrives. Used only when establishing an additional

3973

++			 * subflow inside of an MPTCP connection.

3974

++			 */

3975

++			sk_reset_timer(sk, &tp->mptcp->mptcp_ack_timer,

3976

++				       jiffies + icsk->icsk_rto);

3977

++		}

3978

++

3979

+ 		/* Ok.. it's good. Set up sequence numbers and

3980

+ 		 * move to established.

3981

+ 		 */

3982

+@@ -5474,6 +5595,11 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,

3983

+ 			tp->tcp_header_len = sizeof(struct tcphdr);

3984

+ 		}

3985

+

3986

++		if (mptcp(tp)) {

3987

++			tp->tcp_header_len += MPTCP_SUB_LEN_DSM_ALIGN;

3988

++			tp->advmss -= MPTCP_SUB_LEN_DSM_ALIGN;

3989

++		}

3990

++

3991

+ 		if (tcp_is_sack(tp) && sysctl_tcp_fack)

3992

+ 			tcp_enable_fack(tp);

3993

+

3994

+@@ -5494,9 +5620,12 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,

3995

+ 		    tcp_rcv_fastopen_synack(sk, skb, &foc))

3996

+ 			return -1;

3997

+

3998

+-		if (sk->sk_write_pending ||

3999

++		/* With MPTCP we cannot send data on the third ack due to the

4000

++		 * lack of option-space to combine with an MP_CAPABLE.

4001

++		 */

4002

++		if (!mptcp(tp) && (sk->sk_write_pending ||

4003

+ 		    icsk->icsk_accept_queue.rskq_defer_accept ||

4004

+-		    icsk->icsk_ack.pingpong) {

4005

++		    icsk->icsk_ack.pingpong)) {

4006

+ 			/* Save one ACK. Data will be ready after

4007

+ 			 * several ticks, if write_pending is set.

4008

+ 			 *

4009

+@@ -5536,6 +5665,7 @@ discard:

4010

+ 	    tcp_paws_reject(&tp->rx_opt, 0))

4011

+ 		goto discard_and_undo;

4012

+

4013

++	/* TODO - check this here for MPTCP */

4014

+ 	if (th->syn) {

4015

+ 		/* We see SYN without ACK. It is attempt of

4016

+ 		 * simultaneous connect with crossed SYNs.

4017

+@@ -5552,6 +5682,11 @@ discard:

4018

+ 			tp->tcp_header_len = sizeof(struct tcphdr);

4019

+ 		}

4020

+

4021

++		if (mptcp(tp)) {

4022

++			tp->tcp_header_len += MPTCP_SUB_LEN_DSM_ALIGN;

4023

++			tp->advmss -= MPTCP_SUB_LEN_DSM_ALIGN;

4024

++		}

4025

++

4026

+ 		tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;

4027

+ 		tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1;

4028

+

4029

+@@ -5610,6 +5745,7 @@ reset_and_undo:

4030

+

4031

+ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,

4032

+ 			  const struct tcphdr *th, unsigned int len)

4033

++	__releases(&sk->sk_lock.slock)

4034

+ {

4035

+ 	struct tcp_sock *tp = tcp_sk(sk);

4036

+ 	struct inet_connection_sock *icsk = inet_csk(sk);

4037

+@@ -5661,6 +5797,16 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,

4038

+

4039

+ 	case TCP_SYN_SENT:

4040

+ 		queued = tcp_rcv_synsent_state_process(sk, skb, th, len);

4041

++		if (is_meta_sk(sk)) {

4042

++			sk = tcp_sk(sk)->mpcb->master_sk;

4043

++			tp = tcp_sk(sk);

4044

++

4045

++			/* Need to call it here, because it will announce new

4046

++			 * addresses, which can only be done after the third ack

4047

++			 * of the 3-way handshake.

4048

++			 */

4049

++			mptcp_update_metasocket(sk, tp->meta_sk);

4050

++		}

4051

+ 		if (queued >= 0)

4052

+ 			return queued;

4053

+

4054

+@@ -5668,6 +5814,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,

4055

+ 		tcp_urg(sk, skb, th);

4056

+ 		__kfree_skb(skb);

4057

+ 		tcp_data_snd_check(sk);

4058

++		if (mptcp(tp) && is_master_tp(tp))

4059

++			bh_unlock_sock(sk);

4060

+ 		return 0;

4061

+ 	}

4062

+

4063

+@@ -5706,11 +5854,11 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,

4064

+ 			synack_stamp = tp->lsndtime;

4065

+ 			/* Make sure socket is routed, for correct metrics. */

4066

+ 			icsk->icsk_af_ops->rebuild_header(sk);

4067

+-			tcp_init_congestion_control(sk);

4068

++			tp->ops->init_congestion_control(sk);

4069

+

4070

+ 			tcp_mtup_init(sk);

4071

+ 			tp->copied_seq = tp->rcv_nxt;

4072

+-			tcp_init_buffer_space(sk);

4073

++			tp->ops->init_buffer_space(sk);

4074

+ 		}

4075

+ 		smp_mb();

4076

+ 		tcp_set_state(sk, TCP_ESTABLISHED);

4077

+@@ -5730,6 +5878,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,

4078

+

4079

+ 		if (tp->rx_opt.tstamp_ok)

4080

+ 			tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;

4081

++		if (mptcp(tp))

4082

++			tp->advmss -= MPTCP_SUB_LEN_DSM_ALIGN;

4083

+

4084

+ 		if (req) {

4085

+ 			/* Re-arm the timer because data may have been sent out.

4086

+@@ -5751,6 +5901,12 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,

4087

+

4088

+ 		tcp_initialize_rcv_mss(sk);

4089

+ 		tcp_fast_path_on(tp);

4090

++		/* Send an ACK when establishing a new

4091

++		 * MPTCP subflow, i.e. using an MP_JOIN

4092

++		 * subtype.

4093

++		 */

4094

++		if (mptcp(tp) && !is_master_tp(tp))

4095

++			tcp_send_ack(sk);

4096

+ 		break;

4097

+

4098

+ 	case TCP_FIN_WAIT1: {

4099

+@@ -5802,7 +5958,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,

4100

+ 		tmo = tcp_fin_time(sk);

4101

+ 		if (tmo > TCP_TIMEWAIT_LEN) {

4102

+ 			inet_csk_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN);

4103

+-		} else if (th->fin || sock_owned_by_user(sk)) {

4104

++		} else if (th->fin || mptcp_is_data_fin(skb) ||

4105

++			   sock_owned_by_user(sk)) {

4106

+ 			/* Bad case. We could lose such FIN otherwise.

4107

+ 			 * It is not a big problem, but it looks confusing

4108

+ 			 * and not so rare event. We still can lose it now,

4109

+@@ -5811,7 +5968,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,

4110

+ 			 */

4111

+ 			inet_csk_reset_keepalive_timer(sk, tmo);

4112

+ 		} else {

4113

+-			tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);

4114

++			tp->ops->time_wait(sk, TCP_FIN_WAIT2, tmo);

4115

+ 			goto discard;

4116

+ 		}

4117

+ 		break;

4118

+@@ -5819,7 +5976,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,

4119

+

4120

+ 	case TCP_CLOSING:

4121

+ 		if (tp->snd_una == tp->write_seq) {

4122

+-			tcp_time_wait(sk, TCP_TIME_WAIT, 0);

4123

++			tp->ops->time_wait(sk, TCP_TIME_WAIT, 0);

4124

+ 			goto discard;

4125

+ 		}

4126

+ 		break;

4127

+@@ -5831,6 +5988,9 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,

4128

+ 			goto discard;

4129

+ 		}

4130

+ 		break;

4131

++	case TCP_CLOSE:

4132

++		if (tp->mp_killed)

4133

++			goto discard;

4134

+ 	}

4135

+

4136

+ 	/* step 6: check the URG bit */

4137

+@@ -5851,7 +6011,11 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,

4138

+ 		 */

4139

+ 		if (sk->sk_shutdown & RCV_SHUTDOWN) {

4140

+ 			if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&

4141

+-			    after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) {

4142

++			    after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt) &&

4143

++			    !mptcp(tp)) {

4144

++				/* In case of mptcp, the reset is handled by

4145

++				 * mptcp_rcv_state_process

4146

++				 */

4147

+ 				NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONDATA);

4148

+ 				tcp_reset(sk);

4149

+ 				return 1;

4150

+@@ -5877,3 +6041,154 @@ discard:

4151

+ 	return 0;

4152

+ }

4153

+ EXPORT_SYMBOL(tcp_rcv_state_process);

4154

++

4155

++static inline void pr_drop_req(struct request_sock *req, __u16 port, int family)

4156

++{

4157

++	struct inet_request_sock *ireq = inet_rsk(req);

4158

++

4159

++	if (family == AF_INET)

4160

++		LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("drop open request from %pI4/%u\n"),

4161

++			       &ireq->ir_rmt_addr, port);

4162

++#if IS_ENABLED(CONFIG_IPV6)

4163

++	else if (family == AF_INET6)

4164

++		LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("drop open request from %pI6/%u\n"),

4165

++			       &ireq->ir_v6_rmt_addr, port);

4166

++#endif

4167

++}

4168

++

4169

++int tcp_conn_request(struct request_sock_ops *rsk_ops,

4170

++		     const struct tcp_request_sock_ops *af_ops,

4171

++		     struct sock *sk, struct sk_buff *skb)

4172

++{

4173

++	struct tcp_options_received tmp_opt;

4174

++	struct request_sock *req;

4175

++	struct tcp_sock *tp = tcp_sk(sk);

4176

++	struct dst_entry *dst = NULL;

4177

++	__u32 isn = TCP_SKB_CB(skb)->when;

4178

++	bool want_cookie = false, fastopen;

4179

++	struct flowi fl;

4180

++	struct tcp_fastopen_cookie foc = { .len = -1 };

4181

++	int err;

4182

++

4183

++

4184

++	/* TW buckets are converted to open requests without

4185

++	 * limitations, they conserve resources and peer is

4186

++	 * evidently real one.

4187

++	 */

4188

++	if ((sysctl_tcp_syncookies == 2 ||

4189

++	     inet_csk_reqsk_queue_is_full(sk)) && !isn) {

4190

++		want_cookie = tcp_syn_flood_action(sk, skb, rsk_ops->slab_name);

4191

++		if (!want_cookie)

4192

++			goto drop;

4193

++	}

4194

++

4195

++

4196

++	/* Accept backlog is full. If we have already queued enough

4197

++	 * of warm entries in syn queue, drop request. It is better than

4198

++	 * clogging syn queue with openreqs with exponentially increasing

4199

++	 * timeout.

4200

++	 */

4201

++	if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) {

4202

++		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);

4203

++		goto drop;

4204

++	}

4205

++

4206

++	req = inet_reqsk_alloc(rsk_ops);

4207

++	if (!req)

4208

++		goto drop;

4209

++

4210

++	tcp_rsk(req)->af_specific = af_ops;

4211

++

4212

++	tcp_clear_options(&tmp_opt);

4213

++	tmp_opt.mss_clamp = af_ops->mss_clamp;

4214

++	tmp_opt.user_mss  = tp->rx_opt.user_mss;

4215

++	tcp_parse_options(skb, &tmp_opt, NULL, 0, want_cookie ? NULL : &foc);

4216

++

4217

++	if (want_cookie && !tmp_opt.saw_tstamp)

4218

++		tcp_clear_options(&tmp_opt);

4219

++

4220

++	tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;

4221

++	tcp_openreq_init(req, &tmp_opt, skb);

4222

++

4223

++	if (af_ops->init_req(req, sk, skb))

4224

++		goto drop_and_free;

4225

++

4226

++	if (security_inet_conn_request(sk, skb, req))

4227

++		goto drop_and_free;

4228

++

4229

++	if (!want_cookie || tmp_opt.tstamp_ok)

4230

++		TCP_ECN_create_request(req, skb, sock_net(sk));

4231

++

4232

++	if (want_cookie) {

4233

++		isn = cookie_init_sequence(af_ops, sk, skb, &req->mss);

4234

++		req->cookie_ts = tmp_opt.tstamp_ok;

4235

++	} else if (!isn) {

4236

++		/* VJ's idea. We save last timestamp seen

4237

++		 * from the destination in peer table, when entering

4238

++		 * state TIME-WAIT, and check against it before

4239

++		 * accepting new connection request.

4240

++		 *

4241

++		 * If "isn" is not zero, this request hit alive

4242

++		 * timewait bucket, so that all the necessary checks

4243

++		 * are made in the function processing timewait state.

4244

++		 */

4245

++		if (tmp_opt.saw_tstamp && tcp_death_row.sysctl_tw_recycle) {

4246

++			bool strict;

4247

++

4248

++			dst = af_ops->route_req(sk, &fl, req, &strict);

4249

++			if (dst && strict &&

4250

++			    !tcp_peer_is_proven(req, dst, true)) {

4251

++				NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);

4252

++				goto drop_and_release;

4253

++			}

4254

++		}

4255

++		/* Kill the following clause, if you dislike this way. */

4256

++		else if (!sysctl_tcp_syncookies &&

4257

++			 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <

4258

++			  (sysctl_max_syn_backlog >> 2)) &&

4259

++			 !tcp_peer_is_proven(req, dst, false)) {

4260

++			/* Without syncookies last quarter of

4261

++			 * backlog is filled with destinations,

4262

++			 * proven to be alive.

4263

++			 * It means that we continue to communicate

4264

++			 * to destinations, already remembered

4265

++			 * to the moment of synflood.

4266

++			 */

4267

++			pr_drop_req(req, ntohs(tcp_hdr(skb)->source),

4268

++				    rsk_ops->family);

4269

++			goto drop_and_release;

4270

++		}

4271

++

4272

++		isn = af_ops->init_seq(skb);

4273

++	}

4274

++	if (!dst) {

4275

++		dst = af_ops->route_req(sk, &fl, req, NULL);

4276

++		if (!dst)

4277

++			goto drop_and_free;

4278

++	}

4279

++

4280

++	tcp_rsk(req)->snt_isn = isn;

4281

++	tcp_openreq_init_rwin(req, sk, dst);

4282

++	fastopen = !want_cookie &&

4283

++		   tcp_try_fastopen(sk, skb, req, &foc, dst);

4284

++	err = af_ops->send_synack(sk, dst, &fl, req,

4285

++				  skb_get_queue_mapping(skb), &foc);

4286

++	if (!fastopen) {

4287

++		if (err || want_cookie)

4288

++			goto drop_and_free;

4289

++

4290

++		tcp_rsk(req)->listener = NULL;

4291

++		af_ops->queue_hash_add(sk, req, TCP_TIMEOUT_INIT);

4292

++	}

4293

++

4294

++	return 0;

4295

++

4296

++drop_and_release:

4297

++	dst_release(dst);

4298

++drop_and_free:

4299

++	reqsk_free(req);

4300

++drop:

4301

++	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);

4302

++	return 0;

4303

++}

4304

++EXPORT_SYMBOL(tcp_conn_request);

4305

+diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c

4306

+index 77cccda1ad0c..c77017f600f1 100644

4307

+--- a/net/ipv4/tcp_ipv4.c

4308

++++ b/net/ipv4/tcp_ipv4.c

4309

+@@ -67,6 +67,8 @@

4310

+ #include <net/icmp.h>

4311

+ #include <net/inet_hashtables.h>

4312

+ #include <net/tcp.h>

4313

++#include <net/mptcp.h>

4314

++#include <net/mptcp_v4.h>

4315

+ #include <net/transp_v6.h>

4316

+ #include <net/ipv6.h>

4317

+ #include <net/inet_common.h>

4318

+@@ -99,7 +101,7 @@ static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,

4319

+ struct inet_hashinfo tcp_hashinfo;

4320

+ EXPORT_SYMBOL(tcp_hashinfo);

4321

+

4322

+-static inline __u32 tcp_v4_init_sequence(const struct sk_buff *skb)

4323

++__u32 tcp_v4_init_sequence(const struct sk_buff *skb)

4324

+ {

4325

+ 	return secure_tcp_sequence_number(ip_hdr(skb)->daddr,

4326

+ 					  ip_hdr(skb)->saddr,

4327

+@@ -334,7 +336,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)

4328

+ 	struct inet_sock *inet;

4329

+ 	const int type = icmp_hdr(icmp_skb)->type;

4330

+ 	const int code = icmp_hdr(icmp_skb)->code;

4331

+-	struct sock *sk;

4332

++	struct sock *sk, *meta_sk;

4333

+ 	struct sk_buff *skb;

4334

+ 	struct request_sock *fastopen;

4335

+ 	__u32 seq, snd_una;

4336

+@@ -358,13 +360,19 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)

4337

+ 		return;

4338

+ 	}

4339

+

4340

+-	bh_lock_sock(sk);

4341

++	tp = tcp_sk(sk);

4342

++	if (mptcp(tp))

4343

++		meta_sk = mptcp_meta_sk(sk);

4344

++	else

4345

++		meta_sk = sk;

4346

++

4347

++	bh_lock_sock(meta_sk);

4348

+ 	/* If too many ICMPs get dropped on busy

4349

+ 	 * servers this needs to be solved differently.

4350

+ 	 * We do take care of PMTU discovery (RFC1191) special case :

4351

+ 	 * we can receive locally generated ICMP messages while socket is held.

4352

+ 	 */

4353

+-	if (sock_owned_by_user(sk)) {

4354

++	if (sock_owned_by_user(meta_sk)) {

4355

+ 		if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))

4356

+ 			NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);

4357

+ 	}

4358

+@@ -377,7 +385,6 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)

4359

+ 	}

4360

+

4361

+ 	icsk = inet_csk(sk);

4362

+-	tp = tcp_sk(sk);

4363

+ 	seq = ntohl(th->seq);

4364

+ 	/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */

4365

+ 	fastopen = tp->fastopen_rsk;

4366

+@@ -411,11 +418,13 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)

4367

+ 				goto out;

4368

+

4369

+ 			tp->mtu_info = info;

4370

+-			if (!sock_owned_by_user(sk)) {

4371

++			if (!sock_owned_by_user(meta_sk)) {

4372

+ 				tcp_v4_mtu_reduced(sk);

4373

+ 			} else {

4374

+ 				if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags))

4375

+ 					sock_hold(sk);

4376

++				if (mptcp(tp))

4377

++					mptcp_tsq_flags(sk);

4378

+ 			}

4379

+ 			goto out;

4380

+ 		}

4381

+@@ -429,7 +438,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)

4382

+ 		    !icsk->icsk_backoff || fastopen)

4383

+ 			break;

4384

+

4385

+-		if (sock_owned_by_user(sk))

4386

++		if (sock_owned_by_user(meta_sk))

4387

+ 			break;

4388

+

4389

+ 		icsk->icsk_backoff--;

4390

+@@ -463,7 +472,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)

4391

+ 	switch (sk->sk_state) {

4392

+ 		struct request_sock *req, **prev;

4393

+ 	case TCP_LISTEN:

4394

+-		if (sock_owned_by_user(sk))

4395

++		if (sock_owned_by_user(meta_sk))

4396

+ 			goto out;

4397

+

4398

+ 		req = inet_csk_search_req(sk, &prev, th->dest,

4399

+@@ -499,7 +508,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)

4400

+ 		if (fastopen && fastopen->sk == NULL)

4401

+ 			break;

4402

+

4403

+-		if (!sock_owned_by_user(sk)) {

4404

++		if (!sock_owned_by_user(meta_sk)) {

4405

+ 			sk->sk_err = err;

4406

+

4407

+ 			sk->sk_error_report(sk);

4408

+@@ -528,7 +537,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)

4409

+ 	 */

4410

+

4411

+ 	inet = inet_sk(sk);

4412

+-	if (!sock_owned_by_user(sk) && inet->recverr) {

4413

++	if (!sock_owned_by_user(meta_sk) && inet->recverr) {

4414

+ 		sk->sk_err = err;

4415

+ 		sk->sk_error_report(sk);

4416

+ 	} else	{ /* Only an error on timeout */

4417

+@@ -536,7 +545,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)

4418

+ 	}

4419

+

4420

+ out:

4421

+-	bh_unlock_sock(sk);

4422

++	bh_unlock_sock(meta_sk);

4423

+ 	sock_put(sk);

4424

+ }

4425

+

4426

+@@ -578,7 +587,7 @@ EXPORT_SYMBOL(tcp_v4_send_check);

4427

+  *	Exception: precedence violation. We do not implement it in any case.

4428

+  */

4429

+

4430

+-static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)

4431

++void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)

4432

+ {

4433

+ 	const struct tcphdr *th = tcp_hdr(skb);

4434

+ 	struct {

4435

+@@ -702,10 +711,10 @@ release_sk1:

4436

+    outside socket context is ugly, certainly. What can I do?

4437

+  */

4438

+

4439

+-static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,

4440

++static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack, u32 data_ack,

4441

+ 			    u32 win, u32 tsval, u32 tsecr, int oif,

4442

+ 			    struct tcp_md5sig_key *key,

4443

+-			    int reply_flags, u8 tos)

4444

++			    int reply_flags, u8 tos, int mptcp)

4445

+ {

4446

+ 	const struct tcphdr *th = tcp_hdr(skb);

4447

+ 	struct {

4448

+@@ -714,6 +723,10 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,

4449

+ #ifdef CONFIG_TCP_MD5SIG

4450

+ 			   + (TCPOLEN_MD5SIG_ALIGNED >> 2)

4451

+ #endif

4452

++#ifdef CONFIG_MPTCP

4453

++			   + ((MPTCP_SUB_LEN_DSS >> 2) +

4454

++			      (MPTCP_SUB_LEN_ACK >> 2))

4455

++#endif

4456

+ 			];

4457

+ 	} rep;

4458

+ 	struct ip_reply_arg arg;

4459

+@@ -758,6 +771,21 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,

4460

+ 				    ip_hdr(skb)->daddr, &rep.th);

4461

+ 	}

4462

+ #endif

4463

++#ifdef CONFIG_MPTCP

4464

++	if (mptcp) {

4465

++		int offset = (tsecr) ? 3 : 0;

4466

++		/* Construction of 32-bit data_ack */

4467

++		rep.opt[offset++] = htonl((TCPOPT_MPTCP << 24) |

4468

++					  ((MPTCP_SUB_LEN_DSS + MPTCP_SUB_LEN_ACK) << 16) |

4469

++					  (0x20 << 8) |

4470

++					  (0x01));

4471

++		rep.opt[offset] = htonl(data_ack);

4472

++

4473

++		arg.iov[0].iov_len += MPTCP_SUB_LEN_DSS + MPTCP_SUB_LEN_ACK;

4474

++		rep.th.doff = arg.iov[0].iov_len / 4;

4475

++	}

4476

++#endif /* CONFIG_MPTCP */

4477

++

4478

+ 	arg.flags = reply_flags;

4479

+ 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,

4480

+ 				      ip_hdr(skb)->saddr, /* XXX */

4481

+@@ -776,36 +804,44 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)

4482

+ {

4483

+ 	struct inet_timewait_sock *tw = inet_twsk(sk);

4484

+ 	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);

4485

++	u32 data_ack = 0;

4486

++	int mptcp = 0;

4487

++

4488

++	if (tcptw->mptcp_tw && tcptw->mptcp_tw->meta_tw) {

4489

++		data_ack = (u32)tcptw->mptcp_tw->rcv_nxt;

4490

++		mptcp = 1;

4491

++	}

4492

+

4493

+ 	tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,

4494

++			data_ack,

4495

+ 			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,

4496

+ 			tcp_time_stamp + tcptw->tw_ts_offset,

4497

+ 			tcptw->tw_ts_recent,

4498

+ 			tw->tw_bound_dev_if,

4499

+ 			tcp_twsk_md5_key(tcptw),

4500

+ 			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,

4501

+-			tw->tw_tos

4502

++			tw->tw_tos, mptcp

4503

+ 			);

4504

+

4505

+ 	inet_twsk_put(tw);

4506

+ }

4507

+

4508

+-static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,

4509

+-				  struct request_sock *req)

4510

++void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,

4511

++			   struct request_sock *req)

4512

+ {

4513

+ 	/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV

4514

+ 	 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.

4515

+ 	 */

4516

+ 	tcp_v4_send_ack(skb, (sk->sk_state == TCP_LISTEN) ?

4517

+ 			tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt,

4518

+-			tcp_rsk(req)->rcv_nxt, req->rcv_wnd,

4519

++			tcp_rsk(req)->rcv_nxt, 0, req->rcv_wnd,

4520

+ 			tcp_time_stamp,

4521

+ 			req->ts_recent,

4522

+ 			0,

4523

+ 			tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,

4524

+ 					  AF_INET),

4525

+ 			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,

4526

+-			ip_hdr(skb)->tos);

4527

++			ip_hdr(skb)->tos, 0);

4528

+ }

4529

+

4530

+ /*

4531

+@@ -813,10 +849,11 @@ static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,

4532

+  *	This still operates on a request_sock only, not on a big

4533

+  *	socket.

4534

+  */

4535

+-static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,

4536

+-			      struct request_sock *req,

4537

+-			      u16 queue_mapping,

4538

+-			      struct tcp_fastopen_cookie *foc)

4539

++int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,

4540

++		       struct flowi *fl,

4541

++		       struct request_sock *req,

4542

++		       u16 queue_mapping,

4543

++		       struct tcp_fastopen_cookie *foc)

4544

+ {

4545

+ 	const struct inet_request_sock *ireq = inet_rsk(req);

4546

+ 	struct flowi4 fl4;

4547

+@@ -844,21 +881,10 @@ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,

4548

+ 	return err;

4549

+ }

4550

+

4551

+-static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req)

4552

+-{

4553

+-	int res = tcp_v4_send_synack(sk, NULL, req, 0, NULL);

4554

+-

4555

+-	if (!res) {

4556

+-		TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);

4557

+-		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);

4558

+-	}

4559

+-	return res;

4560

+-}

4561

+-

4562

+ /*

4563

+  *	IPv4 request_sock destructor.

4564

+  */

4565

+-static void tcp_v4_reqsk_destructor(struct request_sock *req)

4566

++void tcp_v4_reqsk_destructor(struct request_sock *req)

4567

+ {

4568

+ 	kfree(inet_rsk(req)->opt);

4569

+ }

4570

+@@ -896,7 +922,7 @@ EXPORT_SYMBOL(tcp_syn_flood_action);

4571

+ /*

4572

+  * Save and compile IPv4 options into the request_sock if needed.

4573

+  */

4574

+-static struct ip_options_rcu *tcp_v4_save_options(struct sk_buff *skb)

4575

++struct ip_options_rcu *tcp_v4_save_options(struct sk_buff *skb)

4576

+ {

4577

+ 	const struct ip_options *opt = &(IPCB(skb)->opt);

4578

+ 	struct ip_options_rcu *dopt = NULL;

4579

+@@ -1237,161 +1263,71 @@ static bool tcp_v4_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb)

4580

+

4581

+ #endif

4582

+

4583

++static int tcp_v4_init_req(struct request_sock *req, struct sock *sk,

4584

++			   struct sk_buff *skb)

4585

++{

4586

++	struct inet_request_sock *ireq = inet_rsk(req);

4587

++

4588

++	ireq->ir_loc_addr = ip_hdr(skb)->daddr;

4589

++	ireq->ir_rmt_addr = ip_hdr(skb)->saddr;

4590

++	ireq->no_srccheck = inet_sk(sk)->transparent;

4591

++	ireq->opt = tcp_v4_save_options(skb);

4592

++	ireq->ir_mark = inet_request_mark(sk, skb);

4593

++

4594

++	return 0;

4595

++}

4596

++

4597

++static struct dst_entry *tcp_v4_route_req(struct sock *sk, struct flowi *fl,

4598

++					  const struct request_sock *req,

4599

++					  bool *strict)

4600

++{

4601

++	struct dst_entry *dst = inet_csk_route_req(sk, &fl->u.ip4, req);

4602

++

4603

++	if (strict) {

4604

++		if (fl->u.ip4.daddr == inet_rsk(req)->ir_rmt_addr)

4605

++			*strict = true;

4606

++		else

4607

++			*strict = false;

4608

++	}

4609

++

4610

++	return dst;

4611

++}

4612

++

4613

+ struct request_sock_ops tcp_request_sock_ops __read_mostly = {

4614

+ 	.family		=	PF_INET,

4615

+ 	.obj_size	=	sizeof(struct tcp_request_sock),

4616

+-	.rtx_syn_ack	=	tcp_v4_rtx_synack,

4617

++	.rtx_syn_ack	=	tcp_rtx_synack,

4618

+ 	.send_ack	=	tcp_v4_reqsk_send_ack,

4619

+ 	.destructor	=	tcp_v4_reqsk_destructor,

4620

+ 	.send_reset	=	tcp_v4_send_reset,

4621

+ 	.syn_ack_timeout = 	tcp_syn_ack_timeout,

4622

+ };

4623

+

4624

++const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {

4625

++	.mss_clamp	=	TCP_MSS_DEFAULT,

4626

+ #ifdef CONFIG_TCP_MD5SIG

4627

+-static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {

4628

+ 	.md5_lookup	=	tcp_v4_reqsk_md5_lookup,

4629

+ 	.calc_md5_hash	=	tcp_v4_md5_hash_skb,

4630

+-};

4631

+ #endif

4632

++	.init_req	=	tcp_v4_init_req,

4633

++#ifdef CONFIG_SYN_COOKIES

4634

++	.cookie_init_seq =	cookie_v4_init_sequence,

4635

++#endif

4636

++	.route_req	=	tcp_v4_route_req,

4637

++	.init_seq	=	tcp_v4_init_sequence,

4638

++	.send_synack	=	tcp_v4_send_synack,

4639

++	.queue_hash_add =	inet_csk_reqsk_queue_hash_add,

4640

++};

4641

+

4642

+ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)

4643

+ {

4644

+-	struct tcp_options_received tmp_opt;

4645

+-	struct request_sock *req;

4646

+-	struct inet_request_sock *ireq;

4647

+-	struct tcp_sock *tp = tcp_sk(sk);

4648

+-	struct dst_entry *dst = NULL;

4649

+-	__be32 saddr = ip_hdr(skb)->saddr;

4650

+-	__be32 daddr = ip_hdr(skb)->daddr;

4651

+-	__u32 isn = TCP_SKB_CB(skb)->when;

4652

+-	bool want_cookie = false, fastopen;

4653

+-	struct flowi4 fl4;

4654

+-	struct tcp_fastopen_cookie foc = { .len = -1 };

4655

+-	int err;

4656

+-

4657

+ 	/* Never answer to SYNs send to broadcast or multicast */

4658

+ 	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))

4659

+ 		goto drop;

4660

+

4661

+-	/* TW buckets are converted to open requests without

4662

+-	 * limitations, they conserve resources and peer is

4663

+-	 * evidently real one.

4664

+-	 */

4665

+-	if ((sysctl_tcp_syncookies == 2 ||

4666

+-	     inet_csk_reqsk_queue_is_full(sk)) && !isn) {

4667

+-		want_cookie = tcp_syn_flood_action(sk, skb, "TCP");

4668

+-		if (!want_cookie)

4669

+-			goto drop;

4670

+-	}

4671

+-

4672

+-	/* Accept backlog is full. If we have already queued enough

4673

+-	 * of warm entries in syn queue, drop request. It is better than

4674

+-	 * clogging syn queue with openreqs with exponentially increasing

4675

+-	 * timeout.

4676

+-	 */

4677

+-	if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) {

4678

+-		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);

4679

+-		goto drop;

4680

+-	}

4681

+-

4682

+-	req = inet_reqsk_alloc(&tcp_request_sock_ops);

4683

+-	if (!req)

4684

+-		goto drop;

4685

+-

4686

+-#ifdef CONFIG_TCP_MD5SIG

4687

+-	tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;

4688

+-#endif

4689

+-

4690

+-	tcp_clear_options(&tmp_opt);

4691

+-	tmp_opt.mss_clamp = TCP_MSS_DEFAULT;

4692

+-	tmp_opt.user_mss  = tp->rx_opt.user_mss;

4693

+-	tcp_parse_options(skb, &tmp_opt, 0, want_cookie ? NULL : &foc);

4694

+-

4695

+-	if (want_cookie && !tmp_opt.saw_tstamp)

4696

+-		tcp_clear_options(&tmp_opt);

4697

+-

4698

+-	tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;

4699

+-	tcp_openreq_init(req, &tmp_opt, skb);

4700

++	return tcp_conn_request(&tcp_request_sock_ops,

4701

++				&tcp_request_sock_ipv4_ops, sk, skb);

4702

+

4703

+-	ireq = inet_rsk(req);

4704

+-	ireq->ir_loc_addr = daddr;

4705

+-	ireq->ir_rmt_addr = saddr;

4706

+-	ireq->no_srccheck = inet_sk(sk)->transparent;

4707

+-	ireq->opt = tcp_v4_save_options(skb);

4708

+-	ireq->ir_mark = inet_request_mark(sk, skb);

4709

+-

4710

+-	if (security_inet_conn_request(sk, skb, req))

4711

+-		goto drop_and_free;

4712

+-

4713

+-	if (!want_cookie || tmp_opt.tstamp_ok)

4714

+-		TCP_ECN_create_request(req, skb, sock_net(sk));

4715

+-

4716

+-	if (want_cookie) {

4717

+-		isn = cookie_v4_init_sequence(sk, skb, &req->mss);

4718

+-		req->cookie_ts = tmp_opt.tstamp_ok;

4719

+-	} else if (!isn) {

4720

+-		/* VJ's idea. We save last timestamp seen

4721

+-		 * from the destination in peer table, when entering

4722

+-		 * state TIME-WAIT, and check against it before

4723

+-		 * accepting new connection request.

4724

+-		 *

4725

+-		 * If "isn" is not zero, this request hit alive

4726

+-		 * timewait bucket, so that all the necessary checks

4727

+-		 * are made in the function processing timewait state.

4728

+-		 */

4729

+-		if (tmp_opt.saw_tstamp &&

4730

+-		    tcp_death_row.sysctl_tw_recycle &&

4731

+-		    (dst = inet_csk_route_req(sk, &fl4, req)) != NULL &&

4732

+-		    fl4.daddr == saddr) {

4733

+-			if (!tcp_peer_is_proven(req, dst, true)) {

4734

+-				NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);

4735

+-				goto drop_and_release;

4736

+-			}

4737

+-		}

4738

+-		/* Kill the following clause, if you dislike this way. */

4739

+-		else if (!sysctl_tcp_syncookies &&

4740

+-			 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <

4741

+-			  (sysctl_max_syn_backlog >> 2)) &&

4742

+-			 !tcp_peer_is_proven(req, dst, false)) {

4743

+-			/* Without syncookies last quarter of

4744

+-			 * backlog is filled with destinations,

4745

+-			 * proven to be alive.

4746

+-			 * It means that we continue to communicate

4747

+-			 * to destinations, already remembered

4748

+-			 * to the moment of synflood.

4749

+-			 */

4750

+-			LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("drop open request from %pI4/%u\n"),

4751

+-				       &saddr, ntohs(tcp_hdr(skb)->source));

4752

+-			goto drop_and_release;

4753

+-		}

4754

+-

4755

+-		isn = tcp_v4_init_sequence(skb);

4756

+-	}

4757

+-	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)

4758

+-		goto drop_and_free;

4759

+-

4760

+-	tcp_rsk(req)->snt_isn = isn;

4761

+-	tcp_rsk(req)->snt_synack = tcp_time_stamp;

4762

+-	tcp_openreq_init_rwin(req, sk, dst);

4763

+-	fastopen = !want_cookie &&

4764

+-		   tcp_try_fastopen(sk, skb, req, &foc, dst);

4765

+-	err = tcp_v4_send_synack(sk, dst, req,

4766

+-				 skb_get_queue_mapping(skb), &foc);

4767

+-	if (!fastopen) {

4768

+-		if (err || want_cookie)

4769

+-			goto drop_and_free;

4770

+-

4771

+-		tcp_rsk(req)->snt_synack = tcp_time_stamp;

4772

+-		tcp_rsk(req)->listener = NULL;

4773

+-		inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);

4774

+-	}

4775

+-

4776

+-	return 0;

4777

+-

4778

+-drop_and_release:

4779

+-	dst_release(dst);

4780

+-drop_and_free:

4781

+-	reqsk_free(req);

4782

+ drop:

4783

+ 	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);

4784

+ 	return 0;

4785

+@@ -1497,7 +1433,7 @@ put_and_exit:

4786

+ }

4787

+ EXPORT_SYMBOL(tcp_v4_syn_recv_sock);

4788

+

4789

+-static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)

4790

++struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)

4791

+ {

4792

+ 	struct tcphdr *th = tcp_hdr(skb);

4793

+ 	const struct iphdr *iph = ip_hdr(skb);

4794

+@@ -1514,8 +1450,15 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)

4795

+

4796

+ 	if (nsk) {

4797

+ 		if (nsk->sk_state != TCP_TIME_WAIT) {

4798

++			/* Don't lock again the meta-sk. It has been locked

4799

++			 * before mptcp_v4_do_rcv.

4800

++			 */

4801

++			if (mptcp(tcp_sk(nsk)) && !is_meta_sk(sk))

4802

++				bh_lock_sock(mptcp_meta_sk(nsk));

4803

+ 			bh_lock_sock(nsk);

4804

++

4805

+ 			return nsk;

4806

++

4807

+ 		}

4808

+ 		inet_twsk_put(inet_twsk(nsk));

4809

+ 		return NULL;

4810

+@@ -1550,6 +1493,9 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)

4811

+ 		goto discard;

4812

+ #endif

4813

+

4814

++	if (is_meta_sk(sk))

4815

++		return mptcp_v4_do_rcv(sk, skb);

4816

++

4817

+ 	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */

4818

+ 		struct dst_entry *dst = sk->sk_rx_dst;

4819

+

4820

+@@ -1681,7 +1627,7 @@ bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)

4821

+ 	} else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {

4822

+ 		wake_up_interruptible_sync_poll(sk_sleep(sk),

4823

+ 					   POLLIN | POLLRDNORM | POLLRDBAND);

4824

+-		if (!inet_csk_ack_scheduled(sk))

4825

++		if (!inet_csk_ack_scheduled(sk) && !mptcp(tp))

4826

+ 			inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,

4827

+ 						  (3 * tcp_rto_min(sk)) / 4,

4828

+ 						  TCP_RTO_MAX);

4829

+@@ -1698,7 +1644,7 @@ int tcp_v4_rcv(struct sk_buff *skb)

4830

+ {

4831

+ 	const struct iphdr *iph;

4832

+ 	const struct tcphdr *th;

4833

+-	struct sock *sk;

4834

++	struct sock *sk, *meta_sk = NULL;

4835

+ 	int ret;

4836

+ 	struct net *net = dev_net(skb->dev);

4837

+

4838

+@@ -1732,18 +1678,42 @@ int tcp_v4_rcv(struct sk_buff *skb)

4839

+ 	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +

4840

+ 				    skb->len - th->doff * 4);

4841

+ 	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);

4842

++#ifdef CONFIG_MPTCP

4843

++	TCP_SKB_CB(skb)->mptcp_flags = 0;

4844

++	TCP_SKB_CB(skb)->dss_off = 0;

4845

++#endif

4846

+ 	TCP_SKB_CB(skb)->when	 = 0;

4847

+ 	TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);

4848

+ 	TCP_SKB_CB(skb)->sacked	 = 0;

4849

+

4850

+ 	sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);

4851

+-	if (!sk)

4852

+-		goto no_tcp_socket;

4853

+

4854

+ process:

4855

+-	if (sk->sk_state == TCP_TIME_WAIT)

4856

++	if (sk && sk->sk_state == TCP_TIME_WAIT)

4857

+ 		goto do_time_wait;

4858

+

4859

++#ifdef CONFIG_MPTCP

4860

++	if (!sk && th->syn && !th->ack) {

4861

++		int ret = mptcp_lookup_join(skb, NULL);

4862

++

4863

++		if (ret < 0) {

4864

++			tcp_v4_send_reset(NULL, skb);

4865

++			goto discard_it;

4866

++		} else if (ret > 0) {

4867

++			return 0;

4868

++		}

4869

++	}

4870

++

4871

++	/* Is there a pending request sock for this segment ? */

4872

++	if ((!sk || sk->sk_state == TCP_LISTEN) && mptcp_check_req(skb, net)) {

4873

++		if (sk)

4874

++			sock_put(sk);

4875

++		return 0;

4876

++	}

4877

++#endif

4878

++	if (!sk)

4879

++		goto no_tcp_socket;

4880

++

4881

+ 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {

4882

+ 		NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);

4883

+ 		goto discard_and_relse;

4884

+@@ -1759,11 +1729,21 @@ process:

4885

+ 	sk_mark_napi_id(sk, skb);

4886

+ 	skb->dev = NULL;

4887

+

4888

+-	bh_lock_sock_nested(sk);

4889

++	if (mptcp(tcp_sk(sk))) {

4890

++		meta_sk = mptcp_meta_sk(sk);

4891

++

4892

++		bh_lock_sock_nested(meta_sk);

4893

++		if (sock_owned_by_user(meta_sk))

4894

++			skb->sk = sk;

4895

++	} else {

4896

++		meta_sk = sk;

4897

++		bh_lock_sock_nested(sk);

4898

++	}

4899

++

4900

+ 	ret = 0;

4901

+-	if (!sock_owned_by_user(sk)) {

4902

++	if (!sock_owned_by_user(meta_sk)) {

4903

+ #ifdef CONFIG_NET_DMA

4904

+-		struct tcp_sock *tp = tcp_sk(sk);

4905

++		struct tcp_sock *tp = tcp_sk(meta_sk);

4906

+ 		if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)

4907

+ 			tp->ucopy.dma_chan = net_dma_find_channel();

4908

+ 		if (tp->ucopy.dma_chan)

4909

+@@ -1771,16 +1751,16 @@ process:

4910

+ 		else

4911

+ #endif

4912

+ 		{

4913

+-			if (!tcp_prequeue(sk, skb))

4914

++			if (!tcp_prequeue(meta_sk, skb))

4915

+ 				ret = tcp_v4_do_rcv(sk, skb);

4916

+ 		}

4917

+-	} else if (unlikely(sk_add_backlog(sk, skb,

4918

+-					   sk->sk_rcvbuf + sk->sk_sndbuf))) {

4919

+-		bh_unlock_sock(sk);

4920

++	} else if (unlikely(sk_add_backlog(meta_sk, skb,

4921

++					   meta_sk->sk_rcvbuf + meta_sk->sk_sndbuf))) {

4922

++		bh_unlock_sock(meta_sk);

4923

+ 		NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);

4924

+ 		goto discard_and_relse;

4925

+ 	}

4926

+-	bh_unlock_sock(sk);

4927

++	bh_unlock_sock(meta_sk);

4928

+

4929

+ 	sock_put(sk);

4930

+

4931

+@@ -1835,6 +1815,18 @@ do_time_wait:

4932

+ 			sk = sk2;

4933

+ 			goto process;

4934

+ 		}

4935

++#ifdef CONFIG_MPTCP

4936

++		if (th->syn && !th->ack) {

4937

++			int ret = mptcp_lookup_join(skb, inet_twsk(sk));

4938

++

4939

++			if (ret < 0) {

4940

++				tcp_v4_send_reset(NULL, skb);

4941

++				goto discard_it;

4942

++			} else if (ret > 0) {

4943

++				return 0;

4944

++			}

4945

++		}

4946

++#endif

4947

+ 		/* Fall through to ACK */

4948

+ 	}

4949

+ 	case TCP_TW_ACK:

4950

+@@ -1900,7 +1892,12 @@ static int tcp_v4_init_sock(struct sock *sk)

4951

+

4952

+ 	tcp_init_sock(sk);

4953

+

4954

+-	icsk->icsk_af_ops = &ipv4_specific;

4955

++#ifdef CONFIG_MPTCP

4956

++	if (is_mptcp_enabled(sk))

4957

++		icsk->icsk_af_ops = &mptcp_v4_specific;

4958

++	else

4959

++#endif

4960

++		icsk->icsk_af_ops = &ipv4_specific;

4961

+

4962

+ #ifdef CONFIG_TCP_MD5SIG

4963

+ 	tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;

4964

+@@ -1917,6 +1914,11 @@ void tcp_v4_destroy_sock(struct sock *sk)

4965

+

4966

+ 	tcp_cleanup_congestion_control(sk);

4967

+

4968

++	if (mptcp(tp))

4969

++		mptcp_destroy_sock(sk);

4970

++	if (tp->inside_tk_table)

4971

++		mptcp_hash_remove(tp);

4972

++

4973

+ 	/* Cleanup up the write buffer. */

4974

+ 	tcp_write_queue_purge(sk);

4975

+

4976

+@@ -2481,6 +2483,19 @@ void tcp4_proc_exit(void)

4977

+ }

4978

+ #endif /* CONFIG_PROC_FS */

4979

+

4980

++#ifdef CONFIG_MPTCP

4981

++static void tcp_v4_clear_sk(struct sock *sk, int size)

4982

++{

4983

++	struct tcp_sock *tp = tcp_sk(sk);

4984

++

4985

++	/* we do not want to clear tk_table field, because of RCU lookups */

4986

++	sk_prot_clear_nulls(sk, offsetof(struct tcp_sock, tk_table));

4987

++

4988

++	size -= offsetof(struct tcp_sock, tk_table) + sizeof(tp->tk_table);

4989

++	memset((char *)&tp->tk_table + sizeof(tp->tk_table), 0, size);

4990

++}

4991

++#endif

4992

++

4993

+ struct proto tcp_prot = {

4994

+ 	.name			= "TCP",

4995

+ 	.owner			= THIS_MODULE,

4996

+@@ -2528,6 +2543,9 @@ struct proto tcp_prot = {

4997

+ 	.destroy_cgroup		= tcp_destroy_cgroup,

4998

+ 	.proto_cgroup		= tcp_proto_cgroup,

4999

+ #endif

5000

++#ifdef CONFIG_MPTCP

5001

++	.clear_sk		= tcp_v4_clear_sk,

5002

++#endif

5003

+ };

5004

+ EXPORT_SYMBOL(tcp_prot);

5005

+

5006

+diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c

5007

+index e68e0d4af6c9..ae6946857dff 100644

5008

+--- a/net/ipv4/tcp_minisocks.c

5009

++++ b/net/ipv4/tcp_minisocks.c

5010

+@@ -18,11 +18,13 @@

5011

+  *		Jorge Cwik, <jorge@×××××××××××××.net>

5012

+  */

5013

+

5014

++#include <linux/kconfig.h>

5015

+ #include <linux/mm.h>

5016

+ #include <linux/module.h>

5017

+ #include <linux/slab.h>

5018

+ #include <linux/sysctl.h>

5019

+ #include <linux/workqueue.h>

5020

++#include <net/mptcp.h>

5021

+ #include <net/tcp.h>

5022

+ #include <net/inet_common.h>

5023

+ #include <net/xfrm.h>

5024

+@@ -95,10 +97,13 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,

5025

+ 	struct tcp_options_received tmp_opt;

5026

+ 	struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);

5027

+ 	bool paws_reject = false;

5028

++	struct mptcp_options_received mopt;

5029

+

5030

+ 	tmp_opt.saw_tstamp = 0;

5031

+ 	if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) {

5032

+-		tcp_parse_options(skb, &tmp_opt, 0, NULL);

5033

++		mptcp_init_mp_opt(&mopt);

5034

++

5035

++		tcp_parse_options(skb, &tmp_opt, &mopt, 0, NULL);

5036

+

5037

+ 		if (tmp_opt.saw_tstamp) {

5038

+ 			tmp_opt.rcv_tsecr	-= tcptw->tw_ts_offset;

5039

+@@ -106,6 +111,11 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,

5040

+ 			tmp_opt.ts_recent_stamp	= tcptw->tw_ts_recent_stamp;

5041

+ 			paws_reject = tcp_paws_reject(&tmp_opt, th->rst);

5042

+ 		}

5043

++

5044

++		if (unlikely(mopt.mp_fclose) && tcptw->mptcp_tw) {

5045

++			if (mopt.mptcp_key == tcptw->mptcp_tw->loc_key)

5046

++				goto kill_with_rst;

5047

++		}

5048

+ 	}

5049

+

5050

+ 	if (tw->tw_substate == TCP_FIN_WAIT2) {

5051

+@@ -128,6 +138,16 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,

5052

+ 		if (!th->ack ||

5053

+ 		    !after(TCP_SKB_CB(skb)->end_seq, tcptw->tw_rcv_nxt) ||

5054

+ 		    TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq) {

5055

++			/* If mptcp_is_data_fin() returns true, we are sure that

5056

++			 * mopt has been initialized - otherwise it would not

5057

++			 * be a DATA_FIN.

5058

++			 */

5059

++			if (tcptw->mptcp_tw && tcptw->mptcp_tw->meta_tw &&

5060

++			    mptcp_is_data_fin(skb) &&

5061

++			    TCP_SKB_CB(skb)->seq == tcptw->tw_rcv_nxt &&

5062

++			    mopt.data_seq + 1 == (u32)tcptw->mptcp_tw->rcv_nxt)

5063

++				return TCP_TW_ACK;

5064

++

5065

+ 			inet_twsk_put(tw);

5066

+ 			return TCP_TW_SUCCESS;

5067

+ 		}

5068

+@@ -290,6 +310,15 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)

5069

+ 		tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp;

5070

+ 		tcptw->tw_ts_offset	= tp->tsoffset;

5071

+

5072

++		if (mptcp(tp)) {

5073

++			if (mptcp_init_tw_sock(sk, tcptw)) {

5074

++				inet_twsk_free(tw);

5075

++				goto exit;

5076

++			}

5077

++		} else {

5078

++			tcptw->mptcp_tw = NULL;

5079

++		}

5080

++

5081

+ #if IS_ENABLED(CONFIG_IPV6)

5082

+ 		if (tw->tw_family == PF_INET6) {

5083

+ 			struct ipv6_pinfo *np = inet6_sk(sk);

5084

+@@ -347,15 +376,18 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)

5085

+ 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPTIMEWAITOVERFLOW);

5086

+ 	}

5087

+

5088

++exit:

5089

+ 	tcp_update_metrics(sk);

5090

+ 	tcp_done(sk);

5091

+ }

5092

+

5093

+ void tcp_twsk_destructor(struct sock *sk)

5094

+ {

5095

+-#ifdef CONFIG_TCP_MD5SIG

5096

+ 	struct tcp_timewait_sock *twsk = tcp_twsk(sk);

5097

+

5098

++	if (twsk->mptcp_tw)

5099

++		mptcp_twsk_destructor(twsk);

5100

++#ifdef CONFIG_TCP_MD5SIG

5101

+ 	if (twsk->tw_md5_key)

5102

+ 		kfree_rcu(twsk->tw_md5_key, rcu);

5103

+ #endif

5104

+@@ -382,13 +414,14 @@ void tcp_openreq_init_rwin(struct request_sock *req,

5105

+ 		req->window_clamp = tcp_full_space(sk);

5106

+

5107

+ 	/* tcp_full_space because it is guaranteed to be the first packet */

5108

+-	tcp_select_initial_window(tcp_full_space(sk),

5109

+-		mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),

5110

++	tp->ops->select_initial_window(tcp_full_space(sk),

5111

++		mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0) -

5112

++		(ireq->saw_mpc ? MPTCP_SUB_LEN_DSM_ALIGN : 0),

5113

+ 		&req->rcv_wnd,

5114

+ 		&req->window_clamp,

5115

+ 		ireq->wscale_ok,

5116

+ 		&rcv_wscale,

5117

+-		dst_metric(dst, RTAX_INITRWND));

5118

++		dst_metric(dst, RTAX_INITRWND), sk);

5119

+ 	ireq->rcv_wscale = rcv_wscale;

5120

+ }

5121

+ EXPORT_SYMBOL(tcp_openreq_init_rwin);

5122

+@@ -499,6 +532,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,

5123

+ 			newtp->rx_opt.ts_recent_stamp = 0;

5124

+ 			newtp->tcp_header_len = sizeof(struct tcphdr);

5125

+ 		}

5126

++		if (ireq->saw_mpc)

5127

++			newtp->tcp_header_len += MPTCP_SUB_LEN_DSM_ALIGN;

5128

+ 		newtp->tsoffset = 0;

5129

+ #ifdef CONFIG_TCP_MD5SIG

5130

+ 		newtp->md5sig_info = NULL;	/*XXX*/

5131

+@@ -535,16 +570,20 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,

5132

+ 			   bool fastopen)

5133

+ {

5134

+ 	struct tcp_options_received tmp_opt;

5135

++	struct mptcp_options_received mopt;

5136

+ 	struct sock *child;

5137

+ 	const struct tcphdr *th = tcp_hdr(skb);

5138

+ 	__be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK);

5139

+ 	bool paws_reject = false;

5140

+

5141

+-	BUG_ON(fastopen == (sk->sk_state == TCP_LISTEN));

5142

++	BUG_ON(!mptcp(tcp_sk(sk)) && fastopen == (sk->sk_state == TCP_LISTEN));

5143

+

5144

+ 	tmp_opt.saw_tstamp = 0;

5145

++

5146

++	mptcp_init_mp_opt(&mopt);

5147

++

5148

+ 	if (th->doff > (sizeof(struct tcphdr)>>2)) {

5149

+-		tcp_parse_options(skb, &tmp_opt, 0, NULL);

5150

++		tcp_parse_options(skb, &tmp_opt, &mopt, 0, NULL);

5151

+

5152

+ 		if (tmp_opt.saw_tstamp) {

5153

+ 			tmp_opt.ts_recent = req->ts_recent;

5154

+@@ -583,7 +622,14 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,

5155

+ 		 *

5156

+ 		 * Reset timer after retransmitting SYNACK, similar to

5157

+ 		 * the idea of fast retransmit in recovery.

5158

++		 *

5159

++		 * Fall back to TCP if MP_CAPABLE is not set.

5160

+ 		 */

5161

++

5162

++		if (inet_rsk(req)->saw_mpc && !mopt.saw_mpc)

5163

++			inet_rsk(req)->saw_mpc = false;

5164

++

5165

++

5166

+ 		if (!inet_rtx_syn_ack(sk, req))

5167

+ 			req->expires = min(TCP_TIMEOUT_INIT << req->num_timeout,

5168

+ 					   TCP_RTO_MAX) + jiffies;

5169

+@@ -718,9 +764,21 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,

5170

+ 	 * socket is created, wait for troubles.

5171

+ 	 */

5172

+ 	child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL);

5173

++

5174

+ 	if (child == NULL)

5175

+ 		goto listen_overflow;

5176

+

5177

++	if (!is_meta_sk(sk)) {

5178

++		int ret = mptcp_check_req_master(sk, child, req, prev);

5179

++		if (ret < 0)

5180

++			goto listen_overflow;

5181

++

5182

++		/* MPTCP-supported */

5183

++		if (!ret)

5184

++			return tcp_sk(child)->mpcb->master_sk;

5185

++	} else {

5186

++		return mptcp_check_req_child(sk, child, req, prev, &mopt);

5187

++	}

5188

+ 	inet_csk_reqsk_queue_unlink(sk, req, prev);

5189

+ 	inet_csk_reqsk_queue_removed(sk, req);

5190

+

5191

+@@ -746,7 +804,17 @@ embryonic_reset:

5192

+ 		tcp_reset(sk);

5193

+ 	}

5194

+ 	if (!fastopen) {

5195

+-		inet_csk_reqsk_queue_drop(sk, req, prev);

5196

++		if (is_meta_sk(sk)) {

5197

++			/* We want to avoid stoping the keepalive-timer and so

5198

++			 * avoid ending up in inet_csk_reqsk_queue_removed ...

5199

++			 */

5200

++			inet_csk_reqsk_queue_unlink(sk, req, prev);

5201

++			if (reqsk_queue_removed(&inet_csk(sk)->icsk_accept_queue, req) == 0)

5202

++				mptcp_delete_synack_timer(sk);

5203

++			reqsk_free(req);

5204

++		} else {

5205

++			inet_csk_reqsk_queue_drop(sk, req, prev);

5206

++		}

5207

+ 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_EMBRYONICRSTS);

5208

+ 	}

5209

+ 	return NULL;

5210

+@@ -770,8 +838,9 @@ int tcp_child_process(struct sock *parent, struct sock *child,

5211

+ {

5212

+ 	int ret = 0;

5213

+ 	int state = child->sk_state;

5214

++	struct sock *meta_sk = mptcp(tcp_sk(child)) ? mptcp_meta_sk(child) : child;

5215

+

5216

+-	if (!sock_owned_by_user(child)) {

5217

++	if (!sock_owned_by_user(meta_sk)) {

5218

+ 		ret = tcp_rcv_state_process(child, skb, tcp_hdr(skb),

5219

+ 					    skb->len);

5220

+ 		/* Wakeup parent, send SIGIO */

5221

+@@ -782,10 +851,14 @@ int tcp_child_process(struct sock *parent, struct sock *child,

5222

+ 		 * in main socket hash table and lock on listening

5223

+ 		 * socket does not protect us more.

5224

+ 		 */

5225

+-		__sk_add_backlog(child, skb);

5226

++		if (mptcp(tcp_sk(child)))

5227

++			skb->sk = child;

5228

++		__sk_add_backlog(meta_sk, skb);

5229

+ 	}

5230

+

5231

+-	bh_unlock_sock(child);

5232

++	if (mptcp(tcp_sk(child)))

5233

++		bh_unlock_sock(child);

5234

++	bh_unlock_sock(meta_sk);

5235

+ 	sock_put(child);

5236

+ 	return ret;

5237

+ }

5238

+diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c

5239

+index 179b51e6bda3..efd31b6c5784 100644

5240

+--- a/net/ipv4/tcp_output.c

5241

++++ b/net/ipv4/tcp_output.c

5242

+@@ -36,6 +36,12 @@

5243

+

5244

+ #define pr_fmt(fmt) "TCP: " fmt

5245

+

5246

++#include <net/mptcp.h>

5247

++#include <net/mptcp_v4.h>

5248

++#if IS_ENABLED(CONFIG_IPV6)

5249

++#include <net/mptcp_v6.h>

5250

++#endif

5251

++#include <net/ipv6.h>

5252

+ #include <net/tcp.h>

5253

+

5254

+ #include <linux/compiler.h>

5255

+@@ -68,11 +74,8 @@ int sysctl_tcp_slow_start_after_idle __read_mostly = 1;

5256

+ unsigned int sysctl_tcp_notsent_lowat __read_mostly = UINT_MAX;

5257

+ EXPORT_SYMBOL(sysctl_tcp_notsent_lowat);

5258

+

5259

+-static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,

5260

+-			   int push_one, gfp_t gfp);

5261

+-

5262

+ /* Account for new data that has been sent to the network. */

5263

+-static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb)

5264

++void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb)

5265

+ {

5266

+ 	struct inet_connection_sock *icsk = inet_csk(sk);

5267

+ 	struct tcp_sock *tp = tcp_sk(sk);

5268

+@@ -214,7 +217,7 @@ u32 tcp_default_init_rwnd(u32 mss)

5269

+ void tcp_select_initial_window(int __space, __u32 mss,

5270

+ 			       __u32 *rcv_wnd, __u32 *window_clamp,

5271

+ 			       int wscale_ok, __u8 *rcv_wscale,

5272

+-			       __u32 init_rcv_wnd)

5273

++			       __u32 init_rcv_wnd, const struct sock *sk)

5274

+ {

5275

+ 	unsigned int space = (__space < 0 ? 0 : __space);

5276

+

5277

+@@ -269,12 +272,16 @@ EXPORT_SYMBOL(tcp_select_initial_window);

5278

+  * value can be stuffed directly into th->window for an outgoing

5279

+  * frame.

5280

+  */

5281

+-static u16 tcp_select_window(struct sock *sk)

5282

++u16 tcp_select_window(struct sock *sk)

5283

+ {

5284

+ 	struct tcp_sock *tp = tcp_sk(sk);

5285

+ 	u32 old_win = tp->rcv_wnd;

5286

+-	u32 cur_win = tcp_receive_window(tp);

5287

+-	u32 new_win = __tcp_select_window(sk);

5288

++	/* The window must never shrink at the meta-level. At the subflow we

5289

++	 * have to allow this. Otherwise we may announce a window too large

5290

++	 * for the current meta-level sk_rcvbuf.

5291

++	 */

5292

++	u32 cur_win = tcp_receive_window(mptcp(tp) ? tcp_sk(mptcp_meta_sk(sk)) : tp);

5293

++	u32 new_win = tp->ops->__select_window(sk);

5294

+

5295

+ 	/* Never shrink the offered window */

5296

+ 	if (new_win < cur_win) {

5297

+@@ -290,6 +297,7 @@ static u16 tcp_select_window(struct sock *sk)

5298

+ 				      LINUX_MIB_TCPWANTZEROWINDOWADV);

5299

+ 		new_win = ALIGN(cur_win, 1 << tp->rx_opt.rcv_wscale);

5300

+ 	}

5301

++

5302

+ 	tp->rcv_wnd = new_win;

5303

+ 	tp->rcv_wup = tp->rcv_nxt;

5304

+

5305

+@@ -374,7 +382,7 @@ static inline void TCP_ECN_send(struct sock *sk, struct sk_buff *skb,

5306

+ /* Constructs common control bits of non-data skb. If SYN/FIN is present,

5307

+  * auto increment end seqno.

5308

+  */

5309

+-static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)

5310

++void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)

5311

+ {

5312

+ 	struct skb_shared_info *shinfo = skb_shinfo(skb);

5313

+

5314

+@@ -394,7 +402,7 @@ static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)

5315

+ 	TCP_SKB_CB(skb)->end_seq = seq;

5316

+ }

5317

+

5318

+-static inline bool tcp_urg_mode(const struct tcp_sock *tp)

5319

++bool tcp_urg_mode(const struct tcp_sock *tp)

5320

+ {

5321

+ 	return tp->snd_una != tp->snd_up;

5322

+ }

5323

+@@ -404,17 +412,7 @@ static inline bool tcp_urg_mode(const struct tcp_sock *tp)

5324

+ #define OPTION_MD5		(1 << 2)

5325

+ #define OPTION_WSCALE		(1 << 3)

5326

+ #define OPTION_FAST_OPEN_COOKIE	(1 << 8)

5327

+-

5328

+-struct tcp_out_options {

5329

+-	u16 options;		/* bit field of OPTION_* */

5330

+-	u16 mss;		/* 0 to disable */

5331

+-	u8 ws;			/* window scale, 0 to disable */

5332

+-	u8 num_sack_blocks;	/* number of SACK blocks to include */

5333

+-	u8 hash_size;		/* bytes in hash_location */

5334

+-	__u8 *hash_location;	/* temporary pointer, overloaded */

5335

+-	__u32 tsval, tsecr;	/* need to include OPTION_TS */

5336

+-	struct tcp_fastopen_cookie *fastopen_cookie;	/* Fast open cookie */

5337

+-};

5338

++/* Before adding here - take a look at OPTION_MPTCP in include/net/mptcp.h */

5339

+

5340

+ /* Write previously computed TCP options to the packet.

5341

+  *

5342

+@@ -430,7 +428,7 @@ struct tcp_out_options {

5343

+  * (but it may well be that other scenarios fail similarly).

5344

+  */

5345

+ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,

5346

+-			      struct tcp_out_options *opts)

5347

++			      struct tcp_out_options *opts, struct sk_buff *skb)

5348

+ {

5349

+ 	u16 options = opts->options;	/* mungable copy */

5350

+

5351

+@@ -513,6 +511,9 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,

5352

+ 		}

5353

+ 		ptr += (foc->len + 3) >> 2;

5354

+ 	}

5355

++

5356

++	if (unlikely(OPTION_MPTCP & opts->options))

5357

++		mptcp_options_write(ptr, tp, opts, skb);

5358

+ }

5359

+

5360

+ /* Compute TCP options for SYN packets. This is not the final

5361

+@@ -564,6 +565,8 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,

5362

+ 		if (unlikely(!(OPTION_TS & opts->options)))

5363

+ 			remaining -= TCPOLEN_SACKPERM_ALIGNED;

5364

+ 	}

5365

++	if (tp->request_mptcp || mptcp(tp))

5366

++		mptcp_syn_options(sk, opts, &remaining);

5367

+

5368

+ 	if (fastopen && fastopen->cookie.len >= 0) {

5369

+ 		u32 need = TCPOLEN_EXP_FASTOPEN_BASE + fastopen->cookie.len;

5370

+@@ -637,6 +640,9 @@ static unsigned int tcp_synack_options(struct sock *sk,

5371

+ 		}

5372

+ 	}

5373

+

5374

++	if (ireq->saw_mpc)

5375

++		mptcp_synack_options(req, opts, &remaining);

5376

++

5377

+ 	return MAX_TCP_OPTION_SPACE - remaining;

5378

+ }

5379

+

5380

+@@ -670,16 +676,22 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb

5381

+ 		opts->tsecr = tp->rx_opt.ts_recent;

5382

+ 		size += TCPOLEN_TSTAMP_ALIGNED;

5383

+ 	}

5384

++	if (mptcp(tp))

5385

++		mptcp_established_options(sk, skb, opts, &size);

5386

+

5387

+ 	eff_sacks = tp->rx_opt.num_sacks + tp->rx_opt.dsack;

5388

+ 	if (unlikely(eff_sacks)) {

5389

+-		const unsigned int remaining = MAX_TCP_OPTION_SPACE - size;

5390

+-		opts->num_sack_blocks =

5391

+-			min_t(unsigned int, eff_sacks,

5392

+-			      (remaining - TCPOLEN_SACK_BASE_ALIGNED) /

5393

+-			      TCPOLEN_SACK_PERBLOCK);

5394

+-		size += TCPOLEN_SACK_BASE_ALIGNED +

5395

+-			opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK;

5396

++		const unsigned remaining = MAX_TCP_OPTION_SPACE - size;

5397

++		if (remaining < TCPOLEN_SACK_BASE_ALIGNED)

5398

++			opts->num_sack_blocks = 0;

5399

++		else

5400

++			opts->num_sack_blocks =

5401

++			    min_t(unsigned int, eff_sacks,

5402

++				  (remaining - TCPOLEN_SACK_BASE_ALIGNED) /

5403

++				  TCPOLEN_SACK_PERBLOCK);

5404

++		if (opts->num_sack_blocks)

5405

++			size += TCPOLEN_SACK_BASE_ALIGNED +

5406

++			    opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK;

5407

+ 	}

5408

+

5409

+ 	return size;

5410

+@@ -711,8 +723,8 @@ static void tcp_tsq_handler(struct sock *sk)

5411

+ 	if ((1 << sk->sk_state) &

5412

+ 	    (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_CLOSING |

5413

+ 	     TCPF_CLOSE_WAIT  | TCPF_LAST_ACK))

5414

+-		tcp_write_xmit(sk, tcp_current_mss(sk), tcp_sk(sk)->nonagle,

5415

+-			       0, GFP_ATOMIC);

5416

++		tcp_sk(sk)->ops->write_xmit(sk, tcp_current_mss(sk),

5417

++					    tcp_sk(sk)->nonagle, 0, GFP_ATOMIC);

5418

+ }

5419

+ /*

5420

+  * One tasklet per cpu tries to send more skbs.

5421

+@@ -727,7 +739,7 @@ static void tcp_tasklet_func(unsigned long data)

5422

+ 	unsigned long flags;

5423

+ 	struct list_head *q, *n;

5424

+ 	struct tcp_sock *tp;

5425

+-	struct sock *sk;

5426

++	struct sock *sk, *meta_sk;

5427

+

5428

+ 	local_irq_save(flags);

5429

+ 	list_splice_init(&tsq->head, &list);

5430

+@@ -738,15 +750,25 @@ static void tcp_tasklet_func(unsigned long data)

5431

+ 		list_del(&tp->tsq_node);

5432

+

5433

+ 		sk = (struct sock *)tp;

5434

+-		bh_lock_sock(sk);

5435

++		meta_sk = mptcp(tp) ? mptcp_meta_sk(sk) : sk;

5436

++		bh_lock_sock(meta_sk);

5437

+

5438

+-		if (!sock_owned_by_user(sk)) {

5439

++		if (!sock_owned_by_user(meta_sk)) {

5440

+ 			tcp_tsq_handler(sk);

5441

++			if (mptcp(tp))

5442

++				tcp_tsq_handler(meta_sk);

5443

+ 		} else {

5444

++			if (mptcp(tp) && sk->sk_state == TCP_CLOSE)

5445

++				goto exit;

5446

++

5447

+ 			/* defer the work to tcp_release_cb() */

5448

+ 			set_bit(TCP_TSQ_DEFERRED, &tp->tsq_flags);

5449

++

5450

++			if (mptcp(tp))

5451

++				mptcp_tsq_flags(sk);

5452

+ 		}

5453

+-		bh_unlock_sock(sk);

5454

++exit:

5455

++		bh_unlock_sock(meta_sk);

5456

+

5457

+ 		clear_bit(TSQ_QUEUED, &tp->tsq_flags);

5458

+ 		sk_free(sk);

5459

+@@ -756,7 +778,10 @@ static void tcp_tasklet_func(unsigned long data)

5460

+ #define TCP_DEFERRED_ALL ((1UL << TCP_TSQ_DEFERRED) |		\

5461

+ 			  (1UL << TCP_WRITE_TIMER_DEFERRED) |	\

5462

+ 			  (1UL << TCP_DELACK_TIMER_DEFERRED) |	\

5463

+-			  (1UL << TCP_MTU_REDUCED_DEFERRED))

5464

++			  (1UL << TCP_MTU_REDUCED_DEFERRED) |   \

5465

++			  (1UL << MPTCP_PATH_MANAGER) |		\

5466

++			  (1UL << MPTCP_SUB_DEFERRED))

5467

++

5468

+ /**

5469

+  * tcp_release_cb - tcp release_sock() callback

5470

+  * @sk: socket

5471

+@@ -803,6 +828,13 @@ void tcp_release_cb(struct sock *sk)

5472

+ 		sk->sk_prot->mtu_reduced(sk);

5473

+ 		__sock_put(sk);

5474

+ 	}

5475

++	if (flags & (1UL << MPTCP_PATH_MANAGER)) {

5476

++		if (tcp_sk(sk)->mpcb->pm_ops->release_sock)

5477

++			tcp_sk(sk)->mpcb->pm_ops->release_sock(sk);

5478

++		__sock_put(sk);

5479

++	}

5480

++	if (flags & (1UL << MPTCP_SUB_DEFERRED))

5481

++		mptcp_tsq_sub_deferred(sk);

5482

+ }

5483

+ EXPORT_SYMBOL(tcp_release_cb);

5484

+

5485

+@@ -862,8 +894,8 @@ void tcp_wfree(struct sk_buff *skb)

5486

+  * We are working here with either a clone of the original

5487

+  * SKB, or a fresh unique copy made by the retransmit engine.

5488

+  */

5489

+-static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,

5490

+-			    gfp_t gfp_mask)

5491

++int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,

5492

++		        gfp_t gfp_mask)

5493

+ {

5494

+ 	const struct inet_connection_sock *icsk = inet_csk(sk);

5495

+ 	struct inet_sock *inet;

5496

+@@ -933,7 +965,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,

5497

+ 		 */

5498

+ 		th->window	= htons(min(tp->rcv_wnd, 65535U));

5499

+ 	} else {

5500

+-		th->window	= htons(tcp_select_window(sk));

5501

++		th->window	= htons(tp->ops->select_window(sk));

5502

+ 	}

5503

+ 	th->check		= 0;

5504

+ 	th->urg_ptr		= 0;

5505

+@@ -949,7 +981,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,

5506

+ 		}

5507

+ 	}

5508

+

5509

+-	tcp_options_write((__be32 *)(th + 1), tp, &opts);

5510

++	tcp_options_write((__be32 *)(th + 1), tp, &opts, skb);

5511

+ 	if (likely((tcb->tcp_flags & TCPHDR_SYN) == 0))

5512

+ 		TCP_ECN_send(sk, skb, tcp_header_size);

5513

+

5514

+@@ -988,7 +1020,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,

5515

+  * NOTE: probe0 timer is not checked, do not forget tcp_push_pending_frames,

5516

+  * otherwise socket can stall.

5517

+  */

5518

+-static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)

5519

++void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)

5520

+ {

5521

+ 	struct tcp_sock *tp = tcp_sk(sk);

5522

+

5523

+@@ -1001,15 +1033,16 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)

5524

+ }

5525

+

5526

+ /* Initialize TSO segments for a packet. */

5527

+-static void tcp_set_skb_tso_segs(const struct sock *sk, struct sk_buff *skb,

5528

+-				 unsigned int mss_now)

5529

++void tcp_set_skb_tso_segs(const struct sock *sk, struct sk_buff *skb,

5530

++			  unsigned int mss_now)

5531

+ {

5532

+ 	struct skb_shared_info *shinfo = skb_shinfo(skb);

5533

+

5534

+ 	/* Make sure we own this skb before messing gso_size/gso_segs */

5535

+ 	WARN_ON_ONCE(skb_cloned(skb));

5536

+

5537

+-	if (skb->len <= mss_now || skb->ip_summed == CHECKSUM_NONE) {

5538

++	if (skb->len <= mss_now || (is_meta_sk(sk) && !mptcp_sk_can_gso(sk)) ||

5539

++	    (!is_meta_sk(sk) && !sk_can_gso(sk)) || skb->ip_summed == CHECKSUM_NONE) {

5540

+ 		/* Avoid the costly divide in the normal

5541

+ 		 * non-TSO case.

5542

+ 		 */

5543

+@@ -1041,7 +1074,7 @@ static void tcp_adjust_fackets_out(struct sock *sk, const struct sk_buff *skb,

5544

+ /* Pcount in the middle of the write queue got changed, we need to do various

5545

+  * tweaks to fix counters

5546

+  */

5547

+-static void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int decr)

5548

++void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int decr)

5549

+ {

5550

+ 	struct tcp_sock *tp = tcp_sk(sk);

5551

+

5552

+@@ -1164,7 +1197,7 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,

5553

+  * eventually). The difference is that pulled data not copied, but

5554

+  * immediately discarded.

5555

+  */

5556

+-static void __pskb_trim_head(struct sk_buff *skb, int len)

5557

++void __pskb_trim_head(struct sk_buff *skb, int len)

5558

+ {

5559

+ 	struct skb_shared_info *shinfo;

5560

+ 	int i, k, eat;

5561

+@@ -1205,6 +1238,9 @@ static void __pskb_trim_head(struct sk_buff *skb, int len)

5562

+ /* Remove acked data from a packet in the transmit queue. */

5563

+ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)

5564

+ {

5565

++	if (mptcp(tcp_sk(sk)) && !is_meta_sk(sk) && mptcp_is_data_seq(skb))

5566

++		return mptcp_trim_head(sk, skb, len);

5567

++

5568

+ 	if (skb_unclone(skb, GFP_ATOMIC))

5569

+ 		return -ENOMEM;

5570

+

5571

+@@ -1222,6 +1258,15 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)

5572

+ 	if (tcp_skb_pcount(skb) > 1)

5573

+ 		tcp_set_skb_tso_segs(sk, skb, tcp_skb_mss(skb));

5574

+

5575

++#ifdef CONFIG_MPTCP

5576

++	/* Some data got acked - we assume that the seq-number reached the dest.

5577

++	 * Anyway, our MPTCP-option has been trimmed above - we lost it here.

5578

++	 * Only remove the SEQ if the call does not come from a meta retransmit.

5579

++	 */

5580

++	if (mptcp(tcp_sk(sk)) && !is_meta_sk(sk))

5581

++		TCP_SKB_CB(skb)->mptcp_flags &= ~MPTCPHDR_SEQ;

5582

++#endif

5583

++

5584

+ 	return 0;

5585

+ }

5586

+

5587

+@@ -1379,6 +1424,7 @@ unsigned int tcp_current_mss(struct sock *sk)

5588

+

5589

+ 	return mss_now;

5590

+ }

5591

++EXPORT_SYMBOL(tcp_current_mss);

5592

+

5593

+ /* RFC2861, slow part. Adjust cwnd, after it was not full during one rto.

5594

+  * As additional protections, we do not touch cwnd in retransmission phases,

5595

+@@ -1446,8 +1492,8 @@ static bool tcp_minshall_check(const struct tcp_sock *tp)

5596

+  * But we can avoid doing the divide again given we already have

5597

+  *  skb_pcount = skb->len / mss_now

5598

+  */

5599

+-static void tcp_minshall_update(struct tcp_sock *tp, unsigned int mss_now,

5600

+-				const struct sk_buff *skb)

5601

++void tcp_minshall_update(struct tcp_sock *tp, unsigned int mss_now,

5602

++			 const struct sk_buff *skb)

5603

+ {

5604

+ 	if (skb->len < tcp_skb_pcount(skb) * mss_now)

5605

+ 		tp->snd_sml = TCP_SKB_CB(skb)->end_seq;

5606

+@@ -1468,11 +1514,11 @@ static bool tcp_nagle_check(bool partial, const struct tcp_sock *tp,

5607

+ 		 (!nonagle && tp->packets_out && tcp_minshall_check(tp)));

5608

+ }

5609

+ /* Returns the portion of skb which can be sent right away */

5610

+-static unsigned int tcp_mss_split_point(const struct sock *sk,

5611

+-					const struct sk_buff *skb,

5612

+-					unsigned int mss_now,

5613

+-					unsigned int max_segs,

5614

+-					int nonagle)

5615

++unsigned int tcp_mss_split_point(const struct sock *sk,

5616

++				 const struct sk_buff *skb,

5617

++				 unsigned int mss_now,

5618

++				 unsigned int max_segs,

5619

++				 int nonagle)

5620

+ {

5621

+ 	const struct tcp_sock *tp = tcp_sk(sk);

5622

+ 	u32 partial, needed, window, max_len;

5623

+@@ -1502,13 +1548,14 @@ static unsigned int tcp_mss_split_point(const struct sock *sk,

5624

+ /* Can at least one segment of SKB be sent right now, according to the

5625

+  * congestion window rules?  If so, return how many segments are allowed.

5626

+  */

5627

+-static inline unsigned int tcp_cwnd_test(const struct tcp_sock *tp,

5628

+-					 const struct sk_buff *skb)

5629

++unsigned int tcp_cwnd_test(const struct tcp_sock *tp,

5630

++			   const struct sk_buff *skb)

5631

+ {

5632

+ 	u32 in_flight, cwnd;

5633

+

5634

+ 	/* Don't be strict about the congestion window for the final FIN.  */

5635

+-	if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) &&

5636

++	if (skb &&

5637

++	    (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) &&

5638

+ 	    tcp_skb_pcount(skb) == 1)

5639

+ 		return 1;

5640

+

5641

+@@ -1524,8 +1571,8 @@ static inline unsigned int tcp_cwnd_test(const struct tcp_sock *tp,

5642

+  * This must be invoked the first time we consider transmitting

5643

+  * SKB onto the wire.

5644

+  */

5645

+-static int tcp_init_tso_segs(const struct sock *sk, struct sk_buff *skb,

5646

+-			     unsigned int mss_now)

5647

++int tcp_init_tso_segs(const struct sock *sk, struct sk_buff *skb,

5648

++		      unsigned int mss_now)

5649

+ {

5650

+ 	int tso_segs = tcp_skb_pcount(skb);

5651

+

5652

+@@ -1540,8 +1587,8 @@ static int tcp_init_tso_segs(const struct sock *sk, struct sk_buff *skb,

5653

+ /* Return true if the Nagle test allows this packet to be

5654

+  * sent now.

5655

+  */

5656

+-static inline bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buff *skb,

5657

+-				  unsigned int cur_mss, int nonagle)

5658

++bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buff *skb,

5659

++		    unsigned int cur_mss, int nonagle)

5660

+ {

5661

+ 	/* Nagle rule does not apply to frames, which sit in the middle of the

5662

+ 	 * write_queue (they have no chances to get new data).

5663

+@@ -1553,7 +1600,8 @@ static inline bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buf

5664

+ 		return true;

5665

+

5666

+ 	/* Don't use the nagle rule for urgent data (or for the final FIN). */

5667

+-	if (tcp_urg_mode(tp) || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN))

5668

++	if (tcp_urg_mode(tp) || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) ||

5669

++	    mptcp_is_data_fin(skb))

5670

+ 		return true;

5671

+

5672

+ 	if (!tcp_nagle_check(skb->len < cur_mss, tp, nonagle))

5673

+@@ -1563,9 +1611,8 @@ static inline bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buf

5674

+ }

5675

+

5676

+ /* Does at least the first segment of SKB fit into the send window? */

5677

+-static bool tcp_snd_wnd_test(const struct tcp_sock *tp,

5678

+-			     const struct sk_buff *skb,

5679

+-			     unsigned int cur_mss)

5680

++bool tcp_snd_wnd_test(const struct tcp_sock *tp, const struct sk_buff *skb,

5681

++		      unsigned int cur_mss)

5682

+ {

5683

+ 	u32 end_seq = TCP_SKB_CB(skb)->end_seq;

5684

+

5685

+@@ -1676,7 +1723,7 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,

5686

+ 	u32 send_win, cong_win, limit, in_flight;

5687

+ 	int win_divisor;

5688

+

5689

+-	if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)

5690

++	if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) || mptcp_is_data_fin(skb))

5691

+ 		goto send_now;

5692

+

5693

+ 	if (icsk->icsk_ca_state != TCP_CA_Open)

5694

+@@ -1888,7 +1935,7 @@ static int tcp_mtu_probe(struct sock *sk)

5695

+  * Returns true, if no segments are in flight and we have queued segments,

5696

+  * but cannot send anything now because of SWS or another problem.

5697

+  */

5698

+-static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,

5699

++bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,

5700

+ 			   int push_one, gfp_t gfp)

5701

+ {

5702

+ 	struct tcp_sock *tp = tcp_sk(sk);

5703

+@@ -1900,7 +1947,11 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,

5704

+

5705

+ 	sent_pkts = 0;

5706

+

5707

+-	if (!push_one) {

5708

++	/* pmtu not yet supported with MPTCP. Should be possible, by early

5709

++	 * exiting the loop inside tcp_mtu_probe, making sure that only one

5710

++	 * single DSS-mapping gets probed.

5711

++	 */

5712

++	if (!push_one && !mptcp(tp)) {

5713

+ 		/* Do MTU probing. */

5714

+ 		result = tcp_mtu_probe(sk);

5715

+ 		if (!result) {

5716

+@@ -2099,7 +2150,8 @@ void tcp_send_loss_probe(struct sock *sk)

5717

+ 	int err = -1;

5718

+

5719

+ 	if (tcp_send_head(sk) != NULL) {

5720

+-		err = tcp_write_xmit(sk, mss, TCP_NAGLE_OFF, 2, GFP_ATOMIC);

5721

++		err = tp->ops->write_xmit(sk, mss, TCP_NAGLE_OFF, 2,

5722

++					  GFP_ATOMIC);

5723

+ 		goto rearm_timer;

5724

+ 	}

5725

+

5726

+@@ -2159,8 +2211,8 @@ void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss,

5727

+ 	if (unlikely(sk->sk_state == TCP_CLOSE))

5728

+ 		return;

5729

+

5730

+-	if (tcp_write_xmit(sk, cur_mss, nonagle, 0,

5731

+-			   sk_gfp_atomic(sk, GFP_ATOMIC)))

5732

++	if (tcp_sk(sk)->ops->write_xmit(sk, cur_mss, nonagle, 0,

5733

++					sk_gfp_atomic(sk, GFP_ATOMIC)))

5734

+ 		tcp_check_probe_timer(sk);

5735

+ }

5736

+

5737

+@@ -2173,7 +2225,8 @@ void tcp_push_one(struct sock *sk, unsigned int mss_now)

5738

+

5739

+ 	BUG_ON(!skb || skb->len < mss_now);

5740

+

5741

+-	tcp_write_xmit(sk, mss_now, TCP_NAGLE_PUSH, 1, sk->sk_allocation);

5742

++	tcp_sk(sk)->ops->write_xmit(sk, mss_now, TCP_NAGLE_PUSH, 1,

5743

++				    sk->sk_allocation);

5744

+ }

5745

+

5746

+ /* This function returns the amount that we can raise the

5747

+@@ -2386,6 +2439,10 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to,

5748

+ 	if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)

5749

+ 		return;

5750

+

5751

++	/* Currently not supported for MPTCP - but it should be possible */

5752

++	if (mptcp(tp))

5753

++		return;

5754

++

5755

+ 	tcp_for_write_queue_from_safe(skb, tmp, sk) {

5756

+ 		if (!tcp_can_collapse(sk, skb))

5757

+ 			break;

5758

+@@ -2843,7 +2900,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,

5759

+

5760

+ 	/* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */

5761

+ 	th->window = htons(min(req->rcv_wnd, 65535U));

5762

+-	tcp_options_write((__be32 *)(th + 1), tp, &opts);

5763

++	tcp_options_write((__be32 *)(th + 1), tp, &opts, skb);

5764

+ 	th->doff = (tcp_header_size >> 2);

5765

+ 	TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_OUTSEGS);

5766

+

5767

+@@ -2897,13 +2954,13 @@ static void tcp_connect_init(struct sock *sk)

5768

+ 	    (tp->window_clamp > tcp_full_space(sk) || tp->window_clamp == 0))

5769

+ 		tp->window_clamp = tcp_full_space(sk);

5770

+

5771

+-	tcp_select_initial_window(tcp_full_space(sk),

5772

+-				  tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0),

5773

+-				  &tp->rcv_wnd,

5774

+-				  &tp->window_clamp,

5775

+-				  sysctl_tcp_window_scaling,

5776

+-				  &rcv_wscale,

5777

+-				  dst_metric(dst, RTAX_INITRWND));

5778

++	tp->ops->select_initial_window(tcp_full_space(sk),

5779

++				       tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0),

5780

++				       &tp->rcv_wnd,

5781

++				       &tp->window_clamp,

5782

++				       sysctl_tcp_window_scaling,

5783

++				       &rcv_wscale,

5784

++				       dst_metric(dst, RTAX_INITRWND), sk);

5785

+

5786

+ 	tp->rx_opt.rcv_wscale = rcv_wscale;

5787

+ 	tp->rcv_ssthresh = tp->rcv_wnd;

5788

+@@ -2927,6 +2984,36 @@ static void tcp_connect_init(struct sock *sk)

5789

+ 	inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT;

5790

+ 	inet_csk(sk)->icsk_retransmits = 0;

5791

+ 	tcp_clear_retrans(tp);

5792

++

5793

++#ifdef CONFIG_MPTCP

5794

++	if (sysctl_mptcp_enabled && mptcp_doit(sk)) {

5795

++		if (is_master_tp(tp)) {

5796

++			tp->request_mptcp = 1;

5797

++			mptcp_connect_init(sk);

5798

++		} else if (tp->mptcp) {

5799

++			struct inet_sock *inet	= inet_sk(sk);

5800

++

5801

++			tp->mptcp->snt_isn	= tp->write_seq;

5802

++			tp->mptcp->init_rcv_wnd	= tp->rcv_wnd;

5803

++

5804

++			/* Set nonce for new subflows */

5805

++			if (sk->sk_family == AF_INET)

5806

++				tp->mptcp->mptcp_loc_nonce = mptcp_v4_get_nonce(

5807

++							inet->inet_saddr,

5808

++							inet->inet_daddr,

5809

++							inet->inet_sport,

5810

++							inet->inet_dport);

5811

++#if IS_ENABLED(CONFIG_IPV6)

5812

++			else

5813

++				tp->mptcp->mptcp_loc_nonce = mptcp_v6_get_nonce(

5814

++						inet6_sk(sk)->saddr.s6_addr32,

5815

++						sk->sk_v6_daddr.s6_addr32,

5816

++						inet->inet_sport,

5817

++						inet->inet_dport);

5818

++#endif

5819

++		}

5820

++	}

5821

++#endif

5822

+ }

5823

+

5824

+ static void tcp_connect_queue_skb(struct sock *sk, struct sk_buff *skb)

5825

+@@ -3176,6 +3263,7 @@ void tcp_send_ack(struct sock *sk)

5826

+ 	TCP_SKB_CB(buff)->when = tcp_time_stamp;

5827

+ 	tcp_transmit_skb(sk, buff, 0, sk_gfp_atomic(sk, GFP_ATOMIC));

5828

+ }

5829

++EXPORT_SYMBOL(tcp_send_ack);

5830

+

5831

+ /* This routine sends a packet with an out of date sequence

5832

+  * number. It assumes the other end will try to ack it.

5833

+@@ -3188,7 +3276,7 @@ void tcp_send_ack(struct sock *sk)

5834

+  * one is with SEG.SEQ=SND.UNA to deliver urgent pointer, another is

5835

+  * out-of-date with SND.UNA-1 to probe window.

5836

+  */

5837

+-static int tcp_xmit_probe_skb(struct sock *sk, int urgent)

5838

++int tcp_xmit_probe_skb(struct sock *sk, int urgent)

5839

+ {

5840

+ 	struct tcp_sock *tp = tcp_sk(sk);

5841

+ 	struct sk_buff *skb;

5842

+@@ -3270,7 +3358,7 @@ void tcp_send_probe0(struct sock *sk)

5843

+ 	struct tcp_sock *tp = tcp_sk(sk);

5844

+ 	int err;

5845

+

5846

+-	err = tcp_write_wakeup(sk);

5847

++	err = tp->ops->write_wakeup(sk);

5848

+

5849

+ 	if (tp->packets_out || !tcp_send_head(sk)) {

5850

+ 		/* Cancel probe timer, if it is not required. */

5851

+@@ -3301,3 +3389,18 @@ void tcp_send_probe0(struct sock *sk)

5852

+ 					  TCP_RTO_MAX);

5853

+ 	}

5854

+ }

5855

++

5856

++int tcp_rtx_synack(struct sock *sk, struct request_sock *req)

5857

++{

5858

++	const struct tcp_request_sock_ops *af_ops = tcp_rsk(req)->af_specific;

5859

++	struct flowi fl;

5860

++	int res;

5861

++

5862

++	res = af_ops->send_synack(sk, NULL, &fl, req, 0, NULL);

5863

++	if (!res) {

5864

++		TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);

5865

++		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);

5866

++	}

5867

++	return res;

5868

++}

5869

++EXPORT_SYMBOL(tcp_rtx_synack);

5870

+diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c

5871

+index 286227abed10..966b873cbf3e 100644

5872

+--- a/net/ipv4/tcp_timer.c

5873

++++ b/net/ipv4/tcp_timer.c

5874

+@@ -20,6 +20,7 @@

5875

+

5876

+ #include <linux/module.h>

5877

+ #include <linux/gfp.h>

5878

++#include <net/mptcp.h>

5879

+ #include <net/tcp.h>

5880

+

5881

+ int sysctl_tcp_syn_retries __read_mostly = TCP_SYN_RETRIES;

5882

+@@ -32,7 +33,7 @@ int sysctl_tcp_retries2 __read_mostly = TCP_RETR2;

5883

+ int sysctl_tcp_orphan_retries __read_mostly;

5884

+ int sysctl_tcp_thin_linear_timeouts __read_mostly;

5885

+

5886

+-static void tcp_write_err(struct sock *sk)

5887

++void tcp_write_err(struct sock *sk)

5888

+ {

5889

+ 	sk->sk_err = sk->sk_err_soft ? : ETIMEDOUT;

5890

+ 	sk->sk_error_report(sk);

5891

+@@ -74,7 +75,7 @@ static int tcp_out_of_resources(struct sock *sk, int do_reset)

5892

+ 		    (!tp->snd_wnd && !tp->packets_out))

5893

+ 			do_reset = 1;

5894

+ 		if (do_reset)

5895

+-			tcp_send_active_reset(sk, GFP_ATOMIC);

5896

++			tp->ops->send_active_reset(sk, GFP_ATOMIC);

5897

+ 		tcp_done(sk);

5898

+ 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONMEMORY);

5899

+ 		return 1;

5900

+@@ -124,10 +125,8 @@ static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk)

5901

+  * retransmissions with an initial RTO of TCP_RTO_MIN or TCP_TIMEOUT_INIT if

5902

+  * syn_set flag is set.

5903

+  */

5904

+-static bool retransmits_timed_out(struct sock *sk,

5905

+-				  unsigned int boundary,

5906

+-				  unsigned int timeout,

5907

+-				  bool syn_set)

5908

++bool retransmits_timed_out(struct sock *sk, unsigned int boundary,

5909

++			   unsigned int timeout, bool syn_set)

5910

+ {

5911

+ 	unsigned int linear_backoff_thresh, start_ts;

5912

+ 	unsigned int rto_base = syn_set ? TCP_TIMEOUT_INIT : TCP_RTO_MIN;

5913

+@@ -153,7 +152,7 @@ static bool retransmits_timed_out(struct sock *sk,

5914

+ }

5915

+

5916

+ /* A write timeout has occurred. Process the after effects. */

5917

+-static int tcp_write_timeout(struct sock *sk)

5918

++int tcp_write_timeout(struct sock *sk)

5919

+ {

5920

+ 	struct inet_connection_sock *icsk = inet_csk(sk);

5921

+ 	struct tcp_sock *tp = tcp_sk(sk);

5922

+@@ -171,6 +170,10 @@ static int tcp_write_timeout(struct sock *sk)

5923

+ 		}

5924

+ 		retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;

5925

+ 		syn_set = true;

5926

++		/* Stop retransmitting MP_CAPABLE options in SYN if timed out. */

5927

++		if (tcp_sk(sk)->request_mptcp &&

5928

++		    icsk->icsk_retransmits >= mptcp_sysctl_syn_retries())

5929

++			tcp_sk(sk)->request_mptcp = 0;

5930

+ 	} else {

5931

+ 		if (retransmits_timed_out(sk, sysctl_tcp_retries1, 0, 0)) {

5932

+ 			/* Black hole detection */

5933

+@@ -251,18 +254,22 @@ out:

5934

+ static void tcp_delack_timer(unsigned long data)

5935

+ {

5936

+ 	struct sock *sk = (struct sock *)data;

5937

++	struct tcp_sock *tp = tcp_sk(sk);

5938

++	struct sock *meta_sk = mptcp(tp) ? mptcp_meta_sk(sk) : sk;

5939

+

5940

+-	bh_lock_sock(sk);

5941

+-	if (!sock_owned_by_user(sk)) {

5942

++	bh_lock_sock(meta_sk);

5943

++	if (!sock_owned_by_user(meta_sk)) {

5944

+ 		tcp_delack_timer_handler(sk);

5945

+ 	} else {

5946

+ 		inet_csk(sk)->icsk_ack.blocked = 1;

5947

+-		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED);

5948

++		NET_INC_STATS_BH(sock_net(meta_sk), LINUX_MIB_DELAYEDACKLOCKED);

5949

+ 		/* deleguate our work to tcp_release_cb() */

5950

+ 		if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags))

5951

+ 			sock_hold(sk);

5952

++		if (mptcp(tp))

5953

++			mptcp_tsq_flags(sk);

5954

+ 	}

5955

+-	bh_unlock_sock(sk);

5956

++	bh_unlock_sock(meta_sk);

5957

+ 	sock_put(sk);

5958

+ }

5959

+

5960

+@@ -479,6 +486,10 @@ out_reset_timer:

5961

+ 		__sk_dst_reset(sk);

5962

+

5963

+ out:;

5964

++	if (mptcp(tp)) {

5965

++		mptcp_reinject_data(sk, 1);

5966

++		mptcp_set_rto(sk);

5967

++	}

5968

+ }

5969

+

5970

+ void tcp_write_timer_handler(struct sock *sk)

5971

+@@ -505,7 +516,7 @@ void tcp_write_timer_handler(struct sock *sk)

5972

+ 		break;

5973

+ 	case ICSK_TIME_RETRANS:

5974

+ 		icsk->icsk_pending = 0;

5975

+-		tcp_retransmit_timer(sk);

5976

++		tcp_sk(sk)->ops->retransmit_timer(sk);

5977

+ 		break;

5978

+ 	case ICSK_TIME_PROBE0:

5979

+ 		icsk->icsk_pending = 0;

5980

+@@ -520,16 +531,19 @@ out:

5981

+ static void tcp_write_timer(unsigned long data)

5982

+ {

5983

+ 	struct sock *sk = (struct sock *)data;

5984

++	struct sock *meta_sk = mptcp(tcp_sk(sk)) ? mptcp_meta_sk(sk) : sk;

5985

+

5986

+-	bh_lock_sock(sk);

5987

+-	if (!sock_owned_by_user(sk)) {

5988

++	bh_lock_sock(meta_sk);

5989

++	if (!sock_owned_by_user(meta_sk)) {

5990

+ 		tcp_write_timer_handler(sk);

5991

+ 	} else {

5992

+ 		/* deleguate our work to tcp_release_cb() */

5993

+ 		if (!test_and_set_bit(TCP_WRITE_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags))

5994

+ 			sock_hold(sk);

5995

++		if (mptcp(tcp_sk(sk)))

5996

++			mptcp_tsq_flags(sk);

5997

+ 	}

5998

+-	bh_unlock_sock(sk);

5999

++	bh_unlock_sock(meta_sk);

6000

+ 	sock_put(sk);

6001

+ }

6002

+

6003

+@@ -566,11 +580,12 @@ static void tcp_keepalive_timer (unsigned long data)

6004

+ 	struct sock *sk = (struct sock *) data;

6005

+ 	struct inet_connection_sock *icsk = inet_csk(sk);

6006

+ 	struct tcp_sock *tp = tcp_sk(sk);

6007

++	struct sock *meta_sk = mptcp(tp) ? mptcp_meta_sk(sk) : sk;

6008

+ 	u32 elapsed;

6009

+

6010

+ 	/* Only process if socket is not in use. */

6011

+-	bh_lock_sock(sk);

6012

+-	if (sock_owned_by_user(sk)) {

6013

++	bh_lock_sock(meta_sk);

6014

++	if (sock_owned_by_user(meta_sk)) {

6015

+ 		/* Try again later. */

6016

+ 		inet_csk_reset_keepalive_timer (sk, HZ/20);

6017

+ 		goto out;

6018

+@@ -581,16 +596,38 @@ static void tcp_keepalive_timer (unsigned long data)

6019

+ 		goto out;

6020

+ 	}

6021

+

6022

++	if (tp->send_mp_fclose) {

6023

++		/* MUST do this before tcp_write_timeout, because retrans_stamp

6024

++		 * may have been set to 0 in another part while we are

6025

++		 * retransmitting MP_FASTCLOSE. Then, we would crash, because

6026

++		 * retransmits_timed_out accesses the meta-write-queue.

6027

++		 *

6028

++		 * We make sure that the timestamp is != 0.

6029

++		 */

6030

++		if (!tp->retrans_stamp)

6031

++			tp->retrans_stamp = tcp_time_stamp ? : 1;

6032

++

6033

++		if (tcp_write_timeout(sk))

6034

++			goto out;

6035

++

6036

++		tcp_send_ack(sk);

6037

++		icsk->icsk_retransmits++;

6038

++

6039

++		icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX);

6040

++		elapsed = icsk->icsk_rto;

6041

++		goto resched;

6042

++	}

6043

++

6044

+ 	if (sk->sk_state == TCP_FIN_WAIT2 && sock_flag(sk, SOCK_DEAD)) {

6045

+ 		if (tp->linger2 >= 0) {

6046

+ 			const int tmo = tcp_fin_time(sk) - TCP_TIMEWAIT_LEN;

6047

+

6048

+ 			if (tmo > 0) {

6049

+-				tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);

6050

++				tp->ops->time_wait(sk, TCP_FIN_WAIT2, tmo);

6051

+ 				goto out;

6052

+ 			}

6053

+ 		}

6054

+-		tcp_send_active_reset(sk, GFP_ATOMIC);

6055

++		tp->ops->send_active_reset(sk, GFP_ATOMIC);

6056

+ 		goto death;

6057

+ 	}

6058

+

6059

+@@ -614,11 +651,11 @@ static void tcp_keepalive_timer (unsigned long data)

6060

+ 		    icsk->icsk_probes_out > 0) ||

6061

+ 		    (icsk->icsk_user_timeout == 0 &&

6062

+ 		    icsk->icsk_probes_out >= keepalive_probes(tp))) {

6063

+-			tcp_send_active_reset(sk, GFP_ATOMIC);

6064

++			tp->ops->send_active_reset(sk, GFP_ATOMIC);

6065

+ 			tcp_write_err(sk);

6066

+ 			goto out;

6067

+ 		}

6068

+-		if (tcp_write_wakeup(sk) <= 0) {

6069

++		if (tp->ops->write_wakeup(sk) <= 0) {

6070

+ 			icsk->icsk_probes_out++;

6071

+ 			elapsed = keepalive_intvl_when(tp);

6072

+ 		} else {

6073

+@@ -642,7 +679,7 @@ death:

6074

+ 	tcp_done(sk);

6075

+

6076

+ out:

6077

+-	bh_unlock_sock(sk);

6078

++	bh_unlock_sock(meta_sk);

6079

+ 	sock_put(sk);

6080

+ }

6081

+

6082

+diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c

6083

+index 5667b3003af9..7139c2973fd2 100644

6084

+--- a/net/ipv6/addrconf.c

6085

++++ b/net/ipv6/addrconf.c

6086

+@@ -760,6 +760,7 @@ void inet6_ifa_finish_destroy(struct inet6_ifaddr *ifp)

6087

+

6088

+ 	kfree_rcu(ifp, rcu);

6089

+ }

6090

++EXPORT_SYMBOL(inet6_ifa_finish_destroy);

6091

+

6092

+ static void

6093

+ ipv6_link_dev_addr(struct inet6_dev *idev, struct inet6_ifaddr *ifp)

6094

+diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c

6095

+index 7cb4392690dd..7057afbca4df 100644

6096

+--- a/net/ipv6/af_inet6.c

6097

++++ b/net/ipv6/af_inet6.c

6098

+@@ -97,8 +97,7 @@ static __inline__ struct ipv6_pinfo *inet6_sk_generic(struct sock *sk)

6099

+ 	return (struct ipv6_pinfo *)(((u8 *)sk) + offset);

6100

+ }

6101

+

6102

+-static int inet6_create(struct net *net, struct socket *sock, int protocol,

6103

+-			int kern)

6104

++int inet6_create(struct net *net, struct socket *sock, int protocol, int kern)

6105

+ {

6106

+ 	struct inet_sock *inet;

6107

+ 	struct ipv6_pinfo *np;

6108

+diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c

6109

+index a245e5ddffbd..99c892b8992d 100644

6110

+--- a/net/ipv6/inet6_connection_sock.c

6111

++++ b/net/ipv6/inet6_connection_sock.c

6112

+@@ -96,8 +96,8 @@ struct dst_entry *inet6_csk_route_req(struct sock *sk,

6113

+ /*

6114

+  * request_sock (formerly open request) hash tables.

6115

+  */

6116

+-static u32 inet6_synq_hash(const struct in6_addr *raddr, const __be16 rport,

6117

+-			   const u32 rnd, const u32 synq_hsize)

6118

++u32 inet6_synq_hash(const struct in6_addr *raddr, const __be16 rport,

6119

++		    const u32 rnd, const u32 synq_hsize)

6120

+ {

6121

+ 	u32 c;

6122

+

6123

+diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c

6124

+index edb58aff4ae7..ea4d9fda0927 100644

6125

+--- a/net/ipv6/ipv6_sockglue.c

6126

++++ b/net/ipv6/ipv6_sockglue.c

6127

+@@ -48,6 +48,8 @@

6128

+ #include <net/addrconf.h>

6129

+ #include <net/inet_common.h>

6130

+ #include <net/tcp.h>

6131

++#include <net/mptcp.h>

6132

++#include <net/mptcp_v4.h>

6133

+ #include <net/udp.h>

6134

+ #include <net/udplite.h>

6135

+ #include <net/xfrm.h>

6136

+@@ -196,7 +198,12 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname,

6137

+ 				sock_prot_inuse_add(net, &tcp_prot, 1);

6138

+ 				local_bh_enable();

6139

+ 				sk->sk_prot = &tcp_prot;

6140

+-				icsk->icsk_af_ops = &ipv4_specific;

6141

++#ifdef CONFIG_MPTCP

6142

++				if (is_mptcp_enabled(sk))

6143

++					icsk->icsk_af_ops = &mptcp_v4_specific;

6144

++				else

6145

++#endif

6146

++					icsk->icsk_af_ops = &ipv4_specific;

6147

+ 				sk->sk_socket->ops = &inet_stream_ops;

6148

+ 				sk->sk_family = PF_INET;

6149

+ 				tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);

6150

+diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c

6151

+index a822b880689b..b2b38869d795 100644

6152

+--- a/net/ipv6/syncookies.c

6153

++++ b/net/ipv6/syncookies.c

6154

+@@ -181,13 +181,13 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)

6155

+

6156

+ 	/* check for timestamp cookie support */

6157

+ 	memset(&tcp_opt, 0, sizeof(tcp_opt));

6158

+-	tcp_parse_options(skb, &tcp_opt, 0, NULL);

6159

++	tcp_parse_options(skb, &tcp_opt, NULL, 0, NULL);

6160

+

6161

+ 	if (!cookie_check_timestamp(&tcp_opt, sock_net(sk), &ecn_ok))

6162

+ 		goto out;

6163

+

6164

+ 	ret = NULL;

6165

+-	req = inet6_reqsk_alloc(&tcp6_request_sock_ops);

6166

++	req = inet_reqsk_alloc(&tcp6_request_sock_ops);

6167

+ 	if (!req)

6168

+ 		goto out;

6169

+

6170

+@@ -255,10 +255,10 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)

6171

+ 	}

6172

+

6173

+ 	req->window_clamp = tp->window_clamp ? :dst_metric(dst, RTAX_WINDOW);

6174

+-	tcp_select_initial_window(tcp_full_space(sk), req->mss,

6175

+-				  &req->rcv_wnd, &req->window_clamp,

6176

+-				  ireq->wscale_ok, &rcv_wscale,

6177

+-				  dst_metric(dst, RTAX_INITRWND));

6178

++	tp->ops->select_initial_window(tcp_full_space(sk), req->mss,

6179

++				       &req->rcv_wnd, &req->window_clamp,

6180

++				       ireq->wscale_ok, &rcv_wscale,

6181

++				       dst_metric(dst, RTAX_INITRWND), sk);

6182

+

6183

+ 	ireq->rcv_wscale = rcv_wscale;

6184

+

6185

+diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c

6186

+index 229239ad96b1..fda94d71666e 100644

6187

+--- a/net/ipv6/tcp_ipv6.c

6188

++++ b/net/ipv6/tcp_ipv6.c

6189

+@@ -63,6 +63,8 @@

6190

+ #include <net/inet_common.h>

6191

+ #include <net/secure_seq.h>

6192

+ #include <net/tcp_memcontrol.h>

6193

++#include <net/mptcp.h>

6194

++#include <net/mptcp_v6.h>

6195

+ #include <net/busy_poll.h>

6196

+

6197

+ #include <linux/proc_fs.h>

6198

+@@ -71,12 +73,6 @@

6199

+ #include <linux/crypto.h>

6200

+ #include <linux/scatterlist.h>

6201

+

6202

+-static void	tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb);

6203

+-static void	tcp_v6_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,

6204

+-				      struct request_sock *req);

6205

+-

6206

+-static int	tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb);

6207

+-

6208

+ static const struct inet_connection_sock_af_ops ipv6_mapped;

6209

+ static const struct inet_connection_sock_af_ops ipv6_specific;

6210

+ #ifdef CONFIG_TCP_MD5SIG

6211

+@@ -90,7 +86,7 @@ static struct tcp_md5sig_key *tcp_v6_md5_do_lookup(struct sock *sk,

6212

+ }

6213

+ #endif

6214

+

6215

+-static void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)

6216

++void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)

6217

+ {

6218

+ 	struct dst_entry *dst = skb_dst(skb);

6219

+ 	const struct rt6_info *rt = (const struct rt6_info *)dst;

6220

+@@ -102,10 +98,11 @@ static void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)

6221

+ 		inet6_sk(sk)->rx_dst_cookie = rt->rt6i_node->fn_sernum;

6222

+ }

6223

+

6224

+-static void tcp_v6_hash(struct sock *sk)

6225

++void tcp_v6_hash(struct sock *sk)

6226

+ {

6227

+ 	if (sk->sk_state != TCP_CLOSE) {

6228

+-		if (inet_csk(sk)->icsk_af_ops == &ipv6_mapped) {

6229

++		if (inet_csk(sk)->icsk_af_ops == &ipv6_mapped ||

6230

++		    inet_csk(sk)->icsk_af_ops == &mptcp_v6_mapped) {

6231

+ 			tcp_prot.hash(sk);

6232

+ 			return;

6233

+ 		}

6234

+@@ -115,7 +112,7 @@ static void tcp_v6_hash(struct sock *sk)

6235

+ 	}

6236

+ }

6237

+

6238

+-static __u32 tcp_v6_init_sequence(const struct sk_buff *skb)

6239

++__u32 tcp_v6_init_sequence(const struct sk_buff *skb)

6240

+ {

6241

+ 	return secure_tcpv6_sequence_number(ipv6_hdr(skb)->daddr.s6_addr32,

6242

+ 					    ipv6_hdr(skb)->saddr.s6_addr32,

6243

+@@ -123,7 +120,7 @@ static __u32 tcp_v6_init_sequence(const struct sk_buff *skb)

6244

+ 					    tcp_hdr(skb)->source);

6245

+ }

6246

+

6247

+-static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,

6248

++int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,

6249

+ 			  int addr_len)

6250

+ {

6251

+ 	struct sockaddr_in6 *usin = (struct sockaddr_in6 *) uaddr;

6252

+@@ -215,7 +212,12 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,

6253

+ 		sin.sin_port = usin->sin6_port;

6254

+ 		sin.sin_addr.s_addr = usin->sin6_addr.s6_addr32[3];

6255

+

6256

+-		icsk->icsk_af_ops = &ipv6_mapped;

6257

++#ifdef CONFIG_MPTCP

6258

++		if (is_mptcp_enabled(sk))

6259

++			icsk->icsk_af_ops = &mptcp_v6_mapped;

6260

++		else

6261

++#endif

6262

++			icsk->icsk_af_ops = &ipv6_mapped;

6263

+ 		sk->sk_backlog_rcv = tcp_v4_do_rcv;

6264

+ #ifdef CONFIG_TCP_MD5SIG

6265

+ 		tp->af_specific = &tcp_sock_ipv6_mapped_specific;

6266

+@@ -225,7 +227,12 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,

6267

+

6268

+ 		if (err) {

6269

+ 			icsk->icsk_ext_hdr_len = exthdrlen;

6270

+-			icsk->icsk_af_ops = &ipv6_specific;

6271

++#ifdef CONFIG_MPTCP

6272

++			if (is_mptcp_enabled(sk))

6273

++				icsk->icsk_af_ops = &mptcp_v6_specific;

6274

++			else

6275

++#endif

6276

++				icsk->icsk_af_ops = &ipv6_specific;

6277

+ 			sk->sk_backlog_rcv = tcp_v6_do_rcv;

6278

+ #ifdef CONFIG_TCP_MD5SIG

6279

+ 			tp->af_specific = &tcp_sock_ipv6_specific;

6280

+@@ -337,7 +344,7 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,

6281

+ 	const struct ipv6hdr *hdr = (const struct ipv6hdr *)skb->data;

6282

+ 	const struct tcphdr *th = (struct tcphdr *)(skb->data+offset);

6283

+ 	struct ipv6_pinfo *np;

6284

+-	struct sock *sk;

6285

++	struct sock *sk, *meta_sk;

6286

+ 	int err;

6287

+ 	struct tcp_sock *tp;

6288

+ 	struct request_sock *fastopen;

6289

+@@ -358,8 +365,14 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,

6290

+ 		return;

6291

+ 	}

6292

+

6293

+-	bh_lock_sock(sk);

6294

+-	if (sock_owned_by_user(sk) && type != ICMPV6_PKT_TOOBIG)

6295

++	tp = tcp_sk(sk);

6296

++	if (mptcp(tp))

6297

++		meta_sk = mptcp_meta_sk(sk);

6298

++	else

6299

++		meta_sk = sk;

6300

++

6301

++	bh_lock_sock(meta_sk);

6302

++	if (sock_owned_by_user(meta_sk) && type != ICMPV6_PKT_TOOBIG)

6303

+ 		NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);

6304

+

6305

+ 	if (sk->sk_state == TCP_CLOSE)

6306

+@@ -370,7 +383,6 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,

6307

+ 		goto out;

6308

+ 	}

6309

+

6310

+-	tp = tcp_sk(sk);

6311

+ 	seq = ntohl(th->seq);

6312

+ 	/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */

6313

+ 	fastopen = tp->fastopen_rsk;

6314

+@@ -403,11 +415,15 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,

6315

+ 			goto out;

6316

+

6317

+ 		tp->mtu_info = ntohl(info);

6318

+-		if (!sock_owned_by_user(sk))

6319

++		if (!sock_owned_by_user(meta_sk))

6320

+ 			tcp_v6_mtu_reduced(sk);

6321

+-		else if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED,

6322

++		else {

6323

++			if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED,

6324

+ 					   &tp->tsq_flags))

6325

+-			sock_hold(sk);

6326

++				sock_hold(sk);

6327

++			if (mptcp(tp))

6328

++				mptcp_tsq_flags(sk);

6329

++		}

6330

+ 		goto out;

6331

+ 	}

6332

+

6333

+@@ -417,7 +433,7 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,

6334

+ 	switch (sk->sk_state) {

6335

+ 		struct request_sock *req, **prev;

6336

+ 	case TCP_LISTEN:

6337

+-		if (sock_owned_by_user(sk))

6338

++		if (sock_owned_by_user(meta_sk))

6339

+ 			goto out;

6340

+

6341

+ 		req = inet6_csk_search_req(sk, &prev, th->dest, &hdr->daddr,

6342

+@@ -447,7 +463,7 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,

6343

+ 		if (fastopen && fastopen->sk == NULL)

6344

+ 			break;

6345

+

6346

+-		if (!sock_owned_by_user(sk)) {

6347

++		if (!sock_owned_by_user(meta_sk)) {

6348

+ 			sk->sk_err = err;

6349

+ 			sk->sk_error_report(sk);		/* Wake people up to see the error (see connect in sock.c) */

6350

+

6351

+@@ -457,26 +473,27 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,

6352

+ 		goto out;

6353

+ 	}

6354

+

6355

+-	if (!sock_owned_by_user(sk) && np->recverr) {

6356

++	if (!sock_owned_by_user(meta_sk) && np->recverr) {

6357

+ 		sk->sk_err = err;

6358

+ 		sk->sk_error_report(sk);

6359

+ 	} else

6360

+ 		sk->sk_err_soft = err;

6361

+

6362

+ out:

6363

+-	bh_unlock_sock(sk);

6364

++	bh_unlock_sock(meta_sk);

6365

+ 	sock_put(sk);

6366

+ }

6367

+

6368

+

6369

+-static int tcp_v6_send_synack(struct sock *sk, struct dst_entry *dst,

6370

+-			      struct flowi6 *fl6,

6371

+-			      struct request_sock *req,

6372

+-			      u16 queue_mapping,

6373

+-			      struct tcp_fastopen_cookie *foc)

6374

++int tcp_v6_send_synack(struct sock *sk, struct dst_entry *dst,

6375

++		       struct flowi *fl,

6376

++		       struct request_sock *req,

6377

++		       u16 queue_mapping,

6378

++		       struct tcp_fastopen_cookie *foc)

6379

+ {

6380

+ 	struct inet_request_sock *ireq = inet_rsk(req);

6381

+ 	struct ipv6_pinfo *np = inet6_sk(sk);

6382

++	struct flowi6 *fl6 = &fl->u.ip6;

6383

+ 	struct sk_buff *skb;

6384

+ 	int err = -ENOMEM;

6385

+

6386

+@@ -497,18 +514,21 @@ static int tcp_v6_send_synack(struct sock *sk, struct dst_entry *dst,

6387

+ 		skb_set_queue_mapping(skb, queue_mapping);

6388

+ 		err = ip6_xmit(sk, skb, fl6, np->opt, np->tclass);

6389

+ 		err = net_xmit_eval(err);

6390

++		if (!tcp_rsk(req)->snt_synack && !err)

6391

++			tcp_rsk(req)->snt_synack = tcp_time_stamp;

6392

+ 	}

6393

+

6394

+ done:

6395

+ 	return err;

6396

+ }

6397

+

6398

+-static int tcp_v6_rtx_synack(struct sock *sk, struct request_sock *req)

6399

++int tcp_v6_rtx_synack(struct sock *sk, struct request_sock *req)

6400

+ {

6401

+-	struct flowi6 fl6;

6402

++	const struct tcp_request_sock_ops *af_ops = tcp_rsk(req)->af_specific;

6403

++	struct flowi fl;

6404

+ 	int res;

6405

+

6406

+-	res = tcp_v6_send_synack(sk, NULL, &fl6, req, 0, NULL);

6407

++	res = af_ops->send_synack(sk, NULL, &fl, req, 0, NULL);

6408

+ 	if (!res) {

6409

+ 		TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);

6410

+ 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);

6411

+@@ -516,7 +536,7 @@ static int tcp_v6_rtx_synack(struct sock *sk, struct request_sock *req)

6412

+ 	return res;

6413

+ }

6414

+

6415

+-static void tcp_v6_reqsk_destructor(struct request_sock *req)

6416

++void tcp_v6_reqsk_destructor(struct request_sock *req)

6417

+ {

6418

+ 	kfree_skb(inet_rsk(req)->pktopts);

6419

+ }

6420

+@@ -718,27 +738,74 @@ static int tcp_v6_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb)

6421

+ }

6422

+ #endif

6423

+

6424

++static int tcp_v6_init_req(struct request_sock *req, struct sock *sk,

6425

++			   struct sk_buff *skb)

6426

++{

6427

++	struct inet_request_sock *ireq = inet_rsk(req);

6428

++	struct ipv6_pinfo *np = inet6_sk(sk);

6429

++

6430

++	ireq->ir_v6_rmt_addr = ipv6_hdr(skb)->saddr;

6431

++	ireq->ir_v6_loc_addr = ipv6_hdr(skb)->daddr;

6432

++

6433

++	ireq->ir_iif = sk->sk_bound_dev_if;

6434

++	ireq->ir_mark = inet_request_mark(sk, skb);

6435

++

6436

++	/* So that link locals have meaning */

6437

++	if (!sk->sk_bound_dev_if &&

6438

++	    ipv6_addr_type(&ireq->ir_v6_rmt_addr) & IPV6_ADDR_LINKLOCAL)

6439

++		ireq->ir_iif = inet6_iif(skb);

6440

++

6441

++	if (!TCP_SKB_CB(skb)->when &&

6442

++	    (ipv6_opt_accepted(sk, skb) || np->rxopt.bits.rxinfo ||

6443

++	     np->rxopt.bits.rxoinfo || np->rxopt.bits.rxhlim ||

6444

++	     np->rxopt.bits.rxohlim || np->repflow)) {

6445

++		atomic_inc(&skb->users);

6446

++		ireq->pktopts = skb;

6447

++	}

6448

++

6449

++	return 0;

6450

++}

6451

++

6452

++static struct dst_entry *tcp_v6_route_req(struct sock *sk, struct flowi *fl,

6453

++					  const struct request_sock *req,

6454

++					  bool *strict)

6455

++{

6456

++	if (strict)

6457

++		*strict = true;

6458

++	return inet6_csk_route_req(sk, &fl->u.ip6, req);

6459

++}

6460

++

6461

+ struct request_sock_ops tcp6_request_sock_ops __read_mostly = {

6462

+ 	.family		=	AF_INET6,

6463

+ 	.obj_size	=	sizeof(struct tcp6_request_sock),

6464

+-	.rtx_syn_ack	=	tcp_v6_rtx_synack,

6465

++	.rtx_syn_ack	=	tcp_rtx_synack,

6466

+ 	.send_ack	=	tcp_v6_reqsk_send_ack,

6467

+ 	.destructor	=	tcp_v6_reqsk_destructor,

6468

+ 	.send_reset	=	tcp_v6_send_reset,

6469

+ 	.syn_ack_timeout =	tcp_syn_ack_timeout,

6470

+ };

6471

+

6472

++const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = {

6473

++	.mss_clamp	=	IPV6_MIN_MTU - sizeof(struct tcphdr) -

6474

++				sizeof(struct ipv6hdr),

6475

+ #ifdef CONFIG_TCP_MD5SIG

6476

+-static const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = {

6477

+ 	.md5_lookup	=	tcp_v6_reqsk_md5_lookup,

6478

+ 	.calc_md5_hash	=	tcp_v6_md5_hash_skb,

6479

+-};

6480

+ #endif

6481

++	.init_req	=	tcp_v6_init_req,

6482

++#ifdef CONFIG_SYN_COOKIES

6483

++	.cookie_init_seq =	cookie_v6_init_sequence,

6484

++#endif

6485

++	.route_req	=	tcp_v6_route_req,

6486

++	.init_seq	=	tcp_v6_init_sequence,

6487

++	.send_synack	=	tcp_v6_send_synack,

6488

++	.queue_hash_add =	inet6_csk_reqsk_queue_hash_add,

6489

++};

6490

+

6491

+-static void tcp_v6_send_response(struct sk_buff *skb, u32 seq, u32 ack, u32 win,

6492

+-				 u32 tsval, u32 tsecr, int oif,

6493

+-				 struct tcp_md5sig_key *key, int rst, u8 tclass,

6494

+-				 u32 label)

6495

++static void tcp_v6_send_response(struct sk_buff *skb, u32 seq, u32 ack,

6496

++				 u32 data_ack, u32 win, u32 tsval, u32 tsecr,

6497

++				 int oif, struct tcp_md5sig_key *key, int rst,

6498

++				 u8 tclass, u32 label, int mptcp)

6499

+ {

6500

+ 	const struct tcphdr *th = tcp_hdr(skb);

6501

+ 	struct tcphdr *t1;

6502

+@@ -756,7 +823,10 @@ static void tcp_v6_send_response(struct sk_buff *skb, u32 seq, u32 ack, u32 win,

6503

+ 	if (key)

6504

+ 		tot_len += TCPOLEN_MD5SIG_ALIGNED;

6505

+ #endif

6506

+-

6507

++#ifdef CONFIG_MPTCP

6508

++	if (mptcp)

6509

++		tot_len += MPTCP_SUB_LEN_DSS + MPTCP_SUB_LEN_ACK;

6510

++#endif

6511

+ 	buff = alloc_skb(MAX_HEADER + sizeof(struct ipv6hdr) + tot_len,

6512

+ 			 GFP_ATOMIC);

6513

+ 	if (buff == NULL)

6514

+@@ -794,6 +864,17 @@ static void tcp_v6_send_response(struct sk_buff *skb, u32 seq, u32 ack, u32 win,

6515

+ 		tcp_v6_md5_hash_hdr((__u8 *)topt, key,

6516

+ 				    &ipv6_hdr(skb)->saddr,

6517

+ 				    &ipv6_hdr(skb)->daddr, t1);

6518

++		topt += 4;

6519

++	}

6520

++#endif

6521

++#ifdef CONFIG_MPTCP

6522

++	if (mptcp) {

6523

++		/* Construction of 32-bit data_ack */

6524

++		*topt++ = htonl((TCPOPT_MPTCP << 24) |

6525

++				((MPTCP_SUB_LEN_DSS + MPTCP_SUB_LEN_ACK) << 16) |

6526

++				(0x20 << 8) |

6527

++				(0x01));

6528

++		*topt++ = htonl(data_ack);

6529

+ 	}

6530

+ #endif

6531

+

6532

+@@ -834,7 +915,7 @@ static void tcp_v6_send_response(struct sk_buff *skb, u32 seq, u32 ack, u32 win,

6533

+ 	kfree_skb(buff);

6534

+ }

6535

+

6536

+-static void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb)

6537

++void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb)

6538

+ {

6539

+ 	const struct tcphdr *th = tcp_hdr(skb);

6540

+ 	u32 seq = 0, ack_seq = 0;

6541

+@@ -891,7 +972,7 @@ static void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb)

6542

+ 			  (th->doff << 2);

6543

+

6544

+ 	oif = sk ? sk->sk_bound_dev_if : 0;

6545

+-	tcp_v6_send_response(skb, seq, ack_seq, 0, 0, 0, oif, key, 1, 0, 0);

6546

++	tcp_v6_send_response(skb, seq, ack_seq, 0, 0, 0, 0, oif, key, 1, 0, 0, 0);

6547

+

6548

+ #ifdef CONFIG_TCP_MD5SIG

6549

+ release_sk1:

6550

+@@ -902,45 +983,52 @@ release_sk1:

6551

+ #endif

6552

+ }

6553

+

6554

+-static void tcp_v6_send_ack(struct sk_buff *skb, u32 seq, u32 ack,

6555

++static void tcp_v6_send_ack(struct sk_buff *skb, u32 seq, u32 ack, u32 data_ack,

6556

+ 			    u32 win, u32 tsval, u32 tsecr, int oif,

6557

+ 			    struct tcp_md5sig_key *key, u8 tclass,

6558

+-			    u32 label)

6559

++			    u32 label, int mptcp)

6560

+ {

6561

+-	tcp_v6_send_response(skb, seq, ack, win, tsval, tsecr, oif, key, 0, tclass,

6562

+-			     label);

6563

++	tcp_v6_send_response(skb, seq, ack, data_ack, win, tsval, tsecr, oif,

6564

++			     key, 0, tclass, label, mptcp);

6565

+ }

6566

+

6567

+ static void tcp_v6_timewait_ack(struct sock *sk, struct sk_buff *skb)

6568

+ {

6569

+ 	struct inet_timewait_sock *tw = inet_twsk(sk);

6570

+ 	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);

6571

++	u32 data_ack = 0;

6572

++	int mptcp = 0;

6573

+

6574

++	if (tcptw->mptcp_tw && tcptw->mptcp_tw->meta_tw) {

6575

++		data_ack = (u32)tcptw->mptcp_tw->rcv_nxt;

6576

++		mptcp = 1;

6577

++	}

6578

+ 	tcp_v6_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,

6579

++			data_ack,

6580

+ 			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,

6581

+ 			tcp_time_stamp + tcptw->tw_ts_offset,

6582

+ 			tcptw->tw_ts_recent, tw->tw_bound_dev_if, tcp_twsk_md5_key(tcptw),

6583

+-			tw->tw_tclass, (tw->tw_flowlabel << 12));

6584

++			tw->tw_tclass, (tw->tw_flowlabel << 12), mptcp);

6585

+

6586

+ 	inet_twsk_put(tw);

6587

+ }

6588

+

6589

+-static void tcp_v6_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,

6590

+-				  struct request_sock *req)

6591

++void tcp_v6_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,

6592

++			   struct request_sock *req)

6593

+ {

6594

+ 	/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV

6595

+ 	 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.

6596

+ 	 */

6597

+ 	tcp_v6_send_ack(skb, (sk->sk_state == TCP_LISTEN) ?

6598

+ 			tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt,

6599

+-			tcp_rsk(req)->rcv_nxt,

6600

++			tcp_rsk(req)->rcv_nxt, 0,

6601

+ 			req->rcv_wnd, tcp_time_stamp, req->ts_recent, sk->sk_bound_dev_if,

6602

+ 			tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->daddr),

6603

+-			0, 0);

6604

++			0, 0, 0);

6605

+ }

6606

+

6607

+

6608

+-static struct sock *tcp_v6_hnd_req(struct sock *sk, struct sk_buff *skb)

6609

++struct sock *tcp_v6_hnd_req(struct sock *sk, struct sk_buff *skb)

6610

+ {

6611

+ 	struct request_sock *req, **prev;

6612

+ 	const struct tcphdr *th = tcp_hdr(skb);

6613

+@@ -959,7 +1047,13 @@ static struct sock *tcp_v6_hnd_req(struct sock *sk, struct sk_buff *skb)

6614

+

6615

+ 	if (nsk) {

6616

+ 		if (nsk->sk_state != TCP_TIME_WAIT) {

6617

++			/* Don't lock again the meta-sk. It has been locked

6618

++			 * before mptcp_v6_do_rcv.

6619

++			 */

6620

++			if (mptcp(tcp_sk(nsk)) && !is_meta_sk(sk))

6621

++				bh_lock_sock(mptcp_meta_sk(nsk));

6622

+ 			bh_lock_sock(nsk);

6623

++

6624

+ 			return nsk;

6625

+ 		}

6626

+ 		inet_twsk_put(inet_twsk(nsk));

6627

+@@ -973,161 +1067,25 @@ static struct sock *tcp_v6_hnd_req(struct sock *sk, struct sk_buff *skb)

6628

+ 	return sk;

6629

+ }

6630

+

6631

+-/* FIXME: this is substantially similar to the ipv4 code.

6632

+- * Can some kind of merge be done? -- erics

6633

+- */

6634

+-static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)

6635

++int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)

6636

+ {

6637

+-	struct tcp_options_received tmp_opt;

6638

+-	struct request_sock *req;

6639

+-	struct inet_request_sock *ireq;

6640

+-	struct ipv6_pinfo *np = inet6_sk(sk);

6641

+-	struct tcp_sock *tp = tcp_sk(sk);

6642

+-	__u32 isn = TCP_SKB_CB(skb)->when;

6643

+-	struct dst_entry *dst = NULL;

6644

+-	struct tcp_fastopen_cookie foc = { .len = -1 };

6645

+-	bool want_cookie = false, fastopen;

6646

+-	struct flowi6 fl6;

6647

+-	int err;

6648

+-

6649

+ 	if (skb->protocol == htons(ETH_P_IP))

6650

+ 		return tcp_v4_conn_request(sk, skb);

6651

+

6652

+ 	if (!ipv6_unicast_destination(skb))

6653

+ 		goto drop;

6654

+

6655

+-	if ((sysctl_tcp_syncookies == 2 ||

6656

+-	     inet_csk_reqsk_queue_is_full(sk)) && !isn) {

6657

+-		want_cookie = tcp_syn_flood_action(sk, skb, "TCPv6");

6658

+-		if (!want_cookie)

6659

+-			goto drop;

6660

+-	}

6661

+-

6662

+-	if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) {

6663

+-		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);

6664

+-		goto drop;

6665

+-	}

6666

+-

6667

+-	req = inet6_reqsk_alloc(&tcp6_request_sock_ops);

6668

+-	if (req == NULL)

6669

+-		goto drop;

6670

+-

6671

+-#ifdef CONFIG_TCP_MD5SIG

6672

+-	tcp_rsk(req)->af_specific = &tcp_request_sock_ipv6_ops;

6673

+-#endif

6674

+-

6675

+-	tcp_clear_options(&tmp_opt);

6676

+-	tmp_opt.mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr);

6677

+-	tmp_opt.user_mss = tp->rx_opt.user_mss;

6678

+-	tcp_parse_options(skb, &tmp_opt, 0, want_cookie ? NULL : &foc);

6679

+-

6680

+-	if (want_cookie && !tmp_opt.saw_tstamp)

6681

+-		tcp_clear_options(&tmp_opt);

6682

++	return tcp_conn_request(&tcp6_request_sock_ops,

6683

++				&tcp_request_sock_ipv6_ops, sk, skb);

6684

+

6685

+-	tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;

6686

+-	tcp_openreq_init(req, &tmp_opt, skb);

6687

+-

6688

+-	ireq = inet_rsk(req);

6689

+-	ireq->ir_v6_rmt_addr = ipv6_hdr(skb)->saddr;

6690

+-	ireq->ir_v6_loc_addr = ipv6_hdr(skb)->daddr;

6691

+-	if (!want_cookie || tmp_opt.tstamp_ok)

6692

+-		TCP_ECN_create_request(req, skb, sock_net(sk));

6693

+-

6694

+-	ireq->ir_iif = sk->sk_bound_dev_if;

6695

+-	ireq->ir_mark = inet_request_mark(sk, skb);

6696

+-

6697

+-	/* So that link locals have meaning */

6698

+-	if (!sk->sk_bound_dev_if &&

6699

+-	    ipv6_addr_type(&ireq->ir_v6_rmt_addr) & IPV6_ADDR_LINKLOCAL)

6700

+-		ireq->ir_iif = inet6_iif(skb);

6701

+-

6702

+-	if (!isn) {

6703

+-		if (ipv6_opt_accepted(sk, skb) ||

6704

+-		    np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo ||

6705

+-		    np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim ||

6706

+-		    np->repflow) {

6707

+-			atomic_inc(&skb->users);

6708

+-			ireq->pktopts = skb;

6709

+-		}

6710

+-

6711

+-		if (want_cookie) {

6712

+-			isn = cookie_v6_init_sequence(sk, skb, &req->mss);

6713

+-			req->cookie_ts = tmp_opt.tstamp_ok;

6714

+-			goto have_isn;

6715

+-		}

6716

+-

6717

+-		/* VJ's idea. We save last timestamp seen

6718

+-		 * from the destination in peer table, when entering

6719

+-		 * state TIME-WAIT, and check against it before

6720

+-		 * accepting new connection request.

6721

+-		 *

6722

+-		 * If "isn" is not zero, this request hit alive

6723

+-		 * timewait bucket, so that all the necessary checks

6724

+-		 * are made in the function processing timewait state.

6725

+-		 */

6726

+-		if (tmp_opt.saw_tstamp &&

6727

+-		    tcp_death_row.sysctl_tw_recycle &&

6728

+-		    (dst = inet6_csk_route_req(sk, &fl6, req)) != NULL) {

6729

+-			if (!tcp_peer_is_proven(req, dst, true)) {

6730

+-				NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);

6731

+-				goto drop_and_release;

6732

+-			}

6733

+-		}

6734

+-		/* Kill the following clause, if you dislike this way. */

6735

+-		else if (!sysctl_tcp_syncookies &&

6736

+-			 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <

6737

+-			  (sysctl_max_syn_backlog >> 2)) &&

6738

+-			 !tcp_peer_is_proven(req, dst, false)) {

6739

+-			/* Without syncookies last quarter of

6740

+-			 * backlog is filled with destinations,

6741

+-			 * proven to be alive.

6742

+-			 * It means that we continue to communicate

6743

+-			 * to destinations, already remembered

6744

+-			 * to the moment of synflood.

6745

+-			 */

6746

+-			LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open request from %pI6/%u\n",

6747

+-				       &ireq->ir_v6_rmt_addr, ntohs(tcp_hdr(skb)->source));

6748

+-			goto drop_and_release;

6749

+-		}

6750

+-

6751

+-		isn = tcp_v6_init_sequence(skb);

6752

+-	}

6753

+-have_isn:

6754

+-

6755

+-	if (security_inet_conn_request(sk, skb, req))

6756

+-		goto drop_and_release;

6757

+-

6758

+-	if (!dst && (dst = inet6_csk_route_req(sk, &fl6, req)) == NULL)

6759

+-		goto drop_and_free;

6760

+-

6761

+-	tcp_rsk(req)->snt_isn = isn;

6762

+-	tcp_rsk(req)->snt_synack = tcp_time_stamp;

6763

+-	tcp_openreq_init_rwin(req, sk, dst);

6764

+-	fastopen = !want_cookie &&

6765

+-		   tcp_try_fastopen(sk, skb, req, &foc, dst);

6766

+-	err = tcp_v6_send_synack(sk, dst, &fl6, req,

6767

+-				 skb_get_queue_mapping(skb), &foc);

6768

+-	if (!fastopen) {

6769

+-		if (err || want_cookie)

6770

+-			goto drop_and_free;

6771

+-

6772

+-		tcp_rsk(req)->listener = NULL;

6773

+-		inet6_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);

6774

+-	}

6775

+-	return 0;

6776

+-

6777

+-drop_and_release:

6778

+-	dst_release(dst);

6779

+-drop_and_free:

6780

+-	reqsk_free(req);

6781

+ drop:

6782

+ 	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);

6783

+ 	return 0; /* don't send reset */

6784

+ }

6785

+

6786

+-static struct sock *tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,

6787

+-					 struct request_sock *req,

6788

+-					 struct dst_entry *dst)

6789

++struct sock *tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,

6790

++				  struct request_sock *req,

6791

++				  struct dst_entry *dst)

6792

+ {

6793

+ 	struct inet_request_sock *ireq;

6794

+ 	struct ipv6_pinfo *newnp, *np = inet6_sk(sk);

6795

+@@ -1165,7 +1123,12 @@ static struct sock *tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,

6796

+

6797

+ 		newsk->sk_v6_rcv_saddr = newnp->saddr;

6798

+

6799

+-		inet_csk(newsk)->icsk_af_ops = &ipv6_mapped;

6800

++#ifdef CONFIG_MPTCP

6801

++		if (is_mptcp_enabled(newsk))

6802

++			inet_csk(newsk)->icsk_af_ops = &mptcp_v6_mapped;

6803

++		else

6804

++#endif

6805

++			inet_csk(newsk)->icsk_af_ops = &ipv6_mapped;

6806

+ 		newsk->sk_backlog_rcv = tcp_v4_do_rcv;

6807

+ #ifdef CONFIG_TCP_MD5SIG

6808

+ 		newtp->af_specific = &tcp_sock_ipv6_mapped_specific;

6809

+@@ -1329,7 +1292,7 @@ out:

6810

+  * This is because we cannot sleep with the original spinlock

6811

+  * held.

6812

+  */

6813

+-static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)

6814

++int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)

6815

+ {

6816

+ 	struct ipv6_pinfo *np = inet6_sk(sk);

6817

+ 	struct tcp_sock *tp;

6818

+@@ -1351,6 +1314,9 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)

6819

+ 		goto discard;

6820

+ #endif

6821

+

6822

++	if (is_meta_sk(sk))

6823

++		return mptcp_v6_do_rcv(sk, skb);

6824

++

6825

+ 	if (sk_filter(sk, skb))

6826

+ 		goto discard;

6827

+

6828

+@@ -1472,7 +1438,7 @@ static int tcp_v6_rcv(struct sk_buff *skb)

6829

+ {

6830

+ 	const struct tcphdr *th;

6831

+ 	const struct ipv6hdr *hdr;

6832

+-	struct sock *sk;

6833

++	struct sock *sk, *meta_sk = NULL;

6834

+ 	int ret;

6835

+ 	struct net *net = dev_net(skb->dev);

6836

+

6837

+@@ -1503,18 +1469,43 @@ static int tcp_v6_rcv(struct sk_buff *skb)

6838

+ 	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +

6839

+ 				    skb->len - th->doff*4);

6840

+ 	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);

6841

++#ifdef CONFIG_MPTCP

6842

++	TCP_SKB_CB(skb)->mptcp_flags = 0;

6843

++	TCP_SKB_CB(skb)->dss_off = 0;

6844

++#endif

6845

+ 	TCP_SKB_CB(skb)->when = 0;

6846

+ 	TCP_SKB_CB(skb)->ip_dsfield = ipv6_get_dsfield(hdr);

6847

+ 	TCP_SKB_CB(skb)->sacked = 0;

6848

+

6849

+ 	sk = __inet6_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);

6850

+-	if (!sk)

6851

+-		goto no_tcp_socket;

6852

+

6853

+ process:

6854

+-	if (sk->sk_state == TCP_TIME_WAIT)

6855

++	if (sk && sk->sk_state == TCP_TIME_WAIT)

6856

+ 		goto do_time_wait;

6857

+

6858

++#ifdef CONFIG_MPTCP

6859

++	if (!sk && th->syn && !th->ack) {

6860

++		int ret = mptcp_lookup_join(skb, NULL);

6861

++

6862

++		if (ret < 0) {

6863

++			tcp_v6_send_reset(NULL, skb);

6864

++			goto discard_it;

6865

++		} else if (ret > 0) {

6866

++			return 0;

6867

++		}

6868

++	}

6869

++

6870

++	/* Is there a pending request sock for this segment ? */

6871

++	if ((!sk || sk->sk_state == TCP_LISTEN) && mptcp_check_req(skb, net)) {

6872

++		if (sk)

6873

++			sock_put(sk);

6874

++		return 0;

6875

++	}

6876

++#endif

6877

++

6878

++	if (!sk)

6879

++		goto no_tcp_socket;

6880

++

6881

+ 	if (hdr->hop_limit < inet6_sk(sk)->min_hopcount) {

6882

+ 		NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);

6883

+ 		goto discard_and_relse;

6884

+@@ -1529,11 +1520,21 @@ process:

6885

+ 	sk_mark_napi_id(sk, skb);

6886

+ 	skb->dev = NULL;

6887

+

6888

+-	bh_lock_sock_nested(sk);

6889

++	if (mptcp(tcp_sk(sk))) {

6890

++		meta_sk = mptcp_meta_sk(sk);

6891

++

6892

++		bh_lock_sock_nested(meta_sk);

6893

++		if (sock_owned_by_user(meta_sk))

6894

++			skb->sk = sk;

6895

++	} else {

6896

++		meta_sk = sk;

6897

++		bh_lock_sock_nested(sk);

6898

++	}

6899

++

6900

+ 	ret = 0;

6901

+-	if (!sock_owned_by_user(sk)) {

6902

++	if (!sock_owned_by_user(meta_sk)) {

6903

+ #ifdef CONFIG_NET_DMA

6904

+-		struct tcp_sock *tp = tcp_sk(sk);

6905

++		struct tcp_sock *tp = tcp_sk(meta_sk);

6906

+ 		if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)

6907

+ 			tp->ucopy.dma_chan = net_dma_find_channel();

6908

+ 		if (tp->ucopy.dma_chan)

6909

+@@ -1541,16 +1542,17 @@ process:

6910

+ 		else

6911

+ #endif

6912

+ 		{

6913

+-			if (!tcp_prequeue(sk, skb))

6914

++			if (!tcp_prequeue(meta_sk, skb))

6915

+ 				ret = tcp_v6_do_rcv(sk, skb);

6916

+ 		}

6917

+-	} else if (unlikely(sk_add_backlog(sk, skb,

6918

+-					   sk->sk_rcvbuf + sk->sk_sndbuf))) {

6919

+-		bh_unlock_sock(sk);

6920

++	} else if (unlikely(sk_add_backlog(meta_sk, skb,

6921

++					   meta_sk->sk_rcvbuf + meta_sk->sk_sndbuf))) {

6922

++		bh_unlock_sock(meta_sk);

6923

+ 		NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);

6924

+ 		goto discard_and_relse;

6925

+ 	}

6926

+-	bh_unlock_sock(sk);

6927

++

6928

++	bh_unlock_sock(meta_sk);

6929

+

6930

+ 	sock_put(sk);

6931

+ 	return ret ? -1 : 0;

6932

+@@ -1607,6 +1609,18 @@ do_time_wait:

6933

+ 			sk = sk2;

6934

+ 			goto process;

6935

+ 		}

6936

++#ifdef CONFIG_MPTCP

6937

++		if (th->syn && !th->ack) {

6938

++			int ret = mptcp_lookup_join(skb, inet_twsk(sk));

6939

++

6940

++			if (ret < 0) {

6941

++				tcp_v6_send_reset(NULL, skb);

6942

++				goto discard_it;

6943

++			} else if (ret > 0) {

6944

++				return 0;

6945

++			}

6946

++		}

6947

++#endif

6948

+ 		/* Fall through to ACK */

6949

+ 	}

6950

+ 	case TCP_TW_ACK:

6951

+@@ -1657,7 +1671,7 @@ static void tcp_v6_early_demux(struct sk_buff *skb)

6952

+ 	}

6953

+ }

6954

+

6955

+-static struct timewait_sock_ops tcp6_timewait_sock_ops = {

6956

++struct timewait_sock_ops tcp6_timewait_sock_ops = {

6957

+ 	.twsk_obj_size	= sizeof(struct tcp6_timewait_sock),

6958

+ 	.twsk_unique	= tcp_twsk_unique,

6959

+ 	.twsk_destructor = tcp_twsk_destructor,

6960

+@@ -1730,7 +1744,12 @@ static int tcp_v6_init_sock(struct sock *sk)

6961

+

6962

+ 	tcp_init_sock(sk);

6963

+

6964

+-	icsk->icsk_af_ops = &ipv6_specific;

6965

++#ifdef CONFIG_MPTCP

6966

++	if (is_mptcp_enabled(sk))

6967

++		icsk->icsk_af_ops = &mptcp_v6_specific;

6968

++	else

6969

++#endif

6970

++		icsk->icsk_af_ops = &ipv6_specific;

6971

+

6972

+ #ifdef CONFIG_TCP_MD5SIG

6973

+ 	tcp_sk(sk)->af_specific = &tcp_sock_ipv6_specific;

6974

+@@ -1739,7 +1758,7 @@ static int tcp_v6_init_sock(struct sock *sk)

6975

+ 	return 0;

6976

+ }

6977

+

6978

+-static void tcp_v6_destroy_sock(struct sock *sk)

6979

++void tcp_v6_destroy_sock(struct sock *sk)

6980

+ {

6981

+ 	tcp_v4_destroy_sock(sk);

6982

+ 	inet6_destroy_sock(sk);

6983

+@@ -1924,12 +1943,28 @@ void tcp6_proc_exit(struct net *net)

6984

+ static void tcp_v6_clear_sk(struct sock *sk, int size)

6985

+ {

6986

+ 	struct inet_sock *inet = inet_sk(sk);

6987

++#ifdef CONFIG_MPTCP

6988

++	struct tcp_sock *tp = tcp_sk(sk);

6989

++	/* size_tk_table goes from the end of tk_table to the end of sk */

6990

++	int size_tk_table = size - offsetof(struct tcp_sock, tk_table) -

6991

++			    sizeof(tp->tk_table);

6992

++#endif

6993

+

6994

+ 	/* we do not want to clear pinet6 field, because of RCU lookups */

6995

+ 	sk_prot_clear_nulls(sk, offsetof(struct inet_sock, pinet6));

6996

+

6997

+ 	size -= offsetof(struct inet_sock, pinet6) + sizeof(inet->pinet6);

6998

++

6999

++#ifdef CONFIG_MPTCP

7000

++	/* We zero out only from pinet6 to tk_table */

7001

++	size -= size_tk_table + sizeof(tp->tk_table);

7002

++#endif

7003

+ 	memset(&inet->pinet6 + 1, 0, size);

7004

++

7005

++#ifdef CONFIG_MPTCP

7006

++	memset((char *)&tp->tk_table + sizeof(tp->tk_table), 0, size_tk_table);

7007

++#endif

7008

++

7009

+ }

7010

+

7011

+ struct proto tcpv6_prot = {

7012

+diff --git a/net/mptcp/Kconfig b/net/mptcp/Kconfig

7013

+new file mode 100644

7014

+index 000000000000..cdfc03adabf8

7015

+--- /dev/null

7016

++++ b/net/mptcp/Kconfig

7017

+@@ -0,0 +1,115 @@

7018

++#

7019

++# MPTCP configuration

7020

++#

7021

++config MPTCP

7022

++        bool "MPTCP protocol"

7023

++        depends on (IPV6=y || IPV6=n)

7024

++        ---help---

7025

++          This replaces the normal TCP stack with a Multipath TCP stack,

7026

++          able to use several paths at once.

7027

++

7028

++menuconfig MPTCP_PM_ADVANCED

7029

++	bool "MPTCP: advanced path-manager control"

7030

++	depends on MPTCP=y

7031

++	---help---

7032

++	  Support for selection of different path-managers. You should choose 'Y' here,

7033

++	  because otherwise you will not actively create new MPTCP-subflows.

7034

++

7035

++if MPTCP_PM_ADVANCED

7036

++

7037

++config MPTCP_FULLMESH

7038

++	tristate "MPTCP Full-Mesh Path-Manager"

7039

++	depends on MPTCP=y

7040

++	---help---

7041

++	  This path-management module will create a full-mesh among all IP-addresses.

7042

++

7043

++config MPTCP_NDIFFPORTS

7044

++	tristate "MPTCP ndiff-ports"

7045

++	depends on MPTCP=y

7046

++	---help---

7047

++	  This path-management module will create multiple subflows between the same

7048

++	  pair of IP-addresses, modifying the source-port. You can set the number

7049

++	  of subflows via the mptcp_ndiffports-sysctl.

7050

++

7051

++config MPTCP_BINDER

7052

++	tristate "MPTCP Binder"

7053

++	depends on (MPTCP=y)

7054

++	---help---

7055

++	  This path-management module works like ndiffports, and adds the sysctl

7056

++	  option to set the gateway (and/or path to) per each additional subflow

7057

++	  via Loose Source Routing (IPv4 only).

7058

++

7059

++choice

7060

++	prompt "Default MPTCP Path-Manager"

7061

++	default DEFAULT

7062

++	help

7063

++	  Select the Path-Manager of your choice

7064

++

7065

++	config DEFAULT_FULLMESH

7066

++		bool "Full mesh" if MPTCP_FULLMESH=y

7067

++

7068

++	config DEFAULT_NDIFFPORTS

7069

++		bool "ndiff-ports" if MPTCP_NDIFFPORTS=y

7070

++

7071

++	config DEFAULT_BINDER

7072

++		bool "binder" if MPTCP_BINDER=y

7073

++

7074

++	config DEFAULT_DUMMY

7075

++		bool "Default"

7076

++

7077

++endchoice

7078

++

7079

++endif

7080

++

7081

++config DEFAULT_MPTCP_PM

7082

++	string

7083

++	default "default" if DEFAULT_DUMMY

7084

++	default "fullmesh" if DEFAULT_FULLMESH 

7085

++	default "ndiffports" if DEFAULT_NDIFFPORTS

7086

++	default "binder" if DEFAULT_BINDER

7087

++	default "default"

7088

++

7089

++menuconfig MPTCP_SCHED_ADVANCED

7090

++	bool "MPTCP: advanced scheduler control"

7091

++	depends on MPTCP=y

7092

++	---help---

7093

++	  Support for selection of different schedulers. You should choose 'Y' here,

7094

++	  if you want to choose a different scheduler than the default one.

7095

++

7096

++if MPTCP_SCHED_ADVANCED

7097

++

7098

++config MPTCP_ROUNDROBIN

7099

++	tristate "MPTCP Round-Robin"

7100

++	depends on (MPTCP=y)

7101

++	---help---

7102

++	  This is a very simple round-robin scheduler. Probably has bad performance

7103

++	  but might be interesting for researchers.

7104

++

7105

++choice

7106

++	prompt "Default MPTCP Scheduler"

7107

++	default DEFAULT

7108

++	help

7109

++	  Select the Scheduler of your choice

7110

++

7111

++	config DEFAULT_SCHEDULER

7112

++		bool "Default"

7113

++		---help---

7114

++		  This is the default scheduler, sending first on the subflow

7115

++		  with the lowest RTT.

7116

++

7117

++	config DEFAULT_ROUNDROBIN

7118

++		bool "Round-Robin" if MPTCP_ROUNDROBIN=y

7119

++		---help---

7120

++		  This is the round-rob scheduler, sending in a round-robin

7121

++		  fashion..

7122

++

7123

++endchoice

7124

++endif

7125

++

7126

++config DEFAULT_MPTCP_SCHED

7127

++	string

7128

++	depends on (MPTCP=y)

7129

++	default "default" if DEFAULT_SCHEDULER

7130

++	default "roundrobin" if DEFAULT_ROUNDROBIN

7131

++	default "default"

7132

++

7133

+diff --git a/net/mptcp/Makefile b/net/mptcp/Makefile

7134

+new file mode 100644

7135

+index 000000000000..35561a7012e3

7136

+--- /dev/null

7137

++++ b/net/mptcp/Makefile

7138

+@@ -0,0 +1,20 @@

7139

++#

7140

++## Makefile for MultiPath TCP support code.

7141

++#

7142

++#

7143

++

7144

++obj-$(CONFIG_MPTCP) += mptcp.o

7145

++

7146

++mptcp-y := mptcp_ctrl.o mptcp_ipv4.o mptcp_ofo_queue.o mptcp_pm.o \

7147

++	   mptcp_output.o mptcp_input.o mptcp_sched.o

7148

++

7149

++obj-$(CONFIG_TCP_CONG_COUPLED) += mptcp_coupled.o

7150

++obj-$(CONFIG_TCP_CONG_OLIA) += mptcp_olia.o

7151

++obj-$(CONFIG_TCP_CONG_WVEGAS) += mptcp_wvegas.o

7152

++obj-$(CONFIG_MPTCP_FULLMESH) += mptcp_fullmesh.o

7153

++obj-$(CONFIG_MPTCP_NDIFFPORTS) += mptcp_ndiffports.o

7154

++obj-$(CONFIG_MPTCP_BINDER) += mptcp_binder.o

7155

++obj-$(CONFIG_MPTCP_ROUNDROBIN) += mptcp_rr.o

7156

++

7157

++mptcp-$(subst m,y,$(CONFIG_IPV6)) += mptcp_ipv6.o

7158

++

7159

+diff --git a/net/mptcp/mptcp_binder.c b/net/mptcp/mptcp_binder.c

7160

+new file mode 100644

7161

+index 000000000000..95d8da560715

7162

+--- /dev/null

7163

++++ b/net/mptcp/mptcp_binder.c

7164

+@@ -0,0 +1,487 @@

7165

++#include <linux/module.h>

7166

++

7167

++#include <net/mptcp.h>

7168

++#include <net/mptcp_v4.h>

7169

++

7170

++#include <linux/route.h>

7171

++#include <linux/inet.h>

7172

++#include <linux/mroute.h>

7173

++#include <linux/spinlock_types.h>

7174

++#include <net/inet_ecn.h>

7175

++#include <net/route.h>

7176

++#include <net/xfrm.h>

7177

++#include <net/compat.h>

7178

++#include <linux/slab.h>

7179

++

7180

++#define MPTCP_GW_MAX_LISTS	10

7181

++#define MPTCP_GW_LIST_MAX_LEN	6

7182

++#define MPTCP_GW_SYSCTL_MAX_LEN	(15 * MPTCP_GW_LIST_MAX_LEN *	\

7183

++							MPTCP_GW_MAX_LISTS)

7184

++

7185

++struct mptcp_gw_list {

7186

++	struct in_addr list[MPTCP_GW_MAX_LISTS][MPTCP_GW_LIST_MAX_LEN];

7187

++	u8 len[MPTCP_GW_MAX_LISTS];

7188

++};

7189

++

7190

++struct binder_priv {

7191

++	/* Worker struct for subflow establishment */

7192

++	struct work_struct subflow_work;

7193

++

7194

++	struct mptcp_cb *mpcb;

7195

++

7196

++	/* Prevent multiple sub-sockets concurrently iterating over sockets */

7197

++	spinlock_t *flow_lock;

7198

++};

7199

++

7200

++static struct mptcp_gw_list *mptcp_gws;

7201

++static rwlock_t mptcp_gws_lock;

7202

++

7203

++static int mptcp_binder_ndiffports __read_mostly = 1;

7204

++

7205

++static char sysctl_mptcp_binder_gateways[MPTCP_GW_SYSCTL_MAX_LEN] __read_mostly;

7206

++

7207

++static int mptcp_get_avail_list_ipv4(struct sock *sk)

7208

++{

7209

++	int i, j, list_taken, opt_ret, opt_len;

7210

++	unsigned char *opt_ptr, *opt_end_ptr, opt[MAX_IPOPTLEN];

7211

++

7212

++	for (i = 0; i < MPTCP_GW_MAX_LISTS; ++i) {

7213

++		if (mptcp_gws->len[i] == 0)

7214

++			goto error;

7215

++

7216

++		mptcp_debug("mptcp_get_avail_list_ipv4: List %i\n", i);

7217

++		list_taken = 0;

7218

++

7219

++		/* Loop through all sub-sockets in this connection */

7220

++		mptcp_for_each_sk(tcp_sk(sk)->mpcb, sk) {

7221

++			mptcp_debug("mptcp_get_avail_list_ipv4: Next sock\n");

7222

++

7223

++			/* Reset length and options buffer, then retrieve

7224

++			 * from socket

7225

++			 */

7226

++			opt_len = MAX_IPOPTLEN;

7227

++			memset(opt, 0, MAX_IPOPTLEN);

7228

++			opt_ret = ip_getsockopt(sk, IPPROTO_IP,

7229

++				IP_OPTIONS, opt, &opt_len);

7230

++			if (opt_ret < 0) {

7231

++				mptcp_debug(KERN_ERR "%s: MPTCP subsocket getsockopt() IP_OPTIONS failed, error %d\n",

7232

++					    __func__, opt_ret);

7233

++				goto error;

7234

++			}

7235

++

7236

++			/* If socket has no options, it has no stake in this list */

7237

++			if (opt_len <= 0)

7238

++				continue;

7239

++

7240

++			/* Iterate options buffer */

7241

++			for (opt_ptr = &opt[0]; opt_ptr < &opt[opt_len]; opt_ptr++) {

7242

++				if (*opt_ptr == IPOPT_LSRR) {

7243

++					mptcp_debug("mptcp_get_avail_list_ipv4: LSRR options found\n");

7244

++					goto sock_lsrr;

7245

++				}

7246

++			}

7247

++			continue;

7248

++

7249

++sock_lsrr:

7250

++			/* Pointer to the 2nd to last address */

7251

++			opt_end_ptr = opt_ptr+(*(opt_ptr+1))-4;

7252

++

7253

++			/* Addresses start 3 bytes after type offset */

7254

++			opt_ptr += 3;

7255

++			j = 0;

7256

++

7257

++			/* Different length lists cannot be the same */

7258

++			if ((opt_end_ptr-opt_ptr)/4 != mptcp_gws->len[i])

7259

++				continue;

7260

++

7261

++			/* Iterate if we are still inside options list

7262

++			 * and sysctl list

7263

++			 */

7264

++			while (opt_ptr < opt_end_ptr && j < mptcp_gws->len[i]) {

7265

++				/* If there is a different address, this list must

7266

++				 * not be set on this socket

7267

++				 */

7268

++				if (memcmp(&mptcp_gws->list[i][j], opt_ptr, 4))

7269

++					break;

7270

++

7271

++				/* Jump 4 bytes to next address */

7272

++				opt_ptr += 4;

7273

++				j++;

7274

++			}

7275

++

7276

++			/* Reached the end without a differing address, lists

7277

++			 * are therefore identical.

7278

++			 */

7279

++			if (j == mptcp_gws->len[i]) {

7280

++				mptcp_debug("mptcp_get_avail_list_ipv4: List already used\n");

7281

++				list_taken = 1;

7282

++				break;

7283

++			}

7284

++		}

7285

++

7286

++		/* Free list found if not taken by a socket */

7287

++		if (!list_taken) {

7288

++			mptcp_debug("mptcp_get_avail_list_ipv4: List free\n");

7289

++			break;

7290

++		}

7291

++	}

7292

++

7293

++	if (i >= MPTCP_GW_MAX_LISTS)

7294

++		goto error;

7295

++

7296

++	return i;

7297

++error:

7298

++	return -1;

7299

++}

7300

++

7301

++/* The list of addresses is parsed each time a new connection is opened,

7302

++ *  to make sure it's up to date. In case of error, all the lists are

7303

++ *  marked as unavailable and the subflow's fingerprint is set to 0.

7304

++ */

7305

++static void mptcp_v4_add_lsrr(struct sock *sk, struct in_addr addr)

7306

++{

7307

++	int i, j, ret;

7308

++	unsigned char opt[MAX_IPOPTLEN] = {0};

7309

++	struct tcp_sock *tp = tcp_sk(sk);

7310

++	struct binder_priv *fmp = (struct binder_priv *)&tp->mpcb->mptcp_pm[0];

7311

++

7312

++	/* Read lock: multiple sockets can read LSRR addresses at the same

7313

++	 * time, but writes are done in mutual exclusion.

7314

++	 * Spin lock: must search for free list for one socket at a time, or

7315

++	 * multiple sockets could take the same list.

7316

++	 */

7317

++	read_lock(&mptcp_gws_lock);

7318

++	spin_lock(fmp->flow_lock);

7319

++

7320

++	i = mptcp_get_avail_list_ipv4(sk);

7321

++

7322

++	/* Execution enters here only if a free path is found.

7323

++	 */

7324

++	if (i >= 0) {

7325

++		opt[0] = IPOPT_NOP;

7326

++		opt[1] = IPOPT_LSRR;

7327

++		opt[2] = sizeof(mptcp_gws->list[i][0].s_addr) *

7328

++				(mptcp_gws->len[i] + 1) + 3;

7329

++		opt[3] = IPOPT_MINOFF;

7330

++		for (j = 0; j < mptcp_gws->len[i]; ++j)

7331

++			memcpy(opt + 4 +

7332

++				(j * sizeof(mptcp_gws->list[i][0].s_addr)),

7333

++				&mptcp_gws->list[i][j].s_addr,

7334

++				sizeof(mptcp_gws->list[i][0].s_addr));

7335

++		/* Final destination must be part of IP_OPTIONS parameter. */

7336

++		memcpy(opt + 4 + (j * sizeof(addr.s_addr)), &addr.s_addr,

7337

++		       sizeof(addr.s_addr));

7338

++

7339

++		/* setsockopt must be inside the lock, otherwise another

7340

++		 * subflow could fail to see that we have taken a list.

7341

++		 */

7342

++		ret = ip_setsockopt(sk, IPPROTO_IP, IP_OPTIONS, opt,

7343

++				4 + sizeof(mptcp_gws->list[i][0].s_addr)

7344

++				* (mptcp_gws->len[i] + 1));

7345

++

7346

++		if (ret < 0) {

7347

++			mptcp_debug(KERN_ERR "%s: MPTCP subsock setsockopt() IP_OPTIONS failed, error %d\n",

7348

++				    __func__, ret);

7349

++		}

7350

++	}

7351

++

7352

++	spin_unlock(fmp->flow_lock);

7353

++	read_unlock(&mptcp_gws_lock);

7354

++

7355

++	return;

7356

++}

7357

++

7358

++/* Parses gateways string for a list of paths to different

7359

++ * gateways, and stores them for use with the Loose Source Routing (LSRR)

7360

++ * socket option. Each list must have "," separated addresses, and the lists

7361

++ * themselves must be separated by "-". Returns -1 in case one or more of the

7362

++ * addresses is not a valid ipv4/6 address.

7363

++ */

7364

++static int mptcp_parse_gateway_ipv4(char *gateways)

7365

++{

7366

++	int i, j, k, ret;

7367

++	char *tmp_string = NULL;

7368

++	struct in_addr tmp_addr;

7369

++

7370

++	tmp_string = kzalloc(16, GFP_KERNEL);

7371

++	if (tmp_string == NULL)

7372

++		return -ENOMEM;

7373

++

7374

++	write_lock(&mptcp_gws_lock);

7375

++

7376

++	memset(mptcp_gws, 0, sizeof(struct mptcp_gw_list));

7377

++

7378

++	/* A TMP string is used since inet_pton needs a null terminated string

7379

++	 * but we do not want to modify the sysctl for obvious reasons.

7380

++	 * i will iterate over the SYSCTL string, j will iterate over the

7381

++	 * temporary string where each IP is copied into, k will iterate over

7382

++	 * the IPs in each list.

7383

++	 */

7384

++	for (i = j = k = 0;

7385

++			i < MPTCP_GW_SYSCTL_MAX_LEN && k < MPTCP_GW_MAX_LISTS;

7386

++			++i) {

7387

++		if (gateways[i] == '-' || gateways[i] == ',' || gateways[i] == '\0') {

7388

++			/* If the temp IP is empty and the current list is

7389

++			 *  empty, we are done.

7390

++			 */

7391

++			if (j == 0 && mptcp_gws->len[k] == 0)

7392

++				break;

7393

++

7394

++			/* Terminate the temp IP string, then if it is

7395

++			 * non-empty parse the IP and copy it.

7396

++			 */

7397

++			tmp_string[j] = '\0';

7398

++			if (j > 0) {

7399

++				mptcp_debug("mptcp_parse_gateway_list tmp: %s i: %d\n", tmp_string, i);

7400

++

7401

++				ret = in4_pton(tmp_string, strlen(tmp_string),

7402

++						(u8 *)&tmp_addr.s_addr, '\0',

7403

++						NULL);

7404

++

7405

++				if (ret) {

7406

++					mptcp_debug("mptcp_parse_gateway_list ret: %d s_addr: %pI4\n",

7407

++						    ret,

7408

++						    &tmp_addr.s_addr);

7409

++					memcpy(&mptcp_gws->list[k][mptcp_gws->len[k]].s_addr,

7410

++					       &tmp_addr.s_addr,

7411

++					       sizeof(tmp_addr.s_addr));

7412

++					mptcp_gws->len[k]++;

7413

++					j = 0;

7414

++					tmp_string[j] = '\0';

7415

++					/* Since we can't impose a limit to

7416

++					 * what the user can input, make sure

7417

++					 * there are not too many IPs in the

7418

++					 * SYSCTL string.

7419

++					 */

7420

++					if (mptcp_gws->len[k] > MPTCP_GW_LIST_MAX_LEN) {

7421

++						mptcp_debug("mptcp_parse_gateway_list too many members in list %i: max %i\n",

7422

++							    k,

7423

++							    MPTCP_GW_LIST_MAX_LEN);

7424

++						goto error;

7425

++					}

7426

++				} else {

7427

++					goto error;

7428

++				}

7429

++			}

7430

++

7431

++			if (gateways[i] == '-' || gateways[i] == '\0')

7432

++				++k;

7433

++		} else {

7434

++			tmp_string[j] = gateways[i];

7435

++			++j;

7436

++		}

7437

++	}

7438

++

7439

++	/* Number of flows is number of gateway lists plus master flow */

7440

++	mptcp_binder_ndiffports = k+1;

7441

++

7442

++	write_unlock(&mptcp_gws_lock);

7443

++	kfree(tmp_string);

7444

++

7445

++	return 0;

7446

++

7447

++error:

7448

++	memset(mptcp_gws, 0, sizeof(struct mptcp_gw_list));

7449

++	memset(gateways, 0, sizeof(char) * MPTCP_GW_SYSCTL_MAX_LEN);

7450

++	write_unlock(&mptcp_gws_lock);

7451

++	kfree(tmp_string);

7452

++	return -1;

7453

++}

7454

++

7455

++/**

7456

++ * Create all new subflows, by doing calls to mptcp_initX_subsockets

7457

++ *

7458

++ * This function uses a goto next_subflow, to allow releasing the lock between

7459

++ * new subflows and giving other processes a chance to do some work on the

7460

++ * socket and potentially finishing the communication.

7461

++ **/

7462

++static void create_subflow_worker(struct work_struct *work)

7463

++{

7464

++	const struct binder_priv *pm_priv = container_of(work,

7465

++						     struct binder_priv,

7466

++						     subflow_work);

7467

++	struct mptcp_cb *mpcb = pm_priv->mpcb;

7468

++	struct sock *meta_sk = mpcb->meta_sk;

7469

++	int iter = 0;

7470

++

7471

++next_subflow:

7472

++	if (iter) {

7473

++		release_sock(meta_sk);

7474

++		mutex_unlock(&mpcb->mpcb_mutex);

7475

++

7476

++		cond_resched();

7477

++	}

7478

++	mutex_lock(&mpcb->mpcb_mutex);

7479

++	lock_sock_nested(meta_sk, SINGLE_DEPTH_NESTING);

7480

++

7481

++	iter++;

7482

++

7483

++	if (sock_flag(meta_sk, SOCK_DEAD))

7484

++		goto exit;

7485

++

7486

++	if (mpcb->master_sk &&

7487

++	    !tcp_sk(mpcb->master_sk)->mptcp->fully_established)

7488

++		goto exit;

7489

++

7490

++	if (mptcp_binder_ndiffports > iter &&

7491

++	    mptcp_binder_ndiffports > mpcb->cnt_subflows) {

7492

++		struct mptcp_loc4 loc;

7493

++		struct mptcp_rem4 rem;

7494

++

7495

++		loc.addr.s_addr = inet_sk(meta_sk)->inet_saddr;

7496

++		loc.loc4_id = 0;

7497

++		loc.low_prio = 0;

7498

++

7499

++		rem.addr.s_addr = inet_sk(meta_sk)->inet_daddr;

7500

++		rem.port = inet_sk(meta_sk)->inet_dport;

7501

++		rem.rem4_id = 0; /* Default 0 */

7502

++

7503

++		mptcp_init4_subsockets(meta_sk, &loc, &rem);

7504

++

7505

++		goto next_subflow;

7506

++	}

7507

++

7508

++exit:

7509

++	release_sock(meta_sk);

7510

++	mutex_unlock(&mpcb->mpcb_mutex);

7511

++	sock_put(meta_sk);

7512

++}

7513

++

7514

++static void binder_new_session(const struct sock *meta_sk)

7515

++{

7516

++	struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;

7517

++	struct binder_priv *fmp = (struct binder_priv *)&mpcb->mptcp_pm[0];

7518

++	static DEFINE_SPINLOCK(flow_lock);

7519

++

7520

++#if IS_ENABLED(CONFIG_IPV6)

7521

++	if (meta_sk->sk_family == AF_INET6 &&

7522

++	    !mptcp_v6_is_v4_mapped(meta_sk)) {

7523

++			mptcp_fallback_default(mpcb);

7524

++			return;

7525

++	}

7526

++#endif

7527

++

7528

++	/* Initialize workqueue-struct */

7529

++	INIT_WORK(&fmp->subflow_work, create_subflow_worker);

7530

++	fmp->mpcb = mpcb;

7531

++

7532

++	fmp->flow_lock = &flow_lock;

7533

++}

7534

++

7535

++static void binder_create_subflows(struct sock *meta_sk)

7536

++{

7537

++	struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;

7538

++	struct binder_priv *pm_priv = (struct binder_priv *)&mpcb->mptcp_pm[0];

7539

++

7540

++	if (mpcb->infinite_mapping_snd || mpcb->infinite_mapping_rcv ||

7541

++	    mpcb->send_infinite_mapping ||

7542

++	    mpcb->server_side || sock_flag(meta_sk, SOCK_DEAD))

7543

++		return;

7544

++

7545

++	if (!work_pending(&pm_priv->subflow_work)) {

7546

++		sock_hold(meta_sk);

7547

++		queue_work(mptcp_wq, &pm_priv->subflow_work);

7548

++	}

7549

++}

7550

++

7551

++static int binder_get_local_id(sa_family_t family, union inet_addr *addr,

7552

++				  struct net *net, bool *low_prio)

7553

++{

7554

++	return 0;

7555

++}

7556

++

7557

++/* Callback functions, executed when syctl mptcp.mptcp_gateways is updated.

7558

++ * Inspired from proc_tcp_congestion_control().

7559

++ */

7560

++static int proc_mptcp_gateways(ctl_table *ctl, int write,

7561

++				       void __user *buffer, size_t *lenp,

7562

++				       loff_t *ppos)

7563

++{

7564

++	int ret;

7565

++	ctl_table tbl = {

7566

++		.maxlen = MPTCP_GW_SYSCTL_MAX_LEN,

7567

++	};

7568

++

7569

++	if (write) {

7570

++		tbl.data = kzalloc(MPTCP_GW_SYSCTL_MAX_LEN, GFP_KERNEL);

7571

++		if (tbl.data == NULL)

7572

++			return -1;

7573

++		ret = proc_dostring(&tbl, write, buffer, lenp, ppos);

7574

++		if (ret == 0) {

7575

++			ret = mptcp_parse_gateway_ipv4(tbl.data);

7576

++			memcpy(ctl->data, tbl.data, MPTCP_GW_SYSCTL_MAX_LEN);

7577

++		}

7578

++		kfree(tbl.data);

7579

++	} else {

7580

++		ret = proc_dostring(ctl, write, buffer, lenp, ppos);

7581

++	}

7582

++

7583

++

7584

++	return ret;

7585

++}

7586

++

7587

++static struct mptcp_pm_ops binder __read_mostly = {

7588

++	.new_session = binder_new_session,

7589

++	.fully_established = binder_create_subflows,

7590

++	.get_local_id = binder_get_local_id,

7591

++	.init_subsocket_v4 = mptcp_v4_add_lsrr,

7592

++	.name = "binder",

7593

++	.owner = THIS_MODULE,

7594

++};

7595

++

7596

++static struct ctl_table binder_table[] = {

7597

++	{

7598

++		.procname = "mptcp_binder_gateways",

7599

++		.data = &sysctl_mptcp_binder_gateways,

7600

++		.maxlen = sizeof(char) * MPTCP_GW_SYSCTL_MAX_LEN,

7601

++		.mode = 0644,

7602

++		.proc_handler = &proc_mptcp_gateways

7603

++	},

7604

++	{ }

7605

++};

7606

++

7607

++struct ctl_table_header *mptcp_sysctl_binder;

7608

++

7609

++/* General initialization of MPTCP_PM */

7610

++static int __init binder_register(void)

7611

++{

7612

++	mptcp_gws = kzalloc(sizeof(*mptcp_gws), GFP_KERNEL);

7613

++	if (!mptcp_gws)

7614

++		return -ENOMEM;

7615

++

7616

++	rwlock_init(&mptcp_gws_lock);

7617

++

7618

++	BUILD_BUG_ON(sizeof(struct binder_priv) > MPTCP_PM_SIZE);

7619

++

7620

++	mptcp_sysctl_binder = register_net_sysctl(&init_net, "net/mptcp",

7621

++			binder_table);

7622

++	if (!mptcp_sysctl_binder)

7623

++		goto sysctl_fail;

7624

++

7625

++	if (mptcp_register_path_manager(&binder))

7626

++		goto pm_failed;

7627

++

7628

++	return 0;

7629

++

7630

++pm_failed:

7631

++	unregister_net_sysctl_table(mptcp_sysctl_binder);

7632

++sysctl_fail:

7633

++	kfree(mptcp_gws);

7634

++

7635

++	return -1;

7636

++}

7637

++

7638

++static void binder_unregister(void)

7639

++{

7640

++	mptcp_unregister_path_manager(&binder);

7641

++	unregister_net_sysctl_table(mptcp_sysctl_binder);

7642

++	kfree(mptcp_gws);

7643

++}

7644

++

7645

++module_init(binder_register);

7646

++module_exit(binder_unregister);

7647

++

7648

++MODULE_AUTHOR("Luca Boccassi, Duncan Eastoe, Christoph Paasch (ndiffports)");

7649

++MODULE_LICENSE("GPL");

7650

++MODULE_DESCRIPTION("BINDER MPTCP");

7651

++MODULE_VERSION("0.1");

7652

+diff --git a/net/mptcp/mptcp_coupled.c b/net/mptcp/mptcp_coupled.c

7653

+new file mode 100644

7654

+index 000000000000..5d761164eb85

7655

+--- /dev/null

7656

++++ b/net/mptcp/mptcp_coupled.c

7657

+@@ -0,0 +1,270 @@

7658

++/*

7659

++ *	MPTCP implementation - Linked Increase congestion control Algorithm (LIA)

7660

++ *

7661

++ *	Initial Design & Implementation:

7662

++ *	Sébastien Barré <sebastien.barre@×××××××××.be>

7663

++ *

7664

++ *	Current Maintainer & Author:

7665

++ *	Christoph Paasch <christoph.paasch@×××××××××.be>

7666

++ *

7667

++ *	Additional authors:

7668

++ *	Jaakko Korkeaniemi <jaakko.korkeaniemi@×××××.fi>

7669

++ *	Gregory Detal <gregory.detal@×××××××××.be>

7670

++ *	Fabien Duchêne <fabien.duchene@×××××××××.be>

7671

++ *	Andreas Seelinger <Andreas.Seelinger@×××××××××××.de>

7672

++ *	Lavkesh Lahngir <lavkesh51@×××××.com>

7673

++ *	Andreas Ripke <ripke@××××××.eu>

7674

++ *	Vlad Dogaru <vlad.dogaru@×××××.com>

7675

++ *	Octavian Purdila <octavian.purdila@×××××.com>

7676

++ *	John Ronan <jronan@××××.org>

7677

++ *	Catalin Nicutar <catalin.nicutar@×××××.com>

7678

++ *	Brandon Heller <brandonh@××××××××.edu>

7679

++ *

7680

++ *

7681

++ *	This program is free software; you can redistribute it and/or

7682

++ *      modify it under the terms of the GNU General Public License

7683

++ *      as published by the Free Software Foundation; either version

7684

++ *      2 of the License, or (at your option) any later version.

7685

++ */

7686

++#include <net/tcp.h>

7687

++#include <net/mptcp.h>

7688

++

7689

++#include <linux/module.h>

7690

++

7691

++/* Scaling is done in the numerator with alpha_scale_num and in the denominator

7692

++ * with alpha_scale_den.

7693

++ *

7694

++ * To downscale, we just need to use alpha_scale.

7695

++ *

7696

++ * We have: alpha_scale = alpha_scale_num / (alpha_scale_den ^ 2)

7697

++ */

7698

++static int alpha_scale_den = 10;

7699

++static int alpha_scale_num = 32;

7700

++static int alpha_scale = 12;

7701

++

7702

++struct mptcp_ccc {

7703

++	u64	alpha;

7704

++	bool	forced_update;

7705

++};

7706

++

7707

++static inline int mptcp_ccc_sk_can_send(const struct sock *sk)

7708

++{

7709

++	return mptcp_sk_can_send(sk) && tcp_sk(sk)->srtt_us;

7710

++}

7711

++

7712

++static inline u64 mptcp_get_alpha(const struct sock *meta_sk)

7713

++{

7714

++	return ((struct mptcp_ccc *)inet_csk_ca(meta_sk))->alpha;

7715

++}

7716

++

7717

++static inline void mptcp_set_alpha(const struct sock *meta_sk, u64 alpha)

7718

++{

7719

++	((struct mptcp_ccc *)inet_csk_ca(meta_sk))->alpha = alpha;

7720

++}

7721

++

7722

++static inline u64 mptcp_ccc_scale(u32 val, int scale)

7723

++{

7724

++	return (u64) val << scale;

7725

++}

7726

++

7727

++static inline bool mptcp_get_forced(const struct sock *meta_sk)

7728

++{

7729

++	return ((struct mptcp_ccc *)inet_csk_ca(meta_sk))->forced_update;

7730

++}

7731

++

7732

++static inline void mptcp_set_forced(const struct sock *meta_sk, bool force)

7733

++{

7734

++	((struct mptcp_ccc *)inet_csk_ca(meta_sk))->forced_update = force;

7735

++}

7736

++

7737

++static void mptcp_ccc_recalc_alpha(const struct sock *sk)

7738

++{

7739

++	const struct mptcp_cb *mpcb = tcp_sk(sk)->mpcb;

7740

++	const struct sock *sub_sk;

7741

++	int best_cwnd = 0, best_rtt = 0, can_send = 0;

7742

++	u64 max_numerator = 0, sum_denominator = 0, alpha = 1;

7743

++

7744

++	if (!mpcb)

7745

++		return;

7746

++

7747

++	/* Only one subflow left - fall back to normal reno-behavior

7748

++	 * (set alpha to 1)

7749

++	 */

7750

++	if (mpcb->cnt_established <= 1)

7751

++		goto exit;

7752

++

7753

++	/* Do regular alpha-calculation for multiple subflows */

7754

++

7755

++	/* Find the max numerator of the alpha-calculation */

7756

++	mptcp_for_each_sk(mpcb, sub_sk) {

7757

++		struct tcp_sock *sub_tp = tcp_sk(sub_sk);

7758

++		u64 tmp;

7759

++

7760

++		if (!mptcp_ccc_sk_can_send(sub_sk))

7761

++			continue;

7762

++

7763

++		can_send++;

7764

++

7765

++		/* We need to look for the path, that provides the max-value.

7766

++		 * Integer-overflow is not possible here, because

7767

++		 * tmp will be in u64.

7768

++		 */

7769

++		tmp = div64_u64(mptcp_ccc_scale(sub_tp->snd_cwnd,

7770

++				alpha_scale_num), (u64)sub_tp->srtt_us * sub_tp->srtt_us);

7771

++

7772

++		if (tmp >= max_numerator) {

7773

++			max_numerator = tmp;

7774

++			best_cwnd = sub_tp->snd_cwnd;

7775

++			best_rtt = sub_tp->srtt_us;

7776

++		}

7777

++	}

7778

++

7779

++	/* No subflow is able to send - we don't care anymore */

7780

++	if (unlikely(!can_send))

7781

++		goto exit;

7782

++

7783

++	/* Calculate the denominator */

7784

++	mptcp_for_each_sk(mpcb, sub_sk) {

7785

++		struct tcp_sock *sub_tp = tcp_sk(sub_sk);

7786

++

7787

++		if (!mptcp_ccc_sk_can_send(sub_sk))

7788

++			continue;

7789

++

7790

++		sum_denominator += div_u64(

7791

++				mptcp_ccc_scale(sub_tp->snd_cwnd,

7792

++						alpha_scale_den) * best_rtt,

7793

++						sub_tp->srtt_us);

7794

++	}

7795

++	sum_denominator *= sum_denominator;

7796

++	if (unlikely(!sum_denominator)) {

7797

++		pr_err("%s: sum_denominator == 0, cnt_established:%d\n",

7798

++		       __func__, mpcb->cnt_established);

7799

++		mptcp_for_each_sk(mpcb, sub_sk) {

7800

++			struct tcp_sock *sub_tp = tcp_sk(sub_sk);

7801

++			pr_err("%s: pi:%d, state:%d\n, rtt:%u, cwnd: %u",

7802

++			       __func__, sub_tp->mptcp->path_index,

7803

++			       sub_sk->sk_state, sub_tp->srtt_us,

7804

++			       sub_tp->snd_cwnd);

7805

++		}

7806

++	}

7807

++

7808

++	alpha = div64_u64(mptcp_ccc_scale(best_cwnd, alpha_scale_num), sum_denominator);

7809

++

7810

++	if (unlikely(!alpha))

7811

++		alpha = 1;

7812

++

7813

++exit:

7814

++	mptcp_set_alpha(mptcp_meta_sk(sk), alpha);

7815

++}

7816

++

7817

++static void mptcp_ccc_init(struct sock *sk)

7818

++{

7819

++	if (mptcp(tcp_sk(sk))) {

7820

++		mptcp_set_forced(mptcp_meta_sk(sk), 0);

7821

++		mptcp_set_alpha(mptcp_meta_sk(sk), 1);

7822

++	}

7823

++	/* If we do not mptcp, behave like reno: return */

7824

++}

7825

++

7826

++static void mptcp_ccc_cwnd_event(struct sock *sk, enum tcp_ca_event event)

7827

++{

7828

++	if (event == CA_EVENT_LOSS)

7829

++		mptcp_ccc_recalc_alpha(sk);

7830

++}

7831

++

7832

++static void mptcp_ccc_set_state(struct sock *sk, u8 ca_state)

7833

++{

7834

++	if (!mptcp(tcp_sk(sk)))

7835

++		return;

7836

++

7837

++	mptcp_set_forced(mptcp_meta_sk(sk), 1);

7838

++}

7839

++

7840

++static void mptcp_ccc_cong_avoid(struct sock *sk, u32 ack, u32 acked)

7841

++{

7842

++	struct tcp_sock *tp = tcp_sk(sk);

7843

++	const struct mptcp_cb *mpcb = tp->mpcb;

7844

++	int snd_cwnd;

7845

++

7846

++	if (!mptcp(tp)) {

7847

++		tcp_reno_cong_avoid(sk, ack, acked);

7848

++		return;

7849

++	}

7850

++

7851

++	if (!tcp_is_cwnd_limited(sk))

7852

++		return;

7853

++

7854

++	if (tp->snd_cwnd <= tp->snd_ssthresh) {

7855

++		/* In "safe" area, increase. */

7856

++		tcp_slow_start(tp, acked);

7857

++		mptcp_ccc_recalc_alpha(sk);

7858

++		return;

7859

++	}

7860

++

7861

++	if (mptcp_get_forced(mptcp_meta_sk(sk))) {

7862

++		mptcp_ccc_recalc_alpha(sk);

7863

++		mptcp_set_forced(mptcp_meta_sk(sk), 0);

7864

++	}

7865

++

7866

++	if (mpcb->cnt_established > 1) {

7867

++		u64 alpha = mptcp_get_alpha(mptcp_meta_sk(sk));

7868

++

7869

++		/* This may happen, if at the initialization, the mpcb

7870

++		 * was not yet attached to the sock, and thus

7871

++		 * initializing alpha failed.

7872

++		 */

7873

++		if (unlikely(!alpha))

7874

++			alpha = 1;

7875

++

7876

++		snd_cwnd = (int) div_u64 ((u64) mptcp_ccc_scale(1, alpha_scale),

7877

++						alpha);

7878

++

7879

++		/* snd_cwnd_cnt >= max (scale * tot_cwnd / alpha, cwnd)

7880

++		 * Thus, we select here the max value.

7881

++		 */

7882

++		if (snd_cwnd < tp->snd_cwnd)

7883

++			snd_cwnd = tp->snd_cwnd;

7884

++	} else {

7885

++		snd_cwnd = tp->snd_cwnd;

7886

++	}

7887

++

7888

++	if (tp->snd_cwnd_cnt >= snd_cwnd) {

7889

++		if (tp->snd_cwnd < tp->snd_cwnd_clamp) {

7890

++			tp->snd_cwnd++;

7891

++			mptcp_ccc_recalc_alpha(sk);

7892

++		}

7893

++

7894

++		tp->snd_cwnd_cnt = 0;

7895

++	} else {

7896

++		tp->snd_cwnd_cnt++;

7897

++	}

7898

++}

7899

++

7900

++static struct tcp_congestion_ops mptcp_ccc = {

7901

++	.init		= mptcp_ccc_init,

7902

++	.ssthresh	= tcp_reno_ssthresh,

7903

++	.cong_avoid	= mptcp_ccc_cong_avoid,

7904

++	.cwnd_event	= mptcp_ccc_cwnd_event,

7905

++	.set_state	= mptcp_ccc_set_state,

7906

++	.owner		= THIS_MODULE,

7907

++	.name		= "lia",

7908

++};

7909

++

7910

++static int __init mptcp_ccc_register(void)

7911

++{

7912

++	BUILD_BUG_ON(sizeof(struct mptcp_ccc) > ICSK_CA_PRIV_SIZE);

7913

++	return tcp_register_congestion_control(&mptcp_ccc);

7914

++}

7915

++

7916

++static void __exit mptcp_ccc_unregister(void)

7917

++{

7918

++	tcp_unregister_congestion_control(&mptcp_ccc);

7919

++}

7920

++

7921

++module_init(mptcp_ccc_register);

7922

++module_exit(mptcp_ccc_unregister);

7923

++

7924

++MODULE_AUTHOR("Christoph Paasch, Sébastien Barré");

7925

++MODULE_LICENSE("GPL");

7926

++MODULE_DESCRIPTION("MPTCP LINKED INCREASE CONGESTION CONTROL ALGORITHM");

7927

++MODULE_VERSION("0.1");

7928

+diff --git a/net/mptcp/mptcp_ctrl.c b/net/mptcp/mptcp_ctrl.c

7929

+new file mode 100644

7930

+index 000000000000..28dfa0479f5e

7931

+--- /dev/null

7932

++++ b/net/mptcp/mptcp_ctrl.c

7933

+@@ -0,0 +1,2401 @@

7934

++/*

7935

++ *	MPTCP implementation - MPTCP-control

7936

++ *

7937

++ *	Initial Design & Implementation:

7938

++ *	Sébastien Barré <sebastien.barre@×××××××××.be>

7939

++ *

7940

++ *	Current Maintainer & Author:

7941

++ *	Christoph Paasch <christoph.paasch@×××××××××.be>

7942

++ *

7943

++ *	Additional authors:

7944

++ *	Jaakko Korkeaniemi <jaakko.korkeaniemi@×××××.fi>

7945

++ *	Gregory Detal <gregory.detal@×××××××××.be>

7946

++ *	Fabien Duchêne <fabien.duchene@×××××××××.be>

7947

++ *	Andreas Seelinger <Andreas.Seelinger@×××××××××××.de>

7948

++ *	Lavkesh Lahngir <lavkesh51@×××××.com>

7949

++ *	Andreas Ripke <ripke@××××××.eu>

7950

++ *	Vlad Dogaru <vlad.dogaru@×××××.com>

7951

++ *	Octavian Purdila <octavian.purdila@×××××.com>

7952

++ *	John Ronan <jronan@××××.org>

7953

++ *	Catalin Nicutar <catalin.nicutar@×××××.com>

7954

++ *	Brandon Heller <brandonh@××××××××.edu>

7955

++ *

7956

++ *

7957

++ *	This program is free software; you can redistribute it and/or

7958

++ *      modify it under the terms of the GNU General Public License

7959

++ *      as published by the Free Software Foundation; either version

7960

++ *      2 of the License, or (at your option) any later version.

7961

++ */

7962

++

7963

++#include <net/inet_common.h>

7964

++#include <net/inet6_hashtables.h>

7965

++#include <net/ipv6.h>

7966

++#include <net/ip6_checksum.h>

7967

++#include <net/mptcp.h>

7968

++#include <net/mptcp_v4.h>

7969

++#if IS_ENABLED(CONFIG_IPV6)

7970

++#include <net/ip6_route.h>

7971

++#include <net/mptcp_v6.h>

7972

++#endif

7973

++#include <net/sock.h>

7974

++#include <net/tcp.h>

7975

++#include <net/tcp_states.h>

7976

++#include <net/transp_v6.h>

7977

++#include <net/xfrm.h>

7978

++

7979

++#include <linux/cryptohash.h>

7980

++#include <linux/kconfig.h>

7981

++#include <linux/module.h>

7982

++#include <linux/netpoll.h>

7983

++#include <linux/list.h>

7984

++#include <linux/jhash.h>

7985

++#include <linux/tcp.h>

7986

++#include <linux/net.h>

7987

++#include <linux/in.h>

7988

++#include <linux/random.h>

7989

++#include <linux/inetdevice.h>

7990

++#include <linux/workqueue.h>

7991

++#include <linux/atomic.h>

7992

++#include <linux/sysctl.h>

7993

++

7994

++static struct kmem_cache *mptcp_sock_cache __read_mostly;

7995

++static struct kmem_cache *mptcp_cb_cache __read_mostly;

7996

++static struct kmem_cache *mptcp_tw_cache __read_mostly;

7997

++

7998

++int sysctl_mptcp_enabled __read_mostly = 1;

7999

++int sysctl_mptcp_checksum __read_mostly = 1;

8000

++int sysctl_mptcp_debug __read_mostly;

8001

++EXPORT_SYMBOL(sysctl_mptcp_debug);

8002

++int sysctl_mptcp_syn_retries __read_mostly = 3;

8003

++

8004

++bool mptcp_init_failed __read_mostly;

8005

++

8006

++struct static_key mptcp_static_key = STATIC_KEY_INIT_FALSE;

8007

++EXPORT_SYMBOL(mptcp_static_key);

8008

++

8009

++static int proc_mptcp_path_manager(ctl_table *ctl, int write,

8010

++				   void __user *buffer, size_t *lenp,

8011

++				   loff_t *ppos)

8012

++{

8013

++	char val[MPTCP_PM_NAME_MAX];

8014

++	ctl_table tbl = {

8015

++		.data = val,

8016

++		.maxlen = MPTCP_PM_NAME_MAX,

8017

++	};

8018

++	int ret;

8019

++

8020

++	mptcp_get_default_path_manager(val);

8021

++

8022

++	ret = proc_dostring(&tbl, write, buffer, lenp, ppos);

8023

++	if (write && ret == 0)

8024

++		ret = mptcp_set_default_path_manager(val);

8025

++	return ret;

8026

++}

8027

++

8028

++static int proc_mptcp_scheduler(ctl_table *ctl, int write,

8029

++				void __user *buffer, size_t *lenp,

8030

++				loff_t *ppos)

8031

++{

8032

++	char val[MPTCP_SCHED_NAME_MAX];

8033

++	ctl_table tbl = {

8034

++		.data = val,

8035

++		.maxlen = MPTCP_SCHED_NAME_MAX,

8036

++	};

8037

++	int ret;

8038

++

8039

++	mptcp_get_default_scheduler(val);

8040

++

8041

++	ret = proc_dostring(&tbl, write, buffer, lenp, ppos);

8042

++	if (write && ret == 0)

8043

++		ret = mptcp_set_default_scheduler(val);

8044

++	return ret;

8045

++}

8046

++

8047

++static struct ctl_table mptcp_table[] = {

8048

++	{

8049

++		.procname = "mptcp_enabled",

8050

++		.data = &sysctl_mptcp_enabled,

8051

++		.maxlen = sizeof(int),

8052

++		.mode = 0644,

8053

++		.proc_handler = &proc_dointvec

8054

++	},

8055

++	{

8056

++		.procname = "mptcp_checksum",

8057

++		.data = &sysctl_mptcp_checksum,

8058

++		.maxlen = sizeof(int),

8059

++		.mode = 0644,

8060

++		.proc_handler = &proc_dointvec

8061

++	},

8062

++	{

8063

++		.procname = "mptcp_debug",

8064

++		.data = &sysctl_mptcp_debug,

8065

++		.maxlen = sizeof(int),

8066

++		.mode = 0644,

8067

++		.proc_handler = &proc_dointvec

8068

++	},

8069

++	{

8070

++		.procname = "mptcp_syn_retries",

8071

++		.data = &sysctl_mptcp_syn_retries,

8072

++		.maxlen = sizeof(int),

8073

++		.mode = 0644,

8074

++		.proc_handler = &proc_dointvec

8075

++	},

8076

++	{

8077

++		.procname	= "mptcp_path_manager",

8078

++		.mode		= 0644,

8079

++		.maxlen		= MPTCP_PM_NAME_MAX,

8080

++		.proc_handler	= proc_mptcp_path_manager,

8081

++	},

8082

++	{

8083

++		.procname	= "mptcp_scheduler",

8084

++		.mode		= 0644,

8085

++		.maxlen		= MPTCP_SCHED_NAME_MAX,

8086

++		.proc_handler	= proc_mptcp_scheduler,

8087

++	},

8088

++	{ }

8089

++};

8090

++

8091

++static inline u32 mptcp_hash_tk(u32 token)

8092

++{

8093

++	return token % MPTCP_HASH_SIZE;

8094

++}

8095

++

8096

++struct hlist_nulls_head tk_hashtable[MPTCP_HASH_SIZE];

8097

++EXPORT_SYMBOL(tk_hashtable);

8098

++

8099

++/* This second hashtable is needed to retrieve request socks

8100

++ * created as a result of a join request. While the SYN contains

8101

++ * the token, the final ack does not, so we need a separate hashtable

8102

++ * to retrieve the mpcb.

8103

++ */

8104

++struct hlist_nulls_head mptcp_reqsk_htb[MPTCP_HASH_SIZE];

8105

++spinlock_t mptcp_reqsk_hlock;	/* hashtable protection */

8106

++

8107

++/* The following hash table is used to avoid collision of token */

8108

++static struct hlist_nulls_head mptcp_reqsk_tk_htb[MPTCP_HASH_SIZE];

8109

++spinlock_t mptcp_tk_hashlock;	/* hashtable protection */

8110

++

8111

++static bool mptcp_reqsk_find_tk(const u32 token)

8112

++{

8113

++	const u32 hash = mptcp_hash_tk(token);

8114

++	const struct mptcp_request_sock *mtreqsk;

8115

++	const struct hlist_nulls_node *node;

8116

++

8117

++begin:

8118

++	hlist_nulls_for_each_entry_rcu(mtreqsk, node,

8119

++				       &mptcp_reqsk_tk_htb[hash], hash_entry) {

8120

++		if (token == mtreqsk->mptcp_loc_token)

8121

++			return true;

8122

++	}

8123

++	/* A request-socket is destroyed by RCU. So, it might have been recycled

8124

++	 * and put into another hash-table list. So, after the lookup we may

8125

++	 * end up in a different list. So, we may need to restart.

8126

++	 *

8127

++	 * See also the comment in __inet_lookup_established.

8128

++	 */

8129

++	if (get_nulls_value(node) != hash)

8130

++		goto begin;

8131

++	return false;

8132

++}

8133

++

8134

++static void mptcp_reqsk_insert_tk(struct request_sock *reqsk, const u32 token)

8135

++{

8136

++	u32 hash = mptcp_hash_tk(token);

8137

++

8138

++	hlist_nulls_add_head_rcu(&mptcp_rsk(reqsk)->hash_entry,

8139

++				 &mptcp_reqsk_tk_htb[hash]);

8140

++}

8141

++

8142

++static void mptcp_reqsk_remove_tk(const struct request_sock *reqsk)

8143

++{

8144

++	rcu_read_lock();

8145

++	spin_lock(&mptcp_tk_hashlock);

8146

++	hlist_nulls_del_init_rcu(&mptcp_rsk(reqsk)->hash_entry);

8147

++	spin_unlock(&mptcp_tk_hashlock);

8148

++	rcu_read_unlock();

8149

++}

8150

++

8151

++void mptcp_reqsk_destructor(struct request_sock *req)

8152

++{

8153

++	if (!mptcp_rsk(req)->is_sub) {

8154

++		if (in_softirq()) {

8155

++			mptcp_reqsk_remove_tk(req);

8156

++		} else {

8157

++			rcu_read_lock_bh();

8158

++			spin_lock(&mptcp_tk_hashlock);

8159

++			hlist_nulls_del_init_rcu(&mptcp_rsk(req)->hash_entry);

8160

++			spin_unlock(&mptcp_tk_hashlock);

8161

++			rcu_read_unlock_bh();

8162

++		}

8163

++	} else {

8164

++		mptcp_hash_request_remove(req);

8165

++	}

8166

++}

8167

++

8168

++static void __mptcp_hash_insert(struct tcp_sock *meta_tp, const u32 token)

8169

++{

8170

++	u32 hash = mptcp_hash_tk(token);

8171

++	hlist_nulls_add_head_rcu(&meta_tp->tk_table, &tk_hashtable[hash]);

8172

++	meta_tp->inside_tk_table = 1;

8173

++}

8174

++

8175

++static bool mptcp_find_token(u32 token)

8176

++{

8177

++	const u32 hash = mptcp_hash_tk(token);

8178

++	const struct tcp_sock *meta_tp;

8179

++	const struct hlist_nulls_node *node;

8180

++

8181

++begin:

8182

++	hlist_nulls_for_each_entry_rcu(meta_tp, node, &tk_hashtable[hash], tk_table) {

8183

++		if (token == meta_tp->mptcp_loc_token)

8184

++			return true;

8185

++	}

8186

++	/* A TCP-socket is destroyed by RCU. So, it might have been recycled

8187

++	 * and put into another hash-table list. So, after the lookup we may

8188

++	 * end up in a different list. So, we may need to restart.

8189

++	 *

8190

++	 * See also the comment in __inet_lookup_established.

8191

++	 */

8192

++	if (get_nulls_value(node) != hash)

8193

++		goto begin;

8194

++	return false;

8195

++}

8196

++

8197

++static void mptcp_set_key_reqsk(struct request_sock *req,

8198

++				const struct sk_buff *skb)

8199

++{

8200

++	const struct inet_request_sock *ireq = inet_rsk(req);

8201

++	struct mptcp_request_sock *mtreq = mptcp_rsk(req);

8202

++

8203

++	if (skb->protocol == htons(ETH_P_IP)) {

8204

++		mtreq->mptcp_loc_key = mptcp_v4_get_key(ip_hdr(skb)->saddr,

8205

++							ip_hdr(skb)->daddr,

8206

++							htons(ireq->ir_num),

8207

++							ireq->ir_rmt_port);

8208

++#if IS_ENABLED(CONFIG_IPV6)

8209

++	} else {

8210

++		mtreq->mptcp_loc_key = mptcp_v6_get_key(ipv6_hdr(skb)->saddr.s6_addr32,

8211

++							ipv6_hdr(skb)->daddr.s6_addr32,

8212

++							htons(ireq->ir_num),

8213

++							ireq->ir_rmt_port);

8214

++#endif

8215

++	}

8216

++

8217

++	mptcp_key_sha1(mtreq->mptcp_loc_key, &mtreq->mptcp_loc_token, NULL);

8218

++}

8219

++

8220

++/* New MPTCP-connection request, prepare a new token for the meta-socket that

8221

++ * will be created in mptcp_check_req_master(), and store the received token.

8222

++ */

8223

++void mptcp_reqsk_new_mptcp(struct request_sock *req,

8224

++			   const struct mptcp_options_received *mopt,

8225

++			   const struct sk_buff *skb)

8226

++{

8227

++	struct mptcp_request_sock *mtreq = mptcp_rsk(req);

8228

++

8229

++	inet_rsk(req)->saw_mpc = 1;

8230

++

8231

++	rcu_read_lock();

8232

++	spin_lock(&mptcp_tk_hashlock);

8233

++	do {

8234

++		mptcp_set_key_reqsk(req, skb);

8235

++	} while (mptcp_reqsk_find_tk(mtreq->mptcp_loc_token) ||

8236

++		 mptcp_find_token(mtreq->mptcp_loc_token));

8237

++

8238

++	mptcp_reqsk_insert_tk(req, mtreq->mptcp_loc_token);

8239

++	spin_unlock(&mptcp_tk_hashlock);

8240

++	rcu_read_unlock();

8241

++	mtreq->mptcp_rem_key = mopt->mptcp_key;

8242

++}

8243

++

8244

++static void mptcp_set_key_sk(const struct sock *sk)

8245

++{

8246

++	struct tcp_sock *tp = tcp_sk(sk);

8247

++	const struct inet_sock *isk = inet_sk(sk);

8248

++

8249

++	if (sk->sk_family == AF_INET)

8250

++		tp->mptcp_loc_key = mptcp_v4_get_key(isk->inet_saddr,

8251

++						     isk->inet_daddr,

8252

++						     isk->inet_sport,

8253

++						     isk->inet_dport);

8254

++#if IS_ENABLED(CONFIG_IPV6)

8255

++	else

8256

++		tp->mptcp_loc_key = mptcp_v6_get_key(inet6_sk(sk)->saddr.s6_addr32,

8257

++						     sk->sk_v6_daddr.s6_addr32,

8258

++						     isk->inet_sport,

8259

++						     isk->inet_dport);

8260

++#endif

8261

++

8262

++	mptcp_key_sha1(tp->mptcp_loc_key,

8263

++		       &tp->mptcp_loc_token, NULL);

8264

++}

8265

++

8266

++void mptcp_connect_init(struct sock *sk)

8267

++{

8268

++	struct tcp_sock *tp = tcp_sk(sk);

8269

++

8270

++	rcu_read_lock_bh();

8271

++	spin_lock(&mptcp_tk_hashlock);

8272

++	do {

8273

++		mptcp_set_key_sk(sk);

8274

++	} while (mptcp_reqsk_find_tk(tp->mptcp_loc_token) ||

8275

++		 mptcp_find_token(tp->mptcp_loc_token));

8276

++

8277

++	__mptcp_hash_insert(tp, tp->mptcp_loc_token);

8278

++	spin_unlock(&mptcp_tk_hashlock);

8279

++	rcu_read_unlock_bh();

8280

++}

8281

++

8282

++/**

8283

++ * This function increments the refcount of the mpcb struct.

8284

++ * It is the responsibility of the caller to decrement when releasing

8285

++ * the structure.

8286

++ */

8287

++struct sock *mptcp_hash_find(const struct net *net, const u32 token)

8288

++{

8289

++	const u32 hash = mptcp_hash_tk(token);

8290

++	const struct tcp_sock *meta_tp;

8291

++	struct sock *meta_sk = NULL;

8292

++	const struct hlist_nulls_node *node;

8293

++

8294

++	rcu_read_lock();

8295

++begin:

8296

++	hlist_nulls_for_each_entry_rcu(meta_tp, node, &tk_hashtable[hash],

8297

++				       tk_table) {

8298

++		meta_sk = (struct sock *)meta_tp;

8299

++		if (token == meta_tp->mptcp_loc_token &&

8300

++		    net_eq(net, sock_net(meta_sk))) {

8301

++			if (unlikely(!atomic_inc_not_zero(&meta_sk->sk_refcnt)))

8302

++				goto out;

8303

++			if (unlikely(token != meta_tp->mptcp_loc_token ||

8304

++				     !net_eq(net, sock_net(meta_sk)))) {

8305

++				sock_gen_put(meta_sk);

8306

++				goto begin;

8307

++			}

8308

++			goto found;

8309

++		}

8310

++	}

8311

++	/* A TCP-socket is destroyed by RCU. So, it might have been recycled

8312

++	 * and put into another hash-table list. So, after the lookup we may

8313

++	 * end up in a different list. So, we may need to restart.

8314

++	 *

8315

++	 * See also the comment in __inet_lookup_established.

8316

++	 */

8317

++	if (get_nulls_value(node) != hash)

8318

++		goto begin;

8319

++out:

8320

++	meta_sk = NULL;

8321

++found:

8322

++	rcu_read_unlock();

8323

++	return meta_sk;

8324

++}

8325

++

8326

++void mptcp_hash_remove_bh(struct tcp_sock *meta_tp)

8327

++{

8328

++	/* remove from the token hashtable */

8329

++	rcu_read_lock_bh();

8330

++	spin_lock(&mptcp_tk_hashlock);

8331

++	hlist_nulls_del_init_rcu(&meta_tp->tk_table);

8332

++	meta_tp->inside_tk_table = 0;

8333

++	spin_unlock(&mptcp_tk_hashlock);

8334

++	rcu_read_unlock_bh();

8335

++}

8336

++

8337

++void mptcp_hash_remove(struct tcp_sock *meta_tp)

8338

++{

8339

++	rcu_read_lock();

8340

++	spin_lock(&mptcp_tk_hashlock);

8341

++	hlist_nulls_del_init_rcu(&meta_tp->tk_table);

8342

++	meta_tp->inside_tk_table = 0;

8343

++	spin_unlock(&mptcp_tk_hashlock);

8344

++	rcu_read_unlock();

8345

++}

8346

++

8347

++struct sock *mptcp_select_ack_sock(const struct sock *meta_sk)

8348

++{

8349

++	const struct tcp_sock *meta_tp = tcp_sk(meta_sk);

8350

++	struct sock *sk, *rttsk = NULL, *lastsk = NULL;

8351

++	u32 min_time = 0, last_active = 0;

8352

++

8353

++	mptcp_for_each_sk(meta_tp->mpcb, sk) {

8354

++		struct tcp_sock *tp = tcp_sk(sk);

8355

++		u32 elapsed;

8356

++

8357

++		if (!mptcp_sk_can_send_ack(sk) || tp->pf)

8358

++			continue;

8359

++

8360

++		elapsed = keepalive_time_elapsed(tp);

8361

++

8362

++		/* We take the one with the lowest RTT within a reasonable

8363

++		 * (meta-RTO)-timeframe

8364

++		 */

8365

++		if (elapsed < inet_csk(meta_sk)->icsk_rto) {

8366

++			if (!min_time || tp->srtt_us < min_time) {

8367

++				min_time = tp->srtt_us;

8368

++				rttsk = sk;

8369

++			}

8370

++			continue;

8371

++		}

8372

++

8373

++		/* Otherwise, we just take the most recent active */

8374

++		if (!rttsk && (!last_active || elapsed < last_active)) {

8375

++			last_active = elapsed;

8376

++			lastsk = sk;

8377

++		}

8378

++	}

8379

++

8380

++	if (rttsk)

8381

++		return rttsk;

8382

++

8383

++	return lastsk;

8384

++}

8385

++EXPORT_SYMBOL(mptcp_select_ack_sock);

8386

++

8387

++static void mptcp_sock_def_error_report(struct sock *sk)

8388

++{

8389

++	const struct mptcp_cb *mpcb = tcp_sk(sk)->mpcb;

8390

++

8391

++	if (!sock_flag(sk, SOCK_DEAD))

8392

++		mptcp_sub_close(sk, 0);

8393

++

8394

++	if (mpcb->infinite_mapping_rcv || mpcb->infinite_mapping_snd ||

8395

++	    mpcb->send_infinite_mapping) {

8396

++		struct sock *meta_sk = mptcp_meta_sk(sk);

8397

++

8398

++		meta_sk->sk_err = sk->sk_err;

8399

++		meta_sk->sk_err_soft = sk->sk_err_soft;

8400

++

8401

++		if (!sock_flag(meta_sk, SOCK_DEAD))

8402

++			meta_sk->sk_error_report(meta_sk);

8403

++

8404

++		tcp_done(meta_sk);

8405

++	}

8406

++

8407

++	sk->sk_err = 0;

8408

++	return;

8409

++}

8410

++

8411

++static void mptcp_mpcb_put(struct mptcp_cb *mpcb)

8412

++{

8413

++	if (atomic_dec_and_test(&mpcb->mpcb_refcnt)) {

8414

++		mptcp_cleanup_path_manager(mpcb);

8415

++		mptcp_cleanup_scheduler(mpcb);

8416

++		kmem_cache_free(mptcp_cb_cache, mpcb);

8417

++	}

8418

++}

8419

++

8420

++static void mptcp_sock_destruct(struct sock *sk)

8421

++{

8422

++	struct tcp_sock *tp = tcp_sk(sk);

8423

++

8424

++	inet_sock_destruct(sk);

8425

++

8426

++	if (!is_meta_sk(sk) && !tp->was_meta_sk) {

8427

++		BUG_ON(!hlist_unhashed(&tp->mptcp->cb_list));

8428

++

8429

++		kmem_cache_free(mptcp_sock_cache, tp->mptcp);

8430

++		tp->mptcp = NULL;

8431

++

8432

++		/* Taken when mpcb pointer was set */

8433

++		sock_put(mptcp_meta_sk(sk));

8434

++		mptcp_mpcb_put(tp->mpcb);

8435

++	} else {

8436

++		struct mptcp_cb *mpcb = tp->mpcb;

8437

++		struct mptcp_tw *mptw;

8438

++

8439

++		/* The mpcb is disappearing - we can make the final

8440

++		 * update to the rcv_nxt of the time-wait-sock and remove

8441

++		 * its reference to the mpcb.

8442

++		 */

8443

++		spin_lock_bh(&mpcb->tw_lock);

8444

++		list_for_each_entry_rcu(mptw, &mpcb->tw_list, list) {

8445

++			list_del_rcu(&mptw->list);

8446

++			mptw->in_list = 0;

8447

++			mptcp_mpcb_put(mpcb);

8448

++			rcu_assign_pointer(mptw->mpcb, NULL);

8449

++		}

8450

++		spin_unlock_bh(&mpcb->tw_lock);

8451

++

8452

++		mptcp_mpcb_put(mpcb);

8453

++

8454

++		mptcp_debug("%s destroying meta-sk\n", __func__);

8455

++	}

8456

++

8457

++	WARN_ON(!static_key_false(&mptcp_static_key));

8458

++	/* Must be the last call, because is_meta_sk() above still needs the

8459

++	 * static key

8460

++	 */

8461

++	static_key_slow_dec(&mptcp_static_key);

8462

++}

8463

++

8464

++void mptcp_destroy_sock(struct sock *sk)

8465

++{

8466

++	if (is_meta_sk(sk)) {

8467

++		struct sock *sk_it, *tmpsk;

8468

++

8469

++		__skb_queue_purge(&tcp_sk(sk)->mpcb->reinject_queue);

8470

++		mptcp_purge_ofo_queue(tcp_sk(sk));

8471

++

8472

++		/* We have to close all remaining subflows. Normally, they

8473

++		 * should all be about to get closed. But, if the kernel is

8474

++		 * forcing a closure (e.g., tcp_write_err), the subflows might

8475

++		 * not have been closed properly (as we are waiting for the

8476

++		 * DATA_ACK of the DATA_FIN).

8477

++		 */

8478

++		mptcp_for_each_sk_safe(tcp_sk(sk)->mpcb, sk_it, tmpsk) {

8479

++			/* Already did call tcp_close - waiting for graceful

8480

++			 * closure, or if we are retransmitting fast-close on

8481

++			 * the subflow. The reset (or timeout) will kill the

8482

++			 * subflow..

8483

++			 */

8484

++			if (tcp_sk(sk_it)->closing ||

8485

++			    tcp_sk(sk_it)->send_mp_fclose)

8486

++				continue;

8487

++

8488

++			/* Allow the delayed work first to prevent time-wait state */

8489

++			if (delayed_work_pending(&tcp_sk(sk_it)->mptcp->work))

8490

++				continue;

8491

++

8492

++			mptcp_sub_close(sk_it, 0);

8493

++		}

8494

++

8495

++		mptcp_delete_synack_timer(sk);

8496

++	} else {

8497

++		mptcp_del_sock(sk);

8498

++	}

8499

++}

8500

++

8501

++static void mptcp_set_state(struct sock *sk)

8502

++{

8503

++	struct sock *meta_sk = mptcp_meta_sk(sk);

8504

++

8505

++	/* Meta is not yet established - wake up the application */

8506

++	if ((1 << meta_sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV) &&

8507

++	    sk->sk_state == TCP_ESTABLISHED) {

8508

++		tcp_set_state(meta_sk, TCP_ESTABLISHED);

8509

++

8510

++		if (!sock_flag(meta_sk, SOCK_DEAD)) {

8511

++			meta_sk->sk_state_change(meta_sk);

8512

++			sk_wake_async(meta_sk, SOCK_WAKE_IO, POLL_OUT);

8513

++		}

8514

++	}

8515

++

8516

++	if (sk->sk_state == TCP_ESTABLISHED) {

8517

++		tcp_sk(sk)->mptcp->establish_increased = 1;

8518

++		tcp_sk(sk)->mpcb->cnt_established++;

8519

++	}

8520

++}

8521

++

8522

++void mptcp_init_congestion_control(struct sock *sk)

8523

++{

8524

++	struct inet_connection_sock *icsk = inet_csk(sk);

8525

++	struct inet_connection_sock *meta_icsk = inet_csk(mptcp_meta_sk(sk));

8526

++	const struct tcp_congestion_ops *ca = meta_icsk->icsk_ca_ops;

8527

++

8528

++	/* The application didn't set the congestion control to use

8529

++	 * fallback to the default one.

8530

++	 */

8531

++	if (ca == &tcp_init_congestion_ops)

8532

++		goto use_default;

8533

++

8534

++	/* Use the same congestion control as set by the user. If the

8535

++	 * module is not available fallback to the default one.

8536

++	 */

8537

++	if (!try_module_get(ca->owner)) {

8538

++		pr_warn("%s: fallback to the system default CC\n", __func__);

8539

++		goto use_default;

8540

++	}

8541

++

8542

++	icsk->icsk_ca_ops = ca;

8543

++	if (icsk->icsk_ca_ops->init)

8544

++		icsk->icsk_ca_ops->init(sk);

8545

++

8546

++	return;

8547

++

8548

++use_default:

8549

++	icsk->icsk_ca_ops = &tcp_init_congestion_ops;

8550

++	tcp_init_congestion_control(sk);

8551

++}

8552

++

8553

++u32 mptcp_secret[MD5_MESSAGE_BYTES / 4] ____cacheline_aligned;

8554

++u32 mptcp_seed = 0;

8555

++

8556

++void mptcp_key_sha1(u64 key, u32 *token, u64 *idsn)

8557

++{

8558

++	u32 workspace[SHA_WORKSPACE_WORDS];

8559

++	u32 mptcp_hashed_key[SHA_DIGEST_WORDS];

8560

++	u8 input[64];

8561

++	int i;

8562

++

8563

++	memset(workspace, 0, sizeof(workspace));

8564

++

8565

++	/* Initialize input with appropriate padding */

8566

++	memset(&input[9], 0, sizeof(input) - 10); /* -10, because the last byte

8567

++						   * is explicitly set too

8568

++						   */

8569

++	memcpy(input, &key, sizeof(key)); /* Copy key to the msg beginning */

8570

++	input[8] = 0x80; /* Padding: First bit after message = 1 */

8571

++	input[63] = 0x40; /* Padding: Length of the message = 64 bits */

8572

++

8573

++	sha_init(mptcp_hashed_key);

8574

++	sha_transform(mptcp_hashed_key, input, workspace);

8575

++

8576

++	for (i = 0; i < 5; i++)

8577

++		mptcp_hashed_key[i] = cpu_to_be32(mptcp_hashed_key[i]);

8578

++

8579

++	if (token)

8580

++		*token = mptcp_hashed_key[0];

8581

++	if (idsn)

8582

++		*idsn = *((u64 *)&mptcp_hashed_key[3]);

8583

++}

8584

++

8585

++void mptcp_hmac_sha1(u8 *key_1, u8 *key_2, u8 *rand_1, u8 *rand_2,

8586

++		       u32 *hash_out)

8587

++{

8588

++	u32 workspace[SHA_WORKSPACE_WORDS];

8589

++	u8 input[128]; /* 2 512-bit blocks */

8590

++	int i;

8591

++

8592

++	memset(workspace, 0, sizeof(workspace));

8593

++

8594

++	/* Generate key xored with ipad */

8595

++	memset(input, 0x36, 64);

8596

++	for (i = 0; i < 8; i++)

8597

++		input[i] ^= key_1[i];

8598

++	for (i = 0; i < 8; i++)

8599

++		input[i + 8] ^= key_2[i];

8600

++

8601

++	memcpy(&input[64], rand_1, 4);

8602

++	memcpy(&input[68], rand_2, 4);

8603

++	input[72] = 0x80; /* Padding: First bit after message = 1 */

8604

++	memset(&input[73], 0, 53);

8605

++

8606

++	/* Padding: Length of the message = 512 + 64 bits */

8607

++	input[126] = 0x02;

8608

++	input[127] = 0x40;

8609

++

8610

++	sha_init(hash_out);

8611

++	sha_transform(hash_out, input, workspace);

8612

++	memset(workspace, 0, sizeof(workspace));

8613

++

8614

++	sha_transform(hash_out, &input[64], workspace);

8615

++	memset(workspace, 0, sizeof(workspace));

8616

++

8617

++	for (i = 0; i < 5; i++)

8618

++		hash_out[i] = cpu_to_be32(hash_out[i]);

8619

++

8620

++	/* Prepare second part of hmac */

8621

++	memset(input, 0x5C, 64);

8622

++	for (i = 0; i < 8; i++)

8623

++		input[i] ^= key_1[i];

8624

++	for (i = 0; i < 8; i++)

8625

++		input[i + 8] ^= key_2[i];

8626

++

8627

++	memcpy(&input[64], hash_out, 20);

8628

++	input[84] = 0x80;

8629

++	memset(&input[85], 0, 41);

8630

++

8631

++	/* Padding: Length of the message = 512 + 160 bits */

8632

++	input[126] = 0x02;

8633

++	input[127] = 0xA0;

8634

++

8635

++	sha_init(hash_out);

8636

++	sha_transform(hash_out, input, workspace);

8637

++	memset(workspace, 0, sizeof(workspace));

8638

++

8639

++	sha_transform(hash_out, &input[64], workspace);

8640

++

8641

++	for (i = 0; i < 5; i++)

8642

++		hash_out[i] = cpu_to_be32(hash_out[i]);

8643

++}

8644

++

8645

++static void mptcp_mpcb_inherit_sockopts(struct sock *meta_sk, struct sock *master_sk)

8646

++{

8647

++	/* Socket-options handled by sk_clone_lock while creating the meta-sk.

8648

++	 * ======

8649

++	 * SO_SNDBUF, SO_SNDBUFFORCE, SO_RCVBUF, SO_RCVBUFFORCE, SO_RCVLOWAT,

8650

++	 * SO_RCVTIMEO, SO_SNDTIMEO, SO_ATTACH_FILTER, SO_DETACH_FILTER,

8651

++	 * TCP_NODELAY, TCP_CORK

8652

++	 *

8653

++	 * Socket-options handled in this function here

8654

++	 * ======

8655

++	 * TCP_DEFER_ACCEPT

8656

++	 * SO_KEEPALIVE

8657

++	 *

8658

++	 * Socket-options on the todo-list

8659

++	 * ======

8660

++	 * SO_BINDTODEVICE - should probably prevent creation of new subsocks

8661

++	 *		     across other devices. - what about the api-draft?

8662

++	 * SO_DEBUG

8663

++	 * SO_REUSEADDR - probably we don't care about this

8664

++	 * SO_DONTROUTE, SO_BROADCAST

8665

++	 * SO_OOBINLINE

8666

++	 * SO_LINGER

8667

++	 * SO_TIMESTAMP* - I don't think this is of concern for a SOCK_STREAM

8668

++	 * SO_PASSSEC - I don't think this is of concern for a SOCK_STREAM

8669

++	 * SO_RXQ_OVFL

8670

++	 * TCP_COOKIE_TRANSACTIONS

8671

++	 * TCP_MAXSEG

8672

++	 * TCP_THIN_* - Handled by sk_clone_lock, but we need to support this

8673

++	 *		in mptcp_retransmit_timer. AND we need to check what is

8674

++	 *		about the subsockets.

8675

++	 * TCP_LINGER2

8676

++	 * TCP_WINDOW_CLAMP

8677

++	 * TCP_USER_TIMEOUT

8678

++	 * TCP_MD5SIG

8679

++	 *

8680

++	 * Socket-options of no concern for the meta-socket (but for the subsocket)

8681

++	 * ======

8682

++	 * SO_PRIORITY

8683

++	 * SO_MARK

8684

++	 * TCP_CONGESTION

8685

++	 * TCP_SYNCNT

8686

++	 * TCP_QUICKACK

8687

++	 */

8688

++

8689

++	/* DEFER_ACCEPT should not be set on the meta, as we want to accept new subflows directly */

8690

++	inet_csk(meta_sk)->icsk_accept_queue.rskq_defer_accept = 0;

8691

++

8692

++	/* Keepalives are handled entirely at the MPTCP-layer */

8693

++	if (sock_flag(meta_sk, SOCK_KEEPOPEN)) {

8694

++		inet_csk_reset_keepalive_timer(meta_sk,

8695

++					       keepalive_time_when(tcp_sk(meta_sk)));

8696

++		sock_reset_flag(master_sk, SOCK_KEEPOPEN);

8697

++		inet_csk_delete_keepalive_timer(master_sk);

8698

++	}

8699

++

8700

++	/* Do not propagate subflow-errors up to the MPTCP-layer */

8701

++	inet_sk(master_sk)->recverr = 0;

8702

++}

8703

++

8704

++static void mptcp_sub_inherit_sockopts(const struct sock *meta_sk, struct sock *sub_sk)

8705

++{

8706

++	/* IP_TOS also goes to the subflow. */

8707

++	if (inet_sk(sub_sk)->tos != inet_sk(meta_sk)->tos) {

8708

++		inet_sk(sub_sk)->tos = inet_sk(meta_sk)->tos;

8709

++		sub_sk->sk_priority = meta_sk->sk_priority;

8710

++		sk_dst_reset(sub_sk);

8711

++	}

8712

++

8713

++	/* Inherit SO_REUSEADDR */

8714

++	sub_sk->sk_reuse = meta_sk->sk_reuse;

8715

++

8716

++	/* Inherit snd/rcv-buffer locks */

8717

++	sub_sk->sk_userlocks = meta_sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;

8718

++

8719

++	/* Nagle/Cork is forced off on the subflows. It is handled at the meta-layer */

8720

++	tcp_sk(sub_sk)->nonagle = TCP_NAGLE_OFF|TCP_NAGLE_PUSH;

8721

++

8722

++	/* Keepalives are handled entirely at the MPTCP-layer */

8723

++	if (sock_flag(sub_sk, SOCK_KEEPOPEN)) {

8724

++		sock_reset_flag(sub_sk, SOCK_KEEPOPEN);

8725

++		inet_csk_delete_keepalive_timer(sub_sk);

8726

++	}

8727

++

8728

++	/* Do not propagate subflow-errors up to the MPTCP-layer */

8729

++	inet_sk(sub_sk)->recverr = 0;

8730

++}

8731

++

8732

++int mptcp_backlog_rcv(struct sock *meta_sk, struct sk_buff *skb)

8733

++{

8734

++	/* skb-sk may be NULL if we receive a packet immediatly after the

8735

++	 * SYN/ACK + MP_CAPABLE.

8736

++	 */

8737

++	struct sock *sk = skb->sk ? skb->sk : meta_sk;

8738

++	int ret = 0;

8739

++

8740

++	skb->sk = NULL;

8741

++

8742

++	if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt))) {

8743

++		kfree_skb(skb);

8744

++		return 0;

8745

++	}

8746

++

8747

++	if (sk->sk_family == AF_INET)

8748

++		ret = tcp_v4_do_rcv(sk, skb);

8749

++#if IS_ENABLED(CONFIG_IPV6)

8750

++	else

8751

++		ret = tcp_v6_do_rcv(sk, skb);

8752

++#endif

8753

++

8754

++	sock_put(sk);

8755

++	return ret;

8756

++}

8757

++

8758

++struct lock_class_key meta_key;

8759

++struct lock_class_key meta_slock_key;

8760

++

8761

++static void mptcp_synack_timer_handler(unsigned long data)

8762

++{

8763

++	struct sock *meta_sk = (struct sock *) data;

8764

++	struct listen_sock *lopt = inet_csk(meta_sk)->icsk_accept_queue.listen_opt;

8765

++

8766

++	/* Only process if socket is not in use. */

8767

++	bh_lock_sock(meta_sk);

8768

++

8769

++	if (sock_owned_by_user(meta_sk)) {

8770

++		/* Try again later. */

8771

++		mptcp_reset_synack_timer(meta_sk, HZ/20);

8772

++		goto out;

8773

++	}

8774

++

8775

++	/* May happen if the queue got destructed in mptcp_close */

8776

++	if (!lopt)

8777

++		goto out;

8778

++

8779

++	inet_csk_reqsk_queue_prune(meta_sk, TCP_SYNQ_INTERVAL,

8780

++				   TCP_TIMEOUT_INIT, TCP_RTO_MAX);

8781

++

8782

++	if (lopt->qlen)

8783

++		mptcp_reset_synack_timer(meta_sk, TCP_SYNQ_INTERVAL);

8784

++

8785

++out:

8786

++	bh_unlock_sock(meta_sk);

8787

++	sock_put(meta_sk);

8788

++}

8789

++

8790

++static const struct tcp_sock_ops mptcp_meta_specific = {

8791

++	.__select_window		= __mptcp_select_window,

8792

++	.select_window			= mptcp_select_window,

8793

++	.select_initial_window		= mptcp_select_initial_window,

8794

++	.init_buffer_space		= mptcp_init_buffer_space,

8795

++	.set_rto			= mptcp_tcp_set_rto,

8796

++	.should_expand_sndbuf		= mptcp_should_expand_sndbuf,

8797

++	.init_congestion_control	= mptcp_init_congestion_control,

8798

++	.send_fin			= mptcp_send_fin,

8799

++	.write_xmit			= mptcp_write_xmit,

8800

++	.send_active_reset		= mptcp_send_active_reset,

8801

++	.write_wakeup			= mptcp_write_wakeup,

8802

++	.prune_ofo_queue		= mptcp_prune_ofo_queue,

8803

++	.retransmit_timer		= mptcp_retransmit_timer,

8804

++	.time_wait			= mptcp_time_wait,

8805

++	.cleanup_rbuf			= mptcp_cleanup_rbuf,

8806

++};

8807

++

8808

++static const struct tcp_sock_ops mptcp_sub_specific = {

8809

++	.__select_window		= __mptcp_select_window,

8810

++	.select_window			= mptcp_select_window,

8811

++	.select_initial_window		= mptcp_select_initial_window,

8812

++	.init_buffer_space		= mptcp_init_buffer_space,

8813

++	.set_rto			= mptcp_tcp_set_rto,

8814

++	.should_expand_sndbuf		= mptcp_should_expand_sndbuf,

8815

++	.init_congestion_control	= mptcp_init_congestion_control,

8816

++	.send_fin			= tcp_send_fin,

8817

++	.write_xmit			= tcp_write_xmit,

8818

++	.send_active_reset		= tcp_send_active_reset,

8819

++	.write_wakeup			= tcp_write_wakeup,

8820

++	.prune_ofo_queue		= tcp_prune_ofo_queue,

8821

++	.retransmit_timer		= tcp_retransmit_timer,

8822

++	.time_wait			= tcp_time_wait,

8823

++	.cleanup_rbuf			= tcp_cleanup_rbuf,

8824

++};

8825

++

8826

++static int mptcp_alloc_mpcb(struct sock *meta_sk, __u64 remote_key, u32 window)

8827

++{

8828

++	struct mptcp_cb *mpcb;

8829

++	struct sock *master_sk;

8830

++	struct inet_connection_sock *master_icsk, *meta_icsk = inet_csk(meta_sk);

8831

++	struct tcp_sock *master_tp, *meta_tp = tcp_sk(meta_sk);

8832

++	u64 idsn;

8833

++

8834

++	dst_release(meta_sk->sk_rx_dst);

8835

++	meta_sk->sk_rx_dst = NULL;

8836

++	/* This flag is set to announce sock_lock_init to

8837

++	 * reclassify the lock-class of the master socket.

8838

++	 */

8839

++	meta_tp->is_master_sk = 1;

8840

++	master_sk = sk_clone_lock(meta_sk, GFP_ATOMIC | __GFP_ZERO);

8841

++	meta_tp->is_master_sk = 0;

8842

++	if (!master_sk)

8843

++		return -ENOBUFS;

8844

++

8845

++	master_tp = tcp_sk(master_sk);

8846

++	master_icsk = inet_csk(master_sk);

8847

++

8848

++	mpcb = kmem_cache_zalloc(mptcp_cb_cache, GFP_ATOMIC);

8849

++	if (!mpcb) {

8850

++		/* sk_free (and __sk_free) requirese wmem_alloc to be 1.

8851

++		 * All the rest is set to 0 thanks to __GFP_ZERO above.

8852

++		 */

8853

++		atomic_set(&master_sk->sk_wmem_alloc, 1);

8854

++		sk_free(master_sk);

8855

++		return -ENOBUFS;

8856

++	}

8857

++

8858

++#if IS_ENABLED(CONFIG_IPV6)

8859

++	if (meta_icsk->icsk_af_ops == &mptcp_v6_mapped) {

8860

++		struct ipv6_pinfo *newnp, *np = inet6_sk(meta_sk);

8861

++

8862

++		inet_sk(master_sk)->pinet6 = &((struct tcp6_sock *)master_sk)->inet6;

8863

++

8864

++		newnp = inet6_sk(master_sk);

8865

++		memcpy(newnp, np, sizeof(struct ipv6_pinfo));

8866

++

8867

++		newnp->ipv6_mc_list = NULL;

8868

++		newnp->ipv6_ac_list = NULL;

8869

++		newnp->ipv6_fl_list = NULL;

8870

++		newnp->opt = NULL;

8871

++		newnp->pktoptions = NULL;

8872

++		(void)xchg(&newnp->rxpmtu, NULL);

8873

++	} else if (meta_sk->sk_family == AF_INET6) {

8874

++		struct ipv6_pinfo *newnp, *np = inet6_sk(meta_sk);

8875

++

8876

++		inet_sk(master_sk)->pinet6 = &((struct tcp6_sock *)master_sk)->inet6;

8877

++

8878

++		newnp = inet6_sk(master_sk);

8879

++		memcpy(newnp, np, sizeof(struct ipv6_pinfo));

8880

++

8881

++		newnp->hop_limit	= -1;

8882

++		newnp->mcast_hops	= IPV6_DEFAULT_MCASTHOPS;

8883

++		newnp->mc_loop	= 1;

8884

++		newnp->pmtudisc	= IPV6_PMTUDISC_WANT;

8885

++		newnp->ipv6only	= sock_net(master_sk)->ipv6.sysctl.bindv6only;

8886

++	}

8887

++#endif

8888

++

8889

++	meta_tp->mptcp = NULL;

8890

++

8891

++	/* Store the keys and generate the peer's token */

8892

++	mpcb->mptcp_loc_key = meta_tp->mptcp_loc_key;

8893

++	mpcb->mptcp_loc_token = meta_tp->mptcp_loc_token;

8894

++

8895

++	/* Generate Initial data-sequence-numbers */

8896

++	mptcp_key_sha1(mpcb->mptcp_loc_key, NULL, &idsn);

8897

++	idsn = ntohll(idsn) + 1;

8898

++	mpcb->snd_high_order[0] = idsn >> 32;

8899

++	mpcb->snd_high_order[1] = mpcb->snd_high_order[0] - 1;

8900

++

8901

++	meta_tp->write_seq = (u32)idsn;

8902

++	meta_tp->snd_sml = meta_tp->write_seq;

8903

++	meta_tp->snd_una = meta_tp->write_seq;

8904

++	meta_tp->snd_nxt = meta_tp->write_seq;

8905

++	meta_tp->pushed_seq = meta_tp->write_seq;

8906

++	meta_tp->snd_up = meta_tp->write_seq;

8907

++

8908

++	mpcb->mptcp_rem_key = remote_key;

8909

++	mptcp_key_sha1(mpcb->mptcp_rem_key, &mpcb->mptcp_rem_token, &idsn);

8910

++	idsn = ntohll(idsn) + 1;

8911

++	mpcb->rcv_high_order[0] = idsn >> 32;

8912

++	mpcb->rcv_high_order[1] = mpcb->rcv_high_order[0] + 1;

8913

++	meta_tp->copied_seq = (u32) idsn;

8914

++	meta_tp->rcv_nxt = (u32) idsn;

8915

++	meta_tp->rcv_wup = (u32) idsn;

8916

++

8917

++	meta_tp->snd_wl1 = meta_tp->rcv_nxt - 1;

8918

++	meta_tp->snd_wnd = window;

8919

++	meta_tp->retrans_stamp = 0; /* Set in tcp_connect() */

8920

++

8921

++	meta_tp->packets_out = 0;

8922

++	meta_icsk->icsk_probes_out = 0;

8923

++

8924

++	/* Set mptcp-pointers */

8925

++	master_tp->mpcb = mpcb;

8926

++	master_tp->meta_sk = meta_sk;

8927

++	meta_tp->mpcb = mpcb;

8928

++	meta_tp->meta_sk = meta_sk;

8929

++	mpcb->meta_sk = meta_sk;

8930

++	mpcb->master_sk = master_sk;

8931

++

8932

++	meta_tp->was_meta_sk = 0;

8933

++

8934

++	/* Initialize the queues */

8935

++	skb_queue_head_init(&mpcb->reinject_queue);

8936

++	skb_queue_head_init(&master_tp->out_of_order_queue);

8937

++	tcp_prequeue_init(master_tp);

8938

++	INIT_LIST_HEAD(&master_tp->tsq_node);

8939

++

8940

++	master_tp->tsq_flags = 0;

8941

++

8942

++	mutex_init(&mpcb->mpcb_mutex);

8943

++

8944

++	/* Init the accept_queue structure, we support a queue of 32 pending

8945

++	 * connections, it does not need to be huge, since we only store  here

8946

++	 * pending subflow creations.

8947

++	 */

8948

++	if (reqsk_queue_alloc(&meta_icsk->icsk_accept_queue, 32, GFP_ATOMIC)) {

8949

++		inet_put_port(master_sk);

8950

++		kmem_cache_free(mptcp_cb_cache, mpcb);

8951

++		sk_free(master_sk);

8952

++		return -ENOMEM;

8953

++	}

8954

++

8955

++	/* Redefine function-pointers as the meta-sk is now fully ready */

8956

++	static_key_slow_inc(&mptcp_static_key);

8957

++	meta_tp->mpc = 1;

8958

++	meta_tp->ops = &mptcp_meta_specific;

8959

++

8960

++	meta_sk->sk_backlog_rcv = mptcp_backlog_rcv;

8961

++	meta_sk->sk_destruct = mptcp_sock_destruct;

8962

++

8963

++	/* Meta-level retransmit timer */

8964

++	meta_icsk->icsk_rto *= 2; /* Double of initial - rto */

8965

++

8966

++	tcp_init_xmit_timers(master_sk);

8967

++	/* Has been set for sending out the SYN */

8968

++	inet_csk_clear_xmit_timer(meta_sk, ICSK_TIME_RETRANS);

8969

++

8970

++	if (!meta_tp->inside_tk_table) {

8971

++		/* Adding the meta_tp in the token hashtable - coming from server-side */

8972

++		rcu_read_lock();

8973

++		spin_lock(&mptcp_tk_hashlock);

8974

++

8975

++		__mptcp_hash_insert(meta_tp, mpcb->mptcp_loc_token);

8976

++

8977

++		spin_unlock(&mptcp_tk_hashlock);

8978

++		rcu_read_unlock();

8979

++	}

8980

++	master_tp->inside_tk_table = 0;

8981

++

8982

++	/* Init time-wait stuff */

8983

++	INIT_LIST_HEAD(&mpcb->tw_list);

8984

++	spin_lock_init(&mpcb->tw_lock);

8985

++

8986

++	INIT_HLIST_HEAD(&mpcb->callback_list);

8987

++

8988

++	mptcp_mpcb_inherit_sockopts(meta_sk, master_sk);

8989

++

8990

++	mpcb->orig_sk_rcvbuf = meta_sk->sk_rcvbuf;

8991

++	mpcb->orig_sk_sndbuf = meta_sk->sk_sndbuf;

8992

++	mpcb->orig_window_clamp = meta_tp->window_clamp;

8993

++

8994

++	/* The meta is directly linked - set refcnt to 1 */

8995

++	atomic_set(&mpcb->mpcb_refcnt, 1);

8996

++

8997

++	mptcp_init_path_manager(mpcb);

8998

++	mptcp_init_scheduler(mpcb);

8999

++

9000

++	setup_timer(&mpcb->synack_timer, mptcp_synack_timer_handler,

9001

++		    (unsigned long)meta_sk);

9002

++

9003

++	mptcp_debug("%s: created mpcb with token %#x\n",

9004

++		    __func__, mpcb->mptcp_loc_token);

9005

++

9006

++	return 0;

9007

++}

9008

++

9009

++void mptcp_fallback_meta_sk(struct sock *meta_sk)

9010

++{

9011

++	kfree(inet_csk(meta_sk)->icsk_accept_queue.listen_opt);

9012

++	kmem_cache_free(mptcp_cb_cache, tcp_sk(meta_sk)->mpcb);

9013

++}

9014

++

9015

++int mptcp_add_sock(struct sock *meta_sk, struct sock *sk, u8 loc_id, u8 rem_id,

9016

++		   gfp_t flags)

9017

++{

9018

++	struct mptcp_cb *mpcb	= tcp_sk(meta_sk)->mpcb;

9019

++	struct tcp_sock *tp	= tcp_sk(sk);

9020

++

9021

++	tp->mptcp = kmem_cache_zalloc(mptcp_sock_cache, flags);

9022

++	if (!tp->mptcp)

9023

++		return -ENOMEM;

9024

++

9025

++	tp->mptcp->path_index = mptcp_set_new_pathindex(mpcb);

9026

++	/* No more space for more subflows? */

9027

++	if (!tp->mptcp->path_index) {

9028

++		kmem_cache_free(mptcp_sock_cache, tp->mptcp);

9029

++		return -EPERM;

9030

++	}

9031

++

9032

++	INIT_HLIST_NODE(&tp->mptcp->cb_list);

9033

++

9034

++	tp->mptcp->tp = tp;

9035

++	tp->mpcb = mpcb;

9036

++	tp->meta_sk = meta_sk;

9037

++

9038

++	static_key_slow_inc(&mptcp_static_key);

9039

++	tp->mpc = 1;

9040

++	tp->ops = &mptcp_sub_specific;

9041

++

9042

++	tp->mptcp->loc_id = loc_id;

9043

++	tp->mptcp->rem_id = rem_id;

9044

++	if (mpcb->sched_ops->init)

9045

++		mpcb->sched_ops->init(sk);

9046

++

9047

++	/* The corresponding sock_put is in mptcp_sock_destruct(). It cannot be

9048

++	 * included in mptcp_del_sock(), because the mpcb must remain alive

9049

++	 * until the last subsocket is completely destroyed.

9050

++	 */

9051

++	sock_hold(meta_sk);

9052

++	atomic_inc(&mpcb->mpcb_refcnt);

9053

++

9054

++	tp->mptcp->next = mpcb->connection_list;

9055

++	mpcb->connection_list = tp;

9056

++	tp->mptcp->attached = 1;

9057

++

9058

++	mpcb->cnt_subflows++;

9059

++	atomic_add(atomic_read(&((struct sock *)tp)->sk_rmem_alloc),

9060

++		   &meta_sk->sk_rmem_alloc);

9061

++

9062

++	mptcp_sub_inherit_sockopts(meta_sk, sk);

9063

++	INIT_DELAYED_WORK(&tp->mptcp->work, mptcp_sub_close_wq);

9064

++

9065

++	/* As we successfully allocated the mptcp_tcp_sock, we have to

9066

++	 * change the function-pointers here (for sk_destruct to work correctly)

9067

++	 */

9068

++	sk->sk_error_report = mptcp_sock_def_error_report;

9069

++	sk->sk_data_ready = mptcp_data_ready;

9070

++	sk->sk_write_space = mptcp_write_space;

9071

++	sk->sk_state_change = mptcp_set_state;

9072

++	sk->sk_destruct = mptcp_sock_destruct;

9073

++

9074

++	if (sk->sk_family == AF_INET)

9075

++		mptcp_debug("%s: token %#x pi %d, src_addr:%pI4:%d dst_addr:%pI4:%d, cnt_subflows now %d\n",

9076

++			    __func__ , mpcb->mptcp_loc_token,

9077

++			    tp->mptcp->path_index,

9078

++			    &((struct inet_sock *)tp)->inet_saddr,

9079

++			    ntohs(((struct inet_sock *)tp)->inet_sport),

9080

++			    &((struct inet_sock *)tp)->inet_daddr,

9081

++			    ntohs(((struct inet_sock *)tp)->inet_dport),

9082

++			    mpcb->cnt_subflows);

9083

++#if IS_ENABLED(CONFIG_IPV6)

9084

++	else

9085

++		mptcp_debug("%s: token %#x pi %d, src_addr:%pI6:%d dst_addr:%pI6:%d, cnt_subflows now %d\n",

9086

++			    __func__ , mpcb->mptcp_loc_token,

9087

++			    tp->mptcp->path_index, &inet6_sk(sk)->saddr,

9088

++			    ntohs(((struct inet_sock *)tp)->inet_sport),

9089

++			    &sk->sk_v6_daddr,

9090

++			    ntohs(((struct inet_sock *)tp)->inet_dport),

9091

++			    mpcb->cnt_subflows);

9092

++#endif

9093

++

9094

++	return 0;

9095

++}

9096

++

9097

++void mptcp_del_sock(struct sock *sk)

9098

++{

9099

++	struct tcp_sock *tp = tcp_sk(sk), *tp_prev;

9100

++	struct mptcp_cb *mpcb;

9101

++

9102

++	if (!tp->mptcp || !tp->mptcp->attached)

9103

++		return;

9104

++

9105

++	mpcb = tp->mpcb;

9106

++	tp_prev = mpcb->connection_list;

9107

++

9108

++	mptcp_debug("%s: Removing subsock tok %#x pi:%d state %d is_meta? %d\n",

9109

++		    __func__, mpcb->mptcp_loc_token, tp->mptcp->path_index,

9110

++		    sk->sk_state, is_meta_sk(sk));

9111

++

9112

++	if (tp_prev == tp) {

9113

++		mpcb->connection_list = tp->mptcp->next;

9114

++	} else {

9115

++		for (; tp_prev && tp_prev->mptcp->next; tp_prev = tp_prev->mptcp->next) {

9116

++			if (tp_prev->mptcp->next == tp) {

9117

++				tp_prev->mptcp->next = tp->mptcp->next;

9118

++				break;

9119

++			}

9120

++		}

9121

++	}

9122

++	mpcb->cnt_subflows--;

9123

++	if (tp->mptcp->establish_increased)

9124

++		mpcb->cnt_established--;

9125

++

9126

++	tp->mptcp->next = NULL;

9127

++	tp->mptcp->attached = 0;

9128

++	mpcb->path_index_bits &= ~(1 << tp->mptcp->path_index);

9129

++

9130

++	if (!skb_queue_empty(&sk->sk_write_queue))

9131

++		mptcp_reinject_data(sk, 0);

9132

++

9133

++	if (is_master_tp(tp))

9134

++		mpcb->master_sk = NULL;

9135

++	else if (tp->mptcp->pre_established)

9136

++		sk_stop_timer(sk, &tp->mptcp->mptcp_ack_timer);

9137

++

9138

++	rcu_assign_pointer(inet_sk(sk)->inet_opt, NULL);

9139

++}

9140

++

9141

++/* Updates the metasocket ULID/port data, based on the given sock.

9142

++ * The argument sock must be the sock accessible to the application.

9143

++ * In this function, we update the meta socket info, based on the changes

9144

++ * in the application socket (bind, address allocation, ...)

9145

++ */

9146

++void mptcp_update_metasocket(struct sock *sk, const struct sock *meta_sk)

9147

++{

9148

++	if (tcp_sk(sk)->mpcb->pm_ops->new_session)

9149

++		tcp_sk(sk)->mpcb->pm_ops->new_session(meta_sk);

9150

++

9151

++	tcp_sk(sk)->mptcp->send_mp_prio = tcp_sk(sk)->mptcp->low_prio;

9152

++}

9153

++

9154

++/* Clean up the receive buffer for full frames taken by the user,

9155

++ * then send an ACK if necessary.  COPIED is the number of bytes

9156

++ * tcp_recvmsg has given to the user so far, it speeds up the

9157

++ * calculation of whether or not we must ACK for the sake of

9158

++ * a window update.

9159

++ */

9160

++void mptcp_cleanup_rbuf(struct sock *meta_sk, int copied)

9161

++{

9162

++	struct tcp_sock *meta_tp = tcp_sk(meta_sk);

9163

++	struct sock *sk;

9164

++	__u32 rcv_window_now = 0;

9165

++

9166

++	if (copied > 0 && !(meta_sk->sk_shutdown & RCV_SHUTDOWN)) {

9167

++		rcv_window_now = tcp_receive_window(meta_tp);

9168

++

9169

++		if (2 * rcv_window_now > meta_tp->window_clamp)

9170

++			rcv_window_now = 0;

9171

++	}

9172

++

9173

++	mptcp_for_each_sk(meta_tp->mpcb, sk) {

9174

++		struct tcp_sock *tp = tcp_sk(sk);

9175

++		const struct inet_connection_sock *icsk = inet_csk(sk);

9176

++

9177

++		if (!mptcp_sk_can_send_ack(sk))

9178

++			continue;

9179

++

9180

++		if (!inet_csk_ack_scheduled(sk))

9181

++			goto second_part;

9182

++		/* Delayed ACKs frequently hit locked sockets during bulk

9183

++		 * receive.

9184

++		 */

9185

++		if (icsk->icsk_ack.blocked ||

9186

++		    /* Once-per-two-segments ACK was not sent by tcp_input.c */

9187

++		    tp->rcv_nxt - tp->rcv_wup > icsk->icsk_ack.rcv_mss ||

9188

++		    /* If this read emptied read buffer, we send ACK, if

9189

++		     * connection is not bidirectional, user drained

9190

++		     * receive buffer and there was a small segment

9191

++		     * in queue.

9192

++		     */

9193

++		    (copied > 0 &&

9194

++		     ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED2) ||

9195

++		      ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED) &&

9196

++		       !icsk->icsk_ack.pingpong)) &&

9197

++		     !atomic_read(&meta_sk->sk_rmem_alloc))) {

9198

++			tcp_send_ack(sk);

9199

++			continue;

9200

++		}

9201

++

9202

++second_part:

9203

++		/* This here is the second part of tcp_cleanup_rbuf */

9204

++		if (rcv_window_now) {

9205

++			__u32 new_window = tp->ops->__select_window(sk);

9206

++

9207

++			/* Send ACK now, if this read freed lots of space

9208

++			 * in our buffer. Certainly, new_window is new window.

9209

++			 * We can advertise it now, if it is not less than

9210

++			 * current one.

9211

++			 * "Lots" means "at least twice" here.

9212

++			 */

9213

++			if (new_window && new_window >= 2 * rcv_window_now)

9214

++				tcp_send_ack(sk);

9215

++		}

9216

++	}

9217

++}

9218

++

9219

++static int mptcp_sub_send_fin(struct sock *sk)

9220

++{

9221

++	struct tcp_sock *tp = tcp_sk(sk);

9222

++	struct sk_buff *skb = tcp_write_queue_tail(sk);

9223

++	int mss_now;

9224

++

9225

++	/* Optimization, tack on the FIN if we have a queue of

9226

++	 * unsent frames.  But be careful about outgoing SACKS

9227

++	 * and IP options.

9228

++	 */

9229

++	mss_now = tcp_current_mss(sk);

9230

++

9231

++	if (tcp_send_head(sk) != NULL) {

9232

++		TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_FIN;

9233

++		TCP_SKB_CB(skb)->end_seq++;

9234

++		tp->write_seq++;

9235

++	} else {

9236

++		skb = alloc_skb_fclone(MAX_TCP_HEADER, GFP_ATOMIC);

9237

++		if (!skb)

9238

++			return 1;

9239

++

9240

++		/* Reserve space for headers and prepare control bits. */

9241

++		skb_reserve(skb, MAX_TCP_HEADER);

9242

++		/* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */

9243

++		tcp_init_nondata_skb(skb, tp->write_seq,

9244

++				     TCPHDR_ACK | TCPHDR_FIN);

9245

++		tcp_queue_skb(sk, skb);

9246

++	}

9247

++	__tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_OFF);

9248

++

9249

++	return 0;

9250

++}

9251

++

9252

++void mptcp_sub_close_wq(struct work_struct *work)

9253

++{

9254

++	struct tcp_sock *tp = container_of(work, struct mptcp_tcp_sock, work.work)->tp;

9255

++	struct sock *sk = (struct sock *)tp;

9256

++	struct sock *meta_sk = mptcp_meta_sk(sk);

9257

++

9258

++	mutex_lock(&tp->mpcb->mpcb_mutex);

9259

++	lock_sock_nested(meta_sk, SINGLE_DEPTH_NESTING);

9260

++

9261

++	if (sock_flag(sk, SOCK_DEAD))

9262

++		goto exit;

9263

++

9264

++	/* We come from tcp_disconnect. We are sure that meta_sk is set */

9265

++	if (!mptcp(tp)) {

9266

++		tp->closing = 1;

9267

++		sock_rps_reset_flow(sk);

9268

++		tcp_close(sk, 0);

9269

++		goto exit;

9270

++	}

9271

++

9272

++	if (meta_sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE) {

9273

++		tp->closing = 1;

9274

++		sock_rps_reset_flow(sk);

9275

++		tcp_close(sk, 0);

9276

++	} else if (tcp_close_state(sk)) {

9277

++		sk->sk_shutdown |= SEND_SHUTDOWN;

9278

++		tcp_send_fin(sk);

9279

++	}

9280

++

9281

++exit:

9282

++	release_sock(meta_sk);

9283

++	mutex_unlock(&tp->mpcb->mpcb_mutex);

9284

++	sock_put(sk);

9285

++}

9286

++

9287

++void mptcp_sub_close(struct sock *sk, unsigned long delay)

9288

++{

9289

++	struct tcp_sock *tp = tcp_sk(sk);

9290

++	struct delayed_work *work = &tcp_sk(sk)->mptcp->work;

9291

++

9292

++	/* We are already closing - e.g., call from sock_def_error_report upon

9293

++	 * tcp_disconnect in tcp_close.

9294

++	 */

9295

++	if (tp->closing)

9296

++		return;

9297

++

9298

++	/* Work already scheduled ? */

9299

++	if (work_pending(&work->work)) {

9300

++		/* Work present - who will be first ? */

9301

++		if (jiffies + delay > work->timer.expires)

9302

++			return;

9303

++

9304

++		/* Try canceling - if it fails, work will be executed soon */

9305

++		if (!cancel_delayed_work(work))

9306

++			return;

9307

++		sock_put(sk);

9308

++	}

9309

++

9310

++	if (!delay) {

9311

++		unsigned char old_state = sk->sk_state;

9312

++

9313

++		/* If we are in user-context we can directly do the closing

9314

++		 * procedure. No need to schedule a work-queue.

9315

++		 */

9316

++		if (!in_softirq()) {

9317

++			if (sock_flag(sk, SOCK_DEAD))

9318

++				return;

9319

++

9320

++			if (!mptcp(tp)) {

9321

++				tp->closing = 1;

9322

++				sock_rps_reset_flow(sk);

9323

++				tcp_close(sk, 0);

9324

++				return;

9325

++			}

9326

++

9327

++			if (mptcp_meta_sk(sk)->sk_shutdown == SHUTDOWN_MASK ||

9328

++			    sk->sk_state == TCP_CLOSE) {

9329

++				tp->closing = 1;

9330

++				sock_rps_reset_flow(sk);

9331

++				tcp_close(sk, 0);

9332

++			} else if (tcp_close_state(sk)) {

9333

++				sk->sk_shutdown |= SEND_SHUTDOWN;

9334

++				tcp_send_fin(sk);

9335

++			}

9336

++

9337

++			return;

9338

++		}

9339

++

9340

++		/* We directly send the FIN. Because it may take so a long time,

9341

++		 * untile the work-queue will get scheduled...

9342

++		 *

9343

++		 * If mptcp_sub_send_fin returns 1, it failed and thus we reset

9344

++		 * the old state so that tcp_close will finally send the fin

9345

++		 * in user-context.

9346

++		 */

9347

++		if (!sk->sk_err && old_state != TCP_CLOSE &&

9348

++		    tcp_close_state(sk) && mptcp_sub_send_fin(sk)) {

9349

++			if (old_state == TCP_ESTABLISHED)

9350

++				TCP_INC_STATS(sock_net(sk), TCP_MIB_CURRESTAB);

9351

++			sk->sk_state = old_state;

9352

++		}

9353

++	}

9354

++

9355

++	sock_hold(sk);

9356

++	queue_delayed_work(mptcp_wq, work, delay);

9357

++}

9358

++

9359

++void mptcp_sub_force_close(struct sock *sk)

9360

++{

9361

++	/* The below tcp_done may have freed the socket, if he is already dead.

9362

++	 * Thus, we are not allowed to access it afterwards. That's why

9363

++	 * we have to store the dead-state in this local variable.

9364

++	 */

9365

++	int sock_is_dead = sock_flag(sk, SOCK_DEAD);

9366

++

9367

++	tcp_sk(sk)->mp_killed = 1;

9368

++

9369

++	if (sk->sk_state != TCP_CLOSE)

9370

++		tcp_done(sk);

9371

++

9372

++	if (!sock_is_dead)

9373

++		mptcp_sub_close(sk, 0);

9374

++}

9375

++EXPORT_SYMBOL(mptcp_sub_force_close);

9376

++

9377

++/* Update the mpcb send window, based on the contributions

9378

++ * of each subflow

9379

++ */

9380

++void mptcp_update_sndbuf(const struct tcp_sock *tp)

9381

++{

9382

++	struct sock *meta_sk = tp->meta_sk, *sk;

9383

++	int new_sndbuf = 0, old_sndbuf = meta_sk->sk_sndbuf;

9384

++

9385

++	mptcp_for_each_sk(tp->mpcb, sk) {

9386

++		if (!mptcp_sk_can_send(sk))

9387

++			continue;

9388

++

9389

++		new_sndbuf += sk->sk_sndbuf;

9390

++

9391

++		if (new_sndbuf > sysctl_tcp_wmem[2] || new_sndbuf < 0) {

9392

++			new_sndbuf = sysctl_tcp_wmem[2];

9393

++			break;

9394

++		}

9395

++	}

9396

++	meta_sk->sk_sndbuf = max(min(new_sndbuf, sysctl_tcp_wmem[2]), meta_sk->sk_sndbuf);

9397

++

9398

++	/* The subflow's call to sk_write_space in tcp_new_space ends up in

9399

++	 * mptcp_write_space.

9400

++	 * It has nothing to do with waking up the application.

9401

++	 * So, we do it here.

9402

++	 */

9403

++	if (old_sndbuf != meta_sk->sk_sndbuf)

9404

++		meta_sk->sk_write_space(meta_sk);

9405

++}

9406

++

9407

++void mptcp_close(struct sock *meta_sk, long timeout)

9408

++{

9409

++	struct tcp_sock *meta_tp = tcp_sk(meta_sk);

9410

++	struct sock *sk_it, *tmpsk;

9411

++	struct mptcp_cb *mpcb = meta_tp->mpcb;

9412

++	struct sk_buff *skb;

9413

++	int data_was_unread = 0;

9414

++	int state;

9415

++

9416

++	mptcp_debug("%s: Close of meta_sk with tok %#x\n",

9417

++		    __func__, mpcb->mptcp_loc_token);

9418

++

9419

++	mutex_lock(&mpcb->mpcb_mutex);

9420

++	lock_sock(meta_sk);

9421

++

9422

++	if (meta_tp->inside_tk_table) {

9423

++		/* Detach the mpcb from the token hashtable */

9424

++		mptcp_hash_remove_bh(meta_tp);

9425

++		reqsk_queue_destroy(&inet_csk(meta_sk)->icsk_accept_queue);

9426

++	}

9427

++

9428

++	meta_sk->sk_shutdown = SHUTDOWN_MASK;

9429

++	/* We need to flush the recv. buffs.  We do this only on the

9430

++	 * descriptor close, not protocol-sourced closes, because the

9431

++	 * reader process may not have drained the data yet!

9432

++	 */

9433

++	while ((skb = __skb_dequeue(&meta_sk->sk_receive_queue)) != NULL) {

9434

++		u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq -

9435

++			  tcp_hdr(skb)->fin;

9436

++		data_was_unread += len;

9437

++		__kfree_skb(skb);

9438

++	}

9439

++

9440

++	sk_mem_reclaim(meta_sk);

9441

++

9442

++	/* If socket has been already reset (e.g. in tcp_reset()) - kill it. */

9443

++	if (meta_sk->sk_state == TCP_CLOSE) {

9444

++		mptcp_for_each_sk_safe(mpcb, sk_it, tmpsk) {

9445

++			if (tcp_sk(sk_it)->send_mp_fclose)

9446

++				continue;

9447

++			mptcp_sub_close(sk_it, 0);

9448

++		}

9449

++		goto adjudge_to_death;

9450

++	}

9451

++

9452

++	if (data_was_unread) {

9453

++		/* Unread data was tossed, zap the connection. */

9454

++		NET_INC_STATS_USER(sock_net(meta_sk), LINUX_MIB_TCPABORTONCLOSE);

9455

++		tcp_set_state(meta_sk, TCP_CLOSE);

9456

++		tcp_sk(meta_sk)->ops->send_active_reset(meta_sk,

9457

++							meta_sk->sk_allocation);

9458

++	} else if (sock_flag(meta_sk, SOCK_LINGER) && !meta_sk->sk_lingertime) {

9459

++		/* Check zero linger _after_ checking for unread data. */

9460

++		meta_sk->sk_prot->disconnect(meta_sk, 0);

9461

++		NET_INC_STATS_USER(sock_net(meta_sk), LINUX_MIB_TCPABORTONDATA);

9462

++	} else if (tcp_close_state(meta_sk)) {

9463

++		mptcp_send_fin(meta_sk);

9464

++	} else if (meta_tp->snd_una == meta_tp->write_seq) {

9465

++		/* The DATA_FIN has been sent and acknowledged

9466

++		 * (e.g., by sk_shutdown). Close all the other subflows

9467

++		 */

9468

++		mptcp_for_each_sk_safe(mpcb, sk_it, tmpsk) {

9469

++			unsigned long delay = 0;

9470

++			/* If we are the passive closer, don't trigger

9471

++			 * subflow-fin until the subflow has been finned

9472

++			 * by the peer. - thus we add a delay

9473

++			 */

9474

++			if (mpcb->passive_close &&

9475

++			    sk_it->sk_state == TCP_ESTABLISHED)

9476

++				delay = inet_csk(sk_it)->icsk_rto << 3;

9477

++

9478

++			mptcp_sub_close(sk_it, delay);

9479

++		}

9480

++	}

9481

++

9482

++	sk_stream_wait_close(meta_sk, timeout);

9483

++

9484

++adjudge_to_death:

9485

++	state = meta_sk->sk_state;

9486

++	sock_hold(meta_sk);

9487

++	sock_orphan(meta_sk);

9488

++

9489

++	/* socket will be freed after mptcp_close - we have to prevent

9490

++	 * access from the subflows.

9491

++	 */

9492

++	mptcp_for_each_sk(mpcb, sk_it) {

9493

++		/* Similar to sock_orphan, but we don't set it DEAD, because

9494

++		 * the callbacks are still set and must be called.

9495

++		 */

9496

++		write_lock_bh(&sk_it->sk_callback_lock);

9497

++		sk_set_socket(sk_it, NULL);

9498

++		sk_it->sk_wq  = NULL;

9499

++		write_unlock_bh(&sk_it->sk_callback_lock);

9500

++	}

9501

++

9502

++	/* It is the last release_sock in its life. It will remove backlog. */

9503

++	release_sock(meta_sk);

9504

++

9505

++	/* Now socket is owned by kernel and we acquire BH lock

9506

++	 * to finish close. No need to check for user refs.

9507

++	 */

9508

++	local_bh_disable();

9509

++	bh_lock_sock(meta_sk);

9510

++	WARN_ON(sock_owned_by_user(meta_sk));

9511

++

9512

++	percpu_counter_inc(meta_sk->sk_prot->orphan_count);

9513

++

9514

++	/* Have we already been destroyed by a softirq or backlog? */

9515

++	if (state != TCP_CLOSE && meta_sk->sk_state == TCP_CLOSE)

9516

++		goto out;

9517

++

9518

++	/*	This is a (useful) BSD violating of the RFC. There is a

9519

++	 *	problem with TCP as specified in that the other end could

9520

++	 *	keep a socket open forever with no application left this end.

9521

++	 *	We use a 3 minute timeout (about the same as BSD) then kill

9522

++	 *	our end. If they send after that then tough - BUT: long enough

9523

++	 *	that we won't make the old 4*rto = almost no time - whoops

9524

++	 *	reset mistake.

9525

++	 *

9526

++	 *	Nope, it was not mistake. It is really desired behaviour

9527

++	 *	f.e. on http servers, when such sockets are useless, but

9528

++	 *	consume significant resources. Let's do it with special

9529

++	 *	linger2	option.					--ANK

9530

++	 */

9531

++

9532

++	if (meta_sk->sk_state == TCP_FIN_WAIT2) {

9533

++		if (meta_tp->linger2 < 0) {

9534

++			tcp_set_state(meta_sk, TCP_CLOSE);

9535

++			meta_tp->ops->send_active_reset(meta_sk, GFP_ATOMIC);

9536

++			NET_INC_STATS_BH(sock_net(meta_sk),

9537

++					 LINUX_MIB_TCPABORTONLINGER);

9538

++		} else {

9539

++			const int tmo = tcp_fin_time(meta_sk);

9540

++

9541

++			if (tmo > TCP_TIMEWAIT_LEN) {

9542

++				inet_csk_reset_keepalive_timer(meta_sk,

9543

++							       tmo - TCP_TIMEWAIT_LEN);

9544

++			} else {

9545

++				meta_tp->ops->time_wait(meta_sk, TCP_FIN_WAIT2,

9546

++							tmo);

9547

++				goto out;

9548

++			}

9549

++		}

9550

++	}

9551

++	if (meta_sk->sk_state != TCP_CLOSE) {

9552

++		sk_mem_reclaim(meta_sk);

9553

++		if (tcp_too_many_orphans(meta_sk, 0)) {

9554

++			if (net_ratelimit())

9555

++				pr_info("MPTCP: too many of orphaned sockets\n");

9556

++			tcp_set_state(meta_sk, TCP_CLOSE);

9557

++			meta_tp->ops->send_active_reset(meta_sk, GFP_ATOMIC);

9558

++			NET_INC_STATS_BH(sock_net(meta_sk),

9559

++					 LINUX_MIB_TCPABORTONMEMORY);

9560

++		}

9561

++	}

9562

++

9563

++

9564

++	if (meta_sk->sk_state == TCP_CLOSE)

9565

++		inet_csk_destroy_sock(meta_sk);

9566

++	/* Otherwise, socket is reprieved until protocol close. */

9567

++

9568

++out:

9569

++	bh_unlock_sock(meta_sk);

9570

++	local_bh_enable();

9571

++	mutex_unlock(&mpcb->mpcb_mutex);

9572

++	sock_put(meta_sk); /* Taken by sock_hold */

9573

++}

9574

++

9575

++void mptcp_disconnect(struct sock *sk)

9576

++{

9577

++	struct sock *subsk, *tmpsk;

9578

++	struct tcp_sock *tp = tcp_sk(sk);

9579

++

9580

++	mptcp_delete_synack_timer(sk);

9581

++

9582

++	__skb_queue_purge(&tp->mpcb->reinject_queue);

9583

++

9584

++	if (tp->inside_tk_table) {

9585

++		mptcp_hash_remove_bh(tp);

9586

++		reqsk_queue_destroy(&inet_csk(tp->meta_sk)->icsk_accept_queue);

9587

++	}

9588

++

9589

++	local_bh_disable();

9590

++	mptcp_for_each_sk_safe(tp->mpcb, subsk, tmpsk) {

9591

++		/* The socket will get removed from the subsocket-list

9592

++		 * and made non-mptcp by setting mpc to 0.

9593

++		 *

9594

++		 * This is necessary, because tcp_disconnect assumes

9595

++		 * that the connection is completly dead afterwards.

9596

++		 * Thus we need to do a mptcp_del_sock. Due to this call

9597

++		 * we have to make it non-mptcp.

9598

++		 *

9599

++		 * We have to lock the socket, because we set mpc to 0.

9600

++		 * An incoming packet would take the subsocket's lock

9601

++		 * and go on into the receive-path.

9602

++		 * This would be a race.

9603

++		 */

9604

++

9605

++		bh_lock_sock(subsk);

9606

++		mptcp_del_sock(subsk);

9607

++		tcp_sk(subsk)->mpc = 0;

9608

++		tcp_sk(subsk)->ops = &tcp_specific;

9609

++		mptcp_sub_force_close(subsk);

9610

++		bh_unlock_sock(subsk);

9611

++	}

9612

++	local_bh_enable();

9613

++

9614

++	tp->was_meta_sk = 1;

9615

++	tp->mpc = 0;

9616

++	tp->ops = &tcp_specific;

9617

++}

9618

++

9619

++

9620

++/* Returns 1 if we should enable MPTCP for that socket. */

9621

++int mptcp_doit(struct sock *sk)

9622

++{

9623

++	/* Do not allow MPTCP enabling if the MPTCP initialization failed */

9624

++	if (mptcp_init_failed)

9625

++		return 0;

9626

++

9627

++	if (sysctl_mptcp_enabled == MPTCP_APP && !tcp_sk(sk)->mptcp_enabled)

9628

++		return 0;

9629

++

9630

++	/* Socket may already be established (e.g., called from tcp_recvmsg) */

9631

++	if (mptcp(tcp_sk(sk)) || tcp_sk(sk)->request_mptcp)

9632

++		return 1;

9633

++

9634

++	/* Don't do mptcp over loopback */

9635

++	if (sk->sk_family == AF_INET &&

9636

++	    (ipv4_is_loopback(inet_sk(sk)->inet_daddr) ||

9637

++	     ipv4_is_loopback(inet_sk(sk)->inet_saddr)))

9638

++		return 0;

9639

++#if IS_ENABLED(CONFIG_IPV6)

9640

++	if (sk->sk_family == AF_INET6 &&

9641

++	    (ipv6_addr_loopback(&sk->sk_v6_daddr) ||

9642

++	     ipv6_addr_loopback(&inet6_sk(sk)->saddr)))

9643

++		return 0;

9644

++#endif

9645

++	if (mptcp_v6_is_v4_mapped(sk) &&

9646

++	    ipv4_is_loopback(inet_sk(sk)->inet_saddr))

9647

++		return 0;

9648

++

9649

++#ifdef CONFIG_TCP_MD5SIG

9650

++	/* If TCP_MD5SIG is enabled, do not do MPTCP - there is no Option-Space */

9651

++	if (tcp_sk(sk)->af_specific->md5_lookup(sk, sk))

9652

++		return 0;

9653

++#endif

9654

++

9655

++	return 1;

9656

++}

9657

++

9658

++int mptcp_create_master_sk(struct sock *meta_sk, __u64 remote_key, u32 window)

9659

++{

9660

++	struct tcp_sock *master_tp;

9661

++	struct sock *master_sk;

9662

++

9663

++	if (mptcp_alloc_mpcb(meta_sk, remote_key, window))

9664

++		goto err_alloc_mpcb;

9665

++

9666

++	master_sk = tcp_sk(meta_sk)->mpcb->master_sk;

9667

++	master_tp = tcp_sk(master_sk);

9668

++

9669

++	if (mptcp_add_sock(meta_sk, master_sk, 0, 0, GFP_ATOMIC))

9670

++		goto err_add_sock;

9671

++

9672

++	if (__inet_inherit_port(meta_sk, master_sk) < 0)

9673

++		goto err_add_sock;

9674

++

9675

++	meta_sk->sk_prot->unhash(meta_sk);

9676

++

9677

++	if (master_sk->sk_family == AF_INET || mptcp_v6_is_v4_mapped(master_sk))

9678

++		__inet_hash_nolisten(master_sk, NULL);

9679

++#if IS_ENABLED(CONFIG_IPV6)

9680

++	else

9681

++		__inet6_hash(master_sk, NULL);

9682

++#endif

9683

++

9684

++	master_tp->mptcp->init_rcv_wnd = master_tp->rcv_wnd;

9685

++

9686

++	return 0;

9687

++

9688

++err_add_sock:

9689

++	mptcp_fallback_meta_sk(meta_sk);

9690

++

9691

++	inet_csk_prepare_forced_close(master_sk);

9692

++	tcp_done(master_sk);

9693

++	inet_csk_prepare_forced_close(meta_sk);

9694

++	tcp_done(meta_sk);

9695

++

9696

++err_alloc_mpcb:

9697

++	return -ENOBUFS;

9698

++}

9699

++

9700

++static int __mptcp_check_req_master(struct sock *child,

9701

++				    struct request_sock *req)

9702

++{

9703

++	struct tcp_sock *child_tp = tcp_sk(child);

9704

++	struct sock *meta_sk = child;

9705

++	struct mptcp_cb *mpcb;

9706

++	struct mptcp_request_sock *mtreq;

9707

++

9708

++	/* Never contained an MP_CAPABLE */

9709

++	if (!inet_rsk(req)->mptcp_rqsk)

9710

++		return 1;

9711

++

9712

++	if (!inet_rsk(req)->saw_mpc) {

9713

++		/* Fallback to regular TCP, because we saw one SYN without

9714

++		 * MP_CAPABLE. In tcp_check_req we continue the regular path.

9715

++		 * But, the socket has been added to the reqsk_tk_htb, so we

9716

++		 * must still remove it.

9717

++		 */

9718

++		mptcp_reqsk_remove_tk(req);

9719

++		return 1;

9720

++	}

9721

++

9722

++	/* Just set this values to pass them to mptcp_alloc_mpcb */

9723

++	mtreq = mptcp_rsk(req);

9724

++	child_tp->mptcp_loc_key = mtreq->mptcp_loc_key;

9725

++	child_tp->mptcp_loc_token = mtreq->mptcp_loc_token;

9726

++

9727

++	if (mptcp_create_master_sk(meta_sk, mtreq->mptcp_rem_key,

9728

++				   child_tp->snd_wnd))

9729

++		return -ENOBUFS;

9730

++

9731

++	child = tcp_sk(child)->mpcb->master_sk;

9732

++	child_tp = tcp_sk(child);

9733

++	mpcb = child_tp->mpcb;

9734

++

9735

++	child_tp->mptcp->snt_isn = tcp_rsk(req)->snt_isn;

9736

++	child_tp->mptcp->rcv_isn = tcp_rsk(req)->rcv_isn;

9737

++

9738

++	mpcb->dss_csum = mtreq->dss_csum;

9739

++	mpcb->server_side = 1;

9740

++

9741

++	/* Will be moved to ESTABLISHED by  tcp_rcv_state_process() */

9742

++	mptcp_update_metasocket(child, meta_sk);

9743

++

9744

++	/* Needs to be done here additionally, because when accepting a

9745

++	 * new connection we pass by __reqsk_free and not reqsk_free.

9746

++	 */

9747

++	mptcp_reqsk_remove_tk(req);

9748

++

9749

++	/* Hold when creating the meta-sk in tcp_vX_syn_recv_sock. */

9750

++	sock_put(meta_sk);

9751

++

9752

++	return 0;

9753

++}

9754

++

9755

++int mptcp_check_req_fastopen(struct sock *child, struct request_sock *req)

9756

++{

9757

++	struct sock *meta_sk = child, *master_sk;

9758

++	struct sk_buff *skb;

9759

++	u32 new_mapping;

9760

++	int ret;

9761

++

9762

++	ret = __mptcp_check_req_master(child, req);

9763

++	if (ret)

9764

++		return ret;

9765

++

9766

++	master_sk = tcp_sk(meta_sk)->mpcb->master_sk;

9767

++

9768

++	/* We need to rewind copied_seq as it is set to IDSN + 1 and as we have

9769

++	 * pre-MPTCP data in the receive queue.

9770

++	 */

9771

++	tcp_sk(meta_sk)->copied_seq -= tcp_sk(master_sk)->rcv_nxt -

9772

++				       tcp_rsk(req)->rcv_isn - 1;

9773

++

9774

++	/* Map subflow sequence number to data sequence numbers. We need to map

9775

++	 * these data to [IDSN - len - 1, IDSN[.

9776

++	 */

9777

++	new_mapping = tcp_sk(meta_sk)->copied_seq - tcp_rsk(req)->rcv_isn - 1;

9778

++

9779

++	/* There should be only one skb: the SYN + data. */

9780

++	skb_queue_walk(&meta_sk->sk_receive_queue, skb) {

9781

++		TCP_SKB_CB(skb)->seq += new_mapping;

9782

++		TCP_SKB_CB(skb)->end_seq += new_mapping;

9783

++	}

9784

++

9785

++	/* With fastopen we change the semantics of the relative subflow

9786

++	 * sequence numbers to deal with middleboxes that could add/remove

9787

++	 * multiple bytes in the SYN. We chose to start counting at rcv_nxt - 1

9788

++	 * instead of the regular TCP ISN.

9789

++	 */

9790

++	tcp_sk(master_sk)->mptcp->rcv_isn = tcp_sk(master_sk)->rcv_nxt - 1;

9791

++

9792

++	/* We need to update copied_seq of the master_sk to account for the

9793

++	 * already moved data to the meta receive queue.

9794

++	 */

9795

++	tcp_sk(master_sk)->copied_seq = tcp_sk(master_sk)->rcv_nxt;

9796

++

9797

++	/* Handled by the master_sk */

9798

++	tcp_sk(meta_sk)->fastopen_rsk = NULL;

9799

++

9800

++	return 0;

9801

++}

9802

++

9803

++int mptcp_check_req_master(struct sock *sk, struct sock *child,

9804

++			   struct request_sock *req,

9805

++			   struct request_sock **prev)

9806

++{

9807

++	struct sock *meta_sk = child;

9808

++	int ret;

9809

++

9810

++	ret = __mptcp_check_req_master(child, req);

9811

++	if (ret)

9812

++		return ret;

9813

++

9814

++	inet_csk_reqsk_queue_unlink(sk, req, prev);

9815

++	inet_csk_reqsk_queue_removed(sk, req);

9816

++	inet_csk_reqsk_queue_add(sk, req, meta_sk);

9817

++

9818

++	return 0;

9819

++}

9820

++

9821

++struct sock *mptcp_check_req_child(struct sock *meta_sk, struct sock *child,

9822

++				   struct request_sock *req,

9823

++				   struct request_sock **prev,

9824

++				   const struct mptcp_options_received *mopt)

9825

++{

9826

++	struct tcp_sock *child_tp = tcp_sk(child);

9827

++	struct mptcp_request_sock *mtreq = mptcp_rsk(req);

9828

++	struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;

9829

++	u8 hash_mac_check[20];

9830

++

9831

++	child_tp->inside_tk_table = 0;

9832

++

9833

++	if (!mopt->join_ack)

9834

++		goto teardown;

9835

++

9836

++	mptcp_hmac_sha1((u8 *)&mpcb->mptcp_rem_key,

9837

++			(u8 *)&mpcb->mptcp_loc_key,

9838

++			(u8 *)&mtreq->mptcp_rem_nonce,

9839

++			(u8 *)&mtreq->mptcp_loc_nonce,

9840

++			(u32 *)hash_mac_check);

9841

++

9842

++	if (memcmp(hash_mac_check, (char *)&mopt->mptcp_recv_mac, 20))

9843

++		goto teardown;

9844

++

9845

++	/* Point it to the same struct socket and wq as the meta_sk */

9846

++	sk_set_socket(child, meta_sk->sk_socket);

9847

++	child->sk_wq = meta_sk->sk_wq;

9848

++

9849

++	if (mptcp_add_sock(meta_sk, child, mtreq->loc_id, mtreq->rem_id, GFP_ATOMIC)) {

9850

++		/* Has been inherited, but now child_tp->mptcp is NULL */

9851

++		child_tp->mpc = 0;

9852

++		child_tp->ops = &tcp_specific;

9853

++

9854

++		/* TODO when we support acking the third ack for new subflows,

9855

++		 * we should silently discard this third ack, by returning NULL.

9856

++		 *

9857

++		 * Maybe, at the retransmission we will have enough memory to

9858

++		 * fully add the socket to the meta-sk.

9859

++		 */

9860

++		goto teardown;

9861

++	}

9862

++

9863

++	/* The child is a clone of the meta socket, we must now reset

9864

++	 * some of the fields

9865

++	 */

9866

++	child_tp->mptcp->rcv_low_prio = mtreq->rcv_low_prio;

9867

++

9868

++	/* We should allow proper increase of the snd/rcv-buffers. Thus, we

9869

++	 * use the original values instead of the bloated up ones from the

9870

++	 * clone.

9871

++	 */

9872

++	child->sk_sndbuf = mpcb->orig_sk_sndbuf;

9873

++	child->sk_rcvbuf = mpcb->orig_sk_rcvbuf;

9874

++

9875

++	child_tp->mptcp->slave_sk = 1;

9876

++	child_tp->mptcp->snt_isn = tcp_rsk(req)->snt_isn;

9877

++	child_tp->mptcp->rcv_isn = tcp_rsk(req)->rcv_isn;

9878

++	child_tp->mptcp->init_rcv_wnd = req->rcv_wnd;

9879

++

9880

++	child_tp->tsq_flags = 0;

9881

++

9882

++	/* Subflows do not use the accept queue, as they

9883

++	 * are attached immediately to the mpcb.

9884

++	 */

9885

++	inet_csk_reqsk_queue_unlink(meta_sk, req, prev);

9886

++	reqsk_queue_removed(&inet_csk(meta_sk)->icsk_accept_queue, req);

9887

++	reqsk_free(req);

9888

++	return child;

9889

++

9890

++teardown:

9891

++	/* Drop this request - sock creation failed. */

9892

++	inet_csk_reqsk_queue_unlink(meta_sk, req, prev);

9893

++	reqsk_queue_removed(&inet_csk(meta_sk)->icsk_accept_queue, req);

9894

++	reqsk_free(req);

9895

++	inet_csk_prepare_forced_close(child);

9896

++	tcp_done(child);

9897

++	return meta_sk;

9898

++}

9899

++

9900

++int mptcp_init_tw_sock(struct sock *sk, struct tcp_timewait_sock *tw)

9901

++{

9902

++	struct mptcp_tw *mptw;

9903

++	struct tcp_sock *tp = tcp_sk(sk);

9904

++	struct mptcp_cb *mpcb = tp->mpcb;

9905

++

9906

++	/* A subsocket in tw can only receive data. So, if we are in

9907

++	 * infinite-receive, then we should not reply with a data-ack or act

9908

++	 * upon general MPTCP-signaling. We prevent this by simply not creating

9909

++	 * the mptcp_tw_sock.

9910

++	 */

9911

++	if (mpcb->infinite_mapping_rcv) {

9912

++		tw->mptcp_tw = NULL;

9913

++		return 0;

9914

++	}

9915

++

9916

++	/* Alloc MPTCP-tw-sock */

9917

++	mptw = kmem_cache_alloc(mptcp_tw_cache, GFP_ATOMIC);

9918

++	if (!mptw)

9919

++		return -ENOBUFS;

9920

++

9921

++	atomic_inc(&mpcb->mpcb_refcnt);

9922

++

9923

++	tw->mptcp_tw = mptw;

9924

++	mptw->loc_key = mpcb->mptcp_loc_key;

9925

++	mptw->meta_tw = mpcb->in_time_wait;

9926

++	if (mptw->meta_tw) {

9927

++		mptw->rcv_nxt = mptcp_get_rcv_nxt_64(mptcp_meta_tp(tp));

9928

++		if (mpcb->mptw_state != TCP_TIME_WAIT)

9929

++			mptw->rcv_nxt++;

9930

++	}

9931

++	rcu_assign_pointer(mptw->mpcb, mpcb);

9932

++

9933

++	spin_lock(&mpcb->tw_lock);

9934

++	list_add_rcu(&mptw->list, &tp->mpcb->tw_list);

9935

++	mptw->in_list = 1;

9936

++	spin_unlock(&mpcb->tw_lock);

9937

++

9938

++	return 0;

9939

++}

9940

++

9941

++void mptcp_twsk_destructor(struct tcp_timewait_sock *tw)

9942

++{

9943

++	struct mptcp_cb *mpcb;

9944

++

9945

++	rcu_read_lock();

9946

++	mpcb = rcu_dereference(tw->mptcp_tw->mpcb);

9947

++

9948

++	/* If we are still holding a ref to the mpcb, we have to remove ourself

9949

++	 * from the list and drop the ref properly.

9950

++	 */

9951

++	if (mpcb && atomic_inc_not_zero(&mpcb->mpcb_refcnt)) {

9952

++		spin_lock(&mpcb->tw_lock);

9953

++		if (tw->mptcp_tw->in_list) {

9954

++			list_del_rcu(&tw->mptcp_tw->list);

9955

++			tw->mptcp_tw->in_list = 0;

9956

++		}

9957

++		spin_unlock(&mpcb->tw_lock);

9958

++

9959

++		/* Twice, because we increased it above */

9960

++		mptcp_mpcb_put(mpcb);

9961

++		mptcp_mpcb_put(mpcb);

9962

++	}

9963

++

9964

++	rcu_read_unlock();

9965

++

9966

++	kmem_cache_free(mptcp_tw_cache, tw->mptcp_tw);

9967

++}

9968

++

9969

++/* Updates the rcv_nxt of the time-wait-socks and allows them to ack a

9970

++ * data-fin.

9971

++ */

9972

++void mptcp_time_wait(struct sock *sk, int state, int timeo)

9973

++{

9974

++	struct tcp_sock *tp = tcp_sk(sk);

9975

++	struct mptcp_tw *mptw;

9976

++

9977

++	/* Used for sockets that go into tw after the meta

9978

++	 * (see mptcp_init_tw_sock())

9979

++	 */

9980

++	tp->mpcb->in_time_wait = 1;

9981

++	tp->mpcb->mptw_state = state;

9982

++

9983

++	/* Update the time-wait-sock's information */

9984

++	rcu_read_lock_bh();

9985

++	list_for_each_entry_rcu(mptw, &tp->mpcb->tw_list, list) {

9986

++		mptw->meta_tw = 1;

9987

++		mptw->rcv_nxt = mptcp_get_rcv_nxt_64(tp);

9988

++

9989

++		/* We want to ack a DATA_FIN, but are yet in FIN_WAIT_2 -

9990

++		 * pretend as if the DATA_FIN has already reached us, that way

9991

++		 * the checks in tcp_timewait_state_process will be good as the

9992

++		 * DATA_FIN comes in.

9993

++		 */

9994

++		if (state != TCP_TIME_WAIT)

9995

++			mptw->rcv_nxt++;

9996

++	}

9997

++	rcu_read_unlock_bh();

9998

++

9999

++	tcp_done(sk);

10000

++}

10001

++

10002

++void mptcp_tsq_flags(struct sock *sk)

10003

++{

10004

++	struct tcp_sock *tp = tcp_sk(sk);

10005

++	struct sock *meta_sk = mptcp_meta_sk(sk);

10006

++

10007

++	/* It will be handled as a regular deferred-call */

10008

++	if (is_meta_sk(sk))

10009

++		return;

10010

++

10011

++	if (hlist_unhashed(&tp->mptcp->cb_list)) {

10012

++		hlist_add_head(&tp->mptcp->cb_list, &tp->mpcb->callback_list);

10013

++		/* We need to hold it here, as the sock_hold is not assured

10014

++		 * by the release_sock as it is done in regular TCP.

10015

++		 *

10016

++		 * The subsocket may get inet_csk_destroy'd while it is inside

10017

++		 * the callback_list.

10018

++		 */

10019

++		sock_hold(sk);

10020

++	}

10021

++

10022

++	if (!test_and_set_bit(MPTCP_SUB_DEFERRED, &tcp_sk(meta_sk)->tsq_flags))

10023

++		sock_hold(meta_sk);

10024

++}

10025

++

10026

++void mptcp_tsq_sub_deferred(struct sock *meta_sk)

10027

++{

10028

++	struct tcp_sock *meta_tp = tcp_sk(meta_sk);

10029

++	struct mptcp_tcp_sock *mptcp;

10030

++	struct hlist_node *tmp;

10031

++

10032

++	BUG_ON(!is_meta_sk(meta_sk) && !meta_tp->was_meta_sk);

10033

++

10034

++	__sock_put(meta_sk);

10035

++	hlist_for_each_entry_safe(mptcp, tmp, &meta_tp->mpcb->callback_list, cb_list) {

10036

++		struct tcp_sock *tp = mptcp->tp;

10037

++		struct sock *sk = (struct sock *)tp;

10038

++

10039

++		hlist_del_init(&mptcp->cb_list);

10040

++		sk->sk_prot->release_cb(sk);

10041

++		/* Final sock_put (cfr. mptcp_tsq_flags */

10042

++		sock_put(sk);

10043

++	}

10044

++}

10045

++

10046

++void mptcp_join_reqsk_init(struct mptcp_cb *mpcb, const struct request_sock *req,

10047

++			   struct sk_buff *skb)

10048

++{

10049

++	struct mptcp_request_sock *mtreq = mptcp_rsk(req);

10050

++	struct mptcp_options_received mopt;

10051

++	u8 mptcp_hash_mac[20];

10052

++

10053

++	mptcp_init_mp_opt(&mopt);

10054

++	tcp_parse_mptcp_options(skb, &mopt);

10055

++

10056

++	mtreq = mptcp_rsk(req);

10057

++	mtreq->mptcp_mpcb = mpcb;

10058

++	mtreq->is_sub = 1;

10059

++	inet_rsk(req)->mptcp_rqsk = 1;

10060

++

10061

++	mtreq->mptcp_rem_nonce = mopt.mptcp_recv_nonce;

10062

++

10063

++	mptcp_hmac_sha1((u8 *)&mpcb->mptcp_loc_key,

10064

++			(u8 *)&mpcb->mptcp_rem_key,

10065

++			(u8 *)&mtreq->mptcp_loc_nonce,

10066

++			(u8 *)&mtreq->mptcp_rem_nonce, (u32 *)mptcp_hash_mac);

10067

++	mtreq->mptcp_hash_tmac = *(u64 *)mptcp_hash_mac;

10068

++

10069

++	mtreq->rem_id = mopt.rem_id;

10070

++	mtreq->rcv_low_prio = mopt.low_prio;

10071

++	inet_rsk(req)->saw_mpc = 1;

10072

++}

10073

++

10074

++void mptcp_reqsk_init(struct request_sock *req, const struct sk_buff *skb)

10075

++{

10076

++	struct mptcp_options_received mopt;

10077

++	struct mptcp_request_sock *mreq = mptcp_rsk(req);

10078

++

10079

++	mptcp_init_mp_opt(&mopt);

10080

++	tcp_parse_mptcp_options(skb, &mopt);

10081

++

10082

++	mreq->is_sub = 0;

10083

++	inet_rsk(req)->mptcp_rqsk = 1;

10084

++	mreq->dss_csum = mopt.dss_csum;

10085

++	mreq->hash_entry.pprev = NULL;

10086

++

10087

++	mptcp_reqsk_new_mptcp(req, &mopt, skb);

10088

++}

10089

++

10090

++int mptcp_conn_request(struct sock *sk, struct sk_buff *skb)

10091

++{

10092

++	struct mptcp_options_received mopt;

10093

++	const struct tcp_sock *tp = tcp_sk(sk);

10094

++	__u32 isn = TCP_SKB_CB(skb)->when;

10095

++	bool want_cookie = false;

10096

++

10097

++	if ((sysctl_tcp_syncookies == 2 ||

10098

++	     inet_csk_reqsk_queue_is_full(sk)) && !isn) {

10099

++		want_cookie = tcp_syn_flood_action(sk, skb,

10100

++						   mptcp_request_sock_ops.slab_name);

10101

++		if (!want_cookie)

10102

++			goto drop;

10103

++	}

10104

++

10105

++	mptcp_init_mp_opt(&mopt);

10106

++	tcp_parse_mptcp_options(skb, &mopt);

10107

++

10108

++	if (mopt.is_mp_join)

10109

++		return mptcp_do_join_short(skb, &mopt, sock_net(sk));

10110

++	if (mopt.drop_me)

10111

++		goto drop;

10112

++

10113

++	if (sysctl_mptcp_enabled == MPTCP_APP && !tp->mptcp_enabled)

10114

++		mopt.saw_mpc = 0;

10115

++

10116

++	if (skb->protocol == htons(ETH_P_IP)) {

10117

++		if (mopt.saw_mpc && !want_cookie) {

10118

++			if (skb_rtable(skb)->rt_flags &

10119

++			    (RTCF_BROADCAST | RTCF_MULTICAST))

10120

++				goto drop;

10121

++

10122

++			return tcp_conn_request(&mptcp_request_sock_ops,

10123

++						&mptcp_request_sock_ipv4_ops,

10124

++						sk, skb);

10125

++		}

10126

++

10127

++		return tcp_v4_conn_request(sk, skb);

10128

++#if IS_ENABLED(CONFIG_IPV6)

10129

++	} else {

10130

++		if (mopt.saw_mpc && !want_cookie) {

10131

++			if (!ipv6_unicast_destination(skb))

10132

++				goto drop;

10133

++

10134

++			return tcp_conn_request(&mptcp6_request_sock_ops,

10135

++						&mptcp_request_sock_ipv6_ops,

10136

++						sk, skb);

10137

++		}

10138

++

10139

++		return tcp_v6_conn_request(sk, skb);

10140

++#endif

10141

++	}

10142

++drop:

10143

++	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);

10144

++	return 0;

10145

++}

10146

++

10147

++struct workqueue_struct *mptcp_wq;

10148

++EXPORT_SYMBOL(mptcp_wq);

10149

++

10150

++/* Output /proc/net/mptcp */

10151

++static int mptcp_pm_seq_show(struct seq_file *seq, void *v)

10152

++{

10153

++	struct tcp_sock *meta_tp;

10154

++	const struct net *net = seq->private;

10155

++	int i, n = 0;

10156

++

10157

++	seq_printf(seq, "  sl  loc_tok  rem_tok  v6 local_address                         remote_address                        st ns tx_queue rx_queue inode");

10158

++	seq_putc(seq, '\n');

10159

++

10160

++	for (i = 0; i < MPTCP_HASH_SIZE; i++) {

10161

++		struct hlist_nulls_node *node;

10162

++		rcu_read_lock_bh();

10163

++		hlist_nulls_for_each_entry_rcu(meta_tp, node,

10164

++					       &tk_hashtable[i], tk_table) {

10165

++			struct mptcp_cb *mpcb = meta_tp->mpcb;

10166

++			struct sock *meta_sk = (struct sock *)meta_tp;

10167

++			struct inet_sock *isk = inet_sk(meta_sk);

10168

++

10169

++			if (!mptcp(meta_tp) || !net_eq(net, sock_net(meta_sk)))

10170

++				continue;

10171

++

10172

++			if (capable(CAP_NET_ADMIN)) {

10173

++				seq_printf(seq, "%4d: %04X %04X ", n++,

10174

++						mpcb->mptcp_loc_token,

10175

++						mpcb->mptcp_rem_token);

10176

++			} else {

10177

++				seq_printf(seq, "%4d: %04X %04X ", n++, -1, -1);

10178

++			}

10179

++			if (meta_sk->sk_family == AF_INET ||

10180

++			    mptcp_v6_is_v4_mapped(meta_sk)) {

10181

++				seq_printf(seq, " 0 %08X:%04X                         %08X:%04X                        ",

10182

++					   isk->inet_rcv_saddr,

10183

++					   ntohs(isk->inet_sport),

10184

++					   isk->inet_daddr,

10185

++					   ntohs(isk->inet_dport));

10186

++#if IS_ENABLED(CONFIG_IPV6)

10187

++			} else if (meta_sk->sk_family == AF_INET6) {

10188

++				struct in6_addr *src = &meta_sk->sk_v6_rcv_saddr;

10189

++				struct in6_addr *dst = &meta_sk->sk_v6_daddr;

10190

++				seq_printf(seq, " 1 %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X",

10191

++					   src->s6_addr32[0], src->s6_addr32[1],

10192

++					   src->s6_addr32[2], src->s6_addr32[3],

10193

++					   ntohs(isk->inet_sport),

10194

++					   dst->s6_addr32[0], dst->s6_addr32[1],

10195

++					   dst->s6_addr32[2], dst->s6_addr32[3],

10196

++					   ntohs(isk->inet_dport));

10197

++#endif

10198

++			}

10199

++			seq_printf(seq, " %02X %02X %08X:%08X %lu",

10200

++				   meta_sk->sk_state, mpcb->cnt_subflows,

10201

++				   meta_tp->write_seq - meta_tp->snd_una,

10202

++				   max_t(int, meta_tp->rcv_nxt -

10203

++					 meta_tp->copied_seq, 0),

10204

++				   sock_i_ino(meta_sk));

10205

++			seq_putc(seq, '\n');

10206

++		}

10207

++

10208

++		rcu_read_unlock_bh();

10209

++	}

10210

++

10211

++	return 0;

10212

++}

10213

++

10214

++static int mptcp_pm_seq_open(struct inode *inode, struct file *file)

10215

++{

10216

++	return single_open_net(inode, file, mptcp_pm_seq_show);

10217

++}

10218

++

10219

++static const struct file_operations mptcp_pm_seq_fops = {

10220

++	.owner = THIS_MODULE,

10221

++	.open = mptcp_pm_seq_open,

10222

++	.read = seq_read,

10223

++	.llseek = seq_lseek,

10224

++	.release = single_release_net,

10225

++};

10226

++

10227

++static int mptcp_pm_init_net(struct net *net)

10228

++{

10229

++	if (!proc_create("mptcp", S_IRUGO, net->proc_net, &mptcp_pm_seq_fops))

10230

++		return -ENOMEM;

10231

++

10232

++	return 0;

10233

++}

10234

++

10235

++static void mptcp_pm_exit_net(struct net *net)

10236

++{

10237

++	remove_proc_entry("mptcp", net->proc_net);

10238

++}

10239

++

10240

++static struct pernet_operations mptcp_pm_proc_ops = {

10241

++	.init = mptcp_pm_init_net,

10242

++	.exit = mptcp_pm_exit_net,

10243

++};

10244

++

10245

++/* General initialization of mptcp */

10246

++void __init mptcp_init(void)

10247

++{

10248

++	int i;

10249

++	struct ctl_table_header *mptcp_sysctl;

10250

++

10251

++	mptcp_sock_cache = kmem_cache_create("mptcp_sock",

10252

++					     sizeof(struct mptcp_tcp_sock),

10253

++					     0, SLAB_HWCACHE_ALIGN,

10254

++					     NULL);

10255

++	if (!mptcp_sock_cache)

10256

++		goto mptcp_sock_cache_failed;

10257

++

10258

++	mptcp_cb_cache = kmem_cache_create("mptcp_cb", sizeof(struct mptcp_cb),

10259

++					   0, SLAB_DESTROY_BY_RCU|SLAB_HWCACHE_ALIGN,

10260

++					   NULL);

10261

++	if (!mptcp_cb_cache)

10262

++		goto mptcp_cb_cache_failed;

10263

++

10264

++	mptcp_tw_cache = kmem_cache_create("mptcp_tw", sizeof(struct mptcp_tw),

10265

++					   0, SLAB_DESTROY_BY_RCU|SLAB_HWCACHE_ALIGN,

10266

++					   NULL);

10267

++	if (!mptcp_tw_cache)

10268

++		goto mptcp_tw_cache_failed;

10269

++

10270

++	get_random_bytes(mptcp_secret, sizeof(mptcp_secret));

10271

++

10272

++	mptcp_wq = alloc_workqueue("mptcp_wq", WQ_UNBOUND | WQ_MEM_RECLAIM, 8);

10273

++	if (!mptcp_wq)

10274

++		goto alloc_workqueue_failed;

10275

++

10276

++	for (i = 0; i < MPTCP_HASH_SIZE; i++) {

10277

++		INIT_HLIST_NULLS_HEAD(&tk_hashtable[i], i);

10278

++		INIT_HLIST_NULLS_HEAD(&mptcp_reqsk_htb[i],

10279

++				      i + MPTCP_REQSK_NULLS_BASE);

10280

++		INIT_HLIST_NULLS_HEAD(&mptcp_reqsk_tk_htb[i], i);

10281

++	}

10282

++

10283

++	spin_lock_init(&mptcp_reqsk_hlock);

10284

++	spin_lock_init(&mptcp_tk_hashlock);

10285

++

10286

++	if (register_pernet_subsys(&mptcp_pm_proc_ops))

10287

++		goto pernet_failed;

10288

++

10289

++#if IS_ENABLED(CONFIG_IPV6)

10290

++	if (mptcp_pm_v6_init())

10291

++		goto mptcp_pm_v6_failed;

10292

++#endif

10293

++	if (mptcp_pm_v4_init())

10294

++		goto mptcp_pm_v4_failed;

10295

++

10296

++	mptcp_sysctl = register_net_sysctl(&init_net, "net/mptcp", mptcp_table);

10297

++	if (!mptcp_sysctl)

10298

++		goto register_sysctl_failed;

10299

++

10300

++	if (mptcp_register_path_manager(&mptcp_pm_default))

10301

++		goto register_pm_failed;

10302

++

10303

++	if (mptcp_register_scheduler(&mptcp_sched_default))

10304

++		goto register_sched_failed;

10305

++

10306

++	pr_info("MPTCP: Stable release v0.89.0-rc");

10307

++

10308

++	mptcp_init_failed = false;

10309

++

10310

++	return;

10311

++

10312

++register_sched_failed:

10313

++	mptcp_unregister_path_manager(&mptcp_pm_default);

10314

++register_pm_failed:

10315

++	unregister_net_sysctl_table(mptcp_sysctl);

10316

++register_sysctl_failed:

10317

++	mptcp_pm_v4_undo();

10318

++mptcp_pm_v4_failed:

10319

++#if IS_ENABLED(CONFIG_IPV6)

10320

++	mptcp_pm_v6_undo();

10321

++mptcp_pm_v6_failed:

10322

++#endif

10323

++	unregister_pernet_subsys(&mptcp_pm_proc_ops);

10324

++pernet_failed:

10325

++	destroy_workqueue(mptcp_wq);

10326

++alloc_workqueue_failed:

10327

++	kmem_cache_destroy(mptcp_tw_cache);

10328

++mptcp_tw_cache_failed:

10329

++	kmem_cache_destroy(mptcp_cb_cache);

10330

++mptcp_cb_cache_failed:

10331

++	kmem_cache_destroy(mptcp_sock_cache);

10332

++mptcp_sock_cache_failed:

10333

++	mptcp_init_failed = true;

10334

++}

10335

+diff --git a/net/mptcp/mptcp_fullmesh.c b/net/mptcp/mptcp_fullmesh.c

10336

+new file mode 100644

10337

+index 000000000000..3a54413ce25b

10338

+--- /dev/null

10339

++++ b/net/mptcp/mptcp_fullmesh.c

10340

+@@ -0,0 +1,1722 @@

10341

++#include <linux/module.h>

10342

++

10343

++#include <net/mptcp.h>

10344

++#include <net/mptcp_v4.h>

10345

++

10346

++#if IS_ENABLED(CONFIG_IPV6)

10347

++#include <net/mptcp_v6.h>

10348

++#include <net/addrconf.h>

10349

++#endif

10350

++

10351

++enum {

10352

++	MPTCP_EVENT_ADD = 1,

10353

++	MPTCP_EVENT_DEL,

10354

++	MPTCP_EVENT_MOD,

10355

++};

10356

++

10357

++#define MPTCP_SUBFLOW_RETRY_DELAY	1000

10358

++

10359

++/* Max number of local or remote addresses we can store.

10360

++ * When changing, see the bitfield below in fullmesh_rem4/6.

10361

++ */

10362

++#define MPTCP_MAX_ADDR	8

10363

++

10364

++struct fullmesh_rem4 {

10365

++	u8		rem4_id;

10366

++	u8		bitfield;

10367

++	u8		retry_bitfield;

10368

++	__be16		port;

10369

++	struct in_addr	addr;

10370

++};

10371

++

10372

++struct fullmesh_rem6 {

10373

++	u8		rem6_id;

10374

++	u8		bitfield;

10375

++	u8		retry_bitfield;

10376

++	__be16		port;

10377

++	struct in6_addr	addr;

10378

++};

10379

++

10380

++struct mptcp_loc_addr {

10381

++	struct mptcp_loc4 locaddr4[MPTCP_MAX_ADDR];

10382

++	u8 loc4_bits;

10383

++	u8 next_v4_index;

10384

++

10385

++	struct mptcp_loc6 locaddr6[MPTCP_MAX_ADDR];

10386

++	u8 loc6_bits;

10387

++	u8 next_v6_index;

10388

++};

10389

++

10390

++struct mptcp_addr_event {

10391

++	struct list_head list;

10392

++	unsigned short	family;

10393

++	u8	code:7,

10394

++		low_prio:1;

10395

++	union inet_addr addr;

10396

++};

10397

++

10398

++struct fullmesh_priv {

10399

++	/* Worker struct for subflow establishment */

10400

++	struct work_struct subflow_work;

10401

++	/* Delayed worker, when the routing-tables are not yet ready. */

10402

++	struct delayed_work subflow_retry_work;

10403

++

10404

++	/* Remote addresses */

10405

++	struct fullmesh_rem4 remaddr4[MPTCP_MAX_ADDR];

10406

++	struct fullmesh_rem6 remaddr6[MPTCP_MAX_ADDR];

10407

++

10408

++	struct mptcp_cb *mpcb;

10409

++

10410

++	u16 remove_addrs; /* Addresses to remove */

10411

++	u8 announced_addrs_v4; /* IPv4 Addresses we did announce */

10412

++	u8 announced_addrs_v6; /* IPv6 Addresses we did announce */

10413

++

10414

++	u8	add_addr; /* Are we sending an add_addr? */

10415

++

10416

++	u8 rem4_bits;

10417

++	u8 rem6_bits;

10418

++};

10419

++

10420

++struct mptcp_fm_ns {

10421

++	struct mptcp_loc_addr __rcu *local;

10422

++	spinlock_t local_lock; /* Protecting the above pointer */

10423

++	struct list_head events;

10424

++	struct delayed_work address_worker;

10425

++

10426

++	struct net *net;

10427

++};

10428

++

10429

++static struct mptcp_pm_ops full_mesh __read_mostly;

10430

++

10431

++static void full_mesh_create_subflows(struct sock *meta_sk);

10432

++

10433

++static struct mptcp_fm_ns *fm_get_ns(const struct net *net)

10434

++{

10435

++	return (struct mptcp_fm_ns *)net->mptcp.path_managers[MPTCP_PM_FULLMESH];

10436

++}

10437

++

10438

++static struct fullmesh_priv *fullmesh_get_priv(const struct mptcp_cb *mpcb)

10439

++{

10440

++	return (struct fullmesh_priv *)&mpcb->mptcp_pm[0];

10441

++}

10442

++

10443

++/* Find the first free index in the bitfield */

10444

++static int __mptcp_find_free_index(u8 bitfield, u8 base)

10445

++{

10446

++	int i;

10447

++

10448

++	/* There are anyways no free bits... */

10449

++	if (bitfield == 0xff)

10450

++		goto exit;

10451

++

10452

++	i = ffs(~(bitfield >> base)) - 1;

10453

++	if (i < 0)

10454

++		goto exit;

10455

++

10456

++	/* No free bits when starting at base, try from 0 on */

10457

++	if (i + base >= sizeof(bitfield) * 8)

10458

++		return __mptcp_find_free_index(bitfield, 0);

10459

++

10460

++	return i + base;

10461

++exit:

10462

++	return -1;

10463

++}

10464

++

10465

++static int mptcp_find_free_index(u8 bitfield)

10466

++{

10467

++	return __mptcp_find_free_index(bitfield, 0);

10468

++}

10469

++

10470

++static void mptcp_addv4_raddr(struct mptcp_cb *mpcb,

10471

++			      const struct in_addr *addr,

10472

++			      __be16 port, u8 id)

10473

++{

10474

++	int i;

10475

++	struct fullmesh_rem4 *rem4;

10476

++	struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb);

10477

++

10478

++	mptcp_for_each_bit_set(fmp->rem4_bits, i) {

10479

++		rem4 = &fmp->remaddr4[i];

10480

++

10481

++		/* Address is already in the list --- continue */

10482

++		if (rem4->rem4_id == id &&

10483

++		    rem4->addr.s_addr == addr->s_addr && rem4->port == port)

10484

++			return;

10485

++

10486

++		/* This may be the case, when the peer is behind a NAT. He is

10487

++		 * trying to JOIN, thus sending the JOIN with a certain ID.

10488

++		 * However the src_addr of the IP-packet has been changed. We

10489

++		 * update the addr in the list, because this is the address as

10490

++		 * OUR BOX sees it.

10491

++		 */

10492

++		if (rem4->rem4_id == id && rem4->addr.s_addr != addr->s_addr) {

10493

++			/* update the address */

10494

++			mptcp_debug("%s: updating old addr:%pI4 to addr %pI4 with id:%d\n",

10495

++				    __func__, &rem4->addr.s_addr,

10496

++				    &addr->s_addr, id);

10497

++			rem4->addr.s_addr = addr->s_addr;

10498

++			rem4->port = port;

10499

++			mpcb->list_rcvd = 1;

10500

++			return;

10501

++		}

10502

++	}

10503

++

10504

++	i = mptcp_find_free_index(fmp->rem4_bits);

10505

++	/* Do we have already the maximum number of local/remote addresses? */

10506

++	if (i < 0) {

10507

++		mptcp_debug("%s: At max num of remote addresses: %d --- not adding address: %pI4\n",

10508

++			    __func__, MPTCP_MAX_ADDR, &addr->s_addr);

10509

++		return;

10510

++	}

10511

++

10512

++	rem4 = &fmp->remaddr4[i];

10513

++

10514

++	/* Address is not known yet, store it */

10515

++	rem4->addr.s_addr = addr->s_addr;

10516

++	rem4->port = port;

10517

++	rem4->bitfield = 0;

10518

++	rem4->retry_bitfield = 0;

10519

++	rem4->rem4_id = id;

10520

++	mpcb->list_rcvd = 1;

10521

++	fmp->rem4_bits |= (1 << i);

10522

++

10523

++	return;

10524

++}

10525

++

10526

++static void mptcp_addv6_raddr(struct mptcp_cb *mpcb,

10527

++			      const struct in6_addr *addr,

10528

++			      __be16 port, u8 id)

10529

++{

10530

++	int i;

10531

++	struct fullmesh_rem6 *rem6;

10532

++	struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb);

10533

++

10534

++	mptcp_for_each_bit_set(fmp->rem6_bits, i) {

10535

++		rem6 = &fmp->remaddr6[i];

10536

++

10537

++		/* Address is already in the list --- continue */

10538

++		if (rem6->rem6_id == id &&

10539

++		    ipv6_addr_equal(&rem6->addr, addr) && rem6->port == port)

10540

++			return;

10541

++

10542

++		/* This may be the case, when the peer is behind a NAT. He is

10543

++		 * trying to JOIN, thus sending the JOIN with a certain ID.

10544

++		 * However the src_addr of the IP-packet has been changed. We

10545

++		 * update the addr in the list, because this is the address as

10546

++		 * OUR BOX sees it.

10547

++		 */

10548

++		if (rem6->rem6_id == id) {

10549

++			/* update the address */

10550

++			mptcp_debug("%s: updating old addr: %pI6 to addr %pI6 with id:%d\n",

10551

++				    __func__, &rem6->addr, addr, id);

10552

++			rem6->addr = *addr;

10553

++			rem6->port = port;

10554

++			mpcb->list_rcvd = 1;

10555

++			return;

10556

++		}

10557

++	}

10558

++

10559

++	i = mptcp_find_free_index(fmp->rem6_bits);

10560

++	/* Do we have already the maximum number of local/remote addresses? */

10561

++	if (i < 0) {

10562

++		mptcp_debug("%s: At max num of remote addresses: %d --- not adding address: %pI6\n",

10563

++			    __func__, MPTCP_MAX_ADDR, addr);

10564

++		return;

10565

++	}

10566

++

10567

++	rem6 = &fmp->remaddr6[i];

10568

++

10569

++	/* Address is not known yet, store it */

10570

++	rem6->addr = *addr;

10571

++	rem6->port = port;

10572

++	rem6->bitfield = 0;

10573

++	rem6->retry_bitfield = 0;

10574

++	rem6->rem6_id = id;

10575

++	mpcb->list_rcvd = 1;

10576

++	fmp->rem6_bits |= (1 << i);

10577

++

10578

++	return;

10579

++}

10580

++

10581

++static void mptcp_v4_rem_raddress(struct mptcp_cb *mpcb, u8 id)

10582

++{

10583

++	int i;

10584

++	struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb);

10585

++

10586

++	mptcp_for_each_bit_set(fmp->rem4_bits, i) {

10587

++		if (fmp->remaddr4[i].rem4_id == id) {

10588

++			/* remove address from bitfield */

10589

++			fmp->rem4_bits &= ~(1 << i);

10590

++

10591

++			break;

10592

++		}

10593

++	}

10594

++}

10595

++

10596

++static void mptcp_v6_rem_raddress(const struct mptcp_cb *mpcb, u8 id)

10597

++{

10598

++	int i;

10599

++	struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb);

10600

++

10601

++	mptcp_for_each_bit_set(fmp->rem6_bits, i) {

10602

++		if (fmp->remaddr6[i].rem6_id == id) {

10603

++			/* remove address from bitfield */

10604

++			fmp->rem6_bits &= ~(1 << i);

10605

++

10606

++			break;

10607

++		}

10608

++	}

10609

++}

10610

++

10611

++/* Sets the bitfield of the remote-address field */

10612

++static void mptcp_v4_set_init_addr_bit(const struct mptcp_cb *mpcb,

10613

++				       const struct in_addr *addr, u8 index)

10614

++{

10615

++	int i;

10616

++	struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb);

10617

++

10618

++	mptcp_for_each_bit_set(fmp->rem4_bits, i) {

10619

++		if (fmp->remaddr4[i].addr.s_addr == addr->s_addr) {

10620

++			fmp->remaddr4[i].bitfield |= (1 << index);

10621

++			return;

10622

++		}

10623

++	}

10624

++}

10625

++

10626

++/* Sets the bitfield of the remote-address field */

10627

++static void mptcp_v6_set_init_addr_bit(struct mptcp_cb *mpcb,

10628

++				       const struct in6_addr *addr, u8 index)

10629

++{

10630

++	int i;

10631

++	struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb);

10632

++

10633

++	mptcp_for_each_bit_set(fmp->rem6_bits, i) {

10634

++		if (ipv6_addr_equal(&fmp->remaddr6[i].addr, addr)) {

10635

++			fmp->remaddr6[i].bitfield |= (1 << index);

10636

++			return;

10637

++		}

10638

++	}

10639

++}

10640

++

10641

++static void mptcp_set_init_addr_bit(struct mptcp_cb *mpcb,

10642

++				    const union inet_addr *addr,

10643

++				    sa_family_t family, u8 id)

10644

++{

10645

++	if (family == AF_INET)

10646

++		mptcp_v4_set_init_addr_bit(mpcb, &addr->in, id);

10647

++	else

10648

++		mptcp_v6_set_init_addr_bit(mpcb, &addr->in6, id);

10649

++}

10650

++

10651

++static void retry_subflow_worker(struct work_struct *work)

10652

++{

10653

++	struct delayed_work *delayed_work = container_of(work,

10654

++							 struct delayed_work,

10655

++							 work);

10656

++	struct fullmesh_priv *fmp = container_of(delayed_work,

10657

++						 struct fullmesh_priv,

10658

++						 subflow_retry_work);

10659

++	struct mptcp_cb *mpcb = fmp->mpcb;

10660

++	struct sock *meta_sk = mpcb->meta_sk;

10661

++	struct mptcp_loc_addr *mptcp_local;

10662

++	struct mptcp_fm_ns *fm_ns = fm_get_ns(sock_net(meta_sk));

10663

++	int iter = 0, i;

10664

++

10665

++	/* We need a local (stable) copy of the address-list. Really, it is not

10666

++	 * such a big deal, if the address-list is not 100% up-to-date.

10667

++	 */

10668

++	rcu_read_lock_bh();

10669

++	mptcp_local = rcu_dereference_bh(fm_ns->local);

10670

++	mptcp_local = kmemdup(mptcp_local, sizeof(*mptcp_local), GFP_ATOMIC);

10671

++	rcu_read_unlock_bh();

10672

++

10673

++	if (!mptcp_local)

10674

++		return;

10675

++

10676

++next_subflow:

10677

++	if (iter) {

10678

++		release_sock(meta_sk);

10679

++		mutex_unlock(&mpcb->mpcb_mutex);

10680

++

10681

++		cond_resched();

10682

++	}

10683

++	mutex_lock(&mpcb->mpcb_mutex);

10684

++	lock_sock_nested(meta_sk, SINGLE_DEPTH_NESTING);

10685

++

10686

++	iter++;

10687

++

10688

++	if (sock_flag(meta_sk, SOCK_DEAD))

10689

++		goto exit;

10690

++

10691

++	mptcp_for_each_bit_set(fmp->rem4_bits, i) {

10692

++		struct fullmesh_rem4 *rem = &fmp->remaddr4[i];

10693

++		/* Do we need to retry establishing a subflow ? */

10694

++		if (rem->retry_bitfield) {

10695

++			int i = mptcp_find_free_index(~rem->retry_bitfield);

10696

++			struct mptcp_rem4 rem4;

10697

++

10698

++			rem->bitfield |= (1 << i);

10699

++			rem->retry_bitfield &= ~(1 << i);

10700

++

10701

++			rem4.addr = rem->addr;

10702

++			rem4.port = rem->port;

10703

++			rem4.rem4_id = rem->rem4_id;

10704

++

10705

++			mptcp_init4_subsockets(meta_sk, &mptcp_local->locaddr4[i], &rem4);

10706

++			goto next_subflow;

10707

++		}

10708

++	}

10709

++

10710

++#if IS_ENABLED(CONFIG_IPV6)

10711

++	mptcp_for_each_bit_set(fmp->rem6_bits, i) {

10712

++		struct fullmesh_rem6 *rem = &fmp->remaddr6[i];

10713

++

10714

++		/* Do we need to retry establishing a subflow ? */

10715

++		if (rem->retry_bitfield) {

10716

++			int i = mptcp_find_free_index(~rem->retry_bitfield);

10717

++			struct mptcp_rem6 rem6;

10718

++

10719

++			rem->bitfield |= (1 << i);

10720

++			rem->retry_bitfield &= ~(1 << i);

10721

++

10722

++			rem6.addr = rem->addr;

10723

++			rem6.port = rem->port;

10724

++			rem6.rem6_id = rem->rem6_id;

10725

++

10726

++			mptcp_init6_subsockets(meta_sk, &mptcp_local->locaddr6[i], &rem6);

10727

++			goto next_subflow;

10728

++		}

10729

++	}

10730

++#endif

10731

++

10732

++exit:

10733

++	kfree(mptcp_local);

10734

++	release_sock(meta_sk);

10735

++	mutex_unlock(&mpcb->mpcb_mutex);

10736

++	sock_put(meta_sk);

10737

++}

10738

++

10739

++/**

10740

++ * Create all new subflows, by doing calls to mptcp_initX_subsockets

10741

++ *

10742

++ * This function uses a goto next_subflow, to allow releasing the lock between

10743

++ * new subflows and giving other processes a chance to do some work on the

10744

++ * socket and potentially finishing the communication.

10745

++ **/

10746

++static void create_subflow_worker(struct work_struct *work)

10747

++{

10748

++	struct fullmesh_priv *fmp = container_of(work, struct fullmesh_priv,

10749

++						 subflow_work);

10750

++	struct mptcp_cb *mpcb = fmp->mpcb;

10751

++	struct sock *meta_sk = mpcb->meta_sk;

10752

++	struct mptcp_loc_addr *mptcp_local;

10753

++	const struct mptcp_fm_ns *fm_ns = fm_get_ns(sock_net(meta_sk));

10754

++	int iter = 0, retry = 0;

10755

++	int i;

10756

++

10757

++	/* We need a local (stable) copy of the address-list. Really, it is not

10758

++	 * such a big deal, if the address-list is not 100% up-to-date.

10759

++	 */

10760

++	rcu_read_lock_bh();

10761

++	mptcp_local = rcu_dereference_bh(fm_ns->local);

10762

++	mptcp_local = kmemdup(mptcp_local, sizeof(*mptcp_local), GFP_ATOMIC);

10763

++	rcu_read_unlock_bh();

10764

++

10765

++	if (!mptcp_local)

10766

++		return;

10767

++

10768

++next_subflow:

10769

++	if (iter) {

10770

++		release_sock(meta_sk);

10771

++		mutex_unlock(&mpcb->mpcb_mutex);

10772

++

10773

++		cond_resched();

10774

++	}

10775

++	mutex_lock(&mpcb->mpcb_mutex);

10776

++	lock_sock_nested(meta_sk, SINGLE_DEPTH_NESTING);

10777

++

10778

++	iter++;

10779

++

10780

++	if (sock_flag(meta_sk, SOCK_DEAD))

10781

++		goto exit;

10782

++

10783

++	if (mpcb->master_sk &&

10784

++	    !tcp_sk(mpcb->master_sk)->mptcp->fully_established)

10785

++		goto exit;

10786

++

10787

++	mptcp_for_each_bit_set(fmp->rem4_bits, i) {

10788

++		struct fullmesh_rem4 *rem;

10789

++		u8 remaining_bits;

10790

++

10791

++		rem = &fmp->remaddr4[i];

10792

++		remaining_bits = ~(rem->bitfield) & mptcp_local->loc4_bits;

10793

++

10794

++		/* Are there still combinations to handle? */

10795

++		if (remaining_bits) {

10796

++			int i = mptcp_find_free_index(~remaining_bits);

10797

++			struct mptcp_rem4 rem4;

10798

++

10799

++			rem->bitfield |= (1 << i);

10800

++

10801

++			rem4.addr = rem->addr;

10802

++			rem4.port = rem->port;

10803

++			rem4.rem4_id = rem->rem4_id;

10804

++

10805

++			/* If a route is not yet available then retry once */

10806

++			if (mptcp_init4_subsockets(meta_sk, &mptcp_local->locaddr4[i],

10807

++						   &rem4) == -ENETUNREACH)

10808

++				retry = rem->retry_bitfield |= (1 << i);

10809

++			goto next_subflow;

10810

++		}

10811

++	}

10812

++

10813

++#if IS_ENABLED(CONFIG_IPV6)

10814

++	mptcp_for_each_bit_set(fmp->rem6_bits, i) {

10815

++		struct fullmesh_rem6 *rem;

10816

++		u8 remaining_bits;

10817

++

10818

++		rem = &fmp->remaddr6[i];

10819

++		remaining_bits = ~(rem->bitfield) & mptcp_local->loc6_bits;

10820

++

10821

++		/* Are there still combinations to handle? */

10822

++		if (remaining_bits) {

10823

++			int i = mptcp_find_free_index(~remaining_bits);

10824

++			struct mptcp_rem6 rem6;

10825

++

10826

++			rem->bitfield |= (1 << i);

10827

++

10828

++			rem6.addr = rem->addr;

10829

++			rem6.port = rem->port;

10830

++			rem6.rem6_id = rem->rem6_id;

10831

++

10832

++			/* If a route is not yet available then retry once */

10833

++			if (mptcp_init6_subsockets(meta_sk, &mptcp_local->locaddr6[i],

10834

++						   &rem6) == -ENETUNREACH)

10835

++				retry = rem->retry_bitfield |= (1 << i);

10836

++			goto next_subflow;

10837

++		}

10838

++	}

10839

++#endif

10840

++

10841

++	if (retry && !delayed_work_pending(&fmp->subflow_retry_work)) {

10842

++		sock_hold(meta_sk);

10843

++		queue_delayed_work(mptcp_wq, &fmp->subflow_retry_work,

10844

++				   msecs_to_jiffies(MPTCP_SUBFLOW_RETRY_DELAY));

10845

++	}

10846

++

10847

++exit:

10848

++	kfree(mptcp_local);

10849

++	release_sock(meta_sk);

10850

++	mutex_unlock(&mpcb->mpcb_mutex);

10851

++	sock_put(meta_sk);

10852

++}

10853

++

10854

++static void announce_remove_addr(u8 addr_id, struct sock *meta_sk)

10855

++{

10856

++	struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;

10857

++	struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb);

10858

++	struct sock *sk = mptcp_select_ack_sock(meta_sk);

10859

++

10860

++	fmp->remove_addrs |= (1 << addr_id);

10861

++	mpcb->addr_signal = 1;

10862

++

10863

++	if (sk)

10864

++		tcp_send_ack(sk);

10865

++}

10866

++

10867

++static void update_addr_bitfields(struct sock *meta_sk,

10868

++				  const struct mptcp_loc_addr *mptcp_local)

10869

++{

10870

++	struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;

10871

++	struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb);

10872

++	int i;

10873

++

10874

++	/* The bits in announced_addrs_* always match with loc*_bits. So, a

10875

++	 * simply & operation unsets the correct bits, because these go from

10876

++	 * announced to non-announced

10877

++	 */

10878

++	fmp->announced_addrs_v4 &= mptcp_local->loc4_bits;

10879

++

10880

++	mptcp_for_each_bit_set(fmp->rem4_bits, i) {

10881

++		fmp->remaddr4[i].bitfield &= mptcp_local->loc4_bits;

10882

++		fmp->remaddr4[i].retry_bitfield &= mptcp_local->loc4_bits;

10883

++	}

10884

++

10885

++	fmp->announced_addrs_v6 &= mptcp_local->loc6_bits;

10886

++

10887

++	mptcp_for_each_bit_set(fmp->rem6_bits, i) {

10888

++		fmp->remaddr6[i].bitfield &= mptcp_local->loc6_bits;

10889

++		fmp->remaddr6[i].retry_bitfield &= mptcp_local->loc6_bits;

10890

++	}

10891

++}

10892

++

10893

++static int mptcp_find_address(const struct mptcp_loc_addr *mptcp_local,

10894

++			      sa_family_t family, const union inet_addr *addr)

10895

++{

10896

++	int i;

10897

++	u8 loc_bits;

10898

++	bool found = false;

10899

++

10900

++	if (family == AF_INET)

10901

++		loc_bits = mptcp_local->loc4_bits;

10902

++	else

10903

++		loc_bits = mptcp_local->loc6_bits;

10904

++

10905

++	mptcp_for_each_bit_set(loc_bits, i) {

10906

++		if (family == AF_INET &&

10907

++		    mptcp_local->locaddr4[i].addr.s_addr == addr->in.s_addr) {

10908

++			found = true;

10909

++			break;

10910

++		}

10911

++		if (family == AF_INET6 &&

10912

++		    ipv6_addr_equal(&mptcp_local->locaddr6[i].addr,

10913

++				    &addr->in6)) {

10914

++			found = true;

10915

++			break;

10916

++		}

10917

++	}

10918

++

10919

++	if (!found)

10920

++		return -1;

10921

++

10922

++	return i;

10923

++}

10924

++

10925

++static void mptcp_address_worker(struct work_struct *work)

10926

++{

10927

++	const struct delayed_work *delayed_work = container_of(work,

10928

++							 struct delayed_work,

10929

++							 work);

10930

++	struct mptcp_fm_ns *fm_ns = container_of(delayed_work,

10931

++						 struct mptcp_fm_ns,

10932

++						 address_worker);

10933

++	struct net *net = fm_ns->net;

10934

++	struct mptcp_addr_event *event = NULL;

10935

++	struct mptcp_loc_addr *mptcp_local, *old;

10936

++	int i, id = -1; /* id is used in the socket-code on a delete-event */

10937

++	bool success; /* Used to indicate if we succeeded handling the event */

10938

++

10939

++next_event:

10940

++	success = false;

10941

++	kfree(event);

10942

++

10943

++	/* First, let's dequeue an event from our event-list */

10944

++	rcu_read_lock_bh();

10945

++	spin_lock(&fm_ns->local_lock);

10946

++

10947

++	event = list_first_entry_or_null(&fm_ns->events,

10948

++					 struct mptcp_addr_event, list);

10949

++	if (!event) {

10950

++		spin_unlock(&fm_ns->local_lock);

10951

++		rcu_read_unlock_bh();

10952

++		return;

10953

++	}

10954

++

10955

++	list_del(&event->list);

10956

++

10957

++	mptcp_local = rcu_dereference_bh(fm_ns->local);

10958

++

10959

++	if (event->code == MPTCP_EVENT_DEL) {

10960

++		id = mptcp_find_address(mptcp_local, event->family, &event->addr);

10961

++

10962

++		/* Not in the list - so we don't care */

10963

++		if (id < 0) {

10964

++			mptcp_debug("%s could not find id\n", __func__);

10965

++			goto duno;

10966

++		}

10967

++

10968

++		old = mptcp_local;

10969

++		mptcp_local = kmemdup(mptcp_local, sizeof(*mptcp_local),

10970

++				      GFP_ATOMIC);

10971

++		if (!mptcp_local)

10972

++			goto duno;

10973

++

10974

++		if (event->family == AF_INET)

10975

++			mptcp_local->loc4_bits &= ~(1 << id);

10976

++		else

10977

++			mptcp_local->loc6_bits &= ~(1 << id);

10978

++

10979

++		rcu_assign_pointer(fm_ns->local, mptcp_local);

10980

++		kfree(old);

10981

++	} else {

10982

++		int i = mptcp_find_address(mptcp_local, event->family, &event->addr);

10983

++		int j = i;

10984

++

10985

++		if (j < 0) {

10986

++			/* Not in the list, so we have to find an empty slot */

10987

++			if (event->family == AF_INET)

10988

++				i = __mptcp_find_free_index(mptcp_local->loc4_bits,

10989

++							    mptcp_local->next_v4_index);

10990

++			if (event->family == AF_INET6)

10991

++				i = __mptcp_find_free_index(mptcp_local->loc6_bits,

10992

++							    mptcp_local->next_v6_index);

10993

++

10994

++			if (i < 0) {

10995

++				mptcp_debug("%s no more space\n", __func__);

10996

++				goto duno;

10997

++			}

10998

++

10999

++			/* It might have been a MOD-event. */

11000

++			event->code = MPTCP_EVENT_ADD;

11001

++		} else {

11002

++			/* Let's check if anything changes */

11003

++			if (event->family == AF_INET &&

11004

++			    event->low_prio == mptcp_local->locaddr4[i].low_prio)

11005

++				goto duno;

11006

++

11007

++			if (event->family == AF_INET6 &&

11008

++			    event->low_prio == mptcp_local->locaddr6[i].low_prio)

11009

++				goto duno;

11010

++		}

11011

++

11012

++		old = mptcp_local;

11013

++		mptcp_local = kmemdup(mptcp_local, sizeof(*mptcp_local),

11014

++				      GFP_ATOMIC);

11015

++		if (!mptcp_local)

11016

++			goto duno;

11017

++

11018

++		if (event->family == AF_INET) {

11019

++			mptcp_local->locaddr4[i].addr.s_addr = event->addr.in.s_addr;

11020

++			mptcp_local->locaddr4[i].loc4_id = i + 1;

11021

++			mptcp_local->locaddr4[i].low_prio = event->low_prio;

11022

++		} else {

11023

++			mptcp_local->locaddr6[i].addr = event->addr.in6;

11024

++			mptcp_local->locaddr6[i].loc6_id = i + MPTCP_MAX_ADDR;

11025

++			mptcp_local->locaddr6[i].low_prio = event->low_prio;

11026

++		}

11027

++

11028

++		if (j < 0) {

11029

++			if (event->family == AF_INET) {

11030

++				mptcp_local->loc4_bits |= (1 << i);

11031

++				mptcp_local->next_v4_index = i + 1;

11032

++			} else {

11033

++				mptcp_local->loc6_bits |= (1 << i);

11034

++				mptcp_local->next_v6_index = i + 1;

11035

++			}

11036

++		}

11037

++

11038

++		rcu_assign_pointer(fm_ns->local, mptcp_local);

11039

++		kfree(old);

11040

++	}

11041

++	success = true;

11042

++

11043

++duno:

11044

++	spin_unlock(&fm_ns->local_lock);

11045

++	rcu_read_unlock_bh();

11046

++

11047

++	if (!success)

11048

++		goto next_event;

11049

++

11050

++	/* Now we iterate over the MPTCP-sockets and apply the event. */

11051

++	for (i = 0; i < MPTCP_HASH_SIZE; i++) {

11052

++		const struct hlist_nulls_node *node;

11053

++		struct tcp_sock *meta_tp;

11054

++

11055

++		rcu_read_lock_bh();

11056

++		hlist_nulls_for_each_entry_rcu(meta_tp, node, &tk_hashtable[i],

11057

++					       tk_table) {

11058

++			struct mptcp_cb *mpcb = meta_tp->mpcb;

11059

++			struct sock *meta_sk = (struct sock *)meta_tp, *sk;

11060

++			struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb);

11061

++			bool meta_v4 = meta_sk->sk_family == AF_INET;

11062

++

11063

++			if (sock_net(meta_sk) != net)

11064

++				continue;

11065

++

11066

++			if (meta_v4) {

11067

++				/* skip IPv6 events if meta is IPv4 */

11068

++				if (event->family == AF_INET6)

11069

++					continue;

11070

++			}

11071

++			/* skip IPv4 events if IPV6_V6ONLY is set */

11072

++			else if (event->family == AF_INET &&

11073

++				 inet6_sk(meta_sk)->ipv6only)

11074

++				continue;

11075

++

11076

++			if (unlikely(!atomic_inc_not_zero(&meta_sk->sk_refcnt)))

11077

++				continue;

11078

++

11079

++			bh_lock_sock(meta_sk);

11080

++

11081

++			if (!mptcp(meta_tp) || !is_meta_sk(meta_sk) ||

11082

++			    mpcb->infinite_mapping_snd ||

11083

++			    mpcb->infinite_mapping_rcv ||

11084

++			    mpcb->send_infinite_mapping)

11085

++				goto next;

11086

++

11087

++			/* May be that the pm has changed in-between */

11088

++			if (mpcb->pm_ops != &full_mesh)

11089

++				goto next;

11090

++

11091

++			if (sock_owned_by_user(meta_sk)) {

11092

++				if (!test_and_set_bit(MPTCP_PATH_MANAGER,

11093

++						      &meta_tp->tsq_flags))

11094

++					sock_hold(meta_sk);

11095

++

11096

++				goto next;

11097

++			}

11098

++

11099

++			if (event->code == MPTCP_EVENT_ADD) {

11100

++				fmp->add_addr++;

11101

++				mpcb->addr_signal = 1;

11102

++

11103

++				sk = mptcp_select_ack_sock(meta_sk);

11104

++				if (sk)

11105

++					tcp_send_ack(sk);

11106

++

11107

++				full_mesh_create_subflows(meta_sk);

11108

++			}

11109

++

11110

++			if (event->code == MPTCP_EVENT_DEL) {

11111

++				struct sock *sk, *tmpsk;

11112

++				struct mptcp_loc_addr *mptcp_local;

11113

++				bool found = false;

11114

++

11115

++				mptcp_local = rcu_dereference_bh(fm_ns->local);

11116

++

11117

++				/* In any case, we need to update our bitfields */

11118

++				if (id >= 0)

11119

++					update_addr_bitfields(meta_sk, mptcp_local);

11120

++

11121

++				/* Look for the socket and remove him */

11122

++				mptcp_for_each_sk_safe(mpcb, sk, tmpsk) {

11123

++					if ((event->family == AF_INET6 &&

11124

++					     (sk->sk_family == AF_INET ||

11125

++					      mptcp_v6_is_v4_mapped(sk))) ||

11126

++					    (event->family == AF_INET &&

11127

++					     (sk->sk_family == AF_INET6 &&

11128

++					      !mptcp_v6_is_v4_mapped(sk))))

11129

++						continue;

11130

++

11131

++					if (event->family == AF_INET &&

11132

++					    (sk->sk_family == AF_INET ||

11133

++					     mptcp_v6_is_v4_mapped(sk)) &&

11134

++					     inet_sk(sk)->inet_saddr != event->addr.in.s_addr)

11135

++						continue;

11136

++

11137

++					if (event->family == AF_INET6 &&

11138

++					    sk->sk_family == AF_INET6 &&

11139

++					    !ipv6_addr_equal(&inet6_sk(sk)->saddr, &event->addr.in6))

11140

++						continue;

11141

++

11142

++					/* Reinject, so that pf = 1 and so we

11143

++					 * won't select this one as the

11144

++					 * ack-sock.

11145

++					 */

11146

++					mptcp_reinject_data(sk, 0);

11147

++

11148

++					/* We announce the removal of this id */

11149

++					announce_remove_addr(tcp_sk(sk)->mptcp->loc_id, meta_sk);

11150

++

11151

++					mptcp_sub_force_close(sk);

11152

++					found = true;

11153

++				}

11154

++

11155

++				if (found)

11156

++					goto next;

11157

++

11158

++				/* The id may have been given by the event,

11159

++				 * matching on a local address. And it may not

11160

++				 * have matched on one of the above sockets,

11161

++				 * because the client never created a subflow.

11162

++				 * So, we have to finally remove it here.

11163

++				 */

11164

++				if (id > 0)

11165

++					announce_remove_addr(id, meta_sk);

11166

++			}

11167

++

11168

++			if (event->code == MPTCP_EVENT_MOD) {

11169

++				struct sock *sk;

11170

++

11171

++				mptcp_for_each_sk(mpcb, sk) {

11172

++					struct tcp_sock *tp = tcp_sk(sk);

11173

++					if (event->family == AF_INET &&

11174

++					    (sk->sk_family == AF_INET ||

11175

++					     mptcp_v6_is_v4_mapped(sk)) &&

11176

++					     inet_sk(sk)->inet_saddr == event->addr.in.s_addr) {

11177

++						if (event->low_prio != tp->mptcp->low_prio) {

11178

++							tp->mptcp->send_mp_prio = 1;

11179

++							tp->mptcp->low_prio = event->low_prio;

11180

++

11181

++							tcp_send_ack(sk);

11182

++						}

11183

++					}

11184

++

11185

++					if (event->family == AF_INET6 &&

11186

++					    sk->sk_family == AF_INET6 &&

11187

++					    !ipv6_addr_equal(&inet6_sk(sk)->saddr, &event->addr.in6)) {

11188

++						if (event->low_prio != tp->mptcp->low_prio) {

11189

++							tp->mptcp->send_mp_prio = 1;

11190

++							tp->mptcp->low_prio = event->low_prio;

11191

++

11192

++							tcp_send_ack(sk);

11193

++						}

11194

++					}

11195

++				}

11196

++			}

11197

++next:

11198

++			bh_unlock_sock(meta_sk);

11199

++			sock_put(meta_sk);

11200

++		}

11201

++		rcu_read_unlock_bh();

11202

++	}

11203

++	goto next_event;

11204

++}

11205

++

11206

++static struct mptcp_addr_event *lookup_similar_event(const struct net *net,

11207

++						     const struct mptcp_addr_event *event)

11208

++{

11209

++	struct mptcp_addr_event *eventq;

11210

++	struct mptcp_fm_ns *fm_ns = fm_get_ns(net);

11211

++

11212

++	list_for_each_entry(eventq, &fm_ns->events, list) {

11213

++		if (eventq->family != event->family)

11214

++			continue;

11215

++		if (event->family == AF_INET) {

11216

++			if (eventq->addr.in.s_addr == event->addr.in.s_addr)

11217

++				return eventq;

11218

++		} else {

11219

++			if (ipv6_addr_equal(&eventq->addr.in6, &event->addr.in6))

11220

++				return eventq;

11221

++		}

11222

++	}

11223

++	return NULL;

11224

++}

11225

++

11226

++/* We already hold the net-namespace MPTCP-lock */

11227

++static void add_pm_event(struct net *net, const struct mptcp_addr_event *event)

11228

++{

11229

++	struct mptcp_addr_event *eventq = lookup_similar_event(net, event);

11230

++	struct mptcp_fm_ns *fm_ns = fm_get_ns(net);

11231

++

11232

++	if (eventq) {

11233

++		switch (event->code) {

11234

++		case MPTCP_EVENT_DEL:

11235

++			mptcp_debug("%s del old_code %u\n", __func__, eventq->code);

11236

++			list_del(&eventq->list);

11237

++			kfree(eventq);

11238

++			break;

11239

++		case MPTCP_EVENT_ADD:

11240

++			mptcp_debug("%s add old_code %u\n", __func__, eventq->code);

11241

++			eventq->low_prio = event->low_prio;

11242

++			eventq->code = MPTCP_EVENT_ADD;

11243

++			return;

11244

++		case MPTCP_EVENT_MOD:

11245

++			mptcp_debug("%s mod old_code %u\n", __func__, eventq->code);

11246

++			eventq->low_prio = event->low_prio;

11247

++			eventq->code = MPTCP_EVENT_MOD;

11248

++			return;

11249

++		}

11250

++	}

11251

++

11252

++	/* OK, we have to add the new address to the wait queue */

11253

++	eventq = kmemdup(event, sizeof(struct mptcp_addr_event), GFP_ATOMIC);

11254

++	if (!eventq)

11255

++		return;

11256

++

11257

++	list_add_tail(&eventq->list, &fm_ns->events);

11258

++

11259

++	/* Create work-queue */

11260

++	if (!delayed_work_pending(&fm_ns->address_worker))

11261

++		queue_delayed_work(mptcp_wq, &fm_ns->address_worker,

11262

++				   msecs_to_jiffies(500));

11263

++}

11264

++

11265

++static void addr4_event_handler(const struct in_ifaddr *ifa, unsigned long event,

11266

++				struct net *net)

11267

++{

11268

++	const struct net_device *netdev = ifa->ifa_dev->dev;

11269

++	struct mptcp_fm_ns *fm_ns = fm_get_ns(net);

11270

++	struct mptcp_addr_event mpevent;

11271

++

11272

++	if (ifa->ifa_scope > RT_SCOPE_LINK ||

11273

++	    ipv4_is_loopback(ifa->ifa_local))

11274

++		return;

11275

++

11276

++	spin_lock_bh(&fm_ns->local_lock);

11277

++

11278

++	mpevent.family = AF_INET;

11279

++	mpevent.addr.in.s_addr = ifa->ifa_local;

11280

++	mpevent.low_prio = (netdev->flags & IFF_MPBACKUP) ? 1 : 0;

11281

++

11282

++	if (event == NETDEV_DOWN || !netif_running(netdev) ||

11283

++	    (netdev->flags & IFF_NOMULTIPATH) || !(netdev->flags & IFF_UP))

11284

++		mpevent.code = MPTCP_EVENT_DEL;

11285

++	else if (event == NETDEV_UP)

11286

++		mpevent.code = MPTCP_EVENT_ADD;

11287

++	else if (event == NETDEV_CHANGE)

11288

++		mpevent.code = MPTCP_EVENT_MOD;

11289

++

11290

++	mptcp_debug("%s created event for %pI4, code %u prio %u\n", __func__,

11291

++		    &ifa->ifa_local, mpevent.code, mpevent.low_prio);

11292

++	add_pm_event(net, &mpevent);

11293

++

11294

++	spin_unlock_bh(&fm_ns->local_lock);

11295

++	return;

11296

++}

11297

++

11298

++/* React on IPv4-addr add/rem-events */

11299

++static int mptcp_pm_inetaddr_event(struct notifier_block *this,

11300

++				   unsigned long event, void *ptr)

11301

++{

11302

++	const struct in_ifaddr *ifa = (struct in_ifaddr *)ptr;

11303

++	struct net *net = dev_net(ifa->ifa_dev->dev);

11304

++

11305

++	if (!(event == NETDEV_UP || event == NETDEV_DOWN ||

11306

++	      event == NETDEV_CHANGE))

11307

++		return NOTIFY_DONE;

11308

++

11309

++	addr4_event_handler(ifa, event, net);

11310

++

11311

++	return NOTIFY_DONE;

11312

++}

11313

++

11314

++static struct notifier_block mptcp_pm_inetaddr_notifier = {

11315

++		.notifier_call = mptcp_pm_inetaddr_event,

11316

++};

11317

++

11318

++#if IS_ENABLED(CONFIG_IPV6)

11319

++

11320

++/* IPV6-related address/interface watchers */

11321

++struct mptcp_dad_data {

11322

++	struct timer_list timer;

11323

++	struct inet6_ifaddr *ifa;

11324

++};

11325

++

11326

++static void dad_callback(unsigned long arg);

11327

++static int inet6_addr_event(struct notifier_block *this,

11328

++				     unsigned long event, void *ptr);

11329

++

11330

++static int ipv6_is_in_dad_state(const struct inet6_ifaddr *ifa)

11331

++{

11332

++	return (ifa->flags & IFA_F_TENTATIVE) &&

11333

++	       ifa->state == INET6_IFADDR_STATE_DAD;

11334

++}

11335

++

11336

++static void dad_init_timer(struct mptcp_dad_data *data,

11337

++				 struct inet6_ifaddr *ifa)

11338

++{

11339

++	data->ifa = ifa;

11340

++	data->timer.data = (unsigned long)data;

11341

++	data->timer.function = dad_callback;

11342

++	if (ifa->idev->cnf.rtr_solicit_delay)

11343

++		data->timer.expires = jiffies + ifa->idev->cnf.rtr_solicit_delay;

11344

++	else

11345

++		data->timer.expires = jiffies + (HZ/10);

11346

++}

11347

++

11348

++static void dad_callback(unsigned long arg)

11349

++{

11350

++	struct mptcp_dad_data *data = (struct mptcp_dad_data *)arg;

11351

++

11352

++	if (ipv6_is_in_dad_state(data->ifa)) {

11353

++		dad_init_timer(data, data->ifa);

11354

++		add_timer(&data->timer);

11355

++	} else {

11356

++		inet6_addr_event(NULL, NETDEV_UP, data->ifa);

11357

++		in6_ifa_put(data->ifa);

11358

++		kfree(data);

11359

++	}

11360

++}

11361

++

11362

++static inline void dad_setup_timer(struct inet6_ifaddr *ifa)

11363

++{

11364

++	struct mptcp_dad_data *data;

11365

++

11366

++	data = kmalloc(sizeof(*data), GFP_ATOMIC);

11367

++

11368

++	if (!data)

11369

++		return;

11370

++

11371

++	init_timer(&data->timer);

11372

++	dad_init_timer(data, ifa);

11373

++	add_timer(&data->timer);

11374

++	in6_ifa_hold(ifa);

11375

++}

11376

++

11377

++static void addr6_event_handler(const struct inet6_ifaddr *ifa, unsigned long event,

11378

++				struct net *net)

11379

++{

11380

++	const struct net_device *netdev = ifa->idev->dev;

11381

++	int addr_type = ipv6_addr_type(&ifa->addr);

11382

++	struct mptcp_fm_ns *fm_ns = fm_get_ns(net);

11383

++	struct mptcp_addr_event mpevent;

11384

++

11385

++	if (ifa->scope > RT_SCOPE_LINK ||

11386

++	    addr_type == IPV6_ADDR_ANY ||

11387

++	    (addr_type & IPV6_ADDR_LOOPBACK) ||

11388

++	    (addr_type & IPV6_ADDR_LINKLOCAL))

11389

++		return;

11390

++

11391

++	spin_lock_bh(&fm_ns->local_lock);

11392

++

11393

++	mpevent.family = AF_INET6;

11394

++	mpevent.addr.in6 = ifa->addr;

11395

++	mpevent.low_prio = (netdev->flags & IFF_MPBACKUP) ? 1 : 0;

11396

++

11397

++	if (event == NETDEV_DOWN || !netif_running(netdev) ||

11398

++	    (netdev->flags & IFF_NOMULTIPATH) || !(netdev->flags & IFF_UP))

11399

++		mpevent.code = MPTCP_EVENT_DEL;

11400

++	else if (event == NETDEV_UP)

11401

++		mpevent.code = MPTCP_EVENT_ADD;

11402

++	else if (event == NETDEV_CHANGE)

11403

++		mpevent.code = MPTCP_EVENT_MOD;

11404

++

11405

++	mptcp_debug("%s created event for %pI6, code %u prio %u\n", __func__,

11406

++		    &ifa->addr, mpevent.code, mpevent.low_prio);

11407

++	add_pm_event(net, &mpevent);

11408

++

11409

++	spin_unlock_bh(&fm_ns->local_lock);

11410

++	return;

11411

++}

11412

++

11413

++/* React on IPv6-addr add/rem-events */

11414

++static int inet6_addr_event(struct notifier_block *this, unsigned long event,

11415

++			    void *ptr)

11416

++{

11417

++	struct inet6_ifaddr *ifa6 = (struct inet6_ifaddr *)ptr;

11418

++	struct net *net = dev_net(ifa6->idev->dev);

11419

++

11420

++	if (!(event == NETDEV_UP || event == NETDEV_DOWN ||

11421

++	      event == NETDEV_CHANGE))

11422

++		return NOTIFY_DONE;

11423

++

11424

++	if (ipv6_is_in_dad_state(ifa6))

11425

++		dad_setup_timer(ifa6);

11426

++	else

11427

++		addr6_event_handler(ifa6, event, net);

11428

++

11429

++	return NOTIFY_DONE;

11430

++}

11431

++

11432

++static struct notifier_block inet6_addr_notifier = {

11433

++		.notifier_call = inet6_addr_event,

11434

++};

11435

++

11436

++#endif

11437

++

11438

++/* React on ifup/down-events */

11439

++static int netdev_event(struct notifier_block *this, unsigned long event,

11440

++			void *ptr)

11441

++{

11442

++	const struct net_device *dev = netdev_notifier_info_to_dev(ptr);

11443

++	struct in_device *in_dev;

11444

++#if IS_ENABLED(CONFIG_IPV6)

11445

++	struct inet6_dev *in6_dev;

11446

++#endif

11447

++

11448

++	if (!(event == NETDEV_UP || event == NETDEV_DOWN ||

11449

++	      event == NETDEV_CHANGE))

11450

++		return NOTIFY_DONE;

11451

++

11452

++	rcu_read_lock();

11453

++	in_dev = __in_dev_get_rtnl(dev);

11454

++

11455

++	if (in_dev) {

11456

++		for_ifa(in_dev) {

11457

++			mptcp_pm_inetaddr_event(NULL, event, ifa);

11458

++		} endfor_ifa(in_dev);

11459

++	}

11460

++

11461

++#if IS_ENABLED(CONFIG_IPV6)

11462

++	in6_dev = __in6_dev_get(dev);

11463

++

11464

++	if (in6_dev) {

11465

++		struct inet6_ifaddr *ifa6;

11466

++		list_for_each_entry(ifa6, &in6_dev->addr_list, if_list)

11467

++			inet6_addr_event(NULL, event, ifa6);

11468

++	}

11469

++#endif

11470

++

11471

++	rcu_read_unlock();

11472

++	return NOTIFY_DONE;

11473

++}

11474

++

11475

++static struct notifier_block mptcp_pm_netdev_notifier = {

11476

++		.notifier_call = netdev_event,

11477

++};

11478

++

11479

++static void full_mesh_add_raddr(struct mptcp_cb *mpcb,

11480

++				const union inet_addr *addr,

11481

++				sa_family_t family, __be16 port, u8 id)

11482

++{

11483

++	if (family == AF_INET)

11484

++		mptcp_addv4_raddr(mpcb, &addr->in, port, id);

11485

++	else

11486

++		mptcp_addv6_raddr(mpcb, &addr->in6, port, id);

11487

++}

11488

++

11489

++static void full_mesh_new_session(const struct sock *meta_sk)

11490

++{

11491

++	struct mptcp_loc_addr *mptcp_local;

11492

++	struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;

11493

++	struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb);

11494

++	const struct mptcp_fm_ns *fm_ns = fm_get_ns(sock_net(meta_sk));

11495

++	int i, index;

11496

++	union inet_addr saddr, daddr;

11497

++	sa_family_t family;

11498

++	bool meta_v4 = meta_sk->sk_family == AF_INET;

11499

++

11500

++	/* Init local variables necessary for the rest */

11501

++	if (meta_sk->sk_family == AF_INET || mptcp_v6_is_v4_mapped(meta_sk)) {

11502

++		saddr.ip = inet_sk(meta_sk)->inet_saddr;

11503

++		daddr.ip = inet_sk(meta_sk)->inet_daddr;

11504

++		family = AF_INET;

11505

++#if IS_ENABLED(CONFIG_IPV6)

11506

++	} else {

11507

++		saddr.in6 = inet6_sk(meta_sk)->saddr;

11508

++		daddr.in6 = meta_sk->sk_v6_daddr;

11509

++		family = AF_INET6;

11510

++#endif

11511

++	}

11512

++

11513

++	rcu_read_lock();

11514

++	mptcp_local = rcu_dereference(fm_ns->local);

11515

++

11516

++	index = mptcp_find_address(mptcp_local, family, &saddr);

11517

++	if (index < 0)

11518

++		goto fallback;

11519

++

11520

++	full_mesh_add_raddr(mpcb, &daddr, family, 0, 0);

11521

++	mptcp_set_init_addr_bit(mpcb, &daddr, family, index);

11522

++

11523

++	/* Initialize workqueue-struct */

11524

++	INIT_WORK(&fmp->subflow_work, create_subflow_worker);

11525

++	INIT_DELAYED_WORK(&fmp->subflow_retry_work, retry_subflow_worker);

11526

++	fmp->mpcb = mpcb;

11527

++

11528

++	if (!meta_v4 && inet6_sk(meta_sk)->ipv6only)

11529

++		goto skip_ipv4;

11530

++

11531

++	/* Look for the address among the local addresses */

11532

++	mptcp_for_each_bit_set(mptcp_local->loc4_bits, i) {

11533

++		__be32 ifa_address = mptcp_local->locaddr4[i].addr.s_addr;

11534

++

11535

++		/* We do not need to announce the initial subflow's address again */

11536

++		if (family == AF_INET && saddr.ip == ifa_address)

11537

++			continue;

11538

++

11539

++		fmp->add_addr++;

11540

++		mpcb->addr_signal = 1;

11541

++	}

11542

++

11543

++skip_ipv4:

11544

++#if IS_ENABLED(CONFIG_IPV6)

11545

++	/* skip IPv6 addresses if meta-socket is IPv4 */

11546

++	if (meta_v4)

11547

++		goto skip_ipv6;

11548

++

11549

++	mptcp_for_each_bit_set(mptcp_local->loc6_bits, i) {

11550

++		const struct in6_addr *ifa6 = &mptcp_local->locaddr6[i].addr;

11551

++

11552

++		/* We do not need to announce the initial subflow's address again */

11553

++		if (family == AF_INET6 && ipv6_addr_equal(&saddr.in6, ifa6))

11554

++			continue;

11555

++

11556

++		fmp->add_addr++;

11557

++		mpcb->addr_signal = 1;

11558

++	}

11559

++

11560

++skip_ipv6:

11561

++#endif

11562

++

11563

++	rcu_read_unlock();

11564

++

11565

++	if (family == AF_INET)

11566

++		fmp->announced_addrs_v4 |= (1 << index);

11567

++	else

11568

++		fmp->announced_addrs_v6 |= (1 << index);

11569

++

11570

++	for (i = fmp->add_addr; i && fmp->add_addr; i--)

11571

++		tcp_send_ack(mpcb->master_sk);

11572

++

11573

++	return;

11574

++

11575

++fallback:

11576

++	rcu_read_unlock();

11577

++	mptcp_fallback_default(mpcb);

11578

++	return;

11579

++}

11580

++

11581

++static void full_mesh_create_subflows(struct sock *meta_sk)

11582

++{

11583

++	const struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;

11584

++	struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb);

11585

++

11586

++	if (mpcb->infinite_mapping_snd || mpcb->infinite_mapping_rcv ||

11587

++	    mpcb->send_infinite_mapping ||

11588

++	    mpcb->server_side || sock_flag(meta_sk, SOCK_DEAD))

11589

++		return;

11590

++

11591

++	if (mpcb->master_sk &&

11592

++	    !tcp_sk(mpcb->master_sk)->mptcp->fully_established)

11593

++		return;

11594

++

11595

++	if (!work_pending(&fmp->subflow_work)) {

11596

++		sock_hold(meta_sk);

11597

++		queue_work(mptcp_wq, &fmp->subflow_work);

11598

++	}

11599

++}

11600

++

11601

++/* Called upon release_sock, if the socket was owned by the user during

11602

++ * a path-management event.

11603

++ */

11604

++static void full_mesh_release_sock(struct sock *meta_sk)

11605

++{

11606

++	struct mptcp_loc_addr *mptcp_local;

11607

++	struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;

11608

++	struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb);

11609

++	const struct mptcp_fm_ns *fm_ns = fm_get_ns(sock_net(meta_sk));

11610

++	struct sock *sk, *tmpsk;

11611

++	bool meta_v4 = meta_sk->sk_family == AF_INET;

11612

++	int i;

11613

++

11614

++	rcu_read_lock();

11615

++	mptcp_local = rcu_dereference(fm_ns->local);

11616

++

11617

++	if (!meta_v4 && inet6_sk(meta_sk)->ipv6only)

11618

++		goto skip_ipv4;

11619

++

11620

++	/* First, detect modifications or additions */

11621

++	mptcp_for_each_bit_set(mptcp_local->loc4_bits, i) {

11622

++		struct in_addr ifa = mptcp_local->locaddr4[i].addr;

11623

++		bool found = false;

11624

++

11625

++		mptcp_for_each_sk(mpcb, sk) {

11626

++			struct tcp_sock *tp = tcp_sk(sk);

11627

++

11628

++			if (sk->sk_family == AF_INET6 &&

11629

++			    !mptcp_v6_is_v4_mapped(sk))

11630

++				continue;

11631

++

11632

++			if (inet_sk(sk)->inet_saddr != ifa.s_addr)

11633

++				continue;

11634

++

11635

++			found = true;

11636

++

11637

++			if (mptcp_local->locaddr4[i].low_prio != tp->mptcp->low_prio) {

11638

++				tp->mptcp->send_mp_prio = 1;

11639

++				tp->mptcp->low_prio = mptcp_local->locaddr4[i].low_prio;

11640

++

11641

++				tcp_send_ack(sk);

11642

++			}

11643

++		}

11644

++

11645

++		if (!found) {

11646

++			fmp->add_addr++;

11647

++			mpcb->addr_signal = 1;

11648

++

11649

++			sk = mptcp_select_ack_sock(meta_sk);

11650

++			if (sk)

11651

++				tcp_send_ack(sk);

11652

++			full_mesh_create_subflows(meta_sk);

11653

++		}

11654

++	}

11655

++

11656

++skip_ipv4:

11657

++#if IS_ENABLED(CONFIG_IPV6)

11658

++	/* skip IPv6 addresses if meta-socket is IPv4 */

11659

++	if (meta_v4)

11660

++		goto removal;

11661

++

11662

++	mptcp_for_each_bit_set(mptcp_local->loc6_bits, i) {

11663

++		struct in6_addr ifa = mptcp_local->locaddr6[i].addr;

11664

++		bool found = false;

11665

++

11666

++		mptcp_for_each_sk(mpcb, sk) {

11667

++			struct tcp_sock *tp = tcp_sk(sk);

11668

++

11669

++			if (sk->sk_family == AF_INET ||

11670

++			    mptcp_v6_is_v4_mapped(sk))

11671

++				continue;

11672

++

11673

++			if (!ipv6_addr_equal(&inet6_sk(sk)->saddr, &ifa))

11674

++				continue;

11675

++

11676

++			found = true;

11677

++

11678

++			if (mptcp_local->locaddr6[i].low_prio != tp->mptcp->low_prio) {

11679

++				tp->mptcp->send_mp_prio = 1;

11680

++				tp->mptcp->low_prio = mptcp_local->locaddr6[i].low_prio;

11681

++

11682

++				tcp_send_ack(sk);

11683

++			}

11684

++		}

11685

++

11686

++		if (!found) {

11687

++			fmp->add_addr++;

11688

++			mpcb->addr_signal = 1;

11689

++

11690

++			sk = mptcp_select_ack_sock(meta_sk);

11691

++			if (sk)

11692

++				tcp_send_ack(sk);

11693

++			full_mesh_create_subflows(meta_sk);

11694

++		}

11695

++	}

11696

++

11697

++removal:

11698

++#endif

11699

++

11700

++	/* Now, detect address-removals */

11701

++	mptcp_for_each_sk_safe(mpcb, sk, tmpsk) {

11702

++		bool shall_remove = true;

11703

++

11704

++		if (sk->sk_family == AF_INET || mptcp_v6_is_v4_mapped(sk)) {

11705

++			mptcp_for_each_bit_set(mptcp_local->loc4_bits, i) {

11706

++				if (inet_sk(sk)->inet_saddr == mptcp_local->locaddr4[i].addr.s_addr) {

11707

++					shall_remove = false;

11708

++					break;

11709

++				}

11710

++			}

11711

++		} else {

11712

++			mptcp_for_each_bit_set(mptcp_local->loc6_bits, i) {

11713

++				if (ipv6_addr_equal(&inet6_sk(sk)->saddr, &mptcp_local->locaddr6[i].addr)) {

11714

++					shall_remove = false;

11715

++					break;

11716

++				}

11717

++			}

11718

++		}

11719

++

11720

++		if (shall_remove) {

11721

++			/* Reinject, so that pf = 1 and so we

11722

++			 * won't select this one as the

11723

++			 * ack-sock.

11724

++			 */

11725

++			mptcp_reinject_data(sk, 0);

11726

++

11727

++			announce_remove_addr(tcp_sk(sk)->mptcp->loc_id,

11728

++					     meta_sk);

11729

++

11730

++			mptcp_sub_force_close(sk);

11731

++		}

11732

++	}

11733

++

11734

++	/* Just call it optimistically. It actually cannot do any harm */

11735

++	update_addr_bitfields(meta_sk, mptcp_local);

11736

++

11737

++	rcu_read_unlock();

11738

++}

11739

++

11740

++static int full_mesh_get_local_id(sa_family_t family, union inet_addr *addr,

11741

++				  struct net *net, bool *low_prio)

11742

++{

11743

++	struct mptcp_loc_addr *mptcp_local;

11744

++	const struct mptcp_fm_ns *fm_ns = fm_get_ns(net);

11745

++	int index, id = -1;

11746

++

11747

++	/* Handle the backup-flows */

11748

++	rcu_read_lock();

11749

++	mptcp_local = rcu_dereference(fm_ns->local);

11750

++

11751

++	index = mptcp_find_address(mptcp_local, family, addr);

11752

++

11753

++	if (index != -1) {

11754

++		if (family == AF_INET) {

11755

++			id = mptcp_local->locaddr4[index].loc4_id;

11756

++			*low_prio = mptcp_local->locaddr4[index].low_prio;

11757

++		} else {

11758

++			id = mptcp_local->locaddr6[index].loc6_id;

11759

++			*low_prio = mptcp_local->locaddr6[index].low_prio;

11760

++		}

11761

++	}

11762

++

11763

++

11764

++	rcu_read_unlock();

11765

++

11766

++	return id;

11767

++}

11768

++

11769

++static void full_mesh_addr_signal(struct sock *sk, unsigned *size,

11770

++				  struct tcp_out_options *opts,

11771

++				  struct sk_buff *skb)

11772

++{

11773

++	const struct tcp_sock *tp = tcp_sk(sk);

11774

++	struct mptcp_cb *mpcb = tp->mpcb;

11775

++	struct sock *meta_sk = mpcb->meta_sk;

11776

++	struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb);

11777

++	struct mptcp_loc_addr *mptcp_local;

11778

++	struct mptcp_fm_ns *fm_ns = fm_get_ns(sock_net(sk));

11779

++	int remove_addr_len;

11780

++	u8 unannouncedv4 = 0, unannouncedv6 = 0;

11781

++	bool meta_v4 = meta_sk->sk_family == AF_INET;

11782

++

11783

++	mpcb->addr_signal = 0;

11784

++

11785

++	if (likely(!fmp->add_addr))

11786

++		goto remove_addr;

11787

++

11788

++	rcu_read_lock();

11789

++	mptcp_local = rcu_dereference(fm_ns->local);

11790

++

11791

++	if (!meta_v4 && inet6_sk(meta_sk)->ipv6only)

11792

++		goto skip_ipv4;

11793

++

11794

++	/* IPv4 */

11795

++	unannouncedv4 = (~fmp->announced_addrs_v4) & mptcp_local->loc4_bits;

11796

++	if (unannouncedv4 &&

11797

++	    MAX_TCP_OPTION_SPACE - *size >= MPTCP_SUB_LEN_ADD_ADDR4_ALIGN) {

11798

++		int ind = mptcp_find_free_index(~unannouncedv4);

11799

++

11800

++		opts->options |= OPTION_MPTCP;

11801

++		opts->mptcp_options |= OPTION_ADD_ADDR;

11802

++		opts->add_addr4.addr_id = mptcp_local->locaddr4[ind].loc4_id;

11803

++		opts->add_addr4.addr = mptcp_local->locaddr4[ind].addr;

11804

++		opts->add_addr_v4 = 1;

11805

++

11806

++		if (skb) {

11807

++			fmp->announced_addrs_v4 |= (1 << ind);

11808

++			fmp->add_addr--;

11809

++		}

11810

++		*size += MPTCP_SUB_LEN_ADD_ADDR4_ALIGN;

11811

++	}

11812

++

11813

++	if (meta_v4)

11814

++		goto skip_ipv6;

11815

++

11816

++skip_ipv4:

11817

++	/* IPv6 */

11818

++	unannouncedv6 = (~fmp->announced_addrs_v6) & mptcp_local->loc6_bits;

11819

++	if (unannouncedv6 &&

11820

++	    MAX_TCP_OPTION_SPACE - *size >= MPTCP_SUB_LEN_ADD_ADDR6_ALIGN) {

11821

++		int ind = mptcp_find_free_index(~unannouncedv6);

11822

++

11823

++		opts->options |= OPTION_MPTCP;

11824

++		opts->mptcp_options |= OPTION_ADD_ADDR;

11825

++		opts->add_addr6.addr_id = mptcp_local->locaddr6[ind].loc6_id;

11826

++		opts->add_addr6.addr = mptcp_local->locaddr6[ind].addr;

11827

++		opts->add_addr_v6 = 1;

11828

++

11829

++		if (skb) {

11830

++			fmp->announced_addrs_v6 |= (1 << ind);

11831

++			fmp->add_addr--;

11832

++		}

11833

++		*size += MPTCP_SUB_LEN_ADD_ADDR6_ALIGN;

11834

++	}

11835

++

11836

++skip_ipv6:

11837

++	rcu_read_unlock();

11838

++

11839

++	if (!unannouncedv4 && !unannouncedv6 && skb)

11840

++		fmp->add_addr--;

11841

++

11842

++remove_addr:

11843

++	if (likely(!fmp->remove_addrs))

11844

++		goto exit;

11845

++

11846

++	remove_addr_len = mptcp_sub_len_remove_addr_align(fmp->remove_addrs);

11847

++	if (MAX_TCP_OPTION_SPACE - *size < remove_addr_len)

11848

++		goto exit;

11849

++

11850

++	opts->options |= OPTION_MPTCP;

11851

++	opts->mptcp_options |= OPTION_REMOVE_ADDR;

11852

++	opts->remove_addrs = fmp->remove_addrs;

11853

++	*size += remove_addr_len;

11854

++	if (skb)

11855

++		fmp->remove_addrs = 0;

11856

++

11857

++exit:

11858

++	mpcb->addr_signal = !!(fmp->add_addr || fmp->remove_addrs);

11859

++}

11860

++

11861

++static void full_mesh_rem_raddr(struct mptcp_cb *mpcb, u8 rem_id)

11862

++{

11863

++	mptcp_v4_rem_raddress(mpcb, rem_id);

11864

++	mptcp_v6_rem_raddress(mpcb, rem_id);

11865

++}

11866

++

11867

++/* Output /proc/net/mptcp_fullmesh */

11868

++static int mptcp_fm_seq_show(struct seq_file *seq, void *v)

11869

++{

11870

++	const struct net *net = seq->private;

11871

++	struct mptcp_loc_addr *mptcp_local;

11872

++	const struct mptcp_fm_ns *fm_ns = fm_get_ns(net);

11873

++	int i;

11874

++

11875

++	seq_printf(seq, "Index, Address-ID, Backup, IP-address\n");

11876

++

11877

++	rcu_read_lock_bh();

11878

++	mptcp_local = rcu_dereference(fm_ns->local);

11879

++

11880

++	seq_printf(seq, "IPv4, next v4-index: %u\n", mptcp_local->next_v4_index);

11881

++

11882

++	mptcp_for_each_bit_set(mptcp_local->loc4_bits, i) {

11883

++		struct mptcp_loc4 *loc4 = &mptcp_local->locaddr4[i];

11884

++

11885

++		seq_printf(seq, "%u, %u, %u, %pI4\n", i, loc4->loc4_id,

11886

++			   loc4->low_prio, &loc4->addr);

11887

++	}

11888

++

11889

++	seq_printf(seq, "IPv6, next v6-index: %u\n", mptcp_local->next_v6_index);

11890

++

11891

++	mptcp_for_each_bit_set(mptcp_local->loc6_bits, i) {

11892

++		struct mptcp_loc6 *loc6 = &mptcp_local->locaddr6[i];

11893

++

11894

++		seq_printf(seq, "%u, %u, %u, %pI6\n", i, loc6->loc6_id,

11895

++			   loc6->low_prio, &loc6->addr);

11896

++	}

11897

++	rcu_read_unlock_bh();

11898

++

11899

++	return 0;

11900

++}

11901

++

11902

++static int mptcp_fm_seq_open(struct inode *inode, struct file *file)

11903

++{

11904

++	return single_open_net(inode, file, mptcp_fm_seq_show);

11905

++}

11906

++

11907

++static const struct file_operations mptcp_fm_seq_fops = {

11908

++	.owner = THIS_MODULE,

11909

++	.open = mptcp_fm_seq_open,

11910

++	.read = seq_read,

11911

++	.llseek = seq_lseek,

11912

++	.release = single_release_net,

11913

++};

11914

++

11915

++static int mptcp_fm_init_net(struct net *net)

11916

++{

11917

++	struct mptcp_loc_addr *mptcp_local;

11918

++	struct mptcp_fm_ns *fm_ns;

11919

++	int err = 0;

11920

++

11921

++	fm_ns = kzalloc(sizeof(*fm_ns), GFP_KERNEL);

11922

++	if (!fm_ns)

11923

++		return -ENOBUFS;

11924

++

11925

++	mptcp_local = kzalloc(sizeof(*mptcp_local), GFP_KERNEL);

11926

++	if (!mptcp_local) {

11927

++		err = -ENOBUFS;

11928

++		goto err_mptcp_local;

11929

++	}

11930

++

11931

++	if (!proc_create("mptcp_fullmesh", S_IRUGO, net->proc_net,

11932

++			 &mptcp_fm_seq_fops)) {

11933

++		err = -ENOMEM;

11934

++		goto err_seq_fops;

11935

++	}

11936

++

11937

++	mptcp_local->next_v4_index = 1;

11938

++

11939

++	rcu_assign_pointer(fm_ns->local, mptcp_local);

11940

++	INIT_DELAYED_WORK(&fm_ns->address_worker, mptcp_address_worker);

11941

++	INIT_LIST_HEAD(&fm_ns->events);

11942

++	spin_lock_init(&fm_ns->local_lock);

11943

++	fm_ns->net = net;

11944

++	net->mptcp.path_managers[MPTCP_PM_FULLMESH] = fm_ns;

11945

++

11946

++	return 0;

11947

++err_seq_fops:

11948

++	kfree(mptcp_local);

11949

++err_mptcp_local:

11950

++	kfree(fm_ns);

11951

++	return err;

11952

++}

11953

++

11954

++static void mptcp_fm_exit_net(struct net *net)

11955

++{

11956

++	struct mptcp_addr_event *eventq, *tmp;

11957

++	struct mptcp_fm_ns *fm_ns;

11958

++	struct mptcp_loc_addr *mptcp_local;

11959

++

11960

++	fm_ns = fm_get_ns(net);

11961

++	cancel_delayed_work_sync(&fm_ns->address_worker);

11962

++

11963

++	rcu_read_lock_bh();

11964

++

11965

++	mptcp_local = rcu_dereference_bh(fm_ns->local);

11966

++	kfree(mptcp_local);

11967

++

11968

++	spin_lock(&fm_ns->local_lock);

11969

++	list_for_each_entry_safe(eventq, tmp, &fm_ns->events, list) {

11970

++		list_del(&eventq->list);

11971

++		kfree(eventq);

11972

++	}

11973

++	spin_unlock(&fm_ns->local_lock);

11974

++

11975

++	rcu_read_unlock_bh();

11976

++

11977

++	remove_proc_entry("mptcp_fullmesh", net->proc_net);

11978

++

11979

++	kfree(fm_ns);

11980

++}

11981

++

11982

++static struct pernet_operations full_mesh_net_ops = {

11983

++	.init = mptcp_fm_init_net,

11984

++	.exit = mptcp_fm_exit_net,

11985

++};

11986

++

11987

++static struct mptcp_pm_ops full_mesh __read_mostly = {

11988

++	.new_session = full_mesh_new_session,

11989

++	.release_sock = full_mesh_release_sock,

11990

++	.fully_established = full_mesh_create_subflows,

11991

++	.new_remote_address = full_mesh_create_subflows,

11992

++	.get_local_id = full_mesh_get_local_id,

11993

++	.addr_signal = full_mesh_addr_signal,

11994

++	.add_raddr = full_mesh_add_raddr,

11995

++	.rem_raddr = full_mesh_rem_raddr,

11996

++	.name = "fullmesh",

11997

++	.owner = THIS_MODULE,

11998

++};

11999

++

12000

++/* General initialization of MPTCP_PM */

12001

++static int __init full_mesh_register(void)

12002

++{

12003

++	int ret;

12004

++

12005

++	BUILD_BUG_ON(sizeof(struct fullmesh_priv) > MPTCP_PM_SIZE);

12006

++

12007

++	ret = register_pernet_subsys(&full_mesh_net_ops);

12008

++	if (ret)

12009

++		goto out;

12010

++

12011

++	ret = register_inetaddr_notifier(&mptcp_pm_inetaddr_notifier);

12012

++	if (ret)

12013

++		goto err_reg_inetaddr;

12014

++	ret = register_netdevice_notifier(&mptcp_pm_netdev_notifier);

12015

++	if (ret)

12016

++		goto err_reg_netdev;

12017

++

12018

++#if IS_ENABLED(CONFIG_IPV6)

12019

++	ret = register_inet6addr_notifier(&inet6_addr_notifier);

12020

++	if (ret)

12021

++		goto err_reg_inet6addr;

12022

++#endif

12023

++

12024

++	ret = mptcp_register_path_manager(&full_mesh);

12025

++	if (ret)

12026

++		goto err_reg_pm;

12027

++

12028

++out:

12029

++	return ret;

12030

++

12031

++

12032

++err_reg_pm:

12033

++#if IS_ENABLED(CONFIG_IPV6)

12034

++	unregister_inet6addr_notifier(&inet6_addr_notifier);

12035

++err_reg_inet6addr:

12036

++#endif

12037

++	unregister_netdevice_notifier(&mptcp_pm_netdev_notifier);

12038

++err_reg_netdev:

12039

++	unregister_inetaddr_notifier(&mptcp_pm_inetaddr_notifier);

12040

++err_reg_inetaddr:

12041

++	unregister_pernet_subsys(&full_mesh_net_ops);

12042

++	goto out;

12043

++}

12044

++

12045

++static void full_mesh_unregister(void)

12046

++{

12047

++#if IS_ENABLED(CONFIG_IPV6)

12048

++	unregister_inet6addr_notifier(&inet6_addr_notifier);

12049

++#endif

12050

++	unregister_netdevice_notifier(&mptcp_pm_netdev_notifier);

12051

++	unregister_inetaddr_notifier(&mptcp_pm_inetaddr_notifier);

12052

++	unregister_pernet_subsys(&full_mesh_net_ops);

12053

++	mptcp_unregister_path_manager(&full_mesh);

12054

++}

12055

++

12056

++module_init(full_mesh_register);

12057

++module_exit(full_mesh_unregister);

12058

++

12059

++MODULE_AUTHOR("Christoph Paasch");

12060

++MODULE_LICENSE("GPL");

12061

++MODULE_DESCRIPTION("Full-Mesh MPTCP");

12062

++MODULE_VERSION("0.88");

12063

+diff --git a/net/mptcp/mptcp_input.c b/net/mptcp/mptcp_input.c

12064

+new file mode 100644

12065

+index 000000000000..43704ccb639e

12066

+--- /dev/null

12067

++++ b/net/mptcp/mptcp_input.c

12068

+@@ -0,0 +1,2405 @@

12069

++/*

12070

++ *	MPTCP implementation - Sending side

12071

++ *

12072

++ *	Initial Design & Implementation:

12073

++ *	Sébastien Barré <sebastien.barre@×××××××××.be>

12074

++ *

12075

++ *	Current Maintainer & Author:

12076

++ *	Christoph Paasch <christoph.paasch@×××××××××.be>

12077

++ *

12078

++ *	Additional authors:

12079

++ *	Jaakko Korkeaniemi <jaakko.korkeaniemi@×××××.fi>

12080

++ *	Gregory Detal <gregory.detal@×××××××××.be>

12081

++ *	Fabien Duchêne <fabien.duchene@×××××××××.be>

12082

++ *	Andreas Seelinger <Andreas.Seelinger@×××××××××××.de>

12083

++ *	Lavkesh Lahngir <lavkesh51@×××××.com>

12084

++ *	Andreas Ripke <ripke@××××××.eu>

12085

++ *	Vlad Dogaru <vlad.dogaru@×××××.com>

12086

++ *	Octavian Purdila <octavian.purdila@×××××.com>

12087

++ *	John Ronan <jronan@××××.org>

12088

++ *	Catalin Nicutar <catalin.nicutar@×××××.com>

12089

++ *	Brandon Heller <brandonh@××××××××.edu>

12090

++ *

12091

++ *

12092

++ *	This program is free software; you can redistribute it and/or

12093

++ *      modify it under the terms of the GNU General Public License

12094

++ *      as published by the Free Software Foundation; either version

12095

++ *      2 of the License, or (at your option) any later version.

12096

++ */

12097

++

12098

++#include <asm/unaligned.h>

12099

++

12100

++#include <net/mptcp.h>

12101

++#include <net/mptcp_v4.h>

12102

++#include <net/mptcp_v6.h>

12103

++

12104

++#include <linux/kconfig.h>

12105

++

12106

++/* is seq1 < seq2 ? */

12107

++static inline bool before64(const u64 seq1, const u64 seq2)

12108

++{

12109

++	return (s64)(seq1 - seq2) < 0;

12110

++}

12111

++

12112

++/* is seq1 > seq2 ? */

12113

++#define after64(seq1, seq2)	before64(seq2, seq1)

12114

++

12115

++static inline void mptcp_become_fully_estab(struct sock *sk)

12116

++{

12117

++	tcp_sk(sk)->mptcp->fully_established = 1;

12118

++

12119

++	if (is_master_tp(tcp_sk(sk)) &&

12120

++	    tcp_sk(sk)->mpcb->pm_ops->fully_established)

12121

++		tcp_sk(sk)->mpcb->pm_ops->fully_established(mptcp_meta_sk(sk));

12122

++}

12123

++

12124

++/* Similar to tcp_tso_acked without any memory accounting */

12125

++static inline int mptcp_tso_acked_reinject(const struct sock *meta_sk,

12126

++					   struct sk_buff *skb)

12127

++{

12128

++	const struct tcp_sock *meta_tp = tcp_sk(meta_sk);

12129

++	u32 packets_acked, len;

12130

++

12131

++	BUG_ON(!after(TCP_SKB_CB(skb)->end_seq, meta_tp->snd_una));

12132

++

12133

++	packets_acked = tcp_skb_pcount(skb);

12134

++

12135

++	if (skb_unclone(skb, GFP_ATOMIC))

12136

++		return 0;

12137

++

12138

++	len = meta_tp->snd_una - TCP_SKB_CB(skb)->seq;

12139

++	__pskb_trim_head(skb, len);

12140

++

12141

++	TCP_SKB_CB(skb)->seq += len;

12142

++	skb->ip_summed = CHECKSUM_PARTIAL;

12143

++	skb->truesize	     -= len;

12144

++

12145

++	/* Any change of skb->len requires recalculation of tso factor. */

12146

++	if (tcp_skb_pcount(skb) > 1)

12147

++		tcp_set_skb_tso_segs(meta_sk, skb, tcp_skb_mss(skb));

12148

++	packets_acked -= tcp_skb_pcount(skb);

12149

++

12150

++	if (packets_acked) {

12151

++		BUG_ON(tcp_skb_pcount(skb) == 0);

12152

++		BUG_ON(!before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq));

12153

++	}

12154

++

12155

++	return packets_acked;

12156

++}

12157

++

12158

++/**

12159

++ * Cleans the meta-socket retransmission queue and the reinject-queue.

12160

++ * @sk must be the metasocket.

12161

++ */

12162

++static void mptcp_clean_rtx_queue(struct sock *meta_sk, u32 prior_snd_una)

12163

++{

12164

++	struct sk_buff *skb, *tmp;

12165

++	struct tcp_sock *meta_tp = tcp_sk(meta_sk);

12166

++	struct mptcp_cb *mpcb = meta_tp->mpcb;

12167

++	bool acked = false;

12168

++	u32 acked_pcount;

12169

++

12170

++	while ((skb = tcp_write_queue_head(meta_sk)) &&

12171

++	       skb != tcp_send_head(meta_sk)) {

12172

++		bool fully_acked = true;

12173

++

12174

++		if (before(meta_tp->snd_una, TCP_SKB_CB(skb)->end_seq)) {

12175

++			if (tcp_skb_pcount(skb) == 1 ||

12176

++			    !after(meta_tp->snd_una, TCP_SKB_CB(skb)->seq))

12177

++				break;

12178

++

12179

++			acked_pcount = tcp_tso_acked(meta_sk, skb);

12180

++			if (!acked_pcount)

12181

++				break;

12182

++

12183

++			fully_acked = false;

12184

++		} else {

12185

++			acked_pcount = tcp_skb_pcount(skb);

12186

++		}

12187

++

12188

++		acked = true;

12189

++		meta_tp->packets_out -= acked_pcount;

12190

++		meta_tp->retrans_stamp = 0;

12191

++

12192

++		if (!fully_acked)

12193

++			break;

12194

++

12195

++		tcp_unlink_write_queue(skb, meta_sk);

12196

++

12197

++		if (mptcp_is_data_fin(skb)) {

12198

++			struct sock *sk_it;

12199

++

12200

++			/* DATA_FIN has been acknowledged - now we can close

12201

++			 * the subflows

12202

++			 */

12203

++			mptcp_for_each_sk(mpcb, sk_it) {

12204

++				unsigned long delay = 0;

12205

++

12206

++				/* If we are the passive closer, don't trigger

12207

++				 * subflow-fin until the subflow has been finned

12208

++				 * by the peer - thus we add a delay.

12209

++				 */

12210

++				if (mpcb->passive_close &&

12211

++				    sk_it->sk_state == TCP_ESTABLISHED)

12212

++					delay = inet_csk(sk_it)->icsk_rto << 3;

12213

++

12214

++				mptcp_sub_close(sk_it, delay);

12215

++			}

12216

++		}

12217

++		sk_wmem_free_skb(meta_sk, skb);

12218

++	}

12219

++	/* Remove acknowledged data from the reinject queue */

12220

++	skb_queue_walk_safe(&mpcb->reinject_queue, skb, tmp) {

12221

++		if (before(meta_tp->snd_una, TCP_SKB_CB(skb)->end_seq)) {

12222

++			if (tcp_skb_pcount(skb) == 1 ||

12223

++			    !after(meta_tp->snd_una, TCP_SKB_CB(skb)->seq))

12224

++				break;

12225

++

12226

++			mptcp_tso_acked_reinject(meta_sk, skb);

12227

++			break;

12228

++		}

12229

++

12230

++		__skb_unlink(skb, &mpcb->reinject_queue);

12231

++		__kfree_skb(skb);

12232

++	}

12233

++

12234

++	if (likely(between(meta_tp->snd_up, prior_snd_una, meta_tp->snd_una)))

12235

++		meta_tp->snd_up = meta_tp->snd_una;

12236

++

12237

++	if (acked) {

12238

++		tcp_rearm_rto(meta_sk);

12239

++		/* Normally this is done in tcp_try_undo_loss - but MPTCP

12240

++		 * does not call this function.

12241

++		 */

12242

++		inet_csk(meta_sk)->icsk_retransmits = 0;

12243

++	}

12244

++}

12245

++

12246

++/* Inspired by tcp_rcv_state_process */

12247

++static int mptcp_rcv_state_process(struct sock *meta_sk, struct sock *sk,

12248

++				   const struct sk_buff *skb, u32 data_seq,

12249

++				   u16 data_len)

12250

++{

12251

++	struct tcp_sock *meta_tp = tcp_sk(meta_sk), *tp = tcp_sk(sk);

12252

++	const struct tcphdr *th = tcp_hdr(skb);

12253

++

12254

++	/* State-machine handling if FIN has been enqueued and he has

12255

++	 * been acked (snd_una == write_seq) - it's important that this

12256

++	 * here is after sk_wmem_free_skb because otherwise

12257

++	 * sk_forward_alloc is wrong upon inet_csk_destroy_sock()

12258

++	 */

12259

++	switch (meta_sk->sk_state) {

12260

++	case TCP_FIN_WAIT1: {

12261

++		struct dst_entry *dst;

12262

++		int tmo;

12263

++

12264

++		if (meta_tp->snd_una != meta_tp->write_seq)

12265

++			break;

12266

++

12267

++		tcp_set_state(meta_sk, TCP_FIN_WAIT2);

12268

++		meta_sk->sk_shutdown |= SEND_SHUTDOWN;

12269

++

12270

++		dst = __sk_dst_get(sk);

12271

++		if (dst)

12272

++			dst_confirm(dst);

12273

++

12274

++		if (!sock_flag(meta_sk, SOCK_DEAD)) {

12275

++			/* Wake up lingering close() */

12276

++			meta_sk->sk_state_change(meta_sk);

12277

++			break;

12278

++		}

12279

++

12280

++		if (meta_tp->linger2 < 0 ||

12281

++		    (data_len &&

12282

++		     after(data_seq + data_len - (mptcp_is_data_fin2(skb, tp) ? 1 : 0),

12283

++			   meta_tp->rcv_nxt))) {

12284

++			mptcp_send_active_reset(meta_sk, GFP_ATOMIC);

12285

++			tcp_done(meta_sk);

12286

++			NET_INC_STATS_BH(sock_net(meta_sk), LINUX_MIB_TCPABORTONDATA);

12287

++			return 1;

12288

++		}

12289

++

12290

++		tmo = tcp_fin_time(meta_sk);

12291

++		if (tmo > TCP_TIMEWAIT_LEN) {

12292

++			inet_csk_reset_keepalive_timer(meta_sk, tmo - TCP_TIMEWAIT_LEN);

12293

++		} else if (mptcp_is_data_fin2(skb, tp) || sock_owned_by_user(meta_sk)) {

12294

++			/* Bad case. We could lose such FIN otherwise.

12295

++			 * It is not a big problem, but it looks confusing

12296

++			 * and not so rare event. We still can lose it now,

12297

++			 * if it spins in bh_lock_sock(), but it is really

12298

++			 * marginal case.

12299

++			 */

12300

++			inet_csk_reset_keepalive_timer(meta_sk, tmo);

12301

++		} else {

12302

++			meta_tp->ops->time_wait(meta_sk, TCP_FIN_WAIT2, tmo);

12303

++		}

12304

++		break;

12305

++	}

12306

++	case TCP_CLOSING:

12307

++	case TCP_LAST_ACK:

12308

++		if (meta_tp->snd_una == meta_tp->write_seq) {

12309

++			tcp_done(meta_sk);

12310

++			return 1;

12311

++		}

12312

++		break;

12313

++	}

12314

++

12315

++	/* step 7: process the segment text */

12316

++	switch (meta_sk->sk_state) {

12317

++	case TCP_FIN_WAIT1:

12318

++	case TCP_FIN_WAIT2:

12319

++		/* RFC 793 says to queue data in these states,

12320

++		 * RFC 1122 says we MUST send a reset.

12321

++		 * BSD 4.4 also does reset.

12322

++		 */

12323

++		if (meta_sk->sk_shutdown & RCV_SHUTDOWN) {

12324

++			if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&

12325

++			    after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt) &&

12326

++			    !mptcp_is_data_fin2(skb, tp)) {

12327

++				NET_INC_STATS_BH(sock_net(meta_sk), LINUX_MIB_TCPABORTONDATA);

12328

++				mptcp_send_active_reset(meta_sk, GFP_ATOMIC);

12329

++				tcp_reset(meta_sk);

12330

++				return 1;

12331

++			}

12332

++		}

12333

++		break;

12334

++	}

12335

++

12336

++	return 0;

12337

++}

12338

++

12339

++/**

12340

++ * @return:

12341

++ *  i) 1: Everything's fine.

12342

++ *  ii) -1: A reset has been sent on the subflow - csum-failure

12343

++ *  iii) 0: csum-failure but no reset sent, because it's the last subflow.

12344

++ *	 Last packet should not be destroyed by the caller because it has

12345

++ *	 been done here.

12346

++ */

12347

++static int mptcp_verif_dss_csum(struct sock *sk)

12348

++{

12349

++	struct tcp_sock *tp = tcp_sk(sk);

12350

++	struct sk_buff *tmp, *tmp1, *last = NULL;

12351

++	__wsum csum_tcp = 0; /* cumulative checksum of pld + mptcp-header */

12352

++	int ans = 1, overflowed = 0, offset = 0, dss_csum_added = 0;

12353

++	int iter = 0;

12354

++

12355

++	skb_queue_walk_safe(&sk->sk_receive_queue, tmp, tmp1) {

12356

++		unsigned int csum_len;

12357

++

12358

++		if (before(tp->mptcp->map_subseq + tp->mptcp->map_data_len, TCP_SKB_CB(tmp)->end_seq))

12359

++			/* Mapping ends in the middle of the packet -

12360

++			 * csum only these bytes

12361

++			 */

12362

++			csum_len = tp->mptcp->map_subseq + tp->mptcp->map_data_len - TCP_SKB_CB(tmp)->seq;

12363

++		else

12364

++			csum_len = tmp->len;

12365

++

12366

++		offset = 0;

12367

++		if (overflowed) {

12368

++			char first_word[4];

12369

++			first_word[0] = 0;

12370

++			first_word[1] = 0;

12371

++			first_word[2] = 0;

12372

++			first_word[3] = *(tmp->data);

12373

++			csum_tcp = csum_partial(first_word, 4, csum_tcp);

12374

++			offset = 1;

12375

++			csum_len--;

12376

++			overflowed = 0;

12377

++		}

12378

++

12379

++		csum_tcp = skb_checksum(tmp, offset, csum_len, csum_tcp);

12380

++

12381

++		/* Was it on an odd-length? Then we have to merge the next byte

12382

++		 * correctly (see above)

12383

++		 */

12384

++		if (csum_len != (csum_len & (~1)))

12385

++			overflowed = 1;

12386

++

12387

++		if (mptcp_is_data_seq(tmp) && !dss_csum_added) {

12388

++			__be32 data_seq = htonl((u32)(tp->mptcp->map_data_seq >> 32));

12389

++

12390

++			/* If a 64-bit dss is present, we increase the offset

12391

++			 * by 4 bytes, as the high-order 64-bits will be added

12392

++			 * in the final csum_partial-call.

12393

++			 */

12394

++			u32 offset = skb_transport_offset(tmp) +

12395

++				     TCP_SKB_CB(tmp)->dss_off;

12396

++			if (TCP_SKB_CB(tmp)->mptcp_flags & MPTCPHDR_SEQ64_SET)

12397

++				offset += 4;

12398

++

12399

++			csum_tcp = skb_checksum(tmp, offset,

12400

++						MPTCP_SUB_LEN_SEQ_CSUM,

12401

++						csum_tcp);

12402

++

12403

++			csum_tcp = csum_partial(&data_seq,

12404

++						sizeof(data_seq), csum_tcp);

12405

++

12406

++			dss_csum_added = 1; /* Just do it once */

12407

++		}

12408

++		last = tmp;

12409

++		iter++;

12410

++

12411

++		if (!skb_queue_is_last(&sk->sk_receive_queue, tmp) &&

12412

++		    !before(TCP_SKB_CB(tmp1)->seq,

12413

++			    tp->mptcp->map_subseq + tp->mptcp->map_data_len))

12414

++			break;

12415

++	}

12416

++

12417

++	/* Now, checksum must be 0 */

12418

++	if (unlikely(csum_fold(csum_tcp))) {

12419

++		pr_err("%s csum is wrong: %#x data_seq %u dss_csum_added %d overflowed %d iterations %d\n",

12420

++		       __func__, csum_fold(csum_tcp), TCP_SKB_CB(last)->seq,

12421

++		       dss_csum_added, overflowed, iter);

12422

++

12423

++		tp->mptcp->send_mp_fail = 1;

12424

++

12425

++		/* map_data_seq is the data-seq number of the

12426

++		 * mapping we are currently checking

12427

++		 */

12428

++		tp->mpcb->csum_cutoff_seq = tp->mptcp->map_data_seq;

12429

++

12430

++		if (tp->mpcb->cnt_subflows > 1) {

12431

++			mptcp_send_reset(sk);

12432

++			ans = -1;

12433

++		} else {

12434

++			tp->mpcb->send_infinite_mapping = 1;

12435

++

12436

++			/* Need to purge the rcv-queue as it's no more valid */

12437

++			while ((tmp = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {

12438

++				tp->copied_seq = TCP_SKB_CB(tmp)->end_seq;

12439

++				kfree_skb(tmp);

12440

++			}

12441

++

12442

++			ans = 0;

12443

++		}

12444

++	}

12445

++

12446

++	return ans;

12447

++}

12448

++

12449

++static inline void mptcp_prepare_skb(struct sk_buff *skb,

12450

++				     const struct sock *sk)

12451

++{

12452

++	const struct tcp_sock *tp = tcp_sk(sk);

12453

++	struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);

12454

++	u32 inc = 0;

12455

++

12456

++	/* If skb is the end of this mapping (end is always at mapping-boundary

12457

++	 * thanks to the splitting/trimming), then we need to increase

12458

++	 * data-end-seq by 1 if this here is a data-fin.

12459

++	 *

12460

++	 * We need to do -1 because end_seq includes the subflow-FIN.

12461

++	 */

12462

++	if (tp->mptcp->map_data_fin &&

12463

++	    (tcb->end_seq - (tcp_hdr(skb)->fin ? 1 : 0)) ==

12464

++	    (tp->mptcp->map_subseq + tp->mptcp->map_data_len)) {

12465

++		inc = 1;

12466

++

12467

++		/* We manually set the fin-flag if it is a data-fin. For easy

12468

++		 * processing in tcp_recvmsg.

12469

++		 */

12470

++		tcp_hdr(skb)->fin = 1;

12471

++	} else {

12472

++		/* We may have a subflow-fin with data but without data-fin */

12473

++		tcp_hdr(skb)->fin = 0;

12474

++	}

12475

++

12476

++	/* Adapt data-seq's to the packet itself. We kinda transform the

12477

++	 * dss-mapping to a per-packet granularity. This is necessary to

12478

++	 * correctly handle overlapping mappings coming from different

12479

++	 * subflows. Otherwise it would be a complete mess.

12480

++	 */

12481

++	tcb->seq = ((u32)tp->mptcp->map_data_seq) + tcb->seq - tp->mptcp->map_subseq;

12482

++	tcb->end_seq = tcb->seq + skb->len + inc;

12483

++}

12484

++

12485

++/**

12486

++ * @return: 1 if the segment has been eaten and can be suppressed,

12487

++ *          otherwise 0.

12488

++ */

12489

++static inline int mptcp_direct_copy(const struct sk_buff *skb,

12490

++				    struct sock *meta_sk)

12491

++{

12492

++	struct tcp_sock *meta_tp = tcp_sk(meta_sk);

12493

++	int chunk = min_t(unsigned int, skb->len, meta_tp->ucopy.len);

12494

++	int eaten = 0;

12495

++

12496

++	__set_current_state(TASK_RUNNING);

12497

++

12498

++	local_bh_enable();

12499

++	if (!skb_copy_datagram_iovec(skb, 0, meta_tp->ucopy.iov, chunk)) {

12500

++		meta_tp->ucopy.len -= chunk;

12501

++		meta_tp->copied_seq += chunk;

12502

++		eaten = (chunk == skb->len);

12503

++		tcp_rcv_space_adjust(meta_sk);

12504

++	}

12505

++	local_bh_disable();

12506

++	return eaten;

12507

++}

12508

++

12509

++static inline void mptcp_reset_mapping(struct tcp_sock *tp)

12510

++{

12511

++	tp->mptcp->map_data_len = 0;

12512

++	tp->mptcp->map_data_seq = 0;

12513

++	tp->mptcp->map_subseq = 0;

12514

++	tp->mptcp->map_data_fin = 0;

12515

++	tp->mptcp->mapping_present = 0;

12516

++}

12517

++

12518

++/* The DSS-mapping received on the sk only covers the second half of the skb

12519

++ * (cut at seq). We trim the head from the skb.

12520

++ * Data will be freed upon kfree().

12521

++ *

12522

++ * Inspired by tcp_trim_head().

12523

++ */

12524

++static void mptcp_skb_trim_head(struct sk_buff *skb, struct sock *sk, u32 seq)

12525

++{

12526

++	int len = seq - TCP_SKB_CB(skb)->seq;

12527

++	u32 new_seq = TCP_SKB_CB(skb)->seq + len;

12528

++

12529

++	if (len < skb_headlen(skb))

12530

++		__skb_pull(skb, len);

12531

++	else

12532

++		__pskb_trim_head(skb, len - skb_headlen(skb));

12533

++

12534

++	TCP_SKB_CB(skb)->seq = new_seq;

12535

++

12536

++	skb->truesize -= len;

12537

++	atomic_sub(len, &sk->sk_rmem_alloc);

12538

++	sk_mem_uncharge(sk, len);

12539

++}

12540

++

12541

++/* The DSS-mapping received on the sk only covers the first half of the skb

12542

++ * (cut at seq). We create a second skb (@return), and queue it in the rcv-queue

12543

++ * as further packets may resolve the mapping of the second half of data.

12544

++ *

12545

++ * Inspired by tcp_fragment().

12546

++ */

12547

++static int mptcp_skb_split_tail(struct sk_buff *skb, struct sock *sk, u32 seq)

12548

++{

12549

++	struct sk_buff *buff;

12550

++	int nsize;

12551

++	int nlen, len;

12552

++

12553

++	len = seq - TCP_SKB_CB(skb)->seq;

12554

++	nsize = skb_headlen(skb) - len + tcp_sk(sk)->tcp_header_len;

12555

++	if (nsize < 0)

12556

++		nsize = 0;

12557

++

12558

++	/* Get a new skb... force flag on. */

12559

++	buff = alloc_skb(nsize, GFP_ATOMIC);

12560

++	if (buff == NULL)

12561

++		return -ENOMEM;

12562

++

12563

++	skb_reserve(buff, tcp_sk(sk)->tcp_header_len);

12564

++	skb_reset_transport_header(buff);

12565

++

12566

++	tcp_hdr(buff)->fin = tcp_hdr(skb)->fin;

12567

++	tcp_hdr(skb)->fin = 0;

12568

++

12569

++	/* We absolutly need to call skb_set_owner_r before refreshing the

12570

++	 * truesize of buff, otherwise the moved data will account twice.

12571

++	 */

12572

++	skb_set_owner_r(buff, sk);

12573

++	nlen = skb->len - len - nsize;

12574

++	buff->truesize += nlen;

12575

++	skb->truesize -= nlen;

12576

++

12577

++	/* Correct the sequence numbers. */

12578

++	TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;

12579

++	TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;

12580

++	TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;

12581

++

12582

++	skb_split(skb, buff, len);

12583

++

12584

++	__skb_queue_after(&sk->sk_receive_queue, skb, buff);

12585

++

12586

++	return 0;

12587

++}

12588

++

12589

++/* @return: 0  everything is fine. Just continue processing

12590

++ *	    1  subflow is broken stop everything

12591

++ *	    -1 this packet was broken - continue with the next one.

12592

++ */

12593

++static int mptcp_prevalidate_skb(struct sock *sk, struct sk_buff *skb)

12594

++{

12595

++	struct tcp_sock *tp = tcp_sk(sk);

12596

++

12597

++	/* If we are in infinite mode, the subflow-fin is in fact a data-fin. */

12598

++	if (!skb->len && tcp_hdr(skb)->fin && !mptcp_is_data_fin(skb) &&

12599

++	    !tp->mpcb->infinite_mapping_rcv) {

12600

++		/* Remove a pure subflow-fin from the queue and increase

12601

++		 * copied_seq.

12602

++		 */

12603

++		tp->copied_seq = TCP_SKB_CB(skb)->end_seq;

12604

++		__skb_unlink(skb, &sk->sk_receive_queue);

12605

++		__kfree_skb(skb);

12606

++		return -1;

12607

++	}

12608

++

12609

++	/* If we are not yet fully established and do not know the mapping for

12610

++	 * this segment, this path has to fallback to infinite or be torn down.

12611

++	 */

12612

++	if (!tp->mptcp->fully_established && !mptcp_is_data_seq(skb) &&

12613

++	    !tp->mptcp->mapping_present && !tp->mpcb->infinite_mapping_rcv) {

12614

++		pr_err("%s %#x will fallback - pi %d from %pS, seq %u\n",

12615

++		       __func__, tp->mpcb->mptcp_loc_token,

12616

++		       tp->mptcp->path_index, __builtin_return_address(0),

12617

++		       TCP_SKB_CB(skb)->seq);

12618

++

12619

++		if (!is_master_tp(tp)) {

12620

++			mptcp_send_reset(sk);

12621

++			return 1;

12622

++		}

12623

++

12624

++		tp->mpcb->infinite_mapping_snd = 1;

12625

++		tp->mpcb->infinite_mapping_rcv = 1;

12626

++		/* We do a seamless fallback and should not send a inf.mapping. */

12627

++		tp->mpcb->send_infinite_mapping = 0;

12628

++		tp->mptcp->fully_established = 1;

12629

++	}

12630

++

12631

++	/* Receiver-side becomes fully established when a whole rcv-window has

12632

++	 * been received without the need to fallback due to the previous

12633

++	 * condition.

12634

++	 */

12635

++	if (!tp->mptcp->fully_established) {

12636

++		tp->mptcp->init_rcv_wnd -= skb->len;

12637

++		if (tp->mptcp->init_rcv_wnd < 0)

12638

++			mptcp_become_fully_estab(sk);

12639

++	}

12640

++

12641

++	return 0;

12642

++}

12643

++

12644

++/* @return: 0  everything is fine. Just continue processing

12645

++ *	    1  subflow is broken stop everything

12646

++ *	    -1 this packet was broken - continue with the next one.

12647

++ */

12648

++static int mptcp_detect_mapping(struct sock *sk, struct sk_buff *skb)

12649

++{

12650

++	struct tcp_sock *tp = tcp_sk(sk), *meta_tp = mptcp_meta_tp(tp);

12651

++	struct mptcp_cb *mpcb = tp->mpcb;

12652

++	struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);

12653

++	u32 *ptr;

12654

++	u32 data_seq, sub_seq, data_len, tcp_end_seq;

12655

++

12656

++	/* If we are in infinite-mapping-mode, the subflow is guaranteed to be

12657

++	 * in-order at the data-level. Thus data-seq-numbers can be inferred

12658

++	 * from what is expected at the data-level.

12659

++	 */

12660

++	if (mpcb->infinite_mapping_rcv) {

12661

++		tp->mptcp->map_data_seq = mptcp_get_rcv_nxt_64(meta_tp);

12662

++		tp->mptcp->map_subseq = tcb->seq;

12663

++		tp->mptcp->map_data_len = skb->len;

12664

++		tp->mptcp->map_data_fin = tcp_hdr(skb)->fin;

12665

++		tp->mptcp->mapping_present = 1;

12666

++		return 0;

12667

++	}

12668

++

12669

++	/* No mapping here? Exit - it is either already set or still on its way */

12670

++	if (!mptcp_is_data_seq(skb)) {

12671

++		/* Too many packets without a mapping - this subflow is broken */

12672

++		if (!tp->mptcp->mapping_present &&

12673

++		    tp->rcv_nxt - tp->copied_seq > 65536) {

12674

++			mptcp_send_reset(sk);

12675

++			return 1;

12676

++		}

12677

++

12678

++		return 0;

12679

++	}

12680

++

12681

++	ptr = mptcp_skb_set_data_seq(skb, &data_seq, mpcb);

12682

++	ptr++;

12683

++	sub_seq = get_unaligned_be32(ptr) + tp->mptcp->rcv_isn;

12684

++	ptr++;

12685

++	data_len = get_unaligned_be16(ptr);

12686

++

12687

++	/* If it's an empty skb with DATA_FIN, sub_seq must get fixed.

12688

++	 * The draft sets it to 0, but we really would like to have the

12689

++	 * real value, to have an easy handling afterwards here in this

12690

++	 * function.

12691

++	 */

12692

++	if (mptcp_is_data_fin(skb) && skb->len == 0)

12693

++		sub_seq = TCP_SKB_CB(skb)->seq;

12694

++

12695

++	/* If there is already a mapping - we check if it maps with the current

12696

++	 * one. If not - we reset.

12697

++	 */

12698

++	if (tp->mptcp->mapping_present &&

12699

++	    (data_seq != (u32)tp->mptcp->map_data_seq ||

12700

++	     sub_seq != tp->mptcp->map_subseq ||

12701

++	     data_len != tp->mptcp->map_data_len + tp->mptcp->map_data_fin ||

12702

++	     mptcp_is_data_fin(skb) != tp->mptcp->map_data_fin)) {

12703

++		/* Mapping in packet is different from what we want */

12704

++		pr_err("%s Mappings do not match!\n", __func__);

12705

++		pr_err("%s dseq %u mdseq %u, sseq %u msseq %u dlen %u mdlen %u dfin %d mdfin %d\n",

12706

++		       __func__, data_seq, (u32)tp->mptcp->map_data_seq,

12707

++		       sub_seq, tp->mptcp->map_subseq, data_len,

12708

++		       tp->mptcp->map_data_len, mptcp_is_data_fin(skb),

12709

++		       tp->mptcp->map_data_fin);

12710

++		mptcp_send_reset(sk);

12711

++		return 1;

12712

++	}

12713

++

12714

++	/* If the previous check was good, the current mapping is valid and we exit. */

12715

++	if (tp->mptcp->mapping_present)

12716

++		return 0;

12717

++

12718

++	/* Mapping not yet set on this subflow - we set it here! */

12719

++

12720

++	if (!data_len) {

12721

++		mpcb->infinite_mapping_rcv = 1;

12722

++		tp->mptcp->fully_established = 1;

12723

++		/* We need to repeat mp_fail's until the sender felt

12724

++		 * back to infinite-mapping - here we stop repeating it.

12725

++		 */

12726

++		tp->mptcp->send_mp_fail = 0;

12727

++

12728

++		/* We have to fixup data_len - it must be the same as skb->len */

12729

++		data_len = skb->len + (mptcp_is_data_fin(skb) ? 1 : 0);

12730

++		sub_seq = tcb->seq;

12731

++

12732

++		/* TODO kill all other subflows than this one */

12733

++		/* data_seq and so on are set correctly */

12734

++

12735

++		/* At this point, the meta-ofo-queue has to be emptied,

12736

++		 * as the following data is guaranteed to be in-order at

12737

++		 * the data and subflow-level

12738

++		 */

12739

++		mptcp_purge_ofo_queue(meta_tp);

12740

++	}

12741

++

12742

++	/* We are sending mp-fail's and thus are in fallback mode.

12743

++	 * Ignore packets which do not announce the fallback and still

12744

++	 * want to provide a mapping.

12745

++	 */

12746

++	if (tp->mptcp->send_mp_fail) {

12747

++		tp->copied_seq = TCP_SKB_CB(skb)->end_seq;

12748

++		__skb_unlink(skb, &sk->sk_receive_queue);

12749

++		__kfree_skb(skb);

12750

++		return -1;

12751

++	}

12752

++

12753

++	/* FIN increased the mapping-length by 1 */

12754

++	if (mptcp_is_data_fin(skb))

12755

++		data_len--;

12756

++

12757

++	/* Subflow-sequences of packet must be

12758

++	 * (at least partially) be part of the DSS-mapping's

12759

++	 * subflow-sequence-space.

12760

++	 *

12761

++	 * Basically the mapping is not valid, if either of the

12762

++	 * following conditions is true:

12763

++	 *

12764

++	 * 1. It's not a data_fin and

12765

++	 *    MPTCP-sub_seq >= TCP-end_seq

12766

++	 *

12767

++	 * 2. It's a data_fin and TCP-end_seq > TCP-seq and

12768

++	 *    MPTCP-sub_seq >= TCP-end_seq

12769

++	 *

12770

++	 * The previous two can be merged into:

12771

++	 *    TCP-end_seq > TCP-seq and MPTCP-sub_seq >= TCP-end_seq

12772

++	 *    Because if it's not a data-fin, TCP-end_seq > TCP-seq

12773

++	 *

12774

++	 * 3. It's a data_fin and skb->len == 0 and

12775

++	 *    MPTCP-sub_seq > TCP-end_seq

12776

++	 *

12777

++	 * 4. It's not a data_fin and TCP-end_seq > TCP-seq and

12778

++	 *    MPTCP-sub_seq + MPTCP-data_len <= TCP-seq

12779

++	 *

12780

++	 * 5. MPTCP-sub_seq is prior to what we already copied (copied_seq)

12781

++	 */

12782

++

12783

++	/* subflow-fin is not part of the mapping - ignore it here ! */

12784

++	tcp_end_seq = tcb->end_seq - tcp_hdr(skb)->fin;

12785

++	if ((!before(sub_seq, tcb->end_seq) && after(tcp_end_seq, tcb->seq)) ||

12786

++	    (mptcp_is_data_fin(skb) && skb->len == 0 && after(sub_seq, tcb->end_seq)) ||

12787

++	    (!after(sub_seq + data_len, tcb->seq) && after(tcp_end_seq, tcb->seq)) ||

12788

++	    before(sub_seq, tp->copied_seq)) {

12789

++		/* Subflow-sequences of packet is different from what is in the

12790

++		 * packet's dss-mapping. The peer is misbehaving - reset

12791

++		 */

12792

++		pr_err("%s Packet's mapping does not map to the DSS sub_seq %u "

12793

++		       "end_seq %u, tcp_end_seq %u seq %u dfin %u len %u data_len %u"

12794

++		       "copied_seq %u\n", __func__, sub_seq, tcb->end_seq, tcp_end_seq, tcb->seq, mptcp_is_data_fin(skb),

12795

++		       skb->len, data_len, tp->copied_seq);

12796

++		mptcp_send_reset(sk);

12797

++		return 1;

12798

++	}

12799

++

12800

++	/* Does the DSS had 64-bit seqnum's ? */

12801

++	if (!(tcb->mptcp_flags & MPTCPHDR_SEQ64_SET)) {

12802

++		/* Wrapped around? */

12803

++		if (unlikely(after(data_seq, meta_tp->rcv_nxt) && data_seq < meta_tp->rcv_nxt)) {

12804

++			tp->mptcp->map_data_seq = mptcp_get_data_seq_64(mpcb, !mpcb->rcv_hiseq_index, data_seq);

12805

++		} else {

12806

++			/* Else, access the default high-order bits */

12807

++			tp->mptcp->map_data_seq = mptcp_get_data_seq_64(mpcb, mpcb->rcv_hiseq_index, data_seq);

12808

++		}

12809

++	} else {

12810

++		tp->mptcp->map_data_seq = mptcp_get_data_seq_64(mpcb, (tcb->mptcp_flags & MPTCPHDR_SEQ64_INDEX) ? 1 : 0, data_seq);

12811

++

12812

++		if (unlikely(tcb->mptcp_flags & MPTCPHDR_SEQ64_OFO)) {

12813

++			/* We make sure that the data_seq is invalid.

12814

++			 * It will be dropped later.

12815

++			 */

12816

++			tp->mptcp->map_data_seq += 0xFFFFFFFF;

12817

++			tp->mptcp->map_data_seq += 0xFFFFFFFF;

12818

++		}

12819

++	}

12820

++

12821

++	tp->mptcp->map_data_len = data_len;

12822

++	tp->mptcp->map_subseq = sub_seq;

12823

++	tp->mptcp->map_data_fin = mptcp_is_data_fin(skb) ? 1 : 0;

12824

++	tp->mptcp->mapping_present = 1;

12825

++

12826

++	return 0;

12827

++}

12828

++

12829

++/* Similar to tcp_sequence(...) */

12830

++static inline bool mptcp_sequence(const struct tcp_sock *meta_tp,

12831

++				 u64 data_seq, u64 end_data_seq)

12832

++{

12833

++	const struct mptcp_cb *mpcb = meta_tp->mpcb;

12834

++	u64 rcv_wup64;

12835

++

12836

++	/* Wrap-around? */

12837

++	if (meta_tp->rcv_wup > meta_tp->rcv_nxt) {

12838

++		rcv_wup64 = ((u64)(mpcb->rcv_high_order[mpcb->rcv_hiseq_index] - 1) << 32) |

12839

++				meta_tp->rcv_wup;

12840

++	} else {

12841

++		rcv_wup64 = mptcp_get_data_seq_64(mpcb, mpcb->rcv_hiseq_index,

12842

++						  meta_tp->rcv_wup);

12843

++	}

12844

++

12845

++	return	!before64(end_data_seq, rcv_wup64) &&

12846

++		!after64(data_seq, mptcp_get_rcv_nxt_64(meta_tp) + tcp_receive_window(meta_tp));

12847

++}

12848

++

12849

++/* @return: 0  everything is fine. Just continue processing

12850

++ *	    -1 this packet was broken - continue with the next one.

12851

++ */

12852

++static int mptcp_validate_mapping(struct sock *sk, struct sk_buff *skb)

12853

++{

12854

++	struct tcp_sock *tp = tcp_sk(sk);

12855

++	struct sk_buff *tmp, *tmp1;

12856

++	u32 tcp_end_seq;

12857

++

12858

++	if (!tp->mptcp->mapping_present)

12859

++		return 0;

12860

++

12861

++	/* either, the new skb gave us the mapping and the first segment

12862

++	 * in the sub-rcv-queue has to be trimmed ...

12863

++	 */

12864

++	tmp = skb_peek(&sk->sk_receive_queue);

12865

++	if (before(TCP_SKB_CB(tmp)->seq, tp->mptcp->map_subseq) &&

12866

++	    after(TCP_SKB_CB(tmp)->end_seq, tp->mptcp->map_subseq))

12867

++		mptcp_skb_trim_head(tmp, sk, tp->mptcp->map_subseq);

12868

++

12869

++	/* ... or the new skb (tail) has to be split at the end. */

12870

++	tcp_end_seq = TCP_SKB_CB(skb)->end_seq - (tcp_hdr(skb)->fin ? 1 : 0);

12871

++	if (after(tcp_end_seq, tp->mptcp->map_subseq + tp->mptcp->map_data_len)) {

12872

++		u32 seq = tp->mptcp->map_subseq + tp->mptcp->map_data_len;

12873

++		if (mptcp_skb_split_tail(skb, sk, seq)) { /* Allocation failed */

12874

++			/* TODO : maybe handle this here better.

12875

++			 * We now just force meta-retransmission.

12876

++			 */

12877

++			tp->copied_seq = TCP_SKB_CB(skb)->end_seq;

12878

++			__skb_unlink(skb, &sk->sk_receive_queue);

12879

++			__kfree_skb(skb);

12880

++			return -1;

12881

++		}

12882

++	}

12883

++

12884

++	/* Now, remove old sk_buff's from the receive-queue.

12885

++	 * This may happen if the mapping has been lost for these segments and

12886

++	 * the next mapping has already been received.

12887

++	 */

12888

++	if (before(TCP_SKB_CB(skb_peek(&sk->sk_receive_queue))->seq, tp->mptcp->map_subseq)) {

12889

++		skb_queue_walk_safe(&sk->sk_receive_queue, tmp1, tmp) {

12890

++			if (!before(TCP_SKB_CB(tmp1)->seq, tp->mptcp->map_subseq))

12891

++				break;

12892

++

12893

++			tp->copied_seq = TCP_SKB_CB(tmp1)->end_seq;

12894

++			__skb_unlink(tmp1, &sk->sk_receive_queue);

12895

++

12896

++			/* Impossible that we could free skb here, because his

12897

++			 * mapping is known to be valid from previous checks

12898

++			 */

12899

++			__kfree_skb(tmp1);

12900

++		}

12901

++	}

12902

++

12903

++	return 0;

12904

++}

12905

++

12906

++/* @return: 0  everything is fine. Just continue processing

12907

++ *	    1  subflow is broken stop everything

12908

++ *	    -1 this mapping has been put in the meta-receive-queue

12909

++ *	    -2 this mapping has been eaten by the application

12910

++ */

12911

++static int mptcp_queue_skb(struct sock *sk)

12912

++{

12913

++	struct tcp_sock *tp = tcp_sk(sk), *meta_tp = mptcp_meta_tp(tp);

12914

++	struct sock *meta_sk = mptcp_meta_sk(sk);

12915

++	struct mptcp_cb *mpcb = tp->mpcb;

12916

++	struct sk_buff *tmp, *tmp1;

12917

++	u64 rcv_nxt64 = mptcp_get_rcv_nxt_64(meta_tp);

12918

++	bool data_queued = false;

12919

++

12920

++	/* Have we not yet received the full mapping? */

12921

++	if (!tp->mptcp->mapping_present ||

12922

++	    before(tp->rcv_nxt, tp->mptcp->map_subseq + tp->mptcp->map_data_len))

12923

++		return 0;

12924

++

12925

++	/* Is this an overlapping mapping? rcv_nxt >= end_data_seq

12926

++	 * OR

12927

++	 * This mapping is out of window

12928

++	 */

12929

++	if (!before64(rcv_nxt64, tp->mptcp->map_data_seq + tp->mptcp->map_data_len + tp->mptcp->map_data_fin) ||

12930

++	    !mptcp_sequence(meta_tp, tp->mptcp->map_data_seq,

12931

++			    tp->mptcp->map_data_seq + tp->mptcp->map_data_len + tp->mptcp->map_data_fin)) {

12932

++		skb_queue_walk_safe(&sk->sk_receive_queue, tmp1, tmp) {

12933

++			__skb_unlink(tmp1, &sk->sk_receive_queue);

12934

++			tp->copied_seq = TCP_SKB_CB(tmp1)->end_seq;

12935

++			__kfree_skb(tmp1);

12936

++

12937

++			if (!skb_queue_empty(&sk->sk_receive_queue) &&

12938

++			    !before(TCP_SKB_CB(tmp)->seq,

12939

++				    tp->mptcp->map_subseq + tp->mptcp->map_data_len))

12940

++				break;

12941

++		}

12942

++

12943

++		mptcp_reset_mapping(tp);

12944

++

12945

++		return -1;

12946

++	}

12947

++

12948

++	/* Record it, because we want to send our data_fin on the same path */

12949

++	if (tp->mptcp->map_data_fin) {

12950

++		mpcb->dfin_path_index = tp->mptcp->path_index;

12951

++		mpcb->dfin_combined = !!(sk->sk_shutdown & RCV_SHUTDOWN);

12952

++	}

12953

++

12954

++	/* Verify the checksum */

12955

++	if (mpcb->dss_csum && !mpcb->infinite_mapping_rcv) {

12956

++		int ret = mptcp_verif_dss_csum(sk);

12957

++

12958

++		if (ret <= 0) {

12959

++			mptcp_reset_mapping(tp);

12960

++			return 1;

12961

++		}

12962

++	}

12963

++

12964

++	if (before64(rcv_nxt64, tp->mptcp->map_data_seq)) {

12965

++		/* Seg's have to go to the meta-ofo-queue */

12966

++		skb_queue_walk_safe(&sk->sk_receive_queue, tmp1, tmp) {

12967

++			tp->copied_seq = TCP_SKB_CB(tmp1)->end_seq;

12968

++			mptcp_prepare_skb(tmp1, sk);

12969

++			__skb_unlink(tmp1, &sk->sk_receive_queue);

12970

++			/* MUST be done here, because fragstolen may be true later.

12971

++			 * Then, kfree_skb_partial will not account the memory.

12972

++			 */

12973

++			skb_orphan(tmp1);

12974

++

12975

++			if (!mpcb->in_time_wait) /* In time-wait, do not receive data */

12976

++				mptcp_add_meta_ofo_queue(meta_sk, tmp1, sk);

12977

++			else

12978

++				__kfree_skb(tmp1);

12979

++

12980

++			if (!skb_queue_empty(&sk->sk_receive_queue) &&

12981

++			    !before(TCP_SKB_CB(tmp)->seq,

12982

++				    tp->mptcp->map_subseq + tp->mptcp->map_data_len))

12983

++				break;

12984

++		}

12985

++		tcp_enter_quickack_mode(sk);

12986

++	} else {

12987

++		/* Ready for the meta-rcv-queue */

12988

++		skb_queue_walk_safe(&sk->sk_receive_queue, tmp1, tmp) {

12989

++			int eaten = 0;

12990

++			const bool copied_early = false;

12991

++			bool fragstolen = false;

12992

++			u32 old_rcv_nxt = meta_tp->rcv_nxt;

12993

++

12994

++			tp->copied_seq = TCP_SKB_CB(tmp1)->end_seq;

12995

++			mptcp_prepare_skb(tmp1, sk);

12996

++			__skb_unlink(tmp1, &sk->sk_receive_queue);

12997

++			/* MUST be done here, because fragstolen may be true.

12998

++			 * Then, kfree_skb_partial will not account the memory.

12999

++			 */

13000

++			skb_orphan(tmp1);

13001

++

13002

++			/* This segment has already been received */

13003

++			if (!after(TCP_SKB_CB(tmp1)->end_seq, meta_tp->rcv_nxt)) {

13004

++				__kfree_skb(tmp1);

13005

++				goto next;

13006

++			}

13007

++

13008

++#ifdef CONFIG_NET_DMA

13009

++			if (TCP_SKB_CB(tmp1)->seq == meta_tp->rcv_nxt  &&

13010

++			    meta_tp->ucopy.task == current &&

13011

++			    meta_tp->copied_seq == meta_tp->rcv_nxt &&

13012

++			    tmp1->len <= meta_tp->ucopy.len &&

13013

++			    sock_owned_by_user(meta_sk) &&

13014

++			    tcp_dma_try_early_copy(meta_sk, tmp1, 0)) {

13015

++				copied_early = true;

13016

++				eaten = 1;

13017

++			}

13018

++#endif

13019

++

13020

++			/* Is direct copy possible ? */

13021

++			if (TCP_SKB_CB(tmp1)->seq == meta_tp->rcv_nxt &&

13022

++			    meta_tp->ucopy.task == current &&

13023

++			    meta_tp->copied_seq == meta_tp->rcv_nxt &&

13024

++			    meta_tp->ucopy.len && sock_owned_by_user(meta_sk) &&

13025

++			    !copied_early)

13026

++				eaten = mptcp_direct_copy(tmp1, meta_sk);

13027

++

13028

++			if (mpcb->in_time_wait) /* In time-wait, do not receive data */

13029

++				eaten = 1;

13030

++

13031

++			if (!eaten)

13032

++				eaten = tcp_queue_rcv(meta_sk, tmp1, 0, &fragstolen);

13033

++

13034

++			meta_tp->rcv_nxt = TCP_SKB_CB(tmp1)->end_seq;

13035

++			mptcp_check_rcvseq_wrap(meta_tp, old_rcv_nxt);

13036

++

13037

++#ifdef CONFIG_NET_DMA

13038

++			if (copied_early)

13039

++				meta_tp->cleanup_rbuf(meta_sk, tmp1->len);

13040

++#endif

13041

++

13042

++			if (tcp_hdr(tmp1)->fin && !mpcb->in_time_wait)

13043

++				mptcp_fin(meta_sk);

13044

++

13045

++			/* Check if this fills a gap in the ofo queue */

13046

++			if (!skb_queue_empty(&meta_tp->out_of_order_queue))

13047

++				mptcp_ofo_queue(meta_sk);

13048

++

13049

++#ifdef CONFIG_NET_DMA

13050

++			if (copied_early)

13051

++				__skb_queue_tail(&meta_sk->sk_async_wait_queue,

13052

++						 tmp1);

13053

++			else

13054

++#endif

13055

++			if (eaten)

13056

++				kfree_skb_partial(tmp1, fragstolen);

13057

++

13058

++			data_queued = true;

13059

++next:

13060

++			if (!skb_queue_empty(&sk->sk_receive_queue) &&

13061

++			    !before(TCP_SKB_CB(tmp)->seq,

13062

++				    tp->mptcp->map_subseq + tp->mptcp->map_data_len))

13063

++				break;

13064

++		}

13065

++	}

13066

++

13067

++	inet_csk(meta_sk)->icsk_ack.lrcvtime = tcp_time_stamp;

13068

++	mptcp_reset_mapping(tp);

13069

++

13070

++	return data_queued ? -1 : -2;

13071

++}

13072

++

13073

++void mptcp_data_ready(struct sock *sk)

13074

++{

13075

++	struct sock *meta_sk = mptcp_meta_sk(sk);

13076

++	struct sk_buff *skb, *tmp;

13077

++	int queued = 0;

13078

++

13079

++	/* restart before the check, because mptcp_fin might have changed the

13080

++	 * state.

13081

++	 */

13082

++restart:

13083

++	/* If the meta cannot receive data, there is no point in pushing data.

13084

++	 * If we are in time-wait, we may still be waiting for the final FIN.

13085

++	 * So, we should proceed with the processing.

13086

++	 */

13087

++	if (!mptcp_sk_can_recv(meta_sk) && !tcp_sk(sk)->mpcb->in_time_wait) {

13088

++		skb_queue_purge(&sk->sk_receive_queue);

13089

++		tcp_sk(sk)->copied_seq = tcp_sk(sk)->rcv_nxt;

13090

++		goto exit;

13091

++	}

13092

++

13093

++	/* Iterate over all segments, detect their mapping (if we don't have

13094

++	 * one yet), validate them and push everything one level higher.

13095

++	 */

13096

++	skb_queue_walk_safe(&sk->sk_receive_queue, skb, tmp) {

13097

++		int ret;

13098

++		/* Pre-validation - e.g., early fallback */

13099

++		ret = mptcp_prevalidate_skb(sk, skb);

13100

++		if (ret < 0)

13101

++			goto restart;

13102

++		else if (ret > 0)

13103

++			break;

13104

++

13105

++		/* Set the current mapping */

13106

++		ret = mptcp_detect_mapping(sk, skb);

13107

++		if (ret < 0)

13108

++			goto restart;

13109

++		else if (ret > 0)

13110

++			break;

13111

++

13112

++		/* Validation */

13113

++		if (mptcp_validate_mapping(sk, skb) < 0)

13114

++			goto restart;

13115

++

13116

++		/* Push a level higher */

13117

++		ret = mptcp_queue_skb(sk);

13118

++		if (ret < 0) {

13119

++			if (ret == -1)

13120

++				queued = ret;

13121

++			goto restart;

13122

++		} else if (ret == 0) {

13123

++			continue;

13124

++		} else { /* ret == 1 */

13125

++			break;

13126

++		}

13127

++	}

13128

++

13129

++exit:

13130

++	if (tcp_sk(sk)->close_it) {

13131

++		tcp_send_ack(sk);

13132

++		tcp_sk(sk)->ops->time_wait(sk, TCP_TIME_WAIT, 0);

13133

++	}

13134

++

13135

++	if (queued == -1 && !sock_flag(meta_sk, SOCK_DEAD))

13136

++		meta_sk->sk_data_ready(meta_sk);

13137

++}

13138

++

13139

++

13140

++int mptcp_check_req(struct sk_buff *skb, struct net *net)

13141

++{

13142

++	const struct tcphdr *th = tcp_hdr(skb);

13143

++	struct sock *meta_sk = NULL;

13144

++

13145

++	/* MPTCP structures not initialized */

13146

++	if (mptcp_init_failed)

13147

++		return 0;

13148

++

13149

++	if (skb->protocol == htons(ETH_P_IP))

13150

++		meta_sk = mptcp_v4_search_req(th->source, ip_hdr(skb)->saddr,

13151

++					      ip_hdr(skb)->daddr, net);

13152

++#if IS_ENABLED(CONFIG_IPV6)

13153

++	else /* IPv6 */

13154

++		meta_sk = mptcp_v6_search_req(th->source, &ipv6_hdr(skb)->saddr,

13155

++					      &ipv6_hdr(skb)->daddr, net);

13156

++#endif /* CONFIG_IPV6 */

13157

++

13158

++	if (!meta_sk)

13159

++		return 0;

13160

++

13161

++	TCP_SKB_CB(skb)->mptcp_flags |= MPTCPHDR_JOIN;

13162

++

13163

++	bh_lock_sock_nested(meta_sk);

13164

++	if (sock_owned_by_user(meta_sk)) {

13165

++		skb->sk = meta_sk;

13166

++		if (unlikely(sk_add_backlog(meta_sk, skb,

13167

++					    meta_sk->sk_rcvbuf + meta_sk->sk_sndbuf))) {

13168

++			bh_unlock_sock(meta_sk);

13169

++			NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);

13170

++			sock_put(meta_sk); /* Taken by mptcp_search_req */

13171

++			kfree_skb(skb);

13172

++			return 1;

13173

++		}

13174

++	} else if (skb->protocol == htons(ETH_P_IP)) {

13175

++		tcp_v4_do_rcv(meta_sk, skb);

13176

++#if IS_ENABLED(CONFIG_IPV6)

13177

++	} else { /* IPv6 */

13178

++		tcp_v6_do_rcv(meta_sk, skb);

13179

++#endif /* CONFIG_IPV6 */

13180

++	}

13181

++	bh_unlock_sock(meta_sk);

13182

++	sock_put(meta_sk); /* Taken by mptcp_vX_search_req */

13183

++	return 1;

13184

++}

13185

++

13186

++struct mp_join *mptcp_find_join(const struct sk_buff *skb)

13187

++{

13188

++	const struct tcphdr *th = tcp_hdr(skb);

13189

++	unsigned char *ptr;

13190

++	int length = (th->doff * 4) - sizeof(struct tcphdr);

13191

++

13192

++	/* Jump through the options to check whether JOIN is there */

13193

++	ptr = (unsigned char *)(th + 1);

13194

++	while (length > 0) {

13195

++		int opcode = *ptr++;

13196

++		int opsize;

13197

++

13198

++		switch (opcode) {

13199

++		case TCPOPT_EOL:

13200

++			return NULL;

13201

++		case TCPOPT_NOP:	/* Ref: RFC 793 section 3.1 */

13202

++			length--;

13203

++			continue;

13204

++		default:

13205

++			opsize = *ptr++;

13206

++			if (opsize < 2)	/* "silly options" */

13207

++				return NULL;

13208

++			if (opsize > length)

13209

++				return NULL;  /* don't parse partial options */

13210

++			if (opcode == TCPOPT_MPTCP &&

13211

++			    ((struct mptcp_option *)(ptr - 2))->sub == MPTCP_SUB_JOIN) {

13212

++				return (struct mp_join *)(ptr - 2);

13213

++			}

13214

++			ptr += opsize - 2;

13215

++			length -= opsize;

13216

++		}

13217

++	}

13218

++	return NULL;

13219

++}

13220

++

13221

++int mptcp_lookup_join(struct sk_buff *skb, struct inet_timewait_sock *tw)

13222

++{

13223

++	const struct mptcp_cb *mpcb;

13224

++	struct sock *meta_sk;

13225

++	u32 token;

13226

++	bool meta_v4;

13227

++	struct mp_join *join_opt = mptcp_find_join(skb);

13228

++	if (!join_opt)

13229

++		return 0;

13230

++

13231

++	/* MPTCP structures were not initialized, so return error */

13232

++	if (mptcp_init_failed)

13233

++		return -1;

13234

++

13235

++	token = join_opt->u.syn.token;

13236

++	meta_sk = mptcp_hash_find(dev_net(skb_dst(skb)->dev), token);

13237

++	if (!meta_sk) {

13238

++		mptcp_debug("%s:mpcb not found:%x\n", __func__, token);

13239

++		return -1;

13240

++	}

13241

++

13242

++	meta_v4 = meta_sk->sk_family == AF_INET;

13243

++	if (meta_v4) {

13244

++		if (skb->protocol == htons(ETH_P_IPV6)) {

13245

++			mptcp_debug("SYN+MP_JOIN with IPV6 address on pure IPV4 meta\n");

13246

++			sock_put(meta_sk); /* Taken by mptcp_hash_find */

13247

++			return -1;

13248

++		}

13249

++	} else if (skb->protocol == htons(ETH_P_IP) &&

13250

++		   inet6_sk(meta_sk)->ipv6only) {

13251

++		mptcp_debug("SYN+MP_JOIN with IPV4 address on IPV6_V6ONLY meta\n");

13252

++		sock_put(meta_sk); /* Taken by mptcp_hash_find */

13253

++		return -1;

13254

++	}

13255

++

13256

++	mpcb = tcp_sk(meta_sk)->mpcb;

13257

++	if (mpcb->infinite_mapping_rcv || mpcb->send_infinite_mapping) {

13258

++		/* We are in fallback-mode on the reception-side -

13259

++		 * no new subflows!

13260

++		 */

13261

++		sock_put(meta_sk); /* Taken by mptcp_hash_find */

13262

++		return -1;

13263

++	}

13264

++

13265

++	/* Coming from time-wait-sock processing in tcp_v4_rcv.

13266

++	 * We have to deschedule it before continuing, because otherwise

13267

++	 * mptcp_v4_do_rcv will hit again on it inside tcp_v4_hnd_req.

13268

++	 */

13269

++	if (tw) {

13270

++		inet_twsk_deschedule(tw, &tcp_death_row);

13271

++		inet_twsk_put(tw);

13272

++	}

13273

++

13274

++	TCP_SKB_CB(skb)->mptcp_flags |= MPTCPHDR_JOIN;

13275

++	/* OK, this is a new syn/join, let's create a new open request and

13276

++	 * send syn+ack

13277

++	 */

13278

++	bh_lock_sock_nested(meta_sk);

13279

++	if (sock_owned_by_user(meta_sk)) {

13280

++		skb->sk = meta_sk;

13281

++		if (unlikely(sk_add_backlog(meta_sk, skb,

13282

++					    meta_sk->sk_rcvbuf + meta_sk->sk_sndbuf))) {

13283

++			bh_unlock_sock(meta_sk);

13284

++			NET_INC_STATS_BH(sock_net(meta_sk),

13285

++					 LINUX_MIB_TCPBACKLOGDROP);

13286

++			sock_put(meta_sk); /* Taken by mptcp_hash_find */

13287

++			kfree_skb(skb);

13288

++			return 1;

13289

++		}

13290

++	} else if (skb->protocol == htons(ETH_P_IP)) {

13291

++		tcp_v4_do_rcv(meta_sk, skb);

13292

++#if IS_ENABLED(CONFIG_IPV6)

13293

++	} else {

13294

++		tcp_v6_do_rcv(meta_sk, skb);

13295

++#endif /* CONFIG_IPV6 */

13296

++	}

13297

++	bh_unlock_sock(meta_sk);

13298

++	sock_put(meta_sk); /* Taken by mptcp_hash_find */

13299

++	return 1;

13300

++}

13301

++

13302

++int mptcp_do_join_short(struct sk_buff *skb,

13303

++			const struct mptcp_options_received *mopt,

13304

++			struct net *net)

13305

++{

13306

++	struct sock *meta_sk;

13307

++	u32 token;

13308

++	bool meta_v4;

13309

++

13310

++	token = mopt->mptcp_rem_token;

13311

++	meta_sk = mptcp_hash_find(net, token);

13312

++	if (!meta_sk) {

13313

++		mptcp_debug("%s:mpcb not found:%x\n", __func__, token);

13314

++		return -1;

13315

++	}

13316

++

13317

++	meta_v4 = meta_sk->sk_family == AF_INET;

13318

++	if (meta_v4) {

13319

++		if (skb->protocol == htons(ETH_P_IPV6)) {

13320

++			mptcp_debug("SYN+MP_JOIN with IPV6 address on pure IPV4 meta\n");

13321

++			sock_put(meta_sk); /* Taken by mptcp_hash_find */

13322

++			return -1;

13323

++		}

13324

++	} else if (skb->protocol == htons(ETH_P_IP) &&

13325

++		   inet6_sk(meta_sk)->ipv6only) {

13326

++		mptcp_debug("SYN+MP_JOIN with IPV4 address on IPV6_V6ONLY meta\n");

13327

++		sock_put(meta_sk); /* Taken by mptcp_hash_find */

13328

++		return -1;

13329

++	}

13330

++

13331

++	TCP_SKB_CB(skb)->mptcp_flags |= MPTCPHDR_JOIN;

13332

++

13333

++	/* OK, this is a new syn/join, let's create a new open request and

13334

++	 * send syn+ack

13335

++	 */

13336

++	bh_lock_sock(meta_sk);

13337

++

13338

++	/* This check is also done in mptcp_vX_do_rcv. But, there we cannot

13339

++	 * call tcp_vX_send_reset, because we hold already two socket-locks.

13340

++	 * (the listener and the meta from above)

13341

++	 *

13342

++	 * And the send-reset will try to take yet another one (ip_send_reply).

13343

++	 * Thus, we propagate the reset up to tcp_rcv_state_process.

13344

++	 */

13345

++	if (tcp_sk(meta_sk)->mpcb->infinite_mapping_rcv ||

13346

++	    tcp_sk(meta_sk)->mpcb->send_infinite_mapping ||

13347

++	    meta_sk->sk_state == TCP_CLOSE || !tcp_sk(meta_sk)->inside_tk_table) {

13348

++		bh_unlock_sock(meta_sk);

13349

++		sock_put(meta_sk); /* Taken by mptcp_hash_find */

13350

++		return -1;

13351

++	}

13352

++

13353

++	if (sock_owned_by_user(meta_sk)) {

13354

++		skb->sk = meta_sk;

13355

++		if (unlikely(sk_add_backlog(meta_sk, skb,

13356

++					    meta_sk->sk_rcvbuf + meta_sk->sk_sndbuf)))

13357

++			NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);

13358

++		else

13359

++			/* Must make sure that upper layers won't free the

13360

++			 * skb if it is added to the backlog-queue.

13361

++			 */

13362

++			skb_get(skb);

13363

++	} else {

13364

++		/* mptcp_v4_do_rcv tries to free the skb - we prevent this, as

13365

++		 * the skb will finally be freed by tcp_v4_do_rcv (where we are

13366

++		 * coming from)

13367

++		 */

13368

++		skb_get(skb);

13369

++		if (skb->protocol == htons(ETH_P_IP)) {

13370

++			tcp_v4_do_rcv(meta_sk, skb);

13371

++#if IS_ENABLED(CONFIG_IPV6)

13372

++		} else { /* IPv6 */

13373

++			tcp_v6_do_rcv(meta_sk, skb);

13374

++#endif /* CONFIG_IPV6 */

13375

++		}

13376

++	}

13377

++

13378

++	bh_unlock_sock(meta_sk);

13379

++	sock_put(meta_sk); /* Taken by mptcp_hash_find */

13380

++	return 0;

13381

++}

13382

++

13383

++/**

13384

++ * Equivalent of tcp_fin() for MPTCP

13385

++ * Can be called only when the FIN is validly part

13386

++ * of the data seqnum space. Not before when we get holes.

13387

++ */

13388

++void mptcp_fin(struct sock *meta_sk)

13389

++{

13390

++	struct sock *sk = NULL, *sk_it;

13391

++	struct tcp_sock *meta_tp = tcp_sk(meta_sk);

13392

++	struct mptcp_cb *mpcb = meta_tp->mpcb;

13393

++

13394

++	mptcp_for_each_sk(mpcb, sk_it) {

13395

++		if (tcp_sk(sk_it)->mptcp->path_index == mpcb->dfin_path_index) {

13396

++			sk = sk_it;

13397

++			break;

13398

++		}

13399

++	}

13400

++

13401

++	if (!sk || sk->sk_state == TCP_CLOSE)

13402

++		sk = mptcp_select_ack_sock(meta_sk);

13403

++

13404

++	inet_csk_schedule_ack(sk);

13405

++

13406

++	meta_sk->sk_shutdown |= RCV_SHUTDOWN;

13407

++	sock_set_flag(meta_sk, SOCK_DONE);

13408

++

13409

++	switch (meta_sk->sk_state) {

13410

++	case TCP_SYN_RECV:

13411

++	case TCP_ESTABLISHED:

13412

++		/* Move to CLOSE_WAIT */

13413

++		tcp_set_state(meta_sk, TCP_CLOSE_WAIT);

13414

++		inet_csk(sk)->icsk_ack.pingpong = 1;

13415

++		break;

13416

++

13417

++	case TCP_CLOSE_WAIT:

13418

++	case TCP_CLOSING:

13419

++		/* Received a retransmission of the FIN, do

13420

++		 * nothing.

13421

++		 */

13422

++		break;

13423

++	case TCP_LAST_ACK:

13424

++		/* RFC793: Remain in the LAST-ACK state. */

13425

++		break;

13426

++

13427

++	case TCP_FIN_WAIT1:

13428

++		/* This case occurs when a simultaneous close

13429

++		 * happens, we must ack the received FIN and

13430

++		 * enter the CLOSING state.

13431

++		 */

13432

++		tcp_send_ack(sk);

13433

++		tcp_set_state(meta_sk, TCP_CLOSING);

13434

++		break;

13435

++	case TCP_FIN_WAIT2:

13436

++		/* Received a FIN -- send ACK and enter TIME_WAIT. */

13437

++		tcp_send_ack(sk);

13438

++		meta_tp->ops->time_wait(meta_sk, TCP_TIME_WAIT, 0);

13439

++		break;

13440

++	default:

13441

++		/* Only TCP_LISTEN and TCP_CLOSE are left, in these

13442

++		 * cases we should never reach this piece of code.

13443

++		 */

13444

++		pr_err("%s: Impossible, meta_sk->sk_state=%d\n", __func__,

13445

++		       meta_sk->sk_state);

13446

++		break;

13447

++	}

13448

++

13449

++	/* It _is_ possible, that we have something out-of-order _after_ FIN.

13450

++	 * Probably, we should reset in this case. For now drop them.

13451

++	 */

13452

++	mptcp_purge_ofo_queue(meta_tp);

13453

++	sk_mem_reclaim(meta_sk);

13454

++

13455

++	if (!sock_flag(meta_sk, SOCK_DEAD)) {

13456

++		meta_sk->sk_state_change(meta_sk);

13457

++

13458

++		/* Do not send POLL_HUP for half duplex close. */

13459

++		if (meta_sk->sk_shutdown == SHUTDOWN_MASK ||

13460

++		    meta_sk->sk_state == TCP_CLOSE)

13461

++			sk_wake_async(meta_sk, SOCK_WAKE_WAITD, POLL_HUP);

13462

++		else

13463

++			sk_wake_async(meta_sk, SOCK_WAKE_WAITD, POLL_IN);

13464

++	}

13465

++

13466

++	return;

13467

++}

13468

++

13469

++static void mptcp_xmit_retransmit_queue(struct sock *meta_sk)

13470

++{

13471

++	struct tcp_sock *meta_tp = tcp_sk(meta_sk);

13472

++	struct sk_buff *skb;

13473

++

13474

++	if (!meta_tp->packets_out)

13475

++		return;

13476

++

13477

++	tcp_for_write_queue(skb, meta_sk) {

13478

++		if (skb == tcp_send_head(meta_sk))

13479

++			break;

13480

++

13481

++		if (mptcp_retransmit_skb(meta_sk, skb))

13482

++			return;

13483

++

13484

++		if (skb == tcp_write_queue_head(meta_sk))

13485

++			inet_csk_reset_xmit_timer(meta_sk, ICSK_TIME_RETRANS,

13486

++						  inet_csk(meta_sk)->icsk_rto,

13487

++						  TCP_RTO_MAX);

13488

++	}

13489

++}

13490

++

13491

++/* Handle the DATA_ACK */

13492

++static void mptcp_data_ack(struct sock *sk, const struct sk_buff *skb)

13493

++{

13494

++	struct sock *meta_sk = mptcp_meta_sk(sk);

13495

++	struct tcp_sock *meta_tp = tcp_sk(meta_sk), *tp = tcp_sk(sk);

13496

++	struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);

13497

++	u32 prior_snd_una = meta_tp->snd_una;

13498

++	int prior_packets;

13499

++	u32 nwin, data_ack, data_seq;

13500

++	u16 data_len = 0;

13501

++

13502

++	/* A valid packet came in - subflow is operational again */

13503

++	tp->pf = 0;

13504

++

13505

++	/* Even if there is no data-ack, we stop retransmitting.

13506

++	 * Except if this is a SYN/ACK. Then it is just a retransmission

13507

++	 */

13508

++	if (tp->mptcp->pre_established && !tcp_hdr(skb)->syn) {

13509

++		tp->mptcp->pre_established = 0;

13510

++		sk_stop_timer(sk, &tp->mptcp->mptcp_ack_timer);

13511

++	}

13512

++

13513

++	/* If we are in infinite mapping mode, rx_opt.data_ack has been

13514

++	 * set by mptcp_clean_rtx_infinite.

13515

++	 */

13516

++	if (!(tcb->mptcp_flags & MPTCPHDR_ACK) && !tp->mpcb->infinite_mapping_snd)

13517

++		goto exit;

13518

++

13519

++	data_ack = tp->mptcp->rx_opt.data_ack;

13520

++

13521

++	if (unlikely(!tp->mptcp->fully_established) &&

13522

++	    tp->mptcp->snt_isn + 1 != TCP_SKB_CB(skb)->ack_seq)

13523

++		/* As soon as a subflow-data-ack (not acking syn, thus snt_isn + 1)

13524

++		 * includes a data-ack, we are fully established

13525

++		 */

13526

++		mptcp_become_fully_estab(sk);

13527

++

13528

++	/* Get the data_seq */

13529

++	if (mptcp_is_data_seq(skb)) {

13530

++		data_seq = tp->mptcp->rx_opt.data_seq;

13531

++		data_len = tp->mptcp->rx_opt.data_len;

13532

++	} else {

13533

++		data_seq = meta_tp->snd_wl1;

13534

++	}

13535

++

13536

++	/* If the ack is older than previous acks

13537

++	 * then we can probably ignore it.

13538

++	 */

13539

++	if (before(data_ack, prior_snd_una))

13540

++		goto exit;

13541

++

13542

++	/* If the ack includes data we haven't sent yet, discard

13543

++	 * this segment (RFC793 Section 3.9).

13544

++	 */

13545

++	if (after(data_ack, meta_tp->snd_nxt))

13546

++		goto exit;

13547

++

13548

++	/*** Now, update the window  - inspired by tcp_ack_update_window ***/

13549

++	nwin = ntohs(tcp_hdr(skb)->window);

13550

++

13551

++	if (likely(!tcp_hdr(skb)->syn))

13552

++		nwin <<= tp->rx_opt.snd_wscale;

13553

++

13554

++	if (tcp_may_update_window(meta_tp, data_ack, data_seq, nwin)) {

13555

++		tcp_update_wl(meta_tp, data_seq);

13556

++

13557

++		/* Draft v09, Section 3.3.5:

13558

++		 * [...] It should only update its local receive window values

13559

++		 * when the largest sequence number allowed (i.e.  DATA_ACK +

13560

++		 * receive window) increases. [...]

13561

++		 */

13562

++		if (meta_tp->snd_wnd != nwin &&

13563

++		    !before(data_ack + nwin, tcp_wnd_end(meta_tp))) {

13564

++			meta_tp->snd_wnd = nwin;

13565

++

13566

++			if (nwin > meta_tp->max_window)

13567

++				meta_tp->max_window = nwin;

13568

++		}

13569

++	}

13570

++	/*** Done, update the window ***/

13571

++

13572

++	/* We passed data and got it acked, remove any soft error

13573

++	 * log. Something worked...

13574

++	 */

13575

++	sk->sk_err_soft = 0;

13576

++	inet_csk(meta_sk)->icsk_probes_out = 0;

13577

++	meta_tp->rcv_tstamp = tcp_time_stamp;

13578

++	prior_packets = meta_tp->packets_out;

13579

++	if (!prior_packets)

13580

++		goto no_queue;

13581

++

13582

++	meta_tp->snd_una = data_ack;

13583

++

13584

++	mptcp_clean_rtx_queue(meta_sk, prior_snd_una);

13585

++

13586

++	/* We are in loss-state, and something got acked, retransmit the whole

13587

++	 * queue now!

13588

++	 */

13589

++	if (inet_csk(meta_sk)->icsk_ca_state == TCP_CA_Loss &&

13590

++	    after(data_ack, prior_snd_una)) {

13591

++		mptcp_xmit_retransmit_queue(meta_sk);

13592

++		inet_csk(meta_sk)->icsk_ca_state = TCP_CA_Open;

13593

++	}

13594

++

13595

++	/* Simplified version of tcp_new_space, because the snd-buffer

13596

++	 * is handled by all the subflows.

13597

++	 */

13598

++	if (sock_flag(meta_sk, SOCK_QUEUE_SHRUNK)) {

13599

++		sock_reset_flag(meta_sk, SOCK_QUEUE_SHRUNK);

13600

++		if (meta_sk->sk_socket &&

13601

++		    test_bit(SOCK_NOSPACE, &meta_sk->sk_socket->flags))

13602

++			meta_sk->sk_write_space(meta_sk);

13603

++	}

13604

++

13605

++	if (meta_sk->sk_state != TCP_ESTABLISHED &&

13606

++	    mptcp_rcv_state_process(meta_sk, sk, skb, data_seq, data_len))

13607

++		return;

13608

++

13609

++exit:

13610

++	mptcp_push_pending_frames(meta_sk);

13611

++

13612

++	return;

13613

++

13614

++no_queue:

13615

++	if (tcp_send_head(meta_sk))

13616

++		tcp_ack_probe(meta_sk);

13617

++

13618

++	mptcp_push_pending_frames(meta_sk);

13619

++

13620

++	return;

13621

++}

13622

++

13623

++void mptcp_clean_rtx_infinite(const struct sk_buff *skb, struct sock *sk)

13624

++{

13625

++	struct tcp_sock *tp = tcp_sk(sk), *meta_tp = tcp_sk(mptcp_meta_sk(sk));

13626

++

13627

++	if (!tp->mpcb->infinite_mapping_snd)

13628

++		return;

13629

++

13630

++	/* The difference between both write_seq's represents the offset between

13631

++	 * data-sequence and subflow-sequence. As we are infinite, this must

13632

++	 * match.

13633

++	 *

13634

++	 * Thus, from this difference we can infer the meta snd_una.

13635

++	 */

13636

++	tp->mptcp->rx_opt.data_ack = meta_tp->snd_nxt - tp->snd_nxt +

13637

++				     tp->snd_una;

13638

++

13639

++	mptcp_data_ack(sk, skb);

13640

++}

13641

++

13642

++/**** static functions used by mptcp_parse_options */

13643

++

13644

++static void mptcp_send_reset_rem_id(const struct mptcp_cb *mpcb, u8 rem_id)

13645

++{

13646

++	struct sock *sk_it, *tmpsk;

13647

++

13648

++	mptcp_for_each_sk_safe(mpcb, sk_it, tmpsk) {

13649

++		if (tcp_sk(sk_it)->mptcp->rem_id == rem_id) {

13650

++			mptcp_reinject_data(sk_it, 0);

13651

++			sk_it->sk_err = ECONNRESET;

13652

++			if (tcp_need_reset(sk_it->sk_state))

13653

++				tcp_sk(sk_it)->ops->send_active_reset(sk_it,

13654

++								      GFP_ATOMIC);

13655

++			mptcp_sub_force_close(sk_it);

13656

++		}

13657

++	}

13658

++}

13659

++

13660

++void mptcp_parse_options(const uint8_t *ptr, int opsize,

13661

++			 struct mptcp_options_received *mopt,

13662

++			 const struct sk_buff *skb)

13663

++{

13664

++	const struct mptcp_option *mp_opt = (struct mptcp_option *)ptr;

13665

++

13666

++	/* If the socket is mp-capable we would have a mopt. */

13667

++	if (!mopt)

13668

++		return;

13669

++

13670

++	switch (mp_opt->sub) {

13671

++	case MPTCP_SUB_CAPABLE:

13672

++	{

13673

++		const struct mp_capable *mpcapable = (struct mp_capable *)ptr;

13674

++

13675

++		if (opsize != MPTCP_SUB_LEN_CAPABLE_SYN &&

13676

++		    opsize != MPTCP_SUB_LEN_CAPABLE_ACK) {

13677

++			mptcp_debug("%s: mp_capable: bad option size %d\n",

13678

++				    __func__, opsize);

13679

++			break;

13680

++		}

13681

++

13682

++		if (!sysctl_mptcp_enabled)

13683

++			break;

13684

++

13685

++		/* We only support MPTCP version 0 */

13686

++		if (mpcapable->ver != 0)

13687

++			break;

13688

++

13689

++		/* MPTCP-RFC 6824:

13690

++		 * "If receiving a message with the 'B' flag set to 1, and this

13691

++		 * is not understood, then this SYN MUST be silently ignored;

13692

++		 */

13693

++		if (mpcapable->b) {

13694

++			mopt->drop_me = 1;

13695

++			break;

13696

++		}

13697

++

13698

++		/* MPTCP-RFC 6824:

13699

++		 * "An implementation that only supports this method MUST set

13700

++		 *  bit "H" to 1, and bits "C" through "G" to 0."

13701

++		 */

13702

++		if (!mpcapable->h)

13703

++			break;

13704

++

13705

++		mopt->saw_mpc = 1;

13706

++		mopt->dss_csum = sysctl_mptcp_checksum || mpcapable->a;

13707

++

13708

++		if (opsize >= MPTCP_SUB_LEN_CAPABLE_SYN)

13709

++			mopt->mptcp_key = mpcapable->sender_key;

13710

++

13711

++		break;

13712

++	}

13713

++	case MPTCP_SUB_JOIN:

13714

++	{

13715

++		const struct mp_join *mpjoin = (struct mp_join *)ptr;

13716

++

13717

++		if (opsize != MPTCP_SUB_LEN_JOIN_SYN &&

13718

++		    opsize != MPTCP_SUB_LEN_JOIN_SYNACK &&

13719

++		    opsize != MPTCP_SUB_LEN_JOIN_ACK) {

13720

++			mptcp_debug("%s: mp_join: bad option size %d\n",

13721

++				    __func__, opsize);

13722

++			break;

13723

++		}

13724

++

13725

++		/* saw_mpc must be set, because in tcp_check_req we assume that

13726

++		 * it is set to support falling back to reg. TCP if a rexmitted

13727

++		 * SYN has no MP_CAPABLE or MP_JOIN

13728

++		 */

13729

++		switch (opsize) {

13730

++		case MPTCP_SUB_LEN_JOIN_SYN:

13731

++			mopt->is_mp_join = 1;

13732

++			mopt->saw_mpc = 1;

13733

++			mopt->low_prio = mpjoin->b;

13734

++			mopt->rem_id = mpjoin->addr_id;

13735

++			mopt->mptcp_rem_token = mpjoin->u.syn.token;

13736

++			mopt->mptcp_recv_nonce = mpjoin->u.syn.nonce;

13737

++			break;

13738

++		case MPTCP_SUB_LEN_JOIN_SYNACK:

13739

++			mopt->saw_mpc = 1;

13740

++			mopt->low_prio = mpjoin->b;

13741

++			mopt->rem_id = mpjoin->addr_id;

13742

++			mopt->mptcp_recv_tmac = mpjoin->u.synack.mac;

13743

++			mopt->mptcp_recv_nonce = mpjoin->u.synack.nonce;

13744

++			break;

13745

++		case MPTCP_SUB_LEN_JOIN_ACK:

13746

++			mopt->saw_mpc = 1;

13747

++			mopt->join_ack = 1;

13748

++			memcpy(mopt->mptcp_recv_mac, mpjoin->u.ack.mac, 20);

13749

++			break;

13750

++		}

13751

++		break;

13752

++	}

13753

++	case MPTCP_SUB_DSS:

13754

++	{

13755

++		const struct mp_dss *mdss = (struct mp_dss *)ptr;

13756

++		struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);

13757

++

13758

++		/* We check opsize for the csum and non-csum case. We do this,

13759

++		 * because the draft says that the csum SHOULD be ignored if

13760

++		 * it has not been negotiated in the MP_CAPABLE but still is

13761

++		 * present in the data.

13762

++		 *

13763

++		 * It will get ignored later in mptcp_queue_skb.

13764

++		 */

13765

++		if (opsize != mptcp_sub_len_dss(mdss, 0) &&

13766

++		    opsize != mptcp_sub_len_dss(mdss, 1)) {

13767

++			mptcp_debug("%s: mp_dss: bad option size %d\n",

13768

++				    __func__, opsize);

13769

++			break;

13770

++		}

13771

++

13772

++		ptr += 4;

13773

++

13774

++		if (mdss->A) {

13775

++			tcb->mptcp_flags |= MPTCPHDR_ACK;

13776

++

13777

++			if (mdss->a) {

13778

++				mopt->data_ack = (u32) get_unaligned_be64(ptr);

13779

++				ptr += MPTCP_SUB_LEN_ACK_64;

13780

++			} else {

13781

++				mopt->data_ack = get_unaligned_be32(ptr);

13782

++				ptr += MPTCP_SUB_LEN_ACK;

13783

++			}

13784

++		}

13785

++

13786

++		tcb->dss_off = (ptr - skb_transport_header(skb));

13787

++

13788

++		if (mdss->M) {

13789

++			if (mdss->m) {

13790

++				u64 data_seq64 = get_unaligned_be64(ptr);

13791

++

13792

++				tcb->mptcp_flags |= MPTCPHDR_SEQ64_SET;

13793

++				mopt->data_seq = (u32) data_seq64;

13794

++

13795

++				ptr += 12; /* 64-bit dseq + subseq */

13796

++			} else {

13797

++				mopt->data_seq = get_unaligned_be32(ptr);

13798

++				ptr += 8; /* 32-bit dseq + subseq */

13799

++			}

13800

++			mopt->data_len = get_unaligned_be16(ptr);

13801

++

13802

++			tcb->mptcp_flags |= MPTCPHDR_SEQ;

13803

++

13804

++			/* Is a check-sum present? */

13805

++			if (opsize == mptcp_sub_len_dss(mdss, 1))

13806

++				tcb->mptcp_flags |= MPTCPHDR_DSS_CSUM;

13807

++

13808

++			/* DATA_FIN only possible with DSS-mapping */

13809

++			if (mdss->F)

13810

++				tcb->mptcp_flags |= MPTCPHDR_FIN;

13811

++		}

13812

++

13813

++		break;

13814

++	}

13815

++	case MPTCP_SUB_ADD_ADDR:

13816

++	{

13817

++#if IS_ENABLED(CONFIG_IPV6)

13818

++		const struct mp_add_addr *mpadd = (struct mp_add_addr *)ptr;

13819

++

13820

++		if ((mpadd->ipver == 4 && opsize != MPTCP_SUB_LEN_ADD_ADDR4 &&

13821

++		     opsize != MPTCP_SUB_LEN_ADD_ADDR4 + 2) ||

13822

++		    (mpadd->ipver == 6 && opsize != MPTCP_SUB_LEN_ADD_ADDR6 &&

13823

++		     opsize != MPTCP_SUB_LEN_ADD_ADDR6 + 2)) {

13824

++#else

13825

++		if (opsize != MPTCP_SUB_LEN_ADD_ADDR4 &&

13826

++		    opsize != MPTCP_SUB_LEN_ADD_ADDR4 + 2) {

13827

++#endif /* CONFIG_IPV6 */

13828

++			mptcp_debug("%s: mp_add_addr: bad option size %d\n",

13829

++				    __func__, opsize);

13830

++			break;

13831

++		}

13832

++

13833

++		/* We have to manually parse the options if we got two of them. */

13834

++		if (mopt->saw_add_addr) {

13835

++			mopt->more_add_addr = 1;

13836

++			break;

13837

++		}

13838

++		mopt->saw_add_addr = 1;

13839

++		mopt->add_addr_ptr = ptr;

13840

++		break;

13841

++	}

13842

++	case MPTCP_SUB_REMOVE_ADDR:

13843

++		if ((opsize - MPTCP_SUB_LEN_REMOVE_ADDR) < 0) {

13844

++			mptcp_debug("%s: mp_remove_addr: bad option size %d\n",

13845

++				    __func__, opsize);

13846

++			break;

13847

++		}

13848

++

13849

++		if (mopt->saw_rem_addr) {

13850

++			mopt->more_rem_addr = 1;

13851

++			break;

13852

++		}

13853

++		mopt->saw_rem_addr = 1;

13854

++		mopt->rem_addr_ptr = ptr;

13855

++		break;

13856

++	case MPTCP_SUB_PRIO:

13857

++	{

13858

++		const struct mp_prio *mpprio = (struct mp_prio *)ptr;

13859

++

13860

++		if (opsize != MPTCP_SUB_LEN_PRIO &&

13861

++		    opsize != MPTCP_SUB_LEN_PRIO_ADDR) {

13862

++			mptcp_debug("%s: mp_prio: bad option size %d\n",

13863

++				    __func__, opsize);

13864

++			break;

13865

++		}

13866

++

13867

++		mopt->saw_low_prio = 1;

13868

++		mopt->low_prio = mpprio->b;

13869

++

13870

++		if (opsize == MPTCP_SUB_LEN_PRIO_ADDR) {

13871

++			mopt->saw_low_prio = 2;

13872

++			mopt->prio_addr_id = mpprio->addr_id;

13873

++		}

13874

++		break;

13875

++	}

13876

++	case MPTCP_SUB_FAIL:

13877

++		if (opsize != MPTCP_SUB_LEN_FAIL) {

13878

++			mptcp_debug("%s: mp_fail: bad option size %d\n",

13879

++				    __func__, opsize);

13880

++			break;

13881

++		}

13882

++		mopt->mp_fail = 1;

13883

++		break;

13884

++	case MPTCP_SUB_FCLOSE:

13885

++		if (opsize != MPTCP_SUB_LEN_FCLOSE) {

13886

++			mptcp_debug("%s: mp_fclose: bad option size %d\n",

13887

++				    __func__, opsize);

13888

++			break;

13889

++		}

13890

++

13891

++		mopt->mp_fclose = 1;

13892

++		mopt->mptcp_key = ((struct mp_fclose *)ptr)->key;

13893

++

13894

++		break;

13895

++	default:

13896

++		mptcp_debug("%s: Received unkown subtype: %d\n",

13897

++			    __func__, mp_opt->sub);

13898

++		break;

13899

++	}

13900

++}

13901

++

13902

++/** Parse only MPTCP options */

13903

++void tcp_parse_mptcp_options(const struct sk_buff *skb,

13904

++			     struct mptcp_options_received *mopt)

13905

++{

13906

++	const struct tcphdr *th = tcp_hdr(skb);

13907

++	int length = (th->doff * 4) - sizeof(struct tcphdr);

13908

++	const unsigned char *ptr = (const unsigned char *)(th + 1);

13909

++

13910

++	while (length > 0) {

13911

++		int opcode = *ptr++;

13912

++		int opsize;

13913

++

13914

++		switch (opcode) {

13915

++		case TCPOPT_EOL:

13916

++			return;

13917

++		case TCPOPT_NOP:	/* Ref: RFC 793 section 3.1 */

13918

++			length--;

13919

++			continue;

13920

++		default:

13921

++			opsize = *ptr++;

13922

++			if (opsize < 2)	/* "silly options" */

13923

++				return;

13924

++			if (opsize > length)

13925

++				return;	/* don't parse partial options */

13926

++			if (opcode == TCPOPT_MPTCP)

13927

++				mptcp_parse_options(ptr - 2, opsize, mopt, skb);

13928

++		}

13929

++		ptr += opsize - 2;

13930

++		length -= opsize;

13931

++	}

13932

++}

13933

++

13934

++int mptcp_check_rtt(const struct tcp_sock *tp, int time)

13935

++{

13936

++	struct mptcp_cb *mpcb = tp->mpcb;

13937

++	struct sock *sk;

13938

++	u32 rtt_max = 0;

13939

++

13940

++	/* In MPTCP, we take the max delay across all flows,

13941

++	 * in order to take into account meta-reordering buffers.

13942

++	 */

13943

++	mptcp_for_each_sk(mpcb, sk) {

13944

++		if (!mptcp_sk_can_recv(sk))

13945

++			continue;

13946

++

13947

++		if (rtt_max < tcp_sk(sk)->rcv_rtt_est.rtt)

13948

++			rtt_max = tcp_sk(sk)->rcv_rtt_est.rtt;

13949

++	}

13950

++	if (time < (rtt_max >> 3) || !rtt_max)

13951

++		return 1;

13952

++

13953

++	return 0;

13954

++}

13955

++

13956

++static void mptcp_handle_add_addr(const unsigned char *ptr, struct sock *sk)

13957

++{

13958

++	struct mp_add_addr *mpadd = (struct mp_add_addr *)ptr;

13959

++	struct mptcp_cb *mpcb = tcp_sk(sk)->mpcb;

13960

++	__be16 port = 0;

13961

++	union inet_addr addr;

13962

++	sa_family_t family;

13963

++

13964

++	if (mpadd->ipver == 4) {

13965

++		if (mpadd->len == MPTCP_SUB_LEN_ADD_ADDR4 + 2)

13966

++			port  = mpadd->u.v4.port;

13967

++		family = AF_INET;

13968

++		addr.in = mpadd->u.v4.addr;

13969

++#if IS_ENABLED(CONFIG_IPV6)

13970

++	} else if (mpadd->ipver == 6) {

13971

++		if (mpadd->len == MPTCP_SUB_LEN_ADD_ADDR6 + 2)

13972

++			port  = mpadd->u.v6.port;

13973

++		family = AF_INET6;

13974

++		addr.in6 = mpadd->u.v6.addr;

13975

++#endif /* CONFIG_IPV6 */

13976

++	} else {

13977

++		return;

13978

++	}

13979

++

13980

++	if (mpcb->pm_ops->add_raddr)

13981

++		mpcb->pm_ops->add_raddr(mpcb, &addr, family, port, mpadd->addr_id);

13982

++}

13983

++

13984

++static void mptcp_handle_rem_addr(const unsigned char *ptr, struct sock *sk)

13985

++{

13986

++	struct mp_remove_addr *mprem = (struct mp_remove_addr *)ptr;

13987

++	int i;

13988

++	u8 rem_id;

13989

++	struct mptcp_cb *mpcb = tcp_sk(sk)->mpcb;

13990

++

13991

++	for (i = 0; i <= mprem->len - MPTCP_SUB_LEN_REMOVE_ADDR; i++) {

13992

++		rem_id = (&mprem->addrs_id)[i];

13993

++

13994

++		if (mpcb->pm_ops->rem_raddr)

13995

++			mpcb->pm_ops->rem_raddr(mpcb, rem_id);

13996

++		mptcp_send_reset_rem_id(mpcb, rem_id);

13997

++	}

13998

++}

13999

++

14000

++static void mptcp_parse_addropt(const struct sk_buff *skb, struct sock *sk)

14001

++{

14002

++	struct tcphdr *th = tcp_hdr(skb);

14003

++	unsigned char *ptr;

14004

++	int length = (th->doff * 4) - sizeof(struct tcphdr);

14005

++

14006

++	/* Jump through the options to check whether ADD_ADDR is there */

14007

++	ptr = (unsigned char *)(th + 1);

14008

++	while (length > 0) {

14009

++		int opcode = *ptr++;

14010

++		int opsize;

14011

++

14012

++		switch (opcode) {

14013

++		case TCPOPT_EOL:

14014

++			return;

14015

++		case TCPOPT_NOP:

14016

++			length--;

14017

++			continue;

14018

++		default:

14019

++			opsize = *ptr++;

14020

++			if (opsize < 2)

14021

++				return;

14022

++			if (opsize > length)

14023

++				return;  /* don't parse partial options */

14024

++			if (opcode == TCPOPT_MPTCP &&

14025

++			    ((struct mptcp_option *)ptr)->sub == MPTCP_SUB_ADD_ADDR) {

14026

++#if IS_ENABLED(CONFIG_IPV6)

14027

++				struct mp_add_addr *mpadd = (struct mp_add_addr *)ptr;

14028

++				if ((mpadd->ipver == 4 && opsize != MPTCP_SUB_LEN_ADD_ADDR4 &&

14029

++				     opsize != MPTCP_SUB_LEN_ADD_ADDR4 + 2) ||

14030

++				    (mpadd->ipver == 6 && opsize != MPTCP_SUB_LEN_ADD_ADDR6 &&

14031

++				     opsize != MPTCP_SUB_LEN_ADD_ADDR6 + 2))

14032

++#else

14033

++				if (opsize != MPTCP_SUB_LEN_ADD_ADDR4 &&

14034

++				    opsize != MPTCP_SUB_LEN_ADD_ADDR4 + 2)

14035

++#endif /* CONFIG_IPV6 */

14036

++					goto cont;

14037

++

14038

++				mptcp_handle_add_addr(ptr, sk);

14039

++			}

14040

++			if (opcode == TCPOPT_MPTCP &&

14041

++			    ((struct mptcp_option *)ptr)->sub == MPTCP_SUB_REMOVE_ADDR) {

14042

++				if ((opsize - MPTCP_SUB_LEN_REMOVE_ADDR) < 0)

14043

++					goto cont;

14044

++

14045

++				mptcp_handle_rem_addr(ptr, sk);

14046

++			}

14047

++cont:

14048

++			ptr += opsize - 2;

14049

++			length -= opsize;

14050

++		}

14051

++	}

14052

++	return;

14053

++}

14054

++

14055

++static inline int mptcp_mp_fail_rcvd(struct sock *sk, const struct tcphdr *th)

14056

++{

14057

++	struct mptcp_tcp_sock *mptcp = tcp_sk(sk)->mptcp;

14058

++	struct sock *meta_sk = mptcp_meta_sk(sk);

14059

++	struct mptcp_cb *mpcb = tcp_sk(sk)->mpcb;

14060

++

14061

++	if (unlikely(mptcp->rx_opt.mp_fail)) {

14062

++		mptcp->rx_opt.mp_fail = 0;

14063

++

14064

++		if (!th->rst && !mpcb->infinite_mapping_snd) {

14065

++			struct sock *sk_it;

14066

++

14067

++			mpcb->send_infinite_mapping = 1;

14068

++			/* We resend everything that has not been acknowledged */

14069

++			meta_sk->sk_send_head = tcp_write_queue_head(meta_sk);

14070

++

14071

++			/* We artificially restart the whole send-queue. Thus,

14072

++			 * it is as if no packets are in flight

14073

++			 */

14074

++			tcp_sk(meta_sk)->packets_out = 0;

14075

++

14076

++			/* If the snd_nxt already wrapped around, we have to

14077

++			 * undo the wrapping, as we are restarting from snd_una

14078

++			 * on.

14079

++			 */

14080

++			if (tcp_sk(meta_sk)->snd_nxt < tcp_sk(meta_sk)->snd_una) {

14081

++				mpcb->snd_high_order[mpcb->snd_hiseq_index] -= 2;

14082

++				mpcb->snd_hiseq_index = mpcb->snd_hiseq_index ? 0 : 1;

14083

++			}

14084

++			tcp_sk(meta_sk)->snd_nxt = tcp_sk(meta_sk)->snd_una;

14085

++

14086

++			/* Trigger a sending on the meta. */

14087

++			mptcp_push_pending_frames(meta_sk);

14088

++

14089

++			mptcp_for_each_sk(mpcb, sk_it) {

14090

++				if (sk != sk_it)

14091

++					mptcp_sub_force_close(sk_it);

14092

++			}

14093

++		}

14094

++

14095

++		return 0;

14096

++	}

14097

++

14098

++	if (unlikely(mptcp->rx_opt.mp_fclose)) {

14099

++		struct sock *sk_it, *tmpsk;

14100

++

14101

++		mptcp->rx_opt.mp_fclose = 0;

14102

++		if (mptcp->rx_opt.mptcp_key != mpcb->mptcp_loc_key)

14103

++			return 0;

14104

++

14105

++		if (tcp_need_reset(sk->sk_state))

14106

++			tcp_sk(sk)->ops->send_active_reset(sk, GFP_ATOMIC);

14107

++

14108

++		mptcp_for_each_sk_safe(mpcb, sk_it, tmpsk)

14109

++			mptcp_sub_force_close(sk_it);

14110

++

14111

++		tcp_reset(meta_sk);

14112

++

14113

++		return 1;

14114

++	}

14115

++

14116

++	return 0;

14117

++}

14118

++

14119

++static inline void mptcp_path_array_check(struct sock *meta_sk)

14120

++{

14121

++	struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;

14122

++

14123

++	if (unlikely(mpcb->list_rcvd)) {

14124

++		mpcb->list_rcvd = 0;

14125

++		if (mpcb->pm_ops->new_remote_address)

14126

++			mpcb->pm_ops->new_remote_address(meta_sk);

14127

++	}

14128

++}

14129

++

14130

++int mptcp_handle_options(struct sock *sk, const struct tcphdr *th,

14131

++			 const struct sk_buff *skb)

14132

++{

14133

++	struct tcp_sock *tp = tcp_sk(sk);

14134

++	struct mptcp_options_received *mopt = &tp->mptcp->rx_opt;

14135

++

14136

++	if (tp->mpcb->infinite_mapping_rcv || tp->mpcb->infinite_mapping_snd)

14137

++		return 0;

14138

++

14139

++	if (mptcp_mp_fail_rcvd(sk, th))

14140

++		return 1;

14141

++

14142

++	/* RFC 6824, Section 3.3:

14143

++	 * If a checksum is not present when its use has been negotiated, the

14144

++	 * receiver MUST close the subflow with a RST as it is considered broken.

14145

++	 */

14146

++	if (mptcp_is_data_seq(skb) && tp->mpcb->dss_csum &&

14147

++	    !(TCP_SKB_CB(skb)->mptcp_flags & MPTCPHDR_DSS_CSUM)) {

14148

++		if (tcp_need_reset(sk->sk_state))

14149

++			tp->ops->send_active_reset(sk, GFP_ATOMIC);

14150

++

14151

++		mptcp_sub_force_close(sk);

14152

++		return 1;

14153

++	}

14154

++

14155

++	/* We have to acknowledge retransmissions of the third

14156

++	 * ack.

14157

++	 */

14158

++	if (mopt->join_ack) {

14159

++		tcp_send_delayed_ack(sk);

14160

++		mopt->join_ack = 0;

14161

++	}

14162

++

14163

++	if (mopt->saw_add_addr || mopt->saw_rem_addr) {

14164

++		if (mopt->more_add_addr || mopt->more_rem_addr) {

14165

++			mptcp_parse_addropt(skb, sk);

14166

++		} else {

14167

++			if (mopt->saw_add_addr)

14168

++				mptcp_handle_add_addr(mopt->add_addr_ptr, sk);

14169

++			if (mopt->saw_rem_addr)

14170

++				mptcp_handle_rem_addr(mopt->rem_addr_ptr, sk);

14171

++		}

14172

++

14173

++		mopt->more_add_addr = 0;

14174

++		mopt->saw_add_addr = 0;

14175

++		mopt->more_rem_addr = 0;

14176

++		mopt->saw_rem_addr = 0;

14177

++	}

14178

++	if (mopt->saw_low_prio) {

14179

++		if (mopt->saw_low_prio == 1) {

14180

++			tp->mptcp->rcv_low_prio = mopt->low_prio;

14181

++		} else {

14182

++			struct sock *sk_it;

14183

++			mptcp_for_each_sk(tp->mpcb, sk_it) {

14184

++				struct mptcp_tcp_sock *mptcp = tcp_sk(sk_it)->mptcp;

14185

++				if (mptcp->rem_id == mopt->prio_addr_id)

14186

++					mptcp->rcv_low_prio = mopt->low_prio;

14187

++			}

14188

++		}

14189

++		mopt->saw_low_prio = 0;

14190

++	}

14191

++

14192

++	mptcp_data_ack(sk, skb);

14193

++

14194

++	mptcp_path_array_check(mptcp_meta_sk(sk));

14195

++	/* Socket may have been mp_killed by a REMOVE_ADDR */

14196

++	if (tp->mp_killed)

14197

++		return 1;

14198

++

14199

++	return 0;

14200

++}

14201

++

14202

++/* In case of fastopen, some data can already be in the write queue.

14203

++ * We need to update the sequence number of the segments as they

14204

++ * were initially TCP sequence numbers.

14205

++ */

14206

++static void mptcp_rcv_synsent_fastopen(struct sock *meta_sk)

14207

++{

14208

++	struct tcp_sock *meta_tp = tcp_sk(meta_sk);

14209

++	struct tcp_sock *master_tp = tcp_sk(meta_tp->mpcb->master_sk);

14210

++	struct sk_buff *skb;

14211

++	u32 new_mapping = meta_tp->write_seq - master_tp->snd_una;

14212

++

14213

++	/* There should only be one skb in write queue: the data not

14214

++	 * acknowledged in the SYN+ACK. In this case, we need to map

14215

++	 * this data to data sequence numbers.

14216

++	 */

14217

++	skb_queue_walk(&meta_sk->sk_write_queue, skb) {

14218

++		/* If the server only acknowledges partially the data sent in

14219

++		 * the SYN, we need to trim the acknowledged part because

14220

++		 * we don't want to retransmit this already received data.

14221

++		 * When we reach this point, tcp_ack() has already cleaned up

14222

++		 * fully acked segments. However, tcp trims partially acked

14223

++		 * segments only when retransmitting. Since MPTCP comes into

14224

++		 * play only now, we will fake an initial transmit, and

14225

++		 * retransmit_skb() will not be called. The following fragment

14226

++		 * comes from __tcp_retransmit_skb().

14227

++		 */

14228

++		if (before(TCP_SKB_CB(skb)->seq, master_tp->snd_una)) {

14229

++			BUG_ON(before(TCP_SKB_CB(skb)->end_seq,

14230

++				      master_tp->snd_una));

14231

++			/* tcp_trim_head can only returns ENOMEM if skb is

14232

++			 * cloned. It is not the case here (see

14233

++			 * tcp_send_syn_data).

14234

++			 */

14235

++			BUG_ON(tcp_trim_head(meta_sk, skb, master_tp->snd_una -

14236

++					     TCP_SKB_CB(skb)->seq));

14237

++		}

14238

++

14239

++		TCP_SKB_CB(skb)->seq += new_mapping;

14240

++		TCP_SKB_CB(skb)->end_seq += new_mapping;

14241

++	}

14242

++

14243

++	/* We can advance write_seq by the number of bytes unacknowledged

14244

++	 * and that were mapped in the previous loop.

14245

++	 */

14246

++	meta_tp->write_seq += master_tp->write_seq - master_tp->snd_una;

14247

++

14248

++	/* The packets from the master_sk will be entailed to it later

14249

++	 * Until that time, its write queue is empty, and

14250

++	 * write_seq must align with snd_una

14251

++	 */

14252

++	master_tp->snd_nxt = master_tp->write_seq = master_tp->snd_una;

14253

++	master_tp->packets_out = 0;

14254

++

14255

++	/* Although these data have been sent already over the subsk,

14256

++	 * They have never been sent over the meta_sk, so we rewind

14257

++	 * the send_head so that tcp considers it as an initial send

14258

++	 * (instead of retransmit).

14259

++	 */

14260

++	meta_sk->sk_send_head = tcp_write_queue_head(meta_sk);

14261

++}

14262

++

14263

++/* The skptr is needed, because if we become MPTCP-capable, we have to switch

14264

++ * from meta-socket to master-socket.

14265

++ *

14266

++ * @return: 1 - we want to reset this connection

14267

++ *	    2 - we want to discard the received syn/ack

14268

++ *	    0 - everything is fine - continue

14269

++ */

14270

++int mptcp_rcv_synsent_state_process(struct sock *sk, struct sock **skptr,

14271

++				    const struct sk_buff *skb,

14272

++				    const struct mptcp_options_received *mopt)

14273

++{

14274

++	struct tcp_sock *tp = tcp_sk(sk);

14275

++

14276

++	if (mptcp(tp)) {

14277

++		u8 hash_mac_check[20];

14278

++		struct mptcp_cb *mpcb = tp->mpcb;

14279

++

14280

++		mptcp_hmac_sha1((u8 *)&mpcb->mptcp_rem_key,

14281

++				(u8 *)&mpcb->mptcp_loc_key,

14282

++				(u8 *)&tp->mptcp->rx_opt.mptcp_recv_nonce,

14283

++				(u8 *)&tp->mptcp->mptcp_loc_nonce,

14284

++				(u32 *)hash_mac_check);

14285

++		if (memcmp(hash_mac_check,

14286

++			   (char *)&tp->mptcp->rx_opt.mptcp_recv_tmac, 8)) {

14287

++			mptcp_sub_force_close(sk);

14288

++			return 1;

14289

++		}

14290

++

14291

++		/* Set this flag in order to postpone data sending

14292

++		 * until the 4th ack arrives.

14293

++		 */

14294

++		tp->mptcp->pre_established = 1;

14295

++		tp->mptcp->rcv_low_prio = tp->mptcp->rx_opt.low_prio;

14296

++

14297

++		mptcp_hmac_sha1((u8 *)&mpcb->mptcp_loc_key,

14298

++				(u8 *)&mpcb->mptcp_rem_key,

14299

++				(u8 *)&tp->mptcp->mptcp_loc_nonce,

14300

++				(u8 *)&tp->mptcp->rx_opt.mptcp_recv_nonce,

14301

++				(u32 *)&tp->mptcp->sender_mac[0]);

14302

++

14303

++	} else if (mopt->saw_mpc) {

14304

++		struct sock *meta_sk = sk;

14305

++

14306

++		if (mptcp_create_master_sk(sk, mopt->mptcp_key,

14307

++					   ntohs(tcp_hdr(skb)->window)))

14308

++			return 2;

14309

++

14310

++		sk = tcp_sk(sk)->mpcb->master_sk;

14311

++		*skptr = sk;

14312

++		tp = tcp_sk(sk);

14313

++

14314

++		/* If fastopen was used data might be in the send queue. We

14315

++		 * need to update their sequence number to MPTCP-level seqno.

14316

++		 * Note that it can happen in rare cases that fastopen_req is

14317

++		 * NULL and syn_data is 0 but fastopen indeed occurred and

14318

++		 * data has been queued in the write queue (but not sent).

14319

++		 * Example of such rare cases: connect is non-blocking and

14320

++		 * TFO is configured to work without cookies.

14321

++		 */

14322

++		if (!skb_queue_empty(&meta_sk->sk_write_queue))

14323

++			mptcp_rcv_synsent_fastopen(meta_sk);

14324

++

14325

++		/* -1, because the SYN consumed 1 byte. In case of TFO, we

14326

++		 * start the subflow-sequence number as if the data of the SYN

14327

++		 * is not part of any mapping.

14328

++		 */

14329

++		tp->mptcp->snt_isn = tp->snd_una - 1;

14330

++		tp->mpcb->dss_csum = mopt->dss_csum;

14331

++		tp->mptcp->include_mpc = 1;

14332

++

14333

++		/* Ensure that fastopen is handled at the meta-level. */

14334

++		tp->fastopen_req = NULL;

14335

++

14336

++		sk_set_socket(sk, mptcp_meta_sk(sk)->sk_socket);

14337

++		sk->sk_wq = mptcp_meta_sk(sk)->sk_wq;

14338

++

14339

++		 /* hold in sk_clone_lock due to initialization to 2 */

14340

++		sock_put(sk);

14341

++	} else {

14342

++		tp->request_mptcp = 0;

14343

++

14344

++		if (tp->inside_tk_table)

14345

++			mptcp_hash_remove(tp);

14346

++	}

14347

++

14348

++	if (mptcp(tp))

14349

++		tp->mptcp->rcv_isn = TCP_SKB_CB(skb)->seq;

14350

++

14351

++	return 0;

14352

++}

14353

++

14354

++bool mptcp_should_expand_sndbuf(const struct sock *sk)

14355

++{

14356

++	const struct sock *sk_it;

14357

++	const struct sock *meta_sk = mptcp_meta_sk(sk);

14358

++	const struct tcp_sock *meta_tp = tcp_sk(meta_sk);

14359

++	int cnt_backups = 0;

14360

++	int backup_available = 0;

14361

++

14362

++	/* We circumvent this check in tcp_check_space, because we want to

14363

++	 * always call sk_write_space. So, we reproduce the check here.

14364

++	 */

14365

++	if (!meta_sk->sk_socket ||

14366

++	    !test_bit(SOCK_NOSPACE, &meta_sk->sk_socket->flags))

14367

++		return false;

14368

++

14369

++	/* If the user specified a specific send buffer setting, do

14370

++	 * not modify it.

14371

++	 */

14372

++	if (meta_sk->sk_userlocks & SOCK_SNDBUF_LOCK)

14373

++		return false;

14374

++

14375

++	/* If we are under global TCP memory pressure, do not expand.  */

14376

++	if (sk_under_memory_pressure(meta_sk))

14377

++		return false;

14378

++

14379

++	/* If we are under soft global TCP memory pressure, do not expand.  */

14380

++	if (sk_memory_allocated(meta_sk) >= sk_prot_mem_limits(meta_sk, 0))

14381

++		return false;

14382

++

14383

++

14384

++	/* For MPTCP we look for a subsocket that could send data.

14385

++	 * If we found one, then we update the send-buffer.

14386

++	 */

14387

++	mptcp_for_each_sk(meta_tp->mpcb, sk_it) {

14388

++		struct tcp_sock *tp_it = tcp_sk(sk_it);

14389

++

14390

++		if (!mptcp_sk_can_send(sk_it))

14391

++			continue;

14392

++

14393

++		/* Backup-flows have to be counted - if there is no other

14394

++		 * subflow we take the backup-flow into account.

14395

++		 */

14396

++		if (tp_it->mptcp->rcv_low_prio || tp_it->mptcp->low_prio)

14397

++			cnt_backups++;

14398

++

14399

++		if (tp_it->packets_out < tp_it->snd_cwnd) {

14400

++			if (tp_it->mptcp->rcv_low_prio || tp_it->mptcp->low_prio) {

14401

++				backup_available = 1;

14402

++				continue;

14403

++			}

14404

++			return true;

14405

++		}

14406

++	}

14407

++

14408

++	/* Backup-flow is available for sending - update send-buffer */

14409

++	if (meta_tp->mpcb->cnt_established == cnt_backups && backup_available)

14410

++		return true;

14411

++	return false;

14412

++}

14413

++

14414

++void mptcp_init_buffer_space(struct sock *sk)

14415

++{

14416

++	struct tcp_sock *tp = tcp_sk(sk);

14417

++	struct sock *meta_sk = mptcp_meta_sk(sk);

14418

++	struct tcp_sock *meta_tp = tcp_sk(meta_sk);

14419

++	int space;

14420

++

14421

++	tcp_init_buffer_space(sk);

14422

++

14423

++	if (is_master_tp(tp)) {

14424

++		meta_tp->rcvq_space.space = meta_tp->rcv_wnd;

14425

++		meta_tp->rcvq_space.time = tcp_time_stamp;

14426

++		meta_tp->rcvq_space.seq = meta_tp->copied_seq;

14427

++

14428

++		/* If there is only one subflow, we just use regular TCP

14429

++		 * autotuning. User-locks are handled already by

14430

++		 * tcp_init_buffer_space

14431

++		 */

14432

++		meta_tp->window_clamp = tp->window_clamp;

14433

++		meta_tp->rcv_ssthresh = tp->rcv_ssthresh;

14434

++		meta_sk->sk_rcvbuf = sk->sk_rcvbuf;

14435

++		meta_sk->sk_sndbuf = sk->sk_sndbuf;

14436

++

14437

++		return;

14438

++	}

14439

++

14440

++	if (meta_sk->sk_userlocks & SOCK_RCVBUF_LOCK)

14441

++		goto snd_buf;

14442

++

14443

++	/* Adding a new subflow to the rcv-buffer space. We make a simple

14444

++	 * addition, to give some space to allow traffic on the new subflow.

14445

++	 * Autotuning will increase it further later on.

14446

++	 */

14447

++	space = min(meta_sk->sk_rcvbuf + sk->sk_rcvbuf, sysctl_tcp_rmem[2]);

14448

++	if (space > meta_sk->sk_rcvbuf) {

14449

++		meta_tp->window_clamp += tp->window_clamp;

14450

++		meta_tp->rcv_ssthresh += tp->rcv_ssthresh;

14451

++		meta_sk->sk_rcvbuf = space;

14452

++	}

14453

++

14454

++snd_buf:

14455

++	if (meta_sk->sk_userlocks & SOCK_SNDBUF_LOCK)

14456

++		return;

14457

++

14458

++	/* Adding a new subflow to the send-buffer space. We make a simple

14459

++	 * addition, to give some space to allow traffic on the new subflow.

14460

++	 * Autotuning will increase it further later on.

14461

++	 */

14462

++	space = min(meta_sk->sk_sndbuf + sk->sk_sndbuf, sysctl_tcp_wmem[2]);

14463

++	if (space > meta_sk->sk_sndbuf) {

14464

++		meta_sk->sk_sndbuf = space;

14465

++		meta_sk->sk_write_space(meta_sk);

14466

++	}

14467

++}

14468

++

14469

++void mptcp_tcp_set_rto(struct sock *sk)

14470

++{

14471

++	tcp_set_rto(sk);

14472

++	mptcp_set_rto(sk);

14473

++}

14474

+diff --git a/net/mptcp/mptcp_ipv4.c b/net/mptcp/mptcp_ipv4.c

14475

+new file mode 100644

14476

+index 000000000000..1183d1305d35

14477

+--- /dev/null

14478

++++ b/net/mptcp/mptcp_ipv4.c

14479

+@@ -0,0 +1,483 @@

14480

++/*

14481

++ *	MPTCP implementation - IPv4-specific functions

14482

++ *

14483

++ *	Initial Design & Implementation:

14484

++ *	Sébastien Barré <sebastien.barre@×××××××××.be>

14485

++ *

14486

++ *	Current Maintainer:

14487

++ *	Christoph Paasch <christoph.paasch@×××××××××.be>

14488

++ *

14489

++ *	Additional authors:

14490

++ *	Jaakko Korkeaniemi <jaakko.korkeaniemi@×××××.fi>

14491

++ *	Gregory Detal <gregory.detal@×××××××××.be>

14492

++ *	Fabien Duchêne <fabien.duchene@×××××××××.be>

14493

++ *	Andreas Seelinger <Andreas.Seelinger@×××××××××××.de>

14494

++ *	Lavkesh Lahngir <lavkesh51@×××××.com>

14495

++ *	Andreas Ripke <ripke@××××××.eu>

14496

++ *	Vlad Dogaru <vlad.dogaru@×××××.com>

14497

++ *	Octavian Purdila <octavian.purdila@×××××.com>

14498

++ *	John Ronan <jronan@××××.org>

14499

++ *	Catalin Nicutar <catalin.nicutar@×××××.com>

14500

++ *	Brandon Heller <brandonh@××××××××.edu>

14501

++ *

14502

++ *

14503

++ *	This program is free software; you can redistribute it and/or

14504

++ *      modify it under the terms of the GNU General Public License

14505

++ *      as published by the Free Software Foundation; either version

14506

++ *      2 of the License, or (at your option) any later version.

14507

++ */

14508

++

14509

++#include <linux/export.h>

14510

++#include <linux/ip.h>

14511

++#include <linux/list.h>

14512

++#include <linux/skbuff.h>

14513

++#include <linux/spinlock.h>

14514

++#include <linux/tcp.h>

14515

++

14516

++#include <net/inet_common.h>

14517

++#include <net/inet_connection_sock.h>

14518

++#include <net/mptcp.h>

14519

++#include <net/mptcp_v4.h>

14520

++#include <net/request_sock.h>

14521

++#include <net/tcp.h>

14522

++

14523

++u32 mptcp_v4_get_nonce(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport)

14524

++{

14525

++	u32 hash[MD5_DIGEST_WORDS];

14526

++

14527

++	hash[0] = (__force u32)saddr;

14528

++	hash[1] = (__force u32)daddr;

14529

++	hash[2] = ((__force u16)sport << 16) + (__force u16)dport;

14530

++	hash[3] = mptcp_seed++;

14531

++

14532

++	md5_transform(hash, mptcp_secret);

14533

++

14534

++	return hash[0];

14535

++}

14536

++

14537

++u64 mptcp_v4_get_key(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport)

14538

++{

14539

++	u32 hash[MD5_DIGEST_WORDS];

14540

++

14541

++	hash[0] = (__force u32)saddr;

14542

++	hash[1] = (__force u32)daddr;

14543

++	hash[2] = ((__force u16)sport << 16) + (__force u16)dport;

14544

++	hash[3] = mptcp_seed++;

14545

++

14546

++	md5_transform(hash, mptcp_secret);

14547

++

14548

++	return *((u64 *)hash);

14549

++}

14550

++

14551

++

14552

++static void mptcp_v4_reqsk_destructor(struct request_sock *req)

14553

++{

14554

++	mptcp_reqsk_destructor(req);

14555

++

14556

++	tcp_v4_reqsk_destructor(req);

14557

++}

14558

++

14559

++static int mptcp_v4_init_req(struct request_sock *req, struct sock *sk,

14560

++			     struct sk_buff *skb)

14561

++{

14562

++	tcp_request_sock_ipv4_ops.init_req(req, sk, skb);

14563

++	mptcp_reqsk_init(req, skb);

14564

++

14565

++	return 0;

14566

++}

14567

++

14568

++static int mptcp_v4_join_init_req(struct request_sock *req, struct sock *sk,

14569

++				  struct sk_buff *skb)

14570

++{

14571

++	struct mptcp_request_sock *mtreq = mptcp_rsk(req);

14572

++	struct mptcp_cb *mpcb = tcp_sk(sk)->mpcb;

14573

++	union inet_addr addr;

14574

++	int loc_id;

14575

++	bool low_prio = false;

14576

++

14577

++	/* We need to do this as early as possible. Because, if we fail later

14578

++	 * (e.g., get_local_id), then reqsk_free tries to remove the

14579

++	 * request-socket from the htb in mptcp_hash_request_remove as pprev

14580

++	 * may be different from NULL.

14581

++	 */

14582

++	mtreq->hash_entry.pprev = NULL;

14583

++

14584

++	tcp_request_sock_ipv4_ops.init_req(req, sk, skb);

14585

++

14586

++	mtreq->mptcp_loc_nonce = mptcp_v4_get_nonce(ip_hdr(skb)->saddr,

14587

++						    ip_hdr(skb)->daddr,

14588

++						    tcp_hdr(skb)->source,

14589

++						    tcp_hdr(skb)->dest);

14590

++	addr.ip = inet_rsk(req)->ir_loc_addr;

14591

++	loc_id = mpcb->pm_ops->get_local_id(AF_INET, &addr, sock_net(sk), &low_prio);

14592

++	if (loc_id == -1)

14593

++		return -1;

14594

++	mtreq->loc_id = loc_id;

14595

++	mtreq->low_prio = low_prio;

14596

++

14597

++	mptcp_join_reqsk_init(mpcb, req, skb);

14598

++

14599

++	return 0;

14600

++}

14601

++

14602

++/* Similar to tcp_request_sock_ops */

14603

++struct request_sock_ops mptcp_request_sock_ops __read_mostly = {

14604

++	.family		=	PF_INET,

14605

++	.obj_size	=	sizeof(struct mptcp_request_sock),

14606

++	.rtx_syn_ack	=	tcp_rtx_synack,

14607

++	.send_ack	=	tcp_v4_reqsk_send_ack,

14608

++	.destructor	=	mptcp_v4_reqsk_destructor,

14609

++	.send_reset	=	tcp_v4_send_reset,

14610

++	.syn_ack_timeout =	tcp_syn_ack_timeout,

14611

++};

14612

++

14613

++static void mptcp_v4_reqsk_queue_hash_add(struct sock *meta_sk,

14614

++					  struct request_sock *req,

14615

++					  const unsigned long timeout)

14616

++{

14617

++	const u32 h1 = inet_synq_hash(inet_rsk(req)->ir_rmt_addr,

14618

++				     inet_rsk(req)->ir_rmt_port,

14619

++				     0, MPTCP_HASH_SIZE);

14620

++	/* We cannot call inet_csk_reqsk_queue_hash_add(), because we do not

14621

++	 * want to reset the keepalive-timer (responsible for retransmitting

14622

++	 * SYN/ACKs). We do not retransmit SYN/ACKs+MP_JOINs, because we cannot

14623

++	 * overload the keepalive timer. Also, it's not a big deal, because the

14624

++	 * third ACK of the MP_JOIN-handshake is sent in a reliable manner. So,

14625

++	 * if the third ACK gets lost, the client will handle the retransmission

14626

++	 * anyways. If our SYN/ACK gets lost, the client will retransmit the

14627

++	 * SYN.

14628

++	 */

14629

++	struct inet_connection_sock *meta_icsk = inet_csk(meta_sk);

14630

++	struct listen_sock *lopt = meta_icsk->icsk_accept_queue.listen_opt;

14631

++	const u32 h2 = inet_synq_hash(inet_rsk(req)->ir_rmt_addr,

14632

++				     inet_rsk(req)->ir_rmt_port,

14633

++				     lopt->hash_rnd, lopt->nr_table_entries);

14634

++

14635

++	reqsk_queue_hash_req(&meta_icsk->icsk_accept_queue, h2, req, timeout);

14636

++	if (reqsk_queue_added(&meta_icsk->icsk_accept_queue) == 0)

14637

++		mptcp_reset_synack_timer(meta_sk, timeout);

14638

++

14639

++	rcu_read_lock();

14640

++	spin_lock(&mptcp_reqsk_hlock);

14641

++	hlist_nulls_add_head_rcu(&mptcp_rsk(req)->hash_entry, &mptcp_reqsk_htb[h1]);

14642

++	spin_unlock(&mptcp_reqsk_hlock);

14643

++	rcu_read_unlock();

14644

++}

14645

++

14646

++/* Similar to tcp_v4_conn_request */

14647

++static int mptcp_v4_join_request(struct sock *meta_sk, struct sk_buff *skb)

14648

++{

14649

++	return tcp_conn_request(&mptcp_request_sock_ops,

14650

++				&mptcp_join_request_sock_ipv4_ops,

14651

++				meta_sk, skb);

14652

++}

14653

++

14654

++/* We only process join requests here. (either the SYN or the final ACK) */

14655

++int mptcp_v4_do_rcv(struct sock *meta_sk, struct sk_buff *skb)

14656

++{

14657

++	const struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;

14658

++	struct sock *child, *rsk = NULL;

14659

++	int ret;

14660

++

14661

++	if (!(TCP_SKB_CB(skb)->mptcp_flags & MPTCPHDR_JOIN)) {

14662

++		struct tcphdr *th = tcp_hdr(skb);

14663

++		const struct iphdr *iph = ip_hdr(skb);

14664

++		struct sock *sk;

14665

++

14666

++		sk = inet_lookup_established(sock_net(meta_sk), &tcp_hashinfo,

14667

++					     iph->saddr, th->source, iph->daddr,

14668

++					     th->dest, inet_iif(skb));

14669

++

14670

++		if (!sk) {

14671

++			kfree_skb(skb);

14672

++			return 0;

14673

++		}

14674

++		if (is_meta_sk(sk)) {

14675

++			WARN("%s Did not find a sub-sk - did found the meta!\n", __func__);

14676

++			kfree_skb(skb);

14677

++			sock_put(sk);

14678

++			return 0;

14679

++		}

14680

++

14681

++		if (sk->sk_state == TCP_TIME_WAIT) {

14682

++			inet_twsk_put(inet_twsk(sk));

14683

++			kfree_skb(skb);

14684

++			return 0;

14685

++		}

14686

++

14687

++		ret = tcp_v4_do_rcv(sk, skb);

14688

++		sock_put(sk);

14689

++

14690

++		return ret;

14691

++	}

14692

++	TCP_SKB_CB(skb)->mptcp_flags = 0;

14693

++

14694

++	/* Has been removed from the tk-table. Thus, no new subflows.

14695

++	 *

14696

++	 * Check for close-state is necessary, because we may have been closed

14697

++	 * without passing by mptcp_close().

14698

++	 *

14699

++	 * When falling back, no new subflows are allowed either.

14700

++	 */

14701

++	if (meta_sk->sk_state == TCP_CLOSE || !tcp_sk(meta_sk)->inside_tk_table ||

14702

++	    mpcb->infinite_mapping_rcv || mpcb->send_infinite_mapping)

14703

++		goto reset_and_discard;

14704

++

14705

++	child = tcp_v4_hnd_req(meta_sk, skb);

14706

++

14707

++	if (!child)

14708

++		goto discard;

14709

++

14710

++	if (child != meta_sk) {

14711

++		sock_rps_save_rxhash(child, skb);

14712

++		/* We don't call tcp_child_process here, because we hold

14713

++		 * already the meta-sk-lock and are sure that it is not owned

14714

++		 * by the user.

14715

++		 */

14716

++		ret = tcp_rcv_state_process(child, skb, tcp_hdr(skb), skb->len);

14717

++		bh_unlock_sock(child);

14718

++		sock_put(child);

14719

++		if (ret) {

14720

++			rsk = child;

14721

++			goto reset_and_discard;

14722

++		}

14723

++	} else {

14724

++		if (tcp_hdr(skb)->syn) {

14725

++			mptcp_v4_join_request(meta_sk, skb);

14726

++			goto discard;

14727

++		}

14728

++		goto reset_and_discard;

14729

++	}

14730

++	return 0;

14731

++

14732

++reset_and_discard:

14733

++	if (reqsk_queue_len(&inet_csk(meta_sk)->icsk_accept_queue)) {

14734

++		const struct tcphdr *th = tcp_hdr(skb);

14735

++		const struct iphdr *iph = ip_hdr(skb);

14736

++		struct request_sock **prev, *req;

14737

++		/* If we end up here, it means we should not have matched on the

14738

++		 * request-socket. But, because the request-sock queue is only

14739

++		 * destroyed in mptcp_close, the socket may actually already be

14740

++		 * in close-state (e.g., through shutdown()) while still having

14741

++		 * pending request sockets.

14742

++		 */

14743

++		req = inet_csk_search_req(meta_sk, &prev, th->source,

14744

++					  iph->saddr, iph->daddr);

14745

++		if (req) {

14746

++			inet_csk_reqsk_queue_unlink(meta_sk, req, prev);

14747

++			reqsk_queue_removed(&inet_csk(meta_sk)->icsk_accept_queue,

14748

++					    req);

14749

++			reqsk_free(req);

14750

++		}

14751

++	}

14752

++

14753

++	tcp_v4_send_reset(rsk, skb);

14754

++discard:

14755

++	kfree_skb(skb);

14756

++	return 0;

14757

++}

14758

++

14759

++/* After this, the ref count of the meta_sk associated with the request_sock

14760

++ * is incremented. Thus it is the responsibility of the caller

14761

++ * to call sock_put() when the reference is not needed anymore.

14762

++ */

14763

++struct sock *mptcp_v4_search_req(const __be16 rport, const __be32 raddr,

14764

++				 const __be32 laddr, const struct net *net)

14765

++{

14766

++	const struct mptcp_request_sock *mtreq;

14767

++	struct sock *meta_sk = NULL;

14768

++	const struct hlist_nulls_node *node;

14769

++	const u32 hash = inet_synq_hash(raddr, rport, 0, MPTCP_HASH_SIZE);

14770

++

14771

++	rcu_read_lock();

14772

++begin:

14773

++	hlist_nulls_for_each_entry_rcu(mtreq, node, &mptcp_reqsk_htb[hash],

14774

++				       hash_entry) {

14775

++		struct inet_request_sock *ireq = inet_rsk(rev_mptcp_rsk(mtreq));

14776

++		meta_sk = mtreq->mptcp_mpcb->meta_sk;

14777

++

14778

++		if (ireq->ir_rmt_port == rport &&

14779

++		    ireq->ir_rmt_addr == raddr &&

14780

++		    ireq->ir_loc_addr == laddr &&

14781

++		    rev_mptcp_rsk(mtreq)->rsk_ops->family == AF_INET &&

14782

++		    net_eq(net, sock_net(meta_sk)))

14783

++			goto found;

14784

++		meta_sk = NULL;

14785

++	}

14786

++	/* A request-socket is destroyed by RCU. So, it might have been recycled

14787

++	 * and put into another hash-table list. So, after the lookup we may

14788

++	 * end up in a different list. So, we may need to restart.

14789

++	 *

14790

++	 * See also the comment in __inet_lookup_established.

14791

++	 */

14792

++	if (get_nulls_value(node) != hash + MPTCP_REQSK_NULLS_BASE)

14793

++		goto begin;

14794

++

14795

++found:

14796

++	if (meta_sk && unlikely(!atomic_inc_not_zero(&meta_sk->sk_refcnt)))

14797

++		meta_sk = NULL;

14798

++	rcu_read_unlock();

14799

++

14800

++	return meta_sk;

14801

++}

14802

++

14803

++/* Create a new IPv4 subflow.

14804

++ *

14805

++ * We are in user-context and meta-sock-lock is hold.

14806

++ */

14807

++int mptcp_init4_subsockets(struct sock *meta_sk, const struct mptcp_loc4 *loc,

14808

++			   struct mptcp_rem4 *rem)

14809

++{

14810

++	struct tcp_sock *tp;

14811

++	struct sock *sk;

14812

++	struct sockaddr_in loc_in, rem_in;

14813

++	struct socket sock;

14814

++	int ret;

14815

++

14816

++	/** First, create and prepare the new socket */

14817

++

14818

++	sock.type = meta_sk->sk_socket->type;

14819

++	sock.state = SS_UNCONNECTED;

14820

++	sock.wq = meta_sk->sk_socket->wq;

14821

++	sock.file = meta_sk->sk_socket->file;

14822

++	sock.ops = NULL;

14823

++

14824

++	ret = inet_create(sock_net(meta_sk), &sock, IPPROTO_TCP, 1);

14825

++	if (unlikely(ret < 0)) {

14826

++		mptcp_debug("%s inet_create failed ret: %d\n", __func__, ret);

14827

++		return ret;

14828

++	}

14829

++

14830

++	sk = sock.sk;

14831

++	tp = tcp_sk(sk);

14832

++

14833

++	/* All subsockets need the MPTCP-lock-class */

14834

++	lockdep_set_class_and_name(&(sk)->sk_lock.slock, &meta_slock_key, "slock-AF_INET-MPTCP");

14835

++	lockdep_init_map(&(sk)->sk_lock.dep_map, "sk_lock-AF_INET-MPTCP", &meta_key, 0);

14836

++

14837

++	if (mptcp_add_sock(meta_sk, sk, loc->loc4_id, rem->rem4_id, GFP_KERNEL))

14838

++		goto error;

14839

++

14840

++	tp->mptcp->slave_sk = 1;

14841

++	tp->mptcp->low_prio = loc->low_prio;

14842

++

14843

++	/* Initializing the timer for an MPTCP subflow */

14844

++	setup_timer(&tp->mptcp->mptcp_ack_timer, mptcp_ack_handler, (unsigned long)sk);

14845

++

14846

++	/** Then, connect the socket to the peer */

14847

++	loc_in.sin_family = AF_INET;

14848

++	rem_in.sin_family = AF_INET;

14849

++	loc_in.sin_port = 0;

14850

++	if (rem->port)

14851

++		rem_in.sin_port = rem->port;

14852

++	else

14853

++		rem_in.sin_port = inet_sk(meta_sk)->inet_dport;

14854

++	loc_in.sin_addr = loc->addr;

14855

++	rem_in.sin_addr = rem->addr;

14856

++

14857

++	ret = sock.ops->bind(&sock, (struct sockaddr *)&loc_in, sizeof(struct sockaddr_in));

14858

++	if (ret < 0) {

14859

++		mptcp_debug("%s: MPTCP subsocket bind() failed, error %d\n",

14860

++			    __func__, ret);

14861

++		goto error;

14862

++	}

14863

++

14864

++	mptcp_debug("%s: token %#x pi %d src_addr:%pI4:%d dst_addr:%pI4:%d\n",

14865

++		    __func__, tcp_sk(meta_sk)->mpcb->mptcp_loc_token,

14866

++		    tp->mptcp->path_index, &loc_in.sin_addr,

14867

++		    ntohs(loc_in.sin_port), &rem_in.sin_addr,

14868

++		    ntohs(rem_in.sin_port));

14869

++

14870

++	if (tcp_sk(meta_sk)->mpcb->pm_ops->init_subsocket_v4)

14871

++		tcp_sk(meta_sk)->mpcb->pm_ops->init_subsocket_v4(sk, rem->addr);

14872

++

14873

++	ret = sock.ops->connect(&sock, (struct sockaddr *)&rem_in,

14874

++				sizeof(struct sockaddr_in), O_NONBLOCK);

14875

++	if (ret < 0 && ret != -EINPROGRESS) {

14876

++		mptcp_debug("%s: MPTCP subsocket connect() failed, error %d\n",

14877

++			    __func__, ret);

14878

++		goto error;

14879

++	}

14880

++

14881

++	sk_set_socket(sk, meta_sk->sk_socket);

14882

++	sk->sk_wq = meta_sk->sk_wq;

14883

++

14884

++	return 0;

14885

++

14886

++error:

14887

++	/* May happen if mptcp_add_sock fails first */

14888

++	if (!mptcp(tp)) {

14889

++		tcp_close(sk, 0);

14890

++	} else {

14891

++		local_bh_disable();

14892

++		mptcp_sub_force_close(sk);

14893

++		local_bh_enable();

14894

++	}

14895

++	return ret;

14896

++}

14897

++EXPORT_SYMBOL(mptcp_init4_subsockets);

14898

++

14899

++const struct inet_connection_sock_af_ops mptcp_v4_specific = {

14900

++	.queue_xmit	   = ip_queue_xmit,

14901

++	.send_check	   = tcp_v4_send_check,

14902

++	.rebuild_header	   = inet_sk_rebuild_header,

14903

++	.sk_rx_dst_set	   = inet_sk_rx_dst_set,

14904

++	.conn_request	   = mptcp_conn_request,

14905

++	.syn_recv_sock	   = tcp_v4_syn_recv_sock,

14906

++	.net_header_len	   = sizeof(struct iphdr),

14907

++	.setsockopt	   = ip_setsockopt,

14908

++	.getsockopt	   = ip_getsockopt,

14909

++	.addr2sockaddr	   = inet_csk_addr2sockaddr,

14910

++	.sockaddr_len	   = sizeof(struct sockaddr_in),

14911

++	.bind_conflict	   = inet_csk_bind_conflict,

14912

++#ifdef CONFIG_COMPAT

14913

++	.compat_setsockopt = compat_ip_setsockopt,

14914

++	.compat_getsockopt = compat_ip_getsockopt,

14915

++#endif

14916

++};

14917

++

14918

++struct tcp_request_sock_ops mptcp_request_sock_ipv4_ops;

14919

++struct tcp_request_sock_ops mptcp_join_request_sock_ipv4_ops;

14920

++

14921

++/* General initialization of IPv4 for MPTCP */

14922

++int mptcp_pm_v4_init(void)

14923

++{

14924

++	int ret = 0;

14925

++	struct request_sock_ops *ops = &mptcp_request_sock_ops;

14926

++

14927

++	mptcp_request_sock_ipv4_ops = tcp_request_sock_ipv4_ops;

14928

++	mptcp_request_sock_ipv4_ops.init_req = mptcp_v4_init_req;

14929

++

14930

++	mptcp_join_request_sock_ipv4_ops = tcp_request_sock_ipv4_ops;

14931

++	mptcp_join_request_sock_ipv4_ops.init_req = mptcp_v4_join_init_req;

14932

++	mptcp_join_request_sock_ipv4_ops.queue_hash_add = mptcp_v4_reqsk_queue_hash_add;

14933

++

14934

++	ops->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", "MPTCP");

14935

++	if (ops->slab_name == NULL) {

14936

++		ret = -ENOMEM;

14937

++		goto out;

14938

++	}

14939

++

14940

++	ops->slab = kmem_cache_create(ops->slab_name, ops->obj_size, 0,

14941

++				      SLAB_DESTROY_BY_RCU|SLAB_HWCACHE_ALIGN,

14942

++				      NULL);

14943

++

14944

++	if (ops->slab == NULL) {

14945

++		ret =  -ENOMEM;

14946

++		goto err_reqsk_create;

14947

++	}

14948

++

14949

++out:

14950

++	return ret;

14951

++

14952

++err_reqsk_create:

14953

++	kfree(ops->slab_name);

14954

++	ops->slab_name = NULL;

14955

++	goto out;

14956

++}

14957

++

14958

++void mptcp_pm_v4_undo(void)

14959

++{

14960

++	kmem_cache_destroy(mptcp_request_sock_ops.slab);

14961

++	kfree(mptcp_request_sock_ops.slab_name);

14962

++}

14963

+diff --git a/net/mptcp/mptcp_ipv6.c b/net/mptcp/mptcp_ipv6.c

14964

+new file mode 100644

14965

+index 000000000000..1036973aa855

14966

+--- /dev/null

14967

++++ b/net/mptcp/mptcp_ipv6.c

14968

+@@ -0,0 +1,518 @@

14969

++/*

14970

++ *	MPTCP implementation - IPv6-specific functions

14971

++ *

14972

++ *	Initial Design & Implementation:

14973

++ *	Sébastien Barré <sebastien.barre@×××××××××.be>

14974

++ *

14975

++ *	Current Maintainer:

14976

++ *	Jaakko Korkeaniemi <jaakko.korkeaniemi@×××××.fi>

14977

++ *

14978

++ *	Additional authors:

14979

++ *	Jaakko Korkeaniemi <jaakko.korkeaniemi@×××××.fi>

14980

++ *	Gregory Detal <gregory.detal@×××××××××.be>

14981

++ *	Fabien Duchêne <fabien.duchene@×××××××××.be>

14982

++ *	Andreas Seelinger <Andreas.Seelinger@×××××××××××.de>

14983

++ *	Lavkesh Lahngir <lavkesh51@×××××.com>

14984

++ *	Andreas Ripke <ripke@××××××.eu>

14985

++ *	Vlad Dogaru <vlad.dogaru@×××××.com>

14986

++ *	Octavian Purdila <octavian.purdila@×××××.com>

14987

++ *	John Ronan <jronan@××××.org>

14988

++ *	Catalin Nicutar <catalin.nicutar@×××××.com>

14989

++ *	Brandon Heller <brandonh@××××××××.edu>

14990

++ *

14991

++ *

14992

++ *	This program is free software; you can redistribute it and/or

14993

++ *      modify it under the terms of the GNU General Public License

14994

++ *      as published by the Free Software Foundation; either version

14995

++ *      2 of the License, or (at your option) any later version.

14996

++ */

14997

++

14998

++#include <linux/export.h>

14999

++#include <linux/in6.h>

15000

++#include <linux/kernel.h>

15001

++

15002

++#include <net/addrconf.h>

15003

++#include <net/flow.h>

15004

++#include <net/inet6_connection_sock.h>

15005

++#include <net/inet6_hashtables.h>

15006

++#include <net/inet_common.h>

15007

++#include <net/ipv6.h>

15008

++#include <net/ip6_checksum.h>

15009

++#include <net/ip6_route.h>

15010

++#include <net/mptcp.h>

15011

++#include <net/mptcp_v6.h>

15012

++#include <net/tcp.h>

15013

++#include <net/transp_v6.h>

15014

++

15015

++__u32 mptcp_v6_get_nonce(const __be32 *saddr, const __be32 *daddr,

15016

++			 __be16 sport, __be16 dport)

15017

++{

15018

++	u32 secret[MD5_MESSAGE_BYTES / 4];

15019

++	u32 hash[MD5_DIGEST_WORDS];

15020

++	u32 i;

15021

++

15022

++	memcpy(hash, saddr, 16);

15023

++	for (i = 0; i < 4; i++)

15024

++		secret[i] = mptcp_secret[i] + (__force u32)daddr[i];

15025

++	secret[4] = mptcp_secret[4] +

15026

++		    (((__force u16)sport << 16) + (__force u16)dport);

15027

++	secret[5] = mptcp_seed++;

15028

++	for (i = 6; i < MD5_MESSAGE_BYTES / 4; i++)

15029

++		secret[i] = mptcp_secret[i];

15030

++

15031

++	md5_transform(hash, secret);

15032

++

15033

++	return hash[0];

15034

++}

15035

++

15036

++u64 mptcp_v6_get_key(const __be32 *saddr, const __be32 *daddr,

15037

++		     __be16 sport, __be16 dport)

15038

++{

15039

++	u32 secret[MD5_MESSAGE_BYTES / 4];

15040

++	u32 hash[MD5_DIGEST_WORDS];

15041

++	u32 i;

15042

++

15043

++	memcpy(hash, saddr, 16);

15044

++	for (i = 0; i < 4; i++)

15045

++		secret[i] = mptcp_secret[i] + (__force u32)daddr[i];

15046

++	secret[4] = mptcp_secret[4] +

15047

++		    (((__force u16)sport << 16) + (__force u16)dport);

15048

++	secret[5] = mptcp_seed++;

15049

++	for (i = 6; i < MD5_MESSAGE_BYTES / 4; i++)

15050

++		secret[i] = mptcp_secret[i];

15051

++

15052

++	md5_transform(hash, secret);

15053

++

15054

++	return *((u64 *)hash);

15055

++}

15056

++

15057

++static void mptcp_v6_reqsk_destructor(struct request_sock *req)

15058

++{

15059

++	mptcp_reqsk_destructor(req);

15060

++

15061

++	tcp_v6_reqsk_destructor(req);

15062

++}

15063

++

15064

++static int mptcp_v6_init_req(struct request_sock *req, struct sock *sk,

15065

++			     struct sk_buff *skb)

15066

++{

15067

++	tcp_request_sock_ipv6_ops.init_req(req, sk, skb);

15068

++	mptcp_reqsk_init(req, skb);

15069

++

15070

++	return 0;

15071

++}

15072

++

15073

++static int mptcp_v6_join_init_req(struct request_sock *req, struct sock *sk,

15074

++				  struct sk_buff *skb)

15075

++{

15076

++	struct mptcp_request_sock *mtreq = mptcp_rsk(req);

15077

++	struct mptcp_cb *mpcb = tcp_sk(sk)->mpcb;

15078

++	union inet_addr addr;

15079

++	int loc_id;

15080

++	bool low_prio = false;

15081

++

15082

++	/* We need to do this as early as possible. Because, if we fail later

15083

++	 * (e.g., get_local_id), then reqsk_free tries to remove the

15084

++	 * request-socket from the htb in mptcp_hash_request_remove as pprev

15085

++	 * may be different from NULL.

15086

++	 */

15087

++	mtreq->hash_entry.pprev = NULL;

15088

++

15089

++	tcp_request_sock_ipv6_ops.init_req(req, sk, skb);

15090

++

15091

++	mtreq->mptcp_loc_nonce = mptcp_v6_get_nonce(ipv6_hdr(skb)->saddr.s6_addr32,

15092

++						    ipv6_hdr(skb)->daddr.s6_addr32,

15093

++						    tcp_hdr(skb)->source,

15094

++						    tcp_hdr(skb)->dest);

15095

++	addr.in6 = inet_rsk(req)->ir_v6_loc_addr;

15096

++	loc_id = mpcb->pm_ops->get_local_id(AF_INET6, &addr, sock_net(sk), &low_prio);

15097

++	if (loc_id == -1)

15098

++		return -1;

15099

++	mtreq->loc_id = loc_id;

15100

++	mtreq->low_prio = low_prio;

15101

++

15102

++	mptcp_join_reqsk_init(mpcb, req, skb);

15103

++

15104

++	return 0;

15105

++}

15106

++

15107

++/* Similar to tcp6_request_sock_ops */

15108

++struct request_sock_ops mptcp6_request_sock_ops __read_mostly = {

15109

++	.family		=	AF_INET6,

15110

++	.obj_size	=	sizeof(struct mptcp_request_sock),

15111

++	.rtx_syn_ack	=	tcp_v6_rtx_synack,

15112

++	.send_ack	=	tcp_v6_reqsk_send_ack,

15113

++	.destructor	=	mptcp_v6_reqsk_destructor,

15114

++	.send_reset	=	tcp_v6_send_reset,

15115

++	.syn_ack_timeout =	tcp_syn_ack_timeout,

15116

++};

15117

++

15118

++static void mptcp_v6_reqsk_queue_hash_add(struct sock *meta_sk,

15119

++					  struct request_sock *req,

15120

++					  const unsigned long timeout)

15121

++{

15122

++	const u32 h1 = inet6_synq_hash(&inet_rsk(req)->ir_v6_rmt_addr,

15123

++				      inet_rsk(req)->ir_rmt_port,

15124

++				      0, MPTCP_HASH_SIZE);

15125

++	/* We cannot call inet6_csk_reqsk_queue_hash_add(), because we do not

15126

++	 * want to reset the keepalive-timer (responsible for retransmitting

15127

++	 * SYN/ACKs). We do not retransmit SYN/ACKs+MP_JOINs, because we cannot

15128

++	 * overload the keepalive timer. Also, it's not a big deal, because the

15129

++	 * third ACK of the MP_JOIN-handshake is sent in a reliable manner. So,

15130

++	 * if the third ACK gets lost, the client will handle the retransmission

15131

++	 * anyways. If our SYN/ACK gets lost, the client will retransmit the

15132

++	 * SYN.

15133

++	 */

15134

++	struct inet_connection_sock *meta_icsk = inet_csk(meta_sk);

15135

++	struct listen_sock *lopt = meta_icsk->icsk_accept_queue.listen_opt;

15136

++	const u32 h2 = inet6_synq_hash(&inet_rsk(req)->ir_v6_rmt_addr,

15137

++				      inet_rsk(req)->ir_rmt_port,

15138

++				      lopt->hash_rnd, lopt->nr_table_entries);

15139

++

15140

++	reqsk_queue_hash_req(&meta_icsk->icsk_accept_queue, h2, req, timeout);

15141

++	if (reqsk_queue_added(&meta_icsk->icsk_accept_queue) == 0)

15142

++		mptcp_reset_synack_timer(meta_sk, timeout);

15143

++

15144

++	rcu_read_lock();

15145

++	spin_lock(&mptcp_reqsk_hlock);

15146

++	hlist_nulls_add_head_rcu(&mptcp_rsk(req)->hash_entry, &mptcp_reqsk_htb[h1]);

15147

++	spin_unlock(&mptcp_reqsk_hlock);

15148

++	rcu_read_unlock();

15149

++}

15150

++

15151

++static int mptcp_v6_join_request(struct sock *meta_sk, struct sk_buff *skb)

15152

++{

15153

++	return tcp_conn_request(&mptcp6_request_sock_ops,

15154

++				&mptcp_join_request_sock_ipv6_ops,

15155

++				meta_sk, skb);

15156

++}

15157

++

15158

++int mptcp_v6_do_rcv(struct sock *meta_sk, struct sk_buff *skb)

15159

++{

15160

++	const struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;

15161

++	struct sock *child, *rsk = NULL;

15162

++	int ret;

15163

++

15164

++	if (!(TCP_SKB_CB(skb)->mptcp_flags & MPTCPHDR_JOIN)) {

15165

++		struct tcphdr *th = tcp_hdr(skb);

15166

++		const struct ipv6hdr *ip6h = ipv6_hdr(skb);

15167

++		struct sock *sk;

15168

++

15169

++		sk = __inet6_lookup_established(sock_net(meta_sk),

15170

++						&tcp_hashinfo,

15171

++						&ip6h->saddr, th->source,

15172

++						&ip6h->daddr, ntohs(th->dest),

15173

++						inet6_iif(skb));

15174

++

15175

++		if (!sk) {

15176

++			kfree_skb(skb);

15177

++			return 0;

15178

++		}

15179

++		if (is_meta_sk(sk)) {

15180

++			WARN("%s Did not find a sub-sk!\n", __func__);

15181

++			kfree_skb(skb);

15182

++			sock_put(sk);

15183

++			return 0;

15184

++		}

15185

++

15186

++		if (sk->sk_state == TCP_TIME_WAIT) {

15187

++			inet_twsk_put(inet_twsk(sk));

15188

++			kfree_skb(skb);

15189

++			return 0;

15190

++		}

15191

++

15192

++		ret = tcp_v6_do_rcv(sk, skb);

15193

++		sock_put(sk);

15194

++

15195

++		return ret;

15196

++	}

15197

++	TCP_SKB_CB(skb)->mptcp_flags = 0;

15198

++

15199

++	/* Has been removed from the tk-table. Thus, no new subflows.

15200

++	 *

15201

++	 * Check for close-state is necessary, because we may have been closed

15202

++	 * without passing by mptcp_close().

15203

++	 *

15204

++	 * When falling back, no new subflows are allowed either.

15205

++	 */

15206

++	if (meta_sk->sk_state == TCP_CLOSE || !tcp_sk(meta_sk)->inside_tk_table ||

15207

++	    mpcb->infinite_mapping_rcv || mpcb->send_infinite_mapping)

15208

++		goto reset_and_discard;

15209

++

15210

++	child = tcp_v6_hnd_req(meta_sk, skb);

15211

++

15212

++	if (!child)

15213

++		goto discard;

15214

++

15215

++	if (child != meta_sk) {

15216

++		sock_rps_save_rxhash(child, skb);

15217

++		/* We don't call tcp_child_process here, because we hold

15218

++		 * already the meta-sk-lock and are sure that it is not owned

15219

++		 * by the user.

15220

++		 */

15221

++		ret = tcp_rcv_state_process(child, skb, tcp_hdr(skb), skb->len);

15222

++		bh_unlock_sock(child);

15223

++		sock_put(child);

15224

++		if (ret) {

15225

++			rsk = child;

15226

++			goto reset_and_discard;

15227

++		}

15228

++	} else {

15229

++		if (tcp_hdr(skb)->syn) {

15230

++			mptcp_v6_join_request(meta_sk, skb);

15231

++			goto discard;

15232

++		}

15233

++		goto reset_and_discard;

15234

++	}

15235

++	return 0;

15236

++

15237

++reset_and_discard:

15238

++	if (reqsk_queue_len(&inet_csk(meta_sk)->icsk_accept_queue)) {

15239

++		const struct tcphdr *th = tcp_hdr(skb);

15240

++		struct request_sock **prev, *req;

15241

++		/* If we end up here, it means we should not have matched on the

15242

++		 * request-socket. But, because the request-sock queue is only

15243

++		 * destroyed in mptcp_close, the socket may actually already be

15244

++		 * in close-state (e.g., through shutdown()) while still having

15245

++		 * pending request sockets.

15246

++		 */

15247

++		req = inet6_csk_search_req(meta_sk, &prev, th->source,

15248

++					   &ipv6_hdr(skb)->saddr,

15249

++					   &ipv6_hdr(skb)->daddr, inet6_iif(skb));

15250

++		if (req) {

15251

++			inet_csk_reqsk_queue_unlink(meta_sk, req, prev);

15252

++			reqsk_queue_removed(&inet_csk(meta_sk)->icsk_accept_queue,

15253

++					    req);

15254

++			reqsk_free(req);

15255

++		}

15256

++	}

15257

++

15258

++	tcp_v6_send_reset(rsk, skb);

15259

++discard:

15260

++	kfree_skb(skb);

15261

++	return 0;

15262

++}

15263

++

15264

++/* After this, the ref count of the meta_sk associated with the request_sock

15265

++ * is incremented. Thus it is the responsibility of the caller

15266

++ * to call sock_put() when the reference is not needed anymore.

15267

++ */

15268

++struct sock *mptcp_v6_search_req(const __be16 rport, const struct in6_addr *raddr,

15269

++				 const struct in6_addr *laddr, const struct net *net)

15270

++{

15271

++	const struct mptcp_request_sock *mtreq;

15272

++	struct sock *meta_sk = NULL;

15273

++	const struct hlist_nulls_node *node;

15274

++	const u32 hash = inet6_synq_hash(raddr, rport, 0, MPTCP_HASH_SIZE);

15275

++

15276

++	rcu_read_lock();

15277

++begin:

15278

++	hlist_nulls_for_each_entry_rcu(mtreq, node, &mptcp_reqsk_htb[hash],

15279

++				       hash_entry) {

15280

++		struct inet_request_sock *treq = inet_rsk(rev_mptcp_rsk(mtreq));

15281

++		meta_sk = mtreq->mptcp_mpcb->meta_sk;

15282

++

15283

++		if (inet_rsk(rev_mptcp_rsk(mtreq))->ir_rmt_port == rport &&

15284

++		    rev_mptcp_rsk(mtreq)->rsk_ops->family == AF_INET6 &&

15285

++		    ipv6_addr_equal(&treq->ir_v6_rmt_addr, raddr) &&

15286

++		    ipv6_addr_equal(&treq->ir_v6_loc_addr, laddr) &&

15287

++		    net_eq(net, sock_net(meta_sk)))

15288

++			goto found;

15289

++		meta_sk = NULL;

15290

++	}

15291

++	/* A request-socket is destroyed by RCU. So, it might have been recycled

15292

++	 * and put into another hash-table list. So, after the lookup we may

15293

++	 * end up in a different list. So, we may need to restart.

15294

++	 *

15295

++	 * See also the comment in __inet_lookup_established.

15296

++	 */

15297

++	if (get_nulls_value(node) != hash + MPTCP_REQSK_NULLS_BASE)

15298

++		goto begin;

15299

++

15300

++found:

15301

++	if (meta_sk && unlikely(!atomic_inc_not_zero(&meta_sk->sk_refcnt)))

15302

++		meta_sk = NULL;

15303

++	rcu_read_unlock();

15304

++

15305

++	return meta_sk;

15306

++}

15307

++

15308

++/* Create a new IPv6 subflow.

15309

++ *

15310

++ * We are in user-context and meta-sock-lock is hold.

15311

++ */

15312

++int mptcp_init6_subsockets(struct sock *meta_sk, const struct mptcp_loc6 *loc,

15313

++			   struct mptcp_rem6 *rem)

15314

++{

15315

++	struct tcp_sock *tp;

15316

++	struct sock *sk;

15317

++	struct sockaddr_in6 loc_in, rem_in;

15318

++	struct socket sock;

15319

++	int ret;

15320

++

15321

++	/** First, create and prepare the new socket */

15322

++

15323

++	sock.type = meta_sk->sk_socket->type;

15324

++	sock.state = SS_UNCONNECTED;

15325

++	sock.wq = meta_sk->sk_socket->wq;

15326

++	sock.file = meta_sk->sk_socket->file;

15327

++	sock.ops = NULL;

15328

++

15329

++	ret = inet6_create(sock_net(meta_sk), &sock, IPPROTO_TCP, 1);

15330

++	if (unlikely(ret < 0)) {

15331

++		mptcp_debug("%s inet6_create failed ret: %d\n", __func__, ret);

15332

++		return ret;

15333

++	}

15334

++

15335

++	sk = sock.sk;

15336

++	tp = tcp_sk(sk);

15337

++

15338

++	/* All subsockets need the MPTCP-lock-class */

15339

++	lockdep_set_class_and_name(&(sk)->sk_lock.slock, &meta_slock_key, "slock-AF_INET-MPTCP");

15340

++	lockdep_init_map(&(sk)->sk_lock.dep_map, "sk_lock-AF_INET-MPTCP", &meta_key, 0);

15341

++

15342

++	if (mptcp_add_sock(meta_sk, sk, loc->loc6_id, rem->rem6_id, GFP_KERNEL))

15343

++		goto error;

15344

++

15345

++	tp->mptcp->slave_sk = 1;

15346

++	tp->mptcp->low_prio = loc->low_prio;

15347

++

15348

++	/* Initializing the timer for an MPTCP subflow */

15349

++	setup_timer(&tp->mptcp->mptcp_ack_timer, mptcp_ack_handler, (unsigned long)sk);

15350

++

15351

++	/** Then, connect the socket to the peer */

15352

++	loc_in.sin6_family = AF_INET6;

15353

++	rem_in.sin6_family = AF_INET6;

15354

++	loc_in.sin6_port = 0;

15355

++	if (rem->port)

15356

++		rem_in.sin6_port = rem->port;

15357

++	else

15358

++		rem_in.sin6_port = inet_sk(meta_sk)->inet_dport;

15359

++	loc_in.sin6_addr = loc->addr;

15360

++	rem_in.sin6_addr = rem->addr;

15361

++

15362

++	ret = sock.ops->bind(&sock, (struct sockaddr *)&loc_in, sizeof(struct sockaddr_in6));

15363

++	if (ret < 0) {

15364

++		mptcp_debug("%s: MPTCP subsocket bind()failed, error %d\n",

15365

++			    __func__, ret);

15366

++		goto error;

15367

++	}

15368

++

15369

++	mptcp_debug("%s: token %#x pi %d src_addr:%pI6:%d dst_addr:%pI6:%d\n",

15370

++		    __func__, tcp_sk(meta_sk)->mpcb->mptcp_loc_token,

15371

++		    tp->mptcp->path_index, &loc_in.sin6_addr,

15372

++		    ntohs(loc_in.sin6_port), &rem_in.sin6_addr,

15373

++		    ntohs(rem_in.sin6_port));

15374

++

15375

++	if (tcp_sk(meta_sk)->mpcb->pm_ops->init_subsocket_v6)

15376

++		tcp_sk(meta_sk)->mpcb->pm_ops->init_subsocket_v6(sk, rem->addr);

15377

++

15378

++	ret = sock.ops->connect(&sock, (struct sockaddr *)&rem_in,

15379

++				sizeof(struct sockaddr_in6), O_NONBLOCK);

15380

++	if (ret < 0 && ret != -EINPROGRESS) {

15381

++		mptcp_debug("%s: MPTCP subsocket connect() failed, error %d\n",

15382

++			    __func__, ret);

15383

++		goto error;

15384

++	}

15385

++

15386

++	sk_set_socket(sk, meta_sk->sk_socket);

15387

++	sk->sk_wq = meta_sk->sk_wq;

15388

++

15389

++	return 0;

15390

++

15391

++error:

15392

++	/* May happen if mptcp_add_sock fails first */

15393

++	if (!mptcp(tp)) {

15394

++		tcp_close(sk, 0);

15395

++	} else {

15396

++		local_bh_disable();

15397

++		mptcp_sub_force_close(sk);

15398

++		local_bh_enable();

15399

++	}

15400

++	return ret;

15401

++}

15402

++EXPORT_SYMBOL(mptcp_init6_subsockets);

15403

++

15404

++const struct inet_connection_sock_af_ops mptcp_v6_specific = {

15405

++	.queue_xmit	   = inet6_csk_xmit,

15406

++	.send_check	   = tcp_v6_send_check,

15407

++	.rebuild_header	   = inet6_sk_rebuild_header,

15408

++	.sk_rx_dst_set	   = inet6_sk_rx_dst_set,

15409

++	.conn_request	   = mptcp_conn_request,

15410

++	.syn_recv_sock	   = tcp_v6_syn_recv_sock,

15411

++	.net_header_len	   = sizeof(struct ipv6hdr),

15412

++	.net_frag_header_len = sizeof(struct frag_hdr),

15413

++	.setsockopt	   = ipv6_setsockopt,

15414

++	.getsockopt	   = ipv6_getsockopt,

15415

++	.addr2sockaddr	   = inet6_csk_addr2sockaddr,

15416

++	.sockaddr_len	   = sizeof(struct sockaddr_in6),

15417

++	.bind_conflict	   = inet6_csk_bind_conflict,

15418

++#ifdef CONFIG_COMPAT

15419

++	.compat_setsockopt = compat_ipv6_setsockopt,

15420

++	.compat_getsockopt = compat_ipv6_getsockopt,

15421

++#endif

15422

++};

15423

++

15424

++const struct inet_connection_sock_af_ops mptcp_v6_mapped = {

15425

++	.queue_xmit	   = ip_queue_xmit,

15426

++	.send_check	   = tcp_v4_send_check,

15427

++	.rebuild_header	   = inet_sk_rebuild_header,

15428

++	.sk_rx_dst_set	   = inet_sk_rx_dst_set,

15429

++	.conn_request	   = mptcp_conn_request,

15430

++	.syn_recv_sock	   = tcp_v6_syn_recv_sock,

15431

++	.net_header_len	   = sizeof(struct iphdr),

15432

++	.setsockopt	   = ipv6_setsockopt,

15433

++	.getsockopt	   = ipv6_getsockopt,

15434

++	.addr2sockaddr	   = inet6_csk_addr2sockaddr,

15435

++	.sockaddr_len	   = sizeof(struct sockaddr_in6),

15436

++	.bind_conflict	   = inet6_csk_bind_conflict,

15437

++#ifdef CONFIG_COMPAT

15438

++	.compat_setsockopt = compat_ipv6_setsockopt,

15439

++	.compat_getsockopt = compat_ipv6_getsockopt,

15440

++#endif

15441

++};

15442

++

15443

++struct tcp_request_sock_ops mptcp_request_sock_ipv6_ops;

15444

++struct tcp_request_sock_ops mptcp_join_request_sock_ipv6_ops;

15445

++

15446

++int mptcp_pm_v6_init(void)

15447

++{

15448

++	int ret = 0;

15449

++	struct request_sock_ops *ops = &mptcp6_request_sock_ops;

15450

++

15451

++	mptcp_request_sock_ipv6_ops = tcp_request_sock_ipv6_ops;

15452

++	mptcp_request_sock_ipv6_ops.init_req = mptcp_v6_init_req;

15453

++

15454

++	mptcp_join_request_sock_ipv6_ops = tcp_request_sock_ipv6_ops;

15455

++	mptcp_join_request_sock_ipv6_ops.init_req = mptcp_v6_join_init_req;

15456

++	mptcp_join_request_sock_ipv6_ops.queue_hash_add = mptcp_v6_reqsk_queue_hash_add;

15457

++

15458

++	ops->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", "MPTCP6");

15459

++	if (ops->slab_name == NULL) {

15460

++		ret = -ENOMEM;

15461

++		goto out;

15462

++	}

15463

++

15464

++	ops->slab = kmem_cache_create(ops->slab_name, ops->obj_size, 0,

15465

++				      SLAB_DESTROY_BY_RCU|SLAB_HWCACHE_ALIGN,

15466

++				      NULL);

15467

++

15468

++	if (ops->slab == NULL) {

15469

++		ret =  -ENOMEM;

15470

++		goto err_reqsk_create;

15471

++	}

15472

++

15473

++out:

15474

++	return ret;

15475

++

15476

++err_reqsk_create:

15477

++	kfree(ops->slab_name);

15478

++	ops->slab_name = NULL;

15479

++	goto out;

15480

++}

15481

++

15482

++void mptcp_pm_v6_undo(void)

15483

++{

15484

++	kmem_cache_destroy(mptcp6_request_sock_ops.slab);

15485

++	kfree(mptcp6_request_sock_ops.slab_name);

15486

++}

15487

+diff --git a/net/mptcp/mptcp_ndiffports.c b/net/mptcp/mptcp_ndiffports.c

15488

+new file mode 100644

15489

+index 000000000000..6f5087983175

15490

+--- /dev/null

15491

++++ b/net/mptcp/mptcp_ndiffports.c

15492

+@@ -0,0 +1,161 @@

15493

++#include <linux/module.h>

15494

++

15495

++#include <net/mptcp.h>

15496

++#include <net/mptcp_v4.h>

15497

++

15498

++#if IS_ENABLED(CONFIG_IPV6)

15499

++#include <net/mptcp_v6.h>

15500

++#endif

15501

++

15502

++struct ndiffports_priv {

15503

++	/* Worker struct for subflow establishment */

15504

++	struct work_struct subflow_work;

15505

++

15506

++	struct mptcp_cb *mpcb;

15507

++};

15508

++

15509

++static int num_subflows __read_mostly = 2;

15510

++module_param(num_subflows, int, 0644);

15511

++MODULE_PARM_DESC(num_subflows, "choose the number of subflows per MPTCP connection");

15512

++

15513

++/**

15514

++ * Create all new subflows, by doing calls to mptcp_initX_subsockets

15515

++ *

15516

++ * This function uses a goto next_subflow, to allow releasing the lock between

15517

++ * new subflows and giving other processes a chance to do some work on the

15518

++ * socket and potentially finishing the communication.

15519

++ **/

15520

++static void create_subflow_worker(struct work_struct *work)

15521

++{

15522

++	const struct ndiffports_priv *pm_priv = container_of(work,

15523

++						     struct ndiffports_priv,

15524

++						     subflow_work);

15525

++	struct mptcp_cb *mpcb = pm_priv->mpcb;

15526

++	struct sock *meta_sk = mpcb->meta_sk;

15527

++	int iter = 0;

15528

++

15529

++next_subflow:

15530

++	if (iter) {

15531

++		release_sock(meta_sk);

15532

++		mutex_unlock(&mpcb->mpcb_mutex);

15533

++

15534

++		cond_resched();

15535

++	}

15536

++	mutex_lock(&mpcb->mpcb_mutex);

15537

++	lock_sock_nested(meta_sk, SINGLE_DEPTH_NESTING);

15538

++

15539

++	iter++;

15540

++

15541

++	if (sock_flag(meta_sk, SOCK_DEAD))

15542

++		goto exit;

15543

++

15544

++	if (mpcb->master_sk &&

15545

++	    !tcp_sk(mpcb->master_sk)->mptcp->fully_established)

15546

++		goto exit;

15547

++

15548

++	if (num_subflows > iter && num_subflows > mpcb->cnt_subflows) {

15549

++		if (meta_sk->sk_family == AF_INET ||

15550

++		    mptcp_v6_is_v4_mapped(meta_sk)) {

15551

++			struct mptcp_loc4 loc;

15552

++			struct mptcp_rem4 rem;

15553

++

15554

++			loc.addr.s_addr = inet_sk(meta_sk)->inet_saddr;

15555

++			loc.loc4_id = 0;

15556

++			loc.low_prio = 0;

15557

++

15558

++			rem.addr.s_addr = inet_sk(meta_sk)->inet_daddr;

15559

++			rem.port = inet_sk(meta_sk)->inet_dport;

15560

++			rem.rem4_id = 0; /* Default 0 */

15561

++

15562

++			mptcp_init4_subsockets(meta_sk, &loc, &rem);

15563

++		} else {

15564

++#if IS_ENABLED(CONFIG_IPV6)

15565

++			struct mptcp_loc6 loc;

15566

++			struct mptcp_rem6 rem;

15567

++

15568

++			loc.addr = inet6_sk(meta_sk)->saddr;

15569

++			loc.loc6_id = 0;

15570

++			loc.low_prio = 0;

15571

++

15572

++			rem.addr = meta_sk->sk_v6_daddr;

15573

++			rem.port = inet_sk(meta_sk)->inet_dport;

15574

++			rem.rem6_id = 0; /* Default 0 */

15575

++

15576

++			mptcp_init6_subsockets(meta_sk, &loc, &rem);

15577

++#endif

15578

++		}

15579

++		goto next_subflow;

15580

++	}

15581

++

15582

++exit:

15583

++	release_sock(meta_sk);

15584

++	mutex_unlock(&mpcb->mpcb_mutex);

15585

++	sock_put(meta_sk);

15586

++}

15587

++

15588

++static void ndiffports_new_session(const struct sock *meta_sk)

15589

++{

15590

++	struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;

15591

++	struct ndiffports_priv *fmp = (struct ndiffports_priv *)&mpcb->mptcp_pm[0];

15592

++

15593

++	/* Initialize workqueue-struct */

15594

++	INIT_WORK(&fmp->subflow_work, create_subflow_worker);

15595

++	fmp->mpcb = mpcb;

15596

++}

15597

++

15598

++static void ndiffports_create_subflows(struct sock *meta_sk)

15599

++{

15600

++	const struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;

15601

++	struct ndiffports_priv *pm_priv = (struct ndiffports_priv *)&mpcb->mptcp_pm[0];

15602

++

15603

++	if (mpcb->infinite_mapping_snd || mpcb->infinite_mapping_rcv ||

15604

++	    mpcb->send_infinite_mapping ||

15605

++	    mpcb->server_side || sock_flag(meta_sk, SOCK_DEAD))

15606

++		return;

15607

++

15608

++	if (!work_pending(&pm_priv->subflow_work)) {

15609

++		sock_hold(meta_sk);

15610

++		queue_work(mptcp_wq, &pm_priv->subflow_work);

15611

++	}

15612

++}

15613

++

15614

++static int ndiffports_get_local_id(sa_family_t family, union inet_addr *addr,

15615

++				   struct net *net, bool *low_prio)

15616

++{

15617

++	return 0;

15618

++}

15619

++

15620

++static struct mptcp_pm_ops ndiffports __read_mostly = {

15621

++	.new_session = ndiffports_new_session,

15622

++	.fully_established = ndiffports_create_subflows,

15623

++	.get_local_id = ndiffports_get_local_id,

15624

++	.name = "ndiffports",

15625

++	.owner = THIS_MODULE,

15626

++};

15627

++

15628

++/* General initialization of MPTCP_PM */

15629

++static int __init ndiffports_register(void)

15630

++{

15631

++	BUILD_BUG_ON(sizeof(struct ndiffports_priv) > MPTCP_PM_SIZE);

15632

++

15633

++	if (mptcp_register_path_manager(&ndiffports))

15634

++		goto exit;

15635

++

15636

++	return 0;

15637

++

15638

++exit:

15639

++	return -1;

15640

++}

15641

++

15642

++static void ndiffports_unregister(void)

15643

++{

15644

++	mptcp_unregister_path_manager(&ndiffports);

15645

++}

15646

++

15647

++module_init(ndiffports_register);

15648

++module_exit(ndiffports_unregister);

15649

++

15650

++MODULE_AUTHOR("Christoph Paasch");

15651

++MODULE_LICENSE("GPL");

15652

++MODULE_DESCRIPTION("NDIFF-PORTS MPTCP");

15653

++MODULE_VERSION("0.88");

15654

+diff --git a/net/mptcp/mptcp_ofo_queue.c b/net/mptcp/mptcp_ofo_queue.c

15655

+new file mode 100644

15656

+index 000000000000..ec4e98622637

15657

+--- /dev/null

15658

++++ b/net/mptcp/mptcp_ofo_queue.c

15659

+@@ -0,0 +1,295 @@

15660

++/*

15661

++ *	MPTCP implementation - Fast algorithm for MPTCP meta-reordering

15662

++ *

15663

++ *	Initial Design & Implementation:

15664

++ *	Sébastien Barré <sebastien.barre@×××××××××.be>

15665

++ *

15666

++ *	Current Maintainer & Author:

15667

++ *	Christoph Paasch <christoph.paasch@×××××××××.be>

15668

++ *

15669

++ *	Additional authors:

15670

++ *	Jaakko Korkeaniemi <jaakko.korkeaniemi@×××××.fi>

15671

++ *	Gregory Detal <gregory.detal@×××××××××.be>

15672

++ *	Fabien Duchêne <fabien.duchene@×××××××××.be>

15673

++ *	Andreas Seelinger <Andreas.Seelinger@×××××××××××.de>

15674

++ *	Lavkesh Lahngir <lavkesh51@×××××.com>

15675

++ *	Andreas Ripke <ripke@××××××.eu>

15676

++ *	Vlad Dogaru <vlad.dogaru@×××××.com>

15677

++ *	Octavian Purdila <octavian.purdila@×××××.com>

15678

++ *	John Ronan <jronan@××××.org>

15679

++ *	Catalin Nicutar <catalin.nicutar@×××××.com>

15680

++ *	Brandon Heller <brandonh@××××××××.edu>

15681

++ *

15682

++ *	This program is free software; you can redistribute it and/or

15683

++ *      modify it under the terms of the GNU General Public License

15684

++ *      as published by the Free Software Foundation; either version

15685

++ *      2 of the License, or (at your option) any later version.

15686

++ */

15687

++

15688

++#include <linux/skbuff.h>

15689

++#include <linux/slab.h>

15690

++#include <net/tcp.h>

15691

++#include <net/mptcp.h>

15692

++

15693

++void mptcp_remove_shortcuts(const struct mptcp_cb *mpcb,

15694

++			    const struct sk_buff *skb)

15695

++{

15696

++	struct tcp_sock *tp;

15697

++

15698

++	mptcp_for_each_tp(mpcb, tp) {

15699

++		if (tp->mptcp->shortcut_ofoqueue == skb) {

15700

++			tp->mptcp->shortcut_ofoqueue = NULL;

15701

++			return;

15702

++		}

15703

++	}

15704

++}

15705

++

15706

++/* Does 'skb' fits after 'here' in the queue 'head' ?

15707

++ * If yes, we queue it and return 1

15708

++ */

15709

++static int mptcp_ofo_queue_after(struct sk_buff_head *head,

15710

++				 struct sk_buff *skb, struct sk_buff *here,

15711

++				 const struct tcp_sock *tp)

15712

++{

15713

++	struct sock *meta_sk = tp->meta_sk;

15714

++	struct tcp_sock *meta_tp = tcp_sk(meta_sk);

15715

++	u32 seq = TCP_SKB_CB(skb)->seq;

15716

++	u32 end_seq = TCP_SKB_CB(skb)->end_seq;

15717

++

15718

++	/* We want to queue skb after here, thus seq >= end_seq */

15719

++	if (before(seq, TCP_SKB_CB(here)->end_seq))

15720

++		return 0;

15721

++

15722

++	if (seq == TCP_SKB_CB(here)->end_seq) {

15723

++		bool fragstolen = false;

15724

++

15725

++		if (!tcp_try_coalesce(meta_sk, here, skb, &fragstolen)) {

15726

++			__skb_queue_after(&meta_tp->out_of_order_queue, here, skb);

15727

++			return 1;

15728

++		} else {

15729

++			kfree_skb_partial(skb, fragstolen);

15730

++			return -1;

15731

++		}

15732

++	}

15733

++

15734

++	/* If here is the last one, we can always queue it */

15735

++	if (skb_queue_is_last(head, here)) {

15736

++		__skb_queue_after(head, here, skb);

15737

++		return 1;

15738

++	} else {

15739

++		struct sk_buff *skb1 = skb_queue_next(head, here);

15740

++		/* It's not the last one, but does it fits between 'here' and

15741

++		 * the one after 'here' ? Thus, does end_seq <= after_here->seq

15742

++		 */

15743

++		if (!after(end_seq, TCP_SKB_CB(skb1)->seq)) {

15744

++			__skb_queue_after(head, here, skb);

15745

++			return 1;

15746

++		}

15747

++	}

15748

++

15749

++	return 0;

15750

++}

15751

++

15752

++static void try_shortcut(struct sk_buff *shortcut, struct sk_buff *skb,

15753

++			 struct sk_buff_head *head, struct tcp_sock *tp)

15754

++{

15755

++	struct sock *meta_sk = tp->meta_sk;

15756

++	struct tcp_sock *tp_it, *meta_tp = tcp_sk(meta_sk);

15757

++	struct mptcp_cb *mpcb = meta_tp->mpcb;

15758

++	struct sk_buff *skb1, *best_shortcut = NULL;

15759

++	u32 seq = TCP_SKB_CB(skb)->seq;

15760

++	u32 end_seq = TCP_SKB_CB(skb)->end_seq;

15761

++	u32 distance = 0xffffffff;

15762

++

15763

++	/* First, check the tp's shortcut */

15764

++	if (!shortcut) {

15765

++		if (skb_queue_empty(head)) {

15766

++			__skb_queue_head(head, skb);

15767

++			goto end;

15768

++		}

15769

++	} else {

15770

++		int ret = mptcp_ofo_queue_after(head, skb, shortcut, tp);

15771

++		/* Does the tp's shortcut is a hit? If yes, we insert. */

15772

++

15773

++		if (ret) {

15774

++			skb = (ret > 0) ? skb : NULL;

15775

++			goto end;

15776

++		}

15777

++	}

15778

++

15779

++	/* Check the shortcuts of the other subsockets. */

15780

++	mptcp_for_each_tp(mpcb, tp_it) {

15781

++		shortcut = tp_it->mptcp->shortcut_ofoqueue;

15782

++		/* Can we queue it here? If yes, do so! */

15783

++		if (shortcut) {

15784

++			int ret = mptcp_ofo_queue_after(head, skb, shortcut, tp);

15785

++

15786

++			if (ret) {

15787

++				skb = (ret > 0) ? skb : NULL;

15788

++				goto end;

15789

++			}

15790

++		}

15791

++

15792

++		/* Could not queue it, check if we are close.

15793

++		 * We are looking for a shortcut, close enough to seq to

15794

++		 * set skb1 prematurely and thus improve the subsequent lookup,

15795

++		 * which tries to find a skb1 so that skb1->seq <= seq.

15796

++		 *

15797

++		 * So, here we only take shortcuts, whose shortcut->seq > seq,

15798

++		 * and minimize the distance between shortcut->seq and seq and

15799

++		 * set best_shortcut to this one with the minimal distance.

15800

++		 *

15801

++		 * That way, the subsequent while-loop is shortest.

15802

++		 */

15803

++		if (shortcut && after(TCP_SKB_CB(shortcut)->seq, seq)) {

15804

++			/* Are we closer than the current best shortcut? */

15805

++			if ((u32)(TCP_SKB_CB(shortcut)->seq - seq) < distance) {

15806

++				distance = (u32)(TCP_SKB_CB(shortcut)->seq - seq);

15807

++				best_shortcut = shortcut;

15808

++			}

15809

++		}

15810

++	}

15811

++

15812

++	if (best_shortcut)

15813

++		skb1 = best_shortcut;

15814

++	else

15815

++		skb1 = skb_peek_tail(head);

15816

++

15817

++	if (seq == TCP_SKB_CB(skb1)->end_seq) {

15818

++		bool fragstolen = false;

15819

++

15820

++		if (!tcp_try_coalesce(meta_sk, skb1, skb, &fragstolen)) {

15821

++			__skb_queue_after(&meta_tp->out_of_order_queue, skb1, skb);

15822

++		} else {

15823

++			kfree_skb_partial(skb, fragstolen);

15824

++			skb = NULL;

15825

++		}

15826

++

15827

++		goto end;

15828

++	}

15829

++

15830

++	/* Find the insertion point, starting from best_shortcut if available.

15831

++	 *

15832

++	 * Inspired from tcp_data_queue_ofo.

15833

++	 */

15834

++	while (1) {

15835

++		/* skb1->seq <= seq */

15836

++		if (!after(TCP_SKB_CB(skb1)->seq, seq))

15837

++			break;

15838

++		if (skb_queue_is_first(head, skb1)) {

15839

++			skb1 = NULL;

15840

++			break;

15841

++		}

15842

++		skb1 = skb_queue_prev(head, skb1);

15843

++	}

15844

++

15845

++	/* Do skb overlap to previous one? */

15846

++	if (skb1 && before(seq, TCP_SKB_CB(skb1)->end_seq)) {

15847

++		if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) {

15848

++			/* All the bits are present. */

15849

++			__kfree_skb(skb);

15850

++			skb = NULL;

15851

++			goto end;

15852

++		}

15853

++		if (seq == TCP_SKB_CB(skb1)->seq) {

15854

++			if (skb_queue_is_first(head, skb1))

15855

++				skb1 = NULL;

15856

++			else

15857

++				skb1 = skb_queue_prev(head, skb1);

15858

++		}

15859

++	}

15860

++	if (!skb1)

15861

++		__skb_queue_head(head, skb);

15862

++	else

15863

++		__skb_queue_after(head, skb1, skb);

15864

++

15865

++	/* And clean segments covered by new one as whole. */

15866

++	while (!skb_queue_is_last(head, skb)) {

15867

++		skb1 = skb_queue_next(head, skb);

15868

++

15869

++		if (!after(end_seq, TCP_SKB_CB(skb1)->seq))

15870

++			break;

15871

++

15872

++		__skb_unlink(skb1, head);

15873

++		mptcp_remove_shortcuts(mpcb, skb1);

15874

++		__kfree_skb(skb1);

15875

++	}

15876

++

15877

++end:

15878

++	if (skb) {

15879

++		skb_set_owner_r(skb, meta_sk);

15880

++		tp->mptcp->shortcut_ofoqueue = skb;

15881

++	}

15882

++

15883

++	return;

15884

++}

15885

++

15886

++/**

15887

++ * @sk: the subflow that received this skb.

15888

++ */

15889

++void mptcp_add_meta_ofo_queue(const struct sock *meta_sk, struct sk_buff *skb,

15890

++			      struct sock *sk)

15891

++{

15892

++	struct tcp_sock *tp = tcp_sk(sk);

15893

++

15894

++	try_shortcut(tp->mptcp->shortcut_ofoqueue, skb,

15895

++		     &tcp_sk(meta_sk)->out_of_order_queue, tp);

15896

++}

15897

++

15898

++bool mptcp_prune_ofo_queue(struct sock *sk)

15899

++{

15900

++	struct tcp_sock *tp	= tcp_sk(sk);

15901

++	bool res		= false;

15902

++

15903

++	if (!skb_queue_empty(&tp->out_of_order_queue)) {

15904

++		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_OFOPRUNED);

15905

++		mptcp_purge_ofo_queue(tp);

15906

++

15907

++		/* No sack at the mptcp-level */

15908

++		sk_mem_reclaim(sk);

15909

++		res = true;

15910

++	}

15911

++

15912

++	return res;

15913

++}

15914

++

15915

++void mptcp_ofo_queue(struct sock *meta_sk)

15916

++{

15917

++	struct tcp_sock *meta_tp = tcp_sk(meta_sk);

15918

++	struct sk_buff *skb;

15919

++

15920

++	while ((skb = skb_peek(&meta_tp->out_of_order_queue)) != NULL) {

15921

++		u32 old_rcv_nxt = meta_tp->rcv_nxt;

15922

++		if (after(TCP_SKB_CB(skb)->seq, meta_tp->rcv_nxt))

15923

++			break;

15924

++

15925

++		if (!after(TCP_SKB_CB(skb)->end_seq, meta_tp->rcv_nxt)) {

15926

++			__skb_unlink(skb, &meta_tp->out_of_order_queue);

15927

++			mptcp_remove_shortcuts(meta_tp->mpcb, skb);

15928

++			__kfree_skb(skb);

15929

++			continue;

15930

++		}

15931

++

15932

++		__skb_unlink(skb, &meta_tp->out_of_order_queue);

15933

++		mptcp_remove_shortcuts(meta_tp->mpcb, skb);

15934

++

15935

++		__skb_queue_tail(&meta_sk->sk_receive_queue, skb);

15936

++		meta_tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;

15937

++		mptcp_check_rcvseq_wrap(meta_tp, old_rcv_nxt);

15938

++

15939

++		if (tcp_hdr(skb)->fin)

15940

++			mptcp_fin(meta_sk);

15941

++	}

15942

++}

15943

++

15944

++void mptcp_purge_ofo_queue(struct tcp_sock *meta_tp)

15945

++{

15946

++	struct sk_buff_head *head = &meta_tp->out_of_order_queue;

15947

++	struct sk_buff *skb, *tmp;

15948

++

15949

++	skb_queue_walk_safe(head, skb, tmp) {

15950

++		__skb_unlink(skb, head);

15951

++		mptcp_remove_shortcuts(meta_tp->mpcb, skb);

15952

++		kfree_skb(skb);

15953

++	}

15954

++}

15955

+diff --git a/net/mptcp/mptcp_olia.c b/net/mptcp/mptcp_olia.c

15956

+new file mode 100644

15957

+index 000000000000..53f5c43bb488

15958

+--- /dev/null

15959

++++ b/net/mptcp/mptcp_olia.c

15960

+@@ -0,0 +1,311 @@

15961

++/*

15962

++ * MPTCP implementation - OPPORTUNISTIC LINKED INCREASES CONGESTION CONTROL:

15963

++ *

15964

++ * Algorithm design:

15965

++ * Ramin Khalili <ramin.khalili@××××.ch>

15966

++ * Nicolas Gast <nicolas.gast@××××.ch>

15967

++ * Jean-Yves Le Boudec <jean-yves.leboudec@××××.ch>

15968

++ *

15969

++ * Implementation:

15970

++ * Ramin Khalili <ramin.khalili@××××.ch>

15971

++ *

15972

++ * Ported to the official MPTCP-kernel:

15973

++ * Christoph Paasch <christoph.paasch@×××××××××.be>

15974

++ *

15975

++ * This program is free software; you can redistribute it and/or

15976

++ * modify it under the terms of the GNU General Public License

15977

++ * as published by the Free Software Foundation; either version

15978

++ * 2 of the License, or (at your option) any later version.

15979

++ */

15980

++

15981

++

15982

++#include <net/tcp.h>

15983

++#include <net/mptcp.h>

15984

++

15985

++#include <linux/module.h>

15986

++

15987

++static int scale = 10;

15988

++

15989

++struct mptcp_olia {

15990

++	u32	mptcp_loss1;

15991

++	u32	mptcp_loss2;

15992

++	u32	mptcp_loss3;

15993

++	int	epsilon_num;

15994

++	u32	epsilon_den;

15995

++	int	mptcp_snd_cwnd_cnt;

15996

++};

15997

++

15998

++static inline int mptcp_olia_sk_can_send(const struct sock *sk)

15999

++{

16000

++	return mptcp_sk_can_send(sk) && tcp_sk(sk)->srtt_us;

16001

++}

16002

++

16003

++static inline u64 mptcp_olia_scale(u64 val, int scale)

16004

++{

16005

++	return (u64) val << scale;

16006

++}

16007

++

16008

++/* take care of artificially inflate (see RFC5681)

16009

++ * of cwnd during fast-retransmit phase

16010

++ */

16011

++static u32 mptcp_get_crt_cwnd(struct sock *sk)

16012

++{

16013

++	const struct inet_connection_sock *icsk = inet_csk(sk);

16014

++

16015

++	if (icsk->icsk_ca_state == TCP_CA_Recovery)

16016

++		return tcp_sk(sk)->snd_ssthresh;

16017

++	else

16018

++		return tcp_sk(sk)->snd_cwnd;

16019

++}

16020

++

16021

++/* return the dominator of the first term of  the increasing term */

16022

++static u64 mptcp_get_rate(const struct mptcp_cb *mpcb , u32 path_rtt)

16023

++{

16024

++	struct sock *sk;

16025

++	u64 rate = 1; /* We have to avoid a zero-rate because it is used as a divisor */

16026

++

16027

++	mptcp_for_each_sk(mpcb, sk) {

16028

++		struct tcp_sock *tp = tcp_sk(sk);

16029

++		u64 scaled_num;

16030

++		u32 tmp_cwnd;

16031

++

16032

++		if (!mptcp_olia_sk_can_send(sk))

16033

++			continue;

16034

++

16035

++		tmp_cwnd = mptcp_get_crt_cwnd(sk);

16036

++		scaled_num = mptcp_olia_scale(tmp_cwnd, scale) * path_rtt;

16037

++		rate += div_u64(scaled_num , tp->srtt_us);

16038

++	}

16039

++	rate *= rate;

16040

++	return rate;

16041

++}

16042

++

16043

++/* find the maximum cwnd, used to find set M */

16044

++static u32 mptcp_get_max_cwnd(const struct mptcp_cb *mpcb)

16045

++{

16046

++	struct sock *sk;

16047

++	u32 best_cwnd = 0;

16048

++

16049

++	mptcp_for_each_sk(mpcb, sk) {

16050

++		u32 tmp_cwnd;

16051

++

16052

++		if (!mptcp_olia_sk_can_send(sk))

16053

++			continue;

16054

++

16055

++		tmp_cwnd = mptcp_get_crt_cwnd(sk);

16056

++		if (tmp_cwnd > best_cwnd)

16057

++			best_cwnd = tmp_cwnd;

16058

++	}

16059

++	return best_cwnd;

16060

++}

16061

++

16062

++static void mptcp_get_epsilon(const struct mptcp_cb *mpcb)

16063

++{

16064

++	struct mptcp_olia *ca;

16065

++	struct tcp_sock *tp;

16066

++	struct sock *sk;

16067

++	u64 tmp_int, tmp_rtt, best_int = 0, best_rtt = 1;

16068

++	u32 max_cwnd = 1, best_cwnd = 1, tmp_cwnd;

16069

++	u8 M = 0, B_not_M = 0;

16070

++

16071

++	/* TODO - integrate this in the following loop - we just want to iterate once */

16072

++

16073

++	max_cwnd = mptcp_get_max_cwnd(mpcb);

16074

++

16075

++	/* find the best path */

16076

++	mptcp_for_each_sk(mpcb, sk) {

16077

++		tp = tcp_sk(sk);

16078

++		ca = inet_csk_ca(sk);

16079

++

16080

++		if (!mptcp_olia_sk_can_send(sk))

16081

++			continue;

16082

++

16083

++		tmp_rtt = (u64)tp->srtt_us * tp->srtt_us;

16084

++		/* TODO - check here and rename variables */

16085

++		tmp_int = max(ca->mptcp_loss3 - ca->mptcp_loss2,

16086

++			      ca->mptcp_loss2 - ca->mptcp_loss1);

16087

++

16088

++		tmp_cwnd = mptcp_get_crt_cwnd(sk);

16089

++		if ((u64)tmp_int * best_rtt >= (u64)best_int * tmp_rtt) {

16090

++			best_rtt = tmp_rtt;

16091

++			best_int = tmp_int;

16092

++			best_cwnd = tmp_cwnd;

16093

++		}

16094

++	}

16095

++

16096

++	/* TODO - integrate this here in mptcp_get_max_cwnd and in the previous loop */

16097

++	/* find the size of M and B_not_M */

16098

++	mptcp_for_each_sk(mpcb, sk) {

16099

++		tp = tcp_sk(sk);

16100

++		ca = inet_csk_ca(sk);

16101

++

16102

++		if (!mptcp_olia_sk_can_send(sk))

16103

++			continue;

16104

++

16105

++		tmp_cwnd = mptcp_get_crt_cwnd(sk);

16106

++		if (tmp_cwnd == max_cwnd) {

16107

++			M++;

16108

++		} else {

16109

++			tmp_rtt = (u64)tp->srtt_us * tp->srtt_us;

16110

++			tmp_int = max(ca->mptcp_loss3 - ca->mptcp_loss2,

16111

++				      ca->mptcp_loss2 - ca->mptcp_loss1);

16112

++

16113

++			if ((u64)tmp_int * best_rtt == (u64)best_int * tmp_rtt)

16114

++				B_not_M++;

16115

++		}

16116

++	}

16117

++

16118

++	/* check if the path is in M or B_not_M and set the value of epsilon accordingly */

16119

++	mptcp_for_each_sk(mpcb, sk) {

16120

++		tp = tcp_sk(sk);

16121

++		ca = inet_csk_ca(sk);

16122

++

16123

++		if (!mptcp_olia_sk_can_send(sk))

16124

++			continue;

16125

++

16126

++		if (B_not_M == 0) {

16127

++			ca->epsilon_num = 0;

16128

++			ca->epsilon_den = 1;

16129

++		} else {

16130

++			tmp_rtt = (u64)tp->srtt_us * tp->srtt_us;

16131

++			tmp_int = max(ca->mptcp_loss3 - ca->mptcp_loss2,

16132

++				      ca->mptcp_loss2 - ca->mptcp_loss1);

16133

++			tmp_cwnd = mptcp_get_crt_cwnd(sk);

16134

++

16135

++			if (tmp_cwnd < max_cwnd &&

16136

++			    (u64)tmp_int * best_rtt == (u64)best_int * tmp_rtt) {

16137

++				ca->epsilon_num = 1;

16138

++				ca->epsilon_den = mpcb->cnt_established * B_not_M;

16139

++			} else if (tmp_cwnd == max_cwnd) {

16140

++				ca->epsilon_num = -1;

16141

++				ca->epsilon_den = mpcb->cnt_established  * M;

16142

++			} else {

16143

++				ca->epsilon_num = 0;

16144

++				ca->epsilon_den = 1;

16145

++			}

16146

++		}

16147

++	}

16148

++}

16149

++

16150

++/* setting the initial values */

16151

++static void mptcp_olia_init(struct sock *sk)

16152

++{

16153

++	const struct tcp_sock *tp = tcp_sk(sk);

16154

++	struct mptcp_olia *ca = inet_csk_ca(sk);

16155

++

16156

++	if (mptcp(tp)) {

16157

++		ca->mptcp_loss1 = tp->snd_una;

16158

++		ca->mptcp_loss2 = tp->snd_una;

16159

++		ca->mptcp_loss3 = tp->snd_una;

16160

++		ca->mptcp_snd_cwnd_cnt = 0;

16161

++		ca->epsilon_num = 0;

16162

++		ca->epsilon_den = 1;

16163

++	}

16164

++}

16165

++

16166

++/* updating inter-loss distance and ssthresh */

16167

++static void mptcp_olia_set_state(struct sock *sk, u8 new_state)

16168

++{

16169

++	if (!mptcp(tcp_sk(sk)))

16170

++		return;

16171

++

16172

++	if (new_state == TCP_CA_Loss ||

16173

++	    new_state == TCP_CA_Recovery || new_state == TCP_CA_CWR) {

16174

++		struct mptcp_olia *ca = inet_csk_ca(sk);

16175

++

16176

++		if (ca->mptcp_loss3 != ca->mptcp_loss2 &&

16177

++		    !inet_csk(sk)->icsk_retransmits) {

16178

++			ca->mptcp_loss1 = ca->mptcp_loss2;

16179

++			ca->mptcp_loss2 = ca->mptcp_loss3;

16180

++		}

16181

++	}

16182

++}

16183

++

16184

++/* main algorithm */

16185

++static void mptcp_olia_cong_avoid(struct sock *sk, u32 ack, u32 acked)

16186

++{

16187

++	struct tcp_sock *tp = tcp_sk(sk);

16188

++	struct mptcp_olia *ca = inet_csk_ca(sk);

16189

++	const struct mptcp_cb *mpcb = tp->mpcb;

16190

++

16191

++	u64 inc_num, inc_den, rate, cwnd_scaled;

16192

++

16193

++	if (!mptcp(tp)) {

16194

++		tcp_reno_cong_avoid(sk, ack, acked);

16195

++		return;

16196

++	}

16197

++

16198

++	ca->mptcp_loss3 = tp->snd_una;

16199

++

16200

++	if (!tcp_is_cwnd_limited(sk))

16201

++		return;

16202

++

16203

++	/* slow start if it is in the safe area */

16204

++	if (tp->snd_cwnd <= tp->snd_ssthresh) {

16205

++		tcp_slow_start(tp, acked);

16206

++		return;

16207

++	}

16208

++

16209

++	mptcp_get_epsilon(mpcb);

16210

++	rate = mptcp_get_rate(mpcb, tp->srtt_us);

16211

++	cwnd_scaled = mptcp_olia_scale(tp->snd_cwnd, scale);

16212

++	inc_den = ca->epsilon_den * tp->snd_cwnd * rate ? : 1;

16213

++

16214

++	/* calculate the increasing term, scaling is used to reduce the rounding effect */

16215

++	if (ca->epsilon_num == -1) {

16216

++		if (ca->epsilon_den * cwnd_scaled * cwnd_scaled < rate) {

16217

++			inc_num = rate - ca->epsilon_den *

16218

++				cwnd_scaled * cwnd_scaled;

16219

++			ca->mptcp_snd_cwnd_cnt -= div64_u64(

16220

++			    mptcp_olia_scale(inc_num , scale) , inc_den);

16221

++		} else {

16222

++			inc_num = ca->epsilon_den *

16223

++			    cwnd_scaled * cwnd_scaled - rate;

16224

++			ca->mptcp_snd_cwnd_cnt += div64_u64(

16225

++			    mptcp_olia_scale(inc_num , scale) , inc_den);

16226

++		}

16227

++	} else {

16228

++		inc_num = ca->epsilon_num * rate +

16229

++		    ca->epsilon_den * cwnd_scaled * cwnd_scaled;

16230

++		ca->mptcp_snd_cwnd_cnt += div64_u64(

16231

++		    mptcp_olia_scale(inc_num , scale) , inc_den);

16232

++	}

16233

++

16234

++

16235

++	if (ca->mptcp_snd_cwnd_cnt >= (1 << scale) - 1) {

16236

++		if (tp->snd_cwnd < tp->snd_cwnd_clamp)

16237

++			tp->snd_cwnd++;

16238

++		ca->mptcp_snd_cwnd_cnt = 0;

16239

++	} else if (ca->mptcp_snd_cwnd_cnt <= 0 - (1 << scale) + 1) {

16240

++		tp->snd_cwnd = max((int) 1 , (int) tp->snd_cwnd - 1);

16241

++		ca->mptcp_snd_cwnd_cnt = 0;

16242

++	}

16243

++}

16244

++

16245

++static struct tcp_congestion_ops mptcp_olia = {

16246

++	.init		= mptcp_olia_init,

16247

++	.ssthresh	= tcp_reno_ssthresh,

16248

++	.cong_avoid	= mptcp_olia_cong_avoid,

16249

++	.set_state	= mptcp_olia_set_state,

16250

++	.owner		= THIS_MODULE,

16251

++	.name		= "olia",

16252

++};

16253

++

16254

++static int __init mptcp_olia_register(void)

16255

++{

16256

++	BUILD_BUG_ON(sizeof(struct mptcp_olia) > ICSK_CA_PRIV_SIZE);

16257

++	return tcp_register_congestion_control(&mptcp_olia);

16258

++}

16259

++

16260

++static void __exit mptcp_olia_unregister(void)

16261

++{

16262

++	tcp_unregister_congestion_control(&mptcp_olia);

16263

++}

16264

++

16265

++module_init(mptcp_olia_register);

16266

++module_exit(mptcp_olia_unregister);

16267

++

16268

++MODULE_AUTHOR("Ramin Khalili, Nicolas Gast, Jean-Yves Le Boudec");

16269

++MODULE_LICENSE("GPL");

16270

++MODULE_DESCRIPTION("MPTCP COUPLED CONGESTION CONTROL");

16271

++MODULE_VERSION("0.1");

16272

+diff --git a/net/mptcp/mptcp_output.c b/net/mptcp/mptcp_output.c

16273

+new file mode 100644

16274

+index 000000000000..400ea254c078

16275

+--- /dev/null

16276

++++ b/net/mptcp/mptcp_output.c

16277

+@@ -0,0 +1,1743 @@

16278

++/*

16279

++ *	MPTCP implementation - Sending side

16280

++ *

16281

++ *	Initial Design & Implementation:

16282

++ *	Sébastien Barré <sebastien.barre@×××××××××.be>

16283

++ *

16284

++ *	Current Maintainer & Author:

16285

++ *	Christoph Paasch <christoph.paasch@×××××××××.be>

16286

++ *

16287

++ *	Additional authors:

16288

++ *	Jaakko Korkeaniemi <jaakko.korkeaniemi@×××××.fi>

16289

++ *	Gregory Detal <gregory.detal@×××××××××.be>

16290

++ *	Fabien Duchêne <fabien.duchene@×××××××××.be>

16291

++ *	Andreas Seelinger <Andreas.Seelinger@×××××××××××.de>

16292

++ *	Lavkesh Lahngir <lavkesh51@×××××.com>

16293

++ *	Andreas Ripke <ripke@××××××.eu>

16294

++ *	Vlad Dogaru <vlad.dogaru@×××××.com>

16295

++ *	Octavian Purdila <octavian.purdila@×××××.com>

16296

++ *	John Ronan <jronan@××××.org>

16297

++ *	Catalin Nicutar <catalin.nicutar@×××××.com>

16298

++ *	Brandon Heller <brandonh@××××××××.edu>

16299

++ *

16300

++ *

16301

++ *	This program is free software; you can redistribute it and/or

16302

++ *      modify it under the terms of the GNU General Public License

16303

++ *      as published by the Free Software Foundation; either version

16304

++ *      2 of the License, or (at your option) any later version.

16305

++ */

16306

++

16307

++#include <linux/kconfig.h>

16308

++#include <linux/skbuff.h>

16309

++#include <linux/tcp.h>

16310

++

16311

++#include <net/mptcp.h>

16312

++#include <net/mptcp_v4.h>

16313

++#include <net/mptcp_v6.h>

16314

++#include <net/sock.h>

16315

++

16316

++static const int mptcp_dss_len = MPTCP_SUB_LEN_DSS_ALIGN +

16317

++				 MPTCP_SUB_LEN_ACK_ALIGN +

16318

++				 MPTCP_SUB_LEN_SEQ_ALIGN;

16319

++

16320

++static inline int mptcp_sub_len_remove_addr(u16 bitfield)

16321

++{

16322

++	unsigned int c;

16323

++	for (c = 0; bitfield; c++)

16324

++		bitfield &= bitfield - 1;

16325

++	return MPTCP_SUB_LEN_REMOVE_ADDR + c - 1;

16326

++}

16327

++

16328

++int mptcp_sub_len_remove_addr_align(u16 bitfield)

16329

++{

16330

++	return ALIGN(mptcp_sub_len_remove_addr(bitfield), 4);

16331

++}

16332

++EXPORT_SYMBOL(mptcp_sub_len_remove_addr_align);

16333

++

16334

++/* get the data-seq and end-data-seq and store them again in the

16335

++ * tcp_skb_cb

16336

++ */

16337

++static int mptcp_reconstruct_mapping(struct sk_buff *skb)

16338

++{

16339

++	const struct mp_dss *mpdss = (struct mp_dss *)TCP_SKB_CB(skb)->dss;

16340

++	u32 *p32;

16341

++	u16 *p16;

16342

++

16343

++	if (!mpdss->M)

16344

++		return 1;

16345

++

16346

++	/* Move the pointer to the data-seq */

16347

++	p32 = (u32 *)mpdss;

16348

++	p32++;

16349

++	if (mpdss->A) {

16350

++		p32++;

16351

++		if (mpdss->a)

16352

++			p32++;

16353

++	}

16354

++

16355

++	TCP_SKB_CB(skb)->seq = ntohl(*p32);

16356

++

16357

++	/* Get the data_len to calculate the end_data_seq */

16358

++	p32++;

16359

++	p32++;

16360

++	p16 = (u16 *)p32;

16361

++	TCP_SKB_CB(skb)->end_seq = ntohs(*p16) + TCP_SKB_CB(skb)->seq;

16362

++

16363

++	return 0;

16364

++}

16365

++

16366

++static void mptcp_find_and_set_pathmask(const struct sock *meta_sk, struct sk_buff *skb)

16367

++{

16368

++	struct sk_buff *skb_it;

16369

++

16370

++	skb_it = tcp_write_queue_head(meta_sk);

16371

++

16372

++	tcp_for_write_queue_from(skb_it, meta_sk) {

16373

++		if (skb_it == tcp_send_head(meta_sk))

16374

++			break;

16375

++

16376

++		if (TCP_SKB_CB(skb_it)->seq == TCP_SKB_CB(skb)->seq) {

16377

++			TCP_SKB_CB(skb)->path_mask = TCP_SKB_CB(skb_it)->path_mask;

16378

++			break;

16379

++		}

16380

++	}

16381

++}

16382

++

16383

++/* Reinject data from one TCP subflow to the meta_sk. If sk == NULL, we are

16384

++ * coming from the meta-retransmit-timer

16385

++ */

16386

++static void __mptcp_reinject_data(struct sk_buff *orig_skb, struct sock *meta_sk,

16387

++				  struct sock *sk, int clone_it)

16388

++{

16389

++	struct sk_buff *skb, *skb1;

16390

++	const struct tcp_sock *meta_tp = tcp_sk(meta_sk);

16391

++	struct mptcp_cb *mpcb = meta_tp->mpcb;

16392

++	u32 seq, end_seq;

16393

++

16394

++	if (clone_it) {

16395

++		/* pskb_copy is necessary here, because the TCP/IP-headers

16396

++		 * will be changed when it's going to be reinjected on another

16397

++		 * subflow.

16398

++		 */

16399

++		skb = pskb_copy_for_clone(orig_skb, GFP_ATOMIC);

16400

++	} else {

16401

++		__skb_unlink(orig_skb, &sk->sk_write_queue);

16402

++		sock_set_flag(sk, SOCK_QUEUE_SHRUNK);

16403

++		sk->sk_wmem_queued -= orig_skb->truesize;

16404

++		sk_mem_uncharge(sk, orig_skb->truesize);

16405

++		skb = orig_skb;

16406

++	}

16407

++	if (unlikely(!skb))

16408

++		return;

16409

++

16410

++	if (sk && mptcp_reconstruct_mapping(skb)) {

16411

++		__kfree_skb(skb);

16412

++		return;

16413

++	}

16414

++

16415

++	skb->sk = meta_sk;

16416

++

16417

++	/* If it reached already the destination, we don't have to reinject it */

16418

++	if (!after(TCP_SKB_CB(skb)->end_seq, meta_tp->snd_una)) {

16419

++		__kfree_skb(skb);

16420

++		return;

16421

++	}

16422

++

16423

++	/* Only reinject segments that are fully covered by the mapping */

16424

++	if (skb->len + (mptcp_is_data_fin(skb) ? 1 : 0) !=

16425

++	    TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq) {

16426

++		u32 seq = TCP_SKB_CB(skb)->seq;

16427

++		u32 end_seq = TCP_SKB_CB(skb)->end_seq;

16428

++

16429

++		__kfree_skb(skb);

16430

++

16431

++		/* Ok, now we have to look for the full mapping in the meta

16432

++		 * send-queue :S

16433

++		 */

16434

++		tcp_for_write_queue(skb, meta_sk) {

16435

++			/* Not yet at the mapping? */

16436

++			if (before(TCP_SKB_CB(skb)->seq, seq))

16437

++				continue;

16438

++			/* We have passed by the mapping */

16439

++			if (after(TCP_SKB_CB(skb)->end_seq, end_seq))

16440

++				return;

16441

++

16442

++			__mptcp_reinject_data(skb, meta_sk, NULL, 1);

16443

++		}

16444

++		return;

16445

++	}

16446

++

16447

++	/* Segment goes back to the MPTCP-layer. So, we need to zero the

16448

++	 * path_mask/dss.

16449

++	 */

16450

++	memset(TCP_SKB_CB(skb)->dss, 0 , mptcp_dss_len);

16451

++

16452

++	/* We need to find out the path-mask from the meta-write-queue

16453

++	 * to properly select a subflow.

16454

++	 */

16455

++	mptcp_find_and_set_pathmask(meta_sk, skb);

16456

++

16457

++	/* If it's empty, just add */

16458

++	if (skb_queue_empty(&mpcb->reinject_queue)) {

16459

++		skb_queue_head(&mpcb->reinject_queue, skb);

16460

++		return;

16461

++	}

16462

++

16463

++	/* Find place to insert skb - or even we can 'drop' it, as the

16464

++	 * data is already covered by other skb's in the reinject-queue.

16465

++	 *

16466

++	 * This is inspired by code from tcp_data_queue.

16467

++	 */

16468

++

16469

++	skb1 = skb_peek_tail(&mpcb->reinject_queue);

16470

++	seq = TCP_SKB_CB(skb)->seq;

16471

++	while (1) {

16472

++		if (!after(TCP_SKB_CB(skb1)->seq, seq))

16473

++			break;

16474

++		if (skb_queue_is_first(&mpcb->reinject_queue, skb1)) {

16475

++			skb1 = NULL;

16476

++			break;

16477

++		}

16478

++		skb1 = skb_queue_prev(&mpcb->reinject_queue, skb1);

16479

++	}

16480

++

16481

++	/* Do skb overlap to previous one? */

16482

++	end_seq = TCP_SKB_CB(skb)->end_seq;

16483

++	if (skb1 && before(seq, TCP_SKB_CB(skb1)->end_seq)) {

16484

++		if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) {

16485

++			/* All the bits are present. Don't reinject */

16486

++			__kfree_skb(skb);

16487

++			return;

16488

++		}

16489

++		if (seq == TCP_SKB_CB(skb1)->seq) {

16490

++			if (skb_queue_is_first(&mpcb->reinject_queue, skb1))

16491

++				skb1 = NULL;

16492

++			else

16493

++				skb1 = skb_queue_prev(&mpcb->reinject_queue, skb1);

16494

++		}

16495

++	}

16496

++	if (!skb1)

16497

++		__skb_queue_head(&mpcb->reinject_queue, skb);

16498

++	else

16499

++		__skb_queue_after(&mpcb->reinject_queue, skb1, skb);

16500

++

16501

++	/* And clean segments covered by new one as whole. */

16502

++	while (!skb_queue_is_last(&mpcb->reinject_queue, skb)) {

16503

++		skb1 = skb_queue_next(&mpcb->reinject_queue, skb);

16504

++

16505

++		if (!after(end_seq, TCP_SKB_CB(skb1)->seq))

16506

++			break;

16507

++

16508

++		__skb_unlink(skb1, &mpcb->reinject_queue);

16509

++		__kfree_skb(skb1);

16510

++	}

16511

++	return;

16512

++}

16513

++

16514

++/* Inserts data into the reinject queue */

16515

++void mptcp_reinject_data(struct sock *sk, int clone_it)

16516

++{

16517

++	struct sk_buff *skb_it, *tmp;

16518

++	struct tcp_sock *tp = tcp_sk(sk);

16519

++	struct sock *meta_sk = tp->meta_sk;

16520

++

16521

++	/* It has already been closed - there is really no point in reinjecting */

16522

++	if (meta_sk->sk_state == TCP_CLOSE)

16523

++		return;

16524

++

16525

++	skb_queue_walk_safe(&sk->sk_write_queue, skb_it, tmp) {

16526

++		struct tcp_skb_cb *tcb = TCP_SKB_CB(skb_it);

16527

++		/* Subflow syn's and fin's are not reinjected.

16528

++		 *

16529

++		 * As well as empty subflow-fins with a data-fin.

16530

++		 * They are reinjected below (without the subflow-fin-flag)

16531

++		 */

16532

++		if (tcb->tcp_flags & TCPHDR_SYN ||

16533

++		    (tcb->tcp_flags & TCPHDR_FIN && !mptcp_is_data_fin(skb_it)) ||

16534

++		    (tcb->tcp_flags & TCPHDR_FIN && mptcp_is_data_fin(skb_it) && !skb_it->len))

16535

++			continue;

16536

++

16537

++		__mptcp_reinject_data(skb_it, meta_sk, sk, clone_it);

16538

++	}

16539

++

16540

++	skb_it = tcp_write_queue_tail(meta_sk);

16541

++	/* If sk has sent the empty data-fin, we have to reinject it too. */

16542

++	if (skb_it && mptcp_is_data_fin(skb_it) && skb_it->len == 0 &&

16543

++	    TCP_SKB_CB(skb_it)->path_mask & mptcp_pi_to_flag(tp->mptcp->path_index)) {

16544

++		__mptcp_reinject_data(skb_it, meta_sk, NULL, 1);

16545

++	}

16546

++

16547

++	mptcp_push_pending_frames(meta_sk);

16548

++

16549

++	tp->pf = 1;

16550

++}

16551

++EXPORT_SYMBOL(mptcp_reinject_data);

16552

++

16553

++static void mptcp_combine_dfin(const struct sk_buff *skb, const struct sock *meta_sk,

16554

++			       struct sock *subsk)

16555

++{

16556

++	const struct tcp_sock *meta_tp = tcp_sk(meta_sk);

16557

++	struct mptcp_cb *mpcb = meta_tp->mpcb;

16558

++	struct sock *sk_it;

16559

++	int all_empty = 1, all_acked;

16560

++

16561

++	/* In infinite mapping we always try to combine */

16562

++	if (mpcb->infinite_mapping_snd && tcp_close_state(subsk)) {

16563

++		subsk->sk_shutdown |= SEND_SHUTDOWN;

16564

++		TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_FIN;

16565

++		return;

16566

++	}

16567

++

16568

++	/* Don't combine, if they didn't combine - otherwise we end up in

16569

++	 * TIME_WAIT, even if our app is smart enough to avoid it

16570

++	 */

16571

++	if (meta_sk->sk_shutdown & RCV_SHUTDOWN) {

16572

++		if (!mpcb->dfin_combined)

16573

++			return;

16574

++	}

16575

++

16576

++	/* If no other subflow has data to send, we can combine */

16577

++	mptcp_for_each_sk(mpcb, sk_it) {

16578

++		if (!mptcp_sk_can_send(sk_it))

16579

++			continue;

16580

++

16581

++		if (!tcp_write_queue_empty(sk_it))

16582

++			all_empty = 0;

16583

++	}

16584

++

16585

++	/* If all data has been DATA_ACKed, we can combine.

16586

++	 * -1, because the data_fin consumed one byte

16587

++	 */

16588

++	all_acked = (meta_tp->snd_una == (meta_tp->write_seq - 1));

16589

++

16590

++	if ((all_empty || all_acked) && tcp_close_state(subsk)) {

16591

++		subsk->sk_shutdown |= SEND_SHUTDOWN;

16592

++		TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_FIN;

16593

++	}

16594

++}

16595

++

16596

++static int mptcp_write_dss_mapping(const struct tcp_sock *tp, const struct sk_buff *skb,

16597

++				   __be32 *ptr)

16598

++{

16599

++	const struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);

16600

++	__be32 *start = ptr;

16601

++	__u16 data_len;

16602

++

16603

++	*ptr++ = htonl(tcb->seq); /* data_seq */

16604

++

16605

++	/* If it's a non-data DATA_FIN, we set subseq to 0 (draft v7) */

16606

++	if (mptcp_is_data_fin(skb) && skb->len == 0)

16607

++		*ptr++ = 0; /* subseq */

16608

++	else

16609

++		*ptr++ = htonl(tp->write_seq - tp->mptcp->snt_isn); /* subseq */

16610

++

16611

++	if (tcb->mptcp_flags & MPTCPHDR_INF)

16612

++		data_len = 0;

16613

++	else

16614

++		data_len = tcb->end_seq - tcb->seq;

16615

++

16616

++	if (tp->mpcb->dss_csum && data_len) {

16617

++		__be16 *p16 = (__be16 *)ptr;

16618

++		__be32 hdseq = mptcp_get_highorder_sndbits(skb, tp->mpcb);

16619

++		__wsum csum;

16620

++

16621

++		*ptr = htonl(((data_len) << 16) |

16622

++			     (TCPOPT_EOL << 8) |

16623

++			     (TCPOPT_EOL));

16624

++		csum = csum_partial(ptr - 2, 12, skb->csum);

16625

++		p16++;

16626

++		*p16++ = csum_fold(csum_partial(&hdseq, sizeof(hdseq), csum));

16627

++	} else {

16628

++		*ptr++ = htonl(((data_len) << 16) |

16629

++			       (TCPOPT_NOP << 8) |

16630

++			       (TCPOPT_NOP));

16631

++	}

16632

++

16633

++	return ptr - start;

16634

++}

16635

++

16636

++static int mptcp_write_dss_data_ack(const struct tcp_sock *tp, const struct sk_buff *skb,

16637

++				    __be32 *ptr)

16638

++{

16639

++	struct mp_dss *mdss = (struct mp_dss *)ptr;

16640

++	__be32 *start = ptr;

16641

++

16642

++	mdss->kind = TCPOPT_MPTCP;

16643

++	mdss->sub = MPTCP_SUB_DSS;

16644

++	mdss->rsv1 = 0;

16645

++	mdss->rsv2 = 0;

16646

++	mdss->F = mptcp_is_data_fin(skb) ? 1 : 0;

16647

++	mdss->m = 0;

16648

++	mdss->M = mptcp_is_data_seq(skb) ? 1 : 0;

16649

++	mdss->a = 0;

16650

++	mdss->A = 1;

16651

++	mdss->len = mptcp_sub_len_dss(mdss, tp->mpcb->dss_csum);

16652

++	ptr++;

16653

++

16654

++	*ptr++ = htonl(mptcp_meta_tp(tp)->rcv_nxt);

16655

++

16656

++	return ptr - start;

16657

++}

16658

++

16659

++/* RFC6824 states that once a particular subflow mapping has been sent

16660

++ * out it must never be changed. However, packets may be split while

16661

++ * they are in the retransmission queue (due to SACK or ACKs) and that

16662

++ * arguably means that we would change the mapping (e.g. it splits it,

16663

++ * our sends out a subset of the initial mapping).

16664

++ *

16665

++ * Furthermore, the skb checksum is not always preserved across splits

16666

++ * (e.g. mptcp_fragment) which would mean that we need to recompute

16667

++ * the DSS checksum in this case.

16668

++ *

16669

++ * To avoid this we save the initial DSS mapping which allows us to

16670

++ * send the same DSS mapping even for fragmented retransmits.

16671

++ */

16672

++static void mptcp_save_dss_data_seq(const struct tcp_sock *tp, struct sk_buff *skb)

16673

++{

16674

++	struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);

16675

++	__be32 *ptr = (__be32 *)tcb->dss;

16676

++

16677

++	tcb->mptcp_flags |= MPTCPHDR_SEQ;

16678

++

16679

++	ptr += mptcp_write_dss_data_ack(tp, skb, ptr);

16680

++	ptr += mptcp_write_dss_mapping(tp, skb, ptr);

16681

++}

16682

++

16683

++/* Write the saved DSS mapping to the header */

16684

++static int mptcp_write_dss_data_seq(const struct tcp_sock *tp, struct sk_buff *skb,

16685

++				    __be32 *ptr)

16686

++{

16687

++	__be32 *start = ptr;

16688

++

16689

++	memcpy(ptr, TCP_SKB_CB(skb)->dss, mptcp_dss_len);

16690

++

16691

++	/* update the data_ack */

16692

++	start[1] = htonl(mptcp_meta_tp(tp)->rcv_nxt);

16693

++

16694

++	/* dss is in a union with inet_skb_parm and

16695

++	 * the IP layer expects zeroed IPCB fields.

16696

++	 */

16697

++	memset(TCP_SKB_CB(skb)->dss, 0 , mptcp_dss_len);

16698

++

16699

++	return mptcp_dss_len/sizeof(*ptr);

16700

++}

16701

++

16702

++static bool mptcp_skb_entail(struct sock *sk, struct sk_buff *skb, int reinject)

16703

++{

16704

++	struct tcp_sock *tp = tcp_sk(sk);

16705

++	const struct sock *meta_sk = mptcp_meta_sk(sk);

16706

++	const struct mptcp_cb *mpcb = tp->mpcb;

16707

++	struct tcp_skb_cb *tcb;

16708

++	struct sk_buff *subskb = NULL;

16709

++

16710

++	if (!reinject)

16711

++		TCP_SKB_CB(skb)->mptcp_flags |= (mpcb->snd_hiseq_index ?

16712

++						  MPTCPHDR_SEQ64_INDEX : 0);

16713

++

16714

++	subskb = pskb_copy_for_clone(skb, GFP_ATOMIC);

16715

++	if (!subskb)

16716

++		return false;

16717

++

16718

++	/* At the subflow-level we need to call again tcp_init_tso_segs. We

16719

++	 * force this, by setting gso_segs to 0. It has been set to 1 prior to

16720

++	 * the call to mptcp_skb_entail.

16721

++	 */

16722

++	skb_shinfo(subskb)->gso_segs = 0;

16723

++

16724

++	TCP_SKB_CB(skb)->path_mask |= mptcp_pi_to_flag(tp->mptcp->path_index);

16725

++

16726

++	if (!(sk->sk_route_caps & NETIF_F_ALL_CSUM) &&

16727

++	    skb->ip_summed == CHECKSUM_PARTIAL) {

16728

++		subskb->csum = skb->csum = skb_checksum(skb, 0, skb->len, 0);

16729

++		subskb->ip_summed = skb->ip_summed = CHECKSUM_NONE;

16730

++	}

16731

++

16732

++	tcb = TCP_SKB_CB(subskb);

16733

++

16734

++	if (tp->mpcb->send_infinite_mapping &&

16735

++	    !tp->mpcb->infinite_mapping_snd &&

16736

++	    !before(tcb->seq, mptcp_meta_tp(tp)->snd_nxt)) {

16737

++		tp->mptcp->fully_established = 1;

16738

++		tp->mpcb->infinite_mapping_snd = 1;

16739

++		tp->mptcp->infinite_cutoff_seq = tp->write_seq;

16740

++		tcb->mptcp_flags |= MPTCPHDR_INF;

16741

++	}

16742

++

16743

++	if (mptcp_is_data_fin(subskb))

16744

++		mptcp_combine_dfin(subskb, meta_sk, sk);

16745

++

16746

++	mptcp_save_dss_data_seq(tp, subskb);

16747

++

16748

++	tcb->seq = tp->write_seq;

16749

++	tcb->sacked = 0; /* reset the sacked field: from the point of view

16750

++			  * of this subflow, we are sending a brand new

16751

++			  * segment

16752

++			  */

16753

++	/* Take into account seg len */

16754

++	tp->write_seq += subskb->len + ((tcb->tcp_flags & TCPHDR_FIN) ? 1 : 0);

16755

++	tcb->end_seq = tp->write_seq;

16756

++

16757

++	/* If it's a non-payload DATA_FIN (also no subflow-fin), the

16758

++	 * segment is not part of the subflow but on a meta-only-level.

16759

++	 */

16760

++	if (!mptcp_is_data_fin(subskb) || tcb->end_seq != tcb->seq) {

16761

++		tcp_add_write_queue_tail(sk, subskb);

16762

++		sk->sk_wmem_queued += subskb->truesize;

16763

++		sk_mem_charge(sk, subskb->truesize);

16764

++	} else {

16765

++		int err;

16766

++

16767

++		/* Necessary to initialize for tcp_transmit_skb. mss of 1, as

16768

++		 * skb->len = 0 will force tso_segs to 1.

16769

++		 */

16770

++		tcp_init_tso_segs(sk, subskb, 1);

16771

++		/* Empty data-fins are sent immediatly on the subflow */

16772

++		TCP_SKB_CB(subskb)->when = tcp_time_stamp;

16773

++		err = tcp_transmit_skb(sk, subskb, 1, GFP_ATOMIC);

16774

++

16775

++		/* It has not been queued, we can free it now. */

16776

++		kfree_skb(subskb);

16777

++

16778

++		if (err)

16779

++			return false;

16780

++	}

16781

++

16782

++	if (!tp->mptcp->fully_established) {

16783

++		tp->mptcp->second_packet = 1;

16784

++		tp->mptcp->last_end_data_seq = TCP_SKB_CB(skb)->end_seq;

16785

++	}

16786

++

16787

++	return true;

16788

++}

16789

++

16790

++/* Fragment an skb and update the mptcp meta-data. Due to reinject, we

16791

++ * might need to undo some operations done by tcp_fragment.

16792

++ */

16793

++static int mptcp_fragment(struct sock *meta_sk, struct sk_buff *skb, u32 len,

16794

++			  gfp_t gfp, int reinject)

16795

++{

16796

++	int ret, diff, old_factor;

16797

++	struct sk_buff *buff;

16798

++	u8 flags;

16799

++

16800

++	if (skb_headlen(skb) < len)

16801

++		diff = skb->len - len;

16802

++	else

16803

++		diff = skb->data_len;

16804

++	old_factor = tcp_skb_pcount(skb);

16805

++

16806

++	/* The mss_now in tcp_fragment is used to set the tso_segs of the skb.

16807

++	 * At the MPTCP-level we do not care about the absolute value. All we

16808

++	 * care about is that it is set to 1 for accurate packets_out

16809

++	 * accounting.

16810

++	 */

16811

++	ret = tcp_fragment(meta_sk, skb, len, UINT_MAX, gfp);

16812

++	if (ret)

16813

++		return ret;

16814

++

16815

++	buff = skb->next;

16816

++

16817

++	flags = TCP_SKB_CB(skb)->mptcp_flags;

16818

++	TCP_SKB_CB(skb)->mptcp_flags = flags & ~(MPTCPHDR_FIN);

16819

++	TCP_SKB_CB(buff)->mptcp_flags = flags;

16820

++	TCP_SKB_CB(buff)->path_mask = TCP_SKB_CB(skb)->path_mask;

16821

++

16822

++	/* If reinject == 1, the buff will be added to the reinject

16823

++	 * queue, which is currently not part of memory accounting. So

16824

++	 * undo the changes done by tcp_fragment and update the

16825

++	 * reinject queue. Also, undo changes to the packet counters.

16826

++	 */

16827

++	if (reinject == 1) {

16828

++		int undo = buff->truesize - diff;

16829

++		meta_sk->sk_wmem_queued -= undo;

16830

++		sk_mem_uncharge(meta_sk, undo);

16831

++

16832

++		tcp_sk(meta_sk)->mpcb->reinject_queue.qlen++;

16833

++		meta_sk->sk_write_queue.qlen--;

16834

++

16835

++		if (!before(tcp_sk(meta_sk)->snd_nxt, TCP_SKB_CB(buff)->end_seq)) {

16836

++			undo = old_factor - tcp_skb_pcount(skb) -

16837

++				tcp_skb_pcount(buff);

16838

++			if (undo)

16839

++				tcp_adjust_pcount(meta_sk, skb, -undo);

16840

++		}

16841

++	}

16842

++

16843

++	return 0;

16844

++}

16845

++

16846

++/* Inspired by tcp_write_wakeup */

16847

++int mptcp_write_wakeup(struct sock *meta_sk)

16848

++{

16849

++	struct tcp_sock *meta_tp = tcp_sk(meta_sk);

16850

++	struct sk_buff *skb;

16851

++	struct sock *sk_it;

16852

++	int ans = 0;

16853

++

16854

++	if (meta_sk->sk_state == TCP_CLOSE)

16855

++		return -1;

16856

++

16857

++	skb = tcp_send_head(meta_sk);

16858

++	if (skb &&

16859

++	    before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(meta_tp))) {

16860

++		unsigned int mss;

16861

++		unsigned int seg_size = tcp_wnd_end(meta_tp) - TCP_SKB_CB(skb)->seq;

16862

++		struct sock *subsk = meta_tp->mpcb->sched_ops->get_subflow(meta_sk, skb, true);

16863

++		struct tcp_sock *subtp;

16864

++		if (!subsk)

16865

++			goto window_probe;

16866

++		subtp = tcp_sk(subsk);

16867

++		mss = tcp_current_mss(subsk);

16868

++

16869

++		seg_size = min(tcp_wnd_end(meta_tp) - TCP_SKB_CB(skb)->seq,

16870

++			       tcp_wnd_end(subtp) - subtp->write_seq);

16871

++

16872

++		if (before(meta_tp->pushed_seq, TCP_SKB_CB(skb)->end_seq))

16873

++			meta_tp->pushed_seq = TCP_SKB_CB(skb)->end_seq;

16874

++

16875

++		/* We are probing the opening of a window

16876

++		 * but the window size is != 0

16877

++		 * must have been a result SWS avoidance ( sender )

16878

++		 */

16879

++		if (seg_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq ||

16880

++		    skb->len > mss) {

16881

++			seg_size = min(seg_size, mss);

16882

++			TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;

16883

++			if (mptcp_fragment(meta_sk, skb, seg_size,

16884

++					   GFP_ATOMIC, 0))

16885

++				return -1;

16886

++		} else if (!tcp_skb_pcount(skb)) {

16887

++			/* see mptcp_write_xmit on why we use UINT_MAX */

16888

++			tcp_set_skb_tso_segs(meta_sk, skb, UINT_MAX);

16889

++		}

16890

++

16891

++		TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;

16892

++		if (!mptcp_skb_entail(subsk, skb, 0))

16893

++			return -1;

16894

++		TCP_SKB_CB(skb)->when = tcp_time_stamp;

16895

++

16896

++		mptcp_check_sndseq_wrap(meta_tp, TCP_SKB_CB(skb)->end_seq -

16897

++						 TCP_SKB_CB(skb)->seq);

16898

++		tcp_event_new_data_sent(meta_sk, skb);

16899

++

16900

++		__tcp_push_pending_frames(subsk, mss, TCP_NAGLE_PUSH);

16901

++

16902

++		return 0;

16903

++	} else {

16904

++window_probe:

16905

++		if (between(meta_tp->snd_up, meta_tp->snd_una + 1,

16906

++			    meta_tp->snd_una + 0xFFFF)) {

16907

++			mptcp_for_each_sk(meta_tp->mpcb, sk_it) {

16908

++				if (mptcp_sk_can_send_ack(sk_it))

16909

++					tcp_xmit_probe_skb(sk_it, 1);

16910

++			}

16911

++		}

16912

++

16913

++		/* At least one of the tcp_xmit_probe_skb's has to succeed */

16914

++		mptcp_for_each_sk(meta_tp->mpcb, sk_it) {

16915

++			int ret;

16916

++

16917

++			if (!mptcp_sk_can_send_ack(sk_it))

16918

++				continue;

16919

++

16920

++			ret = tcp_xmit_probe_skb(sk_it, 0);

16921

++			if (unlikely(ret > 0))

16922

++				ans = ret;

16923

++		}

16924

++		return ans;

16925

++	}

16926

++}

16927

++

16928

++bool mptcp_write_xmit(struct sock *meta_sk, unsigned int mss_now, int nonagle,

16929

++		     int push_one, gfp_t gfp)

16930

++{

16931

++	struct tcp_sock *meta_tp = tcp_sk(meta_sk), *subtp;

16932

++	struct sock *subsk = NULL;

16933

++	struct mptcp_cb *mpcb = meta_tp->mpcb;

16934

++	struct sk_buff *skb;

16935

++	unsigned int sent_pkts;

16936

++	int reinject = 0;

16937

++	unsigned int sublimit;

16938

++

16939

++	sent_pkts = 0;

16940

++

16941

++	while ((skb = mpcb->sched_ops->next_segment(meta_sk, &reinject, &subsk,

16942

++						    &sublimit))) {

16943

++		unsigned int limit;

16944

++

16945

++		subtp = tcp_sk(subsk);

16946

++		mss_now = tcp_current_mss(subsk);

16947

++

16948

++		if (reinject == 1) {

16949

++			if (!after(TCP_SKB_CB(skb)->end_seq, meta_tp->snd_una)) {

16950

++				/* Segment already reached the peer, take the next one */

16951

++				__skb_unlink(skb, &mpcb->reinject_queue);

16952

++				__kfree_skb(skb);

16953

++				continue;

16954

++			}

16955

++		}

16956

++

16957

++		/* If the segment was cloned (e.g. a meta retransmission),

16958

++		 * the header must be expanded/copied so that there is no

16959

++		 * corruption of TSO information.

16960

++		 */

16961

++		if (skb_unclone(skb, GFP_ATOMIC))

16962

++			break;

16963

++

16964

++		if (unlikely(!tcp_snd_wnd_test(meta_tp, skb, mss_now)))

16965

++			break;

16966

++

16967

++		/* Force tso_segs to 1 by using UINT_MAX.

16968

++		 * We actually don't care about the exact number of segments

16969

++		 * emitted on the subflow. We need just to set tso_segs, because

16970

++		 * we still need an accurate packets_out count in

16971

++		 * tcp_event_new_data_sent.

16972

++		 */

16973

++		tcp_set_skb_tso_segs(meta_sk, skb, UINT_MAX);

16974

++

16975

++		/* Check for nagle, irregardless of tso_segs. If the segment is

16976

++		 * actually larger than mss_now (TSO segment), then

16977

++		 * tcp_nagle_check will have partial == false and always trigger

16978

++		 * the transmission.

16979

++		 * tcp_write_xmit has a TSO-level nagle check which is not

16980

++		 * subject to the MPTCP-level. It is based on the properties of

16981

++		 * the subflow, not the MPTCP-level.

16982

++		 */

16983

++		if (unlikely(!tcp_nagle_test(meta_tp, skb, mss_now,

16984

++					     (tcp_skb_is_last(meta_sk, skb) ?

16985

++					      nonagle : TCP_NAGLE_PUSH))))

16986

++			break;

16987

++

16988

++		limit = mss_now;

16989

++		/* skb->len > mss_now is the equivalent of tso_segs > 1 in

16990

++		 * tcp_write_xmit. Otherwise split-point would return 0.

16991

++		 */

16992

++		if (skb->len > mss_now && !tcp_urg_mode(meta_tp))

16993

++			/* We limit the size of the skb so that it fits into the

16994

++			 * window. Call tcp_mss_split_point to avoid duplicating

16995

++			 * code.

16996

++			 * We really only care about fitting the skb into the

16997

++			 * window. That's why we use UINT_MAX. If the skb does

16998

++			 * not fit into the cwnd_quota or the NIC's max-segs

16999

++			 * limitation, it will be split by the subflow's

17000

++			 * tcp_write_xmit which does the appropriate call to

17001

++			 * tcp_mss_split_point.

17002

++			 */

17003

++			limit = tcp_mss_split_point(meta_sk, skb, mss_now,

17004

++						    UINT_MAX / mss_now,

17005

++						    nonagle);

17006

++

17007

++		if (sublimit)

17008

++			limit = min(limit, sublimit);

17009

++

17010

++		if (skb->len > limit &&

17011

++		    unlikely(mptcp_fragment(meta_sk, skb, limit, gfp, reinject)))

17012

++			break;

17013

++

17014

++		if (!mptcp_skb_entail(subsk, skb, reinject))

17015

++			break;

17016

++		/* Nagle is handled at the MPTCP-layer, so

17017

++		 * always push on the subflow

17018

++		 */

17019

++		__tcp_push_pending_frames(subsk, mss_now, TCP_NAGLE_PUSH);

17020

++		TCP_SKB_CB(skb)->when = tcp_time_stamp;

17021

++

17022

++		if (!reinject) {

17023

++			mptcp_check_sndseq_wrap(meta_tp,

17024

++						TCP_SKB_CB(skb)->end_seq -

17025

++						TCP_SKB_CB(skb)->seq);

17026

++			tcp_event_new_data_sent(meta_sk, skb);

17027

++		}

17028

++

17029

++		tcp_minshall_update(meta_tp, mss_now, skb);

17030

++		sent_pkts += tcp_skb_pcount(skb);

17031

++

17032

++		if (reinject > 0) {

17033

++			__skb_unlink(skb, &mpcb->reinject_queue);

17034

++			kfree_skb(skb);

17035

++		}

17036

++

17037

++		if (push_one)

17038

++			break;

17039

++	}

17040

++

17041

++	return !meta_tp->packets_out && tcp_send_head(meta_sk);

17042

++}

17043

++

17044

++void mptcp_write_space(struct sock *sk)

17045

++{

17046

++	mptcp_push_pending_frames(mptcp_meta_sk(sk));

17047

++}

17048

++

17049

++u32 __mptcp_select_window(struct sock *sk)

17050

++{

17051

++	struct inet_connection_sock *icsk = inet_csk(sk);

17052

++	struct tcp_sock *tp = tcp_sk(sk), *meta_tp = mptcp_meta_tp(tp);

17053

++	int mss, free_space, full_space, window;

17054

++

17055

++	/* MSS for the peer's data.  Previous versions used mss_clamp

17056

++	 * here.  I don't know if the value based on our guesses

17057

++	 * of peer's MSS is better for the performance.  It's more correct

17058

++	 * but may be worse for the performance because of rcv_mss

17059

++	 * fluctuations.  --SAW  1998/11/1

17060

++	 */

17061

++	mss = icsk->icsk_ack.rcv_mss;

17062

++	free_space = tcp_space(sk);

17063

++	full_space = min_t(int, meta_tp->window_clamp,

17064

++			tcp_full_space(sk));

17065

++

17066

++	if (mss > full_space)

17067

++		mss = full_space;

17068

++

17069

++	if (free_space < (full_space >> 1)) {

17070

++		icsk->icsk_ack.quick = 0;

17071

++

17072

++		if (tcp_memory_pressure)

17073

++			/* TODO this has to be adapted when we support different

17074

++			 * MSS's among the subflows.

17075

++			 */

17076

++			meta_tp->rcv_ssthresh = min(meta_tp->rcv_ssthresh,

17077

++						    4U * meta_tp->advmss);

17078

++

17079

++		if (free_space < mss)

17080

++			return 0;

17081

++	}

17082

++

17083

++	if (free_space > meta_tp->rcv_ssthresh)

17084

++		free_space = meta_tp->rcv_ssthresh;

17085

++

17086

++	/* Don't do rounding if we are using window scaling, since the

17087

++	 * scaled window will not line up with the MSS boundary anyway.

17088

++	 */

17089

++	window = meta_tp->rcv_wnd;

17090

++	if (tp->rx_opt.rcv_wscale) {

17091

++		window = free_space;

17092

++

17093

++		/* Advertise enough space so that it won't get scaled away.

17094

++		 * Import case: prevent zero window announcement if

17095

++		 * 1<<rcv_wscale > mss.

17096

++		 */

17097

++		if (((window >> tp->rx_opt.rcv_wscale) << tp->

17098

++		     rx_opt.rcv_wscale) != window)

17099

++			window = (((window >> tp->rx_opt.rcv_wscale) + 1)

17100

++				  << tp->rx_opt.rcv_wscale);

17101

++	} else {

17102

++		/* Get the largest window that is a nice multiple of mss.

17103

++		 * Window clamp already applied above.

17104

++		 * If our current window offering is within 1 mss of the

17105

++		 * free space we just keep it. This prevents the divide

17106

++		 * and multiply from happening most of the time.

17107

++		 * We also don't do any window rounding when the free space

17108

++		 * is too small.

17109

++		 */

17110

++		if (window <= free_space - mss || window > free_space)

17111

++			window = (free_space / mss) * mss;

17112

++		else if (mss == full_space &&

17113

++			 free_space > window + (full_space >> 1))

17114

++			window = free_space;

17115

++	}

17116

++

17117

++	return window;

17118

++}

17119

++

17120

++void mptcp_syn_options(const struct sock *sk, struct tcp_out_options *opts,

17121

++		       unsigned *remaining)

17122

++{

17123

++	const struct tcp_sock *tp = tcp_sk(sk);

17124

++

17125

++	opts->options |= OPTION_MPTCP;

17126

++	if (is_master_tp(tp)) {

17127

++		opts->mptcp_options |= OPTION_MP_CAPABLE | OPTION_TYPE_SYN;

17128

++		*remaining -= MPTCP_SUB_LEN_CAPABLE_SYN_ALIGN;

17129

++		opts->mp_capable.sender_key = tp->mptcp_loc_key;

17130

++		opts->dss_csum = !!sysctl_mptcp_checksum;

17131

++	} else {

17132

++		const struct mptcp_cb *mpcb = tp->mpcb;

17133

++

17134

++		opts->mptcp_options |= OPTION_MP_JOIN | OPTION_TYPE_SYN;

17135

++		*remaining -= MPTCP_SUB_LEN_JOIN_SYN_ALIGN;

17136

++		opts->mp_join_syns.token = mpcb->mptcp_rem_token;

17137

++		opts->mp_join_syns.low_prio  = tp->mptcp->low_prio;

17138

++		opts->addr_id = tp->mptcp->loc_id;

17139

++		opts->mp_join_syns.sender_nonce = tp->mptcp->mptcp_loc_nonce;

17140

++	}

17141

++}

17142

++

17143

++void mptcp_synack_options(struct request_sock *req,

17144

++			  struct tcp_out_options *opts, unsigned *remaining)

17145

++{

17146

++	struct mptcp_request_sock *mtreq;

17147

++	mtreq = mptcp_rsk(req);

17148

++

17149

++	opts->options |= OPTION_MPTCP;

17150

++	/* MPCB not yet set - thus it's a new MPTCP-session */

17151

++	if (!mtreq->is_sub) {

17152

++		opts->mptcp_options |= OPTION_MP_CAPABLE | OPTION_TYPE_SYNACK;

17153

++		opts->mp_capable.sender_key = mtreq->mptcp_loc_key;

17154

++		opts->dss_csum = !!sysctl_mptcp_checksum || mtreq->dss_csum;

17155

++		*remaining -= MPTCP_SUB_LEN_CAPABLE_SYN_ALIGN;

17156

++	} else {

17157

++		opts->mptcp_options |= OPTION_MP_JOIN | OPTION_TYPE_SYNACK;

17158

++		opts->mp_join_syns.sender_truncated_mac =

17159

++				mtreq->mptcp_hash_tmac;

17160

++		opts->mp_join_syns.sender_nonce = mtreq->mptcp_loc_nonce;

17161

++		opts->mp_join_syns.low_prio = mtreq->low_prio;

17162

++		opts->addr_id = mtreq->loc_id;

17163

++		*remaining -= MPTCP_SUB_LEN_JOIN_SYNACK_ALIGN;

17164

++	}

17165

++}

17166

++

17167

++void mptcp_established_options(struct sock *sk, struct sk_buff *skb,

17168

++			       struct tcp_out_options *opts, unsigned *size)

17169

++{

17170

++	struct tcp_sock *tp = tcp_sk(sk);

17171

++	struct mptcp_cb *mpcb = tp->mpcb;

17172

++	const struct tcp_skb_cb *tcb = skb ? TCP_SKB_CB(skb) : NULL;

17173

++

17174

++	/* We are coming from tcp_current_mss with the meta_sk as an argument.

17175

++	 * It does not make sense to check for the options, because when the

17176

++	 * segment gets sent, another subflow will be chosen.

17177

++	 */

17178

++	if (!skb && is_meta_sk(sk))

17179

++		return;

17180

++

17181

++	/* In fallback mp_fail-mode, we have to repeat it until the fallback

17182

++	 * has been done by the sender

17183

++	 */

17184

++	if (unlikely(tp->mptcp->send_mp_fail)) {

17185

++		opts->options |= OPTION_MPTCP;

17186

++		opts->mptcp_options |= OPTION_MP_FAIL;

17187

++		*size += MPTCP_SUB_LEN_FAIL;

17188

++		return;

17189

++	}

17190

++

17191

++	if (unlikely(tp->send_mp_fclose)) {

17192

++		opts->options |= OPTION_MPTCP;

17193

++		opts->mptcp_options |= OPTION_MP_FCLOSE;

17194

++		opts->mp_capable.receiver_key = mpcb->mptcp_rem_key;

17195

++		*size += MPTCP_SUB_LEN_FCLOSE_ALIGN;

17196

++		return;

17197

++	}

17198

++

17199

++	/* 1. If we are the sender of the infinite-mapping, we need the

17200

++	 *    MPTCPHDR_INF-flag, because a retransmission of the

17201

++	 *    infinite-announcment still needs the mptcp-option.

17202

++	 *

17203

++	 *    We need infinite_cutoff_seq, because retransmissions from before

17204

++	 *    the infinite-cutoff-moment still need the MPTCP-signalling to stay

17205

++	 *    consistent.

17206

++	 *

17207

++	 * 2. If we are the receiver of the infinite-mapping, we always skip

17208

++	 *    mptcp-options, because acknowledgments from before the

17209

++	 *    infinite-mapping point have already been sent out.

17210

++	 *

17211

++	 * I know, the whole infinite-mapping stuff is ugly...

17212

++	 *

17213

++	 * TODO: Handle wrapped data-sequence numbers

17214

++	 *       (even if it's very unlikely)

17215

++	 */

17216

++	if (unlikely(mpcb->infinite_mapping_snd) &&

17217

++	    ((mpcb->send_infinite_mapping && tcb &&

17218

++	      mptcp_is_data_seq(skb) &&

17219

++	      !(tcb->mptcp_flags & MPTCPHDR_INF) &&

17220

++	      !before(tcb->seq, tp->mptcp->infinite_cutoff_seq)) ||

17221

++	     !mpcb->send_infinite_mapping))

17222

++		return;

17223

++

17224

++	if (unlikely(tp->mptcp->include_mpc)) {

17225

++		opts->options |= OPTION_MPTCP;

17226

++		opts->mptcp_options |= OPTION_MP_CAPABLE |

17227

++				       OPTION_TYPE_ACK;

17228

++		*size += MPTCP_SUB_LEN_CAPABLE_ACK_ALIGN;

17229

++		opts->mp_capable.sender_key = mpcb->mptcp_loc_key;

17230

++		opts->mp_capable.receiver_key = mpcb->mptcp_rem_key;

17231

++		opts->dss_csum = mpcb->dss_csum;

17232

++

17233

++		if (skb)

17234

++			tp->mptcp->include_mpc = 0;

17235

++	}

17236

++	if (unlikely(tp->mptcp->pre_established)) {

17237

++		opts->options |= OPTION_MPTCP;

17238

++		opts->mptcp_options |= OPTION_MP_JOIN | OPTION_TYPE_ACK;

17239

++		*size += MPTCP_SUB_LEN_JOIN_ACK_ALIGN;

17240

++	}

17241

++

17242

++	if (!tp->mptcp->include_mpc && !tp->mptcp->pre_established) {

17243

++		opts->options |= OPTION_MPTCP;

17244

++		opts->mptcp_options |= OPTION_DATA_ACK;

17245

++		/* If !skb, we come from tcp_current_mss and thus we always

17246

++		 * assume that the DSS-option will be set for the data-packet.

17247

++		 */

17248

++		if (skb && !mptcp_is_data_seq(skb)) {

17249

++			*size += MPTCP_SUB_LEN_ACK_ALIGN;

17250

++		} else {

17251

++			/* Doesn't matter, if csum included or not. It will be

17252

++			 * either 10 or 12, and thus aligned = 12

17253

++			 */

17254

++			*size += MPTCP_SUB_LEN_ACK_ALIGN +

17255

++				 MPTCP_SUB_LEN_SEQ_ALIGN;

17256

++		}

17257

++

17258

++		*size += MPTCP_SUB_LEN_DSS_ALIGN;

17259

++	}

17260

++

17261

++	if (unlikely(mpcb->addr_signal) && mpcb->pm_ops->addr_signal)

17262

++		mpcb->pm_ops->addr_signal(sk, size, opts, skb);

17263

++

17264

++	if (unlikely(tp->mptcp->send_mp_prio) &&

17265

++	    MAX_TCP_OPTION_SPACE - *size >= MPTCP_SUB_LEN_PRIO_ALIGN) {

17266

++		opts->options |= OPTION_MPTCP;

17267

++		opts->mptcp_options |= OPTION_MP_PRIO;

17268

++		if (skb)

17269

++			tp->mptcp->send_mp_prio = 0;

17270

++		*size += MPTCP_SUB_LEN_PRIO_ALIGN;

17271

++	}

17272

++

17273

++	return;

17274

++}

17275

++

17276

++u16 mptcp_select_window(struct sock *sk)

17277

++{

17278

++	u16 new_win		= tcp_select_window(sk);

17279

++	struct tcp_sock *tp	= tcp_sk(sk);

17280

++	struct tcp_sock *meta_tp = mptcp_meta_tp(tp);

17281

++

17282

++	meta_tp->rcv_wnd	= tp->rcv_wnd;

17283

++	meta_tp->rcv_wup	= meta_tp->rcv_nxt;

17284

++

17285

++	return new_win;

17286

++}

17287

++

17288

++void mptcp_options_write(__be32 *ptr, struct tcp_sock *tp,

17289

++			 const struct tcp_out_options *opts,

17290

++			 struct sk_buff *skb)

17291

++{

17292

++	if (unlikely(OPTION_MP_CAPABLE & opts->mptcp_options)) {

17293

++		struct mp_capable *mpc = (struct mp_capable *)ptr;

17294

++

17295

++		mpc->kind = TCPOPT_MPTCP;

17296

++

17297

++		if ((OPTION_TYPE_SYN & opts->mptcp_options) ||

17298

++		    (OPTION_TYPE_SYNACK & opts->mptcp_options)) {

17299

++			mpc->sender_key = opts->mp_capable.sender_key;

17300

++			mpc->len = MPTCP_SUB_LEN_CAPABLE_SYN;

17301

++			ptr += MPTCP_SUB_LEN_CAPABLE_SYN_ALIGN >> 2;

17302

++		} else if (OPTION_TYPE_ACK & opts->mptcp_options) {

17303

++			mpc->sender_key = opts->mp_capable.sender_key;

17304

++			mpc->receiver_key = opts->mp_capable.receiver_key;

17305

++			mpc->len = MPTCP_SUB_LEN_CAPABLE_ACK;

17306

++			ptr += MPTCP_SUB_LEN_CAPABLE_ACK_ALIGN >> 2;

17307

++		}

17308

++

17309

++		mpc->sub = MPTCP_SUB_CAPABLE;

17310

++		mpc->ver = 0;

17311

++		mpc->a = opts->dss_csum;

17312

++		mpc->b = 0;

17313

++		mpc->rsv = 0;

17314

++		mpc->h = 1;

17315

++	}

17316

++

17317

++	if (unlikely(OPTION_MP_JOIN & opts->mptcp_options)) {

17318

++		struct mp_join *mpj = (struct mp_join *)ptr;

17319

++

17320

++		mpj->kind = TCPOPT_MPTCP;

17321

++		mpj->sub = MPTCP_SUB_JOIN;

17322

++		mpj->rsv = 0;

17323

++

17324

++		if (OPTION_TYPE_SYN & opts->mptcp_options) {

17325

++			mpj->len = MPTCP_SUB_LEN_JOIN_SYN;

17326

++			mpj->u.syn.token = opts->mp_join_syns.token;

17327

++			mpj->u.syn.nonce = opts->mp_join_syns.sender_nonce;

17328

++			mpj->b = opts->mp_join_syns.low_prio;

17329

++			mpj->addr_id = opts->addr_id;

17330

++			ptr += MPTCP_SUB_LEN_JOIN_SYN_ALIGN >> 2;

17331

++		} else if (OPTION_TYPE_SYNACK & opts->mptcp_options) {

17332

++			mpj->len = MPTCP_SUB_LEN_JOIN_SYNACK;

17333

++			mpj->u.synack.mac =

17334

++				opts->mp_join_syns.sender_truncated_mac;

17335

++			mpj->u.synack.nonce = opts->mp_join_syns.sender_nonce;

17336

++			mpj->b = opts->mp_join_syns.low_prio;

17337

++			mpj->addr_id = opts->addr_id;

17338

++			ptr += MPTCP_SUB_LEN_JOIN_SYNACK_ALIGN >> 2;

17339

++		} else if (OPTION_TYPE_ACK & opts->mptcp_options) {

17340

++			mpj->len = MPTCP_SUB_LEN_JOIN_ACK;

17341

++			mpj->addr_id = 0; /* addr_id is rsv (RFC 6824, p. 21) */

17342

++			memcpy(mpj->u.ack.mac, &tp->mptcp->sender_mac[0], 20);

17343

++			ptr += MPTCP_SUB_LEN_JOIN_ACK_ALIGN >> 2;

17344

++		}

17345

++	}

17346

++	if (unlikely(OPTION_ADD_ADDR & opts->mptcp_options)) {

17347

++		struct mp_add_addr *mpadd = (struct mp_add_addr *)ptr;

17348

++

17349

++		mpadd->kind = TCPOPT_MPTCP;

17350

++		if (opts->add_addr_v4) {

17351

++			mpadd->len = MPTCP_SUB_LEN_ADD_ADDR4;

17352

++			mpadd->sub = MPTCP_SUB_ADD_ADDR;

17353

++			mpadd->ipver = 4;

17354

++			mpadd->addr_id = opts->add_addr4.addr_id;

17355

++			mpadd->u.v4.addr = opts->add_addr4.addr;

17356

++			ptr += MPTCP_SUB_LEN_ADD_ADDR4_ALIGN >> 2;

17357

++		} else if (opts->add_addr_v6) {

17358

++			mpadd->len = MPTCP_SUB_LEN_ADD_ADDR6;

17359

++			mpadd->sub = MPTCP_SUB_ADD_ADDR;

17360

++			mpadd->ipver = 6;

17361

++			mpadd->addr_id = opts->add_addr6.addr_id;

17362

++			memcpy(&mpadd->u.v6.addr, &opts->add_addr6.addr,

17363

++			       sizeof(mpadd->u.v6.addr));

17364

++			ptr += MPTCP_SUB_LEN_ADD_ADDR6_ALIGN >> 2;

17365

++		}

17366

++	}

17367

++	if (unlikely(OPTION_REMOVE_ADDR & opts->mptcp_options)) {

17368

++		struct mp_remove_addr *mprem = (struct mp_remove_addr *)ptr;

17369

++		u8 *addrs_id;

17370

++		int id, len, len_align;

17371

++

17372

++		len = mptcp_sub_len_remove_addr(opts->remove_addrs);

17373

++		len_align = mptcp_sub_len_remove_addr_align(opts->remove_addrs);

17374

++

17375

++		mprem->kind = TCPOPT_MPTCP;

17376

++		mprem->len = len;

17377

++		mprem->sub = MPTCP_SUB_REMOVE_ADDR;

17378

++		mprem->rsv = 0;

17379

++		addrs_id = &mprem->addrs_id;

17380

++

17381

++		mptcp_for_each_bit_set(opts->remove_addrs, id)

17382

++			*(addrs_id++) = id;

17383

++

17384

++		/* Fill the rest with NOP's */

17385

++		if (len_align > len) {

17386

++			int i;

17387

++			for (i = 0; i < len_align - len; i++)

17388

++				*(addrs_id++) = TCPOPT_NOP;

17389

++		}

17390

++

17391

++		ptr += len_align >> 2;

17392

++	}

17393

++	if (unlikely(OPTION_MP_FAIL & opts->mptcp_options)) {

17394

++		struct mp_fail *mpfail = (struct mp_fail *)ptr;

17395

++

17396

++		mpfail->kind = TCPOPT_MPTCP;

17397

++		mpfail->len = MPTCP_SUB_LEN_FAIL;

17398

++		mpfail->sub = MPTCP_SUB_FAIL;

17399

++		mpfail->rsv1 = 0;

17400

++		mpfail->rsv2 = 0;

17401

++		mpfail->data_seq = htonll(tp->mpcb->csum_cutoff_seq);

17402

++

17403

++		ptr += MPTCP_SUB_LEN_FAIL_ALIGN >> 2;

17404

++	}

17405

++	if (unlikely(OPTION_MP_FCLOSE & opts->mptcp_options)) {

17406

++		struct mp_fclose *mpfclose = (struct mp_fclose *)ptr;

17407

++

17408

++		mpfclose->kind = TCPOPT_MPTCP;

17409

++		mpfclose->len = MPTCP_SUB_LEN_FCLOSE;

17410

++		mpfclose->sub = MPTCP_SUB_FCLOSE;

17411

++		mpfclose->rsv1 = 0;

17412

++		mpfclose->rsv2 = 0;

17413

++		mpfclose->key = opts->mp_capable.receiver_key;

17414

++

17415

++		ptr += MPTCP_SUB_LEN_FCLOSE_ALIGN >> 2;

17416

++	}

17417

++

17418

++	if (OPTION_DATA_ACK & opts->mptcp_options) {

17419

++		if (!mptcp_is_data_seq(skb))

17420

++			ptr += mptcp_write_dss_data_ack(tp, skb, ptr);

17421

++		else

17422

++			ptr += mptcp_write_dss_data_seq(tp, skb, ptr);

17423

++	}

17424

++	if (unlikely(OPTION_MP_PRIO & opts->mptcp_options)) {

17425

++		struct mp_prio *mpprio = (struct mp_prio *)ptr;

17426

++

17427

++		mpprio->kind = TCPOPT_MPTCP;

17428

++		mpprio->len = MPTCP_SUB_LEN_PRIO;

17429

++		mpprio->sub = MPTCP_SUB_PRIO;

17430

++		mpprio->rsv = 0;

17431

++		mpprio->b = tp->mptcp->low_prio;

17432

++		mpprio->addr_id = TCPOPT_NOP;

17433

++

17434

++		ptr += MPTCP_SUB_LEN_PRIO_ALIGN >> 2;

17435

++	}

17436

++}

17437

++

17438

++/* Sends the datafin */

17439

++void mptcp_send_fin(struct sock *meta_sk)

17440

++{

17441

++	struct tcp_sock *meta_tp = tcp_sk(meta_sk);

17442

++	struct sk_buff *skb = tcp_write_queue_tail(meta_sk);

17443

++	int mss_now;

17444

++

17445

++	if ((1 << meta_sk->sk_state) & (TCPF_CLOSE_WAIT | TCPF_LAST_ACK))

17446

++		meta_tp->mpcb->passive_close = 1;

17447

++

17448

++	/* Optimization, tack on the FIN if we have a queue of

17449

++	 * unsent frames.  But be careful about outgoing SACKS

17450

++	 * and IP options.

17451

++	 */

17452

++	mss_now = mptcp_current_mss(meta_sk);

17453

++

17454

++	if (tcp_send_head(meta_sk) != NULL) {

17455

++		TCP_SKB_CB(skb)->mptcp_flags |= MPTCPHDR_FIN;

17456

++		TCP_SKB_CB(skb)->end_seq++;

17457

++		meta_tp->write_seq++;

17458

++	} else {

17459

++		/* Socket is locked, keep trying until memory is available. */

17460

++		for (;;) {

17461

++			skb = alloc_skb_fclone(MAX_TCP_HEADER,

17462

++					       meta_sk->sk_allocation);

17463

++			if (skb)

17464

++				break;

17465

++			yield();

17466

++		}

17467

++		/* Reserve space for headers and prepare control bits. */

17468

++		skb_reserve(skb, MAX_TCP_HEADER);

17469

++

17470

++		tcp_init_nondata_skb(skb, meta_tp->write_seq, TCPHDR_ACK);

17471

++		TCP_SKB_CB(skb)->end_seq++;

17472

++		TCP_SKB_CB(skb)->mptcp_flags |= MPTCPHDR_FIN;

17473

++		tcp_queue_skb(meta_sk, skb);

17474

++	}

17475

++	__tcp_push_pending_frames(meta_sk, mss_now, TCP_NAGLE_OFF);

17476

++}

17477

++

17478

++void mptcp_send_active_reset(struct sock *meta_sk, gfp_t priority)

17479

++{

17480

++	struct tcp_sock *meta_tp = tcp_sk(meta_sk);

17481

++	struct mptcp_cb *mpcb = meta_tp->mpcb;

17482

++	struct sock *sk = NULL, *sk_it = NULL, *tmpsk;

17483

++

17484

++	if (!mpcb->cnt_subflows)

17485

++		return;

17486

++

17487

++	WARN_ON(meta_tp->send_mp_fclose);

17488

++

17489

++	/* First - select a socket */

17490

++	sk = mptcp_select_ack_sock(meta_sk);

17491

++

17492

++	/* May happen if no subflow is in an appropriate state */

17493

++	if (!sk)

17494

++		return;

17495

++

17496

++	/* We are in infinite mode - just send a reset */

17497

++	if (mpcb->infinite_mapping_snd || mpcb->infinite_mapping_rcv) {

17498

++		sk->sk_err = ECONNRESET;

17499

++		if (tcp_need_reset(sk->sk_state))

17500

++			tcp_send_active_reset(sk, priority);

17501

++		mptcp_sub_force_close(sk);

17502

++		return;

17503

++	}

17504

++

17505

++

17506

++	tcp_sk(sk)->send_mp_fclose = 1;

17507

++	/** Reset all other subflows */

17508

++

17509

++	/* tcp_done must be handled with bh disabled */

17510

++	if (!in_serving_softirq())

17511

++		local_bh_disable();

17512

++

17513

++	mptcp_for_each_sk_safe(mpcb, sk_it, tmpsk) {

17514

++		if (tcp_sk(sk_it)->send_mp_fclose)

17515

++			continue;

17516

++

17517

++		sk_it->sk_err = ECONNRESET;

17518

++		if (tcp_need_reset(sk_it->sk_state))

17519

++			tcp_send_active_reset(sk_it, GFP_ATOMIC);

17520

++		mptcp_sub_force_close(sk_it);

17521

++	}

17522

++

17523

++	if (!in_serving_softirq())

17524

++		local_bh_enable();

17525

++

17526

++	tcp_send_ack(sk);

17527

++	inet_csk_reset_keepalive_timer(sk, inet_csk(sk)->icsk_rto);

17528

++

17529

++	meta_tp->send_mp_fclose = 1;

17530

++}

17531

++

17532

++static void mptcp_ack_retransmit_timer(struct sock *sk)

17533

++{

17534

++	struct sk_buff *skb;

17535

++	struct tcp_sock *tp = tcp_sk(sk);

17536

++	struct inet_connection_sock *icsk = inet_csk(sk);

17537

++

17538

++	if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk))

17539

++		goto out; /* Routing failure or similar */

17540

++

17541

++	if (!tp->retrans_stamp)

17542

++		tp->retrans_stamp = tcp_time_stamp ? : 1;

17543

++

17544

++	if (tcp_write_timeout(sk)) {

17545

++		tp->mptcp->pre_established = 0;

17546

++		sk_stop_timer(sk, &tp->mptcp->mptcp_ack_timer);

17547

++		tp->ops->send_active_reset(sk, GFP_ATOMIC);

17548

++		goto out;

17549

++	}

17550

++

17551

++	skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);

17552

++	if (skb == NULL) {

17553

++		sk_reset_timer(sk, &tp->mptcp->mptcp_ack_timer,

17554

++			       jiffies + icsk->icsk_rto);

17555

++		return;

17556

++	}

17557

++

17558

++	/* Reserve space for headers and prepare control bits */

17559

++	skb_reserve(skb, MAX_TCP_HEADER);

17560

++	tcp_init_nondata_skb(skb, tp->snd_una, TCPHDR_ACK);

17561

++

17562

++	TCP_SKB_CB(skb)->when = tcp_time_stamp;

17563

++	if (tcp_transmit_skb(sk, skb, 0, GFP_ATOMIC) > 0) {

17564

++		/* Retransmission failed because of local congestion,

17565

++		 * do not backoff.

17566

++		 */

17567

++		if (!icsk->icsk_retransmits)

17568

++			icsk->icsk_retransmits = 1;

17569

++		sk_reset_timer(sk, &tp->mptcp->mptcp_ack_timer,

17570

++			       jiffies + icsk->icsk_rto);

17571

++		return;

17572

++	}

17573

++

17574

++

17575

++	icsk->icsk_retransmits++;

17576

++	icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX);

17577

++	sk_reset_timer(sk, &tp->mptcp->mptcp_ack_timer,

17578

++		       jiffies + icsk->icsk_rto);

17579

++	if (retransmits_timed_out(sk, sysctl_tcp_retries1 + 1, 0, 0))

17580

++		__sk_dst_reset(sk);

17581

++

17582

++out:;

17583

++}

17584

++

17585

++void mptcp_ack_handler(unsigned long data)

17586

++{

17587

++	struct sock *sk = (struct sock *)data;

17588

++	struct sock *meta_sk = mptcp_meta_sk(sk);

17589

++

17590

++	bh_lock_sock(meta_sk);

17591

++	if (sock_owned_by_user(meta_sk)) {

17592

++		/* Try again later */

17593

++		sk_reset_timer(sk, &tcp_sk(sk)->mptcp->mptcp_ack_timer,

17594

++			       jiffies + (HZ / 20));

17595

++		goto out_unlock;

17596

++	}

17597

++

17598

++	if (sk->sk_state == TCP_CLOSE)

17599

++		goto out_unlock;

17600

++	if (!tcp_sk(sk)->mptcp->pre_established)

17601

++		goto out_unlock;

17602

++

17603

++	mptcp_ack_retransmit_timer(sk);

17604

++

17605

++	sk_mem_reclaim(sk);

17606

++

17607

++out_unlock:

17608

++	bh_unlock_sock(meta_sk);

17609

++	sock_put(sk);

17610

++}

17611

++

17612

++/* Similar to tcp_retransmit_skb

17613

++ *

17614

++ * The diff is that we handle the retransmission-stats (retrans_stamp) at the

17615

++ * meta-level.

17616

++ */

17617

++int mptcp_retransmit_skb(struct sock *meta_sk, struct sk_buff *skb)

17618

++{

17619

++	struct tcp_sock *meta_tp = tcp_sk(meta_sk);

17620

++	struct sock *subsk;

17621

++	unsigned int limit, mss_now;

17622

++	int err = -1;

17623

++

17624

++	/* Do not sent more than we queued. 1/4 is reserved for possible

17625

++	 * copying overhead: fragmentation, tunneling, mangling etc.

17626

++	 *

17627

++	 * This is a meta-retransmission thus we check on the meta-socket.

17628

++	 */

17629

++	if (atomic_read(&meta_sk->sk_wmem_alloc) >

17630

++	    min(meta_sk->sk_wmem_queued + (meta_sk->sk_wmem_queued >> 2), meta_sk->sk_sndbuf)) {

17631

++		return -EAGAIN;

17632

++	}

17633

++

17634

++	/* We need to make sure that the retransmitted segment can be sent on a

17635

++	 * subflow right now. If it is too big, it needs to be fragmented.

17636

++	 */

17637

++	subsk = meta_tp->mpcb->sched_ops->get_subflow(meta_sk, skb, false);

17638

++	if (!subsk) {

17639

++		/* We want to increase icsk_retransmits, thus return 0, so that

17640

++		 * mptcp_retransmit_timer enters the desired branch.

17641

++		 */

17642

++		err = 0;

17643

++		goto failed;

17644

++	}

17645

++	mss_now = tcp_current_mss(subsk);

17646

++

17647

++	/* If the segment was cloned (e.g. a meta retransmission), the header

17648

++	 * must be expanded/copied so that there is no corruption of TSO

17649

++	 * information.

17650

++	 */

17651

++	if (skb_unclone(skb, GFP_ATOMIC)) {

17652

++		err = -ENOMEM;

17653

++		goto failed;

17654

++	}

17655

++

17656

++	/* Must have been set by mptcp_write_xmit before */

17657

++	BUG_ON(!tcp_skb_pcount(skb));

17658

++

17659

++	limit = mss_now;

17660

++	/* skb->len > mss_now is the equivalent of tso_segs > 1 in

17661

++	 * tcp_write_xmit. Otherwise split-point would return 0.

17662

++	 */

17663

++	if (skb->len > mss_now && !tcp_urg_mode(meta_tp))

17664

++		limit = tcp_mss_split_point(meta_sk, skb, mss_now,

17665

++					    UINT_MAX / mss_now,

17666

++					    TCP_NAGLE_OFF);

17667

++

17668

++	if (skb->len > limit &&

17669

++	    unlikely(mptcp_fragment(meta_sk, skb, limit,

17670

++				    GFP_ATOMIC, 0)))

17671

++		goto failed;

17672

++

17673

++	if (!mptcp_skb_entail(subsk, skb, -1))

17674

++		goto failed;

17675

++	TCP_SKB_CB(skb)->when = tcp_time_stamp;

17676

++

17677

++	/* Update global TCP statistics. */

17678

++	TCP_INC_STATS(sock_net(meta_sk), TCP_MIB_RETRANSSEGS);

17679

++

17680

++	/* Diff to tcp_retransmit_skb */

17681

++

17682

++	/* Save stamp of the first retransmit. */

17683

++	if (!meta_tp->retrans_stamp)

17684

++		meta_tp->retrans_stamp = TCP_SKB_CB(skb)->when;

17685

++

17686

++	__tcp_push_pending_frames(subsk, mss_now, TCP_NAGLE_PUSH);

17687

++

17688

++	return 0;

17689

++

17690

++failed:

17691

++	NET_INC_STATS_BH(sock_net(meta_sk), LINUX_MIB_TCPRETRANSFAIL);

17692

++	return err;

17693

++}

17694

++

17695

++/* Similar to tcp_retransmit_timer

17696

++ *

17697

++ * The diff is that we have to handle retransmissions of the FAST_CLOSE-message

17698

++ * and that we don't have an srtt estimation at the meta-level.

17699

++ */

17700

++void mptcp_retransmit_timer(struct sock *meta_sk)

17701

++{

17702

++	struct tcp_sock *meta_tp = tcp_sk(meta_sk);

17703

++	struct mptcp_cb *mpcb = meta_tp->mpcb;

17704

++	struct inet_connection_sock *meta_icsk = inet_csk(meta_sk);

17705

++	int err;

17706

++

17707

++	/* In fallback, retransmission is handled at the subflow-level */

17708

++	if (!meta_tp->packets_out || mpcb->infinite_mapping_snd ||

17709

++	    mpcb->send_infinite_mapping)

17710

++		return;

17711

++

17712

++	WARN_ON(tcp_write_queue_empty(meta_sk));

17713

++

17714

++	if (!meta_tp->snd_wnd && !sock_flag(meta_sk, SOCK_DEAD) &&

17715

++	    !((1 << meta_sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))) {

17716

++		/* Receiver dastardly shrinks window. Our retransmits

17717

++		 * become zero probes, but we should not timeout this

17718

++		 * connection. If the socket is an orphan, time it out,

17719

++		 * we cannot allow such beasts to hang infinitely.

17720

++		 */

17721

++		struct inet_sock *meta_inet = inet_sk(meta_sk);

17722

++		if (meta_sk->sk_family == AF_INET) {

17723

++			LIMIT_NETDEBUG(KERN_DEBUG "MPTCP: Peer %pI4:%u/%u unexpectedly shrunk window %u:%u (repaired)\n",

17724

++				       &meta_inet->inet_daddr,

17725

++				       ntohs(meta_inet->inet_dport),

17726

++				       meta_inet->inet_num, meta_tp->snd_una,

17727

++				       meta_tp->snd_nxt);

17728

++		}

17729

++#if IS_ENABLED(CONFIG_IPV6)

17730

++		else if (meta_sk->sk_family == AF_INET6) {

17731

++			LIMIT_NETDEBUG(KERN_DEBUG "MPTCP: Peer %pI6:%u/%u unexpectedly shrunk window %u:%u (repaired)\n",

17732

++				       &meta_sk->sk_v6_daddr,

17733

++				       ntohs(meta_inet->inet_dport),

17734

++				       meta_inet->inet_num, meta_tp->snd_una,

17735

++				       meta_tp->snd_nxt);

17736

++		}

17737

++#endif

17738

++		if (tcp_time_stamp - meta_tp->rcv_tstamp > TCP_RTO_MAX) {

17739

++			tcp_write_err(meta_sk);

17740

++			return;

17741

++		}

17742

++

17743

++		mptcp_retransmit_skb(meta_sk, tcp_write_queue_head(meta_sk));

17744

++		goto out_reset_timer;

17745

++	}

17746

++

17747

++	if (tcp_write_timeout(meta_sk))

17748

++		return;

17749

++

17750

++	if (meta_icsk->icsk_retransmits == 0)

17751

++		NET_INC_STATS_BH(sock_net(meta_sk), LINUX_MIB_TCPTIMEOUTS);

17752

++

17753

++	meta_icsk->icsk_ca_state = TCP_CA_Loss;

17754

++

17755

++	err = mptcp_retransmit_skb(meta_sk, tcp_write_queue_head(meta_sk));

17756

++	if (err > 0) {

17757

++		/* Retransmission failed because of local congestion,

17758

++		 * do not backoff.

17759

++		 */

17760

++		if (!meta_icsk->icsk_retransmits)

17761

++			meta_icsk->icsk_retransmits = 1;

17762

++		inet_csk_reset_xmit_timer(meta_sk, ICSK_TIME_RETRANS,

17763

++					  min(meta_icsk->icsk_rto, TCP_RESOURCE_PROBE_INTERVAL),

17764

++					  TCP_RTO_MAX);

17765

++		return;

17766

++	}

17767

++

17768

++	/* Increase the timeout each time we retransmit.  Note that

17769

++	 * we do not increase the rtt estimate.  rto is initialized

17770

++	 * from rtt, but increases here.  Jacobson (SIGCOMM 88) suggests

17771

++	 * that doubling rto each time is the least we can get away with.

17772

++	 * In KA9Q, Karn uses this for the first few times, and then

17773

++	 * goes to quadratic.  netBSD doubles, but only goes up to *64,

17774

++	 * and clamps at 1 to 64 sec afterwards.  Note that 120 sec is

17775

++	 * defined in the protocol as the maximum possible RTT.  I guess

17776

++	 * we'll have to use something other than TCP to talk to the

17777

++	 * University of Mars.

17778

++	 *

17779

++	 * PAWS allows us longer timeouts and large windows, so once

17780

++	 * implemented ftp to mars will work nicely. We will have to fix

17781

++	 * the 120 second clamps though!

17782

++	 */

17783

++	meta_icsk->icsk_backoff++;

17784

++	meta_icsk->icsk_retransmits++;

17785

++

17786

++out_reset_timer:

17787

++	/* If stream is thin, use linear timeouts. Since 'icsk_backoff' is

17788

++	 * used to reset timer, set to 0. Recalculate 'icsk_rto' as this

17789

++	 * might be increased if the stream oscillates between thin and thick,

17790

++	 * thus the old value might already be too high compared to the value

17791

++	 * set by 'tcp_set_rto' in tcp_input.c which resets the rto without

17792

++	 * backoff. Limit to TCP_THIN_LINEAR_RETRIES before initiating

17793

++	 * exponential backoff behaviour to avoid continue hammering

17794

++	 * linear-timeout retransmissions into a black hole

17795

++	 */

17796

++	if (meta_sk->sk_state == TCP_ESTABLISHED &&

17797

++	    (meta_tp->thin_lto || sysctl_tcp_thin_linear_timeouts) &&

17798

++	    tcp_stream_is_thin(meta_tp) &&

17799

++	    meta_icsk->icsk_retransmits <= TCP_THIN_LINEAR_RETRIES) {

17800

++		meta_icsk->icsk_backoff = 0;

17801

++		/* We cannot do the same as in tcp_write_timer because the

17802

++		 * srtt is not set here.

17803

++		 */

17804

++		mptcp_set_rto(meta_sk);

17805

++	} else {

17806

++		/* Use normal (exponential) backoff */

17807

++		meta_icsk->icsk_rto = min(meta_icsk->icsk_rto << 1, TCP_RTO_MAX);

17808

++	}

17809

++	inet_csk_reset_xmit_timer(meta_sk, ICSK_TIME_RETRANS, meta_icsk->icsk_rto, TCP_RTO_MAX);

17810

++

17811

++	return;

17812

++}

17813

++

17814

++/* Modify values to an mptcp-level for the initial window of new subflows */

17815

++void mptcp_select_initial_window(int __space, __u32 mss, __u32 *rcv_wnd,

17816

++				__u32 *window_clamp, int wscale_ok,

17817

++				__u8 *rcv_wscale, __u32 init_rcv_wnd,

17818

++				 const struct sock *sk)

17819

++{

17820

++	struct mptcp_cb *mpcb = tcp_sk(sk)->mpcb;

17821

++

17822

++	*window_clamp = mpcb->orig_window_clamp;

17823

++	__space = tcp_win_from_space(mpcb->orig_sk_rcvbuf);

17824

++

17825

++	tcp_select_initial_window(__space, mss, rcv_wnd, window_clamp,

17826

++				  wscale_ok, rcv_wscale, init_rcv_wnd, sk);

17827

++}

17828

++

17829

++static inline u64 mptcp_calc_rate(const struct sock *meta_sk, unsigned int mss,

17830

++				  unsigned int (*mss_cb)(struct sock *sk))

17831

++{

17832

++	struct sock *sk;

17833

++	u64 rate = 0;

17834

++

17835

++	mptcp_for_each_sk(tcp_sk(meta_sk)->mpcb, sk) {

17836

++		struct tcp_sock *tp = tcp_sk(sk);

17837

++		int this_mss;

17838

++		u64 this_rate;

17839

++

17840

++		if (!mptcp_sk_can_send(sk))

17841

++			continue;

17842

++

17843

++		/* Do not consider subflows without a RTT estimation yet

17844

++		 * otherwise this_rate >>> rate.

17845

++		 */

17846

++		if (unlikely(!tp->srtt_us))

17847

++			continue;

17848

++

17849

++		this_mss = mss_cb(sk);

17850

++

17851

++		/* If this_mss is smaller than mss, it means that a segment will

17852

++		 * be splitted in two (or more) when pushed on this subflow. If

17853

++		 * you consider that mss = 1428 and this_mss = 1420 then two

17854

++		 * segments will be generated: a 1420-byte and 8-byte segment.

17855

++		 * The latter will introduce a large overhead as for a single

17856

++		 * data segment 2 slots will be used in the congestion window.

17857

++		 * Therefore reducing by ~2 the potential throughput of this

17858

++		 * subflow. Indeed, 1428 will be send while 2840 could have been

17859

++		 * sent if mss == 1420 reducing the throughput by 2840 / 1428.

17860

++		 *

17861

++		 * The following algorithm take into account this overhead

17862

++		 * when computing the potential throughput that MPTCP can

17863

++		 * achieve when generating mss-byte segments.

17864

++		 *

17865

++		 * The formulae is the following:

17866

++		 *  \sum_{\forall sub} ratio * \frac{mss * cwnd_sub}{rtt_sub}

17867

++		 * Where ratio is computed as follows:

17868

++		 *  \frac{mss}{\ceil{mss / mss_sub} * mss_sub}

17869

++		 *

17870

++		 * ratio gives the reduction factor of the theoretical

17871

++		 * throughput a subflow can achieve if MPTCP uses a specific

17872

++		 * MSS value.

17873

++		 */

17874

++		this_rate = div64_u64((u64)mss * mss * (USEC_PER_SEC << 3) *

17875

++				      max(tp->snd_cwnd, tp->packets_out),

17876

++				      (u64)tp->srtt_us *

17877

++				      DIV_ROUND_UP(mss, this_mss) * this_mss);

17878

++		rate += this_rate;

17879

++	}

17880

++

17881

++	return rate;

17882

++}

17883

++

17884

++static unsigned int __mptcp_current_mss(const struct sock *meta_sk,

17885

++					unsigned int (*mss_cb)(struct sock *sk))

17886

++{

17887

++	unsigned int mss = 0;

17888

++	u64 rate = 0;

17889

++	struct sock *sk;

17890

++

17891

++	mptcp_for_each_sk(tcp_sk(meta_sk)->mpcb, sk) {

17892

++		int this_mss;

17893

++		u64 this_rate;

17894

++

17895

++		if (!mptcp_sk_can_send(sk))

17896

++			continue;

17897

++

17898

++		this_mss = mss_cb(sk);

17899

++

17900

++		/* Same mss values will produce the same throughput. */

17901

++		if (this_mss == mss)

17902

++			continue;

17903

++

17904

++		/* See whether using this mss value can theoretically improve

17905

++		 * the performances.

17906

++		 */

17907

++		this_rate = mptcp_calc_rate(meta_sk, this_mss, mss_cb);

17908

++		if (this_rate >= rate) {

17909

++			mss = this_mss;

17910

++			rate = this_rate;

17911

++		}

17912

++	}

17913

++

17914

++	return mss;

17915

++}

17916

++

17917

++unsigned int mptcp_current_mss(struct sock *meta_sk)

17918

++{

17919

++	unsigned int mss = __mptcp_current_mss(meta_sk, tcp_current_mss);

17920

++

17921

++	/* If no subflow is available, we take a default-mss from the

17922

++	 * meta-socket.

17923

++	 */

17924

++	return !mss ? tcp_current_mss(meta_sk) : mss;

17925

++}

17926

++

17927

++static unsigned int mptcp_select_size_mss(struct sock *sk)

17928

++{

17929

++	return tcp_sk(sk)->mss_cache;

17930

++}

17931

++

17932

++int mptcp_select_size(const struct sock *meta_sk, bool sg)

17933

++{

17934

++	unsigned int mss = __mptcp_current_mss(meta_sk, mptcp_select_size_mss);

17935

++

17936

++	if (sg) {

17937

++		if (mptcp_sk_can_gso(meta_sk)) {

17938

++			mss = SKB_WITH_OVERHEAD(2048 - MAX_TCP_HEADER);

17939

++		} else {

17940

++			int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);

17941

++

17942

++			if (mss >= pgbreak &&

17943

++			    mss <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE)

17944

++				mss = pgbreak;

17945

++		}

17946

++	}

17947

++

17948

++	return !mss ? tcp_sk(meta_sk)->mss_cache : mss;

17949

++}

17950

++

17951

++int mptcp_check_snd_buf(const struct tcp_sock *tp)

17952

++{

17953

++	const struct sock *sk;

17954

++	u32 rtt_max = tp->srtt_us;

17955

++	u64 bw_est;

17956

++

17957

++	if (!tp->srtt_us)

17958

++		return tp->reordering + 1;

17959

++

17960

++	mptcp_for_each_sk(tp->mpcb, sk) {

17961

++		if (!mptcp_sk_can_send(sk))

17962

++			continue;

17963

++

17964

++		if (rtt_max < tcp_sk(sk)->srtt_us)

17965

++			rtt_max = tcp_sk(sk)->srtt_us;

17966

++	}

17967

++

17968

++	bw_est = div64_u64(((u64)tp->snd_cwnd * rtt_max) << 16,

17969

++				(u64)tp->srtt_us);

17970

++

17971

++	return max_t(unsigned int, (u32)(bw_est >> 16),

17972

++			tp->reordering + 1);

17973

++}

17974

++

17975

++unsigned int mptcp_xmit_size_goal(const struct sock *meta_sk, u32 mss_now,

17976

++				  int large_allowed)

17977

++{

17978

++	struct sock *sk;

17979

++	u32 xmit_size_goal = 0;

17980

++

17981

++	if (large_allowed && mptcp_sk_can_gso(meta_sk)) {

17982

++		mptcp_for_each_sk(tcp_sk(meta_sk)->mpcb, sk) {

17983

++			int this_size_goal;

17984

++

17985

++			if (!mptcp_sk_can_send(sk))

17986

++				continue;

17987

++

17988

++			this_size_goal = tcp_xmit_size_goal(sk, mss_now, 1);

17989

++			if (this_size_goal > xmit_size_goal)

17990

++				xmit_size_goal = this_size_goal;

17991

++		}

17992

++	}

17993

++

17994

++	return max(xmit_size_goal, mss_now);

17995

++}

17996

++

17997

++/* Similar to tcp_trim_head - but we correctly copy the DSS-option */

17998

++int mptcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)

17999

++{

18000

++	if (skb_cloned(skb)) {

18001

++		if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC))

18002

++			return -ENOMEM;

18003

++	}

18004

++

18005

++	__pskb_trim_head(skb, len);

18006

++

18007

++	TCP_SKB_CB(skb)->seq += len;

18008

++	skb->ip_summed = CHECKSUM_PARTIAL;

18009

++

18010

++	skb->truesize	     -= len;

18011

++	sk->sk_wmem_queued   -= len;

18012

++	sk_mem_uncharge(sk, len);

18013

++	sock_set_flag(sk, SOCK_QUEUE_SHRUNK);

18014

++

18015

++	/* Any change of skb->len requires recalculation of tso factor. */

18016

++	if (tcp_skb_pcount(skb) > 1)

18017

++		tcp_set_skb_tso_segs(sk, skb, tcp_skb_mss(skb));

18018

++

18019

++	return 0;

18020

++}

18021

+diff --git a/net/mptcp/mptcp_pm.c b/net/mptcp/mptcp_pm.c

18022

+new file mode 100644

18023

+index 000000000000..9542f950729f

18024

+--- /dev/null

18025

++++ b/net/mptcp/mptcp_pm.c

18026

+@@ -0,0 +1,169 @@

18027

++/*

18028

++ *     MPTCP implementation - MPTCP-subflow-management

18029

++ *

18030

++ *     Initial Design & Implementation:

18031

++ *     Sébastien Barré <sebastien.barre@×××××××××.be>

18032

++ *

18033

++ *     Current Maintainer & Author:

18034

++ *     Christoph Paasch <christoph.paasch@×××××××××.be>

18035

++ *

18036

++ *     Additional authors:

18037

++ *     Jaakko Korkeaniemi <jaakko.korkeaniemi@×××××.fi>

18038

++ *     Gregory Detal <gregory.detal@×××××××××.be>

18039

++ *     Fabien Duchêne <fabien.duchene@×××××××××.be>

18040

++ *     Andreas Seelinger <Andreas.Seelinger@×××××××××××.de>

18041

++ *     Lavkesh Lahngir <lavkesh51@×××××.com>

18042

++ *     Andreas Ripke <ripke@××××××.eu>

18043

++ *     Vlad Dogaru <vlad.dogaru@×××××.com>

18044

++ *     Octavian Purdila <octavian.purdila@×××××.com>

18045

++ *     John Ronan <jronan@××××.org>

18046

++ *     Catalin Nicutar <catalin.nicutar@×××××.com>

18047

++ *     Brandon Heller <brandonh@××××××××.edu>

18048

++ *

18049

++ *

18050

++ *     This program is free software; you can redistribute it and/or

18051

++ *      modify it under the terms of the GNU General Public License

18052

++ *      as published by the Free Software Foundation; either version

18053

++ *      2 of the License, or (at your option) any later version.

18054

++ */

18055

++

18056

++

18057

++#include <linux/module.h>

18058

++#include <net/mptcp.h>

18059

++

18060

++static DEFINE_SPINLOCK(mptcp_pm_list_lock);

18061

++static LIST_HEAD(mptcp_pm_list);

18062

++

18063

++static int mptcp_default_id(sa_family_t family, union inet_addr *addr,

18064

++			    struct net *net, bool *low_prio)

18065

++{

18066

++	return 0;

18067

++}

18068

++

18069

++struct mptcp_pm_ops mptcp_pm_default = {

18070

++	.get_local_id = mptcp_default_id, /* We do not care */

18071

++	.name = "default",

18072

++	.owner = THIS_MODULE,

18073

++};

18074

++

18075

++static struct mptcp_pm_ops *mptcp_pm_find(const char *name)

18076

++{

18077

++	struct mptcp_pm_ops *e;

18078

++

18079

++	list_for_each_entry_rcu(e, &mptcp_pm_list, list) {

18080

++		if (strcmp(e->name, name) == 0)

18081

++			return e;

18082

++	}

18083

++

18084

++	return NULL;

18085

++}

18086

++

18087

++int mptcp_register_path_manager(struct mptcp_pm_ops *pm)

18088

++{

18089

++	int ret = 0;

18090

++

18091

++	if (!pm->get_local_id)

18092

++		return -EINVAL;

18093

++

18094

++	spin_lock(&mptcp_pm_list_lock);

18095

++	if (mptcp_pm_find(pm->name)) {

18096

++		pr_notice("%s already registered\n", pm->name);

18097

++		ret = -EEXIST;

18098

++	} else {

18099

++		list_add_tail_rcu(&pm->list, &mptcp_pm_list);

18100

++		pr_info("%s registered\n", pm->name);

18101

++	}

18102

++	spin_unlock(&mptcp_pm_list_lock);

18103

++

18104

++	return ret;

18105

++}

18106

++EXPORT_SYMBOL_GPL(mptcp_register_path_manager);

18107

++

18108

++void mptcp_unregister_path_manager(struct mptcp_pm_ops *pm)

18109

++{

18110

++	spin_lock(&mptcp_pm_list_lock);

18111

++	list_del_rcu(&pm->list);

18112

++	spin_unlock(&mptcp_pm_list_lock);

18113

++}

18114

++EXPORT_SYMBOL_GPL(mptcp_unregister_path_manager);

18115

++

18116

++void mptcp_get_default_path_manager(char *name)

18117

++{

18118

++	struct mptcp_pm_ops *pm;

18119

++

18120

++	BUG_ON(list_empty(&mptcp_pm_list));

18121

++

18122

++	rcu_read_lock();

18123

++	pm = list_entry(mptcp_pm_list.next, struct mptcp_pm_ops, list);

18124

++	strncpy(name, pm->name, MPTCP_PM_NAME_MAX);

18125

++	rcu_read_unlock();

18126

++}

18127

++

18128

++int mptcp_set_default_path_manager(const char *name)

18129

++{

18130

++	struct mptcp_pm_ops *pm;

18131

++	int ret = -ENOENT;

18132

++

18133

++	spin_lock(&mptcp_pm_list_lock);

18134

++	pm = mptcp_pm_find(name);

18135

++#ifdef CONFIG_MODULES

18136

++	if (!pm && capable(CAP_NET_ADMIN)) {

18137

++		spin_unlock(&mptcp_pm_list_lock);

18138

++

18139

++		request_module("mptcp_%s", name);

18140

++		spin_lock(&mptcp_pm_list_lock);

18141

++		pm = mptcp_pm_find(name);

18142

++	}

18143

++#endif

18144

++

18145

++	if (pm) {

18146

++		list_move(&pm->list, &mptcp_pm_list);

18147

++		ret = 0;

18148

++	} else {

18149

++		pr_info("%s is not available\n", name);

18150

++	}

18151

++	spin_unlock(&mptcp_pm_list_lock);

18152

++

18153

++	return ret;

18154

++}

18155

++

18156

++void mptcp_init_path_manager(struct mptcp_cb *mpcb)

18157

++{

18158

++	struct mptcp_pm_ops *pm;

18159

++

18160

++	rcu_read_lock();

18161

++	list_for_each_entry_rcu(pm, &mptcp_pm_list, list) {

18162

++		if (try_module_get(pm->owner)) {

18163

++			mpcb->pm_ops = pm;

18164

++			break;

18165

++		}

18166

++	}

18167

++	rcu_read_unlock();

18168

++}

18169

++

18170

++/* Manage refcounts on socket close. */

18171

++void mptcp_cleanup_path_manager(struct mptcp_cb *mpcb)

18172

++{

18173

++	module_put(mpcb->pm_ops->owner);

18174

++}

18175

++

18176

++/* Fallback to the default path-manager. */

18177

++void mptcp_fallback_default(struct mptcp_cb *mpcb)

18178

++{

18179

++	struct mptcp_pm_ops *pm;

18180

++

18181

++	mptcp_cleanup_path_manager(mpcb);

18182

++	pm = mptcp_pm_find("default");

18183

++

18184

++	/* Cannot fail - it's the default module */

18185

++	try_module_get(pm->owner);

18186

++	mpcb->pm_ops = pm;

18187

++}

18188

++EXPORT_SYMBOL_GPL(mptcp_fallback_default);

18189

++

18190

++/* Set default value from kernel configuration at bootup */

18191

++static int __init mptcp_path_manager_default(void)

18192

++{

18193

++	return mptcp_set_default_path_manager(CONFIG_DEFAULT_MPTCP_PM);

18194

++}

18195

++late_initcall(mptcp_path_manager_default);

18196

+diff --git a/net/mptcp/mptcp_rr.c b/net/mptcp/mptcp_rr.c

18197

+new file mode 100644

18198

+index 000000000000..93278f684069

18199

+--- /dev/null

18200

++++ b/net/mptcp/mptcp_rr.c

18201

+@@ -0,0 +1,301 @@

18202

++/* MPTCP Scheduler module selector. Highly inspired by tcp_cong.c */

18203

++

18204

++#include <linux/module.h>

18205

++#include <net/mptcp.h>

18206

++

18207

++static unsigned char num_segments __read_mostly = 1;

18208

++module_param(num_segments, byte, 0644);

18209

++MODULE_PARM_DESC(num_segments, "The number of consecutive segments that are part of a burst");

18210

++

18211

++static bool cwnd_limited __read_mostly = 1;

18212

++module_param(cwnd_limited, bool, 0644);

18213

++MODULE_PARM_DESC(cwnd_limited, "if set to 1, the scheduler tries to fill the congestion-window on all subflows");

18214

++

18215

++struct rrsched_priv {

18216

++	unsigned char quota;

18217

++};

18218

++

18219

++static struct rrsched_priv *rrsched_get_priv(const struct tcp_sock *tp)

18220

++{

18221

++	return (struct rrsched_priv *)&tp->mptcp->mptcp_sched[0];

18222

++}

18223

++

18224

++/* If the sub-socket sk available to send the skb? */

18225

++static bool mptcp_rr_is_available(const struct sock *sk, const struct sk_buff *skb,

18226

++				  bool zero_wnd_test, bool cwnd_test)

18227

++{

18228

++	const struct tcp_sock *tp = tcp_sk(sk);

18229

++	unsigned int space, in_flight;

18230

++

18231

++	/* Set of states for which we are allowed to send data */

18232

++	if (!mptcp_sk_can_send(sk))

18233

++		return false;

18234

++

18235

++	/* We do not send data on this subflow unless it is

18236

++	 * fully established, i.e. the 4th ack has been received.

18237

++	 */

18238

++	if (tp->mptcp->pre_established)

18239

++		return false;

18240

++

18241

++	if (tp->pf)

18242

++		return false;

18243

++

18244

++	if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss) {

18245

++		/* If SACK is disabled, and we got a loss, TCP does not exit

18246

++		 * the loss-state until something above high_seq has been acked.

18247

++		 * (see tcp_try_undo_recovery)

18248

++		 *

18249

++		 * high_seq is the snd_nxt at the moment of the RTO. As soon

18250

++		 * as we have an RTO, we won't push data on the subflow.

18251

++		 * Thus, snd_una can never go beyond high_seq.

18252

++		 */

18253

++		if (!tcp_is_reno(tp))

18254

++			return false;

18255

++		else if (tp->snd_una != tp->high_seq)

18256

++			return false;

18257

++	}

18258

++

18259

++	if (!tp->mptcp->fully_established) {

18260

++		/* Make sure that we send in-order data */

18261

++		if (skb && tp->mptcp->second_packet &&

18262

++		    tp->mptcp->last_end_data_seq != TCP_SKB_CB(skb)->seq)

18263

++			return false;

18264

++	}

18265

++

18266

++	if (!cwnd_test)

18267

++		goto zero_wnd_test;

18268

++

18269

++	in_flight = tcp_packets_in_flight(tp);

18270

++	/* Not even a single spot in the cwnd */

18271

++	if (in_flight >= tp->snd_cwnd)

18272

++		return false;

18273

++

18274

++	/* Now, check if what is queued in the subflow's send-queue

18275

++	 * already fills the cwnd.

18276

++	 */

18277

++	space = (tp->snd_cwnd - in_flight) * tp->mss_cache;

18278

++

18279

++	if (tp->write_seq - tp->snd_nxt > space)

18280

++		return false;

18281

++

18282

++zero_wnd_test:

18283

++	if (zero_wnd_test && !before(tp->write_seq, tcp_wnd_end(tp)))

18284

++		return false;

18285

++

18286

++	return true;

18287

++}

18288

++

18289

++/* Are we not allowed to reinject this skb on tp? */

18290

++static int mptcp_rr_dont_reinject_skb(const struct tcp_sock *tp, const struct sk_buff *skb)

18291

++{

18292

++	/* If the skb has already been enqueued in this sk, try to find

18293

++	 * another one.

18294

++	 */

18295

++	return skb &&

18296

++		/* Has the skb already been enqueued into this subsocket? */

18297

++		mptcp_pi_to_flag(tp->mptcp->path_index) & TCP_SKB_CB(skb)->path_mask;

18298

++}

18299

++

18300

++/* We just look for any subflow that is available */

18301

++static struct sock *rr_get_available_subflow(struct sock *meta_sk,

18302

++					     struct sk_buff *skb,

18303

++					     bool zero_wnd_test)

18304

++{

18305

++	const struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;

18306

++	struct sock *sk, *bestsk = NULL, *backupsk = NULL;

18307

++

18308

++	/* Answer data_fin on same subflow!!! */

18309

++	if (meta_sk->sk_shutdown & RCV_SHUTDOWN &&

18310

++	    skb && mptcp_is_data_fin(skb)) {

18311

++		mptcp_for_each_sk(mpcb, sk) {

18312

++			if (tcp_sk(sk)->mptcp->path_index == mpcb->dfin_path_index &&

18313

++			    mptcp_rr_is_available(sk, skb, zero_wnd_test, true))

18314

++				return sk;

18315

++		}

18316

++	}

18317

++

18318

++	/* First, find the best subflow */

18319

++	mptcp_for_each_sk(mpcb, sk) {

18320

++		struct tcp_sock *tp = tcp_sk(sk);

18321

++

18322

++		if (!mptcp_rr_is_available(sk, skb, zero_wnd_test, true))

18323

++			continue;

18324

++

18325

++		if (mptcp_rr_dont_reinject_skb(tp, skb)) {

18326

++			backupsk = sk;

18327

++			continue;

18328

++		}

18329

++

18330

++		bestsk = sk;

18331

++	}

18332

++

18333

++	if (bestsk) {

18334

++		sk = bestsk;

18335

++	} else if (backupsk) {

18336

++		/* It has been sent on all subflows once - let's give it a

18337

++		 * chance again by restarting its pathmask.

18338

++		 */

18339

++		if (skb)

18340

++			TCP_SKB_CB(skb)->path_mask = 0;

18341

++		sk = backupsk;

18342

++	}

18343

++

18344

++	return sk;

18345

++}

18346

++

18347

++/* Returns the next segment to be sent from the mptcp meta-queue.

18348

++ * (chooses the reinject queue if any segment is waiting in it, otherwise,

18349

++ * chooses the normal write queue).

18350

++ * Sets *@reinject to 1 if the returned segment comes from the

18351

++ * reinject queue. Sets it to 0 if it is the regular send-head of the meta-sk,

18352

++ * and sets it to -1 if it is a meta-level retransmission to optimize the

18353

++ * receive-buffer.

18354

++ */

18355

++static struct sk_buff *__mptcp_rr_next_segment(const struct sock *meta_sk, int *reinject)

18356

++{

18357

++	const struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;

18358

++	struct sk_buff *skb = NULL;

18359

++

18360

++	*reinject = 0;

18361

++

18362

++	/* If we are in fallback-mode, just take from the meta-send-queue */

18363

++	if (mpcb->infinite_mapping_snd || mpcb->send_infinite_mapping)

18364

++		return tcp_send_head(meta_sk);

18365

++

18366

++	skb = skb_peek(&mpcb->reinject_queue);

18367

++

18368

++	if (skb)

18369

++		*reinject = 1;

18370

++	else

18371

++		skb = tcp_send_head(meta_sk);

18372

++	return skb;

18373

++}

18374

++

18375

++static struct sk_buff *mptcp_rr_next_segment(struct sock *meta_sk,

18376

++					     int *reinject,

18377

++					     struct sock **subsk,

18378

++					     unsigned int *limit)

18379

++{

18380

++	const struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;

18381

++	struct sock *sk_it, *choose_sk = NULL;

18382

++	struct sk_buff *skb = __mptcp_rr_next_segment(meta_sk, reinject);

18383

++	unsigned char split = num_segments;

18384

++	unsigned char iter = 0, full_subs = 0;

18385

++

18386

++	/* As we set it, we have to reset it as well. */

18387

++	*limit = 0;

18388

++

18389

++	if (!skb)

18390

++		return NULL;

18391

++

18392

++	if (*reinject) {

18393

++		*subsk = rr_get_available_subflow(meta_sk, skb, false);

18394

++		if (!*subsk)

18395

++			return NULL;

18396

++

18397

++		return skb;

18398

++	}

18399

++

18400

++retry:

18401

++

18402

++	/* First, we look for a subflow who is currently being used */

18403

++	mptcp_for_each_sk(mpcb, sk_it) {

18404

++		struct tcp_sock *tp_it = tcp_sk(sk_it);

18405

++		struct rrsched_priv *rsp = rrsched_get_priv(tp_it);

18406

++

18407

++		if (!mptcp_rr_is_available(sk_it, skb, false, cwnd_limited))

18408

++			continue;

18409

++

18410

++		iter++;

18411

++

18412

++		/* Is this subflow currently being used? */

18413

++		if (rsp->quota > 0 && rsp->quota < num_segments) {

18414

++			split = num_segments - rsp->quota;

18415

++			choose_sk = sk_it;

18416

++			goto found;

18417

++		}

18418

++

18419

++		/* Or, it's totally unused */

18420

++		if (!rsp->quota) {

18421

++			split = num_segments;

18422

++			choose_sk = sk_it;

18423

++		}

18424

++

18425

++		/* Or, it must then be fully used  */

18426

++		if (rsp->quota == num_segments)

18427

++			full_subs++;

18428

++	}

18429

++

18430

++	/* All considered subflows have a full quota, and we considered at

18431

++	 * least one.

18432

++	 */

18433

++	if (iter && iter == full_subs) {

18434

++		/* So, we restart this round by setting quota to 0 and retry

18435

++		 * to find a subflow.

18436

++		 */

18437

++		mptcp_for_each_sk(mpcb, sk_it) {

18438

++			struct tcp_sock *tp_it = tcp_sk(sk_it);

18439

++			struct rrsched_priv *rsp = rrsched_get_priv(tp_it);

18440

++

18441

++			if (!mptcp_rr_is_available(sk_it, skb, false, cwnd_limited))

18442

++				continue;

18443

++

18444

++			rsp->quota = 0;

18445

++		}

18446

++

18447

++		goto retry;

18448

++	}

18449

++

18450

++found:

18451

++	if (choose_sk) {

18452

++		unsigned int mss_now;

18453

++		struct tcp_sock *choose_tp = tcp_sk(choose_sk);

18454

++		struct rrsched_priv *rsp = rrsched_get_priv(choose_tp);

18455

++

18456

++		if (!mptcp_rr_is_available(choose_sk, skb, false, true))

18457

++			return NULL;

18458

++

18459

++		*subsk = choose_sk;

18460

++		mss_now = tcp_current_mss(*subsk);

18461

++		*limit = split * mss_now;

18462

++

18463

++		if (skb->len > mss_now)

18464

++			rsp->quota += DIV_ROUND_UP(skb->len, mss_now);

18465

++		else

18466

++			rsp->quota++;

18467

++

18468

++		return skb;

18469

++	}

18470

++

18471

++	return NULL;

18472

++}

18473

++

18474

++static struct mptcp_sched_ops mptcp_sched_rr = {

18475

++	.get_subflow = rr_get_available_subflow,

18476

++	.next_segment = mptcp_rr_next_segment,

18477

++	.name = "roundrobin",

18478

++	.owner = THIS_MODULE,

18479

++};

18480

++

18481

++static int __init rr_register(void)

18482

++{

18483

++	BUILD_BUG_ON(sizeof(struct rrsched_priv) > MPTCP_SCHED_SIZE);

18484

++

18485

++	if (mptcp_register_scheduler(&mptcp_sched_rr))

18486

++		return -1;

18487

++

18488

++	return 0;

18489

++}

18490

++

18491

++static void rr_unregister(void)

18492

++{

18493

++	mptcp_unregister_scheduler(&mptcp_sched_rr);

18494

++}

18495

++

18496

++module_init(rr_register);

18497

++module_exit(rr_unregister);

18498

++

18499

++MODULE_AUTHOR("Christoph Paasch");

18500

++MODULE_LICENSE("GPL");

18501

++MODULE_DESCRIPTION("ROUNDROBIN MPTCP");

18502

++MODULE_VERSION("0.89");

18503

+diff --git a/net/mptcp/mptcp_sched.c b/net/mptcp/mptcp_sched.c

18504

+new file mode 100644

18505

+index 000000000000..6c7ff4eceac1

18506

+--- /dev/null

18507

++++ b/net/mptcp/mptcp_sched.c

18508

+@@ -0,0 +1,493 @@

18509

++/* MPTCP Scheduler module selector. Highly inspired by tcp_cong.c */

18510

++

18511

++#include <linux/module.h>

18512

++#include <net/mptcp.h>

18513

++

18514

++static DEFINE_SPINLOCK(mptcp_sched_list_lock);

18515

++static LIST_HEAD(mptcp_sched_list);

18516

++

18517

++struct defsched_priv {

18518

++	u32	last_rbuf_opti;

18519

++};

18520

++

18521

++static struct defsched_priv *defsched_get_priv(const struct tcp_sock *tp)

18522

++{

18523

++	return (struct defsched_priv *)&tp->mptcp->mptcp_sched[0];

18524

++}

18525

++

18526

++/* If the sub-socket sk available to send the skb? */

18527

++static bool mptcp_is_available(struct sock *sk, const struct sk_buff *skb,

18528

++			       bool zero_wnd_test)

18529

++{

18530

++	const struct tcp_sock *tp = tcp_sk(sk);

18531

++	unsigned int mss_now, space, in_flight;

18532

++

18533

++	/* Set of states for which we are allowed to send data */

18534

++	if (!mptcp_sk_can_send(sk))

18535

++		return false;

18536

++

18537

++	/* We do not send data on this subflow unless it is

18538

++	 * fully established, i.e. the 4th ack has been received.

18539

++	 */

18540

++	if (tp->mptcp->pre_established)

18541

++		return false;

18542

++

18543

++	if (tp->pf)

18544

++		return false;

18545

++

18546

++	if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss) {

18547

++		/* If SACK is disabled, and we got a loss, TCP does not exit

18548

++		 * the loss-state until something above high_seq has been acked.

18549

++		 * (see tcp_try_undo_recovery)

18550

++		 *

18551

++		 * high_seq is the snd_nxt at the moment of the RTO. As soon

18552

++		 * as we have an RTO, we won't push data on the subflow.

18553

++		 * Thus, snd_una can never go beyond high_seq.

18554

++		 */

18555

++		if (!tcp_is_reno(tp))

18556

++			return false;

18557

++		else if (tp->snd_una != tp->high_seq)

18558

++			return false;

18559

++	}

18560

++

18561

++	if (!tp->mptcp->fully_established) {

18562

++		/* Make sure that we send in-order data */

18563

++		if (skb && tp->mptcp->second_packet &&

18564

++		    tp->mptcp->last_end_data_seq != TCP_SKB_CB(skb)->seq)

18565

++			return false;

18566

++	}

18567

++

18568

++	/* If TSQ is already throttling us, do not send on this subflow. When

18569

++	 * TSQ gets cleared the subflow becomes eligible again.

18570

++	 */

18571

++	if (test_bit(TSQ_THROTTLED, &tp->tsq_flags))

18572

++		return false;

18573

++

18574

++	in_flight = tcp_packets_in_flight(tp);

18575

++	/* Not even a single spot in the cwnd */

18576

++	if (in_flight >= tp->snd_cwnd)

18577

++		return false;

18578

++

18579

++	/* Now, check if what is queued in the subflow's send-queue

18580

++	 * already fills the cwnd.

18581

++	 */

18582

++	space = (tp->snd_cwnd - in_flight) * tp->mss_cache;

18583

++

18584

++	if (tp->write_seq - tp->snd_nxt > space)

18585

++		return false;

18586

++

18587

++	if (zero_wnd_test && !before(tp->write_seq, tcp_wnd_end(tp)))

18588

++		return false;

18589

++

18590

++	mss_now = tcp_current_mss(sk);

18591

++

18592

++	/* Don't send on this subflow if we bypass the allowed send-window at

18593

++	 * the per-subflow level. Similar to tcp_snd_wnd_test, but manually

18594

++	 * calculated end_seq (because here at this point end_seq is still at

18595

++	 * the meta-level).

18596

++	 */

18597

++	if (skb && !zero_wnd_test &&

18598

++	    after(tp->write_seq + min(skb->len, mss_now), tcp_wnd_end(tp)))

18599

++		return false;

18600

++

18601

++	return true;

18602

++}

18603

++

18604

++/* Are we not allowed to reinject this skb on tp? */

18605

++static int mptcp_dont_reinject_skb(const struct tcp_sock *tp, const struct sk_buff *skb)

18606

++{

18607

++	/* If the skb has already been enqueued in this sk, try to find

18608

++	 * another one.

18609

++	 */

18610

++	return skb &&

18611

++		/* Has the skb already been enqueued into this subsocket? */

18612

++		mptcp_pi_to_flag(tp->mptcp->path_index) & TCP_SKB_CB(skb)->path_mask;

18613

++}

18614

++

18615

++/* This is the scheduler. This function decides on which flow to send

18616

++ * a given MSS. If all subflows are found to be busy, NULL is returned

18617

++ * The flow is selected based on the shortest RTT.

18618

++ * If all paths have full cong windows, we simply return NULL.

18619

++ *

18620

++ * Additionally, this function is aware of the backup-subflows.

18621

++ */

18622

++static struct sock *get_available_subflow(struct sock *meta_sk,

18623

++					  struct sk_buff *skb,

18624

++					  bool zero_wnd_test)

18625

++{

18626

++	struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;

18627

++	struct sock *sk, *bestsk = NULL, *lowpriosk = NULL, *backupsk = NULL;

18628

++	u32 min_time_to_peer = 0xffffffff, lowprio_min_time_to_peer = 0xffffffff;

18629

++	int cnt_backups = 0;

18630

++

18631

++	/* if there is only one subflow, bypass the scheduling function */

18632

++	if (mpcb->cnt_subflows == 1) {

18633

++		bestsk = (struct sock *)mpcb->connection_list;

18634

++		if (!mptcp_is_available(bestsk, skb, zero_wnd_test))

18635

++			bestsk = NULL;

18636

++		return bestsk;

18637

++	}

18638

++

18639

++	/* Answer data_fin on same subflow!!! */

18640

++	if (meta_sk->sk_shutdown & RCV_SHUTDOWN &&

18641

++	    skb && mptcp_is_data_fin(skb)) {

18642

++		mptcp_for_each_sk(mpcb, sk) {

18643

++			if (tcp_sk(sk)->mptcp->path_index == mpcb->dfin_path_index &&

18644

++			    mptcp_is_available(sk, skb, zero_wnd_test))

18645

++				return sk;

18646

++		}

18647

++	}

18648

++

18649

++	/* First, find the best subflow */

18650

++	mptcp_for_each_sk(mpcb, sk) {

18651

++		struct tcp_sock *tp = tcp_sk(sk);

18652

++

18653

++		if (tp->mptcp->rcv_low_prio || tp->mptcp->low_prio)

18654

++			cnt_backups++;

18655

++

18656

++		if ((tp->mptcp->rcv_low_prio || tp->mptcp->low_prio) &&

18657

++		    tp->srtt_us < lowprio_min_time_to_peer) {

18658

++			if (!mptcp_is_available(sk, skb, zero_wnd_test))

18659

++				continue;

18660

++

18661

++			if (mptcp_dont_reinject_skb(tp, skb)) {

18662

++				backupsk = sk;

18663

++				continue;

18664

++			}

18665

++

18666

++			lowprio_min_time_to_peer = tp->srtt_us;

18667

++			lowpriosk = sk;

18668

++		} else if (!(tp->mptcp->rcv_low_prio || tp->mptcp->low_prio) &&

18669

++			   tp->srtt_us < min_time_to_peer) {

18670

++			if (!mptcp_is_available(sk, skb, zero_wnd_test))

18671

++				continue;

18672

++

18673

++			if (mptcp_dont_reinject_skb(tp, skb)) {

18674

++				backupsk = sk;

18675

++				continue;

18676

++			}

18677

++

18678

++			min_time_to_peer = tp->srtt_us;

18679

++			bestsk = sk;

18680

++		}

18681

++	}

18682

++

18683

++	if (mpcb->cnt_established == cnt_backups && lowpriosk) {

18684

++		sk = lowpriosk;

18685

++	} else if (bestsk) {

18686

++		sk = bestsk;

18687

++	} else if (backupsk) {

18688

++		/* It has been sent on all subflows once - let's give it a

18689

++		 * chance again by restarting its pathmask.

18690

++		 */

18691

++		if (skb)

18692

++			TCP_SKB_CB(skb)->path_mask = 0;

18693

++		sk = backupsk;

18694

++	}

18695

++

18696

++	return sk;

18697

++}

18698

++

18699

++static struct sk_buff *mptcp_rcv_buf_optimization(struct sock *sk, int penal)

18700

++{

18701

++	struct sock *meta_sk;

18702

++	const struct tcp_sock *tp = tcp_sk(sk);

18703

++	struct tcp_sock *tp_it;

18704

++	struct sk_buff *skb_head;

18705

++	struct defsched_priv *dsp = defsched_get_priv(tp);

18706

++

18707

++	if (tp->mpcb->cnt_subflows == 1)

18708

++		return NULL;

18709

++

18710

++	meta_sk = mptcp_meta_sk(sk);

18711

++	skb_head = tcp_write_queue_head(meta_sk);

18712

++

18713

++	if (!skb_head || skb_head == tcp_send_head(meta_sk))

18714

++		return NULL;

18715

++

18716

++	/* If penalization is optional (coming from mptcp_next_segment() and

18717

++	 * We are not send-buffer-limited we do not penalize. The retransmission

18718

++	 * is just an optimization to fix the idle-time due to the delay before

18719

++	 * we wake up the application.

18720

++	 */

18721

++	if (!penal && sk_stream_memory_free(meta_sk))

18722

++		goto retrans;

18723

++

18724

++	/* Only penalize again after an RTT has elapsed */

18725

++	if (tcp_time_stamp - dsp->last_rbuf_opti < usecs_to_jiffies(tp->srtt_us >> 3))

18726

++		goto retrans;

18727

++

18728

++	/* Half the cwnd of the slow flow */

18729

++	mptcp_for_each_tp(tp->mpcb, tp_it) {

18730

++		if (tp_it != tp &&

18731

++		    TCP_SKB_CB(skb_head)->path_mask & mptcp_pi_to_flag(tp_it->mptcp->path_index)) {

18732

++			if (tp->srtt_us < tp_it->srtt_us && inet_csk((struct sock *)tp_it)->icsk_ca_state == TCP_CA_Open) {

18733

++				tp_it->snd_cwnd = max(tp_it->snd_cwnd >> 1U, 1U);

18734

++				if (tp_it->snd_ssthresh != TCP_INFINITE_SSTHRESH)

18735

++					tp_it->snd_ssthresh = max(tp_it->snd_ssthresh >> 1U, 2U);

18736

++

18737

++				dsp->last_rbuf_opti = tcp_time_stamp;

18738

++			}

18739

++			break;

18740

++		}

18741

++	}

18742

++

18743

++retrans:

18744

++

18745

++	/* Segment not yet injected into this path? Take it!!! */

18746

++	if (!(TCP_SKB_CB(skb_head)->path_mask & mptcp_pi_to_flag(tp->mptcp->path_index))) {

18747

++		bool do_retrans = false;

18748

++		mptcp_for_each_tp(tp->mpcb, tp_it) {

18749

++			if (tp_it != tp &&

18750

++			    TCP_SKB_CB(skb_head)->path_mask & mptcp_pi_to_flag(tp_it->mptcp->path_index)) {

18751

++				if (tp_it->snd_cwnd <= 4) {

18752

++					do_retrans = true;

18753

++					break;

18754

++				}

18755

++

18756

++				if (4 * tp->srtt_us >= tp_it->srtt_us) {

18757

++					do_retrans = false;

18758

++					break;

18759

++				} else {

18760

++					do_retrans = true;

18761

++				}

18762

++			}

18763

++		}

18764

++

18765

++		if (do_retrans && mptcp_is_available(sk, skb_head, false))

18766

++			return skb_head;

18767

++	}

18768

++	return NULL;

18769

++}

18770

++

18771

++/* Returns the next segment to be sent from the mptcp meta-queue.

18772

++ * (chooses the reinject queue if any segment is waiting in it, otherwise,

18773

++ * chooses the normal write queue).

18774

++ * Sets *@reinject to 1 if the returned segment comes from the

18775

++ * reinject queue. Sets it to 0 if it is the regular send-head of the meta-sk,

18776

++ * and sets it to -1 if it is a meta-level retransmission to optimize the

18777

++ * receive-buffer.

18778

++ */

18779

++static struct sk_buff *__mptcp_next_segment(struct sock *meta_sk, int *reinject)

18780

++{

18781

++	const struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;

18782

++	struct sk_buff *skb = NULL;

18783

++

18784

++	*reinject = 0;

18785

++

18786

++	/* If we are in fallback-mode, just take from the meta-send-queue */

18787

++	if (mpcb->infinite_mapping_snd || mpcb->send_infinite_mapping)

18788

++		return tcp_send_head(meta_sk);

18789

++

18790

++	skb = skb_peek(&mpcb->reinject_queue);

18791

++

18792

++	if (skb) {

18793

++		*reinject = 1;

18794

++	} else {

18795

++		skb = tcp_send_head(meta_sk);

18796

++

18797

++		if (!skb && meta_sk->sk_socket &&

18798

++		    test_bit(SOCK_NOSPACE, &meta_sk->sk_socket->flags) &&

18799

++		    sk_stream_wspace(meta_sk) < sk_stream_min_wspace(meta_sk)) {

18800

++			struct sock *subsk = get_available_subflow(meta_sk, NULL,

18801

++								   false);

18802

++			if (!subsk)

18803

++				return NULL;

18804

++

18805

++			skb = mptcp_rcv_buf_optimization(subsk, 0);

18806

++			if (skb)

18807

++				*reinject = -1;

18808

++		}

18809

++	}

18810

++	return skb;

18811

++}

18812

++

18813

++static struct sk_buff *mptcp_next_segment(struct sock *meta_sk,

18814

++					  int *reinject,

18815

++					  struct sock **subsk,

18816

++					  unsigned int *limit)

18817

++{

18818

++	struct sk_buff *skb = __mptcp_next_segment(meta_sk, reinject);

18819

++	unsigned int mss_now;

18820

++	struct tcp_sock *subtp;

18821

++	u16 gso_max_segs;

18822

++	u32 max_len, max_segs, window, needed;

18823

++

18824

++	/* As we set it, we have to reset it as well. */

18825

++	*limit = 0;

18826

++

18827

++	if (!skb)

18828

++		return NULL;

18829

++

18830

++	*subsk = get_available_subflow(meta_sk, skb, false);

18831

++	if (!*subsk)

18832

++		return NULL;

18833

++

18834

++	subtp = tcp_sk(*subsk);

18835

++	mss_now = tcp_current_mss(*subsk);

18836

++

18837

++	if (!*reinject && unlikely(!tcp_snd_wnd_test(tcp_sk(meta_sk), skb, mss_now))) {

18838

++		skb = mptcp_rcv_buf_optimization(*subsk, 1);

18839

++		if (skb)

18840

++			*reinject = -1;

18841

++		else

18842

++			return NULL;

18843

++	}

18844

++

18845

++	/* No splitting required, as we will only send one single segment */

18846

++	if (skb->len <= mss_now)

18847

++		return skb;

18848

++

18849

++	/* The following is similar to tcp_mss_split_point, but

18850

++	 * we do not care about nagle, because we will anyways

18851

++	 * use TCP_NAGLE_PUSH, which overrides this.

18852

++	 *

18853

++	 * So, we first limit according to the cwnd/gso-size and then according

18854

++	 * to the subflow's window.

18855

++	 */

18856

++

18857

++	gso_max_segs = (*subsk)->sk_gso_max_segs;

18858

++	if (!gso_max_segs) /* No gso supported on the subflow's NIC */

18859

++		gso_max_segs = 1;

18860

++	max_segs = min_t(unsigned int, tcp_cwnd_test(subtp, skb), gso_max_segs);

18861

++	if (!max_segs)

18862

++		return NULL;

18863

++

18864

++	max_len = mss_now * max_segs;

18865

++	window = tcp_wnd_end(subtp) - subtp->write_seq;

18866

++

18867

++	needed = min(skb->len, window);

18868

++	if (max_len <= skb->len)

18869

++		/* Take max_win, which is actually the cwnd/gso-size */

18870

++		*limit = max_len;

18871

++	else

18872

++		/* Or, take the window */

18873

++		*limit = needed;

18874

++

18875

++	return skb;

18876

++}

18877

++

18878

++static void defsched_init(struct sock *sk)

18879

++{

18880

++	struct defsched_priv *dsp = defsched_get_priv(tcp_sk(sk));

18881

++

18882

++	dsp->last_rbuf_opti = tcp_time_stamp;

18883

++}

18884

++

18885

++struct mptcp_sched_ops mptcp_sched_default = {

18886

++	.get_subflow = get_available_subflow,

18887

++	.next_segment = mptcp_next_segment,

18888

++	.init = defsched_init,

18889

++	.name = "default",

18890

++	.owner = THIS_MODULE,

18891

++};

18892

++

18893

++static struct mptcp_sched_ops *mptcp_sched_find(const char *name)

18894

++{

18895

++	struct mptcp_sched_ops *e;

18896

++

18897

++	list_for_each_entry_rcu(e, &mptcp_sched_list, list) {

18898

++		if (strcmp(e->name, name) == 0)

18899

++			return e;

18900

++	}

18901

++

18902

++	return NULL;

18903

++}

18904

++

18905

++int mptcp_register_scheduler(struct mptcp_sched_ops *sched)

18906

++{

18907

++	int ret = 0;

18908

++

18909

++	if (!sched->get_subflow || !sched->next_segment)

18910

++		return -EINVAL;

18911

++

18912

++	spin_lock(&mptcp_sched_list_lock);

18913

++	if (mptcp_sched_find(sched->name)) {

18914

++		pr_notice("%s already registered\n", sched->name);

18915

++		ret = -EEXIST;

18916

++	} else {

18917

++		list_add_tail_rcu(&sched->list, &mptcp_sched_list);

18918

++		pr_info("%s registered\n", sched->name);

18919

++	}

18920

++	spin_unlock(&mptcp_sched_list_lock);

18921

++

18922

++	return ret;

18923

++}

18924

++EXPORT_SYMBOL_GPL(mptcp_register_scheduler);

18925

++

18926

++void mptcp_unregister_scheduler(struct mptcp_sched_ops *sched)

18927

++{

18928

++	spin_lock(&mptcp_sched_list_lock);

18929

++	list_del_rcu(&sched->list);

18930

++	spin_unlock(&mptcp_sched_list_lock);

18931

++}

18932

++EXPORT_SYMBOL_GPL(mptcp_unregister_scheduler);

18933

++

18934

++void mptcp_get_default_scheduler(char *name)

18935

++{

18936

++	struct mptcp_sched_ops *sched;

18937

++

18938

++	BUG_ON(list_empty(&mptcp_sched_list));

18939

++

18940

++	rcu_read_lock();

18941

++	sched = list_entry(mptcp_sched_list.next, struct mptcp_sched_ops, list);

18942

++	strncpy(name, sched->name, MPTCP_SCHED_NAME_MAX);

18943

++	rcu_read_unlock();

18944

++}

18945

++

18946

++int mptcp_set_default_scheduler(const char *name)

18947

++{

18948

++	struct mptcp_sched_ops *sched;

18949

++	int ret = -ENOENT;

18950

++

18951

++	spin_lock(&mptcp_sched_list_lock);

18952

++	sched = mptcp_sched_find(name);

18953

++#ifdef CONFIG_MODULES

18954

++	if (!sched && capable(CAP_NET_ADMIN)) {

18955

++		spin_unlock(&mptcp_sched_list_lock);

18956

++

18957

++		request_module("mptcp_%s", name);

18958

++		spin_lock(&mptcp_sched_list_lock);

18959

++		sched = mptcp_sched_find(name);

18960

++	}

18961

++#endif

18962

++

18963

++	if (sched) {

18964

++		list_move(&sched->list, &mptcp_sched_list);

18965

++		ret = 0;

18966

++	} else {

18967

++		pr_info("%s is not available\n", name);

18968

++	}

18969

++	spin_unlock(&mptcp_sched_list_lock);

18970

++

18971

++	return ret;

18972

++}

18973

++

18974

++void mptcp_init_scheduler(struct mptcp_cb *mpcb)

18975

++{

18976

++	struct mptcp_sched_ops *sched;

18977

++

18978

++	rcu_read_lock();

18979

++	list_for_each_entry_rcu(sched, &mptcp_sched_list, list) {

18980

++		if (try_module_get(sched->owner)) {

18981

++			mpcb->sched_ops = sched;

18982

++			break;

18983

++		}

18984

++	}

18985

++	rcu_read_unlock();

18986

++}

18987

++

18988

++/* Manage refcounts on socket close. */

18989

++void mptcp_cleanup_scheduler(struct mptcp_cb *mpcb)

18990

++{

18991

++	module_put(mpcb->sched_ops->owner);

18992

++}

18993

++

18994

++/* Set default value from kernel configuration at bootup */

18995

++static int __init mptcp_scheduler_default(void)

18996

++{

18997

++	BUILD_BUG_ON(sizeof(struct defsched_priv) > MPTCP_SCHED_SIZE);

18998

++

18999

++	return mptcp_set_default_scheduler(CONFIG_DEFAULT_MPTCP_SCHED);

19000

++}

19001

++late_initcall(mptcp_scheduler_default);

19002

+diff --git a/net/mptcp/mptcp_wvegas.c b/net/mptcp/mptcp_wvegas.c

19003

+new file mode 100644

19004

+index 000000000000..29ca1d868d17

19005

+--- /dev/null

19006

++++ b/net/mptcp/mptcp_wvegas.c

19007

+@@ -0,0 +1,268 @@

19008

++/*

19009

++ *	MPTCP implementation - WEIGHTED VEGAS

19010

++ *

19011

++ *	Algorithm design:

19012

++ *	Yu Cao <cyAnalyst@×××.com>

19013

++ *	Mingwei Xu <xmw@××××××××××××××××××××××.cn>

19014

++ *	Xiaoming Fu <fu@××××××××××××××××××.de>

19015

++ *

19016

++ *	Implementation:

19017

++ *	Yu Cao <cyAnalyst@×××.com>

19018

++ *	Enhuan Dong <deh13@××××××××××××××××××.cn>

19019

++ *

19020

++ *	Ported to the official MPTCP-kernel:

19021

++ *	Christoph Paasch <christoph.paasch@×××××××××.be>

19022

++ *

19023

++ *	This program is free software; you can redistribute it and/or

19024

++ *	modify it under the terms of the GNU General Public License

19025

++ *	as published by the Free Software Foundation; either version

19026

++ *	2 of the License, or (at your option) any later version.

19027

++ */

19028

++

19029

++#include <linux/skbuff.h>

19030

++#include <net/tcp.h>

19031

++#include <net/mptcp.h>

19032

++#include <linux/module.h>

19033

++#include <linux/tcp.h>

19034

++

19035

++static int initial_alpha = 2;

19036

++static int total_alpha = 10;

19037

++static int gamma = 1;

19038

++

19039

++module_param(initial_alpha, int, 0644);

19040

++MODULE_PARM_DESC(initial_alpha, "initial alpha for all subflows");

19041

++module_param(total_alpha, int, 0644);

19042

++MODULE_PARM_DESC(total_alpha, "total alpha for all subflows");

19043

++module_param(gamma, int, 0644);

19044

++MODULE_PARM_DESC(gamma, "limit on increase (scale by 2)");

19045

++

19046

++#define MPTCP_WVEGAS_SCALE 16

19047

++

19048

++/* wVegas variables */

19049

++struct wvegas {

19050

++	u32	beg_snd_nxt;	/* right edge during last RTT */

19051

++	u8	doing_wvegas_now;/* if true, do wvegas for this RTT */

19052

++

19053

++	u16	cnt_rtt;		/* # of RTTs measured within last RTT */

19054

++	u32 sampled_rtt; /* cumulative RTTs measured within last RTT (in usec) */

19055

++	u32	base_rtt;	/* the min of all wVegas RTT measurements seen (in usec) */

19056

++

19057

++	u64 instant_rate; /* cwnd / srtt_us, unit: pkts/us * 2^16 */

19058

++	u64 weight; /* the ratio of subflow's rate to the total rate, * 2^16 */

19059

++	int alpha; /* alpha for each subflows */

19060

++

19061

++	u32 queue_delay; /* queue delay*/

19062

++};

19063

++

19064

++

19065

++static inline u64 mptcp_wvegas_scale(u32 val, int scale)

19066

++{

19067

++	return (u64) val << scale;

19068

++}

19069

++

19070

++static void wvegas_enable(const struct sock *sk)

19071

++{

19072

++	const struct tcp_sock *tp = tcp_sk(sk);

19073

++	struct wvegas *wvegas = inet_csk_ca(sk);

19074

++

19075

++	wvegas->doing_wvegas_now = 1;

19076

++

19077

++	wvegas->beg_snd_nxt = tp->snd_nxt;

19078

++

19079

++	wvegas->cnt_rtt = 0;

19080

++	wvegas->sampled_rtt = 0;

19081

++

19082

++	wvegas->instant_rate = 0;

19083

++	wvegas->alpha = initial_alpha;

19084

++	wvegas->weight = mptcp_wvegas_scale(1, MPTCP_WVEGAS_SCALE);

19085

++

19086

++	wvegas->queue_delay = 0;

19087

++}

19088

++

19089

++static inline void wvegas_disable(const struct sock *sk)

19090

++{

19091

++	struct wvegas *wvegas = inet_csk_ca(sk);

19092

++

19093

++	wvegas->doing_wvegas_now = 0;

19094

++}

19095

++

19096

++static void mptcp_wvegas_init(struct sock *sk)

19097

++{

19098

++	struct wvegas *wvegas = inet_csk_ca(sk);

19099

++

19100

++	wvegas->base_rtt = 0x7fffffff;

19101

++	wvegas_enable(sk);

19102

++}

19103

++

19104

++static inline u64 mptcp_wvegas_rate(u32 cwnd, u32 rtt_us)

19105

++{

19106

++	return div_u64(mptcp_wvegas_scale(cwnd, MPTCP_WVEGAS_SCALE), rtt_us);

19107

++}

19108

++

19109

++static void mptcp_wvegas_pkts_acked(struct sock *sk, u32 cnt, s32 rtt_us)

19110

++{

19111

++	struct wvegas *wvegas = inet_csk_ca(sk);

19112

++	u32 vrtt;

19113

++

19114

++	if (rtt_us < 0)

19115

++		return;

19116

++

19117

++	vrtt = rtt_us + 1;

19118

++

19119

++	if (vrtt < wvegas->base_rtt)

19120

++		wvegas->base_rtt = vrtt;

19121

++

19122

++	wvegas->sampled_rtt += vrtt;

19123

++	wvegas->cnt_rtt++;

19124

++}

19125

++

19126

++static void mptcp_wvegas_state(struct sock *sk, u8 ca_state)

19127

++{

19128

++	if (ca_state == TCP_CA_Open)

19129

++		wvegas_enable(sk);

19130

++	else

19131

++		wvegas_disable(sk);

19132

++}

19133

++

19134

++static void mptcp_wvegas_cwnd_event(struct sock *sk, enum tcp_ca_event event)

19135

++{

19136

++	if (event == CA_EVENT_CWND_RESTART) {

19137

++		mptcp_wvegas_init(sk);

19138

++	} else if (event == CA_EVENT_LOSS) {

19139

++		struct wvegas *wvegas = inet_csk_ca(sk);

19140

++		wvegas->instant_rate = 0;

19141

++	}

19142

++}

19143

++

19144

++static inline u32 mptcp_wvegas_ssthresh(const struct tcp_sock *tp)

19145

++{

19146

++	return  min(tp->snd_ssthresh, tp->snd_cwnd - 1);

19147

++}

19148

++

19149

++static u64 mptcp_wvegas_weight(const struct mptcp_cb *mpcb, const struct sock *sk)

19150

++{

19151

++	u64 total_rate = 0;

19152

++	struct sock *sub_sk;

19153

++	const struct wvegas *wvegas = inet_csk_ca(sk);

19154

++

19155

++	if (!mpcb)

19156

++		return wvegas->weight;

19157

++

19158

++

19159

++	mptcp_for_each_sk(mpcb, sub_sk) {

19160

++		struct wvegas *sub_wvegas = inet_csk_ca(sub_sk);

19161

++

19162

++		/* sampled_rtt is initialized by 0 */

19163

++		if (mptcp_sk_can_send(sub_sk) && (sub_wvegas->sampled_rtt > 0))

19164

++			total_rate += sub_wvegas->instant_rate;

19165

++	}

19166

++

19167

++	if (total_rate && wvegas->instant_rate)

19168

++		return div64_u64(mptcp_wvegas_scale(wvegas->instant_rate, MPTCP_WVEGAS_SCALE), total_rate);

19169

++	else

19170

++		return wvegas->weight;

19171

++}

19172

++

19173

++static void mptcp_wvegas_cong_avoid(struct sock *sk, u32 ack, u32 acked)

19174

++{

19175

++	struct tcp_sock *tp = tcp_sk(sk);

19176

++	struct wvegas *wvegas = inet_csk_ca(sk);

19177

++

19178

++	if (!wvegas->doing_wvegas_now) {

19179

++		tcp_reno_cong_avoid(sk, ack, acked);

19180

++		return;

19181

++	}

19182

++

19183

++	if (after(ack, wvegas->beg_snd_nxt)) {

19184

++		wvegas->beg_snd_nxt  = tp->snd_nxt;

19185

++

19186

++		if (wvegas->cnt_rtt <= 2) {

19187

++			tcp_reno_cong_avoid(sk, ack, acked);

19188

++		} else {

19189

++			u32 rtt, diff, q_delay;

19190

++			u64 target_cwnd;

19191

++

19192

++			rtt = wvegas->sampled_rtt / wvegas->cnt_rtt;

19193

++			target_cwnd = div_u64(((u64)tp->snd_cwnd * wvegas->base_rtt), rtt);

19194

++

19195

++			diff = div_u64((u64)tp->snd_cwnd * (rtt - wvegas->base_rtt), rtt);

19196

++

19197

++			if (diff > gamma && tp->snd_cwnd <= tp->snd_ssthresh) {

19198

++				tp->snd_cwnd = min(tp->snd_cwnd, (u32)target_cwnd+1);

19199

++				tp->snd_ssthresh = mptcp_wvegas_ssthresh(tp);

19200

++

19201

++			} else if (tp->snd_cwnd <= tp->snd_ssthresh) {

19202

++				tcp_slow_start(tp, acked);

19203

++			} else {

19204

++				if (diff >= wvegas->alpha) {

19205

++					wvegas->instant_rate = mptcp_wvegas_rate(tp->snd_cwnd, rtt);

19206

++					wvegas->weight = mptcp_wvegas_weight(tp->mpcb, sk);

19207

++					wvegas->alpha = max(2U, (u32)((wvegas->weight * total_alpha) >> MPTCP_WVEGAS_SCALE));

19208

++				}

19209

++				if (diff > wvegas->alpha) {

19210

++					tp->snd_cwnd--;

19211

++					tp->snd_ssthresh = mptcp_wvegas_ssthresh(tp);

19212

++				} else if (diff < wvegas->alpha) {

19213

++					tp->snd_cwnd++;

19214

++				}

19215

++

19216

++				/* Try to drain link queue if needed*/

19217

++				q_delay = rtt - wvegas->base_rtt;

19218

++				if ((wvegas->queue_delay == 0) || (wvegas->queue_delay > q_delay))

19219

++					wvegas->queue_delay = q_delay;

19220

++

19221

++				if (q_delay >= 2 * wvegas->queue_delay) {

19222

++					u32 backoff_factor = div_u64(mptcp_wvegas_scale(wvegas->base_rtt, MPTCP_WVEGAS_SCALE), 2 * rtt);

19223

++					tp->snd_cwnd = ((u64)tp->snd_cwnd * backoff_factor) >> MPTCP_WVEGAS_SCALE;

19224

++					wvegas->queue_delay = 0;

19225

++				}

19226

++			}

19227

++

19228

++			if (tp->snd_cwnd < 2)

19229

++				tp->snd_cwnd = 2;

19230

++			else if (tp->snd_cwnd > tp->snd_cwnd_clamp)

19231

++				tp->snd_cwnd = tp->snd_cwnd_clamp;

19232

++

19233

++			tp->snd_ssthresh = tcp_current_ssthresh(sk);

19234

++		}

19235

++

19236

++		wvegas->cnt_rtt = 0;

19237

++		wvegas->sampled_rtt = 0;

19238

++	}

19239

++	/* Use normal slow start */

19240

++	else if (tp->snd_cwnd <= tp->snd_ssthresh)

19241

++		tcp_slow_start(tp, acked);

19242

++}

19243

++

19244

++

19245

++static struct tcp_congestion_ops mptcp_wvegas __read_mostly = {

19246

++	.init		= mptcp_wvegas_init,

19247

++	.ssthresh	= tcp_reno_ssthresh,

19248

++	.cong_avoid	= mptcp_wvegas_cong_avoid,

19249

++	.pkts_acked	= mptcp_wvegas_pkts_acked,

19250

++	.set_state	= mptcp_wvegas_state,

19251

++	.cwnd_event	= mptcp_wvegas_cwnd_event,

19252

++

19253

++	.owner		= THIS_MODULE,

19254

++	.name		= "wvegas",

19255

++};

19256

++

19257

++static int __init mptcp_wvegas_register(void)

19258

++{

19259

++	BUILD_BUG_ON(sizeof(struct wvegas) > ICSK_CA_PRIV_SIZE);

19260

++	tcp_register_congestion_control(&mptcp_wvegas);

19261

++	return 0;

19262

++}

19263

++

19264

++static void __exit mptcp_wvegas_unregister(void)

19265

++{

19266

++	tcp_unregister_congestion_control(&mptcp_wvegas);

19267

++}

19268

++

19269

++module_init(mptcp_wvegas_register);

19270

++module_exit(mptcp_wvegas_unregister);

19271

++

19272

++MODULE_AUTHOR("Yu Cao, Enhuan Dong");

19273

++MODULE_LICENSE("GPL");

19274

++MODULE_DESCRIPTION("MPTCP wVegas");

19275

++MODULE_VERSION("0.1");

Gentoo Archives: gentoo-commits