[gentoo-commits] proj/linux-patches:3.16 commit in: / - gentoo-commits

From:	Mike Pagano <mpagano@g.o>
To:	gentoo-commits@l.g.o
Subject:	[gentoo-commits] proj/linux-patches:3.16 commit in: /
Date:	Fri, 26 Sep 2014 19:40:22
Message-Id:	`1411760417.d9d386b72f6c05e68b48912cc93da59331852155.mpagano@gentoo`

1

commit:     d9d386b72f6c05e68b48912cc93da59331852155

2

Author:     Mike Pagano <mpagano <AT> gentoo <DOT> org>

3

AuthorDate: Fri Sep 26 19:40:17 2014 +0000

4

Commit:     Mike Pagano <mpagano <AT> gentoo <DOT> org>

5

CommitDate: Fri Sep 26 19:40:17 2014 +0000

6

URL:        http://sources.gentoo.org/gitweb/?p=proj/linux-patches.git;a=commit;h=d9d386b7

7

8

Add multipath-tcp patch. Fix distro config.

9

10

---

11

 0000_README                                 |     4 +

12

 2500_multipath-tcp-v3.16-872d7f6c6f4e.patch | 19230 ++++++++++++++++++++++++++

13

 4567_distro-Gentoo-Kconfig.patch            |    19 +-

14

 3 files changed, 19243 insertions(+), 10 deletions(-)

15

16

diff --git a/0000_README b/0000_README

17

index 706e53e..d92e6b7 100644

18

--- a/0000_README

19

+++ b/0000_README

20

@@ -58,6 +58,10 @@ Patch:  2400_kcopy-patch-for-infiniband-driver.patch

21

 From:   Alexey Shvetsov <alexxy@g.o>

22

 Desc:   Zero copy for infiniband psm userspace driver

23

24

+Patch:  2500_multipath-tcp-v3.16-872d7f6c6f4e.patch

25

+From:   http://multipath-tcp.org/

26

+Desc:   Patch for simultaneous use of several IP-addresses/interfaces in TCP for better resource utilization, better throughput and smoother reaction to failures.

27

+

28

 Patch:  2700_ThinkPad-30-brightness-control-fix.patch

29

 From:   Seth Forshee <seth.forshee@×××××××××.com>

30

 Desc:   ACPI: Disable Windows 8 compatibility for some Lenovo ThinkPads

31

32

diff --git a/2500_multipath-tcp-v3.16-872d7f6c6f4e.patch b/2500_multipath-tcp-v3.16-872d7f6c6f4e.patch

33

new file mode 100644

34

index 0000000..3000da3

35

--- /dev/null

36

+++ b/2500_multipath-tcp-v3.16-872d7f6c6f4e.patch

37

@@ -0,0 +1,19230 @@

38

+diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c

39

+index 768a0fb67dd6..5a46d91a8df9 100644

40

+--- a/drivers/infiniband/hw/cxgb4/cm.c

41

++++ b/drivers/infiniband/hw/cxgb4/cm.c

42

+@@ -3432,7 +3432,7 @@ static void build_cpl_pass_accept_req(struct sk_buff *skb, int stid , u8 tos)

43

+ 	 */

44

+ 	memset(&tmp_opt, 0, sizeof(tmp_opt));

45

+ 	tcp_clear_options(&tmp_opt);

46

+-	tcp_parse_options(skb, &tmp_opt, 0, NULL);

47

++	tcp_parse_options(skb, &tmp_opt, NULL, 0, NULL);

48

+

49

+ 	req = (struct cpl_pass_accept_req *)__skb_push(skb, sizeof(*req));

50

+ 	memset(req, 0, sizeof(*req));

51

+diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h

52

+index 2faef339d8f2..d86c853ffaad 100644

53

+--- a/include/linux/ipv6.h

54

++++ b/include/linux/ipv6.h

55

+@@ -256,16 +256,6 @@ static inline struct ipv6_pinfo * inet6_sk(const struct sock *__sk)

56

+ 	return inet_sk(__sk)->pinet6;

57

+ }

58

+

59

+-static inline struct request_sock *inet6_reqsk_alloc(struct request_sock_ops *ops)

60

+-{

61

+-	struct request_sock *req = reqsk_alloc(ops);

62

+-

63

+-	if (req)

64

+-		inet_rsk(req)->pktopts = NULL;

65

+-

66

+-	return req;

67

+-}

68

+-

69

+ static inline struct raw6_sock *raw6_sk(const struct sock *sk)

70

+ {

71

+ 	return (struct raw6_sock *)sk;

72

+@@ -309,12 +299,6 @@ static inline struct ipv6_pinfo * inet6_sk(const struct sock *__sk)

73

+ 	return NULL;

74

+ }

75

+

76

+-static inline struct inet6_request_sock *

77

+-			inet6_rsk(const struct request_sock *rsk)

78

+-{

79

+-	return NULL;

80

+-}

81

+-

82

+ static inline struct raw6_sock *raw6_sk(const struct sock *sk)

83

+ {

84

+ 	return NULL;

85

+diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h

86

+index ec89301ada41..99ea4b0e3693 100644

87

+--- a/include/linux/skbuff.h

88

++++ b/include/linux/skbuff.h

89

+@@ -2784,8 +2784,10 @@ static inline bool __skb_checksum_validate_needed(struct sk_buff *skb,

90

+ 						  bool zero_okay,

91

+ 						  __sum16 check)

92

+ {

93

+-	if (skb_csum_unnecessary(skb) || (zero_okay && !check)) {

94

+-		skb->csum_valid = 1;

95

++	if (skb_csum_unnecessary(skb)) {

96

++		return false;

97

++	} else if (zero_okay && !check) {

98

++		skb->ip_summed = CHECKSUM_UNNECESSARY;

99

+ 		return false;

100

+ 	}

101

+

102

+diff --git a/include/linux/tcp.h b/include/linux/tcp.h

103

+index a0513210798f..7bc2e078d6ca 100644

104

+--- a/include/linux/tcp.h

105

++++ b/include/linux/tcp.h

106

+@@ -53,7 +53,7 @@ static inline unsigned int tcp_optlen(const struct sk_buff *skb)

107

+ /* TCP Fast Open */

108

+ #define TCP_FASTOPEN_COOKIE_MIN	4	/* Min Fast Open Cookie size in bytes */

109

+ #define TCP_FASTOPEN_COOKIE_MAX	16	/* Max Fast Open Cookie size in bytes */

110

+-#define TCP_FASTOPEN_COOKIE_SIZE 8	/* the size employed by this impl. */

111

++#define TCP_FASTOPEN_COOKIE_SIZE 4	/* the size employed by this impl. */

112

+

113

+ /* TCP Fast Open Cookie as stored in memory */

114

+ struct tcp_fastopen_cookie {

115

+@@ -72,6 +72,51 @@ struct tcp_sack_block {

116

+ 	u32	end_seq;

117

+ };

118

+

119

++struct tcp_out_options {

120

++	u16	options;	/* bit field of OPTION_* */

121

++	u8	ws;		/* window scale, 0 to disable */

122

++	u8	num_sack_blocks;/* number of SACK blocks to include */

123

++	u8	hash_size;	/* bytes in hash_location */

124

++	u16	mss;		/* 0 to disable */

125

++	__u8	*hash_location;	/* temporary pointer, overloaded */

126

++	__u32	tsval, tsecr;	/* need to include OPTION_TS */

127

++	struct tcp_fastopen_cookie *fastopen_cookie;	/* Fast open cookie */

128

++#ifdef CONFIG_MPTCP

129

++	u16	mptcp_options;	/* bit field of MPTCP related OPTION_* */

130

++	u8	dss_csum:1,

131

++		add_addr_v4:1,

132

++		add_addr_v6:1;	/* dss-checksum required? */

133

++

134

++	union {

135

++		struct {

136

++			__u64	sender_key;	/* sender's key for mptcp */

137

++			__u64	receiver_key;	/* receiver's key for mptcp */

138

++		} mp_capable;

139

++

140

++		struct {

141

++			__u64	sender_truncated_mac;

142

++			__u32	sender_nonce;

143

++					/* random number of the sender */

144

++			__u32	token;	/* token for mptcp */

145

++			u8	low_prio:1;

146

++		} mp_join_syns;

147

++	};

148

++

149

++	struct {

150

++		struct in_addr addr;

151

++		u8 addr_id;

152

++	} add_addr4;

153

++

154

++	struct {

155

++		struct in6_addr addr;

156

++		u8 addr_id;

157

++	} add_addr6;

158

++

159

++	u16	remove_addrs;	/* list of address id */

160

++	u8	addr_id;	/* address id (mp_join or add_address) */

161

++#endif /* CONFIG_MPTCP */

162

++};

163

++

164

+ /*These are used to set the sack_ok field in struct tcp_options_received */

165

+ #define TCP_SACK_SEEN     (1 << 0)   /*1 = peer is SACK capable, */

166

+ #define TCP_FACK_ENABLED  (1 << 1)   /*1 = FACK is enabled locally*/

167

+@@ -95,6 +140,9 @@ struct tcp_options_received {

168

+ 	u16	mss_clamp;	/* Maximal mss, negotiated at connection setup */

169

+ };

170

+

171

++struct mptcp_cb;

172

++struct mptcp_tcp_sock;

173

++

174

+ static inline void tcp_clear_options(struct tcp_options_received *rx_opt)

175

+ {

176

+ 	rx_opt->tstamp_ok = rx_opt->sack_ok = 0;

177

+@@ -111,10 +159,7 @@ struct tcp_request_sock_ops;

178

+

179

+ struct tcp_request_sock {

180

+ 	struct inet_request_sock 	req;

181

+-#ifdef CONFIG_TCP_MD5SIG

182

+-	/* Only used by TCP MD5 Signature so far. */

183

+ 	const struct tcp_request_sock_ops *af_specific;

184

+-#endif

185

+ 	struct sock			*listener; /* needed for TFO */

186

+ 	u32				rcv_isn;

187

+ 	u32				snt_isn;

188

+@@ -130,6 +175,8 @@ static inline struct tcp_request_sock *tcp_rsk(const struct request_sock *req)

189

+ 	return (struct tcp_request_sock *)req;

190

+ }

191

+

192

++struct tcp_md5sig_key;

193

++

194

+ struct tcp_sock {

195

+ 	/* inet_connection_sock has to be the first member of tcp_sock */

196

+ 	struct inet_connection_sock	inet_conn;

197

+@@ -326,6 +373,37 @@ struct tcp_sock {

198

+ 	 * socket. Used to retransmit SYNACKs etc.

199

+ 	 */

200

+ 	struct request_sock *fastopen_rsk;

201

++

202

++	/* MPTCP/TCP-specific callbacks */

203

++	const struct tcp_sock_ops	*ops;

204

++

205

++	struct mptcp_cb		*mpcb;

206

++	struct sock		*meta_sk;

207

++	/* We keep these flags even if CONFIG_MPTCP is not checked, because

208

++	 * it allows checking MPTCP capability just by checking the mpc flag,

209

++	 * rather than adding ifdefs everywhere.

210

++	 */

211

++	u16     mpc:1,          /* Other end is multipath capable */

212

++		inside_tk_table:1, /* Is the tcp_sock inside the token-table? */

213

++		send_mp_fclose:1,

214

++		request_mptcp:1, /* Did we send out an MP_CAPABLE?

215

++				  * (this speeds up mptcp_doit() in tcp_recvmsg)

216

++				  */

217

++		mptcp_enabled:1, /* Is MPTCP enabled from the application ? */

218

++		pf:1, /* Potentially Failed state: when this flag is set, we

219

++		       * stop using the subflow

220

++		       */

221

++		mp_killed:1, /* Killed with a tcp_done in mptcp? */

222

++		was_meta_sk:1,	/* This was a meta sk (in case of reuse) */

223

++		is_master_sk,

224

++		close_it:1,	/* Must close socket in mptcp_data_ready? */

225

++		closing:1;

226

++	struct mptcp_tcp_sock *mptcp;

227

++#ifdef CONFIG_MPTCP

228

++	struct hlist_nulls_node tk_table;

229

++	u32		mptcp_loc_token;

230

++	u64		mptcp_loc_key;

231

++#endif /* CONFIG_MPTCP */

232

+ };

233

+

234

+ enum tsq_flags {

235

+@@ -337,6 +415,8 @@ enum tsq_flags {

236

+ 	TCP_MTU_REDUCED_DEFERRED,  /* tcp_v{4|6}_err() could not call

237

+ 				    * tcp_v{4|6}_mtu_reduced()

238

+ 				    */

239

++	MPTCP_PATH_MANAGER, /* MPTCP deferred creation of new subflows */

240

++	MPTCP_SUB_DEFERRED, /* A subflow got deferred - process them */

241

+ };

242

+

243

+ static inline struct tcp_sock *tcp_sk(const struct sock *sk)

244

+@@ -355,6 +435,7 @@ struct tcp_timewait_sock {

245

+ #ifdef CONFIG_TCP_MD5SIG

246

+ 	struct tcp_md5sig_key	  *tw_md5_key;

247

+ #endif

248

++	struct mptcp_tw		  *mptcp_tw;

249

+ };

250

+

251

+ static inline struct tcp_timewait_sock *tcp_twsk(const struct sock *sk)

252

+diff --git a/include/net/inet6_connection_sock.h b/include/net/inet6_connection_sock.h

253

+index 74af137304be..83f63033897a 100644

254

+--- a/include/net/inet6_connection_sock.h

255

++++ b/include/net/inet6_connection_sock.h

256

+@@ -27,6 +27,8 @@ int inet6_csk_bind_conflict(const struct sock *sk,

257

+

258

+ struct dst_entry *inet6_csk_route_req(struct sock *sk, struct flowi6 *fl6,

259

+ 				      const struct request_sock *req);

260

++u32 inet6_synq_hash(const struct in6_addr *raddr, const __be16 rport,

261

++		    const u32 rnd, const u32 synq_hsize);

262

+

263

+ struct request_sock *inet6_csk_search_req(const struct sock *sk,

264

+ 					  struct request_sock ***prevp,

265

+diff --git a/include/net/inet_common.h b/include/net/inet_common.h

266

+index fe7994c48b75..780f229f46a8 100644

267

+--- a/include/net/inet_common.h

268

++++ b/include/net/inet_common.h

269

+@@ -1,6 +1,8 @@

270

+ #ifndef _INET_COMMON_H

271

+ #define _INET_COMMON_H

272

+

273

++#include <net/sock.h>

274

++

275

+ extern const struct proto_ops inet_stream_ops;

276

+ extern const struct proto_ops inet_dgram_ops;

277

+

278

+@@ -13,6 +15,8 @@ struct sock;

279

+ struct sockaddr;

280

+ struct socket;

281

+

282

++int inet_create(struct net *net, struct socket *sock, int protocol, int kern);

283

++int inet6_create(struct net *net, struct socket *sock, int protocol, int kern);

284

+ int inet_release(struct socket *sock);

285

+ int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,

286

+ 			int addr_len, int flags);

287

+diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h

288

+index 7a4313887568..f62159e39839 100644

289

+--- a/include/net/inet_connection_sock.h

290

++++ b/include/net/inet_connection_sock.h

291

+@@ -30,6 +30,7 @@

292

+

293

+ struct inet_bind_bucket;

294

+ struct tcp_congestion_ops;

295

++struct tcp_options_received;

296

+

297

+ /*

298

+  * Pointers to address related TCP functions

299

+@@ -243,6 +244,9 @@ static inline void inet_csk_reset_xmit_timer(struct sock *sk, const int what,

300

+

301

+ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err);

302

+

303

++u32 inet_synq_hash(const __be32 raddr, const __be16 rport, const u32 rnd,

304

++		   const u32 synq_hsize);

305

++

306

+ struct request_sock *inet_csk_search_req(const struct sock *sk,

307

+ 					 struct request_sock ***prevp,

308

+ 					 const __be16 rport,

309

+diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h

310

+index b1edf17bec01..6a32d8d6b85e 100644

311

+--- a/include/net/inet_sock.h

312

++++ b/include/net/inet_sock.h

313

+@@ -86,10 +86,14 @@ struct inet_request_sock {

314

+ 				wscale_ok  : 1,

315

+ 				ecn_ok	   : 1,

316

+ 				acked	   : 1,

317

+-				no_srccheck: 1;

318

++				no_srccheck: 1,

319

++				mptcp_rqsk : 1,

320

++				saw_mpc    : 1;

321

+ 	kmemcheck_bitfield_end(flags);

322

+-	struct ip_options_rcu	*opt;

323

+-	struct sk_buff		*pktopts;

324

++	union {

325

++		struct ip_options_rcu	*opt;

326

++		struct sk_buff		*pktopts;

327

++	};

328

+ 	u32                     ir_mark;

329

+ };

330

+

331

+diff --git a/include/net/mptcp.h b/include/net/mptcp.h

332

+new file mode 100644

333

+index 000000000000..712780fc39e4

334

+--- /dev/null

335

++++ b/include/net/mptcp.h

336

+@@ -0,0 +1,1439 @@

337

++/*

338

++ *	MPTCP implementation

339

++ *

340

++ *	Initial Design & Implementation:

341

++ *	Sébastien Barré <sebastien.barre@×××××××××.be>

342

++ *

343

++ *	Current Maintainer & Author:

344

++ *	Christoph Paasch <christoph.paasch@×××××××××.be>

345

++ *

346

++ *	Additional authors:

347

++ *	Jaakko Korkeaniemi <jaakko.korkeaniemi@×××××.fi>

348

++ *	Gregory Detal <gregory.detal@×××××××××.be>

349

++ *	Fabien Duchêne <fabien.duchene@×××××××××.be>

350

++ *	Andreas Seelinger <Andreas.Seelinger@×××××××××××.de>

351

++ *	Lavkesh Lahngir <lavkesh51@×××××.com>

352

++ *	Andreas Ripke <ripke@××××××.eu>

353

++ *	Vlad Dogaru <vlad.dogaru@×××××.com>

354

++ *	Octavian Purdila <octavian.purdila@×××××.com>

355

++ *	John Ronan <jronan@××××.org>

356

++ *	Catalin Nicutar <catalin.nicutar@×××××.com>

357

++ *	Brandon Heller <brandonh@××××××××.edu>

358

++ *

359

++ *

360

++ *	This program is free software; you can redistribute it and/or

361

++ *      modify it under the terms of the GNU General Public License

362

++ *      as published by the Free Software Foundation; either version

363

++ *      2 of the License, or (at your option) any later version.

364

++ */

365

++

366

++#ifndef _MPTCP_H

367

++#define _MPTCP_H

368

++

369

++#include <linux/inetdevice.h>

370

++#include <linux/ipv6.h>

371

++#include <linux/list.h>

372

++#include <linux/net.h>

373

++#include <linux/netpoll.h>

374

++#include <linux/skbuff.h>

375

++#include <linux/socket.h>

376

++#include <linux/tcp.h>

377

++#include <linux/kernel.h>

378

++

379

++#include <asm/byteorder.h>

380

++#include <asm/unaligned.h>

381

++#include <crypto/hash.h>

382

++#include <net/tcp.h>

383

++

384

++#if defined(__LITTLE_ENDIAN_BITFIELD)

385

++	#define ntohll(x)  be64_to_cpu(x)

386

++	#define htonll(x)  cpu_to_be64(x)

387

++#elif defined(__BIG_ENDIAN_BITFIELD)

388

++	#define ntohll(x) (x)

389

++	#define htonll(x) (x)

390

++#endif

391

++

392

++struct mptcp_loc4 {

393

++	u8		loc4_id;

394

++	u8		low_prio:1;

395

++	struct in_addr	addr;

396

++};

397

++

398

++struct mptcp_rem4 {

399

++	u8		rem4_id;

400

++	__be16		port;

401

++	struct in_addr	addr;

402

++};

403

++

404

++struct mptcp_loc6 {

405

++	u8		loc6_id;

406

++	u8		low_prio:1;

407

++	struct in6_addr	addr;

408

++};

409

++

410

++struct mptcp_rem6 {

411

++	u8		rem6_id;

412

++	__be16		port;

413

++	struct in6_addr	addr;

414

++};

415

++

416

++struct mptcp_request_sock {

417

++	struct tcp_request_sock		req;

418

++	/* hlist-nulls entry to the hash-table. Depending on whether this is a

419

++	 * a new MPTCP connection or an additional subflow, the request-socket

420

++	 * is either in the mptcp_reqsk_tk_htb or mptcp_reqsk_htb.

421

++	 */

422

++	struct hlist_nulls_node		hash_entry;

423

++

424

++	union {

425

++		struct {

426

++			/* Only on initial subflows */

427

++			u64		mptcp_loc_key;

428

++			u64		mptcp_rem_key;

429

++			u32		mptcp_loc_token;

430

++		};

431

++

432

++		struct {

433

++			/* Only on additional subflows */

434

++			struct mptcp_cb	*mptcp_mpcb;

435

++			u32		mptcp_rem_nonce;

436

++			u32		mptcp_loc_nonce;

437

++			u64		mptcp_hash_tmac;

438

++		};

439

++	};

440

++

441

++	u8				loc_id;

442

++	u8				rem_id; /* Address-id in the MP_JOIN */

443

++	u8				dss_csum:1,

444

++					is_sub:1, /* Is this a new subflow? */

445

++					low_prio:1, /* Interface set to low-prio? */

446

++					rcv_low_prio:1;

447

++};

448

++

449

++struct mptcp_options_received {

450

++	u16	saw_mpc:1,

451

++		dss_csum:1,

452

++		drop_me:1,

453

++

454

++		is_mp_join:1,

455

++		join_ack:1,

456

++

457

++		saw_low_prio:2, /* 0x1 - low-prio set for this subflow

458

++				 * 0x2 - low-prio set for another subflow

459

++				 */

460

++		low_prio:1,

461

++

462

++		saw_add_addr:2, /* Saw at least one add_addr option:

463

++				 * 0x1: IPv4 - 0x2: IPv6

464

++				 */

465

++		more_add_addr:1, /* Saw one more add-addr. */

466

++

467

++		saw_rem_addr:1, /* Saw at least one rem_addr option */

468

++		more_rem_addr:1, /* Saw one more rem-addr. */

469

++

470

++		mp_fail:1,

471

++		mp_fclose:1;

472

++	u8	rem_id;		/* Address-id in the MP_JOIN */

473

++	u8	prio_addr_id;	/* Address-id in the MP_PRIO */

474

++

475

++	const unsigned char *add_addr_ptr; /* Pointer to add-address option */

476

++	const unsigned char *rem_addr_ptr; /* Pointer to rem-address option */

477

++

478

++	u32	data_ack;

479

++	u32	data_seq;

480

++	u16	data_len;

481

++

482

++	u32	mptcp_rem_token;/* Remote token */

483

++

484

++	/* Key inside the option (from mp_capable or fast_close) */

485

++	u64	mptcp_key;

486

++

487

++	u32	mptcp_recv_nonce;

488

++	u64	mptcp_recv_tmac;

489

++	u8	mptcp_recv_mac[20];

490

++};

491

++

492

++struct mptcp_tcp_sock {

493

++	struct tcp_sock	*next;		/* Next subflow socket */

494

++	struct hlist_node cb_list;

495

++	struct mptcp_options_received rx_opt;

496

++

497

++	 /* Those three fields record the current mapping */

498

++	u64	map_data_seq;

499

++	u32	map_subseq;

500

++	u16	map_data_len;

501

++	u16	slave_sk:1,

502

++		fully_established:1,

503

++		establish_increased:1,

504

++		second_packet:1,

505

++		attached:1,

506

++		send_mp_fail:1,

507

++		include_mpc:1,

508

++		mapping_present:1,

509

++		map_data_fin:1,

510

++		low_prio:1, /* use this socket as backup */

511

++		rcv_low_prio:1, /* Peer sent low-prio option to us */

512

++		send_mp_prio:1, /* Trigger to send mp_prio on this socket */

513

++		pre_established:1; /* State between sending 3rd ACK and

514

++				    * receiving the fourth ack of new subflows.

515

++				    */

516

++

517

++	/* isn: needed to translate abs to relative subflow seqnums */

518

++	u32	snt_isn;

519

++	u32	rcv_isn;

520

++	u8	path_index;

521

++	u8	loc_id;

522

++	u8	rem_id;

523

++

524

++#define MPTCP_SCHED_SIZE 4

525

++	u8	mptcp_sched[MPTCP_SCHED_SIZE] __aligned(8);

526

++

527

++	struct sk_buff  *shortcut_ofoqueue; /* Shortcut to the current modified

528

++					     * skb in the ofo-queue.

529

++					     */

530

++

531

++	int	init_rcv_wnd;

532

++	u32	infinite_cutoff_seq;

533

++	struct delayed_work work;

534

++	u32	mptcp_loc_nonce;

535

++	struct tcp_sock *tp; /* Where is my daddy? */

536

++	u32	last_end_data_seq;

537

++

538

++	/* MP_JOIN subflow: timer for retransmitting the 3rd ack */

539

++	struct timer_list mptcp_ack_timer;

540

++

541

++	/* HMAC of the third ack */

542

++	char sender_mac[20];

543

++};

544

++

545

++struct mptcp_tw {

546

++	struct list_head list;

547

++	u64 loc_key;

548

++	u64 rcv_nxt;

549

++	struct mptcp_cb __rcu *mpcb;

550

++	u8 meta_tw:1,

551

++	   in_list:1;

552

++};

553

++

554

++#define MPTCP_PM_NAME_MAX 16

555

++struct mptcp_pm_ops {

556

++	struct list_head list;

557

++

558

++	/* Signal the creation of a new MPTCP-session. */

559

++	void (*new_session)(const struct sock *meta_sk);

560

++	void (*release_sock)(struct sock *meta_sk);

561

++	void (*fully_established)(struct sock *meta_sk);

562

++	void (*new_remote_address)(struct sock *meta_sk);

563

++	int  (*get_local_id)(sa_family_t family, union inet_addr *addr,

564

++			     struct net *net, bool *low_prio);

565

++	void (*addr_signal)(struct sock *sk, unsigned *size,

566

++			    struct tcp_out_options *opts, struct sk_buff *skb);

567

++	void (*add_raddr)(struct mptcp_cb *mpcb, const union inet_addr *addr,

568

++			  sa_family_t family, __be16 port, u8 id);

569

++	void (*rem_raddr)(struct mptcp_cb *mpcb, u8 rem_id);

570

++	void (*init_subsocket_v4)(struct sock *sk, struct in_addr addr);

571

++	void (*init_subsocket_v6)(struct sock *sk, struct in6_addr addr);

572

++

573

++	char		name[MPTCP_PM_NAME_MAX];

574

++	struct module	*owner;

575

++};

576

++

577

++#define MPTCP_SCHED_NAME_MAX 16

578

++struct mptcp_sched_ops {

579

++	struct list_head list;

580

++

581

++	struct sock *		(*get_subflow)(struct sock *meta_sk,

582

++					       struct sk_buff *skb,

583

++					       bool zero_wnd_test);

584

++	struct sk_buff *	(*next_segment)(struct sock *meta_sk,

585

++						int *reinject,

586

++						struct sock **subsk,

587

++						unsigned int *limit);

588

++	void			(*init)(struct sock *sk);

589

++

590

++	char			name[MPTCP_SCHED_NAME_MAX];

591

++	struct module		*owner;

592

++};

593

++

594

++struct mptcp_cb {

595

++	/* list of sockets in this multipath connection */

596

++	struct tcp_sock *connection_list;

597

++	/* list of sockets that need a call to release_cb */

598

++	struct hlist_head callback_list;

599

++

600

++	/* High-order bits of 64-bit sequence numbers */

601

++	u32 snd_high_order[2];

602

++	u32 rcv_high_order[2];

603

++

604

++	u16	send_infinite_mapping:1,

605

++		in_time_wait:1,

606

++		list_rcvd:1, /* XXX TO REMOVE */

607

++		addr_signal:1, /* Path-manager wants us to call addr_signal */

608

++		dss_csum:1,

609

++		server_side:1,

610

++		infinite_mapping_rcv:1,

611

++		infinite_mapping_snd:1,

612

++		dfin_combined:1,   /* Was the DFIN combined with subflow-fin? */

613

++		passive_close:1,

614

++		snd_hiseq_index:1, /* Index in snd_high_order of snd_nxt */

615

++		rcv_hiseq_index:1; /* Index in rcv_high_order of rcv_nxt */

616

++

617

++	/* socket count in this connection */

618

++	u8 cnt_subflows;

619

++	u8 cnt_established;

620

++

621

++	struct mptcp_sched_ops *sched_ops;

622

++

623

++	struct sk_buff_head reinject_queue;

624

++	/* First cache-line boundary is here minus 8 bytes. But from the

625

++	 * reinject-queue only the next and prev pointers are regularly

626

++	 * accessed. Thus, the whole data-path is on a single cache-line.

627

++	 */

628

++

629

++	u64	csum_cutoff_seq;

630

++

631

++	/***** Start of fields, used for connection closure */

632

++	spinlock_t	 tw_lock;

633

++	unsigned char	 mptw_state;

634

++	u8		 dfin_path_index;

635

++

636

++	struct list_head tw_list;

637

++

638

++	/***** Start of fields, used for subflow establishment and closure */

639

++	atomic_t	mpcb_refcnt;

640

++

641

++	/* Mutex needed, because otherwise mptcp_close will complain that the

642

++	 * socket is owned by the user.

643

++	 * E.g., mptcp_sub_close_wq is taking the meta-lock.

644

++	 */

645

++	struct mutex	mpcb_mutex;

646

++

647

++	/***** Start of fields, used for subflow establishment */

648

++	struct sock *meta_sk;

649

++

650

++	/* Master socket, also part of the connection_list, this

651

++	 * socket is the one that the application sees.

652

++	 */

653

++	struct sock *master_sk;

654

++

655

++	__u64	mptcp_loc_key;

656

++	__u64	mptcp_rem_key;

657

++	__u32	mptcp_loc_token;

658

++	__u32	mptcp_rem_token;

659

++

660

++#define MPTCP_PM_SIZE 608

661

++	u8 mptcp_pm[MPTCP_PM_SIZE] __aligned(8);

662

++	struct mptcp_pm_ops *pm_ops;

663

++

664

++	u32 path_index_bits;

665

++	/* Next pi to pick up in case a new path becomes available */

666

++	u8 next_path_index;

667

++

668

++	/* Original snd/rcvbuf of the initial subflow.

669

++	 * Used for the new subflows on the server-side to allow correct

670

++	 * autotuning

671

++	 */

672

++	int orig_sk_rcvbuf;

673

++	int orig_sk_sndbuf;

674

++	u32 orig_window_clamp;

675

++

676

++	/* Timer for retransmitting SYN/ACK+MP_JOIN */

677

++	struct timer_list synack_timer;

678

++};

679

++

680

++#define MPTCP_SUB_CAPABLE			0

681

++#define MPTCP_SUB_LEN_CAPABLE_SYN		12

682

++#define MPTCP_SUB_LEN_CAPABLE_SYN_ALIGN		12

683

++#define MPTCP_SUB_LEN_CAPABLE_ACK		20

684

++#define MPTCP_SUB_LEN_CAPABLE_ACK_ALIGN		20

685

++

686

++#define MPTCP_SUB_JOIN			1

687

++#define MPTCP_SUB_LEN_JOIN_SYN		12

688

++#define MPTCP_SUB_LEN_JOIN_SYN_ALIGN	12

689

++#define MPTCP_SUB_LEN_JOIN_SYNACK	16

690

++#define MPTCP_SUB_LEN_JOIN_SYNACK_ALIGN	16

691

++#define MPTCP_SUB_LEN_JOIN_ACK		24

692

++#define MPTCP_SUB_LEN_JOIN_ACK_ALIGN	24

693

++

694

++#define MPTCP_SUB_DSS		2

695

++#define MPTCP_SUB_LEN_DSS	4

696

++#define MPTCP_SUB_LEN_DSS_ALIGN	4

697

++

698

++/* Lengths for seq and ack are the ones without the generic MPTCP-option header,

699

++ * as they are part of the DSS-option.

700

++ * To get the total length, just add the different options together.

701

++ */

702

++#define MPTCP_SUB_LEN_SEQ	10

703

++#define MPTCP_SUB_LEN_SEQ_CSUM	12

704

++#define MPTCP_SUB_LEN_SEQ_ALIGN	12

705

++

706

++#define MPTCP_SUB_LEN_SEQ_64		14

707

++#define MPTCP_SUB_LEN_SEQ_CSUM_64	16

708

++#define MPTCP_SUB_LEN_SEQ_64_ALIGN	16

709

++

710

++#define MPTCP_SUB_LEN_ACK	4

711

++#define MPTCP_SUB_LEN_ACK_ALIGN	4

712

++

713

++#define MPTCP_SUB_LEN_ACK_64		8

714

++#define MPTCP_SUB_LEN_ACK_64_ALIGN	8

715

++

716

++/* This is the "default" option-length we will send out most often.

717

++ * MPTCP DSS-header

718

++ * 32-bit data sequence number

719

++ * 32-bit data ack

720

++ *

721

++ * It is necessary to calculate the effective MSS we will be using when

722

++ * sending data.

723

++ */

724

++#define MPTCP_SUB_LEN_DSM_ALIGN  (MPTCP_SUB_LEN_DSS_ALIGN +		\

725

++				  MPTCP_SUB_LEN_SEQ_ALIGN +		\

726

++				  MPTCP_SUB_LEN_ACK_ALIGN)

727

++

728

++#define MPTCP_SUB_ADD_ADDR		3

729

++#define MPTCP_SUB_LEN_ADD_ADDR4		8

730

++#define MPTCP_SUB_LEN_ADD_ADDR6		20

731

++#define MPTCP_SUB_LEN_ADD_ADDR4_ALIGN	8

732

++#define MPTCP_SUB_LEN_ADD_ADDR6_ALIGN	20

733

++

734

++#define MPTCP_SUB_REMOVE_ADDR	4

735

++#define MPTCP_SUB_LEN_REMOVE_ADDR	4

736

++

737

++#define MPTCP_SUB_PRIO		5

738

++#define MPTCP_SUB_LEN_PRIO	3

739

++#define MPTCP_SUB_LEN_PRIO_ADDR	4

740

++#define MPTCP_SUB_LEN_PRIO_ALIGN	4

741

++

742

++#define MPTCP_SUB_FAIL		6

743

++#define MPTCP_SUB_LEN_FAIL	12

744

++#define MPTCP_SUB_LEN_FAIL_ALIGN	12

745

++

746

++#define MPTCP_SUB_FCLOSE	7

747

++#define MPTCP_SUB_LEN_FCLOSE	12

748

++#define MPTCP_SUB_LEN_FCLOSE_ALIGN	12

749

++

750

++

751

++#define OPTION_MPTCP		(1 << 5)

752

++

753

++#ifdef CONFIG_MPTCP

754

++

755

++/* Used for checking if the mptcp initialization has been successful */

756

++extern bool mptcp_init_failed;

757

++

758

++/* MPTCP options */

759

++#define OPTION_TYPE_SYN		(1 << 0)

760

++#define OPTION_TYPE_SYNACK	(1 << 1)

761

++#define OPTION_TYPE_ACK		(1 << 2)

762

++#define OPTION_MP_CAPABLE	(1 << 3)

763

++#define OPTION_DATA_ACK		(1 << 4)

764

++#define OPTION_ADD_ADDR		(1 << 5)

765

++#define OPTION_MP_JOIN		(1 << 6)

766

++#define OPTION_MP_FAIL		(1 << 7)

767

++#define OPTION_MP_FCLOSE	(1 << 8)

768

++#define OPTION_REMOVE_ADDR	(1 << 9)

769

++#define OPTION_MP_PRIO		(1 << 10)

770

++

771

++/* MPTCP flags: both TX and RX */

772

++#define MPTCPHDR_SEQ		0x01 /* DSS.M option is present */

773

++#define MPTCPHDR_FIN		0x02 /* DSS.F option is present */

774

++#define MPTCPHDR_SEQ64_INDEX	0x04 /* index of seq in mpcb->snd_high_order */

775

++/* MPTCP flags: RX only */

776

++#define MPTCPHDR_ACK		0x08

777

++#define MPTCPHDR_SEQ64_SET	0x10 /* Did we received a 64-bit seq number?  */

778

++#define MPTCPHDR_SEQ64_OFO	0x20 /* Is it not in our circular array? */

779

++#define MPTCPHDR_DSS_CSUM	0x40

780

++#define MPTCPHDR_JOIN		0x80

781

++/* MPTCP flags: TX only */

782

++#define MPTCPHDR_INF		0x08

783

++

784

++struct mptcp_option {

785

++	__u8	kind;

786

++	__u8	len;

787

++#if defined(__LITTLE_ENDIAN_BITFIELD)

788

++	__u8	ver:4,

789

++		sub:4;

790

++#elif defined(__BIG_ENDIAN_BITFIELD)

791

++	__u8	sub:4,

792

++		ver:4;

793

++#else

794

++#error	"Adjust your <asm/byteorder.h> defines"

795

++#endif

796

++};

797

++

798

++struct mp_capable {

799

++	__u8	kind;

800

++	__u8	len;

801

++#if defined(__LITTLE_ENDIAN_BITFIELD)

802

++	__u8	ver:4,

803

++		sub:4;

804

++	__u8	h:1,

805

++		rsv:5,

806

++		b:1,

807

++		a:1;

808

++#elif defined(__BIG_ENDIAN_BITFIELD)

809

++	__u8	sub:4,

810

++		ver:4;

811

++	__u8	a:1,

812

++		b:1,

813

++		rsv:5,

814

++		h:1;

815

++#else

816

++#error	"Adjust your <asm/byteorder.h> defines"

817

++#endif

818

++	__u64	sender_key;

819

++	__u64	receiver_key;

820

++} __attribute__((__packed__));

821

++

822

++struct mp_join {

823

++	__u8	kind;

824

++	__u8	len;

825

++#if defined(__LITTLE_ENDIAN_BITFIELD)

826

++	__u8	b:1,

827

++		rsv:3,

828

++		sub:4;

829

++#elif defined(__BIG_ENDIAN_BITFIELD)

830

++	__u8	sub:4,

831

++		rsv:3,

832

++		b:1;

833

++#else

834

++#error	"Adjust your <asm/byteorder.h> defines"

835

++#endif

836

++	__u8	addr_id;

837

++	union {

838

++		struct {

839

++			u32	token;

840

++			u32	nonce;

841

++		} syn;

842

++		struct {

843

++			__u64	mac;

844

++			u32	nonce;

845

++		} synack;

846

++		struct {

847

++			__u8	mac[20];

848

++		} ack;

849

++	} u;

850

++} __attribute__((__packed__));

851

++

852

++struct mp_dss {

853

++	__u8	kind;

854

++	__u8	len;

855

++#if defined(__LITTLE_ENDIAN_BITFIELD)

856

++	__u16	rsv1:4,

857

++		sub:4,

858

++		A:1,

859

++		a:1,

860

++		M:1,

861

++		m:1,

862

++		F:1,

863

++		rsv2:3;

864

++#elif defined(__BIG_ENDIAN_BITFIELD)

865

++	__u16	sub:4,

866

++		rsv1:4,

867

++		rsv2:3,

868

++		F:1,

869

++		m:1,

870

++		M:1,

871

++		a:1,

872

++		A:1;

873

++#else

874

++#error	"Adjust your <asm/byteorder.h> defines"

875

++#endif

876

++};

877

++

878

++struct mp_add_addr {

879

++	__u8	kind;

880

++	__u8	len;

881

++#if defined(__LITTLE_ENDIAN_BITFIELD)

882

++	__u8	ipver:4,

883

++		sub:4;

884

++#elif defined(__BIG_ENDIAN_BITFIELD)

885

++	__u8	sub:4,

886

++		ipver:4;

887

++#else

888

++#error	"Adjust your <asm/byteorder.h> defines"

889

++#endif

890

++	__u8	addr_id;

891

++	union {

892

++		struct {

893

++			struct in_addr	addr;

894

++			__be16		port;

895

++		} v4;

896

++		struct {

897

++			struct in6_addr	addr;

898

++			__be16		port;

899

++		} v6;

900

++	} u;

901

++} __attribute__((__packed__));

902

++

903

++struct mp_remove_addr {

904

++	__u8	kind;

905

++	__u8	len;

906

++#if defined(__LITTLE_ENDIAN_BITFIELD)

907

++	__u8	rsv:4,

908

++		sub:4;

909

++#elif defined(__BIG_ENDIAN_BITFIELD)

910

++	__u8	sub:4,

911

++		rsv:4;

912

++#else

913

++#error "Adjust your <asm/byteorder.h> defines"

914

++#endif

915

++	/* list of addr_id */

916

++	__u8	addrs_id;

917

++};

918

++

919

++struct mp_fail {

920

++	__u8	kind;

921

++	__u8	len;

922

++#if defined(__LITTLE_ENDIAN_BITFIELD)

923

++	__u16	rsv1:4,

924

++		sub:4,

925

++		rsv2:8;

926

++#elif defined(__BIG_ENDIAN_BITFIELD)

927

++	__u16	sub:4,

928

++		rsv1:4,

929

++		rsv2:8;

930

++#else

931

++#error	"Adjust your <asm/byteorder.h> defines"

932

++#endif

933

++	__be64	data_seq;

934

++} __attribute__((__packed__));

935

++

936

++struct mp_fclose {

937

++	__u8	kind;

938

++	__u8	len;

939

++#if defined(__LITTLE_ENDIAN_BITFIELD)

940

++	__u16	rsv1:4,

941

++		sub:4,

942

++		rsv2:8;

943

++#elif defined(__BIG_ENDIAN_BITFIELD)

944

++	__u16	sub:4,

945

++		rsv1:4,

946

++		rsv2:8;

947

++#else

948

++#error	"Adjust your <asm/byteorder.h> defines"

949

++#endif

950

++	__u64	key;

951

++} __attribute__((__packed__));

952

++

953

++struct mp_prio {

954

++	__u8	kind;

955

++	__u8	len;

956

++#if defined(__LITTLE_ENDIAN_BITFIELD)

957

++	__u8	b:1,

958

++		rsv:3,

959

++		sub:4;

960

++#elif defined(__BIG_ENDIAN_BITFIELD)

961

++	__u8	sub:4,

962

++		rsv:3,

963

++		b:1;

964

++#else

965

++#error	"Adjust your <asm/byteorder.h> defines"

966

++#endif

967

++	__u8	addr_id;

968

++} __attribute__((__packed__));

969

++

970

++static inline int mptcp_sub_len_dss(const struct mp_dss *m, const int csum)

971

++{

972

++	return 4 + m->A * (4 + m->a * 4) + m->M * (10 + m->m * 4 + csum * 2);

973

++}

974

++

975

++#define MPTCP_APP	2

976

++

977

++extern int sysctl_mptcp_enabled;

978

++extern int sysctl_mptcp_checksum;

979

++extern int sysctl_mptcp_debug;

980

++extern int sysctl_mptcp_syn_retries;

981

++

982

++extern struct workqueue_struct *mptcp_wq;

983

++

984

++#define mptcp_debug(fmt, args...)					\

985

++	do {								\

986

++		if (unlikely(sysctl_mptcp_debug))			\

987

++			pr_err(__FILE__ ": " fmt, ##args);	\

988

++	} while (0)

989

++

990

++/* Iterates over all subflows */

991

++#define mptcp_for_each_tp(mpcb, tp)					\

992

++	for ((tp) = (mpcb)->connection_list; (tp); (tp) = (tp)->mptcp->next)

993

++

994

++#define mptcp_for_each_sk(mpcb, sk)					\

995

++	for ((sk) = (struct sock *)(mpcb)->connection_list;		\

996

++	     sk;							\

997

++	     sk = (struct sock *)tcp_sk(sk)->mptcp->next)

998

++

999

++#define mptcp_for_each_sk_safe(__mpcb, __sk, __temp)			\

1000

++	for (__sk = (struct sock *)(__mpcb)->connection_list,		\

1001

++	     __temp = __sk ? (struct sock *)tcp_sk(__sk)->mptcp->next : NULL; \

1002

++	     __sk;							\

1003

++	     __sk = __temp,						\

1004

++	     __temp = __sk ? (struct sock *)tcp_sk(__sk)->mptcp->next : NULL)

1005

++

1006

++/* Iterates over all bit set to 1 in a bitset */

1007

++#define mptcp_for_each_bit_set(b, i)					\

1008

++	for (i = ffs(b) - 1; i >= 0; i = ffs(b >> (i + 1) << (i + 1)) - 1)

1009

++

1010

++#define mptcp_for_each_bit_unset(b, i)					\

1011

++	mptcp_for_each_bit_set(~b, i)

1012

++

1013

++extern struct lock_class_key meta_key;

1014

++extern struct lock_class_key meta_slock_key;

1015

++extern u32 mptcp_secret[MD5_MESSAGE_BYTES / 4];

1016

++

1017

++/* This is needed to ensure that two subsequent key/nonce-generation result in

1018

++ * different keys/nonces if the IPs and ports are the same.

1019

++ */

1020

++extern u32 mptcp_seed;

1021

++

1022

++#define MPTCP_HASH_SIZE                1024

1023

++

1024

++extern struct hlist_nulls_head tk_hashtable[MPTCP_HASH_SIZE];

1025

++

1026

++/* This second hashtable is needed to retrieve request socks

1027

++ * created as a result of a join request. While the SYN contains

1028

++ * the token, the final ack does not, so we need a separate hashtable

1029

++ * to retrieve the mpcb.

1030

++ */

1031

++extern struct hlist_nulls_head mptcp_reqsk_htb[MPTCP_HASH_SIZE];

1032

++extern spinlock_t mptcp_reqsk_hlock;	/* hashtable protection */

1033

++

1034

++/* Lock, protecting the two hash-tables that hold the token. Namely,

1035

++ * mptcp_reqsk_tk_htb and tk_hashtable

1036

++ */

1037

++extern spinlock_t mptcp_tk_hashlock;	/* hashtable protection */

1038

++

1039

++/* Request-sockets can be hashed in the tk_htb for collision-detection or in

1040

++ * the regular htb for join-connections. We need to define different NULLS

1041

++ * values so that we can correctly detect a request-socket that has been

1042

++ * recycled. See also c25eb3bfb9729.

1043

++ */

1044

++#define MPTCP_REQSK_NULLS_BASE (1U << 29)

1045

++

1046

++

1047

++void mptcp_data_ready(struct sock *sk);

1048

++void mptcp_write_space(struct sock *sk);

1049

++

1050

++void mptcp_add_meta_ofo_queue(const struct sock *meta_sk, struct sk_buff *skb,

1051

++			      struct sock *sk);

1052

++void mptcp_ofo_queue(struct sock *meta_sk);

1053

++void mptcp_purge_ofo_queue(struct tcp_sock *meta_tp);

1054

++void mptcp_cleanup_rbuf(struct sock *meta_sk, int copied);

1055

++int mptcp_add_sock(struct sock *meta_sk, struct sock *sk, u8 loc_id, u8 rem_id,

1056

++		   gfp_t flags);

1057

++void mptcp_del_sock(struct sock *sk);

1058

++void mptcp_update_metasocket(struct sock *sock, const struct sock *meta_sk);

1059

++void mptcp_reinject_data(struct sock *orig_sk, int clone_it);

1060

++void mptcp_update_sndbuf(const struct tcp_sock *tp);

1061

++void mptcp_send_fin(struct sock *meta_sk);

1062

++void mptcp_send_active_reset(struct sock *meta_sk, gfp_t priority);

1063

++bool mptcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,

1064

++		      int push_one, gfp_t gfp);

1065

++void tcp_parse_mptcp_options(const struct sk_buff *skb,

1066

++			     struct mptcp_options_received *mopt);

1067

++void mptcp_parse_options(const uint8_t *ptr, int opsize,

1068

++			 struct mptcp_options_received *mopt,

1069

++			 const struct sk_buff *skb);

1070

++void mptcp_syn_options(const struct sock *sk, struct tcp_out_options *opts,

1071

++		       unsigned *remaining);

1072

++void mptcp_synack_options(struct request_sock *req,

1073

++			  struct tcp_out_options *opts,

1074

++			  unsigned *remaining);

1075

++void mptcp_established_options(struct sock *sk, struct sk_buff *skb,

1076

++			       struct tcp_out_options *opts, unsigned *size);

1077

++void mptcp_options_write(__be32 *ptr, struct tcp_sock *tp,

1078

++			 const struct tcp_out_options *opts,

1079

++			 struct sk_buff *skb);

1080

++void mptcp_close(struct sock *meta_sk, long timeout);

1081

++int mptcp_doit(struct sock *sk);

1082

++int mptcp_create_master_sk(struct sock *meta_sk, __u64 remote_key, u32 window);

1083

++int mptcp_check_req_fastopen(struct sock *child, struct request_sock *req);

1084

++int mptcp_check_req_master(struct sock *sk, struct sock *child,

1085

++			   struct request_sock *req,

1086

++			   struct request_sock **prev);

1087

++struct sock *mptcp_check_req_child(struct sock *sk, struct sock *child,

1088

++				   struct request_sock *req,

1089

++				   struct request_sock **prev,

1090

++				   const struct mptcp_options_received *mopt);

1091

++u32 __mptcp_select_window(struct sock *sk);

1092

++void mptcp_select_initial_window(int __space, __u32 mss, __u32 *rcv_wnd,

1093

++					__u32 *window_clamp, int wscale_ok,

1094

++					__u8 *rcv_wscale, __u32 init_rcv_wnd,

1095

++					const struct sock *sk);

1096

++unsigned int mptcp_current_mss(struct sock *meta_sk);

1097

++int mptcp_select_size(const struct sock *meta_sk, bool sg);

1098

++void mptcp_key_sha1(u64 key, u32 *token, u64 *idsn);

1099

++void mptcp_hmac_sha1(u8 *key_1, u8 *key_2, u8 *rand_1, u8 *rand_2,

1100

++		     u32 *hash_out);

1101

++void mptcp_clean_rtx_infinite(const struct sk_buff *skb, struct sock *sk);

1102

++void mptcp_fin(struct sock *meta_sk);

1103

++void mptcp_retransmit_timer(struct sock *meta_sk);

1104

++int mptcp_write_wakeup(struct sock *meta_sk);

1105

++void mptcp_sub_close_wq(struct work_struct *work);

1106

++void mptcp_sub_close(struct sock *sk, unsigned long delay);

1107

++struct sock *mptcp_select_ack_sock(const struct sock *meta_sk);

1108

++void mptcp_fallback_meta_sk(struct sock *meta_sk);

1109

++int mptcp_backlog_rcv(struct sock *meta_sk, struct sk_buff *skb);

1110

++void mptcp_ack_handler(unsigned long);

1111

++int mptcp_check_rtt(const struct tcp_sock *tp, int time);

1112

++int mptcp_check_snd_buf(const struct tcp_sock *tp);

1113

++int mptcp_handle_options(struct sock *sk, const struct tcphdr *th,

1114

++			 const struct sk_buff *skb);

1115

++void __init mptcp_init(void);

1116

++int mptcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len);

1117

++void mptcp_destroy_sock(struct sock *sk);

1118

++int mptcp_rcv_synsent_state_process(struct sock *sk, struct sock **skptr,

1119

++				    const struct sk_buff *skb,

1120

++				    const struct mptcp_options_received *mopt);

1121

++unsigned int mptcp_xmit_size_goal(const struct sock *meta_sk, u32 mss_now,

1122

++				  int large_allowed);

1123

++int mptcp_init_tw_sock(struct sock *sk, struct tcp_timewait_sock *tw);

1124

++void mptcp_twsk_destructor(struct tcp_timewait_sock *tw);

1125

++void mptcp_time_wait(struct sock *sk, int state, int timeo);

1126

++void mptcp_disconnect(struct sock *sk);

1127

++bool mptcp_should_expand_sndbuf(const struct sock *sk);

1128

++int mptcp_retransmit_skb(struct sock *meta_sk, struct sk_buff *skb);

1129

++void mptcp_tsq_flags(struct sock *sk);

1130

++void mptcp_tsq_sub_deferred(struct sock *meta_sk);

1131

++struct mp_join *mptcp_find_join(const struct sk_buff *skb);

1132

++void mptcp_hash_remove_bh(struct tcp_sock *meta_tp);

1133

++void mptcp_hash_remove(struct tcp_sock *meta_tp);

1134

++struct sock *mptcp_hash_find(const struct net *net, const u32 token);

1135

++int mptcp_lookup_join(struct sk_buff *skb, struct inet_timewait_sock *tw);

1136

++int mptcp_do_join_short(struct sk_buff *skb,

1137

++			const struct mptcp_options_received *mopt,

1138

++			struct net *net);

1139

++void mptcp_reqsk_destructor(struct request_sock *req);

1140

++void mptcp_reqsk_new_mptcp(struct request_sock *req,

1141

++			   const struct mptcp_options_received *mopt,

1142

++			   const struct sk_buff *skb);

1143

++int mptcp_check_req(struct sk_buff *skb, struct net *net);

1144

++void mptcp_connect_init(struct sock *sk);

1145

++void mptcp_sub_force_close(struct sock *sk);

1146

++int mptcp_sub_len_remove_addr_align(u16 bitfield);

1147

++void mptcp_remove_shortcuts(const struct mptcp_cb *mpcb,

1148

++			    const struct sk_buff *skb);

1149

++void mptcp_init_buffer_space(struct sock *sk);

1150

++void mptcp_join_reqsk_init(struct mptcp_cb *mpcb, const struct request_sock *req,

1151

++			   struct sk_buff *skb);

1152

++void mptcp_reqsk_init(struct request_sock *req, const struct sk_buff *skb);

1153

++int mptcp_conn_request(struct sock *sk, struct sk_buff *skb);

1154

++void mptcp_init_congestion_control(struct sock *sk);

1155

++

1156

++/* MPTCP-path-manager registration/initialization functions */

1157

++int mptcp_register_path_manager(struct mptcp_pm_ops *pm);

1158

++void mptcp_unregister_path_manager(struct mptcp_pm_ops *pm);

1159

++void mptcp_init_path_manager(struct mptcp_cb *mpcb);

1160

++void mptcp_cleanup_path_manager(struct mptcp_cb *mpcb);

1161

++void mptcp_fallback_default(struct mptcp_cb *mpcb);

1162

++void mptcp_get_default_path_manager(char *name);

1163

++int mptcp_set_default_path_manager(const char *name);

1164

++extern struct mptcp_pm_ops mptcp_pm_default;

1165

++

1166

++/* MPTCP-scheduler registration/initialization functions */

1167

++int mptcp_register_scheduler(struct mptcp_sched_ops *sched);

1168

++void mptcp_unregister_scheduler(struct mptcp_sched_ops *sched);

1169

++void mptcp_init_scheduler(struct mptcp_cb *mpcb);

1170

++void mptcp_cleanup_scheduler(struct mptcp_cb *mpcb);

1171

++void mptcp_get_default_scheduler(char *name);

1172

++int mptcp_set_default_scheduler(const char *name);

1173

++extern struct mptcp_sched_ops mptcp_sched_default;

1174

++

1175

++static inline void mptcp_reset_synack_timer(struct sock *meta_sk,

1176

++					    unsigned long len)

1177

++{

1178

++	sk_reset_timer(meta_sk, &tcp_sk(meta_sk)->mpcb->synack_timer,

1179

++		       jiffies + len);

1180

++}

1181

++

1182

++static inline void mptcp_delete_synack_timer(struct sock *meta_sk)

1183

++{

1184

++	sk_stop_timer(meta_sk, &tcp_sk(meta_sk)->mpcb->synack_timer);

1185

++}

1186

++

1187

++static inline bool is_mptcp_enabled(const struct sock *sk)

1188

++{

1189

++	if (!sysctl_mptcp_enabled || mptcp_init_failed)

1190

++		return false;

1191

++

1192

++	if (sysctl_mptcp_enabled == MPTCP_APP && !tcp_sk(sk)->mptcp_enabled)

1193

++		return false;

1194

++

1195

++	return true;

1196

++}

1197

++

1198

++static inline int mptcp_pi_to_flag(int pi)

1199

++{

1200

++	return 1 << (pi - 1);

1201

++}

1202

++

1203

++static inline

1204

++struct mptcp_request_sock *mptcp_rsk(const struct request_sock *req)

1205

++{

1206

++	return (struct mptcp_request_sock *)req;

1207

++}

1208

++

1209

++static inline

1210

++struct request_sock *rev_mptcp_rsk(const struct mptcp_request_sock *req)

1211

++{

1212

++	return (struct request_sock *)req;

1213

++}

1214

++

1215

++static inline bool mptcp_can_sendpage(struct sock *sk)

1216

++{

1217

++	struct sock *sk_it;

1218

++

1219

++	if (tcp_sk(sk)->mpcb->dss_csum)

1220

++		return false;

1221

++

1222

++	mptcp_for_each_sk(tcp_sk(sk)->mpcb, sk_it) {

1223

++		if (!(sk_it->sk_route_caps & NETIF_F_SG) ||

1224

++		    !(sk_it->sk_route_caps & NETIF_F_ALL_CSUM))

1225

++			return false;

1226

++	}

1227

++

1228

++	return true;

1229

++}

1230

++

1231

++static inline void mptcp_push_pending_frames(struct sock *meta_sk)

1232

++{

1233

++	/* We check packets out and send-head here. TCP only checks the

1234

++	 * send-head. But, MPTCP also checks packets_out, as this is an

1235

++	 * indication that we might want to do opportunistic reinjection.

1236

++	 */

1237

++	if (tcp_sk(meta_sk)->packets_out || tcp_send_head(meta_sk)) {

1238

++		struct tcp_sock *tp = tcp_sk(meta_sk);

1239

++

1240

++		/* We don't care about the MSS, because it will be set in

1241

++		 * mptcp_write_xmit.

1242

++		 */

1243

++		__tcp_push_pending_frames(meta_sk, 0, tp->nonagle);

1244

++	}

1245

++}

1246

++

1247

++static inline void mptcp_send_reset(struct sock *sk)

1248

++{

1249

++	tcp_sk(sk)->ops->send_active_reset(sk, GFP_ATOMIC);

1250

++	mptcp_sub_force_close(sk);

1251

++}

1252

++

1253

++static inline bool mptcp_is_data_seq(const struct sk_buff *skb)

1254

++{

1255

++	return TCP_SKB_CB(skb)->mptcp_flags & MPTCPHDR_SEQ;

1256

++}

1257

++

1258

++static inline bool mptcp_is_data_fin(const struct sk_buff *skb)

1259

++{

1260

++	return TCP_SKB_CB(skb)->mptcp_flags & MPTCPHDR_FIN;

1261

++}

1262

++

1263

++/* Is it a data-fin while in infinite mapping mode?

1264

++ * In infinite mode, a subflow-fin is in fact a data-fin.

1265

++ */

1266

++static inline bool mptcp_is_data_fin2(const struct sk_buff *skb,

1267

++				     const struct tcp_sock *tp)

1268

++{

1269

++	return mptcp_is_data_fin(skb) ||

1270

++	       (tp->mpcb->infinite_mapping_rcv && tcp_hdr(skb)->fin);

1271

++}

1272

++

1273

++static inline u8 mptcp_get_64_bit(u64 data_seq, struct mptcp_cb *mpcb)

1274

++{

1275

++	u64 data_seq_high = (u32)(data_seq >> 32);

1276

++

1277

++	if (mpcb->rcv_high_order[0] == data_seq_high)

1278

++		return 0;

1279

++	else if (mpcb->rcv_high_order[1] == data_seq_high)

1280

++		return MPTCPHDR_SEQ64_INDEX;

1281

++	else

1282

++		return MPTCPHDR_SEQ64_OFO;

1283

++}

1284

++

1285

++/* Sets the data_seq and returns pointer to the in-skb field of the data_seq.

1286

++ * If the packet has a 64-bit dseq, the pointer points to the last 32 bits.

1287

++ */

1288

++static inline __u32 *mptcp_skb_set_data_seq(const struct sk_buff *skb,

1289

++					    u32 *data_seq,

1290

++					    struct mptcp_cb *mpcb)

1291

++{

1292

++	__u32 *ptr = (__u32 *)(skb_transport_header(skb) + TCP_SKB_CB(skb)->dss_off);

1293

++

1294

++	if (TCP_SKB_CB(skb)->mptcp_flags & MPTCPHDR_SEQ64_SET) {

1295

++		u64 data_seq64 = get_unaligned_be64(ptr);

1296

++

1297

++		if (mpcb)

1298

++			TCP_SKB_CB(skb)->mptcp_flags |= mptcp_get_64_bit(data_seq64, mpcb);

1299

++

1300

++		*data_seq = (u32)data_seq64;

1301

++		ptr++;

1302

++	} else {

1303

++		*data_seq = get_unaligned_be32(ptr);

1304

++	}

1305

++

1306

++	return ptr;

1307

++}

1308

++

1309

++static inline struct sock *mptcp_meta_sk(const struct sock *sk)

1310

++{

1311

++	return tcp_sk(sk)->meta_sk;

1312

++}

1313

++

1314

++static inline struct tcp_sock *mptcp_meta_tp(const struct tcp_sock *tp)

1315

++{

1316

++	return tcp_sk(tp->meta_sk);

1317

++}

1318

++

1319

++static inline int is_meta_tp(const struct tcp_sock *tp)

1320

++{

1321

++	return tp->mpcb && mptcp_meta_tp(tp) == tp;

1322

++}

1323

++

1324

++static inline int is_meta_sk(const struct sock *sk)

1325

++{

1326

++	return sk->sk_type == SOCK_STREAM  && sk->sk_protocol == IPPROTO_TCP &&

1327

++	       mptcp(tcp_sk(sk)) && mptcp_meta_sk(sk) == sk;

1328

++}

1329

++

1330

++static inline int is_master_tp(const struct tcp_sock *tp)

1331

++{

1332

++	return !mptcp(tp) || (!tp->mptcp->slave_sk && !is_meta_tp(tp));

1333

++}

1334

++

1335

++static inline void mptcp_hash_request_remove(struct request_sock *req)

1336

++{

1337

++	int in_softirq = 0;

1338

++

1339

++	if (hlist_nulls_unhashed(&mptcp_rsk(req)->hash_entry))

1340

++		return;

1341

++

1342

++	if (in_softirq()) {

1343

++		spin_lock(&mptcp_reqsk_hlock);

1344

++		in_softirq = 1;

1345

++	} else {

1346

++		spin_lock_bh(&mptcp_reqsk_hlock);

1347

++	}

1348

++

1349

++	hlist_nulls_del_init_rcu(&mptcp_rsk(req)->hash_entry);

1350

++

1351

++	if (in_softirq)

1352

++		spin_unlock(&mptcp_reqsk_hlock);

1353

++	else

1354

++		spin_unlock_bh(&mptcp_reqsk_hlock);

1355

++}

1356

++

1357

++static inline void mptcp_init_mp_opt(struct mptcp_options_received *mopt)

1358

++{

1359

++	mopt->saw_mpc = 0;

1360

++	mopt->dss_csum = 0;

1361

++	mopt->drop_me = 0;

1362

++

1363

++	mopt->is_mp_join = 0;

1364

++	mopt->join_ack = 0;

1365

++

1366

++	mopt->saw_low_prio = 0;

1367

++	mopt->low_prio = 0;

1368

++

1369

++	mopt->saw_add_addr = 0;

1370

++	mopt->more_add_addr = 0;

1371

++

1372

++	mopt->saw_rem_addr = 0;

1373

++	mopt->more_rem_addr = 0;

1374

++

1375

++	mopt->mp_fail = 0;

1376

++	mopt->mp_fclose = 0;

1377

++}

1378

++

1379

++static inline void mptcp_reset_mopt(struct tcp_sock *tp)

1380

++{

1381

++	struct mptcp_options_received *mopt = &tp->mptcp->rx_opt;

1382

++

1383

++	mopt->saw_low_prio = 0;

1384

++	mopt->saw_add_addr = 0;

1385

++	mopt->more_add_addr = 0;

1386

++	mopt->saw_rem_addr = 0;

1387

++	mopt->more_rem_addr = 0;

1388

++	mopt->join_ack = 0;

1389

++	mopt->mp_fail = 0;

1390

++	mopt->mp_fclose = 0;

1391

++}

1392

++

1393

++static inline __be32 mptcp_get_highorder_sndbits(const struct sk_buff *skb,

1394

++						 const struct mptcp_cb *mpcb)

1395

++{

1396

++	return htonl(mpcb->snd_high_order[(TCP_SKB_CB(skb)->mptcp_flags &

1397

++			MPTCPHDR_SEQ64_INDEX) ? 1 : 0]);

1398

++}

1399

++

1400

++static inline u64 mptcp_get_data_seq_64(const struct mptcp_cb *mpcb, int index,

1401

++					u32 data_seq_32)

1402

++{

1403

++	return ((u64)mpcb->rcv_high_order[index] << 32) | data_seq_32;

1404

++}

1405

++

1406

++static inline u64 mptcp_get_rcv_nxt_64(const struct tcp_sock *meta_tp)

1407

++{

1408

++	struct mptcp_cb *mpcb = meta_tp->mpcb;

1409

++	return mptcp_get_data_seq_64(mpcb, mpcb->rcv_hiseq_index,

1410

++				     meta_tp->rcv_nxt);

1411

++}

1412

++

1413

++static inline void mptcp_check_sndseq_wrap(struct tcp_sock *meta_tp, int inc)

1414

++{

1415

++	if (unlikely(meta_tp->snd_nxt > meta_tp->snd_nxt + inc)) {

1416

++		struct mptcp_cb *mpcb = meta_tp->mpcb;

1417

++		mpcb->snd_hiseq_index = mpcb->snd_hiseq_index ? 0 : 1;

1418

++		mpcb->snd_high_order[mpcb->snd_hiseq_index] += 2;

1419

++	}

1420

++}

1421

++

1422

++static inline void mptcp_check_rcvseq_wrap(struct tcp_sock *meta_tp,

1423

++					   u32 old_rcv_nxt)

1424

++{

1425

++	if (unlikely(old_rcv_nxt > meta_tp->rcv_nxt)) {

1426

++		struct mptcp_cb *mpcb = meta_tp->mpcb;

1427

++		mpcb->rcv_high_order[mpcb->rcv_hiseq_index] += 2;

1428

++		mpcb->rcv_hiseq_index = mpcb->rcv_hiseq_index ? 0 : 1;

1429

++	}

1430

++}

1431

++

1432

++static inline int mptcp_sk_can_send(const struct sock *sk)

1433

++{

1434

++	return tcp_passive_fastopen(sk) ||

1435

++	       ((1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&

1436

++		!tcp_sk(sk)->mptcp->pre_established);

1437

++}

1438

++

1439

++static inline int mptcp_sk_can_recv(const struct sock *sk)

1440

++{

1441

++	return (1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2);

1442

++}

1443

++

1444

++static inline int mptcp_sk_can_send_ack(const struct sock *sk)

1445

++{

1446

++	return !((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV |

1447

++					TCPF_CLOSE | TCPF_LISTEN)) &&

1448

++	       !tcp_sk(sk)->mptcp->pre_established;

1449

++}

1450

++

1451

++/* Only support GSO if all subflows supports it */

1452

++static inline bool mptcp_sk_can_gso(const struct sock *meta_sk)

1453

++{

1454

++	struct sock *sk;

1455

++

1456

++	if (tcp_sk(meta_sk)->mpcb->dss_csum)

1457

++		return false;

1458

++

1459

++	mptcp_for_each_sk(tcp_sk(meta_sk)->mpcb, sk) {

1460

++		if (!mptcp_sk_can_send(sk))

1461

++			continue;

1462

++		if (!sk_can_gso(sk))

1463

++			return false;

1464

++	}

1465

++	return true;

1466

++}

1467

++

1468

++static inline bool mptcp_can_sg(const struct sock *meta_sk)

1469

++{

1470

++	struct sock *sk;

1471

++

1472

++	if (tcp_sk(meta_sk)->mpcb->dss_csum)

1473

++		return false;

1474

++

1475

++	mptcp_for_each_sk(tcp_sk(meta_sk)->mpcb, sk) {

1476

++		if (!mptcp_sk_can_send(sk))

1477

++			continue;

1478

++		if (!(sk->sk_route_caps & NETIF_F_SG))

1479

++			return false;

1480

++	}

1481

++	return true;

1482

++}

1483

++

1484

++static inline void mptcp_set_rto(struct sock *sk)

1485

++{

1486

++	struct tcp_sock *tp = tcp_sk(sk);

1487

++	struct sock *sk_it;

1488

++	struct inet_connection_sock *micsk = inet_csk(mptcp_meta_sk(sk));

1489

++	__u32 max_rto = 0;

1490

++

1491

++	/* We are in recovery-phase on the MPTCP-level. Do not update the

1492

++	 * RTO, because this would kill exponential backoff.

1493

++	 */

1494

++	if (micsk->icsk_retransmits)

1495

++		return;

1496

++

1497

++	mptcp_for_each_sk(tp->mpcb, sk_it) {

1498

++		if (mptcp_sk_can_send(sk_it) &&

1499

++		    inet_csk(sk_it)->icsk_rto > max_rto)

1500

++			max_rto = inet_csk(sk_it)->icsk_rto;

1501

++	}

1502

++	if (max_rto) {

1503

++		micsk->icsk_rto = max_rto << 1;

1504

++

1505

++		/* A successfull rto-measurement - reset backoff counter */

1506

++		micsk->icsk_backoff = 0;

1507

++	}

1508

++}

1509

++

1510

++static inline int mptcp_sysctl_syn_retries(void)

1511

++{

1512

++	return sysctl_mptcp_syn_retries;

1513

++}

1514

++

1515

++static inline void mptcp_sub_close_passive(struct sock *sk)

1516

++{

1517

++	struct sock *meta_sk = mptcp_meta_sk(sk);

1518

++	struct tcp_sock *tp = tcp_sk(sk), *meta_tp = tcp_sk(meta_sk);

1519

++

1520

++	/* Only close, if the app did a send-shutdown (passive close), and we

1521

++	 * received the data-ack of the data-fin.

1522

++	 */

1523

++	if (tp->mpcb->passive_close && meta_tp->snd_una == meta_tp->write_seq)

1524

++		mptcp_sub_close(sk, 0);

1525

++}

1526

++

1527

++static inline bool mptcp_fallback_infinite(struct sock *sk, int flag)

1528

++{

1529

++	struct tcp_sock *tp = tcp_sk(sk);

1530

++

1531

++	/* If data has been acknowleged on the meta-level, fully_established

1532

++	 * will have been set before and thus we will not fall back to infinite

1533

++	 * mapping.

1534

++	 */

1535

++	if (likely(tp->mptcp->fully_established))

1536

++		return false;

1537

++

1538

++	if (!(flag & MPTCP_FLAG_DATA_ACKED))

1539

++		return false;

1540

++

1541

++	/* Don't fallback twice ;) */

1542

++	if (tp->mpcb->infinite_mapping_snd)

1543

++		return false;

1544

++

1545

++	pr_err("%s %#x will fallback - pi %d, src %pI4 dst %pI4 from %pS\n",

1546

++	       __func__, tp->mpcb->mptcp_loc_token, tp->mptcp->path_index,

1547

++	       &inet_sk(sk)->inet_saddr, &inet_sk(sk)->inet_daddr,

1548

++	       __builtin_return_address(0));

1549

++	if (!is_master_tp(tp))

1550

++		return true;

1551

++

1552

++	tp->mpcb->infinite_mapping_snd = 1;

1553

++	tp->mpcb->infinite_mapping_rcv = 1;

1554

++	tp->mptcp->fully_established = 1;

1555

++

1556

++	return false;

1557

++}

1558

++

1559

++/* Find the first index whose bit in the bit-field == 0 */

1560

++static inline u8 mptcp_set_new_pathindex(struct mptcp_cb *mpcb)

1561

++{

1562

++	u8 base = mpcb->next_path_index;

1563

++	int i;

1564

++

1565

++	/* Start at 1, because 0 is reserved for the meta-sk */

1566

++	mptcp_for_each_bit_unset(mpcb->path_index_bits >> base, i) {

1567

++		if (i + base < 1)

1568

++			continue;

1569

++		if (i + base >= sizeof(mpcb->path_index_bits) * 8)

1570

++			break;

1571

++		i += base;

1572

++		mpcb->path_index_bits |= (1 << i);

1573

++		mpcb->next_path_index = i + 1;

1574

++		return i;

1575

++	}

1576

++	mptcp_for_each_bit_unset(mpcb->path_index_bits, i) {

1577

++		if (i >= sizeof(mpcb->path_index_bits) * 8)

1578

++			break;

1579

++		if (i < 1)

1580

++			continue;

1581

++		mpcb->path_index_bits |= (1 << i);

1582

++		mpcb->next_path_index = i + 1;

1583

++		return i;

1584

++	}

1585

++

1586

++	return 0;

1587

++}

1588

++

1589

++static inline bool mptcp_v6_is_v4_mapped(const struct sock *sk)

1590

++{

1591

++	return sk->sk_family == AF_INET6 &&

1592

++	       ipv6_addr_type(&inet6_sk(sk)->saddr) == IPV6_ADDR_MAPPED;

1593

++}

1594

++

1595

++/* TCP and MPTCP mpc flag-depending functions */

1596

++u16 mptcp_select_window(struct sock *sk);

1597

++void mptcp_init_buffer_space(struct sock *sk);

1598

++void mptcp_tcp_set_rto(struct sock *sk);

1599

++

1600

++/* TCP and MPTCP flag-depending functions */

1601

++bool mptcp_prune_ofo_queue(struct sock *sk);

1602

++

1603

++#else /* CONFIG_MPTCP */

1604

++#define mptcp_debug(fmt, args...)	\

1605

++	do {				\

1606

++	} while (0)

1607

++

1608

++/* Without MPTCP, we just do one iteration

1609

++ * over the only socket available. This assumes that

1610

++ * the sk/tp arg is the socket in that case.

1611

++ */

1612

++#define mptcp_for_each_sk(mpcb, sk)

1613

++#define mptcp_for_each_sk_safe(__mpcb, __sk, __temp)

1614

++

1615

++static inline bool mptcp_is_data_fin(const struct sk_buff *skb)

1616

++{

1617

++	return false;

1618

++}

1619

++static inline bool mptcp_is_data_seq(const struct sk_buff *skb)

1620

++{

1621

++	return false;

1622

++}

1623

++static inline struct sock *mptcp_meta_sk(const struct sock *sk)

1624

++{

1625

++	return NULL;

1626

++}

1627

++static inline struct tcp_sock *mptcp_meta_tp(const struct tcp_sock *tp)

1628

++{

1629

++	return NULL;

1630

++}

1631

++static inline int is_meta_sk(const struct sock *sk)

1632

++{

1633

++	return 0;

1634

++}

1635

++static inline int is_master_tp(const struct tcp_sock *tp)

1636

++{

1637

++	return 0;

1638

++}

1639

++static inline void mptcp_purge_ofo_queue(struct tcp_sock *meta_tp) {}

1640

++static inline void mptcp_del_sock(const struct sock *sk) {}

1641

++static inline void mptcp_update_metasocket(struct sock *sock, const struct sock *meta_sk) {}

1642

++static inline void mptcp_reinject_data(struct sock *orig_sk, int clone_it) {}

1643

++static inline void mptcp_update_sndbuf(const struct tcp_sock *tp) {}

1644

++static inline void mptcp_clean_rtx_infinite(const struct sk_buff *skb,

1645

++					    const struct sock *sk) {}

1646

++static inline void mptcp_sub_close(struct sock *sk, unsigned long delay) {}

1647

++static inline void mptcp_set_rto(const struct sock *sk) {}

1648

++static inline void mptcp_send_fin(const struct sock *meta_sk) {}

1649

++static inline void mptcp_parse_options(const uint8_t *ptr, const int opsize,

1650

++				       const struct mptcp_options_received *mopt,

1651

++				       const struct sk_buff *skb) {}

1652

++static inline void mptcp_syn_options(const struct sock *sk,

1653

++				     struct tcp_out_options *opts,

1654

++				     unsigned *remaining) {}

1655

++static inline void mptcp_synack_options(struct request_sock *req,

1656

++					struct tcp_out_options *opts,

1657

++					unsigned *remaining) {}

1658

++

1659

++static inline void mptcp_established_options(struct sock *sk,

1660

++					     struct sk_buff *skb,

1661

++					     struct tcp_out_options *opts,

1662

++					     unsigned *size) {}

1663

++static inline void mptcp_options_write(__be32 *ptr, struct tcp_sock *tp,

1664

++				       const struct tcp_out_options *opts,

1665

++				       struct sk_buff *skb) {}

1666

++static inline void mptcp_close(struct sock *meta_sk, long timeout) {}

1667

++static inline int mptcp_doit(struct sock *sk)

1668

++{

1669

++	return 0;

1670

++}

1671

++static inline int mptcp_check_req_fastopen(struct sock *child,

1672

++					   struct request_sock *req)

1673

++{

1674

++	return 1;

1675

++}

1676

++static inline int mptcp_check_req_master(const struct sock *sk,

1677

++					 const struct sock *child,

1678

++					 struct request_sock *req,

1679

++					 struct request_sock **prev)

1680

++{

1681

++	return 1;

1682

++}

1683

++static inline struct sock *mptcp_check_req_child(struct sock *sk,

1684

++						 struct sock *child,

1685

++						 struct request_sock *req,

1686

++						 struct request_sock **prev,

1687

++						 const struct mptcp_options_received *mopt)

1688

++{

1689

++	return NULL;

1690

++}

1691

++static inline unsigned int mptcp_current_mss(struct sock *meta_sk)

1692

++{

1693

++	return 0;

1694

++}

1695

++static inline int mptcp_select_size(const struct sock *meta_sk, bool sg)

1696

++{

1697

++	return 0;

1698

++}

1699

++static inline void mptcp_sub_close_passive(struct sock *sk) {}

1700

++static inline bool mptcp_fallback_infinite(const struct sock *sk, int flag)

1701

++{

1702

++	return false;

1703

++}

1704

++static inline void mptcp_init_mp_opt(const struct mptcp_options_received *mopt) {}

1705

++static inline int mptcp_check_rtt(const struct tcp_sock *tp, int time)

1706

++{

1707

++	return 0;

1708

++}

1709

++static inline int mptcp_check_snd_buf(const struct tcp_sock *tp)

1710

++{

1711

++	return 0;

1712

++}

1713

++static inline int mptcp_sysctl_syn_retries(void)

1714

++{

1715

++	return 0;

1716

++}

1717

++static inline void mptcp_send_reset(const struct sock *sk) {}

1718

++static inline int mptcp_handle_options(struct sock *sk,

1719

++				       const struct tcphdr *th,

1720

++				       struct sk_buff *skb)

1721

++{

1722

++	return 0;

1723

++}

1724

++static inline void mptcp_reset_mopt(struct tcp_sock *tp) {}

1725

++static inline void  __init mptcp_init(void) {}

1726

++static inline int mptcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)

1727

++{

1728

++	return 0;

1729

++}

1730

++static inline bool mptcp_sk_can_gso(const struct sock *sk)

1731

++{

1732

++	return false;

1733

++}

1734

++static inline bool mptcp_can_sg(const struct sock *meta_sk)

1735

++{

1736

++	return false;

1737

++}

1738

++static inline unsigned int mptcp_xmit_size_goal(const struct sock *meta_sk,

1739

++						u32 mss_now, int large_allowed)

1740

++{

1741

++	return 0;

1742

++}

1743

++static inline void mptcp_destroy_sock(struct sock *sk) {}

1744

++static inline int mptcp_rcv_synsent_state_process(struct sock *sk,

1745

++						  struct sock **skptr,

1746

++						  struct sk_buff *skb,

1747

++						  const struct mptcp_options_received *mopt)

1748

++{

1749

++	return 0;

1750

++}

1751

++static inline bool mptcp_can_sendpage(struct sock *sk)

1752

++{

1753

++	return false;

1754

++}

1755

++static inline int mptcp_init_tw_sock(struct sock *sk,

1756

++				     struct tcp_timewait_sock *tw)

1757

++{

1758

++	return 0;

1759

++}

1760

++static inline void mptcp_twsk_destructor(struct tcp_timewait_sock *tw) {}

1761

++static inline void mptcp_disconnect(struct sock *sk) {}

1762

++static inline void mptcp_tsq_flags(struct sock *sk) {}

1763

++static inline void mptcp_tsq_sub_deferred(struct sock *meta_sk) {}

1764

++static inline void mptcp_hash_remove_bh(struct tcp_sock *meta_tp) {}

1765

++static inline void mptcp_hash_remove(struct tcp_sock *meta_tp) {}

1766

++static inline void mptcp_reqsk_new_mptcp(struct request_sock *req,

1767

++					 const struct tcp_options_received *rx_opt,

1768

++					 const struct mptcp_options_received *mopt,

1769

++					 const struct sk_buff *skb) {}

1770

++static inline void mptcp_remove_shortcuts(const struct mptcp_cb *mpcb,

1771

++					  const struct sk_buff *skb) {}

1772

++static inline void mptcp_delete_synack_timer(struct sock *meta_sk) {}

1773

++#endif /* CONFIG_MPTCP */

1774

++

1775

++#endif /* _MPTCP_H */

1776

+diff --git a/include/net/mptcp_v4.h b/include/net/mptcp_v4.h

1777

+new file mode 100644

1778

+index 000000000000..93ad97c77c5a

1779

+--- /dev/null

1780

++++ b/include/net/mptcp_v4.h

1781

+@@ -0,0 +1,67 @@

1782

++/*

1783

++ *	MPTCP implementation

1784

++ *

1785

++ *	Initial Design & Implementation:

1786

++ *	Sébastien Barré <sebastien.barre@×××××××××.be>

1787

++ *

1788

++ *	Current Maintainer & Author:

1789

++ *	Christoph Paasch <christoph.paasch@×××××××××.be>

1790

++ *

1791

++ *	Additional authors:

1792

++ *	Jaakko Korkeaniemi <jaakko.korkeaniemi@×××××.fi>

1793

++ *	Gregory Detal <gregory.detal@×××××××××.be>

1794

++ *	Fabien Duchêne <fabien.duchene@×××××××××.be>

1795

++ *	Andreas Seelinger <Andreas.Seelinger@×××××××××××.de>

1796

++ *	Lavkesh Lahngir <lavkesh51@×××××.com>

1797

++ *	Andreas Ripke <ripke@××××××.eu>

1798

++ *	Vlad Dogaru <vlad.dogaru@×××××.com>

1799

++ *	Octavian Purdila <octavian.purdila@×××××.com>

1800

++ *	John Ronan <jronan@××××.org>

1801

++ *	Catalin Nicutar <catalin.nicutar@×××××.com>

1802

++ *	Brandon Heller <brandonh@××××××××.edu>

1803

++ *

1804

++ *

1805

++ *	This program is free software; you can redistribute it and/or

1806

++ *      modify it under the terms of the GNU General Public License

1807

++ *      as published by the Free Software Foundation; either version

1808

++ *      2 of the License, or (at your option) any later version.

1809

++ */

1810

++

1811

++#ifndef MPTCP_V4_H_

1812

++#define MPTCP_V4_H_

1813

++

1814

++

1815

++#include <linux/in.h>

1816

++#include <linux/skbuff.h>

1817

++#include <net/mptcp.h>

1818

++#include <net/request_sock.h>

1819

++#include <net/sock.h>

1820

++

1821

++extern struct request_sock_ops mptcp_request_sock_ops;

1822

++extern const struct inet_connection_sock_af_ops mptcp_v4_specific;

1823

++extern struct tcp_request_sock_ops mptcp_request_sock_ipv4_ops;

1824

++extern struct tcp_request_sock_ops mptcp_join_request_sock_ipv4_ops;

1825

++

1826

++#ifdef CONFIG_MPTCP

1827

++

1828

++int mptcp_v4_do_rcv(struct sock *meta_sk, struct sk_buff *skb);

1829

++struct sock *mptcp_v4_search_req(const __be16 rport, const __be32 raddr,

1830

++				 const __be32 laddr, const struct net *net);

1831

++int mptcp_init4_subsockets(struct sock *meta_sk, const struct mptcp_loc4 *loc,

1832

++			   struct mptcp_rem4 *rem);

1833

++int mptcp_pm_v4_init(void);

1834

++void mptcp_pm_v4_undo(void);

1835

++u32 mptcp_v4_get_nonce(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport);

1836

++u64 mptcp_v4_get_key(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport);

1837

++

1838

++#else

1839

++

1840

++static inline int mptcp_v4_do_rcv(const struct sock *meta_sk,

1841

++				  const struct sk_buff *skb)

1842

++{

1843

++	return 0;

1844

++}

1845

++

1846

++#endif /* CONFIG_MPTCP */

1847

++

1848

++#endif /* MPTCP_V4_H_ */

1849

+diff --git a/include/net/mptcp_v6.h b/include/net/mptcp_v6.h

1850

+new file mode 100644

1851

+index 000000000000..49a4f30ccd4d

1852

+--- /dev/null

1853

++++ b/include/net/mptcp_v6.h

1854

+@@ -0,0 +1,69 @@

1855

++/*

1856

++ *	MPTCP implementation

1857

++ *

1858

++ *	Initial Design & Implementation:

1859

++ *	Sébastien Barré <sebastien.barre@×××××××××.be>

1860

++ *

1861

++ *	Current Maintainer & Author:

1862

++ *	Jaakko Korkeaniemi <jaakko.korkeaniemi@×××××.fi>

1863

++ *

1864

++ *	Additional authors:

1865

++ *	Jaakko Korkeaniemi <jaakko.korkeaniemi@×××××.fi>

1866

++ *	Gregory Detal <gregory.detal@×××××××××.be>

1867

++ *	Fabien Duchêne <fabien.duchene@×××××××××.be>

1868

++ *	Andreas Seelinger <Andreas.Seelinger@×××××××××××.de>

1869

++ *	Lavkesh Lahngir <lavkesh51@×××××.com>

1870

++ *	Andreas Ripke <ripke@××××××.eu>

1871

++ *	Vlad Dogaru <vlad.dogaru@×××××.com>

1872

++ *	Octavian Purdila <octavian.purdila@×××××.com>

1873

++ *	John Ronan <jronan@××××.org>

1874

++ *	Catalin Nicutar <catalin.nicutar@×××××.com>

1875

++ *	Brandon Heller <brandonh@××××××××.edu>

1876

++ *

1877

++ *

1878

++ *	This program is free software; you can redistribute it and/or

1879

++ *      modify it under the terms of the GNU General Public License

1880

++ *      as published by the Free Software Foundation; either version

1881

++ *      2 of the License, or (at your option) any later version.

1882

++ */

1883

++

1884

++#ifndef _MPTCP_V6_H

1885

++#define _MPTCP_V6_H

1886

++

1887

++#include <linux/in6.h>

1888

++#include <net/if_inet6.h>

1889

++

1890

++#include <net/mptcp.h>

1891

++

1892

++

1893

++#ifdef CONFIG_MPTCP

1894

++extern const struct inet_connection_sock_af_ops mptcp_v6_mapped;

1895

++extern const struct inet_connection_sock_af_ops mptcp_v6_specific;

1896

++extern struct request_sock_ops mptcp6_request_sock_ops;

1897

++extern struct tcp_request_sock_ops mptcp_request_sock_ipv6_ops;

1898

++extern struct tcp_request_sock_ops mptcp_join_request_sock_ipv6_ops;

1899

++

1900

++int mptcp_v6_do_rcv(struct sock *meta_sk, struct sk_buff *skb);

1901

++struct sock *mptcp_v6_search_req(const __be16 rport, const struct in6_addr *raddr,

1902

++				 const struct in6_addr *laddr, const struct net *net);

1903

++int mptcp_init6_subsockets(struct sock *meta_sk, const struct mptcp_loc6 *loc,

1904

++			   struct mptcp_rem6 *rem);

1905

++int mptcp_pm_v6_init(void);

1906

++void mptcp_pm_v6_undo(void);

1907

++__u32 mptcp_v6_get_nonce(const __be32 *saddr, const __be32 *daddr,

1908

++			 __be16 sport, __be16 dport);

1909

++u64 mptcp_v6_get_key(const __be32 *saddr, const __be32 *daddr,

1910

++		     __be16 sport, __be16 dport);

1911

++

1912

++#else /* CONFIG_MPTCP */

1913

++

1914

++#define mptcp_v6_mapped ipv6_mapped

1915

++

1916

++static inline int mptcp_v6_do_rcv(struct sock *meta_sk, struct sk_buff *skb)

1917

++{

1918

++	return 0;

1919

++}

1920

++

1921

++#endif /* CONFIG_MPTCP */

1922

++

1923

++#endif /* _MPTCP_V6_H */

1924

+diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h

1925

+index 361d26077196..bae95a11c531 100644

1926

+--- a/include/net/net_namespace.h

1927

++++ b/include/net/net_namespace.h

1928

+@@ -16,6 +16,7 @@

1929

+ #include <net/netns/packet.h>

1930

+ #include <net/netns/ipv4.h>

1931

+ #include <net/netns/ipv6.h>

1932

++#include <net/netns/mptcp.h>

1933

+ #include <net/netns/ieee802154_6lowpan.h>

1934

+ #include <net/netns/sctp.h>

1935

+ #include <net/netns/dccp.h>

1936

+@@ -92,6 +93,9 @@ struct net {

1937

+ #if IS_ENABLED(CONFIG_IPV6)

1938

+ 	struct netns_ipv6	ipv6;

1939

+ #endif

1940

++#if IS_ENABLED(CONFIG_MPTCP)

1941

++	struct netns_mptcp	mptcp;

1942

++#endif

1943

+ #if IS_ENABLED(CONFIG_IEEE802154_6LOWPAN)

1944

+ 	struct netns_ieee802154_lowpan	ieee802154_lowpan;

1945

+ #endif

1946

+diff --git a/include/net/netns/mptcp.h b/include/net/netns/mptcp.h

1947

+new file mode 100644

1948

+index 000000000000..bad418b04cc8

1949

+--- /dev/null

1950

++++ b/include/net/netns/mptcp.h

1951

+@@ -0,0 +1,44 @@

1952

++/*

1953

++ *	MPTCP implementation - MPTCP namespace

1954

++ *

1955

++ *	Initial Design & Implementation:

1956

++ *	Sébastien Barré <sebastien.barre@×××××××××.be>

1957

++ *

1958

++ *	Current Maintainer:

1959

++ *	Christoph Paasch <christoph.paasch@×××××××××.be>

1960

++ *

1961

++ *	Additional authors:

1962

++ *	Jaakko Korkeaniemi <jaakko.korkeaniemi@×××××.fi>

1963

++ *	Gregory Detal <gregory.detal@×××××××××.be>

1964

++ *	Fabien Duchêne <fabien.duchene@×××××××××.be>

1965

++ *	Andreas Seelinger <Andreas.Seelinger@×××××××××××.de>

1966

++ *	Lavkesh Lahngir <lavkesh51@×××××.com>

1967

++ *	Andreas Ripke <ripke@××××××.eu>

1968

++ *	Vlad Dogaru <vlad.dogaru@×××××.com>

1969

++ *	Octavian Purdila <octavian.purdila@×××××.com>

1970

++ *	John Ronan <jronan@××××.org>

1971

++ *	Catalin Nicutar <catalin.nicutar@×××××.com>

1972

++ *	Brandon Heller <brandonh@××××××××.edu>

1973

++ *

1974

++ *

1975

++ *	This program is free software; you can redistribute it and/or

1976

++ *      modify it under the terms of the GNU General Public License

1977

++ *      as published by the Free Software Foundation; either version

1978

++ *      2 of the License, or (at your option) any later version.

1979

++ */

1980

++

1981

++#ifndef __NETNS_MPTCP_H__

1982

++#define __NETNS_MPTCP_H__

1983

++

1984

++#include <linux/compiler.h>

1985

++

1986

++enum {

1987

++	MPTCP_PM_FULLMESH = 0,

1988

++	MPTCP_PM_MAX

1989

++};

1990

++

1991

++struct netns_mptcp {

1992

++	void *path_managers[MPTCP_PM_MAX];

1993

++};

1994

++

1995

++#endif /* __NETNS_MPTCP_H__ */

1996

+diff --git a/include/net/request_sock.h b/include/net/request_sock.h

1997

+index 7f830ff67f08..e79e87a8e1a6 100644

1998

+--- a/include/net/request_sock.h

1999

++++ b/include/net/request_sock.h

2000

+@@ -164,7 +164,7 @@ struct request_sock_queue {

2001

+ };

2002

+

2003

+ int reqsk_queue_alloc(struct request_sock_queue *queue,

2004

+-		      unsigned int nr_table_entries);

2005

++		      unsigned int nr_table_entries, gfp_t flags);

2006

+

2007

+ void __reqsk_queue_destroy(struct request_sock_queue *queue);

2008

+ void reqsk_queue_destroy(struct request_sock_queue *queue);

2009

+diff --git a/include/net/sock.h b/include/net/sock.h

2010

+index 156350745700..0e23cae8861f 100644

2011

+--- a/include/net/sock.h

2012

++++ b/include/net/sock.h

2013

+@@ -901,6 +901,16 @@ void sk_clear_memalloc(struct sock *sk);

2014

+

2015

+ int sk_wait_data(struct sock *sk, long *timeo);

2016

+

2017

++/* START - needed for MPTCP */

2018

++struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority, int family);

2019

++void sock_lock_init(struct sock *sk);

2020

++

2021

++extern struct lock_class_key af_callback_keys[AF_MAX];

2022

++extern char *const af_family_clock_key_strings[AF_MAX+1];

2023

++

2024

++#define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE))

2025

++/* END - needed for MPTCP */

2026

++

2027

+ struct request_sock_ops;

2028

+ struct timewait_sock_ops;

2029

+ struct inet_hashinfo;

2030

+diff --git a/include/net/tcp.h b/include/net/tcp.h

2031

+index 7286db80e8b8..ff92e74cd684 100644

2032

+--- a/include/net/tcp.h

2033

++++ b/include/net/tcp.h

2034

+@@ -177,6 +177,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo);

2035

+ #define TCPOPT_SACK             5       /* SACK Block */

2036

+ #define TCPOPT_TIMESTAMP	8	/* Better RTT estimations/PAWS */

2037

+ #define TCPOPT_MD5SIG		19	/* MD5 Signature (RFC2385) */

2038

++#define TCPOPT_MPTCP		30

2039

+ #define TCPOPT_EXP		254	/* Experimental */

2040

+ /* Magic number to be after the option value for sharing TCP

2041

+  * experimental options. See draft-ietf-tcpm-experimental-options-00.txt

2042

+@@ -229,6 +230,27 @@ void tcp_time_wait(struct sock *sk, int state, int timeo);

2043

+ #define	TFO_SERVER_WO_SOCKOPT1	0x400

2044

+ #define	TFO_SERVER_WO_SOCKOPT2	0x800

2045

+

2046

++/* Flags from tcp_input.c for tcp_ack */

2047

++#define FLAG_DATA               0x01 /* Incoming frame contained data.          */

2048

++#define FLAG_WIN_UPDATE         0x02 /* Incoming ACK was a window update.       */

2049

++#define FLAG_DATA_ACKED         0x04 /* This ACK acknowledged new data.         */

2050

++#define FLAG_RETRANS_DATA_ACKED 0x08 /* "" "" some of which was retransmitted.  */

2051

++#define FLAG_SYN_ACKED          0x10 /* This ACK acknowledged SYN.              */

2052

++#define FLAG_DATA_SACKED        0x20 /* New SACK.                               */

2053

++#define FLAG_ECE                0x40 /* ECE in this ACK                         */

2054

++#define FLAG_SLOWPATH           0x100 /* Do not skip RFC checks for window update.*/

2055

++#define FLAG_ORIG_SACK_ACKED    0x200 /* Never retransmitted data are (s)acked  */

2056

++#define FLAG_SND_UNA_ADVANCED   0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */

2057

++#define FLAG_DSACKING_ACK       0x800 /* SACK blocks contained D-SACK info */

2058

++#define FLAG_SACK_RENEGING      0x2000 /* snd_una advanced to a sacked seq */

2059

++#define FLAG_UPDATE_TS_RECENT   0x4000 /* tcp_replace_ts_recent() */

2060

++#define MPTCP_FLAG_DATA_ACKED	0x8000

2061

++

2062

++#define FLAG_ACKED              (FLAG_DATA_ACKED|FLAG_SYN_ACKED)

2063

++#define FLAG_NOT_DUP            (FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED)

2064

++#define FLAG_CA_ALERT           (FLAG_DATA_SACKED|FLAG_ECE)

2065

++#define FLAG_FORWARD_PROGRESS   (FLAG_ACKED|FLAG_DATA_SACKED)

2066

++

2067

+ extern struct inet_timewait_death_row tcp_death_row;

2068

+

2069

+ /* sysctl variables for tcp */

2070

+@@ -344,6 +366,107 @@ extern struct proto tcp_prot;

2071

+ #define TCP_ADD_STATS_USER(net, field, val) SNMP_ADD_STATS_USER((net)->mib.tcp_statistics, field, val)

2072

+ #define TCP_ADD_STATS(net, field, val)	SNMP_ADD_STATS((net)->mib.tcp_statistics, field, val)

2073

+

2074

++/**** START - Exports needed for MPTCP ****/

2075

++extern const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops;

2076

++extern const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops;

2077

++

2078

++struct mptcp_options_received;

2079

++

2080

++void tcp_enter_quickack_mode(struct sock *sk);

2081

++int tcp_close_state(struct sock *sk);

2082

++void tcp_minshall_update(struct tcp_sock *tp, unsigned int mss_now,

2083

++			 const struct sk_buff *skb);

2084

++int tcp_xmit_probe_skb(struct sock *sk, int urgent);

2085

++void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb);

2086

++int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,

2087

++		     gfp_t gfp_mask);

2088

++unsigned int tcp_mss_split_point(const struct sock *sk,

2089

++				 const struct sk_buff *skb,

2090

++				 unsigned int mss_now,

2091

++				 unsigned int max_segs,

2092

++				 int nonagle);

2093

++bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buff *skb,

2094

++		    unsigned int cur_mss, int nonagle);

2095

++bool tcp_snd_wnd_test(const struct tcp_sock *tp, const struct sk_buff *skb,

2096

++		      unsigned int cur_mss);

2097

++unsigned int tcp_cwnd_test(const struct tcp_sock *tp, const struct sk_buff *skb);

2098

++int tcp_init_tso_segs(const struct sock *sk, struct sk_buff *skb,

2099

++		      unsigned int mss_now);

2100

++void __pskb_trim_head(struct sk_buff *skb, int len);

2101

++void tcp_queue_skb(struct sock *sk, struct sk_buff *skb);

2102

++void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags);

2103

++void tcp_reset(struct sock *sk);

2104

++bool tcp_may_update_window(const struct tcp_sock *tp, const u32 ack,

2105

++			   const u32 ack_seq, const u32 nwin);

2106

++bool tcp_urg_mode(const struct tcp_sock *tp);

2107

++void tcp_ack_probe(struct sock *sk);

2108

++void tcp_rearm_rto(struct sock *sk);

2109

++int tcp_write_timeout(struct sock *sk);

2110

++bool retransmits_timed_out(struct sock *sk, unsigned int boundary,

2111

++			   unsigned int timeout, bool syn_set);

2112

++void tcp_write_err(struct sock *sk);

2113

++void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int decr);

2114

++void tcp_set_skb_tso_segs(const struct sock *sk, struct sk_buff *skb,

2115

++			  unsigned int mss_now);

2116

++

2117

++int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req);

2118

++void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,

2119

++			   struct request_sock *req);

2120

++__u32 tcp_v4_init_sequence(const struct sk_buff *skb);

2121

++int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,

2122

++		       struct flowi *fl,

2123

++		       struct request_sock *req,

2124

++		       u16 queue_mapping,

2125

++		       struct tcp_fastopen_cookie *foc);

2126

++void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb);

2127

++struct ip_options_rcu *tcp_v4_save_options(struct sk_buff *skb);

2128

++struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb);

2129

++void tcp_v4_reqsk_destructor(struct request_sock *req);

2130

++

2131

++int tcp_v6_rtx_synack(struct sock *sk, struct request_sock *req);

2132

++void tcp_v6_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,

2133

++			   struct request_sock *req);

2134

++__u32 tcp_v6_init_sequence(const struct sk_buff *skb);

2135

++int tcp_v6_send_synack(struct sock *sk, struct dst_entry *dst,

2136

++		       struct flowi *fl, struct request_sock *req,

2137

++		       u16 queue_mapping, struct tcp_fastopen_cookie *foc);

2138

++void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb);

2139

++int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb);

2140

++int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len);

2141

++void tcp_v6_destroy_sock(struct sock *sk);

2142

++void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb);

2143

++void tcp_v6_hash(struct sock *sk);

2144

++struct sock *tcp_v6_hnd_req(struct sock *sk,struct sk_buff *skb);

2145

++struct sock *tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,

2146

++			          struct request_sock *req,

2147

++				  struct dst_entry *dst);

2148

++void tcp_v6_reqsk_destructor(struct request_sock *req);

2149

++

2150

++unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,

2151

++				       int large_allowed);

2152

++u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb);

2153

++

2154

++void skb_clone_fraglist(struct sk_buff *skb);

2155

++void copy_skb_header(struct sk_buff *new, const struct sk_buff *old);

2156

++

2157

++void inet_twsk_free(struct inet_timewait_sock *tw);

2158

++int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb);

2159

++/* These states need RST on ABORT according to RFC793 */

2160

++static inline bool tcp_need_reset(int state)

2161

++{

2162

++	return (1 << state) &

2163

++	       (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 |

2164

++		TCPF_FIN_WAIT2 | TCPF_SYN_RECV);

2165

++}

2166

++

2167

++bool tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb,

2168

++			    int hlen);

2169

++int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int hdrlen,

2170

++			       bool *fragstolen);

2171

++bool tcp_try_coalesce(struct sock *sk, struct sk_buff *to,

2172

++		      struct sk_buff *from, bool *fragstolen);

2173

++/**** END - Exports needed for MPTCP ****/

2174

++

2175

+ void tcp_tasklet_init(void);

2176

+

2177

+ void tcp_v4_err(struct sk_buff *skb, u32);

2178

+@@ -440,6 +563,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,

2179

+ 		size_t len, int nonblock, int flags, int *addr_len);

2180

+ void tcp_parse_options(const struct sk_buff *skb,

2181

+ 		       struct tcp_options_received *opt_rx,

2182

++		       struct mptcp_options_received *mopt_rx,

2183

+ 		       int estab, struct tcp_fastopen_cookie *foc);

2184

+ const u8 *tcp_parse_md5sig_option(const struct tcphdr *th);

2185

+

2186

+@@ -493,14 +617,8 @@ static inline u32 tcp_cookie_time(void)

2187

+

2188

+ u32 __cookie_v4_init_sequence(const struct iphdr *iph, const struct tcphdr *th,

2189

+ 			      u16 *mssp);

2190

+-__u32 cookie_v4_init_sequence(struct sock *sk, struct sk_buff *skb, __u16 *mss);

2191

+-#else

2192

+-static inline __u32 cookie_v4_init_sequence(struct sock *sk,

2193

+-					    struct sk_buff *skb,

2194

+-					    __u16 *mss)

2195

+-{

2196

+-	return 0;

2197

+-}

2198

++__u32 cookie_v4_init_sequence(struct sock *sk, const struct sk_buff *skb,

2199

++			      __u16 *mss);

2200

+ #endif

2201

+

2202

+ __u32 cookie_init_timestamp(struct request_sock *req);

2203

+@@ -516,13 +634,6 @@ u32 __cookie_v6_init_sequence(const struct ipv6hdr *iph,

2204

+ 			      const struct tcphdr *th, u16 *mssp);

2205

+ __u32 cookie_v6_init_sequence(struct sock *sk, const struct sk_buff *skb,

2206

+ 			      __u16 *mss);

2207

+-#else

2208

+-static inline __u32 cookie_v6_init_sequence(struct sock *sk,

2209

+-					    struct sk_buff *skb,

2210

+-					    __u16 *mss)

2211

+-{

2212

+-	return 0;

2213

+-}

2214

+ #endif

2215

+ /* tcp_output.c */

2216

+

2217

+@@ -551,10 +662,17 @@ void tcp_send_delayed_ack(struct sock *sk);

2218

+ void tcp_send_loss_probe(struct sock *sk);

2219

+ bool tcp_schedule_loss_probe(struct sock *sk);

2220

+

2221

++u16 tcp_select_window(struct sock *sk);

2222

++bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,

2223

++		int push_one, gfp_t gfp);

2224

++

2225

+ /* tcp_input.c */

2226

+ void tcp_resume_early_retransmit(struct sock *sk);

2227

+ void tcp_rearm_rto(struct sock *sk);

2228

+ void tcp_reset(struct sock *sk);

2229

++void tcp_set_rto(struct sock *sk);

2230

++bool tcp_should_expand_sndbuf(const struct sock *sk);

2231

++bool tcp_prune_ofo_queue(struct sock *sk);

2232

+

2233

+ /* tcp_timer.c */

2234

+ void tcp_init_xmit_timers(struct sock *);

2235

+@@ -703,14 +821,27 @@ void tcp_send_window_probe(struct sock *sk);

2236

+  */

2237

+ struct tcp_skb_cb {

2238

+ 	union {

2239

+-		struct inet_skb_parm	h4;

2240

++		union {

2241

++			struct inet_skb_parm	h4;

2242

+ #if IS_ENABLED(CONFIG_IPV6)

2243

+-		struct inet6_skb_parm	h6;

2244

++			struct inet6_skb_parm	h6;

2245

+ #endif

2246

+-	} header;	/* For incoming frames		*/

2247

++		} header;	/* For incoming frames		*/

2248

++#ifdef CONFIG_MPTCP

2249

++		union {			/* For MPTCP outgoing frames */

2250

++			__u32 path_mask; /* paths that tried to send this skb */

2251

++			__u32 dss[6];	/* DSS options */

2252

++		};

2253

++#endif

2254

++	};

2255

+ 	__u32		seq;		/* Starting sequence number	*/

2256

+ 	__u32		end_seq;	/* SEQ + FIN + SYN + datalen	*/

2257

+ 	__u32		when;		/* used to compute rtt's	*/

2258

++#ifdef CONFIG_MPTCP

2259

++	__u8		mptcp_flags;	/* flags for the MPTCP layer    */

2260

++	__u8		dss_off;	/* Number of 4-byte words until

2261

++					 * seq-number */

2262

++#endif

2263

+ 	__u8		tcp_flags;	/* TCP header flags. (tcp[13])	*/

2264

+

2265

+ 	__u8		sacked;		/* State flags for SACK/FACK.	*/

2266

+@@ -1075,7 +1206,8 @@ u32 tcp_default_init_rwnd(u32 mss);

2267

+ /* Determine a window scaling and initial window to offer. */

2268

+ void tcp_select_initial_window(int __space, __u32 mss, __u32 *rcv_wnd,

2269

+ 			       __u32 *window_clamp, int wscale_ok,

2270

+-			       __u8 *rcv_wscale, __u32 init_rcv_wnd);

2271

++			       __u8 *rcv_wscale, __u32 init_rcv_wnd,

2272

++			       const struct sock *sk);

2273

+

2274

+ static inline int tcp_win_from_space(int space)

2275

+ {

2276

+@@ -1084,15 +1216,34 @@ static inline int tcp_win_from_space(int space)

2277

+ 		space - (space>>sysctl_tcp_adv_win_scale);

2278

+ }

2279

+

2280

++#ifdef CONFIG_MPTCP

2281

++extern struct static_key mptcp_static_key;

2282

++static inline bool mptcp(const struct tcp_sock *tp)

2283

++{

2284

++	return static_key_false(&mptcp_static_key) && tp->mpc;

2285

++}

2286

++#else

2287

++static inline bool mptcp(const struct tcp_sock *tp)

2288

++{

2289

++	return 0;

2290

++}

2291

++#endif

2292

++

2293

+ /* Note: caller must be prepared to deal with negative returns */ 

2294

+ static inline int tcp_space(const struct sock *sk)

2295

+ {

2296

++	if (mptcp(tcp_sk(sk)))

2297

++		sk = tcp_sk(sk)->meta_sk;

2298

++

2299

+ 	return tcp_win_from_space(sk->sk_rcvbuf -

2300

+ 				  atomic_read(&sk->sk_rmem_alloc));

2301

+ }

2302

+

2303

+ static inline int tcp_full_space(const struct sock *sk)

2304

+ {

2305

++	if (mptcp(tcp_sk(sk)))

2306

++		sk = tcp_sk(sk)->meta_sk;

2307

++

2308

+ 	return tcp_win_from_space(sk->sk_rcvbuf); 

2309

+ }

2310

+

2311

+@@ -1115,6 +1266,8 @@ static inline void tcp_openreq_init(struct request_sock *req,

2312

+ 	ireq->wscale_ok = rx_opt->wscale_ok;

2313

+ 	ireq->acked = 0;

2314

+ 	ireq->ecn_ok = 0;

2315

++	ireq->mptcp_rqsk = 0;

2316

++	ireq->saw_mpc = 0;

2317

+ 	ireq->ir_rmt_port = tcp_hdr(skb)->source;

2318

+ 	ireq->ir_num = ntohs(tcp_hdr(skb)->dest);

2319

+ }

2320

+@@ -1585,6 +1738,11 @@ int tcp4_proc_init(void);

2321

+ void tcp4_proc_exit(void);

2322

+ #endif

2323

+

2324

++int tcp_rtx_synack(struct sock *sk, struct request_sock *req);

2325

++int tcp_conn_request(struct request_sock_ops *rsk_ops,

2326

++		     const struct tcp_request_sock_ops *af_ops,

2327

++		     struct sock *sk, struct sk_buff *skb);

2328

++

2329

+ /* TCP af-specific functions */

2330

+ struct tcp_sock_af_ops {

2331

+ #ifdef CONFIG_TCP_MD5SIG

2332

+@@ -1601,7 +1759,32 @@ struct tcp_sock_af_ops {

2333

+ #endif

2334

+ };

2335

+

2336

++/* TCP/MPTCP-specific functions */

2337

++struct tcp_sock_ops {

2338

++	u32 (*__select_window)(struct sock *sk);

2339

++	u16 (*select_window)(struct sock *sk);

2340

++	void (*select_initial_window)(int __space, __u32 mss, __u32 *rcv_wnd,

2341

++				      __u32 *window_clamp, int wscale_ok,

2342

++				      __u8 *rcv_wscale, __u32 init_rcv_wnd,

2343

++				      const struct sock *sk);

2344

++	void (*init_buffer_space)(struct sock *sk);

2345

++	void (*set_rto)(struct sock *sk);

2346

++	bool (*should_expand_sndbuf)(const struct sock *sk);

2347

++	void (*send_fin)(struct sock *sk);

2348

++	bool (*write_xmit)(struct sock *sk, unsigned int mss_now, int nonagle,

2349

++			   int push_one, gfp_t gfp);

2350

++	void (*send_active_reset)(struct sock *sk, gfp_t priority);

2351

++	int (*write_wakeup)(struct sock *sk);

2352

++	bool (*prune_ofo_queue)(struct sock *sk);

2353

++	void (*retransmit_timer)(struct sock *sk);

2354

++	void (*time_wait)(struct sock *sk, int state, int timeo);

2355

++	void (*cleanup_rbuf)(struct sock *sk, int copied);

2356

++	void (*init_congestion_control)(struct sock *sk);

2357

++};

2358

++extern const struct tcp_sock_ops tcp_specific;

2359

++

2360

+ struct tcp_request_sock_ops {

2361

++	u16 mss_clamp;

2362

+ #ifdef CONFIG_TCP_MD5SIG

2363

+ 	struct tcp_md5sig_key	*(*md5_lookup) (struct sock *sk,

2364

+ 						struct request_sock *req);

2365

+@@ -1611,8 +1794,39 @@ struct tcp_request_sock_ops {

2366

+ 						  const struct request_sock *req,

2367

+ 						  const struct sk_buff *skb);

2368

+ #endif

2369

++	int (*init_req)(struct request_sock *req, struct sock *sk,

2370

++			 struct sk_buff *skb);

2371

++#ifdef CONFIG_SYN_COOKIES

2372

++	__u32 (*cookie_init_seq)(struct sock *sk, const struct sk_buff *skb,

2373

++				 __u16 *mss);

2374

++#endif

2375

++	struct dst_entry *(*route_req)(struct sock *sk, struct flowi *fl,

2376

++				       const struct request_sock *req,

2377

++				       bool *strict);

2378

++	__u32 (*init_seq)(const struct sk_buff *skb);

2379

++	int (*send_synack)(struct sock *sk, struct dst_entry *dst,

2380

++			   struct flowi *fl, struct request_sock *req,

2381

++			   u16 queue_mapping, struct tcp_fastopen_cookie *foc);

2382

++	void (*queue_hash_add)(struct sock *sk, struct request_sock *req,

2383

++			       const unsigned long timeout);

2384

+ };

2385

+

2386

++#ifdef CONFIG_SYN_COOKIES

2387

++static inline __u32 cookie_init_sequence(const struct tcp_request_sock_ops *ops,

2388

++					 struct sock *sk, struct sk_buff *skb,

2389

++					 __u16 *mss)

2390

++{

2391

++	return ops->cookie_init_seq(sk, skb, mss);

2392

++}

2393

++#else

2394

++static inline __u32 cookie_init_sequence(const struct tcp_request_sock_ops *ops,

2395

++					 struct sock *sk, struct sk_buff *skb,

2396

++					 __u16 *mss)

2397

++{

2398

++	return 0;

2399

++}

2400

++#endif

2401

++

2402

+ int tcpv4_offload_init(void);

2403

+

2404

+ void tcp_v4_init(void);

2405

+diff --git a/include/uapi/linux/if.h b/include/uapi/linux/if.h

2406

+index 9cf2394f0bcf..c2634b6ed854 100644

2407

+--- a/include/uapi/linux/if.h

2408

++++ b/include/uapi/linux/if.h

2409

+@@ -109,6 +109,9 @@ enum net_device_flags {

2410

+ #define IFF_DORMANT			IFF_DORMANT

2411

+ #define IFF_ECHO			IFF_ECHO

2412

+

2413

++#define IFF_NOMULTIPATH	0x80000		/* Disable for MPTCP 		*/

2414

++#define IFF_MPBACKUP	0x100000	/* Use as backup path for MPTCP */

2415

++

2416

+ #define IFF_VOLATILE	(IFF_LOOPBACK|IFF_POINTOPOINT|IFF_BROADCAST|IFF_ECHO|\

2417

+ 		IFF_MASTER|IFF_SLAVE|IFF_RUNNING|IFF_LOWER_UP|IFF_DORMANT)

2418

+

2419

+diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h

2420

+index 3b9718328d8b..487475681d84 100644

2421

+--- a/include/uapi/linux/tcp.h

2422

++++ b/include/uapi/linux/tcp.h

2423

+@@ -112,6 +112,7 @@ enum {

2424

+ #define TCP_FASTOPEN		23	/* Enable FastOpen on listeners */

2425

+ #define TCP_TIMESTAMP		24

2426

+ #define TCP_NOTSENT_LOWAT	25	/* limit number of unsent bytes in write queue */

2427

++#define MPTCP_ENABLED		26

2428

+

2429

+ struct tcp_repair_opt {

2430

+ 	__u32	opt_code;

2431

+diff --git a/net/Kconfig b/net/Kconfig

2432

+index d92afe4204d9..96b58593ad5e 100644

2433

+--- a/net/Kconfig

2434

++++ b/net/Kconfig

2435

+@@ -79,6 +79,7 @@ if INET

2436

+ source "net/ipv4/Kconfig"

2437

+ source "net/ipv6/Kconfig"

2438

+ source "net/netlabel/Kconfig"

2439

++source "net/mptcp/Kconfig"

2440

+

2441

+ endif # if INET

2442

+

2443

+diff --git a/net/Makefile b/net/Makefile

2444

+index cbbbe6d657ca..244bac1435b1 100644

2445

+--- a/net/Makefile

2446

++++ b/net/Makefile

2447

+@@ -20,6 +20,7 @@ obj-$(CONFIG_INET)		+= ipv4/

2448

+ obj-$(CONFIG_XFRM)		+= xfrm/

2449

+ obj-$(CONFIG_UNIX)		+= unix/

2450

+ obj-$(CONFIG_NET)		+= ipv6/

2451

++obj-$(CONFIG_MPTCP)		+= mptcp/

2452

+ obj-$(CONFIG_PACKET)		+= packet/

2453

+ obj-$(CONFIG_NET_KEY)		+= key/

2454

+ obj-$(CONFIG_BRIDGE)		+= bridge/

2455

+diff --git a/net/core/dev.c b/net/core/dev.c

2456

+index 367a586d0c8a..215d2757fbf6 100644

2457

+--- a/net/core/dev.c

2458

++++ b/net/core/dev.c

2459

+@@ -5420,7 +5420,7 @@ int __dev_change_flags(struct net_device *dev, unsigned int flags)

2460

+

2461

+ 	dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |

2462

+ 			       IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |

2463

+-			       IFF_AUTOMEDIA)) |

2464

++			       IFF_AUTOMEDIA | IFF_NOMULTIPATH | IFF_MPBACKUP)) |

2465

+ 		     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |

2466

+ 				    IFF_ALLMULTI));

2467

+

2468

+diff --git a/net/core/request_sock.c b/net/core/request_sock.c

2469

+index 467f326126e0..909dfa13f499 100644

2470

+--- a/net/core/request_sock.c

2471

++++ b/net/core/request_sock.c

2472

+@@ -38,7 +38,8 @@ int sysctl_max_syn_backlog = 256;

2473

+ EXPORT_SYMBOL(sysctl_max_syn_backlog);

2474

+

2475

+ int reqsk_queue_alloc(struct request_sock_queue *queue,

2476

+-		      unsigned int nr_table_entries)

2477

++		      unsigned int nr_table_entries,

2478

++		      gfp_t flags)

2479

+ {

2480

+ 	size_t lopt_size = sizeof(struct listen_sock);

2481

+ 	struct listen_sock *lopt;

2482

+@@ -48,9 +49,11 @@ int reqsk_queue_alloc(struct request_sock_queue *queue,

2483

+ 	nr_table_entries = roundup_pow_of_two(nr_table_entries + 1);

2484

+ 	lopt_size += nr_table_entries * sizeof(struct request_sock *);

2485

+ 	if (lopt_size > PAGE_SIZE)

2486

+-		lopt = vzalloc(lopt_size);

2487

++		lopt = __vmalloc(lopt_size,

2488

++			flags | __GFP_HIGHMEM | __GFP_ZERO,

2489

++			PAGE_KERNEL);

2490

+ 	else

2491

+-		lopt = kzalloc(lopt_size, GFP_KERNEL);

2492

++		lopt = kzalloc(lopt_size, flags);

2493

+ 	if (lopt == NULL)

2494

+ 		return -ENOMEM;

2495

+

2496

+diff --git a/net/core/skbuff.c b/net/core/skbuff.c

2497

+index c1a33033cbe2..8abc5d60fbe3 100644

2498

+--- a/net/core/skbuff.c

2499

++++ b/net/core/skbuff.c

2500

+@@ -472,7 +472,7 @@ static inline void skb_drop_fraglist(struct sk_buff *skb)

2501

+ 	skb_drop_list(&skb_shinfo(skb)->frag_list);

2502

+ }

2503

+

2504

+-static void skb_clone_fraglist(struct sk_buff *skb)

2505

++void skb_clone_fraglist(struct sk_buff *skb)

2506

+ {

2507

+ 	struct sk_buff *list;

2508

+

2509

+@@ -897,7 +897,7 @@ static void skb_headers_offset_update(struct sk_buff *skb, int off)

2510

+ 	skb->inner_mac_header += off;

2511

+ }

2512

+

2513

+-static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)

2514

++void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)

2515

+ {

2516

+ 	__copy_skb_header(new, old);

2517

+

2518

+diff --git a/net/core/sock.c b/net/core/sock.c

2519

+index 026e01f70274..359295523177 100644

2520

+--- a/net/core/sock.c

2521

++++ b/net/core/sock.c

2522

+@@ -136,6 +136,11 @@

2523

+

2524

+ #include <trace/events/sock.h>

2525

+

2526

++#ifdef CONFIG_MPTCP

2527

++#include <net/mptcp.h>

2528

++#include <net/inet_common.h>

2529

++#endif

2530

++

2531

+ #ifdef CONFIG_INET

2532

+ #include <net/tcp.h>

2533

+ #endif

2534

+@@ -280,7 +285,7 @@ static const char *const af_family_slock_key_strings[AF_MAX+1] = {

2535

+   "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG"      ,

2536

+   "slock-AF_NFC"   , "slock-AF_VSOCK"    ,"slock-AF_MAX"

2537

+ };

2538

+-static const char *const af_family_clock_key_strings[AF_MAX+1] = {

2539

++char *const af_family_clock_key_strings[AF_MAX+1] = {

2540

+   "clock-AF_UNSPEC", "clock-AF_UNIX"     , "clock-AF_INET"     ,

2541

+   "clock-AF_AX25"  , "clock-AF_IPX"      , "clock-AF_APPLETALK",

2542

+   "clock-AF_NETROM", "clock-AF_BRIDGE"   , "clock-AF_ATMPVC"   ,

2543

+@@ -301,7 +306,7 @@ static const char *const af_family_clock_key_strings[AF_MAX+1] = {

2544

+  * sk_callback_lock locking rules are per-address-family,

2545

+  * so split the lock classes by using a per-AF key:

2546

+  */

2547

+-static struct lock_class_key af_callback_keys[AF_MAX];

2548

++struct lock_class_key af_callback_keys[AF_MAX];

2549

+

2550

+ /* Take into consideration the size of the struct sk_buff overhead in the

2551

+  * determination of these values, since that is non-constant across

2552

+@@ -422,8 +427,6 @@ static void sock_warn_obsolete_bsdism(const char *name)

2553

+ 	}

2554

+ }

2555

+

2556

+-#define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE))

2557

+-

2558

+ static void sock_disable_timestamp(struct sock *sk, unsigned long flags)

2559

+ {

2560

+ 	if (sk->sk_flags & flags) {

2561

+@@ -1253,8 +1256,25 @@ lenout:

2562

+  *

2563

+  * (We also register the sk_lock with the lock validator.)

2564

+  */

2565

+-static inline void sock_lock_init(struct sock *sk)

2566

+-{

2567

++void sock_lock_init(struct sock *sk)

2568

++{

2569

++#ifdef CONFIG_MPTCP

2570

++	/* Reclassify the lock-class for subflows */

2571

++	if (sk->sk_type == SOCK_STREAM && sk->sk_protocol == IPPROTO_TCP)

2572

++		if (mptcp(tcp_sk(sk)) || tcp_sk(sk)->is_master_sk) {

2573

++			sock_lock_init_class_and_name(sk, "slock-AF_INET-MPTCP",

2574

++						      &meta_slock_key,

2575

++						      "sk_lock-AF_INET-MPTCP",

2576

++						      &meta_key);

2577

++

2578

++			/* We don't yet have the mptcp-point.

2579

++			 * Thus we still need inet_sock_destruct

2580

++			 */

2581

++			sk->sk_destruct = inet_sock_destruct;

2582

++			return;

2583

++		}

2584

++#endif

2585

++

2586

+ 	sock_lock_init_class_and_name(sk,

2587

+ 			af_family_slock_key_strings[sk->sk_family],

2588

+ 			af_family_slock_keys + sk->sk_family,

2589

+@@ -1301,7 +1321,7 @@ void sk_prot_clear_portaddr_nulls(struct sock *sk, int size)

2590

+ }

2591

+ EXPORT_SYMBOL(sk_prot_clear_portaddr_nulls);

2592

+

2593

+-static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,

2594

++struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,

2595

+ 		int family)

2596

+ {

2597

+ 	struct sock *sk;

2598

+diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c

2599

+index 4db3c2a1679c..04cb17d4b0ce 100644

2600

+--- a/net/dccp/ipv6.c

2601

++++ b/net/dccp/ipv6.c

2602

+@@ -386,7 +386,7 @@ static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb)

2603

+ 	if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)

2604

+ 		goto drop;

2605

+

2606

+-	req = inet6_reqsk_alloc(&dccp6_request_sock_ops);

2607

++	req = inet_reqsk_alloc(&dccp6_request_sock_ops);

2608

+ 	if (req == NULL)

2609

+ 		goto drop;

2610

+

2611

+diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig

2612

+index 05c57f0fcabe..630434db0085 100644

2613

+--- a/net/ipv4/Kconfig

2614

++++ b/net/ipv4/Kconfig

2615

+@@ -556,6 +556,30 @@ config TCP_CONG_ILLINOIS

2616

+ 	For further details see:

2617

+ 	  http://www.ews.uiuc.edu/~shaoliu/tcpillinois/index.html

2618

+

2619

++config TCP_CONG_COUPLED

2620

++	tristate "MPTCP COUPLED CONGESTION CONTROL"

2621

++	depends on MPTCP

2622

++	default n

2623

++	---help---

2624

++	MultiPath TCP Coupled Congestion Control

2625

++	To enable it, just put 'coupled' in tcp_congestion_control

2626

++

2627

++config TCP_CONG_OLIA

2628

++	tristate "MPTCP Opportunistic Linked Increase"

2629

++	depends on MPTCP

2630

++	default n

2631

++	---help---

2632

++	MultiPath TCP Opportunistic Linked Increase Congestion Control

2633

++	To enable it, just put 'olia' in tcp_congestion_control

2634

++

2635

++config TCP_CONG_WVEGAS

2636

++	tristate "MPTCP WVEGAS CONGESTION CONTROL"

2637

++	depends on MPTCP

2638

++	default n

2639

++	---help---

2640

++	wVegas congestion control for MPTCP

2641

++	To enable it, just put 'wvegas' in tcp_congestion_control

2642

++

2643

+ choice

2644

+ 	prompt "Default TCP congestion control"

2645

+ 	default DEFAULT_CUBIC

2646

+@@ -584,6 +608,15 @@ choice

2647

+ 	config DEFAULT_WESTWOOD

2648

+ 		bool "Westwood" if TCP_CONG_WESTWOOD=y

2649

+

2650

++	config DEFAULT_COUPLED

2651

++		bool "Coupled" if TCP_CONG_COUPLED=y

2652

++

2653

++	config DEFAULT_OLIA

2654

++		bool "Olia" if TCP_CONG_OLIA=y

2655

++

2656

++	config DEFAULT_WVEGAS

2657

++		bool "Wvegas" if TCP_CONG_WVEGAS=y

2658

++

2659

+ 	config DEFAULT_RENO

2660

+ 		bool "Reno"

2661

+

2662

+@@ -605,6 +638,8 @@ config DEFAULT_TCP_CONG

2663

+ 	default "vegas" if DEFAULT_VEGAS

2664

+ 	default "westwood" if DEFAULT_WESTWOOD

2665

+ 	default "veno" if DEFAULT_VENO

2666

++	default "coupled" if DEFAULT_COUPLED

2667

++	default "wvegas" if DEFAULT_WVEGAS

2668

+ 	default "reno" if DEFAULT_RENO

2669

+ 	default "cubic"

2670

+

2671

+diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c

2672

+index d156b3c5f363..4afd6d8d9028 100644

2673

+--- a/net/ipv4/af_inet.c

2674

++++ b/net/ipv4/af_inet.c

2675

+@@ -104,6 +104,7 @@

2676

+ #include <net/ip_fib.h>

2677

+ #include <net/inet_connection_sock.h>

2678

+ #include <net/tcp.h>

2679

++#include <net/mptcp.h>

2680

+ #include <net/udp.h>

2681

+ #include <net/udplite.h>

2682

+ #include <net/ping.h>

2683

+@@ -246,8 +247,7 @@ EXPORT_SYMBOL(inet_listen);

2684

+  *	Create an inet socket.

2685

+  */

2686

+

2687

+-static int inet_create(struct net *net, struct socket *sock, int protocol,

2688

+-		       int kern)

2689

++int inet_create(struct net *net, struct socket *sock, int protocol, int kern)

2690

+ {

2691

+ 	struct sock *sk;

2692

+ 	struct inet_protosw *answer;

2693

+@@ -676,6 +676,23 @@ int inet_accept(struct socket *sock, struct socket *newsock, int flags)

2694

+ 	lock_sock(sk2);

2695

+

2696

+ 	sock_rps_record_flow(sk2);

2697

++

2698

++	if (sk2->sk_protocol == IPPROTO_TCP && mptcp(tcp_sk(sk2))) {

2699

++		struct sock *sk_it = sk2;

2700

++

2701

++		mptcp_for_each_sk(tcp_sk(sk2)->mpcb, sk_it)

2702

++			sock_rps_record_flow(sk_it);

2703

++

2704

++		if (tcp_sk(sk2)->mpcb->master_sk) {

2705

++			sk_it = tcp_sk(sk2)->mpcb->master_sk;

2706

++

2707

++			write_lock_bh(&sk_it->sk_callback_lock);

2708

++			sk_it->sk_wq = newsock->wq;

2709

++			sk_it->sk_socket = newsock;

2710

++			write_unlock_bh(&sk_it->sk_callback_lock);

2711

++		}

2712

++	}

2713

++

2714

+ 	WARN_ON(!((1 << sk2->sk_state) &

2715

+ 		  (TCPF_ESTABLISHED | TCPF_SYN_RECV |

2716

+ 		  TCPF_CLOSE_WAIT | TCPF_CLOSE)));

2717

+@@ -1763,6 +1780,9 @@ static int __init inet_init(void)

2718

+

2719

+ 	ip_init();

2720

+

2721

++	/* We must initialize MPTCP before TCP. */

2722

++	mptcp_init();

2723

++

2724

+ 	tcp_v4_init();

2725

+

2726

+ 	/* Setup TCP slab cache for open requests. */

2727

+diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c

2728

+index 14d02ea905b6..7d734d8af19b 100644

2729

+--- a/net/ipv4/inet_connection_sock.c

2730

++++ b/net/ipv4/inet_connection_sock.c

2731

+@@ -23,6 +23,7 @@

2732

+ #include <net/route.h>

2733

+ #include <net/tcp_states.h>

2734

+ #include <net/xfrm.h>

2735

++#include <net/mptcp.h>

2736

+

2737

+ #ifdef INET_CSK_DEBUG

2738

+ const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n";

2739

+@@ -465,8 +466,8 @@ no_route:

2740

+ }

2741

+ EXPORT_SYMBOL_GPL(inet_csk_route_child_sock);

2742

+

2743

+-static inline u32 inet_synq_hash(const __be32 raddr, const __be16 rport,

2744

+-				 const u32 rnd, const u32 synq_hsize)

2745

++u32 inet_synq_hash(const __be32 raddr, const __be16 rport, const u32 rnd,

2746

++		   const u32 synq_hsize)

2747

+ {

2748

+ 	return jhash_2words((__force u32)raddr, (__force u32)rport, rnd) & (synq_hsize - 1);

2749

+ }

2750

+@@ -647,7 +648,7 @@ void inet_csk_reqsk_queue_prune(struct sock *parent,

2751

+

2752

+ 	lopt->clock_hand = i;

2753

+

2754

+-	if (lopt->qlen)

2755

++	if (lopt->qlen && !is_meta_sk(parent))

2756

+ 		inet_csk_reset_keepalive_timer(parent, interval);

2757

+ }

2758

+ EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_prune);

2759

+@@ -664,7 +665,9 @@ struct sock *inet_csk_clone_lock(const struct sock *sk,

2760

+ 				 const struct request_sock *req,

2761

+ 				 const gfp_t priority)

2762

+ {

2763

+-	struct sock *newsk = sk_clone_lock(sk, priority);

2764

++	struct sock *newsk;

2765

++

2766

++	newsk = sk_clone_lock(sk, priority);

2767

+

2768

+ 	if (newsk != NULL) {

2769

+ 		struct inet_connection_sock *newicsk = inet_csk(newsk);

2770

+@@ -743,7 +746,8 @@ int inet_csk_listen_start(struct sock *sk, const int nr_table_entries)

2771

+ {

2772

+ 	struct inet_sock *inet = inet_sk(sk);

2773

+ 	struct inet_connection_sock *icsk = inet_csk(sk);

2774

+-	int rc = reqsk_queue_alloc(&icsk->icsk_accept_queue, nr_table_entries);

2775

++	int rc = reqsk_queue_alloc(&icsk->icsk_accept_queue, nr_table_entries,

2776

++				   GFP_KERNEL);

2777

+

2778

+ 	if (rc != 0)

2779

+ 		return rc;

2780

+@@ -801,9 +805,14 @@ void inet_csk_listen_stop(struct sock *sk)

2781

+

2782

+ 	while ((req = acc_req) != NULL) {

2783

+ 		struct sock *child = req->sk;

2784

++		bool mutex_taken = false;

2785

+

2786

+ 		acc_req = req->dl_next;

2787

+

2788

++		if (is_meta_sk(child)) {

2789

++			mutex_lock(&tcp_sk(child)->mpcb->mpcb_mutex);

2790

++			mutex_taken = true;

2791

++		}

2792

+ 		local_bh_disable();

2793

+ 		bh_lock_sock(child);

2794

+ 		WARN_ON(sock_owned_by_user(child));

2795

+@@ -832,6 +841,8 @@ void inet_csk_listen_stop(struct sock *sk)

2796

+

2797

+ 		bh_unlock_sock(child);

2798

+ 		local_bh_enable();

2799

++		if (mutex_taken)

2800

++			mutex_unlock(&tcp_sk(child)->mpcb->mpcb_mutex);

2801

+ 		sock_put(child);

2802

+

2803

+ 		sk_acceptq_removed(sk);

2804

+diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c

2805

+index c86624b36a62..0ff3fe004d62 100644

2806

+--- a/net/ipv4/syncookies.c

2807

++++ b/net/ipv4/syncookies.c

2808

+@@ -170,7 +170,8 @@ u32 __cookie_v4_init_sequence(const struct iphdr *iph, const struct tcphdr *th,

2809

+ }

2810

+ EXPORT_SYMBOL_GPL(__cookie_v4_init_sequence);

2811

+

2812

+-__u32 cookie_v4_init_sequence(struct sock *sk, struct sk_buff *skb, __u16 *mssp)

2813

++__u32 cookie_v4_init_sequence(struct sock *sk, const struct sk_buff *skb,

2814

++			      __u16 *mssp)

2815

+ {

2816

+ 	const struct iphdr *iph = ip_hdr(skb);

2817

+ 	const struct tcphdr *th = tcp_hdr(skb);

2818

+@@ -284,7 +285,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,

2819

+

2820

+ 	/* check for timestamp cookie support */

2821

+ 	memset(&tcp_opt, 0, sizeof(tcp_opt));

2822

+-	tcp_parse_options(skb, &tcp_opt, 0, NULL);

2823

++	tcp_parse_options(skb, &tcp_opt, NULL, 0, NULL);

2824

+

2825

+ 	if (!cookie_check_timestamp(&tcp_opt, sock_net(sk), &ecn_ok))

2826

+ 		goto out;

2827

+@@ -355,10 +356,10 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,

2828

+ 	/* Try to redo what tcp_v4_send_synack did. */

2829

+ 	req->window_clamp = tp->window_clamp ? :dst_metric(&rt->dst, RTAX_WINDOW);

2830

+

2831

+-	tcp_select_initial_window(tcp_full_space(sk), req->mss,

2832

+-				  &req->rcv_wnd, &req->window_clamp,

2833

+-				  ireq->wscale_ok, &rcv_wscale,

2834

+-				  dst_metric(&rt->dst, RTAX_INITRWND));

2835

++	tp->ops->select_initial_window(tcp_full_space(sk), req->mss,

2836

++				       &req->rcv_wnd, &req->window_clamp,

2837

++				       ireq->wscale_ok, &rcv_wscale,

2838

++				       dst_metric(&rt->dst, RTAX_INITRWND), sk);

2839

+

2840

+ 	ireq->rcv_wscale  = rcv_wscale;

2841

+

2842

+diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c

2843

+index 9d2118e5fbc7..2cb89f886d45 100644

2844

+--- a/net/ipv4/tcp.c

2845

++++ b/net/ipv4/tcp.c

2846

+@@ -271,6 +271,7 @@

2847

+

2848

+ #include <net/icmp.h>

2849

+ #include <net/inet_common.h>

2850

++#include <net/mptcp.h>

2851

+ #include <net/tcp.h>

2852

+ #include <net/xfrm.h>

2853

+ #include <net/ip.h>

2854

+@@ -371,6 +372,24 @@ static int retrans_to_secs(u8 retrans, int timeout, int rto_max)

2855

+ 	return period;

2856

+ }

2857

+

2858

++const struct tcp_sock_ops tcp_specific = {

2859

++	.__select_window		= __tcp_select_window,

2860

++	.select_window			= tcp_select_window,

2861

++	.select_initial_window		= tcp_select_initial_window,

2862

++	.init_buffer_space		= tcp_init_buffer_space,

2863

++	.set_rto			= tcp_set_rto,

2864

++	.should_expand_sndbuf		= tcp_should_expand_sndbuf,

2865

++	.init_congestion_control	= tcp_init_congestion_control,

2866

++	.send_fin			= tcp_send_fin,

2867

++	.write_xmit			= tcp_write_xmit,

2868

++	.send_active_reset		= tcp_send_active_reset,

2869

++	.write_wakeup			= tcp_write_wakeup,

2870

++	.prune_ofo_queue		= tcp_prune_ofo_queue,

2871

++	.retransmit_timer		= tcp_retransmit_timer,

2872

++	.time_wait			= tcp_time_wait,

2873

++	.cleanup_rbuf			= tcp_cleanup_rbuf,

2874

++};

2875

++

2876

+ /* Address-family independent initialization for a tcp_sock.

2877

+  *

2878

+  * NOTE: A lot of things set to zero explicitly by call to

2879

+@@ -419,6 +438,8 @@ void tcp_init_sock(struct sock *sk)

2880

+ 	sk->sk_sndbuf = sysctl_tcp_wmem[1];

2881

+ 	sk->sk_rcvbuf = sysctl_tcp_rmem[1];

2882

+

2883

++	tp->ops = &tcp_specific;

2884

++

2885

+ 	local_bh_disable();

2886

+ 	sock_update_memcg(sk);

2887

+ 	sk_sockets_allocated_inc(sk);

2888

+@@ -726,6 +747,14 @@ ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos,

2889

+ 	int ret;

2890

+

2891

+ 	sock_rps_record_flow(sk);

2892

++

2893

++#ifdef CONFIG_MPTCP

2894

++	if (mptcp(tcp_sk(sk))) {

2895

++		struct sock *sk_it;

2896

++		mptcp_for_each_sk(tcp_sk(sk)->mpcb, sk_it)

2897

++			sock_rps_record_flow(sk_it);

2898

++	}

2899

++#endif

2900

+ 	/*

2901

+ 	 * We can't seek on a socket input

2902

+ 	 */

2903

+@@ -821,8 +850,7 @@ struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp)

2904

+ 	return NULL;

2905

+ }

2906

+

2907

+-static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,

2908

+-				       int large_allowed)

2909

++unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now, int large_allowed)

2910

+ {

2911

+ 	struct tcp_sock *tp = tcp_sk(sk);

2912

+ 	u32 xmit_size_goal, old_size_goal;

2913

+@@ -872,8 +900,13 @@ static int tcp_send_mss(struct sock *sk, int *size_goal, int flags)

2914

+ {

2915

+ 	int mss_now;

2916

+

2917

+-	mss_now = tcp_current_mss(sk);

2918

+-	*size_goal = tcp_xmit_size_goal(sk, mss_now, !(flags & MSG_OOB));

2919

++	if (mptcp(tcp_sk(sk))) {

2920

++		mss_now = mptcp_current_mss(sk);

2921

++		*size_goal = mptcp_xmit_size_goal(sk, mss_now, !(flags & MSG_OOB));

2922

++	} else {

2923

++		mss_now = tcp_current_mss(sk);

2924

++		*size_goal = tcp_xmit_size_goal(sk, mss_now, !(flags & MSG_OOB));

2925

++	}

2926

+

2927

+ 	return mss_now;

2928

+ }

2929

+@@ -892,11 +925,32 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,

2930

+ 	 * is fully established.

2931

+ 	 */

2932

+ 	if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &&

2933

+-	    !tcp_passive_fastopen(sk)) {

2934

++	    !tcp_passive_fastopen(mptcp(tp) && tp->mpcb->master_sk ?

2935

++				  tp->mpcb->master_sk : sk)) {

2936

+ 		if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)

2937

+ 			goto out_err;

2938

+ 	}

2939

+

2940

++	if (mptcp(tp)) {

2941

++		struct sock *sk_it = sk;

2942

++

2943

++		/* We must check this with socket-lock hold because we iterate

2944

++		 * over the subflows.

2945

++		 */

2946

++		if (!mptcp_can_sendpage(sk)) {

2947

++			ssize_t ret;

2948

++

2949

++			release_sock(sk);

2950

++			ret = sock_no_sendpage(sk->sk_socket, page, offset,

2951

++					       size, flags);

2952

++			lock_sock(sk);

2953

++			return ret;

2954

++		}

2955

++

2956

++		mptcp_for_each_sk(tp->mpcb, sk_it)

2957

++			sock_rps_record_flow(sk_it);

2958

++	}

2959

++

2960

+ 	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);

2961

+

2962

+ 	mss_now = tcp_send_mss(sk, &size_goal, flags);

2963

+@@ -1001,8 +1055,9 @@ int tcp_sendpage(struct sock *sk, struct page *page, int offset,

2964

+ {

2965

+ 	ssize_t res;

2966

+

2967

+-	if (!(sk->sk_route_caps & NETIF_F_SG) ||

2968

+-	    !(sk->sk_route_caps & NETIF_F_ALL_CSUM))

2969

++	/* If MPTCP is enabled, we check it later after establishment */

2970

++	if (!mptcp(tcp_sk(sk)) && (!(sk->sk_route_caps & NETIF_F_SG) ||

2971

++	    !(sk->sk_route_caps & NETIF_F_ALL_CSUM)))

2972

+ 		return sock_no_sendpage(sk->sk_socket, page, offset, size,

2973

+ 					flags);

2974

+

2975

+@@ -1018,6 +1073,9 @@ static inline int select_size(const struct sock *sk, bool sg)

2976

+ 	const struct tcp_sock *tp = tcp_sk(sk);

2977

+ 	int tmp = tp->mss_cache;

2978

+

2979

++	if (mptcp(tp))

2980

++		return mptcp_select_size(sk, sg);

2981

++

2982

+ 	if (sg) {

2983

+ 		if (sk_can_gso(sk)) {

2984

+ 			/* Small frames wont use a full page:

2985

+@@ -1100,11 +1158,18 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,

2986

+ 	 * is fully established.

2987

+ 	 */

2988

+ 	if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &&

2989

+-	    !tcp_passive_fastopen(sk)) {

2990

++	    !tcp_passive_fastopen(mptcp(tp) && tp->mpcb->master_sk ?

2991

++				  tp->mpcb->master_sk : sk)) {

2992

+ 		if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)

2993

+ 			goto do_error;

2994

+ 	}

2995

+

2996

++	if (mptcp(tp)) {

2997

++		struct sock *sk_it = sk;

2998

++		mptcp_for_each_sk(tp->mpcb, sk_it)

2999

++			sock_rps_record_flow(sk_it);

3000

++	}

3001

++

3002

+ 	if (unlikely(tp->repair)) {

3003

+ 		if (tp->repair_queue == TCP_RECV_QUEUE) {

3004

+ 			copied = tcp_send_rcvq(sk, msg, size);

3005

+@@ -1132,7 +1197,10 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,

3006

+ 	if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))

3007

+ 		goto out_err;

3008

+

3009

+-	sg = !!(sk->sk_route_caps & NETIF_F_SG);

3010

++	if (mptcp(tp))

3011

++		sg = mptcp_can_sg(sk);

3012

++	else

3013

++		sg = !!(sk->sk_route_caps & NETIF_F_SG);

3014

+

3015

+ 	while (--iovlen >= 0) {

3016

+ 		size_t seglen = iov->iov_len;

3017

+@@ -1183,8 +1251,15 @@ new_segment:

3018

+

3019

+ 				/*

3020

+ 				 * Check whether we can use HW checksum.

3021

++				 *

3022

++				 * If dss-csum is enabled, we do not do hw-csum.

3023

++				 * In case of non-mptcp we check the

3024

++				 * device-capabilities.

3025

++				 * In case of mptcp, hw-csum's will be handled

3026

++				 * later in mptcp_write_xmit.

3027

+ 				 */

3028

+-				if (sk->sk_route_caps & NETIF_F_ALL_CSUM)

3029

++				if (((mptcp(tp) && !tp->mpcb->dss_csum) || !mptcp(tp)) &&

3030

++				    (mptcp(tp) || sk->sk_route_caps & NETIF_F_ALL_CSUM))

3031

+ 					skb->ip_summed = CHECKSUM_PARTIAL;

3032

+

3033

+ 				skb_entail(sk, skb);

3034

+@@ -1422,7 +1497,7 @@ void tcp_cleanup_rbuf(struct sock *sk, int copied)

3035

+

3036

+ 		/* Optimize, __tcp_select_window() is not cheap. */

3037

+ 		if (2*rcv_window_now <= tp->window_clamp) {

3038

+-			__u32 new_window = __tcp_select_window(sk);

3039

++			__u32 new_window = tp->ops->__select_window(sk);

3040

+

3041

+ 			/* Send ACK now, if this read freed lots of space

3042

+ 			 * in our buffer. Certainly, new_window is new window.

3043

+@@ -1587,7 +1662,7 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,

3044

+ 	/* Clean up data we have read: This will do ACK frames. */

3045

+ 	if (copied > 0) {

3046

+ 		tcp_recv_skb(sk, seq, &offset);

3047

+-		tcp_cleanup_rbuf(sk, copied);

3048

++		tp->ops->cleanup_rbuf(sk, copied);

3049

+ 	}

3050

+ 	return copied;

3051

+ }

3052

+@@ -1623,6 +1698,14 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,

3053

+

3054

+ 	lock_sock(sk);

3055

+

3056

++#ifdef CONFIG_MPTCP

3057

++	if (mptcp(tp)) {

3058

++		struct sock *sk_it;

3059

++		mptcp_for_each_sk(tp->mpcb, sk_it)

3060

++			sock_rps_record_flow(sk_it);

3061

++	}

3062

++#endif

3063

++

3064

+ 	err = -ENOTCONN;

3065

+ 	if (sk->sk_state == TCP_LISTEN)

3066

+ 		goto out;

3067

+@@ -1761,7 +1844,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,

3068

+ 			}

3069

+ 		}

3070

+

3071

+-		tcp_cleanup_rbuf(sk, copied);

3072

++		tp->ops->cleanup_rbuf(sk, copied);

3073

+

3074

+ 		if (!sysctl_tcp_low_latency && tp->ucopy.task == user_recv) {

3075

+ 			/* Install new reader */

3076

+@@ -1813,7 +1896,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,

3077

+ 			if (tp->rcv_wnd == 0 &&

3078

+ 			    !skb_queue_empty(&sk->sk_async_wait_queue)) {

3079

+ 				tcp_service_net_dma(sk, true);

3080

+-				tcp_cleanup_rbuf(sk, copied);

3081

++				tp->ops->cleanup_rbuf(sk, copied);

3082

+ 			} else

3083

+ 				dma_async_issue_pending(tp->ucopy.dma_chan);

3084

+ 		}

3085

+@@ -1993,7 +2076,7 @@ skip_copy:

3086

+ 	 */

3087

+

3088

+ 	/* Clean up data we have read: This will do ACK frames. */

3089

+-	tcp_cleanup_rbuf(sk, copied);

3090

++	tp->ops->cleanup_rbuf(sk, copied);

3091

+

3092

+ 	release_sock(sk);

3093

+ 	return copied;

3094

+@@ -2070,7 +2153,7 @@ static const unsigned char new_state[16] = {

3095

+   /* TCP_CLOSING	*/ TCP_CLOSING,

3096

+ };

3097

+

3098

+-static int tcp_close_state(struct sock *sk)

3099

++int tcp_close_state(struct sock *sk)

3100

+ {

3101

+ 	int next = (int)new_state[sk->sk_state];

3102

+ 	int ns = next & TCP_STATE_MASK;

3103

+@@ -2100,7 +2183,7 @@ void tcp_shutdown(struct sock *sk, int how)

3104

+ 	     TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) {

3105

+ 		/* Clear out any half completed packets.  FIN if needed. */

3106

+ 		if (tcp_close_state(sk))

3107

+-			tcp_send_fin(sk);

3108

++			tcp_sk(sk)->ops->send_fin(sk);

3109

+ 	}

3110

+ }

3111

+ EXPORT_SYMBOL(tcp_shutdown);

3112

+@@ -2125,6 +2208,11 @@ void tcp_close(struct sock *sk, long timeout)

3113

+ 	int data_was_unread = 0;

3114

+ 	int state;

3115

+

3116

++	if (is_meta_sk(sk)) {

3117

++		mptcp_close(sk, timeout);

3118

++		return;

3119

++	}

3120

++

3121

+ 	lock_sock(sk);

3122

+ 	sk->sk_shutdown = SHUTDOWN_MASK;

3123

+

3124

+@@ -2167,7 +2255,7 @@ void tcp_close(struct sock *sk, long timeout)

3125

+ 		/* Unread data was tossed, zap the connection. */

3126

+ 		NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE);

3127

+ 		tcp_set_state(sk, TCP_CLOSE);

3128

+-		tcp_send_active_reset(sk, sk->sk_allocation);

3129

++		tcp_sk(sk)->ops->send_active_reset(sk, sk->sk_allocation);

3130

+ 	} else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {

3131

+ 		/* Check zero linger _after_ checking for unread data. */

3132

+ 		sk->sk_prot->disconnect(sk, 0);

3133

+@@ -2247,7 +2335,7 @@ adjudge_to_death:

3134

+ 		struct tcp_sock *tp = tcp_sk(sk);

3135

+ 		if (tp->linger2 < 0) {

3136

+ 			tcp_set_state(sk, TCP_CLOSE);

3137

+-			tcp_send_active_reset(sk, GFP_ATOMIC);

3138

++			tp->ops->send_active_reset(sk, GFP_ATOMIC);

3139

+ 			NET_INC_STATS_BH(sock_net(sk),

3140

+ 					LINUX_MIB_TCPABORTONLINGER);

3141

+ 		} else {

3142

+@@ -2257,7 +2345,8 @@ adjudge_to_death:

3143

+ 				inet_csk_reset_keepalive_timer(sk,

3144

+ 						tmo - TCP_TIMEWAIT_LEN);

3145

+ 			} else {

3146

+-				tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);

3147

++				tcp_sk(sk)->ops->time_wait(sk, TCP_FIN_WAIT2,

3148

++							   tmo);

3149

+ 				goto out;

3150

+ 			}

3151

+ 		}

3152

+@@ -2266,7 +2355,7 @@ adjudge_to_death:

3153

+ 		sk_mem_reclaim(sk);

3154

+ 		if (tcp_check_oom(sk, 0)) {

3155

+ 			tcp_set_state(sk, TCP_CLOSE);

3156

+-			tcp_send_active_reset(sk, GFP_ATOMIC);

3157

++			tcp_sk(sk)->ops->send_active_reset(sk, GFP_ATOMIC);

3158

+ 			NET_INC_STATS_BH(sock_net(sk),

3159

+ 					LINUX_MIB_TCPABORTONMEMORY);

3160

+ 		}

3161

+@@ -2291,15 +2380,6 @@ out:

3162

+ }

3163

+ EXPORT_SYMBOL(tcp_close);

3164

+

3165

+-/* These states need RST on ABORT according to RFC793 */

3166

+-

3167

+-static inline bool tcp_need_reset(int state)

3168

+-{

3169

+-	return (1 << state) &

3170

+-	       (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 |

3171

+-		TCPF_FIN_WAIT2 | TCPF_SYN_RECV);

3172

+-}

3173

+-

3174

+ int tcp_disconnect(struct sock *sk, int flags)

3175

+ {

3176

+ 	struct inet_sock *inet = inet_sk(sk);

3177

+@@ -2322,7 +2402,7 @@ int tcp_disconnect(struct sock *sk, int flags)

3178

+ 		/* The last check adjusts for discrepancy of Linux wrt. RFC

3179

+ 		 * states

3180

+ 		 */

3181

+-		tcp_send_active_reset(sk, gfp_any());

3182

++		tp->ops->send_active_reset(sk, gfp_any());

3183

+ 		sk->sk_err = ECONNRESET;

3184

+ 	} else if (old_state == TCP_SYN_SENT)

3185

+ 		sk->sk_err = ECONNRESET;

3186

+@@ -2340,6 +2420,13 @@ int tcp_disconnect(struct sock *sk, int flags)

3187

+ 	if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))

3188

+ 		inet_reset_saddr(sk);

3189

+

3190

++	if (is_meta_sk(sk)) {

3191

++		mptcp_disconnect(sk);

3192

++	} else {

3193

++		if (tp->inside_tk_table)

3194

++			mptcp_hash_remove_bh(tp);

3195

++	}

3196

++

3197

+ 	sk->sk_shutdown = 0;

3198

+ 	sock_reset_flag(sk, SOCK_DONE);

3199

+ 	tp->srtt_us = 0;

3200

+@@ -2632,6 +2719,12 @@ static int do_tcp_setsockopt(struct sock *sk, int level,

3201

+ 		break;

3202

+

3203

+ 	case TCP_DEFER_ACCEPT:

3204

++		/* An established MPTCP-connection (mptcp(tp) only returns true

3205

++		 * if the socket is established) should not use DEFER on new

3206

++		 * subflows.

3207

++		 */

3208

++		if (mptcp(tp))

3209

++			break;

3210

+ 		/* Translate value in seconds to number of retransmits */

3211

+ 		icsk->icsk_accept_queue.rskq_defer_accept =

3212

+ 			secs_to_retrans(val, TCP_TIMEOUT_INIT / HZ,

3213

+@@ -2659,7 +2752,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level,

3214

+ 			    (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&

3215

+ 			    inet_csk_ack_scheduled(sk)) {

3216

+ 				icsk->icsk_ack.pending |= ICSK_ACK_PUSHED;

3217

+-				tcp_cleanup_rbuf(sk, 1);

3218

++				tp->ops->cleanup_rbuf(sk, 1);

3219

+ 				if (!(val & 1))

3220

+ 					icsk->icsk_ack.pingpong = 1;

3221

+ 			}

3222

+@@ -2699,6 +2792,18 @@ static int do_tcp_setsockopt(struct sock *sk, int level,

3223

+ 		tp->notsent_lowat = val;

3224

+ 		sk->sk_write_space(sk);

3225

+ 		break;

3226

++#ifdef CONFIG_MPTCP

3227

++	case MPTCP_ENABLED:

3228

++		if (sk->sk_state == TCP_CLOSE || sk->sk_state == TCP_LISTEN) {

3229

++			if (val)

3230

++				tp->mptcp_enabled = 1;

3231

++			else

3232

++				tp->mptcp_enabled = 0;

3233

++		} else {

3234

++			err = -EPERM;

3235

++		}

3236

++		break;

3237

++#endif

3238

+ 	default:

3239

+ 		err = -ENOPROTOOPT;

3240

+ 		break;

3241

+@@ -2931,6 +3036,11 @@ static int do_tcp_getsockopt(struct sock *sk, int level,

3242

+ 	case TCP_NOTSENT_LOWAT:

3243

+ 		val = tp->notsent_lowat;

3244

+ 		break;

3245

++#ifdef CONFIG_MPTCP

3246

++	case MPTCP_ENABLED:

3247

++		val = tp->mptcp_enabled;

3248

++		break;

3249

++#endif

3250

+ 	default:

3251

+ 		return -ENOPROTOOPT;

3252

+ 	}

3253

+@@ -3120,8 +3230,11 @@ void tcp_done(struct sock *sk)

3254

+ 	if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV)

3255

+ 		TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_ATTEMPTFAILS);

3256

+

3257

++	WARN_ON(sk->sk_state == TCP_CLOSE);

3258

+ 	tcp_set_state(sk, TCP_CLOSE);

3259

++

3260

+ 	tcp_clear_xmit_timers(sk);

3261

++

3262

+ 	if (req != NULL)

3263

+ 		reqsk_fastopen_remove(sk, req, false);

3264

+

3265

+diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c

3266

+index 9771563ab564..5c230d96c4c1 100644

3267

+--- a/net/ipv4/tcp_fastopen.c

3268

++++ b/net/ipv4/tcp_fastopen.c

3269

+@@ -7,6 +7,7 @@

3270

+ #include <linux/rculist.h>

3271

+ #include <net/inetpeer.h>

3272

+ #include <net/tcp.h>

3273

++#include <net/mptcp.h>

3274

+

3275

+ int sysctl_tcp_fastopen __read_mostly = TFO_CLIENT_ENABLE;

3276

+

3277

+@@ -133,7 +134,7 @@ static bool tcp_fastopen_create_child(struct sock *sk,

3278

+ {

3279

+ 	struct tcp_sock *tp;

3280

+ 	struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;

3281

+-	struct sock *child;

3282

++	struct sock *child, *meta_sk;

3283

+

3284

+ 	req->num_retrans = 0;

3285

+ 	req->num_timeout = 0;

3286

+@@ -176,13 +177,6 @@ static bool tcp_fastopen_create_child(struct sock *sk,

3287

+ 	/* Add the child socket directly into the accept queue */

3288

+ 	inet_csk_reqsk_queue_add(sk, req, child);

3289

+

3290

+-	/* Now finish processing the fastopen child socket. */

3291

+-	inet_csk(child)->icsk_af_ops->rebuild_header(child);

3292

+-	tcp_init_congestion_control(child);

3293

+-	tcp_mtup_init(child);

3294

+-	tcp_init_metrics(child);

3295

+-	tcp_init_buffer_space(child);

3296

+-

3297

+ 	/* Queue the data carried in the SYN packet. We need to first

3298

+ 	 * bump skb's refcnt because the caller will attempt to free it.

3299

+ 	 *

3300

+@@ -199,8 +193,24 @@ static bool tcp_fastopen_create_child(struct sock *sk,

3301

+ 		tp->syn_data_acked = 1;

3302

+ 	}

3303

+ 	tcp_rsk(req)->rcv_nxt = tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;

3304

++

3305

++	meta_sk = child;

3306

++	if (!mptcp_check_req_fastopen(meta_sk, req)) {

3307

++		child = tcp_sk(meta_sk)->mpcb->master_sk;

3308

++		tp = tcp_sk(child);

3309

++	}

3310

++

3311

++	/* Now finish processing the fastopen child socket. */

3312

++	inet_csk(child)->icsk_af_ops->rebuild_header(child);

3313

++	tp->ops->init_congestion_control(child);

3314

++	tcp_mtup_init(child);

3315

++	tcp_init_metrics(child);

3316

++	tp->ops->init_buffer_space(child);

3317

++

3318

+ 	sk->sk_data_ready(sk);

3319

+-	bh_unlock_sock(child);

3320

++	if (mptcp(tcp_sk(child)))

3321

++		bh_unlock_sock(child);

3322

++	bh_unlock_sock(meta_sk);

3323

+ 	sock_put(child);

3324

+ 	WARN_ON(req->sk == NULL);

3325

+ 	return true;

3326

+diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c

3327

+index 40639c288dc2..3273bb69f387 100644

3328

+--- a/net/ipv4/tcp_input.c

3329

++++ b/net/ipv4/tcp_input.c

3330

+@@ -74,6 +74,9 @@

3331

+ #include <linux/ipsec.h>

3332

+ #include <asm/unaligned.h>

3333

+ #include <net/netdma.h>

3334

++#include <net/mptcp.h>

3335

++#include <net/mptcp_v4.h>

3336

++#include <net/mptcp_v6.h>

3337

+

3338

+ int sysctl_tcp_timestamps __read_mostly = 1;

3339

+ int sysctl_tcp_window_scaling __read_mostly = 1;

3340

+@@ -99,25 +102,6 @@ int sysctl_tcp_thin_dupack __read_mostly;

3341

+ int sysctl_tcp_moderate_rcvbuf __read_mostly = 1;

3342

+ int sysctl_tcp_early_retrans __read_mostly = 3;

3343

+

3344

+-#define FLAG_DATA		0x01 /* Incoming frame contained data.		*/

3345

+-#define FLAG_WIN_UPDATE		0x02 /* Incoming ACK was a window update.	*/

3346

+-#define FLAG_DATA_ACKED		0x04 /* This ACK acknowledged new data.		*/

3347

+-#define FLAG_RETRANS_DATA_ACKED	0x08 /* "" "" some of which was retransmitted.	*/

3348

+-#define FLAG_SYN_ACKED		0x10 /* This ACK acknowledged SYN.		*/

3349

+-#define FLAG_DATA_SACKED	0x20 /* New SACK.				*/

3350

+-#define FLAG_ECE		0x40 /* ECE in this ACK				*/

3351

+-#define FLAG_SLOWPATH		0x100 /* Do not skip RFC checks for window update.*/

3352

+-#define FLAG_ORIG_SACK_ACKED	0x200 /* Never retransmitted data are (s)acked	*/

3353

+-#define FLAG_SND_UNA_ADVANCED	0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */

3354

+-#define FLAG_DSACKING_ACK	0x800 /* SACK blocks contained D-SACK info */

3355

+-#define FLAG_SACK_RENEGING	0x2000 /* snd_una advanced to a sacked seq */

3356

+-#define FLAG_UPDATE_TS_RECENT	0x4000 /* tcp_replace_ts_recent() */

3357

+-

3358

+-#define FLAG_ACKED		(FLAG_DATA_ACKED|FLAG_SYN_ACKED)

3359

+-#define FLAG_NOT_DUP		(FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED)

3360

+-#define FLAG_CA_ALERT		(FLAG_DATA_SACKED|FLAG_ECE)

3361

+-#define FLAG_FORWARD_PROGRESS	(FLAG_ACKED|FLAG_DATA_SACKED)

3362

+-

3363

+ #define TCP_REMNANT (TCP_FLAG_FIN|TCP_FLAG_URG|TCP_FLAG_SYN|TCP_FLAG_PSH)

3364

+ #define TCP_HP_BITS (~(TCP_RESERVED_BITS|TCP_FLAG_PSH))

3365

+

3366

+@@ -181,7 +165,7 @@ static void tcp_incr_quickack(struct sock *sk)

3367

+ 		icsk->icsk_ack.quick = min(quickacks, TCP_MAX_QUICKACKS);

3368

+ }

3369

+

3370

+-static void tcp_enter_quickack_mode(struct sock *sk)

3371

++void tcp_enter_quickack_mode(struct sock *sk)

3372

+ {

3373

+ 	struct inet_connection_sock *icsk = inet_csk(sk);

3374

+ 	tcp_incr_quickack(sk);

3375

+@@ -283,8 +267,12 @@ static void tcp_sndbuf_expand(struct sock *sk)

3376

+ 	per_mss = roundup_pow_of_two(per_mss) +

3377

+ 		  SKB_DATA_ALIGN(sizeof(struct sk_buff));

3378

+

3379

+-	nr_segs = max_t(u32, TCP_INIT_CWND, tp->snd_cwnd);

3380

+-	nr_segs = max_t(u32, nr_segs, tp->reordering + 1);

3381

++	if (mptcp(tp)) {

3382

++		nr_segs = mptcp_check_snd_buf(tp);

3383

++	} else {

3384

++		nr_segs = max_t(u32, TCP_INIT_CWND, tp->snd_cwnd);

3385

++		nr_segs = max_t(u32, nr_segs, tp->reordering + 1);

3386

++	}

3387

+

3388

+ 	/* Fast Recovery (RFC 5681 3.2) :

3389

+ 	 * Cubic needs 1.7 factor, rounded to 2 to include

3390

+@@ -292,8 +280,16 @@ static void tcp_sndbuf_expand(struct sock *sk)

3391

+ 	 */

3392

+ 	sndmem = 2 * nr_segs * per_mss;

3393

+

3394

+-	if (sk->sk_sndbuf < sndmem)

3395

++	/* MPTCP: after this sndmem is the new contribution of the

3396

++	 * current subflow to the aggregated sndbuf */

3397

++	if (sk->sk_sndbuf < sndmem) {

3398

++		int old_sndbuf = sk->sk_sndbuf;

3399

+ 		sk->sk_sndbuf = min(sndmem, sysctl_tcp_wmem[2]);

3400

++		/* MPTCP: ok, the subflow sndbuf has grown, reflect

3401

++		 * this in the aggregate buffer.*/

3402

++		if (mptcp(tp) && old_sndbuf != sk->sk_sndbuf)

3403

++			mptcp_update_sndbuf(tp);

3404

++	}

3405

+ }

3406

+

3407

+ /* 2. Tuning advertised window (window_clamp, rcv_ssthresh)

3408

+@@ -342,10 +338,12 @@ static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb)

3409

+ static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb)

3410

+ {

3411

+ 	struct tcp_sock *tp = tcp_sk(sk);

3412

++	struct sock *meta_sk = mptcp(tp) ? mptcp_meta_sk(sk) : sk;

3413

++	struct tcp_sock *meta_tp = tcp_sk(meta_sk);

3414

+

3415

+ 	/* Check #1 */

3416

+-	if (tp->rcv_ssthresh < tp->window_clamp &&

3417

+-	    (int)tp->rcv_ssthresh < tcp_space(sk) &&

3418

++	if (meta_tp->rcv_ssthresh < meta_tp->window_clamp &&

3419

++	    (int)meta_tp->rcv_ssthresh < tcp_space(sk) &&

3420

+ 	    !sk_under_memory_pressure(sk)) {

3421

+ 		int incr;

3422

+

3423

+@@ -353,14 +351,14 @@ static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb)

3424

+ 		 * will fit to rcvbuf in future.

3425

+ 		 */

3426

+ 		if (tcp_win_from_space(skb->truesize) <= skb->len)

3427

+-			incr = 2 * tp->advmss;

3428

++			incr = 2 * meta_tp->advmss;

3429

+ 		else

3430

+-			incr = __tcp_grow_window(sk, skb);

3431

++			incr = __tcp_grow_window(meta_sk, skb);

3432

+

3433

+ 		if (incr) {

3434

+ 			incr = max_t(int, incr, 2 * skb->len);

3435

+-			tp->rcv_ssthresh = min(tp->rcv_ssthresh + incr,

3436

+-					       tp->window_clamp);

3437

++			meta_tp->rcv_ssthresh = min(meta_tp->rcv_ssthresh + incr,

3438

++					            meta_tp->window_clamp);

3439

+ 			inet_csk(sk)->icsk_ack.quick |= 1;

3440

+ 		}

3441

+ 	}

3442

+@@ -543,7 +541,10 @@ void tcp_rcv_space_adjust(struct sock *sk)

3443

+ 	int copied;

3444

+

3445

+ 	time = tcp_time_stamp - tp->rcvq_space.time;

3446

+-	if (time < (tp->rcv_rtt_est.rtt >> 3) || tp->rcv_rtt_est.rtt == 0)

3447

++	if (mptcp(tp)) {

3448

++		if (mptcp_check_rtt(tp, time))

3449

++			return;

3450

++	} else if (time < (tp->rcv_rtt_est.rtt >> 3) || tp->rcv_rtt_est.rtt == 0)

3451

+ 		return;

3452

+

3453

+ 	/* Number of bytes copied to user in last RTT */

3454

+@@ -761,7 +762,7 @@ static void tcp_update_pacing_rate(struct sock *sk)

3455

+ /* Calculate rto without backoff.  This is the second half of Van Jacobson's

3456

+  * routine referred to above.

3457

+  */

3458

+-static void tcp_set_rto(struct sock *sk)

3459

++void tcp_set_rto(struct sock *sk)

3460

+ {

3461

+ 	const struct tcp_sock *tp = tcp_sk(sk);

3462

+ 	/* Old crap is replaced with new one. 8)

3463

+@@ -1376,7 +1377,11 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb,

3464

+ 	int len;

3465

+ 	int in_sack;

3466

+

3467

+-	if (!sk_can_gso(sk))

3468

++	/* For MPTCP we cannot shift skb-data and remove one skb from the

3469

++	 * send-queue, because this will make us loose the DSS-option (which

3470

++	 * is stored in TCP_SKB_CB(skb)->dss) of the skb we are removing.

3471

++	 */

3472

++	if (!sk_can_gso(sk) || mptcp(tp))

3473

+ 		goto fallback;

3474

+

3475

+ 	/* Normally R but no L won't result in plain S */

3476

+@@ -2915,7 +2920,7 @@ static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag,

3477

+ 		return false;

3478

+

3479

+ 	tcp_rtt_estimator(sk, seq_rtt_us);

3480

+-	tcp_set_rto(sk);

3481

++	tp->ops->set_rto(sk);

3482

+

3483

+ 	/* RFC6298: only reset backoff on valid RTT measurement. */

3484

+ 	inet_csk(sk)->icsk_backoff = 0;

3485

+@@ -3000,7 +3005,7 @@ void tcp_resume_early_retransmit(struct sock *sk)

3486

+ }

3487

+

3488

+ /* If we get here, the whole TSO packet has not been acked. */

3489

+-static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb)

3490

++u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb)

3491

+ {

3492

+ 	struct tcp_sock *tp = tcp_sk(sk);

3493

+ 	u32 packets_acked;

3494

+@@ -3095,6 +3100,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,

3495

+ 		 */

3496

+ 		if (!(scb->tcp_flags & TCPHDR_SYN)) {

3497

+ 			flag |= FLAG_DATA_ACKED;

3498

++			if (mptcp(tp) && mptcp_is_data_seq(skb))

3499

++				flag |= MPTCP_FLAG_DATA_ACKED;

3500

+ 		} else {

3501

+ 			flag |= FLAG_SYN_ACKED;

3502

+ 			tp->retrans_stamp = 0;

3503

+@@ -3189,7 +3196,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,

3504

+ 	return flag;

3505

+ }

3506

+

3507

+-static void tcp_ack_probe(struct sock *sk)

3508

++void tcp_ack_probe(struct sock *sk)

3509

+ {

3510

+ 	const struct tcp_sock *tp = tcp_sk(sk);

3511

+ 	struct inet_connection_sock *icsk = inet_csk(sk);

3512

+@@ -3236,9 +3243,8 @@ static inline bool tcp_may_raise_cwnd(const struct sock *sk, const int flag)

3513

+ /* Check that window update is acceptable.

3514

+  * The function assumes that snd_una<=ack<=snd_next.

3515

+  */

3516

+-static inline bool tcp_may_update_window(const struct tcp_sock *tp,

3517

+-					const u32 ack, const u32 ack_seq,

3518

+-					const u32 nwin)

3519

++bool tcp_may_update_window(const struct tcp_sock *tp, const u32 ack,

3520

++			   const u32 ack_seq, const u32 nwin)

3521

+ {

3522

+ 	return	after(ack, tp->snd_una) ||

3523

+ 		after(ack_seq, tp->snd_wl1) ||

3524

+@@ -3357,7 +3363,7 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag)

3525

+ }

3526

+

3527

+ /* This routine deals with incoming acks, but not outgoing ones. */

3528

+-static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)

3529

++static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)

3530

+ {

3531

+ 	struct inet_connection_sock *icsk = inet_csk(sk);

3532

+ 	struct tcp_sock *tp = tcp_sk(sk);

3533

+@@ -3449,6 +3455,16 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)

3534

+ 				    sack_rtt_us);

3535

+ 	acked -= tp->packets_out;

3536

+

3537

++	if (mptcp(tp)) {

3538

++		if (mptcp_fallback_infinite(sk, flag)) {

3539

++			pr_err("%s resetting flow\n", __func__);

3540

++			mptcp_send_reset(sk);

3541

++			goto invalid_ack;

3542

++		}

3543

++

3544

++		mptcp_clean_rtx_infinite(skb, sk);

3545

++	}

3546

++

3547

+ 	/* Advance cwnd if state allows */

3548

+ 	if (tcp_may_raise_cwnd(sk, flag))

3549

+ 		tcp_cong_avoid(sk, ack, acked);

3550

+@@ -3512,8 +3528,9 @@ old_ack:

3551

+  * the fast version below fails.

3552

+  */

3553

+ void tcp_parse_options(const struct sk_buff *skb,

3554

+-		       struct tcp_options_received *opt_rx, int estab,

3555

+-		       struct tcp_fastopen_cookie *foc)

3556

++		       struct tcp_options_received *opt_rx,

3557

++		       struct mptcp_options_received *mopt,

3558

++		       int estab, struct tcp_fastopen_cookie *foc)

3559

+ {

3560

+ 	const unsigned char *ptr;

3561

+ 	const struct tcphdr *th = tcp_hdr(skb);

3562

+@@ -3596,6 +3613,9 @@ void tcp_parse_options(const struct sk_buff *skb,

3563

+ 				 */

3564

+ 				break;

3565

+ #endif

3566

++			case TCPOPT_MPTCP:

3567

++				mptcp_parse_options(ptr - 2, opsize, mopt, skb);

3568

++				break;

3569

+ 			case TCPOPT_EXP:

3570

+ 				/* Fast Open option shares code 254 using a

3571

+ 				 * 16 bits magic number. It's valid only in

3572

+@@ -3657,8 +3677,8 @@ static bool tcp_fast_parse_options(const struct sk_buff *skb,

3573

+ 		if (tcp_parse_aligned_timestamp(tp, th))

3574

+ 			return true;

3575

+ 	}

3576

+-

3577

+-	tcp_parse_options(skb, &tp->rx_opt, 1, NULL);

3578

++	tcp_parse_options(skb, &tp->rx_opt, mptcp(tp) ? &tp->mptcp->rx_opt : NULL,

3579

++			  1, NULL);

3580

+ 	if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)

3581

+ 		tp->rx_opt.rcv_tsecr -= tp->tsoffset;

3582

+

3583

+@@ -3831,6 +3851,8 @@ static void tcp_fin(struct sock *sk)

3584

+ 		dst = __sk_dst_get(sk);

3585

+ 		if (!dst || !dst_metric(dst, RTAX_QUICKACK))

3586

+ 			inet_csk(sk)->icsk_ack.pingpong = 1;

3587

++		if (mptcp(tp))

3588

++			mptcp_sub_close_passive(sk);

3589

+ 		break;

3590

+

3591

+ 	case TCP_CLOSE_WAIT:

3592

+@@ -3852,9 +3874,16 @@ static void tcp_fin(struct sock *sk)

3593

+ 		tcp_set_state(sk, TCP_CLOSING);

3594

+ 		break;

3595

+ 	case TCP_FIN_WAIT2:

3596

++		if (mptcp(tp)) {

3597

++			/* The socket will get closed by mptcp_data_ready.

3598

++			 * We first have to process all data-sequences.

3599

++			 */

3600

++			tp->close_it = 1;

3601

++			break;

3602

++		}

3603

+ 		/* Received a FIN -- send ACK and enter TIME_WAIT. */

3604

+ 		tcp_send_ack(sk);

3605

+-		tcp_time_wait(sk, TCP_TIME_WAIT, 0);

3606

++		tp->ops->time_wait(sk, TCP_TIME_WAIT, 0);

3607

+ 		break;

3608

+ 	default:

3609

+ 		/* Only TCP_LISTEN and TCP_CLOSE are left, in these

3610

+@@ -3876,6 +3905,10 @@ static void tcp_fin(struct sock *sk)

3611

+ 	if (!sock_flag(sk, SOCK_DEAD)) {

3612

+ 		sk->sk_state_change(sk);

3613

+

3614

++		/* Don't wake up MPTCP-subflows */

3615

++		if (mptcp(tp))

3616

++			return;

3617

++

3618

+ 		/* Do not send POLL_HUP for half duplex close. */

3619

+ 		if (sk->sk_shutdown == SHUTDOWN_MASK ||

3620

+ 		    sk->sk_state == TCP_CLOSE)

3621

+@@ -4073,7 +4106,11 @@ static void tcp_ofo_queue(struct sock *sk)

3622

+ 			tcp_dsack_extend(sk, TCP_SKB_CB(skb)->seq, dsack);

3623

+ 		}

3624

+

3625

+-		if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {

3626

++		/* In case of MPTCP, the segment may be empty if it's a

3627

++		 * non-data DATA_FIN. (see beginning of tcp_data_queue)

3628

++		 */

3629

++		if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt) &&

3630

++		    !(mptcp(tp) && TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq)) {

3631

+ 			SOCK_DEBUG(sk, "ofo packet was already received\n");

3632

+ 			__skb_unlink(skb, &tp->out_of_order_queue);

3633

+ 			__kfree_skb(skb);

3634

+@@ -4091,12 +4128,14 @@ static void tcp_ofo_queue(struct sock *sk)

3635

+ 	}

3636

+ }

3637

+

3638

+-static bool tcp_prune_ofo_queue(struct sock *sk);

3639

+ static int tcp_prune_queue(struct sock *sk);

3640

+

3641

+ static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb,

3642

+ 				 unsigned int size)

3643

+ {

3644

++	if (mptcp(tcp_sk(sk)))

3645

++		sk = mptcp_meta_sk(sk);

3646

++

3647

+ 	if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||

3648

+ 	    !sk_rmem_schedule(sk, skb, size)) {

3649

+

3650

+@@ -4104,7 +4143,7 @@ static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb,

3651

+ 			return -1;

3652

+

3653

+ 		if (!sk_rmem_schedule(sk, skb, size)) {

3654

+-			if (!tcp_prune_ofo_queue(sk))

3655

++			if (!tcp_sk(sk)->ops->prune_ofo_queue(sk))

3656

+ 				return -1;

3657

+

3658

+ 			if (!sk_rmem_schedule(sk, skb, size))

3659

+@@ -4127,15 +4166,16 @@ static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb,

3660

+  * Better try to coalesce them right now to avoid future collapses.

3661

+  * Returns true if caller should free @from instead of queueing it

3662

+  */

3663

+-static bool tcp_try_coalesce(struct sock *sk,

3664

+-			     struct sk_buff *to,

3665

+-			     struct sk_buff *from,

3666

+-			     bool *fragstolen)

3667

++bool tcp_try_coalesce(struct sock *sk, struct sk_buff *to, struct sk_buff *from,

3668

++		      bool *fragstolen)

3669

+ {

3670

+ 	int delta;

3671

+

3672

+ 	*fragstolen = false;

3673

+

3674

++	if (mptcp(tcp_sk(sk)) && !is_meta_sk(sk))

3675

++		return false;

3676

++

3677

+ 	if (tcp_hdr(from)->fin)

3678

+ 		return false;

3679

+

3680

+@@ -4225,7 +4265,9 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)

3681

+

3682

+ 	/* Do skb overlap to previous one? */

3683

+ 	if (skb1 && before(seq, TCP_SKB_CB(skb1)->end_seq)) {

3684

+-		if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) {

3685

++		/* MPTCP allows non-data data-fin to be in the ofo-queue */

3686

++		if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq) &&

3687

++		    !(mptcp(tp) && end_seq == seq)) {

3688

+ 			/* All the bits are present. Drop. */

3689

+ 			NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFOMERGE);

3690

+ 			__kfree_skb(skb);

3691

+@@ -4263,6 +4305,9 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)

3692

+ 					 end_seq);

3693

+ 			break;

3694

+ 		}

3695

++		/* MPTCP allows non-data data-fin to be in the ofo-queue */

3696

++		if (mptcp(tp) && TCP_SKB_CB(skb1)->seq == TCP_SKB_CB(skb1)->end_seq)

3697

++			continue;

3698

+ 		__skb_unlink(skb1, &tp->out_of_order_queue);

3699

+ 		tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq,

3700

+ 				 TCP_SKB_CB(skb1)->end_seq);

3701

+@@ -4280,8 +4325,8 @@ end:

3702

+ 	}

3703

+ }

3704

+

3705

+-static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int hdrlen,

3706

+-		  bool *fragstolen)

3707

++int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int hdrlen,

3708

++			       bool *fragstolen)

3709

+ {

3710

+ 	int eaten;

3711

+ 	struct sk_buff *tail = skb_peek_tail(&sk->sk_receive_queue);

3712

+@@ -4343,7 +4388,10 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)

3713

+ 	int eaten = -1;

3714

+ 	bool fragstolen = false;

3715

+

3716

+-	if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq)

3717

++	/* If no data is present, but a data_fin is in the options, we still

3718

++	 * have to call mptcp_queue_skb later on. */

3719

++	if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq &&

3720

++	    !(mptcp(tp) && mptcp_is_data_fin(skb)))

3721

+ 		goto drop;

3722

+

3723

+ 	skb_dst_drop(skb);

3724

+@@ -4389,7 +4437,7 @@ queue_and_out:

3725

+ 			eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen);

3726

+ 		}

3727

+ 		tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;

3728

+-		if (skb->len)

3729

++		if (skb->len || mptcp_is_data_fin(skb))

3730

+ 			tcp_event_data_recv(sk, skb);

3731

+ 		if (th->fin)

3732

+ 			tcp_fin(sk);

3733

+@@ -4411,7 +4459,11 @@ queue_and_out:

3734

+

3735

+ 		if (eaten > 0)

3736

+ 			kfree_skb_partial(skb, fragstolen);

3737

+-		if (!sock_flag(sk, SOCK_DEAD))

3738

++		if (!sock_flag(sk, SOCK_DEAD) || mptcp(tp))

3739

++			/* MPTCP: we always have to call data_ready, because

3740

++			 * we may be about to receive a data-fin, which still

3741

++			 * must get queued.

3742

++			 */

3743

+ 			sk->sk_data_ready(sk);

3744

+ 		return;

3745

+ 	}

3746

+@@ -4463,6 +4515,8 @@ static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb,

3747

+ 		next = skb_queue_next(list, skb);

3748

+

3749

+ 	__skb_unlink(skb, list);

3750

++	if (mptcp(tcp_sk(sk)))

3751

++		mptcp_remove_shortcuts(tcp_sk(sk)->mpcb, skb);

3752

+ 	__kfree_skb(skb);

3753

+ 	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRCVCOLLAPSED);

3754

+

3755

+@@ -4630,7 +4684,7 @@ static void tcp_collapse_ofo_queue(struct sock *sk)

3756

+  * Purge the out-of-order queue.

3757

+  * Return true if queue was pruned.

3758

+  */

3759

+-static bool tcp_prune_ofo_queue(struct sock *sk)

3760

++bool tcp_prune_ofo_queue(struct sock *sk)

3761

+ {

3762

+ 	struct tcp_sock *tp = tcp_sk(sk);

3763

+ 	bool res = false;

3764

+@@ -4686,7 +4740,7 @@ static int tcp_prune_queue(struct sock *sk)

3765

+ 	/* Collapsing did not help, destructive actions follow.

3766

+ 	 * This must not ever occur. */

3767

+

3768

+-	tcp_prune_ofo_queue(sk);

3769

++	tp->ops->prune_ofo_queue(sk);

3770

+

3771

+ 	if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)

3772

+ 		return 0;

3773

+@@ -4702,7 +4756,29 @@ static int tcp_prune_queue(struct sock *sk)

3774

+ 	return -1;

3775

+ }

3776

+

3777

+-static bool tcp_should_expand_sndbuf(const struct sock *sk)

3778

++/* RFC2861, slow part. Adjust cwnd, after it was not full during one rto.

3779

++ * As additional protections, we do not touch cwnd in retransmission phases,

3780

++ * and if application hit its sndbuf limit recently.

3781

++ */

3782

++void tcp_cwnd_application_limited(struct sock *sk)

3783

++{

3784

++	struct tcp_sock *tp = tcp_sk(sk);

3785

++

3786

++	if (inet_csk(sk)->icsk_ca_state == TCP_CA_Open &&

3787

++	    sk->sk_socket && !test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {

3788

++		/* Limited by application or receiver window. */

3789

++		u32 init_win = tcp_init_cwnd(tp, __sk_dst_get(sk));

3790

++		u32 win_used = max(tp->snd_cwnd_used, init_win);

3791

++		if (win_used < tp->snd_cwnd) {

3792

++			tp->snd_ssthresh = tcp_current_ssthresh(sk);

3793

++			tp->snd_cwnd = (tp->snd_cwnd + win_used) >> 1;

3794

++		}

3795

++		tp->snd_cwnd_used = 0;

3796

++	}

3797

++	tp->snd_cwnd_stamp = tcp_time_stamp;

3798

++}

3799

++

3800

++bool tcp_should_expand_sndbuf(const struct sock *sk)

3801

+ {

3802

+ 	const struct tcp_sock *tp = tcp_sk(sk);

3803

+

3804

+@@ -4737,7 +4813,7 @@ static void tcp_new_space(struct sock *sk)

3805

+ {

3806

+ 	struct tcp_sock *tp = tcp_sk(sk);

3807

+

3808

+-	if (tcp_should_expand_sndbuf(sk)) {

3809

++	if (tp->ops->should_expand_sndbuf(sk)) {

3810

+ 		tcp_sndbuf_expand(sk);

3811

+ 		tp->snd_cwnd_stamp = tcp_time_stamp;

3812

+ 	}

3813

+@@ -4749,8 +4825,9 @@ static void tcp_check_space(struct sock *sk)

3814

+ {

3815

+ 	if (sock_flag(sk, SOCK_QUEUE_SHRUNK)) {

3816

+ 		sock_reset_flag(sk, SOCK_QUEUE_SHRUNK);

3817

+-		if (sk->sk_socket &&

3818

+-		    test_bit(SOCK_NOSPACE, &sk->sk_socket->flags))

3819

++		if (mptcp(tcp_sk(sk)) ||

3820

++		    (sk->sk_socket &&

3821

++			test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)))

3822

+ 			tcp_new_space(sk);

3823

+ 	}

3824

+ }

3825

+@@ -4773,7 +4850,7 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)

3826

+ 	     /* ... and right edge of window advances far enough.

3827

+ 	      * (tcp_recvmsg() will send ACK otherwise). Or...

3828

+ 	      */

3829

+-	     __tcp_select_window(sk) >= tp->rcv_wnd) ||

3830

++	     tp->ops->__select_window(sk) >= tp->rcv_wnd) ||

3831

+ 	    /* We ACK each frame or... */

3832

+ 	    tcp_in_quickack_mode(sk) ||

3833

+ 	    /* We have out of order data. */

3834

+@@ -4875,6 +4952,10 @@ static void tcp_urg(struct sock *sk, struct sk_buff *skb, const struct tcphdr *t

3835

+ {

3836

+ 	struct tcp_sock *tp = tcp_sk(sk);

3837

+

3838

++	/* MPTCP urgent data is not yet supported */

3839

++	if (mptcp(tp))

3840

++		return;

3841

++

3842

+ 	/* Check if we get a new urgent pointer - normally not. */

3843

+ 	if (th->urg)

3844

+ 		tcp_check_urg(sk, th);

3845

+@@ -4942,8 +5023,7 @@ static inline bool tcp_checksum_complete_user(struct sock *sk,

3846

+ }

3847

+

3848

+ #ifdef CONFIG_NET_DMA

3849

+-static bool tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb,

3850

+-				  int hlen)

3851

++bool tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb, int hlen)

3852

+ {

3853

+ 	struct tcp_sock *tp = tcp_sk(sk);

3854

+ 	int chunk = skb->len - hlen;

3855

+@@ -5052,9 +5132,15 @@ syn_challenge:

3856

+ 		goto discard;

3857

+ 	}

3858

+

3859

++	/* If valid: post process the received MPTCP options. */

3860

++	if (mptcp(tp) && mptcp_handle_options(sk, th, skb))

3861

++		goto discard;

3862

++

3863

+ 	return true;

3864

+

3865

+ discard:

3866

++	if (mptcp(tp))

3867

++		mptcp_reset_mopt(tp);

3868

+ 	__kfree_skb(skb);

3869

+ 	return false;

3870

+ }

3871

+@@ -5106,6 +5192,10 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,

3872

+

3873

+ 	tp->rx_opt.saw_tstamp = 0;

3874

+

3875

++	/* MPTCP: force slowpath. */

3876

++	if (mptcp(tp))

3877

++		goto slow_path;

3878

++

3879

+ 	/*	pred_flags is 0xS?10 << 16 + snd_wnd

3880

+ 	 *	if header_prediction is to be made

3881

+ 	 *	'S' will always be tp->tcp_header_len >> 2

3882

+@@ -5205,7 +5295,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,

3883

+ 					NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPHITSTOUSER);

3884

+ 				}

3885

+ 				if (copied_early)

3886

+-					tcp_cleanup_rbuf(sk, skb->len);

3887

++					tp->ops->cleanup_rbuf(sk, skb->len);

3888

+ 			}

3889

+ 			if (!eaten) {

3890

+ 				if (tcp_checksum_complete_user(sk, skb))

3891

+@@ -5313,14 +5403,14 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb)

3892

+

3893

+ 	tcp_init_metrics(sk);

3894

+

3895

+-	tcp_init_congestion_control(sk);

3896

++	tp->ops->init_congestion_control(sk);

3897

+

3898

+ 	/* Prevent spurious tcp_cwnd_restart() on first data

3899

+ 	 * packet.

3900

+ 	 */

3901

+ 	tp->lsndtime = tcp_time_stamp;

3902

+

3903

+-	tcp_init_buffer_space(sk);

3904

++	tp->ops->init_buffer_space(sk);

3905

+

3906

+ 	if (sock_flag(sk, SOCK_KEEPOPEN))

3907

+ 		inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tp));

3908

+@@ -5350,7 +5440,7 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,

3909

+ 		/* Get original SYNACK MSS value if user MSS sets mss_clamp */

3910

+ 		tcp_clear_options(&opt);

3911

+ 		opt.user_mss = opt.mss_clamp = 0;

3912

+-		tcp_parse_options(synack, &opt, 0, NULL);

3913

++		tcp_parse_options(synack, &opt, NULL, 0, NULL);

3914

+ 		mss = opt.mss_clamp;

3915

+ 	}

3916

+

3917

+@@ -5365,7 +5455,11 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,

3918

+

3919

+ 	tcp_fastopen_cache_set(sk, mss, cookie, syn_drop);

3920

+

3921

+-	if (data) { /* Retransmit unacked data in SYN */

3922

++	/* In mptcp case, we do not rely on "retransmit", but instead on

3923

++	 * "transmit", because if fastopen data is not acked, the retransmission

3924

++	 * becomes the first MPTCP data (see mptcp_rcv_synsent_fastopen).

3925

++	 */

3926

++	if (data && !mptcp(tp)) { /* Retransmit unacked data in SYN */

3927

+ 		tcp_for_write_queue_from(data, sk) {

3928

+ 			if (data == tcp_send_head(sk) ||

3929

+ 			    __tcp_retransmit_skb(sk, data))

3930

+@@ -5388,8 +5482,11 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,

3931

+ 	struct tcp_sock *tp = tcp_sk(sk);

3932

+ 	struct tcp_fastopen_cookie foc = { .len = -1 };

3933

+ 	int saved_clamp = tp->rx_opt.mss_clamp;

3934

++	struct mptcp_options_received mopt;

3935

++	mptcp_init_mp_opt(&mopt);

3936

+

3937

+-	tcp_parse_options(skb, &tp->rx_opt, 0, &foc);

3938

++	tcp_parse_options(skb, &tp->rx_opt,

3939

++			  mptcp(tp) ? &tp->mptcp->rx_opt : &mopt, 0, &foc);

3940

+ 	if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)

3941

+ 		tp->rx_opt.rcv_tsecr -= tp->tsoffset;

3942

+

3943

+@@ -5448,6 +5545,30 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,

3944

+ 		tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);

3945

+ 		tcp_ack(sk, skb, FLAG_SLOWPATH);

3946

+

3947

++		if (tp->request_mptcp || mptcp(tp)) {

3948

++			int ret;

3949

++			ret = mptcp_rcv_synsent_state_process(sk, &sk,

3950

++							      skb, &mopt);

3951

++

3952

++			/* May have changed if we support MPTCP */

3953

++			tp = tcp_sk(sk);

3954

++			icsk = inet_csk(sk);

3955

++

3956

++			if (ret == 1)

3957

++				goto reset_and_undo;

3958

++			if (ret == 2)

3959

++				goto discard;

3960

++		}

3961

++

3962

++		if (mptcp(tp) && !is_master_tp(tp)) {

3963

++			/* Timer for repeating the ACK until an answer

3964

++			 * arrives. Used only when establishing an additional

3965

++			 * subflow inside of an MPTCP connection.

3966

++			 */

3967

++			sk_reset_timer(sk, &tp->mptcp->mptcp_ack_timer,

3968

++				       jiffies + icsk->icsk_rto);

3969

++		}

3970

++

3971

+ 		/* Ok.. it's good. Set up sequence numbers and

3972

+ 		 * move to established.

3973

+ 		 */

3974

+@@ -5474,6 +5595,11 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,

3975

+ 			tp->tcp_header_len = sizeof(struct tcphdr);

3976

+ 		}

3977

+

3978

++		if (mptcp(tp)) {

3979

++			tp->tcp_header_len += MPTCP_SUB_LEN_DSM_ALIGN;

3980

++			tp->advmss -= MPTCP_SUB_LEN_DSM_ALIGN;

3981

++		}

3982

++

3983

+ 		if (tcp_is_sack(tp) && sysctl_tcp_fack)

3984

+ 			tcp_enable_fack(tp);

3985

+

3986

+@@ -5494,9 +5620,12 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,

3987

+ 		    tcp_rcv_fastopen_synack(sk, skb, &foc))

3988

+ 			return -1;

3989

+

3990

+-		if (sk->sk_write_pending ||

3991

++		/* With MPTCP we cannot send data on the third ack due to the

3992

++		 * lack of option-space to combine with an MP_CAPABLE.

3993

++		 */

3994

++		if (!mptcp(tp) && (sk->sk_write_pending ||

3995

+ 		    icsk->icsk_accept_queue.rskq_defer_accept ||

3996

+-		    icsk->icsk_ack.pingpong) {

3997

++		    icsk->icsk_ack.pingpong)) {

3998

+ 			/* Save one ACK. Data will be ready after

3999

+ 			 * several ticks, if write_pending is set.

4000

+ 			 *

4001

+@@ -5536,6 +5665,7 @@ discard:

4002

+ 	    tcp_paws_reject(&tp->rx_opt, 0))

4003

+ 		goto discard_and_undo;

4004

+

4005

++	/* TODO - check this here for MPTCP */

4006

+ 	if (th->syn) {

4007

+ 		/* We see SYN without ACK. It is attempt of

4008

+ 		 * simultaneous connect with crossed SYNs.

4009

+@@ -5552,6 +5682,11 @@ discard:

4010

+ 			tp->tcp_header_len = sizeof(struct tcphdr);

4011

+ 		}

4012

+

4013

++		if (mptcp(tp)) {

4014

++			tp->tcp_header_len += MPTCP_SUB_LEN_DSM_ALIGN;

4015

++			tp->advmss -= MPTCP_SUB_LEN_DSM_ALIGN;

4016

++		}

4017

++

4018

+ 		tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;

4019

+ 		tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1;

4020

+

4021

+@@ -5610,6 +5745,7 @@ reset_and_undo:

4022

+

4023

+ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,

4024

+ 			  const struct tcphdr *th, unsigned int len)

4025

++	__releases(&sk->sk_lock.slock)

4026

+ {

4027

+ 	struct tcp_sock *tp = tcp_sk(sk);

4028

+ 	struct inet_connection_sock *icsk = inet_csk(sk);

4029

+@@ -5661,6 +5797,16 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,

4030

+

4031

+ 	case TCP_SYN_SENT:

4032

+ 		queued = tcp_rcv_synsent_state_process(sk, skb, th, len);

4033

++		if (is_meta_sk(sk)) {

4034

++			sk = tcp_sk(sk)->mpcb->master_sk;

4035

++			tp = tcp_sk(sk);

4036

++

4037

++			/* Need to call it here, because it will announce new

4038

++			 * addresses, which can only be done after the third ack

4039

++			 * of the 3-way handshake.

4040

++			 */

4041

++			mptcp_update_metasocket(sk, tp->meta_sk);

4042

++		}

4043

+ 		if (queued >= 0)

4044

+ 			return queued;

4045

+

4046

+@@ -5668,6 +5814,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,

4047

+ 		tcp_urg(sk, skb, th);

4048

+ 		__kfree_skb(skb);

4049

+ 		tcp_data_snd_check(sk);

4050

++		if (mptcp(tp) && is_master_tp(tp))

4051

++			bh_unlock_sock(sk);

4052

+ 		return 0;

4053

+ 	}

4054

+

4055

+@@ -5706,11 +5854,11 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,

4056

+ 			synack_stamp = tp->lsndtime;

4057

+ 			/* Make sure socket is routed, for correct metrics. */

4058

+ 			icsk->icsk_af_ops->rebuild_header(sk);

4059

+-			tcp_init_congestion_control(sk);

4060

++			tp->ops->init_congestion_control(sk);

4061

+

4062

+ 			tcp_mtup_init(sk);

4063

+ 			tp->copied_seq = tp->rcv_nxt;

4064

+-			tcp_init_buffer_space(sk);

4065

++			tp->ops->init_buffer_space(sk);

4066

+ 		}

4067

+ 		smp_mb();

4068

+ 		tcp_set_state(sk, TCP_ESTABLISHED);

4069

+@@ -5730,6 +5878,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,

4070

+

4071

+ 		if (tp->rx_opt.tstamp_ok)

4072

+ 			tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;

4073

++		if (mptcp(tp))

4074

++			tp->advmss -= MPTCP_SUB_LEN_DSM_ALIGN;

4075

+

4076

+ 		if (req) {

4077

+ 			/* Re-arm the timer because data may have been sent out.

4078

+@@ -5751,6 +5901,12 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,

4079

+

4080

+ 		tcp_initialize_rcv_mss(sk);

4081

+ 		tcp_fast_path_on(tp);

4082

++		/* Send an ACK when establishing a new

4083

++		 * MPTCP subflow, i.e. using an MP_JOIN

4084

++		 * subtype.

4085

++		 */

4086

++		if (mptcp(tp) && !is_master_tp(tp))

4087

++			tcp_send_ack(sk);

4088

+ 		break;

4089

+

4090

+ 	case TCP_FIN_WAIT1: {

4091

+@@ -5802,7 +5958,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,

4092

+ 		tmo = tcp_fin_time(sk);

4093

+ 		if (tmo > TCP_TIMEWAIT_LEN) {

4094

+ 			inet_csk_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN);

4095

+-		} else if (th->fin || sock_owned_by_user(sk)) {

4096

++		} else if (th->fin || mptcp_is_data_fin(skb) ||

4097

++			   sock_owned_by_user(sk)) {

4098

+ 			/* Bad case. We could lose such FIN otherwise.

4099

+ 			 * It is not a big problem, but it looks confusing

4100

+ 			 * and not so rare event. We still can lose it now,

4101

+@@ -5811,7 +5968,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,

4102

+ 			 */

4103

+ 			inet_csk_reset_keepalive_timer(sk, tmo);

4104

+ 		} else {

4105

+-			tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);

4106

++			tp->ops->time_wait(sk, TCP_FIN_WAIT2, tmo);

4107

+ 			goto discard;

4108

+ 		}

4109

+ 		break;

4110

+@@ -5819,7 +5976,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,

4111

+

4112

+ 	case TCP_CLOSING:

4113

+ 		if (tp->snd_una == tp->write_seq) {

4114

+-			tcp_time_wait(sk, TCP_TIME_WAIT, 0);

4115

++			tp->ops->time_wait(sk, TCP_TIME_WAIT, 0);

4116

+ 			goto discard;

4117

+ 		}

4118

+ 		break;

4119

+@@ -5831,6 +5988,9 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,

4120

+ 			goto discard;

4121

+ 		}

4122

+ 		break;

4123

++	case TCP_CLOSE:

4124

++		if (tp->mp_killed)

4125

++			goto discard;

4126

+ 	}

4127

+

4128

+ 	/* step 6: check the URG bit */

4129

+@@ -5851,7 +6011,11 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,

4130

+ 		 */

4131

+ 		if (sk->sk_shutdown & RCV_SHUTDOWN) {

4132

+ 			if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&

4133

+-			    after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) {

4134

++			    after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt) &&

4135

++			    !mptcp(tp)) {

4136

++				/* In case of mptcp, the reset is handled by

4137

++				 * mptcp_rcv_state_process

4138

++				 */

4139

+ 				NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONDATA);

4140

+ 				tcp_reset(sk);

4141

+ 				return 1;

4142

+@@ -5877,3 +6041,154 @@ discard:

4143

+ 	return 0;

4144

+ }

4145

+ EXPORT_SYMBOL(tcp_rcv_state_process);

4146

++

4147

++static inline void pr_drop_req(struct request_sock *req, __u16 port, int family)

4148

++{

4149

++	struct inet_request_sock *ireq = inet_rsk(req);

4150

++

4151

++	if (family == AF_INET)

4152

++		LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("drop open request from %pI4/%u\n"),

4153

++			       &ireq->ir_rmt_addr, port);

4154

++#if IS_ENABLED(CONFIG_IPV6)

4155

++	else if (family == AF_INET6)

4156

++		LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("drop open request from %pI6/%u\n"),

4157

++			       &ireq->ir_v6_rmt_addr, port);

4158

++#endif

4159

++}

4160

++

4161

++int tcp_conn_request(struct request_sock_ops *rsk_ops,

4162

++		     const struct tcp_request_sock_ops *af_ops,

4163

++		     struct sock *sk, struct sk_buff *skb)

4164

++{

4165

++	struct tcp_options_received tmp_opt;

4166

++	struct request_sock *req;

4167

++	struct tcp_sock *tp = tcp_sk(sk);

4168

++	struct dst_entry *dst = NULL;

4169

++	__u32 isn = TCP_SKB_CB(skb)->when;

4170

++	bool want_cookie = false, fastopen;

4171

++	struct flowi fl;

4172

++	struct tcp_fastopen_cookie foc = { .len = -1 };

4173

++	int err;

4174

++

4175

++

4176

++	/* TW buckets are converted to open requests without

4177

++	 * limitations, they conserve resources and peer is

4178

++	 * evidently real one.

4179

++	 */

4180

++	if ((sysctl_tcp_syncookies == 2 ||

4181

++	     inet_csk_reqsk_queue_is_full(sk)) && !isn) {

4182

++		want_cookie = tcp_syn_flood_action(sk, skb, rsk_ops->slab_name);

4183

++		if (!want_cookie)

4184

++			goto drop;

4185

++	}

4186

++

4187

++

4188

++	/* Accept backlog is full. If we have already queued enough

4189

++	 * of warm entries in syn queue, drop request. It is better than

4190

++	 * clogging syn queue with openreqs with exponentially increasing

4191

++	 * timeout.

4192

++	 */

4193

++	if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) {

4194

++		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);

4195

++		goto drop;

4196

++	}

4197

++

4198

++	req = inet_reqsk_alloc(rsk_ops);

4199

++	if (!req)

4200

++		goto drop;

4201

++

4202

++	tcp_rsk(req)->af_specific = af_ops;

4203

++

4204

++	tcp_clear_options(&tmp_opt);

4205

++	tmp_opt.mss_clamp = af_ops->mss_clamp;

4206

++	tmp_opt.user_mss  = tp->rx_opt.user_mss;

4207

++	tcp_parse_options(skb, &tmp_opt, NULL, 0, want_cookie ? NULL : &foc);

4208

++

4209

++	if (want_cookie && !tmp_opt.saw_tstamp)

4210

++		tcp_clear_options(&tmp_opt);

4211

++

4212

++	tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;

4213

++	tcp_openreq_init(req, &tmp_opt, skb);

4214

++

4215

++	if (af_ops->init_req(req, sk, skb))

4216

++		goto drop_and_free;

4217

++

4218

++	if (security_inet_conn_request(sk, skb, req))

4219

++		goto drop_and_free;

4220

++

4221

++	if (!want_cookie || tmp_opt.tstamp_ok)

4222

++		TCP_ECN_create_request(req, skb, sock_net(sk));

4223

++

4224

++	if (want_cookie) {

4225

++		isn = cookie_init_sequence(af_ops, sk, skb, &req->mss);

4226

++		req->cookie_ts = tmp_opt.tstamp_ok;

4227

++	} else if (!isn) {

4228

++		/* VJ's idea. We save last timestamp seen

4229

++		 * from the destination in peer table, when entering

4230

++		 * state TIME-WAIT, and check against it before

4231

++		 * accepting new connection request.

4232

++		 *

4233

++		 * If "isn" is not zero, this request hit alive

4234

++		 * timewait bucket, so that all the necessary checks

4235

++		 * are made in the function processing timewait state.

4236

++		 */

4237

++		if (tmp_opt.saw_tstamp && tcp_death_row.sysctl_tw_recycle) {

4238

++			bool strict;

4239

++

4240

++			dst = af_ops->route_req(sk, &fl, req, &strict);

4241

++			if (dst && strict &&

4242

++			    !tcp_peer_is_proven(req, dst, true)) {

4243

++				NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);

4244

++				goto drop_and_release;

4245

++			}

4246

++		}

4247

++		/* Kill the following clause, if you dislike this way. */

4248

++		else if (!sysctl_tcp_syncookies &&

4249

++			 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <

4250

++			  (sysctl_max_syn_backlog >> 2)) &&

4251

++			 !tcp_peer_is_proven(req, dst, false)) {

4252

++			/* Without syncookies last quarter of

4253

++			 * backlog is filled with destinations,

4254

++			 * proven to be alive.

4255

++			 * It means that we continue to communicate

4256

++			 * to destinations, already remembered

4257

++			 * to the moment of synflood.

4258

++			 */

4259

++			pr_drop_req(req, ntohs(tcp_hdr(skb)->source),

4260

++				    rsk_ops->family);

4261

++			goto drop_and_release;

4262

++		}

4263

++

4264

++		isn = af_ops->init_seq(skb);

4265

++	}

4266

++	if (!dst) {

4267

++		dst = af_ops->route_req(sk, &fl, req, NULL);

4268

++		if (!dst)

4269

++			goto drop_and_free;

4270

++	}

4271

++

4272

++	tcp_rsk(req)->snt_isn = isn;

4273

++	tcp_openreq_init_rwin(req, sk, dst);

4274

++	fastopen = !want_cookie &&

4275

++		   tcp_try_fastopen(sk, skb, req, &foc, dst);

4276

++	err = af_ops->send_synack(sk, dst, &fl, req,

4277

++				  skb_get_queue_mapping(skb), &foc);

4278

++	if (!fastopen) {

4279

++		if (err || want_cookie)

4280

++			goto drop_and_free;

4281

++

4282

++		tcp_rsk(req)->listener = NULL;

4283

++		af_ops->queue_hash_add(sk, req, TCP_TIMEOUT_INIT);

4284

++	}

4285

++

4286

++	return 0;

4287

++

4288

++drop_and_release:

4289

++	dst_release(dst);

4290

++drop_and_free:

4291

++	reqsk_free(req);

4292

++drop:

4293

++	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);

4294

++	return 0;

4295

++}

4296

++EXPORT_SYMBOL(tcp_conn_request);

4297

+diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c

4298

+index 77cccda1ad0c..c77017f600f1 100644

4299

+--- a/net/ipv4/tcp_ipv4.c

4300

++++ b/net/ipv4/tcp_ipv4.c

4301

+@@ -67,6 +67,8 @@

4302

+ #include <net/icmp.h>

4303

+ #include <net/inet_hashtables.h>

4304

+ #include <net/tcp.h>

4305

++#include <net/mptcp.h>

4306

++#include <net/mptcp_v4.h>

4307

+ #include <net/transp_v6.h>

4308

+ #include <net/ipv6.h>

4309

+ #include <net/inet_common.h>

4310

+@@ -99,7 +101,7 @@ static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,

4311

+ struct inet_hashinfo tcp_hashinfo;

4312

+ EXPORT_SYMBOL(tcp_hashinfo);

4313

+

4314

+-static inline __u32 tcp_v4_init_sequence(const struct sk_buff *skb)

4315

++__u32 tcp_v4_init_sequence(const struct sk_buff *skb)

4316

+ {

4317

+ 	return secure_tcp_sequence_number(ip_hdr(skb)->daddr,

4318

+ 					  ip_hdr(skb)->saddr,

4319

+@@ -334,7 +336,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)

4320

+ 	struct inet_sock *inet;

4321

+ 	const int type = icmp_hdr(icmp_skb)->type;

4322

+ 	const int code = icmp_hdr(icmp_skb)->code;

4323

+-	struct sock *sk;

4324

++	struct sock *sk, *meta_sk;

4325

+ 	struct sk_buff *skb;

4326

+ 	struct request_sock *fastopen;

4327

+ 	__u32 seq, snd_una;

4328

+@@ -358,13 +360,19 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)

4329

+ 		return;

4330

+ 	}

4331

+

4332

+-	bh_lock_sock(sk);

4333

++	tp = tcp_sk(sk);

4334

++	if (mptcp(tp))

4335

++		meta_sk = mptcp_meta_sk(sk);

4336

++	else

4337

++		meta_sk = sk;

4338

++

4339

++	bh_lock_sock(meta_sk);

4340

+ 	/* If too many ICMPs get dropped on busy

4341

+ 	 * servers this needs to be solved differently.

4342

+ 	 * We do take care of PMTU discovery (RFC1191) special case :

4343

+ 	 * we can receive locally generated ICMP messages while socket is held.

4344

+ 	 */

4345

+-	if (sock_owned_by_user(sk)) {

4346

++	if (sock_owned_by_user(meta_sk)) {

4347

+ 		if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))

4348

+ 			NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);

4349

+ 	}

4350

+@@ -377,7 +385,6 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)

4351

+ 	}

4352

+

4353

+ 	icsk = inet_csk(sk);

4354

+-	tp = tcp_sk(sk);

4355

+ 	seq = ntohl(th->seq);

4356

+ 	/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */

4357

+ 	fastopen = tp->fastopen_rsk;

4358

+@@ -411,11 +418,13 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)

4359

+ 				goto out;

4360

+

4361

+ 			tp->mtu_info = info;

4362

+-			if (!sock_owned_by_user(sk)) {

4363

++			if (!sock_owned_by_user(meta_sk)) {

4364

+ 				tcp_v4_mtu_reduced(sk);

4365

+ 			} else {

4366

+ 				if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags))

4367

+ 					sock_hold(sk);

4368

++				if (mptcp(tp))

4369

++					mptcp_tsq_flags(sk);

4370

+ 			}

4371

+ 			goto out;

4372

+ 		}

4373

+@@ -429,7 +438,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)

4374

+ 		    !icsk->icsk_backoff || fastopen)

4375

+ 			break;

4376

+

4377

+-		if (sock_owned_by_user(sk))

4378

++		if (sock_owned_by_user(meta_sk))

4379

+ 			break;

4380

+

4381

+ 		icsk->icsk_backoff--;

4382

+@@ -463,7 +472,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)

4383

+ 	switch (sk->sk_state) {

4384

+ 		struct request_sock *req, **prev;

4385

+ 	case TCP_LISTEN:

4386

+-		if (sock_owned_by_user(sk))

4387

++		if (sock_owned_by_user(meta_sk))

4388

+ 			goto out;

4389

+

4390

+ 		req = inet_csk_search_req(sk, &prev, th->dest,

4391

+@@ -499,7 +508,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)

4392

+ 		if (fastopen && fastopen->sk == NULL)

4393

+ 			break;

4394

+

4395

+-		if (!sock_owned_by_user(sk)) {

4396

++		if (!sock_owned_by_user(meta_sk)) {

4397

+ 			sk->sk_err = err;

4398

+

4399

+ 			sk->sk_error_report(sk);

4400

+@@ -528,7 +537,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)

4401

+ 	 */

4402

+

4403

+ 	inet = inet_sk(sk);

4404

+-	if (!sock_owned_by_user(sk) && inet->recverr) {

4405

++	if (!sock_owned_by_user(meta_sk) && inet->recverr) {

4406

+ 		sk->sk_err = err;

4407

+ 		sk->sk_error_report(sk);

4408

+ 	} else	{ /* Only an error on timeout */

4409

+@@ -536,7 +545,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)

4410

+ 	}

4411

+

4412

+ out:

4413

+-	bh_unlock_sock(sk);

4414

++	bh_unlock_sock(meta_sk);

4415

+ 	sock_put(sk);

4416

+ }

4417

+

4418

+@@ -578,7 +587,7 @@ EXPORT_SYMBOL(tcp_v4_send_check);

4419

+  *	Exception: precedence violation. We do not implement it in any case.

4420

+  */

4421

+

4422

+-static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)

4423

++void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)

4424

+ {

4425

+ 	const struct tcphdr *th = tcp_hdr(skb);

4426

+ 	struct {

4427

+@@ -702,10 +711,10 @@ release_sk1:

4428

+    outside socket context is ugly, certainly. What can I do?

4429

+  */

4430

+

4431

+-static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,

4432

++static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack, u32 data_ack,

4433

+ 			    u32 win, u32 tsval, u32 tsecr, int oif,

4434

+ 			    struct tcp_md5sig_key *key,

4435

+-			    int reply_flags, u8 tos)

4436

++			    int reply_flags, u8 tos, int mptcp)

4437

+ {

4438

+ 	const struct tcphdr *th = tcp_hdr(skb);

4439

+ 	struct {

4440

+@@ -714,6 +723,10 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,

4441

+ #ifdef CONFIG_TCP_MD5SIG

4442

+ 			   + (TCPOLEN_MD5SIG_ALIGNED >> 2)

4443

+ #endif

4444

++#ifdef CONFIG_MPTCP

4445

++			   + ((MPTCP_SUB_LEN_DSS >> 2) +

4446

++			      (MPTCP_SUB_LEN_ACK >> 2))

4447

++#endif

4448

+ 			];

4449

+ 	} rep;

4450

+ 	struct ip_reply_arg arg;

4451

+@@ -758,6 +771,21 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,

4452

+ 				    ip_hdr(skb)->daddr, &rep.th);

4453

+ 	}

4454

+ #endif

4455

++#ifdef CONFIG_MPTCP

4456

++	if (mptcp) {

4457

++		int offset = (tsecr) ? 3 : 0;

4458

++		/* Construction of 32-bit data_ack */

4459

++		rep.opt[offset++] = htonl((TCPOPT_MPTCP << 24) |

4460

++					  ((MPTCP_SUB_LEN_DSS + MPTCP_SUB_LEN_ACK) << 16) |

4461

++					  (0x20 << 8) |

4462

++					  (0x01));

4463

++		rep.opt[offset] = htonl(data_ack);

4464

++

4465

++		arg.iov[0].iov_len += MPTCP_SUB_LEN_DSS + MPTCP_SUB_LEN_ACK;

4466

++		rep.th.doff = arg.iov[0].iov_len / 4;

4467

++	}

4468

++#endif /* CONFIG_MPTCP */

4469

++

4470

+ 	arg.flags = reply_flags;

4471

+ 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,

4472

+ 				      ip_hdr(skb)->saddr, /* XXX */

4473

+@@ -776,36 +804,44 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)

4474

+ {

4475

+ 	struct inet_timewait_sock *tw = inet_twsk(sk);

4476

+ 	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);

4477

++	u32 data_ack = 0;

4478

++	int mptcp = 0;

4479

++

4480

++	if (tcptw->mptcp_tw && tcptw->mptcp_tw->meta_tw) {

4481

++		data_ack = (u32)tcptw->mptcp_tw->rcv_nxt;

4482

++		mptcp = 1;

4483

++	}

4484

+

4485

+ 	tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,

4486

++			data_ack,

4487

+ 			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,

4488

+ 			tcp_time_stamp + tcptw->tw_ts_offset,

4489

+ 			tcptw->tw_ts_recent,

4490

+ 			tw->tw_bound_dev_if,

4491

+ 			tcp_twsk_md5_key(tcptw),

4492

+ 			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,

4493

+-			tw->tw_tos

4494

++			tw->tw_tos, mptcp

4495

+ 			);

4496

+

4497

+ 	inet_twsk_put(tw);

4498

+ }

4499

+

4500

+-static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,

4501

+-				  struct request_sock *req)

4502

++void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,

4503

++			   struct request_sock *req)

4504

+ {

4505

+ 	/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV

4506

+ 	 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.

4507

+ 	 */

4508

+ 	tcp_v4_send_ack(skb, (sk->sk_state == TCP_LISTEN) ?

4509

+ 			tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt,

4510

+-			tcp_rsk(req)->rcv_nxt, req->rcv_wnd,

4511

++			tcp_rsk(req)->rcv_nxt, 0, req->rcv_wnd,

4512

+ 			tcp_time_stamp,

4513

+ 			req->ts_recent,

4514

+ 			0,

4515

+ 			tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,

4516

+ 					  AF_INET),

4517

+ 			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,

4518

+-			ip_hdr(skb)->tos);

4519

++			ip_hdr(skb)->tos, 0);

4520

+ }

4521

+

4522

+ /*

4523

+@@ -813,10 +849,11 @@ static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,

4524

+  *	This still operates on a request_sock only, not on a big

4525

+  *	socket.

4526

+  */

4527

+-static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,

4528

+-			      struct request_sock *req,

4529

+-			      u16 queue_mapping,

4530

+-			      struct tcp_fastopen_cookie *foc)

4531

++int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,

4532

++		       struct flowi *fl,

4533

++		       struct request_sock *req,

4534

++		       u16 queue_mapping,

4535

++		       struct tcp_fastopen_cookie *foc)

4536

+ {

4537

+ 	const struct inet_request_sock *ireq = inet_rsk(req);

4538

+ 	struct flowi4 fl4;

4539

+@@ -844,21 +881,10 @@ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,

4540

+ 	return err;

4541

+ }

4542

+

4543

+-static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req)

4544

+-{

4545

+-	int res = tcp_v4_send_synack(sk, NULL, req, 0, NULL);

4546

+-

4547

+-	if (!res) {

4548

+-		TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);

4549

+-		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);

4550

+-	}

4551

+-	return res;

4552

+-}

4553

+-

4554

+ /*

4555

+  *	IPv4 request_sock destructor.

4556

+  */

4557

+-static void tcp_v4_reqsk_destructor(struct request_sock *req)

4558

++void tcp_v4_reqsk_destructor(struct request_sock *req)

4559

+ {

4560

+ 	kfree(inet_rsk(req)->opt);

4561

+ }

4562

+@@ -896,7 +922,7 @@ EXPORT_SYMBOL(tcp_syn_flood_action);

4563

+ /*

4564

+  * Save and compile IPv4 options into the request_sock if needed.

4565

+  */

4566

+-static struct ip_options_rcu *tcp_v4_save_options(struct sk_buff *skb)

4567

++struct ip_options_rcu *tcp_v4_save_options(struct sk_buff *skb)

4568

+ {

4569

+ 	const struct ip_options *opt = &(IPCB(skb)->opt);

4570

+ 	struct ip_options_rcu *dopt = NULL;

4571

+@@ -1237,161 +1263,71 @@ static bool tcp_v4_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb)

4572

+

4573

+ #endif

4574

+

4575

++static int tcp_v4_init_req(struct request_sock *req, struct sock *sk,

4576

++			   struct sk_buff *skb)

4577

++{

4578

++	struct inet_request_sock *ireq = inet_rsk(req);

4579

++

4580

++	ireq->ir_loc_addr = ip_hdr(skb)->daddr;

4581

++	ireq->ir_rmt_addr = ip_hdr(skb)->saddr;

4582

++	ireq->no_srccheck = inet_sk(sk)->transparent;

4583

++	ireq->opt = tcp_v4_save_options(skb);

4584

++	ireq->ir_mark = inet_request_mark(sk, skb);

4585

++

4586

++	return 0;

4587

++}

4588

++

4589

++static struct dst_entry *tcp_v4_route_req(struct sock *sk, struct flowi *fl,

4590

++					  const struct request_sock *req,

4591

++					  bool *strict)

4592

++{

4593

++	struct dst_entry *dst = inet_csk_route_req(sk, &fl->u.ip4, req);

4594

++

4595

++	if (strict) {

4596

++		if (fl->u.ip4.daddr == inet_rsk(req)->ir_rmt_addr)

4597

++			*strict = true;

4598

++		else

4599

++			*strict = false;

4600

++	}

4601

++

4602

++	return dst;

4603

++}

4604

++

4605

+ struct request_sock_ops tcp_request_sock_ops __read_mostly = {

4606

+ 	.family		=	PF_INET,

4607

+ 	.obj_size	=	sizeof(struct tcp_request_sock),

4608

+-	.rtx_syn_ack	=	tcp_v4_rtx_synack,

4609

++	.rtx_syn_ack	=	tcp_rtx_synack,

4610

+ 	.send_ack	=	tcp_v4_reqsk_send_ack,

4611

+ 	.destructor	=	tcp_v4_reqsk_destructor,

4612

+ 	.send_reset	=	tcp_v4_send_reset,

4613

+ 	.syn_ack_timeout = 	tcp_syn_ack_timeout,

4614

+ };

4615

+

4616

++const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {

4617

++	.mss_clamp	=	TCP_MSS_DEFAULT,

4618

+ #ifdef CONFIG_TCP_MD5SIG

4619

+-static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {

4620

+ 	.md5_lookup	=	tcp_v4_reqsk_md5_lookup,

4621

+ 	.calc_md5_hash	=	tcp_v4_md5_hash_skb,

4622

+-};

4623

+ #endif

4624

++	.init_req	=	tcp_v4_init_req,

4625

++#ifdef CONFIG_SYN_COOKIES

4626

++	.cookie_init_seq =	cookie_v4_init_sequence,

4627

++#endif

4628

++	.route_req	=	tcp_v4_route_req,

4629

++	.init_seq	=	tcp_v4_init_sequence,

4630

++	.send_synack	=	tcp_v4_send_synack,

4631

++	.queue_hash_add =	inet_csk_reqsk_queue_hash_add,

4632

++};

4633

+

4634

+ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)

4635

+ {

4636

+-	struct tcp_options_received tmp_opt;

4637

+-	struct request_sock *req;

4638

+-	struct inet_request_sock *ireq;

4639

+-	struct tcp_sock *tp = tcp_sk(sk);

4640

+-	struct dst_entry *dst = NULL;

4641

+-	__be32 saddr = ip_hdr(skb)->saddr;

4642

+-	__be32 daddr = ip_hdr(skb)->daddr;

4643

+-	__u32 isn = TCP_SKB_CB(skb)->when;

4644

+-	bool want_cookie = false, fastopen;

4645

+-	struct flowi4 fl4;

4646

+-	struct tcp_fastopen_cookie foc = { .len = -1 };

4647

+-	int err;

4648

+-

4649

+ 	/* Never answer to SYNs send to broadcast or multicast */

4650

+ 	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))

4651

+ 		goto drop;

4652

+

4653

+-	/* TW buckets are converted to open requests without

4654

+-	 * limitations, they conserve resources and peer is

4655

+-	 * evidently real one.

4656

+-	 */

4657

+-	if ((sysctl_tcp_syncookies == 2 ||

4658

+-	     inet_csk_reqsk_queue_is_full(sk)) && !isn) {

4659

+-		want_cookie = tcp_syn_flood_action(sk, skb, "TCP");

4660

+-		if (!want_cookie)

4661

+-			goto drop;

4662

+-	}

4663

+-

4664

+-	/* Accept backlog is full. If we have already queued enough

4665

+-	 * of warm entries in syn queue, drop request. It is better than

4666

+-	 * clogging syn queue with openreqs with exponentially increasing

4667

+-	 * timeout.

4668

+-	 */

4669

+-	if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) {

4670

+-		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);

4671

+-		goto drop;

4672

+-	}

4673

+-

4674

+-	req = inet_reqsk_alloc(&tcp_request_sock_ops);

4675

+-	if (!req)

4676

+-		goto drop;

4677

+-

4678

+-#ifdef CONFIG_TCP_MD5SIG

4679

+-	tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;

4680

+-#endif

4681

+-

4682

+-	tcp_clear_options(&tmp_opt);

4683

+-	tmp_opt.mss_clamp = TCP_MSS_DEFAULT;

4684

+-	tmp_opt.user_mss  = tp->rx_opt.user_mss;

4685

+-	tcp_parse_options(skb, &tmp_opt, 0, want_cookie ? NULL : &foc);

4686

+-

4687

+-	if (want_cookie && !tmp_opt.saw_tstamp)

4688

+-		tcp_clear_options(&tmp_opt);

4689

+-

4690

+-	tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;

4691

+-	tcp_openreq_init(req, &tmp_opt, skb);

4692

++	return tcp_conn_request(&tcp_request_sock_ops,

4693

++				&tcp_request_sock_ipv4_ops, sk, skb);

4694

+

4695

+-	ireq = inet_rsk(req);

4696

+-	ireq->ir_loc_addr = daddr;

4697

+-	ireq->ir_rmt_addr = saddr;

4698

+-	ireq->no_srccheck = inet_sk(sk)->transparent;

4699

+-	ireq->opt = tcp_v4_save_options(skb);

4700

+-	ireq->ir_mark = inet_request_mark(sk, skb);

4701

+-

4702

+-	if (security_inet_conn_request(sk, skb, req))

4703

+-		goto drop_and_free;

4704

+-

4705

+-	if (!want_cookie || tmp_opt.tstamp_ok)

4706

+-		TCP_ECN_create_request(req, skb, sock_net(sk));

4707

+-

4708

+-	if (want_cookie) {

4709

+-		isn = cookie_v4_init_sequence(sk, skb, &req->mss);

4710

+-		req->cookie_ts = tmp_opt.tstamp_ok;

4711

+-	} else if (!isn) {

4712

+-		/* VJ's idea. We save last timestamp seen

4713

+-		 * from the destination in peer table, when entering

4714

+-		 * state TIME-WAIT, and check against it before

4715

+-		 * accepting new connection request.

4716

+-		 *

4717

+-		 * If "isn" is not zero, this request hit alive

4718

+-		 * timewait bucket, so that all the necessary checks

4719

+-		 * are made in the function processing timewait state.

4720

+-		 */

4721

+-		if (tmp_opt.saw_tstamp &&

4722

+-		    tcp_death_row.sysctl_tw_recycle &&

4723

+-		    (dst = inet_csk_route_req(sk, &fl4, req)) != NULL &&

4724

+-		    fl4.daddr == saddr) {

4725

+-			if (!tcp_peer_is_proven(req, dst, true)) {

4726

+-				NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);

4727

+-				goto drop_and_release;

4728

+-			}

4729

+-		}

4730

+-		/* Kill the following clause, if you dislike this way. */

4731

+-		else if (!sysctl_tcp_syncookies &&

4732

+-			 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <

4733

+-			  (sysctl_max_syn_backlog >> 2)) &&

4734

+-			 !tcp_peer_is_proven(req, dst, false)) {

4735

+-			/* Without syncookies last quarter of

4736

+-			 * backlog is filled with destinations,

4737

+-			 * proven to be alive.

4738

+-			 * It means that we continue to communicate

4739

+-			 * to destinations, already remembered

4740

+-			 * to the moment of synflood.

4741

+-			 */

4742

+-			LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("drop open request from %pI4/%u\n"),

4743

+-				       &saddr, ntohs(tcp_hdr(skb)->source));

4744

+-			goto drop_and_release;

4745

+-		}

4746

+-

4747

+-		isn = tcp_v4_init_sequence(skb);

4748

+-	}

4749

+-	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)

4750

+-		goto drop_and_free;

4751

+-

4752

+-	tcp_rsk(req)->snt_isn = isn;

4753

+-	tcp_rsk(req)->snt_synack = tcp_time_stamp;

4754

+-	tcp_openreq_init_rwin(req, sk, dst);

4755

+-	fastopen = !want_cookie &&

4756

+-		   tcp_try_fastopen(sk, skb, req, &foc, dst);

4757

+-	err = tcp_v4_send_synack(sk, dst, req,

4758

+-				 skb_get_queue_mapping(skb), &foc);

4759

+-	if (!fastopen) {

4760

+-		if (err || want_cookie)

4761

+-			goto drop_and_free;

4762

+-

4763

+-		tcp_rsk(req)->snt_synack = tcp_time_stamp;

4764

+-		tcp_rsk(req)->listener = NULL;

4765

+-		inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);

4766

+-	}

4767

+-

4768

+-	return 0;

4769

+-

4770

+-drop_and_release:

4771

+-	dst_release(dst);

4772

+-drop_and_free:

4773

+-	reqsk_free(req);

4774

+ drop:

4775

+ 	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);

4776

+ 	return 0;

4777

+@@ -1497,7 +1433,7 @@ put_and_exit:

4778

+ }

4779

+ EXPORT_SYMBOL(tcp_v4_syn_recv_sock);

4780

+

4781

+-static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)

4782

++struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)

4783

+ {

4784

+ 	struct tcphdr *th = tcp_hdr(skb);

4785

+ 	const struct iphdr *iph = ip_hdr(skb);

4786

+@@ -1514,8 +1450,15 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)

4787

+

4788

+ 	if (nsk) {

4789

+ 		if (nsk->sk_state != TCP_TIME_WAIT) {

4790

++			/* Don't lock again the meta-sk. It has been locked

4791

++			 * before mptcp_v4_do_rcv.

4792

++			 */

4793

++			if (mptcp(tcp_sk(nsk)) && !is_meta_sk(sk))

4794

++				bh_lock_sock(mptcp_meta_sk(nsk));

4795

+ 			bh_lock_sock(nsk);

4796

++

4797

+ 			return nsk;

4798

++

4799

+ 		}

4800

+ 		inet_twsk_put(inet_twsk(nsk));

4801

+ 		return NULL;

4802

+@@ -1550,6 +1493,9 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)

4803

+ 		goto discard;

4804

+ #endif

4805

+

4806

++	if (is_meta_sk(sk))

4807

++		return mptcp_v4_do_rcv(sk, skb);

4808

++

4809

+ 	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */

4810

+ 		struct dst_entry *dst = sk->sk_rx_dst;

4811

+

4812

+@@ -1681,7 +1627,7 @@ bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)

4813

+ 	} else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {

4814

+ 		wake_up_interruptible_sync_poll(sk_sleep(sk),

4815

+ 					   POLLIN | POLLRDNORM | POLLRDBAND);

4816

+-		if (!inet_csk_ack_scheduled(sk))

4817

++		if (!inet_csk_ack_scheduled(sk) && !mptcp(tp))

4818

+ 			inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,

4819

+ 						  (3 * tcp_rto_min(sk)) / 4,

4820

+ 						  TCP_RTO_MAX);

4821

+@@ -1698,7 +1644,7 @@ int tcp_v4_rcv(struct sk_buff *skb)

4822

+ {

4823

+ 	const struct iphdr *iph;

4824

+ 	const struct tcphdr *th;

4825

+-	struct sock *sk;

4826

++	struct sock *sk, *meta_sk = NULL;

4827

+ 	int ret;

4828

+ 	struct net *net = dev_net(skb->dev);

4829

+

4830

+@@ -1732,18 +1678,42 @@ int tcp_v4_rcv(struct sk_buff *skb)

4831

+ 	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +

4832

+ 				    skb->len - th->doff * 4);

4833

+ 	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);

4834

++#ifdef CONFIG_MPTCP

4835

++	TCP_SKB_CB(skb)->mptcp_flags = 0;

4836

++	TCP_SKB_CB(skb)->dss_off = 0;

4837

++#endif

4838

+ 	TCP_SKB_CB(skb)->when	 = 0;

4839

+ 	TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);

4840

+ 	TCP_SKB_CB(skb)->sacked	 = 0;

4841

+

4842

+ 	sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);

4843

+-	if (!sk)

4844

+-		goto no_tcp_socket;

4845

+

4846

+ process:

4847

+-	if (sk->sk_state == TCP_TIME_WAIT)

4848

++	if (sk && sk->sk_state == TCP_TIME_WAIT)

4849

+ 		goto do_time_wait;

4850

+

4851

++#ifdef CONFIG_MPTCP

4852

++	if (!sk && th->syn && !th->ack) {

4853

++		int ret = mptcp_lookup_join(skb, NULL);

4854

++

4855

++		if (ret < 0) {

4856

++			tcp_v4_send_reset(NULL, skb);

4857

++			goto discard_it;

4858

++		} else if (ret > 0) {

4859

++			return 0;

4860

++		}

4861

++	}

4862

++

4863

++	/* Is there a pending request sock for this segment ? */

4864

++	if ((!sk || sk->sk_state == TCP_LISTEN) && mptcp_check_req(skb, net)) {

4865

++		if (sk)

4866

++			sock_put(sk);

4867

++		return 0;

4868

++	}

4869

++#endif

4870

++	if (!sk)

4871

++		goto no_tcp_socket;

4872

++

4873

+ 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {

4874

+ 		NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);

4875

+ 		goto discard_and_relse;

4876

+@@ -1759,11 +1729,21 @@ process:

4877

+ 	sk_mark_napi_id(sk, skb);

4878

+ 	skb->dev = NULL;

4879

+

4880

+-	bh_lock_sock_nested(sk);

4881

++	if (mptcp(tcp_sk(sk))) {

4882

++		meta_sk = mptcp_meta_sk(sk);

4883

++

4884

++		bh_lock_sock_nested(meta_sk);

4885

++		if (sock_owned_by_user(meta_sk))

4886

++			skb->sk = sk;

4887

++	} else {

4888

++		meta_sk = sk;

4889

++		bh_lock_sock_nested(sk);

4890

++	}

4891

++

4892

+ 	ret = 0;

4893

+-	if (!sock_owned_by_user(sk)) {

4894

++	if (!sock_owned_by_user(meta_sk)) {

4895

+ #ifdef CONFIG_NET_DMA

4896

+-		struct tcp_sock *tp = tcp_sk(sk);

4897

++		struct tcp_sock *tp = tcp_sk(meta_sk);

4898

+ 		if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)

4899

+ 			tp->ucopy.dma_chan = net_dma_find_channel();

4900

+ 		if (tp->ucopy.dma_chan)

4901

+@@ -1771,16 +1751,16 @@ process:

4902

+ 		else

4903

+ #endif

4904

+ 		{

4905

+-			if (!tcp_prequeue(sk, skb))

4906

++			if (!tcp_prequeue(meta_sk, skb))

4907

+ 				ret = tcp_v4_do_rcv(sk, skb);

4908

+ 		}

4909

+-	} else if (unlikely(sk_add_backlog(sk, skb,

4910

+-					   sk->sk_rcvbuf + sk->sk_sndbuf))) {

4911

+-		bh_unlock_sock(sk);

4912

++	} else if (unlikely(sk_add_backlog(meta_sk, skb,

4913

++					   meta_sk->sk_rcvbuf + meta_sk->sk_sndbuf))) {

4914

++		bh_unlock_sock(meta_sk);

4915

+ 		NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);

4916

+ 		goto discard_and_relse;

4917

+ 	}

4918

+-	bh_unlock_sock(sk);

4919

++	bh_unlock_sock(meta_sk);

4920

+

4921

+ 	sock_put(sk);

4922

+

4923

+@@ -1835,6 +1815,18 @@ do_time_wait:

4924

+ 			sk = sk2;

4925

+ 			goto process;

4926

+ 		}

4927

++#ifdef CONFIG_MPTCP

4928

++		if (th->syn && !th->ack) {

4929

++			int ret = mptcp_lookup_join(skb, inet_twsk(sk));

4930

++

4931

++			if (ret < 0) {

4932

++				tcp_v4_send_reset(NULL, skb);

4933

++				goto discard_it;

4934

++			} else if (ret > 0) {

4935

++				return 0;

4936

++			}

4937

++		}

4938

++#endif

4939

+ 		/* Fall through to ACK */

4940

+ 	}

4941

+ 	case TCP_TW_ACK:

4942

+@@ -1900,7 +1892,12 @@ static int tcp_v4_init_sock(struct sock *sk)

4943

+

4944

+ 	tcp_init_sock(sk);

4945

+

4946

+-	icsk->icsk_af_ops = &ipv4_specific;

4947

++#ifdef CONFIG_MPTCP

4948

++	if (is_mptcp_enabled(sk))

4949

++		icsk->icsk_af_ops = &mptcp_v4_specific;

4950

++	else

4951

++#endif

4952

++		icsk->icsk_af_ops = &ipv4_specific;

4953

+

4954

+ #ifdef CONFIG_TCP_MD5SIG

4955

+ 	tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;

4956

+@@ -1917,6 +1914,11 @@ void tcp_v4_destroy_sock(struct sock *sk)

4957

+

4958

+ 	tcp_cleanup_congestion_control(sk);

4959

+

4960

++	if (mptcp(tp))

4961

++		mptcp_destroy_sock(sk);

4962

++	if (tp->inside_tk_table)

4963

++		mptcp_hash_remove(tp);

4964

++

4965

+ 	/* Cleanup up the write buffer. */

4966

+ 	tcp_write_queue_purge(sk);

4967

+

4968

+@@ -2481,6 +2483,19 @@ void tcp4_proc_exit(void)

4969

+ }

4970

+ #endif /* CONFIG_PROC_FS */

4971

+

4972

++#ifdef CONFIG_MPTCP

4973

++static void tcp_v4_clear_sk(struct sock *sk, int size)

4974

++{

4975

++	struct tcp_sock *tp = tcp_sk(sk);

4976

++

4977

++	/* we do not want to clear tk_table field, because of RCU lookups */

4978

++	sk_prot_clear_nulls(sk, offsetof(struct tcp_sock, tk_table));

4979

++

4980

++	size -= offsetof(struct tcp_sock, tk_table) + sizeof(tp->tk_table);

4981

++	memset((char *)&tp->tk_table + sizeof(tp->tk_table), 0, size);

4982

++}

4983

++#endif

4984

++

4985

+ struct proto tcp_prot = {

4986

+ 	.name			= "TCP",

4987

+ 	.owner			= THIS_MODULE,

4988

+@@ -2528,6 +2543,9 @@ struct proto tcp_prot = {

4989

+ 	.destroy_cgroup		= tcp_destroy_cgroup,

4990

+ 	.proto_cgroup		= tcp_proto_cgroup,

4991

+ #endif

4992

++#ifdef CONFIG_MPTCP

4993

++	.clear_sk		= tcp_v4_clear_sk,

4994

++#endif

4995

+ };

4996

+ EXPORT_SYMBOL(tcp_prot);

4997

+

4998

+diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c

4999

+index e68e0d4af6c9..ae6946857dff 100644

5000

+--- a/net/ipv4/tcp_minisocks.c

5001

++++ b/net/ipv4/tcp_minisocks.c

5002

+@@ -18,11 +18,13 @@

5003

+  *		Jorge Cwik, <jorge@×××××××××××××.net>

5004

+  */

5005

+

5006

++#include <linux/kconfig.h>

5007

+ #include <linux/mm.h>

5008

+ #include <linux/module.h>

5009

+ #include <linux/slab.h>

5010

+ #include <linux/sysctl.h>

5011

+ #include <linux/workqueue.h>

5012

++#include <net/mptcp.h>

5013

+ #include <net/tcp.h>

5014

+ #include <net/inet_common.h>

5015

+ #include <net/xfrm.h>

5016

+@@ -95,10 +97,13 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,

5017

+ 	struct tcp_options_received tmp_opt;

5018

+ 	struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);

5019

+ 	bool paws_reject = false;

5020

++	struct mptcp_options_received mopt;

5021

+

5022

+ 	tmp_opt.saw_tstamp = 0;

5023

+ 	if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) {

5024

+-		tcp_parse_options(skb, &tmp_opt, 0, NULL);

5025

++		mptcp_init_mp_opt(&mopt);

5026

++

5027

++		tcp_parse_options(skb, &tmp_opt, &mopt, 0, NULL);

5028

+

5029

+ 		if (tmp_opt.saw_tstamp) {

5030

+ 			tmp_opt.rcv_tsecr	-= tcptw->tw_ts_offset;

5031

+@@ -106,6 +111,11 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,

5032

+ 			tmp_opt.ts_recent_stamp	= tcptw->tw_ts_recent_stamp;

5033

+ 			paws_reject = tcp_paws_reject(&tmp_opt, th->rst);

5034

+ 		}

5035

++

5036

++		if (unlikely(mopt.mp_fclose) && tcptw->mptcp_tw) {

5037

++			if (mopt.mptcp_key == tcptw->mptcp_tw->loc_key)

5038

++				goto kill_with_rst;

5039

++		}

5040

+ 	}

5041

+

5042

+ 	if (tw->tw_substate == TCP_FIN_WAIT2) {

5043

+@@ -128,6 +138,16 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,

5044

+ 		if (!th->ack ||

5045

+ 		    !after(TCP_SKB_CB(skb)->end_seq, tcptw->tw_rcv_nxt) ||

5046

+ 		    TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq) {

5047

++			/* If mptcp_is_data_fin() returns true, we are sure that

5048

++			 * mopt has been initialized - otherwise it would not

5049

++			 * be a DATA_FIN.

5050

++			 */

5051

++			if (tcptw->mptcp_tw && tcptw->mptcp_tw->meta_tw &&

5052

++			    mptcp_is_data_fin(skb) &&

5053

++			    TCP_SKB_CB(skb)->seq == tcptw->tw_rcv_nxt &&

5054

++			    mopt.data_seq + 1 == (u32)tcptw->mptcp_tw->rcv_nxt)

5055

++				return TCP_TW_ACK;

5056

++

5057

+ 			inet_twsk_put(tw);

5058

+ 			return TCP_TW_SUCCESS;

5059

+ 		}

5060

+@@ -290,6 +310,15 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)

5061

+ 		tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp;

5062

+ 		tcptw->tw_ts_offset	= tp->tsoffset;

5063

+

5064

++		if (mptcp(tp)) {

5065

++			if (mptcp_init_tw_sock(sk, tcptw)) {

5066

++				inet_twsk_free(tw);

5067

++				goto exit;

5068

++			}

5069

++		} else {

5070

++			tcptw->mptcp_tw = NULL;

5071

++		}

5072

++

5073

+ #if IS_ENABLED(CONFIG_IPV6)

5074

+ 		if (tw->tw_family == PF_INET6) {

5075

+ 			struct ipv6_pinfo *np = inet6_sk(sk);

5076

+@@ -347,15 +376,18 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)

5077

+ 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPTIMEWAITOVERFLOW);

5078

+ 	}

5079

+

5080

++exit:

5081

+ 	tcp_update_metrics(sk);

5082

+ 	tcp_done(sk);

5083

+ }

5084

+

5085

+ void tcp_twsk_destructor(struct sock *sk)

5086

+ {

5087

+-#ifdef CONFIG_TCP_MD5SIG

5088

+ 	struct tcp_timewait_sock *twsk = tcp_twsk(sk);

5089

+

5090

++	if (twsk->mptcp_tw)

5091

++		mptcp_twsk_destructor(twsk);

5092

++#ifdef CONFIG_TCP_MD5SIG

5093

+ 	if (twsk->tw_md5_key)

5094

+ 		kfree_rcu(twsk->tw_md5_key, rcu);

5095

+ #endif

5096

+@@ -382,13 +414,14 @@ void tcp_openreq_init_rwin(struct request_sock *req,

5097

+ 		req->window_clamp = tcp_full_space(sk);

5098

+

5099

+ 	/* tcp_full_space because it is guaranteed to be the first packet */

5100

+-	tcp_select_initial_window(tcp_full_space(sk),

5101

+-		mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),

5102

++	tp->ops->select_initial_window(tcp_full_space(sk),

5103

++		mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0) -

5104

++		(ireq->saw_mpc ? MPTCP_SUB_LEN_DSM_ALIGN : 0),

5105

+ 		&req->rcv_wnd,

5106

+ 		&req->window_clamp,

5107

+ 		ireq->wscale_ok,

5108

+ 		&rcv_wscale,

5109

+-		dst_metric(dst, RTAX_INITRWND));

5110

++		dst_metric(dst, RTAX_INITRWND), sk);

5111

+ 	ireq->rcv_wscale = rcv_wscale;

5112

+ }

5113

+ EXPORT_SYMBOL(tcp_openreq_init_rwin);

5114

+@@ -499,6 +532,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,

5115

+ 			newtp->rx_opt.ts_recent_stamp = 0;

5116

+ 			newtp->tcp_header_len = sizeof(struct tcphdr);

5117

+ 		}

5118

++		if (ireq->saw_mpc)

5119

++			newtp->tcp_header_len += MPTCP_SUB_LEN_DSM_ALIGN;

5120

+ 		newtp->tsoffset = 0;

5121

+ #ifdef CONFIG_TCP_MD5SIG

5122

+ 		newtp->md5sig_info = NULL;	/*XXX*/

5123

+@@ -535,16 +570,20 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,

5124

+ 			   bool fastopen)

5125

+ {

5126

+ 	struct tcp_options_received tmp_opt;

5127

++	struct mptcp_options_received mopt;

5128

+ 	struct sock *child;

5129

+ 	const struct tcphdr *th = tcp_hdr(skb);

5130

+ 	__be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK);

5131

+ 	bool paws_reject = false;

5132

+

5133

+-	BUG_ON(fastopen == (sk->sk_state == TCP_LISTEN));

5134

++	BUG_ON(!mptcp(tcp_sk(sk)) && fastopen == (sk->sk_state == TCP_LISTEN));

5135

+

5136

+ 	tmp_opt.saw_tstamp = 0;

5137

++

5138

++	mptcp_init_mp_opt(&mopt);

5139

++

5140

+ 	if (th->doff > (sizeof(struct tcphdr)>>2)) {

5141

+-		tcp_parse_options(skb, &tmp_opt, 0, NULL);

5142

++		tcp_parse_options(skb, &tmp_opt, &mopt, 0, NULL);

5143

+

5144

+ 		if (tmp_opt.saw_tstamp) {

5145

+ 			tmp_opt.ts_recent = req->ts_recent;

5146

+@@ -583,7 +622,14 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,

5147

+ 		 *

5148

+ 		 * Reset timer after retransmitting SYNACK, similar to

5149

+ 		 * the idea of fast retransmit in recovery.

5150

++		 *

5151

++		 * Fall back to TCP if MP_CAPABLE is not set.

5152

+ 		 */

5153

++

5154

++		if (inet_rsk(req)->saw_mpc && !mopt.saw_mpc)

5155

++			inet_rsk(req)->saw_mpc = false;

5156

++

5157

++

5158

+ 		if (!inet_rtx_syn_ack(sk, req))

5159

+ 			req->expires = min(TCP_TIMEOUT_INIT << req->num_timeout,

5160

+ 					   TCP_RTO_MAX) + jiffies;

5161

+@@ -718,9 +764,21 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,

5162

+ 	 * socket is created, wait for troubles.

5163

+ 	 */

5164

+ 	child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL);

5165

++

5166

+ 	if (child == NULL)

5167

+ 		goto listen_overflow;

5168

+

5169

++	if (!is_meta_sk(sk)) {

5170

++		int ret = mptcp_check_req_master(sk, child, req, prev);

5171

++		if (ret < 0)

5172

++			goto listen_overflow;

5173

++

5174

++		/* MPTCP-supported */

5175

++		if (!ret)

5176

++			return tcp_sk(child)->mpcb->master_sk;

5177

++	} else {

5178

++		return mptcp_check_req_child(sk, child, req, prev, &mopt);

5179

++	}

5180

+ 	inet_csk_reqsk_queue_unlink(sk, req, prev);

5181

+ 	inet_csk_reqsk_queue_removed(sk, req);

5182

+

5183

+@@ -746,7 +804,17 @@ embryonic_reset:

5184

+ 		tcp_reset(sk);

5185

+ 	}

5186

+ 	if (!fastopen) {

5187

+-		inet_csk_reqsk_queue_drop(sk, req, prev);

5188

++		if (is_meta_sk(sk)) {

5189

++			/* We want to avoid stoping the keepalive-timer and so

5190

++			 * avoid ending up in inet_csk_reqsk_queue_removed ...

5191

++			 */

5192

++			inet_csk_reqsk_queue_unlink(sk, req, prev);

5193

++			if (reqsk_queue_removed(&inet_csk(sk)->icsk_accept_queue, req) == 0)

5194

++				mptcp_delete_synack_timer(sk);

5195

++			reqsk_free(req);

5196

++		} else {

5197

++			inet_csk_reqsk_queue_drop(sk, req, prev);

5198

++		}

5199

+ 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_EMBRYONICRSTS);

5200

+ 	}

5201

+ 	return NULL;

5202

+@@ -770,8 +838,9 @@ int tcp_child_process(struct sock *parent, struct sock *child,

5203

+ {

5204

+ 	int ret = 0;

5205

+ 	int state = child->sk_state;

5206

++	struct sock *meta_sk = mptcp(tcp_sk(child)) ? mptcp_meta_sk(child) : child;

5207

+

5208

+-	if (!sock_owned_by_user(child)) {

5209

++	if (!sock_owned_by_user(meta_sk)) {

5210

+ 		ret = tcp_rcv_state_process(child, skb, tcp_hdr(skb),

5211

+ 					    skb->len);

5212

+ 		/* Wakeup parent, send SIGIO */

5213

+@@ -782,10 +851,14 @@ int tcp_child_process(struct sock *parent, struct sock *child,

5214

+ 		 * in main socket hash table and lock on listening

5215

+ 		 * socket does not protect us more.

5216

+ 		 */

5217

+-		__sk_add_backlog(child, skb);

5218

++		if (mptcp(tcp_sk(child)))

5219

++			skb->sk = child;

5220

++		__sk_add_backlog(meta_sk, skb);

5221

+ 	}

5222

+

5223

+-	bh_unlock_sock(child);

5224

++	if (mptcp(tcp_sk(child)))

5225

++		bh_unlock_sock(child);

5226

++	bh_unlock_sock(meta_sk);

5227

+ 	sock_put(child);

5228

+ 	return ret;

5229

+ }

5230

+diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c

5231

+index 179b51e6bda3..efd31b6c5784 100644

5232

+--- a/net/ipv4/tcp_output.c

5233

++++ b/net/ipv4/tcp_output.c

5234

+@@ -36,6 +36,12 @@

5235

+

5236

+ #define pr_fmt(fmt) "TCP: " fmt

5237

+

5238

++#include <net/mptcp.h>

5239

++#include <net/mptcp_v4.h>

5240

++#if IS_ENABLED(CONFIG_IPV6)

5241

++#include <net/mptcp_v6.h>

5242

++#endif

5243

++#include <net/ipv6.h>

5244

+ #include <net/tcp.h>

5245

+

5246

+ #include <linux/compiler.h>

5247

+@@ -68,11 +74,8 @@ int sysctl_tcp_slow_start_after_idle __read_mostly = 1;

5248

+ unsigned int sysctl_tcp_notsent_lowat __read_mostly = UINT_MAX;

5249

+ EXPORT_SYMBOL(sysctl_tcp_notsent_lowat);

5250

+

5251

+-static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,

5252

+-			   int push_one, gfp_t gfp);

5253

+-

5254

+ /* Account for new data that has been sent to the network. */

5255

+-static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb)

5256

++void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb)

5257

+ {

5258

+ 	struct inet_connection_sock *icsk = inet_csk(sk);

5259

+ 	struct tcp_sock *tp = tcp_sk(sk);

5260

+@@ -214,7 +217,7 @@ u32 tcp_default_init_rwnd(u32 mss)

5261

+ void tcp_select_initial_window(int __space, __u32 mss,

5262

+ 			       __u32 *rcv_wnd, __u32 *window_clamp,

5263

+ 			       int wscale_ok, __u8 *rcv_wscale,

5264

+-			       __u32 init_rcv_wnd)

5265

++			       __u32 init_rcv_wnd, const struct sock *sk)

5266

+ {

5267

+ 	unsigned int space = (__space < 0 ? 0 : __space);

5268

+

5269

+@@ -269,12 +272,16 @@ EXPORT_SYMBOL(tcp_select_initial_window);

5270

+  * value can be stuffed directly into th->window for an outgoing

5271

+  * frame.

5272

+  */

5273

+-static u16 tcp_select_window(struct sock *sk)

5274

++u16 tcp_select_window(struct sock *sk)

5275

+ {

5276

+ 	struct tcp_sock *tp = tcp_sk(sk);

5277

+ 	u32 old_win = tp->rcv_wnd;

5278

+-	u32 cur_win = tcp_receive_window(tp);

5279

+-	u32 new_win = __tcp_select_window(sk);

5280

++	/* The window must never shrink at the meta-level. At the subflow we

5281

++	 * have to allow this. Otherwise we may announce a window too large

5282

++	 * for the current meta-level sk_rcvbuf.

5283

++	 */

5284

++	u32 cur_win = tcp_receive_window(mptcp(tp) ? tcp_sk(mptcp_meta_sk(sk)) : tp);

5285

++	u32 new_win = tp->ops->__select_window(sk);

5286

+

5287

+ 	/* Never shrink the offered window */

5288

+ 	if (new_win < cur_win) {

5289

+@@ -290,6 +297,7 @@ static u16 tcp_select_window(struct sock *sk)

5290

+ 				      LINUX_MIB_TCPWANTZEROWINDOWADV);

5291

+ 		new_win = ALIGN(cur_win, 1 << tp->rx_opt.rcv_wscale);

5292

+ 	}

5293

++

5294

+ 	tp->rcv_wnd = new_win;

5295

+ 	tp->rcv_wup = tp->rcv_nxt;

5296

+

5297

+@@ -374,7 +382,7 @@ static inline void TCP_ECN_send(struct sock *sk, struct sk_buff *skb,

5298

+ /* Constructs common control bits of non-data skb. If SYN/FIN is present,

5299

+  * auto increment end seqno.

5300

+  */

5301

+-static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)

5302

++void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)

5303

+ {

5304

+ 	struct skb_shared_info *shinfo = skb_shinfo(skb);

5305

+

5306

+@@ -394,7 +402,7 @@ static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)

5307

+ 	TCP_SKB_CB(skb)->end_seq = seq;

5308

+ }

5309

+

5310

+-static inline bool tcp_urg_mode(const struct tcp_sock *tp)

5311

++bool tcp_urg_mode(const struct tcp_sock *tp)

5312

+ {

5313

+ 	return tp->snd_una != tp->snd_up;

5314

+ }

5315

+@@ -404,17 +412,7 @@ static inline bool tcp_urg_mode(const struct tcp_sock *tp)

5316

+ #define OPTION_MD5		(1 << 2)

5317

+ #define OPTION_WSCALE		(1 << 3)

5318

+ #define OPTION_FAST_OPEN_COOKIE	(1 << 8)

5319

+-

5320

+-struct tcp_out_options {

5321

+-	u16 options;		/* bit field of OPTION_* */

5322

+-	u16 mss;		/* 0 to disable */

5323

+-	u8 ws;			/* window scale, 0 to disable */

5324

+-	u8 num_sack_blocks;	/* number of SACK blocks to include */

5325

+-	u8 hash_size;		/* bytes in hash_location */

5326

+-	__u8 *hash_location;	/* temporary pointer, overloaded */

5327

+-	__u32 tsval, tsecr;	/* need to include OPTION_TS */

5328

+-	struct tcp_fastopen_cookie *fastopen_cookie;	/* Fast open cookie */

5329

+-};

5330

++/* Before adding here - take a look at OPTION_MPTCP in include/net/mptcp.h */

5331

+

5332

+ /* Write previously computed TCP options to the packet.

5333

+  *

5334

+@@ -430,7 +428,7 @@ struct tcp_out_options {

5335

+  * (but it may well be that other scenarios fail similarly).

5336

+  */

5337

+ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,

5338

+-			      struct tcp_out_options *opts)

5339

++			      struct tcp_out_options *opts, struct sk_buff *skb)

5340

+ {

5341

+ 	u16 options = opts->options;	/* mungable copy */

5342

+

5343

+@@ -513,6 +511,9 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,

5344

+ 		}

5345

+ 		ptr += (foc->len + 3) >> 2;

5346

+ 	}

5347

++

5348

++	if (unlikely(OPTION_MPTCP & opts->options))

5349

++		mptcp_options_write(ptr, tp, opts, skb);

5350

+ }

5351

+

5352

+ /* Compute TCP options for SYN packets. This is not the final

5353

+@@ -564,6 +565,8 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,

5354

+ 		if (unlikely(!(OPTION_TS & opts->options)))

5355

+ 			remaining -= TCPOLEN_SACKPERM_ALIGNED;

5356

+ 	}

5357

++	if (tp->request_mptcp || mptcp(tp))

5358

++		mptcp_syn_options(sk, opts, &remaining);

5359

+

5360

+ 	if (fastopen && fastopen->cookie.len >= 0) {

5361

+ 		u32 need = TCPOLEN_EXP_FASTOPEN_BASE + fastopen->cookie.len;

5362

+@@ -637,6 +640,9 @@ static unsigned int tcp_synack_options(struct sock *sk,

5363

+ 		}

5364

+ 	}

5365

+

5366

++	if (ireq->saw_mpc)

5367

++		mptcp_synack_options(req, opts, &remaining);

5368

++

5369

+ 	return MAX_TCP_OPTION_SPACE - remaining;

5370

+ }

5371

+

5372

+@@ -670,16 +676,22 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb

5373

+ 		opts->tsecr = tp->rx_opt.ts_recent;

5374

+ 		size += TCPOLEN_TSTAMP_ALIGNED;

5375

+ 	}

5376

++	if (mptcp(tp))

5377

++		mptcp_established_options(sk, skb, opts, &size);

5378

+

5379

+ 	eff_sacks = tp->rx_opt.num_sacks + tp->rx_opt.dsack;

5380

+ 	if (unlikely(eff_sacks)) {

5381

+-		const unsigned int remaining = MAX_TCP_OPTION_SPACE - size;

5382

+-		opts->num_sack_blocks =

5383

+-			min_t(unsigned int, eff_sacks,

5384

+-			      (remaining - TCPOLEN_SACK_BASE_ALIGNED) /

5385

+-			      TCPOLEN_SACK_PERBLOCK);

5386

+-		size += TCPOLEN_SACK_BASE_ALIGNED +

5387

+-			opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK;

5388

++		const unsigned remaining = MAX_TCP_OPTION_SPACE - size;

5389

++		if (remaining < TCPOLEN_SACK_BASE_ALIGNED)

5390

++			opts->num_sack_blocks = 0;

5391

++		else

5392

++			opts->num_sack_blocks =

5393

++			    min_t(unsigned int, eff_sacks,

5394

++				  (remaining - TCPOLEN_SACK_BASE_ALIGNED) /

5395

++				  TCPOLEN_SACK_PERBLOCK);

5396

++		if (opts->num_sack_blocks)

5397

++			size += TCPOLEN_SACK_BASE_ALIGNED +

5398

++			    opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK;

5399

+ 	}

5400

+

5401

+ 	return size;

5402

+@@ -711,8 +723,8 @@ static void tcp_tsq_handler(struct sock *sk)

5403

+ 	if ((1 << sk->sk_state) &

5404

+ 	    (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_CLOSING |

5405

+ 	     TCPF_CLOSE_WAIT  | TCPF_LAST_ACK))

5406

+-		tcp_write_xmit(sk, tcp_current_mss(sk), tcp_sk(sk)->nonagle,

5407

+-			       0, GFP_ATOMIC);

5408

++		tcp_sk(sk)->ops->write_xmit(sk, tcp_current_mss(sk),

5409

++					    tcp_sk(sk)->nonagle, 0, GFP_ATOMIC);

5410

+ }

5411

+ /*

5412

+  * One tasklet per cpu tries to send more skbs.

5413

+@@ -727,7 +739,7 @@ static void tcp_tasklet_func(unsigned long data)

5414

+ 	unsigned long flags;

5415

+ 	struct list_head *q, *n;

5416

+ 	struct tcp_sock *tp;

5417

+-	struct sock *sk;

5418

++	struct sock *sk, *meta_sk;

5419

+

5420

+ 	local_irq_save(flags);

5421

+ 	list_splice_init(&tsq->head, &list);

5422

+@@ -738,15 +750,25 @@ static void tcp_tasklet_func(unsigned long data)

5423

+ 		list_del(&tp->tsq_node);

5424

+

5425

+ 		sk = (struct sock *)tp;

5426

+-		bh_lock_sock(sk);

5427

++		meta_sk = mptcp(tp) ? mptcp_meta_sk(sk) : sk;

5428

++		bh_lock_sock(meta_sk);

5429

+

5430

+-		if (!sock_owned_by_user(sk)) {

5431

++		if (!sock_owned_by_user(meta_sk)) {

5432

+ 			tcp_tsq_handler(sk);

5433

++			if (mptcp(tp))

5434

++				tcp_tsq_handler(meta_sk);

5435

+ 		} else {

5436

++			if (mptcp(tp) && sk->sk_state == TCP_CLOSE)

5437

++				goto exit;

5438

++

5439

+ 			/* defer the work to tcp_release_cb() */

5440

+ 			set_bit(TCP_TSQ_DEFERRED, &tp->tsq_flags);

5441

++

5442

++			if (mptcp(tp))

5443

++				mptcp_tsq_flags(sk);

5444

+ 		}

5445

+-		bh_unlock_sock(sk);

5446

++exit:

5447

++		bh_unlock_sock(meta_sk);

5448

+

5449

+ 		clear_bit(TSQ_QUEUED, &tp->tsq_flags);

5450

+ 		sk_free(sk);

5451

+@@ -756,7 +778,10 @@ static void tcp_tasklet_func(unsigned long data)

5452

+ #define TCP_DEFERRED_ALL ((1UL << TCP_TSQ_DEFERRED) |		\

5453

+ 			  (1UL << TCP_WRITE_TIMER_DEFERRED) |	\

5454

+ 			  (1UL << TCP_DELACK_TIMER_DEFERRED) |	\

5455

+-			  (1UL << TCP_MTU_REDUCED_DEFERRED))

5456

++			  (1UL << TCP_MTU_REDUCED_DEFERRED) |   \

5457

++			  (1UL << MPTCP_PATH_MANAGER) |		\

5458

++			  (1UL << MPTCP_SUB_DEFERRED))

5459

++

5460

+ /**

5461

+  * tcp_release_cb - tcp release_sock() callback

5462

+  * @sk: socket

5463

+@@ -803,6 +828,13 @@ void tcp_release_cb(struct sock *sk)

5464

+ 		sk->sk_prot->mtu_reduced(sk);

5465

+ 		__sock_put(sk);

5466

+ 	}

5467

++	if (flags & (1UL << MPTCP_PATH_MANAGER)) {

5468

++		if (tcp_sk(sk)->mpcb->pm_ops->release_sock)

5469

++			tcp_sk(sk)->mpcb->pm_ops->release_sock(sk);

5470

++		__sock_put(sk);

5471

++	}

5472

++	if (flags & (1UL << MPTCP_SUB_DEFERRED))

5473

++		mptcp_tsq_sub_deferred(sk);

5474

+ }

5475

+ EXPORT_SYMBOL(tcp_release_cb);

5476

+

5477

+@@ -862,8 +894,8 @@ void tcp_wfree(struct sk_buff *skb)

5478

+  * We are working here with either a clone of the original

5479

+  * SKB, or a fresh unique copy made by the retransmit engine.

5480

+  */

5481

+-static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,

5482

+-			    gfp_t gfp_mask)

5483

++int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,

5484

++		        gfp_t gfp_mask)

5485

+ {

5486

+ 	const struct inet_connection_sock *icsk = inet_csk(sk);

5487

+ 	struct inet_sock *inet;

5488

+@@ -933,7 +965,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,

5489

+ 		 */

5490

+ 		th->window	= htons(min(tp->rcv_wnd, 65535U));

5491

+ 	} else {

5492

+-		th->window	= htons(tcp_select_window(sk));

5493

++		th->window	= htons(tp->ops->select_window(sk));

5494

+ 	}

5495

+ 	th->check		= 0;

5496

+ 	th->urg_ptr		= 0;

5497

+@@ -949,7 +981,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,

5498

+ 		}

5499

+ 	}

5500

+

5501

+-	tcp_options_write((__be32 *)(th + 1), tp, &opts);

5502

++	tcp_options_write((__be32 *)(th + 1), tp, &opts, skb);

5503

+ 	if (likely((tcb->tcp_flags & TCPHDR_SYN) == 0))

5504

+ 		TCP_ECN_send(sk, skb, tcp_header_size);

5505

+

5506

+@@ -988,7 +1020,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,

5507

+  * NOTE: probe0 timer is not checked, do not forget tcp_push_pending_frames,

5508

+  * otherwise socket can stall.

5509

+  */

5510

+-static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)

5511

++void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)

5512

+ {

5513

+ 	struct tcp_sock *tp = tcp_sk(sk);

5514

+

5515

+@@ -1001,15 +1033,16 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)

5516

+ }

5517

+

5518

+ /* Initialize TSO segments for a packet. */

5519

+-static void tcp_set_skb_tso_segs(const struct sock *sk, struct sk_buff *skb,

5520

+-				 unsigned int mss_now)

5521

++void tcp_set_skb_tso_segs(const struct sock *sk, struct sk_buff *skb,

5522

++			  unsigned int mss_now)

5523

+ {

5524

+ 	struct skb_shared_info *shinfo = skb_shinfo(skb);

5525

+

5526

+ 	/* Make sure we own this skb before messing gso_size/gso_segs */

5527

+ 	WARN_ON_ONCE(skb_cloned(skb));

5528

+

5529

+-	if (skb->len <= mss_now || skb->ip_summed == CHECKSUM_NONE) {

5530

++	if (skb->len <= mss_now || (is_meta_sk(sk) && !mptcp_sk_can_gso(sk)) ||

5531

++	    (!is_meta_sk(sk) && !sk_can_gso(sk)) || skb->ip_summed == CHECKSUM_NONE) {

5532

+ 		/* Avoid the costly divide in the normal

5533

+ 		 * non-TSO case.

5534

+ 		 */

5535

+@@ -1041,7 +1074,7 @@ static void tcp_adjust_fackets_out(struct sock *sk, const struct sk_buff *skb,

5536

+ /* Pcount in the middle of the write queue got changed, we need to do various

5537

+  * tweaks to fix counters

5538

+  */

5539

+-static void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int decr)

5540

++void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int decr)

5541

+ {

5542

+ 	struct tcp_sock *tp = tcp_sk(sk);

5543

+

5544

+@@ -1164,7 +1197,7 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,

5545

+  * eventually). The difference is that pulled data not copied, but

5546

+  * immediately discarded.

5547

+  */

5548

+-static void __pskb_trim_head(struct sk_buff *skb, int len)

5549

++void __pskb_trim_head(struct sk_buff *skb, int len)

5550

+ {

5551

+ 	struct skb_shared_info *shinfo;

5552

+ 	int i, k, eat;

5553

+@@ -1205,6 +1238,9 @@ static void __pskb_trim_head(struct sk_buff *skb, int len)

5554

+ /* Remove acked data from a packet in the transmit queue. */

5555

+ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)

5556

+ {

5557

++	if (mptcp(tcp_sk(sk)) && !is_meta_sk(sk) && mptcp_is_data_seq(skb))

5558

++		return mptcp_trim_head(sk, skb, len);

5559

++

5560

+ 	if (skb_unclone(skb, GFP_ATOMIC))

5561

+ 		return -ENOMEM;

5562

+

5563

+@@ -1222,6 +1258,15 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)

5564

+ 	if (tcp_skb_pcount(skb) > 1)

5565

+ 		tcp_set_skb_tso_segs(sk, skb, tcp_skb_mss(skb));

5566

+

5567

++#ifdef CONFIG_MPTCP

5568

++	/* Some data got acked - we assume that the seq-number reached the dest.

5569

++	 * Anyway, our MPTCP-option has been trimmed above - we lost it here.

5570

++	 * Only remove the SEQ if the call does not come from a meta retransmit.

5571

++	 */

5572

++	if (mptcp(tcp_sk(sk)) && !is_meta_sk(sk))

5573

++		TCP_SKB_CB(skb)->mptcp_flags &= ~MPTCPHDR_SEQ;

5574

++#endif

5575

++

5576

+ 	return 0;

5577

+ }

5578

+

5579

+@@ -1379,6 +1424,7 @@ unsigned int tcp_current_mss(struct sock *sk)

5580

+

5581

+ 	return mss_now;

5582

+ }

5583

++EXPORT_SYMBOL(tcp_current_mss);

5584

+

5585

+ /* RFC2861, slow part. Adjust cwnd, after it was not full during one rto.

5586

+  * As additional protections, we do not touch cwnd in retransmission phases,

5587

+@@ -1446,8 +1492,8 @@ static bool tcp_minshall_check(const struct tcp_sock *tp)

5588

+  * But we can avoid doing the divide again given we already have

5589

+  *  skb_pcount = skb->len / mss_now

5590

+  */

5591

+-static void tcp_minshall_update(struct tcp_sock *tp, unsigned int mss_now,

5592

+-				const struct sk_buff *skb)

5593

++void tcp_minshall_update(struct tcp_sock *tp, unsigned int mss_now,

5594

++			 const struct sk_buff *skb)

5595

+ {

5596

+ 	if (skb->len < tcp_skb_pcount(skb) * mss_now)

5597

+ 		tp->snd_sml = TCP_SKB_CB(skb)->end_seq;

5598

+@@ -1468,11 +1514,11 @@ static bool tcp_nagle_check(bool partial, const struct tcp_sock *tp,

5599

+ 		 (!nonagle && tp->packets_out && tcp_minshall_check(tp)));

5600

+ }

5601

+ /* Returns the portion of skb which can be sent right away */

5602

+-static unsigned int tcp_mss_split_point(const struct sock *sk,

5603

+-					const struct sk_buff *skb,

5604

+-					unsigned int mss_now,

5605

+-					unsigned int max_segs,

5606

+-					int nonagle)

5607

++unsigned int tcp_mss_split_point(const struct sock *sk,

5608

++				 const struct sk_buff *skb,

5609

++				 unsigned int mss_now,

5610

++				 unsigned int max_segs,

5611

++				 int nonagle)

5612

+ {

5613

+ 	const struct tcp_sock *tp = tcp_sk(sk);

5614

+ 	u32 partial, needed, window, max_len;

5615

+@@ -1502,13 +1548,14 @@ static unsigned int tcp_mss_split_point(const struct sock *sk,

5616

+ /* Can at least one segment of SKB be sent right now, according to the

5617

+  * congestion window rules?  If so, return how many segments are allowed.

5618

+  */

5619

+-static inline unsigned int tcp_cwnd_test(const struct tcp_sock *tp,

5620

+-					 const struct sk_buff *skb)

5621

++unsigned int tcp_cwnd_test(const struct tcp_sock *tp,

5622

++			   const struct sk_buff *skb)

5623

+ {

5624

+ 	u32 in_flight, cwnd;

5625

+

5626

+ 	/* Don't be strict about the congestion window for the final FIN.  */

5627

+-	if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) &&

5628

++	if (skb &&

5629

++	    (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) &&

5630

+ 	    tcp_skb_pcount(skb) == 1)

5631

+ 		return 1;

5632

+

5633

+@@ -1524,8 +1571,8 @@ static inline unsigned int tcp_cwnd_test(const struct tcp_sock *tp,

5634

+  * This must be invoked the first time we consider transmitting

5635

+  * SKB onto the wire.

5636

+  */

5637

+-static int tcp_init_tso_segs(const struct sock *sk, struct sk_buff *skb,

5638

+-			     unsigned int mss_now)

5639

++int tcp_init_tso_segs(const struct sock *sk, struct sk_buff *skb,

5640

++		      unsigned int mss_now)

5641

+ {

5642

+ 	int tso_segs = tcp_skb_pcount(skb);

5643

+

5644

+@@ -1540,8 +1587,8 @@ static int tcp_init_tso_segs(const struct sock *sk, struct sk_buff *skb,

5645

+ /* Return true if the Nagle test allows this packet to be

5646

+  * sent now.

5647

+  */

5648

+-static inline bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buff *skb,

5649

+-				  unsigned int cur_mss, int nonagle)

5650

++bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buff *skb,

5651

++		    unsigned int cur_mss, int nonagle)

5652

+ {

5653

+ 	/* Nagle rule does not apply to frames, which sit in the middle of the

5654

+ 	 * write_queue (they have no chances to get new data).

5655

+@@ -1553,7 +1600,8 @@ static inline bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buf

5656

+ 		return true;

5657

+

5658

+ 	/* Don't use the nagle rule for urgent data (or for the final FIN). */

5659

+-	if (tcp_urg_mode(tp) || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN))

5660

++	if (tcp_urg_mode(tp) || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) ||

5661

++	    mptcp_is_data_fin(skb))

5662

+ 		return true;

5663

+

5664

+ 	if (!tcp_nagle_check(skb->len < cur_mss, tp, nonagle))

5665

+@@ -1563,9 +1611,8 @@ static inline bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buf

5666

+ }

5667

+

5668

+ /* Does at least the first segment of SKB fit into the send window? */

5669

+-static bool tcp_snd_wnd_test(const struct tcp_sock *tp,

5670

+-			     const struct sk_buff *skb,

5671

+-			     unsigned int cur_mss)

5672

++bool tcp_snd_wnd_test(const struct tcp_sock *tp, const struct sk_buff *skb,

5673

++		      unsigned int cur_mss)

5674

+ {

5675

+ 	u32 end_seq = TCP_SKB_CB(skb)->end_seq;

5676

+

5677

+@@ -1676,7 +1723,7 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,

5678

+ 	u32 send_win, cong_win, limit, in_flight;

5679

+ 	int win_divisor;

5680

+

5681

+-	if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)

5682

++	if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) || mptcp_is_data_fin(skb))

5683

+ 		goto send_now;

5684

+

5685

+ 	if (icsk->icsk_ca_state != TCP_CA_Open)

5686

+@@ -1888,7 +1935,7 @@ static int tcp_mtu_probe(struct sock *sk)

5687

+  * Returns true, if no segments are in flight and we have queued segments,

5688

+  * but cannot send anything now because of SWS or another problem.

5689

+  */

5690

+-static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,

5691

++bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,

5692

+ 			   int push_one, gfp_t gfp)

5693

+ {

5694

+ 	struct tcp_sock *tp = tcp_sk(sk);

5695

+@@ -1900,7 +1947,11 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,

5696

+

5697

+ 	sent_pkts = 0;

5698

+

5699

+-	if (!push_one) {

5700

++	/* pmtu not yet supported with MPTCP. Should be possible, by early

5701

++	 * exiting the loop inside tcp_mtu_probe, making sure that only one

5702

++	 * single DSS-mapping gets probed.

5703

++	 */

5704

++	if (!push_one && !mptcp(tp)) {

5705

+ 		/* Do MTU probing. */

5706

+ 		result = tcp_mtu_probe(sk);

5707

+ 		if (!result) {

5708

+@@ -2099,7 +2150,8 @@ void tcp_send_loss_probe(struct sock *sk)

5709

+ 	int err = -1;

5710

+

5711

+ 	if (tcp_send_head(sk) != NULL) {

5712

+-		err = tcp_write_xmit(sk, mss, TCP_NAGLE_OFF, 2, GFP_ATOMIC);

5713

++		err = tp->ops->write_xmit(sk, mss, TCP_NAGLE_OFF, 2,

5714

++					  GFP_ATOMIC);

5715

+ 		goto rearm_timer;

5716

+ 	}

5717

+

5718

+@@ -2159,8 +2211,8 @@ void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss,

5719

+ 	if (unlikely(sk->sk_state == TCP_CLOSE))

5720

+ 		return;

5721

+

5722

+-	if (tcp_write_xmit(sk, cur_mss, nonagle, 0,

5723

+-			   sk_gfp_atomic(sk, GFP_ATOMIC)))

5724

++	if (tcp_sk(sk)->ops->write_xmit(sk, cur_mss, nonagle, 0,

5725

++					sk_gfp_atomic(sk, GFP_ATOMIC)))

5726

+ 		tcp_check_probe_timer(sk);

5727

+ }

5728

+

5729

+@@ -2173,7 +2225,8 @@ void tcp_push_one(struct sock *sk, unsigned int mss_now)

5730

+

5731

+ 	BUG_ON(!skb || skb->len < mss_now);

5732

+

5733

+-	tcp_write_xmit(sk, mss_now, TCP_NAGLE_PUSH, 1, sk->sk_allocation);

5734

++	tcp_sk(sk)->ops->write_xmit(sk, mss_now, TCP_NAGLE_PUSH, 1,

5735

++				    sk->sk_allocation);

5736

+ }

5737

+

5738

+ /* This function returns the amount that we can raise the

5739

+@@ -2386,6 +2439,10 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to,

5740

+ 	if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)

5741

+ 		return;

5742

+

5743

++	/* Currently not supported for MPTCP - but it should be possible */

5744

++	if (mptcp(tp))

5745

++		return;

5746

++

5747

+ 	tcp_for_write_queue_from_safe(skb, tmp, sk) {

5748

+ 		if (!tcp_can_collapse(sk, skb))

5749

+ 			break;

5750

+@@ -2843,7 +2900,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,

5751

+

5752

+ 	/* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */

5753

+ 	th->window = htons(min(req->rcv_wnd, 65535U));

5754

+-	tcp_options_write((__be32 *)(th + 1), tp, &opts);

5755

++	tcp_options_write((__be32 *)(th + 1), tp, &opts, skb);

5756

+ 	th->doff = (tcp_header_size >> 2);

5757

+ 	TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_OUTSEGS);

5758

+

5759

+@@ -2897,13 +2954,13 @@ static void tcp_connect_init(struct sock *sk)

5760

+ 	    (tp->window_clamp > tcp_full_space(sk) || tp->window_clamp == 0))

5761

+ 		tp->window_clamp = tcp_full_space(sk);

5762

+

5763

+-	tcp_select_initial_window(tcp_full_space(sk),

5764

+-				  tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0),

5765

+-				  &tp->rcv_wnd,

5766

+-				  &tp->window_clamp,

5767

+-				  sysctl_tcp_window_scaling,

5768

+-				  &rcv_wscale,

5769

+-				  dst_metric(dst, RTAX_INITRWND));

5770

++	tp->ops->select_initial_window(tcp_full_space(sk),

5771

++				       tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0),

5772

++				       &tp->rcv_wnd,

5773

++				       &tp->window_clamp,

5774

++				       sysctl_tcp_window_scaling,

5775

++				       &rcv_wscale,

5776

++				       dst_metric(dst, RTAX_INITRWND), sk);

5777

+

5778

+ 	tp->rx_opt.rcv_wscale = rcv_wscale;

5779

+ 	tp->rcv_ssthresh = tp->rcv_wnd;

5780

+@@ -2927,6 +2984,36 @@ static void tcp_connect_init(struct sock *sk)

5781

+ 	inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT;

5782

+ 	inet_csk(sk)->icsk_retransmits = 0;

5783

+ 	tcp_clear_retrans(tp);

5784

++

5785

++#ifdef CONFIG_MPTCP

5786

++	if (sysctl_mptcp_enabled && mptcp_doit(sk)) {

5787

++		if (is_master_tp(tp)) {

5788

++			tp->request_mptcp = 1;

5789

++			mptcp_connect_init(sk);

5790

++		} else if (tp->mptcp) {

5791

++			struct inet_sock *inet	= inet_sk(sk);

5792

++

5793

++			tp->mptcp->snt_isn	= tp->write_seq;

5794

++			tp->mptcp->init_rcv_wnd	= tp->rcv_wnd;

5795

++

5796

++			/* Set nonce for new subflows */

5797

++			if (sk->sk_family == AF_INET)

5798

++				tp->mptcp->mptcp_loc_nonce = mptcp_v4_get_nonce(

5799

++							inet->inet_saddr,

5800

++							inet->inet_daddr,

5801

++							inet->inet_sport,

5802

++							inet->inet_dport);

5803

++#if IS_ENABLED(CONFIG_IPV6)

5804

++			else

5805

++				tp->mptcp->mptcp_loc_nonce = mptcp_v6_get_nonce(

5806

++						inet6_sk(sk)->saddr.s6_addr32,

5807

++						sk->sk_v6_daddr.s6_addr32,

5808

++						inet->inet_sport,

5809

++						inet->inet_dport);

5810

++#endif

5811

++		}

5812

++	}

5813

++#endif

5814

+ }

5815

+

5816

+ static void tcp_connect_queue_skb(struct sock *sk, struct sk_buff *skb)

5817

+@@ -3176,6 +3263,7 @@ void tcp_send_ack(struct sock *sk)

5818

+ 	TCP_SKB_CB(buff)->when = tcp_time_stamp;

5819

+ 	tcp_transmit_skb(sk, buff, 0, sk_gfp_atomic(sk, GFP_ATOMIC));

5820

+ }

5821

++EXPORT_SYMBOL(tcp_send_ack);

5822

+

5823

+ /* This routine sends a packet with an out of date sequence

5824

+  * number. It assumes the other end will try to ack it.

5825

+@@ -3188,7 +3276,7 @@ void tcp_send_ack(struct sock *sk)

5826

+  * one is with SEG.SEQ=SND.UNA to deliver urgent pointer, another is

5827

+  * out-of-date with SND.UNA-1 to probe window.

5828

+  */

5829

+-static int tcp_xmit_probe_skb(struct sock *sk, int urgent)

5830

++int tcp_xmit_probe_skb(struct sock *sk, int urgent)

5831

+ {

5832

+ 	struct tcp_sock *tp = tcp_sk(sk);

5833

+ 	struct sk_buff *skb;

5834

+@@ -3270,7 +3358,7 @@ void tcp_send_probe0(struct sock *sk)

5835

+ 	struct tcp_sock *tp = tcp_sk(sk);

5836

+ 	int err;

5837

+

5838

+-	err = tcp_write_wakeup(sk);

5839

++	err = tp->ops->write_wakeup(sk);

5840

+

5841

+ 	if (tp->packets_out || !tcp_send_head(sk)) {

5842

+ 		/* Cancel probe timer, if it is not required. */

5843

+@@ -3301,3 +3389,18 @@ void tcp_send_probe0(struct sock *sk)

5844

+ 					  TCP_RTO_MAX);

5845

+ 	}

5846

+ }

5847

++

5848

++int tcp_rtx_synack(struct sock *sk, struct request_sock *req)

5849

++{

5850

++	const struct tcp_request_sock_ops *af_ops = tcp_rsk(req)->af_specific;

5851

++	struct flowi fl;

5852

++	int res;

5853

++

5854

++	res = af_ops->send_synack(sk, NULL, &fl, req, 0, NULL);

5855

++	if (!res) {

5856

++		TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);

5857

++		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);

5858

++	}

5859

++	return res;

5860

++}

5861

++EXPORT_SYMBOL(tcp_rtx_synack);

5862

+diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c

5863

+index 286227abed10..966b873cbf3e 100644

5864

+--- a/net/ipv4/tcp_timer.c

5865

++++ b/net/ipv4/tcp_timer.c

5866

+@@ -20,6 +20,7 @@

5867

+

5868

+ #include <linux/module.h>

5869

+ #include <linux/gfp.h>

5870

++#include <net/mptcp.h>

5871

+ #include <net/tcp.h>

5872

+

5873

+ int sysctl_tcp_syn_retries __read_mostly = TCP_SYN_RETRIES;

5874

+@@ -32,7 +33,7 @@ int sysctl_tcp_retries2 __read_mostly = TCP_RETR2;

5875

+ int sysctl_tcp_orphan_retries __read_mostly;

5876

+ int sysctl_tcp_thin_linear_timeouts __read_mostly;

5877

+

5878

+-static void tcp_write_err(struct sock *sk)

5879

++void tcp_write_err(struct sock *sk)

5880

+ {

5881

+ 	sk->sk_err = sk->sk_err_soft ? : ETIMEDOUT;

5882

+ 	sk->sk_error_report(sk);

5883

+@@ -74,7 +75,7 @@ static int tcp_out_of_resources(struct sock *sk, int do_reset)

5884

+ 		    (!tp->snd_wnd && !tp->packets_out))

5885

+ 			do_reset = 1;

5886

+ 		if (do_reset)

5887

+-			tcp_send_active_reset(sk, GFP_ATOMIC);

5888

++			tp->ops->send_active_reset(sk, GFP_ATOMIC);

5889

+ 		tcp_done(sk);

5890

+ 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONMEMORY);

5891

+ 		return 1;

5892

+@@ -124,10 +125,8 @@ static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk)

5893

+  * retransmissions with an initial RTO of TCP_RTO_MIN or TCP_TIMEOUT_INIT if

5894

+  * syn_set flag is set.

5895

+  */

5896

+-static bool retransmits_timed_out(struct sock *sk,

5897

+-				  unsigned int boundary,

5898

+-				  unsigned int timeout,

5899

+-				  bool syn_set)

5900

++bool retransmits_timed_out(struct sock *sk, unsigned int boundary,

5901

++			   unsigned int timeout, bool syn_set)

5902

+ {

5903

+ 	unsigned int linear_backoff_thresh, start_ts;

5904

+ 	unsigned int rto_base = syn_set ? TCP_TIMEOUT_INIT : TCP_RTO_MIN;

5905

+@@ -153,7 +152,7 @@ static bool retransmits_timed_out(struct sock *sk,

5906

+ }

5907

+

5908

+ /* A write timeout has occurred. Process the after effects. */

5909

+-static int tcp_write_timeout(struct sock *sk)

5910

++int tcp_write_timeout(struct sock *sk)

5911

+ {

5912

+ 	struct inet_connection_sock *icsk = inet_csk(sk);

5913

+ 	struct tcp_sock *tp = tcp_sk(sk);

5914

+@@ -171,6 +170,10 @@ static int tcp_write_timeout(struct sock *sk)

5915

+ 		}

5916

+ 		retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;

5917

+ 		syn_set = true;

5918

++		/* Stop retransmitting MP_CAPABLE options in SYN if timed out. */

5919

++		if (tcp_sk(sk)->request_mptcp &&

5920

++		    icsk->icsk_retransmits >= mptcp_sysctl_syn_retries())

5921

++			tcp_sk(sk)->request_mptcp = 0;

5922

+ 	} else {

5923

+ 		if (retransmits_timed_out(sk, sysctl_tcp_retries1, 0, 0)) {

5924

+ 			/* Black hole detection */

5925

+@@ -251,18 +254,22 @@ out:

5926

+ static void tcp_delack_timer(unsigned long data)

5927

+ {

5928

+ 	struct sock *sk = (struct sock *)data;

5929

++	struct tcp_sock *tp = tcp_sk(sk);

5930

++	struct sock *meta_sk = mptcp(tp) ? mptcp_meta_sk(sk) : sk;

5931

+

5932

+-	bh_lock_sock(sk);

5933

+-	if (!sock_owned_by_user(sk)) {

5934

++	bh_lock_sock(meta_sk);

5935

++	if (!sock_owned_by_user(meta_sk)) {

5936

+ 		tcp_delack_timer_handler(sk);

5937

+ 	} else {

5938

+ 		inet_csk(sk)->icsk_ack.blocked = 1;

5939

+-		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED);

5940

++		NET_INC_STATS_BH(sock_net(meta_sk), LINUX_MIB_DELAYEDACKLOCKED);

5941

+ 		/* deleguate our work to tcp_release_cb() */

5942

+ 		if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags))

5943

+ 			sock_hold(sk);

5944

++		if (mptcp(tp))

5945

++			mptcp_tsq_flags(sk);

5946

+ 	}

5947

+-	bh_unlock_sock(sk);

5948

++	bh_unlock_sock(meta_sk);

5949

+ 	sock_put(sk);

5950

+ }

5951

+

5952

+@@ -479,6 +486,10 @@ out_reset_timer:

5953

+ 		__sk_dst_reset(sk);

5954

+

5955

+ out:;

5956

++	if (mptcp(tp)) {

5957

++		mptcp_reinject_data(sk, 1);

5958

++		mptcp_set_rto(sk);

5959

++	}

5960

+ }

5961

+

5962

+ void tcp_write_timer_handler(struct sock *sk)

5963

+@@ -505,7 +516,7 @@ void tcp_write_timer_handler(struct sock *sk)

5964

+ 		break;

5965

+ 	case ICSK_TIME_RETRANS:

5966

+ 		icsk->icsk_pending = 0;

5967

+-		tcp_retransmit_timer(sk);

5968

++		tcp_sk(sk)->ops->retransmit_timer(sk);

5969

+ 		break;

5970

+ 	case ICSK_TIME_PROBE0:

5971

+ 		icsk->icsk_pending = 0;

5972

+@@ -520,16 +531,19 @@ out:

5973

+ static void tcp_write_timer(unsigned long data)

5974

+ {

5975

+ 	struct sock *sk = (struct sock *)data;

5976

++	struct sock *meta_sk = mptcp(tcp_sk(sk)) ? mptcp_meta_sk(sk) : sk;

5977

+

5978

+-	bh_lock_sock(sk);

5979

+-	if (!sock_owned_by_user(sk)) {

5980

++	bh_lock_sock(meta_sk);

5981

++	if (!sock_owned_by_user(meta_sk)) {

5982

+ 		tcp_write_timer_handler(sk);

5983

+ 	} else {

5984

+ 		/* deleguate our work to tcp_release_cb() */

5985

+ 		if (!test_and_set_bit(TCP_WRITE_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags))

5986

+ 			sock_hold(sk);

5987

++		if (mptcp(tcp_sk(sk)))

5988

++			mptcp_tsq_flags(sk);

5989

+ 	}

5990

+-	bh_unlock_sock(sk);

5991

++	bh_unlock_sock(meta_sk);

5992

+ 	sock_put(sk);

5993

+ }

5994

+

5995

+@@ -566,11 +580,12 @@ static void tcp_keepalive_timer (unsigned long data)

5996

+ 	struct sock *sk = (struct sock *) data;

5997

+ 	struct inet_connection_sock *icsk = inet_csk(sk);

5998

+ 	struct tcp_sock *tp = tcp_sk(sk);

5999

++	struct sock *meta_sk = mptcp(tp) ? mptcp_meta_sk(sk) : sk;

6000

+ 	u32 elapsed;

6001

+

6002

+ 	/* Only process if socket is not in use. */

6003

+-	bh_lock_sock(sk);

6004

+-	if (sock_owned_by_user(sk)) {

6005

++	bh_lock_sock(meta_sk);

6006

++	if (sock_owned_by_user(meta_sk)) {

6007

+ 		/* Try again later. */

6008

+ 		inet_csk_reset_keepalive_timer (sk, HZ/20);

6009

+ 		goto out;

6010

+@@ -581,16 +596,38 @@ static void tcp_keepalive_timer (unsigned long data)

6011

+ 		goto out;

6012

+ 	}

6013

+

6014

++	if (tp->send_mp_fclose) {

6015

++		/* MUST do this before tcp_write_timeout, because retrans_stamp

6016

++		 * may have been set to 0 in another part while we are

6017

++		 * retransmitting MP_FASTCLOSE. Then, we would crash, because

6018

++		 * retransmits_timed_out accesses the meta-write-queue.

6019

++		 *

6020

++		 * We make sure that the timestamp is != 0.

6021

++		 */

6022

++		if (!tp->retrans_stamp)

6023

++			tp->retrans_stamp = tcp_time_stamp ? : 1;

6024

++

6025

++		if (tcp_write_timeout(sk))

6026

++			goto out;

6027

++

6028

++		tcp_send_ack(sk);

6029

++		icsk->icsk_retransmits++;

6030

++

6031

++		icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX);

6032

++		elapsed = icsk->icsk_rto;

6033

++		goto resched;

6034

++	}

6035

++

6036

+ 	if (sk->sk_state == TCP_FIN_WAIT2 && sock_flag(sk, SOCK_DEAD)) {

6037

+ 		if (tp->linger2 >= 0) {

6038

+ 			const int tmo = tcp_fin_time(sk) - TCP_TIMEWAIT_LEN;

6039

+

6040

+ 			if (tmo > 0) {

6041

+-				tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);

6042

++				tp->ops->time_wait(sk, TCP_FIN_WAIT2, tmo);

6043

+ 				goto out;

6044

+ 			}

6045

+ 		}

6046

+-		tcp_send_active_reset(sk, GFP_ATOMIC);

6047

++		tp->ops->send_active_reset(sk, GFP_ATOMIC);

6048

+ 		goto death;

6049

+ 	}

6050

+

6051

+@@ -614,11 +651,11 @@ static void tcp_keepalive_timer (unsigned long data)

6052

+ 		    icsk->icsk_probes_out > 0) ||

6053

+ 		    (icsk->icsk_user_timeout == 0 &&

6054

+ 		    icsk->icsk_probes_out >= keepalive_probes(tp))) {

6055

+-			tcp_send_active_reset(sk, GFP_ATOMIC);

6056

++			tp->ops->send_active_reset(sk, GFP_ATOMIC);

6057

+ 			tcp_write_err(sk);

6058

+ 			goto out;

6059

+ 		}

6060

+-		if (tcp_write_wakeup(sk) <= 0) {

6061

++		if (tp->ops->write_wakeup(sk) <= 0) {

6062

+ 			icsk->icsk_probes_out++;

6063

+ 			elapsed = keepalive_intvl_when(tp);

6064

+ 		} else {

6065

+@@ -642,7 +679,7 @@ death:

6066

+ 	tcp_done(sk);

6067

+

6068

+ out:

6069

+-	bh_unlock_sock(sk);

6070

++	bh_unlock_sock(meta_sk);

6071

+ 	sock_put(sk);

6072

+ }

6073

+

6074

+diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c

6075

+index 5667b3003af9..7139c2973fd2 100644

6076

+--- a/net/ipv6/addrconf.c

6077

++++ b/net/ipv6/addrconf.c

6078

+@@ -760,6 +760,7 @@ void inet6_ifa_finish_destroy(struct inet6_ifaddr *ifp)

6079

+

6080

+ 	kfree_rcu(ifp, rcu);

6081

+ }

6082

++EXPORT_SYMBOL(inet6_ifa_finish_destroy);

6083

+

6084

+ static void

6085

+ ipv6_link_dev_addr(struct inet6_dev *idev, struct inet6_ifaddr *ifp)

6086

+diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c

6087

+index 7cb4392690dd..7057afbca4df 100644

6088

+--- a/net/ipv6/af_inet6.c

6089

++++ b/net/ipv6/af_inet6.c

6090

+@@ -97,8 +97,7 @@ static __inline__ struct ipv6_pinfo *inet6_sk_generic(struct sock *sk)

6091

+ 	return (struct ipv6_pinfo *)(((u8 *)sk) + offset);

6092

+ }

6093

+

6094

+-static int inet6_create(struct net *net, struct socket *sock, int protocol,

6095

+-			int kern)

6096

++int inet6_create(struct net *net, struct socket *sock, int protocol, int kern)

6097

+ {

6098

+ 	struct inet_sock *inet;

6099

+ 	struct ipv6_pinfo *np;

6100

+diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c

6101

+index a245e5ddffbd..99c892b8992d 100644

6102

+--- a/net/ipv6/inet6_connection_sock.c

6103

++++ b/net/ipv6/inet6_connection_sock.c

6104

+@@ -96,8 +96,8 @@ struct dst_entry *inet6_csk_route_req(struct sock *sk,

6105

+ /*

6106

+  * request_sock (formerly open request) hash tables.

6107

+  */

6108

+-static u32 inet6_synq_hash(const struct in6_addr *raddr, const __be16 rport,

6109

+-			   const u32 rnd, const u32 synq_hsize)

6110

++u32 inet6_synq_hash(const struct in6_addr *raddr, const __be16 rport,

6111

++		    const u32 rnd, const u32 synq_hsize)

6112

+ {

6113

+ 	u32 c;

6114

+

6115

+diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c

6116

+index edb58aff4ae7..ea4d9fda0927 100644

6117

+--- a/net/ipv6/ipv6_sockglue.c

6118

++++ b/net/ipv6/ipv6_sockglue.c

6119

+@@ -48,6 +48,8 @@

6120

+ #include <net/addrconf.h>

6121

+ #include <net/inet_common.h>

6122

+ #include <net/tcp.h>

6123

++#include <net/mptcp.h>

6124

++#include <net/mptcp_v4.h>

6125

+ #include <net/udp.h>

6126

+ #include <net/udplite.h>

6127

+ #include <net/xfrm.h>

6128

+@@ -196,7 +198,12 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname,

6129

+ 				sock_prot_inuse_add(net, &tcp_prot, 1);

6130

+ 				local_bh_enable();

6131

+ 				sk->sk_prot = &tcp_prot;

6132

+-				icsk->icsk_af_ops = &ipv4_specific;

6133

++#ifdef CONFIG_MPTCP

6134

++				if (is_mptcp_enabled(sk))

6135

++					icsk->icsk_af_ops = &mptcp_v4_specific;

6136

++				else

6137

++#endif

6138

++					icsk->icsk_af_ops = &ipv4_specific;

6139

+ 				sk->sk_socket->ops = &inet_stream_ops;

6140

+ 				sk->sk_family = PF_INET;

6141

+ 				tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);

6142

+diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c

6143

+index a822b880689b..b2b38869d795 100644

6144

+--- a/net/ipv6/syncookies.c

6145

++++ b/net/ipv6/syncookies.c

6146

+@@ -181,13 +181,13 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)

6147

+

6148

+ 	/* check for timestamp cookie support */

6149

+ 	memset(&tcp_opt, 0, sizeof(tcp_opt));

6150

+-	tcp_parse_options(skb, &tcp_opt, 0, NULL);

6151

++	tcp_parse_options(skb, &tcp_opt, NULL, 0, NULL);

6152

+

6153

+ 	if (!cookie_check_timestamp(&tcp_opt, sock_net(sk), &ecn_ok))

6154

+ 		goto out;

6155

+

6156

+ 	ret = NULL;

6157

+-	req = inet6_reqsk_alloc(&tcp6_request_sock_ops);

6158

++	req = inet_reqsk_alloc(&tcp6_request_sock_ops);

6159

+ 	if (!req)

6160

+ 		goto out;

6161

+

6162

+@@ -255,10 +255,10 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)

6163

+ 	}

6164

+

6165

+ 	req->window_clamp = tp->window_clamp ? :dst_metric(dst, RTAX_WINDOW);

6166

+-	tcp_select_initial_window(tcp_full_space(sk), req->mss,

6167

+-				  &req->rcv_wnd, &req->window_clamp,

6168

+-				  ireq->wscale_ok, &rcv_wscale,

6169

+-				  dst_metric(dst, RTAX_INITRWND));

6170

++	tp->ops->select_initial_window(tcp_full_space(sk), req->mss,

6171

++				       &req->rcv_wnd, &req->window_clamp,

6172

++				       ireq->wscale_ok, &rcv_wscale,

6173

++				       dst_metric(dst, RTAX_INITRWND), sk);

6174

+

6175

+ 	ireq->rcv_wscale = rcv_wscale;

6176

+

6177

+diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c

6178

+index 229239ad96b1..fda94d71666e 100644

6179

+--- a/net/ipv6/tcp_ipv6.c

6180

++++ b/net/ipv6/tcp_ipv6.c

6181

+@@ -63,6 +63,8 @@

6182

+ #include <net/inet_common.h>

6183

+ #include <net/secure_seq.h>

6184

+ #include <net/tcp_memcontrol.h>

6185

++#include <net/mptcp.h>

6186

++#include <net/mptcp_v6.h>

6187

+ #include <net/busy_poll.h>

6188

+

6189

+ #include <linux/proc_fs.h>

6190

+@@ -71,12 +73,6 @@

6191

+ #include <linux/crypto.h>

6192

+ #include <linux/scatterlist.h>

6193

+

6194

+-static void	tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb);

6195

+-static void	tcp_v6_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,

6196

+-				      struct request_sock *req);

6197

+-

6198

+-static int	tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb);

6199

+-

6200

+ static const struct inet_connection_sock_af_ops ipv6_mapped;

6201

+ static const struct inet_connection_sock_af_ops ipv6_specific;

6202

+ #ifdef CONFIG_TCP_MD5SIG

6203

+@@ -90,7 +86,7 @@ static struct tcp_md5sig_key *tcp_v6_md5_do_lookup(struct sock *sk,

6204

+ }

6205

+ #endif

6206

+

6207

+-static void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)

6208

++void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)

6209

+ {

6210

+ 	struct dst_entry *dst = skb_dst(skb);

6211

+ 	const struct rt6_info *rt = (const struct rt6_info *)dst;

6212

+@@ -102,10 +98,11 @@ static void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)

6213

+ 		inet6_sk(sk)->rx_dst_cookie = rt->rt6i_node->fn_sernum;

6214

+ }

6215

+

6216

+-static void tcp_v6_hash(struct sock *sk)

6217

++void tcp_v6_hash(struct sock *sk)

6218

+ {

6219

+ 	if (sk->sk_state != TCP_CLOSE) {

6220

+-		if (inet_csk(sk)->icsk_af_ops == &ipv6_mapped) {

6221

++		if (inet_csk(sk)->icsk_af_ops == &ipv6_mapped ||

6222

++		    inet_csk(sk)->icsk_af_ops == &mptcp_v6_mapped) {

6223

+ 			tcp_prot.hash(sk);

6224

+ 			return;

6225

+ 		}

6226

+@@ -115,7 +112,7 @@ static void tcp_v6_hash(struct sock *sk)

6227

+ 	}

6228

+ }

6229

+

6230

+-static __u32 tcp_v6_init_sequence(const struct sk_buff *skb)

6231

++__u32 tcp_v6_init_sequence(const struct sk_buff *skb)

6232

+ {

6233

+ 	return secure_tcpv6_sequence_number(ipv6_hdr(skb)->daddr.s6_addr32,

6234

+ 					    ipv6_hdr(skb)->saddr.s6_addr32,

6235

+@@ -123,7 +120,7 @@ static __u32 tcp_v6_init_sequence(const struct sk_buff *skb)

6236

+ 					    tcp_hdr(skb)->source);

6237

+ }

6238

+

6239

+-static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,

6240

++int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,

6241

+ 			  int addr_len)

6242

+ {

6243

+ 	struct sockaddr_in6 *usin = (struct sockaddr_in6 *) uaddr;

6244

+@@ -215,7 +212,12 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,

6245

+ 		sin.sin_port = usin->sin6_port;

6246

+ 		sin.sin_addr.s_addr = usin->sin6_addr.s6_addr32[3];

6247

+

6248

+-		icsk->icsk_af_ops = &ipv6_mapped;

6249

++#ifdef CONFIG_MPTCP

6250

++		if (is_mptcp_enabled(sk))

6251

++			icsk->icsk_af_ops = &mptcp_v6_mapped;

6252

++		else

6253

++#endif

6254

++			icsk->icsk_af_ops = &ipv6_mapped;

6255

+ 		sk->sk_backlog_rcv = tcp_v4_do_rcv;

6256

+ #ifdef CONFIG_TCP_MD5SIG

6257

+ 		tp->af_specific = &tcp_sock_ipv6_mapped_specific;

6258

+@@ -225,7 +227,12 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,

6259

+

6260

+ 		if (err) {

6261

+ 			icsk->icsk_ext_hdr_len = exthdrlen;

6262

+-			icsk->icsk_af_ops = &ipv6_specific;

6263

++#ifdef CONFIG_MPTCP

6264

++			if (is_mptcp_enabled(sk))

6265

++				icsk->icsk_af_ops = &mptcp_v6_specific;

6266

++			else

6267

++#endif

6268

++				icsk->icsk_af_ops = &ipv6_specific;

6269

+ 			sk->sk_backlog_rcv = tcp_v6_do_rcv;

6270

+ #ifdef CONFIG_TCP_MD5SIG

6271

+ 			tp->af_specific = &tcp_sock_ipv6_specific;

6272

+@@ -337,7 +344,7 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,

6273

+ 	const struct ipv6hdr *hdr = (const struct ipv6hdr *)skb->data;

6274

+ 	const struct tcphdr *th = (struct tcphdr *)(skb->data+offset);

6275

+ 	struct ipv6_pinfo *np;

6276

+-	struct sock *sk;

6277

++	struct sock *sk, *meta_sk;

6278

+ 	int err;

6279

+ 	struct tcp_sock *tp;

6280

+ 	struct request_sock *fastopen;

6281

+@@ -358,8 +365,14 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,

6282

+ 		return;

6283

+ 	}

6284

+

6285

+-	bh_lock_sock(sk);

6286

+-	if (sock_owned_by_user(sk) && type != ICMPV6_PKT_TOOBIG)

6287

++	tp = tcp_sk(sk);

6288

++	if (mptcp(tp))

6289

++		meta_sk = mptcp_meta_sk(sk);

6290

++	else

6291

++		meta_sk = sk;

6292

++

6293

++	bh_lock_sock(meta_sk);

6294

++	if (sock_owned_by_user(meta_sk) && type != ICMPV6_PKT_TOOBIG)

6295

+ 		NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);

6296

+

6297

+ 	if (sk->sk_state == TCP_CLOSE)

6298

+@@ -370,7 +383,6 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,

6299

+ 		goto out;

6300

+ 	}

6301

+

6302

+-	tp = tcp_sk(sk);

6303

+ 	seq = ntohl(th->seq);

6304

+ 	/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */

6305

+ 	fastopen = tp->fastopen_rsk;

6306

+@@ -403,11 +415,15 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,

6307

+ 			goto out;

6308

+

6309

+ 		tp->mtu_info = ntohl(info);

6310

+-		if (!sock_owned_by_user(sk))

6311

++		if (!sock_owned_by_user(meta_sk))

6312

+ 			tcp_v6_mtu_reduced(sk);

6313

+-		else if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED,

6314

++		else {

6315

++			if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED,

6316

+ 					   &tp->tsq_flags))

6317

+-			sock_hold(sk);

6318

++				sock_hold(sk);

6319

++			if (mptcp(tp))

6320

++				mptcp_tsq_flags(sk);

6321

++		}

6322

+ 		goto out;

6323

+ 	}

6324

+

6325

+@@ -417,7 +433,7 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,

6326

+ 	switch (sk->sk_state) {

6327

+ 		struct request_sock *req, **prev;

6328

+ 	case TCP_LISTEN:

6329

+-		if (sock_owned_by_user(sk))

6330

++		if (sock_owned_by_user(meta_sk))

6331

+ 			goto out;

6332

+

6333

+ 		req = inet6_csk_search_req(sk, &prev, th->dest, &hdr->daddr,

6334

+@@ -447,7 +463,7 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,

6335

+ 		if (fastopen && fastopen->sk == NULL)

6336

+ 			break;

6337

+

6338

+-		if (!sock_owned_by_user(sk)) {

6339

++		if (!sock_owned_by_user(meta_sk)) {

6340

+ 			sk->sk_err = err;

6341

+ 			sk->sk_error_report(sk);		/* Wake people up to see the error (see connect in sock.c) */

6342

+

6343

+@@ -457,26 +473,27 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,

6344

+ 		goto out;

6345

+ 	}

6346

+

6347

+-	if (!sock_owned_by_user(sk) && np->recverr) {

6348

++	if (!sock_owned_by_user(meta_sk) && np->recverr) {

6349

+ 		sk->sk_err = err;

6350

+ 		sk->sk_error_report(sk);

6351

+ 	} else

6352

+ 		sk->sk_err_soft = err;

6353

+

6354

+ out:

6355

+-	bh_unlock_sock(sk);

6356

++	bh_unlock_sock(meta_sk);

6357

+ 	sock_put(sk);

6358

+ }

6359

+

6360

+

6361

+-static int tcp_v6_send_synack(struct sock *sk, struct dst_entry *dst,

6362

+-			      struct flowi6 *fl6,

6363

+-			      struct request_sock *req,

6364

+-			      u16 queue_mapping,

6365

+-			      struct tcp_fastopen_cookie *foc)

6366

++int tcp_v6_send_synack(struct sock *sk, struct dst_entry *dst,

6367

++		       struct flowi *fl,

6368

++		       struct request_sock *req,

6369

++		       u16 queue_mapping,

6370

++		       struct tcp_fastopen_cookie *foc)

6371

+ {

6372

+ 	struct inet_request_sock *ireq = inet_rsk(req);

6373

+ 	struct ipv6_pinfo *np = inet6_sk(sk);

6374

++	struct flowi6 *fl6 = &fl->u.ip6;

6375

+ 	struct sk_buff *skb;

6376

+ 	int err = -ENOMEM;

6377

+

6378

+@@ -497,18 +514,21 @@ static int tcp_v6_send_synack(struct sock *sk, struct dst_entry *dst,

6379

+ 		skb_set_queue_mapping(skb, queue_mapping);

6380

+ 		err = ip6_xmit(sk, skb, fl6, np->opt, np->tclass);

6381

+ 		err = net_xmit_eval(err);

6382

++		if (!tcp_rsk(req)->snt_synack && !err)

6383

++			tcp_rsk(req)->snt_synack = tcp_time_stamp;

6384

+ 	}

6385

+

6386

+ done:

6387

+ 	return err;

6388

+ }

6389

+

6390

+-static int tcp_v6_rtx_synack(struct sock *sk, struct request_sock *req)

6391

++int tcp_v6_rtx_synack(struct sock *sk, struct request_sock *req)

6392

+ {

6393

+-	struct flowi6 fl6;

6394

++	const struct tcp_request_sock_ops *af_ops = tcp_rsk(req)->af_specific;

6395

++	struct flowi fl;

6396

+ 	int res;

6397

+

6398

+-	res = tcp_v6_send_synack(sk, NULL, &fl6, req, 0, NULL);

6399

++	res = af_ops->send_synack(sk, NULL, &fl, req, 0, NULL);

6400

+ 	if (!res) {

6401

+ 		TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);

6402

+ 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);

6403

+@@ -516,7 +536,7 @@ static int tcp_v6_rtx_synack(struct sock *sk, struct request_sock *req)

6404

+ 	return res;

6405

+ }

6406

+

6407

+-static void tcp_v6_reqsk_destructor(struct request_sock *req)

6408

++void tcp_v6_reqsk_destructor(struct request_sock *req)

6409

+ {

6410

+ 	kfree_skb(inet_rsk(req)->pktopts);

6411

+ }

6412

+@@ -718,27 +738,74 @@ static int tcp_v6_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb)

6413

+ }

6414

+ #endif

6415

+

6416

++static int tcp_v6_init_req(struct request_sock *req, struct sock *sk,

6417

++			   struct sk_buff *skb)

6418

++{

6419

++	struct inet_request_sock *ireq = inet_rsk(req);

6420

++	struct ipv6_pinfo *np = inet6_sk(sk);

6421

++

6422

++	ireq->ir_v6_rmt_addr = ipv6_hdr(skb)->saddr;

6423

++	ireq->ir_v6_loc_addr = ipv6_hdr(skb)->daddr;

6424

++

6425

++	ireq->ir_iif = sk->sk_bound_dev_if;

6426

++	ireq->ir_mark = inet_request_mark(sk, skb);

6427

++

6428

++	/* So that link locals have meaning */

6429

++	if (!sk->sk_bound_dev_if &&

6430

++	    ipv6_addr_type(&ireq->ir_v6_rmt_addr) & IPV6_ADDR_LINKLOCAL)

6431

++		ireq->ir_iif = inet6_iif(skb);

6432

++

6433

++	if (!TCP_SKB_CB(skb)->when &&

6434

++	    (ipv6_opt_accepted(sk, skb) || np->rxopt.bits.rxinfo ||

6435

++	     np->rxopt.bits.rxoinfo || np->rxopt.bits.rxhlim ||

6436

++	     np->rxopt.bits.rxohlim || np->repflow)) {

6437

++		atomic_inc(&skb->users);

6438

++		ireq->pktopts = skb;

6439

++	}

6440

++

6441

++	return 0;

6442

++}

6443

++

6444

++static struct dst_entry *tcp_v6_route_req(struct sock *sk, struct flowi *fl,

6445

++					  const struct request_sock *req,

6446

++					  bool *strict)

6447

++{

6448

++	if (strict)

6449

++		*strict = true;

6450

++	return inet6_csk_route_req(sk, &fl->u.ip6, req);

6451

++}

6452

++

6453

+ struct request_sock_ops tcp6_request_sock_ops __read_mostly = {

6454

+ 	.family		=	AF_INET6,

6455

+ 	.obj_size	=	sizeof(struct tcp6_request_sock),

6456

+-	.rtx_syn_ack	=	tcp_v6_rtx_synack,

6457

++	.rtx_syn_ack	=	tcp_rtx_synack,

6458

+ 	.send_ack	=	tcp_v6_reqsk_send_ack,

6459

+ 	.destructor	=	tcp_v6_reqsk_destructor,

6460

+ 	.send_reset	=	tcp_v6_send_reset,

6461

+ 	.syn_ack_timeout =	tcp_syn_ack_timeout,

6462

+ };

6463

+

6464

++const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = {

6465

++	.mss_clamp	=	IPV6_MIN_MTU - sizeof(struct tcphdr) -

6466

++				sizeof(struct ipv6hdr),

6467

+ #ifdef CONFIG_TCP_MD5SIG

6468

+-static const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = {

6469

+ 	.md5_lookup	=	tcp_v6_reqsk_md5_lookup,

6470

+ 	.calc_md5_hash	=	tcp_v6_md5_hash_skb,

6471

+-};

6472

+ #endif

6473

++	.init_req	=	tcp_v6_init_req,

6474

++#ifdef CONFIG_SYN_COOKIES

6475

++	.cookie_init_seq =	cookie_v6_init_sequence,

6476

++#endif

6477

++	.route_req	=	tcp_v6_route_req,

6478

++	.init_seq	=	tcp_v6_init_sequence,

6479

++	.send_synack	=	tcp_v6_send_synack,

6480

++	.queue_hash_add =	inet6_csk_reqsk_queue_hash_add,

6481

++};

6482

+

6483

+-static void tcp_v6_send_response(struct sk_buff *skb, u32 seq, u32 ack, u32 win,

6484

+-				 u32 tsval, u32 tsecr, int oif,

6485

+-				 struct tcp_md5sig_key *key, int rst, u8 tclass,

6486

+-				 u32 label)

6487

++static void tcp_v6_send_response(struct sk_buff *skb, u32 seq, u32 ack,

6488

++				 u32 data_ack, u32 win, u32 tsval, u32 tsecr,

6489

++				 int oif, struct tcp_md5sig_key *key, int rst,

6490

++				 u8 tclass, u32 label, int mptcp)

6491

+ {

6492

+ 	const struct tcphdr *th = tcp_hdr(skb);

6493

+ 	struct tcphdr *t1;

6494

+@@ -756,7 +823,10 @@ static void tcp_v6_send_response(struct sk_buff *skb, u32 seq, u32 ack, u32 win,

6495

+ 	if (key)

6496

+ 		tot_len += TCPOLEN_MD5SIG_ALIGNED;

6497

+ #endif

6498

+-

6499

++#ifdef CONFIG_MPTCP

6500

++	if (mptcp)

6501

++		tot_len += MPTCP_SUB_LEN_DSS + MPTCP_SUB_LEN_ACK;

6502

++#endif

6503

+ 	buff = alloc_skb(MAX_HEADER + sizeof(struct ipv6hdr) + tot_len,

6504

+ 			 GFP_ATOMIC);

6505

+ 	if (buff == NULL)

6506

+@@ -794,6 +864,17 @@ static void tcp_v6_send_response(struct sk_buff *skb, u32 seq, u32 ack, u32 win,

6507

+ 		tcp_v6_md5_hash_hdr((__u8 *)topt, key,

6508

+ 				    &ipv6_hdr(skb)->saddr,

6509

+ 				    &ipv6_hdr(skb)->daddr, t1);

6510

++		topt += 4;

6511

++	}

6512

++#endif

6513

++#ifdef CONFIG_MPTCP

6514

++	if (mptcp) {

6515

++		/* Construction of 32-bit data_ack */

6516

++		*topt++ = htonl((TCPOPT_MPTCP << 24) |

6517

++				((MPTCP_SUB_LEN_DSS + MPTCP_SUB_LEN_ACK) << 16) |

6518

++				(0x20 << 8) |

6519

++				(0x01));

6520

++		*topt++ = htonl(data_ack);

6521

+ 	}

6522

+ #endif

6523

+

6524

+@@ -834,7 +915,7 @@ static void tcp_v6_send_response(struct sk_buff *skb, u32 seq, u32 ack, u32 win,

6525

+ 	kfree_skb(buff);

6526

+ }

6527

+

6528

+-static void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb)

6529

++void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb)

6530

+ {

6531

+ 	const struct tcphdr *th = tcp_hdr(skb);

6532

+ 	u32 seq = 0, ack_seq = 0;

6533

+@@ -891,7 +972,7 @@ static void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb)

6534

+ 			  (th->doff << 2);

6535

+

6536

+ 	oif = sk ? sk->sk_bound_dev_if : 0;

6537

+-	tcp_v6_send_response(skb, seq, ack_seq, 0, 0, 0, oif, key, 1, 0, 0);

6538

++	tcp_v6_send_response(skb, seq, ack_seq, 0, 0, 0, 0, oif, key, 1, 0, 0, 0);

6539

+

6540

+ #ifdef CONFIG_TCP_MD5SIG

6541

+ release_sk1:

6542

+@@ -902,45 +983,52 @@ release_sk1:

6543

+ #endif

6544

+ }

6545

+

6546

+-static void tcp_v6_send_ack(struct sk_buff *skb, u32 seq, u32 ack,

6547

++static void tcp_v6_send_ack(struct sk_buff *skb, u32 seq, u32 ack, u32 data_ack,

6548

+ 			    u32 win, u32 tsval, u32 tsecr, int oif,

6549

+ 			    struct tcp_md5sig_key *key, u8 tclass,

6550

+-			    u32 label)

6551

++			    u32 label, int mptcp)

6552

+ {

6553

+-	tcp_v6_send_response(skb, seq, ack, win, tsval, tsecr, oif, key, 0, tclass,

6554

+-			     label);

6555

++	tcp_v6_send_response(skb, seq, ack, data_ack, win, tsval, tsecr, oif,

6556

++			     key, 0, tclass, label, mptcp);

6557

+ }

6558

+

6559

+ static void tcp_v6_timewait_ack(struct sock *sk, struct sk_buff *skb)

6560

+ {

6561

+ 	struct inet_timewait_sock *tw = inet_twsk(sk);

6562

+ 	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);

6563

++	u32 data_ack = 0;

6564

++	int mptcp = 0;

6565

+

6566

++	if (tcptw->mptcp_tw && tcptw->mptcp_tw->meta_tw) {

6567

++		data_ack = (u32)tcptw->mptcp_tw->rcv_nxt;

6568

++		mptcp = 1;

6569

++	}

6570

+ 	tcp_v6_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,

6571

++			data_ack,

6572

+ 			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,

6573

+ 			tcp_time_stamp + tcptw->tw_ts_offset,

6574

+ 			tcptw->tw_ts_recent, tw->tw_bound_dev_if, tcp_twsk_md5_key(tcptw),

6575

+-			tw->tw_tclass, (tw->tw_flowlabel << 12));

6576

++			tw->tw_tclass, (tw->tw_flowlabel << 12), mptcp);

6577

+

6578

+ 	inet_twsk_put(tw);

6579

+ }

6580

+

6581

+-static void tcp_v6_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,

6582

+-				  struct request_sock *req)

6583

++void tcp_v6_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,

6584

++			   struct request_sock *req)

6585

+ {

6586

+ 	/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV

6587

+ 	 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.

6588

+ 	 */

6589

+ 	tcp_v6_send_ack(skb, (sk->sk_state == TCP_LISTEN) ?

6590

+ 			tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt,

6591

+-			tcp_rsk(req)->rcv_nxt,

6592

++			tcp_rsk(req)->rcv_nxt, 0,

6593

+ 			req->rcv_wnd, tcp_time_stamp, req->ts_recent, sk->sk_bound_dev_if,

6594

+ 			tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->daddr),

6595

+-			0, 0);

6596

++			0, 0, 0);

6597

+ }

6598

+

6599

+

6600

+-static struct sock *tcp_v6_hnd_req(struct sock *sk, struct sk_buff *skb)

6601

++struct sock *tcp_v6_hnd_req(struct sock *sk, struct sk_buff *skb)

6602

+ {

6603

+ 	struct request_sock *req, **prev;

6604

+ 	const struct tcphdr *th = tcp_hdr(skb);

6605

+@@ -959,7 +1047,13 @@ static struct sock *tcp_v6_hnd_req(struct sock *sk, struct sk_buff *skb)

6606

+

6607

+ 	if (nsk) {

6608

+ 		if (nsk->sk_state != TCP_TIME_WAIT) {

6609

++			/* Don't lock again the meta-sk. It has been locked

6610

++			 * before mptcp_v6_do_rcv.

6611

++			 */

6612

++			if (mptcp(tcp_sk(nsk)) && !is_meta_sk(sk))

6613

++				bh_lock_sock(mptcp_meta_sk(nsk));

6614

+ 			bh_lock_sock(nsk);

6615

++

6616

+ 			return nsk;

6617

+ 		}

6618

+ 		inet_twsk_put(inet_twsk(nsk));

6619

+@@ -973,161 +1067,25 @@ static struct sock *tcp_v6_hnd_req(struct sock *sk, struct sk_buff *skb)

6620

+ 	return sk;

6621

+ }

6622

+

6623

+-/* FIXME: this is substantially similar to the ipv4 code.

6624

+- * Can some kind of merge be done? -- erics

6625

+- */

6626

+-static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)

6627

++int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)

6628

+ {

6629

+-	struct tcp_options_received tmp_opt;

6630

+-	struct request_sock *req;

6631

+-	struct inet_request_sock *ireq;

6632

+-	struct ipv6_pinfo *np = inet6_sk(sk);

6633

+-	struct tcp_sock *tp = tcp_sk(sk);

6634

+-	__u32 isn = TCP_SKB_CB(skb)->when;

6635

+-	struct dst_entry *dst = NULL;

6636

+-	struct tcp_fastopen_cookie foc = { .len = -1 };

6637

+-	bool want_cookie = false, fastopen;

6638

+-	struct flowi6 fl6;

6639

+-	int err;

6640

+-

6641

+ 	if (skb->protocol == htons(ETH_P_IP))

6642

+ 		return tcp_v4_conn_request(sk, skb);

6643

+

6644

+ 	if (!ipv6_unicast_destination(skb))

6645

+ 		goto drop;

6646

+

6647

+-	if ((sysctl_tcp_syncookies == 2 ||

6648

+-	     inet_csk_reqsk_queue_is_full(sk)) && !isn) {

6649

+-		want_cookie = tcp_syn_flood_action(sk, skb, "TCPv6");

6650

+-		if (!want_cookie)

6651

+-			goto drop;

6652

+-	}

6653

+-

6654

+-	if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) {

6655

+-		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);

6656

+-		goto drop;

6657

+-	}

6658

+-

6659

+-	req = inet6_reqsk_alloc(&tcp6_request_sock_ops);

6660

+-	if (req == NULL)

6661

+-		goto drop;

6662

+-

6663

+-#ifdef CONFIG_TCP_MD5SIG

6664

+-	tcp_rsk(req)->af_specific = &tcp_request_sock_ipv6_ops;

6665

+-#endif

6666

+-

6667

+-	tcp_clear_options(&tmp_opt);

6668

+-	tmp_opt.mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr);

6669

+-	tmp_opt.user_mss = tp->rx_opt.user_mss;

6670

+-	tcp_parse_options(skb, &tmp_opt, 0, want_cookie ? NULL : &foc);

6671

+-

6672

+-	if (want_cookie && !tmp_opt.saw_tstamp)

6673

+-		tcp_clear_options(&tmp_opt);

6674

++	return tcp_conn_request(&tcp6_request_sock_ops,

6675

++				&tcp_request_sock_ipv6_ops, sk, skb);

6676

+

6677

+-	tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;

6678

+-	tcp_openreq_init(req, &tmp_opt, skb);

6679

+-

6680

+-	ireq = inet_rsk(req);

6681

+-	ireq->ir_v6_rmt_addr = ipv6_hdr(skb)->saddr;

6682

+-	ireq->ir_v6_loc_addr = ipv6_hdr(skb)->daddr;

6683

+-	if (!want_cookie || tmp_opt.tstamp_ok)

6684

+-		TCP_ECN_create_request(req, skb, sock_net(sk));

6685

+-

6686

+-	ireq->ir_iif = sk->sk_bound_dev_if;

6687

+-	ireq->ir_mark = inet_request_mark(sk, skb);

6688

+-

6689

+-	/* So that link locals have meaning */

6690

+-	if (!sk->sk_bound_dev_if &&

6691

+-	    ipv6_addr_type(&ireq->ir_v6_rmt_addr) & IPV6_ADDR_LINKLOCAL)

6692

+-		ireq->ir_iif = inet6_iif(skb);

6693

+-

6694

+-	if (!isn) {

6695

+-		if (ipv6_opt_accepted(sk, skb) ||

6696

+-		    np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo ||

6697

+-		    np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim ||

6698

+-		    np->repflow) {

6699

+-			atomic_inc(&skb->users);

6700

+-			ireq->pktopts = skb;

6701

+-		}

6702

+-

6703

+-		if (want_cookie) {

6704

+-			isn = cookie_v6_init_sequence(sk, skb, &req->mss);

6705

+-			req->cookie_ts = tmp_opt.tstamp_ok;

6706

+-			goto have_isn;

6707

+-		}

6708

+-

6709

+-		/* VJ's idea. We save last timestamp seen

6710

+-		 * from the destination in peer table, when entering

6711

+-		 * state TIME-WAIT, and check against it before

6712

+-		 * accepting new connection request.

6713

+-		 *

6714

+-		 * If "isn" is not zero, this request hit alive

6715

+-		 * timewait bucket, so that all the necessary checks

6716

+-		 * are made in the function processing timewait state.

6717

+-		 */

6718

+-		if (tmp_opt.saw_tstamp &&

6719

+-		    tcp_death_row.sysctl_tw_recycle &&

6720

+-		    (dst = inet6_csk_route_req(sk, &fl6, req)) != NULL) {

6721

+-			if (!tcp_peer_is_proven(req, dst, true)) {

6722

+-				NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);

6723

+-				goto drop_and_release;

6724

+-			}

6725

+-		}

6726

+-		/* Kill the following clause, if you dislike this way. */

6727

+-		else if (!sysctl_tcp_syncookies &&

6728

+-			 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <

6729

+-			  (sysctl_max_syn_backlog >> 2)) &&

6730

+-			 !tcp_peer_is_proven(req, dst, false)) {

6731

+-			/* Without syncookies last quarter of

6732

+-			 * backlog is filled with destinations,

6733

+-			 * proven to be alive.

6734

+-			 * It means that we continue to communicate

6735

+-			 * to destinations, already remembered

6736

+-			 * to the moment of synflood.

6737

+-			 */

6738

+-			LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open request from %pI6/%u\n",

6739

+-				       &ireq->ir_v6_rmt_addr, ntohs(tcp_hdr(skb)->source));

6740

+-			goto drop_and_release;

6741

+-		}

6742

+-

6743

+-		isn = tcp_v6_init_sequence(skb);

6744

+-	}

6745

+-have_isn:

6746

+-

6747

+-	if (security_inet_conn_request(sk, skb, req))

6748

+-		goto drop_and_release;

6749

+-

6750

+-	if (!dst && (dst = inet6_csk_route_req(sk, &fl6, req)) == NULL)

6751

+-		goto drop_and_free;

6752

+-

6753

+-	tcp_rsk(req)->snt_isn = isn;

6754

+-	tcp_rsk(req)->snt_synack = tcp_time_stamp;

6755

+-	tcp_openreq_init_rwin(req, sk, dst);

6756

+-	fastopen = !want_cookie &&

6757

+-		   tcp_try_fastopen(sk, skb, req, &foc, dst);

6758

+-	err = tcp_v6_send_synack(sk, dst, &fl6, req,

6759

+-				 skb_get_queue_mapping(skb), &foc);

6760

+-	if (!fastopen) {

6761

+-		if (err || want_cookie)

6762

+-			goto drop_and_free;

6763

+-

6764

+-		tcp_rsk(req)->listener = NULL;

6765

+-		inet6_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);

6766

+-	}

6767

+-	return 0;

6768

+-

6769

+-drop_and_release:

6770

+-	dst_release(dst);

6771

+-drop_and_free:

6772

+-	reqsk_free(req);

6773

+ drop:

6774

+ 	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);

6775

+ 	return 0; /* don't send reset */

6776

+ }

6777

+

6778

+-static struct sock *tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,

6779

+-					 struct request_sock *req,

6780

+-					 struct dst_entry *dst)

6781

++struct sock *tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,

6782

++				  struct request_sock *req,

6783

++				  struct dst_entry *dst)

6784

+ {

6785

+ 	struct inet_request_sock *ireq;

6786

+ 	struct ipv6_pinfo *newnp, *np = inet6_sk(sk);

6787

+@@ -1165,7 +1123,12 @@ static struct sock *tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,

6788

+

6789

+ 		newsk->sk_v6_rcv_saddr = newnp->saddr;

6790

+

6791

+-		inet_csk(newsk)->icsk_af_ops = &ipv6_mapped;

6792

++#ifdef CONFIG_MPTCP

6793

++		if (is_mptcp_enabled(newsk))

6794

++			inet_csk(newsk)->icsk_af_ops = &mptcp_v6_mapped;

6795

++		else

6796

++#endif

6797

++			inet_csk(newsk)->icsk_af_ops = &ipv6_mapped;

6798

+ 		newsk->sk_backlog_rcv = tcp_v4_do_rcv;

6799

+ #ifdef CONFIG_TCP_MD5SIG

6800

+ 		newtp->af_specific = &tcp_sock_ipv6_mapped_specific;

6801

+@@ -1329,7 +1292,7 @@ out:

6802

+  * This is because we cannot sleep with the original spinlock

6803

+  * held.

6804

+  */

6805

+-static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)

6806

++int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)

6807

+ {

6808

+ 	struct ipv6_pinfo *np = inet6_sk(sk);

6809

+ 	struct tcp_sock *tp;

6810

+@@ -1351,6 +1314,9 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)

6811

+ 		goto discard;

6812

+ #endif

6813

+

6814

++	if (is_meta_sk(sk))

6815

++		return mptcp_v6_do_rcv(sk, skb);

6816

++

6817

+ 	if (sk_filter(sk, skb))

6818

+ 		goto discard;

6819

+

6820

+@@ -1472,7 +1438,7 @@ static int tcp_v6_rcv(struct sk_buff *skb)

6821

+ {

6822

+ 	const struct tcphdr *th;

6823

+ 	const struct ipv6hdr *hdr;

6824

+-	struct sock *sk;

6825

++	struct sock *sk, *meta_sk = NULL;

6826

+ 	int ret;

6827

+ 	struct net *net = dev_net(skb->dev);

6828

+

6829

+@@ -1503,18 +1469,43 @@ static int tcp_v6_rcv(struct sk_buff *skb)

6830

+ 	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +

6831

+ 				    skb->len - th->doff*4);

6832

+ 	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);

6833

++#ifdef CONFIG_MPTCP

6834

++	TCP_SKB_CB(skb)->mptcp_flags = 0;

6835

++	TCP_SKB_CB(skb)->dss_off = 0;

6836

++#endif

6837

+ 	TCP_SKB_CB(skb)->when = 0;

6838

+ 	TCP_SKB_CB(skb)->ip_dsfield = ipv6_get_dsfield(hdr);

6839

+ 	TCP_SKB_CB(skb)->sacked = 0;

6840

+

6841

+ 	sk = __inet6_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);

6842

+-	if (!sk)

6843

+-		goto no_tcp_socket;

6844

+

6845

+ process:

6846

+-	if (sk->sk_state == TCP_TIME_WAIT)

6847

++	if (sk && sk->sk_state == TCP_TIME_WAIT)

6848

+ 		goto do_time_wait;

6849

+

6850

++#ifdef CONFIG_MPTCP

6851

++	if (!sk && th->syn && !th->ack) {

6852

++		int ret = mptcp_lookup_join(skb, NULL);

6853

++

6854

++		if (ret < 0) {

6855

++			tcp_v6_send_reset(NULL, skb);

6856

++			goto discard_it;

6857

++		} else if (ret > 0) {

6858

++			return 0;

6859

++		}

6860

++	}

6861

++

6862

++	/* Is there a pending request sock for this segment ? */

6863

++	if ((!sk || sk->sk_state == TCP_LISTEN) && mptcp_check_req(skb, net)) {

6864

++		if (sk)

6865

++			sock_put(sk);

6866

++		return 0;

6867

++	}

6868

++#endif

6869

++

6870

++	if (!sk)

6871

++		goto no_tcp_socket;

6872

++

6873

+ 	if (hdr->hop_limit < inet6_sk(sk)->min_hopcount) {

6874

+ 		NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);

6875

+ 		goto discard_and_relse;

6876

+@@ -1529,11 +1520,21 @@ process:

6877

+ 	sk_mark_napi_id(sk, skb);

6878

+ 	skb->dev = NULL;

6879

+

6880

+-	bh_lock_sock_nested(sk);

6881

++	if (mptcp(tcp_sk(sk))) {

6882

++		meta_sk = mptcp_meta_sk(sk);

6883

++

6884

++		bh_lock_sock_nested(meta_sk);

6885

++		if (sock_owned_by_user(meta_sk))

6886

++			skb->sk = sk;

6887

++	} else {

6888

++		meta_sk = sk;

6889

++		bh_lock_sock_nested(sk);

6890

++	}

6891

++

6892

+ 	ret = 0;

6893

+-	if (!sock_owned_by_user(sk)) {

6894

++	if (!sock_owned_by_user(meta_sk)) {

6895

+ #ifdef CONFIG_NET_DMA

6896

+-		struct tcp_sock *tp = tcp_sk(sk);

6897

++		struct tcp_sock *tp = tcp_sk(meta_sk);

6898

+ 		if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)

6899

+ 			tp->ucopy.dma_chan = net_dma_find_channel();

6900

+ 		if (tp->ucopy.dma_chan)

6901

+@@ -1541,16 +1542,17 @@ process:

6902

+ 		else

6903

+ #endif

6904

+ 		{

6905

+-			if (!tcp_prequeue(sk, skb))

6906

++			if (!tcp_prequeue(meta_sk, skb))

6907

+ 				ret = tcp_v6_do_rcv(sk, skb);

6908

+ 		}

6909

+-	} else if (unlikely(sk_add_backlog(sk, skb,

6910

+-					   sk->sk_rcvbuf + sk->sk_sndbuf))) {

6911

+-		bh_unlock_sock(sk);

6912

++	} else if (unlikely(sk_add_backlog(meta_sk, skb,

6913

++					   meta_sk->sk_rcvbuf + meta_sk->sk_sndbuf))) {

6914

++		bh_unlock_sock(meta_sk);

6915

+ 		NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);

6916

+ 		goto discard_and_relse;

6917

+ 	}

6918

+-	bh_unlock_sock(sk);

6919

++

6920

++	bh_unlock_sock(meta_sk);

6921

+

6922

+ 	sock_put(sk);

6923

+ 	return ret ? -1 : 0;

6924

+@@ -1607,6 +1609,18 @@ do_time_wait:

6925

+ 			sk = sk2;

6926

+ 			goto process;

6927

+ 		}

6928

++#ifdef CONFIG_MPTCP

6929

++		if (th->syn && !th->ack) {

6930

++			int ret = mptcp_lookup_join(skb, inet_twsk(sk));

6931

++

6932

++			if (ret < 0) {

6933

++				tcp_v6_send_reset(NULL, skb);

6934

++				goto discard_it;

6935

++			} else if (ret > 0) {

6936

++				return 0;

6937

++			}

6938

++		}

6939

++#endif

6940

+ 		/* Fall through to ACK */

6941

+ 	}

6942

+ 	case TCP_TW_ACK:

6943

+@@ -1657,7 +1671,7 @@ static void tcp_v6_early_demux(struct sk_buff *skb)

6944

+ 	}

6945

+ }

6946

+

6947

+-static struct timewait_sock_ops tcp6_timewait_sock_ops = {

6948

++struct timewait_sock_ops tcp6_timewait_sock_ops = {

6949

+ 	.twsk_obj_size	= sizeof(struct tcp6_timewait_sock),

6950

+ 	.twsk_unique	= tcp_twsk_unique,

6951

+ 	.twsk_destructor = tcp_twsk_destructor,

6952

+@@ -1730,7 +1744,12 @@ static int tcp_v6_init_sock(struct sock *sk)

6953

+

6954

+ 	tcp_init_sock(sk);

6955

+

6956

+-	icsk->icsk_af_ops = &ipv6_specific;

6957

++#ifdef CONFIG_MPTCP

6958

++	if (is_mptcp_enabled(sk))

6959

++		icsk->icsk_af_ops = &mptcp_v6_specific;

6960

++	else

6961

++#endif

6962

++		icsk->icsk_af_ops = &ipv6_specific;

6963

+

6964

+ #ifdef CONFIG_TCP_MD5SIG

6965

+ 	tcp_sk(sk)->af_specific = &tcp_sock_ipv6_specific;

6966

+@@ -1739,7 +1758,7 @@ static int tcp_v6_init_sock(struct sock *sk)

6967

+ 	return 0;

6968

+ }

6969

+

6970

+-static void tcp_v6_destroy_sock(struct sock *sk)

6971

++void tcp_v6_destroy_sock(struct sock *sk)

6972

+ {

6973

+ 	tcp_v4_destroy_sock(sk);

6974

+ 	inet6_destroy_sock(sk);

6975

+@@ -1924,12 +1943,28 @@ void tcp6_proc_exit(struct net *net)

6976

+ static void tcp_v6_clear_sk(struct sock *sk, int size)

6977

+ {

6978

+ 	struct inet_sock *inet = inet_sk(sk);

6979

++#ifdef CONFIG_MPTCP

6980

++	struct tcp_sock *tp = tcp_sk(sk);

6981

++	/* size_tk_table goes from the end of tk_table to the end of sk */

6982

++	int size_tk_table = size - offsetof(struct tcp_sock, tk_table) -

6983

++			    sizeof(tp->tk_table);

6984

++#endif

6985

+

6986

+ 	/* we do not want to clear pinet6 field, because of RCU lookups */

6987

+ 	sk_prot_clear_nulls(sk, offsetof(struct inet_sock, pinet6));

6988

+

6989

+ 	size -= offsetof(struct inet_sock, pinet6) + sizeof(inet->pinet6);

6990

++

6991

++#ifdef CONFIG_MPTCP

6992

++	/* We zero out only from pinet6 to tk_table */

6993

++	size -= size_tk_table + sizeof(tp->tk_table);

6994

++#endif

6995

+ 	memset(&inet->pinet6 + 1, 0, size);

6996

++

6997

++#ifdef CONFIG_MPTCP

6998

++	memset((char *)&tp->tk_table + sizeof(tp->tk_table), 0, size_tk_table);

6999

++#endif

7000

++

7001

+ }

7002

+

7003

+ struct proto tcpv6_prot = {

7004

+diff --git a/net/mptcp/Kconfig b/net/mptcp/Kconfig

7005

+new file mode 100644

7006

+index 000000000000..cdfc03adabf8

7007

+--- /dev/null

7008

++++ b/net/mptcp/Kconfig

7009

+@@ -0,0 +1,115 @@

7010

++#

7011

++# MPTCP configuration

7012

++#

7013

++config MPTCP

7014

++        bool "MPTCP protocol"

7015

++        depends on (IPV6=y || IPV6=n)

7016

++        ---help---

7017

++          This replaces the normal TCP stack with a Multipath TCP stack,

7018

++          able to use several paths at once.

7019

++

7020

++menuconfig MPTCP_PM_ADVANCED

7021

++	bool "MPTCP: advanced path-manager control"

7022

++	depends on MPTCP=y

7023

++	---help---

7024

++	  Support for selection of different path-managers. You should choose 'Y' here,

7025

++	  because otherwise you will not actively create new MPTCP-subflows.

7026

++

7027

++if MPTCP_PM_ADVANCED

7028

++

7029

++config MPTCP_FULLMESH

7030

++	tristate "MPTCP Full-Mesh Path-Manager"

7031

++	depends on MPTCP=y

7032

++	---help---

7033

++	  This path-management module will create a full-mesh among all IP-addresses.

7034

++

7035

++config MPTCP_NDIFFPORTS

7036

++	tristate "MPTCP ndiff-ports"

7037

++	depends on MPTCP=y

7038

++	---help---

7039

++	  This path-management module will create multiple subflows between the same

7040

++	  pair of IP-addresses, modifying the source-port. You can set the number

7041

++	  of subflows via the mptcp_ndiffports-sysctl.

7042

++

7043

++config MPTCP_BINDER

7044

++	tristate "MPTCP Binder"

7045

++	depends on (MPTCP=y)

7046

++	---help---

7047

++	  This path-management module works like ndiffports, and adds the sysctl

7048

++	  option to set the gateway (and/or path to) per each additional subflow

7049

++	  via Loose Source Routing (IPv4 only).

7050

++

7051

++choice

7052

++	prompt "Default MPTCP Path-Manager"

7053

++	default DEFAULT

7054

++	help

7055

++	  Select the Path-Manager of your choice

7056

++

7057

++	config DEFAULT_FULLMESH

7058

++		bool "Full mesh" if MPTCP_FULLMESH=y

7059

++

7060

++	config DEFAULT_NDIFFPORTS

7061

++		bool "ndiff-ports" if MPTCP_NDIFFPORTS=y

7062

++

7063

++	config DEFAULT_BINDER

7064

++		bool "binder" if MPTCP_BINDER=y

7065

++

7066

++	config DEFAULT_DUMMY

7067

++		bool "Default"

7068

++

7069

++endchoice

7070

++

7071

++endif

7072

++

7073

++config DEFAULT_MPTCP_PM

7074

++	string

7075

++	default "default" if DEFAULT_DUMMY

7076

++	default "fullmesh" if DEFAULT_FULLMESH 

7077

++	default "ndiffports" if DEFAULT_NDIFFPORTS

7078

++	default "binder" if DEFAULT_BINDER

7079

++	default "default"

7080

++

7081

++menuconfig MPTCP_SCHED_ADVANCED

7082

++	bool "MPTCP: advanced scheduler control"

7083

++	depends on MPTCP=y

7084

++	---help---

7085

++	  Support for selection of different schedulers. You should choose 'Y' here,

7086

++	  if you want to choose a different scheduler than the default one.

7087

++

7088

++if MPTCP_SCHED_ADVANCED

7089

++

7090

++config MPTCP_ROUNDROBIN

7091

++	tristate "MPTCP Round-Robin"

7092

++	depends on (MPTCP=y)

7093

++	---help---

7094

++	  This is a very simple round-robin scheduler. Probably has bad performance

7095

++	  but might be interesting for researchers.

7096

++

7097

++choice

7098

++	prompt "Default MPTCP Scheduler"

7099

++	default DEFAULT

7100

++	help

7101

++	  Select the Scheduler of your choice

7102

++

7103

++	config DEFAULT_SCHEDULER

7104

++		bool "Default"

7105

++		---help---

7106

++		  This is the default scheduler, sending first on the subflow

7107

++		  with the lowest RTT.

7108

++

7109

++	config DEFAULT_ROUNDROBIN

7110

++		bool "Round-Robin" if MPTCP_ROUNDROBIN=y

7111

++		---help---

7112

++		  This is the round-rob scheduler, sending in a round-robin

7113

++		  fashion..

7114

++

7115

++endchoice

7116

++endif

7117

++

7118

++config DEFAULT_MPTCP_SCHED

7119

++	string

7120

++	depends on (MPTCP=y)

7121

++	default "default" if DEFAULT_SCHEDULER

7122

++	default "roundrobin" if DEFAULT_ROUNDROBIN

7123

++	default "default"

7124

++

7125

+diff --git a/net/mptcp/Makefile b/net/mptcp/Makefile

7126

+new file mode 100644

7127

+index 000000000000..35561a7012e3

7128

+--- /dev/null

7129

++++ b/net/mptcp/Makefile

7130

+@@ -0,0 +1,20 @@

7131

++#

7132

++## Makefile for MultiPath TCP support code.

7133

++#

7134

++#

7135

++

7136

++obj-$(CONFIG_MPTCP) += mptcp.o

7137

++

7138

++mptcp-y := mptcp_ctrl.o mptcp_ipv4.o mptcp_ofo_queue.o mptcp_pm.o \

7139

++	   mptcp_output.o mptcp_input.o mptcp_sched.o

7140

++

7141

++obj-$(CONFIG_TCP_CONG_COUPLED) += mptcp_coupled.o

7142

++obj-$(CONFIG_TCP_CONG_OLIA) += mptcp_olia.o

7143

++obj-$(CONFIG_TCP_CONG_WVEGAS) += mptcp_wvegas.o

7144

++obj-$(CONFIG_MPTCP_FULLMESH) += mptcp_fullmesh.o

7145

++obj-$(CONFIG_MPTCP_NDIFFPORTS) += mptcp_ndiffports.o

7146

++obj-$(CONFIG_MPTCP_BINDER) += mptcp_binder.o

7147

++obj-$(CONFIG_MPTCP_ROUNDROBIN) += mptcp_rr.o

7148

++

7149

++mptcp-$(subst m,y,$(CONFIG_IPV6)) += mptcp_ipv6.o

7150

++

7151

+diff --git a/net/mptcp/mptcp_binder.c b/net/mptcp/mptcp_binder.c

7152

+new file mode 100644

7153

+index 000000000000..95d8da560715

7154

+--- /dev/null

7155

++++ b/net/mptcp/mptcp_binder.c

7156

+@@ -0,0 +1,487 @@

7157

++#include <linux/module.h>

7158

++

7159

++#include <net/mptcp.h>

7160

++#include <net/mptcp_v4.h>

7161

++

7162

++#include <linux/route.h>

7163

++#include <linux/inet.h>

7164

++#include <linux/mroute.h>

7165

++#include <linux/spinlock_types.h>

7166

++#include <net/inet_ecn.h>

7167

++#include <net/route.h>

7168

++#include <net/xfrm.h>

7169

++#include <net/compat.h>

7170

++#include <linux/slab.h>

7171

++

7172

++#define MPTCP_GW_MAX_LISTS	10

7173

++#define MPTCP_GW_LIST_MAX_LEN	6

7174

++#define MPTCP_GW_SYSCTL_MAX_LEN	(15 * MPTCP_GW_LIST_MAX_LEN *	\

7175

++							MPTCP_GW_MAX_LISTS)

7176

++

7177

++struct mptcp_gw_list {

7178

++	struct in_addr list[MPTCP_GW_MAX_LISTS][MPTCP_GW_LIST_MAX_LEN];

7179

++	u8 len[MPTCP_GW_MAX_LISTS];

7180

++};

7181

++

7182

++struct binder_priv {

7183

++	/* Worker struct for subflow establishment */

7184

++	struct work_struct subflow_work;

7185

++

7186

++	struct mptcp_cb *mpcb;

7187

++

7188

++	/* Prevent multiple sub-sockets concurrently iterating over sockets */

7189

++	spinlock_t *flow_lock;

7190

++};

7191

++

7192

++static struct mptcp_gw_list *mptcp_gws;

7193

++static rwlock_t mptcp_gws_lock;

7194

++

7195

++static int mptcp_binder_ndiffports __read_mostly = 1;

7196

++

7197

++static char sysctl_mptcp_binder_gateways[MPTCP_GW_SYSCTL_MAX_LEN] __read_mostly;

7198

++

7199

++static int mptcp_get_avail_list_ipv4(struct sock *sk)

7200

++{

7201

++	int i, j, list_taken, opt_ret, opt_len;

7202

++	unsigned char *opt_ptr, *opt_end_ptr, opt[MAX_IPOPTLEN];

7203

++

7204

++	for (i = 0; i < MPTCP_GW_MAX_LISTS; ++i) {

7205

++		if (mptcp_gws->len[i] == 0)

7206

++			goto error;

7207

++

7208

++		mptcp_debug("mptcp_get_avail_list_ipv4: List %i\n", i);

7209

++		list_taken = 0;

7210

++

7211

++		/* Loop through all sub-sockets in this connection */

7212

++		mptcp_for_each_sk(tcp_sk(sk)->mpcb, sk) {

7213

++			mptcp_debug("mptcp_get_avail_list_ipv4: Next sock\n");

7214

++

7215

++			/* Reset length and options buffer, then retrieve

7216

++			 * from socket

7217

++			 */

7218

++			opt_len = MAX_IPOPTLEN;

7219

++			memset(opt, 0, MAX_IPOPTLEN);

7220

++			opt_ret = ip_getsockopt(sk, IPPROTO_IP,

7221

++				IP_OPTIONS, opt, &opt_len);

7222

++			if (opt_ret < 0) {

7223

++				mptcp_debug(KERN_ERR "%s: MPTCP subsocket getsockopt() IP_OPTIONS failed, error %d\n",

7224

++					    __func__, opt_ret);

7225

++				goto error;

7226

++			}

7227

++

7228

++			/* If socket has no options, it has no stake in this list */

7229

++			if (opt_len <= 0)

7230

++				continue;

7231

++

7232

++			/* Iterate options buffer */

7233

++			for (opt_ptr = &opt[0]; opt_ptr < &opt[opt_len]; opt_ptr++) {

7234

++				if (*opt_ptr == IPOPT_LSRR) {

7235

++					mptcp_debug("mptcp_get_avail_list_ipv4: LSRR options found\n");

7236

++					goto sock_lsrr;

7237

++				}

7238

++			}

7239

++			continue;

7240

++

7241

++sock_lsrr:

7242

++			/* Pointer to the 2nd to last address */

7243

++			opt_end_ptr = opt_ptr+(*(opt_ptr+1))-4;

7244

++

7245

++			/* Addresses start 3 bytes after type offset */

7246

++			opt_ptr += 3;

7247

++			j = 0;

7248

++

7249

++			/* Different length lists cannot be the same */

7250

++			if ((opt_end_ptr-opt_ptr)/4 != mptcp_gws->len[i])

7251

++				continue;

7252

++

7253

++			/* Iterate if we are still inside options list

7254

++			 * and sysctl list

7255

++			 */

7256

++			while (opt_ptr < opt_end_ptr && j < mptcp_gws->len[i]) {

7257

++				/* If there is a different address, this list must

7258

++				 * not be set on this socket

7259

++				 */

7260

++				if (memcmp(&mptcp_gws->list[i][j], opt_ptr, 4))

7261

++					break;

7262

++

7263

++				/* Jump 4 bytes to next address */

7264

++				opt_ptr += 4;

7265

++				j++;

7266

++			}

7267

++

7268

++			/* Reached the end without a differing address, lists

7269

++			 * are therefore identical.

7270

++			 */

7271

++			if (j == mptcp_gws->len[i]) {

7272

++				mptcp_debug("mptcp_get_avail_list_ipv4: List already used\n");

7273

++				list_taken = 1;

7274

++				break;

7275

++			}

7276

++		}

7277

++

7278

++		/* Free list found if not taken by a socket */

7279

++		if (!list_taken) {

7280

++			mptcp_debug("mptcp_get_avail_list_ipv4: List free\n");

7281

++			break;

7282

++		}

7283

++	}

7284

++

7285

++	if (i >= MPTCP_GW_MAX_LISTS)

7286

++		goto error;

7287

++

7288

++	return i;

7289

++error:

7290

++	return -1;

7291

++}

7292

++

7293

++/* The list of addresses is parsed each time a new connection is opened,

7294

++ *  to make sure it's up to date. In case of error, all the lists are

7295

++ *  marked as unavailable and the subflow's fingerprint is set to 0.

7296

++ */

7297

++static void mptcp_v4_add_lsrr(struct sock *sk, struct in_addr addr)

7298

++{

7299

++	int i, j, ret;

7300

++	unsigned char opt[MAX_IPOPTLEN] = {0};

7301

++	struct tcp_sock *tp = tcp_sk(sk);

7302

++	struct binder_priv *fmp = (struct binder_priv *)&tp->mpcb->mptcp_pm[0];

7303

++

7304

++	/* Read lock: multiple sockets can read LSRR addresses at the same

7305

++	 * time, but writes are done in mutual exclusion.

7306

++	 * Spin lock: must search for free list for one socket at a time, or

7307

++	 * multiple sockets could take the same list.

7308

++	 */

7309

++	read_lock(&mptcp_gws_lock);

7310

++	spin_lock(fmp->flow_lock);

7311

++

7312

++	i = mptcp_get_avail_list_ipv4(sk);

7313

++

7314

++	/* Execution enters here only if a free path is found.

7315

++	 */

7316

++	if (i >= 0) {

7317

++		opt[0] = IPOPT_NOP;

7318

++		opt[1] = IPOPT_LSRR;

7319

++		opt[2] = sizeof(mptcp_gws->list[i][0].s_addr) *

7320

++				(mptcp_gws->len[i] + 1) + 3;

7321

++		opt[3] = IPOPT_MINOFF;

7322

++		for (j = 0; j < mptcp_gws->len[i]; ++j)

7323

++			memcpy(opt + 4 +

7324

++				(j * sizeof(mptcp_gws->list[i][0].s_addr)),

7325

++				&mptcp_gws->list[i][j].s_addr,

7326

++				sizeof(mptcp_gws->list[i][0].s_addr));

7327

++		/* Final destination must be part of IP_OPTIONS parameter. */

7328

++		memcpy(opt + 4 + (j * sizeof(addr.s_addr)), &addr.s_addr,

7329

++		       sizeof(addr.s_addr));

7330

++

7331

++		/* setsockopt must be inside the lock, otherwise another

7332

++		 * subflow could fail to see that we have taken a list.

7333

++		 */

7334

++		ret = ip_setsockopt(sk, IPPROTO_IP, IP_OPTIONS, opt,

7335

++				4 + sizeof(mptcp_gws->list[i][0].s_addr)

7336

++				* (mptcp_gws->len[i] + 1));

7337

++

7338

++		if (ret < 0) {

7339

++			mptcp_debug(KERN_ERR "%s: MPTCP subsock setsockopt() IP_OPTIONS failed, error %d\n",

7340

++				    __func__, ret);

7341

++		}

7342

++	}

7343

++

7344

++	spin_unlock(fmp->flow_lock);

7345

++	read_unlock(&mptcp_gws_lock);

7346

++

7347

++	return;

7348

++}

7349

++

7350

++/* Parses gateways string for a list of paths to different

7351

++ * gateways, and stores them for use with the Loose Source Routing (LSRR)

7352

++ * socket option. Each list must have "," separated addresses, and the lists

7353

++ * themselves must be separated by "-". Returns -1 in case one or more of the

7354

++ * addresses is not a valid ipv4/6 address.

7355

++ */

7356

++static int mptcp_parse_gateway_ipv4(char *gateways)

7357

++{

7358

++	int i, j, k, ret;

7359

++	char *tmp_string = NULL;

7360

++	struct in_addr tmp_addr;

7361

++

7362

++	tmp_string = kzalloc(16, GFP_KERNEL);

7363

++	if (tmp_string == NULL)

7364

++		return -ENOMEM;

7365

++

7366

++	write_lock(&mptcp_gws_lock);

7367

++

7368

++	memset(mptcp_gws, 0, sizeof(struct mptcp_gw_list));

7369

++

7370

++	/* A TMP string is used since inet_pton needs a null terminated string

7371

++	 * but we do not want to modify the sysctl for obvious reasons.

7372

++	 * i will iterate over the SYSCTL string, j will iterate over the

7373

++	 * temporary string where each IP is copied into, k will iterate over

7374

++	 * the IPs in each list.

7375

++	 */

7376

++	for (i = j = k = 0;

7377

++			i < MPTCP_GW_SYSCTL_MAX_LEN && k < MPTCP_GW_MAX_LISTS;

7378

++			++i) {

7379

++		if (gateways[i] == '-' || gateways[i] == ',' || gateways[i] == '\0') {

7380

++			/* If the temp IP is empty and the current list is

7381

++			 *  empty, we are done.

7382

++			 */

7383

++			if (j == 0 && mptcp_gws->len[k] == 0)

7384

++				break;

7385

++

7386

++			/* Terminate the temp IP string, then if it is

7387

++			 * non-empty parse the IP and copy it.

7388

++			 */

7389

++			tmp_string[j] = '\0';

7390

++			if (j > 0) {

7391

++				mptcp_debug("mptcp_parse_gateway_list tmp: %s i: %d\n", tmp_string, i);

7392

++

7393

++				ret = in4_pton(tmp_string, strlen(tmp_string),

7394

++						(u8 *)&tmp_addr.s_addr, '\0',

7395

++						NULL);

7396

++

7397

++				if (ret) {

7398

++					mptcp_debug("mptcp_parse_gateway_list ret: %d s_addr: %pI4\n",

7399

++						    ret,

7400

++						    &tmp_addr.s_addr);

7401

++					memcpy(&mptcp_gws->list[k][mptcp_gws->len[k]].s_addr,

7402

++					       &tmp_addr.s_addr,

7403

++					       sizeof(tmp_addr.s_addr));

7404

++					mptcp_gws->len[k]++;

7405

++					j = 0;

7406

++					tmp_string[j] = '\0';

7407

++					/* Since we can't impose a limit to

7408

++					 * what the user can input, make sure

7409

++					 * there are not too many IPs in the

7410

++					 * SYSCTL string.

7411

++					 */

7412

++					if (mptcp_gws->len[k] > MPTCP_GW_LIST_MAX_LEN) {

7413

++						mptcp_debug("mptcp_parse_gateway_list too many members in list %i: max %i\n",

7414

++							    k,

7415

++							    MPTCP_GW_LIST_MAX_LEN);

7416

++						goto error;

7417

++					}

7418

++				} else {

7419

++					goto error;

7420

++				}

7421

++			}

7422

++

7423

++			if (gateways[i] == '-' || gateways[i] == '\0')

7424

++				++k;

7425

++		} else {

7426

++			tmp_string[j] = gateways[i];

7427

++			++j;

7428

++		}

7429

++	}

7430

++

7431

++	/* Number of flows is number of gateway lists plus master flow */

7432

++	mptcp_binder_ndiffports = k+1;

7433

++

7434

++	write_unlock(&mptcp_gws_lock);

7435

++	kfree(tmp_string);

7436

++

7437

++	return 0;

7438

++

7439

++error:

7440

++	memset(mptcp_gws, 0, sizeof(struct mptcp_gw_list));

7441

++	memset(gateways, 0, sizeof(char) * MPTCP_GW_SYSCTL_MAX_LEN);

7442

++	write_unlock(&mptcp_gws_lock);

7443

++	kfree(tmp_string);

7444

++	return -1;

7445

++}

7446

++

7447

++/**

7448

++ * Create all new subflows, by doing calls to mptcp_initX_subsockets

7449

++ *

7450

++ * This function uses a goto next_subflow, to allow releasing the lock between

7451

++ * new subflows and giving other processes a chance to do some work on the

7452

++ * socket and potentially finishing the communication.

7453

++ **/

7454

++static void create_subflow_worker(struct work_struct *work)

7455

++{

7456

++	const struct binder_priv *pm_priv = container_of(work,

7457

++						     struct binder_priv,

7458

++						     subflow_work);

7459

++	struct mptcp_cb *mpcb = pm_priv->mpcb;

7460

++	struct sock *meta_sk = mpcb->meta_sk;

7461

++	int iter = 0;

7462

++

7463

++next_subflow:

7464

++	if (iter) {

7465

++		release_sock(meta_sk);

7466

++		mutex_unlock(&mpcb->mpcb_mutex);

7467

++

7468

++		cond_resched();

7469

++	}

7470

++	mutex_lock(&mpcb->mpcb_mutex);

7471

++	lock_sock_nested(meta_sk, SINGLE_DEPTH_NESTING);

7472

++

7473

++	iter++;

7474

++

7475

++	if (sock_flag(meta_sk, SOCK_DEAD))

7476

++		goto exit;

7477

++

7478

++	if (mpcb->master_sk &&

7479

++	    !tcp_sk(mpcb->master_sk)->mptcp->fully_established)

7480

++		goto exit;

7481

++

7482

++	if (mptcp_binder_ndiffports > iter &&

7483

++	    mptcp_binder_ndiffports > mpcb->cnt_subflows) {

7484

++		struct mptcp_loc4 loc;

7485

++		struct mptcp_rem4 rem;

7486

++

7487

++		loc.addr.s_addr = inet_sk(meta_sk)->inet_saddr;

7488

++		loc.loc4_id = 0;

7489

++		loc.low_prio = 0;

7490

++

7491

++		rem.addr.s_addr = inet_sk(meta_sk)->inet_daddr;

7492

++		rem.port = inet_sk(meta_sk)->inet_dport;

7493

++		rem.rem4_id = 0; /* Default 0 */

7494

++

7495

++		mptcp_init4_subsockets(meta_sk, &loc, &rem);

7496

++

7497

++		goto next_subflow;

7498

++	}

7499

++

7500

++exit:

7501

++	release_sock(meta_sk);

7502

++	mutex_unlock(&mpcb->mpcb_mutex);

7503

++	sock_put(meta_sk);

7504

++}

7505

++

7506

++static void binder_new_session(const struct sock *meta_sk)

7507

++{

7508

++	struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;

7509

++	struct binder_priv *fmp = (struct binder_priv *)&mpcb->mptcp_pm[0];

7510

++	static DEFINE_SPINLOCK(flow_lock);

7511

++

7512

++#if IS_ENABLED(CONFIG_IPV6)

7513

++	if (meta_sk->sk_family == AF_INET6 &&

7514

++	    !mptcp_v6_is_v4_mapped(meta_sk)) {

7515

++			mptcp_fallback_default(mpcb);

7516

++			return;

7517

++	}

7518

++#endif

7519

++

7520

++	/* Initialize workqueue-struct */

7521

++	INIT_WORK(&fmp->subflow_work, create_subflow_worker);

7522

++	fmp->mpcb = mpcb;

7523

++

7524

++	fmp->flow_lock = &flow_lock;

7525

++}

7526

++

7527

++static void binder_create_subflows(struct sock *meta_sk)

7528

++{

7529

++	struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;

7530

++	struct binder_priv *pm_priv = (struct binder_priv *)&mpcb->mptcp_pm[0];

7531

++

7532

++	if (mpcb->infinite_mapping_snd || mpcb->infinite_mapping_rcv ||

7533

++	    mpcb->send_infinite_mapping ||

7534

++	    mpcb->server_side || sock_flag(meta_sk, SOCK_DEAD))

7535

++		return;

7536

++

7537

++	if (!work_pending(&pm_priv->subflow_work)) {

7538

++		sock_hold(meta_sk);

7539

++		queue_work(mptcp_wq, &pm_priv->subflow_work);

7540

++	}

7541

++}

7542

++

7543

++static int binder_get_local_id(sa_family_t family, union inet_addr *addr,

7544

++				  struct net *net, bool *low_prio)

7545

++{

7546

++	return 0;

7547

++}

7548

++

7549

++/* Callback functions, executed when syctl mptcp.mptcp_gateways is updated.

7550

++ * Inspired from proc_tcp_congestion_control().

7551

++ */

7552

++static int proc_mptcp_gateways(ctl_table *ctl, int write,

7553

++				       void __user *buffer, size_t *lenp,

7554

++				       loff_t *ppos)

7555

++{

7556

++	int ret;

7557

++	ctl_table tbl = {

7558

++		.maxlen = MPTCP_GW_SYSCTL_MAX_LEN,

7559

++	};

7560

++

7561

++	if (write) {

7562

++		tbl.data = kzalloc(MPTCP_GW_SYSCTL_MAX_LEN, GFP_KERNEL);

7563

++		if (tbl.data == NULL)

7564

++			return -1;

7565

++		ret = proc_dostring(&tbl, write, buffer, lenp, ppos);

7566

++		if (ret == 0) {

7567

++			ret = mptcp_parse_gateway_ipv4(tbl.data);

7568

++			memcpy(ctl->data, tbl.data, MPTCP_GW_SYSCTL_MAX_LEN);

7569

++		}

7570

++		kfree(tbl.data);

7571

++	} else {

7572

++		ret = proc_dostring(ctl, write, buffer, lenp, ppos);

7573

++	}

7574

++

7575

++

7576

++	return ret;

7577

++}

7578

++

7579

++static struct mptcp_pm_ops binder __read_mostly = {

7580

++	.new_session = binder_new_session,

7581

++	.fully_established = binder_create_subflows,

7582

++	.get_local_id = binder_get_local_id,

7583

++	.init_subsocket_v4 = mptcp_v4_add_lsrr,

7584

++	.name = "binder",

7585

++	.owner = THIS_MODULE,

7586

++};

7587

++

7588

++static struct ctl_table binder_table[] = {

7589

++	{

7590

++		.procname = "mptcp_binder_gateways",

7591

++		.data = &sysctl_mptcp_binder_gateways,

7592

++		.maxlen = sizeof(char) * MPTCP_GW_SYSCTL_MAX_LEN,

7593

++		.mode = 0644,

7594

++		.proc_handler = &proc_mptcp_gateways

7595

++	},

7596

++	{ }

7597

++};

7598

++

7599

++struct ctl_table_header *mptcp_sysctl_binder;

7600

++

7601

++/* General initialization of MPTCP_PM */

7602

++static int __init binder_register(void)

7603

++{

7604

++	mptcp_gws = kzalloc(sizeof(*mptcp_gws), GFP_KERNEL);

7605

++	if (!mptcp_gws)

7606

++		return -ENOMEM;

7607

++

7608

++	rwlock_init(&mptcp_gws_lock);

7609

++

7610

++	BUILD_BUG_ON(sizeof(struct binder_priv) > MPTCP_PM_SIZE);

7611

++

7612

++	mptcp_sysctl_binder = register_net_sysctl(&init_net, "net/mptcp",

7613

++			binder_table);

7614

++	if (!mptcp_sysctl_binder)

7615

++		goto sysctl_fail;

7616

++

7617

++	if (mptcp_register_path_manager(&binder))

7618

++		goto pm_failed;

7619

++

7620

++	return 0;

7621

++

7622

++pm_failed:

7623

++	unregister_net_sysctl_table(mptcp_sysctl_binder);

7624

++sysctl_fail:

7625

++	kfree(mptcp_gws);

7626

++

7627

++	return -1;

7628

++}

7629

++

7630

++static void binder_unregister(void)

7631

++{

7632

++	mptcp_unregister_path_manager(&binder);

7633

++	unregister_net_sysctl_table(mptcp_sysctl_binder);

7634

++	kfree(mptcp_gws);

7635

++}

7636

++

7637

++module_init(binder_register);

7638

++module_exit(binder_unregister);

7639

++

7640

++MODULE_AUTHOR("Luca Boccassi, Duncan Eastoe, Christoph Paasch (ndiffports)");

7641

++MODULE_LICENSE("GPL");

7642

++MODULE_DESCRIPTION("BINDER MPTCP");

7643

++MODULE_VERSION("0.1");

7644

+diff --git a/net/mptcp/mptcp_coupled.c b/net/mptcp/mptcp_coupled.c

7645

+new file mode 100644

7646

+index 000000000000..5d761164eb85

7647

+--- /dev/null

7648

++++ b/net/mptcp/mptcp_coupled.c

7649

+@@ -0,0 +1,270 @@

7650

++/*

7651

++ *	MPTCP implementation - Linked Increase congestion control Algorithm (LIA)

7652

++ *

7653

++ *	Initial Design & Implementation:

7654

++ *	Sébastien Barré <sebastien.barre@×××××××××.be>

7655

++ *

7656

++ *	Current Maintainer & Author:

7657

++ *	Christoph Paasch <christoph.paasch@×××××××××.be>

7658

++ *

7659

++ *	Additional authors:

7660

++ *	Jaakko Korkeaniemi <jaakko.korkeaniemi@×××××.fi>

7661

++ *	Gregory Detal <gregory.detal@×××××××××.be>

7662

++ *	Fabien Duchêne <fabien.duchene@×××××××××.be>

7663

++ *	Andreas Seelinger <Andreas.Seelinger@×××××××××××.de>

7664

++ *	Lavkesh Lahngir <lavkesh51@×××××.com>

7665

++ *	Andreas Ripke <ripke@××××××.eu>

7666

++ *	Vlad Dogaru <vlad.dogaru@×××××.com>

7667

++ *	Octavian Purdila <octavian.purdila@×××××.com>

7668

++ *	John Ronan <jronan@××××.org>

7669

++ *	Catalin Nicutar <catalin.nicutar@×××××.com>

7670

++ *	Brandon Heller <brandonh@××××××××.edu>

7671

++ *

7672

++ *

7673

++ *	This program is free software; you can redistribute it and/or

7674

++ *      modify it under the terms of the GNU General Public License

7675

++ *      as published by the Free Software Foundation; either version

7676

++ *      2 of the License, or (at your option) any later version.

7677

++ */

7678

++#include <net/tcp.h>

7679

++#include <net/mptcp.h>

7680

++

7681

++#include <linux/module.h>

7682

++

7683

++/* Scaling is done in the numerator with alpha_scale_num and in the denominator

7684

++ * with alpha_scale_den.

7685

++ *

7686

++ * To downscale, we just need to use alpha_scale.

7687

++ *

7688

++ * We have: alpha_scale = alpha_scale_num / (alpha_scale_den ^ 2)

7689

++ */

7690

++static int alpha_scale_den = 10;

7691

++static int alpha_scale_num = 32;

7692

++static int alpha_scale = 12;

7693

++

7694

++struct mptcp_ccc {

7695

++	u64	alpha;

7696

++	bool	forced_update;

7697

++};

7698

++

7699

++static inline int mptcp_ccc_sk_can_send(const struct sock *sk)

7700

++{

7701

++	return mptcp_sk_can_send(sk) && tcp_sk(sk)->srtt_us;

7702

++}

7703

++

7704

++static inline u64 mptcp_get_alpha(const struct sock *meta_sk)

7705

++{

7706

++	return ((struct mptcp_ccc *)inet_csk_ca(meta_sk))->alpha;

7707

++}

7708

++

7709

++static inline void mptcp_set_alpha(const struct sock *meta_sk, u64 alpha)

7710

++{

7711

++	((struct mptcp_ccc *)inet_csk_ca(meta_sk))->alpha = alpha;

7712

++}

7713

++

7714

++static inline u64 mptcp_ccc_scale(u32 val, int scale)

7715

++{

7716

++	return (u64) val << scale;

7717

++}

7718

++

7719

++static inline bool mptcp_get_forced(const struct sock *meta_sk)

7720

++{

7721

++	return ((struct mptcp_ccc *)inet_csk_ca(meta_sk))->forced_update;

7722

++}

7723

++

7724

++static inline void mptcp_set_forced(const struct sock *meta_sk, bool force)

7725

++{

7726

++	((struct mptcp_ccc *)inet_csk_ca(meta_sk))->forced_update = force;

7727

++}

7728

++

7729

++static void mptcp_ccc_recalc_alpha(const struct sock *sk)

7730

++{

7731

++	const struct mptcp_cb *mpcb = tcp_sk(sk)->mpcb;

7732

++	const struct sock *sub_sk;

7733

++	int best_cwnd = 0, best_rtt = 0, can_send = 0;

7734

++	u64 max_numerator = 0, sum_denominator = 0, alpha = 1;

7735

++

7736

++	if (!mpcb)

7737

++		return;

7738

++

7739

++	/* Only one subflow left - fall back to normal reno-behavior

7740

++	 * (set alpha to 1)

7741

++	 */

7742

++	if (mpcb->cnt_established <= 1)

7743

++		goto exit;

7744

++

7745

++	/* Do regular alpha-calculation for multiple subflows */

7746

++

7747

++	/* Find the max numerator of the alpha-calculation */

7748

++	mptcp_for_each_sk(mpcb, sub_sk) {

7749

++		struct tcp_sock *sub_tp = tcp_sk(sub_sk);

7750

++		u64 tmp;

7751

++

7752

++		if (!mptcp_ccc_sk_can_send(sub_sk))

7753

++			continue;

7754

++

7755

++		can_send++;

7756

++

7757

++		/* We need to look for the path, that provides the max-value.

7758

++		 * Integer-overflow is not possible here, because

7759

++		 * tmp will be in u64.

7760

++		 */

7761

++		tmp = div64_u64(mptcp_ccc_scale(sub_tp->snd_cwnd,

7762

++				alpha_scale_num), (u64)sub_tp->srtt_us * sub_tp->srtt_us);

7763

++

7764

++		if (tmp >= max_numerator) {

7765

++			max_numerator = tmp;

7766

++			best_cwnd = sub_tp->snd_cwnd;

7767

++			best_rtt = sub_tp->srtt_us;

7768

++		}

7769

++	}

7770

++

7771

++	/* No subflow is able to send - we don't care anymore */

7772

++	if (unlikely(!can_send))

7773

++		goto exit;

7774

++

7775

++	/* Calculate the denominator */

7776

++	mptcp_for_each_sk(mpcb, sub_sk) {

7777

++		struct tcp_sock *sub_tp = tcp_sk(sub_sk);

7778

++

7779

++		if (!mptcp_ccc_sk_can_send(sub_sk))

7780

++			continue;

7781

++

7782

++		sum_denominator += div_u64(

7783

++				mptcp_ccc_scale(sub_tp->snd_cwnd,

7784

++						alpha_scale_den) * best_rtt,

7785

++						sub_tp->srtt_us);

7786

++	}

7787

++	sum_denominator *= sum_denominator;

7788

++	if (unlikely(!sum_denominator)) {

7789

++		pr_err("%s: sum_denominator == 0, cnt_established:%d\n",

7790

++		       __func__, mpcb->cnt_established);

7791

++		mptcp_for_each_sk(mpcb, sub_sk) {

7792

++			struct tcp_sock *sub_tp = tcp_sk(sub_sk);

7793

++			pr_err("%s: pi:%d, state:%d\n, rtt:%u, cwnd: %u",

7794

++			       __func__, sub_tp->mptcp->path_index,

7795

++			       sub_sk->sk_state, sub_tp->srtt_us,

7796

++			       sub_tp->snd_cwnd);

7797

++		}

7798

++	}

7799

++

7800

++	alpha = div64_u64(mptcp_ccc_scale(best_cwnd, alpha_scale_num), sum_denominator);

7801

++

7802

++	if (unlikely(!alpha))

7803

++		alpha = 1;

7804

++

7805

++exit:

7806

++	mptcp_set_alpha(mptcp_meta_sk(sk), alpha);

7807

++}

7808

++

7809

++static void mptcp_ccc_init(struct sock *sk)

7810

++{

7811

++	if (mptcp(tcp_sk(sk))) {

7812

++		mptcp_set_forced(mptcp_meta_sk(sk), 0);

7813

++		mptcp_set_alpha(mptcp_meta_sk(sk), 1);

7814

++	}

7815

++	/* If we do not mptcp, behave like reno: return */

7816

++}

7817

++

7818

++static void mptcp_ccc_cwnd_event(struct sock *sk, enum tcp_ca_event event)

7819

++{

7820

++	if (event == CA_EVENT_LOSS)

7821

++		mptcp_ccc_recalc_alpha(sk);

7822

++}

7823

++

7824

++static void mptcp_ccc_set_state(struct sock *sk, u8 ca_state)

7825

++{

7826

++	if (!mptcp(tcp_sk(sk)))

7827

++		return;

7828

++

7829

++	mptcp_set_forced(mptcp_meta_sk(sk), 1);

7830

++}

7831

++

7832

++static void mptcp_ccc_cong_avoid(struct sock *sk, u32 ack, u32 acked)

7833

++{

7834

++	struct tcp_sock *tp = tcp_sk(sk);

7835

++	const struct mptcp_cb *mpcb = tp->mpcb;

7836

++	int snd_cwnd;

7837

++

7838

++	if (!mptcp(tp)) {

7839

++		tcp_reno_cong_avoid(sk, ack, acked);

7840

++		return;

7841

++	}

7842

++

7843

++	if (!tcp_is_cwnd_limited(sk))

7844

++		return;

7845

++

7846

++	if (tp->snd_cwnd <= tp->snd_ssthresh) {

7847

++		/* In "safe" area, increase. */

7848

++		tcp_slow_start(tp, acked);

7849

++		mptcp_ccc_recalc_alpha(sk);

7850

++		return;

7851

++	}

7852

++

7853

++	if (mptcp_get_forced(mptcp_meta_sk(sk))) {

7854

++		mptcp_ccc_recalc_alpha(sk);

7855

++		mptcp_set_forced(mptcp_meta_sk(sk), 0);

7856

++	}

7857

++

7858

++	if (mpcb->cnt_established > 1) {

7859

++		u64 alpha = mptcp_get_alpha(mptcp_meta_sk(sk));

7860

++

7861

++		/* This may happen, if at the initialization, the mpcb

7862

++		 * was not yet attached to the sock, and thus

7863

++		 * initializing alpha failed.

7864

++		 */

7865

++		if (unlikely(!alpha))

7866

++			alpha = 1;

7867

++

7868

++		snd_cwnd = (int) div_u64 ((u64) mptcp_ccc_scale(1, alpha_scale),

7869

++						alpha);

7870

++

7871

++		/* snd_cwnd_cnt >= max (scale * tot_cwnd / alpha, cwnd)

7872

++		 * Thus, we select here the max value.

7873

++		 */

7874

++		if (snd_cwnd < tp->snd_cwnd)

7875

++			snd_cwnd = tp->snd_cwnd;

7876

++	} else {

7877

++		snd_cwnd = tp->snd_cwnd;

7878

++	}

7879

++

7880

++	if (tp->snd_cwnd_cnt >= snd_cwnd) {

7881

++		if (tp->snd_cwnd < tp->snd_cwnd_clamp) {

7882

++			tp->snd_cwnd++;

7883

++			mptcp_ccc_recalc_alpha(sk);

7884

++		}

7885

++

7886

++		tp->snd_cwnd_cnt = 0;

7887

++	} else {

7888

++		tp->snd_cwnd_cnt++;

7889

++	}

7890

++}

7891

++

7892

++static struct tcp_congestion_ops mptcp_ccc = {

7893

++	.init		= mptcp_ccc_init,

7894

++	.ssthresh	= tcp_reno_ssthresh,

7895

++	.cong_avoid	= mptcp_ccc_cong_avoid,

7896

++	.cwnd_event	= mptcp_ccc_cwnd_event,

7897

++	.set_state	= mptcp_ccc_set_state,

7898

++	.owner		= THIS_MODULE,

7899

++	.name		= "lia",

7900

++};

7901

++

7902

++static int __init mptcp_ccc_register(void)

7903

++{

7904

++	BUILD_BUG_ON(sizeof(struct mptcp_ccc) > ICSK_CA_PRIV_SIZE);

7905

++	return tcp_register_congestion_control(&mptcp_ccc);

7906

++}

7907

++

7908

++static void __exit mptcp_ccc_unregister(void)

7909

++{

7910

++	tcp_unregister_congestion_control(&mptcp_ccc);

7911

++}

7912

++

7913

++module_init(mptcp_ccc_register);

7914

++module_exit(mptcp_ccc_unregister);

7915

++

7916

++MODULE_AUTHOR("Christoph Paasch, Sébastien Barré");

7917

++MODULE_LICENSE("GPL");

7918

++MODULE_DESCRIPTION("MPTCP LINKED INCREASE CONGESTION CONTROL ALGORITHM");

7919

++MODULE_VERSION("0.1");

7920

+diff --git a/net/mptcp/mptcp_ctrl.c b/net/mptcp/mptcp_ctrl.c

7921

+new file mode 100644

7922

+index 000000000000..28dfa0479f5e

7923

+--- /dev/null

7924

++++ b/net/mptcp/mptcp_ctrl.c

7925

+@@ -0,0 +1,2401 @@

7926

++/*

7927

++ *	MPTCP implementation - MPTCP-control

7928

++ *

7929

++ *	Initial Design & Implementation:

7930

++ *	Sébastien Barré <sebastien.barre@×××××××××.be>

7931

++ *

7932

++ *	Current Maintainer & Author:

7933

++ *	Christoph Paasch <christoph.paasch@×××××××××.be>

7934

++ *

7935

++ *	Additional authors:

7936

++ *	Jaakko Korkeaniemi <jaakko.korkeaniemi@×××××.fi>

7937

++ *	Gregory Detal <gregory.detal@×××××××××.be>

7938

++ *	Fabien Duchêne <fabien.duchene@×××××××××.be>

7939

++ *	Andreas Seelinger <Andreas.Seelinger@×××××××××××.de>

7940

++ *	Lavkesh Lahngir <lavkesh51@×××××.com>

7941

++ *	Andreas Ripke <ripke@××××××.eu>

7942

++ *	Vlad Dogaru <vlad.dogaru@×××××.com>

7943

++ *	Octavian Purdila <octavian.purdila@×××××.com>

7944

++ *	John Ronan <jronan@××××.org>

7945

++ *	Catalin Nicutar <catalin.nicutar@×××××.com>

7946

++ *	Brandon Heller <brandonh@××××××××.edu>

7947

++ *

7948

++ *

7949

++ *	This program is free software; you can redistribute it and/or

7950

++ *      modify it under the terms of the GNU General Public License

7951

++ *      as published by the Free Software Foundation; either version

7952

++ *      2 of the License, or (at your option) any later version.

7953

++ */

7954

++

7955

++#include <net/inet_common.h>

7956

++#include <net/inet6_hashtables.h>

7957

++#include <net/ipv6.h>

7958

++#include <net/ip6_checksum.h>

7959

++#include <net/mptcp.h>

7960

++#include <net/mptcp_v4.h>

7961

++#if IS_ENABLED(CONFIG_IPV6)

7962

++#include <net/ip6_route.h>

7963

++#include <net/mptcp_v6.h>

7964

++#endif

7965

++#include <net/sock.h>

7966

++#include <net/tcp.h>

7967

++#include <net/tcp_states.h>

7968

++#include <net/transp_v6.h>

7969

++#include <net/xfrm.h>

7970

++

7971

++#include <linux/cryptohash.h>

7972

++#include <linux/kconfig.h>

7973

++#include <linux/module.h>

7974

++#include <linux/netpoll.h>

7975

++#include <linux/list.h>

7976

++#include <linux/jhash.h>

7977

++#include <linux/tcp.h>

7978

++#include <linux/net.h>

7979

++#include <linux/in.h>

7980

++#include <linux/random.h>

7981

++#include <linux/inetdevice.h>

7982

++#include <linux/workqueue.h>

7983

++#include <linux/atomic.h>

7984

++#include <linux/sysctl.h>

7985

++

7986

++static struct kmem_cache *mptcp_sock_cache __read_mostly;

7987

++static struct kmem_cache *mptcp_cb_cache __read_mostly;

7988

++static struct kmem_cache *mptcp_tw_cache __read_mostly;

7989

++

7990

++int sysctl_mptcp_enabled __read_mostly = 1;

7991

++int sysctl_mptcp_checksum __read_mostly = 1;

7992

++int sysctl_mptcp_debug __read_mostly;

7993

++EXPORT_SYMBOL(sysctl_mptcp_debug);

7994

++int sysctl_mptcp_syn_retries __read_mostly = 3;

7995

++

7996

++bool mptcp_init_failed __read_mostly;

7997

++

7998

++struct static_key mptcp_static_key = STATIC_KEY_INIT_FALSE;

7999

++EXPORT_SYMBOL(mptcp_static_key);

8000

++

8001

++static int proc_mptcp_path_manager(ctl_table *ctl, int write,

8002

++				   void __user *buffer, size_t *lenp,

8003

++				   loff_t *ppos)

8004

++{

8005

++	char val[MPTCP_PM_NAME_MAX];

8006

++	ctl_table tbl = {

8007

++		.data = val,

8008

++		.maxlen = MPTCP_PM_NAME_MAX,

8009

++	};

8010

++	int ret;

8011

++

8012

++	mptcp_get_default_path_manager(val);

8013

++

8014

++	ret = proc_dostring(&tbl, write, buffer, lenp, ppos);

8015

++	if (write && ret == 0)

8016

++		ret = mptcp_set_default_path_manager(val);

8017

++	return ret;

8018

++}

8019

++

8020

++static int proc_mptcp_scheduler(ctl_table *ctl, int write,

8021

++				void __user *buffer, size_t *lenp,

8022

++				loff_t *ppos)

8023

++{

8024

++	char val[MPTCP_SCHED_NAME_MAX];

8025

++	ctl_table tbl = {

8026

++		.data = val,

8027

++		.maxlen = MPTCP_SCHED_NAME_MAX,

8028

++	};

8029

++	int ret;

8030

++

8031

++	mptcp_get_default_scheduler(val);

8032

++

8033

++	ret = proc_dostring(&tbl, write, buffer, lenp, ppos);

8034

++	if (write && ret == 0)

8035

++		ret = mptcp_set_default_scheduler(val);

8036

++	return ret;

8037

++}

8038

++

8039

++static struct ctl_table mptcp_table[] = {

8040

++	{

8041

++		.procname = "mptcp_enabled",

8042

++		.data = &sysctl_mptcp_enabled,

8043

++		.maxlen = sizeof(int),

8044

++		.mode = 0644,

8045

++		.proc_handler = &proc_dointvec

8046

++	},

8047

++	{

8048

++		.procname = "mptcp_checksum",

8049

++		.data = &sysctl_mptcp_checksum,

8050

++		.maxlen = sizeof(int),

8051

++		.mode = 0644,

8052

++		.proc_handler = &proc_dointvec

8053

++	},

8054

++	{

8055

++		.procname = "mptcp_debug",

8056

++		.data = &sysctl_mptcp_debug,

8057

++		.maxlen = sizeof(int),

8058

++		.mode = 0644,

8059

++		.proc_handler = &proc_dointvec

8060

++	},

8061

++	{

8062

++		.procname = "mptcp_syn_retries",

8063

++		.data = &sysctl_mptcp_syn_retries,

8064

++		.maxlen = sizeof(int),

8065

++		.mode = 0644,

8066

++		.proc_handler = &proc_dointvec

8067

++	},

8068

++	{

8069

++		.procname	= "mptcp_path_manager",

8070

++		.mode		= 0644,

8071

++		.maxlen		= MPTCP_PM_NAME_MAX,

8072

++		.proc_handler	= proc_mptcp_path_manager,

8073

++	},

8074

++	{

8075

++		.procname	= "mptcp_scheduler",

8076

++		.mode		= 0644,

8077

++		.maxlen		= MPTCP_SCHED_NAME_MAX,

8078

++		.proc_handler	= proc_mptcp_scheduler,

8079

++	},

8080

++	{ }

8081

++};

8082

++

8083

++static inline u32 mptcp_hash_tk(u32 token)

8084

++{

8085

++	return token % MPTCP_HASH_SIZE;

8086

++}

8087

++

8088

++struct hlist_nulls_head tk_hashtable[MPTCP_HASH_SIZE];

8089

++EXPORT_SYMBOL(tk_hashtable);

8090

++

8091

++/* This second hashtable is needed to retrieve request socks

8092

++ * created as a result of a join request. While the SYN contains

8093

++ * the token, the final ack does not, so we need a separate hashtable

8094

++ * to retrieve the mpcb.

8095

++ */

8096

++struct hlist_nulls_head mptcp_reqsk_htb[MPTCP_HASH_SIZE];

8097

++spinlock_t mptcp_reqsk_hlock;	/* hashtable protection */

8098

++

8099

++/* The following hash table is used to avoid collision of token */

8100

++static struct hlist_nulls_head mptcp_reqsk_tk_htb[MPTCP_HASH_SIZE];

8101

++spinlock_t mptcp_tk_hashlock;	/* hashtable protection */

8102

++

8103

++static bool mptcp_reqsk_find_tk(const u32 token)

8104

++{

8105

++	const u32 hash = mptcp_hash_tk(token);

8106

++	const struct mptcp_request_sock *mtreqsk;

8107

++	const struct hlist_nulls_node *node;

8108

++

8109

++begin:

8110

++	hlist_nulls_for_each_entry_rcu(mtreqsk, node,

8111

++				       &mptcp_reqsk_tk_htb[hash], hash_entry) {

8112

++		if (token == mtreqsk->mptcp_loc_token)

8113

++			return true;

8114

++	}

8115

++	/* A request-socket is destroyed by RCU. So, it might have been recycled

8116

++	 * and put into another hash-table list. So, after the lookup we may

8117

++	 * end up in a different list. So, we may need to restart.

8118

++	 *

8119

++	 * See also the comment in __inet_lookup_established.

8120

++	 */

8121

++	if (get_nulls_value(node) != hash)

8122

++		goto begin;

8123

++	return false;

8124

++}

8125

++

8126

++static void mptcp_reqsk_insert_tk(struct request_sock *reqsk, const u32 token)

8127

++{

8128

++	u32 hash = mptcp_hash_tk(token);

8129

++

8130

++	hlist_nulls_add_head_rcu(&mptcp_rsk(reqsk)->hash_entry,

8131

++				 &mptcp_reqsk_tk_htb[hash]);

8132

++}

8133

++

8134

++static void mptcp_reqsk_remove_tk(const struct request_sock *reqsk)

8135

++{

8136

++	rcu_read_lock();

8137

++	spin_lock(&mptcp_tk_hashlock);

8138

++	hlist_nulls_del_init_rcu(&mptcp_rsk(reqsk)->hash_entry);

8139

++	spin_unlock(&mptcp_tk_hashlock);

8140

++	rcu_read_unlock();

8141

++}

8142

++

8143

++void mptcp_reqsk_destructor(struct request_sock *req)

8144

++{

8145

++	if (!mptcp_rsk(req)->is_sub) {

8146

++		if (in_softirq()) {

8147

++			mptcp_reqsk_remove_tk(req);

8148

++		} else {

8149

++			rcu_read_lock_bh();

8150

++			spin_lock(&mptcp_tk_hashlock);

8151

++			hlist_nulls_del_init_rcu(&mptcp_rsk(req)->hash_entry);

8152

++			spin_unlock(&mptcp_tk_hashlock);

8153

++			rcu_read_unlock_bh();

8154

++		}

8155

++	} else {

8156

++		mptcp_hash_request_remove(req);

8157

++	}

8158

++}

8159

++

8160

++static void __mptcp_hash_insert(struct tcp_sock *meta_tp, const u32 token)

8161

++{

8162

++	u32 hash = mptcp_hash_tk(token);

8163

++	hlist_nulls_add_head_rcu(&meta_tp->tk_table, &tk_hashtable[hash]);

8164

++	meta_tp->inside_tk_table = 1;

8165

++}

8166

++

8167

++static bool mptcp_find_token(u32 token)

8168

++{

8169

++	const u32 hash = mptcp_hash_tk(token);

8170

++	const struct tcp_sock *meta_tp;

8171

++	const struct hlist_nulls_node *node;

8172

++

8173

++begin:

8174

++	hlist_nulls_for_each_entry_rcu(meta_tp, node, &tk_hashtable[hash], tk_table) {

8175

++		if (token == meta_tp->mptcp_loc_token)

8176

++			return true;

8177

++	}

8178

++	/* A TCP-socket is destroyed by RCU. So, it might have been recycled

8179

++	 * and put into another hash-table list. So, after the lookup we may

8180

++	 * end up in a different list. So, we may need to restart.

8181

++	 *

8182

++	 * See also the comment in __inet_lookup_established.

8183

++	 */

8184

++	if (get_nulls_value(node) != hash)

8185

++		goto begin;

8186

++	return false;

8187

++}

8188

++

8189

++static void mptcp_set_key_reqsk(struct request_sock *req,

8190

++				const struct sk_buff *skb)

8191

++{

8192

++	const struct inet_request_sock *ireq = inet_rsk(req);

8193

++	struct mptcp_request_sock *mtreq = mptcp_rsk(req);

8194

++

8195

++	if (skb->protocol == htons(ETH_P_IP)) {

8196

++		mtreq->mptcp_loc_key = mptcp_v4_get_key(ip_hdr(skb)->saddr,

8197

++							ip_hdr(skb)->daddr,

8198

++							htons(ireq->ir_num),

8199

++							ireq->ir_rmt_port);

8200

++#if IS_ENABLED(CONFIG_IPV6)

8201

++	} else {

8202

++		mtreq->mptcp_loc_key = mptcp_v6_get_key(ipv6_hdr(skb)->saddr.s6_addr32,

8203

++							ipv6_hdr(skb)->daddr.s6_addr32,

8204

++							htons(ireq->ir_num),

8205

++							ireq->ir_rmt_port);

8206

++#endif

8207

++	}

8208

++

8209

++	mptcp_key_sha1(mtreq->mptcp_loc_key, &mtreq->mptcp_loc_token, NULL);

8210

++}

8211

++

8212

++/* New MPTCP-connection request, prepare a new token for the meta-socket that

8213

++ * will be created in mptcp_check_req_master(), and store the received token.

8214

++ */

8215

++void mptcp_reqsk_new_mptcp(struct request_sock *req,

8216

++			   const struct mptcp_options_received *mopt,

8217

++			   const struct sk_buff *skb)

8218

++{

8219

++	struct mptcp_request_sock *mtreq = mptcp_rsk(req);

8220

++

8221

++	inet_rsk(req)->saw_mpc = 1;

8222

++

8223

++	rcu_read_lock();

8224

++	spin_lock(&mptcp_tk_hashlock);

8225

++	do {

8226

++		mptcp_set_key_reqsk(req, skb);

8227

++	} while (mptcp_reqsk_find_tk(mtreq->mptcp_loc_token) ||

8228

++		 mptcp_find_token(mtreq->mptcp_loc_token));

8229

++

8230

++	mptcp_reqsk_insert_tk(req, mtreq->mptcp_loc_token);

8231

++	spin_unlock(&mptcp_tk_hashlock);

8232

++	rcu_read_unlock();

8233

++	mtreq->mptcp_rem_key = mopt->mptcp_key;

8234

++}

8235

++

8236

++static void mptcp_set_key_sk(const struct sock *sk)

8237

++{

8238

++	struct tcp_sock *tp = tcp_sk(sk);

8239

++	const struct inet_sock *isk = inet_sk(sk);

8240

++

8241

++	if (sk->sk_family == AF_INET)

8242

++		tp->mptcp_loc_key = mptcp_v4_get_key(isk->inet_saddr,

8243

++						     isk->inet_daddr,

8244

++						     isk->inet_sport,

8245

++						     isk->inet_dport);

8246

++#if IS_ENABLED(CONFIG_IPV6)

8247

++	else

8248

++		tp->mptcp_loc_key = mptcp_v6_get_key(inet6_sk(sk)->saddr.s6_addr32,

8249

++						     sk->sk_v6_daddr.s6_addr32,

8250

++						     isk->inet_sport,

8251

++						     isk->inet_dport);

8252

++#endif

8253

++

8254

++	mptcp_key_sha1(tp->mptcp_loc_key,

8255

++		       &tp->mptcp_loc_token, NULL);

8256

++}

8257

++

8258

++void mptcp_connect_init(struct sock *sk)

8259

++{

8260

++	struct tcp_sock *tp = tcp_sk(sk);

8261

++

8262

++	rcu_read_lock_bh();

8263

++	spin_lock(&mptcp_tk_hashlock);

8264

++	do {

8265

++		mptcp_set_key_sk(sk);

8266

++	} while (mptcp_reqsk_find_tk(tp->mptcp_loc_token) ||

8267

++		 mptcp_find_token(tp->mptcp_loc_token));

8268

++

8269

++	__mptcp_hash_insert(tp, tp->mptcp_loc_token);

8270

++	spin_unlock(&mptcp_tk_hashlock);

8271

++	rcu_read_unlock_bh();

8272

++}

8273

++

8274

++/**

8275

++ * This function increments the refcount of the mpcb struct.

8276

++ * It is the responsibility of the caller to decrement when releasing

8277

++ * the structure.

8278

++ */

8279

++struct sock *mptcp_hash_find(const struct net *net, const u32 token)

8280

++{

8281

++	const u32 hash = mptcp_hash_tk(token);

8282

++	const struct tcp_sock *meta_tp;

8283

++	struct sock *meta_sk = NULL;

8284

++	const struct hlist_nulls_node *node;

8285

++

8286

++	rcu_read_lock();

8287

++begin:

8288

++	hlist_nulls_for_each_entry_rcu(meta_tp, node, &tk_hashtable[hash],

8289

++				       tk_table) {

8290

++		meta_sk = (struct sock *)meta_tp;

8291

++		if (token == meta_tp->mptcp_loc_token &&

8292

++		    net_eq(net, sock_net(meta_sk))) {

8293

++			if (unlikely(!atomic_inc_not_zero(&meta_sk->sk_refcnt)))

8294

++				goto out;

8295

++			if (unlikely(token != meta_tp->mptcp_loc_token ||

8296

++				     !net_eq(net, sock_net(meta_sk)))) {

8297

++				sock_gen_put(meta_sk);

8298

++				goto begin;

8299

++			}

8300

++			goto found;

8301

++		}

8302

++	}

8303

++	/* A TCP-socket is destroyed by RCU. So, it might have been recycled

8304

++	 * and put into another hash-table list. So, after the lookup we may

8305

++	 * end up in a different list. So, we may need to restart.

8306

++	 *

8307

++	 * See also the comment in __inet_lookup_established.

8308

++	 */

8309

++	if (get_nulls_value(node) != hash)

8310

++		goto begin;

8311

++out:

8312

++	meta_sk = NULL;

8313

++found:

8314

++	rcu_read_unlock();

8315

++	return meta_sk;

8316

++}

8317

++

8318

++void mptcp_hash_remove_bh(struct tcp_sock *meta_tp)

8319

++{

8320

++	/* remove from the token hashtable */

8321

++	rcu_read_lock_bh();

8322

++	spin_lock(&mptcp_tk_hashlock);

8323

++	hlist_nulls_del_init_rcu(&meta_tp->tk_table);

8324

++	meta_tp->inside_tk_table = 0;

8325

++	spin_unlock(&mptcp_tk_hashlock);

8326

++	rcu_read_unlock_bh();

8327

++}

8328

++

8329

++void mptcp_hash_remove(struct tcp_sock *meta_tp)

8330

++{

8331

++	rcu_read_lock();

8332

++	spin_lock(&mptcp_tk_hashlock);

8333

++	hlist_nulls_del_init_rcu(&meta_tp->tk_table);

8334

++	meta_tp->inside_tk_table = 0;

8335

++	spin_unlock(&mptcp_tk_hashlock);

8336

++	rcu_read_unlock();

8337

++}

8338

++

8339

++struct sock *mptcp_select_ack_sock(const struct sock *meta_sk)

8340

++{

8341

++	const struct tcp_sock *meta_tp = tcp_sk(meta_sk);

8342

++	struct sock *sk, *rttsk = NULL, *lastsk = NULL;

8343

++	u32 min_time = 0, last_active = 0;

8344

++

8345

++	mptcp_for_each_sk(meta_tp->mpcb, sk) {

8346

++		struct tcp_sock *tp = tcp_sk(sk);

8347

++		u32 elapsed;

8348

++

8349

++		if (!mptcp_sk_can_send_ack(sk) || tp->pf)

8350

++			continue;

8351

++

8352

++		elapsed = keepalive_time_elapsed(tp);

8353

++

8354

++		/* We take the one with the lowest RTT within a reasonable

8355

++		 * (meta-RTO)-timeframe

8356

++		 */

8357

++		if (elapsed < inet_csk(meta_sk)->icsk_rto) {

8358

++			if (!min_time || tp->srtt_us < min_time) {

8359

++				min_time = tp->srtt_us;

8360

++				rttsk = sk;

8361

++			}

8362

++			continue;

8363

++		}

8364

++

8365

++		/* Otherwise, we just take the most recent active */

8366

++		if (!rttsk && (!last_active || elapsed < last_active)) {

8367

++			last_active = elapsed;

8368

++			lastsk = sk;

8369

++		}

8370

++	}

8371

++

8372

++	if (rttsk)

8373

++		return rttsk;

8374

++

8375

++	return lastsk;

8376

++}

8377

++EXPORT_SYMBOL(mptcp_select_ack_sock);

8378

++

8379

++static void mptcp_sock_def_error_report(struct sock *sk)

8380

++{

8381

++	const struct mptcp_cb *mpcb = tcp_sk(sk)->mpcb;

8382

++

8383

++	if (!sock_flag(sk, SOCK_DEAD))

8384

++		mptcp_sub_close(sk, 0);

8385

++

8386

++	if (mpcb->infinite_mapping_rcv || mpcb->infinite_mapping_snd ||

8387

++	    mpcb->send_infinite_mapping) {

8388

++		struct sock *meta_sk = mptcp_meta_sk(sk);

8389

++

8390

++		meta_sk->sk_err = sk->sk_err;

8391

++		meta_sk->sk_err_soft = sk->sk_err_soft;

8392

++

8393

++		if (!sock_flag(meta_sk, SOCK_DEAD))

8394

++			meta_sk->sk_error_report(meta_sk);

8395

++

8396

++		tcp_done(meta_sk);

8397

++	}

8398

++

8399

++	sk->sk_err = 0;

8400

++	return;

8401

++}

8402

++

8403

++static void mptcp_mpcb_put(struct mptcp_cb *mpcb)

8404

++{

8405

++	if (atomic_dec_and_test(&mpcb->mpcb_refcnt)) {

8406

++		mptcp_cleanup_path_manager(mpcb);

8407

++		mptcp_cleanup_scheduler(mpcb);

8408

++		kmem_cache_free(mptcp_cb_cache, mpcb);

8409

++	}

8410

++}

8411

++

8412

++static void mptcp_sock_destruct(struct sock *sk)

8413

++{

8414

++	struct tcp_sock *tp = tcp_sk(sk);

8415

++

8416

++	inet_sock_destruct(sk);

8417

++

8418

++	if (!is_meta_sk(sk) && !tp->was_meta_sk) {

8419

++		BUG_ON(!hlist_unhashed(&tp->mptcp->cb_list));

8420

++

8421

++		kmem_cache_free(mptcp_sock_cache, tp->mptcp);

8422

++		tp->mptcp = NULL;

8423

++

8424

++		/* Taken when mpcb pointer was set */

8425

++		sock_put(mptcp_meta_sk(sk));

8426

++		mptcp_mpcb_put(tp->mpcb);

8427

++	} else {

8428

++		struct mptcp_cb *mpcb = tp->mpcb;

8429

++		struct mptcp_tw *mptw;

8430

++

8431

++		/* The mpcb is disappearing - we can make the final

8432

++		 * update to the rcv_nxt of the time-wait-sock and remove

8433

++		 * its reference to the mpcb.

8434

++		 */

8435

++		spin_lock_bh(&mpcb->tw_lock);

8436

++		list_for_each_entry_rcu(mptw, &mpcb->tw_list, list) {

8437

++			list_del_rcu(&mptw->list);

8438

++			mptw->in_list = 0;

8439

++			mptcp_mpcb_put(mpcb);

8440

++			rcu_assign_pointer(mptw->mpcb, NULL);

8441

++		}

8442

++		spin_unlock_bh(&mpcb->tw_lock);

8443

++

8444

++		mptcp_mpcb_put(mpcb);

8445

++

8446

++		mptcp_debug("%s destroying meta-sk\n", __func__);

8447

++	}

8448

++

8449

++	WARN_ON(!static_key_false(&mptcp_static_key));

8450

++	/* Must be the last call, because is_meta_sk() above still needs the

8451

++	 * static key

8452

++	 */

8453

++	static_key_slow_dec(&mptcp_static_key);

8454

++}

8455

++

8456

++void mptcp_destroy_sock(struct sock *sk)

8457

++{

8458

++	if (is_meta_sk(sk)) {

8459

++		struct sock *sk_it, *tmpsk;

8460

++

8461

++		__skb_queue_purge(&tcp_sk(sk)->mpcb->reinject_queue);

8462

++		mptcp_purge_ofo_queue(tcp_sk(sk));

8463

++

8464

++		/* We have to close all remaining subflows. Normally, they

8465

++		 * should all be about to get closed. But, if the kernel is

8466

++		 * forcing a closure (e.g., tcp_write_err), the subflows might

8467

++		 * not have been closed properly (as we are waiting for the

8468

++		 * DATA_ACK of the DATA_FIN).

8469

++		 */

8470

++		mptcp_for_each_sk_safe(tcp_sk(sk)->mpcb, sk_it, tmpsk) {

8471

++			/* Already did call tcp_close - waiting for graceful

8472

++			 * closure, or if we are retransmitting fast-close on

8473

++			 * the subflow. The reset (or timeout) will kill the

8474

++			 * subflow..

8475

++			 */

8476

++			if (tcp_sk(sk_it)->closing ||

8477

++			    tcp_sk(sk_it)->send_mp_fclose)

8478

++				continue;

8479

++

8480

++			/* Allow the delayed work first to prevent time-wait state */

8481

++			if (delayed_work_pending(&tcp_sk(sk_it)->mptcp->work))

8482

++				continue;

8483

++

8484

++			mptcp_sub_close(sk_it, 0);

8485

++		}

8486

++

8487

++		mptcp_delete_synack_timer(sk);

8488

++	} else {

8489

++		mptcp_del_sock(sk);

8490

++	}

8491

++}

8492

++

8493

++static void mptcp_set_state(struct sock *sk)

8494

++{

8495

++	struct sock *meta_sk = mptcp_meta_sk(sk);

8496

++

8497

++	/* Meta is not yet established - wake up the application */

8498

++	if ((1 << meta_sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV) &&

8499

++	    sk->sk_state == TCP_ESTABLISHED) {

8500

++		tcp_set_state(meta_sk, TCP_ESTABLISHED);

8501

++

8502

++		if (!sock_flag(meta_sk, SOCK_DEAD)) {

8503

++			meta_sk->sk_state_change(meta_sk);

8504

++			sk_wake_async(meta_sk, SOCK_WAKE_IO, POLL_OUT);

8505

++		}

8506

++	}

8507

++

8508

++	if (sk->sk_state == TCP_ESTABLISHED) {

8509

++		tcp_sk(sk)->mptcp->establish_increased = 1;

8510

++		tcp_sk(sk)->mpcb->cnt_established++;

8511

++	}

8512

++}

8513

++

8514

++void mptcp_init_congestion_control(struct sock *sk)

8515

++{

8516

++	struct inet_connection_sock *icsk = inet_csk(sk);

8517

++	struct inet_connection_sock *meta_icsk = inet_csk(mptcp_meta_sk(sk));

8518

++	const struct tcp_congestion_ops *ca = meta_icsk->icsk_ca_ops;

8519

++

8520

++	/* The application didn't set the congestion control to use

8521

++	 * fallback to the default one.

8522

++	 */

8523

++	if (ca == &tcp_init_congestion_ops)

8524

++		goto use_default;

8525

++

8526

++	/* Use the same congestion control as set by the user. If the

8527

++	 * module is not available fallback to the default one.

8528

++	 */

8529

++	if (!try_module_get(ca->owner)) {

8530

++		pr_warn("%s: fallback to the system default CC\n", __func__);

8531

++		goto use_default;

8532

++	}

8533

++

8534

++	icsk->icsk_ca_ops = ca;

8535

++	if (icsk->icsk_ca_ops->init)

8536

++		icsk->icsk_ca_ops->init(sk);

8537

++

8538

++	return;

8539

++

8540

++use_default:

8541

++	icsk->icsk_ca_ops = &tcp_init_congestion_ops;

8542

++	tcp_init_congestion_control(sk);

8543

++}

8544

++

8545

++u32 mptcp_secret[MD5_MESSAGE_BYTES / 4] ____cacheline_aligned;

8546

++u32 mptcp_seed = 0;

8547

++

8548

++void mptcp_key_sha1(u64 key, u32 *token, u64 *idsn)

8549

++{

8550

++	u32 workspace[SHA_WORKSPACE_WORDS];

8551

++	u32 mptcp_hashed_key[SHA_DIGEST_WORDS];

8552

++	u8 input[64];

8553

++	int i;

8554

++

8555

++	memset(workspace, 0, sizeof(workspace));

8556

++

8557

++	/* Initialize input with appropriate padding */

8558

++	memset(&input[9], 0, sizeof(input) - 10); /* -10, because the last byte

8559

++						   * is explicitly set too

8560

++						   */

8561

++	memcpy(input, &key, sizeof(key)); /* Copy key to the msg beginning */

8562

++	input[8] = 0x80; /* Padding: First bit after message = 1 */

8563

++	input[63] = 0x40; /* Padding: Length of the message = 64 bits */

8564

++

8565

++	sha_init(mptcp_hashed_key);

8566

++	sha_transform(mptcp_hashed_key, input, workspace);

8567

++

8568

++	for (i = 0; i < 5; i++)

8569

++		mptcp_hashed_key[i] = cpu_to_be32(mptcp_hashed_key[i]);

8570

++

8571

++	if (token)

8572

++		*token = mptcp_hashed_key[0];

8573

++	if (idsn)

8574

++		*idsn = *((u64 *)&mptcp_hashed_key[3]);

8575

++}

8576

++

8577

++void mptcp_hmac_sha1(u8 *key_1, u8 *key_2, u8 *rand_1, u8 *rand_2,

8578

++		       u32 *hash_out)

8579

++{

8580

++	u32 workspace[SHA_WORKSPACE_WORDS];

8581

++	u8 input[128]; /* 2 512-bit blocks */

8582

++	int i;

8583

++

8584

++	memset(workspace, 0, sizeof(workspace));

8585

++

8586

++	/* Generate key xored with ipad */

8587

++	memset(input, 0x36, 64);

8588

++	for (i = 0; i < 8; i++)

8589

++		input[i] ^= key_1[i];

8590

++	for (i = 0; i < 8; i++)

8591

++		input[i + 8] ^= key_2[i];

8592

++

8593

++	memcpy(&input[64], rand_1, 4);

8594

++	memcpy(&input[68], rand_2, 4);

8595

++	input[72] = 0x80; /* Padding: First bit after message = 1 */

8596

++	memset(&input[73], 0, 53);

8597

++

8598

++	/* Padding: Length of the message = 512 + 64 bits */

8599

++	input[126] = 0x02;

8600

++	input[127] = 0x40;

8601

++

8602

++	sha_init(hash_out);

8603

++	sha_transform(hash_out, input, workspace);

8604

++	memset(workspace, 0, sizeof(workspace));

8605

++

8606

++	sha_transform(hash_out, &input[64], workspace);

8607

++	memset(workspace, 0, sizeof(workspace));

8608

++

8609

++	for (i = 0; i < 5; i++)

8610

++		hash_out[i] = cpu_to_be32(hash_out[i]);

8611

++

8612

++	/* Prepare second part of hmac */

8613

++	memset(input, 0x5C, 64);

8614

++	for (i = 0; i < 8; i++)

8615

++		input[i] ^= key_1[i];

8616

++	for (i = 0; i < 8; i++)

8617

++		input[i + 8] ^= key_2[i];

8618

++

8619

++	memcpy(&input[64], hash_out, 20);

8620

++	input[84] = 0x80;

8621

++	memset(&input[85], 0, 41);

8622

++

8623

++	/* Padding: Length of the message = 512 + 160 bits */

8624

++	input[126] = 0x02;

8625

++	input[127] = 0xA0;

8626

++

8627

++	sha_init(hash_out);

8628

++	sha_transform(hash_out, input, workspace);

8629

++	memset(workspace, 0, sizeof(workspace));

8630

++

8631

++	sha_transform(hash_out, &input[64], workspace);

8632

++

8633

++	for (i = 0; i < 5; i++)

8634

++		hash_out[i] = cpu_to_be32(hash_out[i]);

8635

++}

8636

++

8637

++static void mptcp_mpcb_inherit_sockopts(struct sock *meta_sk, struct sock *master_sk)

8638

++{

8639

++	/* Socket-options handled by sk_clone_lock while creating the meta-sk.

8640

++	 * ======

8641

++	 * SO_SNDBUF, SO_SNDBUFFORCE, SO_RCVBUF, SO_RCVBUFFORCE, SO_RCVLOWAT,

8642

++	 * SO_RCVTIMEO, SO_SNDTIMEO, SO_ATTACH_FILTER, SO_DETACH_FILTER,

8643

++	 * TCP_NODELAY, TCP_CORK

8644

++	 *

8645

++	 * Socket-options handled in this function here

8646

++	 * ======

8647

++	 * TCP_DEFER_ACCEPT

8648

++	 * SO_KEEPALIVE

8649

++	 *

8650

++	 * Socket-options on the todo-list

8651

++	 * ======

8652

++	 * SO_BINDTODEVICE - should probably prevent creation of new subsocks

8653

++	 *		     across other devices. - what about the api-draft?

8654

++	 * SO_DEBUG

8655

++	 * SO_REUSEADDR - probably we don't care about this

8656

++	 * SO_DONTROUTE, SO_BROADCAST

8657

++	 * SO_OOBINLINE

8658

++	 * SO_LINGER

8659

++	 * SO_TIMESTAMP* - I don't think this is of concern for a SOCK_STREAM

8660

++	 * SO_PASSSEC - I don't think this is of concern for a SOCK_STREAM

8661

++	 * SO_RXQ_OVFL

8662

++	 * TCP_COOKIE_TRANSACTIONS

8663

++	 * TCP_MAXSEG

8664

++	 * TCP_THIN_* - Handled by sk_clone_lock, but we need to support this

8665

++	 *		in mptcp_retransmit_timer. AND we need to check what is

8666

++	 *		about the subsockets.

8667

++	 * TCP_LINGER2

8668

++	 * TCP_WINDOW_CLAMP

8669

++	 * TCP_USER_TIMEOUT

8670

++	 * TCP_MD5SIG

8671

++	 *

8672

++	 * Socket-options of no concern for the meta-socket (but for the subsocket)

8673

++	 * ======

8674

++	 * SO_PRIORITY

8675

++	 * SO_MARK

8676

++	 * TCP_CONGESTION

8677

++	 * TCP_SYNCNT

8678

++	 * TCP_QUICKACK

8679

++	 */

8680

++

8681

++	/* DEFER_ACCEPT should not be set on the meta, as we want to accept new subflows directly */

8682

++	inet_csk(meta_sk)->icsk_accept_queue.rskq_defer_accept = 0;

8683

++

8684

++	/* Keepalives are handled entirely at the MPTCP-layer */

8685

++	if (sock_flag(meta_sk, SOCK_KEEPOPEN)) {

8686

++		inet_csk_reset_keepalive_timer(meta_sk,

8687

++					       keepalive_time_when(tcp_sk(meta_sk)));

8688

++		sock_reset_flag(master_sk, SOCK_KEEPOPEN);

8689

++		inet_csk_delete_keepalive_timer(master_sk);

8690

++	}

8691

++

8692

++	/* Do not propagate subflow-errors up to the MPTCP-layer */

8693

++	inet_sk(master_sk)->recverr = 0;

8694

++}

8695

++

8696

++static void mptcp_sub_inherit_sockopts(const struct sock *meta_sk, struct sock *sub_sk)

8697

++{

8698

++	/* IP_TOS also goes to the subflow. */

8699

++	if (inet_sk(sub_sk)->tos != inet_sk(meta_sk)->tos) {

8700

++		inet_sk(sub_sk)->tos = inet_sk(meta_sk)->tos;

8701

++		sub_sk->sk_priority = meta_sk->sk_priority;

8702

++		sk_dst_reset(sub_sk);

8703

++	}

8704

++

8705

++	/* Inherit SO_REUSEADDR */

8706

++	sub_sk->sk_reuse = meta_sk->sk_reuse;

8707

++

8708

++	/* Inherit snd/rcv-buffer locks */

8709

++	sub_sk->sk_userlocks = meta_sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;

8710

++

8711

++	/* Nagle/Cork is forced off on the subflows. It is handled at the meta-layer */

8712

++	tcp_sk(sub_sk)->nonagle = TCP_NAGLE_OFF|TCP_NAGLE_PUSH;

8713

++

8714

++	/* Keepalives are handled entirely at the MPTCP-layer */

8715

++	if (sock_flag(sub_sk, SOCK_KEEPOPEN)) {

8716

++		sock_reset_flag(sub_sk, SOCK_KEEPOPEN);

8717

++		inet_csk_delete_keepalive_timer(sub_sk);

8718

++	}

8719

++

8720

++	/* Do not propagate subflow-errors up to the MPTCP-layer */

8721

++	inet_sk(sub_sk)->recverr = 0;

8722

++}

8723

++

8724

++int mptcp_backlog_rcv(struct sock *meta_sk, struct sk_buff *skb)

8725

++{

8726

++	/* skb-sk may be NULL if we receive a packet immediatly after the

8727

++	 * SYN/ACK + MP_CAPABLE.

8728

++	 */

8729

++	struct sock *sk = skb->sk ? skb->sk : meta_sk;

8730

++	int ret = 0;

8731

++

8732

++	skb->sk = NULL;

8733

++

8734

++	if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt))) {

8735

++		kfree_skb(skb);

8736

++		return 0;

8737

++	}

8738

++

8739

++	if (sk->sk_family == AF_INET)

8740

++		ret = tcp_v4_do_rcv(sk, skb);

8741

++#if IS_ENABLED(CONFIG_IPV6)

8742

++	else

8743

++		ret = tcp_v6_do_rcv(sk, skb);

8744

++#endif

8745

++

8746

++	sock_put(sk);

8747

++	return ret;

8748

++}

8749

++

8750

++struct lock_class_key meta_key;

8751

++struct lock_class_key meta_slock_key;

8752

++

8753

++static void mptcp_synack_timer_handler(unsigned long data)

8754

++{

8755

++	struct sock *meta_sk = (struct sock *) data;

8756

++	struct listen_sock *lopt = inet_csk(meta_sk)->icsk_accept_queue.listen_opt;

8757

++

8758

++	/* Only process if socket is not in use. */

8759

++	bh_lock_sock(meta_sk);

8760

++

8761

++	if (sock_owned_by_user(meta_sk)) {

8762

++		/* Try again later. */

8763

++		mptcp_reset_synack_timer(meta_sk, HZ/20);

8764

++		goto out;

8765

++	}

8766

++

8767

++	/* May happen if the queue got destructed in mptcp_close */

8768

++	if (!lopt)

8769

++		goto out;

8770

++

8771

++	inet_csk_reqsk_queue_prune(meta_sk, TCP_SYNQ_INTERVAL,

8772

++				   TCP_TIMEOUT_INIT, TCP_RTO_MAX);

8773

++

8774

++	if (lopt->qlen)

8775

++		mptcp_reset_synack_timer(meta_sk, TCP_SYNQ_INTERVAL);

8776

++

8777

++out:

8778

++	bh_unlock_sock(meta_sk);

8779

++	sock_put(meta_sk);

8780

++}

8781

++

8782

++static const struct tcp_sock_ops mptcp_meta_specific = {

8783

++	.__select_window		= __mptcp_select_window,

8784

++	.select_window			= mptcp_select_window,

8785

++	.select_initial_window		= mptcp_select_initial_window,

8786

++	.init_buffer_space		= mptcp_init_buffer_space,

8787

++	.set_rto			= mptcp_tcp_set_rto,

8788

++	.should_expand_sndbuf		= mptcp_should_expand_sndbuf,

8789

++	.init_congestion_control	= mptcp_init_congestion_control,

8790

++	.send_fin			= mptcp_send_fin,

8791

++	.write_xmit			= mptcp_write_xmit,

8792

++	.send_active_reset		= mptcp_send_active_reset,

8793

++	.write_wakeup			= mptcp_write_wakeup,

8794

++	.prune_ofo_queue		= mptcp_prune_ofo_queue,

8795

++	.retransmit_timer		= mptcp_retransmit_timer,

8796

++	.time_wait			= mptcp_time_wait,

8797

++	.cleanup_rbuf			= mptcp_cleanup_rbuf,

8798

++};

8799

++

8800

++static const struct tcp_sock_ops mptcp_sub_specific = {

8801

++	.__select_window		= __mptcp_select_window,

8802

++	.select_window			= mptcp_select_window,

8803

++	.select_initial_window		= mptcp_select_initial_window,

8804

++	.init_buffer_space		= mptcp_init_buffer_space,

8805

++	.set_rto			= mptcp_tcp_set_rto,

8806

++	.should_expand_sndbuf		= mptcp_should_expand_sndbuf,

8807

++	.init_congestion_control	= mptcp_init_congestion_control,

8808

++	.send_fin			= tcp_send_fin,

8809

++	.write_xmit			= tcp_write_xmit,

8810

++	.send_active_reset		= tcp_send_active_reset,

8811

++	.write_wakeup			= tcp_write_wakeup,

8812

++	.prune_ofo_queue		= tcp_prune_ofo_queue,

8813

++	.retransmit_timer		= tcp_retransmit_timer,

8814

++	.time_wait			= tcp_time_wait,

8815

++	.cleanup_rbuf			= tcp_cleanup_rbuf,

8816

++};

8817

++

8818

++static int mptcp_alloc_mpcb(struct sock *meta_sk, __u64 remote_key, u32 window)

8819

++{

8820

++	struct mptcp_cb *mpcb;

8821

++	struct sock *master_sk;

8822

++	struct inet_connection_sock *master_icsk, *meta_icsk = inet_csk(meta_sk);

8823

++	struct tcp_sock *master_tp, *meta_tp = tcp_sk(meta_sk);

8824

++	u64 idsn;

8825

++

8826

++	dst_release(meta_sk->sk_rx_dst);

8827

++	meta_sk->sk_rx_dst = NULL;

8828

++	/* This flag is set to announce sock_lock_init to

8829

++	 * reclassify the lock-class of the master socket.

8830

++	 */

8831

++	meta_tp->is_master_sk = 1;

8832

++	master_sk = sk_clone_lock(meta_sk, GFP_ATOMIC | __GFP_ZERO);

8833

++	meta_tp->is_master_sk = 0;

8834

++	if (!master_sk)

8835

++		return -ENOBUFS;

8836

++

8837

++	master_tp = tcp_sk(master_sk);

8838

++	master_icsk = inet_csk(master_sk);

8839

++

8840

++	mpcb = kmem_cache_zalloc(mptcp_cb_cache, GFP_ATOMIC);

8841

++	if (!mpcb) {

8842

++		/* sk_free (and __sk_free) requirese wmem_alloc to be 1.

8843

++		 * All the rest is set to 0 thanks to __GFP_ZERO above.

8844

++		 */

8845

++		atomic_set(&master_sk->sk_wmem_alloc, 1);

8846

++		sk_free(master_sk);

8847

++		return -ENOBUFS;

8848

++	}

8849

++

8850

++#if IS_ENABLED(CONFIG_IPV6)

8851

++	if (meta_icsk->icsk_af_ops == &mptcp_v6_mapped) {

8852

++		struct ipv6_pinfo *newnp, *np = inet6_sk(meta_sk);

8853

++

8854

++		inet_sk(master_sk)->pinet6 = &((struct tcp6_sock *)master_sk)->inet6;

8855

++

8856

++		newnp = inet6_sk(master_sk);

8857

++		memcpy(newnp, np, sizeof(struct ipv6_pinfo));

8858

++

8859

++		newnp->ipv6_mc_list = NULL;

8860

++		newnp->ipv6_ac_list = NULL;

8861

++		newnp->ipv6_fl_list = NULL;

8862

++		newnp->opt = NULL;

8863

++		newnp->pktoptions = NULL;

8864

++		(void)xchg(&newnp->rxpmtu, NULL);

8865

++	} else if (meta_sk->sk_family == AF_INET6) {

8866

++		struct ipv6_pinfo *newnp, *np = inet6_sk(meta_sk);

8867

++

8868

++		inet_sk(master_sk)->pinet6 = &((struct tcp6_sock *)master_sk)->inet6;

8869

++

8870

++		newnp = inet6_sk(master_sk);

8871

++		memcpy(newnp, np, sizeof(struct ipv6_pinfo));

8872

++

8873

++		newnp->hop_limit	= -1;

8874

++		newnp->mcast_hops	= IPV6_DEFAULT_MCASTHOPS;

8875

++		newnp->mc_loop	= 1;

8876

++		newnp->pmtudisc	= IPV6_PMTUDISC_WANT;

8877

++		newnp->ipv6only	= sock_net(master_sk)->ipv6.sysctl.bindv6only;

8878

++	}

8879

++#endif

8880

++

8881

++	meta_tp->mptcp = NULL;

8882

++

8883

++	/* Store the keys and generate the peer's token */

8884

++	mpcb->mptcp_loc_key = meta_tp->mptcp_loc_key;

8885

++	mpcb->mptcp_loc_token = meta_tp->mptcp_loc_token;

8886

++

8887

++	/* Generate Initial data-sequence-numbers */

8888

++	mptcp_key_sha1(mpcb->mptcp_loc_key, NULL, &idsn);

8889

++	idsn = ntohll(idsn) + 1;

8890

++	mpcb->snd_high_order[0] = idsn >> 32;

8891

++	mpcb->snd_high_order[1] = mpcb->snd_high_order[0] - 1;

8892

++

8893

++	meta_tp->write_seq = (u32)idsn;

8894

++	meta_tp->snd_sml = meta_tp->write_seq;

8895

++	meta_tp->snd_una = meta_tp->write_seq;

8896

++	meta_tp->snd_nxt = meta_tp->write_seq;

8897

++	meta_tp->pushed_seq = meta_tp->write_seq;

8898

++	meta_tp->snd_up = meta_tp->write_seq;

8899

++

8900

++	mpcb->mptcp_rem_key = remote_key;

8901

++	mptcp_key_sha1(mpcb->mptcp_rem_key, &mpcb->mptcp_rem_token, &idsn);

8902

++	idsn = ntohll(idsn) + 1;

8903

++	mpcb->rcv_high_order[0] = idsn >> 32;

8904

++	mpcb->rcv_high_order[1] = mpcb->rcv_high_order[0] + 1;

8905

++	meta_tp->copied_seq = (u32) idsn;

8906

++	meta_tp->rcv_nxt = (u32) idsn;

8907

++	meta_tp->rcv_wup = (u32) idsn;

8908

++

8909

++	meta_tp->snd_wl1 = meta_tp->rcv_nxt - 1;

8910

++	meta_tp->snd_wnd = window;

8911

++	meta_tp->retrans_stamp = 0; /* Set in tcp_connect() */

8912

++

8913

++	meta_tp->packets_out = 0;

8914

++	meta_icsk->icsk_probes_out = 0;

8915

++

8916

++	/* Set mptcp-pointers */

8917

++	master_tp->mpcb = mpcb;

8918

++	master_tp->meta_sk = meta_sk;

8919

++	meta_tp->mpcb = mpcb;

8920

++	meta_tp->meta_sk = meta_sk;

8921

++	mpcb->meta_sk = meta_sk;

8922

++	mpcb->master_sk = master_sk;

8923

++

8924

++	meta_tp->was_meta_sk = 0;

8925

++

8926

++	/* Initialize the queues */

8927

++	skb_queue_head_init(&mpcb->reinject_queue);

8928

++	skb_queue_head_init(&master_tp->out_of_order_queue);

8929

++	tcp_prequeue_init(master_tp);

8930

++	INIT_LIST_HEAD(&master_tp->tsq_node);

8931

++

8932

++	master_tp->tsq_flags = 0;

8933

++

8934

++	mutex_init(&mpcb->mpcb_mutex);

8935

++

8936

++	/* Init the accept_queue structure, we support a queue of 32 pending

8937

++	 * connections, it does not need to be huge, since we only store  here

8938

++	 * pending subflow creations.

8939

++	 */

8940

++	if (reqsk_queue_alloc(&meta_icsk->icsk_accept_queue, 32, GFP_ATOMIC)) {

8941

++		inet_put_port(master_sk);

8942

++		kmem_cache_free(mptcp_cb_cache, mpcb);

8943

++		sk_free(master_sk);

8944

++		return -ENOMEM;

8945

++	}

8946

++

8947

++	/* Redefine function-pointers as the meta-sk is now fully ready */

8948

++	static_key_slow_inc(&mptcp_static_key);

8949

++	meta_tp->mpc = 1;

8950

++	meta_tp->ops = &mptcp_meta_specific;

8951

++

8952

++	meta_sk->sk_backlog_rcv = mptcp_backlog_rcv;

8953

++	meta_sk->sk_destruct = mptcp_sock_destruct;

8954

++

8955

++	/* Meta-level retransmit timer */

8956

++	meta_icsk->icsk_rto *= 2; /* Double of initial - rto */

8957

++

8958

++	tcp_init_xmit_timers(master_sk);

8959

++	/* Has been set for sending out the SYN */

8960

++	inet_csk_clear_xmit_timer(meta_sk, ICSK_TIME_RETRANS);

8961

++

8962

++	if (!meta_tp->inside_tk_table) {

8963

++		/* Adding the meta_tp in the token hashtable - coming from server-side */

8964

++		rcu_read_lock();

8965

++		spin_lock(&mptcp_tk_hashlock);

8966

++

8967

++		__mptcp_hash_insert(meta_tp, mpcb->mptcp_loc_token);

8968

++

8969

++		spin_unlock(&mptcp_tk_hashlock);

8970

++		rcu_read_unlock();

8971

++	}

8972

++	master_tp->inside_tk_table = 0;

8973

++

8974

++	/* Init time-wait stuff */

8975

++	INIT_LIST_HEAD(&mpcb->tw_list);

8976

++	spin_lock_init(&mpcb->tw_lock);

8977

++

8978

++	INIT_HLIST_HEAD(&mpcb->callback_list);

8979

++

8980

++	mptcp_mpcb_inherit_sockopts(meta_sk, master_sk);

8981

++

8982

++	mpcb->orig_sk_rcvbuf = meta_sk->sk_rcvbuf;

8983

++	mpcb->orig_sk_sndbuf = meta_sk->sk_sndbuf;

8984

++	mpcb->orig_window_clamp = meta_tp->window_clamp;

8985

++

8986

++	/* The meta is directly linked - set refcnt to 1 */

8987

++	atomic_set(&mpcb->mpcb_refcnt, 1);

8988

++

8989

++	mptcp_init_path_manager(mpcb);

8990

++	mptcp_init_scheduler(mpcb);

8991

++

8992

++	setup_timer(&mpcb->synack_timer, mptcp_synack_timer_handler,

8993

++		    (unsigned long)meta_sk);

8994

++

8995

++	mptcp_debug("%s: created mpcb with token %#x\n",

8996

++		    __func__, mpcb->mptcp_loc_token);

8997

++

8998

++	return 0;

8999

++}

9000

++

9001

++void mptcp_fallback_meta_sk(struct sock *meta_sk)

9002

++{

9003

++	kfree(inet_csk(meta_sk)->icsk_accept_queue.listen_opt);

9004

++	kmem_cache_free(mptcp_cb_cache, tcp_sk(meta_sk)->mpcb);

9005

++}

9006

++

9007

++int mptcp_add_sock(struct sock *meta_sk, struct sock *sk, u8 loc_id, u8 rem_id,

9008

++		   gfp_t flags)

9009

++{

9010

++	struct mptcp_cb *mpcb	= tcp_sk(meta_sk)->mpcb;

9011

++	struct tcp_sock *tp	= tcp_sk(sk);

9012

++

9013

++	tp->mptcp = kmem_cache_zalloc(mptcp_sock_cache, flags);

9014

++	if (!tp->mptcp)

9015

++		return -ENOMEM;

9016

++

9017

++	tp->mptcp->path_index = mptcp_set_new_pathindex(mpcb);

9018

++	/* No more space for more subflows? */

9019

++	if (!tp->mptcp->path_index) {

9020

++		kmem_cache_free(mptcp_sock_cache, tp->mptcp);

9021

++		return -EPERM;

9022

++	}

9023

++

9024

++	INIT_HLIST_NODE(&tp->mptcp->cb_list);

9025

++

9026

++	tp->mptcp->tp = tp;

9027

++	tp->mpcb = mpcb;

9028

++	tp->meta_sk = meta_sk;

9029

++

9030

++	static_key_slow_inc(&mptcp_static_key);

9031

++	tp->mpc = 1;

9032

++	tp->ops = &mptcp_sub_specific;

9033

++

9034

++	tp->mptcp->loc_id = loc_id;

9035

++	tp->mptcp->rem_id = rem_id;

9036

++	if (mpcb->sched_ops->init)

9037

++		mpcb->sched_ops->init(sk);

9038

++

9039

++	/* The corresponding sock_put is in mptcp_sock_destruct(). It cannot be

9040

++	 * included in mptcp_del_sock(), because the mpcb must remain alive

9041

++	 * until the last subsocket is completely destroyed.

9042

++	 */

9043

++	sock_hold(meta_sk);

9044

++	atomic_inc(&mpcb->mpcb_refcnt);

9045

++

9046

++	tp->mptcp->next = mpcb->connection_list;

9047

++	mpcb->connection_list = tp;

9048

++	tp->mptcp->attached = 1;

9049

++

9050

++	mpcb->cnt_subflows++;

9051

++	atomic_add(atomic_read(&((struct sock *)tp)->sk_rmem_alloc),

9052

++		   &meta_sk->sk_rmem_alloc);

9053

++

9054

++	mptcp_sub_inherit_sockopts(meta_sk, sk);

9055

++	INIT_DELAYED_WORK(&tp->mptcp->work, mptcp_sub_close_wq);

9056

++

9057

++	/* As we successfully allocated the mptcp_tcp_sock, we have to

9058

++	 * change the function-pointers here (for sk_destruct to work correctly)

9059

++	 */

9060

++	sk->sk_error_report = mptcp_sock_def_error_report;

9061

++	sk->sk_data_ready = mptcp_data_ready;

9062

++	sk->sk_write_space = mptcp_write_space;

9063

++	sk->sk_state_change = mptcp_set_state;

9064

++	sk->sk_destruct = mptcp_sock_destruct;

9065

++

9066

++	if (sk->sk_family == AF_INET)

9067

++		mptcp_debug("%s: token %#x pi %d, src_addr:%pI4:%d dst_addr:%pI4:%d, cnt_subflows now %d\n",

9068

++			    __func__ , mpcb->mptcp_loc_token,

9069

++			    tp->mptcp->path_index,

9070

++			    &((struct inet_sock *)tp)->inet_saddr,

9071

++			    ntohs(((struct inet_sock *)tp)->inet_sport),

9072

++			    &((struct inet_sock *)tp)->inet_daddr,

9073

++			    ntohs(((struct inet_sock *)tp)->inet_dport),

9074

++			    mpcb->cnt_subflows);

9075

++#if IS_ENABLED(CONFIG_IPV6)

9076

++	else

9077

++		mptcp_debug("%s: token %#x pi %d, src_addr:%pI6:%d dst_addr:%pI6:%d, cnt_subflows now %d\n",

9078

++			    __func__ , mpcb->mptcp_loc_token,

9079

++			    tp->mptcp->path_index, &inet6_sk(sk)->saddr,

9080

++			    ntohs(((struct inet_sock *)tp)->inet_sport),

9081

++			    &sk->sk_v6_daddr,

9082

++			    ntohs(((struct inet_sock *)tp)->inet_dport),

9083

++			    mpcb->cnt_subflows);

9084

++#endif

9085

++

9086

++	return 0;

9087

++}

9088

++

9089

++void mptcp_del_sock(struct sock *sk)

9090

++{

9091

++	struct tcp_sock *tp = tcp_sk(sk), *tp_prev;

9092

++	struct mptcp_cb *mpcb;

9093

++

9094

++	if (!tp->mptcp || !tp->mptcp->attached)

9095

++		return;

9096

++

9097

++	mpcb = tp->mpcb;

9098

++	tp_prev = mpcb->connection_list;

9099

++

9100

++	mptcp_debug("%s: Removing subsock tok %#x pi:%d state %d is_meta? %d\n",

9101

++		    __func__, mpcb->mptcp_loc_token, tp->mptcp->path_index,

9102

++		    sk->sk_state, is_meta_sk(sk));

9103

++

9104

++	if (tp_prev == tp) {

9105

++		mpcb->connection_list = tp->mptcp->next;

9106

++	} else {

9107

++		for (; tp_prev && tp_prev->mptcp->next; tp_prev = tp_prev->mptcp->next) {

9108

++			if (tp_prev->mptcp->next == tp) {

9109

++				tp_prev->mptcp->next = tp->mptcp->next;

9110

++				break;

9111

++			}

9112

++		}

9113

++	}

9114

++	mpcb->cnt_subflows--;

9115

++	if (tp->mptcp->establish_increased)

9116

++		mpcb->cnt_established--;

9117

++

9118

++	tp->mptcp->next = NULL;

9119

++	tp->mptcp->attached = 0;

9120

++	mpcb->path_index_bits &= ~(1 << tp->mptcp->path_index);

9121

++

9122

++	if (!skb_queue_empty(&sk->sk_write_queue))

9123

++		mptcp_reinject_data(sk, 0);

9124

++

9125

++	if (is_master_tp(tp))

9126

++		mpcb->master_sk = NULL;

9127

++	else if (tp->mptcp->pre_established)

9128

++		sk_stop_timer(sk, &tp->mptcp->mptcp_ack_timer);

9129

++

9130

++	rcu_assign_pointer(inet_sk(sk)->inet_opt, NULL);

9131

++}

9132

++

9133

++/* Updates the metasocket ULID/port data, based on the given sock.

9134

++ * The argument sock must be the sock accessible to the application.

9135

++ * In this function, we update the meta socket info, based on the changes

9136

++ * in the application socket (bind, address allocation, ...)

9137

++ */

9138

++void mptcp_update_metasocket(struct sock *sk, const struct sock *meta_sk)

9139

++{

9140

++	if (tcp_sk(sk)->mpcb->pm_ops->new_session)

9141

++		tcp_sk(sk)->mpcb->pm_ops->new_session(meta_sk);

9142

++

9143

++	tcp_sk(sk)->mptcp->send_mp_prio = tcp_sk(sk)->mptcp->low_prio;

9144

++}

9145

++

9146

++/* Clean up the receive buffer for full frames taken by the user,

9147

++ * then send an ACK if necessary.  COPIED is the number of bytes

9148

++ * tcp_recvmsg has given to the user so far, it speeds up the

9149

++ * calculation of whether or not we must ACK for the sake of

9150

++ * a window update.

9151

++ */

9152

++void mptcp_cleanup_rbuf(struct sock *meta_sk, int copied)

9153

++{

9154

++	struct tcp_sock *meta_tp = tcp_sk(meta_sk);

9155

++	struct sock *sk;

9156

++	__u32 rcv_window_now = 0;

9157

++

9158

++	if (copied > 0 && !(meta_sk->sk_shutdown & RCV_SHUTDOWN)) {

9159

++		rcv_window_now = tcp_receive_window(meta_tp);

9160

++

9161

++		if (2 * rcv_window_now > meta_tp->window_clamp)

9162

++			rcv_window_now = 0;

9163

++	}

9164

++

9165

++	mptcp_for_each_sk(meta_tp->mpcb, sk) {

9166

++		struct tcp_sock *tp = tcp_sk(sk);

9167

++		const struct inet_connection_sock *icsk = inet_csk(sk);

9168

++

9169

++		if (!mptcp_sk_can_send_ack(sk))

9170

++			continue;

9171

++

9172

++		if (!inet_csk_ack_scheduled(sk))

9173

++			goto second_part;

9174

++		/* Delayed ACKs frequently hit locked sockets during bulk

9175

++		 * receive.

9176

++		 */

9177

++		if (icsk->icsk_ack.blocked ||

9178

++		    /* Once-per-two-segments ACK was not sent by tcp_input.c */

9179

++		    tp->rcv_nxt - tp->rcv_wup > icsk->icsk_ack.rcv_mss ||

9180

++		    /* If this read emptied read buffer, we send ACK, if

9181

++		     * connection is not bidirectional, user drained

9182

++		     * receive buffer and there was a small segment

9183

++		     * in queue.

9184

++		     */

9185

++		    (copied > 0 &&

9186

++		     ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED2) ||

9187

++		      ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED) &&

9188

++		       !icsk->icsk_ack.pingpong)) &&

9189

++		     !atomic_read(&meta_sk->sk_rmem_alloc))) {

9190

++			tcp_send_ack(sk);

9191

++			continue;

9192

++		}

9193

++

9194

++second_part:

9195

++		/* This here is the second part of tcp_cleanup_rbuf */

9196

++		if (rcv_window_now) {

9197

++			__u32 new_window = tp->ops->__select_window(sk);

9198

++

9199

++			/* Send ACK now, if this read freed lots of space

9200

++			 * in our buffer. Certainly, new_window is new window.

9201

++			 * We can advertise it now, if it is not less than

9202

++			 * current one.

9203

++			 * "Lots" means "at least twice" here.

9204

++			 */

9205

++			if (new_window && new_window >= 2 * rcv_window_now)

9206

++				tcp_send_ack(sk);

9207

++		}

9208

++	}

9209

++}

9210

++

9211

++static int mptcp_sub_send_fin(struct sock *sk)

9212

++{

9213

++	struct tcp_sock *tp = tcp_sk(sk);

9214

++	struct sk_buff *skb = tcp_write_queue_tail(sk);

9215

++	int mss_now;

9216

++

9217

++	/* Optimization, tack on the FIN if we have a queue of

9218

++	 * unsent frames.  But be careful about outgoing SACKS

9219

++	 * and IP options.

9220

++	 */

9221

++	mss_now = tcp_current_mss(sk);

9222

++

9223

++	if (tcp_send_head(sk) != NULL) {

9224

++		TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_FIN;

9225

++		TCP_SKB_CB(skb)->end_seq++;

9226

++		tp->write_seq++;

9227

++	} else {

9228

++		skb = alloc_skb_fclone(MAX_TCP_HEADER, GFP_ATOMIC);

9229

++		if (!skb)

9230

++			return 1;

9231

++

9232

++		/* Reserve space for headers and prepare control bits. */

9233

++		skb_reserve(skb, MAX_TCP_HEADER);

9234

++		/* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */

9235

++		tcp_init_nondata_skb(skb, tp->write_seq,

9236

++				     TCPHDR_ACK | TCPHDR_FIN);

9237

++		tcp_queue_skb(sk, skb);

9238

++	}

9239

++	__tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_OFF);

9240

++

9241

++	return 0;

9242

++}

9243

++

9244

++void mptcp_sub_close_wq(struct work_struct *work)

9245

++{

9246

++	struct tcp_sock *tp = container_of(work, struct mptcp_tcp_sock, work.work)->tp;

9247

++	struct sock *sk = (struct sock *)tp;

9248

++	struct sock *meta_sk = mptcp_meta_sk(sk);

9249

++

9250

++	mutex_lock(&tp->mpcb->mpcb_mutex);

9251

++	lock_sock_nested(meta_sk, SINGLE_DEPTH_NESTING);

9252

++

9253

++	if (sock_flag(sk, SOCK_DEAD))

9254

++		goto exit;

9255

++

9256

++	/* We come from tcp_disconnect. We are sure that meta_sk is set */

9257

++	if (!mptcp(tp)) {

9258

++		tp->closing = 1;

9259

++		sock_rps_reset_flow(sk);

9260

++		tcp_close(sk, 0);

9261

++		goto exit;

9262

++	}

9263

++

9264

++	if (meta_sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE) {

9265

++		tp->closing = 1;

9266

++		sock_rps_reset_flow(sk);

9267

++		tcp_close(sk, 0);

9268

++	} else if (tcp_close_state(sk)) {

9269

++		sk->sk_shutdown |= SEND_SHUTDOWN;

9270

++		tcp_send_fin(sk);

9271

++	}

9272

++

9273

++exit:

9274

++	release_sock(meta_sk);

9275

++	mutex_unlock(&tp->mpcb->mpcb_mutex);

9276

++	sock_put(sk);

9277

++}

9278

++

9279

++void mptcp_sub_close(struct sock *sk, unsigned long delay)

9280

++{

9281

++	struct tcp_sock *tp = tcp_sk(sk);

9282

++	struct delayed_work *work = &tcp_sk(sk)->mptcp->work;

9283

++

9284

++	/* We are already closing - e.g., call from sock_def_error_report upon

9285

++	 * tcp_disconnect in tcp_close.

9286

++	 */

9287

++	if (tp->closing)

9288

++		return;

9289

++

9290

++	/* Work already scheduled ? */

9291

++	if (work_pending(&work->work)) {

9292

++		/* Work present - who will be first ? */

9293

++		if (jiffies + delay > work->timer.expires)

9294

++			return;

9295

++

9296

++		/* Try canceling - if it fails, work will be executed soon */

9297

++		if (!cancel_delayed_work(work))

9298

++			return;

9299

++		sock_put(sk);

9300

++	}

9301

++

9302

++	if (!delay) {

9303

++		unsigned char old_state = sk->sk_state;

9304

++

9305

++		/* If we are in user-context we can directly do the closing

9306

++		 * procedure. No need to schedule a work-queue.

9307

++		 */

9308

++		if (!in_softirq()) {

9309

++			if (sock_flag(sk, SOCK_DEAD))

9310

++				return;

9311

++

9312

++			if (!mptcp(tp)) {

9313

++				tp->closing = 1;

9314

++				sock_rps_reset_flow(sk);

9315

++				tcp_close(sk, 0);

9316

++				return;

9317

++			}

9318

++

9319

++			if (mptcp_meta_sk(sk)->sk_shutdown == SHUTDOWN_MASK ||

9320

++			    sk->sk_state == TCP_CLOSE) {

9321

++				tp->closing = 1;

9322

++				sock_rps_reset_flow(sk);

9323

++				tcp_close(sk, 0);

9324

++			} else if (tcp_close_state(sk)) {

9325

++				sk->sk_shutdown |= SEND_SHUTDOWN;

9326

++				tcp_send_fin(sk);

9327

++			}

9328

++

9329

++			return;

9330

++		}

9331

++

9332

++		/* We directly send the FIN. Because it may take so a long time,

9333

++		 * untile the work-queue will get scheduled...

9334

++		 *

9335

++		 * If mptcp_sub_send_fin returns 1, it failed and thus we reset

9336

++		 * the old state so that tcp_close will finally send the fin

9337

++		 * in user-context.

9338

++		 */

9339

++		if (!sk->sk_err && old_state != TCP_CLOSE &&

9340

++		    tcp_close_state(sk) && mptcp_sub_send_fin(sk)) {

9341

++			if (old_state == TCP_ESTABLISHED)

9342

++				TCP_INC_STATS(sock_net(sk), TCP_MIB_CURRESTAB);

9343

++			sk->sk_state = old_state;

9344

++		}

9345

++	}

9346

++

9347

++	sock_hold(sk);

9348

++	queue_delayed_work(mptcp_wq, work, delay);

9349

++}

9350

++

9351

++void mptcp_sub_force_close(struct sock *sk)

9352

++{

9353

++	/* The below tcp_done may have freed the socket, if he is already dead.

9354

++	 * Thus, we are not allowed to access it afterwards. That's why

9355

++	 * we have to store the dead-state in this local variable.

9356

++	 */

9357

++	int sock_is_dead = sock_flag(sk, SOCK_DEAD);

9358

++

9359

++	tcp_sk(sk)->mp_killed = 1;

9360

++

9361

++	if (sk->sk_state != TCP_CLOSE)

9362

++		tcp_done(sk);

9363

++

9364

++	if (!sock_is_dead)

9365

++		mptcp_sub_close(sk, 0);

9366

++}

9367

++EXPORT_SYMBOL(mptcp_sub_force_close);

9368

++

9369

++/* Update the mpcb send window, based on the contributions

9370

++ * of each subflow

9371

++ */

9372

++void mptcp_update_sndbuf(const struct tcp_sock *tp)

9373

++{

9374

++	struct sock *meta_sk = tp->meta_sk, *sk;

9375

++	int new_sndbuf = 0, old_sndbuf = meta_sk->sk_sndbuf;

9376

++

9377

++	mptcp_for_each_sk(tp->mpcb, sk) {

9378

++		if (!mptcp_sk_can_send(sk))

9379

++			continue;

9380

++

9381

++		new_sndbuf += sk->sk_sndbuf;

9382

++

9383

++		if (new_sndbuf > sysctl_tcp_wmem[2] || new_sndbuf < 0) {

9384

++			new_sndbuf = sysctl_tcp_wmem[2];

9385

++			break;

9386

++		}

9387

++	}

9388

++	meta_sk->sk_sndbuf = max(min(new_sndbuf, sysctl_tcp_wmem[2]), meta_sk->sk_sndbuf);

9389

++

9390

++	/* The subflow's call to sk_write_space in tcp_new_space ends up in

9391

++	 * mptcp_write_space.

9392

++	 * It has nothing to do with waking up the application.

9393

++	 * So, we do it here.

9394

++	 */

9395

++	if (old_sndbuf != meta_sk->sk_sndbuf)

9396

++		meta_sk->sk_write_space(meta_sk);

9397

++}

9398

++

9399

++void mptcp_close(struct sock *meta_sk, long timeout)

9400

++{

9401

++	struct tcp_sock *meta_tp = tcp_sk(meta_sk);

9402

++	struct sock *sk_it, *tmpsk;

9403

++	struct mptcp_cb *mpcb = meta_tp->mpcb;

9404

++	struct sk_buff *skb;

9405

++	int data_was_unread = 0;

9406

++	int state;

9407

++

9408

++	mptcp_debug("%s: Close of meta_sk with tok %#x\n",

9409

++		    __func__, mpcb->mptcp_loc_token);

9410

++

9411

++	mutex_lock(&mpcb->mpcb_mutex);

9412

++	lock_sock(meta_sk);

9413

++

9414

++	if (meta_tp->inside_tk_table) {

9415

++		/* Detach the mpcb from the token hashtable */

9416

++		mptcp_hash_remove_bh(meta_tp);

9417

++		reqsk_queue_destroy(&inet_csk(meta_sk)->icsk_accept_queue);

9418

++	}

9419

++

9420

++	meta_sk->sk_shutdown = SHUTDOWN_MASK;

9421

++	/* We need to flush the recv. buffs.  We do this only on the

9422

++	 * descriptor close, not protocol-sourced closes, because the

9423

++	 * reader process may not have drained the data yet!

9424

++	 */

9425

++	while ((skb = __skb_dequeue(&meta_sk->sk_receive_queue)) != NULL) {

9426

++		u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq -

9427

++			  tcp_hdr(skb)->fin;

9428

++		data_was_unread += len;

9429

++		__kfree_skb(skb);

9430

++	}

9431

++

9432

++	sk_mem_reclaim(meta_sk);

9433

++

9434

++	/* If socket has been already reset (e.g. in tcp_reset()) - kill it. */

9435

++	if (meta_sk->sk_state == TCP_CLOSE) {

9436

++		mptcp_for_each_sk_safe(mpcb, sk_it, tmpsk) {

9437

++			if (tcp_sk(sk_it)->send_mp_fclose)

9438

++				continue;

9439

++			mptcp_sub_close(sk_it, 0);

9440

++		}

9441

++		goto adjudge_to_death;

9442

++	}

9443

++

9444

++	if (data_was_unread) {

9445

++		/* Unread data was tossed, zap the connection. */

9446

++		NET_INC_STATS_USER(sock_net(meta_sk), LINUX_MIB_TCPABORTONCLOSE);

9447

++		tcp_set_state(meta_sk, TCP_CLOSE);

9448

++		tcp_sk(meta_sk)->ops->send_active_reset(meta_sk,

9449

++							meta_sk->sk_allocation);

9450

++	} else if (sock_flag(meta_sk, SOCK_LINGER) && !meta_sk->sk_lingertime) {

9451

++		/* Check zero linger _after_ checking for unread data. */

9452

++		meta_sk->sk_prot->disconnect(meta_sk, 0);

9453

++		NET_INC_STATS_USER(sock_net(meta_sk), LINUX_MIB_TCPABORTONDATA);

9454

++	} else if (tcp_close_state(meta_sk)) {

9455

++		mptcp_send_fin(meta_sk);

9456

++	} else if (meta_tp->snd_una == meta_tp->write_seq) {

9457

++		/* The DATA_FIN has been sent and acknowledged

9458

++		 * (e.g., by sk_shutdown). Close all the other subflows

9459

++		 */

9460

++		mptcp_for_each_sk_safe(mpcb, sk_it, tmpsk) {

9461

++			unsigned long delay = 0;

9462

++			/* If we are the passive closer, don't trigger

9463

++			 * subflow-fin until the subflow has been finned

9464

++			 * by the peer. - thus we add a delay

9465

++			 */

9466

++			if (mpcb->passive_close &&

9467

++			    sk_it->sk_state == TCP_ESTABLISHED)

9468

++				delay = inet_csk(sk_it)->icsk_rto << 3;

9469

++

9470

++			mptcp_sub_close(sk_it, delay);

9471

++		}

9472

++	}

9473

++

9474

++	sk_stream_wait_close(meta_sk, timeout);

9475

++

9476

++adjudge_to_death:

9477

++	state = meta_sk->sk_state;

9478

++	sock_hold(meta_sk);

9479

++	sock_orphan(meta_sk);

9480

++

9481

++	/* socket will be freed after mptcp_close - we have to prevent

9482

++	 * access from the subflows.

9483

++	 */

9484

++	mptcp_for_each_sk(mpcb, sk_it) {

9485

++		/* Similar to sock_orphan, but we don't set it DEAD, because

9486

++		 * the callbacks are still set and must be called.

9487

++		 */

9488

++		write_lock_bh(&sk_it->sk_callback_lock);

9489

++		sk_set_socket(sk_it, NULL);

9490

++		sk_it->sk_wq  = NULL;

9491

++		write_unlock_bh(&sk_it->sk_callback_lock);

9492

++	}

9493

++

9494

++	/* It is the last release_sock in its life. It will remove backlog. */

9495

++	release_sock(meta_sk);

9496

++

9497

++	/* Now socket is owned by kernel and we acquire BH lock

9498

++	 * to finish close. No need to check for user refs.

9499

++	 */

9500

++	local_bh_disable();

9501

++	bh_lock_sock(meta_sk);

9502

++	WARN_ON(sock_owned_by_user(meta_sk));

9503

++

9504

++	percpu_counter_inc(meta_sk->sk_prot->orphan_count);

9505

++

9506

++	/* Have we already been destroyed by a softirq or backlog? */

9507

++	if (state != TCP_CLOSE && meta_sk->sk_state == TCP_CLOSE)

9508

++		goto out;

9509

++

9510

++	/*	This is a (useful) BSD violating of the RFC. There is a

9511

++	 *	problem with TCP as specified in that the other end could

9512

++	 *	keep a socket open forever with no application left this end.

9513

++	 *	We use a 3 minute timeout (about the same as BSD) then kill

9514

++	 *	our end. If they send after that then tough - BUT: long enough

9515

++	 *	that we won't make the old 4*rto = almost no time - whoops

9516

++	 *	reset mistake.

9517

++	 *

9518

++	 *	Nope, it was not mistake. It is really desired behaviour

9519

++	 *	f.e. on http servers, when such sockets are useless, but

9520

++	 *	consume significant resources. Let's do it with special

9521

++	 *	linger2	option.					--ANK

9522

++	 */

9523

++

9524

++	if (meta_sk->sk_state == TCP_FIN_WAIT2) {

9525

++		if (meta_tp->linger2 < 0) {

9526

++			tcp_set_state(meta_sk, TCP_CLOSE);

9527

++			meta_tp->ops->send_active_reset(meta_sk, GFP_ATOMIC);

9528

++			NET_INC_STATS_BH(sock_net(meta_sk),

9529

++					 LINUX_MIB_TCPABORTONLINGER);

9530

++		} else {

9531

++			const int tmo = tcp_fin_time(meta_sk);

9532

++

9533

++			if (tmo > TCP_TIMEWAIT_LEN) {

9534

++				inet_csk_reset_keepalive_timer(meta_sk,

9535

++							       tmo - TCP_TIMEWAIT_LEN);

9536

++			} else {

9537

++				meta_tp->ops->time_wait(meta_sk, TCP_FIN_WAIT2,

9538

++							tmo);

9539

++				goto out;

9540

++			}

9541

++		}

9542

++	}

9543

++	if (meta_sk->sk_state != TCP_CLOSE) {

9544

++		sk_mem_reclaim(meta_sk);

9545

++		if (tcp_too_many_orphans(meta_sk, 0)) {

9546

++			if (net_ratelimit())

9547

++				pr_info("MPTCP: too many of orphaned sockets\n");

9548

++			tcp_set_state(meta_sk, TCP_CLOSE);

9549

++			meta_tp->ops->send_active_reset(meta_sk, GFP_ATOMIC);

9550

++			NET_INC_STATS_BH(sock_net(meta_sk),

9551

++					 LINUX_MIB_TCPABORTONMEMORY);

9552

++		}

9553

++	}

9554

++

9555

++

9556

++	if (meta_sk->sk_state == TCP_CLOSE)

9557

++		inet_csk_destroy_sock(meta_sk);

9558

++	/* Otherwise, socket is reprieved until protocol close. */

9559

++

9560

++out:

9561

++	bh_unlock_sock(meta_sk);

9562

++	local_bh_enable();

9563

++	mutex_unlock(&mpcb->mpcb_mutex);

9564

++	sock_put(meta_sk); /* Taken by sock_hold */

9565

++}

9566

++

9567

++void mptcp_disconnect(struct sock *sk)

9568

++{

9569

++	struct sock *subsk, *tmpsk;

9570

++	struct tcp_sock *tp = tcp_sk(sk);

9571

++

9572

++	mptcp_delete_synack_timer(sk);

9573

++

9574

++	__skb_queue_purge(&tp->mpcb->reinject_queue);

9575

++

9576

++	if (tp->inside_tk_table) {

9577

++		mptcp_hash_remove_bh(tp);

9578

++		reqsk_queue_destroy(&inet_csk(tp->meta_sk)->icsk_accept_queue);

9579

++	}

9580

++

9581

++	local_bh_disable();

9582

++	mptcp_for_each_sk_safe(tp->mpcb, subsk, tmpsk) {

9583

++		/* The socket will get removed from the subsocket-list

9584

++		 * and made non-mptcp by setting mpc to 0.

9585

++		 *

9586

++		 * This is necessary, because tcp_disconnect assumes

9587

++		 * that the connection is completly dead afterwards.

9588

++		 * Thus we need to do a mptcp_del_sock. Due to this call

9589

++		 * we have to make it non-mptcp.

9590

++		 *

9591

++		 * We have to lock the socket, because we set mpc to 0.

9592

++		 * An incoming packet would take the subsocket's lock

9593

++		 * and go on into the receive-path.

9594

++		 * This would be a race.

9595

++		 */

9596

++

9597

++		bh_lock_sock(subsk);

9598

++		mptcp_del_sock(subsk);

9599

++		tcp_sk(subsk)->mpc = 0;

9600

++		tcp_sk(subsk)->ops = &tcp_specific;

9601

++		mptcp_sub_force_close(subsk);

9602

++		bh_unlock_sock(subsk);

9603

++	}

9604

++	local_bh_enable();

9605

++

9606

++	tp->was_meta_sk = 1;

9607

++	tp->mpc = 0;

9608

++	tp->ops = &tcp_specific;

9609

++}

9610

++

9611

++

9612

++/* Returns 1 if we should enable MPTCP for that socket. */

9613

++int mptcp_doit(struct sock *sk)

9614

++{

9615

++	/* Do not allow MPTCP enabling if the MPTCP initialization failed */

9616

++	if (mptcp_init_failed)

9617

++		return 0;

9618

++

9619

++	if (sysctl_mptcp_enabled == MPTCP_APP && !tcp_sk(sk)->mptcp_enabled)

9620

++		return 0;

9621

++

9622

++	/* Socket may already be established (e.g., called from tcp_recvmsg) */

9623

++	if (mptcp(tcp_sk(sk)) || tcp_sk(sk)->request_mptcp)

9624

++		return 1;

9625

++

9626

++	/* Don't do mptcp over loopback */

9627

++	if (sk->sk_family == AF_INET &&

9628

++	    (ipv4_is_loopback(inet_sk(sk)->inet_daddr) ||

9629

++	     ipv4_is_loopback(inet_sk(sk)->inet_saddr)))

9630

++		return 0;

9631

++#if IS_ENABLED(CONFIG_IPV6)

9632

++	if (sk->sk_family == AF_INET6 &&

9633

++	    (ipv6_addr_loopback(&sk->sk_v6_daddr) ||

9634

++	     ipv6_addr_loopback(&inet6_sk(sk)->saddr)))

9635

++		return 0;

9636

++#endif

9637

++	if (mptcp_v6_is_v4_mapped(sk) &&

9638

++	    ipv4_is_loopback(inet_sk(sk)->inet_saddr))

9639

++		return 0;

9640

++

9641

++#ifdef CONFIG_TCP_MD5SIG

9642

++	/* If TCP_MD5SIG is enabled, do not do MPTCP - there is no Option-Space */

9643

++	if (tcp_sk(sk)->af_specific->md5_lookup(sk, sk))

9644

++		return 0;

9645

++#endif

9646

++

9647

++	return 1;

9648

++}

9649

++

9650

++int mptcp_create_master_sk(struct sock *meta_sk, __u64 remote_key, u32 window)

9651

++{

9652

++	struct tcp_sock *master_tp;

9653

++	struct sock *master_sk;

9654

++

9655

++	if (mptcp_alloc_mpcb(meta_sk, remote_key, window))

9656

++		goto err_alloc_mpcb;

9657

++

9658

++	master_sk = tcp_sk(meta_sk)->mpcb->master_sk;

9659

++	master_tp = tcp_sk(master_sk);

9660

++

9661

++	if (mptcp_add_sock(meta_sk, master_sk, 0, 0, GFP_ATOMIC))

9662

++		goto err_add_sock;

9663

++

9664

++	if (__inet_inherit_port(meta_sk, master_sk) < 0)

9665

++		goto err_add_sock;

9666

++

9667

++	meta_sk->sk_prot->unhash(meta_sk);

9668

++

9669

++	if (master_sk->sk_family == AF_INET || mptcp_v6_is_v4_mapped(master_sk))

9670

++		__inet_hash_nolisten(master_sk, NULL);

9671

++#if IS_ENABLED(CONFIG_IPV6)

9672

++	else

9673

++		__inet6_hash(master_sk, NULL);

9674

++#endif

9675

++

9676

++	master_tp->mptcp->init_rcv_wnd = master_tp->rcv_wnd;

9677

++

9678

++	return 0;

9679

++

9680

++err_add_sock:

9681

++	mptcp_fallback_meta_sk(meta_sk);

9682

++

9683

++	inet_csk_prepare_forced_close(master_sk);

9684

++	tcp_done(master_sk);

9685

++	inet_csk_prepare_forced_close(meta_sk);

9686

++	tcp_done(meta_sk);

9687

++

9688

++err_alloc_mpcb:

9689

++	return -ENOBUFS;

9690

++}

9691

++

9692

++static int __mptcp_check_req_master(struct sock *child,

9693

++				    struct request_sock *req)

9694

++{

9695

++	struct tcp_sock *child_tp = tcp_sk(child);

9696

++	struct sock *meta_sk = child;

9697

++	struct mptcp_cb *mpcb;

9698

++	struct mptcp_request_sock *mtreq;

9699

++

9700

++	/* Never contained an MP_CAPABLE */

9701

++	if (!inet_rsk(req)->mptcp_rqsk)

9702

++		return 1;

9703

++

9704

++	if (!inet_rsk(req)->saw_mpc) {

9705

++		/* Fallback to regular TCP, because we saw one SYN without

9706

++		 * MP_CAPABLE. In tcp_check_req we continue the regular path.

9707

++		 * But, the socket has been added to the reqsk_tk_htb, so we

9708

++		 * must still remove it.

9709

++		 */

9710

++		mptcp_reqsk_remove_tk(req);

9711

++		return 1;

9712

++	}

9713

++

9714

++	/* Just set this values to pass them to mptcp_alloc_mpcb */

9715

++	mtreq = mptcp_rsk(req);

9716

++	child_tp->mptcp_loc_key = mtreq->mptcp_loc_key;

9717

++	child_tp->mptcp_loc_token = mtreq->mptcp_loc_token;

9718

++

9719

++	if (mptcp_create_master_sk(meta_sk, mtreq->mptcp_rem_key,

9720

++				   child_tp->snd_wnd))

9721

++		return -ENOBUFS;

9722

++

9723

++	child = tcp_sk(child)->mpcb->master_sk;

9724

++	child_tp = tcp_sk(child);

9725

++	mpcb = child_tp->mpcb;

9726

++

9727

++	child_tp->mptcp->snt_isn = tcp_rsk(req)->snt_isn;

9728

++	child_tp->mptcp->rcv_isn = tcp_rsk(req)->rcv_isn;

9729

++

9730

++	mpcb->dss_csum = mtreq->dss_csum;

9731

++	mpcb->server_side = 1;

9732

++

9733

++	/* Will be moved to ESTABLISHED by  tcp_rcv_state_process() */

9734

++	mptcp_update_metasocket(child, meta_sk);

9735

++

9736

++	/* Needs to be done here additionally, because when accepting a

9737

++	 * new connection we pass by __reqsk_free and not reqsk_free.

9738

++	 */

9739

++	mptcp_reqsk_remove_tk(req);

9740

++

9741

++	/* Hold when creating the meta-sk in tcp_vX_syn_recv_sock. */

9742

++	sock_put(meta_sk);

9743

++

9744

++	return 0;

9745

++}

9746

++

9747

++int mptcp_check_req_fastopen(struct sock *child, struct request_sock *req)

9748

++{

9749

++	struct sock *meta_sk = child, *master_sk;

9750

++	struct sk_buff *skb;

9751

++	u32 new_mapping;

9752

++	int ret;

9753

++

9754

++	ret = __mptcp_check_req_master(child, req);

9755

++	if (ret)

9756

++		return ret;

9757

++

9758

++	master_sk = tcp_sk(meta_sk)->mpcb->master_sk;

9759

++

9760

++	/* We need to rewind copied_seq as it is set to IDSN + 1 and as we have

9761

++	 * pre-MPTCP data in the receive queue.

9762

++	 */

9763

++	tcp_sk(meta_sk)->copied_seq -= tcp_sk(master_sk)->rcv_nxt -

9764

++				       tcp_rsk(req)->rcv_isn - 1;

9765

++

9766

++	/* Map subflow sequence number to data sequence numbers. We need to map

9767

++	 * these data to [IDSN - len - 1, IDSN[.

9768

++	 */

9769

++	new_mapping = tcp_sk(meta_sk)->copied_seq - tcp_rsk(req)->rcv_isn - 1;

9770

++

9771

++	/* There should be only one skb: the SYN + data. */

9772

++	skb_queue_walk(&meta_sk->sk_receive_queue, skb) {

9773

++		TCP_SKB_CB(skb)->seq += new_mapping;

9774

++		TCP_SKB_CB(skb)->end_seq += new_mapping;

9775

++	}

9776

++

9777

++	/* With fastopen we change the semantics of the relative subflow

9778

++	 * sequence numbers to deal with middleboxes that could add/remove

9779

++	 * multiple bytes in the SYN. We chose to start counting at rcv_nxt - 1

9780

++	 * instead of the regular TCP ISN.

9781

++	 */

9782

++	tcp_sk(master_sk)->mptcp->rcv_isn = tcp_sk(master_sk)->rcv_nxt - 1;

9783

++

9784

++	/* We need to update copied_seq of the master_sk to account for the

9785

++	 * already moved data to the meta receive queue.

9786

++	 */

9787

++	tcp_sk(master_sk)->copied_seq = tcp_sk(master_sk)->rcv_nxt;

9788

++

9789

++	/* Handled by the master_sk */

9790

++	tcp_sk(meta_sk)->fastopen_rsk = NULL;

9791

++

9792

++	return 0;

9793

++}

9794

++

9795

++int mptcp_check_req_master(struct sock *sk, struct sock *child,

9796

++			   struct request_sock *req,

9797

++			   struct request_sock **prev)

9798

++{

9799

++	struct sock *meta_sk = child;

9800

++	int ret;

9801

++

9802

++	ret = __mptcp_check_req_master(child, req);

9803

++	if (ret)

9804

++		return ret;

9805

++

9806

++	inet_csk_reqsk_queue_unlink(sk, req, prev);

9807

++	inet_csk_reqsk_queue_removed(sk, req);

9808

++	inet_csk_reqsk_queue_add(sk, req, meta_sk);

9809

++

9810

++	return 0;

9811

++}

9812

++

9813

++struct sock *mptcp_check_req_child(struct sock *meta_sk, struct sock *child,

9814

++				   struct request_sock *req,

9815

++				   struct request_sock **prev,

9816

++				   const struct mptcp_options_received *mopt)

9817

++{

9818

++	struct tcp_sock *child_tp = tcp_sk(child);

9819

++	struct mptcp_request_sock *mtreq = mptcp_rsk(req);

9820

++	struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;

9821

++	u8 hash_mac_check[20];

9822

++

9823

++	child_tp->inside_tk_table = 0;

9824

++

9825

++	if (!mopt->join_ack)

9826

++		goto teardown;

9827

++

9828

++	mptcp_hmac_sha1((u8 *)&mpcb->mptcp_rem_key,

9829

++			(u8 *)&mpcb->mptcp_loc_key,

9830

++			(u8 *)&mtreq->mptcp_rem_nonce,

9831

++			(u8 *)&mtreq->mptcp_loc_nonce,

9832

++			(u32 *)hash_mac_check);

9833

++

9834

++	if (memcmp(hash_mac_check, (char *)&mopt->mptcp_recv_mac, 20))

9835

++		goto teardown;

9836

++

9837

++	/* Point it to the same struct socket and wq as the meta_sk */

9838

++	sk_set_socket(child, meta_sk->sk_socket);

9839

++	child->sk_wq = meta_sk->sk_wq;

9840

++

9841

++	if (mptcp_add_sock(meta_sk, child, mtreq->loc_id, mtreq->rem_id, GFP_ATOMIC)) {

9842

++		/* Has been inherited, but now child_tp->mptcp is NULL */

9843

++		child_tp->mpc = 0;

9844

++		child_tp->ops = &tcp_specific;

9845

++

9846

++		/* TODO when we support acking the third ack for new subflows,

9847

++		 * we should silently discard this third ack, by returning NULL.

9848

++		 *

9849

++		 * Maybe, at the retransmission we will have enough memory to

9850

++		 * fully add the socket to the meta-sk.

9851

++		 */

9852

++		goto teardown;

9853

++	}

9854

++

9855

++	/* The child is a clone of the meta socket, we must now reset

9856

++	 * some of the fields

9857

++	 */

9858

++	child_tp->mptcp->rcv_low_prio = mtreq->rcv_low_prio;

9859

++

9860

++	/* We should allow proper increase of the snd/rcv-buffers. Thus, we

9861

++	 * use the original values instead of the bloated up ones from the

9862

++	 * clone.

9863

++	 */

9864

++	child->sk_sndbuf = mpcb->orig_sk_sndbuf;

9865

++	child->sk_rcvbuf = mpcb->orig_sk_rcvbuf;

9866

++

9867

++	child_tp->mptcp->slave_sk = 1;

9868

++	child_tp->mptcp->snt_isn = tcp_rsk(req)->snt_isn;

9869

++	child_tp->mptcp->rcv_isn = tcp_rsk(req)->rcv_isn;

9870

++	child_tp->mptcp->init_rcv_wnd = req->rcv_wnd;

9871

++

9872

++	child_tp->tsq_flags = 0;

9873

++

9874

++	/* Subflows do not use the accept queue, as they

9875

++	 * are attached immediately to the mpcb.

9876

++	 */

9877

++	inet_csk_reqsk_queue_unlink(meta_sk, req, prev);

9878

++	reqsk_queue_removed(&inet_csk(meta_sk)->icsk_accept_queue, req);

9879

++	reqsk_free(req);

9880

++	return child;

9881

++

9882

++teardown:

9883

++	/* Drop this request - sock creation failed. */

9884

++	inet_csk_reqsk_queue_unlink(meta_sk, req, prev);

9885

++	reqsk_queue_removed(&inet_csk(meta_sk)->icsk_accept_queue, req);

9886

++	reqsk_free(req);

9887

++	inet_csk_prepare_forced_close(child);

9888

++	tcp_done(child);

9889

++	return meta_sk;

9890

++}

9891

++

9892

++int mptcp_init_tw_sock(struct sock *sk, struct tcp_timewait_sock *tw)

9893

++{

9894

++	struct mptcp_tw *mptw;

9895

++	struct tcp_sock *tp = tcp_sk(sk);

9896

++	struct mptcp_cb *mpcb = tp->mpcb;

9897

++

9898

++	/* A subsocket in tw can only receive data. So, if we are in

9899

++	 * infinite-receive, then we should not reply with a data-ack or act

9900

++	 * upon general MPTCP-signaling. We prevent this by simply not creating

9901

++	 * the mptcp_tw_sock.

9902

++	 */

9903

++	if (mpcb->infinite_mapping_rcv) {

9904

++		tw->mptcp_tw = NULL;

9905

++		return 0;

9906

++	}

9907

++

9908

++	/* Alloc MPTCP-tw-sock */

9909

++	mptw = kmem_cache_alloc(mptcp_tw_cache, GFP_ATOMIC);

9910

++	if (!mptw)

9911

++		return -ENOBUFS;

9912

++

9913

++	atomic_inc(&mpcb->mpcb_refcnt);

9914

++

9915

++	tw->mptcp_tw = mptw;

9916

++	mptw->loc_key = mpcb->mptcp_loc_key;

9917

++	mptw->meta_tw = mpcb->in_time_wait;

9918

++	if (mptw->meta_tw) {

9919

++		mptw->rcv_nxt = mptcp_get_rcv_nxt_64(mptcp_meta_tp(tp));

9920

++		if (mpcb->mptw_state != TCP_TIME_WAIT)

9921

++			mptw->rcv_nxt++;

9922

++	}

9923

++	rcu_assign_pointer(mptw->mpcb, mpcb);

9924

++

9925

++	spin_lock(&mpcb->tw_lock);

9926

++	list_add_rcu(&mptw->list, &tp->mpcb->tw_list);

9927

++	mptw->in_list = 1;

9928

++	spin_unlock(&mpcb->tw_lock);

9929

++

9930

++	return 0;

9931

++}

9932

++

9933

++void mptcp_twsk_destructor(struct tcp_timewait_sock *tw)

9934

++{

9935

++	struct mptcp_cb *mpcb;

9936

++

9937

++	rcu_read_lock();

9938

++	mpcb = rcu_dereference(tw->mptcp_tw->mpcb);

9939

++

9940

++	/* If we are still holding a ref to the mpcb, we have to remove ourself

9941

++	 * from the list and drop the ref properly.

9942

++	 */

9943

++	if (mpcb && atomic_inc_not_zero(&mpcb->mpcb_refcnt)) {

9944

++		spin_lock(&mpcb->tw_lock);

9945

++		if (tw->mptcp_tw->in_list) {

9946

++			list_del_rcu(&tw->mptcp_tw->list);

9947

++			tw->mptcp_tw->in_list = 0;

9948

++		}

9949

++		spin_unlock(&mpcb->tw_lock);

9950

++

9951

++		/* Twice, because we increased it above */

9952

++		mptcp_mpcb_put(mpcb);

9953

++		mptcp_mpcb_put(mpcb);

9954

++	}

9955

++

9956

++	rcu_read_unlock();

9957

++

9958

++	kmem_cache_free(mptcp_tw_cache, tw->mptcp_tw);

9959

++}

9960

++

9961

++/* Updates the rcv_nxt of the time-wait-socks and allows them to ack a

9962

++ * data-fin.

9963

++ */

9964

++void mptcp_time_wait(struct sock *sk, int state, int timeo)

9965

++{

9966

++	struct tcp_sock *tp = tcp_sk(sk);

9967

++	struct mptcp_tw *mptw;

9968

++

9969

++	/* Used for sockets that go into tw after the meta

9970

++	 * (see mptcp_init_tw_sock())

9971

++	 */

9972

++	tp->mpcb->in_time_wait = 1;

9973

++	tp->mpcb->mptw_state = state;

9974

++

9975

++	/* Update the time-wait-sock's information */

9976

++	rcu_read_lock_bh();

9977

++	list_for_each_entry_rcu(mptw, &tp->mpcb->tw_list, list) {

9978

++		mptw->meta_tw = 1;

9979

++		mptw->rcv_nxt = mptcp_get_rcv_nxt_64(tp);

9980

++

9981

++		/* We want to ack a DATA_FIN, but are yet in FIN_WAIT_2 -

9982

++		 * pretend as if the DATA_FIN has already reached us, that way

9983

++		 * the checks in tcp_timewait_state_process will be good as the

9984

++		 * DATA_FIN comes in.

9985

++		 */

9986

++		if (state != TCP_TIME_WAIT)

9987

++			mptw->rcv_nxt++;

9988

++	}

9989

++	rcu_read_unlock_bh();

9990

++

9991

++	tcp_done(sk);

9992

++}

9993

++

9994

++void mptcp_tsq_flags(struct sock *sk)

9995

++{

9996

++	struct tcp_sock *tp = tcp_sk(sk);

9997

++	struct sock *meta_sk = mptcp_meta_sk(sk);

9998

++

9999

++	/* It will be handled as a regular deferred-call */

10000

++	if (is_meta_sk(sk))

10001

++		return;

10002

++

10003

++	if (hlist_unhashed(&tp->mptcp->cb_list)) {

10004

++		hlist_add_head(&tp->mptcp->cb_list, &tp->mpcb->callback_list);

10005

++		/* We need to hold it here, as the sock_hold is not assured

10006

++		 * by the release_sock as it is done in regular TCP.

10007

++		 *

10008

++		 * The subsocket may get inet_csk_destroy'd while it is inside

10009

++		 * the callback_list.

10010

++		 */

10011

++		sock_hold(sk);

10012

++	}

10013

++

10014

++	if (!test_and_set_bit(MPTCP_SUB_DEFERRED, &tcp_sk(meta_sk)->tsq_flags))

10015

++		sock_hold(meta_sk);

10016

++}

10017

++

10018

++void mptcp_tsq_sub_deferred(struct sock *meta_sk)

10019

++{

10020

++	struct tcp_sock *meta_tp = tcp_sk(meta_sk);

10021

++	struct mptcp_tcp_sock *mptcp;

10022

++	struct hlist_node *tmp;

10023

++

10024

++	BUG_ON(!is_meta_sk(meta_sk) && !meta_tp->was_meta_sk);

10025

++

10026

++	__sock_put(meta_sk);

10027

++	hlist_for_each_entry_safe(mptcp, tmp, &meta_tp->mpcb->callback_list, cb_list) {

10028

++		struct tcp_sock *tp = mptcp->tp;

10029

++		struct sock *sk = (struct sock *)tp;

10030

++

10031

++		hlist_del_init(&mptcp->cb_list);

10032

++		sk->sk_prot->release_cb(sk);

10033

++		/* Final sock_put (cfr. mptcp_tsq_flags */

10034

++		sock_put(sk);

10035

++	}

10036

++}

10037

++

10038

++void mptcp_join_reqsk_init(struct mptcp_cb *mpcb, const struct request_sock *req,

10039

++			   struct sk_buff *skb)

10040

++{

10041

++	struct mptcp_request_sock *mtreq = mptcp_rsk(req);

10042

++	struct mptcp_options_received mopt;

10043

++	u8 mptcp_hash_mac[20];

10044

++

10045

++	mptcp_init_mp_opt(&mopt);

10046

++	tcp_parse_mptcp_options(skb, &mopt);

10047

++

10048

++	mtreq = mptcp_rsk(req);

10049

++	mtreq->mptcp_mpcb = mpcb;

10050

++	mtreq->is_sub = 1;

10051

++	inet_rsk(req)->mptcp_rqsk = 1;

10052

++

10053

++	mtreq->mptcp_rem_nonce = mopt.mptcp_recv_nonce;

10054

++

10055

++	mptcp_hmac_sha1((u8 *)&mpcb->mptcp_loc_key,

10056

++			(u8 *)&mpcb->mptcp_rem_key,

10057

++			(u8 *)&mtreq->mptcp_loc_nonce,

10058

++			(u8 *)&mtreq->mptcp_rem_nonce, (u32 *)mptcp_hash_mac);

10059

++	mtreq->mptcp_hash_tmac = *(u64 *)mptcp_hash_mac;

10060

++

10061

++	mtreq->rem_id = mopt.rem_id;

10062

++	mtreq->rcv_low_prio = mopt.low_prio;

10063

++	inet_rsk(req)->saw_mpc = 1;

10064

++}

10065

++

10066

++void mptcp_reqsk_init(struct request_sock *req, const struct sk_buff *skb)

10067

++{

10068

++	struct mptcp_options_received mopt;

10069

++	struct mptcp_request_sock *mreq = mptcp_rsk(req);

10070

++

10071

++	mptcp_init_mp_opt(&mopt);

10072

++	tcp_parse_mptcp_options(skb, &mopt);

10073

++

10074

++	mreq->is_sub = 0;

10075

++	inet_rsk(req)->mptcp_rqsk = 1;

10076

++	mreq->dss_csum = mopt.dss_csum;

10077

++	mreq->hash_entry.pprev = NULL;

10078

++

10079

++	mptcp_reqsk_new_mptcp(req, &mopt, skb);

10080

++}

10081

++

10082

++int mptcp_conn_request(struct sock *sk, struct sk_buff *skb)

10083

++{

10084

++	struct mptcp_options_received mopt;

10085

++	const struct tcp_sock *tp = tcp_sk(sk);

10086

++	__u32 isn = TCP_SKB_CB(skb)->when;

10087

++	bool want_cookie = false;

10088

++

10089

++	if ((sysctl_tcp_syncookies == 2 ||

10090

++	     inet_csk_reqsk_queue_is_full(sk)) && !isn) {

10091

++		want_cookie = tcp_syn_flood_action(sk, skb,

10092

++						   mptcp_request_sock_ops.slab_name);

10093

++		if (!want_cookie)

10094

++			goto drop;

10095

++	}

10096

++

10097

++	mptcp_init_mp_opt(&mopt);

10098

++	tcp_parse_mptcp_options(skb, &mopt);

10099

++

10100

++	if (mopt.is_mp_join)

10101

++		return mptcp_do_join_short(skb, &mopt, sock_net(sk));

10102

++	if (mopt.drop_me)

10103

++		goto drop;

10104

++

10105

++	if (sysctl_mptcp_enabled == MPTCP_APP && !tp->mptcp_enabled)

10106

++		mopt.saw_mpc = 0;

10107

++

10108

++	if (skb->protocol == htons(ETH_P_IP)) {

10109

++		if (mopt.saw_mpc && !want_cookie) {

10110

++			if (skb_rtable(skb)->rt_flags &

10111

++			    (RTCF_BROADCAST | RTCF_MULTICAST))

10112

++				goto drop;

10113

++

10114

++			return tcp_conn_request(&mptcp_request_sock_ops,

10115

++						&mptcp_request_sock_ipv4_ops,

10116

++						sk, skb);

10117

++		}

10118

++

10119

++		return tcp_v4_conn_request(sk, skb);

10120

++#if IS_ENABLED(CONFIG_IPV6)

10121

++	} else {

10122

++		if (mopt.saw_mpc && !want_cookie) {

10123

++			if (!ipv6_unicast_destination(skb))

10124

++				goto drop;

10125

++

10126

++			return tcp_conn_request(&mptcp6_request_sock_ops,

10127

++						&mptcp_request_sock_ipv6_ops,

10128

++						sk, skb);

10129

++		}

10130

++

10131

++		return tcp_v6_conn_request(sk, skb);

10132

++#endif

10133

++	}

10134

++drop:

10135

++	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);

10136

++	return 0;

10137

++}

10138

++

10139

++struct workqueue_struct *mptcp_wq;

10140

++EXPORT_SYMBOL(mptcp_wq);

10141

++

10142

++/* Output /proc/net/mptcp */

10143

++static int mptcp_pm_seq_show(struct seq_file *seq, void *v)

10144

++{

10145

++	struct tcp_sock *meta_tp;

10146

++	const struct net *net = seq->private;

10147

++	int i, n = 0;

10148

++

10149

++	seq_printf(seq, "  sl  loc_tok  rem_tok  v6 local_address                         remote_address                        st ns tx_queue rx_queue inode");

10150

++	seq_putc(seq, '\n');

10151

++

10152

++	for (i = 0; i < MPTCP_HASH_SIZE; i++) {

10153

++		struct hlist_nulls_node *node;

10154

++		rcu_read_lock_bh();

10155

++		hlist_nulls_for_each_entry_rcu(meta_tp, node,

10156

++					       &tk_hashtable[i], tk_table) {

10157

++			struct mptcp_cb *mpcb = meta_tp->mpcb;

10158

++			struct sock *meta_sk = (struct sock *)meta_tp;

10159

++			struct inet_sock *isk = inet_sk(meta_sk);

10160

++

10161

++			if (!mptcp(meta_tp) || !net_eq(net, sock_net(meta_sk)))

10162

++				continue;

10163

++

10164

++			if (capable(CAP_NET_ADMIN)) {

10165

++				seq_printf(seq, "%4d: %04X %04X ", n++,

10166

++						mpcb->mptcp_loc_token,

10167

++						mpcb->mptcp_rem_token);

10168

++			} else {

10169

++				seq_printf(seq, "%4d: %04X %04X ", n++, -1, -1);

10170

++			}

10171

++			if (meta_sk->sk_family == AF_INET ||

10172

++			    mptcp_v6_is_v4_mapped(meta_sk)) {

10173

++				seq_printf(seq, " 0 %08X:%04X                         %08X:%04X                        ",

10174

++					   isk->inet_rcv_saddr,

10175

++					   ntohs(isk->inet_sport),

10176

++					   isk->inet_daddr,

10177

++					   ntohs(isk->inet_dport));

10178

++#if IS_ENABLED(CONFIG_IPV6)

10179

++			} else if (meta_sk->sk_family == AF_INET6) {

10180

++				struct in6_addr *src = &meta_sk->sk_v6_rcv_saddr;

10181

++				struct in6_addr *dst = &meta_sk->sk_v6_daddr;

10182

++				seq_printf(seq, " 1 %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X",

10183

++					   src->s6_addr32[0], src->s6_addr32[1],

10184

++					   src->s6_addr32[2], src->s6_addr32[3],

10185

++					   ntohs(isk->inet_sport),

10186

++					   dst->s6_addr32[0], dst->s6_addr32[1],

10187

++					   dst->s6_addr32[2], dst->s6_addr32[3],

10188

++					   ntohs(isk->inet_dport));

10189

++#endif

10190

++			}

10191

++			seq_printf(seq, " %02X %02X %08X:%08X %lu",

10192

++				   meta_sk->sk_state, mpcb->cnt_subflows,

10193

++				   meta_tp->write_seq - meta_tp->snd_una,

10194

++				   max_t(int, meta_tp->rcv_nxt -

10195

++					 meta_tp->copied_seq, 0),

10196

++				   sock_i_ino(meta_sk));

10197

++			seq_putc(seq, '\n');

10198

++		}

10199

++

10200

++		rcu_read_unlock_bh();

10201

++	}

10202

++

10203

++	return 0;

10204

++}

10205

++

10206

++static int mptcp_pm_seq_open(struct inode *inode, struct file *file)

10207

++{

10208

++	return single_open_net(inode, file, mptcp_pm_seq_show);

10209

++}

10210

++

10211

++static const struct file_operations mptcp_pm_seq_fops = {

10212

++	.owner = THIS_MODULE,

10213

++	.open = mptcp_pm_seq_open,

10214

++	.read = seq_read,

10215

++	.llseek = seq_lseek,

10216

++	.release = single_release_net,

10217

++};

10218

++

10219

++static int mptcp_pm_init_net(struct net *net)

10220

++{

10221

++	if (!proc_create("mptcp", S_IRUGO, net->proc_net, &mptcp_pm_seq_fops))

10222

++		return -ENOMEM;

10223

++

10224

++	return 0;

10225

++}

10226

++

10227

++static void mptcp_pm_exit_net(struct net *net)

10228

++{

10229

++	remove_proc_entry("mptcp", net->proc_net);

10230

++}

10231

++

10232

++static struct pernet_operations mptcp_pm_proc_ops = {

10233

++	.init = mptcp_pm_init_net,

10234

++	.exit = mptcp_pm_exit_net,

10235

++};

10236

++

10237

++/* General initialization of mptcp */

10238

++void __init mptcp_init(void)

10239

++{

10240

++	int i;

10241

++	struct ctl_table_header *mptcp_sysctl;

10242

++

10243

++	mptcp_sock_cache = kmem_cache_create("mptcp_sock",

10244

++					     sizeof(struct mptcp_tcp_sock),

10245

++					     0, SLAB_HWCACHE_ALIGN,

10246

++					     NULL);

10247

++	if (!mptcp_sock_cache)

10248

++		goto mptcp_sock_cache_failed;

10249

++

10250

++	mptcp_cb_cache = kmem_cache_create("mptcp_cb", sizeof(struct mptcp_cb),

10251

++					   0, SLAB_DESTROY_BY_RCU|SLAB_HWCACHE_ALIGN,

10252

++					   NULL);

10253

++	if (!mptcp_cb_cache)

10254

++		goto mptcp_cb_cache_failed;

10255

++

10256

++	mptcp_tw_cache = kmem_cache_create("mptcp_tw", sizeof(struct mptcp_tw),

10257

++					   0, SLAB_DESTROY_BY_RCU|SLAB_HWCACHE_ALIGN,

10258

++					   NULL);

10259

++	if (!mptcp_tw_cache)

10260

++		goto mptcp_tw_cache_failed;

10261

++

10262

++	get_random_bytes(mptcp_secret, sizeof(mptcp_secret));

10263

++

10264

++	mptcp_wq = alloc_workqueue("mptcp_wq", WQ_UNBOUND | WQ_MEM_RECLAIM, 8);

10265

++	if (!mptcp_wq)

10266

++		goto alloc_workqueue_failed;

10267

++

10268

++	for (i = 0; i < MPTCP_HASH_SIZE; i++) {

10269

++		INIT_HLIST_NULLS_HEAD(&tk_hashtable[i], i);

10270

++		INIT_HLIST_NULLS_HEAD(&mptcp_reqsk_htb[i],

10271

++				      i + MPTCP_REQSK_NULLS_BASE);

10272

++		INIT_HLIST_NULLS_HEAD(&mptcp_reqsk_tk_htb[i], i);

10273

++	}

10274

++

10275

++	spin_lock_init(&mptcp_reqsk_hlock);

10276

++	spin_lock_init(&mptcp_tk_hashlock);

10277

++

10278

++	if (register_pernet_subsys(&mptcp_pm_proc_ops))

10279

++		goto pernet_failed;

10280

++

10281

++#if IS_ENABLED(CONFIG_IPV6)

10282

++	if (mptcp_pm_v6_init())

10283

++		goto mptcp_pm_v6_failed;

10284

++#endif

10285

++	if (mptcp_pm_v4_init())

10286

++		goto mptcp_pm_v4_failed;

10287

++

10288

++	mptcp_sysctl = register_net_sysctl(&init_net, "net/mptcp", mptcp_table);

10289

++	if (!mptcp_sysctl)

10290

++		goto register_sysctl_failed;

10291

++

10292

++	if (mptcp_register_path_manager(&mptcp_pm_default))

10293

++		goto register_pm_failed;

10294

++

10295

++	if (mptcp_register_scheduler(&mptcp_sched_default))

10296

++		goto register_sched_failed;

10297

++

10298

++	pr_info("MPTCP: Stable release v0.89.0-rc");

10299

++

10300

++	mptcp_init_failed = false;

10301

++

10302

++	return;

10303

++

10304

++register_sched_failed:

10305

++	mptcp_unregister_path_manager(&mptcp_pm_default);

10306

++register_pm_failed:

10307

++	unregister_net_sysctl_table(mptcp_sysctl);

10308

++register_sysctl_failed:

10309

++	mptcp_pm_v4_undo();

10310

++mptcp_pm_v4_failed:

10311

++#if IS_ENABLED(CONFIG_IPV6)

10312

++	mptcp_pm_v6_undo();

10313

++mptcp_pm_v6_failed:

10314

++#endif

10315

++	unregister_pernet_subsys(&mptcp_pm_proc_ops);

10316

++pernet_failed:

10317

++	destroy_workqueue(mptcp_wq);

10318

++alloc_workqueue_failed:

10319

++	kmem_cache_destroy(mptcp_tw_cache);

10320

++mptcp_tw_cache_failed:

10321

++	kmem_cache_destroy(mptcp_cb_cache);

10322

++mptcp_cb_cache_failed:

10323

++	kmem_cache_destroy(mptcp_sock_cache);

10324

++mptcp_sock_cache_failed:

10325

++	mptcp_init_failed = true;

10326

++}

10327

+diff --git a/net/mptcp/mptcp_fullmesh.c b/net/mptcp/mptcp_fullmesh.c

10328

+new file mode 100644

10329

+index 000000000000..3a54413ce25b

10330

+--- /dev/null

10331

++++ b/net/mptcp/mptcp_fullmesh.c

10332

+@@ -0,0 +1,1722 @@

10333

++#include <linux/module.h>

10334

++

10335

++#include <net/mptcp.h>

10336

++#include <net/mptcp_v4.h>

10337

++

10338

++#if IS_ENABLED(CONFIG_IPV6)

10339

++#include <net/mptcp_v6.h>

10340

++#include <net/addrconf.h>

10341

++#endif

10342

++

10343

++enum {

10344

++	MPTCP_EVENT_ADD = 1,

10345

++	MPTCP_EVENT_DEL,

10346

++	MPTCP_EVENT_MOD,

10347

++};

10348

++

10349

++#define MPTCP_SUBFLOW_RETRY_DELAY	1000

10350

++

10351

++/* Max number of local or remote addresses we can store.

10352

++ * When changing, see the bitfield below in fullmesh_rem4/6.

10353

++ */

10354

++#define MPTCP_MAX_ADDR	8

10355

++

10356

++struct fullmesh_rem4 {

10357

++	u8		rem4_id;

10358

++	u8		bitfield;

10359

++	u8		retry_bitfield;

10360

++	__be16		port;

10361

++	struct in_addr	addr;

10362

++};

10363

++

10364

++struct fullmesh_rem6 {

10365

++	u8		rem6_id;

10366

++	u8		bitfield;

10367

++	u8		retry_bitfield;

10368

++	__be16		port;

10369

++	struct in6_addr	addr;

10370

++};

10371

++

10372

++struct mptcp_loc_addr {

10373

++	struct mptcp_loc4 locaddr4[MPTCP_MAX_ADDR];

10374

++	u8 loc4_bits;

10375

++	u8 next_v4_index;

10376

++

10377

++	struct mptcp_loc6 locaddr6[MPTCP_MAX_ADDR];

10378

++	u8 loc6_bits;

10379

++	u8 next_v6_index;

10380

++};

10381

++

10382

++struct mptcp_addr_event {

10383

++	struct list_head list;

10384

++	unsigned short	family;

10385

++	u8	code:7,

10386

++		low_prio:1;

10387

++	union inet_addr addr;

10388

++};

10389

++

10390

++struct fullmesh_priv {

10391

++	/* Worker struct for subflow establishment */

10392

++	struct work_struct subflow_work;

10393

++	/* Delayed worker, when the routing-tables are not yet ready. */

10394

++	struct delayed_work subflow_retry_work;

10395

++

10396

++	/* Remote addresses */

10397

++	struct fullmesh_rem4 remaddr4[MPTCP_MAX_ADDR];

10398

++	struct fullmesh_rem6 remaddr6[MPTCP_MAX_ADDR];

10399

++

10400

++	struct mptcp_cb *mpcb;

10401

++

10402

++	u16 remove_addrs; /* Addresses to remove */

10403

++	u8 announced_addrs_v4; /* IPv4 Addresses we did announce */

10404

++	u8 announced_addrs_v6; /* IPv6 Addresses we did announce */

10405

++

10406

++	u8	add_addr; /* Are we sending an add_addr? */

10407

++

10408

++	u8 rem4_bits;

10409

++	u8 rem6_bits;

10410

++};

10411

++

10412

++struct mptcp_fm_ns {

10413

++	struct mptcp_loc_addr __rcu *local;

10414

++	spinlock_t local_lock; /* Protecting the above pointer */

10415

++	struct list_head events;

10416

++	struct delayed_work address_worker;

10417

++

10418

++	struct net *net;

10419

++};

10420

++

10421

++static struct mptcp_pm_ops full_mesh __read_mostly;

10422

++

10423

++static void full_mesh_create_subflows(struct sock *meta_sk);

10424

++

10425

++static struct mptcp_fm_ns *fm_get_ns(const struct net *net)

10426

++{

10427

++	return (struct mptcp_fm_ns *)net->mptcp.path_managers[MPTCP_PM_FULLMESH];

10428

++}

10429

++

10430

++static struct fullmesh_priv *fullmesh_get_priv(const struct mptcp_cb *mpcb)

10431

++{

10432

++	return (struct fullmesh_priv *)&mpcb->mptcp_pm[0];

10433

++}

10434

++

10435

++/* Find the first free index in the bitfield */

10436

++static int __mptcp_find_free_index(u8 bitfield, u8 base)

10437

++{

10438

++	int i;

10439

++

10440

++	/* There are anyways no free bits... */

10441

++	if (bitfield == 0xff)

10442

++		goto exit;

10443

++

10444

++	i = ffs(~(bitfield >> base)) - 1;

10445

++	if (i < 0)

10446

++		goto exit;

10447

++

10448

++	/* No free bits when starting at base, try from 0 on */

10449

++	if (i + base >= sizeof(bitfield) * 8)

10450

++		return __mptcp_find_free_index(bitfield, 0);

10451

++

10452

++	return i + base;

10453

++exit:

10454

++	return -1;

10455

++}

10456

++

10457

++static int mptcp_find_free_index(u8 bitfield)

10458

++{

10459

++	return __mptcp_find_free_index(bitfield, 0);

10460

++}

10461

++

10462

++static void mptcp_addv4_raddr(struct mptcp_cb *mpcb,

10463

++			      const struct in_addr *addr,

10464

++			      __be16 port, u8 id)

10465

++{

10466

++	int i;

10467

++	struct fullmesh_rem4 *rem4;

10468

++	struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb);

10469

++

10470

++	mptcp_for_each_bit_set(fmp->rem4_bits, i) {

10471

++		rem4 = &fmp->remaddr4[i];

10472

++

10473

++		/* Address is already in the list --- continue */

10474

++		if (rem4->rem4_id == id &&

10475

++		    rem4->addr.s_addr == addr->s_addr && rem4->port == port)

10476

++			return;

10477

++

10478

++		/* This may be the case, when the peer is behind a NAT. He is

10479

++		 * trying to JOIN, thus sending the JOIN with a certain ID.

10480

++		 * However the src_addr of the IP-packet has been changed. We

10481

++		 * update the addr in the list, because this is the address as

10482

++		 * OUR BOX sees it.

10483

++		 */

10484

++		if (rem4->rem4_id == id && rem4->addr.s_addr != addr->s_addr) {

10485

++			/* update the address */

10486

++			mptcp_debug("%s: updating old addr:%pI4 to addr %pI4 with id:%d\n",

10487

++				    __func__, &rem4->addr.s_addr,

10488

++				    &addr->s_addr, id);

10489

++			rem4->addr.s_addr = addr->s_addr;

10490

++			rem4->port = port;

10491

++			mpcb->list_rcvd = 1;

10492

++			return;

10493

++		}

10494

++	}

10495

++

10496

++	i = mptcp_find_free_index(fmp->rem4_bits);

10497

++	/* Do we have already the maximum number of local/remote addresses? */

10498

++	if (i < 0) {

10499

++		mptcp_debug("%s: At max num of remote addresses: %d --- not adding address: %pI4\n",

10500

++			    __func__, MPTCP_MAX_ADDR, &addr->s_addr);

10501

++		return;

10502

++	}

10503

++

10504

++	rem4 = &fmp->remaddr4[i];

10505

++

10506

++	/* Address is not known yet, store it */

10507

++	rem4->addr.s_addr = addr->s_addr;

10508

++	rem4->port = port;

10509

++	rem4->bitfield = 0;

10510

++	rem4->retry_bitfield = 0;

10511

++	rem4->rem4_id = id;

10512

++	mpcb->list_rcvd = 1;

10513

++	fmp->rem4_bits |= (1 << i);

10514

++

10515

++	return;

10516

++}

10517

++

10518

++static void mptcp_addv6_raddr(struct mptcp_cb *mpcb,

10519

++			      const struct in6_addr *addr,

10520

++			      __be16 port, u8 id)

10521

++{

10522

++	int i;

10523

++	struct fullmesh_rem6 *rem6;

10524

++	struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb);

10525

++

10526

++	mptcp_for_each_bit_set(fmp->rem6_bits, i) {

10527

++		rem6 = &fmp->remaddr6[i];

10528

++

10529

++		/* Address is already in the list --- continue */

10530

++		if (rem6->rem6_id == id &&

10531

++		    ipv6_addr_equal(&rem6->addr, addr) && rem6->port == port)

10532

++			return;

10533

++

10534

++		/* This may be the case, when the peer is behind a NAT. He is

10535

++		 * trying to JOIN, thus sending the JOIN with a certain ID.

10536

++		 * However the src_addr of the IP-packet has been changed. We

10537

++		 * update the addr in the list, because this is the address as

10538

++		 * OUR BOX sees it.

10539

++		 */

10540

++		if (rem6->rem6_id == id) {

10541

++			/* update the address */

10542

++			mptcp_debug("%s: updating old addr: %pI6 to addr %pI6 with id:%d\n",

10543

++				    __func__, &rem6->addr, addr, id);

10544

++			rem6->addr = *addr;

10545

++			rem6->port = port;

10546

++			mpcb->list_rcvd = 1;

10547

++			return;

10548

++		}

10549

++	}

10550

++

10551

++	i = mptcp_find_free_index(fmp->rem6_bits);

10552

++	/* Do we have already the maximum number of local/remote addresses? */

10553

++	if (i < 0) {

10554

++		mptcp_debug("%s: At max num of remote addresses: %d --- not adding address: %pI6\n",

10555

++			    __func__, MPTCP_MAX_ADDR, addr);

10556

++		return;

10557

++	}

10558

++

10559

++	rem6 = &fmp->remaddr6[i];

10560

++

10561

++	/* Address is not known yet, store it */

10562

++	rem6->addr = *addr;

10563

++	rem6->port = port;

10564

++	rem6->bitfield = 0;

10565

++	rem6->retry_bitfield = 0;

10566

++	rem6->rem6_id = id;

10567

++	mpcb->list_rcvd = 1;

10568

++	fmp->rem6_bits |= (1 << i);

10569

++

10570

++	return;

10571

++}

10572

++

10573

++static void mptcp_v4_rem_raddress(struct mptcp_cb *mpcb, u8 id)

10574

++{

10575

++	int i;

10576

++	struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb);

10577

++

10578

++	mptcp_for_each_bit_set(fmp->rem4_bits, i) {

10579

++		if (fmp->remaddr4[i].rem4_id == id) {

10580

++			/* remove address from bitfield */

10581

++			fmp->rem4_bits &= ~(1 << i);

10582

++

10583

++			break;

10584

++		}

10585

++	}

10586

++}

10587

++

10588

++static void mptcp_v6_rem_raddress(const struct mptcp_cb *mpcb, u8 id)

10589

++{

10590

++	int i;

10591

++	struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb);

10592

++

10593

++	mptcp_for_each_bit_set(fmp->rem6_bits, i) {

10594

++		if (fmp->remaddr6[i].rem6_id == id) {

10595

++			/* remove address from bitfield */

10596

++			fmp->rem6_bits &= ~(1 << i);

10597

++

10598

++			break;

10599

++		}

10600

++	}

10601

++}

10602

++

10603

++/* Sets the bitfield of the remote-address field */

10604

++static void mptcp_v4_set_init_addr_bit(const struct mptcp_cb *mpcb,

10605

++				       const struct in_addr *addr, u8 index)

10606

++{

10607

++	int i;

10608

++	struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb);

10609

++

10610

++	mptcp_for_each_bit_set(fmp->rem4_bits, i) {

10611

++		if (fmp->remaddr4[i].addr.s_addr == addr->s_addr) {

10612

++			fmp->remaddr4[i].bitfield |= (1 << index);

10613

++			return;

10614

++		}

10615

++	}

10616

++}

10617

++

10618

++/* Sets the bitfield of the remote-address field */

10619

++static void mptcp_v6_set_init_addr_bit(struct mptcp_cb *mpcb,

10620

++				       const struct in6_addr *addr, u8 index)

10621

++{

10622

++	int i;

10623

++	struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb);

10624

++

10625

++	mptcp_for_each_bit_set(fmp->rem6_bits, i) {

10626

++		if (ipv6_addr_equal(&fmp->remaddr6[i].addr, addr)) {

10627

++			fmp->remaddr6[i].bitfield |= (1 << index);

10628

++			return;

10629

++		}

10630

++	}

10631

++}

10632

++

10633

++static void mptcp_set_init_addr_bit(struct mptcp_cb *mpcb,

10634

++				    const union inet_addr *addr,

10635

++				    sa_family_t family, u8 id)

10636

++{

10637

++	if (family == AF_INET)

10638

++		mptcp_v4_set_init_addr_bit(mpcb, &addr->in, id);

10639

++	else

10640

++		mptcp_v6_set_init_addr_bit(mpcb, &addr->in6, id);

10641

++}

10642

++

10643

++static void retry_subflow_worker(struct work_struct *work)

10644

++{

10645

++	struct delayed_work *delayed_work = container_of(work,

10646

++							 struct delayed_work,

10647

++							 work);

10648

++	struct fullmesh_priv *fmp = container_of(delayed_work,

10649

++						 struct fullmesh_priv,

10650

++						 subflow_retry_work);

10651

++	struct mptcp_cb *mpcb = fmp->mpcb;

10652

++	struct sock *meta_sk = mpcb->meta_sk;

10653

++	struct mptcp_loc_addr *mptcp_local;

10654

++	struct mptcp_fm_ns *fm_ns = fm_get_ns(sock_net(meta_sk));

10655

++	int iter = 0, i;

10656

++

10657

++	/* We need a local (stable) copy of the address-list. Really, it is not

10658

++	 * such a big deal, if the address-list is not 100% up-to-date.

10659

++	 */

10660

++	rcu_read_lock_bh();

10661

++	mptcp_local = rcu_dereference_bh(fm_ns->local);

10662

++	mptcp_local = kmemdup(mptcp_local, sizeof(*mptcp_local), GFP_ATOMIC);

10663

++	rcu_read_unlock_bh();

10664

++

10665

++	if (!mptcp_local)

10666

++		return;

10667

++

10668

++next_subflow:

10669

++	if (iter) {

10670

++		release_sock(meta_sk);

10671

++		mutex_unlock(&mpcb->mpcb_mutex);

10672

++

10673

++		cond_resched();

10674

++	}

10675

++	mutex_lock(&mpcb->mpcb_mutex);

10676

++	lock_sock_nested(meta_sk, SINGLE_DEPTH_NESTING);

10677

++

10678

++	iter++;

10679

++

10680

++	if (sock_flag(meta_sk, SOCK_DEAD))

10681

++		goto exit;

10682

++

10683

++	mptcp_for_each_bit_set(fmp->rem4_bits, i) {

10684

++		struct fullmesh_rem4 *rem = &fmp->remaddr4[i];

10685

++		/* Do we need to retry establishing a subflow ? */

10686

++		if (rem->retry_bitfield) {

10687

++			int i = mptcp_find_free_index(~rem->retry_bitfield);

10688

++			struct mptcp_rem4 rem4;

10689

++

10690

++			rem->bitfield |= (1 << i);

10691

++			rem->retry_bitfield &= ~(1 << i);

10692

++

10693

++			rem4.addr = rem->addr;

10694

++			rem4.port = rem->port;

10695

++			rem4.rem4_id = rem->rem4_id;

10696

++

10697

++			mptcp_init4_subsockets(meta_sk, &mptcp_local->locaddr4[i], &rem4);

10698

++			goto next_subflow;

10699

++		}

10700

++	}

10701

++

10702

++#if IS_ENABLED(CONFIG_IPV6)

10703

++	mptcp_for_each_bit_set(fmp->rem6_bits, i) {

10704

++		struct fullmesh_rem6 *rem = &fmp->remaddr6[i];

10705

++

10706

++		/* Do we need to retry establishing a subflow ? */

10707

++		if (rem->retry_bitfield) {

10708

++			int i = mptcp_find_free_index(~rem->retry_bitfield);

10709

++			struct mptcp_rem6 rem6;

10710

++

10711

++			rem->bitfield |= (1 << i);

10712

++			rem->retry_bitfield &= ~(1 << i);

10713

++

10714

++			rem6.addr = rem->addr;

10715

++			rem6.port = rem->port;

10716

++			rem6.rem6_id = rem->rem6_id;

10717

++

10718

++			mptcp_init6_subsockets(meta_sk, &mptcp_local->locaddr6[i], &rem6);

10719

++			goto next_subflow;

10720

++		}

10721

++	}

10722

++#endif

10723

++

10724

++exit:

10725

++	kfree(mptcp_local);

10726

++	release_sock(meta_sk);

10727

++	mutex_unlock(&mpcb->mpcb_mutex);

10728

++	sock_put(meta_sk);

10729

++}

10730

++

10731

++/**

10732

++ * Create all new subflows, by doing calls to mptcp_initX_subsockets

10733

++ *

10734

++ * This function uses a goto next_subflow, to allow releasing the lock between

10735

++ * new subflows and giving other processes a chance to do some work on the

10736

++ * socket and potentially finishing the communication.

10737

++ **/

10738

++static void create_subflow_worker(struct work_struct *work)

10739

++{

10740

++	struct fullmesh_priv *fmp = container_of(work, struct fullmesh_priv,

10741

++						 subflow_work);

10742

++	struct mptcp_cb *mpcb = fmp->mpcb;

10743

++	struct sock *meta_sk = mpcb->meta_sk;

10744

++	struct mptcp_loc_addr *mptcp_local;

10745

++	const struct mptcp_fm_ns *fm_ns = fm_get_ns(sock_net(meta_sk));

10746

++	int iter = 0, retry = 0;

10747

++	int i;

10748

++

10749

++	/* We need a local (stable) copy of the address-list. Really, it is not

10750

++	 * such a big deal, if the address-list is not 100% up-to-date.

10751

++	 */

10752

++	rcu_read_lock_bh();

10753

++	mptcp_local = rcu_dereference_bh(fm_ns->local);

10754

++	mptcp_local = kmemdup(mptcp_local, sizeof(*mptcp_local), GFP_ATOMIC);

10755

++	rcu_read_unlock_bh();

10756

++

10757

++	if (!mptcp_local)

10758

++		return;

10759

++

10760

++next_subflow:

10761

++	if (iter) {

10762

++		release_sock(meta_sk);

10763

++		mutex_unlock(&mpcb->mpcb_mutex);

10764

++

10765

++		cond_resched();

10766

++	}

10767

++	mutex_lock(&mpcb->mpcb_mutex);

10768

++	lock_sock_nested(meta_sk, SINGLE_DEPTH_NESTING);

10769

++

10770

++	iter++;

10771

++

10772

++	if (sock_flag(meta_sk, SOCK_DEAD))

10773

++		goto exit;

10774

++

10775

++	if (mpcb->master_sk &&

10776

++	    !tcp_sk(mpcb->master_sk)->mptcp->fully_established)

10777

++		goto exit;

10778

++

10779

++	mptcp_for_each_bit_set(fmp->rem4_bits, i) {

10780

++		struct fullmesh_rem4 *rem;

10781

++		u8 remaining_bits;

10782

++

10783

++		rem = &fmp->remaddr4[i];

10784

++		remaining_bits = ~(rem->bitfield) & mptcp_local->loc4_bits;

10785

++

10786

++		/* Are there still combinations to handle? */

10787

++		if (remaining_bits) {

10788

++			int i = mptcp_find_free_index(~remaining_bits);

10789

++			struct mptcp_rem4 rem4;

10790

++

10791

++			rem->bitfield |= (1 << i);

10792

++

10793

++			rem4.addr = rem->addr;

10794

++			rem4.port = rem->port;

10795

++			rem4.rem4_id = rem->rem4_id;

10796

++

10797

++			/* If a route is not yet available then retry once */

10798

++			if (mptcp_init4_subsockets(meta_sk, &mptcp_local->locaddr4[i],

10799

++						   &rem4) == -ENETUNREACH)

10800

++				retry = rem->retry_bitfield |= (1 << i);

10801

++			goto next_subflow;

10802

++		}

10803

++	}

10804

++

10805

++#if IS_ENABLED(CONFIG_IPV6)

10806

++	mptcp_for_each_bit_set(fmp->rem6_bits, i) {

10807

++		struct fullmesh_rem6 *rem;

10808

++		u8 remaining_bits;

10809

++

10810

++		rem = &fmp->remaddr6[i];

10811

++		remaining_bits = ~(rem->bitfield) & mptcp_local->loc6_bits;

10812

++

10813

++		/* Are there still combinations to handle? */

10814

++		if (remaining_bits) {

10815

++			int i = mptcp_find_free_index(~remaining_bits);

10816

++			struct mptcp_rem6 rem6;

10817

++

10818

++			rem->bitfield |= (1 << i);

10819

++

10820

++			rem6.addr = rem->addr;

10821

++			rem6.port = rem->port;

10822

++			rem6.rem6_id = rem->rem6_id;

10823

++

10824

++			/* If a route is not yet available then retry once */

10825

++			if (mptcp_init6_subsockets(meta_sk, &mptcp_local->locaddr6[i],

10826

++						   &rem6) == -ENETUNREACH)

10827

++				retry = rem->retry_bitfield |= (1 << i);

10828

++			goto next_subflow;

10829

++		}

10830

++	}

10831

++#endif

10832

++

10833

++	if (retry && !delayed_work_pending(&fmp->subflow_retry_work)) {

10834

++		sock_hold(meta_sk);

10835

++		queue_delayed_work(mptcp_wq, &fmp->subflow_retry_work,

10836

++				   msecs_to_jiffies(MPTCP_SUBFLOW_RETRY_DELAY));

10837

++	}

10838

++

10839

++exit:

10840

++	kfree(mptcp_local);

10841

++	release_sock(meta_sk);

10842

++	mutex_unlock(&mpcb->mpcb_mutex);

10843

++	sock_put(meta_sk);

10844

++}

10845

++

10846

++static void announce_remove_addr(u8 addr_id, struct sock *meta_sk)

10847

++{

10848

++	struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;

10849

++	struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb);

10850

++	struct sock *sk = mptcp_select_ack_sock(meta_sk);

10851

++

10852

++	fmp->remove_addrs |= (1 << addr_id);

10853

++	mpcb->addr_signal = 1;

10854

++

10855

++	if (sk)

10856

++		tcp_send_ack(sk);

10857

++}

10858

++

10859

++static void update_addr_bitfields(struct sock *meta_sk,

10860

++				  const struct mptcp_loc_addr *mptcp_local)

10861

++{

10862

++	struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;

10863

++	struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb);

10864

++	int i;

10865

++

10866

++	/* The bits in announced_addrs_* always match with loc*_bits. So, a

10867

++	 * simply & operation unsets the correct bits, because these go from

10868

++	 * announced to non-announced

10869

++	 */

10870

++	fmp->announced_addrs_v4 &= mptcp_local->loc4_bits;

10871

++

10872

++	mptcp_for_each_bit_set(fmp->rem4_bits, i) {

10873

++		fmp->remaddr4[i].bitfield &= mptcp_local->loc4_bits;

10874

++		fmp->remaddr4[i].retry_bitfield &= mptcp_local->loc4_bits;

10875

++	}

10876

++

10877

++	fmp->announced_addrs_v6 &= mptcp_local->loc6_bits;

10878

++

10879

++	mptcp_for_each_bit_set(fmp->rem6_bits, i) {

10880

++		fmp->remaddr6[i].bitfield &= mptcp_local->loc6_bits;

10881

++		fmp->remaddr6[i].retry_bitfield &= mptcp_local->loc6_bits;

10882

++	}

10883

++}

10884

++

10885

++static int mptcp_find_address(const struct mptcp_loc_addr *mptcp_local,

10886

++			      sa_family_t family, const union inet_addr *addr)

10887

++{

10888

++	int i;

10889

++	u8 loc_bits;

10890

++	bool found = false;

10891

++

10892

++	if (family == AF_INET)

10893

++		loc_bits = mptcp_local->loc4_bits;

10894

++	else

10895

++		loc_bits = mptcp_local->loc6_bits;

10896

++

10897

++	mptcp_for_each_bit_set(loc_bits, i) {

10898

++		if (family == AF_INET &&

10899

++		    mptcp_local->locaddr4[i].addr.s_addr == addr->in.s_addr) {

10900

++			found = true;

10901

++			break;

10902

++		}

10903

++		if (family == AF_INET6 &&

10904

++		    ipv6_addr_equal(&mptcp_local->locaddr6[i].addr,

10905

++				    &addr->in6)) {

10906

++			found = true;

10907

++			break;

10908

++		}

10909

++	}

10910

++

10911

++	if (!found)

10912

++		return -1;

10913

++

10914

++	return i;

10915

++}

10916

++

10917

++static void mptcp_address_worker(struct work_struct *work)

10918

++{

10919

++	const struct delayed_work *delayed_work = container_of(work,

10920

++							 struct delayed_work,

10921

++							 work);

10922

++	struct mptcp_fm_ns *fm_ns = container_of(delayed_work,

10923

++						 struct mptcp_fm_ns,

10924

++						 address_worker);

10925

++	struct net *net = fm_ns->net;

10926

++	struct mptcp_addr_event *event = NULL;

10927

++	struct mptcp_loc_addr *mptcp_local, *old;

10928

++	int i, id = -1; /* id is used in the socket-code on a delete-event */

10929

++	bool success; /* Used to indicate if we succeeded handling the event */

10930

++

10931

++next_event:

10932

++	success = false;

10933

++	kfree(event);

10934

++

10935

++	/* First, let's dequeue an event from our event-list */

10936

++	rcu_read_lock_bh();

10937

++	spin_lock(&fm_ns->local_lock);

10938

++

10939

++	event = list_first_entry_or_null(&fm_ns->events,

10940

++					 struct mptcp_addr_event, list);

10941

++	if (!event) {

10942

++		spin_unlock(&fm_ns->local_lock);

10943

++		rcu_read_unlock_bh();

10944

++		return;

10945

++	}

10946

++

10947

++	list_del(&event->list);

10948

++

10949

++	mptcp_local = rcu_dereference_bh(fm_ns->local);

10950

++

10951

++	if (event->code == MPTCP_EVENT_DEL) {

10952

++		id = mptcp_find_address(mptcp_local, event->family, &event->addr);

10953

++

10954

++		/* Not in the list - so we don't care */

10955

++		if (id < 0) {

10956

++			mptcp_debug("%s could not find id\n", __func__);

10957

++			goto duno;

10958

++		}

10959

++

10960

++		old = mptcp_local;

10961

++		mptcp_local = kmemdup(mptcp_local, sizeof(*mptcp_local),

10962

++				      GFP_ATOMIC);

10963

++		if (!mptcp_local)

10964

++			goto duno;

10965

++

10966

++		if (event->family == AF_INET)

10967

++			mptcp_local->loc4_bits &= ~(1 << id);

10968

++		else

10969

++			mptcp_local->loc6_bits &= ~(1 << id);

10970

++

10971

++		rcu_assign_pointer(fm_ns->local, mptcp_local);

10972

++		kfree(old);

10973

++	} else {

10974

++		int i = mptcp_find_address(mptcp_local, event->family, &event->addr);

10975

++		int j = i;

10976

++

10977

++		if (j < 0) {

10978

++			/* Not in the list, so we have to find an empty slot */

10979

++			if (event->family == AF_INET)

10980

++				i = __mptcp_find_free_index(mptcp_local->loc4_bits,

10981

++							    mptcp_local->next_v4_index);

10982

++			if (event->family == AF_INET6)

10983

++				i = __mptcp_find_free_index(mptcp_local->loc6_bits,

10984

++							    mptcp_local->next_v6_index);

10985

++

10986

++			if (i < 0) {

10987

++				mptcp_debug("%s no more space\n", __func__);

10988

++				goto duno;

10989

++			}

10990

++

10991

++			/* It might have been a MOD-event. */

10992

++			event->code = MPTCP_EVENT_ADD;

10993

++		} else {

10994

++			/* Let's check if anything changes */

10995

++			if (event->family == AF_INET &&

10996

++			    event->low_prio == mptcp_local->locaddr4[i].low_prio)

10997

++				goto duno;

10998

++

10999

++			if (event->family == AF_INET6 &&

11000

++			    event->low_prio == mptcp_local->locaddr6[i].low_prio)

11001

++				goto duno;

11002

++		}

11003

++

11004

++		old = mptcp_local;

11005

++		mptcp_local = kmemdup(mptcp_local, sizeof(*mptcp_local),

11006

++				      GFP_ATOMIC);

11007

++		if (!mptcp_local)

11008

++			goto duno;

11009

++

11010

++		if (event->family == AF_INET) {

11011

++			mptcp_local->locaddr4[i].addr.s_addr = event->addr.in.s_addr;

11012

++			mptcp_local->locaddr4[i].loc4_id = i + 1;

11013

++			mptcp_local->locaddr4[i].low_prio = event->low_prio;

11014

++		} else {

11015

++			mptcp_local->locaddr6[i].addr = event->addr.in6;

11016

++			mptcp_local->locaddr6[i].loc6_id = i + MPTCP_MAX_ADDR;

11017

++			mptcp_local->locaddr6[i].low_prio = event->low_prio;

11018

++		}

11019

++

11020

++		if (j < 0) {

11021

++			if (event->family == AF_INET) {

11022

++				mptcp_local->loc4_bits |= (1 << i);

11023

++				mptcp_local->next_v4_index = i + 1;

11024

++			} else {

11025

++				mptcp_local->loc6_bits |= (1 << i);

11026

++				mptcp_local->next_v6_index = i + 1;

11027

++			}

11028

++		}

11029

++

11030

++		rcu_assign_pointer(fm_ns->local, mptcp_local);

11031

++		kfree(old);

11032

++	}

11033

++	success = true;

11034

++

11035

++duno:

11036

++	spin_unlock(&fm_ns->local_lock);

11037

++	rcu_read_unlock_bh();

11038

++

11039

++	if (!success)

11040

++		goto next_event;

11041

++

11042

++	/* Now we iterate over the MPTCP-sockets and apply the event. */

11043

++	for (i = 0; i < MPTCP_HASH_SIZE; i++) {

11044

++		const struct hlist_nulls_node *node;

11045

++		struct tcp_sock *meta_tp;

11046

++

11047

++		rcu_read_lock_bh();

11048

++		hlist_nulls_for_each_entry_rcu(meta_tp, node, &tk_hashtable[i],

11049

++					       tk_table) {

11050

++			struct mptcp_cb *mpcb = meta_tp->mpcb;

11051

++			struct sock *meta_sk = (struct sock *)meta_tp, *sk;

11052

++			struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb);

11053

++			bool meta_v4 = meta_sk->sk_family == AF_INET;

11054

++

11055

++			if (sock_net(meta_sk) != net)

11056

++				continue;

11057

++

11058

++			if (meta_v4) {

11059

++				/* skip IPv6 events if meta is IPv4 */

11060

++				if (event->family == AF_INET6)

11061

++					continue;

11062

++			}

11063

++			/* skip IPv4 events if IPV6_V6ONLY is set */

11064

++			else if (event->family == AF_INET &&

11065

++				 inet6_sk(meta_sk)->ipv6only)

11066

++				continue;

11067

++

11068

++			if (unlikely(!atomic_inc_not_zero(&meta_sk->sk_refcnt)))

11069

++				continue;

11070

++

11071

++			bh_lock_sock(meta_sk);

11072

++

11073

++			if (!mptcp(meta_tp) || !is_meta_sk(meta_sk) ||

11074

++			    mpcb->infinite_mapping_snd ||

11075

++			    mpcb->infinite_mapping_rcv ||

11076

++			    mpcb->send_infinite_mapping)

11077

++				goto next;

11078

++

11079

++			/* May be that the pm has changed in-between */

11080

++			if (mpcb->pm_ops != &full_mesh)

11081

++				goto next;

11082

++

11083

++			if (sock_owned_by_user(meta_sk)) {

11084

++				if (!test_and_set_bit(MPTCP_PATH_MANAGER,

11085

++						      &meta_tp->tsq_flags))

11086

++					sock_hold(meta_sk);

11087

++

11088

++				goto next;

11089

++			}

11090

++

11091

++			if (event->code == MPTCP_EVENT_ADD) {

11092

++				fmp->add_addr++;

11093

++				mpcb->addr_signal = 1;

11094

++

11095

++				sk = mptcp_select_ack_sock(meta_sk);

11096

++				if (sk)

11097

++					tcp_send_ack(sk);

11098

++

11099

++				full_mesh_create_subflows(meta_sk);

11100

++			}

11101

++

11102

++			if (event->code == MPTCP_EVENT_DEL) {

11103

++				struct sock *sk, *tmpsk;

11104

++				struct mptcp_loc_addr *mptcp_local;

11105

++				bool found = false;

11106

++

11107

++				mptcp_local = rcu_dereference_bh(fm_ns->local);

11108

++

11109

++				/* In any case, we need to update our bitfields */

11110

++				if (id >= 0)

11111

++					update_addr_bitfields(meta_sk, mptcp_local);

11112

++

11113

++				/* Look for the socket and remove him */

11114

++				mptcp_for_each_sk_safe(mpcb, sk, tmpsk) {

11115

++					if ((event->family == AF_INET6 &&

11116

++					     (sk->sk_family == AF_INET ||

11117

++					      mptcp_v6_is_v4_mapped(sk))) ||

11118

++					    (event->family == AF_INET &&

11119

++					     (sk->sk_family == AF_INET6 &&

11120

++					      !mptcp_v6_is_v4_mapped(sk))))

11121

++						continue;

11122

++

11123

++					if (event->family == AF_INET &&

11124

++					    (sk->sk_family == AF_INET ||

11125

++					     mptcp_v6_is_v4_mapped(sk)) &&

11126

++					     inet_sk(sk)->inet_saddr != event->addr.in.s_addr)

11127

++						continue;

11128

++

11129

++					if (event->family == AF_INET6 &&

11130

++					    sk->sk_family == AF_INET6 &&

11131

++					    !ipv6_addr_equal(&inet6_sk(sk)->saddr, &event->addr.in6))

11132

++						continue;

11133

++

11134

++					/* Reinject, so that pf = 1 and so we

11135

++					 * won't select this one as the

11136

++					 * ack-sock.

11137

++					 */

11138

++					mptcp_reinject_data(sk, 0);

11139

++

11140

++					/* We announce the removal of this id */

11141

++					announce_remove_addr(tcp_sk(sk)->mptcp->loc_id, meta_sk);

11142

++

11143

++					mptcp_sub_force_close(sk);

11144

++					found = true;

11145

++				}

11146

++

11147

++				if (found)

11148

++					goto next;

11149

++

11150

++				/* The id may have been given by the event,

11151

++				 * matching on a local address. And it may not

11152

++				 * have matched on one of the above sockets,

11153

++				 * because the client never created a subflow.

11154

++				 * So, we have to finally remove it here.

11155

++				 */

11156

++				if (id > 0)

11157

++					announce_remove_addr(id, meta_sk);

11158

++			}

11159

++

11160

++			if (event->code == MPTCP_EVENT_MOD) {

11161

++				struct sock *sk;

11162

++

11163

++				mptcp_for_each_sk(mpcb, sk) {

11164

++					struct tcp_sock *tp = tcp_sk(sk);

11165

++					if (event->family == AF_INET &&

11166

++					    (sk->sk_family == AF_INET ||

11167

++					     mptcp_v6_is_v4_mapped(sk)) &&

11168

++					     inet_sk(sk)->inet_saddr == event->addr.in.s_addr) {

11169

++						if (event->low_prio != tp->mptcp->low_prio) {

11170

++							tp->mptcp->send_mp_prio = 1;

11171

++							tp->mptcp->low_prio = event->low_prio;

11172

++

11173

++							tcp_send_ack(sk);

11174

++						}

11175

++					}

11176

++

11177

++					if (event->family == AF_INET6 &&

11178

++					    sk->sk_family == AF_INET6 &&

11179

++					    !ipv6_addr_equal(&inet6_sk(sk)->saddr, &event->addr.in6)) {

11180

++						if (event->low_prio != tp->mptcp->low_prio) {

11181

++							tp->mptcp->send_mp_prio = 1;

11182

++							tp->mptcp->low_prio = event->low_prio;

11183

++

11184

++							tcp_send_ack(sk);

11185

++						}

11186

++					}

11187

++				}

11188

++			}

11189

++next:

11190

++			bh_unlock_sock(meta_sk);

11191

++			sock_put(meta_sk);

11192

++		}

11193

++		rcu_read_unlock_bh();

11194

++	}

11195

++	goto next_event;

11196

++}

11197

++

11198

++static struct mptcp_addr_event *lookup_similar_event(const struct net *net,

11199

++						     const struct mptcp_addr_event *event)

11200

++{

11201

++	struct mptcp_addr_event *eventq;

11202

++	struct mptcp_fm_ns *fm_ns = fm_get_ns(net);

11203

++

11204

++	list_for_each_entry(eventq, &fm_ns->events, list) {

11205

++		if (eventq->family != event->family)

11206

++			continue;

11207

++		if (event->family == AF_INET) {

11208

++			if (eventq->addr.in.s_addr == event->addr.in.s_addr)

11209

++				return eventq;

11210

++		} else {

11211

++			if (ipv6_addr_equal(&eventq->addr.in6, &event->addr.in6))

11212

++				return eventq;

11213

++		}

11214

++	}

11215

++	return NULL;

11216

++}

11217

++

11218

++/* We already hold the net-namespace MPTCP-lock */

11219

++static void add_pm_event(struct net *net, const struct mptcp_addr_event *event)

11220

++{

11221

++	struct mptcp_addr_event *eventq = lookup_similar_event(net, event);

11222

++	struct mptcp_fm_ns *fm_ns = fm_get_ns(net);

11223

++

11224

++	if (eventq) {

11225

++		switch (event->code) {

11226

++		case MPTCP_EVENT_DEL:

11227

++			mptcp_debug("%s del old_code %u\n", __func__, eventq->code);

11228

++			list_del(&eventq->list);

11229

++			kfree(eventq);

11230

++			break;

11231

++		case MPTCP_EVENT_ADD:

11232

++			mptcp_debug("%s add old_code %u\n", __func__, eventq->code);

11233

++			eventq->low_prio = event->low_prio;

11234

++			eventq->code = MPTCP_EVENT_ADD;

11235

++			return;

11236

++		case MPTCP_EVENT_MOD:

11237

++			mptcp_debug("%s mod old_code %u\n", __func__, eventq->code);

11238

++			eventq->low_prio = event->low_prio;

11239

++			eventq->code = MPTCP_EVENT_MOD;

11240

++			return;

11241

++		}

11242

++	}

11243

++

11244

++	/* OK, we have to add the new address to the wait queue */

11245

++	eventq = kmemdup(event, sizeof(struct mptcp_addr_event), GFP_ATOMIC);

11246

++	if (!eventq)

11247

++		return;

11248

++

11249

++	list_add_tail(&eventq->list, &fm_ns->events);

11250

++

11251

++	/* Create work-queue */

11252

++	if (!delayed_work_pending(&fm_ns->address_worker))

11253

++		queue_delayed_work(mptcp_wq, &fm_ns->address_worker,

11254

++				   msecs_to_jiffies(500));

11255

++}

11256

++

11257

++static void addr4_event_handler(const struct in_ifaddr *ifa, unsigned long event,

11258

++				struct net *net)

11259

++{

11260

++	const struct net_device *netdev = ifa->ifa_dev->dev;

11261

++	struct mptcp_fm_ns *fm_ns = fm_get_ns(net);

11262

++	struct mptcp_addr_event mpevent;

11263

++

11264

++	if (ifa->ifa_scope > RT_SCOPE_LINK ||

11265

++	    ipv4_is_loopback(ifa->ifa_local))

11266

++		return;

11267

++

11268

++	spin_lock_bh(&fm_ns->local_lock);

11269

++

11270

++	mpevent.family = AF_INET;

11271

++	mpevent.addr.in.s_addr = ifa->ifa_local;

11272

++	mpevent.low_prio = (netdev->flags & IFF_MPBACKUP) ? 1 : 0;

11273

++

11274

++	if (event == NETDEV_DOWN || !netif_running(netdev) ||

11275

++	    (netdev->flags & IFF_NOMULTIPATH) || !(netdev->flags & IFF_UP))

11276

++		mpevent.code = MPTCP_EVENT_DEL;

11277

++	else if (event == NETDEV_UP)

11278

++		mpevent.code = MPTCP_EVENT_ADD;

11279

++	else if (event == NETDEV_CHANGE)

11280

++		mpevent.code = MPTCP_EVENT_MOD;

11281

++

11282

++	mptcp_debug("%s created event for %pI4, code %u prio %u\n", __func__,

11283

++		    &ifa->ifa_local, mpevent.code, mpevent.low_prio);

11284

++	add_pm_event(net, &mpevent);

11285

++

11286

++	spin_unlock_bh(&fm_ns->local_lock);

11287

++	return;

11288

++}

11289

++

11290

++/* React on IPv4-addr add/rem-events */

11291

++static int mptcp_pm_inetaddr_event(struct notifier_block *this,

11292

++				   unsigned long event, void *ptr)

11293

++{

11294

++	const struct in_ifaddr *ifa = (struct in_ifaddr *)ptr;

11295

++	struct net *net = dev_net(ifa->ifa_dev->dev);

11296

++

11297

++	if (!(event == NETDEV_UP || event == NETDEV_DOWN ||

11298

++	      event == NETDEV_CHANGE))

11299

++		return NOTIFY_DONE;

11300

++

11301

++	addr4_event_handler(ifa, event, net);

11302

++

11303

++	return NOTIFY_DONE;

11304

++}

11305

++

11306

++static struct notifier_block mptcp_pm_inetaddr_notifier = {

11307

++		.notifier_call = mptcp_pm_inetaddr_event,

11308

++};

11309

++

11310

++#if IS_ENABLED(CONFIG_IPV6)

11311

++

11312

++/* IPV6-related address/interface watchers */

11313

++struct mptcp_dad_data {

11314

++	struct timer_list timer;

11315

++	struct inet6_ifaddr *ifa;

11316

++};

11317

++

11318

++static void dad_callback(unsigned long arg);

11319

++static int inet6_addr_event(struct notifier_block *this,

11320

++				     unsigned long event, void *ptr);

11321

++

11322

++static int ipv6_is_in_dad_state(const struct inet6_ifaddr *ifa)

11323

++{

11324

++	return (ifa->flags & IFA_F_TENTATIVE) &&

11325

++	       ifa->state == INET6_IFADDR_STATE_DAD;

11326

++}

11327

++

11328

++static void dad_init_timer(struct mptcp_dad_data *data,

11329

++				 struct inet6_ifaddr *ifa)

11330

++{

11331

++	data->ifa = ifa;

11332

++	data->timer.data = (unsigned long)data;

11333

++	data->timer.function = dad_callback;

11334

++	if (ifa->idev->cnf.rtr_solicit_delay)

11335

++		data->timer.expires = jiffies + ifa->idev->cnf.rtr_solicit_delay;

11336

++	else

11337

++		data->timer.expires = jiffies + (HZ/10);

11338

++}

11339

++

11340

++static void dad_callback(unsigned long arg)

11341

++{

11342

++	struct mptcp_dad_data *data = (struct mptcp_dad_data *)arg;

11343

++

11344

++	if (ipv6_is_in_dad_state(data->ifa)) {

11345

++		dad_init_timer(data, data->ifa);

11346

++		add_timer(&data->timer);

11347

++	} else {

11348

++		inet6_addr_event(NULL, NETDEV_UP, data->ifa);

11349

++		in6_ifa_put(data->ifa);

11350

++		kfree(data);

11351

++	}

11352

++}

11353

++

11354

++static inline void dad_setup_timer(struct inet6_ifaddr *ifa)

11355

++{

11356

++	struct mptcp_dad_data *data;

11357

++

11358

++	data = kmalloc(sizeof(*data), GFP_ATOMIC);

11359

++

11360

++	if (!data)

11361

++		return;

11362

++

11363

++	init_timer(&data->timer);

11364

++	dad_init_timer(data, ifa);

11365

++	add_timer(&data->timer);

11366

++	in6_ifa_hold(ifa);

11367

++}

11368

++

11369

++static void addr6_event_handler(const struct inet6_ifaddr *ifa, unsigned long event,

11370

++				struct net *net)

11371

++{

11372

++	const struct net_device *netdev = ifa->idev->dev;

11373

++	int addr_type = ipv6_addr_type(&ifa->addr);

11374

++	struct mptcp_fm_ns *fm_ns = fm_get_ns(net);

11375

++	struct mptcp_addr_event mpevent;

11376

++

11377

++	if (ifa->scope > RT_SCOPE_LINK ||

11378

++	    addr_type == IPV6_ADDR_ANY ||

11379

++	    (addr_type & IPV6_ADDR_LOOPBACK) ||

11380

++	    (addr_type & IPV6_ADDR_LINKLOCAL))

11381

++		return;

11382

++

11383

++	spin_lock_bh(&fm_ns->local_lock);

11384

++

11385

++	mpevent.family = AF_INET6;

11386

++	mpevent.addr.in6 = ifa->addr;

11387

++	mpevent.low_prio = (netdev->flags & IFF_MPBACKUP) ? 1 : 0;

11388

++

11389

++	if (event == NETDEV_DOWN || !netif_running(netdev) ||

11390

++	    (netdev->flags & IFF_NOMULTIPATH) || !(netdev->flags & IFF_UP))

11391

++		mpevent.code = MPTCP_EVENT_DEL;

11392

++	else if (event == NETDEV_UP)

11393

++		mpevent.code = MPTCP_EVENT_ADD;

11394

++	else if (event == NETDEV_CHANGE)

11395

++		mpevent.code = MPTCP_EVENT_MOD;

11396

++

11397

++	mptcp_debug("%s created event for %pI6, code %u prio %u\n", __func__,

11398

++		    &ifa->addr, mpevent.code, mpevent.low_prio);

11399

++	add_pm_event(net, &mpevent);

11400

++

11401

++	spin_unlock_bh(&fm_ns->local_lock);

11402

++	return;

11403

++}

11404

++

11405

++/* React on IPv6-addr add/rem-events */

11406

++static int inet6_addr_event(struct notifier_block *this, unsigned long event,

11407

++			    void *ptr)

11408

++{

11409

++	struct inet6_ifaddr *ifa6 = (struct inet6_ifaddr *)ptr;

11410

++	struct net *net = dev_net(ifa6->idev->dev);

11411

++

11412

++	if (!(event == NETDEV_UP || event == NETDEV_DOWN ||

11413

++	      event == NETDEV_CHANGE))

11414

++		return NOTIFY_DONE;

11415

++

11416

++	if (ipv6_is_in_dad_state(ifa6))

11417

++		dad_setup_timer(ifa6);

11418

++	else

11419

++		addr6_event_handler(ifa6, event, net);

11420

++

11421

++	return NOTIFY_DONE;

11422

++}

11423

++

11424

++static struct notifier_block inet6_addr_notifier = {

11425

++		.notifier_call = inet6_addr_event,

11426

++};

11427

++

11428

++#endif

11429

++

11430

++/* React on ifup/down-events */

11431

++static int netdev_event(struct notifier_block *this, unsigned long event,

11432

++			void *ptr)

11433

++{

11434

++	const struct net_device *dev = netdev_notifier_info_to_dev(ptr);

11435

++	struct in_device *in_dev;

11436

++#if IS_ENABLED(CONFIG_IPV6)

11437

++	struct inet6_dev *in6_dev;

11438

++#endif

11439

++

11440

++	if (!(event == NETDEV_UP || event == NETDEV_DOWN ||

11441

++	      event == NETDEV_CHANGE))

11442

++		return NOTIFY_DONE;

11443

++

11444

++	rcu_read_lock();

11445

++	in_dev = __in_dev_get_rtnl(dev);

11446

++

11447

++	if (in_dev) {

11448

++		for_ifa(in_dev) {

11449

++			mptcp_pm_inetaddr_event(NULL, event, ifa);

11450

++		} endfor_ifa(in_dev);

11451

++	}

11452

++

11453

++#if IS_ENABLED(CONFIG_IPV6)

11454

++	in6_dev = __in6_dev_get(dev);

11455

++

11456

++	if (in6_dev) {

11457

++		struct inet6_ifaddr *ifa6;

11458

++		list_for_each_entry(ifa6, &in6_dev->addr_list, if_list)

11459

++			inet6_addr_event(NULL, event, ifa6);

11460

++	}

11461

++#endif

11462

++

11463

++	rcu_read_unlock();

11464

++	return NOTIFY_DONE;

11465

++}

11466

++

11467

++static struct notifier_block mptcp_pm_netdev_notifier = {

11468

++		.notifier_call = netdev_event,

11469

++};

11470

++

11471

++static void full_mesh_add_raddr(struct mptcp_cb *mpcb,

11472

++				const union inet_addr *addr,

11473

++				sa_family_t family, __be16 port, u8 id)

11474

++{

11475

++	if (family == AF_INET)

11476

++		mptcp_addv4_raddr(mpcb, &addr->in, port, id);

11477

++	else

11478

++		mptcp_addv6_raddr(mpcb, &addr->in6, port, id);

11479

++}

11480

++

11481

++static void full_mesh_new_session(const struct sock *meta_sk)

11482

++{

11483

++	struct mptcp_loc_addr *mptcp_local;

11484

++	struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;

11485

++	struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb);

11486

++	const struct mptcp_fm_ns *fm_ns = fm_get_ns(sock_net(meta_sk));

11487

++	int i, index;

11488

++	union inet_addr saddr, daddr;

11489

++	sa_family_t family;

11490

++	bool meta_v4 = meta_sk->sk_family == AF_INET;

11491

++

11492

++	/* Init local variables necessary for the rest */

11493

++	if (meta_sk->sk_family == AF_INET || mptcp_v6_is_v4_mapped(meta_sk)) {

11494

++		saddr.ip = inet_sk(meta_sk)->inet_saddr;

11495

++		daddr.ip = inet_sk(meta_sk)->inet_daddr;

11496

++		family = AF_INET;

11497

++#if IS_ENABLED(CONFIG_IPV6)

11498

++	} else {

11499

++		saddr.in6 = inet6_sk(meta_sk)->saddr;

11500

++		daddr.in6 = meta_sk->sk_v6_daddr;

11501

++		family = AF_INET6;

11502

++#endif

11503

++	}

11504

++

11505

++	rcu_read_lock();

11506

++	mptcp_local = rcu_dereference(fm_ns->local);

11507

++

11508

++	index = mptcp_find_address(mptcp_local, family, &saddr);

11509

++	if (index < 0)

11510

++		goto fallback;

11511

++

11512

++	full_mesh_add_raddr(mpcb, &daddr, family, 0, 0);

11513

++	mptcp_set_init_addr_bit(mpcb, &daddr, family, index);

11514

++

11515

++	/* Initialize workqueue-struct */

11516

++	INIT_WORK(&fmp->subflow_work, create_subflow_worker);

11517

++	INIT_DELAYED_WORK(&fmp->subflow_retry_work, retry_subflow_worker);

11518

++	fmp->mpcb = mpcb;

11519

++

11520

++	if (!meta_v4 && inet6_sk(meta_sk)->ipv6only)

11521

++		goto skip_ipv4;

11522

++

11523

++	/* Look for the address among the local addresses */

11524

++	mptcp_for_each_bit_set(mptcp_local->loc4_bits, i) {

11525

++		__be32 ifa_address = mptcp_local->locaddr4[i].addr.s_addr;

11526

++

11527

++		/* We do not need to announce the initial subflow's address again */

11528

++		if (family == AF_INET && saddr.ip == ifa_address)

11529

++			continue;

11530

++

11531

++		fmp->add_addr++;

11532

++		mpcb->addr_signal = 1;

11533

++	}

11534

++

11535

++skip_ipv4:

11536

++#if IS_ENABLED(CONFIG_IPV6)

11537

++	/* skip IPv6 addresses if meta-socket is IPv4 */

11538

++	if (meta_v4)

11539

++		goto skip_ipv6;

11540

++

11541

++	mptcp_for_each_bit_set(mptcp_local->loc6_bits, i) {

11542

++		const struct in6_addr *ifa6 = &mptcp_local->locaddr6[i].addr;

11543

++

11544

++		/* We do not need to announce the initial subflow's address again */

11545

++		if (family == AF_INET6 && ipv6_addr_equal(&saddr.in6, ifa6))

11546

++			continue;

11547

++

11548

++		fmp->add_addr++;

11549

++		mpcb->addr_signal = 1;

11550

++	}

11551

++

11552

++skip_ipv6:

11553

++#endif

11554

++

11555

++	rcu_read_unlock();

11556

++

11557

++	if (family == AF_INET)

11558

++		fmp->announced_addrs_v4 |= (1 << index);

11559

++	else

11560

++		fmp->announced_addrs_v6 |= (1 << index);

11561

++

11562

++	for (i = fmp->add_addr; i && fmp->add_addr; i--)

11563

++		tcp_send_ack(mpcb->master_sk);

11564

++

11565

++	return;

11566

++

11567

++fallback:

11568

++	rcu_read_unlock();

11569

++	mptcp_fallback_default(mpcb);

11570

++	return;

11571

++}

11572

++

11573

++static void full_mesh_create_subflows(struct sock *meta_sk)

11574

++{

11575

++	const struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;

11576

++	struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb);

11577

++

11578

++	if (mpcb->infinite_mapping_snd || mpcb->infinite_mapping_rcv ||

11579

++	    mpcb->send_infinite_mapping ||

11580

++	    mpcb->server_side || sock_flag(meta_sk, SOCK_DEAD))

11581

++		return;

11582

++

11583

++	if (mpcb->master_sk &&

11584

++	    !tcp_sk(mpcb->master_sk)->mptcp->fully_established)

11585

++		return;

11586

++

11587

++	if (!work_pending(&fmp->subflow_work)) {

11588

++		sock_hold(meta_sk);

11589

++		queue_work(mptcp_wq, &fmp->subflow_work);

11590

++	}

11591

++}

11592

++

11593

++/* Called upon release_sock, if the socket was owned by the user during

11594

++ * a path-management event.

11595

++ */

11596

++static void full_mesh_release_sock(struct sock *meta_sk)

11597

++{

11598

++	struct mptcp_loc_addr *mptcp_local;

11599

++	struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;

11600

++	struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb);

11601

++	const struct mptcp_fm_ns *fm_ns = fm_get_ns(sock_net(meta_sk));

11602

++	struct sock *sk, *tmpsk;

11603

++	bool meta_v4 = meta_sk->sk_family == AF_INET;

11604

++	int i;

11605

++

11606

++	rcu_read_lock();

11607

++	mptcp_local = rcu_dereference(fm_ns->local);

11608

++

11609

++	if (!meta_v4 && inet6_sk(meta_sk)->ipv6only)

11610

++		goto skip_ipv4;

11611

++

11612

++	/* First, detect modifications or additions */

11613

++	mptcp_for_each_bit_set(mptcp_local->loc4_bits, i) {

11614

++		struct in_addr ifa = mptcp_local->locaddr4[i].addr;

11615

++		bool found = false;

11616

++

11617

++		mptcp_for_each_sk(mpcb, sk) {

11618

++			struct tcp_sock *tp = tcp_sk(sk);

11619

++

11620

++			if (sk->sk_family == AF_INET6 &&

11621

++			    !mptcp_v6_is_v4_mapped(sk))

11622

++				continue;

11623

++

11624

++			if (inet_sk(sk)->inet_saddr != ifa.s_addr)

11625

++				continue;

11626

++

11627

++			found = true;

11628

++

11629

++			if (mptcp_local->locaddr4[i].low_prio != tp->mptcp->low_prio) {

11630

++				tp->mptcp->send_mp_prio = 1;

11631

++				tp->mptcp->low_prio = mptcp_local->locaddr4[i].low_prio;

11632

++

11633

++				tcp_send_ack(sk);

11634

++			}

11635

++		}

11636

++

11637

++		if (!found) {

11638

++			fmp->add_addr++;

11639

++			mpcb->addr_signal = 1;

11640

++

11641

++			sk = mptcp_select_ack_sock(meta_sk);

11642

++			if (sk)

11643

++				tcp_send_ack(sk);

11644

++			full_mesh_create_subflows(meta_sk);

11645

++		}

11646

++	}

11647

++

11648

++skip_ipv4:

11649

++#if IS_ENABLED(CONFIG_IPV6)

11650

++	/* skip IPv6 addresses if meta-socket is IPv4 */

11651

++	if (meta_v4)

11652

++		goto removal;

11653

++

11654

++	mptcp_for_each_bit_set(mptcp_local->loc6_bits, i) {

11655

++		struct in6_addr ifa = mptcp_local->locaddr6[i].addr;

11656

++		bool found = false;

11657

++

11658

++		mptcp_for_each_sk(mpcb, sk) {

11659

++			struct tcp_sock *tp = tcp_sk(sk);

11660

++

11661

++			if (sk->sk_family == AF_INET ||

11662

++			    mptcp_v6_is_v4_mapped(sk))

11663

++				continue;

11664

++

11665

++			if (!ipv6_addr_equal(&inet6_sk(sk)->saddr, &ifa))

11666

++				continue;

11667

++

11668

++			found = true;

11669

++

11670

++			if (mptcp_local->locaddr6[i].low_prio != tp->mptcp->low_prio) {

11671

++				tp->mptcp->send_mp_prio = 1;

11672

++				tp->mptcp->low_prio = mptcp_local->locaddr6[i].low_prio;

11673

++

11674

++				tcp_send_ack(sk);

11675

++			}

11676

++		}

11677

++

11678

++		if (!found) {

11679

++			fmp->add_addr++;

11680

++			mpcb->addr_signal = 1;

11681

++

11682

++			sk = mptcp_select_ack_sock(meta_sk);

11683

++			if (sk)

11684

++				tcp_send_ack(sk);

11685

++			full_mesh_create_subflows(meta_sk);

11686

++		}

11687

++	}

11688

++

11689

++removal:

11690

++#endif

11691

++

11692

++	/* Now, detect address-removals */

11693

++	mptcp_for_each_sk_safe(mpcb, sk, tmpsk) {

11694

++		bool shall_remove = true;

11695

++

11696

++		if (sk->sk_family == AF_INET || mptcp_v6_is_v4_mapped(sk)) {

11697

++			mptcp_for_each_bit_set(mptcp_local->loc4_bits, i) {

11698

++				if (inet_sk(sk)->inet_saddr == mptcp_local->locaddr4[i].addr.s_addr) {

11699

++					shall_remove = false;

11700

++					break;

11701

++				}

11702

++			}

11703

++		} else {

11704

++			mptcp_for_each_bit_set(mptcp_local->loc6_bits, i) {

11705

++				if (ipv6_addr_equal(&inet6_sk(sk)->saddr, &mptcp_local->locaddr6[i].addr)) {

11706

++					shall_remove = false;

11707

++					break;

11708

++				}

11709

++			}

11710

++		}

11711

++

11712

++		if (shall_remove) {

11713

++			/* Reinject, so that pf = 1 and so we

11714

++			 * won't select this one as the

11715

++			 * ack-sock.

11716

++			 */

11717

++			mptcp_reinject_data(sk, 0);

11718

++

11719

++			announce_remove_addr(tcp_sk(sk)->mptcp->loc_id,

11720

++					     meta_sk);

11721

++

11722

++			mptcp_sub_force_close(sk);

11723

++		}

11724

++	}

11725

++

11726

++	/* Just call it optimistically. It actually cannot do any harm */

11727

++	update_addr_bitfields(meta_sk, mptcp_local);

11728

++

11729

++	rcu_read_unlock();

11730

++}

11731

++

11732

++static int full_mesh_get_local_id(sa_family_t family, union inet_addr *addr,

11733

++				  struct net *net, bool *low_prio)

11734

++{

11735

++	struct mptcp_loc_addr *mptcp_local;

11736

++	const struct mptcp_fm_ns *fm_ns = fm_get_ns(net);

11737

++	int index, id = -1;

11738

++

11739

++	/* Handle the backup-flows */

11740

++	rcu_read_lock();

11741

++	mptcp_local = rcu_dereference(fm_ns->local);

11742

++

11743

++	index = mptcp_find_address(mptcp_local, family, addr);

11744

++

11745

++	if (index != -1) {

11746

++		if (family == AF_INET) {

11747

++			id = mptcp_local->locaddr4[index].loc4_id;

11748

++			*low_prio = mptcp_local->locaddr4[index].low_prio;

11749

++		} else {

11750

++			id = mptcp_local->locaddr6[index].loc6_id;

11751

++			*low_prio = mptcp_local->locaddr6[index].low_prio;

11752

++		}

11753

++	}

11754

++

11755

++

11756

++	rcu_read_unlock();

11757

++

11758

++	return id;

11759

++}

11760

++

11761

++static void full_mesh_addr_signal(struct sock *sk, unsigned *size,

11762

++				  struct tcp_out_options *opts,

11763

++				  struct sk_buff *skb)

11764

++{

11765

++	const struct tcp_sock *tp = tcp_sk(sk);

11766

++	struct mptcp_cb *mpcb = tp->mpcb;

11767

++	struct sock *meta_sk = mpcb->meta_sk;

11768

++	struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb);

11769

++	struct mptcp_loc_addr *mptcp_local;

11770

++	struct mptcp_fm_ns *fm_ns = fm_get_ns(sock_net(sk));

11771

++	int remove_addr_len;

11772

++	u8 unannouncedv4 = 0, unannouncedv6 = 0;

11773

++	bool meta_v4 = meta_sk->sk_family == AF_INET;

11774

++

11775

++	mpcb->addr_signal = 0;

11776

++

11777

++	if (likely(!fmp->add_addr))

11778

++		goto remove_addr;

11779

++

11780

++	rcu_read_lock();

11781

++	mptcp_local = rcu_dereference(fm_ns->local);

11782

++

11783

++	if (!meta_v4 && inet6_sk(meta_sk)->ipv6only)

11784

++		goto skip_ipv4;

11785

++

11786

++	/* IPv4 */

11787

++	unannouncedv4 = (~fmp->announced_addrs_v4) & mptcp_local->loc4_bits;

11788

++	if (unannouncedv4 &&

11789

++	    MAX_TCP_OPTION_SPACE - *size >= MPTCP_SUB_LEN_ADD_ADDR4_ALIGN) {

11790

++		int ind = mptcp_find_free_index(~unannouncedv4);

11791

++

11792

++		opts->options |= OPTION_MPTCP;

11793

++		opts->mptcp_options |= OPTION_ADD_ADDR;

11794

++		opts->add_addr4.addr_id = mptcp_local->locaddr4[ind].loc4_id;

11795

++		opts->add_addr4.addr = mptcp_local->locaddr4[ind].addr;

11796

++		opts->add_addr_v4 = 1;

11797

++

11798

++		if (skb) {

11799

++			fmp->announced_addrs_v4 |= (1 << ind);

11800

++			fmp->add_addr--;

11801

++		}

11802

++		*size += MPTCP_SUB_LEN_ADD_ADDR4_ALIGN;

11803

++	}

11804

++

11805

++	if (meta_v4)

11806

++		goto skip_ipv6;

11807

++

11808

++skip_ipv4:

11809

++	/* IPv6 */

11810

++	unannouncedv6 = (~fmp->announced_addrs_v6) & mptcp_local->loc6_bits;

11811

++	if (unannouncedv6 &&

11812

++	    MAX_TCP_OPTION_SPACE - *size >= MPTCP_SUB_LEN_ADD_ADDR6_ALIGN) {

11813

++		int ind = mptcp_find_free_index(~unannouncedv6);

11814

++

11815

++		opts->options |= OPTION_MPTCP;

11816

++		opts->mptcp_options |= OPTION_ADD_ADDR;

11817

++		opts->add_addr6.addr_id = mptcp_local->locaddr6[ind].loc6_id;

11818

++		opts->add_addr6.addr = mptcp_local->locaddr6[ind].addr;

11819

++		opts->add_addr_v6 = 1;

11820

++

11821

++		if (skb) {

11822

++			fmp->announced_addrs_v6 |= (1 << ind);

11823

++			fmp->add_addr--;

11824

++		}

11825

++		*size += MPTCP_SUB_LEN_ADD_ADDR6_ALIGN;

11826

++	}

11827

++

11828

++skip_ipv6:

11829

++	rcu_read_unlock();

11830

++

11831

++	if (!unannouncedv4 && !unannouncedv6 && skb)

11832

++		fmp->add_addr--;

11833

++

11834

++remove_addr:

11835

++	if (likely(!fmp->remove_addrs))

11836

++		goto exit;

11837

++

11838

++	remove_addr_len = mptcp_sub_len_remove_addr_align(fmp->remove_addrs);

11839

++	if (MAX_TCP_OPTION_SPACE - *size < remove_addr_len)

11840

++		goto exit;

11841

++

11842

++	opts->options |= OPTION_MPTCP;

11843

++	opts->mptcp_options |= OPTION_REMOVE_ADDR;

11844

++	opts->remove_addrs = fmp->remove_addrs;

11845

++	*size += remove_addr_len;

11846

++	if (skb)

11847

++		fmp->remove_addrs = 0;

11848

++

11849

++exit:

11850

++	mpcb->addr_signal = !!(fmp->add_addr || fmp->remove_addrs);

11851

++}

11852

++

11853

++static void full_mesh_rem_raddr(struct mptcp_cb *mpcb, u8 rem_id)

11854

++{

11855

++	mptcp_v4_rem_raddress(mpcb, rem_id);

11856

++	mptcp_v6_rem_raddress(mpcb, rem_id);

11857

++}

11858

++

11859

++/* Output /proc/net/mptcp_fullmesh */

11860

++static int mptcp_fm_seq_show(struct seq_file *seq, void *v)

11861

++{

11862

++	const struct net *net = seq->private;

11863

++	struct mptcp_loc_addr *mptcp_local;

11864

++	const struct mptcp_fm_ns *fm_ns = fm_get_ns(net);

11865

++	int i;

11866

++

11867

++	seq_printf(seq, "Index, Address-ID, Backup, IP-address\n");

11868

++

11869

++	rcu_read_lock_bh();

11870

++	mptcp_local = rcu_dereference(fm_ns->local);

11871

++

11872

++	seq_printf(seq, "IPv4, next v4-index: %u\n", mptcp_local->next_v4_index);

11873

++

11874

++	mptcp_for_each_bit_set(mptcp_local->loc4_bits, i) {

11875

++		struct mptcp_loc4 *loc4 = &mptcp_local->locaddr4[i];

11876

++

11877

++		seq_printf(seq, "%u, %u, %u, %pI4\n", i, loc4->loc4_id,

11878

++			   loc4->low_prio, &loc4->addr);

11879

++	}

11880

++

11881

++	seq_printf(seq, "IPv6, next v6-index: %u\n", mptcp_local->next_v6_index);

11882

++

11883

++	mptcp_for_each_bit_set(mptcp_local->loc6_bits, i) {

11884

++		struct mptcp_loc6 *loc6 = &mptcp_local->locaddr6[i];

11885

++

11886

++		seq_printf(seq, "%u, %u, %u, %pI6\n", i, loc6->loc6_id,

11887

++			   loc6->low_prio, &loc6->addr);

11888

++	}

11889

++	rcu_read_unlock_bh();

11890

++

11891

++	return 0;

11892

++}

11893

++

11894

++static int mptcp_fm_seq_open(struct inode *inode, struct file *file)

11895

++{

11896

++	return single_open_net(inode, file, mptcp_fm_seq_show);

11897

++}

11898

++

11899

++static const struct file_operations mptcp_fm_seq_fops = {

11900

++	.owner = THIS_MODULE,

11901

++	.open = mptcp_fm_seq_open,

11902

++	.read = seq_read,

11903

++	.llseek = seq_lseek,

11904

++	.release = single_release_net,

11905

++};

11906

++

11907

++static int mptcp_fm_init_net(struct net *net)

11908

++{

11909

++	struct mptcp_loc_addr *mptcp_local;

11910

++	struct mptcp_fm_ns *fm_ns;

11911

++	int err = 0;

11912

++

11913

++	fm_ns = kzalloc(sizeof(*fm_ns), GFP_KERNEL);

11914

++	if (!fm_ns)

11915

++		return -ENOBUFS;

11916

++

11917

++	mptcp_local = kzalloc(sizeof(*mptcp_local), GFP_KERNEL);

11918

++	if (!mptcp_local) {

11919

++		err = -ENOBUFS;

11920

++		goto err_mptcp_local;

11921

++	}

11922

++

11923

++	if (!proc_create("mptcp_fullmesh", S_IRUGO, net->proc_net,

11924

++			 &mptcp_fm_seq_fops)) {

11925

++		err = -ENOMEM;

11926

++		goto err_seq_fops;

11927

++	}

11928

++

11929

++	mptcp_local->next_v4_index = 1;

11930

++

11931

++	rcu_assign_pointer(fm_ns->local, mptcp_local);

11932

++	INIT_DELAYED_WORK(&fm_ns->address_worker, mptcp_address_worker);

11933

++	INIT_LIST_HEAD(&fm_ns->events);

11934

++	spin_lock_init(&fm_ns->local_lock);

11935

++	fm_ns->net = net;

11936

++	net->mptcp.path_managers[MPTCP_PM_FULLMESH] = fm_ns;

11937

++

11938

++	return 0;

11939

++err_seq_fops:

11940

++	kfree(mptcp_local);

11941

++err_mptcp_local:

11942

++	kfree(fm_ns);

11943

++	return err;

11944

++}

11945

++

11946

++static void mptcp_fm_exit_net(struct net *net)

11947

++{

11948

++	struct mptcp_addr_event *eventq, *tmp;

11949

++	struct mptcp_fm_ns *fm_ns;

11950

++	struct mptcp_loc_addr *mptcp_local;

11951

++

11952

++	fm_ns = fm_get_ns(net);

11953

++	cancel_delayed_work_sync(&fm_ns->address_worker);

11954

++

11955

++	rcu_read_lock_bh();

11956

++

11957

++	mptcp_local = rcu_dereference_bh(fm_ns->local);

11958

++	kfree(mptcp_local);

11959

++

11960

++	spin_lock(&fm_ns->local_lock);

11961

++	list_for_each_entry_safe(eventq, tmp, &fm_ns->events, list) {

11962

++		list_del(&eventq->list);

11963

++		kfree(eventq);

11964

++	}

11965

++	spin_unlock(&fm_ns->local_lock);

11966

++

11967

++	rcu_read_unlock_bh();

11968

++

11969

++	remove_proc_entry("mptcp_fullmesh", net->proc_net);

11970

++

11971

++	kfree(fm_ns);

11972

++}

11973

++

11974

++static struct pernet_operations full_mesh_net_ops = {

11975

++	.init = mptcp_fm_init_net,

11976

++	.exit = mptcp_fm_exit_net,

11977

++};

11978

++

11979

++static struct mptcp_pm_ops full_mesh __read_mostly = {

11980

++	.new_session = full_mesh_new_session,

11981

++	.release_sock = full_mesh_release_sock,

11982

++	.fully_established = full_mesh_create_subflows,

11983

++	.new_remote_address = full_mesh_create_subflows,

11984

++	.get_local_id = full_mesh_get_local_id,

11985

++	.addr_signal = full_mesh_addr_signal,

11986

++	.add_raddr = full_mesh_add_raddr,

11987

++	.rem_raddr = full_mesh_rem_raddr,

11988

++	.name = "fullmesh",

11989

++	.owner = THIS_MODULE,

11990

++};

11991

++

11992

++/* General initialization of MPTCP_PM */

11993

++static int __init full_mesh_register(void)

11994

++{

11995

++	int ret;

11996

++

11997

++	BUILD_BUG_ON(sizeof(struct fullmesh_priv) > MPTCP_PM_SIZE);

11998

++

11999

++	ret = register_pernet_subsys(&full_mesh_net_ops);

12000

++	if (ret)

12001

++		goto out;

12002

++

12003

++	ret = register_inetaddr_notifier(&mptcp_pm_inetaddr_notifier);

12004

++	if (ret)

12005

++		goto err_reg_inetaddr;

12006

++	ret = register_netdevice_notifier(&mptcp_pm_netdev_notifier);

12007

++	if (ret)

12008

++		goto err_reg_netdev;

12009

++

12010

++#if IS_ENABLED(CONFIG_IPV6)

12011

++	ret = register_inet6addr_notifier(&inet6_addr_notifier);

12012

++	if (ret)

12013

++		goto err_reg_inet6addr;

12014

++#endif

12015

++

12016

++	ret = mptcp_register_path_manager(&full_mesh);

12017

++	if (ret)

12018

++		goto err_reg_pm;

12019

++

12020

++out:

12021

++	return ret;

12022

++

12023

++

12024

++err_reg_pm:

12025

++#if IS_ENABLED(CONFIG_IPV6)

12026

++	unregister_inet6addr_notifier(&inet6_addr_notifier);

12027

++err_reg_inet6addr:

12028

++#endif

12029

++	unregister_netdevice_notifier(&mptcp_pm_netdev_notifier);

12030

++err_reg_netdev:

12031

++	unregister_inetaddr_notifier(&mptcp_pm_inetaddr_notifier);

12032

++err_reg_inetaddr:

12033

++	unregister_pernet_subsys(&full_mesh_net_ops);

12034

++	goto out;

12035

++}

12036

++

12037

++static void full_mesh_unregister(void)

12038

++{

12039

++#if IS_ENABLED(CONFIG_IPV6)

12040

++	unregister_inet6addr_notifier(&inet6_addr_notifier);

12041

++#endif

12042

++	unregister_netdevice_notifier(&mptcp_pm_netdev_notifier);

12043

++	unregister_inetaddr_notifier(&mptcp_pm_inetaddr_notifier);

12044

++	unregister_pernet_subsys(&full_mesh_net_ops);

12045

++	mptcp_unregister_path_manager(&full_mesh);

12046

++}

12047

++

12048

++module_init(full_mesh_register);

12049

++module_exit(full_mesh_unregister);

12050

++

12051

++MODULE_AUTHOR("Christoph Paasch");

12052

++MODULE_LICENSE("GPL");

12053

++MODULE_DESCRIPTION("Full-Mesh MPTCP");

12054

++MODULE_VERSION("0.88");

12055

+diff --git a/net/mptcp/mptcp_input.c b/net/mptcp/mptcp_input.c

12056

+new file mode 100644

12057

+index 000000000000..43704ccb639e

12058

+--- /dev/null

12059

++++ b/net/mptcp/mptcp_input.c

12060

+@@ -0,0 +1,2405 @@

12061

++/*

12062

++ *	MPTCP implementation - Sending side

12063

++ *

12064

++ *	Initial Design & Implementation:

12065

++ *	Sébastien Barré <sebastien.barre@×××××××××.be>

12066

++ *

12067

++ *	Current Maintainer & Author:

12068

++ *	Christoph Paasch <christoph.paasch@×××××××××.be>

12069

++ *

12070

++ *	Additional authors:

12071

++ *	Jaakko Korkeaniemi <jaakko.korkeaniemi@×××××.fi>

12072

++ *	Gregory Detal <gregory.detal@×××××××××.be>

12073

++ *	Fabien Duchêne <fabien.duchene@×××××××××.be>

12074

++ *	Andreas Seelinger <Andreas.Seelinger@×××××××××××.de>

12075

++ *	Lavkesh Lahngir <lavkesh51@×××××.com>

12076

++ *	Andreas Ripke <ripke@××××××.eu>

12077

++ *	Vlad Dogaru <vlad.dogaru@×××××.com>

12078

++ *	Octavian Purdila <octavian.purdila@×××××.com>

12079

++ *	John Ronan <jronan@××××.org>

12080

++ *	Catalin Nicutar <catalin.nicutar@×××××.com>

12081

++ *	Brandon Heller <brandonh@××××××××.edu>

12082

++ *

12083

++ *

12084

++ *	This program is free software; you can redistribute it and/or

12085

++ *      modify it under the terms of the GNU General Public License

12086

++ *      as published by the Free Software Foundation; either version

12087

++ *      2 of the License, or (at your option) any later version.

12088

++ */

12089

++

12090

++#include <asm/unaligned.h>

12091

++

12092

++#include <net/mptcp.h>

12093

++#include <net/mptcp_v4.h>

12094

++#include <net/mptcp_v6.h>

12095

++

12096

++#include <linux/kconfig.h>

12097

++

12098

++/* is seq1 < seq2 ? */

12099

++static inline bool before64(const u64 seq1, const u64 seq2)

12100

++{

12101

++	return (s64)(seq1 - seq2) < 0;

12102

++}

12103

++

12104

++/* is seq1 > seq2 ? */

12105

++#define after64(seq1, seq2)	before64(seq2, seq1)

12106

++

12107

++static inline void mptcp_become_fully_estab(struct sock *sk)

12108

++{

12109

++	tcp_sk(sk)->mptcp->fully_established = 1;

12110

++

12111

++	if (is_master_tp(tcp_sk(sk)) &&

12112

++	    tcp_sk(sk)->mpcb->pm_ops->fully_established)

12113

++		tcp_sk(sk)->mpcb->pm_ops->fully_established(mptcp_meta_sk(sk));

12114

++}

12115

++

12116

++/* Similar to tcp_tso_acked without any memory accounting */

12117

++static inline int mptcp_tso_acked_reinject(const struct sock *meta_sk,

12118

++					   struct sk_buff *skb)

12119

++{

12120

++	const struct tcp_sock *meta_tp = tcp_sk(meta_sk);

12121

++	u32 packets_acked, len;

12122

++

12123

++	BUG_ON(!after(TCP_SKB_CB(skb)->end_seq, meta_tp->snd_una));

12124

++

12125

++	packets_acked = tcp_skb_pcount(skb);

12126

++

12127

++	if (skb_unclone(skb, GFP_ATOMIC))

12128

++		return 0;

12129

++

12130

++	len = meta_tp->snd_una - TCP_SKB_CB(skb)->seq;

12131

++	__pskb_trim_head(skb, len);

12132

++

12133

++	TCP_SKB_CB(skb)->seq += len;

12134

++	skb->ip_summed = CHECKSUM_PARTIAL;

12135

++	skb->truesize	     -= len;

12136

++

12137

++	/* Any change of skb->len requires recalculation of tso factor. */

12138

++	if (tcp_skb_pcount(skb) > 1)

12139

++		tcp_set_skb_tso_segs(meta_sk, skb, tcp_skb_mss(skb));

12140

++	packets_acked -= tcp_skb_pcount(skb);

12141

++

12142

++	if (packets_acked) {

12143

++		BUG_ON(tcp_skb_pcount(skb) == 0);

12144

++		BUG_ON(!before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq));

12145

++	}

12146

++

12147

++	return packets_acked;

12148

++}

12149

++

12150

++/**

12151

++ * Cleans the meta-socket retransmission queue and the reinject-queue.

12152

++ * @sk must be the metasocket.

12153

++ */

12154

++static void mptcp_clean_rtx_queue(struct sock *meta_sk, u32 prior_snd_una)

12155

++{

12156

++	struct sk_buff *skb, *tmp;

12157

++	struct tcp_sock *meta_tp = tcp_sk(meta_sk);

12158

++	struct mptcp_cb *mpcb = meta_tp->mpcb;

12159

++	bool acked = false;

12160

++	u32 acked_pcount;

12161

++

12162

++	while ((skb = tcp_write_queue_head(meta_sk)) &&

12163

++	       skb != tcp_send_head(meta_sk)) {

12164

++		bool fully_acked = true;

12165

++

12166

++		if (before(meta_tp->snd_una, TCP_SKB_CB(skb)->end_seq)) {

12167

++			if (tcp_skb_pcount(skb) == 1 ||

12168

++			    !after(meta_tp->snd_una, TCP_SKB_CB(skb)->seq))

12169

++				break;

12170

++

12171

++			acked_pcount = tcp_tso_acked(meta_sk, skb);

12172

++			if (!acked_pcount)

12173

++				break;

12174

++

12175

++			fully_acked = false;

12176

++		} else {

12177

++			acked_pcount = tcp_skb_pcount(skb);

12178

++		}

12179

++

12180

++		acked = true;

12181

++		meta_tp->packets_out -= acked_pcount;

12182

++		meta_tp->retrans_stamp = 0;

12183

++

12184

++		if (!fully_acked)

12185

++			break;

12186

++

12187

++		tcp_unlink_write_queue(skb, meta_sk);

12188

++

12189

++		if (mptcp_is_data_fin(skb)) {

12190

++			struct sock *sk_it;

12191

++

12192

++			/* DATA_FIN has been acknowledged - now we can close

12193

++			 * the subflows

12194

++			 */

12195

++			mptcp_for_each_sk(mpcb, sk_it) {

12196

++				unsigned long delay = 0;

12197

++

12198

++				/* If we are the passive closer, don't trigger

12199

++				 * subflow-fin until the subflow has been finned

12200

++				 * by the peer - thus we add a delay.

12201

++				 */

12202

++				if (mpcb->passive_close &&

12203

++				    sk_it->sk_state == TCP_ESTABLISHED)

12204

++					delay = inet_csk(sk_it)->icsk_rto << 3;

12205

++

12206

++				mptcp_sub_close(sk_it, delay);

12207

++			}

12208

++		}

12209

++		sk_wmem_free_skb(meta_sk, skb);

12210

++	}

12211

++	/* Remove acknowledged data from the reinject queue */

12212

++	skb_queue_walk_safe(&mpcb->reinject_queue, skb, tmp) {

12213

++		if (before(meta_tp->snd_una, TCP_SKB_CB(skb)->end_seq)) {

12214

++			if (tcp_skb_pcount(skb) == 1 ||

12215

++			    !after(meta_tp->snd_una, TCP_SKB_CB(skb)->seq))

12216

++				break;

12217

++

12218

++			mptcp_tso_acked_reinject(meta_sk, skb);

12219

++			break;

12220

++		}

12221

++

12222

++		__skb_unlink(skb, &mpcb->reinject_queue);

12223

++		__kfree_skb(skb);

12224

++	}

12225

++

12226

++	if (likely(between(meta_tp->snd_up, prior_snd_una, meta_tp->snd_una)))

12227

++		meta_tp->snd_up = meta_tp->snd_una;

12228

++

12229

++	if (acked) {

12230

++		tcp_rearm_rto(meta_sk);

12231

++		/* Normally this is done in tcp_try_undo_loss - but MPTCP

12232

++		 * does not call this function.

12233

++		 */

12234

++		inet_csk(meta_sk)->icsk_retransmits = 0;

12235

++	}

12236

++}

12237

++

12238

++/* Inspired by tcp_rcv_state_process */

12239

++static int mptcp_rcv_state_process(struct sock *meta_sk, struct sock *sk,

12240

++				   const struct sk_buff *skb, u32 data_seq,

12241

++				   u16 data_len)

12242

++{

12243

++	struct tcp_sock *meta_tp = tcp_sk(meta_sk), *tp = tcp_sk(sk);

12244

++	const struct tcphdr *th = tcp_hdr(skb);

12245

++

12246

++	/* State-machine handling if FIN has been enqueued and he has

12247

++	 * been acked (snd_una == write_seq) - it's important that this

12248

++	 * here is after sk_wmem_free_skb because otherwise

12249

++	 * sk_forward_alloc is wrong upon inet_csk_destroy_sock()

12250

++	 */

12251

++	switch (meta_sk->sk_state) {

12252

++	case TCP_FIN_WAIT1: {

12253

++		struct dst_entry *dst;

12254

++		int tmo;

12255

++

12256

++		if (meta_tp->snd_una != meta_tp->write_seq)

12257

++			break;

12258

++

12259

++		tcp_set_state(meta_sk, TCP_FIN_WAIT2);

12260

++		meta_sk->sk_shutdown |= SEND_SHUTDOWN;

12261

++

12262

++		dst = __sk_dst_get(sk);

12263

++		if (dst)

12264

++			dst_confirm(dst);

12265

++

12266

++		if (!sock_flag(meta_sk, SOCK_DEAD)) {

12267

++			/* Wake up lingering close() */

12268

++			meta_sk->sk_state_change(meta_sk);

12269

++			break;

12270

++		}

12271

++

12272

++		if (meta_tp->linger2 < 0 ||

12273

++		    (data_len &&

12274

++		     after(data_seq + data_len - (mptcp_is_data_fin2(skb, tp) ? 1 : 0),

12275

++			   meta_tp->rcv_nxt))) {

12276

++			mptcp_send_active_reset(meta_sk, GFP_ATOMIC);

12277

++			tcp_done(meta_sk);

12278

++			NET_INC_STATS_BH(sock_net(meta_sk), LINUX_MIB_TCPABORTONDATA);

12279

++			return 1;

12280

++		}

12281

++

12282

++		tmo = tcp_fin_time(meta_sk);

12283

++		if (tmo > TCP_TIMEWAIT_LEN) {

12284

++			inet_csk_reset_keepalive_timer(meta_sk, tmo - TCP_TIMEWAIT_LEN);

12285

++		} else if (mptcp_is_data_fin2(skb, tp) || sock_owned_by_user(meta_sk)) {

12286

++			/* Bad case. We could lose such FIN otherwise.

12287

++			 * It is not a big problem, but it looks confusing

12288

++			 * and not so rare event. We still can lose it now,

12289

++			 * if it spins in bh_lock_sock(), but it is really

12290

++			 * marginal case.

12291

++			 */

12292

++			inet_csk_reset_keepalive_timer(meta_sk, tmo);

12293

++		} else {

12294

++			meta_tp->ops->time_wait(meta_sk, TCP_FIN_WAIT2, tmo);

12295

++		}

12296

++		break;

12297

++	}

12298

++	case TCP_CLOSING:

12299

++	case TCP_LAST_ACK:

12300

++		if (meta_tp->snd_una == meta_tp->write_seq) {

12301

++			tcp_done(meta_sk);

12302

++			return 1;

12303

++		}

12304

++		break;

12305

++	}

12306

++

12307

++	/* step 7: process the segment text */

12308

++	switch (meta_sk->sk_state) {

12309

++	case TCP_FIN_WAIT1:

12310

++	case TCP_FIN_WAIT2:

12311

++		/* RFC 793 says to queue data in these states,

12312

++		 * RFC 1122 says we MUST send a reset.

12313

++		 * BSD 4.4 also does reset.

12314

++		 */

12315

++		if (meta_sk->sk_shutdown & RCV_SHUTDOWN) {

12316

++			if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&

12317

++			    after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt) &&

12318

++			    !mptcp_is_data_fin2(skb, tp)) {

12319

++				NET_INC_STATS_BH(sock_net(meta_sk), LINUX_MIB_TCPABORTONDATA);

12320

++				mptcp_send_active_reset(meta_sk, GFP_ATOMIC);

12321

++				tcp_reset(meta_sk);

12322

++				return 1;

12323

++			}

12324

++		}

12325

++		break;

12326

++	}

12327

++

12328

++	return 0;

12329

++}

12330

++

12331

++/**

12332

++ * @return:

12333

++ *  i) 1: Everything's fine.

12334

++ *  ii) -1: A reset has been sent on the subflow - csum-failure

12335

++ *  iii) 0: csum-failure but no reset sent, because it's the last subflow.

12336

++ *	 Last packet should not be destroyed by the caller because it has

12337

++ *	 been done here.

12338

++ */

12339

++static int mptcp_verif_dss_csum(struct sock *sk)

12340

++{

12341

++	struct tcp_sock *tp = tcp_sk(sk);

12342

++	struct sk_buff *tmp, *tmp1, *last = NULL;

12343

++	__wsum csum_tcp = 0; /* cumulative checksum of pld + mptcp-header */

12344

++	int ans = 1, overflowed = 0, offset = 0, dss_csum_added = 0;

12345

++	int iter = 0;

12346

++

12347

++	skb_queue_walk_safe(&sk->sk_receive_queue, tmp, tmp1) {

12348

++		unsigned int csum_len;

12349

++

12350

++		if (before(tp->mptcp->map_subseq + tp->mptcp->map_data_len, TCP_SKB_CB(tmp)->end_seq))

12351

++			/* Mapping ends in the middle of the packet -

12352

++			 * csum only these bytes

12353

++			 */

12354

++			csum_len = tp->mptcp->map_subseq + tp->mptcp->map_data_len - TCP_SKB_CB(tmp)->seq;

12355

++		else

12356

++			csum_len = tmp->len;

12357

++

12358

++		offset = 0;

12359

++		if (overflowed) {

12360

++			char first_word[4];

12361

++			first_word[0] = 0;

12362

++			first_word[1] = 0;

12363

++			first_word[2] = 0;

12364

++			first_word[3] = *(tmp->data);

12365

++			csum_tcp = csum_partial(first_word, 4, csum_tcp);

12366

++			offset = 1;

12367

++			csum_len--;

12368

++			overflowed = 0;

12369

++		}

12370

++

12371

++		csum_tcp = skb_checksum(tmp, offset, csum_len, csum_tcp);

12372

++

12373

++		/* Was it on an odd-length? Then we have to merge the next byte

12374

++		 * correctly (see above)

12375

++		 */

12376

++		if (csum_len != (csum_len & (~1)))

12377

++			overflowed = 1;

12378

++

12379

++		if (mptcp_is_data_seq(tmp) && !dss_csum_added) {

12380

++			__be32 data_seq = htonl((u32)(tp->mptcp->map_data_seq >> 32));

12381

++

12382

++			/* If a 64-bit dss is present, we increase the offset

12383

++			 * by 4 bytes, as the high-order 64-bits will be added

12384

++			 * in the final csum_partial-call.

12385

++			 */

12386

++			u32 offset = skb_transport_offset(tmp) +

12387

++				     TCP_SKB_CB(tmp)->dss_off;

12388

++			if (TCP_SKB_CB(tmp)->mptcp_flags & MPTCPHDR_SEQ64_SET)

12389

++				offset += 4;

12390

++

12391

++			csum_tcp = skb_checksum(tmp, offset,

12392

++						MPTCP_SUB_LEN_SEQ_CSUM,

12393

++						csum_tcp);

12394

++

12395

++			csum_tcp = csum_partial(&data_seq,

12396

++						sizeof(data_seq), csum_tcp);

12397

++

12398

++			dss_csum_added = 1; /* Just do it once */

12399

++		}

12400

++		last = tmp;

12401

++		iter++;

12402

++

12403

++		if (!skb_queue_is_last(&sk->sk_receive_queue, tmp) &&

12404

++		    !before(TCP_SKB_CB(tmp1)->seq,

12405

++			    tp->mptcp->map_subseq + tp->mptcp->map_data_len))

12406

++			break;

12407

++	}

12408

++

12409

++	/* Now, checksum must be 0 */

12410

++	if (unlikely(csum_fold(csum_tcp))) {

12411

++		pr_err("%s csum is wrong: %#x data_seq %u dss_csum_added %d overflowed %d iterations %d\n",

12412

++		       __func__, csum_fold(csum_tcp), TCP_SKB_CB(last)->seq,

12413

++		       dss_csum_added, overflowed, iter);

12414

++

12415

++		tp->mptcp->send_mp_fail = 1;

12416

++

12417

++		/* map_data_seq is the data-seq number of the

12418

++		 * mapping we are currently checking

12419

++		 */

12420

++		tp->mpcb->csum_cutoff_seq = tp->mptcp->map_data_seq;

12421

++

12422

++		if (tp->mpcb->cnt_subflows > 1) {

12423

++			mptcp_send_reset(sk);

12424

++			ans = -1;

12425

++		} else {

12426

++			tp->mpcb->send_infinite_mapping = 1;

12427

++

12428

++			/* Need to purge the rcv-queue as it's no more valid */

12429

++			while ((tmp = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {

12430

++				tp->copied_seq = TCP_SKB_CB(tmp)->end_seq;

12431

++				kfree_skb(tmp);

12432

++			}

12433

++

12434

++			ans = 0;

12435

++		}

12436

++	}

12437

++

12438

++	return ans;

12439

++}

12440

++

12441

++static inline void mptcp_prepare_skb(struct sk_buff *skb,

12442

++				     const struct sock *sk)

12443

++{

12444

++	const struct tcp_sock *tp = tcp_sk(sk);

12445

++	struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);

12446

++	u32 inc = 0;

12447

++

12448

++	/* If skb is the end of this mapping (end is always at mapping-boundary

12449

++	 * thanks to the splitting/trimming), then we need to increase

12450

++	 * data-end-seq by 1 if this here is a data-fin.

12451

++	 *

12452

++	 * We need to do -1 because end_seq includes the subflow-FIN.

12453

++	 */

12454

++	if (tp->mptcp->map_data_fin &&

12455

++	    (tcb->end_seq - (tcp_hdr(skb)->fin ? 1 : 0)) ==

12456

++	    (tp->mptcp->map_subseq + tp->mptcp->map_data_len)) {

12457

++		inc = 1;

12458

++

12459

++		/* We manually set the fin-flag if it is a data-fin. For easy

12460

++		 * processing in tcp_recvmsg.

12461

++		 */

12462

++		tcp_hdr(skb)->fin = 1;

12463

++	} else {

12464

++		/* We may have a subflow-fin with data but without data-fin */

12465

++		tcp_hdr(skb)->fin = 0;

12466

++	}

12467

++

12468

++	/* Adapt data-seq's to the packet itself. We kinda transform the

12469

++	 * dss-mapping to a per-packet granularity. This is necessary to

12470

++	 * correctly handle overlapping mappings coming from different

12471

++	 * subflows. Otherwise it would be a complete mess.

12472

++	 */

12473

++	tcb->seq = ((u32)tp->mptcp->map_data_seq) + tcb->seq - tp->mptcp->map_subseq;

12474

++	tcb->end_seq = tcb->seq + skb->len + inc;

12475

++}

12476

++

12477

++/**

12478

++ * @return: 1 if the segment has been eaten and can be suppressed,

12479

++ *          otherwise 0.

12480

++ */

12481

++static inline int mptcp_direct_copy(const struct sk_buff *skb,

12482

++				    struct sock *meta_sk)

12483

++{

12484

++	struct tcp_sock *meta_tp = tcp_sk(meta_sk);

12485

++	int chunk = min_t(unsigned int, skb->len, meta_tp->ucopy.len);

12486

++	int eaten = 0;

12487

++

12488

++	__set_current_state(TASK_RUNNING);

12489

++

12490

++	local_bh_enable();

12491

++	if (!skb_copy_datagram_iovec(skb, 0, meta_tp->ucopy.iov, chunk)) {

12492

++		meta_tp->ucopy.len -= chunk;

12493

++		meta_tp->copied_seq += chunk;

12494

++		eaten = (chunk == skb->len);

12495

++		tcp_rcv_space_adjust(meta_sk);

12496

++	}

12497

++	local_bh_disable();

12498

++	return eaten;

12499

++}

12500

++

12501

++static inline void mptcp_reset_mapping(struct tcp_sock *tp)

12502

++{

12503

++	tp->mptcp->map_data_len = 0;

12504

++	tp->mptcp->map_data_seq = 0;

12505

++	tp->mptcp->map_subseq = 0;

12506

++	tp->mptcp->map_data_fin = 0;

12507

++	tp->mptcp->mapping_present = 0;

12508

++}

12509

++

12510

++/* The DSS-mapping received on the sk only covers the second half of the skb

12511

++ * (cut at seq). We trim the head from the skb.

12512

++ * Data will be freed upon kfree().

12513

++ *

12514

++ * Inspired by tcp_trim_head().

12515

++ */

12516

++static void mptcp_skb_trim_head(struct sk_buff *skb, struct sock *sk, u32 seq)

12517

++{

12518

++	int len = seq - TCP_SKB_CB(skb)->seq;

12519

++	u32 new_seq = TCP_SKB_CB(skb)->seq + len;

12520

++

12521

++	if (len < skb_headlen(skb))

12522

++		__skb_pull(skb, len);

12523

++	else

12524

++		__pskb_trim_head(skb, len - skb_headlen(skb));

12525

++

12526

++	TCP_SKB_CB(skb)->seq = new_seq;

12527

++

12528

++	skb->truesize -= len;

12529

++	atomic_sub(len, &sk->sk_rmem_alloc);

12530

++	sk_mem_uncharge(sk, len);

12531

++}

12532

++

12533

++/* The DSS-mapping received on the sk only covers the first half of the skb

12534

++ * (cut at seq). We create a second skb (@return), and queue it in the rcv-queue

12535

++ * as further packets may resolve the mapping of the second half of data.

12536

++ *

12537

++ * Inspired by tcp_fragment().

12538

++ */

12539

++static int mptcp_skb_split_tail(struct sk_buff *skb, struct sock *sk, u32 seq)

12540

++{

12541

++	struct sk_buff *buff;

12542

++	int nsize;

12543

++	int nlen, len;

12544

++

12545

++	len = seq - TCP_SKB_CB(skb)->seq;

12546

++	nsize = skb_headlen(skb) - len + tcp_sk(sk)->tcp_header_len;

12547

++	if (nsize < 0)

12548

++		nsize = 0;

12549

++

12550

++	/* Get a new skb... force flag on. */

12551

++	buff = alloc_skb(nsize, GFP_ATOMIC);

12552

++	if (buff == NULL)

12553

++		return -ENOMEM;

12554

++

12555

++	skb_reserve(buff, tcp_sk(sk)->tcp_header_len);

12556

++	skb_reset_transport_header(buff);

12557

++

12558

++	tcp_hdr(buff)->fin = tcp_hdr(skb)->fin;

12559

++	tcp_hdr(skb)->fin = 0;

12560

++

12561

++	/* We absolutly need to call skb_set_owner_r before refreshing the

12562

++	 * truesize of buff, otherwise the moved data will account twice.

12563

++	 */

12564

++	skb_set_owner_r(buff, sk);

12565

++	nlen = skb->len - len - nsize;

12566

++	buff->truesize += nlen;

12567

++	skb->truesize -= nlen;

12568

++

12569

++	/* Correct the sequence numbers. */

12570

++	TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;

12571

++	TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;

12572

++	TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;

12573

++

12574

++	skb_split(skb, buff, len);

12575

++

12576

++	__skb_queue_after(&sk->sk_receive_queue, skb, buff);

12577

++

12578

++	return 0;

12579

++}

12580

++

12581

++/* @return: 0  everything is fine. Just continue processing

12582

++ *	    1  subflow is broken stop everything

12583

++ *	    -1 this packet was broken - continue with the next one.

12584

++ */

12585

++static int mptcp_prevalidate_skb(struct sock *sk, struct sk_buff *skb)

12586

++{

12587

++	struct tcp_sock *tp = tcp_sk(sk);

12588

++

12589

++	/* If we are in infinite mode, the subflow-fin is in fact a data-fin. */

12590

++	if (!skb->len && tcp_hdr(skb)->fin && !mptcp_is_data_fin(skb) &&

12591

++	    !tp->mpcb->infinite_mapping_rcv) {

12592

++		/* Remove a pure subflow-fin from the queue and increase

12593

++		 * copied_seq.

12594

++		 */

12595

++		tp->copied_seq = TCP_SKB_CB(skb)->end_seq;

12596

++		__skb_unlink(skb, &sk->sk_receive_queue);

12597

++		__kfree_skb(skb);

12598

++		return -1;

12599

++	}

12600

++

12601

++	/* If we are not yet fully established and do not know the mapping for

12602

++	 * this segment, this path has to fallback to infinite or be torn down.

12603

++	 */

12604

++	if (!tp->mptcp->fully_established && !mptcp_is_data_seq(skb) &&

12605

++	    !tp->mptcp->mapping_present && !tp->mpcb->infinite_mapping_rcv) {

12606

++		pr_err("%s %#x will fallback - pi %d from %pS, seq %u\n",

12607

++		       __func__, tp->mpcb->mptcp_loc_token,

12608

++		       tp->mptcp->path_index, __builtin_return_address(0),

12609

++		       TCP_SKB_CB(skb)->seq);

12610

++

12611

++		if (!is_master_tp(tp)) {

12612

++			mptcp_send_reset(sk);

12613

++			return 1;

12614

++		}

12615

++

12616

++		tp->mpcb->infinite_mapping_snd = 1;

12617

++		tp->mpcb->infinite_mapping_rcv = 1;

12618

++		/* We do a seamless fallback and should not send a inf.mapping. */

12619

++		tp->mpcb->send_infinite_mapping = 0;

12620

++		tp->mptcp->fully_established = 1;

12621

++	}

12622

++

12623

++	/* Receiver-side becomes fully established when a whole rcv-window has

12624

++	 * been received without the need to fallback due to the previous

12625

++	 * condition.

12626

++	 */

12627

++	if (!tp->mptcp->fully_established) {

12628

++		tp->mptcp->init_rcv_wnd -= skb->len;

12629

++		if (tp->mptcp->init_rcv_wnd < 0)

12630

++			mptcp_become_fully_estab(sk);

12631

++	}

12632

++

12633

++	return 0;

12634

++}

12635

++

12636

++/* @return: 0  everything is fine. Just continue processing

12637

++ *	    1  subflow is broken stop everything

12638

++ *	    -1 this packet was broken - continue with the next one.

12639

++ */

12640

++static int mptcp_detect_mapping(struct sock *sk, struct sk_buff *skb)

12641

++{

12642

++	struct tcp_sock *tp = tcp_sk(sk), *meta_tp = mptcp_meta_tp(tp);

12643

++	struct mptcp_cb *mpcb = tp->mpcb;

12644

++	struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);

12645

++	u32 *ptr;

12646

++	u32 data_seq, sub_seq, data_len, tcp_end_seq;

12647

++

12648

++	/* If we are in infinite-mapping-mode, the subflow is guaranteed to be

12649

++	 * in-order at the data-level. Thus data-seq-numbers can be inferred

12650

++	 * from what is expected at the data-level.

12651

++	 */

12652

++	if (mpcb->infinite_mapping_rcv) {

12653

++		tp->mptcp->map_data_seq = mptcp_get_rcv_nxt_64(meta_tp);

12654

++		tp->mptcp->map_subseq = tcb->seq;

12655

++		tp->mptcp->map_data_len = skb->len;

12656

++		tp->mptcp->map_data_fin = tcp_hdr(skb)->fin;

12657

++		tp->mptcp->mapping_present = 1;

12658

++		return 0;

12659

++	}

12660

++

12661

++	/* No mapping here? Exit - it is either already set or still on its way */

12662

++	if (!mptcp_is_data_seq(skb)) {

12663

++		/* Too many packets without a mapping - this subflow is broken */

12664

++		if (!tp->mptcp->mapping_present &&

12665

++		    tp->rcv_nxt - tp->copied_seq > 65536) {

12666

++			mptcp_send_reset(sk);

12667

++			return 1;

12668

++		}

12669

++

12670

++		return 0;

12671

++	}

12672

++

12673

++	ptr = mptcp_skb_set_data_seq(skb, &data_seq, mpcb);

12674

++	ptr++;

12675

++	sub_seq = get_unaligned_be32(ptr) + tp->mptcp->rcv_isn;

12676

++	ptr++;

12677

++	data_len = get_unaligned_be16(ptr);

12678

++

12679

++	/* If it's an empty skb with DATA_FIN, sub_seq must get fixed.

12680

++	 * The draft sets it to 0, but we really would like to have the

12681

++	 * real value, to have an easy handling afterwards here in this

12682

++	 * function.

12683

++	 */

12684

++	if (mptcp_is_data_fin(skb) && skb->len == 0)

12685

++		sub_seq = TCP_SKB_CB(skb)->seq;

12686

++

12687

++	/* If there is already a mapping - we check if it maps with the current

12688

++	 * one. If not - we reset.

12689

++	 */

12690

++	if (tp->mptcp->mapping_present &&

12691

++	    (data_seq != (u32)tp->mptcp->map_data_seq ||

12692

++	     sub_seq != tp->mptcp->map_subseq ||

12693

++	     data_len != tp->mptcp->map_data_len + tp->mptcp->map_data_fin ||

12694

++	     mptcp_is_data_fin(skb) != tp->mptcp->map_data_fin)) {

12695

++		/* Mapping in packet is different from what we want */

12696

++		pr_err("%s Mappings do not match!\n", __func__);

12697

++		pr_err("%s dseq %u mdseq %u, sseq %u msseq %u dlen %u mdlen %u dfin %d mdfin %d\n",

12698

++		       __func__, data_seq, (u32)tp->mptcp->map_data_seq,

12699

++		       sub_seq, tp->mptcp->map_subseq, data_len,

12700

++		       tp->mptcp->map_data_len, mptcp_is_data_fin(skb),

12701

++		       tp->mptcp->map_data_fin);

12702

++		mptcp_send_reset(sk);

12703

++		return 1;

12704

++	}

12705

++

12706

++	/* If the previous check was good, the current mapping is valid and we exit. */

12707

++	if (tp->mptcp->mapping_present)

12708

++		return 0;

12709

++

12710

++	/* Mapping not yet set on this subflow - we set it here! */

12711

++

12712

++	if (!data_len) {

12713

++		mpcb->infinite_mapping_rcv = 1;

12714

++		tp->mptcp->fully_established = 1;

12715

++		/* We need to repeat mp_fail's until the sender felt

12716

++		 * back to infinite-mapping - here we stop repeating it.

12717

++		 */

12718

++		tp->mptcp->send_mp_fail = 0;

12719

++

12720

++		/* We have to fixup data_len - it must be the same as skb->len */

12721

++		data_len = skb->len + (mptcp_is_data_fin(skb) ? 1 : 0);

12722

++		sub_seq = tcb->seq;

12723

++

12724

++		/* TODO kill all other subflows than this one */

12725

++		/* data_seq and so on are set correctly */

12726

++

12727

++		/* At this point, the meta-ofo-queue has to be emptied,

12728

++		 * as the following data is guaranteed to be in-order at

12729

++		 * the data and subflow-level

12730

++		 */

12731

++		mptcp_purge_ofo_queue(meta_tp);

12732

++	}

12733

++

12734

++	/* We are sending mp-fail's and thus are in fallback mode.

12735

++	 * Ignore packets which do not announce the fallback and still

12736

++	 * want to provide a mapping.

12737

++	 */

12738

++	if (tp->mptcp->send_mp_fail) {

12739

++		tp->copied_seq = TCP_SKB_CB(skb)->end_seq;

12740

++		__skb_unlink(skb, &sk->sk_receive_queue);

12741

++		__kfree_skb(skb);

12742

++		return -1;

12743

++	}

12744

++

12745

++	/* FIN increased the mapping-length by 1 */

12746

++	if (mptcp_is_data_fin(skb))

12747

++		data_len--;

12748

++

12749

++	/* Subflow-sequences of packet must be

12750

++	 * (at least partially) be part of the DSS-mapping's

12751

++	 * subflow-sequence-space.

12752

++	 *

12753

++	 * Basically the mapping is not valid, if either of the

12754

++	 * following conditions is true:

12755

++	 *

12756

++	 * 1. It's not a data_fin and

12757

++	 *    MPTCP-sub_seq >= TCP-end_seq

12758

++	 *

12759

++	 * 2. It's a data_fin and TCP-end_seq > TCP-seq and

12760

++	 *    MPTCP-sub_seq >= TCP-end_seq

12761

++	 *

12762

++	 * The previous two can be merged into:

12763

++	 *    TCP-end_seq > TCP-seq and MPTCP-sub_seq >= TCP-end_seq

12764

++	 *    Because if it's not a data-fin, TCP-end_seq > TCP-seq

12765

++	 *

12766

++	 * 3. It's a data_fin and skb->len == 0 and

12767

++	 *    MPTCP-sub_seq > TCP-end_seq

12768

++	 *

12769

++	 * 4. It's not a data_fin and TCP-end_seq > TCP-seq and

12770

++	 *    MPTCP-sub_seq + MPTCP-data_len <= TCP-seq

12771

++	 *

12772

++	 * 5. MPTCP-sub_seq is prior to what we already copied (copied_seq)

12773

++	 */

12774

++

12775

++	/* subflow-fin is not part of the mapping - ignore it here ! */

12776

++	tcp_end_seq = tcb->end_seq - tcp_hdr(skb)->fin;

12777

++	if ((!before(sub_seq, tcb->end_seq) && after(tcp_end_seq, tcb->seq)) ||

12778

++	    (mptcp_is_data_fin(skb) && skb->len == 0 && after(sub_seq, tcb->end_seq)) ||

12779

++	    (!after(sub_seq + data_len, tcb->seq) && after(tcp_end_seq, tcb->seq)) ||

12780

++	    before(sub_seq, tp->copied_seq)) {

12781

++		/* Subflow-sequences of packet is different from what is in the

12782

++		 * packet's dss-mapping. The peer is misbehaving - reset

12783

++		 */

12784

++		pr_err("%s Packet's mapping does not map to the DSS sub_seq %u "

12785

++		       "end_seq %u, tcp_end_seq %u seq %u dfin %u len %u data_len %u"

12786

++		       "copied_seq %u\n", __func__, sub_seq, tcb->end_seq, tcp_end_seq, tcb->seq, mptcp_is_data_fin(skb),

12787

++		       skb->len, data_len, tp->copied_seq);

12788

++		mptcp_send_reset(sk);

12789

++		return 1;

12790

++	}

12791

++

12792

++	/* Does the DSS had 64-bit seqnum's ? */

12793

++	if (!(tcb->mptcp_flags & MPTCPHDR_SEQ64_SET)) {

12794

++		/* Wrapped around? */

12795

++		if (unlikely(after(data_seq, meta_tp->rcv_nxt) && data_seq < meta_tp->rcv_nxt)) {

12796

++			tp->mptcp->map_data_seq = mptcp_get_data_seq_64(mpcb, !mpcb->rcv_hiseq_index, data_seq);

12797

++		} else {

12798

++			/* Else, access the default high-order bits */

12799

++			tp->mptcp->map_data_seq = mptcp_get_data_seq_64(mpcb, mpcb->rcv_hiseq_index, data_seq);

12800

++		}

12801

++	} else {

12802

++		tp->mptcp->map_data_seq = mptcp_get_data_seq_64(mpcb, (tcb->mptcp_flags & MPTCPHDR_SEQ64_INDEX) ? 1 : 0, data_seq);

12803

++

12804

++		if (unlikely(tcb->mptcp_flags & MPTCPHDR_SEQ64_OFO)) {

12805

++			/* We make sure that the data_seq is invalid.

12806

++			 * It will be dropped later.

12807

++			 */

12808

++			tp->mptcp->map_data_seq += 0xFFFFFFFF;

12809

++			tp->mptcp->map_data_seq += 0xFFFFFFFF;

12810

++		}

12811

++	}

12812

++

12813

++	tp->mptcp->map_data_len = data_len;

12814

++	tp->mptcp->map_subseq = sub_seq;

12815

++	tp->mptcp->map_data_fin = mptcp_is_data_fin(skb) ? 1 : 0;

12816

++	tp->mptcp->mapping_present = 1;

12817

++

12818

++	return 0;

12819

++}

12820

++

12821

++/* Similar to tcp_sequence(...) */

12822

++static inline bool mptcp_sequence(const struct tcp_sock *meta_tp,

12823

++				 u64 data_seq, u64 end_data_seq)

12824

++{

12825

++	const struct mptcp_cb *mpcb = meta_tp->mpcb;

12826

++	u64 rcv_wup64;

12827

++

12828

++	/* Wrap-around? */

12829

++	if (meta_tp->rcv_wup > meta_tp->rcv_nxt) {

12830

++		rcv_wup64 = ((u64)(mpcb->rcv_high_order[mpcb->rcv_hiseq_index] - 1) << 32) |

12831

++				meta_tp->rcv_wup;

12832

++	} else {

12833

++		rcv_wup64 = mptcp_get_data_seq_64(mpcb, mpcb->rcv_hiseq_index,

12834

++						  meta_tp->rcv_wup);

12835

++	}

12836

++

12837

++	return	!before64(end_data_seq, rcv_wup64) &&

12838

++		!after64(data_seq, mptcp_get_rcv_nxt_64(meta_tp) + tcp_receive_window(meta_tp));

12839

++}

12840

++

12841

++/* @return: 0  everything is fine. Just continue processing

12842

++ *	    -1 this packet was broken - continue with the next one.

12843

++ */

12844

++static int mptcp_validate_mapping(struct sock *sk, struct sk_buff *skb)

12845

++{

12846

++	struct tcp_sock *tp = tcp_sk(sk);

12847

++	struct sk_buff *tmp, *tmp1;

12848

++	u32 tcp_end_seq;

12849

++

12850

++	if (!tp->mptcp->mapping_present)

12851

++		return 0;

12852

++

12853

++	/* either, the new skb gave us the mapping and the first segment

12854

++	 * in the sub-rcv-queue has to be trimmed ...

12855

++	 */

12856

++	tmp = skb_peek(&sk->sk_receive_queue);

12857

++	if (before(TCP_SKB_CB(tmp)->seq, tp->mptcp->map_subseq) &&

12858

++	    after(TCP_SKB_CB(tmp)->end_seq, tp->mptcp->map_subseq))

12859

++		mptcp_skb_trim_head(tmp, sk, tp->mptcp->map_subseq);

12860

++

12861

++	/* ... or the new skb (tail) has to be split at the end. */

12862

++	tcp_end_seq = TCP_SKB_CB(skb)->end_seq - (tcp_hdr(skb)->fin ? 1 : 0);

12863

++	if (after(tcp_end_seq, tp->mptcp->map_subseq + tp->mptcp->map_data_len)) {

12864

++		u32 seq = tp->mptcp->map_subseq + tp->mptcp->map_data_len;

12865

++		if (mptcp_skb_split_tail(skb, sk, seq)) { /* Allocation failed */

12866

++			/* TODO : maybe handle this here better.

12867

++			 * We now just force meta-retransmission.

12868

++			 */

12869

++			tp->copied_seq = TCP_SKB_CB(skb)->end_seq;

12870

++			__skb_unlink(skb, &sk->sk_receive_queue);

12871

++			__kfree_skb(skb);

12872

++			return -1;

12873

++		}

12874

++	}

12875

++

12876

++	/* Now, remove old sk_buff's from the receive-queue.

12877

++	 * This may happen if the mapping has been lost for these segments and

12878

++	 * the next mapping has already been received.

12879

++	 */

12880

++	if (before(TCP_SKB_CB(skb_peek(&sk->sk_receive_queue))->seq, tp->mptcp->map_subseq)) {

12881

++		skb_queue_walk_safe(&sk->sk_receive_queue, tmp1, tmp) {

12882

++			if (!before(TCP_SKB_CB(tmp1)->seq, tp->mptcp->map_subseq))

12883

++				break;

12884

++

12885

++			tp->copied_seq = TCP_SKB_CB(tmp1)->end_seq;

12886

++			__skb_unlink(tmp1, &sk->sk_receive_queue);

12887

++

12888

++			/* Impossible that we could free skb here, because his

12889

++			 * mapping is known to be valid from previous checks

12890

++			 */

12891

++			__kfree_skb(tmp1);

12892

++		}

12893

++	}

12894

++

12895

++	return 0;

12896

++}

12897

++

12898

++/* @return: 0  everything is fine. Just continue processing

12899

++ *	    1  subflow is broken stop everything

12900

++ *	    -1 this mapping has been put in the meta-receive-queue

12901

++ *	    -2 this mapping has been eaten by the application

12902

++ */

12903

++static int mptcp_queue_skb(struct sock *sk)

12904

++{

12905

++	struct tcp_sock *tp = tcp_sk(sk), *meta_tp = mptcp_meta_tp(tp);

12906

++	struct sock *meta_sk = mptcp_meta_sk(sk);

12907

++	struct mptcp_cb *mpcb = tp->mpcb;

12908

++	struct sk_buff *tmp, *tmp1;

12909

++	u64 rcv_nxt64 = mptcp_get_rcv_nxt_64(meta_tp);

12910

++	bool data_queued = false;

12911

++

12912

++	/* Have we not yet received the full mapping? */

12913

++	if (!tp->mptcp->mapping_present ||

12914

++	    before(tp->rcv_nxt, tp->mptcp->map_subseq + tp->mptcp->map_data_len))

12915

++		return 0;

12916

++

12917

++	/* Is this an overlapping mapping? rcv_nxt >= end_data_seq

12918

++	 * OR

12919

++	 * This mapping is out of window

12920

++	 */

12921

++	if (!before64(rcv_nxt64, tp->mptcp->map_data_seq + tp->mptcp->map_data_len + tp->mptcp->map_data_fin) ||

12922

++	    !mptcp_sequence(meta_tp, tp->mptcp->map_data_seq,

12923

++			    tp->mptcp->map_data_seq + tp->mptcp->map_data_len + tp->mptcp->map_data_fin)) {

12924

++		skb_queue_walk_safe(&sk->sk_receive_queue, tmp1, tmp) {

12925

++			__skb_unlink(tmp1, &sk->sk_receive_queue);

12926

++			tp->copied_seq = TCP_SKB_CB(tmp1)->end_seq;

12927

++			__kfree_skb(tmp1);

12928

++

12929

++			if (!skb_queue_empty(&sk->sk_receive_queue) &&

12930

++			    !before(TCP_SKB_CB(tmp)->seq,

12931

++				    tp->mptcp->map_subseq + tp->mptcp->map_data_len))

12932

++				break;

12933

++		}

12934

++

12935

++		mptcp_reset_mapping(tp);

12936

++

12937

++		return -1;

12938

++	}

12939

++

12940

++	/* Record it, because we want to send our data_fin on the same path */

12941

++	if (tp->mptcp->map_data_fin) {

12942

++		mpcb->dfin_path_index = tp->mptcp->path_index;

12943

++		mpcb->dfin_combined = !!(sk->sk_shutdown & RCV_SHUTDOWN);

12944

++	}

12945

++

12946

++	/* Verify the checksum */

12947

++	if (mpcb->dss_csum && !mpcb->infinite_mapping_rcv) {

12948

++		int ret = mptcp_verif_dss_csum(sk);

12949

++

12950

++		if (ret <= 0) {

12951

++			mptcp_reset_mapping(tp);

12952

++			return 1;

12953

++		}

12954

++	}

12955

++

12956

++	if (before64(rcv_nxt64, tp->mptcp->map_data_seq)) {

12957

++		/* Seg's have to go to the meta-ofo-queue */

12958

++		skb_queue_walk_safe(&sk->sk_receive_queue, tmp1, tmp) {

12959

++			tp->copied_seq = TCP_SKB_CB(tmp1)->end_seq;

12960

++			mptcp_prepare_skb(tmp1, sk);

12961

++			__skb_unlink(tmp1, &sk->sk_receive_queue);

12962

++			/* MUST be done here, because fragstolen may be true later.

12963

++			 * Then, kfree_skb_partial will not account the memory.

12964

++			 */

12965

++			skb_orphan(tmp1);

12966

++

12967

++			if (!mpcb->in_time_wait) /* In time-wait, do not receive data */

12968

++				mptcp_add_meta_ofo_queue(meta_sk, tmp1, sk);

12969

++			else

12970

++				__kfree_skb(tmp1);

12971

++

12972

++			if (!skb_queue_empty(&sk->sk_receive_queue) &&

12973

++			    !before(TCP_SKB_CB(tmp)->seq,

12974

++				    tp->mptcp->map_subseq + tp->mptcp->map_data_len))

12975

++				break;

12976

++		}

12977

++		tcp_enter_quickack_mode(sk);

12978

++	} else {

12979

++		/* Ready for the meta-rcv-queue */

12980

++		skb_queue_walk_safe(&sk->sk_receive_queue, tmp1, tmp) {

12981

++			int eaten = 0;

12982

++			const bool copied_early = false;

12983

++			bool fragstolen = false;

12984

++			u32 old_rcv_nxt = meta_tp->rcv_nxt;

12985

++

12986

++			tp->copied_seq = TCP_SKB_CB(tmp1)->end_seq;

12987

++			mptcp_prepare_skb(tmp1, sk);

12988

++			__skb_unlink(tmp1, &sk->sk_receive_queue);

12989

++			/* MUST be done here, because fragstolen may be true.

12990

++			 * Then, kfree_skb_partial will not account the memory.

12991

++			 */

12992

++			skb_orphan(tmp1);

12993

++

12994

++			/* This segment has already been received */

12995

++			if (!after(TCP_SKB_CB(tmp1)->end_seq, meta_tp->rcv_nxt)) {

12996

++				__kfree_skb(tmp1);

12997

++				goto next;

12998

++			}

12999

++

13000

++#ifdef CONFIG_NET_DMA

13001

++			if (TCP_SKB_CB(tmp1)->seq == meta_tp->rcv_nxt  &&

13002

++			    meta_tp->ucopy.task == current &&

13003

++			    meta_tp->copied_seq == meta_tp->rcv_nxt &&

13004

++			    tmp1->len <= meta_tp->ucopy.len &&

13005

++			    sock_owned_by_user(meta_sk) &&

13006

++			    tcp_dma_try_early_copy(meta_sk, tmp1, 0)) {

13007

++				copied_early = true;

13008

++				eaten = 1;

13009

++			}

13010

++#endif

13011

++

13012

++			/* Is direct copy possible ? */

13013

++			if (TCP_SKB_CB(tmp1)->seq == meta_tp->rcv_nxt &&

13014

++			    meta_tp->ucopy.task == current &&

13015

++			    meta_tp->copied_seq == meta_tp->rcv_nxt &&

13016

++			    meta_tp->ucopy.len && sock_owned_by_user(meta_sk) &&

13017

++			    !copied_early)

13018

++				eaten = mptcp_direct_copy(tmp1, meta_sk);

13019

++

13020

++			if (mpcb->in_time_wait) /* In time-wait, do not receive data */

13021

++				eaten = 1;

13022

++

13023

++			if (!eaten)

13024

++				eaten = tcp_queue_rcv(meta_sk, tmp1, 0, &fragstolen);

13025

++

13026

++			meta_tp->rcv_nxt = TCP_SKB_CB(tmp1)->end_seq;

13027

++			mptcp_check_rcvseq_wrap(meta_tp, old_rcv_nxt);

13028

++

13029

++#ifdef CONFIG_NET_DMA

13030

++			if (copied_early)

13031

++				meta_tp->cleanup_rbuf(meta_sk, tmp1->len);

13032

++#endif

13033

++

13034

++			if (tcp_hdr(tmp1)->fin && !mpcb->in_time_wait)

13035

++				mptcp_fin(meta_sk);

13036

++

13037

++			/* Check if this fills a gap in the ofo queue */

13038

++			if (!skb_queue_empty(&meta_tp->out_of_order_queue))

13039

++				mptcp_ofo_queue(meta_sk);

13040

++

13041

++#ifdef CONFIG_NET_DMA

13042

++			if (copied_early)

13043

++				__skb_queue_tail(&meta_sk->sk_async_wait_queue,

13044

++						 tmp1);

13045

++			else

13046

++#endif

13047

++			if (eaten)

13048

++				kfree_skb_partial(tmp1, fragstolen);

13049

++

13050

++			data_queued = true;

13051

++next:

13052

++			if (!skb_queue_empty(&sk->sk_receive_queue) &&

13053

++			    !before(TCP_SKB_CB(tmp)->seq,

13054

++				    tp->mptcp->map_subseq + tp->mptcp->map_data_len))

13055

++				break;

13056

++		}

13057

++	}

13058

++

13059

++	inet_csk(meta_sk)->icsk_ack.lrcvtime = tcp_time_stamp;

13060

++	mptcp_reset_mapping(tp);

13061

++

13062

++	return data_queued ? -1 : -2;

13063

++}

13064

++

13065

++void mptcp_data_ready(struct sock *sk)

13066

++{

13067

++	struct sock *meta_sk = mptcp_meta_sk(sk);

13068

++	struct sk_buff *skb, *tmp;

13069

++	int queued = 0;

13070

++

13071

++	/* restart before the check, because mptcp_fin might have changed the

13072

++	 * state.

13073

++	 */

13074

++restart:

13075

++	/* If the meta cannot receive data, there is no point in pushing data.

13076

++	 * If we are in time-wait, we may still be waiting for the final FIN.

13077

++	 * So, we should proceed with the processing.

13078

++	 */

13079

++	if (!mptcp_sk_can_recv(meta_sk) && !tcp_sk(sk)->mpcb->in_time_wait) {

13080

++		skb_queue_purge(&sk->sk_receive_queue);

13081

++		tcp_sk(sk)->copied_seq = tcp_sk(sk)->rcv_nxt;

13082

++		goto exit;

13083

++	}

13084

++

13085

++	/* Iterate over all segments, detect their mapping (if we don't have

13086

++	 * one yet), validate them and push everything one level higher.

13087

++	 */

13088

++	skb_queue_walk_safe(&sk->sk_receive_queue, skb, tmp) {

13089

++		int ret;

13090

++		/* Pre-validation - e.g., early fallback */

13091

++		ret = mptcp_prevalidate_skb(sk, skb);

13092

++		if (ret < 0)

13093

++			goto restart;

13094

++		else if (ret > 0)

13095

++			break;

13096

++

13097

++		/* Set the current mapping */

13098

++		ret = mptcp_detect_mapping(sk, skb);

13099

++		if (ret < 0)

13100

++			goto restart;

13101

++		else if (ret > 0)

13102

++			break;

13103

++

13104

++		/* Validation */

13105

++		if (mptcp_validate_mapping(sk, skb) < 0)

13106

++			goto restart;

13107

++

13108

++		/* Push a level higher */

13109

++		ret = mptcp_queue_skb(sk);

13110

++		if (ret < 0) {

13111

++			if (ret == -1)

13112

++				queued = ret;

13113

++			goto restart;

13114

++		} else if (ret == 0) {

13115

++			continue;

13116

++		} else { /* ret == 1 */

13117

++			break;

13118

++		}

13119

++	}

13120

++

13121

++exit:

13122

++	if (tcp_sk(sk)->close_it) {

13123

++		tcp_send_ack(sk);

13124

++		tcp_sk(sk)->ops->time_wait(sk, TCP_TIME_WAIT, 0);

13125

++	}

13126

++

13127

++	if (queued == -1 && !sock_flag(meta_sk, SOCK_DEAD))

13128

++		meta_sk->sk_data_ready(meta_sk);

13129

++}

13130

++

13131

++

13132

++int mptcp_check_req(struct sk_buff *skb, struct net *net)

13133

++{

13134

++	const struct tcphdr *th = tcp_hdr(skb);

13135

++	struct sock *meta_sk = NULL;

13136

++

13137

++	/* MPTCP structures not initialized */

13138

++	if (mptcp_init_failed)

13139

++		return 0;

13140

++

13141

++	if (skb->protocol == htons(ETH_P_IP))

13142

++		meta_sk = mptcp_v4_search_req(th->source, ip_hdr(skb)->saddr,

13143

++					      ip_hdr(skb)->daddr, net);

13144

++#if IS_ENABLED(CONFIG_IPV6)

13145

++	else /* IPv6 */

13146

++		meta_sk = mptcp_v6_search_req(th->source, &ipv6_hdr(skb)->saddr,

13147

++					      &ipv6_hdr(skb)->daddr, net);

13148

++#endif /* CONFIG_IPV6 */

13149

++

13150

++	if (!meta_sk)

13151

++		return 0;

13152

++

13153

++	TCP_SKB_CB(skb)->mptcp_flags |= MPTCPHDR_JOIN;

13154

++

13155

++	bh_lock_sock_nested(meta_sk);

13156

++	if (sock_owned_by_user(meta_sk)) {

13157

++		skb->sk = meta_sk;

13158

++		if (unlikely(sk_add_backlog(meta_sk, skb,

13159

++					    meta_sk->sk_rcvbuf + meta_sk->sk_sndbuf))) {

13160

++			bh_unlock_sock(meta_sk);

13161

++			NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);

13162

++			sock_put(meta_sk); /* Taken by mptcp_search_req */

13163

++			kfree_skb(skb);

13164

++			return 1;

13165

++		}

13166

++	} else if (skb->protocol == htons(ETH_P_IP)) {

13167

++		tcp_v4_do_rcv(meta_sk, skb);

13168

++#if IS_ENABLED(CONFIG_IPV6)

13169

++	} else { /* IPv6 */

13170

++		tcp_v6_do_rcv(meta_sk, skb);

13171

++#endif /* CONFIG_IPV6 */

13172

++	}

13173

++	bh_unlock_sock(meta_sk);

13174

++	sock_put(meta_sk); /* Taken by mptcp_vX_search_req */

13175

++	return 1;

13176

++}

13177

++

13178

++struct mp_join *mptcp_find_join(const struct sk_buff *skb)

13179

++{

13180

++	const struct tcphdr *th = tcp_hdr(skb);

13181

++	unsigned char *ptr;

13182

++	int length = (th->doff * 4) - sizeof(struct tcphdr);

13183

++

13184

++	/* Jump through the options to check whether JOIN is there */

13185

++	ptr = (unsigned char *)(th + 1);

13186

++	while (length > 0) {

13187

++		int opcode = *ptr++;

13188

++		int opsize;

13189

++

13190

++		switch (opcode) {

13191

++		case TCPOPT_EOL:

13192

++			return NULL;

13193

++		case TCPOPT_NOP:	/* Ref: RFC 793 section 3.1 */

13194

++			length--;

13195

++			continue;

13196

++		default:

13197

++			opsize = *ptr++;

13198

++			if (opsize < 2)	/* "silly options" */

13199

++				return NULL;

13200

++			if (opsize > length)

13201

++				return NULL;  /* don't parse partial options */

13202

++			if (opcode == TCPOPT_MPTCP &&

13203

++			    ((struct mptcp_option *)(ptr - 2))->sub == MPTCP_SUB_JOIN) {

13204

++				return (struct mp_join *)(ptr - 2);

13205

++			}

13206

++			ptr += opsize - 2;

13207

++			length -= opsize;

13208

++		}

13209

++	}

13210

++	return NULL;

13211

++}

13212

++

13213

++int mptcp_lookup_join(struct sk_buff *skb, struct inet_timewait_sock *tw)

13214

++{

13215

++	const struct mptcp_cb *mpcb;

13216

++	struct sock *meta_sk;

13217

++	u32 token;

13218

++	bool meta_v4;

13219

++	struct mp_join *join_opt = mptcp_find_join(skb);

13220

++	if (!join_opt)

13221

++		return 0;

13222

++

13223

++	/* MPTCP structures were not initialized, so return error */

13224

++	if (mptcp_init_failed)

13225

++		return -1;

13226

++

13227

++	token = join_opt->u.syn.token;

13228

++	meta_sk = mptcp_hash_find(dev_net(skb_dst(skb)->dev), token);

13229

++	if (!meta_sk) {

13230

++		mptcp_debug("%s:mpcb not found:%x\n", __func__, token);

13231

++		return -1;

13232

++	}

13233

++

13234

++	meta_v4 = meta_sk->sk_family == AF_INET;

13235

++	if (meta_v4) {

13236

++		if (skb->protocol == htons(ETH_P_IPV6)) {

13237

++			mptcp_debug("SYN+MP_JOIN with IPV6 address on pure IPV4 meta\n");

13238

++			sock_put(meta_sk); /* Taken by mptcp_hash_find */

13239

++			return -1;

13240

++		}

13241

++	} else if (skb->protocol == htons(ETH_P_IP) &&

13242

++		   inet6_sk(meta_sk)->ipv6only) {

13243

++		mptcp_debug("SYN+MP_JOIN with IPV4 address on IPV6_V6ONLY meta\n");

13244

++		sock_put(meta_sk); /* Taken by mptcp_hash_find */

13245

++		return -1;

13246

++	}

13247

++

13248

++	mpcb = tcp_sk(meta_sk)->mpcb;

13249

++	if (mpcb->infinite_mapping_rcv || mpcb->send_infinite_mapping) {

13250

++		/* We are in fallback-mode on the reception-side -

13251

++		 * no new subflows!

13252

++		 */

13253

++		sock_put(meta_sk); /* Taken by mptcp_hash_find */

13254

++		return -1;

13255

++	}

13256

++

13257

++	/* Coming from time-wait-sock processing in tcp_v4_rcv.

13258

++	 * We have to deschedule it before continuing, because otherwise

13259

++	 * mptcp_v4_do_rcv will hit again on it inside tcp_v4_hnd_req.

13260

++	 */

13261

++	if (tw) {

13262

++		inet_twsk_deschedule(tw, &tcp_death_row);

13263

++		inet_twsk_put(tw);

13264

++	}

13265

++

13266

++	TCP_SKB_CB(skb)->mptcp_flags |= MPTCPHDR_JOIN;

13267

++	/* OK, this is a new syn/join, let's create a new open request and

13268

++	 * send syn+ack

13269

++	 */

13270

++	bh_lock_sock_nested(meta_sk);

13271

++	if (sock_owned_by_user(meta_sk)) {

13272

++		skb->sk = meta_sk;

13273

++		if (unlikely(sk_add_backlog(meta_sk, skb,

13274

++					    meta_sk->sk_rcvbuf + meta_sk->sk_sndbuf))) {

13275

++			bh_unlock_sock(meta_sk);

13276

++			NET_INC_STATS_BH(sock_net(meta_sk),

13277

++					 LINUX_MIB_TCPBACKLOGDROP);

13278

++			sock_put(meta_sk); /* Taken by mptcp_hash_find */

13279

++			kfree_skb(skb);

13280

++			return 1;

13281

++		}

13282

++	} else if (skb->protocol == htons(ETH_P_IP)) {

13283

++		tcp_v4_do_rcv(meta_sk, skb);

13284

++#if IS_ENABLED(CONFIG_IPV6)

13285

++	} else {

13286

++		tcp_v6_do_rcv(meta_sk, skb);

13287

++#endif /* CONFIG_IPV6 */

13288

++	}

13289

++	bh_unlock_sock(meta_sk);

13290

++	sock_put(meta_sk); /* Taken by mptcp_hash_find */

13291

++	return 1;

13292

++}

13293

++

13294

++int mptcp_do_join_short(struct sk_buff *skb,

13295

++			const struct mptcp_options_received *mopt,

13296

++			struct net *net)

13297

++{

13298

++	struct sock *meta_sk;

13299

++	u32 token;

13300

++	bool meta_v4;

13301

++

13302

++	token = mopt->mptcp_rem_token;

13303

++	meta_sk = mptcp_hash_find(net, token);

13304

++	if (!meta_sk) {

13305

++		mptcp_debug("%s:mpcb not found:%x\n", __func__, token);

13306

++		return -1;

13307

++	}

13308

++

13309

++	meta_v4 = meta_sk->sk_family == AF_INET;

13310

++	if (meta_v4) {

13311

++		if (skb->protocol == htons(ETH_P_IPV6)) {

13312

++			mptcp_debug("SYN+MP_JOIN with IPV6 address on pure IPV4 meta\n");

13313

++			sock_put(meta_sk); /* Taken by mptcp_hash_find */

13314

++			return -1;

13315

++		}

13316

++	} else if (skb->protocol == htons(ETH_P_IP) &&

13317

++		   inet6_sk(meta_sk)->ipv6only) {

13318

++		mptcp_debug("SYN+MP_JOIN with IPV4 address on IPV6_V6ONLY meta\n");

13319

++		sock_put(meta_sk); /* Taken by mptcp_hash_find */

13320

++		return -1;

13321

++	}

13322

++

13323

++	TCP_SKB_CB(skb)->mptcp_flags |= MPTCPHDR_JOIN;

13324

++

13325

++	/* OK, this is a new syn/join, let's create a new open request and

13326

++	 * send syn+ack

13327

++	 */

13328

++	bh_lock_sock(meta_sk);

13329

++

13330

++	/* This check is also done in mptcp_vX_do_rcv. But, there we cannot

13331

++	 * call tcp_vX_send_reset, because we hold already two socket-locks.

13332

++	 * (the listener and the meta from above)

13333

++	 *

13334

++	 * And the send-reset will try to take yet another one (ip_send_reply).

13335

++	 * Thus, we propagate the reset up to tcp_rcv_state_process.

13336

++	 */

13337

++	if (tcp_sk(meta_sk)->mpcb->infinite_mapping_rcv ||

13338

++	    tcp_sk(meta_sk)->mpcb->send_infinite_mapping ||

13339

++	    meta_sk->sk_state == TCP_CLOSE || !tcp_sk(meta_sk)->inside_tk_table) {

13340

++		bh_unlock_sock(meta_sk);

13341

++		sock_put(meta_sk); /* Taken by mptcp_hash_find */

13342

++		return -1;

13343

++	}

13344

++

13345

++	if (sock_owned_by_user(meta_sk)) {

13346

++		skb->sk = meta_sk;

13347

++		if (unlikely(sk_add_backlog(meta_sk, skb,

13348

++					    meta_sk->sk_rcvbuf + meta_sk->sk_sndbuf)))

13349

++			NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);

13350

++		else

13351

++			/* Must make sure that upper layers won't free the

13352

++			 * skb if it is added to the backlog-queue.

13353

++			 */

13354

++			skb_get(skb);

13355

++	} else {

13356

++		/* mptcp_v4_do_rcv tries to free the skb - we prevent this, as

13357

++		 * the skb will finally be freed by tcp_v4_do_rcv (where we are

13358

++		 * coming from)

13359

++		 */

13360

++		skb_get(skb);

13361

++		if (skb->protocol == htons(ETH_P_IP)) {

13362

++			tcp_v4_do_rcv(meta_sk, skb);

13363

++#if IS_ENABLED(CONFIG_IPV6)

13364

++		} else { /* IPv6 */

13365

++			tcp_v6_do_rcv(meta_sk, skb);

13366

++#endif /* CONFIG_IPV6 */

13367

++		}

13368

++	}

13369

++

13370

++	bh_unlock_sock(meta_sk);

13371

++	sock_put(meta_sk); /* Taken by mptcp_hash_find */

13372

++	return 0;

13373

++}

13374

++

13375

++/**

13376

++ * Equivalent of tcp_fin() for MPTCP

13377

++ * Can be called only when the FIN is validly part

13378

++ * of the data seqnum space. Not before when we get holes.

13379

++ */

13380

++void mptcp_fin(struct sock *meta_sk)

13381

++{

13382

++	struct sock *sk = NULL, *sk_it;

13383

++	struct tcp_sock *meta_tp = tcp_sk(meta_sk);

13384

++	struct mptcp_cb *mpcb = meta_tp->mpcb;

13385

++

13386

++	mptcp_for_each_sk(mpcb, sk_it) {

13387

++		if (tcp_sk(sk_it)->mptcp->path_index == mpcb->dfin_path_index) {

13388

++			sk = sk_it;

13389

++			break;

13390

++		}

13391

++	}

13392

++

13393

++	if (!sk || sk->sk_state == TCP_CLOSE)

13394

++		sk = mptcp_select_ack_sock(meta_sk);

13395

++

13396

++	inet_csk_schedule_ack(sk);

13397

++

13398

++	meta_sk->sk_shutdown |= RCV_SHUTDOWN;

13399

++	sock_set_flag(meta_sk, SOCK_DONE);

13400

++

13401

++	switch (meta_sk->sk_state) {

13402

++	case TCP_SYN_RECV:

13403

++	case TCP_ESTABLISHED:

13404

++		/* Move to CLOSE_WAIT */

13405

++		tcp_set_state(meta_sk, TCP_CLOSE_WAIT);

13406

++		inet_csk(sk)->icsk_ack.pingpong = 1;

13407

++		break;

13408

++

13409

++	case TCP_CLOSE_WAIT:

13410

++	case TCP_CLOSING:

13411

++		/* Received a retransmission of the FIN, do

13412

++		 * nothing.

13413

++		 */

13414

++		break;

13415

++	case TCP_LAST_ACK:

13416

++		/* RFC793: Remain in the LAST-ACK state. */

13417

++		break;

13418

++

13419

++	case TCP_FIN_WAIT1:

13420

++		/* This case occurs when a simultaneous close

13421

++		 * happens, we must ack the received FIN and

13422

++		 * enter the CLOSING state.

13423

++		 */

13424

++		tcp_send_ack(sk);

13425

++		tcp_set_state(meta_sk, TCP_CLOSING);

13426

++		break;

13427

++	case TCP_FIN_WAIT2:

13428

++		/* Received a FIN -- send ACK and enter TIME_WAIT. */

13429

++		tcp_send_ack(sk);

13430

++		meta_tp->ops->time_wait(meta_sk, TCP_TIME_WAIT, 0);

13431

++		break;

13432

++	default:

13433

++		/* Only TCP_LISTEN and TCP_CLOSE are left, in these

13434

++		 * cases we should never reach this piece of code.

13435

++		 */

13436

++		pr_err("%s: Impossible, meta_sk->sk_state=%d\n", __func__,

13437

++		       meta_sk->sk_state);

13438

++		break;

13439

++	}

13440

++

13441

++	/* It _is_ possible, that we have something out-of-order _after_ FIN.

13442

++	 * Probably, we should reset in this case. For now drop them.

13443

++	 */

13444

++	mptcp_purge_ofo_queue(meta_tp);

13445

++	sk_mem_reclaim(meta_sk);

13446

++

13447

++	if (!sock_flag(meta_sk, SOCK_DEAD)) {

13448

++		meta_sk->sk_state_change(meta_sk);

13449

++

13450

++		/* Do not send POLL_HUP for half duplex close. */

13451

++		if (meta_sk->sk_shutdown == SHUTDOWN_MASK ||

13452

++		    meta_sk->sk_state == TCP_CLOSE)

13453

++			sk_wake_async(meta_sk, SOCK_WAKE_WAITD, POLL_HUP);

13454

++		else

13455

++			sk_wake_async(meta_sk, SOCK_WAKE_WAITD, POLL_IN);

13456

++	}

13457

++

13458

++	return;

13459

++}

13460

++

13461

++static void mptcp_xmit_retransmit_queue(struct sock *meta_sk)

13462

++{

13463

++	struct tcp_sock *meta_tp = tcp_sk(meta_sk);

13464

++	struct sk_buff *skb;

13465

++

13466

++	if (!meta_tp->packets_out)

13467

++		return;

13468

++

13469

++	tcp_for_write_queue(skb, meta_sk) {

13470

++		if (skb == tcp_send_head(meta_sk))

13471

++			break;

13472

++

13473

++		if (mptcp_retransmit_skb(meta_sk, skb))

13474

++			return;

13475

++

13476

++		if (skb == tcp_write_queue_head(meta_sk))

13477

++			inet_csk_reset_xmit_timer(meta_sk, ICSK_TIME_RETRANS,

13478

++						  inet_csk(meta_sk)->icsk_rto,

13479

++						  TCP_RTO_MAX);

13480

++	}

13481

++}

13482

++

13483

++/* Handle the DATA_ACK */

13484

++static void mptcp_data_ack(struct sock *sk, const struct sk_buff *skb)

13485

++{

13486

++	struct sock *meta_sk = mptcp_meta_sk(sk);

13487

++	struct tcp_sock *meta_tp = tcp_sk(meta_sk), *tp = tcp_sk(sk);

13488

++	struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);

13489

++	u32 prior_snd_una = meta_tp->snd_una;

13490

++	int prior_packets;

13491

++	u32 nwin, data_ack, data_seq;

13492

++	u16 data_len = 0;

13493

++

13494

++	/* A valid packet came in - subflow is operational again */

13495

++	tp->pf = 0;

13496

++

13497

++	/* Even if there is no data-ack, we stop retransmitting.

13498

++	 * Except if this is a SYN/ACK. Then it is just a retransmission

13499

++	 */

13500

++	if (tp->mptcp->pre_established && !tcp_hdr(skb)->syn) {

13501

++		tp->mptcp->pre_established = 0;

13502

++		sk_stop_timer(sk, &tp->mptcp->mptcp_ack_timer);

13503

++	}

13504

++

13505

++	/* If we are in infinite mapping mode, rx_opt.data_ack has been

13506

++	 * set by mptcp_clean_rtx_infinite.

13507

++	 */

13508

++	if (!(tcb->mptcp_flags & MPTCPHDR_ACK) && !tp->mpcb->infinite_mapping_snd)

13509

++		goto exit;

13510

++

13511

++	data_ack = tp->mptcp->rx_opt.data_ack;

13512

++

13513

++	if (unlikely(!tp->mptcp->fully_established) &&

13514

++	    tp->mptcp->snt_isn + 1 != TCP_SKB_CB(skb)->ack_seq)

13515

++		/* As soon as a subflow-data-ack (not acking syn, thus snt_isn + 1)

13516

++		 * includes a data-ack, we are fully established

13517

++		 */

13518

++		mptcp_become_fully_estab(sk);

13519

++

13520

++	/* Get the data_seq */

13521

++	if (mptcp_is_data_seq(skb)) {

13522

++		data_seq = tp->mptcp->rx_opt.data_seq;

13523

++		data_len = tp->mptcp->rx_opt.data_len;

13524

++	} else {

13525

++		data_seq = meta_tp->snd_wl1;

13526

++	}

13527

++

13528

++	/* If the ack is older than previous acks

13529

++	 * then we can probably ignore it.

13530

++	 */

13531

++	if (before(data_ack, prior_snd_una))

13532

++		goto exit;

13533

++

13534

++	/* If the ack includes data we haven't sent yet, discard

13535

++	 * this segment (RFC793 Section 3.9).

13536

++	 */

13537

++	if (after(data_ack, meta_tp->snd_nxt))

13538

++		goto exit;

13539

++

13540

++	/*** Now, update the window  - inspired by tcp_ack_update_window ***/

13541

++	nwin = ntohs(tcp_hdr(skb)->window);

13542

++

13543

++	if (likely(!tcp_hdr(skb)->syn))

13544

++		nwin <<= tp->rx_opt.snd_wscale;

13545

++

13546

++	if (tcp_may_update_window(meta_tp, data_ack, data_seq, nwin)) {

13547

++		tcp_update_wl(meta_tp, data_seq);

13548

++

13549

++		/* Draft v09, Section 3.3.5:

13550

++		 * [...] It should only update its local receive window values

13551

++		 * when the largest sequence number allowed (i.e.  DATA_ACK +

13552

++		 * receive window) increases. [...]

13553

++		 */

13554

++		if (meta_tp->snd_wnd != nwin &&

13555

++		    !before(data_ack + nwin, tcp_wnd_end(meta_tp))) {

13556

++			meta_tp->snd_wnd = nwin;

13557

++

13558

++			if (nwin > meta_tp->max_window)

13559

++				meta_tp->max_window = nwin;

13560

++		}

13561

++	}

13562

++	/*** Done, update the window ***/

13563

++

13564

++	/* We passed data and got it acked, remove any soft error

13565

++	 * log. Something worked...

13566

++	 */

13567

++	sk->sk_err_soft = 0;

13568

++	inet_csk(meta_sk)->icsk_probes_out = 0;

13569

++	meta_tp->rcv_tstamp = tcp_time_stamp;

13570

++	prior_packets = meta_tp->packets_out;

13571

++	if (!prior_packets)

13572

++		goto no_queue;

13573

++

13574

++	meta_tp->snd_una = data_ack;

13575

++

13576

++	mptcp_clean_rtx_queue(meta_sk, prior_snd_una);

13577

++

13578

++	/* We are in loss-state, and something got acked, retransmit the whole

13579

++	 * queue now!

13580

++	 */

13581

++	if (inet_csk(meta_sk)->icsk_ca_state == TCP_CA_Loss &&

13582

++	    after(data_ack, prior_snd_una)) {

13583

++		mptcp_xmit_retransmit_queue(meta_sk);

13584

++		inet_csk(meta_sk)->icsk_ca_state = TCP_CA_Open;

13585

++	}

13586

++

13587

++	/* Simplified version of tcp_new_space, because the snd-buffer

13588

++	 * is handled by all the subflows.

13589

++	 */

13590

++	if (sock_flag(meta_sk, SOCK_QUEUE_SHRUNK)) {

13591

++		sock_reset_flag(meta_sk, SOCK_QUEUE_SHRUNK);

13592

++		if (meta_sk->sk_socket &&

13593

++		    test_bit(SOCK_NOSPACE, &meta_sk->sk_socket->flags))

13594

++			meta_sk->sk_write_space(meta_sk);

13595

++	}

13596

++

13597

++	if (meta_sk->sk_state != TCP_ESTABLISHED &&

13598

++	    mptcp_rcv_state_process(meta_sk, sk, skb, data_seq, data_len))

13599

++		return;

13600

++

13601

++exit:

13602

++	mptcp_push_pending_frames(meta_sk);

13603

++

13604

++	return;

13605

++

13606

++no_queue:

13607

++	if (tcp_send_head(meta_sk))

13608

++		tcp_ack_probe(meta_sk);

13609

++

13610

++	mptcp_push_pending_frames(meta_sk);

13611

++

13612

++	return;

13613

++}

13614

++

13615

++void mptcp_clean_rtx_infinite(const struct sk_buff *skb, struct sock *sk)

13616

++{

13617

++	struct tcp_sock *tp = tcp_sk(sk), *meta_tp = tcp_sk(mptcp_meta_sk(sk));

13618

++

13619

++	if (!tp->mpcb->infinite_mapping_snd)

13620

++		return;

13621

++

13622

++	/* The difference between both write_seq's represents the offset between

13623

++	 * data-sequence and subflow-sequence. As we are infinite, this must

13624

++	 * match.

13625

++	 *

13626

++	 * Thus, from this difference we can infer the meta snd_una.

13627

++	 */

13628

++	tp->mptcp->rx_opt.data_ack = meta_tp->snd_nxt - tp->snd_nxt +

13629

++				     tp->snd_una;

13630

++

13631

++	mptcp_data_ack(sk, skb);

13632

++}

13633

++

13634

++/**** static functions used by mptcp_parse_options */

13635

++

13636

++static void mptcp_send_reset_rem_id(const struct mptcp_cb *mpcb, u8 rem_id)

13637

++{

13638

++	struct sock *sk_it, *tmpsk;

13639

++

13640

++	mptcp_for_each_sk_safe(mpcb, sk_it, tmpsk) {

13641

++		if (tcp_sk(sk_it)->mptcp->rem_id == rem_id) {

13642

++			mptcp_reinject_data(sk_it, 0);

13643

++			sk_it->sk_err = ECONNRESET;

13644

++			if (tcp_need_reset(sk_it->sk_state))

13645

++				tcp_sk(sk_it)->ops->send_active_reset(sk_it,

13646

++								      GFP_ATOMIC);

13647

++			mptcp_sub_force_close(sk_it);

13648

++		}

13649

++	}

13650

++}

13651

++

13652

++void mptcp_parse_options(const uint8_t *ptr, int opsize,

13653

++			 struct mptcp_options_received *mopt,

13654

++			 const struct sk_buff *skb)

13655

++{

13656

++	const struct mptcp_option *mp_opt = (struct mptcp_option *)ptr;

13657

++

13658

++	/* If the socket is mp-capable we would have a mopt. */

13659

++	if (!mopt)

13660

++		return;

13661

++

13662

++	switch (mp_opt->sub) {

13663

++	case MPTCP_SUB_CAPABLE:

13664

++	{

13665

++		const struct mp_capable *mpcapable = (struct mp_capable *)ptr;

13666

++

13667

++		if (opsize != MPTCP_SUB_LEN_CAPABLE_SYN &&

13668

++		    opsize != MPTCP_SUB_LEN_CAPABLE_ACK) {

13669

++			mptcp_debug("%s: mp_capable: bad option size %d\n",

13670

++				    __func__, opsize);

13671

++			break;

13672

++		}

13673

++

13674

++		if (!sysctl_mptcp_enabled)

13675

++			break;

13676

++

13677

++		/* We only support MPTCP version 0 */

13678

++		if (mpcapable->ver != 0)

13679

++			break;

13680

++

13681

++		/* MPTCP-RFC 6824:

13682

++		 * "If receiving a message with the 'B' flag set to 1, and this

13683

++		 * is not understood, then this SYN MUST be silently ignored;

13684

++		 */

13685

++		if (mpcapable->b) {

13686

++			mopt->drop_me = 1;

13687

++			break;

13688

++		}

13689

++

13690

++		/* MPTCP-RFC 6824:

13691

++		 * "An implementation that only supports this method MUST set

13692

++		 *  bit "H" to 1, and bits "C" through "G" to 0."

13693

++		 */

13694

++		if (!mpcapable->h)

13695

++			break;

13696

++

13697

++		mopt->saw_mpc = 1;

13698

++		mopt->dss_csum = sysctl_mptcp_checksum || mpcapable->a;

13699

++

13700

++		if (opsize >= MPTCP_SUB_LEN_CAPABLE_SYN)

13701

++			mopt->mptcp_key = mpcapable->sender_key;

13702

++

13703

++		break;

13704

++	}

13705

++	case MPTCP_SUB_JOIN:

13706

++	{

13707

++		const struct mp_join *mpjoin = (struct mp_join *)ptr;

13708

++

13709

++		if (opsize != MPTCP_SUB_LEN_JOIN_SYN &&

13710

++		    opsize != MPTCP_SUB_LEN_JOIN_SYNACK &&

13711

++		    opsize != MPTCP_SUB_LEN_JOIN_ACK) {

13712

++			mptcp_debug("%s: mp_join: bad option size %d\n",

13713

++				    __func__, opsize);

13714

++			break;

13715

++		}

13716

++

13717

++		/* saw_mpc must be set, because in tcp_check_req we assume that

13718

++		 * it is set to support falling back to reg. TCP if a rexmitted

13719

++		 * SYN has no MP_CAPABLE or MP_JOIN

13720

++		 */

13721

++		switch (opsize) {

13722

++		case MPTCP_SUB_LEN_JOIN_SYN:

13723

++			mopt->is_mp_join = 1;

13724

++			mopt->saw_mpc = 1;

13725

++			mopt->low_prio = mpjoin->b;

13726

++			mopt->rem_id = mpjoin->addr_id;

13727

++			mopt->mptcp_rem_token = mpjoin->u.syn.token;

13728

++			mopt->mptcp_recv_nonce = mpjoin->u.syn.nonce;

13729

++			break;

13730

++		case MPTCP_SUB_LEN_JOIN_SYNACK:

13731

++			mopt->saw_mpc = 1;

13732

++			mopt->low_prio = mpjoin->b;

13733

++			mopt->rem_id = mpjoin->addr_id;

13734

++			mopt->mptcp_recv_tmac = mpjoin->u.synack.mac;

13735

++			mopt->mptcp_recv_nonce = mpjoin->u.synack.nonce;

13736

++			break;

13737

++		case MPTCP_SUB_LEN_JOIN_ACK:

13738

++			mopt->saw_mpc = 1;

13739

++			mopt->join_ack = 1;

13740

++			memcpy(mopt->mptcp_recv_mac, mpjoin->u.ack.mac, 20);

13741

++			break;

13742

++		}

13743

++		break;

13744

++	}

13745

++	case MPTCP_SUB_DSS:

13746

++	{

13747

++		const struct mp_dss *mdss = (struct mp_dss *)ptr;

13748

++		struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);

13749

++

13750

++		/* We check opsize for the csum and non-csum case. We do this,

13751

++		 * because the draft says that the csum SHOULD be ignored if

13752

++		 * it has not been negotiated in the MP_CAPABLE but still is

13753

++		 * present in the data.

13754

++		 *

13755

++		 * It will get ignored later in mptcp_queue_skb.

13756

++		 */

13757

++		if (opsize != mptcp_sub_len_dss(mdss, 0) &&

13758

++		    opsize != mptcp_sub_len_dss(mdss, 1)) {

13759

++			mptcp_debug("%s: mp_dss: bad option size %d\n",

13760

++				    __func__, opsize);

13761

++			break;

13762

++		}

13763

++

13764

++		ptr += 4;

13765

++

13766

++		if (mdss->A) {

13767

++			tcb->mptcp_flags |= MPTCPHDR_ACK;

13768

++

13769

++			if (mdss->a) {

13770

++				mopt->data_ack = (u32) get_unaligned_be64(ptr);

13771

++				ptr += MPTCP_SUB_LEN_ACK_64;

13772

++			} else {

13773

++				mopt->data_ack = get_unaligned_be32(ptr);

13774

++				ptr += MPTCP_SUB_LEN_ACK;

13775

++			}

13776

++		}

13777

++

13778

++		tcb->dss_off = (ptr - skb_transport_header(skb));

13779

++

13780

++		if (mdss->M) {

13781

++			if (mdss->m) {

13782

++				u64 data_seq64 = get_unaligned_be64(ptr);

13783

++

13784

++				tcb->mptcp_flags |= MPTCPHDR_SEQ64_SET;

13785

++				mopt->data_seq = (u32) data_seq64;

13786

++

13787

++				ptr += 12; /* 64-bit dseq + subseq */

13788

++			} else {

13789

++				mopt->data_seq = get_unaligned_be32(ptr);

13790

++				ptr += 8; /* 32-bit dseq + subseq */

13791

++			}

13792

++			mopt->data_len = get_unaligned_be16(ptr);

13793

++

13794

++			tcb->mptcp_flags |= MPTCPHDR_SEQ;

13795

++

13796

++			/* Is a check-sum present? */

13797

++			if (opsize == mptcp_sub_len_dss(mdss, 1))

13798

++				tcb->mptcp_flags |= MPTCPHDR_DSS_CSUM;

13799

++

13800

++			/* DATA_FIN only possible with DSS-mapping */

13801

++			if (mdss->F)

13802

++				tcb->mptcp_flags |= MPTCPHDR_FIN;

13803

++		}

13804

++

13805

++		break;

13806

++	}

13807

++	case MPTCP_SUB_ADD_ADDR:

13808

++	{

13809

++#if IS_ENABLED(CONFIG_IPV6)

13810

++		const struct mp_add_addr *mpadd = (struct mp_add_addr *)ptr;

13811

++

13812

++		if ((mpadd->ipver == 4 && opsize != MPTCP_SUB_LEN_ADD_ADDR4 &&

13813

++		     opsize != MPTCP_SUB_LEN_ADD_ADDR4 + 2) ||

13814

++		    (mpadd->ipver == 6 && opsize != MPTCP_SUB_LEN_ADD_ADDR6 &&

13815

++		     opsize != MPTCP_SUB_LEN_ADD_ADDR6 + 2)) {

13816

++#else

13817

++		if (opsize != MPTCP_SUB_LEN_ADD_ADDR4 &&

13818

++		    opsize != MPTCP_SUB_LEN_ADD_ADDR4 + 2) {

13819

++#endif /* CONFIG_IPV6 */

13820

++			mptcp_debug("%s: mp_add_addr: bad option size %d\n",

13821

++				    __func__, opsize);

13822

++			break;

13823

++		}

13824

++

13825

++		/* We have to manually parse the options if we got two of them. */

13826

++		if (mopt->saw_add_addr) {

13827

++			mopt->more_add_addr = 1;

13828

++			break;

13829

++		}

13830

++		mopt->saw_add_addr = 1;

13831

++		mopt->add_addr_ptr = ptr;

13832

++		break;

13833

++	}

13834

++	case MPTCP_SUB_REMOVE_ADDR:

13835

++		if ((opsize - MPTCP_SUB_LEN_REMOVE_ADDR) < 0) {

13836

++			mptcp_debug("%s: mp_remove_addr: bad option size %d\n",

13837

++				    __func__, opsize);

13838

++			break;

13839

++		}

13840

++

13841

++		if (mopt->saw_rem_addr) {

13842

++			mopt->more_rem_addr = 1;

13843

++			break;

13844

++		}

13845

++		mopt->saw_rem_addr = 1;

13846

++		mopt->rem_addr_ptr = ptr;

13847

++		break;

13848

++	case MPTCP_SUB_PRIO:

13849

++	{

13850

++		const struct mp_prio *mpprio = (struct mp_prio *)ptr;

13851

++

13852

++		if (opsize != MPTCP_SUB_LEN_PRIO &&

13853

++		    opsize != MPTCP_SUB_LEN_PRIO_ADDR) {

13854

++			mptcp_debug("%s: mp_prio: bad option size %d\n",

13855

++				    __func__, opsize);

13856

++			break;

13857

++		}

13858

++

13859

++		mopt->saw_low_prio = 1;

13860

++		mopt->low_prio = mpprio->b;

13861

++

13862

++		if (opsize == MPTCP_SUB_LEN_PRIO_ADDR) {

13863

++			mopt->saw_low_prio = 2;

13864

++			mopt->prio_addr_id = mpprio->addr_id;

13865

++		}

13866

++		break;

13867

++	}

13868

++	case MPTCP_SUB_FAIL:

13869

++		if (opsize != MPTCP_SUB_LEN_FAIL) {

13870

++			mptcp_debug("%s: mp_fail: bad option size %d\n",

13871

++				    __func__, opsize);

13872

++			break;

13873

++		}

13874

++		mopt->mp_fail = 1;

13875

++		break;

13876

++	case MPTCP_SUB_FCLOSE:

13877

++		if (opsize != MPTCP_SUB_LEN_FCLOSE) {

13878

++			mptcp_debug("%s: mp_fclose: bad option size %d\n",

13879

++				    __func__, opsize);

13880

++			break;

13881

++		}

13882

++

13883

++		mopt->mp_fclose = 1;

13884

++		mopt->mptcp_key = ((struct mp_fclose *)ptr)->key;

13885

++

13886

++		break;

13887

++	default:

13888

++		mptcp_debug("%s: Received unkown subtype: %d\n",

13889

++			    __func__, mp_opt->sub);

13890

++		break;

13891

++	}

13892

++}

13893

++

13894

++/** Parse only MPTCP options */

13895

++void tcp_parse_mptcp_options(const struct sk_buff *skb,

13896

++			     struct mptcp_options_received *mopt)

13897

++{

13898

++	const struct tcphdr *th = tcp_hdr(skb);

13899

++	int length = (th->doff * 4) - sizeof(struct tcphdr);

13900

++	const unsigned char *ptr = (const unsigned char *)(th + 1);

13901

++

13902

++	while (length > 0) {

13903

++		int opcode = *ptr++;

13904

++		int opsize;

13905

++

13906

++		switch (opcode) {

13907

++		case TCPOPT_EOL:

13908

++			return;

13909

++		case TCPOPT_NOP:	/* Ref: RFC 793 section 3.1 */

13910

++			length--;

13911

++			continue;

13912

++		default:

13913

++			opsize = *ptr++;

13914

++			if (opsize < 2)	/* "silly options" */

13915

++				return;

13916

++			if (opsize > length)

13917

++				return;	/* don't parse partial options */

13918

++			if (opcode == TCPOPT_MPTCP)

13919

++				mptcp_parse_options(ptr - 2, opsize, mopt, skb);

13920

++		}

13921

++		ptr += opsize - 2;

13922

++		length -= opsize;

13923

++	}

13924

++}

13925

++

13926

++int mptcp_check_rtt(const struct tcp_sock *tp, int time)

13927

++{

13928

++	struct mptcp_cb *mpcb = tp->mpcb;

13929

++	struct sock *sk;

13930

++	u32 rtt_max = 0;

13931

++

13932

++	/* In MPTCP, we take the max delay across all flows,

13933

++	 * in order to take into account meta-reordering buffers.

13934

++	 */

13935

++	mptcp_for_each_sk(mpcb, sk) {

13936

++		if (!mptcp_sk_can_recv(sk))

13937

++			continue;

13938

++

13939

++		if (rtt_max < tcp_sk(sk)->rcv_rtt_est.rtt)

13940

++			rtt_max = tcp_sk(sk)->rcv_rtt_est.rtt;

13941

++	}

13942

++	if (time < (rtt_max >> 3) || !rtt_max)

13943

++		return 1;

13944

++

13945

++	return 0;

13946

++}

13947

++

13948

++static void mptcp_handle_add_addr(const unsigned char *ptr, struct sock *sk)

13949

++{

13950

++	struct mp_add_addr *mpadd = (struct mp_add_addr *)ptr;

13951

++	struct mptcp_cb *mpcb = tcp_sk(sk)->mpcb;

13952

++	__be16 port = 0;

13953

++	union inet_addr addr;

13954

++	sa_family_t family;

13955

++

13956

++	if (mpadd->ipver == 4) {

13957

++		if (mpadd->len == MPTCP_SUB_LEN_ADD_ADDR4 + 2)

13958

++			port  = mpadd->u.v4.port;

13959

++		family = AF_INET;

13960

++		addr.in = mpadd->u.v4.addr;

13961

++#if IS_ENABLED(CONFIG_IPV6)

13962

++	} else if (mpadd->ipver == 6) {

13963

++		if (mpadd->len == MPTCP_SUB_LEN_ADD_ADDR6 + 2)

13964

++			port  = mpadd->u.v6.port;

13965

++		family = AF_INET6;

13966

++		addr.in6 = mpadd->u.v6.addr;

13967

++#endif /* CONFIG_IPV6 */

13968

++	} else {

13969

++		return;

13970

++	}

13971

++

13972

++	if (mpcb->pm_ops->add_raddr)

13973

++		mpcb->pm_ops->add_raddr(mpcb, &addr, family, port, mpadd->addr_id);

13974

++}

13975

++

13976

++static void mptcp_handle_rem_addr(const unsigned char *ptr, struct sock *sk)

13977

++{

13978

++	struct mp_remove_addr *mprem = (struct mp_remove_addr *)ptr;

13979

++	int i;

13980

++	u8 rem_id;

13981

++	struct mptcp_cb *mpcb = tcp_sk(sk)->mpcb;

13982

++

13983

++	for (i = 0; i <= mprem->len - MPTCP_SUB_LEN_REMOVE_ADDR; i++) {

13984

++		rem_id = (&mprem->addrs_id)[i];

13985

++

13986

++		if (mpcb->pm_ops->rem_raddr)

13987

++			mpcb->pm_ops->rem_raddr(mpcb, rem_id);

13988

++		mptcp_send_reset_rem_id(mpcb, rem_id);

13989

++	}

13990

++}

13991

++

13992

++static void mptcp_parse_addropt(const struct sk_buff *skb, struct sock *sk)

13993

++{

13994

++	struct tcphdr *th = tcp_hdr(skb);

13995

++	unsigned char *ptr;

13996

++	int length = (th->doff * 4) - sizeof(struct tcphdr);

13997

++

13998

++	/* Jump through the options to check whether ADD_ADDR is there */

13999

++	ptr = (unsigned char *)(th + 1);

14000

++	while (length > 0) {

14001

++		int opcode = *ptr++;

14002

++		int opsize;

14003

++

14004

++		switch (opcode) {

14005

++		case TCPOPT_EOL:

14006

++			return;

14007

++		case TCPOPT_NOP:

14008

++			length--;

14009

++			continue;

14010

++		default:

14011

++			opsize = *ptr++;

14012

++			if (opsize < 2)

14013

++				return;

14014

++			if (opsize > length)

14015

++				return;  /* don't parse partial options */

14016

++			if (opcode == TCPOPT_MPTCP &&

14017

++			    ((struct mptcp_option *)ptr)->sub == MPTCP_SUB_ADD_ADDR) {

14018

++#if IS_ENABLED(CONFIG_IPV6)

14019

++				struct mp_add_addr *mpadd = (struct mp_add_addr *)ptr;

14020

++				if ((mpadd->ipver == 4 && opsize != MPTCP_SUB_LEN_ADD_ADDR4 &&

14021

++				     opsize != MPTCP_SUB_LEN_ADD_ADDR4 + 2) ||

14022

++				    (mpadd->ipver == 6 && opsize != MPTCP_SUB_LEN_ADD_ADDR6 &&

14023

++				     opsize != MPTCP_SUB_LEN_ADD_ADDR6 + 2))

14024

++#else

14025

++				if (opsize != MPTCP_SUB_LEN_ADD_ADDR4 &&

14026

++				    opsize != MPTCP_SUB_LEN_ADD_ADDR4 + 2)

14027

++#endif /* CONFIG_IPV6 */

14028

++					goto cont;

14029

++

14030

++				mptcp_handle_add_addr(ptr, sk);

14031

++			}

14032

++			if (opcode == TCPOPT_MPTCP &&

14033

++			    ((struct mptcp_option *)ptr)->sub == MPTCP_SUB_REMOVE_ADDR) {

14034

++				if ((opsize - MPTCP_SUB_LEN_REMOVE_ADDR) < 0)

14035

++					goto cont;

14036

++

14037

++				mptcp_handle_rem_addr(ptr, sk);

14038

++			}

14039

++cont:

14040

++			ptr += opsize - 2;

14041

++			length -= opsize;

14042

++		}

14043

++	}

14044

++	return;

14045

++}

14046

++

14047

++static inline int mptcp_mp_fail_rcvd(struct sock *sk, const struct tcphdr *th)

14048

++{

14049

++	struct mptcp_tcp_sock *mptcp = tcp_sk(sk)->mptcp;

14050

++	struct sock *meta_sk = mptcp_meta_sk(sk);

14051

++	struct mptcp_cb *mpcb = tcp_sk(sk)->mpcb;

14052

++

14053

++	if (unlikely(mptcp->rx_opt.mp_fail)) {

14054

++		mptcp->rx_opt.mp_fail = 0;

14055

++

14056

++		if (!th->rst && !mpcb->infinite_mapping_snd) {

14057

++			struct sock *sk_it;

14058

++

14059

++			mpcb->send_infinite_mapping = 1;

14060

++			/* We resend everything that has not been acknowledged */

14061

++			meta_sk->sk_send_head = tcp_write_queue_head(meta_sk);

14062

++

14063

++			/* We artificially restart the whole send-queue. Thus,

14064

++			 * it is as if no packets are in flight

14065

++			 */

14066

++			tcp_sk(meta_sk)->packets_out = 0;

14067

++

14068

++			/* If the snd_nxt already wrapped around, we have to

14069

++			 * undo the wrapping, as we are restarting from snd_una

14070

++			 * on.

14071

++			 */

14072

++			if (tcp_sk(meta_sk)->snd_nxt < tcp_sk(meta_sk)->snd_una) {

14073

++				mpcb->snd_high_order[mpcb->snd_hiseq_index] -= 2;

14074

++				mpcb->snd_hiseq_index = mpcb->snd_hiseq_index ? 0 : 1;

14075

++			}

14076

++			tcp_sk(meta_sk)->snd_nxt = tcp_sk(meta_sk)->snd_una;

14077

++

14078

++			/* Trigger a sending on the meta. */

14079

++			mptcp_push_pending_frames(meta_sk);

14080

++

14081

++			mptcp_for_each_sk(mpcb, sk_it) {

14082

++				if (sk != sk_it)

14083

++					mptcp_sub_force_close(sk_it);

14084

++			}

14085

++		}

14086

++

14087

++		return 0;

14088

++	}

14089

++

14090

++	if (unlikely(mptcp->rx_opt.mp_fclose)) {

14091

++		struct sock *sk_it, *tmpsk;

14092

++

14093

++		mptcp->rx_opt.mp_fclose = 0;

14094

++		if (mptcp->rx_opt.mptcp_key != mpcb->mptcp_loc_key)

14095

++			return 0;

14096

++

14097

++		if (tcp_need_reset(sk->sk_state))

14098

++			tcp_sk(sk)->ops->send_active_reset(sk, GFP_ATOMIC);

14099

++

14100

++		mptcp_for_each_sk_safe(mpcb, sk_it, tmpsk)

14101

++			mptcp_sub_force_close(sk_it);

14102

++

14103

++		tcp_reset(meta_sk);

14104

++

14105

++		return 1;

14106

++	}

14107

++

14108

++	return 0;

14109

++}

14110

++

14111

++static inline void mptcp_path_array_check(struct sock *meta_sk)

14112

++{

14113

++	struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;

14114

++

14115

++	if (unlikely(mpcb->list_rcvd)) {

14116

++		mpcb->list_rcvd = 0;

14117

++		if (mpcb->pm_ops->new_remote_address)

14118

++			mpcb->pm_ops->new_remote_address(meta_sk);

14119

++	}

14120

++}

14121

++

14122

++int mptcp_handle_options(struct sock *sk, const struct tcphdr *th,

14123

++			 const struct sk_buff *skb)

14124

++{

14125

++	struct tcp_sock *tp = tcp_sk(sk);

14126

++	struct mptcp_options_received *mopt = &tp->mptcp->rx_opt;

14127

++

14128

++	if (tp->mpcb->infinite_mapping_rcv || tp->mpcb->infinite_mapping_snd)

14129

++		return 0;

14130

++

14131

++	if (mptcp_mp_fail_rcvd(sk, th))

14132

++		return 1;

14133

++

14134

++	/* RFC 6824, Section 3.3:

14135

++	 * If a checksum is not present when its use has been negotiated, the

14136

++	 * receiver MUST close the subflow with a RST as it is considered broken.

14137

++	 */

14138

++	if (mptcp_is_data_seq(skb) && tp->mpcb->dss_csum &&

14139

++	    !(TCP_SKB_CB(skb)->mptcp_flags & MPTCPHDR_DSS_CSUM)) {

14140

++		if (tcp_need_reset(sk->sk_state))

14141

++			tp->ops->send_active_reset(sk, GFP_ATOMIC);

14142

++

14143

++		mptcp_sub_force_close(sk);

14144

++		return 1;

14145

++	}

14146

++

14147

++	/* We have to acknowledge retransmissions of the third

14148

++	 * ack.

14149

++	 */

14150

++	if (mopt->join_ack) {

14151

++		tcp_send_delayed_ack(sk);

14152

++		mopt->join_ack = 0;

14153

++	}

14154

++

14155

++	if (mopt->saw_add_addr || mopt->saw_rem_addr) {

14156

++		if (mopt->more_add_addr || mopt->more_rem_addr) {

14157

++			mptcp_parse_addropt(skb, sk);

14158

++		} else {

14159

++			if (mopt->saw_add_addr)

14160

++				mptcp_handle_add_addr(mopt->add_addr_ptr, sk);

14161

++			if (mopt->saw_rem_addr)

14162

++				mptcp_handle_rem_addr(mopt->rem_addr_ptr, sk);

14163

++		}

14164

++

14165

++		mopt->more_add_addr = 0;

14166

++		mopt->saw_add_addr = 0;

14167

++		mopt->more_rem_addr = 0;

14168

++		mopt->saw_rem_addr = 0;

14169

++	}

14170

++	if (mopt->saw_low_prio) {

14171

++		if (mopt->saw_low_prio == 1) {

14172

++			tp->mptcp->rcv_low_prio = mopt->low_prio;

14173

++		} else {

14174

++			struct sock *sk_it;

14175

++			mptcp_for_each_sk(tp->mpcb, sk_it) {

14176

++				struct mptcp_tcp_sock *mptcp = tcp_sk(sk_it)->mptcp;

14177

++				if (mptcp->rem_id == mopt->prio_addr_id)

14178

++					mptcp->rcv_low_prio = mopt->low_prio;

14179

++			}

14180

++		}

14181

++		mopt->saw_low_prio = 0;

14182

++	}

14183

++

14184

++	mptcp_data_ack(sk, skb);

14185

++

14186

++	mptcp_path_array_check(mptcp_meta_sk(sk));

14187

++	/* Socket may have been mp_killed by a REMOVE_ADDR */

14188

++	if (tp->mp_killed)

14189

++		return 1;

14190

++

14191

++	return 0;

14192

++}

14193

++

14194

++/* In case of fastopen, some data can already be in the write queue.

14195

++ * We need to update the sequence number of the segments as they

14196

++ * were initially TCP sequence numbers.

14197

++ */

14198

++static void mptcp_rcv_synsent_fastopen(struct sock *meta_sk)

14199

++{

14200

++	struct tcp_sock *meta_tp = tcp_sk(meta_sk);

14201

++	struct tcp_sock *master_tp = tcp_sk(meta_tp->mpcb->master_sk);

14202

++	struct sk_buff *skb;

14203

++	u32 new_mapping = meta_tp->write_seq - master_tp->snd_una;

14204

++

14205

++	/* There should only be one skb in write queue: the data not

14206

++	 * acknowledged in the SYN+ACK. In this case, we need to map

14207

++	 * this data to data sequence numbers.

14208

++	 */

14209

++	skb_queue_walk(&meta_sk->sk_write_queue, skb) {

14210

++		/* If the server only acknowledges partially the data sent in

14211

++		 * the SYN, we need to trim the acknowledged part because

14212

++		 * we don't want to retransmit this already received data.

14213

++		 * When we reach this point, tcp_ack() has already cleaned up

14214

++		 * fully acked segments. However, tcp trims partially acked

14215

++		 * segments only when retransmitting. Since MPTCP comes into

14216

++		 * play only now, we will fake an initial transmit, and

14217

++		 * retransmit_skb() will not be called. The following fragment

14218

++		 * comes from __tcp_retransmit_skb().

14219

++		 */

14220

++		if (before(TCP_SKB_CB(skb)->seq, master_tp->snd_una)) {

14221

++			BUG_ON(before(TCP_SKB_CB(skb)->end_seq,

14222

++				      master_tp->snd_una));

14223

++			/* tcp_trim_head can only returns ENOMEM if skb is

14224

++			 * cloned. It is not the case here (see

14225

++			 * tcp_send_syn_data).

14226

++			 */

14227

++			BUG_ON(tcp_trim_head(meta_sk, skb, master_tp->snd_una -

14228

++					     TCP_SKB_CB(skb)->seq));

14229

++		}

14230

++

14231

++		TCP_SKB_CB(skb)->seq += new_mapping;

14232

++		TCP_SKB_CB(skb)->end_seq += new_mapping;

14233

++	}

14234

++

14235

++	/* We can advance write_seq by the number of bytes unacknowledged

14236

++	 * and that were mapped in the previous loop.

14237

++	 */

14238

++	meta_tp->write_seq += master_tp->write_seq - master_tp->snd_una;

14239

++

14240

++	/* The packets from the master_sk will be entailed to it later

14241

++	 * Until that time, its write queue is empty, and

14242

++	 * write_seq must align with snd_una

14243

++	 */

14244

++	master_tp->snd_nxt = master_tp->write_seq = master_tp->snd_una;

14245

++	master_tp->packets_out = 0;

14246

++

14247

++	/* Although these data have been sent already over the subsk,

14248

++	 * They have never been sent over the meta_sk, so we rewind

14249

++	 * the send_head so that tcp considers it as an initial send

14250

++	 * (instead of retransmit).

14251

++	 */

14252

++	meta_sk->sk_send_head = tcp_write_queue_head(meta_sk);

14253

++}

14254

++

14255

++/* The skptr is needed, because if we become MPTCP-capable, we have to switch

14256

++ * from meta-socket to master-socket.

14257

++ *

14258

++ * @return: 1 - we want to reset this connection

14259

++ *	    2 - we want to discard the received syn/ack

14260

++ *	    0 - everything is fine - continue

14261

++ */

14262

++int mptcp_rcv_synsent_state_process(struct sock *sk, struct sock **skptr,

14263

++				    const struct sk_buff *skb,

14264

++				    const struct mptcp_options_received *mopt)

14265

++{

14266

++	struct tcp_sock *tp = tcp_sk(sk);

14267

++

14268

++	if (mptcp(tp)) {

14269

++		u8 hash_mac_check[20];

14270

++		struct mptcp_cb *mpcb = tp->mpcb;

14271

++

14272

++		mptcp_hmac_sha1((u8 *)&mpcb->mptcp_rem_key,

14273

++				(u8 *)&mpcb->mptcp_loc_key,

14274

++				(u8 *)&tp->mptcp->rx_opt.mptcp_recv_nonce,

14275

++				(u8 *)&tp->mptcp->mptcp_loc_nonce,

14276

++				(u32 *)hash_mac_check);

14277

++		if (memcmp(hash_mac_check,

14278

++			   (char *)&tp->mptcp->rx_opt.mptcp_recv_tmac, 8)) {

14279

++			mptcp_sub_force_close(sk);

14280

++			return 1;

14281

++		}

14282

++

14283

++		/* Set this flag in order to postpone data sending

14284

++		 * until the 4th ack arrives.

14285

++		 */

14286

++		tp->mptcp->pre_established = 1;

14287

++		tp->mptcp->rcv_low_prio = tp->mptcp->rx_opt.low_prio;

14288

++

14289

++		mptcp_hmac_sha1((u8 *)&mpcb->mptcp_loc_key,

14290

++				(u8 *)&mpcb->mptcp_rem_key,

14291

++				(u8 *)&tp->mptcp->mptcp_loc_nonce,

14292

++				(u8 *)&tp->mptcp->rx_opt.mptcp_recv_nonce,

14293

++				(u32 *)&tp->mptcp->sender_mac[0]);

14294

++

14295

++	} else if (mopt->saw_mpc) {

14296

++		struct sock *meta_sk = sk;

14297

++

14298

++		if (mptcp_create_master_sk(sk, mopt->mptcp_key,

14299

++					   ntohs(tcp_hdr(skb)->window)))

14300

++			return 2;

14301

++

14302

++		sk = tcp_sk(sk)->mpcb->master_sk;

14303

++		*skptr = sk;

14304

++		tp = tcp_sk(sk);

14305

++

14306

++		/* If fastopen was used data might be in the send queue. We

14307

++		 * need to update their sequence number to MPTCP-level seqno.

14308

++		 * Note that it can happen in rare cases that fastopen_req is

14309

++		 * NULL and syn_data is 0 but fastopen indeed occurred and

14310

++		 * data has been queued in the write queue (but not sent).

14311

++		 * Example of such rare cases: connect is non-blocking and

14312

++		 * TFO is configured to work without cookies.

14313

++		 */

14314

++		if (!skb_queue_empty(&meta_sk->sk_write_queue))

14315

++			mptcp_rcv_synsent_fastopen(meta_sk);

14316

++

14317

++		/* -1, because the SYN consumed 1 byte. In case of TFO, we

14318

++		 * start the subflow-sequence number as if the data of the SYN

14319

++		 * is not part of any mapping.

14320

++		 */

14321

++		tp->mptcp->snt_isn = tp->snd_una - 1;

14322

++		tp->mpcb->dss_csum = mopt->dss_csum;

14323

++		tp->mptcp->include_mpc = 1;

14324

++

14325

++		/* Ensure that fastopen is handled at the meta-level. */

14326

++		tp->fastopen_req = NULL;

14327

++

14328

++		sk_set_socket(sk, mptcp_meta_sk(sk)->sk_socket);

14329

++		sk->sk_wq = mptcp_meta_sk(sk)->sk_wq;

14330

++

14331

++		 /* hold in sk_clone_lock due to initialization to 2 */

14332

++		sock_put(sk);

14333

++	} else {

14334

++		tp->request_mptcp = 0;

14335

++

14336

++		if (tp->inside_tk_table)

14337

++			mptcp_hash_remove(tp);

14338

++	}

14339

++

14340

++	if (mptcp(tp))

14341

++		tp->mptcp->rcv_isn = TCP_SKB_CB(skb)->seq;

14342

++

14343

++	return 0;

14344

++}

14345

++

14346

++bool mptcp_should_expand_sndbuf(const struct sock *sk)

14347

++{

14348

++	const struct sock *sk_it;

14349

++	const struct sock *meta_sk = mptcp_meta_sk(sk);

14350

++	const struct tcp_sock *meta_tp = tcp_sk(meta_sk);

14351

++	int cnt_backups = 0;

14352

++	int backup_available = 0;

14353

++

14354

++	/* We circumvent this check in tcp_check_space, because we want to

14355

++	 * always call sk_write_space. So, we reproduce the check here.

14356

++	 */

14357

++	if (!meta_sk->sk_socket ||

14358

++	    !test_bit(SOCK_NOSPACE, &meta_sk->sk_socket->flags))

14359

++		return false;

14360

++

14361

++	/* If the user specified a specific send buffer setting, do

14362

++	 * not modify it.

14363

++	 */

14364

++	if (meta_sk->sk_userlocks & SOCK_SNDBUF_LOCK)

14365

++		return false;

14366

++

14367

++	/* If we are under global TCP memory pressure, do not expand.  */

14368

++	if (sk_under_memory_pressure(meta_sk))

14369

++		return false;

14370

++

14371

++	/* If we are under soft global TCP memory pressure, do not expand.  */

14372

++	if (sk_memory_allocated(meta_sk) >= sk_prot_mem_limits(meta_sk, 0))

14373

++		return false;

14374

++

14375

++

14376

++	/* For MPTCP we look for a subsocket that could send data.

14377

++	 * If we found one, then we update the send-buffer.

14378

++	 */

14379

++	mptcp_for_each_sk(meta_tp->mpcb, sk_it) {

14380

++		struct tcp_sock *tp_it = tcp_sk(sk_it);

14381

++

14382

++		if (!mptcp_sk_can_send(sk_it))

14383

++			continue;

14384

++

14385

++		/* Backup-flows have to be counted - if there is no other

14386

++		 * subflow we take the backup-flow into account.

14387

++		 */

14388

++		if (tp_it->mptcp->rcv_low_prio || tp_it->mptcp->low_prio)

14389

++			cnt_backups++;

14390

++

14391

++		if (tp_it->packets_out < tp_it->snd_cwnd) {

14392

++			if (tp_it->mptcp->rcv_low_prio || tp_it->mptcp->low_prio) {

14393

++				backup_available = 1;

14394

++				continue;

14395

++			}

14396

++			return true;

14397

++		}

14398

++	}

14399

++

14400

++	/* Backup-flow is available for sending - update send-buffer */

14401

++	if (meta_tp->mpcb->cnt_established == cnt_backups && backup_available)

14402

++		return true;

14403

++	return false;

14404

++}

14405

++

14406

++void mptcp_init_buffer_space(struct sock *sk)

14407

++{

14408

++	struct tcp_sock *tp = tcp_sk(sk);

14409

++	struct sock *meta_sk = mptcp_meta_sk(sk);

14410

++	struct tcp_sock *meta_tp = tcp_sk(meta_sk);

14411

++	int space;

14412

++

14413

++	tcp_init_buffer_space(sk);

14414

++

14415

++	if (is_master_tp(tp)) {

14416

++		meta_tp->rcvq_space.space = meta_tp->rcv_wnd;

14417

++		meta_tp->rcvq_space.time = tcp_time_stamp;

14418

++		meta_tp->rcvq_space.seq = meta_tp->copied_seq;

14419

++

14420

++		/* If there is only one subflow, we just use regular TCP

14421

++		 * autotuning. User-locks are handled already by

14422

++		 * tcp_init_buffer_space

14423

++		 */

14424

++		meta_tp->window_clamp = tp->window_clamp;

14425

++		meta_tp->rcv_ssthresh = tp->rcv_ssthresh;

14426

++		meta_sk->sk_rcvbuf = sk->sk_rcvbuf;

14427

++		meta_sk->sk_sndbuf = sk->sk_sndbuf;

14428

++

14429

++		return;

14430

++	}

14431

++

14432

++	if (meta_sk->sk_userlocks & SOCK_RCVBUF_LOCK)

14433

++		goto snd_buf;

14434

++

14435

++	/* Adding a new subflow to the rcv-buffer space. We make a simple

14436

++	 * addition, to give some space to allow traffic on the new subflow.

14437

++	 * Autotuning will increase it further later on.

14438

++	 */

14439

++	space = min(meta_sk->sk_rcvbuf + sk->sk_rcvbuf, sysctl_tcp_rmem[2]);

14440

++	if (space > meta_sk->sk_rcvbuf) {

14441

++		meta_tp->window_clamp += tp->window_clamp;

14442

++		meta_tp->rcv_ssthresh += tp->rcv_ssthresh;

14443

++		meta_sk->sk_rcvbuf = space;

14444

++	}

14445

++

14446

++snd_buf:

14447

++	if (meta_sk->sk_userlocks & SOCK_SNDBUF_LOCK)

14448

++		return;

14449

++

14450

++	/* Adding a new subflow to the send-buffer space. We make a simple

14451

++	 * addition, to give some space to allow traffic on the new subflow.

14452

++	 * Autotuning will increase it further later on.

14453

++	 */

14454

++	space = min(meta_sk->sk_sndbuf + sk->sk_sndbuf, sysctl_tcp_wmem[2]);

14455

++	if (space > meta_sk->sk_sndbuf) {

14456

++		meta_sk->sk_sndbuf = space;

14457

++		meta_sk->sk_write_space(meta_sk);

14458

++	}

14459

++}

14460

++

14461

++void mptcp_tcp_set_rto(struct sock *sk)

14462

++{

14463

++	tcp_set_rto(sk);

14464

++	mptcp_set_rto(sk);

14465

++}

14466

+diff --git a/net/mptcp/mptcp_ipv4.c b/net/mptcp/mptcp_ipv4.c

14467

+new file mode 100644

14468

+index 000000000000..1183d1305d35

14469

+--- /dev/null

14470

++++ b/net/mptcp/mptcp_ipv4.c

14471

+@@ -0,0 +1,483 @@

14472

++/*

14473

++ *	MPTCP implementation - IPv4-specific functions

14474

++ *

14475

++ *	Initial Design & Implementation:

14476

++ *	Sébastien Barré <sebastien.barre@×××××××××.be>

14477

++ *

14478

++ *	Current Maintainer:

14479

++ *	Christoph Paasch <christoph.paasch@×××××××××.be>

14480

++ *

14481

++ *	Additional authors:

14482

++ *	Jaakko Korkeaniemi <jaakko.korkeaniemi@×××××.fi>

14483

++ *	Gregory Detal <gregory.detal@×××××××××.be>

14484

++ *	Fabien Duchêne <fabien.duchene@×××××××××.be>

14485

++ *	Andreas Seelinger <Andreas.Seelinger@×××××××××××.de>

14486

++ *	Lavkesh Lahngir <lavkesh51@×××××.com>

14487

++ *	Andreas Ripke <ripke@××××××.eu>

14488

++ *	Vlad Dogaru <vlad.dogaru@×××××.com>

14489

++ *	Octavian Purdila <octavian.purdila@×××××.com>

14490

++ *	John Ronan <jronan@××××.org>

14491

++ *	Catalin Nicutar <catalin.nicutar@×××××.com>

14492

++ *	Brandon Heller <brandonh@××××××××.edu>

14493

++ *

14494

++ *

14495

++ *	This program is free software; you can redistribute it and/or

14496

++ *      modify it under the terms of the GNU General Public License

14497

++ *      as published by the Free Software Foundation; either version

14498

++ *      2 of the License, or (at your option) any later version.

14499

++ */

14500

++

14501

++#include <linux/export.h>

14502

++#include <linux/ip.h>

14503

++#include <linux/list.h>

14504

++#include <linux/skbuff.h>

14505

++#include <linux/spinlock.h>

14506

++#include <linux/tcp.h>

14507

++

14508

++#include <net/inet_common.h>

14509

++#include <net/inet_connection_sock.h>

14510

++#include <net/mptcp.h>

14511

++#include <net/mptcp_v4.h>

14512

++#include <net/request_sock.h>

14513

++#include <net/tcp.h>

14514

++

14515

++u32 mptcp_v4_get_nonce(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport)

14516

++{

14517

++	u32 hash[MD5_DIGEST_WORDS];

14518

++

14519

++	hash[0] = (__force u32)saddr;

14520

++	hash[1] = (__force u32)daddr;

14521

++	hash[2] = ((__force u16)sport << 16) + (__force u16)dport;

14522

++	hash[3] = mptcp_seed++;

14523

++

14524

++	md5_transform(hash, mptcp_secret);

14525

++

14526

++	return hash[0];

14527

++}

14528

++

14529

++u64 mptcp_v4_get_key(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport)

14530

++{

14531

++	u32 hash[MD5_DIGEST_WORDS];

14532

++

14533

++	hash[0] = (__force u32)saddr;

14534

++	hash[1] = (__force u32)daddr;

14535

++	hash[2] = ((__force u16)sport << 16) + (__force u16)dport;

14536

++	hash[3] = mptcp_seed++;

14537

++

14538

++	md5_transform(hash, mptcp_secret);

14539

++

14540

++	return *((u64 *)hash);

14541

++}

14542

++

14543

++

14544

++static void mptcp_v4_reqsk_destructor(struct request_sock *req)

14545

++{

14546

++	mptcp_reqsk_destructor(req);

14547

++

14548

++	tcp_v4_reqsk_destructor(req);

14549

++}

14550

++

14551

++static int mptcp_v4_init_req(struct request_sock *req, struct sock *sk,

14552

++			     struct sk_buff *skb)

14553

++{

14554

++	tcp_request_sock_ipv4_ops.init_req(req, sk, skb);

14555

++	mptcp_reqsk_init(req, skb);

14556

++

14557

++	return 0;

14558

++}

14559

++

14560

++static int mptcp_v4_join_init_req(struct request_sock *req, struct sock *sk,

14561

++				  struct sk_buff *skb)

14562

++{

14563

++	struct mptcp_request_sock *mtreq = mptcp_rsk(req);

14564

++	struct mptcp_cb *mpcb = tcp_sk(sk)->mpcb;

14565

++	union inet_addr addr;

14566

++	int loc_id;

14567

++	bool low_prio = false;

14568

++

14569

++	/* We need to do this as early as possible. Because, if we fail later

14570

++	 * (e.g., get_local_id), then reqsk_free tries to remove the

14571

++	 * request-socket from the htb in mptcp_hash_request_remove as pprev

14572

++	 * may be different from NULL.

14573

++	 */

14574

++	mtreq->hash_entry.pprev = NULL;

14575

++

14576

++	tcp_request_sock_ipv4_ops.init_req(req, sk, skb);

14577

++

14578

++	mtreq->mptcp_loc_nonce = mptcp_v4_get_nonce(ip_hdr(skb)->saddr,

14579

++						    ip_hdr(skb)->daddr,

14580

++						    tcp_hdr(skb)->source,

14581

++						    tcp_hdr(skb)->dest);

14582

++	addr.ip = inet_rsk(req)->ir_loc_addr;

14583

++	loc_id = mpcb->pm_ops->get_local_id(AF_INET, &addr, sock_net(sk), &low_prio);

14584

++	if (loc_id == -1)

14585

++		return -1;

14586

++	mtreq->loc_id = loc_id;

14587

++	mtreq->low_prio = low_prio;

14588

++

14589

++	mptcp_join_reqsk_init(mpcb, req, skb);

14590

++

14591

++	return 0;

14592

++}

14593

++

14594

++/* Similar to tcp_request_sock_ops */

14595

++struct request_sock_ops mptcp_request_sock_ops __read_mostly = {

14596

++	.family		=	PF_INET,

14597

++	.obj_size	=	sizeof(struct mptcp_request_sock),

14598

++	.rtx_syn_ack	=	tcp_rtx_synack,

14599

++	.send_ack	=	tcp_v4_reqsk_send_ack,

14600

++	.destructor	=	mptcp_v4_reqsk_destructor,

14601

++	.send_reset	=	tcp_v4_send_reset,

14602

++	.syn_ack_timeout =	tcp_syn_ack_timeout,

14603

++};

14604

++

14605

++static void mptcp_v4_reqsk_queue_hash_add(struct sock *meta_sk,

14606

++					  struct request_sock *req,

14607

++					  const unsigned long timeout)

14608

++{

14609

++	const u32 h1 = inet_synq_hash(inet_rsk(req)->ir_rmt_addr,

14610

++				     inet_rsk(req)->ir_rmt_port,

14611

++				     0, MPTCP_HASH_SIZE);

14612

++	/* We cannot call inet_csk_reqsk_queue_hash_add(), because we do not

14613

++	 * want to reset the keepalive-timer (responsible for retransmitting

14614

++	 * SYN/ACKs). We do not retransmit SYN/ACKs+MP_JOINs, because we cannot

14615

++	 * overload the keepalive timer. Also, it's not a big deal, because the

14616

++	 * third ACK of the MP_JOIN-handshake is sent in a reliable manner. So,

14617

++	 * if the third ACK gets lost, the client will handle the retransmission

14618

++	 * anyways. If our SYN/ACK gets lost, the client will retransmit the

14619

++	 * SYN.

14620

++	 */

14621

++	struct inet_connection_sock *meta_icsk = inet_csk(meta_sk);

14622

++	struct listen_sock *lopt = meta_icsk->icsk_accept_queue.listen_opt;

14623

++	const u32 h2 = inet_synq_hash(inet_rsk(req)->ir_rmt_addr,

14624

++				     inet_rsk(req)->ir_rmt_port,

14625

++				     lopt->hash_rnd, lopt->nr_table_entries);

14626

++

14627

++	reqsk_queue_hash_req(&meta_icsk->icsk_accept_queue, h2, req, timeout);

14628

++	if (reqsk_queue_added(&meta_icsk->icsk_accept_queue) == 0)

14629

++		mptcp_reset_synack_timer(meta_sk, timeout);

14630

++

14631

++	rcu_read_lock();

14632

++	spin_lock(&mptcp_reqsk_hlock);

14633

++	hlist_nulls_add_head_rcu(&mptcp_rsk(req)->hash_entry, &mptcp_reqsk_htb[h1]);

14634

++	spin_unlock(&mptcp_reqsk_hlock);

14635

++	rcu_read_unlock();

14636

++}

14637

++

14638

++/* Similar to tcp_v4_conn_request */

14639

++static int mptcp_v4_join_request(struct sock *meta_sk, struct sk_buff *skb)

14640

++{

14641

++	return tcp_conn_request(&mptcp_request_sock_ops,

14642

++				&mptcp_join_request_sock_ipv4_ops,

14643

++				meta_sk, skb);

14644

++}

14645

++

14646

++/* We only process join requests here. (either the SYN or the final ACK) */

14647

++int mptcp_v4_do_rcv(struct sock *meta_sk, struct sk_buff *skb)

14648

++{

14649

++	const struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;

14650

++	struct sock *child, *rsk = NULL;

14651

++	int ret;

14652

++

14653

++	if (!(TCP_SKB_CB(skb)->mptcp_flags & MPTCPHDR_JOIN)) {

14654

++		struct tcphdr *th = tcp_hdr(skb);

14655

++		const struct iphdr *iph = ip_hdr(skb);

14656

++		struct sock *sk;

14657

++

14658

++		sk = inet_lookup_established(sock_net(meta_sk), &tcp_hashinfo,

14659

++					     iph->saddr, th->source, iph->daddr,

14660

++					     th->dest, inet_iif(skb));

14661

++

14662

++		if (!sk) {

14663

++			kfree_skb(skb);

14664

++			return 0;

14665

++		}

14666

++		if (is_meta_sk(sk)) {

14667

++			WARN("%s Did not find a sub-sk - did found the meta!\n", __func__);

14668

++			kfree_skb(skb);

14669

++			sock_put(sk);

14670

++			return 0;

14671

++		}

14672

++

14673

++		if (sk->sk_state == TCP_TIME_WAIT) {

14674

++			inet_twsk_put(inet_twsk(sk));

14675

++			kfree_skb(skb);

14676

++			return 0;

14677

++		}

14678

++

14679

++		ret = tcp_v4_do_rcv(sk, skb);

14680

++		sock_put(sk);

14681

++

14682

++		return ret;

14683

++	}

14684

++	TCP_SKB_CB(skb)->mptcp_flags = 0;

14685

++

14686

++	/* Has been removed from the tk-table. Thus, no new subflows.

14687

++	 *

14688

++	 * Check for close-state is necessary, because we may have been closed

14689

++	 * without passing by mptcp_close().

14690

++	 *

14691

++	 * When falling back, no new subflows are allowed either.

14692

++	 */

14693

++	if (meta_sk->sk_state == TCP_CLOSE || !tcp_sk(meta_sk)->inside_tk_table ||

14694

++	    mpcb->infinite_mapping_rcv || mpcb->send_infinite_mapping)

14695

++		goto reset_and_discard;

14696

++

14697

++	child = tcp_v4_hnd_req(meta_sk, skb);

14698

++

14699

++	if (!child)

14700

++		goto discard;

14701

++

14702

++	if (child != meta_sk) {

14703

++		sock_rps_save_rxhash(child, skb);

14704

++		/* We don't call tcp_child_process here, because we hold

14705

++		 * already the meta-sk-lock and are sure that it is not owned

14706

++		 * by the user.

14707

++		 */

14708

++		ret = tcp_rcv_state_process(child, skb, tcp_hdr(skb), skb->len);

14709

++		bh_unlock_sock(child);

14710

++		sock_put(child);

14711

++		if (ret) {

14712

++			rsk = child;

14713

++			goto reset_and_discard;

14714

++		}

14715

++	} else {

14716

++		if (tcp_hdr(skb)->syn) {

14717

++			mptcp_v4_join_request(meta_sk, skb);

14718

++			goto discard;

14719

++		}

14720

++		goto reset_and_discard;

14721

++	}

14722

++	return 0;

14723

++

14724

++reset_and_discard:

14725

++	if (reqsk_queue_len(&inet_csk(meta_sk)->icsk_accept_queue)) {

14726

++		const struct tcphdr *th = tcp_hdr(skb);

14727

++		const struct iphdr *iph = ip_hdr(skb);

14728

++		struct request_sock **prev, *req;

14729

++		/* If we end up here, it means we should not have matched on the

14730

++		 * request-socket. But, because the request-sock queue is only

14731

++		 * destroyed in mptcp_close, the socket may actually already be

14732

++		 * in close-state (e.g., through shutdown()) while still having

14733

++		 * pending request sockets.

14734

++		 */

14735

++		req = inet_csk_search_req(meta_sk, &prev, th->source,

14736

++					  iph->saddr, iph->daddr);

14737

++		if (req) {

14738

++			inet_csk_reqsk_queue_unlink(meta_sk, req, prev);

14739

++			reqsk_queue_removed(&inet_csk(meta_sk)->icsk_accept_queue,

14740

++					    req);

14741

++			reqsk_free(req);

14742

++		}

14743

++	}

14744

++

14745

++	tcp_v4_send_reset(rsk, skb);

14746

++discard:

14747

++	kfree_skb(skb);

14748

++	return 0;

14749

++}

14750

++

14751

++/* After this, the ref count of the meta_sk associated with the request_sock

14752

++ * is incremented. Thus it is the responsibility of the caller

14753

++ * to call sock_put() when the reference is not needed anymore.

14754

++ */

14755

++struct sock *mptcp_v4_search_req(const __be16 rport, const __be32 raddr,

14756

++				 const __be32 laddr, const struct net *net)

14757

++{

14758

++	const struct mptcp_request_sock *mtreq;

14759

++	struct sock *meta_sk = NULL;

14760

++	const struct hlist_nulls_node *node;

14761

++	const u32 hash = inet_synq_hash(raddr, rport, 0, MPTCP_HASH_SIZE);

14762

++

14763

++	rcu_read_lock();

14764

++begin:

14765

++	hlist_nulls_for_each_entry_rcu(mtreq, node, &mptcp_reqsk_htb[hash],

14766

++				       hash_entry) {

14767

++		struct inet_request_sock *ireq = inet_rsk(rev_mptcp_rsk(mtreq));

14768

++		meta_sk = mtreq->mptcp_mpcb->meta_sk;

14769

++

14770

++		if (ireq->ir_rmt_port == rport &&

14771

++		    ireq->ir_rmt_addr == raddr &&

14772

++		    ireq->ir_loc_addr == laddr &&

14773

++		    rev_mptcp_rsk(mtreq)->rsk_ops->family == AF_INET &&

14774

++		    net_eq(net, sock_net(meta_sk)))

14775

++			goto found;

14776

++		meta_sk = NULL;

14777

++	}

14778

++	/* A request-socket is destroyed by RCU. So, it might have been recycled

14779

++	 * and put into another hash-table list. So, after the lookup we may

14780

++	 * end up in a different list. So, we may need to restart.

14781

++	 *

14782

++	 * See also the comment in __inet_lookup_established.

14783

++	 */

14784

++	if (get_nulls_value(node) != hash + MPTCP_REQSK_NULLS_BASE)

14785

++		goto begin;

14786

++

14787

++found:

14788

++	if (meta_sk && unlikely(!atomic_inc_not_zero(&meta_sk->sk_refcnt)))

14789

++		meta_sk = NULL;

14790

++	rcu_read_unlock();

14791

++

14792

++	return meta_sk;

14793

++}

14794

++

14795

++/* Create a new IPv4 subflow.

14796

++ *

14797

++ * We are in user-context and meta-sock-lock is hold.

14798

++ */

14799

++int mptcp_init4_subsockets(struct sock *meta_sk, const struct mptcp_loc4 *loc,

14800

++			   struct mptcp_rem4 *rem)

14801

++{

14802

++	struct tcp_sock *tp;

14803

++	struct sock *sk;

14804

++	struct sockaddr_in loc_in, rem_in;

14805

++	struct socket sock;

14806

++	int ret;

14807

++

14808

++	/** First, create and prepare the new socket */

14809

++

14810

++	sock.type = meta_sk->sk_socket->type;

14811

++	sock.state = SS_UNCONNECTED;

14812

++	sock.wq = meta_sk->sk_socket->wq;

14813

++	sock.file = meta_sk->sk_socket->file;

14814

++	sock.ops = NULL;

14815

++

14816

++	ret = inet_create(sock_net(meta_sk), &sock, IPPROTO_TCP, 1);

14817

++	if (unlikely(ret < 0)) {

14818

++		mptcp_debug("%s inet_create failed ret: %d\n", __func__, ret);

14819

++		return ret;

14820

++	}

14821

++

14822

++	sk = sock.sk;

14823

++	tp = tcp_sk(sk);

14824

++

14825

++	/* All subsockets need the MPTCP-lock-class */

14826

++	lockdep_set_class_and_name(&(sk)->sk_lock.slock, &meta_slock_key, "slock-AF_INET-MPTCP");

14827

++	lockdep_init_map(&(sk)->sk_lock.dep_map, "sk_lock-AF_INET-MPTCP", &meta_key, 0);

14828

++

14829

++	if (mptcp_add_sock(meta_sk, sk, loc->loc4_id, rem->rem4_id, GFP_KERNEL))

14830

++		goto error;

14831

++

14832

++	tp->mptcp->slave_sk = 1;

14833

++	tp->mptcp->low_prio = loc->low_prio;

14834

++

14835

++	/* Initializing the timer for an MPTCP subflow */

14836

++	setup_timer(&tp->mptcp->mptcp_ack_timer, mptcp_ack_handler, (unsigned long)sk);

14837

++

14838

++	/** Then, connect the socket to the peer */

14839

++	loc_in.sin_family = AF_INET;

14840

++	rem_in.sin_family = AF_INET;

14841

++	loc_in.sin_port = 0;

14842

++	if (rem->port)

14843

++		rem_in.sin_port = rem->port;

14844

++	else

14845

++		rem_in.sin_port = inet_sk(meta_sk)->inet_dport;

14846

++	loc_in.sin_addr = loc->addr;

14847

++	rem_in.sin_addr = rem->addr;

14848

++

14849

++	ret = sock.ops->bind(&sock, (struct sockaddr *)&loc_in, sizeof(struct sockaddr_in));

14850

++	if (ret < 0) {

14851

++		mptcp_debug("%s: MPTCP subsocket bind() failed, error %d\n",

14852

++			    __func__, ret);

14853

++		goto error;

14854

++	}

14855

++

14856

++	mptcp_debug("%s: token %#x pi %d src_addr:%pI4:%d dst_addr:%pI4:%d\n",

14857

++		    __func__, tcp_sk(meta_sk)->mpcb->mptcp_loc_token,

14858

++		    tp->mptcp->path_index, &loc_in.sin_addr,

14859

++		    ntohs(loc_in.sin_port), &rem_in.sin_addr,

14860

++		    ntohs(rem_in.sin_port));

14861

++

14862

++	if (tcp_sk(meta_sk)->mpcb->pm_ops->init_subsocket_v4)

14863

++		tcp_sk(meta_sk)->mpcb->pm_ops->init_subsocket_v4(sk, rem->addr);

14864

++

14865

++	ret = sock.ops->connect(&sock, (struct sockaddr *)&rem_in,

14866

++				sizeof(struct sockaddr_in), O_NONBLOCK);

14867

++	if (ret < 0 && ret != -EINPROGRESS) {

14868

++		mptcp_debug("%s: MPTCP subsocket connect() failed, error %d\n",

14869

++			    __func__, ret);

14870

++		goto error;

14871

++	}

14872

++

14873

++	sk_set_socket(sk, meta_sk->sk_socket);

14874

++	sk->sk_wq = meta_sk->sk_wq;

14875

++

14876

++	return 0;

14877

++

14878

++error:

14879

++	/* May happen if mptcp_add_sock fails first */

14880

++	if (!mptcp(tp)) {

14881

++		tcp_close(sk, 0);

14882

++	} else {

14883

++		local_bh_disable();

14884

++		mptcp_sub_force_close(sk);

14885

++		local_bh_enable();

14886

++	}

14887

++	return ret;

14888

++}

14889

++EXPORT_SYMBOL(mptcp_init4_subsockets);

14890

++

14891

++const struct inet_connection_sock_af_ops mptcp_v4_specific = {

14892

++	.queue_xmit	   = ip_queue_xmit,

14893

++	.send_check	   = tcp_v4_send_check,

14894

++	.rebuild_header	   = inet_sk_rebuild_header,

14895

++	.sk_rx_dst_set	   = inet_sk_rx_dst_set,

14896

++	.conn_request	   = mptcp_conn_request,

14897

++	.syn_recv_sock	   = tcp_v4_syn_recv_sock,

14898

++	.net_header_len	   = sizeof(struct iphdr),

14899

++	.setsockopt	   = ip_setsockopt,

14900

++	.getsockopt	   = ip_getsockopt,

14901

++	.addr2sockaddr	   = inet_csk_addr2sockaddr,

14902

++	.sockaddr_len	   = sizeof(struct sockaddr_in),

14903

++	.bind_conflict	   = inet_csk_bind_conflict,

14904

++#ifdef CONFIG_COMPAT

14905

++	.compat_setsockopt = compat_ip_setsockopt,

14906

++	.compat_getsockopt = compat_ip_getsockopt,

14907

++#endif

14908

++};

14909

++

14910

++struct tcp_request_sock_ops mptcp_request_sock_ipv4_ops;

14911

++struct tcp_request_sock_ops mptcp_join_request_sock_ipv4_ops;

14912

++

14913

++/* General initialization of IPv4 for MPTCP */

14914

++int mptcp_pm_v4_init(void)

14915

++{

14916

++	int ret = 0;

14917

++	struct request_sock_ops *ops = &mptcp_request_sock_ops;

14918

++

14919

++	mptcp_request_sock_ipv4_ops = tcp_request_sock_ipv4_ops;

14920

++	mptcp_request_sock_ipv4_ops.init_req = mptcp_v4_init_req;

14921

++

14922

++	mptcp_join_request_sock_ipv4_ops = tcp_request_sock_ipv4_ops;

14923

++	mptcp_join_request_sock_ipv4_ops.init_req = mptcp_v4_join_init_req;

14924

++	mptcp_join_request_sock_ipv4_ops.queue_hash_add = mptcp_v4_reqsk_queue_hash_add;

14925

++

14926

++	ops->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", "MPTCP");

14927

++	if (ops->slab_name == NULL) {

14928

++		ret = -ENOMEM;

14929

++		goto out;

14930

++	}

14931

++

14932

++	ops->slab = kmem_cache_create(ops->slab_name, ops->obj_size, 0,

14933

++				      SLAB_DESTROY_BY_RCU|SLAB_HWCACHE_ALIGN,

14934

++				      NULL);

14935

++

14936

++	if (ops->slab == NULL) {

14937

++		ret =  -ENOMEM;

14938

++		goto err_reqsk_create;

14939

++	}

14940

++

14941

++out:

14942

++	return ret;

14943

++

14944

++err_reqsk_create:

14945

++	kfree(ops->slab_name);

14946

++	ops->slab_name = NULL;

14947

++	goto out;

14948

++}

14949

++

14950

++void mptcp_pm_v4_undo(void)

14951

++{

14952

++	kmem_cache_destroy(mptcp_request_sock_ops.slab);

14953

++	kfree(mptcp_request_sock_ops.slab_name);

14954

++}

14955

+diff --git a/net/mptcp/mptcp_ipv6.c b/net/mptcp/mptcp_ipv6.c

14956

+new file mode 100644

14957

+index 000000000000..1036973aa855

14958

+--- /dev/null

14959

++++ b/net/mptcp/mptcp_ipv6.c

14960

+@@ -0,0 +1,518 @@

14961

++/*

14962

++ *	MPTCP implementation - IPv6-specific functions

14963

++ *

14964

++ *	Initial Design & Implementation:

14965

++ *	Sébastien Barré <sebastien.barre@×××××××××.be>

14966

++ *

14967

++ *	Current Maintainer:

14968

++ *	Jaakko Korkeaniemi <jaakko.korkeaniemi@×××××.fi>

14969

++ *

14970

++ *	Additional authors:

14971

++ *	Jaakko Korkeaniemi <jaakko.korkeaniemi@×××××.fi>

14972

++ *	Gregory Detal <gregory.detal@×××××××××.be>

14973

++ *	Fabien Duchêne <fabien.duchene@×××××××××.be>

14974

++ *	Andreas Seelinger <Andreas.Seelinger@×××××××××××.de>

14975

++ *	Lavkesh Lahngir <lavkesh51@×××××.com>

14976

++ *	Andreas Ripke <ripke@××××××.eu>

14977

++ *	Vlad Dogaru <vlad.dogaru@×××××.com>

14978

++ *	Octavian Purdila <octavian.purdila@×××××.com>

14979

++ *	John Ronan <jronan@××××.org>

14980

++ *	Catalin Nicutar <catalin.nicutar@×××××.com>

14981

++ *	Brandon Heller <brandonh@××××××××.edu>

14982

++ *

14983

++ *

14984

++ *	This program is free software; you can redistribute it and/or

14985

++ *      modify it under the terms of the GNU General Public License

14986

++ *      as published by the Free Software Foundation; either version

14987

++ *      2 of the License, or (at your option) any later version.

14988

++ */

14989

++

14990

++#include <linux/export.h>

14991

++#include <linux/in6.h>

14992

++#include <linux/kernel.h>

14993

++

14994

++#include <net/addrconf.h>

14995

++#include <net/flow.h>

14996

++#include <net/inet6_connection_sock.h>

14997

++#include <net/inet6_hashtables.h>

14998

++#include <net/inet_common.h>

14999

++#include <net/ipv6.h>

15000

++#include <net/ip6_checksum.h>

15001

++#include <net/ip6_route.h>

15002

++#include <net/mptcp.h>

15003

++#include <net/mptcp_v6.h>

15004

++#include <net/tcp.h>

15005

++#include <net/transp_v6.h>

15006

++

15007

++__u32 mptcp_v6_get_nonce(const __be32 *saddr, const __be32 *daddr,

15008

++			 __be16 sport, __be16 dport)

15009

++{

15010

++	u32 secret[MD5_MESSAGE_BYTES / 4];

15011

++	u32 hash[MD5_DIGEST_WORDS];

15012

++	u32 i;

15013

++

15014

++	memcpy(hash, saddr, 16);

15015

++	for (i = 0; i < 4; i++)

15016

++		secret[i] = mptcp_secret[i] + (__force u32)daddr[i];

15017

++	secret[4] = mptcp_secret[4] +

15018

++		    (((__force u16)sport << 16) + (__force u16)dport);

15019

++	secret[5] = mptcp_seed++;

15020

++	for (i = 6; i < MD5_MESSAGE_BYTES / 4; i++)

15021

++		secret[i] = mptcp_secret[i];

15022

++

15023

++	md5_transform(hash, secret);

15024

++

15025

++	return hash[0];

15026

++}

15027

++

15028

++u64 mptcp_v6_get_key(const __be32 *saddr, const __be32 *daddr,

15029

++		     __be16 sport, __be16 dport)

15030

++{

15031

++	u32 secret[MD5_MESSAGE_BYTES / 4];

15032

++	u32 hash[MD5_DIGEST_WORDS];

15033

++	u32 i;

15034

++

15035

++	memcpy(hash, saddr, 16);

15036

++	for (i = 0; i < 4; i++)

15037

++		secret[i] = mptcp_secret[i] + (__force u32)daddr[i];

15038

++	secret[4] = mptcp_secret[4] +

15039

++		    (((__force u16)sport << 16) + (__force u16)dport);

15040

++	secret[5] = mptcp_seed++;

15041

++	for (i = 6; i < MD5_MESSAGE_BYTES / 4; i++)

15042

++		secret[i] = mptcp_secret[i];

15043

++

15044

++	md5_transform(hash, secret);

15045

++

15046

++	return *((u64 *)hash);

15047

++}

15048

++

15049

++static void mptcp_v6_reqsk_destructor(struct request_sock *req)

15050

++{

15051

++	mptcp_reqsk_destructor(req);

15052

++

15053

++	tcp_v6_reqsk_destructor(req);

15054

++}

15055

++

15056

++static int mptcp_v6_init_req(struct request_sock *req, struct sock *sk,

15057

++			     struct sk_buff *skb)

15058

++{

15059

++	tcp_request_sock_ipv6_ops.init_req(req, sk, skb);

15060

++	mptcp_reqsk_init(req, skb);

15061

++

15062

++	return 0;

15063

++}

15064

++

15065

++static int mptcp_v6_join_init_req(struct request_sock *req, struct sock *sk,

15066

++				  struct sk_buff *skb)

15067

++{

15068

++	struct mptcp_request_sock *mtreq = mptcp_rsk(req);

15069

++	struct mptcp_cb *mpcb = tcp_sk(sk)->mpcb;

15070

++	union inet_addr addr;

15071

++	int loc_id;

15072

++	bool low_prio = false;

15073

++

15074

++	/* We need to do this as early as possible. Because, if we fail later

15075

++	 * (e.g., get_local_id), then reqsk_free tries to remove the

15076

++	 * request-socket from the htb in mptcp_hash_request_remove as pprev

15077

++	 * may be different from NULL.

15078

++	 */

15079

++	mtreq->hash_entry.pprev = NULL;

15080

++

15081

++	tcp_request_sock_ipv6_ops.init_req(req, sk, skb);

15082

++

15083

++	mtreq->mptcp_loc_nonce = mptcp_v6_get_nonce(ipv6_hdr(skb)->saddr.s6_addr32,

15084

++						    ipv6_hdr(skb)->daddr.s6_addr32,

15085

++						    tcp_hdr(skb)->source,

15086

++						    tcp_hdr(skb)->dest);

15087

++	addr.in6 = inet_rsk(req)->ir_v6_loc_addr;

15088

++	loc_id = mpcb->pm_ops->get_local_id(AF_INET6, &addr, sock_net(sk), &low_prio);

15089

++	if (loc_id == -1)

15090

++		return -1;

15091

++	mtreq->loc_id = loc_id;

15092

++	mtreq->low_prio = low_prio;

15093

++

15094

++	mptcp_join_reqsk_init(mpcb, req, skb);

15095

++

15096

++	return 0;

15097

++}

15098

++

15099

++/* Similar to tcp6_request_sock_ops */

15100

++struct request_sock_ops mptcp6_request_sock_ops __read_mostly = {

15101

++	.family		=	AF_INET6,

15102

++	.obj_size	=	sizeof(struct mptcp_request_sock),

15103

++	.rtx_syn_ack	=	tcp_v6_rtx_synack,

15104

++	.send_ack	=	tcp_v6_reqsk_send_ack,

15105

++	.destructor	=	mptcp_v6_reqsk_destructor,

15106

++	.send_reset	=	tcp_v6_send_reset,

15107

++	.syn_ack_timeout =	tcp_syn_ack_timeout,

15108

++};

15109

++

15110

++static void mptcp_v6_reqsk_queue_hash_add(struct sock *meta_sk,

15111

++					  struct request_sock *req,

15112

++					  const unsigned long timeout)

15113

++{

15114

++	const u32 h1 = inet6_synq_hash(&inet_rsk(req)->ir_v6_rmt_addr,

15115

++				      inet_rsk(req)->ir_rmt_port,

15116

++				      0, MPTCP_HASH_SIZE);

15117

++	/* We cannot call inet6_csk_reqsk_queue_hash_add(), because we do not

15118

++	 * want to reset the keepalive-timer (responsible for retransmitting

15119

++	 * SYN/ACKs). We do not retransmit SYN/ACKs+MP_JOINs, because we cannot

15120

++	 * overload the keepalive timer. Also, it's not a big deal, because the

15121

++	 * third ACK of the MP_JOIN-handshake is sent in a reliable manner. So,

15122

++	 * if the third ACK gets lost, the client will handle the retransmission

15123

++	 * anyways. If our SYN/ACK gets lost, the client will retransmit the

15124

++	 * SYN.

15125

++	 */

15126

++	struct inet_connection_sock *meta_icsk = inet_csk(meta_sk);

15127

++	struct listen_sock *lopt = meta_icsk->icsk_accept_queue.listen_opt;

15128

++	const u32 h2 = inet6_synq_hash(&inet_rsk(req)->ir_v6_rmt_addr,

15129

++				      inet_rsk(req)->ir_rmt_port,

15130

++				      lopt->hash_rnd, lopt->nr_table_entries);

15131

++

15132

++	reqsk_queue_hash_req(&meta_icsk->icsk_accept_queue, h2, req, timeout);

15133

++	if (reqsk_queue_added(&meta_icsk->icsk_accept_queue) == 0)

15134

++		mptcp_reset_synack_timer(meta_sk, timeout);

15135

++

15136

++	rcu_read_lock();

15137

++	spin_lock(&mptcp_reqsk_hlock);

15138

++	hlist_nulls_add_head_rcu(&mptcp_rsk(req)->hash_entry, &mptcp_reqsk_htb[h1]);

15139

++	spin_unlock(&mptcp_reqsk_hlock);

15140

++	rcu_read_unlock();

15141

++}

15142

++

15143

++static int mptcp_v6_join_request(struct sock *meta_sk, struct sk_buff *skb)

15144

++{

15145

++	return tcp_conn_request(&mptcp6_request_sock_ops,

15146

++				&mptcp_join_request_sock_ipv6_ops,

15147

++				meta_sk, skb);

15148

++}

15149

++

15150

++int mptcp_v6_do_rcv(struct sock *meta_sk, struct sk_buff *skb)

15151

++{

15152

++	const struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;

15153

++	struct sock *child, *rsk = NULL;

15154

++	int ret;

15155

++

15156

++	if (!(TCP_SKB_CB(skb)->mptcp_flags & MPTCPHDR_JOIN)) {

15157

++		struct tcphdr *th = tcp_hdr(skb);

15158

++		const struct ipv6hdr *ip6h = ipv6_hdr(skb);

15159

++		struct sock *sk;

15160

++

15161

++		sk = __inet6_lookup_established(sock_net(meta_sk),

15162

++						&tcp_hashinfo,

15163

++						&ip6h->saddr, th->source,

15164

++						&ip6h->daddr, ntohs(th->dest),

15165

++						inet6_iif(skb));

15166

++

15167

++		if (!sk) {

15168

++			kfree_skb(skb);

15169

++			return 0;

15170

++		}

15171

++		if (is_meta_sk(sk)) {

15172

++			WARN("%s Did not find a sub-sk!\n", __func__);

15173

++			kfree_skb(skb);

15174

++			sock_put(sk);

15175

++			return 0;

15176

++		}

15177

++

15178

++		if (sk->sk_state == TCP_TIME_WAIT) {

15179

++			inet_twsk_put(inet_twsk(sk));

15180

++			kfree_skb(skb);

15181

++			return 0;

15182

++		}

15183

++

15184

++		ret = tcp_v6_do_rcv(sk, skb);

15185

++		sock_put(sk);

15186

++

15187

++		return ret;

15188

++	}

15189

++	TCP_SKB_CB(skb)->mptcp_flags = 0;

15190

++

15191

++	/* Has been removed from the tk-table. Thus, no new subflows.

15192

++	 *

15193

++	 * Check for close-state is necessary, because we may have been closed

15194

++	 * without passing by mptcp_close().

15195

++	 *

15196

++	 * When falling back, no new subflows are allowed either.

15197

++	 */

15198

++	if (meta_sk->sk_state == TCP_CLOSE || !tcp_sk(meta_sk)->inside_tk_table ||

15199

++	    mpcb->infinite_mapping_rcv || mpcb->send_infinite_mapping)

15200

++		goto reset_and_discard;

15201

++

15202

++	child = tcp_v6_hnd_req(meta_sk, skb);

15203

++

15204

++	if (!child)

15205

++		goto discard;

15206

++

15207

++	if (child != meta_sk) {

15208

++		sock_rps_save_rxhash(child, skb);

15209

++		/* We don't call tcp_child_process here, because we hold

15210

++		 * already the meta-sk-lock and are sure that it is not owned

15211

++		 * by the user.

15212

++		 */

15213

++		ret = tcp_rcv_state_process(child, skb, tcp_hdr(skb), skb->len);

15214

++		bh_unlock_sock(child);

15215

++		sock_put(child);

15216

++		if (ret) {

15217

++			rsk = child;

15218

++			goto reset_and_discard;

15219

++		}

15220

++	} else {

15221

++		if (tcp_hdr(skb)->syn) {

15222

++			mptcp_v6_join_request(meta_sk, skb);

15223

++			goto discard;

15224

++		}

15225

++		goto reset_and_discard;

15226

++	}

15227

++	return 0;

15228

++

15229

++reset_and_discard:

15230

++	if (reqsk_queue_len(&inet_csk(meta_sk)->icsk_accept_queue)) {

15231

++		const struct tcphdr *th = tcp_hdr(skb);

15232

++		struct request_sock **prev, *req;

15233

++		/* If we end up here, it means we should not have matched on the

15234

++		 * request-socket. But, because the request-sock queue is only

15235

++		 * destroyed in mptcp_close, the socket may actually already be

15236

++		 * in close-state (e.g., through shutdown()) while still having

15237

++		 * pending request sockets.

15238

++		 */

15239

++		req = inet6_csk_search_req(meta_sk, &prev, th->source,

15240

++					   &ipv6_hdr(skb)->saddr,

15241

++					   &ipv6_hdr(skb)->daddr, inet6_iif(skb));

15242

++		if (req) {

15243

++			inet_csk_reqsk_queue_unlink(meta_sk, req, prev);

15244

++			reqsk_queue_removed(&inet_csk(meta_sk)->icsk_accept_queue,

15245

++					    req);

15246

++			reqsk_free(req);

15247

++		}

15248

++	}

15249

++

15250

++	tcp_v6_send_reset(rsk, skb);

15251

++discard:

15252

++	kfree_skb(skb);

15253

++	return 0;

15254

++}

15255

++

15256

++/* After this, the ref count of the meta_sk associated with the request_sock

15257

++ * is incremented. Thus it is the responsibility of the caller

15258

++ * to call sock_put() when the reference is not needed anymore.

15259

++ */

15260

++struct sock *mptcp_v6_search_req(const __be16 rport, const struct in6_addr *raddr,

15261

++				 const struct in6_addr *laddr, const struct net *net)

15262

++{

15263

++	const struct mptcp_request_sock *mtreq;

15264

++	struct sock *meta_sk = NULL;

15265

++	const struct hlist_nulls_node *node;

15266

++	const u32 hash = inet6_synq_hash(raddr, rport, 0, MPTCP_HASH_SIZE);

15267

++

15268

++	rcu_read_lock();

15269

++begin:

15270

++	hlist_nulls_for_each_entry_rcu(mtreq, node, &mptcp_reqsk_htb[hash],

15271

++				       hash_entry) {

15272

++		struct inet_request_sock *treq = inet_rsk(rev_mptcp_rsk(mtreq));

15273

++		meta_sk = mtreq->mptcp_mpcb->meta_sk;

15274

++

15275

++		if (inet_rsk(rev_mptcp_rsk(mtreq))->ir_rmt_port == rport &&

15276

++		    rev_mptcp_rsk(mtreq)->rsk_ops->family == AF_INET6 &&

15277

++		    ipv6_addr_equal(&treq->ir_v6_rmt_addr, raddr) &&

15278

++		    ipv6_addr_equal(&treq->ir_v6_loc_addr, laddr) &&

15279

++		    net_eq(net, sock_net(meta_sk)))

15280

++			goto found;

15281

++		meta_sk = NULL;

15282

++	}

15283

++	/* A request-socket is destroyed by RCU. So, it might have been recycled

15284

++	 * and put into another hash-table list. So, after the lookup we may

15285

++	 * end up in a different list. So, we may need to restart.

15286

++	 *

15287

++	 * See also the comment in __inet_lookup_established.

15288

++	 */

15289

++	if (get_nulls_value(node) != hash + MPTCP_REQSK_NULLS_BASE)

15290

++		goto begin;

15291

++

15292

++found:

15293

++	if (meta_sk && unlikely(!atomic_inc_not_zero(&meta_sk->sk_refcnt)))

15294

++		meta_sk = NULL;

15295

++	rcu_read_unlock();

15296

++

15297

++	return meta_sk;

15298

++}

15299

++

15300

++/* Create a new IPv6 subflow.

15301

++ *

15302

++ * We are in user-context and meta-sock-lock is hold.

15303

++ */

15304

++int mptcp_init6_subsockets(struct sock *meta_sk, const struct mptcp_loc6 *loc,

15305

++			   struct mptcp_rem6 *rem)

15306

++{

15307

++	struct tcp_sock *tp;

15308

++	struct sock *sk;

15309

++	struct sockaddr_in6 loc_in, rem_in;

15310

++	struct socket sock;

15311

++	int ret;

15312

++

15313

++	/** First, create and prepare the new socket */

15314

++

15315

++	sock.type = meta_sk->sk_socket->type;

15316

++	sock.state = SS_UNCONNECTED;

15317

++	sock.wq = meta_sk->sk_socket->wq;

15318

++	sock.file = meta_sk->sk_socket->file;

15319

++	sock.ops = NULL;

15320

++

15321

++	ret = inet6_create(sock_net(meta_sk), &sock, IPPROTO_TCP, 1);

15322

++	if (unlikely(ret < 0)) {

15323

++		mptcp_debug("%s inet6_create failed ret: %d\n", __func__, ret);

15324

++		return ret;

15325

++	}

15326

++

15327

++	sk = sock.sk;

15328

++	tp = tcp_sk(sk);

15329

++

15330

++	/* All subsockets need the MPTCP-lock-class */

15331

++	lockdep_set_class_and_name(&(sk)->sk_lock.slock, &meta_slock_key, "slock-AF_INET-MPTCP");

15332

++	lockdep_init_map(&(sk)->sk_lock.dep_map, "sk_lock-AF_INET-MPTCP", &meta_key, 0);

15333

++

15334

++	if (mptcp_add_sock(meta_sk, sk, loc->loc6_id, rem->rem6_id, GFP_KERNEL))

15335

++		goto error;

15336

++

15337

++	tp->mptcp->slave_sk = 1;

15338

++	tp->mptcp->low_prio = loc->low_prio;

15339

++

15340

++	/* Initializing the timer for an MPTCP subflow */

15341

++	setup_timer(&tp->mptcp->mptcp_ack_timer, mptcp_ack_handler, (unsigned long)sk);

15342

++

15343

++	/** Then, connect the socket to the peer */

15344

++	loc_in.sin6_family = AF_INET6;

15345

++	rem_in.sin6_family = AF_INET6;

15346

++	loc_in.sin6_port = 0;

15347

++	if (rem->port)

15348

++		rem_in.sin6_port = rem->port;

15349

++	else

15350

++		rem_in.sin6_port = inet_sk(meta_sk)->inet_dport;

15351

++	loc_in.sin6_addr = loc->addr;

15352

++	rem_in.sin6_addr = rem->addr;

15353

++

15354

++	ret = sock.ops->bind(&sock, (struct sockaddr *)&loc_in, sizeof(struct sockaddr_in6));

15355

++	if (ret < 0) {

15356

++		mptcp_debug("%s: MPTCP subsocket bind()failed, error %d\n",

15357

++			    __func__, ret);

15358

++		goto error;

15359

++	}

15360

++

15361

++	mptcp_debug("%s: token %#x pi %d src_addr:%pI6:%d dst_addr:%pI6:%d\n",

15362

++		    __func__, tcp_sk(meta_sk)->mpcb->mptcp_loc_token,

15363

++		    tp->mptcp->path_index, &loc_in.sin6_addr,

15364

++		    ntohs(loc_in.sin6_port), &rem_in.sin6_addr,

15365

++		    ntohs(rem_in.sin6_port));

15366

++

15367

++	if (tcp_sk(meta_sk)->mpcb->pm_ops->init_subsocket_v6)

15368

++		tcp_sk(meta_sk)->mpcb->pm_ops->init_subsocket_v6(sk, rem->addr);

15369

++

15370

++	ret = sock.ops->connect(&sock, (struct sockaddr *)&rem_in,

15371

++				sizeof(struct sockaddr_in6), O_NONBLOCK);

15372

++	if (ret < 0 && ret != -EINPROGRESS) {

15373

++		mptcp_debug("%s: MPTCP subsocket connect() failed, error %d\n",

15374

++			    __func__, ret);

15375

++		goto error;

15376

++	}

15377

++

15378

++	sk_set_socket(sk, meta_sk->sk_socket);

15379

++	sk->sk_wq = meta_sk->sk_wq;

15380

++

15381

++	return 0;

15382

++

15383

++error:

15384

++	/* May happen if mptcp_add_sock fails first */

15385

++	if (!mptcp(tp)) {

15386

++		tcp_close(sk, 0);

15387

++	} else {

15388

++		local_bh_disable();

15389

++		mptcp_sub_force_close(sk);

15390

++		local_bh_enable();

15391

++	}

15392

++	return ret;

15393

++}

15394

++EXPORT_SYMBOL(mptcp_init6_subsockets);

15395

++

15396

++const struct inet_connection_sock_af_ops mptcp_v6_specific = {

15397

++	.queue_xmit	   = inet6_csk_xmit,

15398

++	.send_check	   = tcp_v6_send_check,

15399

++	.rebuild_header	   = inet6_sk_rebuild_header,

15400

++	.sk_rx_dst_set	   = inet6_sk_rx_dst_set,

15401

++	.conn_request	   = mptcp_conn_request,

15402

++	.syn_recv_sock	   = tcp_v6_syn_recv_sock,

15403

++	.net_header_len	   = sizeof(struct ipv6hdr),

15404

++	.net_frag_header_len = sizeof(struct frag_hdr),

15405

++	.setsockopt	   = ipv6_setsockopt,

15406

++	.getsockopt	   = ipv6_getsockopt,

15407

++	.addr2sockaddr	   = inet6_csk_addr2sockaddr,

15408

++	.sockaddr_len	   = sizeof(struct sockaddr_in6),

15409

++	.bind_conflict	   = inet6_csk_bind_conflict,

15410

++#ifdef CONFIG_COMPAT

15411

++	.compat_setsockopt = compat_ipv6_setsockopt,

15412

++	.compat_getsockopt = compat_ipv6_getsockopt,

15413

++#endif

15414

++};

15415

++

15416

++const struct inet_connection_sock_af_ops mptcp_v6_mapped = {

15417

++	.queue_xmit	   = ip_queue_xmit,

15418

++	.send_check	   = tcp_v4_send_check,

15419

++	.rebuild_header	   = inet_sk_rebuild_header,

15420

++	.sk_rx_dst_set	   = inet_sk_rx_dst_set,

15421

++	.conn_request	   = mptcp_conn_request,

15422

++	.syn_recv_sock	   = tcp_v6_syn_recv_sock,

15423

++	.net_header_len	   = sizeof(struct iphdr),

15424

++	.setsockopt	   = ipv6_setsockopt,

15425

++	.getsockopt	   = ipv6_getsockopt,

15426

++	.addr2sockaddr	   = inet6_csk_addr2sockaddr,

15427

++	.sockaddr_len	   = sizeof(struct sockaddr_in6),

15428

++	.bind_conflict	   = inet6_csk_bind_conflict,

15429

++#ifdef CONFIG_COMPAT

15430

++	.compat_setsockopt = compat_ipv6_setsockopt,

15431

++	.compat_getsockopt = compat_ipv6_getsockopt,

15432

++#endif

15433

++};

15434

++

15435

++struct tcp_request_sock_ops mptcp_request_sock_ipv6_ops;

15436

++struct tcp_request_sock_ops mptcp_join_request_sock_ipv6_ops;

15437

++

15438

++int mptcp_pm_v6_init(void)

15439

++{

15440

++	int ret = 0;

15441

++	struct request_sock_ops *ops = &mptcp6_request_sock_ops;

15442

++

15443

++	mptcp_request_sock_ipv6_ops = tcp_request_sock_ipv6_ops;

15444

++	mptcp_request_sock_ipv6_ops.init_req = mptcp_v6_init_req;

15445

++

15446

++	mptcp_join_request_sock_ipv6_ops = tcp_request_sock_ipv6_ops;

15447

++	mptcp_join_request_sock_ipv6_ops.init_req = mptcp_v6_join_init_req;

15448

++	mptcp_join_request_sock_ipv6_ops.queue_hash_add = mptcp_v6_reqsk_queue_hash_add;

15449

++

15450

++	ops->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", "MPTCP6");

15451

++	if (ops->slab_name == NULL) {

15452

++		ret = -ENOMEM;

15453

++		goto out;

15454

++	}

15455

++

15456

++	ops->slab = kmem_cache_create(ops->slab_name, ops->obj_size, 0,

15457

++				      SLAB_DESTROY_BY_RCU|SLAB_HWCACHE_ALIGN,

15458

++				      NULL);

15459

++

15460

++	if (ops->slab == NULL) {

15461

++		ret =  -ENOMEM;

15462

++		goto err_reqsk_create;

15463

++	}

15464

++

15465

++out:

15466

++	return ret;

15467

++

15468

++err_reqsk_create:

15469

++	kfree(ops->slab_name);

15470

++	ops->slab_name = NULL;

15471

++	goto out;

15472

++}

15473

++

15474

++void mptcp_pm_v6_undo(void)

15475

++{

15476

++	kmem_cache_destroy(mptcp6_request_sock_ops.slab);

15477

++	kfree(mptcp6_request_sock_ops.slab_name);

15478

++}

15479

+diff --git a/net/mptcp/mptcp_ndiffports.c b/net/mptcp/mptcp_ndiffports.c

15480

+new file mode 100644

15481

+index 000000000000..6f5087983175

15482

+--- /dev/null

15483

++++ b/net/mptcp/mptcp_ndiffports.c

15484

+@@ -0,0 +1,161 @@

15485

++#include <linux/module.h>

15486

++

15487

++#include <net/mptcp.h>

15488

++#include <net/mptcp_v4.h>

15489

++

15490

++#if IS_ENABLED(CONFIG_IPV6)

15491

++#include <net/mptcp_v6.h>

15492

++#endif

15493

++

15494

++struct ndiffports_priv {

15495

++	/* Worker struct for subflow establishment */

15496

++	struct work_struct subflow_work;

15497

++

15498

++	struct mptcp_cb *mpcb;

15499

++};

15500

++

15501

++static int num_subflows __read_mostly = 2;

15502

++module_param(num_subflows, int, 0644);

15503

++MODULE_PARM_DESC(num_subflows, "choose the number of subflows per MPTCP connection");

15504

++

15505

++/**

15506

++ * Create all new subflows, by doing calls to mptcp_initX_subsockets

15507

++ *

15508

++ * This function uses a goto next_subflow, to allow releasing the lock between

15509

++ * new subflows and giving other processes a chance to do some work on the

15510

++ * socket and potentially finishing the communication.

15511

++ **/

15512

++static void create_subflow_worker(struct work_struct *work)

15513

++{

15514

++	const struct ndiffports_priv *pm_priv = container_of(work,

15515

++						     struct ndiffports_priv,

15516

++						     subflow_work);

15517

++	struct mptcp_cb *mpcb = pm_priv->mpcb;

15518

++	struct sock *meta_sk = mpcb->meta_sk;

15519

++	int iter = 0;

15520

++

15521

++next_subflow:

15522

++	if (iter) {

15523

++		release_sock(meta_sk);

15524

++		mutex_unlock(&mpcb->mpcb_mutex);

15525

++

15526

++		cond_resched();

15527

++	}

15528

++	mutex_lock(&mpcb->mpcb_mutex);

15529

++	lock_sock_nested(meta_sk, SINGLE_DEPTH_NESTING);

15530

++

15531

++	iter++;

15532

++

15533

++	if (sock_flag(meta_sk, SOCK_DEAD))

15534

++		goto exit;

15535

++

15536

++	if (mpcb->master_sk &&

15537

++	    !tcp_sk(mpcb->master_sk)->mptcp->fully_established)

15538

++		goto exit;

15539

++

15540

++	if (num_subflows > iter && num_subflows > mpcb->cnt_subflows) {

15541

++		if (meta_sk->sk_family == AF_INET ||

15542

++		    mptcp_v6_is_v4_mapped(meta_sk)) {

15543

++			struct mptcp_loc4 loc;

15544

++			struct mptcp_rem4 rem;

15545

++

15546

++			loc.addr.s_addr = inet_sk(meta_sk)->inet_saddr;

15547

++			loc.loc4_id = 0;

15548

++			loc.low_prio = 0;

15549

++

15550

++			rem.addr.s_addr = inet_sk(meta_sk)->inet_daddr;

15551

++			rem.port = inet_sk(meta_sk)->inet_dport;

15552

++			rem.rem4_id = 0; /* Default 0 */

15553

++

15554

++			mptcp_init4_subsockets(meta_sk, &loc, &rem);

15555

++		} else {

15556

++#if IS_ENABLED(CONFIG_IPV6)

15557

++			struct mptcp_loc6 loc;

15558

++			struct mptcp_rem6 rem;

15559

++

15560

++			loc.addr = inet6_sk(meta_sk)->saddr;

15561

++			loc.loc6_id = 0;

15562

++			loc.low_prio = 0;

15563

++

15564

++			rem.addr = meta_sk->sk_v6_daddr;

15565

++			rem.port = inet_sk(meta_sk)->inet_dport;

15566

++			rem.rem6_id = 0; /* Default 0 */

15567

++

15568

++			mptcp_init6_subsockets(meta_sk, &loc, &rem);

15569

++#endif

15570

++		}

15571

++		goto next_subflow;

15572

++	}

15573

++

15574

++exit:

15575

++	release_sock(meta_sk);

15576

++	mutex_unlock(&mpcb->mpcb_mutex);

15577

++	sock_put(meta_sk);

15578

++}

15579

++

15580

++static void ndiffports_new_session(const struct sock *meta_sk)

15581

++{

15582

++	struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;

15583

++	struct ndiffports_priv *fmp = (struct ndiffports_priv *)&mpcb->mptcp_pm[0];

15584

++

15585

++	/* Initialize workqueue-struct */

15586

++	INIT_WORK(&fmp->subflow_work, create_subflow_worker);

15587

++	fmp->mpcb = mpcb;

15588

++}

15589

++

15590

++static void ndiffports_create_subflows(struct sock *meta_sk)

15591

++{

15592

++	const struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;

15593

++	struct ndiffports_priv *pm_priv = (struct ndiffports_priv *)&mpcb->mptcp_pm[0];

15594

++

15595

++	if (mpcb->infinite_mapping_snd || mpcb->infinite_mapping_rcv ||

15596

++	    mpcb->send_infinite_mapping ||

15597

++	    mpcb->server_side || sock_flag(meta_sk, SOCK_DEAD))

15598

++		return;

15599

++

15600

++	if (!work_pending(&pm_priv->subflow_work)) {

15601

++		sock_hold(meta_sk);

15602

++		queue_work(mptcp_wq, &pm_priv->subflow_work);

15603

++	}

15604

++}

15605

++

15606

++static int ndiffports_get_local_id(sa_family_t family, union inet_addr *addr,

15607

++				   struct net *net, bool *low_prio)

15608

++{

15609

++	return 0;

15610

++}

15611

++

15612

++static struct mptcp_pm_ops ndiffports __read_mostly = {

15613

++	.new_session = ndiffports_new_session,

15614

++	.fully_established = ndiffports_create_subflows,

15615

++	.get_local_id = ndiffports_get_local_id,

15616

++	.name = "ndiffports",

15617

++	.owner = THIS_MODULE,

15618

++};

15619

++

15620

++/* General initialization of MPTCP_PM */

15621

++static int __init ndiffports_register(void)

15622

++{

15623

++	BUILD_BUG_ON(sizeof(struct ndiffports_priv) > MPTCP_PM_SIZE);

15624

++

15625

++	if (mptcp_register_path_manager(&ndiffports))

15626

++		goto exit;

15627

++

15628

++	return 0;

15629

++

15630

++exit:

15631

++	return -1;

15632

++}

15633

++

15634

++static void ndiffports_unregister(void)

15635

++{

15636

++	mptcp_unregister_path_manager(&ndiffports);

15637

++}

15638

++

15639

++module_init(ndiffports_register);

15640

++module_exit(ndiffports_unregister);

15641

++

15642

++MODULE_AUTHOR("Christoph Paasch");

15643

++MODULE_LICENSE("GPL");

15644

++MODULE_DESCRIPTION("NDIFF-PORTS MPTCP");

15645

++MODULE_VERSION("0.88");

15646

+diff --git a/net/mptcp/mptcp_ofo_queue.c b/net/mptcp/mptcp_ofo_queue.c

15647

+new file mode 100644

15648

+index 000000000000..ec4e98622637

15649

+--- /dev/null

15650

++++ b/net/mptcp/mptcp_ofo_queue.c

15651

+@@ -0,0 +1,295 @@

15652

++/*

15653

++ *	MPTCP implementation - Fast algorithm for MPTCP meta-reordering

15654

++ *

15655

++ *	Initial Design & Implementation:

15656

++ *	Sébastien Barré <sebastien.barre@×××××××××.be>

15657

++ *

15658

++ *	Current Maintainer & Author:

15659

++ *	Christoph Paasch <christoph.paasch@×××××××××.be>

15660

++ *

15661

++ *	Additional authors:

15662

++ *	Jaakko Korkeaniemi <jaakko.korkeaniemi@×××××.fi>

15663

++ *	Gregory Detal <gregory.detal@×××××××××.be>

15664

++ *	Fabien Duchêne <fabien.duchene@×××××××××.be>

15665

++ *	Andreas Seelinger <Andreas.Seelinger@×××××××××××.de>

15666

++ *	Lavkesh Lahngir <lavkesh51@×××××.com>

15667

++ *	Andreas Ripke <ripke@××××××.eu>

15668

++ *	Vlad Dogaru <vlad.dogaru@×××××.com>

15669

++ *	Octavian Purdila <octavian.purdila@×××××.com>

15670

++ *	John Ronan <jronan@××××.org>

15671

++ *	Catalin Nicutar <catalin.nicutar@×××××.com>

15672

++ *	Brandon Heller <brandonh@××××××××.edu>

15673

++ *

15674

++ *	This program is free software; you can redistribute it and/or

15675

++ *      modify it under the terms of the GNU General Public License

15676

++ *      as published by the Free Software Foundation; either version

15677

++ *      2 of the License, or (at your option) any later version.

15678

++ */

15679

++

15680

++#include <linux/skbuff.h>

15681

++#include <linux/slab.h>

15682

++#include <net/tcp.h>

15683

++#include <net/mptcp.h>

15684

++

15685

++void mptcp_remove_shortcuts(const struct mptcp_cb *mpcb,

15686

++			    const struct sk_buff *skb)

15687

++{

15688

++	struct tcp_sock *tp;

15689

++

15690

++	mptcp_for_each_tp(mpcb, tp) {

15691

++		if (tp->mptcp->shortcut_ofoqueue == skb) {

15692

++			tp->mptcp->shortcut_ofoqueue = NULL;

15693

++			return;

15694

++		}

15695

++	}

15696

++}

15697

++

15698

++/* Does 'skb' fits after 'here' in the queue 'head' ?

15699

++ * If yes, we queue it and return 1

15700

++ */

15701

++static int mptcp_ofo_queue_after(struct sk_buff_head *head,

15702

++				 struct sk_buff *skb, struct sk_buff *here,

15703

++				 const struct tcp_sock *tp)

15704

++{

15705

++	struct sock *meta_sk = tp->meta_sk;

15706

++	struct tcp_sock *meta_tp = tcp_sk(meta_sk);

15707

++	u32 seq = TCP_SKB_CB(skb)->seq;

15708

++	u32 end_seq = TCP_SKB_CB(skb)->end_seq;

15709

++

15710

++	/* We want to queue skb after here, thus seq >= end_seq */

15711

++	if (before(seq, TCP_SKB_CB(here)->end_seq))

15712

++		return 0;

15713

++

15714

++	if (seq == TCP_SKB_CB(here)->end_seq) {

15715

++		bool fragstolen = false;

15716

++

15717

++		if (!tcp_try_coalesce(meta_sk, here, skb, &fragstolen)) {

15718

++			__skb_queue_after(&meta_tp->out_of_order_queue, here, skb);

15719

++			return 1;

15720

++		} else {

15721

++			kfree_skb_partial(skb, fragstolen);

15722

++			return -1;

15723

++		}

15724

++	}

15725

++

15726

++	/* If here is the last one, we can always queue it */

15727

++	if (skb_queue_is_last(head, here)) {

15728

++		__skb_queue_after(head, here, skb);

15729

++		return 1;

15730

++	} else {

15731

++		struct sk_buff *skb1 = skb_queue_next(head, here);

15732

++		/* It's not the last one, but does it fits between 'here' and

15733

++		 * the one after 'here' ? Thus, does end_seq <= after_here->seq

15734

++		 */

15735

++		if (!after(end_seq, TCP_SKB_CB(skb1)->seq)) {

15736

++			__skb_queue_after(head, here, skb);

15737

++			return 1;

15738

++		}

15739

++	}

15740

++

15741

++	return 0;

15742

++}

15743

++

15744

++static void try_shortcut(struct sk_buff *shortcut, struct sk_buff *skb,

15745

++			 struct sk_buff_head *head, struct tcp_sock *tp)

15746

++{

15747

++	struct sock *meta_sk = tp->meta_sk;

15748

++	struct tcp_sock *tp_it, *meta_tp = tcp_sk(meta_sk);

15749

++	struct mptcp_cb *mpcb = meta_tp->mpcb;

15750

++	struct sk_buff *skb1, *best_shortcut = NULL;

15751

++	u32 seq = TCP_SKB_CB(skb)->seq;

15752

++	u32 end_seq = TCP_SKB_CB(skb)->end_seq;

15753

++	u32 distance = 0xffffffff;

15754

++

15755

++	/* First, check the tp's shortcut */

15756

++	if (!shortcut) {

15757

++		if (skb_queue_empty(head)) {

15758

++			__skb_queue_head(head, skb);

15759

++			goto end;

15760

++		}

15761

++	} else {

15762

++		int ret = mptcp_ofo_queue_after(head, skb, shortcut, tp);

15763

++		/* Does the tp's shortcut is a hit? If yes, we insert. */

15764

++

15765

++		if (ret) {

15766

++			skb = (ret > 0) ? skb : NULL;

15767

++			goto end;

15768

++		}

15769

++	}

15770

++

15771

++	/* Check the shortcuts of the other subsockets. */

15772

++	mptcp_for_each_tp(mpcb, tp_it) {

15773

++		shortcut = tp_it->mptcp->shortcut_ofoqueue;

15774

++		/* Can we queue it here? If yes, do so! */

15775

++		if (shortcut) {

15776

++			int ret = mptcp_ofo_queue_after(head, skb, shortcut, tp);

15777

++

15778

++			if (ret) {

15779

++				skb = (ret > 0) ? skb : NULL;

15780

++				goto end;

15781

++			}

15782

++		}

15783

++

15784

++		/* Could not queue it, check if we are close.

15785

++		 * We are looking for a shortcut, close enough to seq to

15786

++		 * set skb1 prematurely and thus improve the subsequent lookup,

15787

++		 * which tries to find a skb1 so that skb1->seq <= seq.

15788

++		 *

15789

++		 * So, here we only take shortcuts, whose shortcut->seq > seq,

15790

++		 * and minimize the distance between shortcut->seq and seq and

15791

++		 * set best_shortcut to this one with the minimal distance.

15792

++		 *

15793

++		 * That way, the subsequent while-loop is shortest.

15794

++		 */

15795

++		if (shortcut && after(TCP_SKB_CB(shortcut)->seq, seq)) {

15796

++			/* Are we closer than the current best shortcut? */

15797

++			if ((u32)(TCP_SKB_CB(shortcut)->seq - seq) < distance) {

15798

++				distance = (u32)(TCP_SKB_CB(shortcut)->seq - seq);

15799

++				best_shortcut = shortcut;

15800

++			}

15801

++		}

15802

++	}

15803

++

15804

++	if (best_shortcut)

15805

++		skb1 = best_shortcut;

15806

++	else

15807

++		skb1 = skb_peek_tail(head);

15808

++

15809

++	if (seq == TCP_SKB_CB(skb1)->end_seq) {

15810

++		bool fragstolen = false;

15811

++

15812

++		if (!tcp_try_coalesce(meta_sk, skb1, skb, &fragstolen)) {

15813

++			__skb_queue_after(&meta_tp->out_of_order_queue, skb1, skb);

15814

++		} else {

15815

++			kfree_skb_partial(skb, fragstolen);

15816

++			skb = NULL;

15817

++		}

15818

++

15819

++		goto end;

15820

++	}

15821

++

15822

++	/* Find the insertion point, starting from best_shortcut if available.

15823

++	 *

15824

++	 * Inspired from tcp_data_queue_ofo.

15825

++	 */

15826

++	while (1) {

15827

++		/* skb1->seq <= seq */

15828

++		if (!after(TCP_SKB_CB(skb1)->seq, seq))

15829

++			break;

15830

++		if (skb_queue_is_first(head, skb1)) {

15831

++			skb1 = NULL;

15832

++			break;

15833

++		}

15834

++		skb1 = skb_queue_prev(head, skb1);

15835

++	}

15836

++

15837

++	/* Do skb overlap to previous one? */

15838

++	if (skb1 && before(seq, TCP_SKB_CB(skb1)->end_seq)) {

15839

++		if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) {

15840

++			/* All the bits are present. */

15841

++			__kfree_skb(skb);

15842

++			skb = NULL;

15843

++			goto end;

15844

++		}

15845

++		if (seq == TCP_SKB_CB(skb1)->seq) {

15846

++			if (skb_queue_is_first(head, skb1))

15847

++				skb1 = NULL;

15848

++			else

15849

++				skb1 = skb_queue_prev(head, skb1);

15850

++		}

15851

++	}

15852

++	if (!skb1)

15853

++		__skb_queue_head(head, skb);

15854

++	else

15855

++		__skb_queue_after(head, skb1, skb);

15856

++

15857

++	/* And clean segments covered by new one as whole. */

15858

++	while (!skb_queue_is_last(head, skb)) {

15859

++		skb1 = skb_queue_next(head, skb);

15860

++

15861

++		if (!after(end_seq, TCP_SKB_CB(skb1)->seq))

15862

++			break;

15863

++

15864

++		__skb_unlink(skb1, head);

15865

++		mptcp_remove_shortcuts(mpcb, skb1);

15866

++		__kfree_skb(skb1);

15867

++	}

15868

++

15869

++end:

15870

++	if (skb) {

15871

++		skb_set_owner_r(skb, meta_sk);

15872

++		tp->mptcp->shortcut_ofoqueue = skb;

15873

++	}

15874

++

15875

++	return;

15876

++}

15877

++

15878

++/**

15879

++ * @sk: the subflow that received this skb.

15880

++ */

15881

++void mptcp_add_meta_ofo_queue(const struct sock *meta_sk, struct sk_buff *skb,

15882

++			      struct sock *sk)

15883

++{

15884

++	struct tcp_sock *tp = tcp_sk(sk);

15885

++

15886

++	try_shortcut(tp->mptcp->shortcut_ofoqueue, skb,

15887

++		     &tcp_sk(meta_sk)->out_of_order_queue, tp);

15888

++}

15889

++

15890

++bool mptcp_prune_ofo_queue(struct sock *sk)

15891

++{

15892

++	struct tcp_sock *tp	= tcp_sk(sk);

15893

++	bool res		= false;

15894

++

15895

++	if (!skb_queue_empty(&tp->out_of_order_queue)) {

15896

++		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_OFOPRUNED);

15897

++		mptcp_purge_ofo_queue(tp);

15898

++

15899

++		/* No sack at the mptcp-level */

15900

++		sk_mem_reclaim(sk);

15901

++		res = true;

15902

++	}

15903

++

15904

++	return res;

15905

++}

15906

++

15907

++void mptcp_ofo_queue(struct sock *meta_sk)

15908

++{

15909

++	struct tcp_sock *meta_tp = tcp_sk(meta_sk);

15910

++	struct sk_buff *skb;

15911

++

15912

++	while ((skb = skb_peek(&meta_tp->out_of_order_queue)) != NULL) {

15913

++		u32 old_rcv_nxt = meta_tp->rcv_nxt;

15914

++		if (after(TCP_SKB_CB(skb)->seq, meta_tp->rcv_nxt))

15915

++			break;

15916

++

15917

++		if (!after(TCP_SKB_CB(skb)->end_seq, meta_tp->rcv_nxt)) {

15918

++			__skb_unlink(skb, &meta_tp->out_of_order_queue);

15919

++			mptcp_remove_shortcuts(meta_tp->mpcb, skb);

15920

++			__kfree_skb(skb);

15921

++			continue;

15922

++		}

15923

++

15924

++		__skb_unlink(skb, &meta_tp->out_of_order_queue);

15925

++		mptcp_remove_shortcuts(meta_tp->mpcb, skb);

15926

++

15927

++		__skb_queue_tail(&meta_sk->sk_receive_queue, skb);

15928

++		meta_tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;

15929

++		mptcp_check_rcvseq_wrap(meta_tp, old_rcv_nxt);

15930

++

15931

++		if (tcp_hdr(skb)->fin)

15932

++			mptcp_fin(meta_sk);

15933

++	}

15934

++}

15935

++

15936

++void mptcp_purge_ofo_queue(struct tcp_sock *meta_tp)

15937

++{

15938

++	struct sk_buff_head *head = &meta_tp->out_of_order_queue;

15939

++	struct sk_buff *skb, *tmp;

15940

++

15941

++	skb_queue_walk_safe(head, skb, tmp) {

15942

++		__skb_unlink(skb, head);

15943

++		mptcp_remove_shortcuts(meta_tp->mpcb, skb);

15944

++		kfree_skb(skb);

15945

++	}

15946

++}

15947

+diff --git a/net/mptcp/mptcp_olia.c b/net/mptcp/mptcp_olia.c

15948

+new file mode 100644

15949

+index 000000000000..53f5c43bb488

15950

+--- /dev/null

15951

++++ b/net/mptcp/mptcp_olia.c

15952

+@@ -0,0 +1,311 @@

15953

++/*

15954

++ * MPTCP implementation - OPPORTUNISTIC LINKED INCREASES CONGESTION CONTROL:

15955

++ *

15956

++ * Algorithm design:

15957

++ * Ramin Khalili <ramin.khalili@××××.ch>

15958

++ * Nicolas Gast <nicolas.gast@××××.ch>

15959

++ * Jean-Yves Le Boudec <jean-yves.leboudec@××××.ch>

15960

++ *

15961

++ * Implementation:

15962

++ * Ramin Khalili <ramin.khalili@××××.ch>

15963

++ *

15964

++ * Ported to the official MPTCP-kernel:

15965

++ * Christoph Paasch <christoph.paasch@×××××××××.be>

15966

++ *

15967

++ * This program is free software; you can redistribute it and/or

15968

++ * modify it under the terms of the GNU General Public License

15969

++ * as published by the Free Software Foundation; either version

15970

++ * 2 of the License, or (at your option) any later version.

15971

++ */

15972

++

15973

++

15974

++#include <net/tcp.h>

15975

++#include <net/mptcp.h>

15976

++

15977

++#include <linux/module.h>

15978

++

15979

++static int scale = 10;

15980

++

15981

++struct mptcp_olia {

15982

++	u32	mptcp_loss1;

15983

++	u32	mptcp_loss2;

15984

++	u32	mptcp_loss3;

15985

++	int	epsilon_num;

15986

++	u32	epsilon_den;

15987

++	int	mptcp_snd_cwnd_cnt;

15988

++};

15989

++

15990

++static inline int mptcp_olia_sk_can_send(const struct sock *sk)

15991

++{

15992

++	return mptcp_sk_can_send(sk) && tcp_sk(sk)->srtt_us;

15993

++}

15994

++

15995

++static inline u64 mptcp_olia_scale(u64 val, int scale)

15996

++{

15997

++	return (u64) val << scale;

15998

++}

15999

++

16000

++/* take care of artificially inflate (see RFC5681)

16001

++ * of cwnd during fast-retransmit phase

16002

++ */

16003

++static u32 mptcp_get_crt_cwnd(struct sock *sk)

16004

++{

16005

++	const struct inet_connection_sock *icsk = inet_csk(sk);

16006

++

16007

++	if (icsk->icsk_ca_state == TCP_CA_Recovery)

16008

++		return tcp_sk(sk)->snd_ssthresh;

16009

++	else

16010

++		return tcp_sk(sk)->snd_cwnd;

16011

++}

16012

++

16013

++/* return the dominator of the first term of  the increasing term */

16014

++static u64 mptcp_get_rate(const struct mptcp_cb *mpcb , u32 path_rtt)

16015

++{

16016

++	struct sock *sk;

16017

++	u64 rate = 1; /* We have to avoid a zero-rate because it is used as a divisor */

16018

++

16019

++	mptcp_for_each_sk(mpcb, sk) {

16020

++		struct tcp_sock *tp = tcp_sk(sk);

16021

++		u64 scaled_num;

16022

++		u32 tmp_cwnd;

16023

++

16024

++		if (!mptcp_olia_sk_can_send(sk))

16025

++			continue;

16026

++

16027

++		tmp_cwnd = mptcp_get_crt_cwnd(sk);

16028

++		scaled_num = mptcp_olia_scale(tmp_cwnd, scale) * path_rtt;

16029

++		rate += div_u64(scaled_num , tp->srtt_us);

16030

++	}

16031

++	rate *= rate;

16032

++	return rate;

16033

++}

16034

++

16035

++/* find the maximum cwnd, used to find set M */

16036

++static u32 mptcp_get_max_cwnd(const struct mptcp_cb *mpcb)

16037

++{

16038

++	struct sock *sk;

16039

++	u32 best_cwnd = 0;

16040

++

16041

++	mptcp_for_each_sk(mpcb, sk) {

16042

++		u32 tmp_cwnd;

16043

++

16044

++		if (!mptcp_olia_sk_can_send(sk))

16045

++			continue;

16046

++

16047

++		tmp_cwnd = mptcp_get_crt_cwnd(sk);

16048

++		if (tmp_cwnd > best_cwnd)

16049

++			best_cwnd = tmp_cwnd;

16050

++	}

16051

++	return best_cwnd;

16052

++}

16053

++

16054

++static void mptcp_get_epsilon(const struct mptcp_cb *mpcb)

16055

++{

16056

++	struct mptcp_olia *ca;

16057

++	struct tcp_sock *tp;

16058

++	struct sock *sk;

16059

++	u64 tmp_int, tmp_rtt, best_int = 0, best_rtt = 1;

16060

++	u32 max_cwnd = 1, best_cwnd = 1, tmp_cwnd;

16061

++	u8 M = 0, B_not_M = 0;

16062

++

16063

++	/* TODO - integrate this in the following loop - we just want to iterate once */

16064

++

16065

++	max_cwnd = mptcp_get_max_cwnd(mpcb);

16066

++

16067

++	/* find the best path */

16068

++	mptcp_for_each_sk(mpcb, sk) {

16069

++		tp = tcp_sk(sk);

16070

++		ca = inet_csk_ca(sk);

16071

++

16072

++		if (!mptcp_olia_sk_can_send(sk))

16073

++			continue;

16074

++

16075

++		tmp_rtt = (u64)tp->srtt_us * tp->srtt_us;

16076

++		/* TODO - check here and rename variables */

16077

++		tmp_int = max(ca->mptcp_loss3 - ca->mptcp_loss2,

16078

++			      ca->mptcp_loss2 - ca->mptcp_loss1);

16079

++

16080

++		tmp_cwnd = mptcp_get_crt_cwnd(sk);

16081

++		if ((u64)tmp_int * best_rtt >= (u64)best_int * tmp_rtt) {

16082

++			best_rtt = tmp_rtt;

16083

++			best_int = tmp_int;

16084

++			best_cwnd = tmp_cwnd;

16085

++		}

16086

++	}

16087

++

16088

++	/* TODO - integrate this here in mptcp_get_max_cwnd and in the previous loop */

16089

++	/* find the size of M and B_not_M */

16090

++	mptcp_for_each_sk(mpcb, sk) {

16091

++		tp = tcp_sk(sk);

16092

++		ca = inet_csk_ca(sk);

16093

++

16094

++		if (!mptcp_olia_sk_can_send(sk))

16095

++			continue;

16096

++

16097

++		tmp_cwnd = mptcp_get_crt_cwnd(sk);

16098

++		if (tmp_cwnd == max_cwnd) {

16099

++			M++;

16100

++		} else {

16101

++			tmp_rtt = (u64)tp->srtt_us * tp->srtt_us;

16102

++			tmp_int = max(ca->mptcp_loss3 - ca->mptcp_loss2,

16103

++				      ca->mptcp_loss2 - ca->mptcp_loss1);

16104

++

16105

++			if ((u64)tmp_int * best_rtt == (u64)best_int * tmp_rtt)

16106

++				B_not_M++;

16107

++		}

16108

++	}

16109

++

16110

++	/* check if the path is in M or B_not_M and set the value of epsilon accordingly */

16111

++	mptcp_for_each_sk(mpcb, sk) {

16112

++		tp = tcp_sk(sk);

16113

++		ca = inet_csk_ca(sk);

16114

++

16115

++		if (!mptcp_olia_sk_can_send(sk))

16116

++			continue;

16117

++

16118

++		if (B_not_M == 0) {

16119

++			ca->epsilon_num = 0;

16120

++			ca->epsilon_den = 1;

16121

++		} else {

16122

++			tmp_rtt = (u64)tp->srtt_us * tp->srtt_us;

16123

++			tmp_int = max(ca->mptcp_loss3 - ca->mptcp_loss2,

16124

++				      ca->mptcp_loss2 - ca->mptcp_loss1);

16125

++			tmp_cwnd = mptcp_get_crt_cwnd(sk);

16126

++

16127

++			if (tmp_cwnd < max_cwnd &&

16128

++			    (u64)tmp_int * best_rtt == (u64)best_int * tmp_rtt) {

16129

++				ca->epsilon_num = 1;

16130

++				ca->epsilon_den = mpcb->cnt_established * B_not_M;

16131

++			} else if (tmp_cwnd == max_cwnd) {

16132

++				ca->epsilon_num = -1;

16133

++				ca->epsilon_den = mpcb->cnt_established  * M;

16134

++			} else {

16135

++				ca->epsilon_num = 0;

16136

++				ca->epsilon_den = 1;

16137

++			}

16138

++		}

16139

++	}

16140

++}

16141

++

16142

++/* setting the initial values */

16143

++static void mptcp_olia_init(struct sock *sk)

16144

++{

16145

++	const struct tcp_sock *tp = tcp_sk(sk);

16146

++	struct mptcp_olia *ca = inet_csk_ca(sk);

16147

++

16148

++	if (mptcp(tp)) {

16149

++		ca->mptcp_loss1 = tp->snd_una;

16150

++		ca->mptcp_loss2 = tp->snd_una;

16151

++		ca->mptcp_loss3 = tp->snd_una;

16152

++		ca->mptcp_snd_cwnd_cnt = 0;

16153

++		ca->epsilon_num = 0;

16154

++		ca->epsilon_den = 1;

16155

++	}

16156

++}

16157

++

16158

++/* updating inter-loss distance and ssthresh */

16159

++static void mptcp_olia_set_state(struct sock *sk, u8 new_state)

16160

++{

16161

++	if (!mptcp(tcp_sk(sk)))

16162

++		return;

16163

++

16164

++	if (new_state == TCP_CA_Loss ||

16165

++	    new_state == TCP_CA_Recovery || new_state == TCP_CA_CWR) {

16166

++		struct mptcp_olia *ca = inet_csk_ca(sk);

16167

++

16168

++		if (ca->mptcp_loss3 != ca->mptcp_loss2 &&

16169

++		    !inet_csk(sk)->icsk_retransmits) {

16170

++			ca->mptcp_loss1 = ca->mptcp_loss2;

16171

++			ca->mptcp_loss2 = ca->mptcp_loss3;

16172

++		}

16173

++	}

16174

++}

16175

++

16176

++/* main algorithm */

16177

++static void mptcp_olia_cong_avoid(struct sock *sk, u32 ack, u32 acked)

16178

++{

16179

++	struct tcp_sock *tp = tcp_sk(sk);

16180

++	struct mptcp_olia *ca = inet_csk_ca(sk);

16181

++	const struct mptcp_cb *mpcb = tp->mpcb;

16182

++

16183

++	u64 inc_num, inc_den, rate, cwnd_scaled;

16184

++

16185

++	if (!mptcp(tp)) {

16186

++		tcp_reno_cong_avoid(sk, ack, acked);

16187

++		return;

16188

++	}

16189

++

16190

++	ca->mptcp_loss3 = tp->snd_una;

16191

++

16192

++	if (!tcp_is_cwnd_limited(sk))

16193

++		return;

16194

++

16195

++	/* slow start if it is in the safe area */

16196

++	if (tp->snd_cwnd <= tp->snd_ssthresh) {

16197

++		tcp_slow_start(tp, acked);

16198

++		return;

16199

++	}

16200

++

16201

++	mptcp_get_epsilon(mpcb);

16202

++	rate = mptcp_get_rate(mpcb, tp->srtt_us);

16203

++	cwnd_scaled = mptcp_olia_scale(tp->snd_cwnd, scale);

16204

++	inc_den = ca->epsilon_den * tp->snd_cwnd * rate ? : 1;

16205

++

16206

++	/* calculate the increasing term, scaling is used to reduce the rounding effect */

16207

++	if (ca->epsilon_num == -1) {

16208

++		if (ca->epsilon_den * cwnd_scaled * cwnd_scaled < rate) {

16209

++			inc_num = rate - ca->epsilon_den *

16210

++				cwnd_scaled * cwnd_scaled;

16211

++			ca->mptcp_snd_cwnd_cnt -= div64_u64(

16212

++			    mptcp_olia_scale(inc_num , scale) , inc_den);

16213

++		} else {

16214

++			inc_num = ca->epsilon_den *

16215

++			    cwnd_scaled * cwnd_scaled - rate;

16216

++			ca->mptcp_snd_cwnd_cnt += div64_u64(

16217

++			    mptcp_olia_scale(inc_num , scale) , inc_den);

16218

++		}

16219

++	} else {

16220

++		inc_num = ca->epsilon_num * rate +

16221

++		    ca->epsilon_den * cwnd_scaled * cwnd_scaled;

16222

++		ca->mptcp_snd_cwnd_cnt += div64_u64(

16223

++		    mptcp_olia_scale(inc_num , scale) , inc_den);

16224

++	}

16225

++

16226

++

16227

++	if (ca->mptcp_snd_cwnd_cnt >= (1 << scale) - 1) {

16228

++		if (tp->snd_cwnd < tp->snd_cwnd_clamp)

16229

++			tp->snd_cwnd++;

16230

++		ca->mptcp_snd_cwnd_cnt = 0;

16231

++	} else if (ca->mptcp_snd_cwnd_cnt <= 0 - (1 << scale) + 1) {

16232

++		tp->snd_cwnd = max((int) 1 , (int) tp->snd_cwnd - 1);

16233

++		ca->mptcp_snd_cwnd_cnt = 0;

16234

++	}

16235

++}

16236

++

16237

++static struct tcp_congestion_ops mptcp_olia = {

16238

++	.init		= mptcp_olia_init,

16239

++	.ssthresh	= tcp_reno_ssthresh,

16240

++	.cong_avoid	= mptcp_olia_cong_avoid,

16241

++	.set_state	= mptcp_olia_set_state,

16242

++	.owner		= THIS_MODULE,

16243

++	.name		= "olia",

16244

++};

16245

++

16246

++static int __init mptcp_olia_register(void)

16247

++{

16248

++	BUILD_BUG_ON(sizeof(struct mptcp_olia) > ICSK_CA_PRIV_SIZE);

16249

++	return tcp_register_congestion_control(&mptcp_olia);

16250

++}

16251

++

16252

++static void __exit mptcp_olia_unregister(void)

16253

++{

16254

++	tcp_unregister_congestion_control(&mptcp_olia);

16255

++}

16256

++

16257

++module_init(mptcp_olia_register);

16258

++module_exit(mptcp_olia_unregister);

16259

++

16260

++MODULE_AUTHOR("Ramin Khalili, Nicolas Gast, Jean-Yves Le Boudec");

16261

++MODULE_LICENSE("GPL");

16262

++MODULE_DESCRIPTION("MPTCP COUPLED CONGESTION CONTROL");

16263

++MODULE_VERSION("0.1");

16264

+diff --git a/net/mptcp/mptcp_output.c b/net/mptcp/mptcp_output.c

16265

+new file mode 100644

16266

+index 000000000000..400ea254c078

16267

+--- /dev/null

16268

++++ b/net/mptcp/mptcp_output.c

16269

+@@ -0,0 +1,1743 @@

16270

++/*

16271

++ *	MPTCP implementation - Sending side

16272

++ *

16273

++ *	Initial Design & Implementation:

16274

++ *	Sébastien Barré <sebastien.barre@×××××××××.be>

16275

++ *

16276

++ *	Current Maintainer & Author:

16277

++ *	Christoph Paasch <christoph.paasch@×××××××××.be>

16278

++ *

16279

++ *	Additional authors:

16280

++ *	Jaakko Korkeaniemi <jaakko.korkeaniemi@×××××.fi>

16281

++ *	Gregory Detal <gregory.detal@×××××××××.be>

16282

++ *	Fabien Duchêne <fabien.duchene@×××××××××.be>

16283

++ *	Andreas Seelinger <Andreas.Seelinger@×××××××××××.de>

16284

++ *	Lavkesh Lahngir <lavkesh51@×××××.com>

16285

++ *	Andreas Ripke <ripke@××××××.eu>

16286

++ *	Vlad Dogaru <vlad.dogaru@×××××.com>

16287

++ *	Octavian Purdila <octavian.purdila@×××××.com>

16288

++ *	John Ronan <jronan@××××.org>

16289

++ *	Catalin Nicutar <catalin.nicutar@×××××.com>

16290

++ *	Brandon Heller <brandonh@××××××××.edu>

16291

++ *

16292

++ *

16293

++ *	This program is free software; you can redistribute it and/or

16294

++ *      modify it under the terms of the GNU General Public License

16295

++ *      as published by the Free Software Foundation; either version

16296

++ *      2 of the License, or (at your option) any later version.

16297

++ */

16298

++

16299

++#include <linux/kconfig.h>

16300

++#include <linux/skbuff.h>

16301

++#include <linux/tcp.h>

16302

++

16303

++#include <net/mptcp.h>

16304

++#include <net/mptcp_v4.h>

16305

++#include <net/mptcp_v6.h>

16306

++#include <net/sock.h>

16307

++

16308

++static const int mptcp_dss_len = MPTCP_SUB_LEN_DSS_ALIGN +

16309

++				 MPTCP_SUB_LEN_ACK_ALIGN +

16310

++				 MPTCP_SUB_LEN_SEQ_ALIGN;

16311

++

16312

++static inline int mptcp_sub_len_remove_addr(u16 bitfield)

16313

++{

16314

++	unsigned int c;

16315

++	for (c = 0; bitfield; c++)

16316

++		bitfield &= bitfield - 1;

16317

++	return MPTCP_SUB_LEN_REMOVE_ADDR + c - 1;

16318

++}

16319

++

16320

++int mptcp_sub_len_remove_addr_align(u16 bitfield)

16321

++{

16322

++	return ALIGN(mptcp_sub_len_remove_addr(bitfield), 4);

16323

++}

16324

++EXPORT_SYMBOL(mptcp_sub_len_remove_addr_align);

16325

++

16326

++/* get the data-seq and end-data-seq and store them again in the

16327

++ * tcp_skb_cb

16328

++ */

16329

++static int mptcp_reconstruct_mapping(struct sk_buff *skb)

16330

++{

16331

++	const struct mp_dss *mpdss = (struct mp_dss *)TCP_SKB_CB(skb)->dss;

16332

++	u32 *p32;

16333

++	u16 *p16;

16334

++

16335

++	if (!mpdss->M)

16336

++		return 1;

16337

++

16338

++	/* Move the pointer to the data-seq */

16339

++	p32 = (u32 *)mpdss;

16340

++	p32++;

16341

++	if (mpdss->A) {

16342

++		p32++;

16343

++		if (mpdss->a)

16344

++			p32++;

16345

++	}

16346

++

16347

++	TCP_SKB_CB(skb)->seq = ntohl(*p32);

16348

++

16349

++	/* Get the data_len to calculate the end_data_seq */

16350

++	p32++;

16351

++	p32++;

16352

++	p16 = (u16 *)p32;

16353

++	TCP_SKB_CB(skb)->end_seq = ntohs(*p16) + TCP_SKB_CB(skb)->seq;

16354

++

16355

++	return 0;

16356

++}

16357

++

16358

++static void mptcp_find_and_set_pathmask(const struct sock *meta_sk, struct sk_buff *skb)

16359

++{

16360

++	struct sk_buff *skb_it;

16361

++

16362

++	skb_it = tcp_write_queue_head(meta_sk);

16363

++

16364

++	tcp_for_write_queue_from(skb_it, meta_sk) {

16365

++		if (skb_it == tcp_send_head(meta_sk))

16366

++			break;

16367

++

16368

++		if (TCP_SKB_CB(skb_it)->seq == TCP_SKB_CB(skb)->seq) {

16369

++			TCP_SKB_CB(skb)->path_mask = TCP_SKB_CB(skb_it)->path_mask;

16370

++			break;

16371

++		}

16372

++	}

16373

++}

16374

++

16375

++/* Reinject data from one TCP subflow to the meta_sk. If sk == NULL, we are

16376

++ * coming from the meta-retransmit-timer

16377

++ */

16378

++static void __mptcp_reinject_data(struct sk_buff *orig_skb, struct sock *meta_sk,

16379

++				  struct sock *sk, int clone_it)

16380

++{

16381

++	struct sk_buff *skb, *skb1;

16382

++	const struct tcp_sock *meta_tp = tcp_sk(meta_sk);

16383

++	struct mptcp_cb *mpcb = meta_tp->mpcb;

16384

++	u32 seq, end_seq;

16385

++

16386

++	if (clone_it) {

16387

++		/* pskb_copy is necessary here, because the TCP/IP-headers

16388

++		 * will be changed when it's going to be reinjected on another

16389

++		 * subflow.

16390

++		 */

16391

++		skb = pskb_copy_for_clone(orig_skb, GFP_ATOMIC);

16392

++	} else {

16393

++		__skb_unlink(orig_skb, &sk->sk_write_queue);

16394

++		sock_set_flag(sk, SOCK_QUEUE_SHRUNK);

16395

++		sk->sk_wmem_queued -= orig_skb->truesize;

16396

++		sk_mem_uncharge(sk, orig_skb->truesize);

16397

++		skb = orig_skb;

16398

++	}

16399

++	if (unlikely(!skb))

16400

++		return;

16401

++

16402

++	if (sk && mptcp_reconstruct_mapping(skb)) {

16403

++		__kfree_skb(skb);

16404

++		return;

16405

++	}

16406

++

16407

++	skb->sk = meta_sk;

16408

++

16409

++	/* If it reached already the destination, we don't have to reinject it */

16410

++	if (!after(TCP_SKB_CB(skb)->end_seq, meta_tp->snd_una)) {

16411

++		__kfree_skb(skb);

16412

++		return;

16413

++	}

16414

++

16415

++	/* Only reinject segments that are fully covered by the mapping */

16416

++	if (skb->len + (mptcp_is_data_fin(skb) ? 1 : 0) !=

16417

++	    TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq) {

16418

++		u32 seq = TCP_SKB_CB(skb)->seq;

16419

++		u32 end_seq = TCP_SKB_CB(skb)->end_seq;

16420

++

16421

++		__kfree_skb(skb);

16422

++

16423

++		/* Ok, now we have to look for the full mapping in the meta

16424

++		 * send-queue :S

16425

++		 */

16426

++		tcp_for_write_queue(skb, meta_sk) {

16427

++			/* Not yet at the mapping? */

16428

++			if (before(TCP_SKB_CB(skb)->seq, seq))

16429

++				continue;

16430

++			/* We have passed by the mapping */

16431

++			if (after(TCP_SKB_CB(skb)->end_seq, end_seq))

16432

++				return;

16433

++

16434

++			__mptcp_reinject_data(skb, meta_sk, NULL, 1);

16435

++		}

16436

++		return;

16437

++	}

16438

++

16439

++	/* Segment goes back to the MPTCP-layer. So, we need to zero the

16440

++	 * path_mask/dss.

16441

++	 */

16442

++	memset(TCP_SKB_CB(skb)->dss, 0 , mptcp_dss_len);

16443

++

16444

++	/* We need to find out the path-mask from the meta-write-queue

16445

++	 * to properly select a subflow.

16446

++	 */

16447

++	mptcp_find_and_set_pathmask(meta_sk, skb);

16448

++

16449

++	/* If it's empty, just add */

16450

++	if (skb_queue_empty(&mpcb->reinject_queue)) {

16451

++		skb_queue_head(&mpcb->reinject_queue, skb);

16452

++		return;

16453

++	}

16454

++

16455

++	/* Find place to insert skb - or even we can 'drop' it, as the

16456

++	 * data is already covered by other skb's in the reinject-queue.

16457

++	 *

16458

++	 * This is inspired by code from tcp_data_queue.

16459

++	 */

16460

++

16461

++	skb1 = skb_peek_tail(&mpcb->reinject_queue);

16462

++	seq = TCP_SKB_CB(skb)->seq;

16463

++	while (1) {

16464

++		if (!after(TCP_SKB_CB(skb1)->seq, seq))

16465

++			break;

16466

++		if (skb_queue_is_first(&mpcb->reinject_queue, skb1)) {

16467

++			skb1 = NULL;

16468

++			break;

16469

++		}

16470

++		skb1 = skb_queue_prev(&mpcb->reinject_queue, skb1);

16471

++	}

16472

++

16473

++	/* Do skb overlap to previous one? */

16474

++	end_seq = TCP_SKB_CB(skb)->end_seq;

16475

++	if (skb1 && before(seq, TCP_SKB_CB(skb1)->end_seq)) {

16476

++		if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) {

16477

++			/* All the bits are present. Don't reinject */

16478

++			__kfree_skb(skb);

16479

++			return;

16480

++		}

16481

++		if (seq == TCP_SKB_CB(skb1)->seq) {

16482

++			if (skb_queue_is_first(&mpcb->reinject_queue, skb1))

16483

++				skb1 = NULL;

16484

++			else

16485

++				skb1 = skb_queue_prev(&mpcb->reinject_queue, skb1);

16486

++		}

16487

++	}

16488

++	if (!skb1)

16489

++		__skb_queue_head(&mpcb->reinject_queue, skb);

16490

++	else

16491

++		__skb_queue_after(&mpcb->reinject_queue, skb1, skb);

16492

++

16493

++	/* And clean segments covered by new one as whole. */

16494

++	while (!skb_queue_is_last(&mpcb->reinject_queue, skb)) {

16495

++		skb1 = skb_queue_next(&mpcb->reinject_queue, skb);

16496

++

16497

++		if (!after(end_seq, TCP_SKB_CB(skb1)->seq))

16498

++			break;

16499

++

16500

++		__skb_unlink(skb1, &mpcb->reinject_queue);

16501

++		__kfree_skb(skb1);

16502

++	}

16503

++	return;

16504

++}

16505

++

16506

++/* Inserts data into the reinject queue */

16507

++void mptcp_reinject_data(struct sock *sk, int clone_it)

16508

++{

16509

++	struct sk_buff *skb_it, *tmp;

16510

++	struct tcp_sock *tp = tcp_sk(sk);

16511

++	struct sock *meta_sk = tp->meta_sk;

16512

++

16513

++	/* It has already been closed - there is really no point in reinjecting */

16514

++	if (meta_sk->sk_state == TCP_CLOSE)

16515

++		return;

16516

++

16517

++	skb_queue_walk_safe(&sk->sk_write_queue, skb_it, tmp) {

16518

++		struct tcp_skb_cb *tcb = TCP_SKB_CB(skb_it);

16519

++		/* Subflow syn's and fin's are not reinjected.

16520

++		 *

16521

++		 * As well as empty subflow-fins with a data-fin.

16522

++		 * They are reinjected below (without the subflow-fin-flag)

16523

++		 */

16524

++		if (tcb->tcp_flags & TCPHDR_SYN ||

16525

++		    (tcb->tcp_flags & TCPHDR_FIN && !mptcp_is_data_fin(skb_it)) ||

16526

++		    (tcb->tcp_flags & TCPHDR_FIN && mptcp_is_data_fin(skb_it) && !skb_it->len))

16527

++			continue;

16528

++

16529

++		__mptcp_reinject_data(skb_it, meta_sk, sk, clone_it);

16530

++	}

16531

++

16532

++	skb_it = tcp_write_queue_tail(meta_sk);

16533

++	/* If sk has sent the empty data-fin, we have to reinject it too. */

16534

++	if (skb_it && mptcp_is_data_fin(skb_it) && skb_it->len == 0 &&

16535

++	    TCP_SKB_CB(skb_it)->path_mask & mptcp_pi_to_flag(tp->mptcp->path_index)) {

16536

++		__mptcp_reinject_data(skb_it, meta_sk, NULL, 1);

16537

++	}

16538

++

16539

++	mptcp_push_pending_frames(meta_sk);

16540

++

16541

++	tp->pf = 1;

16542

++}

16543

++EXPORT_SYMBOL(mptcp_reinject_data);

16544

++

16545

++static void mptcp_combine_dfin(const struct sk_buff *skb, const struct sock *meta_sk,

16546

++			       struct sock *subsk)

16547

++{

16548

++	const struct tcp_sock *meta_tp = tcp_sk(meta_sk);

16549

++	struct mptcp_cb *mpcb = meta_tp->mpcb;

16550

++	struct sock *sk_it;

16551

++	int all_empty = 1, all_acked;

16552

++

16553

++	/* In infinite mapping we always try to combine */

16554

++	if (mpcb->infinite_mapping_snd && tcp_close_state(subsk)) {

16555

++		subsk->sk_shutdown |= SEND_SHUTDOWN;

16556

++		TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_FIN;

16557

++		return;

16558

++	}

16559

++

16560

++	/* Don't combine, if they didn't combine - otherwise we end up in

16561

++	 * TIME_WAIT, even if our app is smart enough to avoid it

16562

++	 */

16563

++	if (meta_sk->sk_shutdown & RCV_SHUTDOWN) {

16564

++		if (!mpcb->dfin_combined)

16565

++			return;

16566

++	}

16567

++

16568

++	/* If no other subflow has data to send, we can combine */

16569

++	mptcp_for_each_sk(mpcb, sk_it) {

16570

++		if (!mptcp_sk_can_send(sk_it))

16571

++			continue;

16572

++

16573

++		if (!tcp_write_queue_empty(sk_it))

16574

++			all_empty = 0;

16575

++	}

16576

++

16577

++	/* If all data has been DATA_ACKed, we can combine.

16578

++	 * -1, because the data_fin consumed one byte

16579

++	 */

16580

++	all_acked = (meta_tp->snd_una == (meta_tp->write_seq - 1));

16581

++

16582

++	if ((all_empty || all_acked) && tcp_close_state(subsk)) {

16583

++		subsk->sk_shutdown |= SEND_SHUTDOWN;

16584

++		TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_FIN;

16585

++	}

16586

++}

16587

++

16588

++static int mptcp_write_dss_mapping(const struct tcp_sock *tp, const struct sk_buff *skb,

16589

++				   __be32 *ptr)

16590

++{

16591

++	const struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);

16592

++	__be32 *start = ptr;

16593

++	__u16 data_len;

16594

++

16595

++	*ptr++ = htonl(tcb->seq); /* data_seq */

16596

++

16597

++	/* If it's a non-data DATA_FIN, we set subseq to 0 (draft v7) */

16598

++	if (mptcp_is_data_fin(skb) && skb->len == 0)

16599

++		*ptr++ = 0; /* subseq */

16600

++	else

16601

++		*ptr++ = htonl(tp->write_seq - tp->mptcp->snt_isn); /* subseq */

16602

++

16603

++	if (tcb->mptcp_flags & MPTCPHDR_INF)

16604

++		data_len = 0;

16605

++	else

16606

++		data_len = tcb->end_seq - tcb->seq;

16607

++

16608

++	if (tp->mpcb->dss_csum && data_len) {

16609

++		__be16 *p16 = (__be16 *)ptr;

16610

++		__be32 hdseq = mptcp_get_highorder_sndbits(skb, tp->mpcb);

16611

++		__wsum csum;

16612

++

16613

++		*ptr = htonl(((data_len) << 16) |

16614

++			     (TCPOPT_EOL << 8) |

16615

++			     (TCPOPT_EOL));

16616

++		csum = csum_partial(ptr - 2, 12, skb->csum);

16617

++		p16++;

16618

++		*p16++ = csum_fold(csum_partial(&hdseq, sizeof(hdseq), csum));

16619

++	} else {

16620

++		*ptr++ = htonl(((data_len) << 16) |

16621

++			       (TCPOPT_NOP << 8) |

16622

++			       (TCPOPT_NOP));

16623

++	}

16624

++

16625

++	return ptr - start;

16626

++}

16627

++

16628

++static int mptcp_write_dss_data_ack(const struct tcp_sock *tp, const struct sk_buff *skb,

16629

++				    __be32 *ptr)

16630

++{

16631

++	struct mp_dss *mdss = (struct mp_dss *)ptr;

16632

++	__be32 *start = ptr;

16633

++

16634

++	mdss->kind = TCPOPT_MPTCP;

16635

++	mdss->sub = MPTCP_SUB_DSS;

16636

++	mdss->rsv1 = 0;

16637

++	mdss->rsv2 = 0;

16638

++	mdss->F = mptcp_is_data_fin(skb) ? 1 : 0;

16639

++	mdss->m = 0;

16640

++	mdss->M = mptcp_is_data_seq(skb) ? 1 : 0;

16641

++	mdss->a = 0;

16642

++	mdss->A = 1;

16643

++	mdss->len = mptcp_sub_len_dss(mdss, tp->mpcb->dss_csum);

16644

++	ptr++;

16645

++

16646

++	*ptr++ = htonl(mptcp_meta_tp(tp)->rcv_nxt);

16647

++

16648

++	return ptr - start;

16649

++}

16650

++

16651

++/* RFC6824 states that once a particular subflow mapping has been sent

16652

++ * out it must never be changed. However, packets may be split while

16653

++ * they are in the retransmission queue (due to SACK or ACKs) and that

16654

++ * arguably means that we would change the mapping (e.g. it splits it,

16655

++ * our sends out a subset of the initial mapping).

16656

++ *

16657

++ * Furthermore, the skb checksum is not always preserved across splits

16658

++ * (e.g. mptcp_fragment) which would mean that we need to recompute

16659

++ * the DSS checksum in this case.

16660

++ *

16661

++ * To avoid this we save the initial DSS mapping which allows us to

16662

++ * send the same DSS mapping even for fragmented retransmits.

16663

++ */

16664

++static void mptcp_save_dss_data_seq(const struct tcp_sock *tp, struct sk_buff *skb)

16665

++{

16666

++	struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);

16667

++	__be32 *ptr = (__be32 *)tcb->dss;

16668

++

16669

++	tcb->mptcp_flags |= MPTCPHDR_SEQ;

16670

++

16671

++	ptr += mptcp_write_dss_data_ack(tp, skb, ptr);

16672

++	ptr += mptcp_write_dss_mapping(tp, skb, ptr);

16673

++}

16674

++

16675

++/* Write the saved DSS mapping to the header */

16676

++static int mptcp_write_dss_data_seq(const struct tcp_sock *tp, struct sk_buff *skb,

16677

++				    __be32 *ptr)

16678

++{

16679

++	__be32 *start = ptr;

16680

++

16681

++	memcpy(ptr, TCP_SKB_CB(skb)->dss, mptcp_dss_len);

16682

++

16683

++	/* update the data_ack */

16684

++	start[1] = htonl(mptcp_meta_tp(tp)->rcv_nxt);

16685

++

16686

++	/* dss is in a union with inet_skb_parm and

16687

++	 * the IP layer expects zeroed IPCB fields.

16688

++	 */

16689

++	memset(TCP_SKB_CB(skb)->dss, 0 , mptcp_dss_len);

16690

++

16691

++	return mptcp_dss_len/sizeof(*ptr);

16692

++}

16693

++

16694

++static bool mptcp_skb_entail(struct sock *sk, struct sk_buff *skb, int reinject)

16695

++{

16696

++	struct tcp_sock *tp = tcp_sk(sk);

16697

++	const struct sock *meta_sk = mptcp_meta_sk(sk);

16698

++	const struct mptcp_cb *mpcb = tp->mpcb;

16699

++	struct tcp_skb_cb *tcb;

16700

++	struct sk_buff *subskb = NULL;

16701

++

16702

++	if (!reinject)

16703

++		TCP_SKB_CB(skb)->mptcp_flags |= (mpcb->snd_hiseq_index ?

16704

++						  MPTCPHDR_SEQ64_INDEX : 0);

16705

++

16706

++	subskb = pskb_copy_for_clone(skb, GFP_ATOMIC);

16707

++	if (!subskb)

16708

++		return false;

16709

++

16710

++	/* At the subflow-level we need to call again tcp_init_tso_segs. We

16711

++	 * force this, by setting gso_segs to 0. It has been set to 1 prior to

16712

++	 * the call to mptcp_skb_entail.

16713

++	 */

16714

++	skb_shinfo(subskb)->gso_segs = 0;

16715

++

16716

++	TCP_SKB_CB(skb)->path_mask |= mptcp_pi_to_flag(tp->mptcp->path_index);

16717

++

16718

++	if (!(sk->sk_route_caps & NETIF_F_ALL_CSUM) &&

16719

++	    skb->ip_summed == CHECKSUM_PARTIAL) {

16720

++		subskb->csum = skb->csum = skb_checksum(skb, 0, skb->len, 0);

16721

++		subskb->ip_summed = skb->ip_summed = CHECKSUM_NONE;

16722

++	}

16723

++

16724

++	tcb = TCP_SKB_CB(subskb);

16725

++

16726

++	if (tp->mpcb->send_infinite_mapping &&

16727

++	    !tp->mpcb->infinite_mapping_snd &&

16728

++	    !before(tcb->seq, mptcp_meta_tp(tp)->snd_nxt)) {

16729

++		tp->mptcp->fully_established = 1;

16730

++		tp->mpcb->infinite_mapping_snd = 1;

16731

++		tp->mptcp->infinite_cutoff_seq = tp->write_seq;

16732

++		tcb->mptcp_flags |= MPTCPHDR_INF;

16733

++	}

16734

++

16735

++	if (mptcp_is_data_fin(subskb))

16736

++		mptcp_combine_dfin(subskb, meta_sk, sk);

16737

++

16738

++	mptcp_save_dss_data_seq(tp, subskb);

16739

++

16740

++	tcb->seq = tp->write_seq;

16741

++	tcb->sacked = 0; /* reset the sacked field: from the point of view

16742

++			  * of this subflow, we are sending a brand new

16743

++			  * segment

16744

++			  */

16745

++	/* Take into account seg len */

16746

++	tp->write_seq += subskb->len + ((tcb->tcp_flags & TCPHDR_FIN) ? 1 : 0);

16747

++	tcb->end_seq = tp->write_seq;

16748

++

16749

++	/* If it's a non-payload DATA_FIN (also no subflow-fin), the

16750

++	 * segment is not part of the subflow but on a meta-only-level.

16751

++	 */

16752

++	if (!mptcp_is_data_fin(subskb) || tcb->end_seq != tcb->seq) {

16753

++		tcp_add_write_queue_tail(sk, subskb);

16754

++		sk->sk_wmem_queued += subskb->truesize;

16755

++		sk_mem_charge(sk, subskb->truesize);

16756

++	} else {

16757

++		int err;

16758

++

16759

++		/* Necessary to initialize for tcp_transmit_skb. mss of 1, as

16760

++		 * skb->len = 0 will force tso_segs to 1.

16761

++		 */

16762

++		tcp_init_tso_segs(sk, subskb, 1);

16763

++		/* Empty data-fins are sent immediatly on the subflow */

16764

++		TCP_SKB_CB(subskb)->when = tcp_time_stamp;

16765

++		err = tcp_transmit_skb(sk, subskb, 1, GFP_ATOMIC);

16766

++

16767

++		/* It has not been queued, we can free it now. */

16768

++		kfree_skb(subskb);

16769

++

16770

++		if (err)

16771

++			return false;

16772

++	}

16773

++

16774

++	if (!tp->mptcp->fully_established) {

16775

++		tp->mptcp->second_packet = 1;

16776

++		tp->mptcp->last_end_data_seq = TCP_SKB_CB(skb)->end_seq;

16777

++	}

16778

++

16779

++	return true;

16780

++}

16781

++

16782

++/* Fragment an skb and update the mptcp meta-data. Due to reinject, we

16783

++ * might need to undo some operations done by tcp_fragment.

16784

++ */

16785

++static int mptcp_fragment(struct sock *meta_sk, struct sk_buff *skb, u32 len,

16786

++			  gfp_t gfp, int reinject)

16787

++{

16788

++	int ret, diff, old_factor;

16789

++	struct sk_buff *buff;

16790

++	u8 flags;

16791

++

16792

++	if (skb_headlen(skb) < len)

16793

++		diff = skb->len - len;

16794

++	else

16795

++		diff = skb->data_len;

16796

++	old_factor = tcp_skb_pcount(skb);

16797

++

16798

++	/* The mss_now in tcp_fragment is used to set the tso_segs of the skb.

16799

++	 * At the MPTCP-level we do not care about the absolute value. All we

16800

++	 * care about is that it is set to 1 for accurate packets_out

16801

++	 * accounting.

16802

++	 */

16803

++	ret = tcp_fragment(meta_sk, skb, len, UINT_MAX, gfp);

16804

++	if (ret)

16805

++		return ret;

16806

++

16807

++	buff = skb->next;

16808

++

16809

++	flags = TCP_SKB_CB(skb)->mptcp_flags;

16810

++	TCP_SKB_CB(skb)->mptcp_flags = flags & ~(MPTCPHDR_FIN);

16811

++	TCP_SKB_CB(buff)->mptcp_flags = flags;

16812

++	TCP_SKB_CB(buff)->path_mask = TCP_SKB_CB(skb)->path_mask;

16813

++

16814

++	/* If reinject == 1, the buff will be added to the reinject

16815

++	 * queue, which is currently not part of memory accounting. So

16816

++	 * undo the changes done by tcp_fragment and update the

16817

++	 * reinject queue. Also, undo changes to the packet counters.

16818

++	 */

16819

++	if (reinject == 1) {

16820

++		int undo = buff->truesize - diff;

16821

++		meta_sk->sk_wmem_queued -= undo;

16822

++		sk_mem_uncharge(meta_sk, undo);

16823

++

16824

++		tcp_sk(meta_sk)->mpcb->reinject_queue.qlen++;

16825

++		meta_sk->sk_write_queue.qlen--;

16826

++

16827

++		if (!before(tcp_sk(meta_sk)->snd_nxt, TCP_SKB_CB(buff)->end_seq)) {

16828

++			undo = old_factor - tcp_skb_pcount(skb) -

16829

++				tcp_skb_pcount(buff);

16830

++			if (undo)

16831

++				tcp_adjust_pcount(meta_sk, skb, -undo);

16832

++		}

16833

++	}

16834

++

16835

++	return 0;

16836

++}

16837

++

16838

++/* Inspired by tcp_write_wakeup */

16839

++int mptcp_write_wakeup(struct sock *meta_sk)

16840

++{

16841

++	struct tcp_sock *meta_tp = tcp_sk(meta_sk);

16842

++	struct sk_buff *skb;

16843

++	struct sock *sk_it;

16844

++	int ans = 0;

16845

++

16846

++	if (meta_sk->sk_state == TCP_CLOSE)

16847

++		return -1;

16848

++

16849

++	skb = tcp_send_head(meta_sk);

16850

++	if (skb &&

16851

++	    before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(meta_tp))) {

16852

++		unsigned int mss;

16853

++		unsigned int seg_size = tcp_wnd_end(meta_tp) - TCP_SKB_CB(skb)->seq;

16854

++		struct sock *subsk = meta_tp->mpcb->sched_ops->get_subflow(meta_sk, skb, true);

16855

++		struct tcp_sock *subtp;

16856

++		if (!subsk)

16857

++			goto window_probe;

16858

++		subtp = tcp_sk(subsk);

16859

++		mss = tcp_current_mss(subsk);

16860

++

16861

++		seg_size = min(tcp_wnd_end(meta_tp) - TCP_SKB_CB(skb)->seq,

16862

++			       tcp_wnd_end(subtp) - subtp->write_seq);

16863

++

16864

++		if (before(meta_tp->pushed_seq, TCP_SKB_CB(skb)->end_seq))

16865

++			meta_tp->pushed_seq = TCP_SKB_CB(skb)->end_seq;

16866

++

16867

++		/* We are probing the opening of a window

16868

++		 * but the window size is != 0

16869

++		 * must have been a result SWS avoidance ( sender )

16870

++		 */

16871

++		if (seg_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq ||

16872

++		    skb->len > mss) {

16873

++			seg_size = min(seg_size, mss);

16874

++			TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;

16875

++			if (mptcp_fragment(meta_sk, skb, seg_size,

16876

++					   GFP_ATOMIC, 0))

16877

++				return -1;

16878

++		} else if (!tcp_skb_pcount(skb)) {

16879

++			/* see mptcp_write_xmit on why we use UINT_MAX */

16880

++			tcp_set_skb_tso_segs(meta_sk, skb, UINT_MAX);

16881

++		}

16882

++

16883

++		TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;

16884

++		if (!mptcp_skb_entail(subsk, skb, 0))

16885

++			return -1;

16886

++		TCP_SKB_CB(skb)->when = tcp_time_stamp;

16887

++

16888

++		mptcp_check_sndseq_wrap(meta_tp, TCP_SKB_CB(skb)->end_seq -

16889

++						 TCP_SKB_CB(skb)->seq);

16890

++		tcp_event_new_data_sent(meta_sk, skb);

16891

++

16892

++		__tcp_push_pending_frames(subsk, mss, TCP_NAGLE_PUSH);

16893

++

16894

++		return 0;

16895

++	} else {

16896

++window_probe:

16897

++		if (between(meta_tp->snd_up, meta_tp->snd_una + 1,

16898

++			    meta_tp->snd_una + 0xFFFF)) {

16899

++			mptcp_for_each_sk(meta_tp->mpcb, sk_it) {

16900

++				if (mptcp_sk_can_send_ack(sk_it))

16901

++					tcp_xmit_probe_skb(sk_it, 1);

16902

++			}

16903

++		}

16904

++

16905

++		/* At least one of the tcp_xmit_probe_skb's has to succeed */

16906

++		mptcp_for_each_sk(meta_tp->mpcb, sk_it) {

16907

++			int ret;

16908

++

16909

++			if (!mptcp_sk_can_send_ack(sk_it))

16910

++				continue;

16911

++

16912

++			ret = tcp_xmit_probe_skb(sk_it, 0);

16913

++			if (unlikely(ret > 0))

16914

++				ans = ret;

16915

++		}

16916

++		return ans;

16917

++	}

16918

++}

16919

++

16920

++bool mptcp_write_xmit(struct sock *meta_sk, unsigned int mss_now, int nonagle,

16921

++		     int push_one, gfp_t gfp)

16922

++{

16923

++	struct tcp_sock *meta_tp = tcp_sk(meta_sk), *subtp;

16924

++	struct sock *subsk = NULL;

16925

++	struct mptcp_cb *mpcb = meta_tp->mpcb;

16926

++	struct sk_buff *skb;

16927

++	unsigned int sent_pkts;

16928

++	int reinject = 0;

16929

++	unsigned int sublimit;

16930

++

16931

++	sent_pkts = 0;

16932

++

16933

++	while ((skb = mpcb->sched_ops->next_segment(meta_sk, &reinject, &subsk,

16934

++						    &sublimit))) {

16935

++		unsigned int limit;

16936

++

16937

++		subtp = tcp_sk(subsk);

16938

++		mss_now = tcp_current_mss(subsk);

16939

++

16940

++		if (reinject == 1) {

16941

++			if (!after(TCP_SKB_CB(skb)->end_seq, meta_tp->snd_una)) {

16942

++				/* Segment already reached the peer, take the next one */

16943

++				__skb_unlink(skb, &mpcb->reinject_queue);

16944

++				__kfree_skb(skb);

16945

++				continue;

16946

++			}

16947

++		}

16948

++

16949

++		/* If the segment was cloned (e.g. a meta retransmission),

16950

++		 * the header must be expanded/copied so that there is no

16951

++		 * corruption of TSO information.

16952

++		 */

16953

++		if (skb_unclone(skb, GFP_ATOMIC))

16954

++			break;

16955

++

16956

++		if (unlikely(!tcp_snd_wnd_test(meta_tp, skb, mss_now)))

16957

++			break;

16958

++

16959

++		/* Force tso_segs to 1 by using UINT_MAX.

16960

++		 * We actually don't care about the exact number of segments

16961

++		 * emitted on the subflow. We need just to set tso_segs, because

16962

++		 * we still need an accurate packets_out count in

16963

++		 * tcp_event_new_data_sent.

16964

++		 */

16965

++		tcp_set_skb_tso_segs(meta_sk, skb, UINT_MAX);

16966

++

16967

++		/* Check for nagle, irregardless of tso_segs. If the segment is

16968

++		 * actually larger than mss_now (TSO segment), then

16969

++		 * tcp_nagle_check will have partial == false and always trigger

16970

++		 * the transmission.

16971

++		 * tcp_write_xmit has a TSO-level nagle check which is not

16972

++		 * subject to the MPTCP-level. It is based on the properties of

16973

++		 * the subflow, not the MPTCP-level.

16974

++		 */

16975

++		if (unlikely(!tcp_nagle_test(meta_tp, skb, mss_now,

16976

++					     (tcp_skb_is_last(meta_sk, skb) ?

16977

++					      nonagle : TCP_NAGLE_PUSH))))

16978

++			break;

16979

++

16980

++		limit = mss_now;

16981

++		/* skb->len > mss_now is the equivalent of tso_segs > 1 in

16982

++		 * tcp_write_xmit. Otherwise split-point would return 0.

16983

++		 */

16984

++		if (skb->len > mss_now && !tcp_urg_mode(meta_tp))

16985

++			/* We limit the size of the skb so that it fits into the

16986

++			 * window. Call tcp_mss_split_point to avoid duplicating

16987

++			 * code.

16988

++			 * We really only care about fitting the skb into the

16989

++			 * window. That's why we use UINT_MAX. If the skb does

16990

++			 * not fit into the cwnd_quota or the NIC's max-segs

16991

++			 * limitation, it will be split by the subflow's

16992

++			 * tcp_write_xmit which does the appropriate call to

16993

++			 * tcp_mss_split_point.

16994

++			 */

16995

++			limit = tcp_mss_split_point(meta_sk, skb, mss_now,

16996

++						    UINT_MAX / mss_now,

16997

++						    nonagle);

16998

++

16999

++		if (sublimit)

17000

++			limit = min(limit, sublimit);

17001

++

17002

++		if (skb->len > limit &&

17003

++		    unlikely(mptcp_fragment(meta_sk, skb, limit, gfp, reinject)))

17004

++			break;

17005

++

17006

++		if (!mptcp_skb_entail(subsk, skb, reinject))

17007

++			break;

17008

++		/* Nagle is handled at the MPTCP-layer, so

17009

++		 * always push on the subflow

17010

++		 */

17011

++		__tcp_push_pending_frames(subsk, mss_now, TCP_NAGLE_PUSH);

17012

++		TCP_SKB_CB(skb)->when = tcp_time_stamp;

17013

++

17014

++		if (!reinject) {

17015

++			mptcp_check_sndseq_wrap(meta_tp,

17016

++						TCP_SKB_CB(skb)->end_seq -

17017

++						TCP_SKB_CB(skb)->seq);

17018

++			tcp_event_new_data_sent(meta_sk, skb);

17019

++		}

17020

++

17021

++		tcp_minshall_update(meta_tp, mss_now, skb);

17022

++		sent_pkts += tcp_skb_pcount(skb);

17023

++

17024

++		if (reinject > 0) {

17025

++			__skb_unlink(skb, &mpcb->reinject_queue);

17026

++			kfree_skb(skb);

17027

++		}

17028

++

17029

++		if (push_one)

17030

++			break;

17031

++	}

17032

++

17033

++	return !meta_tp->packets_out && tcp_send_head(meta_sk);

17034

++}

17035

++

17036

++void mptcp_write_space(struct sock *sk)

17037

++{

17038

++	mptcp_push_pending_frames(mptcp_meta_sk(sk));

17039

++}

17040

++

17041

++u32 __mptcp_select_window(struct sock *sk)

17042

++{

17043

++	struct inet_connection_sock *icsk = inet_csk(sk);

17044

++	struct tcp_sock *tp = tcp_sk(sk), *meta_tp = mptcp_meta_tp(tp);

17045

++	int mss, free_space, full_space, window;

17046

++

17047

++	/* MSS for the peer's data.  Previous versions used mss_clamp

17048

++	 * here.  I don't know if the value based on our guesses

17049

++	 * of peer's MSS is better for the performance.  It's more correct

17050

++	 * but may be worse for the performance because of rcv_mss

17051

++	 * fluctuations.  --SAW  1998/11/1

17052

++	 */

17053

++	mss = icsk->icsk_ack.rcv_mss;

17054

++	free_space = tcp_space(sk);

17055

++	full_space = min_t(int, meta_tp->window_clamp,

17056

++			tcp_full_space(sk));

17057

++

17058

++	if (mss > full_space)

17059

++		mss = full_space;

17060

++

17061

++	if (free_space < (full_space >> 1)) {

17062

++		icsk->icsk_ack.quick = 0;

17063

++

17064

++		if (tcp_memory_pressure)

17065

++			/* TODO this has to be adapted when we support different

17066

++			 * MSS's among the subflows.

17067

++			 */

17068

++			meta_tp->rcv_ssthresh = min(meta_tp->rcv_ssthresh,

17069

++						    4U * meta_tp->advmss);

17070

++

17071

++		if (free_space < mss)

17072

++			return 0;

17073

++	}

17074

++

17075

++	if (free_space > meta_tp->rcv_ssthresh)

17076

++		free_space = meta_tp->rcv_ssthresh;

17077

++

17078

++	/* Don't do rounding if we are using window scaling, since the

17079

++	 * scaled window will not line up with the MSS boundary anyway.

17080

++	 */

17081

++	window = meta_tp->rcv_wnd;

17082

++	if (tp->rx_opt.rcv_wscale) {

17083

++		window = free_space;

17084

++

17085

++		/* Advertise enough space so that it won't get scaled away.

17086

++		 * Import case: prevent zero window announcement if

17087

++		 * 1<<rcv_wscale > mss.

17088

++		 */

17089

++		if (((window >> tp->rx_opt.rcv_wscale) << tp->

17090

++		     rx_opt.rcv_wscale) != window)

17091

++			window = (((window >> tp->rx_opt.rcv_wscale) + 1)

17092

++				  << tp->rx_opt.rcv_wscale);

17093

++	} else {

17094

++		/* Get the largest window that is a nice multiple of mss.

17095

++		 * Window clamp already applied above.

17096

++		 * If our current window offering is within 1 mss of the

17097

++		 * free space we just keep it. This prevents the divide

17098

++		 * and multiply from happening most of the time.

17099

++		 * We also don't do any window rounding when the free space

17100

++		 * is too small.

17101

++		 */

17102

++		if (window <= free_space - mss || window > free_space)

17103

++			window = (free_space / mss) * mss;

17104

++		else if (mss == full_space &&

17105

++			 free_space > window + (full_space >> 1))

17106

++			window = free_space;

17107

++	}

17108

++

17109

++	return window;

17110

++}

17111

++

17112

++void mptcp_syn_options(const struct sock *sk, struct tcp_out_options *opts,

17113

++		       unsigned *remaining)

17114

++{

17115

++	const struct tcp_sock *tp = tcp_sk(sk);

17116

++

17117

++	opts->options |= OPTION_MPTCP;

17118

++	if (is_master_tp(tp)) {

17119

++		opts->mptcp_options |= OPTION_MP_CAPABLE | OPTION_TYPE_SYN;

17120

++		*remaining -= MPTCP_SUB_LEN_CAPABLE_SYN_ALIGN;

17121

++		opts->mp_capable.sender_key = tp->mptcp_loc_key;

17122

++		opts->dss_csum = !!sysctl_mptcp_checksum;

17123

++	} else {

17124

++		const struct mptcp_cb *mpcb = tp->mpcb;

17125

++

17126

++		opts->mptcp_options |= OPTION_MP_JOIN | OPTION_TYPE_SYN;

17127

++		*remaining -= MPTCP_SUB_LEN_JOIN_SYN_ALIGN;

17128

++		opts->mp_join_syns.token = mpcb->mptcp_rem_token;

17129

++		opts->mp_join_syns.low_prio  = tp->mptcp->low_prio;

17130

++		opts->addr_id = tp->mptcp->loc_id;

17131

++		opts->mp_join_syns.sender_nonce = tp->mptcp->mptcp_loc_nonce;

17132

++	}

17133

++}

17134

++

17135

++void mptcp_synack_options(struct request_sock *req,

17136

++			  struct tcp_out_options *opts, unsigned *remaining)

17137

++{

17138

++	struct mptcp_request_sock *mtreq;

17139

++	mtreq = mptcp_rsk(req);

17140

++

17141

++	opts->options |= OPTION_MPTCP;

17142

++	/* MPCB not yet set - thus it's a new MPTCP-session */

17143

++	if (!mtreq->is_sub) {

17144

++		opts->mptcp_options |= OPTION_MP_CAPABLE | OPTION_TYPE_SYNACK;

17145

++		opts->mp_capable.sender_key = mtreq->mptcp_loc_key;

17146

++		opts->dss_csum = !!sysctl_mptcp_checksum || mtreq->dss_csum;

17147

++		*remaining -= MPTCP_SUB_LEN_CAPABLE_SYN_ALIGN;

17148

++	} else {

17149

++		opts->mptcp_options |= OPTION_MP_JOIN | OPTION_TYPE_SYNACK;

17150

++		opts->mp_join_syns.sender_truncated_mac =

17151

++				mtreq->mptcp_hash_tmac;

17152

++		opts->mp_join_syns.sender_nonce = mtreq->mptcp_loc_nonce;

17153

++		opts->mp_join_syns.low_prio = mtreq->low_prio;

17154

++		opts->addr_id = mtreq->loc_id;

17155

++		*remaining -= MPTCP_SUB_LEN_JOIN_SYNACK_ALIGN;

17156

++	}

17157

++}

17158

++

17159

++void mptcp_established_options(struct sock *sk, struct sk_buff *skb,

17160

++			       struct tcp_out_options *opts, unsigned *size)

17161

++{

17162

++	struct tcp_sock *tp = tcp_sk(sk);

17163

++	struct mptcp_cb *mpcb = tp->mpcb;

17164

++	const struct tcp_skb_cb *tcb = skb ? TCP_SKB_CB(skb) : NULL;

17165

++

17166

++	/* We are coming from tcp_current_mss with the meta_sk as an argument.

17167

++	 * It does not make sense to check for the options, because when the

17168

++	 * segment gets sent, another subflow will be chosen.

17169

++	 */

17170

++	if (!skb && is_meta_sk(sk))

17171

++		return;

17172

++

17173

++	/* In fallback mp_fail-mode, we have to repeat it until the fallback

17174

++	 * has been done by the sender

17175

++	 */

17176

++	if (unlikely(tp->mptcp->send_mp_fail)) {

17177

++		opts->options |= OPTION_MPTCP;

17178

++		opts->mptcp_options |= OPTION_MP_FAIL;

17179

++		*size += MPTCP_SUB_LEN_FAIL;

17180

++		return;

17181

++	}

17182

++

17183

++	if (unlikely(tp->send_mp_fclose)) {

17184

++		opts->options |= OPTION_MPTCP;

17185

++		opts->mptcp_options |= OPTION_MP_FCLOSE;

17186

++		opts->mp_capable.receiver_key = mpcb->mptcp_rem_key;

17187

++		*size += MPTCP_SUB_LEN_FCLOSE_ALIGN;

17188

++		return;

17189

++	}

17190

++

17191

++	/* 1. If we are the sender of the infinite-mapping, we need the

17192

++	 *    MPTCPHDR_INF-flag, because a retransmission of the

17193

++	 *    infinite-announcment still needs the mptcp-option.

17194

++	 *

17195

++	 *    We need infinite_cutoff_seq, because retransmissions from before

17196

++	 *    the infinite-cutoff-moment still need the MPTCP-signalling to stay

17197

++	 *    consistent.

17198

++	 *

17199

++	 * 2. If we are the receiver of the infinite-mapping, we always skip

17200

++	 *    mptcp-options, because acknowledgments from before the

17201

++	 *    infinite-mapping point have already been sent out.

17202

++	 *

17203

++	 * I know, the whole infinite-mapping stuff is ugly...

17204

++	 *

17205

++	 * TODO: Handle wrapped data-sequence numbers

17206

++	 *       (even if it's very unlikely)

17207

++	 */

17208

++	if (unlikely(mpcb->infinite_mapping_snd) &&

17209

++	    ((mpcb->send_infinite_mapping && tcb &&

17210

++	      mptcp_is_data_seq(skb) &&

17211

++	      !(tcb->mptcp_flags & MPTCPHDR_INF) &&

17212

++	      !before(tcb->seq, tp->mptcp->infinite_cutoff_seq)) ||

17213

++	     !mpcb->send_infinite_mapping))

17214

++		return;

17215

++

17216

++	if (unlikely(tp->mptcp->include_mpc)) {

17217

++		opts->options |= OPTION_MPTCP;

17218

++		opts->mptcp_options |= OPTION_MP_CAPABLE |

17219

++				       OPTION_TYPE_ACK;

17220

++		*size += MPTCP_SUB_LEN_CAPABLE_ACK_ALIGN;

17221

++		opts->mp_capable.sender_key = mpcb->mptcp_loc_key;

17222

++		opts->mp_capable.receiver_key = mpcb->mptcp_rem_key;

17223

++		opts->dss_csum = mpcb->dss_csum;

17224

++

17225

++		if (skb)

17226

++			tp->mptcp->include_mpc = 0;

17227

++	}

17228

++	if (unlikely(tp->mptcp->pre_established)) {

17229

++		opts->options |= OPTION_MPTCP;

17230

++		opts->mptcp_options |= OPTION_MP_JOIN | OPTION_TYPE_ACK;

17231

++		*size += MPTCP_SUB_LEN_JOIN_ACK_ALIGN;

17232

++	}

17233

++

17234

++	if (!tp->mptcp->include_mpc && !tp->mptcp->pre_established) {

17235

++		opts->options |= OPTION_MPTCP;

17236

++		opts->mptcp_options |= OPTION_DATA_ACK;

17237

++		/* If !skb, we come from tcp_current_mss and thus we always

17238

++		 * assume that the DSS-option will be set for the data-packet.

17239

++		 */

17240

++		if (skb && !mptcp_is_data_seq(skb)) {

17241

++			*size += MPTCP_SUB_LEN_ACK_ALIGN;

17242

++		} else {

17243

++			/* Doesn't matter, if csum included or not. It will be

17244

++			 * either 10 or 12, and thus aligned = 12

17245

++			 */

17246

++			*size += MPTCP_SUB_LEN_ACK_ALIGN +

17247

++				 MPTCP_SUB_LEN_SEQ_ALIGN;

17248

++		}

17249

++

17250

++		*size += MPTCP_SUB_LEN_DSS_ALIGN;

17251

++	}

17252

++

17253

++	if (unlikely(mpcb->addr_signal) && mpcb->pm_ops->addr_signal)

17254

++		mpcb->pm_ops->addr_signal(sk, size, opts, skb);

17255

++

17256

++	if (unlikely(tp->mptcp->send_mp_prio) &&

17257

++	    MAX_TCP_OPTION_SPACE - *size >= MPTCP_SUB_LEN_PRIO_ALIGN) {

17258

++		opts->options |= OPTION_MPTCP;

17259

++		opts->mptcp_options |= OPTION_MP_PRIO;

17260

++		if (skb)

17261

++			tp->mptcp->send_mp_prio = 0;

17262

++		*size += MPTCP_SUB_LEN_PRIO_ALIGN;

17263

++	}

17264

++

17265

++	return;

17266

++}

17267

++

17268

++u16 mptcp_select_window(struct sock *sk)

17269

++{

17270

++	u16 new_win		= tcp_select_window(sk);

17271

++	struct tcp_sock *tp	= tcp_sk(sk);

17272

++	struct tcp_sock *meta_tp = mptcp_meta_tp(tp);

17273

++

17274

++	meta_tp->rcv_wnd	= tp->rcv_wnd;

17275

++	meta_tp->rcv_wup	= meta_tp->rcv_nxt;

17276

++

17277

++	return new_win;

17278

++}

17279

++

17280

++void mptcp_options_write(__be32 *ptr, struct tcp_sock *tp,

17281

++			 const struct tcp_out_options *opts,

17282

++			 struct sk_buff *skb)

17283

++{

17284

++	if (unlikely(OPTION_MP_CAPABLE & opts->mptcp_options)) {

17285

++		struct mp_capable *mpc = (struct mp_capable *)ptr;

17286

++

17287

++		mpc->kind = TCPOPT_MPTCP;

17288

++

17289

++		if ((OPTION_TYPE_SYN & opts->mptcp_options) ||

17290

++		    (OPTION_TYPE_SYNACK & opts->mptcp_options)) {

17291

++			mpc->sender_key = opts->mp_capable.sender_key;

17292

++			mpc->len = MPTCP_SUB_LEN_CAPABLE_SYN;

17293

++			ptr += MPTCP_SUB_LEN_CAPABLE_SYN_ALIGN >> 2;

17294

++		} else if (OPTION_TYPE_ACK & opts->mptcp_options) {

17295

++			mpc->sender_key = opts->mp_capable.sender_key;

17296

++			mpc->receiver_key = opts->mp_capable.receiver_key;

17297

++			mpc->len = MPTCP_SUB_LEN_CAPABLE_ACK;

17298

++			ptr += MPTCP_SUB_LEN_CAPABLE_ACK_ALIGN >> 2;

17299

++		}

17300

++

17301

++		mpc->sub = MPTCP_SUB_CAPABLE;

17302

++		mpc->ver = 0;

17303

++		mpc->a = opts->dss_csum;

17304

++		mpc->b = 0;

17305

++		mpc->rsv = 0;

17306

++		mpc->h = 1;

17307

++	}

17308

++

17309

++	if (unlikely(OPTION_MP_JOIN & opts->mptcp_options)) {

17310

++		struct mp_join *mpj = (struct mp_join *)ptr;

17311

++

17312

++		mpj->kind = TCPOPT_MPTCP;

17313

++		mpj->sub = MPTCP_SUB_JOIN;

17314

++		mpj->rsv = 0;

17315

++

17316

++		if (OPTION_TYPE_SYN & opts->mptcp_options) {

17317

++			mpj->len = MPTCP_SUB_LEN_JOIN_SYN;

17318

++			mpj->u.syn.token = opts->mp_join_syns.token;

17319

++			mpj->u.syn.nonce = opts->mp_join_syns.sender_nonce;

17320

++			mpj->b = opts->mp_join_syns.low_prio;

17321

++			mpj->addr_id = opts->addr_id;

17322

++			ptr += MPTCP_SUB_LEN_JOIN_SYN_ALIGN >> 2;

17323

++		} else if (OPTION_TYPE_SYNACK & opts->mptcp_options) {

17324

++			mpj->len = MPTCP_SUB_LEN_JOIN_SYNACK;

17325

++			mpj->u.synack.mac =

17326

++				opts->mp_join_syns.sender_truncated_mac;

17327

++			mpj->u.synack.nonce = opts->mp_join_syns.sender_nonce;

17328

++			mpj->b = opts->mp_join_syns.low_prio;

17329

++			mpj->addr_id = opts->addr_id;

17330

++			ptr += MPTCP_SUB_LEN_JOIN_SYNACK_ALIGN >> 2;

17331

++		} else if (OPTION_TYPE_ACK & opts->mptcp_options) {

17332

++			mpj->len = MPTCP_SUB_LEN_JOIN_ACK;

17333

++			mpj->addr_id = 0; /* addr_id is rsv (RFC 6824, p. 21) */

17334

++			memcpy(mpj->u.ack.mac, &tp->mptcp->sender_mac[0], 20);

17335

++			ptr += MPTCP_SUB_LEN_JOIN_ACK_ALIGN >> 2;

17336

++		}

17337

++	}

17338

++	if (unlikely(OPTION_ADD_ADDR & opts->mptcp_options)) {

17339

++		struct mp_add_addr *mpadd = (struct mp_add_addr *)ptr;

17340

++

17341

++		mpadd->kind = TCPOPT_MPTCP;

17342

++		if (opts->add_addr_v4) {

17343

++			mpadd->len = MPTCP_SUB_LEN_ADD_ADDR4;

17344

++			mpadd->sub = MPTCP_SUB_ADD_ADDR;

17345

++			mpadd->ipver = 4;

17346

++			mpadd->addr_id = opts->add_addr4.addr_id;

17347

++			mpadd->u.v4.addr = opts->add_addr4.addr;

17348

++			ptr += MPTCP_SUB_LEN_ADD_ADDR4_ALIGN >> 2;

17349

++		} else if (opts->add_addr_v6) {

17350

++			mpadd->len = MPTCP_SUB_LEN_ADD_ADDR6;

17351

++			mpadd->sub = MPTCP_SUB_ADD_ADDR;

17352

++			mpadd->ipver = 6;

17353

++			mpadd->addr_id = opts->add_addr6.addr_id;

17354

++			memcpy(&mpadd->u.v6.addr, &opts->add_addr6.addr,

17355

++			       sizeof(mpadd->u.v6.addr));

17356

++			ptr += MPTCP_SUB_LEN_ADD_ADDR6_ALIGN >> 2;

17357

++		}

17358

++	}

17359

++	if (unlikely(OPTION_REMOVE_ADDR & opts->mptcp_options)) {

17360

++		struct mp_remove_addr *mprem = (struct mp_remove_addr *)ptr;

17361

++		u8 *addrs_id;

17362

++		int id, len, len_align;

17363

++

17364

++		len = mptcp_sub_len_remove_addr(opts->remove_addrs);

17365

++		len_align = mptcp_sub_len_remove_addr_align(opts->remove_addrs);

17366

++

17367

++		mprem->kind = TCPOPT_MPTCP;

17368

++		mprem->len = len;

17369

++		mprem->sub = MPTCP_SUB_REMOVE_ADDR;

17370

++		mprem->rsv = 0;

17371

++		addrs_id = &mprem->addrs_id;

17372

++

17373

++		mptcp_for_each_bit_set(opts->remove_addrs, id)

17374

++			*(addrs_id++) = id;

17375

++

17376

++		/* Fill the rest with NOP's */

17377

++		if (len_align > len) {

17378

++			int i;

17379

++			for (i = 0; i < len_align - len; i++)

17380

++				*(addrs_id++) = TCPOPT_NOP;

17381

++		}

17382

++

17383

++		ptr += len_align >> 2;

17384

++	}

17385

++	if (unlikely(OPTION_MP_FAIL & opts->mptcp_options)) {

17386

++		struct mp_fail *mpfail = (struct mp_fail *)ptr;

17387

++

17388

++		mpfail->kind = TCPOPT_MPTCP;

17389

++		mpfail->len = MPTCP_SUB_LEN_FAIL;

17390

++		mpfail->sub = MPTCP_SUB_FAIL;

17391

++		mpfail->rsv1 = 0;

17392

++		mpfail->rsv2 = 0;

17393

++		mpfail->data_seq = htonll(tp->mpcb->csum_cutoff_seq);

17394

++

17395

++		ptr += MPTCP_SUB_LEN_FAIL_ALIGN >> 2;

17396

++	}

17397

++	if (unlikely(OPTION_MP_FCLOSE & opts->mptcp_options)) {

17398

++		struct mp_fclose *mpfclose = (struct mp_fclose *)ptr;

17399

++

17400

++		mpfclose->kind = TCPOPT_MPTCP;

17401

++		mpfclose->len = MPTCP_SUB_LEN_FCLOSE;

17402

++		mpfclose->sub = MPTCP_SUB_FCLOSE;

17403

++		mpfclose->rsv1 = 0;

17404

++		mpfclose->rsv2 = 0;

17405

++		mpfclose->key = opts->mp_capable.receiver_key;

17406

++

17407

++		ptr += MPTCP_SUB_LEN_FCLOSE_ALIGN >> 2;

17408

++	}

17409

++

17410

++	if (OPTION_DATA_ACK & opts->mptcp_options) {

17411

++		if (!mptcp_is_data_seq(skb))

17412

++			ptr += mptcp_write_dss_data_ack(tp, skb, ptr);

17413

++		else

17414

++			ptr += mptcp_write_dss_data_seq(tp, skb, ptr);

17415

++	}

17416

++	if (unlikely(OPTION_MP_PRIO & opts->mptcp_options)) {

17417

++		struct mp_prio *mpprio = (struct mp_prio *)ptr;

17418

++

17419

++		mpprio->kind = TCPOPT_MPTCP;

17420

++		mpprio->len = MPTCP_SUB_LEN_PRIO;

17421

++		mpprio->sub = MPTCP_SUB_PRIO;

17422

++		mpprio->rsv = 0;

17423

++		mpprio->b = tp->mptcp->low_prio;

17424

++		mpprio->addr_id = TCPOPT_NOP;

17425

++

17426

++		ptr += MPTCP_SUB_LEN_PRIO_ALIGN >> 2;

17427

++	}

17428

++}

17429

++

17430

++/* Sends the datafin */

17431

++void mptcp_send_fin(struct sock *meta_sk)

17432

++{

17433

++	struct tcp_sock *meta_tp = tcp_sk(meta_sk);

17434

++	struct sk_buff *skb = tcp_write_queue_tail(meta_sk);

17435

++	int mss_now;

17436

++

17437

++	if ((1 << meta_sk->sk_state) & (TCPF_CLOSE_WAIT | TCPF_LAST_ACK))

17438

++		meta_tp->mpcb->passive_close = 1;

17439

++

17440

++	/* Optimization, tack on the FIN if we have a queue of

17441

++	 * unsent frames.  But be careful about outgoing SACKS

17442

++	 * and IP options.

17443

++	 */

17444

++	mss_now = mptcp_current_mss(meta_sk);

17445

++

17446

++	if (tcp_send_head(meta_sk) != NULL) {

17447

++		TCP_SKB_CB(skb)->mptcp_flags |= MPTCPHDR_FIN;

17448

++		TCP_SKB_CB(skb)->end_seq++;

17449

++		meta_tp->write_seq++;

17450

++	} else {

17451

++		/* Socket is locked, keep trying until memory is available. */

17452

++		for (;;) {

17453

++			skb = alloc_skb_fclone(MAX_TCP_HEADER,

17454

++					       meta_sk->sk_allocation);

17455

++			if (skb)

17456

++				break;

17457

++			yield();

17458

++		}

17459

++		/* Reserve space for headers and prepare control bits. */

17460

++		skb_reserve(skb, MAX_TCP_HEADER);

17461

++

17462

++		tcp_init_nondata_skb(skb, meta_tp->write_seq, TCPHDR_ACK);

17463

++		TCP_SKB_CB(skb)->end_seq++;

17464

++		TCP_SKB_CB(skb)->mptcp_flags |= MPTCPHDR_FIN;

17465

++		tcp_queue_skb(meta_sk, skb);

17466

++	}

17467

++	__tcp_push_pending_frames(meta_sk, mss_now, TCP_NAGLE_OFF);

17468

++}

17469

++

17470

++void mptcp_send_active_reset(struct sock *meta_sk, gfp_t priority)

17471

++{

17472

++	struct tcp_sock *meta_tp = tcp_sk(meta_sk);

17473

++	struct mptcp_cb *mpcb = meta_tp->mpcb;

17474

++	struct sock *sk = NULL, *sk_it = NULL, *tmpsk;

17475

++

17476

++	if (!mpcb->cnt_subflows)

17477

++		return;

17478

++

17479

++	WARN_ON(meta_tp->send_mp_fclose);

17480

++

17481

++	/* First - select a socket */

17482

++	sk = mptcp_select_ack_sock(meta_sk);

17483

++

17484

++	/* May happen if no subflow is in an appropriate state */

17485

++	if (!sk)

17486

++		return;

17487

++

17488

++	/* We are in infinite mode - just send a reset */

17489

++	if (mpcb->infinite_mapping_snd || mpcb->infinite_mapping_rcv) {

17490

++		sk->sk_err = ECONNRESET;

17491

++		if (tcp_need_reset(sk->sk_state))

17492

++			tcp_send_active_reset(sk, priority);

17493

++		mptcp_sub_force_close(sk);

17494

++		return;

17495

++	}

17496

++

17497

++

17498

++	tcp_sk(sk)->send_mp_fclose = 1;

17499

++	/** Reset all other subflows */

17500

++

17501

++	/* tcp_done must be handled with bh disabled */

17502

++	if (!in_serving_softirq())

17503

++		local_bh_disable();

17504

++

17505

++	mptcp_for_each_sk_safe(mpcb, sk_it, tmpsk) {

17506

++		if (tcp_sk(sk_it)->send_mp_fclose)

17507

++			continue;

17508

++

17509

++		sk_it->sk_err = ECONNRESET;

17510

++		if (tcp_need_reset(sk_it->sk_state))

17511

++			tcp_send_active_reset(sk_it, GFP_ATOMIC);

17512

++		mptcp_sub_force_close(sk_it);

17513

++	}

17514

++

17515

++	if (!in_serving_softirq())

17516

++		local_bh_enable();

17517

++

17518

++	tcp_send_ack(sk);

17519

++	inet_csk_reset_keepalive_timer(sk, inet_csk(sk)->icsk_rto);

17520

++

17521

++	meta_tp->send_mp_fclose = 1;

17522

++}

17523

++

17524

++static void mptcp_ack_retransmit_timer(struct sock *sk)

17525

++{

17526

++	struct sk_buff *skb;

17527

++	struct tcp_sock *tp = tcp_sk(sk);

17528

++	struct inet_connection_sock *icsk = inet_csk(sk);

17529

++

17530

++	if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk))

17531

++		goto out; /* Routing failure or similar */

17532

++

17533

++	if (!tp->retrans_stamp)

17534

++		tp->retrans_stamp = tcp_time_stamp ? : 1;

17535

++

17536

++	if (tcp_write_timeout(sk)) {

17537

++		tp->mptcp->pre_established = 0;

17538

++		sk_stop_timer(sk, &tp->mptcp->mptcp_ack_timer);

17539

++		tp->ops->send_active_reset(sk, GFP_ATOMIC);

17540

++		goto out;

17541

++	}

17542

++

17543

++	skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);

17544

++	if (skb == NULL) {

17545

++		sk_reset_timer(sk, &tp->mptcp->mptcp_ack_timer,

17546

++			       jiffies + icsk->icsk_rto);

17547

++		return;

17548

++	}

17549

++

17550

++	/* Reserve space for headers and prepare control bits */

17551

++	skb_reserve(skb, MAX_TCP_HEADER);

17552

++	tcp_init_nondata_skb(skb, tp->snd_una, TCPHDR_ACK);

17553

++

17554

++	TCP_SKB_CB(skb)->when = tcp_time_stamp;

17555

++	if (tcp_transmit_skb(sk, skb, 0, GFP_ATOMIC) > 0) {

17556

++		/* Retransmission failed because of local congestion,

17557

++		 * do not backoff.

17558

++		 */

17559

++		if (!icsk->icsk_retransmits)

17560

++			icsk->icsk_retransmits = 1;

17561

++		sk_reset_timer(sk, &tp->mptcp->mptcp_ack_timer,

17562

++			       jiffies + icsk->icsk_rto);

17563

++		return;

17564

++	}

17565

++

17566

++

17567

++	icsk->icsk_retransmits++;

17568

++	icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX);

17569

++	sk_reset_timer(sk, &tp->mptcp->mptcp_ack_timer,

17570

++		       jiffies + icsk->icsk_rto);

17571

++	if (retransmits_timed_out(sk, sysctl_tcp_retries1 + 1, 0, 0))

17572

++		__sk_dst_reset(sk);

17573

++

17574

++out:;

17575

++}

17576

++

17577

++void mptcp_ack_handler(unsigned long data)

17578

++{

17579

++	struct sock *sk = (struct sock *)data;

17580

++	struct sock *meta_sk = mptcp_meta_sk(sk);

17581

++

17582

++	bh_lock_sock(meta_sk);

17583

++	if (sock_owned_by_user(meta_sk)) {

17584

++		/* Try again later */

17585

++		sk_reset_timer(sk, &tcp_sk(sk)->mptcp->mptcp_ack_timer,

17586

++			       jiffies + (HZ / 20));

17587

++		goto out_unlock;

17588

++	}

17589

++

17590

++	if (sk->sk_state == TCP_CLOSE)

17591

++		goto out_unlock;

17592

++	if (!tcp_sk(sk)->mptcp->pre_established)

17593

++		goto out_unlock;

17594

++

17595

++	mptcp_ack_retransmit_timer(sk);

17596

++

17597

++	sk_mem_reclaim(sk);

17598

++

17599

++out_unlock:

17600

++	bh_unlock_sock(meta_sk);

17601

++	sock_put(sk);

17602

++}

17603

++

17604

++/* Similar to tcp_retransmit_skb

17605

++ *

17606

++ * The diff is that we handle the retransmission-stats (retrans_stamp) at the

17607

++ * meta-level.

17608

++ */

17609

++int mptcp_retransmit_skb(struct sock *meta_sk, struct sk_buff *skb)

17610

++{

17611

++	struct tcp_sock *meta_tp = tcp_sk(meta_sk);

17612

++	struct sock *subsk;

17613

++	unsigned int limit, mss_now;

17614

++	int err = -1;

17615

++

17616

++	/* Do not sent more than we queued. 1/4 is reserved for possible

17617

++	 * copying overhead: fragmentation, tunneling, mangling etc.

17618

++	 *

17619

++	 * This is a meta-retransmission thus we check on the meta-socket.

17620

++	 */

17621

++	if (atomic_read(&meta_sk->sk_wmem_alloc) >

17622

++	    min(meta_sk->sk_wmem_queued + (meta_sk->sk_wmem_queued >> 2), meta_sk->sk_sndbuf)) {

17623

++		return -EAGAIN;

17624

++	}

17625

++

17626

++	/* We need to make sure that the retransmitted segment can be sent on a

17627

++	 * subflow right now. If it is too big, it needs to be fragmented.

17628

++	 */

17629

++	subsk = meta_tp->mpcb->sched_ops->get_subflow(meta_sk, skb, false);

17630

++	if (!subsk) {

17631

++		/* We want to increase icsk_retransmits, thus return 0, so that

17632

++		 * mptcp_retransmit_timer enters the desired branch.

17633

++		 */

17634

++		err = 0;

17635

++		goto failed;

17636

++	}

17637

++	mss_now = tcp_current_mss(subsk);

17638

++

17639

++	/* If the segment was cloned (e.g. a meta retransmission), the header

17640

++	 * must be expanded/copied so that there is no corruption of TSO

17641

++	 * information.

17642

++	 */

17643

++	if (skb_unclone(skb, GFP_ATOMIC)) {

17644

++		err = -ENOMEM;

17645

++		goto failed;

17646

++	}

17647

++

17648

++	/* Must have been set by mptcp_write_xmit before */

17649

++	BUG_ON(!tcp_skb_pcount(skb));

17650

++

17651

++	limit = mss_now;

17652

++	/* skb->len > mss_now is the equivalent of tso_segs > 1 in

17653

++	 * tcp_write_xmit. Otherwise split-point would return 0.

17654

++	 */

17655

++	if (skb->len > mss_now && !tcp_urg_mode(meta_tp))

17656

++		limit = tcp_mss_split_point(meta_sk, skb, mss_now,

17657

++					    UINT_MAX / mss_now,

17658

++					    TCP_NAGLE_OFF);

17659

++

17660

++	if (skb->len > limit &&

17661

++	    unlikely(mptcp_fragment(meta_sk, skb, limit,

17662

++				    GFP_ATOMIC, 0)))

17663

++		goto failed;

17664

++

17665

++	if (!mptcp_skb_entail(subsk, skb, -1))

17666

++		goto failed;

17667

++	TCP_SKB_CB(skb)->when = tcp_time_stamp;

17668

++

17669

++	/* Update global TCP statistics. */

17670

++	TCP_INC_STATS(sock_net(meta_sk), TCP_MIB_RETRANSSEGS);

17671

++

17672

++	/* Diff to tcp_retransmit_skb */

17673

++

17674

++	/* Save stamp of the first retransmit. */

17675

++	if (!meta_tp->retrans_stamp)

17676

++		meta_tp->retrans_stamp = TCP_SKB_CB(skb)->when;

17677

++

17678

++	__tcp_push_pending_frames(subsk, mss_now, TCP_NAGLE_PUSH);

17679

++

17680

++	return 0;

17681

++

17682

++failed:

17683

++	NET_INC_STATS_BH(sock_net(meta_sk), LINUX_MIB_TCPRETRANSFAIL);

17684

++	return err;

17685

++}

17686

++

17687

++/* Similar to tcp_retransmit_timer

17688

++ *

17689

++ * The diff is that we have to handle retransmissions of the FAST_CLOSE-message

17690

++ * and that we don't have an srtt estimation at the meta-level.

17691

++ */

17692

++void mptcp_retransmit_timer(struct sock *meta_sk)

17693

++{

17694

++	struct tcp_sock *meta_tp = tcp_sk(meta_sk);

17695

++	struct mptcp_cb *mpcb = meta_tp->mpcb;

17696

++	struct inet_connection_sock *meta_icsk = inet_csk(meta_sk);

17697

++	int err;

17698

++

17699

++	/* In fallback, retransmission is handled at the subflow-level */

17700

++	if (!meta_tp->packets_out || mpcb->infinite_mapping_snd ||

17701

++	    mpcb->send_infinite_mapping)

17702

++		return;

17703

++

17704

++	WARN_ON(tcp_write_queue_empty(meta_sk));

17705

++

17706

++	if (!meta_tp->snd_wnd && !sock_flag(meta_sk, SOCK_DEAD) &&

17707

++	    !((1 << meta_sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))) {

17708

++		/* Receiver dastardly shrinks window. Our retransmits

17709

++		 * become zero probes, but we should not timeout this

17710

++		 * connection. If the socket is an orphan, time it out,

17711

++		 * we cannot allow such beasts to hang infinitely.

17712

++		 */

17713

++		struct inet_sock *meta_inet = inet_sk(meta_sk);

17714

++		if (meta_sk->sk_family == AF_INET) {

17715

++			LIMIT_NETDEBUG(KERN_DEBUG "MPTCP: Peer %pI4:%u/%u unexpectedly shrunk window %u:%u (repaired)\n",

17716

++				       &meta_inet->inet_daddr,

17717

++				       ntohs(meta_inet->inet_dport),

17718

++				       meta_inet->inet_num, meta_tp->snd_una,

17719

++				       meta_tp->snd_nxt);

17720

++		}

17721

++#if IS_ENABLED(CONFIG_IPV6)

17722

++		else if (meta_sk->sk_family == AF_INET6) {

17723

++			LIMIT_NETDEBUG(KERN_DEBUG "MPTCP: Peer %pI6:%u/%u unexpectedly shrunk window %u:%u (repaired)\n",

17724

++				       &meta_sk->sk_v6_daddr,

17725

++				       ntohs(meta_inet->inet_dport),

17726

++				       meta_inet->inet_num, meta_tp->snd_una,

17727

++				       meta_tp->snd_nxt);

17728

++		}

17729

++#endif

17730

++		if (tcp_time_stamp - meta_tp->rcv_tstamp > TCP_RTO_MAX) {

17731

++			tcp_write_err(meta_sk);

17732

++			return;

17733

++		}

17734

++

17735

++		mptcp_retransmit_skb(meta_sk, tcp_write_queue_head(meta_sk));

17736

++		goto out_reset_timer;

17737

++	}

17738

++

17739

++	if (tcp_write_timeout(meta_sk))

17740

++		return;

17741

++

17742

++	if (meta_icsk->icsk_retransmits == 0)

17743

++		NET_INC_STATS_BH(sock_net(meta_sk), LINUX_MIB_TCPTIMEOUTS);

17744

++

17745

++	meta_icsk->icsk_ca_state = TCP_CA_Loss;

17746

++

17747

++	err = mptcp_retransmit_skb(meta_sk, tcp_write_queue_head(meta_sk));

17748

++	if (err > 0) {

17749

++		/* Retransmission failed because of local congestion,

17750

++		 * do not backoff.

17751

++		 */

17752

++		if (!meta_icsk->icsk_retransmits)

17753

++			meta_icsk->icsk_retransmits = 1;

17754

++		inet_csk_reset_xmit_timer(meta_sk, ICSK_TIME_RETRANS,

17755

++					  min(meta_icsk->icsk_rto, TCP_RESOURCE_PROBE_INTERVAL),

17756

++					  TCP_RTO_MAX);

17757

++		return;

17758

++	}

17759

++

17760

++	/* Increase the timeout each time we retransmit.  Note that

17761

++	 * we do not increase the rtt estimate.  rto is initialized

17762

++	 * from rtt, but increases here.  Jacobson (SIGCOMM 88) suggests

17763

++	 * that doubling rto each time is the least we can get away with.

17764

++	 * In KA9Q, Karn uses this for the first few times, and then

17765

++	 * goes to quadratic.  netBSD doubles, but only goes up to *64,

17766

++	 * and clamps at 1 to 64 sec afterwards.  Note that 120 sec is

17767

++	 * defined in the protocol as the maximum possible RTT.  I guess

17768

++	 * we'll have to use something other than TCP to talk to the

17769

++	 * University of Mars.

17770

++	 *

17771

++	 * PAWS allows us longer timeouts and large windows, so once

17772

++	 * implemented ftp to mars will work nicely. We will have to fix

17773

++	 * the 120 second clamps though!

17774

++	 */

17775

++	meta_icsk->icsk_backoff++;

17776

++	meta_icsk->icsk_retransmits++;

17777

++

17778

++out_reset_timer:

17779

++	/* If stream is thin, use linear timeouts. Since 'icsk_backoff' is

17780

++	 * used to reset timer, set to 0. Recalculate 'icsk_rto' as this

17781

++	 * might be increased if the stream oscillates between thin and thick,

17782

++	 * thus the old value might already be too high compared to the value

17783

++	 * set by 'tcp_set_rto' in tcp_input.c which resets the rto without

17784

++	 * backoff. Limit to TCP_THIN_LINEAR_RETRIES before initiating

17785

++	 * exponential backoff behaviour to avoid continue hammering

17786

++	 * linear-timeout retransmissions into a black hole

17787

++	 */

17788

++	if (meta_sk->sk_state == TCP_ESTABLISHED &&

17789

++	    (meta_tp->thin_lto || sysctl_tcp_thin_linear_timeouts) &&

17790

++	    tcp_stream_is_thin(meta_tp) &&

17791

++	    meta_icsk->icsk_retransmits <= TCP_THIN_LINEAR_RETRIES) {

17792

++		meta_icsk->icsk_backoff = 0;

17793

++		/* We cannot do the same as in tcp_write_timer because the

17794

++		 * srtt is not set here.

17795

++		 */

17796

++		mptcp_set_rto(meta_sk);

17797

++	} else {

17798

++		/* Use normal (exponential) backoff */

17799

++		meta_icsk->icsk_rto = min(meta_icsk->icsk_rto << 1, TCP_RTO_MAX);

17800

++	}

17801

++	inet_csk_reset_xmit_timer(meta_sk, ICSK_TIME_RETRANS, meta_icsk->icsk_rto, TCP_RTO_MAX);

17802

++

17803

++	return;

17804

++}

17805

++

17806

++/* Modify values to an mptcp-level for the initial window of new subflows */

17807

++void mptcp_select_initial_window(int __space, __u32 mss, __u32 *rcv_wnd,

17808

++				__u32 *window_clamp, int wscale_ok,

17809

++				__u8 *rcv_wscale, __u32 init_rcv_wnd,

17810

++				 const struct sock *sk)

17811

++{

17812

++	struct mptcp_cb *mpcb = tcp_sk(sk)->mpcb;

17813

++

17814

++	*window_clamp = mpcb->orig_window_clamp;

17815

++	__space = tcp_win_from_space(mpcb->orig_sk_rcvbuf);

17816

++

17817

++	tcp_select_initial_window(__space, mss, rcv_wnd, window_clamp,

17818

++				  wscale_ok, rcv_wscale, init_rcv_wnd, sk);

17819

++}

17820

++

17821

++static inline u64 mptcp_calc_rate(const struct sock *meta_sk, unsigned int mss,

17822

++				  unsigned int (*mss_cb)(struct sock *sk))

17823

++{

17824

++	struct sock *sk;

17825

++	u64 rate = 0;

17826

++

17827

++	mptcp_for_each_sk(tcp_sk(meta_sk)->mpcb, sk) {

17828

++		struct tcp_sock *tp = tcp_sk(sk);

17829

++		int this_mss;

17830

++		u64 this_rate;

17831

++

17832

++		if (!mptcp_sk_can_send(sk))

17833

++			continue;

17834

++

17835

++		/* Do not consider subflows without a RTT estimation yet

17836

++		 * otherwise this_rate >>> rate.

17837

++		 */

17838

++		if (unlikely(!tp->srtt_us))

17839

++			continue;

17840

++

17841

++		this_mss = mss_cb(sk);

17842

++

17843

++		/* If this_mss is smaller than mss, it means that a segment will

17844

++		 * be splitted in two (or more) when pushed on this subflow. If

17845

++		 * you consider that mss = 1428 and this_mss = 1420 then two

17846

++		 * segments will be generated: a 1420-byte and 8-byte segment.

17847

++		 * The latter will introduce a large overhead as for a single

17848

++		 * data segment 2 slots will be used in the congestion window.

17849

++		 * Therefore reducing by ~2 the potential throughput of this

17850

++		 * subflow. Indeed, 1428 will be send while 2840 could have been

17851

++		 * sent if mss == 1420 reducing the throughput by 2840 / 1428.

17852

++		 *

17853

++		 * The following algorithm take into account this overhead

17854

++		 * when computing the potential throughput that MPTCP can

17855

++		 * achieve when generating mss-byte segments.

17856

++		 *

17857

++		 * The formulae is the following:

17858

++		 *  \sum_{\forall sub} ratio * \frac{mss * cwnd_sub}{rtt_sub}

17859

++		 * Where ratio is computed as follows:

17860

++		 *  \frac{mss}{\ceil{mss / mss_sub} * mss_sub}

17861

++		 *

17862

++		 * ratio gives the reduction factor of the theoretical

17863

++		 * throughput a subflow can achieve if MPTCP uses a specific

17864

++		 * MSS value.

17865

++		 */

17866

++		this_rate = div64_u64((u64)mss * mss * (USEC_PER_SEC << 3) *

17867

++				      max(tp->snd_cwnd, tp->packets_out),

17868

++				      (u64)tp->srtt_us *

17869

++				      DIV_ROUND_UP(mss, this_mss) * this_mss);

17870

++		rate += this_rate;

17871

++	}

17872

++

17873

++	return rate;

17874

++}

17875

++

17876

++static unsigned int __mptcp_current_mss(const struct sock *meta_sk,

17877

++					unsigned int (*mss_cb)(struct sock *sk))

17878

++{

17879

++	unsigned int mss = 0;

17880

++	u64 rate = 0;

17881

++	struct sock *sk;

17882

++

17883

++	mptcp_for_each_sk(tcp_sk(meta_sk)->mpcb, sk) {

17884

++		int this_mss;

17885

++		u64 this_rate;

17886

++

17887

++		if (!mptcp_sk_can_send(sk))

17888

++			continue;

17889

++

17890

++		this_mss = mss_cb(sk);

17891

++

17892

++		/* Same mss values will produce the same throughput. */

17893

++		if (this_mss == mss)

17894

++			continue;

17895

++

17896

++		/* See whether using this mss value can theoretically improve

17897

++		 * the performances.

17898

++		 */

17899

++		this_rate = mptcp_calc_rate(meta_sk, this_mss, mss_cb);

17900

++		if (this_rate >= rate) {

17901

++			mss = this_mss;

17902

++			rate = this_rate;

17903

++		}

17904

++	}

17905

++

17906

++	return mss;

17907

++}

17908

++

17909

++unsigned int mptcp_current_mss(struct sock *meta_sk)

17910

++{

17911

++	unsigned int mss = __mptcp_current_mss(meta_sk, tcp_current_mss);

17912

++

17913

++	/* If no subflow is available, we take a default-mss from the

17914

++	 * meta-socket.

17915

++	 */

17916

++	return !mss ? tcp_current_mss(meta_sk) : mss;

17917

++}

17918

++

17919

++static unsigned int mptcp_select_size_mss(struct sock *sk)

17920

++{

17921

++	return tcp_sk(sk)->mss_cache;

17922

++}

17923

++

17924

++int mptcp_select_size(const struct sock *meta_sk, bool sg)

17925

++{

17926

++	unsigned int mss = __mptcp_current_mss(meta_sk, mptcp_select_size_mss);

17927

++

17928

++	if (sg) {

17929

++		if (mptcp_sk_can_gso(meta_sk)) {

17930

++			mss = SKB_WITH_OVERHEAD(2048 - MAX_TCP_HEADER);

17931

++		} else {

17932

++			int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);

17933

++

17934

++			if (mss >= pgbreak &&

17935

++			    mss <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE)

17936

++				mss = pgbreak;

17937

++		}

17938

++	}

17939

++

17940

++	return !mss ? tcp_sk(meta_sk)->mss_cache : mss;

17941

++}

17942

++

17943

++int mptcp_check_snd_buf(const struct tcp_sock *tp)

17944

++{

17945

++	const struct sock *sk;

17946

++	u32 rtt_max = tp->srtt_us;

17947

++	u64 bw_est;

17948

++

17949

++	if (!tp->srtt_us)

17950

++		return tp->reordering + 1;

17951

++

17952

++	mptcp_for_each_sk(tp->mpcb, sk) {

17953

++		if (!mptcp_sk_can_send(sk))

17954

++			continue;

17955

++

17956

++		if (rtt_max < tcp_sk(sk)->srtt_us)

17957

++			rtt_max = tcp_sk(sk)->srtt_us;

17958

++	}

17959

++

17960

++	bw_est = div64_u64(((u64)tp->snd_cwnd * rtt_max) << 16,

17961

++				(u64)tp->srtt_us);

17962

++

17963

++	return max_t(unsigned int, (u32)(bw_est >> 16),

17964

++			tp->reordering + 1);

17965

++}

17966

++

17967

++unsigned int mptcp_xmit_size_goal(const struct sock *meta_sk, u32 mss_now,

17968

++				  int large_allowed)

17969

++{

17970

++	struct sock *sk;

17971

++	u32 xmit_size_goal = 0;

17972

++

17973

++	if (large_allowed && mptcp_sk_can_gso(meta_sk)) {

17974

++		mptcp_for_each_sk(tcp_sk(meta_sk)->mpcb, sk) {

17975

++			int this_size_goal;

17976

++

17977

++			if (!mptcp_sk_can_send(sk))

17978

++				continue;

17979

++

17980

++			this_size_goal = tcp_xmit_size_goal(sk, mss_now, 1);

17981

++			if (this_size_goal > xmit_size_goal)

17982

++				xmit_size_goal = this_size_goal;

17983

++		}

17984

++	}

17985

++

17986

++	return max(xmit_size_goal, mss_now);

17987

++}

17988

++

17989

++/* Similar to tcp_trim_head - but we correctly copy the DSS-option */

17990

++int mptcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)

17991

++{

17992

++	if (skb_cloned(skb)) {

17993

++		if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC))

17994

++			return -ENOMEM;

17995

++	}

17996

++

17997

++	__pskb_trim_head(skb, len);

17998

++

17999

++	TCP_SKB_CB(skb)->seq += len;

18000

++	skb->ip_summed = CHECKSUM_PARTIAL;

18001

++

18002

++	skb->truesize	     -= len;

18003

++	sk->sk_wmem_queued   -= len;

18004

++	sk_mem_uncharge(sk, len);

18005

++	sock_set_flag(sk, SOCK_QUEUE_SHRUNK);

18006

++

18007

++	/* Any change of skb->len requires recalculation of tso factor. */

18008

++	if (tcp_skb_pcount(skb) > 1)

18009

++		tcp_set_skb_tso_segs(sk, skb, tcp_skb_mss(skb));

18010

++

18011

++	return 0;

18012

++}

18013

+diff --git a/net/mptcp/mptcp_pm.c b/net/mptcp/mptcp_pm.c

18014

+new file mode 100644

18015

+index 000000000000..9542f950729f

18016

+--- /dev/null

18017

++++ b/net/mptcp/mptcp_pm.c

18018

+@@ -0,0 +1,169 @@

18019

++/*

18020

++ *     MPTCP implementation - MPTCP-subflow-management

18021

++ *

18022

++ *     Initial Design & Implementation:

18023

++ *     Sébastien Barré <sebastien.barre@×××××××××.be>

18024

++ *

18025

++ *     Current Maintainer & Author:

18026

++ *     Christoph Paasch <christoph.paasch@×××××××××.be>

18027

++ *

18028

++ *     Additional authors:

18029

++ *     Jaakko Korkeaniemi <jaakko.korkeaniemi@×××××.fi>

18030

++ *     Gregory Detal <gregory.detal@×××××××××.be>

18031

++ *     Fabien Duchêne <fabien.duchene@×××××××××.be>

18032

++ *     Andreas Seelinger <Andreas.Seelinger@×××××××××××.de>

18033

++ *     Lavkesh Lahngir <lavkesh51@×××××.com>

18034

++ *     Andreas Ripke <ripke@××××××.eu>

18035

++ *     Vlad Dogaru <vlad.dogaru@×××××.com>

18036

++ *     Octavian Purdila <octavian.purdila@×××××.com>

18037

++ *     John Ronan <jronan@××××.org>

18038

++ *     Catalin Nicutar <catalin.nicutar@×××××.com>

18039

++ *     Brandon Heller <brandonh@××××××××.edu>

18040

++ *

18041

++ *

18042

++ *     This program is free software; you can redistribute it and/or

18043

++ *      modify it under the terms of the GNU General Public License

18044

++ *      as published by the Free Software Foundation; either version

18045

++ *      2 of the License, or (at your option) any later version.

18046

++ */

18047

++

18048

++

18049

++#include <linux/module.h>

18050

++#include <net/mptcp.h>

18051

++

18052

++static DEFINE_SPINLOCK(mptcp_pm_list_lock);

18053

++static LIST_HEAD(mptcp_pm_list);

18054

++

18055

++static int mptcp_default_id(sa_family_t family, union inet_addr *addr,

18056

++			    struct net *net, bool *low_prio)

18057

++{

18058

++	return 0;

18059

++}

18060

++

18061

++struct mptcp_pm_ops mptcp_pm_default = {

18062

++	.get_local_id = mptcp_default_id, /* We do not care */

18063

++	.name = "default",

18064

++	.owner = THIS_MODULE,

18065

++};

18066

++

18067

++static struct mptcp_pm_ops *mptcp_pm_find(const char *name)

18068

++{

18069

++	struct mptcp_pm_ops *e;

18070

++

18071

++	list_for_each_entry_rcu(e, &mptcp_pm_list, list) {

18072

++		if (strcmp(e->name, name) == 0)

18073

++			return e;

18074

++	}

18075

++

18076

++	return NULL;

18077

++}

18078

++

18079

++int mptcp_register_path_manager(struct mptcp_pm_ops *pm)

18080

++{

18081

++	int ret = 0;

18082

++

18083

++	if (!pm->get_local_id)

18084

++		return -EINVAL;

18085

++

18086

++	spin_lock(&mptcp_pm_list_lock);

18087

++	if (mptcp_pm_find(pm->name)) {

18088

++		pr_notice("%s already registered\n", pm->name);

18089

++		ret = -EEXIST;

18090

++	} else {

18091

++		list_add_tail_rcu(&pm->list, &mptcp_pm_list);

18092

++		pr_info("%s registered\n", pm->name);

18093

++	}

18094

++	spin_unlock(&mptcp_pm_list_lock);

18095

++

18096

++	return ret;

18097

++}

18098

++EXPORT_SYMBOL_GPL(mptcp_register_path_manager);

18099

++

18100

++void mptcp_unregister_path_manager(struct mptcp_pm_ops *pm)

18101

++{

18102

++	spin_lock(&mptcp_pm_list_lock);

18103

++	list_del_rcu(&pm->list);

18104

++	spin_unlock(&mptcp_pm_list_lock);

18105

++}

18106

++EXPORT_SYMBOL_GPL(mptcp_unregister_path_manager);

18107

++

18108

++void mptcp_get_default_path_manager(char *name)

18109

++{

18110

++	struct mptcp_pm_ops *pm;

18111

++

18112

++	BUG_ON(list_empty(&mptcp_pm_list));

18113

++

18114

++	rcu_read_lock();

18115

++	pm = list_entry(mptcp_pm_list.next, struct mptcp_pm_ops, list);

18116

++	strncpy(name, pm->name, MPTCP_PM_NAME_MAX);

18117

++	rcu_read_unlock();

18118

++}

18119

++

18120

++int mptcp_set_default_path_manager(const char *name)

18121

++{

18122

++	struct mptcp_pm_ops *pm;

18123

++	int ret = -ENOENT;

18124

++

18125

++	spin_lock(&mptcp_pm_list_lock);

18126

++	pm = mptcp_pm_find(name);

18127

++#ifdef CONFIG_MODULES

18128

++	if (!pm && capable(CAP_NET_ADMIN)) {

18129

++		spin_unlock(&mptcp_pm_list_lock);

18130

++

18131

++		request_module("mptcp_%s", name);

18132

++		spin_lock(&mptcp_pm_list_lock);

18133

++		pm = mptcp_pm_find(name);

18134

++	}

18135

++#endif

18136

++

18137

++	if (pm) {

18138

++		list_move(&pm->list, &mptcp_pm_list);

18139

++		ret = 0;

18140

++	} else {

18141

++		pr_info("%s is not available\n", name);

18142

++	}

18143

++	spin_unlock(&mptcp_pm_list_lock);

18144

++

18145

++	return ret;

18146

++}

18147

++

18148

++void mptcp_init_path_manager(struct mptcp_cb *mpcb)

18149

++{

18150

++	struct mptcp_pm_ops *pm;

18151

++

18152

++	rcu_read_lock();

18153

++	list_for_each_entry_rcu(pm, &mptcp_pm_list, list) {

18154

++		if (try_module_get(pm->owner)) {

18155

++			mpcb->pm_ops = pm;

18156

++			break;

18157

++		}

18158

++	}

18159

++	rcu_read_unlock();

18160

++}

18161

++

18162

++/* Manage refcounts on socket close. */

18163

++void mptcp_cleanup_path_manager(struct mptcp_cb *mpcb)

18164

++{

18165

++	module_put(mpcb->pm_ops->owner);

18166

++}

18167

++

18168

++/* Fallback to the default path-manager. */

18169

++void mptcp_fallback_default(struct mptcp_cb *mpcb)

18170

++{

18171

++	struct mptcp_pm_ops *pm;

18172

++

18173

++	mptcp_cleanup_path_manager(mpcb);

18174

++	pm = mptcp_pm_find("default");

18175

++

18176

++	/* Cannot fail - it's the default module */

18177

++	try_module_get(pm->owner);

18178

++	mpcb->pm_ops = pm;

18179

++}

18180

++EXPORT_SYMBOL_GPL(mptcp_fallback_default);

18181

++

18182

++/* Set default value from kernel configuration at bootup */

18183

++static int __init mptcp_path_manager_default(void)

18184

++{

18185

++	return mptcp_set_default_path_manager(CONFIG_DEFAULT_MPTCP_PM);

18186

++}

18187

++late_initcall(mptcp_path_manager_default);

18188

+diff --git a/net/mptcp/mptcp_rr.c b/net/mptcp/mptcp_rr.c

18189

+new file mode 100644

18190

+index 000000000000..93278f684069

18191

+--- /dev/null

18192

++++ b/net/mptcp/mptcp_rr.c

18193

+@@ -0,0 +1,301 @@

18194

++/* MPTCP Scheduler module selector. Highly inspired by tcp_cong.c */

18195

++

18196

++#include <linux/module.h>

18197

++#include <net/mptcp.h>

18198

++

18199

++static unsigned char num_segments __read_mostly = 1;

18200

++module_param(num_segments, byte, 0644);

18201

++MODULE_PARM_DESC(num_segments, "The number of consecutive segments that are part of a burst");

18202

++

18203

++static bool cwnd_limited __read_mostly = 1;

18204

++module_param(cwnd_limited, bool, 0644);

18205

++MODULE_PARM_DESC(cwnd_limited, "if set to 1, the scheduler tries to fill the congestion-window on all subflows");

18206

++

18207

++struct rrsched_priv {

18208

++	unsigned char quota;

18209

++};

18210

++

18211

++static struct rrsched_priv *rrsched_get_priv(const struct tcp_sock *tp)

18212

++{

18213

++	return (struct rrsched_priv *)&tp->mptcp->mptcp_sched[0];

18214

++}

18215

++

18216

++/* If the sub-socket sk available to send the skb? */

18217

++static bool mptcp_rr_is_available(const struct sock *sk, const struct sk_buff *skb,

18218

++				  bool zero_wnd_test, bool cwnd_test)

18219

++{

18220

++	const struct tcp_sock *tp = tcp_sk(sk);

18221

++	unsigned int space, in_flight;

18222

++

18223

++	/* Set of states for which we are allowed to send data */

18224

++	if (!mptcp_sk_can_send(sk))

18225

++		return false;

18226

++

18227

++	/* We do not send data on this subflow unless it is

18228

++	 * fully established, i.e. the 4th ack has been received.

18229

++	 */

18230

++	if (tp->mptcp->pre_established)

18231

++		return false;

18232

++

18233

++	if (tp->pf)

18234

++		return false;

18235

++

18236

++	if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss) {

18237

++		/* If SACK is disabled, and we got a loss, TCP does not exit

18238

++		 * the loss-state until something above high_seq has been acked.

18239

++		 * (see tcp_try_undo_recovery)

18240

++		 *

18241

++		 * high_seq is the snd_nxt at the moment of the RTO. As soon

18242

++		 * as we have an RTO, we won't push data on the subflow.

18243

++		 * Thus, snd_una can never go beyond high_seq.

18244

++		 */

18245

++		if (!tcp_is_reno(tp))

18246

++			return false;

18247

++		else if (tp->snd_una != tp->high_seq)

18248

++			return false;

18249

++	}

18250

++

18251

++	if (!tp->mptcp->fully_established) {

18252

++		/* Make sure that we send in-order data */

18253

++		if (skb && tp->mptcp->second_packet &&

18254

++		    tp->mptcp->last_end_data_seq != TCP_SKB_CB(skb)->seq)

18255

++			return false;

18256

++	}

18257

++

18258

++	if (!cwnd_test)

18259

++		goto zero_wnd_test;

18260

++

18261

++	in_flight = tcp_packets_in_flight(tp);

18262

++	/* Not even a single spot in the cwnd */

18263

++	if (in_flight >= tp->snd_cwnd)

18264

++		return false;

18265

++

18266

++	/* Now, check if what is queued in the subflow's send-queue

18267

++	 * already fills the cwnd.

18268

++	 */

18269

++	space = (tp->snd_cwnd - in_flight) * tp->mss_cache;

18270

++

18271

++	if (tp->write_seq - tp->snd_nxt > space)

18272

++		return false;

18273

++

18274

++zero_wnd_test:

18275

++	if (zero_wnd_test && !before(tp->write_seq, tcp_wnd_end(tp)))

18276

++		return false;

18277

++

18278

++	return true;

18279

++}

18280

++

18281

++/* Are we not allowed to reinject this skb on tp? */

18282

++static int mptcp_rr_dont_reinject_skb(const struct tcp_sock *tp, const struct sk_buff *skb)

18283

++{

18284

++	/* If the skb has already been enqueued in this sk, try to find

18285

++	 * another one.

18286

++	 */

18287

++	return skb &&

18288

++		/* Has the skb already been enqueued into this subsocket? */

18289

++		mptcp_pi_to_flag(tp->mptcp->path_index) & TCP_SKB_CB(skb)->path_mask;

18290

++}

18291

++

18292

++/* We just look for any subflow that is available */

18293

++static struct sock *rr_get_available_subflow(struct sock *meta_sk,

18294

++					     struct sk_buff *skb,

18295

++					     bool zero_wnd_test)

18296

++{

18297

++	const struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;

18298

++	struct sock *sk, *bestsk = NULL, *backupsk = NULL;

18299

++

18300

++	/* Answer data_fin on same subflow!!! */

18301

++	if (meta_sk->sk_shutdown & RCV_SHUTDOWN &&

18302

++	    skb && mptcp_is_data_fin(skb)) {

18303

++		mptcp_for_each_sk(mpcb, sk) {

18304

++			if (tcp_sk(sk)->mptcp->path_index == mpcb->dfin_path_index &&

18305

++			    mptcp_rr_is_available(sk, skb, zero_wnd_test, true))

18306

++				return sk;

18307

++		}

18308

++	}

18309

++

18310

++	/* First, find the best subflow */

18311

++	mptcp_for_each_sk(mpcb, sk) {

18312

++		struct tcp_sock *tp = tcp_sk(sk);

18313

++

18314

++		if (!mptcp_rr_is_available(sk, skb, zero_wnd_test, true))

18315

++			continue;

18316

++

18317

++		if (mptcp_rr_dont_reinject_skb(tp, skb)) {

18318

++			backupsk = sk;

18319

++			continue;

18320

++		}

18321

++

18322

++		bestsk = sk;

18323

++	}

18324

++

18325

++	if (bestsk) {

18326

++		sk = bestsk;

18327

++	} else if (backupsk) {

18328

++		/* It has been sent on all subflows once - let's give it a

18329

++		 * chance again by restarting its pathmask.

18330

++		 */

18331

++		if (skb)

18332

++			TCP_SKB_CB(skb)->path_mask = 0;

18333

++		sk = backupsk;

18334

++	}

18335

++

18336

++	return sk;

18337

++}

18338

++

18339

++/* Returns the next segment to be sent from the mptcp meta-queue.

18340

++ * (chooses the reinject queue if any segment is waiting in it, otherwise,

18341

++ * chooses the normal write queue).

18342

++ * Sets *@reinject to 1 if the returned segment comes from the

18343

++ * reinject queue. Sets it to 0 if it is the regular send-head of the meta-sk,

18344

++ * and sets it to -1 if it is a meta-level retransmission to optimize the

18345

++ * receive-buffer.

18346

++ */

18347

++static struct sk_buff *__mptcp_rr_next_segment(const struct sock *meta_sk, int *reinject)

18348

++{

18349

++	const struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;

18350

++	struct sk_buff *skb = NULL;

18351

++

18352

++	*reinject = 0;

18353

++

18354

++	/* If we are in fallback-mode, just take from the meta-send-queue */

18355

++	if (mpcb->infinite_mapping_snd || mpcb->send_infinite_mapping)

18356

++		return tcp_send_head(meta_sk);

18357

++

18358

++	skb = skb_peek(&mpcb->reinject_queue);

18359

++

18360

++	if (skb)

18361

++		*reinject = 1;

18362

++	else

18363

++		skb = tcp_send_head(meta_sk);

18364

++	return skb;

18365

++}

18366

++

18367

++static struct sk_buff *mptcp_rr_next_segment(struct sock *meta_sk,

18368

++					     int *reinject,

18369

++					     struct sock **subsk,

18370

++					     unsigned int *limit)

18371

++{

18372

++	const struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;

18373

++	struct sock *sk_it, *choose_sk = NULL;

18374

++	struct sk_buff *skb = __mptcp_rr_next_segment(meta_sk, reinject);

18375

++	unsigned char split = num_segments;

18376

++	unsigned char iter = 0, full_subs = 0;

18377

++

18378

++	/* As we set it, we have to reset it as well. */

18379

++	*limit = 0;

18380

++

18381

++	if (!skb)

18382

++		return NULL;

18383

++

18384

++	if (*reinject) {

18385

++		*subsk = rr_get_available_subflow(meta_sk, skb, false);

18386

++		if (!*subsk)

18387

++			return NULL;

18388

++

18389

++		return skb;

18390

++	}

18391

++

18392

++retry:

18393

++

18394

++	/* First, we look for a subflow who is currently being used */

18395

++	mptcp_for_each_sk(mpcb, sk_it) {

18396

++		struct tcp_sock *tp_it = tcp_sk(sk_it);

18397

++		struct rrsched_priv *rsp = rrsched_get_priv(tp_it);

18398

++

18399

++		if (!mptcp_rr_is_available(sk_it, skb, false, cwnd_limited))

18400

++			continue;

18401

++

18402

++		iter++;

18403

++

18404

++		/* Is this subflow currently being used? */

18405

++		if (rsp->quota > 0 && rsp->quota < num_segments) {

18406

++			split = num_segments - rsp->quota;

18407

++			choose_sk = sk_it;

18408

++			goto found;

18409

++		}

18410

++

18411

++		/* Or, it's totally unused */

18412

++		if (!rsp->quota) {

18413

++			split = num_segments;

18414

++			choose_sk = sk_it;

18415

++		}

18416

++

18417

++		/* Or, it must then be fully used  */

18418

++		if (rsp->quota == num_segments)

18419

++			full_subs++;

18420

++	}

18421

++

18422

++	/* All considered subflows have a full quota, and we considered at

18423

++	 * least one.

18424

++	 */

18425

++	if (iter && iter == full_subs) {

18426

++		/* So, we restart this round by setting quota to 0 and retry

18427

++		 * to find a subflow.

18428

++		 */

18429

++		mptcp_for_each_sk(mpcb, sk_it) {

18430

++			struct tcp_sock *tp_it = tcp_sk(sk_it);

18431

++			struct rrsched_priv *rsp = rrsched_get_priv(tp_it);

18432

++

18433

++			if (!mptcp_rr_is_available(sk_it, skb, false, cwnd_limited))

18434

++				continue;

18435

++

18436

++			rsp->quota = 0;

18437

++		}

18438

++

18439

++		goto retry;

18440

++	}

18441

++

18442

++found:

18443

++	if (choose_sk) {

18444

++		unsigned int mss_now;

18445

++		struct tcp_sock *choose_tp = tcp_sk(choose_sk);

18446

++		struct rrsched_priv *rsp = rrsched_get_priv(choose_tp);

18447

++

18448

++		if (!mptcp_rr_is_available(choose_sk, skb, false, true))

18449

++			return NULL;

18450

++

18451

++		*subsk = choose_sk;

18452

++		mss_now = tcp_current_mss(*subsk);

18453

++		*limit = split * mss_now;

18454

++

18455

++		if (skb->len > mss_now)

18456

++			rsp->quota += DIV_ROUND_UP(skb->len, mss_now);

18457

++		else

18458

++			rsp->quota++;

18459

++

18460

++		return skb;

18461

++	}

18462

++

18463

++	return NULL;

18464

++}

18465

++

18466

++static struct mptcp_sched_ops mptcp_sched_rr = {

18467

++	.get_subflow = rr_get_available_subflow,

18468

++	.next_segment = mptcp_rr_next_segment,

18469

++	.name = "roundrobin",

18470

++	.owner = THIS_MODULE,

18471

++};

18472

++

18473

++static int __init rr_register(void)

18474

++{

18475

++	BUILD_BUG_ON(sizeof(struct rrsched_priv) > MPTCP_SCHED_SIZE);

18476

++

18477

++	if (mptcp_register_scheduler(&mptcp_sched_rr))

18478

++		return -1;

18479

++

18480

++	return 0;

18481

++}

18482

++

18483

++static void rr_unregister(void)

18484

++{

18485

++	mptcp_unregister_scheduler(&mptcp_sched_rr);

18486

++}

18487

++

18488

++module_init(rr_register);

18489

++module_exit(rr_unregister);

18490

++

18491

++MODULE_AUTHOR("Christoph Paasch");

18492

++MODULE_LICENSE("GPL");

18493

++MODULE_DESCRIPTION("ROUNDROBIN MPTCP");

18494

++MODULE_VERSION("0.89");

18495

+diff --git a/net/mptcp/mptcp_sched.c b/net/mptcp/mptcp_sched.c

18496

+new file mode 100644

18497

+index 000000000000..6c7ff4eceac1

18498

+--- /dev/null

18499

++++ b/net/mptcp/mptcp_sched.c

18500

+@@ -0,0 +1,493 @@

18501

++/* MPTCP Scheduler module selector. Highly inspired by tcp_cong.c */

18502

++

18503

++#include <linux/module.h>

18504

++#include <net/mptcp.h>

18505

++

18506

++static DEFINE_SPINLOCK(mptcp_sched_list_lock);

18507

++static LIST_HEAD(mptcp_sched_list);

18508

++

18509

++struct defsched_priv {

18510

++	u32	last_rbuf_opti;

18511

++};

18512

++

18513

++static struct defsched_priv *defsched_get_priv(const struct tcp_sock *tp)

18514

++{

18515

++	return (struct defsched_priv *)&tp->mptcp->mptcp_sched[0];

18516

++}

18517

++

18518

++/* If the sub-socket sk available to send the skb? */

18519

++static bool mptcp_is_available(struct sock *sk, const struct sk_buff *skb,

18520

++			       bool zero_wnd_test)

18521

++{

18522

++	const struct tcp_sock *tp = tcp_sk(sk);

18523

++	unsigned int mss_now, space, in_flight;

18524

++

18525

++	/* Set of states for which we are allowed to send data */

18526

++	if (!mptcp_sk_can_send(sk))

18527

++		return false;

18528

++

18529

++	/* We do not send data on this subflow unless it is

18530

++	 * fully established, i.e. the 4th ack has been received.

18531

++	 */

18532

++	if (tp->mptcp->pre_established)

18533

++		return false;

18534

++

18535

++	if (tp->pf)

18536

++		return false;

18537

++

18538

++	if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss) {

18539

++		/* If SACK is disabled, and we got a loss, TCP does not exit

18540

++		 * the loss-state until something above high_seq has been acked.

18541

++		 * (see tcp_try_undo_recovery)

18542

++		 *

18543

++		 * high_seq is the snd_nxt at the moment of the RTO. As soon

18544

++		 * as we have an RTO, we won't push data on the subflow.

18545

++		 * Thus, snd_una can never go beyond high_seq.

18546

++		 */

18547

++		if (!tcp_is_reno(tp))

18548

++			return false;

18549

++		else if (tp->snd_una != tp->high_seq)

18550

++			return false;

18551

++	}

18552

++

18553

++	if (!tp->mptcp->fully_established) {

18554

++		/* Make sure that we send in-order data */

18555

++		if (skb && tp->mptcp->second_packet &&

18556

++		    tp->mptcp->last_end_data_seq != TCP_SKB_CB(skb)->seq)

18557

++			return false;

18558

++	}

18559

++

18560

++	/* If TSQ is already throttling us, do not send on this subflow. When

18561

++	 * TSQ gets cleared the subflow becomes eligible again.

18562

++	 */

18563

++	if (test_bit(TSQ_THROTTLED, &tp->tsq_flags))

18564

++		return false;

18565

++

18566

++	in_flight = tcp_packets_in_flight(tp);

18567

++	/* Not even a single spot in the cwnd */

18568

++	if (in_flight >= tp->snd_cwnd)

18569

++		return false;

18570

++

18571

++	/* Now, check if what is queued in the subflow's send-queue

18572

++	 * already fills the cwnd.

18573

++	 */

18574

++	space = (tp->snd_cwnd - in_flight) * tp->mss_cache;

18575

++

18576

++	if (tp->write_seq - tp->snd_nxt > space)

18577

++		return false;

18578

++

18579

++	if (zero_wnd_test && !before(tp->write_seq, tcp_wnd_end(tp)))

18580

++		return false;

18581

++

18582

++	mss_now = tcp_current_mss(sk);

18583

++

18584

++	/* Don't send on this subflow if we bypass the allowed send-window at

18585

++	 * the per-subflow level. Similar to tcp_snd_wnd_test, but manually

18586

++	 * calculated end_seq (because here at this point end_seq is still at

18587

++	 * the meta-level).

18588

++	 */

18589

++	if (skb && !zero_wnd_test &&

18590

++	    after(tp->write_seq + min(skb->len, mss_now), tcp_wnd_end(tp)))

18591

++		return false;

18592

++

18593

++	return true;

18594

++}

18595

++

18596

++/* Are we not allowed to reinject this skb on tp? */

18597

++static int mptcp_dont_reinject_skb(const struct tcp_sock *tp, const struct sk_buff *skb)

18598

++{

18599

++	/* If the skb has already been enqueued in this sk, try to find

18600

++	 * another one.

18601

++	 */

18602

++	return skb &&

18603

++		/* Has the skb already been enqueued into this subsocket? */

18604

++		mptcp_pi_to_flag(tp->mptcp->path_index) & TCP_SKB_CB(skb)->path_mask;

18605

++}

18606

++

18607

++/* This is the scheduler. This function decides on which flow to send

18608

++ * a given MSS. If all subflows are found to be busy, NULL is returned

18609

++ * The flow is selected based on the shortest RTT.

18610

++ * If all paths have full cong windows, we simply return NULL.

18611

++ *

18612

++ * Additionally, this function is aware of the backup-subflows.

18613

++ */

18614

++static struct sock *get_available_subflow(struct sock *meta_sk,

18615

++					  struct sk_buff *skb,

18616

++					  bool zero_wnd_test)

18617

++{

18618

++	struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;

18619

++	struct sock *sk, *bestsk = NULL, *lowpriosk = NULL, *backupsk = NULL;

18620

++	u32 min_time_to_peer = 0xffffffff, lowprio_min_time_to_peer = 0xffffffff;

18621

++	int cnt_backups = 0;

18622

++

18623

++	/* if there is only one subflow, bypass the scheduling function */

18624

++	if (mpcb->cnt_subflows == 1) {

18625

++		bestsk = (struct sock *)mpcb->connection_list;

18626

++		if (!mptcp_is_available(bestsk, skb, zero_wnd_test))

18627

++			bestsk = NULL;

18628

++		return bestsk;

18629

++	}

18630

++

18631

++	/* Answer data_fin on same subflow!!! */

18632

++	if (meta_sk->sk_shutdown & RCV_SHUTDOWN &&

18633

++	    skb && mptcp_is_data_fin(skb)) {

18634

++		mptcp_for_each_sk(mpcb, sk) {

18635

++			if (tcp_sk(sk)->mptcp->path_index == mpcb->dfin_path_index &&

18636

++			    mptcp_is_available(sk, skb, zero_wnd_test))

18637

++				return sk;

18638

++		}

18639

++	}

18640

++

18641

++	/* First, find the best subflow */

18642

++	mptcp_for_each_sk(mpcb, sk) {

18643

++		struct tcp_sock *tp = tcp_sk(sk);

18644

++

18645

++		if (tp->mptcp->rcv_low_prio || tp->mptcp->low_prio)

18646

++			cnt_backups++;

18647

++

18648

++		if ((tp->mptcp->rcv_low_prio || tp->mptcp->low_prio) &&

18649

++		    tp->srtt_us < lowprio_min_time_to_peer) {

18650

++			if (!mptcp_is_available(sk, skb, zero_wnd_test))

18651

++				continue;

18652

++

18653

++			if (mptcp_dont_reinject_skb(tp, skb)) {

18654

++				backupsk = sk;

18655

++				continue;

18656

++			}

18657

++

18658

++			lowprio_min_time_to_peer = tp->srtt_us;

18659

++			lowpriosk = sk;

18660

++		} else if (!(tp->mptcp->rcv_low_prio || tp->mptcp->low_prio) &&

18661

++			   tp->srtt_us < min_time_to_peer) {

18662

++			if (!mptcp_is_available(sk, skb, zero_wnd_test))

18663

++				continue;

18664

++

18665

++			if (mptcp_dont_reinject_skb(tp, skb)) {

18666

++				backupsk = sk;

18667

++				continue;

18668

++			}

18669

++

18670

++			min_time_to_peer = tp->srtt_us;

18671

++			bestsk = sk;

18672

++		}

18673

++	}

18674

++

18675

++	if (mpcb->cnt_established == cnt_backups && lowpriosk) {

18676

++		sk = lowpriosk;

18677

++	} else if (bestsk) {

18678

++		sk = bestsk;

18679

++	} else if (backupsk) {

18680

++		/* It has been sent on all subflows once - let's give it a

18681

++		 * chance again by restarting its pathmask.

18682

++		 */

18683

++		if (skb)

18684

++			TCP_SKB_CB(skb)->path_mask = 0;

18685

++		sk = backupsk;

18686

++	}

18687

++

18688

++	return sk;

18689

++}

18690

++

18691

++static struct sk_buff *mptcp_rcv_buf_optimization(struct sock *sk, int penal)

18692

++{

18693

++	struct sock *meta_sk;

18694

++	const struct tcp_sock *tp = tcp_sk(sk);

18695

++	struct tcp_sock *tp_it;

18696

++	struct sk_buff *skb_head;

18697

++	struct defsched_priv *dsp = defsched_get_priv(tp);

18698

++

18699

++	if (tp->mpcb->cnt_subflows == 1)

18700

++		return NULL;

18701

++

18702

++	meta_sk = mptcp_meta_sk(sk);

18703

++	skb_head = tcp_write_queue_head(meta_sk);

18704

++

18705

++	if (!skb_head || skb_head == tcp_send_head(meta_sk))

18706

++		return NULL;

18707

++

18708

++	/* If penalization is optional (coming from mptcp_next_segment() and

18709

++	 * We are not send-buffer-limited we do not penalize. The retransmission

18710

++	 * is just an optimization to fix the idle-time due to the delay before

18711

++	 * we wake up the application.

18712

++	 */

18713

++	if (!penal && sk_stream_memory_free(meta_sk))

18714

++		goto retrans;

18715

++

18716

++	/* Only penalize again after an RTT has elapsed */

18717

++	if (tcp_time_stamp - dsp->last_rbuf_opti < usecs_to_jiffies(tp->srtt_us >> 3))

18718

++		goto retrans;

18719

++

18720

++	/* Half the cwnd of the slow flow */

18721

++	mptcp_for_each_tp(tp->mpcb, tp_it) {

18722

++		if (tp_it != tp &&

18723

++		    TCP_SKB_CB(skb_head)->path_mask & mptcp_pi_to_flag(tp_it->mptcp->path_index)) {

18724

++			if (tp->srtt_us < tp_it->srtt_us && inet_csk((struct sock *)tp_it)->icsk_ca_state == TCP_CA_Open) {

18725

++				tp_it->snd_cwnd = max(tp_it->snd_cwnd >> 1U, 1U);

18726

++				if (tp_it->snd_ssthresh != TCP_INFINITE_SSTHRESH)

18727

++					tp_it->snd_ssthresh = max(tp_it->snd_ssthresh >> 1U, 2U);

18728

++

18729

++				dsp->last_rbuf_opti = tcp_time_stamp;

18730

++			}

18731

++			break;

18732

++		}

18733

++	}

18734

++

18735

++retrans:

18736

++

18737

++	/* Segment not yet injected into this path? Take it!!! */

18738

++	if (!(TCP_SKB_CB(skb_head)->path_mask & mptcp_pi_to_flag(tp->mptcp->path_index))) {

18739

++		bool do_retrans = false;

18740

++		mptcp_for_each_tp(tp->mpcb, tp_it) {

18741

++			if (tp_it != tp &&

18742

++			    TCP_SKB_CB(skb_head)->path_mask & mptcp_pi_to_flag(tp_it->mptcp->path_index)) {

18743

++				if (tp_it->snd_cwnd <= 4) {

18744

++					do_retrans = true;

18745

++					break;

18746

++				}

18747

++

18748

++				if (4 * tp->srtt_us >= tp_it->srtt_us) {

18749

++					do_retrans = false;

18750

++					break;

18751

++				} else {

18752

++					do_retrans = true;

18753

++				}

18754

++			}

18755

++		}

18756

++

18757

++		if (do_retrans && mptcp_is_available(sk, skb_head, false))

18758

++			return skb_head;

18759

++	}

18760

++	return NULL;

18761

++}

18762

++

18763

++/* Returns the next segment to be sent from the mptcp meta-queue.

18764

++ * (chooses the reinject queue if any segment is waiting in it, otherwise,

18765

++ * chooses the normal write queue).

18766

++ * Sets *@reinject to 1 if the returned segment comes from the

18767

++ * reinject queue. Sets it to 0 if it is the regular send-head of the meta-sk,

18768

++ * and sets it to -1 if it is a meta-level retransmission to optimize the

18769

++ * receive-buffer.

18770

++ */

18771

++static struct sk_buff *__mptcp_next_segment(struct sock *meta_sk, int *reinject)

18772

++{

18773

++	const struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;

18774

++	struct sk_buff *skb = NULL;

18775

++

18776

++	*reinject = 0;

18777

++

18778

++	/* If we are in fallback-mode, just take from the meta-send-queue */

18779

++	if (mpcb->infinite_mapping_snd || mpcb->send_infinite_mapping)

18780

++		return tcp_send_head(meta_sk);

18781

++

18782

++	skb = skb_peek(&mpcb->reinject_queue);

18783

++

18784

++	if (skb) {

18785

++		*reinject = 1;

18786

++	} else {

18787

++		skb = tcp_send_head(meta_sk);

18788

++

18789

++		if (!skb && meta_sk->sk_socket &&

18790

++		    test_bit(SOCK_NOSPACE, &meta_sk->sk_socket->flags) &&

18791

++		    sk_stream_wspace(meta_sk) < sk_stream_min_wspace(meta_sk)) {

18792

++			struct sock *subsk = get_available_subflow(meta_sk, NULL,

18793

++								   false);

18794

++			if (!subsk)

18795

++				return NULL;

18796

++

18797

++			skb = mptcp_rcv_buf_optimization(subsk, 0);

18798

++			if (skb)

18799

++				*reinject = -1;

18800

++		}

18801

++	}

18802

++	return skb;

18803

++}

18804

++

18805

++static struct sk_buff *mptcp_next_segment(struct sock *meta_sk,

18806

++					  int *reinject,

18807

++					  struct sock **subsk,

18808

++					  unsigned int *limit)

18809

++{

18810

++	struct sk_buff *skb = __mptcp_next_segment(meta_sk, reinject);

18811

++	unsigned int mss_now;

18812

++	struct tcp_sock *subtp;

18813

++	u16 gso_max_segs;

18814

++	u32 max_len, max_segs, window, needed;

18815

++

18816

++	/* As we set it, we have to reset it as well. */

18817

++	*limit = 0;

18818

++

18819

++	if (!skb)

18820

++		return NULL;

18821

++

18822

++	*subsk = get_available_subflow(meta_sk, skb, false);

18823

++	if (!*subsk)

18824

++		return NULL;

18825

++

18826

++	subtp = tcp_sk(*subsk);

18827

++	mss_now = tcp_current_mss(*subsk);

18828

++

18829

++	if (!*reinject && unlikely(!tcp_snd_wnd_test(tcp_sk(meta_sk), skb, mss_now))) {

18830

++		skb = mptcp_rcv_buf_optimization(*subsk, 1);

18831

++		if (skb)

18832

++			*reinject = -1;

18833

++		else

18834

++			return NULL;

18835

++	}

18836

++

18837

++	/* No splitting required, as we will only send one single segment */

18838

++	if (skb->len <= mss_now)

18839

++		return skb;

18840

++

18841

++	/* The following is similar to tcp_mss_split_point, but

18842

++	 * we do not care about nagle, because we will anyways

18843

++	 * use TCP_NAGLE_PUSH, which overrides this.

18844

++	 *

18845

++	 * So, we first limit according to the cwnd/gso-size and then according

18846

++	 * to the subflow's window.

18847

++	 */

18848

++

18849

++	gso_max_segs = (*subsk)->sk_gso_max_segs;

18850

++	if (!gso_max_segs) /* No gso supported on the subflow's NIC */

18851

++		gso_max_segs = 1;

18852

++	max_segs = min_t(unsigned int, tcp_cwnd_test(subtp, skb), gso_max_segs);

18853

++	if (!max_segs)

18854

++		return NULL;

18855

++

18856

++	max_len = mss_now * max_segs;

18857

++	window = tcp_wnd_end(subtp) - subtp->write_seq;

18858

++

18859

++	needed = min(skb->len, window);

18860

++	if (max_len <= skb->len)

18861

++		/* Take max_win, which is actually the cwnd/gso-size */

18862

++		*limit = max_len;

18863

++	else

18864

++		/* Or, take the window */

18865

++		*limit = needed;

18866

++

18867

++	return skb;

18868

++}

18869

++

18870

++static void defsched_init(struct sock *sk)

18871

++{

18872

++	struct defsched_priv *dsp = defsched_get_priv(tcp_sk(sk));

18873

++

18874

++	dsp->last_rbuf_opti = tcp_time_stamp;

18875

++}

18876

++

18877

++struct mptcp_sched_ops mptcp_sched_default = {

18878

++	.get_subflow = get_available_subflow,

18879

++	.next_segment = mptcp_next_segment,

18880

++	.init = defsched_init,

18881

++	.name = "default",

18882

++	.owner = THIS_MODULE,

18883

++};

18884

++

18885

++static struct mptcp_sched_ops *mptcp_sched_find(const char *name)

18886

++{

18887

++	struct mptcp_sched_ops *e;

18888

++

18889

++	list_for_each_entry_rcu(e, &mptcp_sched_list, list) {

18890

++		if (strcmp(e->name, name) == 0)

18891

++			return e;

18892

++	}

18893

++

18894

++	return NULL;

18895

++}

18896

++

18897

++int mptcp_register_scheduler(struct mptcp_sched_ops *sched)

18898

++{

18899

++	int ret = 0;

18900

++

18901

++	if (!sched->get_subflow || !sched->next_segment)

18902

++		return -EINVAL;

18903

++

18904

++	spin_lock(&mptcp_sched_list_lock);

18905

++	if (mptcp_sched_find(sched->name)) {

18906

++		pr_notice("%s already registered\n", sched->name);

18907

++		ret = -EEXIST;

18908

++	} else {

18909

++		list_add_tail_rcu(&sched->list, &mptcp_sched_list);

18910

++		pr_info("%s registered\n", sched->name);

18911

++	}

18912

++	spin_unlock(&mptcp_sched_list_lock);

18913

++

18914

++	return ret;

18915

++}

18916

++EXPORT_SYMBOL_GPL(mptcp_register_scheduler);

18917

++

18918

++void mptcp_unregister_scheduler(struct mptcp_sched_ops *sched)

18919

++{

18920

++	spin_lock(&mptcp_sched_list_lock);

18921

++	list_del_rcu(&sched->list);

18922

++	spin_unlock(&mptcp_sched_list_lock);

18923

++}

18924

++EXPORT_SYMBOL_GPL(mptcp_unregister_scheduler);

18925

++

18926

++void mptcp_get_default_scheduler(char *name)

18927

++{

18928

++	struct mptcp_sched_ops *sched;

18929

++

18930

++	BUG_ON(list_empty(&mptcp_sched_list));

18931

++

18932

++	rcu_read_lock();

18933

++	sched = list_entry(mptcp_sched_list.next, struct mptcp_sched_ops, list);

18934

++	strncpy(name, sched->name, MPTCP_SCHED_NAME_MAX);

18935

++	rcu_read_unlock();

18936

++}

18937

++

18938

++int mptcp_set_default_scheduler(const char *name)

18939

++{

18940

++	struct mptcp_sched_ops *sched;

18941

++	int ret = -ENOENT;

18942

++

18943

++	spin_lock(&mptcp_sched_list_lock);

18944

++	sched = mptcp_sched_find(name);

18945

++#ifdef CONFIG_MODULES

18946

++	if (!sched && capable(CAP_NET_ADMIN)) {

18947

++		spin_unlock(&mptcp_sched_list_lock);

18948

++

18949

++		request_module("mptcp_%s", name);

18950

++		spin_lock(&mptcp_sched_list_lock);

18951

++		sched = mptcp_sched_find(name);

18952

++	}

18953

++#endif

18954

++

18955

++	if (sched) {

18956

++		list_move(&sched->list, &mptcp_sched_list);

18957

++		ret = 0;

18958

++	} else {

18959

++		pr_info("%s is not available\n", name);

18960

++	}

18961

++	spin_unlock(&mptcp_sched_list_lock);

18962

++

18963

++	return ret;

18964

++}

18965

++

18966

++void mptcp_init_scheduler(struct mptcp_cb *mpcb)

18967

++{

18968

++	struct mptcp_sched_ops *sched;

18969

++

18970

++	rcu_read_lock();

18971

++	list_for_each_entry_rcu(sched, &mptcp_sched_list, list) {

18972

++		if (try_module_get(sched->owner)) {

18973

++			mpcb->sched_ops = sched;

18974

++			break;

18975

++		}

18976

++	}

18977

++	rcu_read_unlock();

18978

++}

18979

++

18980

++/* Manage refcounts on socket close. */

18981

++void mptcp_cleanup_scheduler(struct mptcp_cb *mpcb)

18982

++{

18983

++	module_put(mpcb->sched_ops->owner);

18984

++}

18985

++

18986

++/* Set default value from kernel configuration at bootup */

18987

++static int __init mptcp_scheduler_default(void)

18988

++{

18989

++	BUILD_BUG_ON(sizeof(struct defsched_priv) > MPTCP_SCHED_SIZE);

18990

++

18991

++	return mptcp_set_default_scheduler(CONFIG_DEFAULT_MPTCP_SCHED);

18992

++}

18993

++late_initcall(mptcp_scheduler_default);

18994

+diff --git a/net/mptcp/mptcp_wvegas.c b/net/mptcp/mptcp_wvegas.c

18995

+new file mode 100644

18996

+index 000000000000..29ca1d868d17

18997

+--- /dev/null

18998

++++ b/net/mptcp/mptcp_wvegas.c

18999

+@@ -0,0 +1,268 @@

19000

++/*

19001

++ *	MPTCP implementation - WEIGHTED VEGAS

19002

++ *

19003

++ *	Algorithm design:

19004

++ *	Yu Cao <cyAnalyst@×××.com>

19005

++ *	Mingwei Xu <xmw@××××××××××××××××××××××.cn>

19006

++ *	Xiaoming Fu <fu@××××××××××××××××××.de>

19007

++ *

19008

++ *	Implementation:

19009

++ *	Yu Cao <cyAnalyst@×××.com>

19010

++ *	Enhuan Dong <deh13@××××××××××××××××××.cn>

19011

++ *

19012

++ *	Ported to the official MPTCP-kernel:

19013

++ *	Christoph Paasch <christoph.paasch@×××××××××.be>

19014

++ *

19015

++ *	This program is free software; you can redistribute it and/or

19016

++ *	modify it under the terms of the GNU General Public License

19017

++ *	as published by the Free Software Foundation; either version

19018

++ *	2 of the License, or (at your option) any later version.

19019

++ */

19020

++

19021

++#include <linux/skbuff.h>

19022

++#include <net/tcp.h>

19023

++#include <net/mptcp.h>

19024

++#include <linux/module.h>

19025

++#include <linux/tcp.h>

19026

++

19027

++static int initial_alpha = 2;

19028

++static int total_alpha = 10;

19029

++static int gamma = 1;

19030

++

19031

++module_param(initial_alpha, int, 0644);

19032

++MODULE_PARM_DESC(initial_alpha, "initial alpha for all subflows");

19033

++module_param(total_alpha, int, 0644);

19034

++MODULE_PARM_DESC(total_alpha, "total alpha for all subflows");

19035

++module_param(gamma, int, 0644);

19036

++MODULE_PARM_DESC(gamma, "limit on increase (scale by 2)");

19037

++

19038

++#define MPTCP_WVEGAS_SCALE 16

19039

++

19040

++/* wVegas variables */

19041

++struct wvegas {

19042

++	u32	beg_snd_nxt;	/* right edge during last RTT */

19043

++	u8	doing_wvegas_now;/* if true, do wvegas for this RTT */

19044

++

19045

++	u16	cnt_rtt;		/* # of RTTs measured within last RTT */

19046

++	u32 sampled_rtt; /* cumulative RTTs measured within last RTT (in usec) */

19047

++	u32	base_rtt;	/* the min of all wVegas RTT measurements seen (in usec) */

19048

++

19049

++	u64 instant_rate; /* cwnd / srtt_us, unit: pkts/us * 2^16 */

19050

++	u64 weight; /* the ratio of subflow's rate to the total rate, * 2^16 */

19051

++	int alpha; /* alpha for each subflows */

19052

++

19053

++	u32 queue_delay; /* queue delay*/

19054

++};

19055

++

19056

++

19057

++static inline u64 mptcp_wvegas_scale(u32 val, int scale)

19058

++{

19059

++	return (u64) val << scale;

19060

++}

19061

++

19062

++static void wvegas_enable(const struct sock *sk)

19063

++{

19064

++	const struct tcp_sock *tp = tcp_sk(sk);

19065

++	struct wvegas *wvegas = inet_csk_ca(sk);

19066

++

19067

++	wvegas->doing_wvegas_now = 1;

19068

++

19069

++	wvegas->beg_snd_nxt = tp->snd_nxt;

19070

++

19071

++	wvegas->cnt_rtt = 0;

19072

++	wvegas->sampled_rtt = 0;

19073

++

19074

++	wvegas->instant_rate = 0;

19075

++	wvegas->alpha = initial_alpha;

19076

++	wvegas->weight = mptcp_wvegas_scale(1, MPTCP_WVEGAS_SCALE);

19077

++

19078

++	wvegas->queue_delay = 0;

19079

++}

19080

++

19081

++static inline void wvegas_disable(const struct sock *sk)

19082

++{

19083

++	struct wvegas *wvegas = inet_csk_ca(sk);

19084

++

19085

++	wvegas->doing_wvegas_now = 0;

19086

++}

19087

++

19088

++static void mptcp_wvegas_init(struct sock *sk)

19089

++{

19090

++	struct wvegas *wvegas = inet_csk_ca(sk);

19091

++

19092

++	wvegas->base_rtt = 0x7fffffff;

19093

++	wvegas_enable(sk);

19094

++}

19095

++

19096

++static inline u64 mptcp_wvegas_rate(u32 cwnd, u32 rtt_us)

19097

++{

19098

++	return div_u64(mptcp_wvegas_scale(cwnd, MPTCP_WVEGAS_SCALE), rtt_us);

19099

++}

19100

++

19101

++static void mptcp_wvegas_pkts_acked(struct sock *sk, u32 cnt, s32 rtt_us)

19102

++{

19103

++	struct wvegas *wvegas = inet_csk_ca(sk);

19104

++	u32 vrtt;

19105

++

19106

++	if (rtt_us < 0)

19107

++		return;

19108

++

19109

++	vrtt = rtt_us + 1;

19110

++

19111

++	if (vrtt < wvegas->base_rtt)

19112

++		wvegas->base_rtt = vrtt;

19113

++

19114

++	wvegas->sampled_rtt += vrtt;

19115

++	wvegas->cnt_rtt++;

19116

++}

19117

++

19118

++static void mptcp_wvegas_state(struct sock *sk, u8 ca_state)

19119

++{

19120

++	if (ca_state == TCP_CA_Open)

19121

++		wvegas_enable(sk);

19122

++	else

19123

++		wvegas_disable(sk);

19124

++}

19125

++

19126

++static void mptcp_wvegas_cwnd_event(struct sock *sk, enum tcp_ca_event event)

19127

++{

19128

++	if (event == CA_EVENT_CWND_RESTART) {

19129

++		mptcp_wvegas_init(sk);

19130

++	} else if (event == CA_EVENT_LOSS) {

19131

++		struct wvegas *wvegas = inet_csk_ca(sk);

19132

++		wvegas->instant_rate = 0;

19133

++	}

19134

++}

19135

++

19136

++static inline u32 mptcp_wvegas_ssthresh(const struct tcp_sock *tp)

19137

++{

19138

++	return  min(tp->snd_ssthresh, tp->snd_cwnd - 1);

19139

++}

19140

++

19141

++static u64 mptcp_wvegas_weight(const struct mptcp_cb *mpcb, const struct sock *sk)

19142

++{

19143

++	u64 total_rate = 0;

19144

++	struct sock *sub_sk;

19145

++	const struct wvegas *wvegas = inet_csk_ca(sk);

19146

++

19147

++	if (!mpcb)

19148

++		return wvegas->weight;

19149

++

19150

++

19151

++	mptcp_for_each_sk(mpcb, sub_sk) {

19152

++		struct wvegas *sub_wvegas = inet_csk_ca(sub_sk);

19153

++

19154

++		/* sampled_rtt is initialized by 0 */

19155

++		if (mptcp_sk_can_send(sub_sk) && (sub_wvegas->sampled_rtt > 0))

19156

++			total_rate += sub_wvegas->instant_rate;

19157

++	}

19158

++

19159

++	if (total_rate && wvegas->instant_rate)

19160

++		return div64_u64(mptcp_wvegas_scale(wvegas->instant_rate, MPTCP_WVEGAS_SCALE), total_rate);

19161

++	else

19162

++		return wvegas->weight;

19163

++}

19164

++

19165

++static void mptcp_wvegas_cong_avoid(struct sock *sk, u32 ack, u32 acked)

19166

++{

19167

++	struct tcp_sock *tp = tcp_sk(sk);

19168

++	struct wvegas *wvegas = inet_csk_ca(sk);

19169

++

19170

++	if (!wvegas->doing_wvegas_now) {

19171

++		tcp_reno_cong_avoid(sk, ack, acked);

19172

++		return;

19173

++	}

19174

++

19175

++	if (after(ack, wvegas->beg_snd_nxt)) {

19176

++		wvegas->beg_snd_nxt  = tp->snd_nxt;

19177

++

19178

++		if (wvegas->cnt_rtt <= 2) {

19179

++			tcp_reno_cong_avoid(sk, ack, acked);

19180

++		} else {

19181

++			u32 rtt, diff, q_delay;

19182

++			u64 target_cwnd;

19183

++

19184

++			rtt = wvegas->sampled_rtt / wvegas->cnt_rtt;

19185

++			target_cwnd = div_u64(((u64)tp->snd_cwnd * wvegas->base_rtt), rtt);

19186

++

19187

++			diff = div_u64((u64)tp->snd_cwnd * (rtt - wvegas->base_rtt), rtt);

19188

++

19189

++			if (diff > gamma && tp->snd_cwnd <= tp->snd_ssthresh) {

19190

++				tp->snd_cwnd = min(tp->snd_cwnd, (u32)target_cwnd+1);

19191

++				tp->snd_ssthresh = mptcp_wvegas_ssthresh(tp);

19192

++

19193

++			} else if (tp->snd_cwnd <= tp->snd_ssthresh) {

19194

++				tcp_slow_start(tp, acked);

19195

++			} else {

19196

++				if (diff >= wvegas->alpha) {

19197

++					wvegas->instant_rate = mptcp_wvegas_rate(tp->snd_cwnd, rtt);

19198

++					wvegas->weight = mptcp_wvegas_weight(tp->mpcb, sk);

19199

++					wvegas->alpha = max(2U, (u32)((wvegas->weight * total_alpha) >> MPTCP_WVEGAS_SCALE));

19200

++				}

19201

++				if (diff > wvegas->alpha) {

19202

++					tp->snd_cwnd--;

19203

++					tp->snd_ssthresh = mptcp_wvegas_ssthresh(tp);

19204

++				} else if (diff < wvegas->alpha) {

19205

++					tp->snd_cwnd++;

19206

++				}

19207

++

19208

++				/* Try to drain link queue if needed*/

19209

++				q_delay = rtt - wvegas->base_rtt;

19210

++				if ((wvegas->queue_delay == 0) || (wvegas->queue_delay > q_delay))

19211

++					wvegas->queue_delay = q_delay;

19212

++

19213

++				if (q_delay >= 2 * wvegas->queue_delay) {

19214

++					u32 backoff_factor = div_u64(mptcp_wvegas_scale(wvegas->base_rtt, MPTCP_WVEGAS_SCALE), 2 * rtt);

19215

++					tp->snd_cwnd = ((u64)tp->snd_cwnd * backoff_factor) >> MPTCP_WVEGAS_SCALE;

19216

++					wvegas->queue_delay = 0;

19217

++				}

19218

++			}

19219

++

19220

++			if (tp->snd_cwnd < 2)

19221

++				tp->snd_cwnd = 2;

19222

++			else if (tp->snd_cwnd > tp->snd_cwnd_clamp)

19223

++				tp->snd_cwnd = tp->snd_cwnd_clamp;

19224

++

19225

++			tp->snd_ssthresh = tcp_current_ssthresh(sk);

19226

++		}

19227

++

19228

++		wvegas->cnt_rtt = 0;

19229

++		wvegas->sampled_rtt = 0;

19230

++	}

19231

++	/* Use normal slow start */

19232

++	else if (tp->snd_cwnd <= tp->snd_ssthresh)

19233

++		tcp_slow_start(tp, acked);

19234

++}

19235

++

19236

++

19237

++static struct tcp_congestion_ops mptcp_wvegas __read_mostly = {

19238

++	.init		= mptcp_wvegas_init,

19239

++	.ssthresh	= tcp_reno_ssthresh,

19240

++	.cong_avoid	= mptcp_wvegas_cong_avoid,

19241

++	.pkts_acked	= mptcp_wvegas_pkts_acked,

19242

++	.set_state	= mptcp_wvegas_state,

19243

++	.cwnd_event	= mptcp_wvegas_cwnd_event,

19244

++

19245

++	.owner		= THIS_MODULE,

19246

++	.name		= "wvegas",

19247

++};

19248

++

19249

++static int __init mptcp_wvegas_register(void)

19250

++{

19251

++	BUILD_BUG_ON(sizeof(struct wvegas) > ICSK_CA_PRIV_SIZE);

19252

++	tcp_register_congestion_control(&mptcp_wvegas);

19253

++	return 0;

19254

++}

19255

++

19256

++static void __exit mptcp_wvegas_unregister(void)

19257

++{

19258

++	tcp_unregister_congestion_control(&mptcp_wvegas);

19259

++}

19260

++

19261

++module_init(mptcp_wvegas_register);

19262

++module_exit(mptcp_wvegas_unregister);

19263

++

19264

++MODULE_AUTHOR("Yu Cao, Enhuan Dong");

19265

++MODULE_LICENSE("GPL");

19266

++MODULE_DESCRIPTION("MPTCP wVegas");

19267

++MODULE_VERSION("0.1");

19268

19269

diff --git a/4567_distro-Gentoo-Kconfig.patch b/4567_distro-Gentoo-Kconfig.patch

19270

index 71dbf09..652e2a7 100644

19271

--- a/4567_distro-Gentoo-Kconfig.patch

19272

+++ b/4567_distro-Gentoo-Kconfig.patch

19273

@@ -1,15 +1,15 @@

19274

---- a/Kconfig   2014-04-02 09:45:05.389224541 -0400

19275

-+++ b/Kconfig   2014-04-02 09:45:39.269224273 -0400

19276

+--- a/Kconfig	2014-04-02 09:45:05.389224541 -0400

19277

++++ b/Kconfig	2014-04-02 09:45:39.269224273 -0400

19278

 @@ -8,4 +8,6 @@ config SRCARCH

19279

-	string

19280

-	option env="SRCARCH"

19281

-

19282

+ 	string

19283

+ 	option env="SRCARCH"

19284

+

19285

 +source "distro/Kconfig"

19286

+

19287

  source "arch/$SRCARCH/Kconfig"

19288

---- /dev/null	2014-09-22 14:19:24.316977284 -0400

19289

-+++ distro/Kconfig	2014-09-22 19:30:35.670959281 -0400

19290

-@@ -0,0 +1,109 @@

19291

+--- 	1969-12-31 19:00:00.000000000 -0500

19292

++++ b/distro/Kconfig	2014-04-02 09:57:03.539218861 -0400

19293

+@@ -0,0 +1,108 @@

19294

 +menu "Gentoo Linux"

19295

+

19296

 +config GENTOO_LINUX

19297

@@ -34,8 +34,6 @@

19298

 +	select DEVTMPFS

19299

 +	select TMPFS

19300

+

19301

-+	select FHANDLE

19302

-+

19303

 +	select MMU

19304

 +	select SHMEM

19305

+

19306

@@ -91,6 +89,7 @@

19307

 +	select CGROUPS

19308

 +	select EPOLL

19309

 +	select FANOTIFY

19310

++	select FHANDLE

19311

 +	select INOTIFY_USER

19312

 +	select NET

19313

 +	select NET_NS

Gentoo Archives: gentoo-commits