[gentoo-commits] proj/linux-patches:5.15 commit in: / - gentoo-commits

From:	Mike Pagano <mpagano@g.o>
To:	gentoo-commits@l.g.o
Subject:	[gentoo-commits] proj/linux-patches:5.15 commit in: /
Date:	Sun, 01 May 2022 17:03:20
Message-Id:	`1651424578.aa3aade4f155b96481a44b6733e806c8181271cc.mpagano@gentoo`

1

commit:     aa3aade4f155b96481a44b6733e806c8181271cc

2

Author:     Mike Pagano <mpagano <AT> gentoo <DOT> org>

3

AuthorDate: Sun May  1 17:02:58 2022 +0000

4

Commit:     Mike Pagano <mpagano <AT> gentoo <DOT> org>

5

CommitDate: Sun May  1 17:02:58 2022 +0000

6

URL:        https://gitweb.gentoo.org/proj/linux-patches.git/commit/?id=aa3aade4

7

8

Linux patch 5.15.37

9

10

Signed-off-by: Mike Pagano <mpagano <AT> gentoo.org>

11

12

 0000_README              |    4 +

13

 1036_linux-5.15.37.patch | 4223 ++++++++++++++++++++++++++++++++++++++++++++++

14

 2 files changed, 4227 insertions(+)

15

16

diff --git a/0000_README b/0000_README

17

index 0f44e39b..cb4266b1 100644

18

--- a/0000_README

19

+++ b/0000_README

20

@@ -187,6 +187,10 @@ Patch:  1035_linux-5.15.36.patch

21

 From:   http://www.kernel.org

22

 Desc:   Linux 5.15.36

23

24

+Patch:  1036_linux-5.15.37.patch

25

+From:   http://www.kernel.org

26

+Desc:   Linux 5.15.37

27

+

28

 Patch:  1500_XATTR_USER_PREFIX.patch

29

 From:   https://bugs.gentoo.org/show_bug.cgi?id=470644

30

 Desc:   Support for namespace user.pax.* on tmpfs.

31

32

diff --git a/1036_linux-5.15.37.patch b/1036_linux-5.15.37.patch

33

new file mode 100644

34

index 00000000..b9d4c0ea

35

--- /dev/null

36

+++ b/1036_linux-5.15.37.patch

37

@@ -0,0 +1,4223 @@

38

+diff --git a/Makefile b/Makefile

39

+index e0710f9837847..50b1688a4ca2c 100644

40

+--- a/Makefile

41

++++ b/Makefile

42

+@@ -1,7 +1,7 @@

43

+ # SPDX-License-Identifier: GPL-2.0

44

+ VERSION = 5

45

+ PATCHLEVEL = 15

46

+-SUBLEVEL = 36

47

++SUBLEVEL = 37

48

+ EXTRAVERSION =

49

+ NAME = Trick or Treat

50

+

51

+diff --git a/arch/arm/boot/dts/socfpga.dtsi b/arch/arm/boot/dts/socfpga.dtsi

52

+index 0b021eef0b538..7c1d6423d7f8c 100644

53

+--- a/arch/arm/boot/dts/socfpga.dtsi

54

++++ b/arch/arm/boot/dts/socfpga.dtsi

55

+@@ -782,7 +782,7 @@

56

+ 		};

57

+

58

+ 		qspi: spi@ff705000 {

59

+-			compatible = "cdns,qspi-nor";

60

++			compatible = "intel,socfpga-qspi", "cdns,qspi-nor";

61

+ 			#address-cells = <1>;

62

+ 			#size-cells = <0>;

63

+ 			reg = <0xff705000 0x1000>,

64

+diff --git a/arch/arm/boot/dts/socfpga_arria10.dtsi b/arch/arm/boot/dts/socfpga_arria10.dtsi

65

+index a574ea91d9d3f..3ba431dfa8c94 100644

66

+--- a/arch/arm/boot/dts/socfpga_arria10.dtsi

67

++++ b/arch/arm/boot/dts/socfpga_arria10.dtsi

68

+@@ -756,7 +756,7 @@

69

+ 		};

70

+

71

+ 		qspi: spi@ff809000 {

72

+-			compatible = "cdns,qspi-nor";

73

++			compatible = "intel,socfpga-qspi", "cdns,qspi-nor";

74

+ 			#address-cells = <1>;

75

+ 			#size-cells = <0>;

76

+ 			reg = <0xff809000 0x100>,

77

+diff --git a/arch/arm64/boot/dts/altera/socfpga_stratix10.dtsi b/arch/arm64/boot/dts/altera/socfpga_stratix10.dtsi

78

+index d301ac0d406bf..3ec301bd08a91 100644

79

+--- a/arch/arm64/boot/dts/altera/socfpga_stratix10.dtsi

80

++++ b/arch/arm64/boot/dts/altera/socfpga_stratix10.dtsi

81

+@@ -594,7 +594,7 @@

82

+ 		};

83

+

84

+ 		qspi: spi@ff8d2000 {

85

+-			compatible = "cdns,qspi-nor";

86

++			compatible =  "intel,socfpga-qspi", "cdns,qspi-nor";

87

+ 			#address-cells = <1>;

88

+ 			#size-cells = <0>;

89

+ 			reg = <0xff8d2000 0x100>,

90

+diff --git a/arch/arm64/boot/dts/intel/socfpga_agilex.dtsi b/arch/arm64/boot/dts/intel/socfpga_agilex.dtsi

91

+index de1e98c99ec5b..f4270cf189962 100644

92

+--- a/arch/arm64/boot/dts/intel/socfpga_agilex.dtsi

93

++++ b/arch/arm64/boot/dts/intel/socfpga_agilex.dtsi

94

+@@ -628,7 +628,7 @@

95

+ 		};

96

+

97

+ 		qspi: spi@ff8d2000 {

98

+-			compatible = "cdns,qspi-nor";

99

++			compatible = "intel,socfpga-qspi", "cdns,qspi-nor";

100

+ 			#address-cells = <1>;

101

+ 			#size-cells = <0>;

102

+ 			reg = <0xff8d2000 0x100>,

103

+diff --git a/arch/powerpc/kernel/kvm.c b/arch/powerpc/kernel/kvm.c

104

+index d89cf802d9aa7..6568823cf3063 100644

105

+--- a/arch/powerpc/kernel/kvm.c

106

++++ b/arch/powerpc/kernel/kvm.c

107

+@@ -669,7 +669,8 @@ static void __init kvm_use_magic_page(void)

108

+ 	on_each_cpu(kvm_map_magic_page, &features, 1);

109

+

110

+ 	/* Quick self-test to see if the mapping works */

111

+-	if (fault_in_pages_readable((const char *)KVM_MAGIC_PAGE, sizeof(u32))) {

112

++	if (fault_in_readable((const char __user *)KVM_MAGIC_PAGE,

113

++			      sizeof(u32))) {

114

+ 		kvm_patching_worked = false;

115

+ 		return;

116

+ 	}

117

+diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c

118

+index f2da879264bcd..3e053e2fd6b69 100644

119

+--- a/arch/powerpc/kernel/signal_32.c

120

++++ b/arch/powerpc/kernel/signal_32.c

121

+@@ -1048,7 +1048,7 @@ SYSCALL_DEFINE3(swapcontext, struct ucontext __user *, old_ctx,

122

+ 	if (new_ctx == NULL)

123

+ 		return 0;

124

+ 	if (!access_ok(new_ctx, ctx_size) ||

125

+-	    fault_in_pages_readable((u8 __user *)new_ctx, ctx_size))

126

++	    fault_in_readable((char __user *)new_ctx, ctx_size))

127

+ 		return -EFAULT;

128

+

129

+ 	/*

130

+@@ -1239,7 +1239,7 @@ SYSCALL_DEFINE3(debug_setcontext, struct ucontext __user *, ctx,

131

+ #endif

132

+

133

+ 	if (!access_ok(ctx, sizeof(*ctx)) ||

134

+-	    fault_in_pages_readable((u8 __user *)ctx, sizeof(*ctx)))

135

++	    fault_in_readable((char __user *)ctx, sizeof(*ctx)))

136

+ 		return -EFAULT;

137

+

138

+ 	/*

139

+diff --git a/arch/powerpc/kernel/signal_64.c b/arch/powerpc/kernel/signal_64.c

140

+index bb9c077ac1322..d1e1fc0acbea3 100644

141

+--- a/arch/powerpc/kernel/signal_64.c

142

++++ b/arch/powerpc/kernel/signal_64.c

143

+@@ -688,7 +688,7 @@ SYSCALL_DEFINE3(swapcontext, struct ucontext __user *, old_ctx,

144

+ 	if (new_ctx == NULL)

145

+ 		return 0;

146

+ 	if (!access_ok(new_ctx, ctx_size) ||

147

+-	    fault_in_pages_readable((u8 __user *)new_ctx, ctx_size))

148

++	    fault_in_readable((char __user *)new_ctx, ctx_size))

149

+ 		return -EFAULT;

150

+

151

+ 	/*

152

+diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c

153

+index 831b25c5e7058..7f71bd4dcd0d6 100644

154

+--- a/arch/x86/kernel/fpu/signal.c

155

++++ b/arch/x86/kernel/fpu/signal.c

156

+@@ -205,7 +205,7 @@ retry:

157

+ 	fpregs_unlock();

158

+

159

+ 	if (ret) {

160

+-		if (!fault_in_pages_writeable(buf_fx, fpu_user_xstate_size))

161

++		if (!fault_in_writeable(buf_fx, fpu_user_xstate_size))

162

+ 			goto retry;

163

+ 		return -EFAULT;

164

+ 	}

165

+@@ -278,10 +278,9 @@ retry:

166

+ 		if (ret != -EFAULT)

167

+ 			return -EINVAL;

168

+

169

+-		ret = fault_in_pages_readable(buf, size);

170

+-		if (!ret)

171

++		if (!fault_in_readable(buf, size))

172

+ 			goto retry;

173

+-		return ret;

174

++		return -EFAULT;

175

+ 	}

176

+

177

+ 	/*

178

+diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig

179

+index ab3e37aa1830c..f93cb989241cc 100644

180

+--- a/drivers/block/Kconfig

181

++++ b/drivers/block/Kconfig

182

+@@ -33,6 +33,22 @@ config BLK_DEV_FD

183

+ 	  To compile this driver as a module, choose M here: the

184

+ 	  module will be called floppy.

185

+

186

++config BLK_DEV_FD_RAWCMD

187

++	bool "Support for raw floppy disk commands (DEPRECATED)"

188

++	depends on BLK_DEV_FD

189

++	help

190

++	  If you want to use actual physical floppies and expect to do

191

++	  special low-level hardware accesses to them (access and use

192

++	  non-standard formats, for example), then enable this.

193

++

194

++	  Note that the code enabled by this option is rarely used and

195

++	  might be unstable or insecure, and distros should not enable it.

196

++

197

++	  Note: FDRAWCMD is deprecated and will be removed from the kernel

198

++	  in the near future.

199

++

200

++	  If unsure, say N.

201

++

202

+ config AMIGA_FLOPPY

203

+ 	tristate "Amiga floppy support"

204

+ 	depends on AMIGA

205

+diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c

206

+index 0f58594c5a4d6..1c152b542a52d 100644

207

+--- a/drivers/block/floppy.c

208

++++ b/drivers/block/floppy.c

209

+@@ -2984,6 +2984,8 @@ static const char *drive_name(int type, int drive)

210

+ 		return "(null)";

211

+ }

212

+

213

++#ifdef CONFIG_BLK_DEV_FD_RAWCMD

214

++

215

+ /* raw commands */

216

+ static void raw_cmd_done(int flag)

217

+ {

218

+@@ -3183,6 +3185,35 @@ static int raw_cmd_ioctl(int cmd, void __user *param)

219

+ 	return ret;

220

+ }

221

+

222

++static int floppy_raw_cmd_ioctl(int type, int drive, int cmd,

223

++				void __user *param)

224

++{

225

++	int ret;

226

++

227

++	pr_warn_once("Note: FDRAWCMD is deprecated and will be removed from the kernel in the near future.\n");

228

++

229

++	if (type)

230

++		return -EINVAL;

231

++	if (lock_fdc(drive))

232

++		return -EINTR;

233

++	set_floppy(drive);

234

++	ret = raw_cmd_ioctl(cmd, param);

235

++	if (ret == -EINTR)

236

++		return -EINTR;

237

++	process_fd_request();

238

++	return ret;

239

++}

240

++

241

++#else /* CONFIG_BLK_DEV_FD_RAWCMD */

242

++

243

++static int floppy_raw_cmd_ioctl(int type, int drive, int cmd,

244

++				void __user *param)

245

++{

246

++	return -EOPNOTSUPP;

247

++}

248

++

249

++#endif

250

++

251

+ static int invalidate_drive(struct block_device *bdev)

252

+ {

253

+ 	/* invalidate the buffer track to force a reread */

254

+@@ -3371,7 +3402,6 @@ static int fd_locked_ioctl(struct block_device *bdev, fmode_t mode, unsigned int

255

+ {

256

+ 	int drive = (long)bdev->bd_disk->private_data;

257

+ 	int type = ITYPE(drive_state[drive].fd_device);

258

+-	int i;

259

+ 	int ret;

260

+ 	int size;

261

+ 	union inparam {

262

+@@ -3522,16 +3552,7 @@ static int fd_locked_ioctl(struct block_device *bdev, fmode_t mode, unsigned int

263

+ 		outparam = &write_errors[drive];

264

+ 		break;

265

+ 	case FDRAWCMD:

266

+-		if (type)

267

+-			return -EINVAL;

268

+-		if (lock_fdc(drive))

269

+-			return -EINTR;

270

+-		set_floppy(drive);

271

+-		i = raw_cmd_ioctl(cmd, (void __user *)param);

272

+-		if (i == -EINTR)

273

+-			return -EINTR;

274

+-		process_fd_request();

275

+-		return i;

276

++		return floppy_raw_cmd_ioctl(type, drive, cmd, (void __user *)param);

277

+ 	case FDTWADDLE:

278

+ 		if (lock_fdc(drive))

279

+ 			return -EINTR;

280

+diff --git a/drivers/gpu/drm/armada/armada_gem.c b/drivers/gpu/drm/armada/armada_gem.c

281

+index 21909642ee4ca..8fbb25913327c 100644

282

+--- a/drivers/gpu/drm/armada/armada_gem.c

283

++++ b/drivers/gpu/drm/armada/armada_gem.c

284

+@@ -336,7 +336,7 @@ int armada_gem_pwrite_ioctl(struct drm_device *dev, void *data,

285

+ 	struct drm_armada_gem_pwrite *args = data;

286

+ 	struct armada_gem_object *dobj;

287

+ 	char __user *ptr;

288

+-	int ret;

289

++	int ret = 0;

290

+

291

+ 	DRM_DEBUG_DRIVER("handle %u off %u size %u ptr 0x%llx\n",

292

+ 		args->handle, args->offset, args->size, args->ptr);

293

+@@ -349,9 +349,8 @@ int armada_gem_pwrite_ioctl(struct drm_device *dev, void *data,

294

+ 	if (!access_ok(ptr, args->size))

295

+ 		return -EFAULT;

296

+

297

+-	ret = fault_in_pages_readable(ptr, args->size);

298

+-	if (ret)

299

+-		return ret;

300

++	if (fault_in_readable(ptr, args->size))

301

++		return -EFAULT;

302

+

303

+ 	dobj = armada_gem_object_lookup(file, args->handle);

304

+ 	if (dobj == NULL)

305

+diff --git a/drivers/spi/spi-cadence-quadspi.c b/drivers/spi/spi-cadence-quadspi.c

306

+index 75680eecd2f7d..2714ba02b176b 100644

307

+--- a/drivers/spi/spi-cadence-quadspi.c

308

++++ b/drivers/spi/spi-cadence-quadspi.c

309

+@@ -36,6 +36,7 @@

310

+ /* Quirks */

311

+ #define CQSPI_NEEDS_WR_DELAY		BIT(0)

312

+ #define CQSPI_DISABLE_DAC_MODE		BIT(1)

313

++#define CQSPI_NO_SUPPORT_WR_COMPLETION	BIT(3)

314

+

315

+ /* Capabilities */

316

+ #define CQSPI_SUPPORTS_OCTAL		BIT(0)

317

+@@ -83,6 +84,7 @@ struct cqspi_st {

318

+ 	u32			wr_delay;

319

+ 	bool			use_direct_mode;

320

+ 	struct cqspi_flash_pdata f_pdata[CQSPI_MAX_CHIPSELECT];

321

++	bool			wr_completion;

322

+ };

323

+

324

+ struct cqspi_driver_platdata {

325

+@@ -797,9 +799,11 @@ static int cqspi_write_setup(struct cqspi_flash_pdata *f_pdata,

326

+ 	 * polling on the controller's side. spinand and spi-nor will take

327

+ 	 * care of polling the status register.

328

+ 	 */

329

+-	reg = readl(reg_base + CQSPI_REG_WR_COMPLETION_CTRL);

330

+-	reg |= CQSPI_REG_WR_DISABLE_AUTO_POLL;

331

+-	writel(reg, reg_base + CQSPI_REG_WR_COMPLETION_CTRL);

332

++	if (cqspi->wr_completion) {

333

++		reg = readl(reg_base + CQSPI_REG_WR_COMPLETION_CTRL);

334

++		reg |= CQSPI_REG_WR_DISABLE_AUTO_POLL;

335

++		writel(reg, reg_base + CQSPI_REG_WR_COMPLETION_CTRL);

336

++	}

337

+

338

+ 	reg = readl(reg_base + CQSPI_REG_SIZE);

339

+ 	reg &= ~CQSPI_REG_SIZE_ADDRESS_MASK;

340

+@@ -1532,6 +1536,10 @@ static int cqspi_probe(struct platform_device *pdev)

341

+

342

+ 	cqspi->master_ref_clk_hz = clk_get_rate(cqspi->clk);

343

+ 	master->max_speed_hz = cqspi->master_ref_clk_hz;

344

++

345

++	/* write completion is supported by default */

346

++	cqspi->wr_completion = true;

347

++

348

+ 	ddata  = of_device_get_match_data(dev);

349

+ 	if (ddata) {

350

+ 		if (ddata->quirks & CQSPI_NEEDS_WR_DELAY)

351

+@@ -1541,6 +1549,8 @@ static int cqspi_probe(struct platform_device *pdev)

352

+ 			master->mode_bits |= SPI_RX_OCTAL | SPI_TX_OCTAL;

353

+ 		if (!(ddata->quirks & CQSPI_DISABLE_DAC_MODE))

354

+ 			cqspi->use_direct_mode = true;

355

++		if (ddata->quirks & CQSPI_NO_SUPPORT_WR_COMPLETION)

356

++			cqspi->wr_completion = false;

357

+ 	}

358

+

359

+ 	ret = devm_request_irq(dev, irq, cqspi_irq_handler, 0,

360

+@@ -1649,6 +1659,10 @@ static const struct cqspi_driver_platdata intel_lgm_qspi = {

361

+ 	.quirks = CQSPI_DISABLE_DAC_MODE,

362

+ };

363

+

364

++static const struct cqspi_driver_platdata socfpga_qspi = {

365

++	.quirks = CQSPI_NO_SUPPORT_WR_COMPLETION,

366

++};

367

++

368

+ static const struct of_device_id cqspi_dt_ids[] = {

369

+ 	{

370

+ 		.compatible = "cdns,qspi-nor",

371

+@@ -1666,6 +1680,10 @@ static const struct of_device_id cqspi_dt_ids[] = {

372

+ 		.compatible = "intel,lgm-qspi",

373

+ 		.data = &intel_lgm_qspi,

374

+ 	},

375

++	{

376

++		.compatible = "intel,socfpga-qspi",

377

++		.data = (void *)&socfpga_qspi,

378

++	},

379

+ 	{ /* end of table */ }

380

+ };

381

+

382

+diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c

383

+index dc1e4d1b72914..ff578c934bbcf 100644

384

+--- a/fs/btrfs/file.c

385

++++ b/fs/btrfs/file.c

386

+@@ -1709,7 +1709,7 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,

387

+ 		 * Fault pages before locking them in prepare_pages

388

+ 		 * to avoid recursive lock

389

+ 		 */

390

+-		if (unlikely(iov_iter_fault_in_readable(i, write_bytes))) {

391

++		if (unlikely(fault_in_iov_iter_readable(i, write_bytes))) {

392

+ 			ret = -EFAULT;

393

+ 			break;

394

+ 		}

395

+@@ -1903,16 +1903,17 @@ static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info,

396

+

397

+ static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)

398

+ {

399

++	const bool is_sync_write = (iocb->ki_flags & IOCB_DSYNC);

400

+ 	struct file *file = iocb->ki_filp;

401

+ 	struct inode *inode = file_inode(file);

402

+ 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);

403

+ 	loff_t pos;

404

+ 	ssize_t written = 0;

405

+ 	ssize_t written_buffered;

406

++	size_t prev_left = 0;

407

+ 	loff_t endbyte;

408

+ 	ssize_t err;

409

+ 	unsigned int ilock_flags = 0;

410

+-	struct iomap_dio *dio = NULL;

411

+

412

+ 	if (iocb->ki_flags & IOCB_NOWAIT)

413

+ 		ilock_flags |= BTRFS_ILOCK_TRY;

414

+@@ -1955,23 +1956,80 @@ relock:

415

+ 		goto buffered;

416

+ 	}

417

+

418

+-	dio = __iomap_dio_rw(iocb, from, &btrfs_dio_iomap_ops, &btrfs_dio_ops,

419

+-			     0);

420

++	/*

421

++	 * We remove IOCB_DSYNC so that we don't deadlock when iomap_dio_rw()

422

++	 * calls generic_write_sync() (through iomap_dio_complete()), because

423

++	 * that results in calling fsync (btrfs_sync_file()) which will try to

424

++	 * lock the inode in exclusive/write mode.

425

++	 */

426

++	if (is_sync_write)

427

++		iocb->ki_flags &= ~IOCB_DSYNC;

428

+

429

+-	btrfs_inode_unlock(inode, ilock_flags);

430

++	/*

431

++	 * The iov_iter can be mapped to the same file range we are writing to.

432

++	 * If that's the case, then we will deadlock in the iomap code, because

433

++	 * it first calls our callback btrfs_dio_iomap_begin(), which will create

434

++	 * an ordered extent, and after that it will fault in the pages that the

435

++	 * iov_iter refers to. During the fault in we end up in the readahead

436

++	 * pages code (starting at btrfs_readahead()), which will lock the range,

437

++	 * find that ordered extent and then wait for it to complete (at

438

++	 * btrfs_lock_and_flush_ordered_range()), resulting in a deadlock since

439

++	 * obviously the ordered extent can never complete as we didn't submit

440

++	 * yet the respective bio(s). This always happens when the buffer is

441

++	 * memory mapped to the same file range, since the iomap DIO code always

442

++	 * invalidates pages in the target file range (after starting and waiting

443

++	 * for any writeback).

444

++	 *

445

++	 * So here we disable page faults in the iov_iter and then retry if we

446

++	 * got -EFAULT, faulting in the pages before the retry.

447

++	 */

448

++again:

449

++	from->nofault = true;

450

++	err = iomap_dio_rw(iocb, from, &btrfs_dio_iomap_ops, &btrfs_dio_ops,

451

++			   IOMAP_DIO_PARTIAL, written);

452

++	from->nofault = false;

453

+

454

+-	if (IS_ERR_OR_NULL(dio)) {

455

+-		err = PTR_ERR_OR_ZERO(dio);

456

+-		if (err < 0 && err != -ENOTBLK)

457

+-			goto out;

458

+-	} else {

459

+-		written = iomap_dio_complete(dio);

460

++	/* No increment (+=) because iomap returns a cumulative value. */

461

++	if (err > 0)

462

++		written = err;

463

++

464

++	if (iov_iter_count(from) > 0 && (err == -EFAULT || err > 0)) {

465

++		const size_t left = iov_iter_count(from);

466

++		/*

467

++		 * We have more data left to write. Try to fault in as many as

468

++		 * possible of the remainder pages and retry. We do this without

469

++		 * releasing and locking again the inode, to prevent races with

470

++		 * truncate.

471

++		 *

472

++		 * Also, in case the iov refers to pages in the file range of the

473

++		 * file we want to write to (due to a mmap), we could enter an

474

++		 * infinite loop if we retry after faulting the pages in, since

475

++		 * iomap will invalidate any pages in the range early on, before

476

++		 * it tries to fault in the pages of the iov. So we keep track of

477

++		 * how much was left of iov in the previous EFAULT and fallback

478

++		 * to buffered IO in case we haven't made any progress.

479

++		 */

480

++		if (left == prev_left) {

481

++			err = -ENOTBLK;

482

++		} else {

483

++			fault_in_iov_iter_readable(from, left);

484

++			prev_left = left;

485

++			goto again;

486

++		}

487

+ 	}

488

+

489

+-	if (written < 0 || !iov_iter_count(from)) {

490

+-		err = written;

491

++	btrfs_inode_unlock(inode, ilock_flags);

492

++

493

++	/*

494

++	 * Add back IOCB_DSYNC. Our caller, btrfs_file_write_iter(), will do

495

++	 * the fsync (call generic_write_sync()).

496

++	 */

497

++	if (is_sync_write)

498

++		iocb->ki_flags |= IOCB_DSYNC;

499

++

500

++	/* If 'err' is -ENOTBLK then it means we must fallback to buffered IO. */

501

++	if ((err < 0 && err != -ENOTBLK) || !iov_iter_count(from))

502

+ 		goto out;

503

+-	}

504

+

505

+ buffered:

506

+ 	pos = iocb->ki_pos;

507

+@@ -1996,7 +2054,7 @@ buffered:

508

+ 	invalidate_mapping_pages(file->f_mapping, pos >> PAGE_SHIFT,

509

+ 				 endbyte >> PAGE_SHIFT);

510

+ out:

511

+-	return written ? written : err;

512

++	return err < 0 ? err : written;

513

+ }

514

+

515

+ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,

516

+@@ -3659,6 +3717,8 @@ static int check_direct_read(struct btrfs_fs_info *fs_info,

517

+ static ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to)

518

+ {

519

+ 	struct inode *inode = file_inode(iocb->ki_filp);

520

++	size_t prev_left = 0;

521

++	ssize_t read = 0;

522

+ 	ssize_t ret;

523

+

524

+ 	if (fsverity_active(inode))

525

+@@ -3668,9 +3728,57 @@ static ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to)

526

+ 		return 0;

527

+

528

+ 	btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED);

529

+-	ret = iomap_dio_rw(iocb, to, &btrfs_dio_iomap_ops, &btrfs_dio_ops, 0);

530

++again:

531

++	/*

532

++	 * This is similar to what we do for direct IO writes, see the comment

533

++	 * at btrfs_direct_write(), but we also disable page faults in addition

534

++	 * to disabling them only at the iov_iter level. This is because when

535

++	 * reading from a hole or prealloc extent, iomap calls iov_iter_zero(),

536

++	 * which can still trigger page fault ins despite having set ->nofault

537

++	 * to true of our 'to' iov_iter.

538

++	 *

539

++	 * The difference to direct IO writes is that we deadlock when trying

540

++	 * to lock the extent range in the inode's tree during he page reads

541

++	 * triggered by the fault in (while for writes it is due to waiting for

542

++	 * our own ordered extent). This is because for direct IO reads,

543

++	 * btrfs_dio_iomap_begin() returns with the extent range locked, which

544

++	 * is only unlocked in the endio callback (end_bio_extent_readpage()).

545

++	 */

546

++	pagefault_disable();

547

++	to->nofault = true;

548

++	ret = iomap_dio_rw(iocb, to, &btrfs_dio_iomap_ops, &btrfs_dio_ops,

549

++			   IOMAP_DIO_PARTIAL, read);

550

++	to->nofault = false;

551

++	pagefault_enable();

552

++

553

++	/* No increment (+=) because iomap returns a cumulative value. */

554

++	if (ret > 0)

555

++		read = ret;

556

++

557

++	if (iov_iter_count(to) > 0 && (ret == -EFAULT || ret > 0)) {

558

++		const size_t left = iov_iter_count(to);

559

++

560

++		if (left == prev_left) {

561

++			/*

562

++			 * We didn't make any progress since the last attempt,

563

++			 * fallback to a buffered read for the remainder of the

564

++			 * range. This is just to avoid any possibility of looping

565

++			 * for too long.

566

++			 */

567

++			ret = read;

568

++		} else {

569

++			/*

570

++			 * We made some progress since the last retry or this is

571

++			 * the first time we are retrying. Fault in as many pages

572

++			 * as possible and retry.

573

++			 */

574

++			fault_in_iov_iter_writeable(to, left);

575

++			prev_left = left;

576

++			goto again;

577

++		}

578

++	}

579

+ 	btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);

580

+-	return ret;

581

++	return ret < 0 ? ret : read;

582

+ }

583

+

584

+ static ssize_t btrfs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)

585

+diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c

586

+index 6266a706bff7d..044d584c3467c 100644

587

+--- a/fs/btrfs/inode.c

588

++++ b/fs/btrfs/inode.c

589

+@@ -7961,6 +7961,34 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,

590

+ 	}

591

+

592

+ 	len = min(len, em->len - (start - em->start));

593

++

594

++	/*

595

++	 * If we have a NOWAIT request and the range contains multiple extents

596

++	 * (or a mix of extents and holes), then we return -EAGAIN to make the

597

++	 * caller fallback to a context where it can do a blocking (without

598

++	 * NOWAIT) request. This way we avoid doing partial IO and returning

599

++	 * success to the caller, which is not optimal for writes and for reads

600

++	 * it can result in unexpected behaviour for an application.

601

++	 *

602

++	 * When doing a read, because we use IOMAP_DIO_PARTIAL when calling

603

++	 * iomap_dio_rw(), we can end up returning less data then what the caller

604

++	 * asked for, resulting in an unexpected, and incorrect, short read.

605

++	 * That is, the caller asked to read N bytes and we return less than that,

606

++	 * which is wrong unless we are crossing EOF. This happens if we get a

607

++	 * page fault error when trying to fault in pages for the buffer that is

608

++	 * associated to the struct iov_iter passed to iomap_dio_rw(), and we

609

++	 * have previously submitted bios for other extents in the range, in

610

++	 * which case iomap_dio_rw() may return us EIOCBQUEUED if not all of

611

++	 * those bios have completed by the time we get the page fault error,

612

++	 * which we return back to our caller - we should only return EIOCBQUEUED

613

++	 * after we have submitted bios for all the extents in the range.

614

++	 */

615

++	if ((flags & IOMAP_NOWAIT) && len < length) {

616

++		free_extent_map(em);

617

++		ret = -EAGAIN;

618

++		goto unlock_err;

619

++	}

620

++

621

+ 	if (write) {

622

+ 		ret = btrfs_get_blocks_direct_write(&em, inode, dio_data,

623

+ 						    start, len);

624

+diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c

625

+index 6a863b3f6de03..bf53af8694f8e 100644

626

+--- a/fs/btrfs/ioctl.c

627

++++ b/fs/btrfs/ioctl.c

628

+@@ -2258,9 +2258,8 @@ static noinline int search_ioctl(struct inode *inode,

629

+ 	key.offset = sk->min_offset;

630

+

631

+ 	while (1) {

632

+-		ret = fault_in_pages_writeable(ubuf + sk_offset,

633

+-					       *buf_size - sk_offset);

634

+-		if (ret)

635

++		ret = -EFAULT;

636

++		if (fault_in_writeable(ubuf + sk_offset, *buf_size - sk_offset))

637

+ 			break;

638

+

639

+ 		ret = btrfs_search_forward(root, &key, path, sk->min_transid);

640

+diff --git a/fs/erofs/data.c b/fs/erofs/data.c

641

+index 9db8297156527..16a41d0db55a3 100644

642

+--- a/fs/erofs/data.c

643

++++ b/fs/erofs/data.c

644

+@@ -287,7 +287,7 @@ static ssize_t erofs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)

645

+

646

+ 		if (!err)

647

+ 			return iomap_dio_rw(iocb, to, &erofs_iomap_ops,

648

+-					    NULL, 0);

649

++					    NULL, 0, 0);

650

+ 		if (err < 0)

651

+ 			return err;

652

+ 	}

653

+diff --git a/fs/ext4/file.c b/fs/ext4/file.c

654

+index ac0e11bbb4450..b25c1f8f7c4f1 100644

655

+--- a/fs/ext4/file.c

656

++++ b/fs/ext4/file.c

657

+@@ -74,7 +74,7 @@ static ssize_t ext4_dio_read_iter(struct kiocb *iocb, struct iov_iter *to)

658

+ 		return generic_file_read_iter(iocb, to);

659

+ 	}

660

+

661

+-	ret = iomap_dio_rw(iocb, to, &ext4_iomap_ops, NULL, 0);

662

++	ret = iomap_dio_rw(iocb, to, &ext4_iomap_ops, NULL, 0, 0);

663

+ 	inode_unlock_shared(inode);

664

+

665

+ 	file_accessed(iocb->ki_filp);

666

+@@ -566,7 +566,8 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)

667

+ 	if (ilock_shared)

668

+ 		iomap_ops = &ext4_iomap_overwrite_ops;

669

+ 	ret = iomap_dio_rw(iocb, from, iomap_ops, &ext4_dio_write_ops,

670

+-			   (unaligned_io || extend) ? IOMAP_DIO_FORCE_WAIT : 0);

671

++			   (unaligned_io || extend) ? IOMAP_DIO_FORCE_WAIT : 0,

672

++			   0);

673

+ 	if (ret == -ENOTBLK)

674

+ 		ret = 0;

675

+

676

+diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c

677

+index 0e14dc41ed4e6..8ef92719c6799 100644

678

+--- a/fs/f2fs/file.c

679

++++ b/fs/f2fs/file.c

680

+@@ -4279,7 +4279,7 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)

681

+ 		size_t target_size = 0;

682

+ 		int err;

683

+

684

+-		if (iov_iter_fault_in_readable(from, iov_iter_count(from)))

685

++		if (fault_in_iov_iter_readable(from, iov_iter_count(from)))

686

+ 			set_inode_flag(inode, FI_NO_PREALLOC);

687

+

688

+ 		if ((iocb->ki_flags & IOCB_NOWAIT)) {

689

+diff --git a/fs/fuse/file.c b/fs/fuse/file.c

690

+index bc50a9fa84a0c..71e9e301e569d 100644

691

+--- a/fs/fuse/file.c

692

++++ b/fs/fuse/file.c

693

+@@ -1164,7 +1164,7 @@ static ssize_t fuse_fill_write_pages(struct fuse_io_args *ia,

694

+

695

+  again:

696

+ 		err = -EFAULT;

697

+-		if (iov_iter_fault_in_readable(ii, bytes))

698

++		if (fault_in_iov_iter_readable(ii, bytes))

699

+ 			break;

700

+

701

+ 		err = -ENOMEM;

702

+diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c

703

+index bb9014ced702a..fbdb7a30470a3 100644

704

+--- a/fs/gfs2/bmap.c

705

++++ b/fs/gfs2/bmap.c

706

+@@ -961,46 +961,6 @@ hole_found:

707

+ 	goto out;

708

+ }

709

+

710

+-static int gfs2_write_lock(struct inode *inode)

711

+-{

712

+-	struct gfs2_inode *ip = GFS2_I(inode);

713

+-	struct gfs2_sbd *sdp = GFS2_SB(inode);

714

+-	int error;

715

+-

716

+-	gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh);

717

+-	error = gfs2_glock_nq(&ip->i_gh);

718

+-	if (error)

719

+-		goto out_uninit;

720

+-	if (&ip->i_inode == sdp->sd_rindex) {

721

+-		struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);

722

+-

723

+-		error = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE,

724

+-					   GL_NOCACHE, &m_ip->i_gh);

725

+-		if (error)

726

+-			goto out_unlock;

727

+-	}

728

+-	return 0;

729

+-

730

+-out_unlock:

731

+-	gfs2_glock_dq(&ip->i_gh);

732

+-out_uninit:

733

+-	gfs2_holder_uninit(&ip->i_gh);

734

+-	return error;

735

+-}

736

+-

737

+-static void gfs2_write_unlock(struct inode *inode)

738

+-{

739

+-	struct gfs2_inode *ip = GFS2_I(inode);

740

+-	struct gfs2_sbd *sdp = GFS2_SB(inode);

741

+-

742

+-	if (&ip->i_inode == sdp->sd_rindex) {

743

+-		struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);

744

+-

745

+-		gfs2_glock_dq_uninit(&m_ip->i_gh);

746

+-	}

747

+-	gfs2_glock_dq_uninit(&ip->i_gh);

748

+-}

749

+-

750

+ static int gfs2_iomap_page_prepare(struct inode *inode, loff_t pos,

751

+ 				   unsigned len)

752

+ {

753

+@@ -1118,11 +1078,6 @@ out_qunlock:

754

+ 	return ret;

755

+ }

756

+

757

+-static inline bool gfs2_iomap_need_write_lock(unsigned flags)

758

+-{

759

+-	return (flags & IOMAP_WRITE) && !(flags & IOMAP_DIRECT);

760

+-}

761

+-

762

+ static int gfs2_iomap_begin(struct inode *inode, loff_t pos, loff_t length,

763

+ 			    unsigned flags, struct iomap *iomap,

764

+ 			    struct iomap *srcmap)

765

+@@ -1135,12 +1090,6 @@ static int gfs2_iomap_begin(struct inode *inode, loff_t pos, loff_t length,

766

+ 		iomap->flags |= IOMAP_F_BUFFER_HEAD;

767

+

768

+ 	trace_gfs2_iomap_start(ip, pos, length, flags);

769

+-	if (gfs2_iomap_need_write_lock(flags)) {

770

+-		ret = gfs2_write_lock(inode);

771

+-		if (ret)

772

+-			goto out;

773

+-	}

774

+-

775

+ 	ret = __gfs2_iomap_get(inode, pos, length, flags, iomap, &mp);

776

+ 	if (ret)

777

+ 		goto out_unlock;

778

+@@ -1168,10 +1117,7 @@ static int gfs2_iomap_begin(struct inode *inode, loff_t pos, loff_t length,

779

+ 	ret = gfs2_iomap_begin_write(inode, pos, length, flags, iomap, &mp);

780

+

781

+ out_unlock:

782

+-	if (ret && gfs2_iomap_need_write_lock(flags))

783

+-		gfs2_write_unlock(inode);

784

+ 	release_metapath(&mp);

785

+-out:

786

+ 	trace_gfs2_iomap_end(ip, iomap, ret);

787

+ 	return ret;

788

+ }

789

+@@ -1219,15 +1165,11 @@ static int gfs2_iomap_end(struct inode *inode, loff_t pos, loff_t length,

790

+ 	}

791

+

792

+ 	if (unlikely(!written))

793

+-		goto out_unlock;

794

++		return 0;

795

+

796

+ 	if (iomap->flags & IOMAP_F_SIZE_CHANGED)

797

+ 		mark_inode_dirty(inode);

798

+ 	set_bit(GLF_DIRTY, &ip->i_gl->gl_flags);

799

+-

800

+-out_unlock:

801

+-	if (gfs2_iomap_need_write_lock(flags))

802

+-		gfs2_write_unlock(inode);

803

+ 	return 0;

804

+ }

805

+

806

+diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c

807

+index 1c8b747072cba..247b8d95b5ef4 100644

808

+--- a/fs/gfs2/file.c

809

++++ b/fs/gfs2/file.c

810

+@@ -777,27 +777,99 @@ static int gfs2_fsync(struct file *file, loff_t start, loff_t end,

811

+ 	return ret ? ret : ret1;

812

+ }

813

+

814

++static inline bool should_fault_in_pages(ssize_t ret, struct iov_iter *i,

815

++					 size_t *prev_count,

816

++					 size_t *window_size)

817

++{

818

++	char __user *p = i->iov[0].iov_base + i->iov_offset;

819

++	size_t count = iov_iter_count(i);

820

++	int pages = 1;

821

++

822

++	if (likely(!count))

823

++		return false;

824

++	if (ret <= 0 && ret != -EFAULT)

825

++		return false;

826

++	if (!iter_is_iovec(i))

827

++		return false;

828

++

829

++	if (*prev_count != count || !*window_size) {

830

++		int pages, nr_dirtied;

831

++

832

++		pages = min_t(int, BIO_MAX_VECS,

833

++			      DIV_ROUND_UP(iov_iter_count(i), PAGE_SIZE));

834

++		nr_dirtied = max(current->nr_dirtied_pause -

835

++				 current->nr_dirtied, 1);

836

++		pages = min(pages, nr_dirtied);

837

++	}

838

++

839

++	*prev_count = count;

840

++	*window_size = (size_t)PAGE_SIZE * pages - offset_in_page(p);

841

++	return true;

842

++}

843

++

844

+ static ssize_t gfs2_file_direct_read(struct kiocb *iocb, struct iov_iter *to,

845

+ 				     struct gfs2_holder *gh)

846

+ {

847

+ 	struct file *file = iocb->ki_filp;

848

+ 	struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);

849

+-	size_t count = iov_iter_count(to);

850

++	size_t prev_count = 0, window_size = 0;

851

++	size_t written = 0;

852

+ 	ssize_t ret;

853

+

854

+-	if (!count)

855

++	/*

856

++	 * In this function, we disable page faults when we're holding the

857

++	 * inode glock while doing I/O.  If a page fault occurs, we indicate

858

++	 * that the inode glock may be dropped, fault in the pages manually,

859

++	 * and retry.

860

++	 *

861

++	 * Unlike generic_file_read_iter, for reads, iomap_dio_rw can trigger

862

++	 * physical as well as manual page faults, and we need to disable both

863

++	 * kinds.

864

++	 *

865

++	 * For direct I/O, gfs2 takes the inode glock in deferred mode.  This

866

++	 * locking mode is compatible with other deferred holders, so multiple

867

++	 * processes and nodes can do direct I/O to a file at the same time.

868

++	 * There's no guarantee that reads or writes will be atomic.  Any

869

++	 * coordination among readers and writers needs to happen externally.

870

++	 */

871

++

872

++	if (!iov_iter_count(to))

873

+ 		return 0; /* skip atime */

874

+

875

+ 	gfs2_holder_init(ip->i_gl, LM_ST_DEFERRED, 0, gh);

876

++retry:

877

+ 	ret = gfs2_glock_nq(gh);

878

+ 	if (ret)

879

+ 		goto out_uninit;

880

++retry_under_glock:

881

++	pagefault_disable();

882

++	to->nofault = true;

883

++	ret = iomap_dio_rw(iocb, to, &gfs2_iomap_ops, NULL,

884

++			   IOMAP_DIO_PARTIAL, written);

885

++	to->nofault = false;

886

++	pagefault_enable();

887

++	if (ret > 0)

888

++		written = ret;

889

++

890

++	if (should_fault_in_pages(ret, to, &prev_count, &window_size)) {

891

++		size_t leftover;

892

+

893

+-	ret = iomap_dio_rw(iocb, to, &gfs2_iomap_ops, NULL, 0);

894

+-	gfs2_glock_dq(gh);

895

++		gfs2_holder_allow_demote(gh);

896

++		leftover = fault_in_iov_iter_writeable(to, window_size);

897

++		gfs2_holder_disallow_demote(gh);

898

++		if (leftover != window_size) {

899

++			if (!gfs2_holder_queued(gh))

900

++				goto retry;

901

++			goto retry_under_glock;

902

++		}

903

++	}

904

++	if (gfs2_holder_queued(gh))

905

++		gfs2_glock_dq(gh);

906

+ out_uninit:

907

+ 	gfs2_holder_uninit(gh);

908

+-	return ret;

909

++	if (ret < 0)

910

++		return ret;

911

++	return written;

912

+ }

913

+

914

+ static ssize_t gfs2_file_direct_write(struct kiocb *iocb, struct iov_iter *from,

915

+@@ -806,10 +878,20 @@ static ssize_t gfs2_file_direct_write(struct kiocb *iocb, struct iov_iter *from,

916

+ 	struct file *file = iocb->ki_filp;

917

+ 	struct inode *inode = file->f_mapping->host;

918

+ 	struct gfs2_inode *ip = GFS2_I(inode);

919

+-	size_t len = iov_iter_count(from);

920

+-	loff_t offset = iocb->ki_pos;

921

++	size_t prev_count = 0, window_size = 0;

922

++	size_t read = 0;

923

+ 	ssize_t ret;

924

+

925

++	/*

926

++	 * In this function, we disable page faults when we're holding the

927

++	 * inode glock while doing I/O.  If a page fault occurs, we indicate

928

++	 * that the inode glock may be dropped, fault in the pages manually,

929

++	 * and retry.

930

++	 *

931

++	 * For writes, iomap_dio_rw only triggers manual page faults, so we

932

++	 * don't need to disable physical ones.

933

++	 */

934

++

935

+ 	/*

936

+ 	 * Deferred lock, even if its a write, since we do no allocation on

937

+ 	 * this path. All we need to change is the atime, and this lock mode

938

+@@ -819,31 +901,62 @@ static ssize_t gfs2_file_direct_write(struct kiocb *iocb, struct iov_iter *from,

939

+ 	 * VFS does.

940

+ 	 */

941

+ 	gfs2_holder_init(ip->i_gl, LM_ST_DEFERRED, 0, gh);

942

++retry:

943

+ 	ret = gfs2_glock_nq(gh);

944

+ 	if (ret)

945

+ 		goto out_uninit;

946

+-

947

++retry_under_glock:

948

+ 	/* Silently fall back to buffered I/O when writing beyond EOF */

949

+-	if (offset + len > i_size_read(&ip->i_inode))

950

++	if (iocb->ki_pos + iov_iter_count(from) > i_size_read(&ip->i_inode))

951

+ 		goto out;

952

+

953

+-	ret = iomap_dio_rw(iocb, from, &gfs2_iomap_ops, NULL, 0);

954

++	from->nofault = true;

955

++	ret = iomap_dio_rw(iocb, from, &gfs2_iomap_ops, NULL,

956

++			   IOMAP_DIO_PARTIAL, read);

957

++	from->nofault = false;

958

++

959

+ 	if (ret == -ENOTBLK)

960

+ 		ret = 0;

961

++	if (ret > 0)

962

++		read = ret;

963

++

964

++	if (should_fault_in_pages(ret, from, &prev_count, &window_size)) {

965

++		size_t leftover;

966

++

967

++		gfs2_holder_allow_demote(gh);

968

++		leftover = fault_in_iov_iter_readable(from, window_size);

969

++		gfs2_holder_disallow_demote(gh);

970

++		if (leftover != window_size) {

971

++			if (!gfs2_holder_queued(gh))

972

++				goto retry;

973

++			goto retry_under_glock;

974

++		}

975

++	}

976

+ out:

977

+-	gfs2_glock_dq(gh);

978

++	if (gfs2_holder_queued(gh))

979

++		gfs2_glock_dq(gh);

980

+ out_uninit:

981

+ 	gfs2_holder_uninit(gh);

982

+-	return ret;

983

++	if (ret < 0)

984

++		return ret;

985

++	return read;

986

+ }

987

+

988

+ static ssize_t gfs2_file_read_iter(struct kiocb *iocb, struct iov_iter *to)

989

+ {

990

+ 	struct gfs2_inode *ip;

991

+ 	struct gfs2_holder gh;

992

++	size_t prev_count = 0, window_size = 0;

993

+ 	size_t written = 0;

994

+ 	ssize_t ret;

995

+

996

++	/*

997

++	 * In this function, we disable page faults when we're holding the

998

++	 * inode glock while doing I/O.  If a page fault occurs, we indicate

999

++	 * that the inode glock may be dropped, fault in the pages manually,

1000

++	 * and retry.

1001

++	 */

1002

++

1003

+ 	if (iocb->ki_flags & IOCB_DIRECT) {

1004

+ 		ret = gfs2_file_direct_read(iocb, to, &gh);

1005

+ 		if (likely(ret != -ENOTBLK))

1006

+@@ -865,18 +978,118 @@ static ssize_t gfs2_file_read_iter(struct kiocb *iocb, struct iov_iter *to)

1007

+ 	}

1008

+ 	ip = GFS2_I(iocb->ki_filp->f_mapping->host);

1009

+ 	gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh);

1010

++retry:

1011

+ 	ret = gfs2_glock_nq(&gh);

1012

+ 	if (ret)

1013

+ 		goto out_uninit;

1014

++retry_under_glock:

1015

++	pagefault_disable();

1016

+ 	ret = generic_file_read_iter(iocb, to);

1017

++	pagefault_enable();

1018

+ 	if (ret > 0)

1019

+ 		written += ret;

1020

+-	gfs2_glock_dq(&gh);

1021

++

1022

++	if (should_fault_in_pages(ret, to, &prev_count, &window_size)) {

1023

++		size_t leftover;

1024

++

1025

++		gfs2_holder_allow_demote(&gh);

1026

++		leftover = fault_in_iov_iter_writeable(to, window_size);

1027

++		gfs2_holder_disallow_demote(&gh);

1028

++		if (leftover != window_size) {

1029

++			if (!gfs2_holder_queued(&gh)) {

1030

++				if (written)

1031

++					goto out_uninit;

1032

++				goto retry;

1033

++			}

1034

++			goto retry_under_glock;

1035

++		}

1036

++	}

1037

++	if (gfs2_holder_queued(&gh))

1038

++		gfs2_glock_dq(&gh);

1039

+ out_uninit:

1040

+ 	gfs2_holder_uninit(&gh);

1041

+ 	return written ? written : ret;

1042

+ }

1043

+

1044

++static ssize_t gfs2_file_buffered_write(struct kiocb *iocb,

1045

++					struct iov_iter *from,

1046

++					struct gfs2_holder *gh)

1047

++{

1048

++	struct file *file = iocb->ki_filp;

1049

++	struct inode *inode = file_inode(file);

1050

++	struct gfs2_inode *ip = GFS2_I(inode);

1051

++	struct gfs2_sbd *sdp = GFS2_SB(inode);

1052

++	struct gfs2_holder *statfs_gh = NULL;

1053

++	size_t prev_count = 0, window_size = 0;

1054

++	size_t read = 0;

1055

++	ssize_t ret;

1056

++

1057

++	/*

1058

++	 * In this function, we disable page faults when we're holding the

1059

++	 * inode glock while doing I/O.  If a page fault occurs, we indicate

1060

++	 * that the inode glock may be dropped, fault in the pages manually,

1061

++	 * and retry.

1062

++	 */

1063

++

1064

++	if (inode == sdp->sd_rindex) {

1065

++		statfs_gh = kmalloc(sizeof(*statfs_gh), GFP_NOFS);

1066

++		if (!statfs_gh)

1067

++			return -ENOMEM;

1068

++	}

1069

++

1070

++	gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, gh);

1071

++retry:

1072

++	ret = gfs2_glock_nq(gh);

1073

++	if (ret)

1074

++		goto out_uninit;

1075

++retry_under_glock:

1076

++	if (inode == sdp->sd_rindex) {

1077

++		struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);

1078

++

1079

++		ret = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE,

1080

++					 GL_NOCACHE, statfs_gh);

1081

++		if (ret)

1082

++			goto out_unlock;

1083

++	}

1084

++

1085

++	current->backing_dev_info = inode_to_bdi(inode);

1086

++	pagefault_disable();

1087

++	ret = iomap_file_buffered_write(iocb, from, &gfs2_iomap_ops);

1088

++	pagefault_enable();

1089

++	current->backing_dev_info = NULL;

1090

++	if (ret > 0) {

1091

++		iocb->ki_pos += ret;

1092

++		read += ret;

1093

++	}

1094

++

1095

++	if (inode == sdp->sd_rindex)

1096

++		gfs2_glock_dq_uninit(statfs_gh);

1097

++

1098

++	if (should_fault_in_pages(ret, from, &prev_count, &window_size)) {

1099

++		size_t leftover;

1100

++

1101

++		gfs2_holder_allow_demote(gh);

1102

++		leftover = fault_in_iov_iter_readable(from, window_size);

1103

++		gfs2_holder_disallow_demote(gh);

1104

++		if (leftover != window_size) {

1105

++			if (!gfs2_holder_queued(gh)) {

1106

++				if (read)

1107

++					goto out_uninit;

1108

++				goto retry;

1109

++			}

1110

++			goto retry_under_glock;

1111

++		}

1112

++	}

1113

++out_unlock:

1114

++	if (gfs2_holder_queued(gh))

1115

++		gfs2_glock_dq(gh);

1116

++out_uninit:

1117

++	gfs2_holder_uninit(gh);

1118

++	if (statfs_gh)

1119

++		kfree(statfs_gh);

1120

++	return read ? read : ret;

1121

++}

1122

++

1123

+ /**

1124

+  * gfs2_file_write_iter - Perform a write to a file

1125

+  * @iocb: The io context

1126

+@@ -928,9 +1141,7 @@ static ssize_t gfs2_file_write_iter(struct kiocb *iocb, struct iov_iter *from)

1127

+ 			goto out_unlock;

1128

+

1129

+ 		iocb->ki_flags |= IOCB_DSYNC;

1130

+-		current->backing_dev_info = inode_to_bdi(inode);

1131

+-		buffered = iomap_file_buffered_write(iocb, from, &gfs2_iomap_ops);

1132

+-		current->backing_dev_info = NULL;

1133

++		buffered = gfs2_file_buffered_write(iocb, from, &gh);

1134

+ 		if (unlikely(buffered <= 0)) {

1135

+ 			if (!ret)

1136

+ 				ret = buffered;

1137

+@@ -944,7 +1155,6 @@ static ssize_t gfs2_file_write_iter(struct kiocb *iocb, struct iov_iter *from)

1138

+ 		 * the direct I/O range as we don't know if the buffered pages

1139

+ 		 * made it to disk.

1140

+ 		 */

1141

+-		iocb->ki_pos += buffered;

1142

+ 		ret2 = generic_write_sync(iocb, buffered);

1143

+ 		invalidate_mapping_pages(mapping,

1144

+ 				(iocb->ki_pos - buffered) >> PAGE_SHIFT,

1145

+@@ -952,13 +1162,9 @@ static ssize_t gfs2_file_write_iter(struct kiocb *iocb, struct iov_iter *from)

1146

+ 		if (!ret || ret2 > 0)

1147

+ 			ret += ret2;

1148

+ 	} else {

1149

+-		current->backing_dev_info = inode_to_bdi(inode);

1150

+-		ret = iomap_file_buffered_write(iocb, from, &gfs2_iomap_ops);

1151

+-		current->backing_dev_info = NULL;

1152

+-		if (likely(ret > 0)) {

1153

+-			iocb->ki_pos += ret;

1154

++		ret = gfs2_file_buffered_write(iocb, from, &gh);

1155

++		if (likely(ret > 0))

1156

+ 			ret = generic_write_sync(iocb, ret);

1157

+-		}

1158

+ 	}

1159

+

1160

+ out_unlock:

1161

+diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c

1162

+index 02cd0ae98208d..e85ef6b14777d 100644

1163

+--- a/fs/gfs2/glock.c

1164

++++ b/fs/gfs2/glock.c

1165

+@@ -58,6 +58,7 @@ struct gfs2_glock_iter {

1166

+ typedef void (*glock_examiner) (struct gfs2_glock * gl);

1167

+

1168

+ static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int target);

1169

++static void __gfs2_glock_dq(struct gfs2_holder *gh);

1170

+

1171

+ static struct dentry *gfs2_root;

1172

+ static struct workqueue_struct *glock_workqueue;

1173

+@@ -197,6 +198,12 @@ static int demote_ok(const struct gfs2_glock *gl)

1174

+

1175

+ 	if (gl->gl_state == LM_ST_UNLOCKED)

1176

+ 		return 0;

1177

++	/*

1178

++	 * Note that demote_ok is used for the lru process of disposing of

1179

++	 * glocks. For this purpose, we don't care if the glock's holders

1180

++	 * have the HIF_MAY_DEMOTE flag set or not. If someone is using

1181

++	 * them, don't demote.

1182

++	 */

1183

+ 	if (!list_empty(&gl->gl_holders))

1184

+ 		return 0;

1185

+ 	if (glops->go_demote_ok)

1186

+@@ -301,46 +308,59 @@ void gfs2_glock_put(struct gfs2_glock *gl)

1187

+ }

1188

+

1189

+ /**

1190

+- * may_grant - check if its ok to grant a new lock

1191

++ * may_grant - check if it's ok to grant a new lock

1192

+  * @gl: The glock

1193

++ * @current_gh: One of the current holders of @gl

1194

+  * @gh: The lock request which we wish to grant

1195

+  *

1196

+- * Returns: true if its ok to grant the lock

1197

++ * With our current compatibility rules, if a glock has one or more active

1198

++ * holders (HIF_HOLDER flag set), any of those holders can be passed in as

1199

++ * @current_gh; they are all the same as far as compatibility with the new @gh

1200

++ * goes.

1201

++ *

1202

++ * Returns true if it's ok to grant the lock.

1203

+  */

1204

+

1205

+-static inline int may_grant(const struct gfs2_glock *gl, const struct gfs2_holder *gh)

1206

+-{

1207

+-	const struct gfs2_holder *gh_head = list_first_entry(&gl->gl_holders, const struct gfs2_holder, gh_list);

1208

++static inline bool may_grant(struct gfs2_glock *gl,

1209

++			     struct gfs2_holder *current_gh,

1210

++			     struct gfs2_holder *gh)

1211

++{

1212

++	if (current_gh) {

1213

++		GLOCK_BUG_ON(gl, !test_bit(HIF_HOLDER, &current_gh->gh_iflags));

1214

++

1215

++		switch(current_gh->gh_state) {

1216

++		case LM_ST_EXCLUSIVE:

1217

++			/*

1218

++			 * Here we make a special exception to grant holders

1219

++			 * who agree to share the EX lock with other holders

1220

++			 * who also have the bit set. If the original holder

1221

++			 * has the LM_FLAG_NODE_SCOPE bit set, we grant more

1222

++			 * holders with the bit set.

1223

++			 */

1224

++			return gh->gh_state == LM_ST_EXCLUSIVE &&

1225

++			       (current_gh->gh_flags & LM_FLAG_NODE_SCOPE) &&

1226

++			       (gh->gh_flags & LM_FLAG_NODE_SCOPE);

1227

+

1228

+-	if (gh != gh_head) {

1229

+-		/**

1230

+-		 * Here we make a special exception to grant holders who agree

1231

+-		 * to share the EX lock with other holders who also have the

1232

+-		 * bit set. If the original holder has the LM_FLAG_NODE_SCOPE bit

1233

+-		 * is set, we grant more holders with the bit set.

1234

+-		 */

1235

+-		if (gh_head->gh_state == LM_ST_EXCLUSIVE &&

1236

+-		    (gh_head->gh_flags & LM_FLAG_NODE_SCOPE) &&

1237

+-		    gh->gh_state == LM_ST_EXCLUSIVE &&

1238

+-		    (gh->gh_flags & LM_FLAG_NODE_SCOPE))

1239

+-			return 1;

1240

+-		if ((gh->gh_state == LM_ST_EXCLUSIVE ||

1241

+-		     gh_head->gh_state == LM_ST_EXCLUSIVE))

1242

+-			return 0;

1243

++		case LM_ST_SHARED:

1244

++		case LM_ST_DEFERRED:

1245

++			return gh->gh_state == current_gh->gh_state;

1246

++

1247

++		default:

1248

++			return false;

1249

++		}

1250

+ 	}

1251

++

1252

+ 	if (gl->gl_state == gh->gh_state)

1253

+-		return 1;

1254

++		return true;

1255

+ 	if (gh->gh_flags & GL_EXACT)

1256

+-		return 0;

1257

++		return false;

1258

+ 	if (gl->gl_state == LM_ST_EXCLUSIVE) {

1259

+-		if (gh->gh_state == LM_ST_SHARED && gh_head->gh_state == LM_ST_SHARED)

1260

+-			return 1;

1261

+-		if (gh->gh_state == LM_ST_DEFERRED && gh_head->gh_state == LM_ST_DEFERRED)

1262

+-			return 1;

1263

++		return gh->gh_state == LM_ST_SHARED ||

1264

++		       gh->gh_state == LM_ST_DEFERRED;

1265

+ 	}

1266

+-	if (gl->gl_state != LM_ST_UNLOCKED && (gh->gh_flags & LM_FLAG_ANY))

1267

+-		return 1;

1268

+-	return 0;

1269

++	if (gh->gh_flags & LM_FLAG_ANY)

1270

++		return gl->gl_state != LM_ST_UNLOCKED;

1271

++	return false;

1272

+ }

1273

+

1274

+ static void gfs2_holder_wake(struct gfs2_holder *gh)

1275

+@@ -366,7 +386,7 @@ static void do_error(struct gfs2_glock *gl, const int ret)

1276

+ 	struct gfs2_holder *gh, *tmp;

1277

+

1278

+ 	list_for_each_entry_safe(gh, tmp, &gl->gl_holders, gh_list) {

1279

+-		if (test_bit(HIF_HOLDER, &gh->gh_iflags))

1280

++		if (!test_bit(HIF_WAIT, &gh->gh_iflags))

1281

+ 			continue;

1282

+ 		if (ret & LM_OUT_ERROR)

1283

+ 			gh->gh_error = -EIO;

1284

+@@ -380,6 +400,78 @@ static void do_error(struct gfs2_glock *gl, const int ret)

1285

+ 	}

1286

+ }

1287

+

1288

++/**

1289

++ * demote_incompat_holders - demote incompatible demoteable holders

1290

++ * @gl: the glock we want to promote

1291

++ * @new_gh: the new holder to be promoted

1292

++ */

1293

++static void demote_incompat_holders(struct gfs2_glock *gl,

1294

++				    struct gfs2_holder *new_gh)

1295

++{

1296

++	struct gfs2_holder *gh;

1297

++

1298

++	/*

1299

++	 * Demote incompatible holders before we make ourselves eligible.

1300

++	 * (This holder may or may not allow auto-demoting, but we don't want

1301

++	 * to demote the new holder before it's even granted.)

1302

++	 */

1303

++	list_for_each_entry(gh, &gl->gl_holders, gh_list) {

1304

++		/*

1305

++		 * Since holders are at the front of the list, we stop when we

1306

++		 * find the first non-holder.

1307

++		 */

1308

++		if (!test_bit(HIF_HOLDER, &gh->gh_iflags))

1309

++			return;

1310

++		if (test_bit(HIF_MAY_DEMOTE, &gh->gh_iflags) &&

1311

++		    !may_grant(gl, new_gh, gh)) {

1312

++			/*

1313

++			 * We should not recurse into do_promote because

1314

++			 * __gfs2_glock_dq only calls handle_callback,

1315

++			 * gfs2_glock_add_to_lru and __gfs2_glock_queue_work.

1316

++			 */

1317

++			__gfs2_glock_dq(gh);

1318

++		}

1319

++	}

1320

++}

1321

++

1322

++/**

1323

++ * find_first_holder - find the first "holder" gh

1324

++ * @gl: the glock

1325

++ */

1326

++

1327

++static inline struct gfs2_holder *find_first_holder(const struct gfs2_glock *gl)

1328

++{

1329

++	struct gfs2_holder *gh;

1330

++

1331

++	if (!list_empty(&gl->gl_holders)) {

1332

++		gh = list_first_entry(&gl->gl_holders, struct gfs2_holder,

1333

++				      gh_list);

1334

++		if (test_bit(HIF_HOLDER, &gh->gh_iflags))

1335

++			return gh;

1336

++	}

1337

++	return NULL;

1338

++}

1339

++

1340

++/**

1341

++ * find_first_strong_holder - find the first non-demoteable holder

1342

++ * @gl: the glock

1343

++ *

1344

++ * Find the first holder that doesn't have the HIF_MAY_DEMOTE flag set.

1345

++ */

1346

++static inline struct gfs2_holder *

1347

++find_first_strong_holder(struct gfs2_glock *gl)

1348

++{

1349

++	struct gfs2_holder *gh;

1350

++

1351

++	list_for_each_entry(gh, &gl->gl_holders, gh_list) {

1352

++		if (!test_bit(HIF_HOLDER, &gh->gh_iflags))

1353

++			return NULL;

1354

++		if (!test_bit(HIF_MAY_DEMOTE, &gh->gh_iflags))

1355

++			return gh;

1356

++	}

1357

++	return NULL;

1358

++}

1359

++

1360

+ /**

1361

+  * do_promote - promote as many requests as possible on the current queue

1362

+  * @gl: The glock

1363

+@@ -393,14 +485,21 @@ __releases(&gl->gl_lockref.lock)

1364

+ __acquires(&gl->gl_lockref.lock)

1365

+ {

1366

+ 	const struct gfs2_glock_operations *glops = gl->gl_ops;

1367

+-	struct gfs2_holder *gh, *tmp;

1368

++	struct gfs2_holder *gh, *tmp, *first_gh;

1369

++	bool incompat_holders_demoted = false;

1370

+ 	int ret;

1371

+

1372

+ restart:

1373

++	first_gh = find_first_strong_holder(gl);

1374

+ 	list_for_each_entry_safe(gh, tmp, &gl->gl_holders, gh_list) {

1375

+-		if (test_bit(HIF_HOLDER, &gh->gh_iflags))

1376

++		if (!test_bit(HIF_WAIT, &gh->gh_iflags))

1377

+ 			continue;

1378

+-		if (may_grant(gl, gh)) {

1379

++		if (may_grant(gl, first_gh, gh)) {

1380

++			if (!incompat_holders_demoted) {

1381

++				demote_incompat_holders(gl, first_gh);

1382

++				incompat_holders_demoted = true;

1383

++				first_gh = gh;

1384

++			}

1385

+ 			if (gh->gh_list.prev == &gl->gl_holders &&

1386

+ 			    glops->go_lock) {

1387

+ 				spin_unlock(&gl->gl_lockref.lock);

1388

+@@ -426,6 +525,11 @@ restart:

1389

+ 			gfs2_holder_wake(gh);

1390

+ 			continue;

1391

+ 		}

1392

++		/*

1393

++		 * If we get here, it means we may not grant this holder for

1394

++		 * some reason. If this holder is the head of the list, it

1395

++		 * means we have a blocked holder at the head, so return 1.

1396

++		 */

1397

+ 		if (gh->gh_list.prev == &gl->gl_holders)

1398

+ 			return 1;

1399

+ 		do_error(gl, 0);

1400

+@@ -722,23 +826,6 @@ out:

1401

+ 	spin_lock(&gl->gl_lockref.lock);

1402

+ }

1403

+

1404

+-/**

1405

+- * find_first_holder - find the first "holder" gh

1406

+- * @gl: the glock

1407

+- */

1408

+-

1409

+-static inline struct gfs2_holder *find_first_holder(const struct gfs2_glock *gl)

1410

+-{

1411

+-	struct gfs2_holder *gh;

1412

+-

1413

+-	if (!list_empty(&gl->gl_holders)) {

1414

+-		gh = list_first_entry(&gl->gl_holders, struct gfs2_holder, gh_list);

1415

+-		if (test_bit(HIF_HOLDER, &gh->gh_iflags))

1416

+-			return gh;

1417

+-	}

1418

+-	return NULL;

1419

+-}

1420

+-

1421

+ /**

1422

+  * run_queue - do all outstanding tasks related to a glock

1423

+  * @gl: The glock in question

1424

+@@ -1354,15 +1441,20 @@ __acquires(&gl->gl_lockref.lock)

1425

+ 		GLOCK_BUG_ON(gl, true);

1426

+

1427

+ 	if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) {

1428

+-		if (test_bit(GLF_LOCK, &gl->gl_flags))

1429

+-			try_futile = !may_grant(gl, gh);

1430

++		if (test_bit(GLF_LOCK, &gl->gl_flags)) {

1431

++			struct gfs2_holder *first_gh;

1432

++

1433

++			first_gh = find_first_strong_holder(gl);

1434

++			try_futile = !may_grant(gl, first_gh, gh);

1435

++		}

1436

+ 		if (test_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags))

1437

+ 			goto fail;

1438

+ 	}

1439

+

1440

+ 	list_for_each_entry(gh2, &gl->gl_holders, gh_list) {

1441

+ 		if (unlikely(gh2->gh_owner_pid == gh->gh_owner_pid &&

1442

+-		    (gh->gh_gl->gl_ops->go_type != LM_TYPE_FLOCK)))

1443

++		    (gh->gh_gl->gl_ops->go_type != LM_TYPE_FLOCK) &&

1444

++		    !test_bit(HIF_MAY_DEMOTE, &gh2->gh_iflags)))

1445

+ 			goto trap_recursive;

1446

+ 		if (try_futile &&

1447

+ 		    !(gh2->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))) {

1448

+@@ -1458,51 +1550,83 @@ int gfs2_glock_poll(struct gfs2_holder *gh)

1449

+ 	return test_bit(HIF_WAIT, &gh->gh_iflags) ? 0 : 1;

1450

+ }

1451

+

1452

+-/**

1453

+- * gfs2_glock_dq - dequeue a struct gfs2_holder from a glock (release a glock)

1454

+- * @gh: the glock holder

1455

+- *

1456

+- */

1457

++static inline bool needs_demote(struct gfs2_glock *gl)

1458

++{

1459

++	return (test_bit(GLF_DEMOTE, &gl->gl_flags) ||

1460

++		test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags));

1461

++}

1462

+

1463

+-void gfs2_glock_dq(struct gfs2_holder *gh)

1464

++static void __gfs2_glock_dq(struct gfs2_holder *gh)

1465

+ {

1466

+ 	struct gfs2_glock *gl = gh->gh_gl;

1467

+ 	struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;

1468

+ 	unsigned delay = 0;

1469

+ 	int fast_path = 0;

1470

+

1471

+-	spin_lock(&gl->gl_lockref.lock);

1472

+ 	/*

1473

+-	 * If we're in the process of file system withdraw, we cannot just

1474

+-	 * dequeue any glocks until our journal is recovered, lest we

1475

+-	 * introduce file system corruption. We need two exceptions to this

1476

+-	 * rule: We need to allow unlocking of nondisk glocks and the glock

1477

+-	 * for our own journal that needs recovery.

1478

++	 * This while loop is similar to function demote_incompat_holders:

1479

++	 * If the glock is due to be demoted (which may be from another node

1480

++	 * or even if this holder is GL_NOCACHE), the weak holders are

1481

++	 * demoted as well, allowing the glock to be demoted.

1482

+ 	 */

1483

+-	if (test_bit(SDF_WITHDRAW_RECOVERY, &sdp->sd_flags) &&

1484

+-	    glock_blocked_by_withdraw(gl) &&

1485

+-	    gh->gh_gl != sdp->sd_jinode_gl) {

1486

+-		sdp->sd_glock_dqs_held++;

1487

+-		spin_unlock(&gl->gl_lockref.lock);

1488

+-		might_sleep();

1489

+-		wait_on_bit(&sdp->sd_flags, SDF_WITHDRAW_RECOVERY,

1490

+-			    TASK_UNINTERRUPTIBLE);

1491

+-		spin_lock(&gl->gl_lockref.lock);

1492

+-	}

1493

+-	if (gh->gh_flags & GL_NOCACHE)

1494

+-		handle_callback(gl, LM_ST_UNLOCKED, 0, false);

1495

++	while (gh) {

1496

++		/*

1497

++		 * If we're in the process of file system withdraw, we cannot

1498

++		 * just dequeue any glocks until our journal is recovered, lest

1499

++		 * we introduce file system corruption. We need two exceptions

1500

++		 * to this rule: We need to allow unlocking of nondisk glocks

1501

++		 * and the glock for our own journal that needs recovery.

1502

++		 */

1503

++		if (test_bit(SDF_WITHDRAW_RECOVERY, &sdp->sd_flags) &&

1504

++		    glock_blocked_by_withdraw(gl) &&

1505

++		    gh->gh_gl != sdp->sd_jinode_gl) {

1506

++			sdp->sd_glock_dqs_held++;

1507

++			spin_unlock(&gl->gl_lockref.lock);

1508

++			might_sleep();

1509

++			wait_on_bit(&sdp->sd_flags, SDF_WITHDRAW_RECOVERY,

1510

++				    TASK_UNINTERRUPTIBLE);

1511

++			spin_lock(&gl->gl_lockref.lock);

1512

++		}

1513

+

1514

+-	list_del_init(&gh->gh_list);

1515

+-	clear_bit(HIF_HOLDER, &gh->gh_iflags);

1516

+-	if (list_empty(&gl->gl_holders) &&

1517

+-	    !test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) &&

1518

+-	    !test_bit(GLF_DEMOTE, &gl->gl_flags))

1519

+-		fast_path = 1;

1520

++		/*

1521

++		 * This holder should not be cached, so mark it for demote.

1522

++		 * Note: this should be done before the check for needs_demote

1523

++		 * below.

1524

++		 */

1525

++		if (gh->gh_flags & GL_NOCACHE)

1526

++			handle_callback(gl, LM_ST_UNLOCKED, 0, false);

1527

++

1528

++		list_del_init(&gh->gh_list);

1529

++		clear_bit(HIF_HOLDER, &gh->gh_iflags);

1530

++		trace_gfs2_glock_queue(gh, 0);

1531

++

1532

++		/*

1533

++		 * If there hasn't been a demote request we are done.

1534

++		 * (Let the remaining holders, if any, keep holding it.)

1535

++		 */

1536

++		if (!needs_demote(gl)) {

1537

++			if (list_empty(&gl->gl_holders))

1538

++				fast_path = 1;

1539

++			break;

1540

++		}

1541

++		/*

1542

++		 * If we have another strong holder (we cannot auto-demote)

1543

++		 * we are done. It keeps holding it until it is done.

1544

++		 */

1545

++		if (find_first_strong_holder(gl))

1546

++			break;

1547

++

1548

++		/*

1549

++		 * If we have a weak holder at the head of the list, it

1550

++		 * (and all others like it) must be auto-demoted. If there

1551

++		 * are no more weak holders, we exit the while loop.

1552

++		 */

1553

++		gh = find_first_holder(gl);

1554

++	}

1555

+

1556

+ 	if (!test_bit(GLF_LFLUSH, &gl->gl_flags) && demote_ok(gl))

1557

+ 		gfs2_glock_add_to_lru(gl);

1558

+

1559

+-	trace_gfs2_glock_queue(gh, 0);

1560

+ 	if (unlikely(!fast_path)) {

1561

+ 		gl->gl_lockref.count++;

1562

+ 		if (test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) &&

1563

+@@ -1511,6 +1635,19 @@ void gfs2_glock_dq(struct gfs2_holder *gh)

1564

+ 			delay = gl->gl_hold_time;

1565

+ 		__gfs2_glock_queue_work(gl, delay);

1566

+ 	}

1567

++}

1568

++

1569

++/**

1570

++ * gfs2_glock_dq - dequeue a struct gfs2_holder from a glock (release a glock)

1571

++ * @gh: the glock holder

1572

++ *

1573

++ */

1574

++void gfs2_glock_dq(struct gfs2_holder *gh)

1575

++{

1576

++	struct gfs2_glock *gl = gh->gh_gl;

1577

++

1578

++	spin_lock(&gl->gl_lockref.lock);

1579

++	__gfs2_glock_dq(gh);

1580

+ 	spin_unlock(&gl->gl_lockref.lock);

1581

+ }

1582

+

1583

+@@ -1673,6 +1810,7 @@ void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs)

1584

+

1585

+ void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state)

1586

+ {

1587

++	struct gfs2_holder mock_gh = { .gh_gl = gl, .gh_state = state, };

1588

+ 	unsigned long delay = 0;

1589

+ 	unsigned long holdtime;

1590

+ 	unsigned long now = jiffies;

1591

+@@ -1687,6 +1825,28 @@ void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state)

1592

+ 		if (test_bit(GLF_REPLY_PENDING, &gl->gl_flags))

1593

+ 			delay = gl->gl_hold_time;

1594

+ 	}

1595

++	/*

1596

++	 * Note 1: We cannot call demote_incompat_holders from handle_callback

1597

++	 * or gfs2_set_demote due to recursion problems like: gfs2_glock_dq ->

1598

++	 * handle_callback -> demote_incompat_holders -> gfs2_glock_dq

1599

++	 * Plus, we only want to demote the holders if the request comes from

1600

++	 * a remote cluster node because local holder conflicts are resolved

1601

++	 * elsewhere.

1602

++	 *

1603

++	 * Note 2: if a remote node wants this glock in EX mode, lock_dlm will

1604

++	 * request that we set our state to UNLOCKED. Here we mock up a holder

1605

++	 * to make it look like someone wants the lock EX locally. Any SH

1606

++	 * and DF requests should be able to share the lock without demoting.

1607

++	 *

1608

++	 * Note 3: We only want to demote the demoteable holders when there

1609

++	 * are no more strong holders. The demoteable holders might as well

1610

++	 * keep the glock until the last strong holder is done with it.

1611

++	 */

1612

++	if (!find_first_strong_holder(gl)) {

1613

++		if (state == LM_ST_UNLOCKED)

1614

++			mock_gh.gh_state = LM_ST_EXCLUSIVE;

1615

++		demote_incompat_holders(gl, &mock_gh);

1616

++	}

1617

+ 	handle_callback(gl, state, delay, true);

1618

+ 	__gfs2_glock_queue_work(gl, delay);

1619

+ 	spin_unlock(&gl->gl_lockref.lock);

1620

+@@ -2078,6 +2238,8 @@ static const char *hflags2str(char *buf, u16 flags, unsigned long iflags)

1621

+ 		*p++ = 'H';

1622

+ 	if (test_bit(HIF_WAIT, &iflags))

1623

+ 		*p++ = 'W';

1624

++	if (test_bit(HIF_MAY_DEMOTE, &iflags))

1625

++		*p++ = 'D';

1626

+ 	*p = 0;

1627

+ 	return buf;

1628

+ }

1629

+diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h

1630

+index 31a8f2f649b52..9012487da4c69 100644

1631

+--- a/fs/gfs2/glock.h

1632

++++ b/fs/gfs2/glock.h

1633

+@@ -150,6 +150,8 @@ static inline struct gfs2_holder *gfs2_glock_is_locked_by_me(struct gfs2_glock *

1634

+ 	list_for_each_entry(gh, &gl->gl_holders, gh_list) {

1635

+ 		if (!test_bit(HIF_HOLDER, &gh->gh_iflags))

1636

+ 			break;

1637

++		if (test_bit(HIF_MAY_DEMOTE, &gh->gh_iflags))

1638

++			continue;

1639

+ 		if (gh->gh_owner_pid == pid)

1640

+ 			goto out;

1641

+ 	}

1642

+@@ -325,6 +327,24 @@ static inline void glock_clear_object(struct gfs2_glock *gl, void *object)

1643

+ 	spin_unlock(&gl->gl_lockref.lock);

1644

+ }

1645

+

1646

++static inline void gfs2_holder_allow_demote(struct gfs2_holder *gh)

1647

++{

1648

++	struct gfs2_glock *gl = gh->gh_gl;

1649

++

1650

++	spin_lock(&gl->gl_lockref.lock);

1651

++	set_bit(HIF_MAY_DEMOTE, &gh->gh_iflags);

1652

++	spin_unlock(&gl->gl_lockref.lock);

1653

++}

1654

++

1655

++static inline void gfs2_holder_disallow_demote(struct gfs2_holder *gh)

1656

++{

1657

++	struct gfs2_glock *gl = gh->gh_gl;

1658

++

1659

++	spin_lock(&gl->gl_lockref.lock);

1660

++	clear_bit(HIF_MAY_DEMOTE, &gh->gh_iflags);

1661

++	spin_unlock(&gl->gl_lockref.lock);

1662

++}

1663

++

1664

+ extern void gfs2_inode_remember_delete(struct gfs2_glock *gl, u64 generation);

1665

+ extern bool gfs2_inode_already_deleted(struct gfs2_glock *gl, u64 generation);

1666

+

1667

+diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h

1668

+index 0fe49770166ea..ca42d310fd4d6 100644

1669

+--- a/fs/gfs2/incore.h

1670

++++ b/fs/gfs2/incore.h

1671

+@@ -252,6 +252,7 @@ struct gfs2_lkstats {

1672

+

1673

+ enum {

1674

+ 	/* States */

1675

++	HIF_MAY_DEMOTE		= 1,

1676

+ 	HIF_HOLDER		= 6,  /* Set for gh that "holds" the glock */

1677

+ 	HIF_WAIT		= 10,

1678

+ };

1679

+@@ -386,9 +387,8 @@ struct gfs2_inode {

1680

+ 	u64 i_generation;

1681

+ 	u64 i_eattr;

1682

+ 	unsigned long i_flags;		/* GIF_... */

1683

+-	struct gfs2_glock *i_gl; /* Move into i_gh? */

1684

++	struct gfs2_glock *i_gl;

1685

+ 	struct gfs2_holder i_iopen_gh;

1686

+-	struct gfs2_holder i_gh; /* for prepare/commit_write only */

1687

+ 	struct gfs2_qadata *i_qadata; /* quota allocation data */

1688

+ 	struct gfs2_holder i_rgd_gh;

1689

+ 	struct gfs2_blkreserv i_res; /* rgrp multi-block reservation */

1690

+diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c

1691

+index 97119ec3b8503..fe10d8a30f6bd 100644

1692

+--- a/fs/iomap/buffered-io.c

1693

++++ b/fs/iomap/buffered-io.c

1694

+@@ -757,7 +757,7 @@ again:

1695

+ 		 * same page as we're writing to, without it being marked

1696

+ 		 * up-to-date.

1697

+ 		 */

1698

+-		if (unlikely(iov_iter_fault_in_readable(i, bytes))) {

1699

++		if (unlikely(fault_in_iov_iter_readable(i, bytes))) {

1700

+ 			status = -EFAULT;

1701

+ 			break;

1702

+ 		}

1703

+diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c

1704

+index 4ecd255e0511c..468dcbba45bcb 100644

1705

+--- a/fs/iomap/direct-io.c

1706

++++ b/fs/iomap/direct-io.c

1707

+@@ -31,6 +31,7 @@ struct iomap_dio {

1708

+ 	atomic_t		ref;

1709

+ 	unsigned		flags;

1710

+ 	int			error;

1711

++	size_t			done_before;

1712

+ 	bool			wait_for_completion;

1713

+

1714

+ 	union {

1715

+@@ -124,6 +125,9 @@ ssize_t iomap_dio_complete(struct iomap_dio *dio)

1716

+ 	if (ret > 0 && (dio->flags & IOMAP_DIO_NEED_SYNC))

1717

+ 		ret = generic_write_sync(iocb, ret);

1718

+

1719

++	if (ret > 0)

1720

++		ret += dio->done_before;

1721

++

1722

+ 	kfree(dio);

1723

+

1724

+ 	return ret;

1725

+@@ -371,6 +375,8 @@ static loff_t iomap_dio_hole_iter(const struct iomap_iter *iter,

1726

+ 	loff_t length = iov_iter_zero(iomap_length(iter), dio->submit.iter);

1727

+

1728

+ 	dio->size += length;

1729

++	if (!length)

1730

++		return -EFAULT;

1731

+ 	return length;

1732

+ }

1733

+

1734

+@@ -402,6 +408,8 @@ static loff_t iomap_dio_inline_iter(const struct iomap_iter *iomi,

1735

+ 		copied = copy_to_iter(inline_data, length, iter);

1736

+ 	}

1737

+ 	dio->size += copied;

1738

++	if (!copied)

1739

++		return -EFAULT;

1740

+ 	return copied;

1741

+ }

1742

+

1743

+@@ -446,13 +454,21 @@ static loff_t iomap_dio_iter(const struct iomap_iter *iter,

1744

+  * may be pure data writes. In that case, we still need to do a full data sync

1745

+  * completion.

1746

+  *

1747

++ * When page faults are disabled and @dio_flags includes IOMAP_DIO_PARTIAL,

1748

++ * __iomap_dio_rw can return a partial result if it encounters a non-resident

1749

++ * page in @iter after preparing a transfer.  In that case, the non-resident

1750

++ * pages can be faulted in and the request resumed with @done_before set to the

1751

++ * number of bytes previously transferred.  The request will then complete with

1752

++ * the correct total number of bytes transferred; this is essential for

1753

++ * completing partial requests asynchronously.

1754

++ *

1755

+  * Returns -ENOTBLK In case of a page invalidation invalidation failure for

1756

+  * writes.  The callers needs to fall back to buffered I/O in this case.

1757

+  */

1758

+ struct iomap_dio *

1759

+ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,

1760

+ 		const struct iomap_ops *ops, const struct iomap_dio_ops *dops,

1761

+-		unsigned int dio_flags)

1762

++		unsigned int dio_flags, size_t done_before)

1763

+ {

1764

+ 	struct address_space *mapping = iocb->ki_filp->f_mapping;

1765

+ 	struct inode *inode = file_inode(iocb->ki_filp);

1766

+@@ -482,6 +498,7 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,

1767

+ 	dio->dops = dops;

1768

+ 	dio->error = 0;

1769

+ 	dio->flags = 0;

1770

++	dio->done_before = done_before;

1771

+

1772

+ 	dio->submit.iter = iter;

1773

+ 	dio->submit.waiter = current;

1774

+@@ -577,6 +594,12 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,

1775

+ 	if (iov_iter_rw(iter) == READ && iomi.pos >= dio->i_size)

1776

+ 		iov_iter_revert(iter, iomi.pos - dio->i_size);

1777

+

1778

++	if (ret == -EFAULT && dio->size && (dio_flags & IOMAP_DIO_PARTIAL)) {

1779

++		if (!(iocb->ki_flags & IOCB_NOWAIT))

1780

++			wait_for_completion = true;

1781

++		ret = 0;

1782

++	}

1783

++

1784

+ 	/* magic error code to fall back to buffered I/O */

1785

+ 	if (ret == -ENOTBLK) {

1786

+ 		wait_for_completion = true;

1787

+@@ -642,11 +665,11 @@ EXPORT_SYMBOL_GPL(__iomap_dio_rw);

1788

+ ssize_t

1789

+ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,

1790

+ 		const struct iomap_ops *ops, const struct iomap_dio_ops *dops,

1791

+-		unsigned int dio_flags)

1792

++		unsigned int dio_flags, size_t done_before)

1793

+ {

1794

+ 	struct iomap_dio *dio;

1795

+

1796

+-	dio = __iomap_dio_rw(iocb, iter, ops, dops, dio_flags);

1797

++	dio = __iomap_dio_rw(iocb, iter, ops, dops, dio_flags, done_before);

1798

+ 	if (IS_ERR_OR_NULL(dio))

1799

+ 		return PTR_ERR_OR_ZERO(dio);

1800

+ 	return iomap_dio_complete(dio);

1801

+diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c

1802

+index ab4f3362466d0..a43adeacd930c 100644

1803

+--- a/fs/ntfs/file.c

1804

++++ b/fs/ntfs/file.c

1805

+@@ -1829,7 +1829,7 @@ again:

1806

+ 		 * pages being swapped out between us bringing them into memory

1807

+ 		 * and doing the actual copying.

1808

+ 		 */

1809

+-		if (unlikely(iov_iter_fault_in_readable(i, bytes))) {

1810

++		if (unlikely(fault_in_iov_iter_readable(i, bytes))) {

1811

+ 			status = -EFAULT;

1812

+ 			break;

1813

+ 		}

1814

+diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c

1815

+index 43b1451bff539..54b9599640ef4 100644

1816

+--- a/fs/ntfs3/file.c

1817

++++ b/fs/ntfs3/file.c

1818

+@@ -989,7 +989,7 @@ static ssize_t ntfs_compress_write(struct kiocb *iocb, struct iov_iter *from)

1819

+ 		frame_vbo = pos & ~(frame_size - 1);

1820

+ 		index = frame_vbo >> PAGE_SHIFT;

1821

+

1822

+-		if (unlikely(iov_iter_fault_in_readable(from, bytes))) {

1823

++		if (unlikely(fault_in_iov_iter_readable(from, bytes))) {

1824

+ 			err = -EFAULT;

1825

+ 			goto out;

1826

+ 		}

1827

+diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c

1828

+index 7aa943edfc02f..240eb932c014b 100644

1829

+--- a/fs/xfs/xfs_file.c

1830

++++ b/fs/xfs/xfs_file.c

1831

+@@ -259,7 +259,7 @@ xfs_file_dio_read(

1832

+ 	ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);

1833

+ 	if (ret)

1834

+ 		return ret;

1835

+-	ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL, 0);

1836

++	ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL, 0, 0);

1837

+ 	xfs_iunlock(ip, XFS_IOLOCK_SHARED);

1838

+

1839

+ 	return ret;

1840

+@@ -569,7 +569,7 @@ xfs_file_dio_write_aligned(

1841

+ 	}

1842

+ 	trace_xfs_file_direct_write(iocb, from);

1843

+ 	ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,

1844

+-			   &xfs_dio_write_ops, 0);

1845

++			   &xfs_dio_write_ops, 0, 0);

1846

+ out_unlock:

1847

+ 	if (iolock)

1848

+ 		xfs_iunlock(ip, iolock);

1849

+@@ -647,7 +647,7 @@ retry_exclusive:

1850

+

1851

+ 	trace_xfs_file_direct_write(iocb, from);

1852

+ 	ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,

1853

+-			   &xfs_dio_write_ops, flags);

1854

++			   &xfs_dio_write_ops, flags, 0);

1855

+

1856

+ 	/*

1857

+ 	 * Retry unaligned I/O with exclusive blocking semantics if the DIO

1858

+diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c

1859

+index 807f33553a8eb..bced33b76beac 100644

1860

+--- a/fs/zonefs/super.c

1861

++++ b/fs/zonefs/super.c

1862

+@@ -852,7 +852,7 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from)

1863

+ 		ret = zonefs_file_dio_append(iocb, from);

1864

+ 	else

1865

+ 		ret = iomap_dio_rw(iocb, from, &zonefs_iomap_ops,

1866

+-				   &zonefs_write_dio_ops, 0);

1867

++				   &zonefs_write_dio_ops, 0, 0);

1868

+ 	if (zi->i_ztype == ZONEFS_ZTYPE_SEQ &&

1869

+ 	    (ret > 0 || ret == -EIOCBQUEUED)) {

1870

+ 		if (ret > 0)

1871

+@@ -987,7 +987,7 @@ static ssize_t zonefs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)

1872

+ 		}

1873

+ 		file_accessed(iocb->ki_filp);

1874

+ 		ret = iomap_dio_rw(iocb, to, &zonefs_iomap_ops,

1875

+-				   &zonefs_read_dio_ops, 0);

1876

++				   &zonefs_read_dio_ops, 0, 0);

1877

+ 	} else {

1878

+ 		ret = generic_file_read_iter(iocb, to);

1879

+ 		if (ret == -EIO)

1880

+diff --git a/include/linux/bpf.h b/include/linux/bpf.h

1881

+index 15b690a0cecb0..c5c4b6f09e230 100644

1882

+--- a/include/linux/bpf.h

1883

++++ b/include/linux/bpf.h

1884

+@@ -293,6 +293,34 @@ bool bpf_map_meta_equal(const struct bpf_map *meta0,

1885

+

1886

+ extern const struct bpf_map_ops bpf_map_offload_ops;

1887

+

1888

++/* bpf_type_flag contains a set of flags that are applicable to the values of

1889

++ * arg_type, ret_type and reg_type. For example, a pointer value may be null,

1890

++ * or a memory is read-only. We classify types into two categories: base types

1891

++ * and extended types. Extended types are base types combined with a type flag.

1892

++ *

1893

++ * Currently there are no more than 32 base types in arg_type, ret_type and

1894

++ * reg_types.

1895

++ */

1896

++#define BPF_BASE_TYPE_BITS	8

1897

++

1898

++enum bpf_type_flag {

1899

++	/* PTR may be NULL. */

1900

++	PTR_MAYBE_NULL		= BIT(0 + BPF_BASE_TYPE_BITS),

1901

++

1902

++	/* MEM is read-only. When applied on bpf_arg, it indicates the arg is

1903

++	 * compatible with both mutable and immutable memory.

1904

++	 */

1905

++	MEM_RDONLY		= BIT(1 + BPF_BASE_TYPE_BITS),

1906

++

1907

++	__BPF_TYPE_LAST_FLAG	= MEM_RDONLY,

1908

++};

1909

++

1910

++/* Max number of base types. */

1911

++#define BPF_BASE_TYPE_LIMIT	(1UL << BPF_BASE_TYPE_BITS)

1912

++

1913

++/* Max number of all types. */

1914

++#define BPF_TYPE_LIMIT		(__BPF_TYPE_LAST_FLAG | (__BPF_TYPE_LAST_FLAG - 1))

1915

++

1916

+ /* function argument constraints */

1917

+ enum bpf_arg_type {

1918

+ 	ARG_DONTCARE = 0,	/* unused argument in helper function */

1919

+@@ -304,13 +332,11 @@ enum bpf_arg_type {

1920

+ 	ARG_PTR_TO_MAP_KEY,	/* pointer to stack used as map key */

1921

+ 	ARG_PTR_TO_MAP_VALUE,	/* pointer to stack used as map value */

1922

+ 	ARG_PTR_TO_UNINIT_MAP_VALUE,	/* pointer to valid memory used to store a map value */

1923

+-	ARG_PTR_TO_MAP_VALUE_OR_NULL,	/* pointer to stack used as map value or NULL */

1924

+

1925

+ 	/* the following constraints used to prototype bpf_memcmp() and other

1926

+ 	 * functions that access data on eBPF program stack

1927

+ 	 */

1928

+ 	ARG_PTR_TO_MEM,		/* pointer to valid memory (stack, packet, map value) */

1929

+-	ARG_PTR_TO_MEM_OR_NULL, /* pointer to valid memory or NULL */

1930

+ 	ARG_PTR_TO_UNINIT_MEM,	/* pointer to memory does not need to be initialized,

1931

+ 				 * helper function must fill all bytes or clear

1932

+ 				 * them in error case.

1933

+@@ -320,42 +346,65 @@ enum bpf_arg_type {

1934

+ 	ARG_CONST_SIZE_OR_ZERO,	/* number of bytes accessed from memory or 0 */

1935

+

1936

+ 	ARG_PTR_TO_CTX,		/* pointer to context */

1937

+-	ARG_PTR_TO_CTX_OR_NULL,	/* pointer to context or NULL */

1938

+ 	ARG_ANYTHING,		/* any (initialized) argument is ok */

1939

+ 	ARG_PTR_TO_SPIN_LOCK,	/* pointer to bpf_spin_lock */

1940

+ 	ARG_PTR_TO_SOCK_COMMON,	/* pointer to sock_common */

1941

+ 	ARG_PTR_TO_INT,		/* pointer to int */

1942

+ 	ARG_PTR_TO_LONG,	/* pointer to long */

1943

+ 	ARG_PTR_TO_SOCKET,	/* pointer to bpf_sock (fullsock) */

1944

+-	ARG_PTR_TO_SOCKET_OR_NULL,	/* pointer to bpf_sock (fullsock) or NULL */

1945

+ 	ARG_PTR_TO_BTF_ID,	/* pointer to in-kernel struct */

1946

+ 	ARG_PTR_TO_ALLOC_MEM,	/* pointer to dynamically allocated memory */

1947

+-	ARG_PTR_TO_ALLOC_MEM_OR_NULL,	/* pointer to dynamically allocated memory or NULL */

1948

+ 	ARG_CONST_ALLOC_SIZE_OR_ZERO,	/* number of allocated bytes requested */

1949

+ 	ARG_PTR_TO_BTF_ID_SOCK_COMMON,	/* pointer to in-kernel sock_common or bpf-mirrored bpf_sock */

1950

+ 	ARG_PTR_TO_PERCPU_BTF_ID,	/* pointer to in-kernel percpu type */

1951

+ 	ARG_PTR_TO_FUNC,	/* pointer to a bpf program function */

1952

+-	ARG_PTR_TO_STACK_OR_NULL,	/* pointer to stack or NULL */

1953

++	ARG_PTR_TO_STACK,	/* pointer to stack */

1954

+ 	ARG_PTR_TO_CONST_STR,	/* pointer to a null terminated read-only string */

1955

+ 	ARG_PTR_TO_TIMER,	/* pointer to bpf_timer */

1956

+ 	__BPF_ARG_TYPE_MAX,

1957

++

1958

++	/* Extended arg_types. */

1959

++	ARG_PTR_TO_MAP_VALUE_OR_NULL	= PTR_MAYBE_NULL | ARG_PTR_TO_MAP_VALUE,

1960

++	ARG_PTR_TO_MEM_OR_NULL		= PTR_MAYBE_NULL | ARG_PTR_TO_MEM,

1961

++	ARG_PTR_TO_CTX_OR_NULL		= PTR_MAYBE_NULL | ARG_PTR_TO_CTX,

1962

++	ARG_PTR_TO_SOCKET_OR_NULL	= PTR_MAYBE_NULL | ARG_PTR_TO_SOCKET,

1963

++	ARG_PTR_TO_ALLOC_MEM_OR_NULL	= PTR_MAYBE_NULL | ARG_PTR_TO_ALLOC_MEM,

1964

++	ARG_PTR_TO_STACK_OR_NULL	= PTR_MAYBE_NULL | ARG_PTR_TO_STACK,

1965

++

1966

++	/* This must be the last entry. Its purpose is to ensure the enum is

1967

++	 * wide enough to hold the higher bits reserved for bpf_type_flag.

1968

++	 */

1969

++	__BPF_ARG_TYPE_LIMIT	= BPF_TYPE_LIMIT,

1970

+ };

1971

++static_assert(__BPF_ARG_TYPE_MAX <= BPF_BASE_TYPE_LIMIT);

1972

+

1973

+ /* type of values returned from helper functions */

1974

+ enum bpf_return_type {

1975

+ 	RET_INTEGER,			/* function returns integer */

1976

+ 	RET_VOID,			/* function doesn't return anything */

1977

+ 	RET_PTR_TO_MAP_VALUE,		/* returns a pointer to map elem value */

1978

+-	RET_PTR_TO_MAP_VALUE_OR_NULL,	/* returns a pointer to map elem value or NULL */

1979

+-	RET_PTR_TO_SOCKET_OR_NULL,	/* returns a pointer to a socket or NULL */

1980

+-	RET_PTR_TO_TCP_SOCK_OR_NULL,	/* returns a pointer to a tcp_sock or NULL */

1981

+-	RET_PTR_TO_SOCK_COMMON_OR_NULL,	/* returns a pointer to a sock_common or NULL */

1982

+-	RET_PTR_TO_ALLOC_MEM_OR_NULL,	/* returns a pointer to dynamically allocated memory or NULL */

1983

+-	RET_PTR_TO_BTF_ID_OR_NULL,	/* returns a pointer to a btf_id or NULL */

1984

+-	RET_PTR_TO_MEM_OR_BTF_ID_OR_NULL, /* returns a pointer to a valid memory or a btf_id or NULL */

1985

++	RET_PTR_TO_SOCKET,		/* returns a pointer to a socket */

1986

++	RET_PTR_TO_TCP_SOCK,		/* returns a pointer to a tcp_sock */

1987

++	RET_PTR_TO_SOCK_COMMON,		/* returns a pointer to a sock_common */

1988

++	RET_PTR_TO_ALLOC_MEM,		/* returns a pointer to dynamically allocated memory */

1989

+ 	RET_PTR_TO_MEM_OR_BTF_ID,	/* returns a pointer to a valid memory or a btf_id */

1990

+ 	RET_PTR_TO_BTF_ID,		/* returns a pointer to a btf_id */

1991

++	__BPF_RET_TYPE_MAX,

1992

++

1993

++	/* Extended ret_types. */

1994

++	RET_PTR_TO_MAP_VALUE_OR_NULL	= PTR_MAYBE_NULL | RET_PTR_TO_MAP_VALUE,

1995

++	RET_PTR_TO_SOCKET_OR_NULL	= PTR_MAYBE_NULL | RET_PTR_TO_SOCKET,

1996

++	RET_PTR_TO_TCP_SOCK_OR_NULL	= PTR_MAYBE_NULL | RET_PTR_TO_TCP_SOCK,

1997

++	RET_PTR_TO_SOCK_COMMON_OR_NULL	= PTR_MAYBE_NULL | RET_PTR_TO_SOCK_COMMON,

1998

++	RET_PTR_TO_ALLOC_MEM_OR_NULL	= PTR_MAYBE_NULL | RET_PTR_TO_ALLOC_MEM,

1999

++	RET_PTR_TO_BTF_ID_OR_NULL	= PTR_MAYBE_NULL | RET_PTR_TO_BTF_ID,

2000

++

2001

++	/* This must be the last entry. Its purpose is to ensure the enum is

2002

++	 * wide enough to hold the higher bits reserved for bpf_type_flag.

2003

++	 */

2004

++	__BPF_RET_TYPE_LIMIT	= BPF_TYPE_LIMIT,

2005

+ };

2006

++static_assert(__BPF_RET_TYPE_MAX <= BPF_BASE_TYPE_LIMIT);

2007

+

2008

+ /* eBPF function prototype used by verifier to allow BPF_CALLs from eBPF programs

2009

+  * to in-kernel helper functions and for adjusting imm32 field in BPF_CALL

2010

+@@ -417,18 +466,15 @@ enum bpf_reg_type {

2011

+ 	PTR_TO_CTX,		 /* reg points to bpf_context */

2012

+ 	CONST_PTR_TO_MAP,	 /* reg points to struct bpf_map */

2013

+ 	PTR_TO_MAP_VALUE,	 /* reg points to map element value */

2014

+-	PTR_TO_MAP_VALUE_OR_NULL,/* points to map elem value or NULL */

2015

++	PTR_TO_MAP_KEY,		 /* reg points to a map element key */

2016

+ 	PTR_TO_STACK,		 /* reg == frame_pointer + offset */

2017

+ 	PTR_TO_PACKET_META,	 /* skb->data - meta_len */

2018

+ 	PTR_TO_PACKET,		 /* reg points to skb->data */

2019

+ 	PTR_TO_PACKET_END,	 /* skb->data + headlen */

2020

+ 	PTR_TO_FLOW_KEYS,	 /* reg points to bpf_flow_keys */

2021

+ 	PTR_TO_SOCKET,		 /* reg points to struct bpf_sock */

2022

+-	PTR_TO_SOCKET_OR_NULL,	 /* reg points to struct bpf_sock or NULL */

2023

+ 	PTR_TO_SOCK_COMMON,	 /* reg points to sock_common */

2024

+-	PTR_TO_SOCK_COMMON_OR_NULL, /* reg points to sock_common or NULL */

2025

+ 	PTR_TO_TCP_SOCK,	 /* reg points to struct tcp_sock */

2026

+-	PTR_TO_TCP_SOCK_OR_NULL, /* reg points to struct tcp_sock or NULL */

2027

+ 	PTR_TO_TP_BUFFER,	 /* reg points to a writable raw tp's buffer */

2028

+ 	PTR_TO_XDP_SOCK,	 /* reg points to struct xdp_sock */

2029

+ 	/* PTR_TO_BTF_ID points to a kernel struct that does not need

2030

+@@ -446,18 +492,25 @@ enum bpf_reg_type {

2031

+ 	 * been checked for null. Used primarily to inform the verifier

2032

+ 	 * an explicit null check is required for this struct.

2033

+ 	 */

2034

+-	PTR_TO_BTF_ID_OR_NULL,

2035

+ 	PTR_TO_MEM,		 /* reg points to valid memory region */

2036

+-	PTR_TO_MEM_OR_NULL,	 /* reg points to valid memory region or NULL */

2037

+-	PTR_TO_RDONLY_BUF,	 /* reg points to a readonly buffer */

2038

+-	PTR_TO_RDONLY_BUF_OR_NULL, /* reg points to a readonly buffer or NULL */

2039

+-	PTR_TO_RDWR_BUF,	 /* reg points to a read/write buffer */

2040

+-	PTR_TO_RDWR_BUF_OR_NULL, /* reg points to a read/write buffer or NULL */

2041

++	PTR_TO_BUF,		 /* reg points to a read/write buffer */

2042

+ 	PTR_TO_PERCPU_BTF_ID,	 /* reg points to a percpu kernel variable */

2043

+ 	PTR_TO_FUNC,		 /* reg points to a bpf program function */

2044

+-	PTR_TO_MAP_KEY,		 /* reg points to a map element key */

2045

+ 	__BPF_REG_TYPE_MAX,

2046

++

2047

++	/* Extended reg_types. */

2048

++	PTR_TO_MAP_VALUE_OR_NULL	= PTR_MAYBE_NULL | PTR_TO_MAP_VALUE,

2049

++	PTR_TO_SOCKET_OR_NULL		= PTR_MAYBE_NULL | PTR_TO_SOCKET,

2050

++	PTR_TO_SOCK_COMMON_OR_NULL	= PTR_MAYBE_NULL | PTR_TO_SOCK_COMMON,

2051

++	PTR_TO_TCP_SOCK_OR_NULL		= PTR_MAYBE_NULL | PTR_TO_TCP_SOCK,

2052

++	PTR_TO_BTF_ID_OR_NULL		= PTR_MAYBE_NULL | PTR_TO_BTF_ID,

2053

++

2054

++	/* This must be the last entry. Its purpose is to ensure the enum is

2055

++	 * wide enough to hold the higher bits reserved for bpf_type_flag.

2056

++	 */

2057

++	__BPF_REG_TYPE_LIMIT	= BPF_TYPE_LIMIT,

2058

+ };

2059

++static_assert(__BPF_REG_TYPE_MAX <= BPF_BASE_TYPE_LIMIT);

2060

+

2061

+ /* The information passed from prog-specific *_is_valid_access

2062

+  * back to the verifier.

2063

+diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h

2064

+index 364550dd19c4a..bb1cc3fbc4bab 100644

2065

+--- a/include/linux/bpf_verifier.h

2066

++++ b/include/linux/bpf_verifier.h

2067

+@@ -18,6 +18,8 @@

2068

+  * that converting umax_value to int cannot overflow.

2069

+  */

2070

+ #define BPF_MAX_VAR_SIZ	(1 << 29)

2071

++/* size of type_str_buf in bpf_verifier. */

2072

++#define TYPE_STR_BUF_LEN 64

2073

+

2074

+ /* Liveness marks, used for registers and spilled-regs (in stack slots).

2075

+  * Read marks propagate upwards until they find a write mark; they record that

2076

+@@ -474,6 +476,8 @@ struct bpf_verifier_env {

2077

+ 	/* longest register parentage chain walked for liveness marking */

2078

+ 	u32 longest_mark_read_walk;

2079

+ 	bpfptr_t fd_array;

2080

++	/* buffer used in reg_type_str() to generate reg_type string */

2081

++	char type_str_buf[TYPE_STR_BUF_LEN];

2082

+ };

2083

+

2084

+ __printf(2, 0) void bpf_verifier_vlog(struct bpf_verifier_log *log,

2085

+@@ -535,4 +539,18 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,

2086

+ 			    u32 btf_id,

2087

+ 			    struct bpf_attach_target_info *tgt_info);

2088

+

2089

++#define BPF_BASE_TYPE_MASK	GENMASK(BPF_BASE_TYPE_BITS - 1, 0)

2090

++

2091

++/* extract base type from bpf_{arg, return, reg}_type. */

2092

++static inline u32 base_type(u32 type)

2093

++{

2094

++	return type & BPF_BASE_TYPE_MASK;

2095

++}

2096

++

2097

++/* extract flags from an extended type. See bpf_type_flag in bpf.h. */

2098

++static inline u32 type_flag(u32 type)

2099

++{

2100

++	return type & ~BPF_BASE_TYPE_MASK;

2101

++}

2102

++

2103

+ #endif /* _LINUX_BPF_VERIFIER_H */

2104

+diff --git a/include/linux/iomap.h b/include/linux/iomap.h

2105

+index 24f8489583ca7..829f2325ecbab 100644

2106

+--- a/include/linux/iomap.h

2107

++++ b/include/linux/iomap.h

2108

+@@ -330,12 +330,19 @@ struct iomap_dio_ops {

2109

+   */

2110

+ #define IOMAP_DIO_OVERWRITE_ONLY	(1 << 1)

2111

+

2112

++/*

2113

++ * When a page fault occurs, return a partial synchronous result and allow

2114

++ * the caller to retry the rest of the operation after dealing with the page

2115

++ * fault.

2116

++ */

2117

++#define IOMAP_DIO_PARTIAL		(1 << 2)

2118

++

2119

+ ssize_t iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,

2120

+ 		const struct iomap_ops *ops, const struct iomap_dio_ops *dops,

2121

+-		unsigned int dio_flags);

2122

++		unsigned int dio_flags, size_t done_before);

2123

+ struct iomap_dio *__iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,

2124

+ 		const struct iomap_ops *ops, const struct iomap_dio_ops *dops,

2125

+-		unsigned int dio_flags);

2126

++		unsigned int dio_flags, size_t done_before);

2127

+ ssize_t iomap_dio_complete(struct iomap_dio *dio);

2128

+ int iomap_dio_iopoll(struct kiocb *kiocb, bool spin);

2129

+

2130

+diff --git a/include/linux/mm.h b/include/linux/mm.h

2131

+index 90c2d7f3c7a88..04345ff97f8ca 100644

2132

+--- a/include/linux/mm.h

2133

++++ b/include/linux/mm.h

2134

+@@ -2858,7 +2858,8 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,

2135

+ #define FOLL_FORCE	0x10	/* get_user_pages read/write w/o permission */

2136

+ #define FOLL_NOWAIT	0x20	/* if a disk transfer is needed, start the IO

2137

+ 				 * and return without waiting upon it */

2138

+-#define FOLL_POPULATE	0x40	/* fault in page */

2139

++#define FOLL_POPULATE	0x40	/* fault in pages (with FOLL_MLOCK) */

2140

++#define FOLL_NOFAULT	0x80	/* do not fault in pages */

2141

+ #define FOLL_HWPOISON	0x100	/* check page is hwpoisoned */

2142

+ #define FOLL_NUMA	0x200	/* force NUMA hinting page fault */

2143

+ #define FOLL_MIGRATION	0x400	/* wait for page to replace migration entry */

2144

+diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h

2145

+index 62db6b0176b95..2f7dd14083d94 100644

2146

+--- a/include/linux/pagemap.h

2147

++++ b/include/linux/pagemap.h

2148

+@@ -733,61 +733,11 @@ int wait_on_page_private_2_killable(struct page *page);

2149

+ extern void add_page_wait_queue(struct page *page, wait_queue_entry_t *waiter);

2150

+

2151

+ /*

2152

+- * Fault everything in given userspace address range in.

2153

++ * Fault in userspace address range.

2154

+  */

2155

+-static inline int fault_in_pages_writeable(char __user *uaddr, size_t size)

2156

+-{

2157

+-	char __user *end = uaddr + size - 1;

2158

+-

2159

+-	if (unlikely(size == 0))

2160

+-		return 0;

2161

+-

2162

+-	if (unlikely(uaddr > end))

2163

+-		return -EFAULT;

2164

+-	/*

2165

+-	 * Writing zeroes into userspace here is OK, because we know that if

2166

+-	 * the zero gets there, we'll be overwriting it.

2167

+-	 */

2168

+-	do {

2169

+-		if (unlikely(__put_user(0, uaddr) != 0))

2170

+-			return -EFAULT;

2171

+-		uaddr += PAGE_SIZE;

2172

+-	} while (uaddr <= end);

2173

+-

2174

+-	/* Check whether the range spilled into the next page. */

2175

+-	if (((unsigned long)uaddr & PAGE_MASK) ==

2176

+-			((unsigned long)end & PAGE_MASK))

2177

+-		return __put_user(0, end);

2178

+-

2179

+-	return 0;

2180

+-}

2181

+-

2182

+-static inline int fault_in_pages_readable(const char __user *uaddr, size_t size)

2183

+-{

2184

+-	volatile char c;

2185

+-	const char __user *end = uaddr + size - 1;

2186

+-

2187

+-	if (unlikely(size == 0))

2188

+-		return 0;

2189

+-

2190

+-	if (unlikely(uaddr > end))

2191

+-		return -EFAULT;

2192

+-

2193

+-	do {

2194

+-		if (unlikely(__get_user(c, uaddr) != 0))

2195

+-			return -EFAULT;

2196

+-		uaddr += PAGE_SIZE;

2197

+-	} while (uaddr <= end);

2198

+-

2199

+-	/* Check whether the range spilled into the next page. */

2200

+-	if (((unsigned long)uaddr & PAGE_MASK) ==

2201

+-			((unsigned long)end & PAGE_MASK)) {

2202

+-		return __get_user(c, end);

2203

+-	}

2204

+-

2205

+-	(void)c;

2206

+-	return 0;

2207

+-}

2208

++size_t fault_in_writeable(char __user *uaddr, size_t size);

2209

++size_t fault_in_safe_writeable(const char __user *uaddr, size_t size);

2210

++size_t fault_in_readable(const char __user *uaddr, size_t size);

2211

+

2212

+ int add_to_page_cache_locked(struct page *page, struct address_space *mapping,

2213

+ 				pgoff_t index, gfp_t gfp_mask);

2214

+diff --git a/include/linux/uio.h b/include/linux/uio.h

2215

+index 207101a9c5c32..6350354f97e90 100644

2216

+--- a/include/linux/uio.h

2217

++++ b/include/linux/uio.h

2218

+@@ -35,6 +35,7 @@ struct iov_iter_state {

2219

+

2220

+ struct iov_iter {

2221

+ 	u8 iter_type;

2222

++	bool nofault;

2223

+ 	bool data_source;

2224

+ 	size_t iov_offset;

2225

+ 	size_t count;

2226

+@@ -133,7 +134,8 @@ size_t copy_page_from_iter_atomic(struct page *page, unsigned offset,

2227

+ 				  size_t bytes, struct iov_iter *i);

2228

+ void iov_iter_advance(struct iov_iter *i, size_t bytes);

2229

+ void iov_iter_revert(struct iov_iter *i, size_t bytes);

2230

+-int iov_iter_fault_in_readable(const struct iov_iter *i, size_t bytes);

2231

++size_t fault_in_iov_iter_readable(const struct iov_iter *i, size_t bytes);

2232

++size_t fault_in_iov_iter_writeable(const struct iov_iter *i, size_t bytes);

2233

+ size_t iov_iter_single_seg_count(const struct iov_iter *i);

2234

+ size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes,

2235

+ 			 struct iov_iter *i);

2236

+diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c

2237

+index 09406b0e215e1..40df35088cdbd 100644

2238

+--- a/kernel/bpf/btf.c

2239

++++ b/kernel/bpf/btf.c

2240

+@@ -4800,10 +4800,12 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,

2241

+ 	/* check for PTR_TO_RDONLY_BUF_OR_NULL or PTR_TO_RDWR_BUF_OR_NULL */

2242

+ 	for (i = 0; i < prog->aux->ctx_arg_info_size; i++) {

2243

+ 		const struct bpf_ctx_arg_aux *ctx_arg_info = &prog->aux->ctx_arg_info[i];

2244

++		u32 type, flag;

2245

+

2246

+-		if (ctx_arg_info->offset == off &&

2247

+-		    (ctx_arg_info->reg_type == PTR_TO_RDONLY_BUF_OR_NULL ||

2248

+-		     ctx_arg_info->reg_type == PTR_TO_RDWR_BUF_OR_NULL)) {

2249

++		type = base_type(ctx_arg_info->reg_type);

2250

++		flag = type_flag(ctx_arg_info->reg_type);

2251

++		if (ctx_arg_info->offset == off && type == PTR_TO_BUF &&

2252

++		    (flag & PTR_MAYBE_NULL)) {

2253

+ 			info->reg_type = ctx_arg_info->reg_type;

2254

+ 			return true;

2255

+ 		}

2256

+@@ -5508,9 +5510,9 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,

2257

+ 			if (reg->type == PTR_TO_BTF_ID) {

2258

+ 				reg_btf = reg->btf;

2259

+ 				reg_ref_id = reg->btf_id;

2260

+-			} else if (reg2btf_ids[reg->type]) {

2261

++			} else if (reg2btf_ids[base_type(reg->type)]) {

2262

+ 				reg_btf = btf_vmlinux;

2263

+-				reg_ref_id = *reg2btf_ids[reg->type];

2264

++				reg_ref_id = *reg2btf_ids[base_type(reg->type)];

2265

+ 			} else {

2266

+ 				bpf_log(log, "kernel function %s args#%d expected pointer to %s %s but R%d is not a pointer to btf_id\n",

2267

+ 					func_name, i,

2268

+@@ -5717,7 +5719,7 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog,

2269

+ 				return -EINVAL;

2270

+ 			}

2271

+

2272

+-			reg->type = PTR_TO_MEM_OR_NULL;

2273

++			reg->type = PTR_TO_MEM | PTR_MAYBE_NULL;

2274

+ 			reg->id = ++env->id_gen;

2275

+

2276

+ 			continue;

2277

+@@ -6229,7 +6231,7 @@ const struct bpf_func_proto bpf_btf_find_by_name_kind_proto = {

2278

+ 	.func		= bpf_btf_find_by_name_kind,

2279

+ 	.gpl_only	= false,

2280

+ 	.ret_type	= RET_INTEGER,

2281

+-	.arg1_type	= ARG_PTR_TO_MEM,

2282

++	.arg1_type	= ARG_PTR_TO_MEM | MEM_RDONLY,

2283

+ 	.arg2_type	= ARG_CONST_SIZE,

2284

+ 	.arg3_type	= ARG_ANYTHING,

2285

+ 	.arg4_type	= ARG_ANYTHING,

2286

+diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c

2287

+index 7dbd68195a2b0..fe053ffd89329 100644

2288

+--- a/kernel/bpf/cgroup.c

2289

++++ b/kernel/bpf/cgroup.c

2290

+@@ -1753,7 +1753,7 @@ static const struct bpf_func_proto bpf_sysctl_set_new_value_proto = {

2291

+ 	.gpl_only	= false,

2292

+ 	.ret_type	= RET_INTEGER,

2293

+ 	.arg1_type	= ARG_PTR_TO_CTX,

2294

+-	.arg2_type	= ARG_PTR_TO_MEM,

2295

++	.arg2_type	= ARG_PTR_TO_MEM | MEM_RDONLY,

2296

+ 	.arg3_type	= ARG_CONST_SIZE,

2297

+ };

2298

+

2299

+diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c

2300

+index 6f600cc95ccda..a711ffe238932 100644

2301

+--- a/kernel/bpf/helpers.c

2302

++++ b/kernel/bpf/helpers.c

2303

+@@ -530,7 +530,7 @@ const struct bpf_func_proto bpf_strtol_proto = {

2304

+ 	.func		= bpf_strtol,

2305

+ 	.gpl_only	= false,

2306

+ 	.ret_type	= RET_INTEGER,

2307

+-	.arg1_type	= ARG_PTR_TO_MEM,

2308

++	.arg1_type	= ARG_PTR_TO_MEM | MEM_RDONLY,

2309

+ 	.arg2_type	= ARG_CONST_SIZE,

2310

+ 	.arg3_type	= ARG_ANYTHING,

2311

+ 	.arg4_type	= ARG_PTR_TO_LONG,

2312

+@@ -558,7 +558,7 @@ const struct bpf_func_proto bpf_strtoul_proto = {

2313

+ 	.func		= bpf_strtoul,

2314

+ 	.gpl_only	= false,

2315

+ 	.ret_type	= RET_INTEGER,

2316

+-	.arg1_type	= ARG_PTR_TO_MEM,

2317

++	.arg1_type	= ARG_PTR_TO_MEM | MEM_RDONLY,

2318

+ 	.arg2_type	= ARG_CONST_SIZE,

2319

+ 	.arg3_type	= ARG_ANYTHING,

2320

+ 	.arg4_type	= ARG_PTR_TO_LONG,

2321

+@@ -630,7 +630,7 @@ const struct bpf_func_proto bpf_event_output_data_proto =  {

2322

+ 	.arg1_type      = ARG_PTR_TO_CTX,

2323

+ 	.arg2_type      = ARG_CONST_MAP_PTR,

2324

+ 	.arg3_type      = ARG_ANYTHING,

2325

+-	.arg4_type      = ARG_PTR_TO_MEM,

2326

++	.arg4_type      = ARG_PTR_TO_MEM | MEM_RDONLY,

2327

+ 	.arg5_type      = ARG_CONST_SIZE_OR_ZERO,

2328

+ };

2329

+

2330

+@@ -667,7 +667,7 @@ BPF_CALL_2(bpf_per_cpu_ptr, const void *, ptr, u32, cpu)

2331

+ const struct bpf_func_proto bpf_per_cpu_ptr_proto = {

2332

+ 	.func		= bpf_per_cpu_ptr,

2333

+ 	.gpl_only	= false,

2334

+-	.ret_type	= RET_PTR_TO_MEM_OR_BTF_ID_OR_NULL,

2335

++	.ret_type	= RET_PTR_TO_MEM_OR_BTF_ID | PTR_MAYBE_NULL | MEM_RDONLY,

2336

+ 	.arg1_type	= ARG_PTR_TO_PERCPU_BTF_ID,

2337

+ 	.arg2_type	= ARG_ANYTHING,

2338

+ };

2339

+@@ -680,7 +680,7 @@ BPF_CALL_1(bpf_this_cpu_ptr, const void *, percpu_ptr)

2340

+ const struct bpf_func_proto bpf_this_cpu_ptr_proto = {

2341

+ 	.func		= bpf_this_cpu_ptr,

2342

+ 	.gpl_only	= false,

2343

+-	.ret_type	= RET_PTR_TO_MEM_OR_BTF_ID,

2344

++	.ret_type	= RET_PTR_TO_MEM_OR_BTF_ID | MEM_RDONLY,

2345

+ 	.arg1_type	= ARG_PTR_TO_PERCPU_BTF_ID,

2346

+ };

2347

+

2348

+@@ -1013,7 +1013,7 @@ const struct bpf_func_proto bpf_snprintf_proto = {

2349

+ 	.arg1_type	= ARG_PTR_TO_MEM_OR_NULL,

2350

+ 	.arg2_type	= ARG_CONST_SIZE_OR_ZERO,

2351

+ 	.arg3_type	= ARG_PTR_TO_CONST_STR,

2352

+-	.arg4_type	= ARG_PTR_TO_MEM_OR_NULL,

2353

++	.arg4_type	= ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY,

2354

+ 	.arg5_type	= ARG_CONST_SIZE_OR_ZERO,

2355

+ };

2356

+

2357

+diff --git a/kernel/bpf/map_iter.c b/kernel/bpf/map_iter.c

2358

+index 6a9542af4212a..b0fa190b09790 100644

2359

+--- a/kernel/bpf/map_iter.c

2360

++++ b/kernel/bpf/map_iter.c

2361

+@@ -174,9 +174,9 @@ static const struct bpf_iter_reg bpf_map_elem_reg_info = {

2362

+ 	.ctx_arg_info_size	= 2,

2363

+ 	.ctx_arg_info		= {

2364

+ 		{ offsetof(struct bpf_iter__bpf_map_elem, key),

2365

+-		  PTR_TO_RDONLY_BUF_OR_NULL },

2366

++		  PTR_TO_BUF | PTR_MAYBE_NULL | MEM_RDONLY },

2367

+ 		{ offsetof(struct bpf_iter__bpf_map_elem, value),

2368

+-		  PTR_TO_RDWR_BUF_OR_NULL },

2369

++		  PTR_TO_BUF | PTR_MAYBE_NULL },

2370

+ 	},

2371

+ };

2372

+

2373

+diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c

2374

+index f1c51c45667d3..710ba9de12ce4 100644

2375

+--- a/kernel/bpf/ringbuf.c

2376

++++ b/kernel/bpf/ringbuf.c

2377

+@@ -444,7 +444,7 @@ const struct bpf_func_proto bpf_ringbuf_output_proto = {

2378

+ 	.func		= bpf_ringbuf_output,

2379

+ 	.ret_type	= RET_INTEGER,

2380

+ 	.arg1_type	= ARG_CONST_MAP_PTR,

2381

+-	.arg2_type	= ARG_PTR_TO_MEM,

2382

++	.arg2_type	= ARG_PTR_TO_MEM | MEM_RDONLY,

2383

+ 	.arg3_type	= ARG_CONST_SIZE_OR_ZERO,

2384

+ 	.arg4_type	= ARG_ANYTHING,

2385

+ };

2386

+diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c

2387

+index 42490c39dfbf5..48e02a725563f 100644

2388

+--- a/kernel/bpf/syscall.c

2389

++++ b/kernel/bpf/syscall.c

2390

+@@ -4753,7 +4753,7 @@ static const struct bpf_func_proto bpf_sys_bpf_proto = {

2391

+ 	.gpl_only	= false,

2392

+ 	.ret_type	= RET_INTEGER,

2393

+ 	.arg1_type	= ARG_ANYTHING,

2394

+-	.arg2_type	= ARG_PTR_TO_MEM,

2395

++	.arg2_type	= ARG_PTR_TO_MEM | MEM_RDONLY,

2396

+ 	.arg3_type	= ARG_CONST_SIZE,

2397

+ };

2398

+

2399

+diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c

2400

+index 670721e39c0e8..d2b119b4fbe74 100644

2401

+--- a/kernel/bpf/verifier.c

2402

++++ b/kernel/bpf/verifier.c

2403

+@@ -445,18 +445,6 @@ static bool reg_type_not_null(enum bpf_reg_type type)

2404

+ 		type == PTR_TO_SOCK_COMMON;

2405

+ }

2406

+

2407

+-static bool reg_type_may_be_null(enum bpf_reg_type type)

2408

+-{

2409

+-	return type == PTR_TO_MAP_VALUE_OR_NULL ||

2410

+-	       type == PTR_TO_SOCKET_OR_NULL ||

2411

+-	       type == PTR_TO_SOCK_COMMON_OR_NULL ||

2412

+-	       type == PTR_TO_TCP_SOCK_OR_NULL ||

2413

+-	       type == PTR_TO_BTF_ID_OR_NULL ||

2414

+-	       type == PTR_TO_MEM_OR_NULL ||

2415

+-	       type == PTR_TO_RDONLY_BUF_OR_NULL ||

2416

+-	       type == PTR_TO_RDWR_BUF_OR_NULL;

2417

+-}

2418

+-

2419

+ static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg)

2420

+ {

2421

+ 	return reg->type == PTR_TO_MAP_VALUE &&

2422

+@@ -465,12 +453,14 @@ static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg)

2423

+

2424

+ static bool reg_type_may_be_refcounted_or_null(enum bpf_reg_type type)

2425

+ {

2426

+-	return type == PTR_TO_SOCKET ||

2427

+-		type == PTR_TO_SOCKET_OR_NULL ||

2428

+-		type == PTR_TO_TCP_SOCK ||

2429

+-		type == PTR_TO_TCP_SOCK_OR_NULL ||

2430

+-		type == PTR_TO_MEM ||

2431

+-		type == PTR_TO_MEM_OR_NULL;

2432

++	return base_type(type) == PTR_TO_SOCKET ||

2433

++		base_type(type) == PTR_TO_TCP_SOCK ||

2434

++		base_type(type) == PTR_TO_MEM;

2435

++}

2436

++

2437

++static bool type_is_rdonly_mem(u32 type)

2438

++{

2439

++	return type & MEM_RDONLY;

2440

+ }

2441

+

2442

+ static bool arg_type_may_be_refcounted(enum bpf_arg_type type)

2443

+@@ -478,14 +468,9 @@ static bool arg_type_may_be_refcounted(enum bpf_arg_type type)

2444

+ 	return type == ARG_PTR_TO_SOCK_COMMON;

2445

+ }

2446

+

2447

+-static bool arg_type_may_be_null(enum bpf_arg_type type)

2448

++static bool type_may_be_null(u32 type)

2449

+ {

2450

+-	return type == ARG_PTR_TO_MAP_VALUE_OR_NULL ||

2451

+-	       type == ARG_PTR_TO_MEM_OR_NULL ||

2452

+-	       type == ARG_PTR_TO_CTX_OR_NULL ||

2453

+-	       type == ARG_PTR_TO_SOCKET_OR_NULL ||

2454

+-	       type == ARG_PTR_TO_ALLOC_MEM_OR_NULL ||

2455

+-	       type == ARG_PTR_TO_STACK_OR_NULL;

2456

++	return type & PTR_MAYBE_NULL;

2457

+ }

2458

+

2459

+ /* Determine whether the function releases some resources allocated by another

2460

+@@ -545,39 +530,54 @@ static bool is_cmpxchg_insn(const struct bpf_insn *insn)

2461

+ 	       insn->imm == BPF_CMPXCHG;

2462

+ }

2463

+

2464

+-/* string representation of 'enum bpf_reg_type' */

2465

+-static const char * const reg_type_str[] = {

2466

+-	[NOT_INIT]		= "?",

2467

+-	[SCALAR_VALUE]		= "inv",

2468

+-	[PTR_TO_CTX]		= "ctx",

2469

+-	[CONST_PTR_TO_MAP]	= "map_ptr",

2470

+-	[PTR_TO_MAP_VALUE]	= "map_value",

2471

+-	[PTR_TO_MAP_VALUE_OR_NULL] = "map_value_or_null",

2472

+-	[PTR_TO_STACK]		= "fp",

2473

+-	[PTR_TO_PACKET]		= "pkt",

2474

+-	[PTR_TO_PACKET_META]	= "pkt_meta",

2475

+-	[PTR_TO_PACKET_END]	= "pkt_end",

2476

+-	[PTR_TO_FLOW_KEYS]	= "flow_keys",

2477

+-	[PTR_TO_SOCKET]		= "sock",

2478

+-	[PTR_TO_SOCKET_OR_NULL] = "sock_or_null",

2479

+-	[PTR_TO_SOCK_COMMON]	= "sock_common",

2480

+-	[PTR_TO_SOCK_COMMON_OR_NULL] = "sock_common_or_null",

2481

+-	[PTR_TO_TCP_SOCK]	= "tcp_sock",

2482

+-	[PTR_TO_TCP_SOCK_OR_NULL] = "tcp_sock_or_null",

2483

+-	[PTR_TO_TP_BUFFER]	= "tp_buffer",

2484

+-	[PTR_TO_XDP_SOCK]	= "xdp_sock",

2485

+-	[PTR_TO_BTF_ID]		= "ptr_",

2486

+-	[PTR_TO_BTF_ID_OR_NULL]	= "ptr_or_null_",

2487

+-	[PTR_TO_PERCPU_BTF_ID]	= "percpu_ptr_",

2488

+-	[PTR_TO_MEM]		= "mem",

2489

+-	[PTR_TO_MEM_OR_NULL]	= "mem_or_null",

2490

+-	[PTR_TO_RDONLY_BUF]	= "rdonly_buf",

2491

+-	[PTR_TO_RDONLY_BUF_OR_NULL] = "rdonly_buf_or_null",

2492

+-	[PTR_TO_RDWR_BUF]	= "rdwr_buf",

2493

+-	[PTR_TO_RDWR_BUF_OR_NULL] = "rdwr_buf_or_null",

2494

+-	[PTR_TO_FUNC]		= "func",

2495

+-	[PTR_TO_MAP_KEY]	= "map_key",

2496

+-};

2497

++/* string representation of 'enum bpf_reg_type'

2498

++ *

2499

++ * Note that reg_type_str() can not appear more than once in a single verbose()

2500

++ * statement.

2501

++ */

2502

++static const char *reg_type_str(struct bpf_verifier_env *env,

2503

++				enum bpf_reg_type type)

2504

++{

2505

++	char postfix[16] = {0}, prefix[16] = {0};

2506

++	static const char * const str[] = {

2507

++		[NOT_INIT]		= "?",

2508

++		[SCALAR_VALUE]		= "inv",

2509

++		[PTR_TO_CTX]		= "ctx",

2510

++		[CONST_PTR_TO_MAP]	= "map_ptr",

2511

++		[PTR_TO_MAP_VALUE]	= "map_value",

2512

++		[PTR_TO_STACK]		= "fp",

2513

++		[PTR_TO_PACKET]		= "pkt",

2514

++		[PTR_TO_PACKET_META]	= "pkt_meta",

2515

++		[PTR_TO_PACKET_END]	= "pkt_end",

2516

++		[PTR_TO_FLOW_KEYS]	= "flow_keys",

2517

++		[PTR_TO_SOCKET]		= "sock",

2518

++		[PTR_TO_SOCK_COMMON]	= "sock_common",

2519

++		[PTR_TO_TCP_SOCK]	= "tcp_sock",

2520

++		[PTR_TO_TP_BUFFER]	= "tp_buffer",

2521

++		[PTR_TO_XDP_SOCK]	= "xdp_sock",

2522

++		[PTR_TO_BTF_ID]		= "ptr_",

2523

++		[PTR_TO_PERCPU_BTF_ID]	= "percpu_ptr_",

2524

++		[PTR_TO_MEM]		= "mem",

2525

++		[PTR_TO_BUF]		= "buf",

2526

++		[PTR_TO_FUNC]		= "func",

2527

++		[PTR_TO_MAP_KEY]	= "map_key",

2528

++	};

2529

++

2530

++	if (type & PTR_MAYBE_NULL) {

2531

++		if (base_type(type) == PTR_TO_BTF_ID ||

2532

++		    base_type(type) == PTR_TO_PERCPU_BTF_ID)

2533

++			strncpy(postfix, "or_null_", 16);

2534

++		else

2535

++			strncpy(postfix, "_or_null", 16);

2536

++	}

2537

++

2538

++	if (type & MEM_RDONLY)

2539

++		strncpy(prefix, "rdonly_", 16);

2540

++

2541

++	snprintf(env->type_str_buf, TYPE_STR_BUF_LEN, "%s%s%s",

2542

++		 prefix, str[base_type(type)], postfix);

2543

++	return env->type_str_buf;

2544

++}

2545

+

2546

+ static char slot_type_char[] = {

2547

+ 	[STACK_INVALID]	= '?',

2548

+@@ -628,7 +628,7 @@ static void print_verifier_state(struct bpf_verifier_env *env,

2549

+ 			continue;

2550

+ 		verbose(env, " R%d", i);

2551

+ 		print_liveness(env, reg->live);

2552

+-		verbose(env, "=%s", reg_type_str[t]);

2553

++		verbose(env, "=%s", reg_type_str(env, t));

2554

+ 		if (t == SCALAR_VALUE && reg->precise)

2555

+ 			verbose(env, "P");

2556

+ 		if ((t == SCALAR_VALUE || t == PTR_TO_STACK) &&

2557

+@@ -636,9 +636,8 @@ static void print_verifier_state(struct bpf_verifier_env *env,

2558

+ 			/* reg->off should be 0 for SCALAR_VALUE */

2559

+ 			verbose(env, "%lld", reg->var_off.value + reg->off);

2560

+ 		} else {

2561

+-			if (t == PTR_TO_BTF_ID ||

2562

+-			    t == PTR_TO_BTF_ID_OR_NULL ||

2563

+-			    t == PTR_TO_PERCPU_BTF_ID)

2564

++			if (base_type(t) == PTR_TO_BTF_ID ||

2565

++			    base_type(t) == PTR_TO_PERCPU_BTF_ID)

2566

+ 				verbose(env, "%s", kernel_type_name(reg->btf, reg->btf_id));

2567

+ 			verbose(env, "(id=%d", reg->id);

2568

+ 			if (reg_type_may_be_refcounted_or_null(t))

2569

+@@ -647,10 +646,9 @@ static void print_verifier_state(struct bpf_verifier_env *env,

2570

+ 				verbose(env, ",off=%d", reg->off);

2571

+ 			if (type_is_pkt_pointer(t))

2572

+ 				verbose(env, ",r=%d", reg->range);

2573

+-			else if (t == CONST_PTR_TO_MAP ||

2574

+-				 t == PTR_TO_MAP_KEY ||

2575

+-				 t == PTR_TO_MAP_VALUE ||

2576

+-				 t == PTR_TO_MAP_VALUE_OR_NULL)

2577

++			else if (base_type(t) == CONST_PTR_TO_MAP ||

2578

++				 base_type(t) == PTR_TO_MAP_KEY ||

2579

++				 base_type(t) == PTR_TO_MAP_VALUE)

2580

+ 				verbose(env, ",ks=%d,vs=%d",

2581

+ 					reg->map_ptr->key_size,

2582

+ 					reg->map_ptr->value_size);

2583

+@@ -720,7 +718,7 @@ static void print_verifier_state(struct bpf_verifier_env *env,

2584

+ 		if (state->stack[i].slot_type[0] == STACK_SPILL) {

2585

+ 			reg = &state->stack[i].spilled_ptr;

2586

+ 			t = reg->type;

2587

+-			verbose(env, "=%s", reg_type_str[t]);

2588

++			verbose(env, "=%s", reg_type_str(env, t));

2589

+ 			if (t == SCALAR_VALUE && reg->precise)

2590

+ 				verbose(env, "P");

2591

+ 			if (t == SCALAR_VALUE && tnum_is_const(reg->var_off))

2592

+@@ -1133,8 +1131,7 @@ static void mark_reg_known_zero(struct bpf_verifier_env *env,

2593

+

2594

+ static void mark_ptr_not_null_reg(struct bpf_reg_state *reg)

2595

+ {

2596

+-	switch (reg->type) {

2597

+-	case PTR_TO_MAP_VALUE_OR_NULL: {

2598

++	if (base_type(reg->type) == PTR_TO_MAP_VALUE) {

2599

+ 		const struct bpf_map *map = reg->map_ptr;

2600

+

2601

+ 		if (map->inner_map_meta) {

2602

+@@ -1153,32 +1150,10 @@ static void mark_ptr_not_null_reg(struct bpf_reg_state *reg)

2603

+ 		} else {

2604

+ 			reg->type = PTR_TO_MAP_VALUE;

2605

+ 		}

2606

+-		break;

2607

+-	}

2608

+-	case PTR_TO_SOCKET_OR_NULL:

2609

+-		reg->type = PTR_TO_SOCKET;

2610

+-		break;

2611

+-	case PTR_TO_SOCK_COMMON_OR_NULL:

2612

+-		reg->type = PTR_TO_SOCK_COMMON;

2613

+-		break;

2614

+-	case PTR_TO_TCP_SOCK_OR_NULL:

2615

+-		reg->type = PTR_TO_TCP_SOCK;

2616

+-		break;

2617

+-	case PTR_TO_BTF_ID_OR_NULL:

2618

+-		reg->type = PTR_TO_BTF_ID;

2619

+-		break;

2620

+-	case PTR_TO_MEM_OR_NULL:

2621

+-		reg->type = PTR_TO_MEM;

2622

+-		break;

2623

+-	case PTR_TO_RDONLY_BUF_OR_NULL:

2624

+-		reg->type = PTR_TO_RDONLY_BUF;

2625

+-		break;

2626

+-	case PTR_TO_RDWR_BUF_OR_NULL:

2627

+-		reg->type = PTR_TO_RDWR_BUF;

2628

+-		break;

2629

+-	default:

2630

+-		WARN_ONCE(1, "unknown nullable register type");

2631

++		return;

2632

+ 	}

2633

++

2634

++	reg->type &= ~PTR_MAYBE_NULL;

2635

+ }

2636

+

2637

+ static bool reg_is_pkt_pointer(const struct bpf_reg_state *reg)

2638

+@@ -1906,7 +1881,7 @@ static int mark_reg_read(struct bpf_verifier_env *env,

2639

+ 			break;

2640

+ 		if (parent->live & REG_LIVE_DONE) {

2641

+ 			verbose(env, "verifier BUG type %s var_off %lld off %d\n",

2642

+-				reg_type_str[parent->type],

2643

++				reg_type_str(env, parent->type),

2644

+ 				parent->var_off.value, parent->off);

2645

+ 			return -EFAULT;

2646

+ 		}

2647

+@@ -2564,9 +2539,8 @@ static int mark_chain_precision_stack(struct bpf_verifier_env *env, int spi)

2648

+

2649

+ static bool is_spillable_regtype(enum bpf_reg_type type)

2650

+ {

2651

+-	switch (type) {

2652

++	switch (base_type(type)) {

2653

+ 	case PTR_TO_MAP_VALUE:

2654

+-	case PTR_TO_MAP_VALUE_OR_NULL:

2655

+ 	case PTR_TO_STACK:

2656

+ 	case PTR_TO_CTX:

2657

+ 	case PTR_TO_PACKET:

2658

+@@ -2575,21 +2549,13 @@ static bool is_spillable_regtype(enum bpf_reg_type type)

2659

+ 	case PTR_TO_FLOW_KEYS:

2660

+ 	case CONST_PTR_TO_MAP:

2661

+ 	case PTR_TO_SOCKET:

2662

+-	case PTR_TO_SOCKET_OR_NULL:

2663

+ 	case PTR_TO_SOCK_COMMON:

2664

+-	case PTR_TO_SOCK_COMMON_OR_NULL:

2665

+ 	case PTR_TO_TCP_SOCK:

2666

+-	case PTR_TO_TCP_SOCK_OR_NULL:

2667

+ 	case PTR_TO_XDP_SOCK:

2668

+ 	case PTR_TO_BTF_ID:

2669

+-	case PTR_TO_BTF_ID_OR_NULL:

2670

+-	case PTR_TO_RDONLY_BUF:

2671

+-	case PTR_TO_RDONLY_BUF_OR_NULL:

2672

+-	case PTR_TO_RDWR_BUF:

2673

+-	case PTR_TO_RDWR_BUF_OR_NULL:

2674

++	case PTR_TO_BUF:

2675

+ 	case PTR_TO_PERCPU_BTF_ID:

2676

+ 	case PTR_TO_MEM:

2677

+-	case PTR_TO_MEM_OR_NULL:

2678

+ 	case PTR_TO_FUNC:

2679

+ 	case PTR_TO_MAP_KEY:

2680

+ 		return true;

2681

+@@ -3405,7 +3371,7 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off,

2682

+ 		 */

2683

+ 		*reg_type = info.reg_type;

2684

+

2685

+-		if (*reg_type == PTR_TO_BTF_ID || *reg_type == PTR_TO_BTF_ID_OR_NULL) {

2686

++		if (base_type(*reg_type) == PTR_TO_BTF_ID) {

2687

+ 			*btf = info.btf;

2688

+ 			*btf_id = info.btf_id;

2689

+ 		} else {

2690

+@@ -3473,7 +3439,7 @@ static int check_sock_access(struct bpf_verifier_env *env, int insn_idx,

2691

+ 	}

2692

+

2693

+ 	verbose(env, "R%d invalid %s access off=%d size=%d\n",

2694

+-		regno, reg_type_str[reg->type], off, size);

2695

++		regno, reg_type_str(env, reg->type), off, size);

2696

+

2697

+ 	return -EACCES;

2698

+ }

2699

+@@ -4200,15 +4166,30 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn

2700

+ 				mark_reg_unknown(env, regs, value_regno);

2701

+ 			}

2702

+ 		}

2703

+-	} else if (reg->type == PTR_TO_MEM) {

2704

++	} else if (base_type(reg->type) == PTR_TO_MEM) {

2705

++		bool rdonly_mem = type_is_rdonly_mem(reg->type);

2706

++

2707

++		if (type_may_be_null(reg->type)) {

2708

++			verbose(env, "R%d invalid mem access '%s'\n", regno,

2709

++				reg_type_str(env, reg->type));

2710

++			return -EACCES;

2711

++		}

2712

++

2713

++		if (t == BPF_WRITE && rdonly_mem) {

2714

++			verbose(env, "R%d cannot write into %s\n",

2715

++				regno, reg_type_str(env, reg->type));

2716

++			return -EACCES;

2717

++		}

2718

++

2719

+ 		if (t == BPF_WRITE && value_regno >= 0 &&

2720

+ 		    is_pointer_value(env, value_regno)) {

2721

+ 			verbose(env, "R%d leaks addr into mem\n", value_regno);

2722

+ 			return -EACCES;

2723

+ 		}

2724

++

2725

+ 		err = check_mem_region_access(env, regno, off, size,

2726

+ 					      reg->mem_size, false);

2727

+-		if (!err && t == BPF_READ && value_regno >= 0)

2728

++		if (!err && value_regno >= 0 && (t == BPF_READ || rdonly_mem))

2729

+ 			mark_reg_unknown(env, regs, value_regno);

2730

+ 	} else if (reg->type == PTR_TO_CTX) {

2731

+ 		enum bpf_reg_type reg_type = SCALAR_VALUE;

2732

+@@ -4238,7 +4219,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn

2733

+ 			} else {

2734

+ 				mark_reg_known_zero(env, regs,

2735

+ 						    value_regno);

2736

+-				if (reg_type_may_be_null(reg_type))

2737

++				if (type_may_be_null(reg_type))

2738

+ 					regs[value_regno].id = ++env->id_gen;

2739

+ 				/* A load of ctx field could have different

2740

+ 				 * actual load size with the one encoded in the

2741

+@@ -4246,8 +4227,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn

2742

+ 				 * a sub-register.

2743

+ 				 */

2744

+ 				regs[value_regno].subreg_def = DEF_NOT_SUBREG;

2745

+-				if (reg_type == PTR_TO_BTF_ID ||

2746

+-				    reg_type == PTR_TO_BTF_ID_OR_NULL) {

2747

++				if (base_type(reg_type) == PTR_TO_BTF_ID) {

2748

+ 					regs[value_regno].btf = btf;

2749

+ 					regs[value_regno].btf_id = btf_id;

2750

+ 				}

2751

+@@ -4300,7 +4280,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn

2752

+ 	} else if (type_is_sk_pointer(reg->type)) {

2753

+ 		if (t == BPF_WRITE) {

2754

+ 			verbose(env, "R%d cannot write into %s\n",

2755

+-				regno, reg_type_str[reg->type]);

2756

++				regno, reg_type_str(env, reg->type));

2757

+ 			return -EACCES;

2758

+ 		}

2759

+ 		err = check_sock_access(env, insn_idx, regno, off, size, t);

2760

+@@ -4316,26 +4296,32 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn

2761

+ 	} else if (reg->type == CONST_PTR_TO_MAP) {

2762

+ 		err = check_ptr_to_map_access(env, regs, regno, off, size, t,

2763

+ 					      value_regno);

2764

+-	} else if (reg->type == PTR_TO_RDONLY_BUF) {

2765

+-		if (t == BPF_WRITE) {

2766

+-			verbose(env, "R%d cannot write into %s\n",

2767

+-				regno, reg_type_str[reg->type]);

2768

+-			return -EACCES;

2769

++	} else if (base_type(reg->type) == PTR_TO_BUF) {

2770

++		bool rdonly_mem = type_is_rdonly_mem(reg->type);

2771

++		const char *buf_info;

2772

++		u32 *max_access;

2773

++

2774

++		if (rdonly_mem) {

2775

++			if (t == BPF_WRITE) {

2776

++				verbose(env, "R%d cannot write into %s\n",

2777

++					regno, reg_type_str(env, reg->type));

2778

++				return -EACCES;

2779

++			}

2780

++			buf_info = "rdonly";

2781

++			max_access = &env->prog->aux->max_rdonly_access;

2782

++		} else {

2783

++			buf_info = "rdwr";

2784

++			max_access = &env->prog->aux->max_rdwr_access;

2785

+ 		}

2786

++

2787

+ 		err = check_buffer_access(env, reg, regno, off, size, false,

2788

+-					  "rdonly",

2789

+-					  &env->prog->aux->max_rdonly_access);

2790

+-		if (!err && value_regno >= 0)

2791

+-			mark_reg_unknown(env, regs, value_regno);

2792

+-	} else if (reg->type == PTR_TO_RDWR_BUF) {

2793

+-		err = check_buffer_access(env, reg, regno, off, size, false,

2794

+-					  "rdwr",

2795

+-					  &env->prog->aux->max_rdwr_access);

2796

+-		if (!err && t == BPF_READ && value_regno >= 0)

2797

++					  buf_info, max_access);

2798

++

2799

++		if (!err && value_regno >= 0 && (rdonly_mem || t == BPF_READ))

2800

+ 			mark_reg_unknown(env, regs, value_regno);

2801

+ 	} else {

2802

+ 		verbose(env, "R%d invalid mem access '%s'\n", regno,

2803

+-			reg_type_str[reg->type]);

2804

++			reg_type_str(env, reg->type));

2805

+ 		return -EACCES;

2806

+ 	}

2807

+

2808

+@@ -4409,7 +4395,7 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i

2809

+ 	    is_sk_reg(env, insn->dst_reg)) {

2810

+ 		verbose(env, "BPF_ATOMIC stores into R%d %s is not allowed\n",

2811

+ 			insn->dst_reg,

2812

+-			reg_type_str[reg_state(env, insn->dst_reg)->type]);

2813

++			reg_type_str(env, reg_state(env, insn->dst_reg)->type));

2814

+ 		return -EACCES;

2815

+ 	}

2816

+

2817

+@@ -4592,8 +4578,10 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,

2818

+ 				   struct bpf_call_arg_meta *meta)

2819

+ {

2820

+ 	struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];

2821

++	const char *buf_info;

2822

++	u32 *max_access;

2823

+

2824

+-	switch (reg->type) {

2825

++	switch (base_type(reg->type)) {

2826

+ 	case PTR_TO_PACKET:

2827

+ 	case PTR_TO_PACKET_META:

2828

+ 		return check_packet_access(env, regno, reg->off, access_size,

2829

+@@ -4612,18 +4600,20 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,

2830

+ 		return check_mem_region_access(env, regno, reg->off,

2831

+ 					       access_size, reg->mem_size,

2832

+ 					       zero_size_allowed);

2833

+-	case PTR_TO_RDONLY_BUF:

2834

+-		if (meta && meta->raw_mode)

2835

+-			return -EACCES;

2836

+-		return check_buffer_access(env, reg, regno, reg->off,

2837

+-					   access_size, zero_size_allowed,

2838

+-					   "rdonly",

2839

+-					   &env->prog->aux->max_rdonly_access);

2840

+-	case PTR_TO_RDWR_BUF:

2841

++	case PTR_TO_BUF:

2842

++		if (type_is_rdonly_mem(reg->type)) {

2843

++			if (meta && meta->raw_mode)

2844

++				return -EACCES;

2845

++

2846

++			buf_info = "rdonly";

2847

++			max_access = &env->prog->aux->max_rdonly_access;

2848

++		} else {

2849

++			buf_info = "rdwr";

2850

++			max_access = &env->prog->aux->max_rdwr_access;

2851

++		}

2852

+ 		return check_buffer_access(env, reg, regno, reg->off,

2853

+ 					   access_size, zero_size_allowed,

2854

+-					   "rdwr",

2855

+-					   &env->prog->aux->max_rdwr_access);

2856

++					   buf_info, max_access);

2857

+ 	case PTR_TO_STACK:

2858

+ 		return check_stack_range_initialized(

2859

+ 				env,

2860

+@@ -4635,9 +4625,9 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,

2861

+ 		    register_is_null(reg))

2862

+ 			return 0;

2863

+

2864

+-		verbose(env, "R%d type=%s expected=%s\n", regno,

2865

+-			reg_type_str[reg->type],

2866

+-			reg_type_str[PTR_TO_STACK]);

2867

++		verbose(env, "R%d type=%s ", regno,

2868

++			reg_type_str(env, reg->type));

2869

++		verbose(env, "expected=%s\n", reg_type_str(env, PTR_TO_STACK));

2870

+ 		return -EACCES;

2871

+ 	}

2872

+ }

2873

+@@ -4648,7 +4638,7 @@ int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg,

2874

+ 	if (register_is_null(reg))

2875

+ 		return 0;

2876

+

2877

+-	if (reg_type_may_be_null(reg->type)) {

2878

++	if (type_may_be_null(reg->type)) {

2879

+ 		/* Assuming that the register contains a value check if the memory

2880

+ 		 * access is safe. Temporarily save and restore the register's state as

2881

+ 		 * the conversion shouldn't be visible to a caller.

2882

+@@ -4796,9 +4786,8 @@ static int process_timer_func(struct bpf_verifier_env *env, int regno,

2883

+

2884

+ static bool arg_type_is_mem_ptr(enum bpf_arg_type type)

2885

+ {

2886

+-	return type == ARG_PTR_TO_MEM ||

2887

+-	       type == ARG_PTR_TO_MEM_OR_NULL ||

2888

+-	       type == ARG_PTR_TO_UNINIT_MEM;

2889

++	return base_type(type) == ARG_PTR_TO_MEM ||

2890

++	       base_type(type) == ARG_PTR_TO_UNINIT_MEM;

2891

+ }

2892

+

2893

+ static bool arg_type_is_mem_size(enum bpf_arg_type type)

2894

+@@ -4900,8 +4889,7 @@ static const struct bpf_reg_types mem_types = {

2895

+ 		PTR_TO_MAP_KEY,

2896

+ 		PTR_TO_MAP_VALUE,

2897

+ 		PTR_TO_MEM,

2898

+-		PTR_TO_RDONLY_BUF,

2899

+-		PTR_TO_RDWR_BUF,

2900

++		PTR_TO_BUF,

2901

+ 	},

2902

+ };

2903

+

2904

+@@ -4932,31 +4920,26 @@ static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = {

2905

+ 	[ARG_PTR_TO_MAP_KEY]		= &map_key_value_types,

2906

+ 	[ARG_PTR_TO_MAP_VALUE]		= &map_key_value_types,

2907

+ 	[ARG_PTR_TO_UNINIT_MAP_VALUE]	= &map_key_value_types,

2908

+-	[ARG_PTR_TO_MAP_VALUE_OR_NULL]	= &map_key_value_types,

2909

+ 	[ARG_CONST_SIZE]		= &scalar_types,

2910

+ 	[ARG_CONST_SIZE_OR_ZERO]	= &scalar_types,

2911

+ 	[ARG_CONST_ALLOC_SIZE_OR_ZERO]	= &scalar_types,

2912

+ 	[ARG_CONST_MAP_PTR]		= &const_map_ptr_types,

2913

+ 	[ARG_PTR_TO_CTX]		= &context_types,

2914

+-	[ARG_PTR_TO_CTX_OR_NULL]	= &context_types,

2915

+ 	[ARG_PTR_TO_SOCK_COMMON]	= &sock_types,

2916

+ #ifdef CONFIG_NET

2917

+ 	[ARG_PTR_TO_BTF_ID_SOCK_COMMON]	= &btf_id_sock_common_types,

2918

+ #endif

2919

+ 	[ARG_PTR_TO_SOCKET]		= &fullsock_types,

2920

+-	[ARG_PTR_TO_SOCKET_OR_NULL]	= &fullsock_types,

2921

+ 	[ARG_PTR_TO_BTF_ID]		= &btf_ptr_types,

2922

+ 	[ARG_PTR_TO_SPIN_LOCK]		= &spin_lock_types,

2923

+ 	[ARG_PTR_TO_MEM]		= &mem_types,

2924

+-	[ARG_PTR_TO_MEM_OR_NULL]	= &mem_types,

2925

+ 	[ARG_PTR_TO_UNINIT_MEM]		= &mem_types,

2926

+ 	[ARG_PTR_TO_ALLOC_MEM]		= &alloc_mem_types,

2927

+-	[ARG_PTR_TO_ALLOC_MEM_OR_NULL]	= &alloc_mem_types,

2928

+ 	[ARG_PTR_TO_INT]		= &int_ptr_types,

2929

+ 	[ARG_PTR_TO_LONG]		= &int_ptr_types,

2930

+ 	[ARG_PTR_TO_PERCPU_BTF_ID]	= &percpu_btf_ptr_types,

2931

+ 	[ARG_PTR_TO_FUNC]		= &func_ptr_types,

2932

+-	[ARG_PTR_TO_STACK_OR_NULL]	= &stack_ptr_types,

2933

++	[ARG_PTR_TO_STACK]		= &stack_ptr_types,

2934

+ 	[ARG_PTR_TO_CONST_STR]		= &const_str_ptr_types,

2935

+ 	[ARG_PTR_TO_TIMER]		= &timer_types,

2936

+ };

2937

+@@ -4970,12 +4953,27 @@ static int check_reg_type(struct bpf_verifier_env *env, u32 regno,

2938

+ 	const struct bpf_reg_types *compatible;

2939

+ 	int i, j;

2940

+

2941

+-	compatible = compatible_reg_types[arg_type];

2942

++	compatible = compatible_reg_types[base_type(arg_type)];

2943

+ 	if (!compatible) {

2944

+ 		verbose(env, "verifier internal error: unsupported arg type %d\n", arg_type);

2945

+ 		return -EFAULT;

2946

+ 	}

2947

+

2948

++	/* ARG_PTR_TO_MEM + RDONLY is compatible with PTR_TO_MEM and PTR_TO_MEM + RDONLY,

2949

++	 * but ARG_PTR_TO_MEM is compatible only with PTR_TO_MEM and NOT with PTR_TO_MEM + RDONLY

2950

++	 *

2951

++	 * Same for MAYBE_NULL:

2952

++	 *

2953

++	 * ARG_PTR_TO_MEM + MAYBE_NULL is compatible with PTR_TO_MEM and PTR_TO_MEM + MAYBE_NULL,

2954

++	 * but ARG_PTR_TO_MEM is compatible only with PTR_TO_MEM but NOT with PTR_TO_MEM + MAYBE_NULL

2955

++	 *

2956

++	 * Therefore we fold these flags depending on the arg_type before comparison.

2957

++	 */

2958

++	if (arg_type & MEM_RDONLY)

2959

++		type &= ~MEM_RDONLY;

2960

++	if (arg_type & PTR_MAYBE_NULL)

2961

++		type &= ~PTR_MAYBE_NULL;

2962

++

2963

+ 	for (i = 0; i < ARRAY_SIZE(compatible->types); i++) {

2964

+ 		expected = compatible->types[i];

2965

+ 		if (expected == NOT_INIT)

2966

+@@ -4985,14 +4983,14 @@ static int check_reg_type(struct bpf_verifier_env *env, u32 regno,

2967

+ 			goto found;

2968

+ 	}

2969

+

2970

+-	verbose(env, "R%d type=%s expected=", regno, reg_type_str[type]);

2971

++	verbose(env, "R%d type=%s expected=", regno, reg_type_str(env, reg->type));

2972

+ 	for (j = 0; j + 1 < i; j++)

2973

+-		verbose(env, "%s, ", reg_type_str[compatible->types[j]]);

2974

+-	verbose(env, "%s\n", reg_type_str[compatible->types[j]]);

2975

++		verbose(env, "%s, ", reg_type_str(env, compatible->types[j]));

2976

++	verbose(env, "%s\n", reg_type_str(env, compatible->types[j]));

2977

+ 	return -EACCES;

2978

+

2979

+ found:

2980

+-	if (type == PTR_TO_BTF_ID) {

2981

++	if (reg->type == PTR_TO_BTF_ID) {

2982

+ 		if (!arg_btf_id) {

2983

+ 			if (!compatible->btf_id) {

2984

+ 				verbose(env, "verifier internal error: missing arg compatible BTF ID\n");

2985

+@@ -5051,15 +5049,14 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg,

2986

+ 		return -EACCES;

2987

+ 	}

2988

+

2989

+-	if (arg_type == ARG_PTR_TO_MAP_VALUE ||

2990

+-	    arg_type == ARG_PTR_TO_UNINIT_MAP_VALUE ||

2991

+-	    arg_type == ARG_PTR_TO_MAP_VALUE_OR_NULL) {

2992

++	if (base_type(arg_type) == ARG_PTR_TO_MAP_VALUE ||

2993

++	    base_type(arg_type) == ARG_PTR_TO_UNINIT_MAP_VALUE) {

2994

+ 		err = resolve_map_arg_type(env, meta, &arg_type);

2995

+ 		if (err)

2996

+ 			return err;

2997

+ 	}

2998

+

2999

+-	if (register_is_null(reg) && arg_type_may_be_null(arg_type))

3000

++	if (register_is_null(reg) && type_may_be_null(arg_type))

3001

+ 		/* A NULL register has a SCALAR_VALUE type, so skip

3002

+ 		 * type checking.

3003

+ 		 */

3004

+@@ -5128,10 +5125,11 @@ skip_type_check:

3005

+ 		err = check_helper_mem_access(env, regno,

3006

+ 					      meta->map_ptr->key_size, false,

3007

+ 					      NULL);

3008

+-	} else if (arg_type == ARG_PTR_TO_MAP_VALUE ||

3009

+-		   (arg_type == ARG_PTR_TO_MAP_VALUE_OR_NULL &&

3010

+-		    !register_is_null(reg)) ||

3011

+-		   arg_type == ARG_PTR_TO_UNINIT_MAP_VALUE) {

3012

++	} else if (base_type(arg_type) == ARG_PTR_TO_MAP_VALUE ||

3013

++		   base_type(arg_type) == ARG_PTR_TO_UNINIT_MAP_VALUE) {

3014

++		if (type_may_be_null(arg_type) && register_is_null(reg))

3015

++			return 0;

3016

++

3017

+ 		/* bpf_map_xxx(..., map_ptr, ..., value) call:

3018

+ 		 * check [value, value + map->value_size) validity

3019

+ 		 */

3020

+@@ -6206,6 +6204,8 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn

3021

+ 			     int *insn_idx_p)

3022

+ {

3023

+ 	const struct bpf_func_proto *fn = NULL;

3024

++	enum bpf_return_type ret_type;

3025

++	enum bpf_type_flag ret_flag;

3026

+ 	struct bpf_reg_state *regs;

3027

+ 	struct bpf_call_arg_meta meta;

3028

+ 	int insn_idx = *insn_idx_p;

3029

+@@ -6339,13 +6339,14 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn

3030

+ 	regs[BPF_REG_0].subreg_def = DEF_NOT_SUBREG;

3031

+

3032

+ 	/* update return register (already marked as written above) */

3033

+-	if (fn->ret_type == RET_INTEGER) {

3034

++	ret_type = fn->ret_type;

3035

++	ret_flag = type_flag(fn->ret_type);

3036

++	if (ret_type == RET_INTEGER) {

3037

+ 		/* sets type to SCALAR_VALUE */

3038

+ 		mark_reg_unknown(env, regs, BPF_REG_0);

3039

+-	} else if (fn->ret_type == RET_VOID) {

3040

++	} else if (ret_type == RET_VOID) {

3041

+ 		regs[BPF_REG_0].type = NOT_INIT;

3042

+-	} else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL ||

3043

+-		   fn->ret_type == RET_PTR_TO_MAP_VALUE) {

3044

++	} else if (base_type(ret_type) == RET_PTR_TO_MAP_VALUE) {

3045

+ 		/* There is no offset yet applied, variable or fixed */

3046

+ 		mark_reg_known_zero(env, regs, BPF_REG_0);

3047

+ 		/* remember map_ptr, so that check_map_access()

3048

+@@ -6359,28 +6360,25 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn

3049

+ 		}

3050

+ 		regs[BPF_REG_0].map_ptr = meta.map_ptr;

3051

+ 		regs[BPF_REG_0].map_uid = meta.map_uid;

3052

+-		if (fn->ret_type == RET_PTR_TO_MAP_VALUE) {

3053

+-			regs[BPF_REG_0].type = PTR_TO_MAP_VALUE;

3054

+-			if (map_value_has_spin_lock(meta.map_ptr))

3055

+-				regs[BPF_REG_0].id = ++env->id_gen;

3056

+-		} else {

3057

+-			regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL;

3058

++		regs[BPF_REG_0].type = PTR_TO_MAP_VALUE | ret_flag;

3059

++		if (!type_may_be_null(ret_type) &&

3060

++		    map_value_has_spin_lock(meta.map_ptr)) {

3061

++			regs[BPF_REG_0].id = ++env->id_gen;

3062

+ 		}

3063

+-	} else if (fn->ret_type == RET_PTR_TO_SOCKET_OR_NULL) {

3064

++	} else if (base_type(ret_type) == RET_PTR_TO_SOCKET) {

3065

+ 		mark_reg_known_zero(env, regs, BPF_REG_0);

3066

+-		regs[BPF_REG_0].type = PTR_TO_SOCKET_OR_NULL;

3067

+-	} else if (fn->ret_type == RET_PTR_TO_SOCK_COMMON_OR_NULL) {

3068

++		regs[BPF_REG_0].type = PTR_TO_SOCKET | ret_flag;

3069

++	} else if (base_type(ret_type) == RET_PTR_TO_SOCK_COMMON) {

3070

+ 		mark_reg_known_zero(env, regs, BPF_REG_0);

3071

+-		regs[BPF_REG_0].type = PTR_TO_SOCK_COMMON_OR_NULL;

3072

+-	} else if (fn->ret_type == RET_PTR_TO_TCP_SOCK_OR_NULL) {

3073

++		regs[BPF_REG_0].type = PTR_TO_SOCK_COMMON | ret_flag;

3074

++	} else if (base_type(ret_type) == RET_PTR_TO_TCP_SOCK) {

3075

+ 		mark_reg_known_zero(env, regs, BPF_REG_0);

3076

+-		regs[BPF_REG_0].type = PTR_TO_TCP_SOCK_OR_NULL;

3077

+-	} else if (fn->ret_type == RET_PTR_TO_ALLOC_MEM_OR_NULL) {

3078

++		regs[BPF_REG_0].type = PTR_TO_TCP_SOCK | ret_flag;

3079

++	} else if (base_type(ret_type) == RET_PTR_TO_ALLOC_MEM) {

3080

+ 		mark_reg_known_zero(env, regs, BPF_REG_0);

3081

+-		regs[BPF_REG_0].type = PTR_TO_MEM_OR_NULL;

3082

++		regs[BPF_REG_0].type = PTR_TO_MEM | ret_flag;

3083

+ 		regs[BPF_REG_0].mem_size = meta.mem_size;

3084

+-	} else if (fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID_OR_NULL ||

3085

+-		   fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID) {

3086

++	} else if (base_type(ret_type) == RET_PTR_TO_MEM_OR_BTF_ID) {

3087

+ 		const struct btf_type *t;

3088

+

3089

+ 		mark_reg_known_zero(env, regs, BPF_REG_0);

3090

+@@ -6398,29 +6396,30 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn

3091

+ 					tname, PTR_ERR(ret));

3092

+ 				return -EINVAL;

3093

+ 			}

3094

+-			regs[BPF_REG_0].type =

3095

+-				fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID ?

3096

+-				PTR_TO_MEM : PTR_TO_MEM_OR_NULL;

3097

++			regs[BPF_REG_0].type = PTR_TO_MEM | ret_flag;

3098

+ 			regs[BPF_REG_0].mem_size = tsize;

3099

+ 		} else {

3100

+-			regs[BPF_REG_0].type =

3101

+-				fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID ?

3102

+-				PTR_TO_BTF_ID : PTR_TO_BTF_ID_OR_NULL;

3103

++			/* MEM_RDONLY may be carried from ret_flag, but it

3104

++			 * doesn't apply on PTR_TO_BTF_ID. Fold it, otherwise

3105

++			 * it will confuse the check of PTR_TO_BTF_ID in

3106

++			 * check_mem_access().

3107

++			 */

3108

++			ret_flag &= ~MEM_RDONLY;

3109

++

3110

++			regs[BPF_REG_0].type = PTR_TO_BTF_ID | ret_flag;

3111

+ 			regs[BPF_REG_0].btf = meta.ret_btf;

3112

+ 			regs[BPF_REG_0].btf_id = meta.ret_btf_id;

3113

+ 		}

3114

+-	} else if (fn->ret_type == RET_PTR_TO_BTF_ID_OR_NULL ||

3115

+-		   fn->ret_type == RET_PTR_TO_BTF_ID) {

3116

++	} else if (base_type(ret_type) == RET_PTR_TO_BTF_ID) {

3117

+ 		int ret_btf_id;

3118

+

3119

+ 		mark_reg_known_zero(env, regs, BPF_REG_0);

3120

+-		regs[BPF_REG_0].type = fn->ret_type == RET_PTR_TO_BTF_ID ?

3121

+-						     PTR_TO_BTF_ID :

3122

+-						     PTR_TO_BTF_ID_OR_NULL;

3123

++		regs[BPF_REG_0].type = PTR_TO_BTF_ID | ret_flag;

3124

+ 		ret_btf_id = *fn->ret_btf_id;

3125

+ 		if (ret_btf_id == 0) {

3126

+-			verbose(env, "invalid return type %d of func %s#%d\n",

3127

+-				fn->ret_type, func_id_name(func_id), func_id);

3128

++			verbose(env, "invalid return type %u of func %s#%d\n",

3129

++				base_type(ret_type), func_id_name(func_id),

3130

++				func_id);

3131

+ 			return -EINVAL;

3132

+ 		}

3133

+ 		/* current BPF helper definitions are only coming from

3134

+@@ -6429,12 +6428,12 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn

3135

+ 		regs[BPF_REG_0].btf = btf_vmlinux;

3136

+ 		regs[BPF_REG_0].btf_id = ret_btf_id;

3137

+ 	} else {

3138

+-		verbose(env, "unknown return type %d of func %s#%d\n",

3139

+-			fn->ret_type, func_id_name(func_id), func_id);

3140

++		verbose(env, "unknown return type %u of func %s#%d\n",

3141

++			base_type(ret_type), func_id_name(func_id), func_id);

3142

+ 		return -EINVAL;

3143

+ 	}

3144

+

3145

+-	if (reg_type_may_be_null(regs[BPF_REG_0].type))

3146

++	if (type_may_be_null(regs[BPF_REG_0].type))

3147

+ 		regs[BPF_REG_0].id = ++env->id_gen;

3148

+

3149

+ 	if (is_ptr_cast_function(func_id)) {

3150

+@@ -6633,25 +6632,25 @@ static bool check_reg_sane_offset(struct bpf_verifier_env *env,

3151

+

3152

+ 	if (known && (val >= BPF_MAX_VAR_OFF || val <= -BPF_MAX_VAR_OFF)) {

3153

+ 		verbose(env, "math between %s pointer and %lld is not allowed\n",

3154

+-			reg_type_str[type], val);

3155

++			reg_type_str(env, type), val);

3156

+ 		return false;

3157

+ 	}

3158

+

3159

+ 	if (reg->off >= BPF_MAX_VAR_OFF || reg->off <= -BPF_MAX_VAR_OFF) {

3160

+ 		verbose(env, "%s pointer offset %d is not allowed\n",

3161

+-			reg_type_str[type], reg->off);

3162

++			reg_type_str(env, type), reg->off);

3163

+ 		return false;

3164

+ 	}

3165

+

3166

+ 	if (smin == S64_MIN) {

3167

+ 		verbose(env, "math between %s pointer and register with unbounded min value is not allowed\n",

3168

+-			reg_type_str[type]);

3169

++			reg_type_str(env, type));

3170

+ 		return false;

3171

+ 	}

3172

+

3173

+ 	if (smin >= BPF_MAX_VAR_OFF || smin <= -BPF_MAX_VAR_OFF) {

3174

+ 		verbose(env, "value %lld makes %s pointer be out of bounds\n",

3175

+-			smin, reg_type_str[type]);

3176

++			smin, reg_type_str(env, type));

3177

+ 		return false;

3178

+ 	}

3179

+

3180

+@@ -7028,11 +7027,13 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,

3181

+ 		return -EACCES;

3182

+ 	}

3183

+

3184

+-	switch (ptr_reg->type) {

3185

+-	case PTR_TO_MAP_VALUE_OR_NULL:

3186

++	if (ptr_reg->type & PTR_MAYBE_NULL) {

3187

+ 		verbose(env, "R%d pointer arithmetic on %s prohibited, null-check it first\n",

3188

+-			dst, reg_type_str[ptr_reg->type]);

3189

++			dst, reg_type_str(env, ptr_reg->type));

3190

+ 		return -EACCES;

3191

++	}

3192

++

3193

++	switch (base_type(ptr_reg->type)) {

3194

+ 	case CONST_PTR_TO_MAP:

3195

+ 		/* smin_val represents the known value */

3196

+ 		if (known && smin_val == 0 && opcode == BPF_ADD)

3197

+@@ -7045,10 +7046,10 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,

3198

+ 	case PTR_TO_XDP_SOCK:

3199

+ reject:

3200

+ 		verbose(env, "R%d pointer arithmetic on %s prohibited\n",

3201

+-			dst, reg_type_str[ptr_reg->type]);

3202

++			dst, reg_type_str(env, ptr_reg->type));

3203

+ 		return -EACCES;

3204

+ 	default:

3205

+-		if (reg_type_may_be_null(ptr_reg->type))

3206

++		if (type_may_be_null(ptr_reg->type))

3207

+ 			goto reject;

3208

+ 		break;

3209

+ 	}

3210

+@@ -8770,7 +8771,7 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state,

3211

+ 				 struct bpf_reg_state *reg, u32 id,

3212

+ 				 bool is_null)

3213

+ {

3214

+-	if (reg_type_may_be_null(reg->type) && reg->id == id &&

3215

++	if (type_may_be_null(reg->type) && reg->id == id &&

3216

+ 	    !WARN_ON_ONCE(!reg->id)) {

3217

+ 		if (WARN_ON_ONCE(reg->smin_value || reg->smax_value ||

3218

+ 				 !tnum_equals_const(reg->var_off, 0) ||

3219

+@@ -9148,7 +9149,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,

3220

+ 	 */

3221

+ 	if (!is_jmp32 && BPF_SRC(insn->code) == BPF_K &&

3222

+ 	    insn->imm == 0 && (opcode == BPF_JEQ || opcode == BPF_JNE) &&

3223

+-	    reg_type_may_be_null(dst_reg->type)) {

3224

++	    type_may_be_null(dst_reg->type)) {

3225

+ 		/* Mark all identical registers in each branch as either

3226

+ 		 * safe or unknown depending R == 0 or R != 0 conditional.

3227

+ 		 */

3228

+@@ -9207,7 +9208,7 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn)

3229

+

3230

+ 	if (insn->src_reg == BPF_PSEUDO_BTF_ID) {

3231

+ 		dst_reg->type = aux->btf_var.reg_type;

3232

+-		switch (dst_reg->type) {

3233

++		switch (base_type(dst_reg->type)) {

3234

+ 		case PTR_TO_MEM:

3235

+ 			dst_reg->mem_size = aux->btf_var.mem_size;

3236

+ 			break;

3237

+@@ -9404,7 +9405,7 @@ static int check_return_code(struct bpf_verifier_env *env)

3238

+ 		/* enforce return zero from async callbacks like timer */

3239

+ 		if (reg->type != SCALAR_VALUE) {

3240

+ 			verbose(env, "In async callback the register R0 is not a known value (%s)\n",

3241

+-				reg_type_str[reg->type]);

3242

++				reg_type_str(env, reg->type));

3243

+ 			return -EINVAL;

3244

+ 		}

3245

+

3246

+@@ -9418,7 +9419,7 @@ static int check_return_code(struct bpf_verifier_env *env)

3247

+ 	if (is_subprog) {

3248

+ 		if (reg->type != SCALAR_VALUE) {

3249

+ 			verbose(env, "At subprogram exit the register R0 is not a scalar value (%s)\n",

3250

+-				reg_type_str[reg->type]);

3251

++				reg_type_str(env, reg->type));

3252

+ 			return -EINVAL;

3253

+ 		}

3254

+ 		return 0;

3255

+@@ -9482,7 +9483,7 @@ static int check_return_code(struct bpf_verifier_env *env)

3256

+

3257

+ 	if (reg->type != SCALAR_VALUE) {

3258

+ 		verbose(env, "At program exit the register R0 is not a known value (%s)\n",

3259

+-			reg_type_str[reg->type]);

3260

++			reg_type_str(env, reg->type));

3261

+ 		return -EINVAL;

3262

+ 	}

3263

+

3264

+@@ -10263,7 +10264,7 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,

3265

+ 		return true;

3266

+ 	if (rcur->type == NOT_INIT)

3267

+ 		return false;

3268

+-	switch (rold->type) {

3269

++	switch (base_type(rold->type)) {

3270

+ 	case SCALAR_VALUE:

3271

+ 		if (env->explore_alu_limits)

3272

+ 			return false;

3273

+@@ -10285,6 +10286,22 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,

3274

+ 		}

3275

+ 	case PTR_TO_MAP_KEY:

3276

+ 	case PTR_TO_MAP_VALUE:

3277

++		/* a PTR_TO_MAP_VALUE could be safe to use as a

3278

++		 * PTR_TO_MAP_VALUE_OR_NULL into the same map.

3279

++		 * However, if the old PTR_TO_MAP_VALUE_OR_NULL then got NULL-

3280

++		 * checked, doing so could have affected others with the same

3281

++		 * id, and we can't check for that because we lost the id when

3282

++		 * we converted to a PTR_TO_MAP_VALUE.

3283

++		 */

3284

++		if (type_may_be_null(rold->type)) {

3285

++			if (!type_may_be_null(rcur->type))

3286

++				return false;

3287

++			if (memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)))

3288

++				return false;

3289

++			/* Check our ids match any regs they're supposed to */

3290

++			return check_ids(rold->id, rcur->id, idmap);

3291

++		}

3292

++

3293

+ 		/* If the new min/max/var_off satisfy the old ones and

3294

+ 		 * everything else matches, we are OK.

3295

+ 		 * 'id' is not compared, since it's only used for maps with

3296

+@@ -10296,20 +10313,6 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,

3297

+ 		return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0 &&

3298

+ 		       range_within(rold, rcur) &&

3299

+ 		       tnum_in(rold->var_off, rcur->var_off);

3300

+-	case PTR_TO_MAP_VALUE_OR_NULL:

3301

+-		/* a PTR_TO_MAP_VALUE could be safe to use as a

3302

+-		 * PTR_TO_MAP_VALUE_OR_NULL into the same map.

3303

+-		 * However, if the old PTR_TO_MAP_VALUE_OR_NULL then got NULL-

3304

+-		 * checked, doing so could have affected others with the same

3305

+-		 * id, and we can't check for that because we lost the id when

3306

+-		 * we converted to a PTR_TO_MAP_VALUE.

3307

+-		 */

3308

+-		if (rcur->type != PTR_TO_MAP_VALUE_OR_NULL)

3309

+-			return false;

3310

+-		if (memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)))

3311

+-			return false;

3312

+-		/* Check our ids match any regs they're supposed to */

3313

+-		return check_ids(rold->id, rcur->id, idmap);

3314

+ 	case PTR_TO_PACKET_META:

3315

+ 	case PTR_TO_PACKET:

3316

+ 		if (rcur->type != rold->type)

3317

+@@ -10338,11 +10341,8 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,

3318

+ 	case PTR_TO_PACKET_END:

3319

+ 	case PTR_TO_FLOW_KEYS:

3320

+ 	case PTR_TO_SOCKET:

3321

+-	case PTR_TO_SOCKET_OR_NULL:

3322

+ 	case PTR_TO_SOCK_COMMON:

3323

+-	case PTR_TO_SOCK_COMMON_OR_NULL:

3324

+ 	case PTR_TO_TCP_SOCK:

3325

+-	case PTR_TO_TCP_SOCK_OR_NULL:

3326

+ 	case PTR_TO_XDP_SOCK:

3327

+ 		/* Only valid matches are exact, which memcmp() above

3328

+ 		 * would have accepted

3329

+@@ -10868,17 +10868,13 @@ next:

3330

+ /* Return true if it's OK to have the same insn return a different type. */

3331

+ static bool reg_type_mismatch_ok(enum bpf_reg_type type)

3332

+ {

3333

+-	switch (type) {

3334

++	switch (base_type(type)) {

3335

+ 	case PTR_TO_CTX:

3336

+ 	case PTR_TO_SOCKET:

3337

+-	case PTR_TO_SOCKET_OR_NULL:

3338

+ 	case PTR_TO_SOCK_COMMON:

3339

+-	case PTR_TO_SOCK_COMMON_OR_NULL:

3340

+ 	case PTR_TO_TCP_SOCK:

3341

+-	case PTR_TO_TCP_SOCK_OR_NULL:

3342

+ 	case PTR_TO_XDP_SOCK:

3343

+ 	case PTR_TO_BTF_ID:

3344

+-	case PTR_TO_BTF_ID_OR_NULL:

3345

+ 		return false;

3346

+ 	default:

3347

+ 		return true;

3348

+@@ -11102,7 +11098,7 @@ static int do_check(struct bpf_verifier_env *env)

3349

+ 			if (is_ctx_reg(env, insn->dst_reg)) {

3350

+ 				verbose(env, "BPF_ST stores into R%d %s is not allowed\n",

3351

+ 					insn->dst_reg,

3352

+-					reg_type_str[reg_state(env, insn->dst_reg)->type]);

3353

++					reg_type_str(env, reg_state(env, insn->dst_reg)->type));

3354

+ 				return -EACCES;

3355

+ 			}

3356

+

3357

+@@ -11353,7 +11349,7 @@ static int check_pseudo_btf_id(struct bpf_verifier_env *env,

3358

+ 			err = -EINVAL;

3359

+ 			goto err_put;

3360

+ 		}

3361

+-		aux->btf_var.reg_type = PTR_TO_MEM;

3362

++		aux->btf_var.reg_type = PTR_TO_MEM | MEM_RDONLY;

3363

+ 		aux->btf_var.mem_size = tsize;

3364

+ 	} else {

3365

+ 		aux->btf_var.reg_type = PTR_TO_BTF_ID;

3366

+@@ -13175,7 +13171,7 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog)

3367

+ 				mark_reg_known_zero(env, regs, i);

3368

+ 			else if (regs[i].type == SCALAR_VALUE)

3369

+ 				mark_reg_unknown(env, regs, i);

3370

+-			else if (regs[i].type == PTR_TO_MEM_OR_NULL) {

3371

++			else if (base_type(regs[i].type) == PTR_TO_MEM) {

3372

+ 				const u32 mem_size = regs[i].mem_size;

3373

+

3374

+ 				mark_reg_known_zero(env, regs, i);

3375

+diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c

3376

+index 5a18b861fcf75..c289010b0964e 100644

3377

+--- a/kernel/trace/bpf_trace.c

3378

++++ b/kernel/trace/bpf_trace.c

3379

+@@ -345,7 +345,7 @@ static const struct bpf_func_proto bpf_probe_write_user_proto = {

3380

+ 	.gpl_only	= true,

3381

+ 	.ret_type	= RET_INTEGER,

3382

+ 	.arg1_type	= ARG_ANYTHING,

3383

+-	.arg2_type	= ARG_PTR_TO_MEM,

3384

++	.arg2_type	= ARG_PTR_TO_MEM | MEM_RDONLY,

3385

+ 	.arg3_type	= ARG_CONST_SIZE,

3386

+ };

3387

+

3388

+@@ -394,7 +394,7 @@ static const struct bpf_func_proto bpf_trace_printk_proto = {

3389

+ 	.func		= bpf_trace_printk,

3390

+ 	.gpl_only	= true,

3391

+ 	.ret_type	= RET_INTEGER,

3392

+-	.arg1_type	= ARG_PTR_TO_MEM,

3393

++	.arg1_type	= ARG_PTR_TO_MEM | MEM_RDONLY,

3394

+ 	.arg2_type	= ARG_CONST_SIZE,

3395

+ };

3396

+

3397

+@@ -446,9 +446,9 @@ static const struct bpf_func_proto bpf_seq_printf_proto = {

3398

+ 	.ret_type	= RET_INTEGER,

3399

+ 	.arg1_type	= ARG_PTR_TO_BTF_ID,

3400

+ 	.arg1_btf_id	= &btf_seq_file_ids[0],

3401

+-	.arg2_type	= ARG_PTR_TO_MEM,

3402

++	.arg2_type	= ARG_PTR_TO_MEM | MEM_RDONLY,

3403

+ 	.arg3_type	= ARG_CONST_SIZE,

3404

+-	.arg4_type      = ARG_PTR_TO_MEM_OR_NULL,

3405

++	.arg4_type      = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY,

3406

+ 	.arg5_type      = ARG_CONST_SIZE_OR_ZERO,

3407

+ };

3408

+

3409

+@@ -463,7 +463,7 @@ static const struct bpf_func_proto bpf_seq_write_proto = {

3410

+ 	.ret_type	= RET_INTEGER,

3411

+ 	.arg1_type	= ARG_PTR_TO_BTF_ID,

3412

+ 	.arg1_btf_id	= &btf_seq_file_ids[0],

3413

+-	.arg2_type	= ARG_PTR_TO_MEM,

3414

++	.arg2_type	= ARG_PTR_TO_MEM | MEM_RDONLY,

3415

+ 	.arg3_type	= ARG_CONST_SIZE_OR_ZERO,

3416

+ };

3417

+

3418

+@@ -487,7 +487,7 @@ static const struct bpf_func_proto bpf_seq_printf_btf_proto = {

3419

+ 	.ret_type	= RET_INTEGER,

3420

+ 	.arg1_type	= ARG_PTR_TO_BTF_ID,

3421

+ 	.arg1_btf_id	= &btf_seq_file_ids[0],

3422

+-	.arg2_type	= ARG_PTR_TO_MEM,

3423

++	.arg2_type	= ARG_PTR_TO_MEM | MEM_RDONLY,

3424

+ 	.arg3_type	= ARG_CONST_SIZE_OR_ZERO,

3425

+ 	.arg4_type	= ARG_ANYTHING,

3426

+ };

3427

+@@ -648,7 +648,7 @@ static const struct bpf_func_proto bpf_perf_event_output_proto = {

3428

+ 	.arg1_type	= ARG_PTR_TO_CTX,

3429

+ 	.arg2_type	= ARG_CONST_MAP_PTR,

3430

+ 	.arg3_type	= ARG_ANYTHING,

3431

+-	.arg4_type	= ARG_PTR_TO_MEM,

3432

++	.arg4_type	= ARG_PTR_TO_MEM | MEM_RDONLY,

3433

+ 	.arg5_type	= ARG_CONST_SIZE_OR_ZERO,

3434

+ };

3435

+

3436

+@@ -958,7 +958,7 @@ const struct bpf_func_proto bpf_snprintf_btf_proto = {

3437

+ 	.ret_type	= RET_INTEGER,

3438

+ 	.arg1_type	= ARG_PTR_TO_MEM,

3439

+ 	.arg2_type	= ARG_CONST_SIZE,

3440

+-	.arg3_type	= ARG_PTR_TO_MEM,

3441

++	.arg3_type	= ARG_PTR_TO_MEM | MEM_RDONLY,

3442

+ 	.arg4_type	= ARG_CONST_SIZE,

3443

+ 	.arg5_type	= ARG_ANYTHING,

3444

+ };

3445

+@@ -1207,7 +1207,7 @@ static const struct bpf_func_proto bpf_perf_event_output_proto_tp = {

3446

+ 	.arg1_type	= ARG_PTR_TO_CTX,

3447

+ 	.arg2_type	= ARG_CONST_MAP_PTR,

3448

+ 	.arg3_type	= ARG_ANYTHING,

3449

+-	.arg4_type	= ARG_PTR_TO_MEM,

3450

++	.arg4_type	= ARG_PTR_TO_MEM | MEM_RDONLY,

3451

+ 	.arg5_type	= ARG_CONST_SIZE_OR_ZERO,

3452

+ };

3453

+

3454

+@@ -1429,7 +1429,7 @@ static const struct bpf_func_proto bpf_perf_event_output_proto_raw_tp = {

3455

+ 	.arg1_type	= ARG_PTR_TO_CTX,

3456

+ 	.arg2_type	= ARG_CONST_MAP_PTR,

3457

+ 	.arg3_type	= ARG_ANYTHING,

3458

+-	.arg4_type	= ARG_PTR_TO_MEM,

3459

++	.arg4_type	= ARG_PTR_TO_MEM | MEM_RDONLY,

3460

+ 	.arg5_type	= ARG_CONST_SIZE_OR_ZERO,

3461

+ };

3462

+

3463

+@@ -1483,7 +1483,7 @@ static const struct bpf_func_proto bpf_get_stack_proto_raw_tp = {

3464

+ 	.gpl_only	= true,

3465

+ 	.ret_type	= RET_INTEGER,

3466

+ 	.arg1_type	= ARG_PTR_TO_CTX,

3467

+-	.arg2_type	= ARG_PTR_TO_MEM,

3468

++	.arg2_type	= ARG_PTR_TO_MEM | MEM_RDONLY,

3469

+ 	.arg3_type	= ARG_CONST_SIZE_OR_ZERO,

3470

+ 	.arg4_type	= ARG_ANYTHING,

3471

+ };

3472

+diff --git a/lib/iov_iter.c b/lib/iov_iter.c

3473

+index c5b2f0f4b8a84..6d146f77601d7 100644

3474

+--- a/lib/iov_iter.c

3475

++++ b/lib/iov_iter.c

3476

+@@ -191,7 +191,7 @@ static size_t copy_page_to_iter_iovec(struct page *page, size_t offset, size_t b

3477

+ 	buf = iov->iov_base + skip;

3478

+ 	copy = min(bytes, iov->iov_len - skip);

3479

+

3480

+-	if (IS_ENABLED(CONFIG_HIGHMEM) && !fault_in_pages_writeable(buf, copy)) {

3481

++	if (IS_ENABLED(CONFIG_HIGHMEM) && !fault_in_writeable(buf, copy)) {

3482

+ 		kaddr = kmap_atomic(page);

3483

+ 		from = kaddr + offset;

3484

+

3485

+@@ -275,7 +275,7 @@ static size_t copy_page_from_iter_iovec(struct page *page, size_t offset, size_t

3486

+ 	buf = iov->iov_base + skip;

3487

+ 	copy = min(bytes, iov->iov_len - skip);

3488

+

3489

+-	if (IS_ENABLED(CONFIG_HIGHMEM) && !fault_in_pages_readable(buf, copy)) {

3490

++	if (IS_ENABLED(CONFIG_HIGHMEM) && !fault_in_readable(buf, copy)) {

3491

+ 		kaddr = kmap_atomic(page);

3492

+ 		to = kaddr + offset;

3493

+

3494

+@@ -431,35 +431,81 @@ out:

3495

+ }

3496

+

3497

+ /*

3498

++ * fault_in_iov_iter_readable - fault in iov iterator for reading

3499

++ * @i: iterator

3500

++ * @size: maximum length

3501

++ *

3502

+  * Fault in one or more iovecs of the given iov_iter, to a maximum length of

3503

+- * bytes.  For each iovec, fault in each page that constitutes the iovec.

3504

++ * @size.  For each iovec, fault in each page that constitutes the iovec.

3505

++ *

3506

++ * Returns the number of bytes not faulted in (like copy_to_user() and

3507

++ * copy_from_user()).

3508

+  *

3509

+- * Return 0 on success, or non-zero if the memory could not be accessed (i.e.

3510

+- * because it is an invalid address).

3511

++ * Always returns 0 for non-userspace iterators.

3512

+  */

3513

+-int iov_iter_fault_in_readable(const struct iov_iter *i, size_t bytes)

3514

++size_t fault_in_iov_iter_readable(const struct iov_iter *i, size_t size)

3515

+ {

3516

+ 	if (iter_is_iovec(i)) {

3517

++		size_t count = min(size, iov_iter_count(i));

3518

+ 		const struct iovec *p;

3519

+ 		size_t skip;

3520

+

3521

+-		if (bytes > i->count)

3522

+-			bytes = i->count;

3523

+-		for (p = i->iov, skip = i->iov_offset; bytes; p++, skip = 0) {

3524

+-			size_t len = min(bytes, p->iov_len - skip);

3525

+-			int err;

3526

++		size -= count;

3527

++		for (p = i->iov, skip = i->iov_offset; count; p++, skip = 0) {

3528

++			size_t len = min(count, p->iov_len - skip);

3529

++			size_t ret;

3530

+

3531

+ 			if (unlikely(!len))

3532

+ 				continue;

3533

+-			err = fault_in_pages_readable(p->iov_base + skip, len);

3534

+-			if (unlikely(err))

3535

+-				return err;

3536

+-			bytes -= len;

3537

++			ret = fault_in_readable(p->iov_base + skip, len);

3538

++			count -= len - ret;

3539

++			if (ret)

3540

++				break;

3541

+ 		}

3542

++		return count + size;

3543

+ 	}

3544

+ 	return 0;

3545

+ }

3546

+-EXPORT_SYMBOL(iov_iter_fault_in_readable);

3547

++EXPORT_SYMBOL(fault_in_iov_iter_readable);

3548

++

3549

++/*

3550

++ * fault_in_iov_iter_writeable - fault in iov iterator for writing

3551

++ * @i: iterator

3552

++ * @size: maximum length

3553

++ *

3554

++ * Faults in the iterator using get_user_pages(), i.e., without triggering

3555

++ * hardware page faults.  This is primarily useful when we already know that

3556

++ * some or all of the pages in @i aren't in memory.

3557

++ *

3558

++ * Returns the number of bytes not faulted in, like copy_to_user() and

3559

++ * copy_from_user().

3560

++ *

3561

++ * Always returns 0 for non-user-space iterators.

3562

++ */

3563

++size_t fault_in_iov_iter_writeable(const struct iov_iter *i, size_t size)

3564

++{

3565

++	if (iter_is_iovec(i)) {

3566

++		size_t count = min(size, iov_iter_count(i));

3567

++		const struct iovec *p;

3568

++		size_t skip;

3569

++

3570

++		size -= count;

3571

++		for (p = i->iov, skip = i->iov_offset; count; p++, skip = 0) {

3572

++			size_t len = min(count, p->iov_len - skip);

3573

++			size_t ret;

3574

++

3575

++			if (unlikely(!len))

3576

++				continue;

3577

++			ret = fault_in_safe_writeable(p->iov_base + skip, len);

3578

++			count -= len - ret;

3579

++			if (ret)

3580

++				break;

3581

++		}

3582

++		return count + size;

3583

++	}

3584

++	return 0;

3585

++}

3586

++EXPORT_SYMBOL(fault_in_iov_iter_writeable);

3587

+

3588

+ void iov_iter_init(struct iov_iter *i, unsigned int direction,

3589

+ 			const struct iovec *iov, unsigned long nr_segs,

3590

+@@ -468,6 +514,7 @@ void iov_iter_init(struct iov_iter *i, unsigned int direction,

3591

+ 	WARN_ON(direction & ~(READ | WRITE));

3592

+ 	*i = (struct iov_iter) {

3593

+ 		.iter_type = ITER_IOVEC,

3594

++		.nofault = false,

3595

+ 		.data_source = direction,

3596

+ 		.iov = iov,

3597

+ 		.nr_segs = nr_segs,

3598

+@@ -1483,13 +1530,17 @@ ssize_t iov_iter_get_pages(struct iov_iter *i,

3599

+ 		return 0;

3600

+

3601

+ 	if (likely(iter_is_iovec(i))) {

3602

++		unsigned int gup_flags = 0;

3603

+ 		unsigned long addr;

3604

+

3605

++		if (iov_iter_rw(i) != WRITE)

3606

++			gup_flags |= FOLL_WRITE;

3607

++		if (i->nofault)

3608

++			gup_flags |= FOLL_NOFAULT;

3609

++

3610

+ 		addr = first_iovec_segment(i, &len, start, maxsize, maxpages);

3611

+ 		n = DIV_ROUND_UP(len, PAGE_SIZE);

3612

+-		res = get_user_pages_fast(addr, n,

3613

+-				iov_iter_rw(i) != WRITE ?  FOLL_WRITE : 0,

3614

+-				pages);

3615

++		res = get_user_pages_fast(addr, n, gup_flags, pages);

3616

+ 		if (unlikely(res <= 0))

3617

+ 			return res;

3618

+ 		return (res == n ? len : res * PAGE_SIZE) - *start;

3619

+@@ -1605,15 +1656,20 @@ ssize_t iov_iter_get_pages_alloc(struct iov_iter *i,

3620

+ 		return 0;

3621

+

3622

+ 	if (likely(iter_is_iovec(i))) {

3623

++		unsigned int gup_flags = 0;

3624

+ 		unsigned long addr;

3625

+

3626

++		if (iov_iter_rw(i) != WRITE)

3627

++			gup_flags |= FOLL_WRITE;

3628

++		if (i->nofault)

3629

++			gup_flags |= FOLL_NOFAULT;

3630

++

3631

+ 		addr = first_iovec_segment(i, &len, start, maxsize, ~0U);

3632

+ 		n = DIV_ROUND_UP(len, PAGE_SIZE);

3633

+ 		p = get_pages_array(n);

3634

+ 		if (!p)

3635

+ 			return -ENOMEM;

3636

+-		res = get_user_pages_fast(addr, n,

3637

+-				iov_iter_rw(i) != WRITE ?  FOLL_WRITE : 0, p);

3638

++		res = get_user_pages_fast(addr, n, gup_flags, p);

3639

+ 		if (unlikely(res <= 0)) {

3640

+ 			kvfree(p);

3641

+ 			*pages = NULL;

3642

+diff --git a/mm/filemap.c b/mm/filemap.c

3643

+index 1293c3409e429..00e391e758801 100644

3644

+--- a/mm/filemap.c

3645

++++ b/mm/filemap.c

3646

+@@ -90,7 +90,7 @@

3647

+  *      ->lock_page		(filemap_fault, access_process_vm)

3648

+  *

3649

+  *  ->i_rwsem			(generic_perform_write)

3650

+- *    ->mmap_lock		(fault_in_pages_readable->do_page_fault)

3651

++ *    ->mmap_lock		(fault_in_readable->do_page_fault)

3652

+  *

3653

+  *  bdi->wb.list_lock

3654

+  *    sb_lock			(fs/fs-writeback.c)

3655

+@@ -3760,7 +3760,7 @@ again:

3656

+ 		 * same page as we're writing to, without it being marked

3657

+ 		 * up-to-date.

3658

+ 		 */

3659

+-		if (unlikely(iov_iter_fault_in_readable(i, bytes))) {

3660

++		if (unlikely(fault_in_iov_iter_readable(i, bytes))) {

3661

+ 			status = -EFAULT;

3662

+ 			break;

3663

+ 		}

3664

+diff --git a/mm/gup.c b/mm/gup.c

3665

+index 52f08e3177e9f..ba2ab7a223f8e 100644

3666

+--- a/mm/gup.c

3667

++++ b/mm/gup.c

3668

+@@ -943,6 +943,8 @@ static int faultin_page(struct vm_area_struct *vma,

3669

+ 	/* mlock all present pages, but do not fault in new pages */

3670

+ 	if ((*flags & (FOLL_POPULATE | FOLL_MLOCK)) == FOLL_MLOCK)

3671

+ 		return -ENOENT;

3672

++	if (*flags & FOLL_NOFAULT)

3673

++		return -EFAULT;

3674

+ 	if (*flags & FOLL_WRITE)

3675

+ 		fault_flags |= FAULT_FLAG_WRITE;

3676

+ 	if (*flags & FOLL_REMOTE)

3677

+@@ -1681,6 +1683,122 @@ finish_or_fault:

3678

+ }

3679

+ #endif /* !CONFIG_MMU */

3680

+

3681

++/**

3682

++ * fault_in_writeable - fault in userspace address range for writing

3683

++ * @uaddr: start of address range

3684

++ * @size: size of address range

3685

++ *

3686

++ * Returns the number of bytes not faulted in (like copy_to_user() and

3687

++ * copy_from_user()).

3688

++ */

3689

++size_t fault_in_writeable(char __user *uaddr, size_t size)

3690

++{

3691

++	char __user *start = uaddr, *end;

3692

++

3693

++	if (unlikely(size == 0))

3694

++		return 0;

3695

++	if (!PAGE_ALIGNED(uaddr)) {

3696

++		if (unlikely(__put_user(0, uaddr) != 0))

3697

++			return size;

3698

++		uaddr = (char __user *)PAGE_ALIGN((unsigned long)uaddr);

3699

++	}

3700

++	end = (char __user *)PAGE_ALIGN((unsigned long)start + size);

3701

++	if (unlikely(end < start))

3702

++		end = NULL;

3703

++	while (uaddr != end) {

3704

++		if (unlikely(__put_user(0, uaddr) != 0))

3705

++			goto out;

3706

++		uaddr += PAGE_SIZE;

3707

++	}

3708

++

3709

++out:

3710

++	if (size > uaddr - start)

3711

++		return size - (uaddr - start);

3712

++	return 0;

3713

++}

3714

++EXPORT_SYMBOL(fault_in_writeable);

3715

++

3716

++/*

3717

++ * fault_in_safe_writeable - fault in an address range for writing

3718

++ * @uaddr: start of address range

3719

++ * @size: length of address range

3720

++ *

3721

++ * Faults in an address range for writing.  This is primarily useful when we

3722

++ * already know that some or all of the pages in the address range aren't in

3723

++ * memory.

3724

++ *

3725

++ * Unlike fault_in_writeable(), this function is non-destructive.

3726

++ *

3727

++ * Note that we don't pin or otherwise hold the pages referenced that we fault

3728

++ * in.  There's no guarantee that they'll stay in memory for any duration of

3729

++ * time.

3730

++ *

3731

++ * Returns the number of bytes not faulted in, like copy_to_user() and

3732

++ * copy_from_user().

3733

++ */

3734

++size_t fault_in_safe_writeable(const char __user *uaddr, size_t size)

3735

++{

3736

++	unsigned long start = (unsigned long)uaddr, end;

3737

++	struct mm_struct *mm = current->mm;

3738

++	bool unlocked = false;

3739

++

3740

++	if (unlikely(size == 0))

3741

++		return 0;

3742

++	end = PAGE_ALIGN(start + size);

3743

++	if (end < start)

3744

++		end = 0;

3745

++

3746

++	mmap_read_lock(mm);

3747

++	do {

3748

++		if (fixup_user_fault(mm, start, FAULT_FLAG_WRITE, &unlocked))

3749

++			break;

3750

++		start = (start + PAGE_SIZE) & PAGE_MASK;

3751

++	} while (start != end);

3752

++	mmap_read_unlock(mm);

3753

++

3754

++	if (size > (unsigned long)uaddr - start)

3755

++		return size - ((unsigned long)uaddr - start);

3756

++	return 0;

3757

++}

3758

++EXPORT_SYMBOL(fault_in_safe_writeable);

3759

++

3760

++/**

3761

++ * fault_in_readable - fault in userspace address range for reading

3762

++ * @uaddr: start of user address range

3763

++ * @size: size of user address range

3764

++ *

3765

++ * Returns the number of bytes not faulted in (like copy_to_user() and

3766

++ * copy_from_user()).

3767

++ */

3768

++size_t fault_in_readable(const char __user *uaddr, size_t size)

3769

++{

3770

++	const char __user *start = uaddr, *end;

3771

++	volatile char c;

3772

++

3773

++	if (unlikely(size == 0))

3774

++		return 0;

3775

++	if (!PAGE_ALIGNED(uaddr)) {

3776

++		if (unlikely(__get_user(c, uaddr) != 0))

3777

++			return size;

3778

++		uaddr = (const char __user *)PAGE_ALIGN((unsigned long)uaddr);

3779

++	}

3780

++	end = (const char __user *)PAGE_ALIGN((unsigned long)start + size);

3781

++	if (unlikely(end < start))

3782

++		end = NULL;

3783

++	while (uaddr != end) {

3784

++		if (unlikely(__get_user(c, uaddr) != 0))

3785

++			goto out;

3786

++		uaddr += PAGE_SIZE;

3787

++	}

3788

++

3789

++out:

3790

++	(void)c;

3791

++	if (size > uaddr - start)

3792

++		return size - (uaddr - start);

3793

++	return 0;

3794

++}

3795

++EXPORT_SYMBOL(fault_in_readable);

3796

++

3797

+ /**

3798

+  * get_dump_page() - pin user page in memory while writing it to core dump

3799

+  * @addr: user address

3800

+@@ -2733,7 +2851,7 @@ static int internal_get_user_pages_fast(unsigned long start,

3801

+

3802

+ 	if (WARN_ON_ONCE(gup_flags & ~(FOLL_WRITE | FOLL_LONGTERM |

3803

+ 				       FOLL_FORCE | FOLL_PIN | FOLL_GET |

3804

+-				       FOLL_FAST_ONLY)))

3805

++				       FOLL_FAST_ONLY | FOLL_NOFAULT)))

3806

+ 		return -EINVAL;

3807

+

3808

+ 	if (gup_flags & FOLL_PIN)

3809

+diff --git a/mm/kfence/core.c b/mm/kfence/core.c

3810

+index 86260e8f28302..66076d8742b78 100644

3811

+--- a/mm/kfence/core.c

3812

++++ b/mm/kfence/core.c

3813

+@@ -528,6 +528,8 @@ static bool __init kfence_init_pool(void)

3814

+ 	 * enters __slab_free() slow-path.

3815

+ 	 */

3816

+ 	for (i = 0; i < KFENCE_POOL_SIZE / PAGE_SIZE; i++) {

3817

++		struct page *page = &pages[i];

3818

++

3819

+ 		if (!i || (i % 2))

3820

+ 			continue;

3821

+

3822

+@@ -535,7 +537,11 @@ static bool __init kfence_init_pool(void)

3823

+ 		if (WARN_ON(compound_head(&pages[i]) != &pages[i]))

3824

+ 			goto err;

3825

+

3826

+-		__SetPageSlab(&pages[i]);

3827

++		__SetPageSlab(page);

3828

++#ifdef CONFIG_MEMCG

3829

++		page->memcg_data = (unsigned long)&kfence_metadata[i / 2 - 1].objcg |

3830

++				   MEMCG_DATA_OBJCGS;

3831

++#endif

3832

+ 	}

3833

+

3834

+ 	/*

3835

+@@ -911,6 +917,9 @@ void __kfence_free(void *addr)

3836

+ {

3837

+ 	struct kfence_metadata *meta = addr_to_metadata((unsigned long)addr);

3838

+

3839

++#ifdef CONFIG_MEMCG

3840

++	KFENCE_WARN_ON(meta->objcg);

3841

++#endif

3842

+ 	/*

3843

+ 	 * If the objects of the cache are SLAB_TYPESAFE_BY_RCU, defer freeing

3844

+ 	 * the object, as the object page may be recycled for other-typed

3845

+diff --git a/mm/kfence/kfence.h b/mm/kfence/kfence.h

3846

+index 92bf6eff6060d..600f2e2431d6d 100644

3847

+--- a/mm/kfence/kfence.h

3848

++++ b/mm/kfence/kfence.h

3849

+@@ -89,6 +89,9 @@ struct kfence_metadata {

3850

+ 	struct kfence_track free_track;

3851

+ 	/* For updating alloc_covered on frees. */

3852

+ 	u32 alloc_stack_hash;

3853

++#ifdef CONFIG_MEMCG

3854

++	struct obj_cgroup *objcg;

3855

++#endif

3856

+ };

3857

+

3858

+ extern struct kfence_metadata kfence_metadata[CONFIG_KFENCE_NUM_OBJECTS];

3859

+diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c

3860

+index 68d2cbf8331ac..ea61dfe19c869 100644

3861

+--- a/net/core/bpf_sk_storage.c

3862

++++ b/net/core/bpf_sk_storage.c

3863

+@@ -929,7 +929,7 @@ static struct bpf_iter_reg bpf_sk_storage_map_reg_info = {

3864

+ 		{ offsetof(struct bpf_iter__bpf_sk_storage_map, sk),

3865

+ 		  PTR_TO_BTF_ID_OR_NULL },

3866

+ 		{ offsetof(struct bpf_iter__bpf_sk_storage_map, value),

3867

+-		  PTR_TO_RDWR_BUF_OR_NULL },

3868

++		  PTR_TO_BUF | PTR_MAYBE_NULL },

3869

+ 	},

3870

+ 	.seq_info		= &iter_seq_info,

3871

+ };

3872

+diff --git a/net/core/filter.c b/net/core/filter.c

3873

+index cdd7e92db3030..821278b906b71 100644

3874

+--- a/net/core/filter.c

3875

++++ b/net/core/filter.c

3876

+@@ -1713,7 +1713,7 @@ static const struct bpf_func_proto bpf_skb_store_bytes_proto = {

3877

+ 	.ret_type	= RET_INTEGER,

3878

+ 	.arg1_type	= ARG_PTR_TO_CTX,

3879

+ 	.arg2_type	= ARG_ANYTHING,

3880

+-	.arg3_type	= ARG_PTR_TO_MEM,

3881

++	.arg3_type	= ARG_PTR_TO_MEM | MEM_RDONLY,

3882

+ 	.arg4_type	= ARG_CONST_SIZE,

3883

+ 	.arg5_type	= ARG_ANYTHING,

3884

+ };

3885

+@@ -2018,9 +2018,9 @@ static const struct bpf_func_proto bpf_csum_diff_proto = {

3886

+ 	.gpl_only	= false,

3887

+ 	.pkt_access	= true,

3888

+ 	.ret_type	= RET_INTEGER,

3889

+-	.arg1_type	= ARG_PTR_TO_MEM_OR_NULL,

3890

++	.arg1_type	= ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY,

3891

+ 	.arg2_type	= ARG_CONST_SIZE_OR_ZERO,

3892

+-	.arg3_type	= ARG_PTR_TO_MEM_OR_NULL,

3893

++	.arg3_type	= ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY,

3894

+ 	.arg4_type	= ARG_CONST_SIZE_OR_ZERO,

3895

+ 	.arg5_type	= ARG_ANYTHING,

3896

+ };

3897

+@@ -2541,7 +2541,7 @@ static const struct bpf_func_proto bpf_redirect_neigh_proto = {

3898

+ 	.gpl_only	= false,

3899

+ 	.ret_type	= RET_INTEGER,

3900

+ 	.arg1_type	= ARG_ANYTHING,

3901

+-	.arg2_type      = ARG_PTR_TO_MEM_OR_NULL,

3902

++	.arg2_type      = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY,

3903

+ 	.arg3_type      = ARG_CONST_SIZE_OR_ZERO,

3904

+ 	.arg4_type	= ARG_ANYTHING,

3905

+ };

3906

+@@ -4177,7 +4177,7 @@ static const struct bpf_func_proto bpf_skb_event_output_proto = {

3907

+ 	.arg1_type	= ARG_PTR_TO_CTX,

3908

+ 	.arg2_type	= ARG_CONST_MAP_PTR,

3909

+ 	.arg3_type	= ARG_ANYTHING,

3910

+-	.arg4_type	= ARG_PTR_TO_MEM,

3911

++	.arg4_type	= ARG_PTR_TO_MEM | MEM_RDONLY,

3912

+ 	.arg5_type	= ARG_CONST_SIZE_OR_ZERO,

3913

+ };

3914

+

3915

+@@ -4191,7 +4191,7 @@ const struct bpf_func_proto bpf_skb_output_proto = {

3916

+ 	.arg1_btf_id	= &bpf_skb_output_btf_ids[0],

3917

+ 	.arg2_type	= ARG_CONST_MAP_PTR,

3918

+ 	.arg3_type	= ARG_ANYTHING,

3919

+-	.arg4_type	= ARG_PTR_TO_MEM,

3920

++	.arg4_type	= ARG_PTR_TO_MEM | MEM_RDONLY,

3921

+ 	.arg5_type	= ARG_CONST_SIZE_OR_ZERO,

3922

+ };

3923

+

3924

+@@ -4374,7 +4374,7 @@ static const struct bpf_func_proto bpf_skb_set_tunnel_key_proto = {

3925

+ 	.gpl_only	= false,

3926

+ 	.ret_type	= RET_INTEGER,

3927

+ 	.arg1_type	= ARG_PTR_TO_CTX,

3928

+-	.arg2_type	= ARG_PTR_TO_MEM,

3929

++	.arg2_type	= ARG_PTR_TO_MEM | MEM_RDONLY,

3930

+ 	.arg3_type	= ARG_CONST_SIZE,

3931

+ 	.arg4_type	= ARG_ANYTHING,

3932

+ };

3933

+@@ -4400,7 +4400,7 @@ static const struct bpf_func_proto bpf_skb_set_tunnel_opt_proto = {

3934

+ 	.gpl_only	= false,

3935

+ 	.ret_type	= RET_INTEGER,

3936

+ 	.arg1_type	= ARG_PTR_TO_CTX,

3937

+-	.arg2_type	= ARG_PTR_TO_MEM,

3938

++	.arg2_type	= ARG_PTR_TO_MEM | MEM_RDONLY,

3939

+ 	.arg3_type	= ARG_CONST_SIZE,

3940

+ };

3941

+

3942

+@@ -4570,7 +4570,7 @@ static const struct bpf_func_proto bpf_xdp_event_output_proto = {

3943

+ 	.arg1_type	= ARG_PTR_TO_CTX,

3944

+ 	.arg2_type	= ARG_CONST_MAP_PTR,

3945

+ 	.arg3_type	= ARG_ANYTHING,

3946

+-	.arg4_type	= ARG_PTR_TO_MEM,

3947

++	.arg4_type	= ARG_PTR_TO_MEM | MEM_RDONLY,

3948

+ 	.arg5_type	= ARG_CONST_SIZE_OR_ZERO,

3949

+ };

3950

+

3951

+@@ -4584,7 +4584,7 @@ const struct bpf_func_proto bpf_xdp_output_proto = {

3952

+ 	.arg1_btf_id	= &bpf_xdp_output_btf_ids[0],

3953

+ 	.arg2_type	= ARG_CONST_MAP_PTR,

3954

+ 	.arg3_type	= ARG_ANYTHING,

3955

+-	.arg4_type	= ARG_PTR_TO_MEM,

3956

++	.arg4_type	= ARG_PTR_TO_MEM | MEM_RDONLY,

3957

+ 	.arg5_type	= ARG_CONST_SIZE_OR_ZERO,

3958

+ };

3959

+

3960

+@@ -5072,7 +5072,7 @@ const struct bpf_func_proto bpf_sk_setsockopt_proto = {

3961

+ 	.arg1_type	= ARG_PTR_TO_BTF_ID_SOCK_COMMON,

3962

+ 	.arg2_type	= ARG_ANYTHING,

3963

+ 	.arg3_type	= ARG_ANYTHING,

3964

+-	.arg4_type	= ARG_PTR_TO_MEM,

3965

++	.arg4_type	= ARG_PTR_TO_MEM | MEM_RDONLY,

3966

+ 	.arg5_type	= ARG_CONST_SIZE,

3967

+ };

3968

+

3969

+@@ -5106,7 +5106,7 @@ static const struct bpf_func_proto bpf_sock_addr_setsockopt_proto = {

3970

+ 	.arg1_type	= ARG_PTR_TO_CTX,

3971

+ 	.arg2_type	= ARG_ANYTHING,

3972

+ 	.arg3_type	= ARG_ANYTHING,

3973

+-	.arg4_type	= ARG_PTR_TO_MEM,

3974

++	.arg4_type	= ARG_PTR_TO_MEM | MEM_RDONLY,

3975

+ 	.arg5_type	= ARG_CONST_SIZE,

3976

+ };

3977

+

3978

+@@ -5140,7 +5140,7 @@ static const struct bpf_func_proto bpf_sock_ops_setsockopt_proto = {

3979

+ 	.arg1_type	= ARG_PTR_TO_CTX,

3980

+ 	.arg2_type	= ARG_ANYTHING,

3981

+ 	.arg3_type	= ARG_ANYTHING,

3982

+-	.arg4_type	= ARG_PTR_TO_MEM,

3983

++	.arg4_type	= ARG_PTR_TO_MEM | MEM_RDONLY,

3984

+ 	.arg5_type	= ARG_CONST_SIZE,

3985

+ };

3986

+

3987

+@@ -5315,7 +5315,7 @@ static const struct bpf_func_proto bpf_bind_proto = {

3988

+ 	.gpl_only	= false,

3989

+ 	.ret_type	= RET_INTEGER,

3990

+ 	.arg1_type	= ARG_PTR_TO_CTX,

3991

+-	.arg2_type	= ARG_PTR_TO_MEM,

3992

++	.arg2_type	= ARG_PTR_TO_MEM | MEM_RDONLY,

3993

+ 	.arg3_type	= ARG_CONST_SIZE,

3994

+ };

3995

+

3996

+@@ -5903,7 +5903,7 @@ static const struct bpf_func_proto bpf_lwt_in_push_encap_proto = {

3997

+ 	.ret_type	= RET_INTEGER,

3998

+ 	.arg1_type	= ARG_PTR_TO_CTX,

3999

+ 	.arg2_type	= ARG_ANYTHING,

4000

+-	.arg3_type	= ARG_PTR_TO_MEM,

4001

++	.arg3_type	= ARG_PTR_TO_MEM | MEM_RDONLY,

4002

+ 	.arg4_type	= ARG_CONST_SIZE

4003

+ };

4004

+

4005

+@@ -5913,7 +5913,7 @@ static const struct bpf_func_proto bpf_lwt_xmit_push_encap_proto = {

4006

+ 	.ret_type	= RET_INTEGER,

4007

+ 	.arg1_type	= ARG_PTR_TO_CTX,

4008

+ 	.arg2_type	= ARG_ANYTHING,

4009

+-	.arg3_type	= ARG_PTR_TO_MEM,

4010

++	.arg3_type	= ARG_PTR_TO_MEM | MEM_RDONLY,

4011

+ 	.arg4_type	= ARG_CONST_SIZE

4012

+ };

4013

+

4014

+@@ -5956,7 +5956,7 @@ static const struct bpf_func_proto bpf_lwt_seg6_store_bytes_proto = {

4015

+ 	.ret_type	= RET_INTEGER,

4016

+ 	.arg1_type	= ARG_PTR_TO_CTX,

4017

+ 	.arg2_type	= ARG_ANYTHING,

4018

+-	.arg3_type	= ARG_PTR_TO_MEM,

4019

++	.arg3_type	= ARG_PTR_TO_MEM | MEM_RDONLY,

4020

+ 	.arg4_type	= ARG_CONST_SIZE

4021

+ };

4022

+

4023

+@@ -6044,7 +6044,7 @@ static const struct bpf_func_proto bpf_lwt_seg6_action_proto = {

4024

+ 	.ret_type	= RET_INTEGER,

4025

+ 	.arg1_type	= ARG_PTR_TO_CTX,

4026

+ 	.arg2_type	= ARG_ANYTHING,

4027

+-	.arg3_type	= ARG_PTR_TO_MEM,

4028

++	.arg3_type	= ARG_PTR_TO_MEM | MEM_RDONLY,

4029

+ 	.arg4_type	= ARG_CONST_SIZE

4030

+ };

4031

+

4032

+@@ -6269,7 +6269,7 @@ static const struct bpf_func_proto bpf_skc_lookup_tcp_proto = {

4033

+ 	.pkt_access	= true,

4034

+ 	.ret_type	= RET_PTR_TO_SOCK_COMMON_OR_NULL,

4035

+ 	.arg1_type	= ARG_PTR_TO_CTX,

4036

+-	.arg2_type	= ARG_PTR_TO_MEM,

4037

++	.arg2_type	= ARG_PTR_TO_MEM | MEM_RDONLY,

4038

+ 	.arg3_type	= ARG_CONST_SIZE,

4039

+ 	.arg4_type	= ARG_ANYTHING,

4040

+ 	.arg5_type	= ARG_ANYTHING,

4041

+@@ -6288,7 +6288,7 @@ static const struct bpf_func_proto bpf_sk_lookup_tcp_proto = {

4042

+ 	.pkt_access	= true,

4043

+ 	.ret_type	= RET_PTR_TO_SOCKET_OR_NULL,

4044

+ 	.arg1_type	= ARG_PTR_TO_CTX,

4045

+-	.arg2_type	= ARG_PTR_TO_MEM,

4046

++	.arg2_type	= ARG_PTR_TO_MEM | MEM_RDONLY,

4047

+ 	.arg3_type	= ARG_CONST_SIZE,

4048

+ 	.arg4_type	= ARG_ANYTHING,

4049

+ 	.arg5_type	= ARG_ANYTHING,

4050

+@@ -6307,7 +6307,7 @@ static const struct bpf_func_proto bpf_sk_lookup_udp_proto = {

4051

+ 	.pkt_access	= true,

4052

+ 	.ret_type	= RET_PTR_TO_SOCKET_OR_NULL,

4053

+ 	.arg1_type	= ARG_PTR_TO_CTX,

4054

+-	.arg2_type	= ARG_PTR_TO_MEM,

4055

++	.arg2_type	= ARG_PTR_TO_MEM | MEM_RDONLY,

4056

+ 	.arg3_type	= ARG_CONST_SIZE,

4057

+ 	.arg4_type	= ARG_ANYTHING,

4058

+ 	.arg5_type	= ARG_ANYTHING,

4059

+@@ -6344,7 +6344,7 @@ static const struct bpf_func_proto bpf_xdp_sk_lookup_udp_proto = {

4060

+ 	.pkt_access     = true,

4061

+ 	.ret_type       = RET_PTR_TO_SOCKET_OR_NULL,

4062

+ 	.arg1_type      = ARG_PTR_TO_CTX,

4063

+-	.arg2_type      = ARG_PTR_TO_MEM,

4064

++	.arg2_type      = ARG_PTR_TO_MEM | MEM_RDONLY,

4065

+ 	.arg3_type      = ARG_CONST_SIZE,

4066

+ 	.arg4_type      = ARG_ANYTHING,

4067

+ 	.arg5_type      = ARG_ANYTHING,

4068

+@@ -6367,7 +6367,7 @@ static const struct bpf_func_proto bpf_xdp_skc_lookup_tcp_proto = {

4069

+ 	.pkt_access     = true,

4070

+ 	.ret_type       = RET_PTR_TO_SOCK_COMMON_OR_NULL,

4071

+ 	.arg1_type      = ARG_PTR_TO_CTX,

4072

+-	.arg2_type      = ARG_PTR_TO_MEM,

4073

++	.arg2_type      = ARG_PTR_TO_MEM | MEM_RDONLY,

4074

+ 	.arg3_type      = ARG_CONST_SIZE,

4075

+ 	.arg4_type      = ARG_ANYTHING,

4076

+ 	.arg5_type      = ARG_ANYTHING,

4077

+@@ -6390,7 +6390,7 @@ static const struct bpf_func_proto bpf_xdp_sk_lookup_tcp_proto = {

4078

+ 	.pkt_access     = true,

4079

+ 	.ret_type       = RET_PTR_TO_SOCKET_OR_NULL,

4080

+ 	.arg1_type      = ARG_PTR_TO_CTX,

4081

+-	.arg2_type      = ARG_PTR_TO_MEM,

4082

++	.arg2_type      = ARG_PTR_TO_MEM | MEM_RDONLY,

4083

+ 	.arg3_type      = ARG_CONST_SIZE,

4084

+ 	.arg4_type      = ARG_ANYTHING,

4085

+ 	.arg5_type      = ARG_ANYTHING,

4086

+@@ -6409,7 +6409,7 @@ static const struct bpf_func_proto bpf_sock_addr_skc_lookup_tcp_proto = {

4087

+ 	.gpl_only	= false,

4088

+ 	.ret_type	= RET_PTR_TO_SOCK_COMMON_OR_NULL,

4089

+ 	.arg1_type	= ARG_PTR_TO_CTX,

4090

+-	.arg2_type	= ARG_PTR_TO_MEM,

4091

++	.arg2_type	= ARG_PTR_TO_MEM | MEM_RDONLY,

4092

+ 	.arg3_type	= ARG_CONST_SIZE,

4093

+ 	.arg4_type	= ARG_ANYTHING,

4094

+ 	.arg5_type	= ARG_ANYTHING,

4095

+@@ -6428,7 +6428,7 @@ static const struct bpf_func_proto bpf_sock_addr_sk_lookup_tcp_proto = {

4096

+ 	.gpl_only	= false,

4097

+ 	.ret_type	= RET_PTR_TO_SOCKET_OR_NULL,

4098

+ 	.arg1_type	= ARG_PTR_TO_CTX,

4099

+-	.arg2_type	= ARG_PTR_TO_MEM,

4100

++	.arg2_type	= ARG_PTR_TO_MEM | MEM_RDONLY,

4101

+ 	.arg3_type	= ARG_CONST_SIZE,

4102

+ 	.arg4_type	= ARG_ANYTHING,

4103

+ 	.arg5_type	= ARG_ANYTHING,

4104

+@@ -6447,7 +6447,7 @@ static const struct bpf_func_proto bpf_sock_addr_sk_lookup_udp_proto = {

4105

+ 	.gpl_only	= false,

4106

+ 	.ret_type	= RET_PTR_TO_SOCKET_OR_NULL,

4107

+ 	.arg1_type	= ARG_PTR_TO_CTX,

4108

+-	.arg2_type	= ARG_PTR_TO_MEM,

4109

++	.arg2_type	= ARG_PTR_TO_MEM | MEM_RDONLY,

4110

+ 	.arg3_type	= ARG_CONST_SIZE,

4111

+ 	.arg4_type	= ARG_ANYTHING,

4112

+ 	.arg5_type	= ARG_ANYTHING,

4113

+@@ -6769,9 +6769,9 @@ static const struct bpf_func_proto bpf_tcp_check_syncookie_proto = {

4114

+ 	.pkt_access	= true,

4115

+ 	.ret_type	= RET_INTEGER,

4116

+ 	.arg1_type	= ARG_PTR_TO_BTF_ID_SOCK_COMMON,

4117

+-	.arg2_type	= ARG_PTR_TO_MEM,

4118

++	.arg2_type	= ARG_PTR_TO_MEM | MEM_RDONLY,

4119

+ 	.arg3_type	= ARG_CONST_SIZE,

4120

+-	.arg4_type	= ARG_PTR_TO_MEM,

4121

++	.arg4_type	= ARG_PTR_TO_MEM | MEM_RDONLY,

4122

+ 	.arg5_type	= ARG_CONST_SIZE,

4123

+ };

4124

+

4125

+@@ -6838,9 +6838,9 @@ static const struct bpf_func_proto bpf_tcp_gen_syncookie_proto = {

4126

+ 	.pkt_access	= true,

4127

+ 	.ret_type	= RET_INTEGER,

4128

+ 	.arg1_type	= ARG_PTR_TO_BTF_ID_SOCK_COMMON,

4129

+-	.arg2_type	= ARG_PTR_TO_MEM,

4130

++	.arg2_type	= ARG_PTR_TO_MEM | MEM_RDONLY,

4131

+ 	.arg3_type	= ARG_CONST_SIZE,

4132

+-	.arg4_type	= ARG_PTR_TO_MEM,

4133

++	.arg4_type	= ARG_PTR_TO_MEM | MEM_RDONLY,

4134

+ 	.arg5_type	= ARG_CONST_SIZE,

4135

+ };

4136

+

4137

+@@ -7069,7 +7069,7 @@ static const struct bpf_func_proto bpf_sock_ops_store_hdr_opt_proto = {

4138

+ 	.gpl_only	= false,

4139

+ 	.ret_type	= RET_INTEGER,

4140

+ 	.arg1_type	= ARG_PTR_TO_CTX,

4141

+-	.arg2_type	= ARG_PTR_TO_MEM,

4142

++	.arg2_type	= ARG_PTR_TO_MEM | MEM_RDONLY,

4143

+ 	.arg3_type	= ARG_CONST_SIZE,

4144

+ 	.arg4_type	= ARG_ANYTHING,

4145

+ };

4146

+diff --git a/net/core/sock_map.c b/net/core/sock_map.c

4147

+index 8288b5382f08d..6351b6af7aca9 100644

4148

+--- a/net/core/sock_map.c

4149

++++ b/net/core/sock_map.c

4150

+@@ -1575,7 +1575,7 @@ static struct bpf_iter_reg sock_map_iter_reg = {

4151

+ 	.ctx_arg_info_size	= 2,

4152

+ 	.ctx_arg_info		= {

4153

+ 		{ offsetof(struct bpf_iter__sockmap, key),

4154

+-		  PTR_TO_RDONLY_BUF_OR_NULL },

4155

++		  PTR_TO_BUF | PTR_MAYBE_NULL | MEM_RDONLY },

4156

+ 		{ offsetof(struct bpf_iter__sockmap, sk),

4157

+ 		  PTR_TO_BTF_ID_OR_NULL },

4158

+ 	},

4159

+diff --git a/tools/testing/selftests/bpf/prog_tests/ksyms_btf.c b/tools/testing/selftests/bpf/prog_tests/ksyms_btf.c

4160

+index cf3acfa5a91d5..69455fe90ac3e 100644

4161

+--- a/tools/testing/selftests/bpf/prog_tests/ksyms_btf.c

4162

++++ b/tools/testing/selftests/bpf/prog_tests/ksyms_btf.c

4163

+@@ -7,6 +7,7 @@

4164

+ #include "test_ksyms_btf.skel.h"

4165

+ #include "test_ksyms_btf_null_check.skel.h"

4166

+ #include "test_ksyms_weak.skel.h"

4167

++#include "test_ksyms_btf_write_check.skel.h"

4168

+

4169

+ static int duration;

4170

+

4171

+@@ -109,6 +110,16 @@ cleanup:

4172

+ 	test_ksyms_weak__destroy(skel);

4173

+ }

4174

+

4175

++static void test_write_check(void)

4176

++{

4177

++	struct test_ksyms_btf_write_check *skel;

4178

++

4179

++	skel = test_ksyms_btf_write_check__open_and_load();

4180

++	ASSERT_ERR_PTR(skel, "unexpected load of a prog writing to ksym memory\n");

4181

++

4182

++	test_ksyms_btf_write_check__destroy(skel);

4183

++}

4184

++

4185

+ void test_ksyms_btf(void)

4186

+ {

4187

+ 	int percpu_datasec;

4188

+@@ -136,4 +147,7 @@ void test_ksyms_btf(void)

4189

+

4190

+ 	if (test__start_subtest("weak_ksyms"))

4191

+ 		test_weak_syms();

4192

++

4193

++	if (test__start_subtest("write_check"))

4194

++		test_write_check();

4195

+ }

4196

+diff --git a/tools/testing/selftests/bpf/progs/test_ksyms_btf_write_check.c b/tools/testing/selftests/bpf/progs/test_ksyms_btf_write_check.c

4197

+new file mode 100644

4198

+index 0000000000000..2180c41cd890f

4199

+--- /dev/null

4200

++++ b/tools/testing/selftests/bpf/progs/test_ksyms_btf_write_check.c

4201

+@@ -0,0 +1,29 @@

4202

++// SPDX-License-Identifier: GPL-2.0

4203

++/* Copyright (c) 2021 Google */

4204

++

4205

++#include "vmlinux.h"

4206

++

4207

++#include <bpf/bpf_helpers.h>

4208

++

4209

++extern const int bpf_prog_active __ksym; /* int type global var. */

4210

++

4211

++SEC("raw_tp/sys_enter")

4212

++int handler(const void *ctx)

4213

++{

4214

++	int *active;

4215

++	__u32 cpu;

4216

++

4217

++	cpu = bpf_get_smp_processor_id();

4218

++	active = (int *)bpf_per_cpu_ptr(&bpf_prog_active, cpu);

4219

++	if (active) {

4220

++		/* Kernel memory obtained from bpf_{per,this}_cpu_ptr

4221

++		 * is read-only, should _not_ pass verification.

4222

++		 */

4223

++		/* WRITE_ONCE */

4224

++		*(volatile int *)active = -1;

4225

++	}

4226

++

4227

++	return 0;

4228

++}

4229

++

4230

++char _license[] SEC("license") = "GPL";

4231

+diff --git a/tools/testing/selftests/bpf/verifier/calls.c b/tools/testing/selftests/bpf/verifier/calls.c

4232

+index 336a749673d19..2e701e7f69680 100644

4233

+--- a/tools/testing/selftests/bpf/verifier/calls.c

4234

++++ b/tools/testing/selftests/bpf/verifier/calls.c

4235

+@@ -107,6 +107,25 @@

4236

+ 	.result = REJECT,

4237

+ 	.errstr = "R0 min value is outside of the allowed memory range",

4238

+ },

4239

++{

4240

++	"calls: trigger reg2btf_ids[reg->type] for reg->type > __BPF_REG_TYPE_MAX",

4241

++	.insns = {

4242

++	BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),

4243

++	BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -8),

4244

++	BPF_ST_MEM(BPF_DW, BPF_REG_1, 0, 0),

4245

++	BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, BPF_PSEUDO_KFUNC_CALL, 0, 0),

4246

++	BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),

4247

++	BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, BPF_PSEUDO_KFUNC_CALL, 0, 0),

4248

++	BPF_EXIT_INSN(),

4249

++	},

4250

++	.prog_type = BPF_PROG_TYPE_SCHED_CLS,

4251

++	.result = REJECT,

4252

++	.errstr = "arg#0 pointer type STRUCT prog_test_ref_kfunc must point",

4253

++	.fixup_kfunc_btf_id = {

4254

++		{ "bpf_kfunc_call_test_acquire", 3 },

4255

++		{ "bpf_kfunc_call_test_release", 5 },

4256

++	},

4257

++},

4258

+ {

4259

+ 	"calls: overlapping caller/callee",

4260

+ 	.insns = {

Gentoo Archives: gentoo-commits