Gentoo Archives: gentoo-commits

From: Mike Pagano <mpagano@g.o>
To: gentoo-commits@l.g.o
Subject: [gentoo-commits] proj/linux-patches:5.15 commit in: /
Date: Sun, 01 May 2022 17:03:20
Message-Id: 1651424578.aa3aade4f155b96481a44b6733e806c8181271cc.mpagano@gentoo
1 commit: aa3aade4f155b96481a44b6733e806c8181271cc
2 Author: Mike Pagano <mpagano <AT> gentoo <DOT> org>
3 AuthorDate: Sun May 1 17:02:58 2022 +0000
4 Commit: Mike Pagano <mpagano <AT> gentoo <DOT> org>
5 CommitDate: Sun May 1 17:02:58 2022 +0000
6 URL: https://gitweb.gentoo.org/proj/linux-patches.git/commit/?id=aa3aade4
7
8 Linux patch 5.15.37
9
10 Signed-off-by: Mike Pagano <mpagano <AT> gentoo.org>
11
12 0000_README | 4 +
13 1036_linux-5.15.37.patch | 4223 ++++++++++++++++++++++++++++++++++++++++++++++
14 2 files changed, 4227 insertions(+)
15
16 diff --git a/0000_README b/0000_README
17 index 0f44e39b..cb4266b1 100644
18 --- a/0000_README
19 +++ b/0000_README
20 @@ -187,6 +187,10 @@ Patch: 1035_linux-5.15.36.patch
21 From: http://www.kernel.org
22 Desc: Linux 5.15.36
23
24 +Patch: 1036_linux-5.15.37.patch
25 +From: http://www.kernel.org
26 +Desc: Linux 5.15.37
27 +
28 Patch: 1500_XATTR_USER_PREFIX.patch
29 From: https://bugs.gentoo.org/show_bug.cgi?id=470644
30 Desc: Support for namespace user.pax.* on tmpfs.
31
32 diff --git a/1036_linux-5.15.37.patch b/1036_linux-5.15.37.patch
33 new file mode 100644
34 index 00000000..b9d4c0ea
35 --- /dev/null
36 +++ b/1036_linux-5.15.37.patch
37 @@ -0,0 +1,4223 @@
38 +diff --git a/Makefile b/Makefile
39 +index e0710f9837847..50b1688a4ca2c 100644
40 +--- a/Makefile
41 ++++ b/Makefile
42 +@@ -1,7 +1,7 @@
43 + # SPDX-License-Identifier: GPL-2.0
44 + VERSION = 5
45 + PATCHLEVEL = 15
46 +-SUBLEVEL = 36
47 ++SUBLEVEL = 37
48 + EXTRAVERSION =
49 + NAME = Trick or Treat
50 +
51 +diff --git a/arch/arm/boot/dts/socfpga.dtsi b/arch/arm/boot/dts/socfpga.dtsi
52 +index 0b021eef0b538..7c1d6423d7f8c 100644
53 +--- a/arch/arm/boot/dts/socfpga.dtsi
54 ++++ b/arch/arm/boot/dts/socfpga.dtsi
55 +@@ -782,7 +782,7 @@
56 + };
57 +
58 + qspi: spi@ff705000 {
59 +- compatible = "cdns,qspi-nor";
60 ++ compatible = "intel,socfpga-qspi", "cdns,qspi-nor";
61 + #address-cells = <1>;
62 + #size-cells = <0>;
63 + reg = <0xff705000 0x1000>,
64 +diff --git a/arch/arm/boot/dts/socfpga_arria10.dtsi b/arch/arm/boot/dts/socfpga_arria10.dtsi
65 +index a574ea91d9d3f..3ba431dfa8c94 100644
66 +--- a/arch/arm/boot/dts/socfpga_arria10.dtsi
67 ++++ b/arch/arm/boot/dts/socfpga_arria10.dtsi
68 +@@ -756,7 +756,7 @@
69 + };
70 +
71 + qspi: spi@ff809000 {
72 +- compatible = "cdns,qspi-nor";
73 ++ compatible = "intel,socfpga-qspi", "cdns,qspi-nor";
74 + #address-cells = <1>;
75 + #size-cells = <0>;
76 + reg = <0xff809000 0x100>,
77 +diff --git a/arch/arm64/boot/dts/altera/socfpga_stratix10.dtsi b/arch/arm64/boot/dts/altera/socfpga_stratix10.dtsi
78 +index d301ac0d406bf..3ec301bd08a91 100644
79 +--- a/arch/arm64/boot/dts/altera/socfpga_stratix10.dtsi
80 ++++ b/arch/arm64/boot/dts/altera/socfpga_stratix10.dtsi
81 +@@ -594,7 +594,7 @@
82 + };
83 +
84 + qspi: spi@ff8d2000 {
85 +- compatible = "cdns,qspi-nor";
86 ++ compatible = "intel,socfpga-qspi", "cdns,qspi-nor";
87 + #address-cells = <1>;
88 + #size-cells = <0>;
89 + reg = <0xff8d2000 0x100>,
90 +diff --git a/arch/arm64/boot/dts/intel/socfpga_agilex.dtsi b/arch/arm64/boot/dts/intel/socfpga_agilex.dtsi
91 +index de1e98c99ec5b..f4270cf189962 100644
92 +--- a/arch/arm64/boot/dts/intel/socfpga_agilex.dtsi
93 ++++ b/arch/arm64/boot/dts/intel/socfpga_agilex.dtsi
94 +@@ -628,7 +628,7 @@
95 + };
96 +
97 + qspi: spi@ff8d2000 {
98 +- compatible = "cdns,qspi-nor";
99 ++ compatible = "intel,socfpga-qspi", "cdns,qspi-nor";
100 + #address-cells = <1>;
101 + #size-cells = <0>;
102 + reg = <0xff8d2000 0x100>,
103 +diff --git a/arch/powerpc/kernel/kvm.c b/arch/powerpc/kernel/kvm.c
104 +index d89cf802d9aa7..6568823cf3063 100644
105 +--- a/arch/powerpc/kernel/kvm.c
106 ++++ b/arch/powerpc/kernel/kvm.c
107 +@@ -669,7 +669,8 @@ static void __init kvm_use_magic_page(void)
108 + on_each_cpu(kvm_map_magic_page, &features, 1);
109 +
110 + /* Quick self-test to see if the mapping works */
111 +- if (fault_in_pages_readable((const char *)KVM_MAGIC_PAGE, sizeof(u32))) {
112 ++ if (fault_in_readable((const char __user *)KVM_MAGIC_PAGE,
113 ++ sizeof(u32))) {
114 + kvm_patching_worked = false;
115 + return;
116 + }
117 +diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c
118 +index f2da879264bcd..3e053e2fd6b69 100644
119 +--- a/arch/powerpc/kernel/signal_32.c
120 ++++ b/arch/powerpc/kernel/signal_32.c
121 +@@ -1048,7 +1048,7 @@ SYSCALL_DEFINE3(swapcontext, struct ucontext __user *, old_ctx,
122 + if (new_ctx == NULL)
123 + return 0;
124 + if (!access_ok(new_ctx, ctx_size) ||
125 +- fault_in_pages_readable((u8 __user *)new_ctx, ctx_size))
126 ++ fault_in_readable((char __user *)new_ctx, ctx_size))
127 + return -EFAULT;
128 +
129 + /*
130 +@@ -1239,7 +1239,7 @@ SYSCALL_DEFINE3(debug_setcontext, struct ucontext __user *, ctx,
131 + #endif
132 +
133 + if (!access_ok(ctx, sizeof(*ctx)) ||
134 +- fault_in_pages_readable((u8 __user *)ctx, sizeof(*ctx)))
135 ++ fault_in_readable((char __user *)ctx, sizeof(*ctx)))
136 + return -EFAULT;
137 +
138 + /*
139 +diff --git a/arch/powerpc/kernel/signal_64.c b/arch/powerpc/kernel/signal_64.c
140 +index bb9c077ac1322..d1e1fc0acbea3 100644
141 +--- a/arch/powerpc/kernel/signal_64.c
142 ++++ b/arch/powerpc/kernel/signal_64.c
143 +@@ -688,7 +688,7 @@ SYSCALL_DEFINE3(swapcontext, struct ucontext __user *, old_ctx,
144 + if (new_ctx == NULL)
145 + return 0;
146 + if (!access_ok(new_ctx, ctx_size) ||
147 +- fault_in_pages_readable((u8 __user *)new_ctx, ctx_size))
148 ++ fault_in_readable((char __user *)new_ctx, ctx_size))
149 + return -EFAULT;
150 +
151 + /*
152 +diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
153 +index 831b25c5e7058..7f71bd4dcd0d6 100644
154 +--- a/arch/x86/kernel/fpu/signal.c
155 ++++ b/arch/x86/kernel/fpu/signal.c
156 +@@ -205,7 +205,7 @@ retry:
157 + fpregs_unlock();
158 +
159 + if (ret) {
160 +- if (!fault_in_pages_writeable(buf_fx, fpu_user_xstate_size))
161 ++ if (!fault_in_writeable(buf_fx, fpu_user_xstate_size))
162 + goto retry;
163 + return -EFAULT;
164 + }
165 +@@ -278,10 +278,9 @@ retry:
166 + if (ret != -EFAULT)
167 + return -EINVAL;
168 +
169 +- ret = fault_in_pages_readable(buf, size);
170 +- if (!ret)
171 ++ if (!fault_in_readable(buf, size))
172 + goto retry;
173 +- return ret;
174 ++ return -EFAULT;
175 + }
176 +
177 + /*
178 +diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
179 +index ab3e37aa1830c..f93cb989241cc 100644
180 +--- a/drivers/block/Kconfig
181 ++++ b/drivers/block/Kconfig
182 +@@ -33,6 +33,22 @@ config BLK_DEV_FD
183 + To compile this driver as a module, choose M here: the
184 + module will be called floppy.
185 +
186 ++config BLK_DEV_FD_RAWCMD
187 ++ bool "Support for raw floppy disk commands (DEPRECATED)"
188 ++ depends on BLK_DEV_FD
189 ++ help
190 ++ If you want to use actual physical floppies and expect to do
191 ++ special low-level hardware accesses to them (access and use
192 ++ non-standard formats, for example), then enable this.
193 ++
194 ++ Note that the code enabled by this option is rarely used and
195 ++ might be unstable or insecure, and distros should not enable it.
196 ++
197 ++ Note: FDRAWCMD is deprecated and will be removed from the kernel
198 ++ in the near future.
199 ++
200 ++ If unsure, say N.
201 ++
202 + config AMIGA_FLOPPY
203 + tristate "Amiga floppy support"
204 + depends on AMIGA
205 +diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
206 +index 0f58594c5a4d6..1c152b542a52d 100644
207 +--- a/drivers/block/floppy.c
208 ++++ b/drivers/block/floppy.c
209 +@@ -2984,6 +2984,8 @@ static const char *drive_name(int type, int drive)
210 + return "(null)";
211 + }
212 +
213 ++#ifdef CONFIG_BLK_DEV_FD_RAWCMD
214 ++
215 + /* raw commands */
216 + static void raw_cmd_done(int flag)
217 + {
218 +@@ -3183,6 +3185,35 @@ static int raw_cmd_ioctl(int cmd, void __user *param)
219 + return ret;
220 + }
221 +
222 ++static int floppy_raw_cmd_ioctl(int type, int drive, int cmd,
223 ++ void __user *param)
224 ++{
225 ++ int ret;
226 ++
227 ++ pr_warn_once("Note: FDRAWCMD is deprecated and will be removed from the kernel in the near future.\n");
228 ++
229 ++ if (type)
230 ++ return -EINVAL;
231 ++ if (lock_fdc(drive))
232 ++ return -EINTR;
233 ++ set_floppy(drive);
234 ++ ret = raw_cmd_ioctl(cmd, param);
235 ++ if (ret == -EINTR)
236 ++ return -EINTR;
237 ++ process_fd_request();
238 ++ return ret;
239 ++}
240 ++
241 ++#else /* CONFIG_BLK_DEV_FD_RAWCMD */
242 ++
243 ++static int floppy_raw_cmd_ioctl(int type, int drive, int cmd,
244 ++ void __user *param)
245 ++{
246 ++ return -EOPNOTSUPP;
247 ++}
248 ++
249 ++#endif
250 ++
251 + static int invalidate_drive(struct block_device *bdev)
252 + {
253 + /* invalidate the buffer track to force a reread */
254 +@@ -3371,7 +3402,6 @@ static int fd_locked_ioctl(struct block_device *bdev, fmode_t mode, unsigned int
255 + {
256 + int drive = (long)bdev->bd_disk->private_data;
257 + int type = ITYPE(drive_state[drive].fd_device);
258 +- int i;
259 + int ret;
260 + int size;
261 + union inparam {
262 +@@ -3522,16 +3552,7 @@ static int fd_locked_ioctl(struct block_device *bdev, fmode_t mode, unsigned int
263 + outparam = &write_errors[drive];
264 + break;
265 + case FDRAWCMD:
266 +- if (type)
267 +- return -EINVAL;
268 +- if (lock_fdc(drive))
269 +- return -EINTR;
270 +- set_floppy(drive);
271 +- i = raw_cmd_ioctl(cmd, (void __user *)param);
272 +- if (i == -EINTR)
273 +- return -EINTR;
274 +- process_fd_request();
275 +- return i;
276 ++ return floppy_raw_cmd_ioctl(type, drive, cmd, (void __user *)param);
277 + case FDTWADDLE:
278 + if (lock_fdc(drive))
279 + return -EINTR;
280 +diff --git a/drivers/gpu/drm/armada/armada_gem.c b/drivers/gpu/drm/armada/armada_gem.c
281 +index 21909642ee4ca..8fbb25913327c 100644
282 +--- a/drivers/gpu/drm/armada/armada_gem.c
283 ++++ b/drivers/gpu/drm/armada/armada_gem.c
284 +@@ -336,7 +336,7 @@ int armada_gem_pwrite_ioctl(struct drm_device *dev, void *data,
285 + struct drm_armada_gem_pwrite *args = data;
286 + struct armada_gem_object *dobj;
287 + char __user *ptr;
288 +- int ret;
289 ++ int ret = 0;
290 +
291 + DRM_DEBUG_DRIVER("handle %u off %u size %u ptr 0x%llx\n",
292 + args->handle, args->offset, args->size, args->ptr);
293 +@@ -349,9 +349,8 @@ int armada_gem_pwrite_ioctl(struct drm_device *dev, void *data,
294 + if (!access_ok(ptr, args->size))
295 + return -EFAULT;
296 +
297 +- ret = fault_in_pages_readable(ptr, args->size);
298 +- if (ret)
299 +- return ret;
300 ++ if (fault_in_readable(ptr, args->size))
301 ++ return -EFAULT;
302 +
303 + dobj = armada_gem_object_lookup(file, args->handle);
304 + if (dobj == NULL)
305 +diff --git a/drivers/spi/spi-cadence-quadspi.c b/drivers/spi/spi-cadence-quadspi.c
306 +index 75680eecd2f7d..2714ba02b176b 100644
307 +--- a/drivers/spi/spi-cadence-quadspi.c
308 ++++ b/drivers/spi/spi-cadence-quadspi.c
309 +@@ -36,6 +36,7 @@
310 + /* Quirks */
311 + #define CQSPI_NEEDS_WR_DELAY BIT(0)
312 + #define CQSPI_DISABLE_DAC_MODE BIT(1)
313 ++#define CQSPI_NO_SUPPORT_WR_COMPLETION BIT(3)
314 +
315 + /* Capabilities */
316 + #define CQSPI_SUPPORTS_OCTAL BIT(0)
317 +@@ -83,6 +84,7 @@ struct cqspi_st {
318 + u32 wr_delay;
319 + bool use_direct_mode;
320 + struct cqspi_flash_pdata f_pdata[CQSPI_MAX_CHIPSELECT];
321 ++ bool wr_completion;
322 + };
323 +
324 + struct cqspi_driver_platdata {
325 +@@ -797,9 +799,11 @@ static int cqspi_write_setup(struct cqspi_flash_pdata *f_pdata,
326 + * polling on the controller's side. spinand and spi-nor will take
327 + * care of polling the status register.
328 + */
329 +- reg = readl(reg_base + CQSPI_REG_WR_COMPLETION_CTRL);
330 +- reg |= CQSPI_REG_WR_DISABLE_AUTO_POLL;
331 +- writel(reg, reg_base + CQSPI_REG_WR_COMPLETION_CTRL);
332 ++ if (cqspi->wr_completion) {
333 ++ reg = readl(reg_base + CQSPI_REG_WR_COMPLETION_CTRL);
334 ++ reg |= CQSPI_REG_WR_DISABLE_AUTO_POLL;
335 ++ writel(reg, reg_base + CQSPI_REG_WR_COMPLETION_CTRL);
336 ++ }
337 +
338 + reg = readl(reg_base + CQSPI_REG_SIZE);
339 + reg &= ~CQSPI_REG_SIZE_ADDRESS_MASK;
340 +@@ -1532,6 +1536,10 @@ static int cqspi_probe(struct platform_device *pdev)
341 +
342 + cqspi->master_ref_clk_hz = clk_get_rate(cqspi->clk);
343 + master->max_speed_hz = cqspi->master_ref_clk_hz;
344 ++
345 ++ /* write completion is supported by default */
346 ++ cqspi->wr_completion = true;
347 ++
348 + ddata = of_device_get_match_data(dev);
349 + if (ddata) {
350 + if (ddata->quirks & CQSPI_NEEDS_WR_DELAY)
351 +@@ -1541,6 +1549,8 @@ static int cqspi_probe(struct platform_device *pdev)
352 + master->mode_bits |= SPI_RX_OCTAL | SPI_TX_OCTAL;
353 + if (!(ddata->quirks & CQSPI_DISABLE_DAC_MODE))
354 + cqspi->use_direct_mode = true;
355 ++ if (ddata->quirks & CQSPI_NO_SUPPORT_WR_COMPLETION)
356 ++ cqspi->wr_completion = false;
357 + }
358 +
359 + ret = devm_request_irq(dev, irq, cqspi_irq_handler, 0,
360 +@@ -1649,6 +1659,10 @@ static const struct cqspi_driver_platdata intel_lgm_qspi = {
361 + .quirks = CQSPI_DISABLE_DAC_MODE,
362 + };
363 +
364 ++static const struct cqspi_driver_platdata socfpga_qspi = {
365 ++ .quirks = CQSPI_NO_SUPPORT_WR_COMPLETION,
366 ++};
367 ++
368 + static const struct of_device_id cqspi_dt_ids[] = {
369 + {
370 + .compatible = "cdns,qspi-nor",
371 +@@ -1666,6 +1680,10 @@ static const struct of_device_id cqspi_dt_ids[] = {
372 + .compatible = "intel,lgm-qspi",
373 + .data = &intel_lgm_qspi,
374 + },
375 ++ {
376 ++ .compatible = "intel,socfpga-qspi",
377 ++ .data = (void *)&socfpga_qspi,
378 ++ },
379 + { /* end of table */ }
380 + };
381 +
382 +diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
383 +index dc1e4d1b72914..ff578c934bbcf 100644
384 +--- a/fs/btrfs/file.c
385 ++++ b/fs/btrfs/file.c
386 +@@ -1709,7 +1709,7 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
387 + * Fault pages before locking them in prepare_pages
388 + * to avoid recursive lock
389 + */
390 +- if (unlikely(iov_iter_fault_in_readable(i, write_bytes))) {
391 ++ if (unlikely(fault_in_iov_iter_readable(i, write_bytes))) {
392 + ret = -EFAULT;
393 + break;
394 + }
395 +@@ -1903,16 +1903,17 @@ static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info,
396 +
397 + static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
398 + {
399 ++ const bool is_sync_write = (iocb->ki_flags & IOCB_DSYNC);
400 + struct file *file = iocb->ki_filp;
401 + struct inode *inode = file_inode(file);
402 + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
403 + loff_t pos;
404 + ssize_t written = 0;
405 + ssize_t written_buffered;
406 ++ size_t prev_left = 0;
407 + loff_t endbyte;
408 + ssize_t err;
409 + unsigned int ilock_flags = 0;
410 +- struct iomap_dio *dio = NULL;
411 +
412 + if (iocb->ki_flags & IOCB_NOWAIT)
413 + ilock_flags |= BTRFS_ILOCK_TRY;
414 +@@ -1955,23 +1956,80 @@ relock:
415 + goto buffered;
416 + }
417 +
418 +- dio = __iomap_dio_rw(iocb, from, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
419 +- 0);
420 ++ /*
421 ++ * We remove IOCB_DSYNC so that we don't deadlock when iomap_dio_rw()
422 ++ * calls generic_write_sync() (through iomap_dio_complete()), because
423 ++ * that results in calling fsync (btrfs_sync_file()) which will try to
424 ++ * lock the inode in exclusive/write mode.
425 ++ */
426 ++ if (is_sync_write)
427 ++ iocb->ki_flags &= ~IOCB_DSYNC;
428 +
429 +- btrfs_inode_unlock(inode, ilock_flags);
430 ++ /*
431 ++ * The iov_iter can be mapped to the same file range we are writing to.
432 ++ * If that's the case, then we will deadlock in the iomap code, because
433 ++ * it first calls our callback btrfs_dio_iomap_begin(), which will create
434 ++ * an ordered extent, and after that it will fault in the pages that the
435 ++ * iov_iter refers to. During the fault in we end up in the readahead
436 ++ * pages code (starting at btrfs_readahead()), which will lock the range,
437 ++ * find that ordered extent and then wait for it to complete (at
438 ++ * btrfs_lock_and_flush_ordered_range()), resulting in a deadlock since
439 ++ * obviously the ordered extent can never complete as we didn't submit
440 ++ * yet the respective bio(s). This always happens when the buffer is
441 ++ * memory mapped to the same file range, since the iomap DIO code always
442 ++ * invalidates pages in the target file range (after starting and waiting
443 ++ * for any writeback).
444 ++ *
445 ++ * So here we disable page faults in the iov_iter and then retry if we
446 ++ * got -EFAULT, faulting in the pages before the retry.
447 ++ */
448 ++again:
449 ++ from->nofault = true;
450 ++ err = iomap_dio_rw(iocb, from, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
451 ++ IOMAP_DIO_PARTIAL, written);
452 ++ from->nofault = false;
453 +
454 +- if (IS_ERR_OR_NULL(dio)) {
455 +- err = PTR_ERR_OR_ZERO(dio);
456 +- if (err < 0 && err != -ENOTBLK)
457 +- goto out;
458 +- } else {
459 +- written = iomap_dio_complete(dio);
460 ++ /* No increment (+=) because iomap returns a cumulative value. */
461 ++ if (err > 0)
462 ++ written = err;
463 ++
464 ++ if (iov_iter_count(from) > 0 && (err == -EFAULT || err > 0)) {
465 ++ const size_t left = iov_iter_count(from);
466 ++ /*
467 ++ * We have more data left to write. Try to fault in as many as
468 ++ * possible of the remainder pages and retry. We do this without
469 ++ * releasing and locking again the inode, to prevent races with
470 ++ * truncate.
471 ++ *
472 ++ * Also, in case the iov refers to pages in the file range of the
473 ++ * file we want to write to (due to a mmap), we could enter an
474 ++ * infinite loop if we retry after faulting the pages in, since
475 ++ * iomap will invalidate any pages in the range early on, before
476 ++ * it tries to fault in the pages of the iov. So we keep track of
477 ++ * how much was left of iov in the previous EFAULT and fallback
478 ++ * to buffered IO in case we haven't made any progress.
479 ++ */
480 ++ if (left == prev_left) {
481 ++ err = -ENOTBLK;
482 ++ } else {
483 ++ fault_in_iov_iter_readable(from, left);
484 ++ prev_left = left;
485 ++ goto again;
486 ++ }
487 + }
488 +
489 +- if (written < 0 || !iov_iter_count(from)) {
490 +- err = written;
491 ++ btrfs_inode_unlock(inode, ilock_flags);
492 ++
493 ++ /*
494 ++ * Add back IOCB_DSYNC. Our caller, btrfs_file_write_iter(), will do
495 ++ * the fsync (call generic_write_sync()).
496 ++ */
497 ++ if (is_sync_write)
498 ++ iocb->ki_flags |= IOCB_DSYNC;
499 ++
500 ++ /* If 'err' is -ENOTBLK then it means we must fallback to buffered IO. */
501 ++ if ((err < 0 && err != -ENOTBLK) || !iov_iter_count(from))
502 + goto out;
503 +- }
504 +
505 + buffered:
506 + pos = iocb->ki_pos;
507 +@@ -1996,7 +2054,7 @@ buffered:
508 + invalidate_mapping_pages(file->f_mapping, pos >> PAGE_SHIFT,
509 + endbyte >> PAGE_SHIFT);
510 + out:
511 +- return written ? written : err;
512 ++ return err < 0 ? err : written;
513 + }
514 +
515 + static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
516 +@@ -3659,6 +3717,8 @@ static int check_direct_read(struct btrfs_fs_info *fs_info,
517 + static ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to)
518 + {
519 + struct inode *inode = file_inode(iocb->ki_filp);
520 ++ size_t prev_left = 0;
521 ++ ssize_t read = 0;
522 + ssize_t ret;
523 +
524 + if (fsverity_active(inode))
525 +@@ -3668,9 +3728,57 @@ static ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to)
526 + return 0;
527 +
528 + btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED);
529 +- ret = iomap_dio_rw(iocb, to, &btrfs_dio_iomap_ops, &btrfs_dio_ops, 0);
530 ++again:
531 ++ /*
532 ++ * This is similar to what we do for direct IO writes, see the comment
533 ++ * at btrfs_direct_write(), but we also disable page faults in addition
534 ++ * to disabling them only at the iov_iter level. This is because when
535 ++ * reading from a hole or prealloc extent, iomap calls iov_iter_zero(),
536 ++ * which can still trigger page fault ins despite having set ->nofault
537 ++ * to true of our 'to' iov_iter.
538 ++ *
539 ++ * The difference to direct IO writes is that we deadlock when trying
540 ++ * to lock the extent range in the inode's tree during he page reads
541 ++ * triggered by the fault in (while for writes it is due to waiting for
542 ++ * our own ordered extent). This is because for direct IO reads,
543 ++ * btrfs_dio_iomap_begin() returns with the extent range locked, which
544 ++ * is only unlocked in the endio callback (end_bio_extent_readpage()).
545 ++ */
546 ++ pagefault_disable();
547 ++ to->nofault = true;
548 ++ ret = iomap_dio_rw(iocb, to, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
549 ++ IOMAP_DIO_PARTIAL, read);
550 ++ to->nofault = false;
551 ++ pagefault_enable();
552 ++
553 ++ /* No increment (+=) because iomap returns a cumulative value. */
554 ++ if (ret > 0)
555 ++ read = ret;
556 ++
557 ++ if (iov_iter_count(to) > 0 && (ret == -EFAULT || ret > 0)) {
558 ++ const size_t left = iov_iter_count(to);
559 ++
560 ++ if (left == prev_left) {
561 ++ /*
562 ++ * We didn't make any progress since the last attempt,
563 ++ * fallback to a buffered read for the remainder of the
564 ++ * range. This is just to avoid any possibility of looping
565 ++ * for too long.
566 ++ */
567 ++ ret = read;
568 ++ } else {
569 ++ /*
570 ++ * We made some progress since the last retry or this is
571 ++ * the first time we are retrying. Fault in as many pages
572 ++ * as possible and retry.
573 ++ */
574 ++ fault_in_iov_iter_writeable(to, left);
575 ++ prev_left = left;
576 ++ goto again;
577 ++ }
578 ++ }
579 + btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
580 +- return ret;
581 ++ return ret < 0 ? ret : read;
582 + }
583 +
584 + static ssize_t btrfs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
585 +diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
586 +index 6266a706bff7d..044d584c3467c 100644
587 +--- a/fs/btrfs/inode.c
588 ++++ b/fs/btrfs/inode.c
589 +@@ -7961,6 +7961,34 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
590 + }
591 +
592 + len = min(len, em->len - (start - em->start));
593 ++
594 ++ /*
595 ++ * If we have a NOWAIT request and the range contains multiple extents
596 ++ * (or a mix of extents and holes), then we return -EAGAIN to make the
597 ++ * caller fallback to a context where it can do a blocking (without
598 ++ * NOWAIT) request. This way we avoid doing partial IO and returning
599 ++ * success to the caller, which is not optimal for writes and for reads
600 ++ * it can result in unexpected behaviour for an application.
601 ++ *
602 ++ * When doing a read, because we use IOMAP_DIO_PARTIAL when calling
603 ++ * iomap_dio_rw(), we can end up returning less data then what the caller
604 ++ * asked for, resulting in an unexpected, and incorrect, short read.
605 ++ * That is, the caller asked to read N bytes and we return less than that,
606 ++ * which is wrong unless we are crossing EOF. This happens if we get a
607 ++ * page fault error when trying to fault in pages for the buffer that is
608 ++ * associated to the struct iov_iter passed to iomap_dio_rw(), and we
609 ++ * have previously submitted bios for other extents in the range, in
610 ++ * which case iomap_dio_rw() may return us EIOCBQUEUED if not all of
611 ++ * those bios have completed by the time we get the page fault error,
612 ++ * which we return back to our caller - we should only return EIOCBQUEUED
613 ++ * after we have submitted bios for all the extents in the range.
614 ++ */
615 ++ if ((flags & IOMAP_NOWAIT) && len < length) {
616 ++ free_extent_map(em);
617 ++ ret = -EAGAIN;
618 ++ goto unlock_err;
619 ++ }
620 ++
621 + if (write) {
622 + ret = btrfs_get_blocks_direct_write(&em, inode, dio_data,
623 + start, len);
624 +diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
625 +index 6a863b3f6de03..bf53af8694f8e 100644
626 +--- a/fs/btrfs/ioctl.c
627 ++++ b/fs/btrfs/ioctl.c
628 +@@ -2258,9 +2258,8 @@ static noinline int search_ioctl(struct inode *inode,
629 + key.offset = sk->min_offset;
630 +
631 + while (1) {
632 +- ret = fault_in_pages_writeable(ubuf + sk_offset,
633 +- *buf_size - sk_offset);
634 +- if (ret)
635 ++ ret = -EFAULT;
636 ++ if (fault_in_writeable(ubuf + sk_offset, *buf_size - sk_offset))
637 + break;
638 +
639 + ret = btrfs_search_forward(root, &key, path, sk->min_transid);
640 +diff --git a/fs/erofs/data.c b/fs/erofs/data.c
641 +index 9db8297156527..16a41d0db55a3 100644
642 +--- a/fs/erofs/data.c
643 ++++ b/fs/erofs/data.c
644 +@@ -287,7 +287,7 @@ static ssize_t erofs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
645 +
646 + if (!err)
647 + return iomap_dio_rw(iocb, to, &erofs_iomap_ops,
648 +- NULL, 0);
649 ++ NULL, 0, 0);
650 + if (err < 0)
651 + return err;
652 + }
653 +diff --git a/fs/ext4/file.c b/fs/ext4/file.c
654 +index ac0e11bbb4450..b25c1f8f7c4f1 100644
655 +--- a/fs/ext4/file.c
656 ++++ b/fs/ext4/file.c
657 +@@ -74,7 +74,7 @@ static ssize_t ext4_dio_read_iter(struct kiocb *iocb, struct iov_iter *to)
658 + return generic_file_read_iter(iocb, to);
659 + }
660 +
661 +- ret = iomap_dio_rw(iocb, to, &ext4_iomap_ops, NULL, 0);
662 ++ ret = iomap_dio_rw(iocb, to, &ext4_iomap_ops, NULL, 0, 0);
663 + inode_unlock_shared(inode);
664 +
665 + file_accessed(iocb->ki_filp);
666 +@@ -566,7 +566,8 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
667 + if (ilock_shared)
668 + iomap_ops = &ext4_iomap_overwrite_ops;
669 + ret = iomap_dio_rw(iocb, from, iomap_ops, &ext4_dio_write_ops,
670 +- (unaligned_io || extend) ? IOMAP_DIO_FORCE_WAIT : 0);
671 ++ (unaligned_io || extend) ? IOMAP_DIO_FORCE_WAIT : 0,
672 ++ 0);
673 + if (ret == -ENOTBLK)
674 + ret = 0;
675 +
676 +diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
677 +index 0e14dc41ed4e6..8ef92719c6799 100644
678 +--- a/fs/f2fs/file.c
679 ++++ b/fs/f2fs/file.c
680 +@@ -4279,7 +4279,7 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
681 + size_t target_size = 0;
682 + int err;
683 +
684 +- if (iov_iter_fault_in_readable(from, iov_iter_count(from)))
685 ++ if (fault_in_iov_iter_readable(from, iov_iter_count(from)))
686 + set_inode_flag(inode, FI_NO_PREALLOC);
687 +
688 + if ((iocb->ki_flags & IOCB_NOWAIT)) {
689 +diff --git a/fs/fuse/file.c b/fs/fuse/file.c
690 +index bc50a9fa84a0c..71e9e301e569d 100644
691 +--- a/fs/fuse/file.c
692 ++++ b/fs/fuse/file.c
693 +@@ -1164,7 +1164,7 @@ static ssize_t fuse_fill_write_pages(struct fuse_io_args *ia,
694 +
695 + again:
696 + err = -EFAULT;
697 +- if (iov_iter_fault_in_readable(ii, bytes))
698 ++ if (fault_in_iov_iter_readable(ii, bytes))
699 + break;
700 +
701 + err = -ENOMEM;
702 +diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
703 +index bb9014ced702a..fbdb7a30470a3 100644
704 +--- a/fs/gfs2/bmap.c
705 ++++ b/fs/gfs2/bmap.c
706 +@@ -961,46 +961,6 @@ hole_found:
707 + goto out;
708 + }
709 +
710 +-static int gfs2_write_lock(struct inode *inode)
711 +-{
712 +- struct gfs2_inode *ip = GFS2_I(inode);
713 +- struct gfs2_sbd *sdp = GFS2_SB(inode);
714 +- int error;
715 +-
716 +- gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh);
717 +- error = gfs2_glock_nq(&ip->i_gh);
718 +- if (error)
719 +- goto out_uninit;
720 +- if (&ip->i_inode == sdp->sd_rindex) {
721 +- struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
722 +-
723 +- error = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE,
724 +- GL_NOCACHE, &m_ip->i_gh);
725 +- if (error)
726 +- goto out_unlock;
727 +- }
728 +- return 0;
729 +-
730 +-out_unlock:
731 +- gfs2_glock_dq(&ip->i_gh);
732 +-out_uninit:
733 +- gfs2_holder_uninit(&ip->i_gh);
734 +- return error;
735 +-}
736 +-
737 +-static void gfs2_write_unlock(struct inode *inode)
738 +-{
739 +- struct gfs2_inode *ip = GFS2_I(inode);
740 +- struct gfs2_sbd *sdp = GFS2_SB(inode);
741 +-
742 +- if (&ip->i_inode == sdp->sd_rindex) {
743 +- struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
744 +-
745 +- gfs2_glock_dq_uninit(&m_ip->i_gh);
746 +- }
747 +- gfs2_glock_dq_uninit(&ip->i_gh);
748 +-}
749 +-
750 + static int gfs2_iomap_page_prepare(struct inode *inode, loff_t pos,
751 + unsigned len)
752 + {
753 +@@ -1118,11 +1078,6 @@ out_qunlock:
754 + return ret;
755 + }
756 +
757 +-static inline bool gfs2_iomap_need_write_lock(unsigned flags)
758 +-{
759 +- return (flags & IOMAP_WRITE) && !(flags & IOMAP_DIRECT);
760 +-}
761 +-
762 + static int gfs2_iomap_begin(struct inode *inode, loff_t pos, loff_t length,
763 + unsigned flags, struct iomap *iomap,
764 + struct iomap *srcmap)
765 +@@ -1135,12 +1090,6 @@ static int gfs2_iomap_begin(struct inode *inode, loff_t pos, loff_t length,
766 + iomap->flags |= IOMAP_F_BUFFER_HEAD;
767 +
768 + trace_gfs2_iomap_start(ip, pos, length, flags);
769 +- if (gfs2_iomap_need_write_lock(flags)) {
770 +- ret = gfs2_write_lock(inode);
771 +- if (ret)
772 +- goto out;
773 +- }
774 +-
775 + ret = __gfs2_iomap_get(inode, pos, length, flags, iomap, &mp);
776 + if (ret)
777 + goto out_unlock;
778 +@@ -1168,10 +1117,7 @@ static int gfs2_iomap_begin(struct inode *inode, loff_t pos, loff_t length,
779 + ret = gfs2_iomap_begin_write(inode, pos, length, flags, iomap, &mp);
780 +
781 + out_unlock:
782 +- if (ret && gfs2_iomap_need_write_lock(flags))
783 +- gfs2_write_unlock(inode);
784 + release_metapath(&mp);
785 +-out:
786 + trace_gfs2_iomap_end(ip, iomap, ret);
787 + return ret;
788 + }
789 +@@ -1219,15 +1165,11 @@ static int gfs2_iomap_end(struct inode *inode, loff_t pos, loff_t length,
790 + }
791 +
792 + if (unlikely(!written))
793 +- goto out_unlock;
794 ++ return 0;
795 +
796 + if (iomap->flags & IOMAP_F_SIZE_CHANGED)
797 + mark_inode_dirty(inode);
798 + set_bit(GLF_DIRTY, &ip->i_gl->gl_flags);
799 +-
800 +-out_unlock:
801 +- if (gfs2_iomap_need_write_lock(flags))
802 +- gfs2_write_unlock(inode);
803 + return 0;
804 + }
805 +
806 +diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
807 +index 1c8b747072cba..247b8d95b5ef4 100644
808 +--- a/fs/gfs2/file.c
809 ++++ b/fs/gfs2/file.c
810 +@@ -777,27 +777,99 @@ static int gfs2_fsync(struct file *file, loff_t start, loff_t end,
811 + return ret ? ret : ret1;
812 + }
813 +
814 ++static inline bool should_fault_in_pages(ssize_t ret, struct iov_iter *i,
815 ++ size_t *prev_count,
816 ++ size_t *window_size)
817 ++{
818 ++ char __user *p = i->iov[0].iov_base + i->iov_offset;
819 ++ size_t count = iov_iter_count(i);
820 ++ int pages = 1;
821 ++
822 ++ if (likely(!count))
823 ++ return false;
824 ++ if (ret <= 0 && ret != -EFAULT)
825 ++ return false;
826 ++ if (!iter_is_iovec(i))
827 ++ return false;
828 ++
829 ++ if (*prev_count != count || !*window_size) {
830 ++ int pages, nr_dirtied;
831 ++
832 ++ pages = min_t(int, BIO_MAX_VECS,
833 ++ DIV_ROUND_UP(iov_iter_count(i), PAGE_SIZE));
834 ++ nr_dirtied = max(current->nr_dirtied_pause -
835 ++ current->nr_dirtied, 1);
836 ++ pages = min(pages, nr_dirtied);
837 ++ }
838 ++
839 ++ *prev_count = count;
840 ++ *window_size = (size_t)PAGE_SIZE * pages - offset_in_page(p);
841 ++ return true;
842 ++}
843 ++
844 + static ssize_t gfs2_file_direct_read(struct kiocb *iocb, struct iov_iter *to,
845 + struct gfs2_holder *gh)
846 + {
847 + struct file *file = iocb->ki_filp;
848 + struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
849 +- size_t count = iov_iter_count(to);
850 ++ size_t prev_count = 0, window_size = 0;
851 ++ size_t written = 0;
852 + ssize_t ret;
853 +
854 +- if (!count)
855 ++ /*
856 ++ * In this function, we disable page faults when we're holding the
857 ++ * inode glock while doing I/O. If a page fault occurs, we indicate
858 ++ * that the inode glock may be dropped, fault in the pages manually,
859 ++ * and retry.
860 ++ *
861 ++ * Unlike generic_file_read_iter, for reads, iomap_dio_rw can trigger
862 ++ * physical as well as manual page faults, and we need to disable both
863 ++ * kinds.
864 ++ *
865 ++ * For direct I/O, gfs2 takes the inode glock in deferred mode. This
866 ++ * locking mode is compatible with other deferred holders, so multiple
867 ++ * processes and nodes can do direct I/O to a file at the same time.
868 ++ * There's no guarantee that reads or writes will be atomic. Any
869 ++ * coordination among readers and writers needs to happen externally.
870 ++ */
871 ++
872 ++ if (!iov_iter_count(to))
873 + return 0; /* skip atime */
874 +
875 + gfs2_holder_init(ip->i_gl, LM_ST_DEFERRED, 0, gh);
876 ++retry:
877 + ret = gfs2_glock_nq(gh);
878 + if (ret)
879 + goto out_uninit;
880 ++retry_under_glock:
881 ++ pagefault_disable();
882 ++ to->nofault = true;
883 ++ ret = iomap_dio_rw(iocb, to, &gfs2_iomap_ops, NULL,
884 ++ IOMAP_DIO_PARTIAL, written);
885 ++ to->nofault = false;
886 ++ pagefault_enable();
887 ++ if (ret > 0)
888 ++ written = ret;
889 ++
890 ++ if (should_fault_in_pages(ret, to, &prev_count, &window_size)) {
891 ++ size_t leftover;
892 +
893 +- ret = iomap_dio_rw(iocb, to, &gfs2_iomap_ops, NULL, 0);
894 +- gfs2_glock_dq(gh);
895 ++ gfs2_holder_allow_demote(gh);
896 ++ leftover = fault_in_iov_iter_writeable(to, window_size);
897 ++ gfs2_holder_disallow_demote(gh);
898 ++ if (leftover != window_size) {
899 ++ if (!gfs2_holder_queued(gh))
900 ++ goto retry;
901 ++ goto retry_under_glock;
902 ++ }
903 ++ }
904 ++ if (gfs2_holder_queued(gh))
905 ++ gfs2_glock_dq(gh);
906 + out_uninit:
907 + gfs2_holder_uninit(gh);
908 +- return ret;
909 ++ if (ret < 0)
910 ++ return ret;
911 ++ return written;
912 + }
913 +
914 + static ssize_t gfs2_file_direct_write(struct kiocb *iocb, struct iov_iter *from,
915 +@@ -806,10 +878,20 @@ static ssize_t gfs2_file_direct_write(struct kiocb *iocb, struct iov_iter *from,
916 + struct file *file = iocb->ki_filp;
917 + struct inode *inode = file->f_mapping->host;
918 + struct gfs2_inode *ip = GFS2_I(inode);
919 +- size_t len = iov_iter_count(from);
920 +- loff_t offset = iocb->ki_pos;
921 ++ size_t prev_count = 0, window_size = 0;
922 ++ size_t read = 0;
923 + ssize_t ret;
924 +
925 ++ /*
926 ++ * In this function, we disable page faults when we're holding the
927 ++ * inode glock while doing I/O. If a page fault occurs, we indicate
928 ++ * that the inode glock may be dropped, fault in the pages manually,
929 ++ * and retry.
930 ++ *
931 ++ * For writes, iomap_dio_rw only triggers manual page faults, so we
932 ++ * don't need to disable physical ones.
933 ++ */
934 ++
935 + /*
936 + * Deferred lock, even if its a write, since we do no allocation on
937 + * this path. All we need to change is the atime, and this lock mode
938 +@@ -819,31 +901,62 @@ static ssize_t gfs2_file_direct_write(struct kiocb *iocb, struct iov_iter *from,
939 + * VFS does.
940 + */
941 + gfs2_holder_init(ip->i_gl, LM_ST_DEFERRED, 0, gh);
942 ++retry:
943 + ret = gfs2_glock_nq(gh);
944 + if (ret)
945 + goto out_uninit;
946 +-
947 ++retry_under_glock:
948 + /* Silently fall back to buffered I/O when writing beyond EOF */
949 +- if (offset + len > i_size_read(&ip->i_inode))
950 ++ if (iocb->ki_pos + iov_iter_count(from) > i_size_read(&ip->i_inode))
951 + goto out;
952 +
953 +- ret = iomap_dio_rw(iocb, from, &gfs2_iomap_ops, NULL, 0);
954 ++ from->nofault = true;
955 ++ ret = iomap_dio_rw(iocb, from, &gfs2_iomap_ops, NULL,
956 ++ IOMAP_DIO_PARTIAL, read);
957 ++ from->nofault = false;
958 ++
959 + if (ret == -ENOTBLK)
960 + ret = 0;
961 ++ if (ret > 0)
962 ++ read = ret;
963 ++
964 ++ if (should_fault_in_pages(ret, from, &prev_count, &window_size)) {
965 ++ size_t leftover;
966 ++
967 ++ gfs2_holder_allow_demote(gh);
968 ++ leftover = fault_in_iov_iter_readable(from, window_size);
969 ++ gfs2_holder_disallow_demote(gh);
970 ++ if (leftover != window_size) {
971 ++ if (!gfs2_holder_queued(gh))
972 ++ goto retry;
973 ++ goto retry_under_glock;
974 ++ }
975 ++ }
976 + out:
977 +- gfs2_glock_dq(gh);
978 ++ if (gfs2_holder_queued(gh))
979 ++ gfs2_glock_dq(gh);
980 + out_uninit:
981 + gfs2_holder_uninit(gh);
982 +- return ret;
983 ++ if (ret < 0)
984 ++ return ret;
985 ++ return read;
986 + }
987 +
988 + static ssize_t gfs2_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
989 + {
990 + struct gfs2_inode *ip;
991 + struct gfs2_holder gh;
992 ++ size_t prev_count = 0, window_size = 0;
993 + size_t written = 0;
994 + ssize_t ret;
995 +
996 ++ /*
997 ++ * In this function, we disable page faults when we're holding the
998 ++ * inode glock while doing I/O. If a page fault occurs, we indicate
999 ++ * that the inode glock may be dropped, fault in the pages manually,
1000 ++ * and retry.
1001 ++ */
1002 ++
1003 + if (iocb->ki_flags & IOCB_DIRECT) {
1004 + ret = gfs2_file_direct_read(iocb, to, &gh);
1005 + if (likely(ret != -ENOTBLK))
1006 +@@ -865,18 +978,118 @@ static ssize_t gfs2_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
1007 + }
1008 + ip = GFS2_I(iocb->ki_filp->f_mapping->host);
1009 + gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh);
1010 ++retry:
1011 + ret = gfs2_glock_nq(&gh);
1012 + if (ret)
1013 + goto out_uninit;
1014 ++retry_under_glock:
1015 ++ pagefault_disable();
1016 + ret = generic_file_read_iter(iocb, to);
1017 ++ pagefault_enable();
1018 + if (ret > 0)
1019 + written += ret;
1020 +- gfs2_glock_dq(&gh);
1021 ++
1022 ++ if (should_fault_in_pages(ret, to, &prev_count, &window_size)) {
1023 ++ size_t leftover;
1024 ++
1025 ++ gfs2_holder_allow_demote(&gh);
1026 ++ leftover = fault_in_iov_iter_writeable(to, window_size);
1027 ++ gfs2_holder_disallow_demote(&gh);
1028 ++ if (leftover != window_size) {
1029 ++ if (!gfs2_holder_queued(&gh)) {
1030 ++ if (written)
1031 ++ goto out_uninit;
1032 ++ goto retry;
1033 ++ }
1034 ++ goto retry_under_glock;
1035 ++ }
1036 ++ }
1037 ++ if (gfs2_holder_queued(&gh))
1038 ++ gfs2_glock_dq(&gh);
1039 + out_uninit:
1040 + gfs2_holder_uninit(&gh);
1041 + return written ? written : ret;
1042 + }
1043 +
1044 ++static ssize_t gfs2_file_buffered_write(struct kiocb *iocb,
1045 ++ struct iov_iter *from,
1046 ++ struct gfs2_holder *gh)
1047 ++{
1048 ++ struct file *file = iocb->ki_filp;
1049 ++ struct inode *inode = file_inode(file);
1050 ++ struct gfs2_inode *ip = GFS2_I(inode);
1051 ++ struct gfs2_sbd *sdp = GFS2_SB(inode);
1052 ++ struct gfs2_holder *statfs_gh = NULL;
1053 ++ size_t prev_count = 0, window_size = 0;
1054 ++ size_t read = 0;
1055 ++ ssize_t ret;
1056 ++
1057 ++ /*
1058 ++ * In this function, we disable page faults when we're holding the
1059 ++ * inode glock while doing I/O. If a page fault occurs, we indicate
1060 ++ * that the inode glock may be dropped, fault in the pages manually,
1061 ++ * and retry.
1062 ++ */
1063 ++
1064 ++ if (inode == sdp->sd_rindex) {
1065 ++ statfs_gh = kmalloc(sizeof(*statfs_gh), GFP_NOFS);
1066 ++ if (!statfs_gh)
1067 ++ return -ENOMEM;
1068 ++ }
1069 ++
1070 ++ gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, gh);
1071 ++retry:
1072 ++ ret = gfs2_glock_nq(gh);
1073 ++ if (ret)
1074 ++ goto out_uninit;
1075 ++retry_under_glock:
1076 ++ if (inode == sdp->sd_rindex) {
1077 ++ struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
1078 ++
1079 ++ ret = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE,
1080 ++ GL_NOCACHE, statfs_gh);
1081 ++ if (ret)
1082 ++ goto out_unlock;
1083 ++ }
1084 ++
1085 ++ current->backing_dev_info = inode_to_bdi(inode);
1086 ++ pagefault_disable();
1087 ++ ret = iomap_file_buffered_write(iocb, from, &gfs2_iomap_ops);
1088 ++ pagefault_enable();
1089 ++ current->backing_dev_info = NULL;
1090 ++ if (ret > 0) {
1091 ++ iocb->ki_pos += ret;
1092 ++ read += ret;
1093 ++ }
1094 ++
1095 ++ if (inode == sdp->sd_rindex)
1096 ++ gfs2_glock_dq_uninit(statfs_gh);
1097 ++
1098 ++ if (should_fault_in_pages(ret, from, &prev_count, &window_size)) {
1099 ++ size_t leftover;
1100 ++
1101 ++ gfs2_holder_allow_demote(gh);
1102 ++ leftover = fault_in_iov_iter_readable(from, window_size);
1103 ++ gfs2_holder_disallow_demote(gh);
1104 ++ if (leftover != window_size) {
1105 ++ if (!gfs2_holder_queued(gh)) {
1106 ++ if (read)
1107 ++ goto out_uninit;
1108 ++ goto retry;
1109 ++ }
1110 ++ goto retry_under_glock;
1111 ++ }
1112 ++ }
1113 ++out_unlock:
1114 ++ if (gfs2_holder_queued(gh))
1115 ++ gfs2_glock_dq(gh);
1116 ++out_uninit:
1117 ++ gfs2_holder_uninit(gh);
1118 ++ if (statfs_gh)
1119 ++ kfree(statfs_gh);
1120 ++ return read ? read : ret;
1121 ++}
1122 ++
1123 + /**
1124 + * gfs2_file_write_iter - Perform a write to a file
1125 + * @iocb: The io context
1126 +@@ -928,9 +1141,7 @@ static ssize_t gfs2_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1127 + goto out_unlock;
1128 +
1129 + iocb->ki_flags |= IOCB_DSYNC;
1130 +- current->backing_dev_info = inode_to_bdi(inode);
1131 +- buffered = iomap_file_buffered_write(iocb, from, &gfs2_iomap_ops);
1132 +- current->backing_dev_info = NULL;
1133 ++ buffered = gfs2_file_buffered_write(iocb, from, &gh);
1134 + if (unlikely(buffered <= 0)) {
1135 + if (!ret)
1136 + ret = buffered;
1137 +@@ -944,7 +1155,6 @@ static ssize_t gfs2_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1138 + * the direct I/O range as we don't know if the buffered pages
1139 + * made it to disk.
1140 + */
1141 +- iocb->ki_pos += buffered;
1142 + ret2 = generic_write_sync(iocb, buffered);
1143 + invalidate_mapping_pages(mapping,
1144 + (iocb->ki_pos - buffered) >> PAGE_SHIFT,
1145 +@@ -952,13 +1162,9 @@ static ssize_t gfs2_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1146 + if (!ret || ret2 > 0)
1147 + ret += ret2;
1148 + } else {
1149 +- current->backing_dev_info = inode_to_bdi(inode);
1150 +- ret = iomap_file_buffered_write(iocb, from, &gfs2_iomap_ops);
1151 +- current->backing_dev_info = NULL;
1152 +- if (likely(ret > 0)) {
1153 +- iocb->ki_pos += ret;
1154 ++ ret = gfs2_file_buffered_write(iocb, from, &gh);
1155 ++ if (likely(ret > 0))
1156 + ret = generic_write_sync(iocb, ret);
1157 +- }
1158 + }
1159 +
1160 + out_unlock:
1161 +diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
1162 +index 02cd0ae98208d..e85ef6b14777d 100644
1163 +--- a/fs/gfs2/glock.c
1164 ++++ b/fs/gfs2/glock.c
1165 +@@ -58,6 +58,7 @@ struct gfs2_glock_iter {
1166 + typedef void (*glock_examiner) (struct gfs2_glock * gl);
1167 +
1168 + static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int target);
1169 ++static void __gfs2_glock_dq(struct gfs2_holder *gh);
1170 +
1171 + static struct dentry *gfs2_root;
1172 + static struct workqueue_struct *glock_workqueue;
1173 +@@ -197,6 +198,12 @@ static int demote_ok(const struct gfs2_glock *gl)
1174 +
1175 + if (gl->gl_state == LM_ST_UNLOCKED)
1176 + return 0;
1177 ++ /*
1178 ++ * Note that demote_ok is used for the lru process of disposing of
1179 ++ * glocks. For this purpose, we don't care if the glock's holders
1180 ++ * have the HIF_MAY_DEMOTE flag set or not. If someone is using
1181 ++ * them, don't demote.
1182 ++ */
1183 + if (!list_empty(&gl->gl_holders))
1184 + return 0;
1185 + if (glops->go_demote_ok)
1186 +@@ -301,46 +308,59 @@ void gfs2_glock_put(struct gfs2_glock *gl)
1187 + }
1188 +
1189 + /**
1190 +- * may_grant - check if its ok to grant a new lock
1191 ++ * may_grant - check if it's ok to grant a new lock
1192 + * @gl: The glock
1193 ++ * @current_gh: One of the current holders of @gl
1194 + * @gh: The lock request which we wish to grant
1195 + *
1196 +- * Returns: true if its ok to grant the lock
1197 ++ * With our current compatibility rules, if a glock has one or more active
1198 ++ * holders (HIF_HOLDER flag set), any of those holders can be passed in as
1199 ++ * @current_gh; they are all the same as far as compatibility with the new @gh
1200 ++ * goes.
1201 ++ *
1202 ++ * Returns true if it's ok to grant the lock.
1203 + */
1204 +
1205 +-static inline int may_grant(const struct gfs2_glock *gl, const struct gfs2_holder *gh)
1206 +-{
1207 +- const struct gfs2_holder *gh_head = list_first_entry(&gl->gl_holders, const struct gfs2_holder, gh_list);
1208 ++static inline bool may_grant(struct gfs2_glock *gl,
1209 ++ struct gfs2_holder *current_gh,
1210 ++ struct gfs2_holder *gh)
1211 ++{
1212 ++ if (current_gh) {
1213 ++ GLOCK_BUG_ON(gl, !test_bit(HIF_HOLDER, &current_gh->gh_iflags));
1214 ++
1215 ++ switch(current_gh->gh_state) {
1216 ++ case LM_ST_EXCLUSIVE:
1217 ++ /*
1218 ++ * Here we make a special exception to grant holders
1219 ++ * who agree to share the EX lock with other holders
1220 ++ * who also have the bit set. If the original holder
1221 ++ * has the LM_FLAG_NODE_SCOPE bit set, we grant more
1222 ++ * holders with the bit set.
1223 ++ */
1224 ++ return gh->gh_state == LM_ST_EXCLUSIVE &&
1225 ++ (current_gh->gh_flags & LM_FLAG_NODE_SCOPE) &&
1226 ++ (gh->gh_flags & LM_FLAG_NODE_SCOPE);
1227 +
1228 +- if (gh != gh_head) {
1229 +- /**
1230 +- * Here we make a special exception to grant holders who agree
1231 +- * to share the EX lock with other holders who also have the
1232 +- * bit set. If the original holder has the LM_FLAG_NODE_SCOPE bit
1233 +- * is set, we grant more holders with the bit set.
1234 +- */
1235 +- if (gh_head->gh_state == LM_ST_EXCLUSIVE &&
1236 +- (gh_head->gh_flags & LM_FLAG_NODE_SCOPE) &&
1237 +- gh->gh_state == LM_ST_EXCLUSIVE &&
1238 +- (gh->gh_flags & LM_FLAG_NODE_SCOPE))
1239 +- return 1;
1240 +- if ((gh->gh_state == LM_ST_EXCLUSIVE ||
1241 +- gh_head->gh_state == LM_ST_EXCLUSIVE))
1242 +- return 0;
1243 ++ case LM_ST_SHARED:
1244 ++ case LM_ST_DEFERRED:
1245 ++ return gh->gh_state == current_gh->gh_state;
1246 ++
1247 ++ default:
1248 ++ return false;
1249 ++ }
1250 + }
1251 ++
1252 + if (gl->gl_state == gh->gh_state)
1253 +- return 1;
1254 ++ return true;
1255 + if (gh->gh_flags & GL_EXACT)
1256 +- return 0;
1257 ++ return false;
1258 + if (gl->gl_state == LM_ST_EXCLUSIVE) {
1259 +- if (gh->gh_state == LM_ST_SHARED && gh_head->gh_state == LM_ST_SHARED)
1260 +- return 1;
1261 +- if (gh->gh_state == LM_ST_DEFERRED && gh_head->gh_state == LM_ST_DEFERRED)
1262 +- return 1;
1263 ++ return gh->gh_state == LM_ST_SHARED ||
1264 ++ gh->gh_state == LM_ST_DEFERRED;
1265 + }
1266 +- if (gl->gl_state != LM_ST_UNLOCKED && (gh->gh_flags & LM_FLAG_ANY))
1267 +- return 1;
1268 +- return 0;
1269 ++ if (gh->gh_flags & LM_FLAG_ANY)
1270 ++ return gl->gl_state != LM_ST_UNLOCKED;
1271 ++ return false;
1272 + }
1273 +
1274 + static void gfs2_holder_wake(struct gfs2_holder *gh)
1275 +@@ -366,7 +386,7 @@ static void do_error(struct gfs2_glock *gl, const int ret)
1276 + struct gfs2_holder *gh, *tmp;
1277 +
1278 + list_for_each_entry_safe(gh, tmp, &gl->gl_holders, gh_list) {
1279 +- if (test_bit(HIF_HOLDER, &gh->gh_iflags))
1280 ++ if (!test_bit(HIF_WAIT, &gh->gh_iflags))
1281 + continue;
1282 + if (ret & LM_OUT_ERROR)
1283 + gh->gh_error = -EIO;
1284 +@@ -380,6 +400,78 @@ static void do_error(struct gfs2_glock *gl, const int ret)
1285 + }
1286 + }
1287 +
1288 ++/**
1289 ++ * demote_incompat_holders - demote incompatible demoteable holders
1290 ++ * @gl: the glock we want to promote
1291 ++ * @new_gh: the new holder to be promoted
1292 ++ */
1293 ++static void demote_incompat_holders(struct gfs2_glock *gl,
1294 ++ struct gfs2_holder *new_gh)
1295 ++{
1296 ++ struct gfs2_holder *gh;
1297 ++
1298 ++ /*
1299 ++ * Demote incompatible holders before we make ourselves eligible.
1300 ++ * (This holder may or may not allow auto-demoting, but we don't want
1301 ++ * to demote the new holder before it's even granted.)
1302 ++ */
1303 ++ list_for_each_entry(gh, &gl->gl_holders, gh_list) {
1304 ++ /*
1305 ++ * Since holders are at the front of the list, we stop when we
1306 ++ * find the first non-holder.
1307 ++ */
1308 ++ if (!test_bit(HIF_HOLDER, &gh->gh_iflags))
1309 ++ return;
1310 ++ if (test_bit(HIF_MAY_DEMOTE, &gh->gh_iflags) &&
1311 ++ !may_grant(gl, new_gh, gh)) {
1312 ++ /*
1313 ++ * We should not recurse into do_promote because
1314 ++ * __gfs2_glock_dq only calls handle_callback,
1315 ++ * gfs2_glock_add_to_lru and __gfs2_glock_queue_work.
1316 ++ */
1317 ++ __gfs2_glock_dq(gh);
1318 ++ }
1319 ++ }
1320 ++}
1321 ++
1322 ++/**
1323 ++ * find_first_holder - find the first "holder" gh
1324 ++ * @gl: the glock
1325 ++ */
1326 ++
1327 ++static inline struct gfs2_holder *find_first_holder(const struct gfs2_glock *gl)
1328 ++{
1329 ++ struct gfs2_holder *gh;
1330 ++
1331 ++ if (!list_empty(&gl->gl_holders)) {
1332 ++ gh = list_first_entry(&gl->gl_holders, struct gfs2_holder,
1333 ++ gh_list);
1334 ++ if (test_bit(HIF_HOLDER, &gh->gh_iflags))
1335 ++ return gh;
1336 ++ }
1337 ++ return NULL;
1338 ++}
1339 ++
1340 ++/**
1341 ++ * find_first_strong_holder - find the first non-demoteable holder
1342 ++ * @gl: the glock
1343 ++ *
1344 ++ * Find the first holder that doesn't have the HIF_MAY_DEMOTE flag set.
1345 ++ */
1346 ++static inline struct gfs2_holder *
1347 ++find_first_strong_holder(struct gfs2_glock *gl)
1348 ++{
1349 ++ struct gfs2_holder *gh;
1350 ++
1351 ++ list_for_each_entry(gh, &gl->gl_holders, gh_list) {
1352 ++ if (!test_bit(HIF_HOLDER, &gh->gh_iflags))
1353 ++ return NULL;
1354 ++ if (!test_bit(HIF_MAY_DEMOTE, &gh->gh_iflags))
1355 ++ return gh;
1356 ++ }
1357 ++ return NULL;
1358 ++}
1359 ++
1360 + /**
1361 + * do_promote - promote as many requests as possible on the current queue
1362 + * @gl: The glock
1363 +@@ -393,14 +485,21 @@ __releases(&gl->gl_lockref.lock)
1364 + __acquires(&gl->gl_lockref.lock)
1365 + {
1366 + const struct gfs2_glock_operations *glops = gl->gl_ops;
1367 +- struct gfs2_holder *gh, *tmp;
1368 ++ struct gfs2_holder *gh, *tmp, *first_gh;
1369 ++ bool incompat_holders_demoted = false;
1370 + int ret;
1371 +
1372 + restart:
1373 ++ first_gh = find_first_strong_holder(gl);
1374 + list_for_each_entry_safe(gh, tmp, &gl->gl_holders, gh_list) {
1375 +- if (test_bit(HIF_HOLDER, &gh->gh_iflags))
1376 ++ if (!test_bit(HIF_WAIT, &gh->gh_iflags))
1377 + continue;
1378 +- if (may_grant(gl, gh)) {
1379 ++ if (may_grant(gl, first_gh, gh)) {
1380 ++ if (!incompat_holders_demoted) {
1381 ++ demote_incompat_holders(gl, first_gh);
1382 ++ incompat_holders_demoted = true;
1383 ++ first_gh = gh;
1384 ++ }
1385 + if (gh->gh_list.prev == &gl->gl_holders &&
1386 + glops->go_lock) {
1387 + spin_unlock(&gl->gl_lockref.lock);
1388 +@@ -426,6 +525,11 @@ restart:
1389 + gfs2_holder_wake(gh);
1390 + continue;
1391 + }
1392 ++ /*
1393 ++ * If we get here, it means we may not grant this holder for
1394 ++ * some reason. If this holder is the head of the list, it
1395 ++ * means we have a blocked holder at the head, so return 1.
1396 ++ */
1397 + if (gh->gh_list.prev == &gl->gl_holders)
1398 + return 1;
1399 + do_error(gl, 0);
1400 +@@ -722,23 +826,6 @@ out:
1401 + spin_lock(&gl->gl_lockref.lock);
1402 + }
1403 +
1404 +-/**
1405 +- * find_first_holder - find the first "holder" gh
1406 +- * @gl: the glock
1407 +- */
1408 +-
1409 +-static inline struct gfs2_holder *find_first_holder(const struct gfs2_glock *gl)
1410 +-{
1411 +- struct gfs2_holder *gh;
1412 +-
1413 +- if (!list_empty(&gl->gl_holders)) {
1414 +- gh = list_first_entry(&gl->gl_holders, struct gfs2_holder, gh_list);
1415 +- if (test_bit(HIF_HOLDER, &gh->gh_iflags))
1416 +- return gh;
1417 +- }
1418 +- return NULL;
1419 +-}
1420 +-
1421 + /**
1422 + * run_queue - do all outstanding tasks related to a glock
1423 + * @gl: The glock in question
1424 +@@ -1354,15 +1441,20 @@ __acquires(&gl->gl_lockref.lock)
1425 + GLOCK_BUG_ON(gl, true);
1426 +
1427 + if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) {
1428 +- if (test_bit(GLF_LOCK, &gl->gl_flags))
1429 +- try_futile = !may_grant(gl, gh);
1430 ++ if (test_bit(GLF_LOCK, &gl->gl_flags)) {
1431 ++ struct gfs2_holder *first_gh;
1432 ++
1433 ++ first_gh = find_first_strong_holder(gl);
1434 ++ try_futile = !may_grant(gl, first_gh, gh);
1435 ++ }
1436 + if (test_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags))
1437 + goto fail;
1438 + }
1439 +
1440 + list_for_each_entry(gh2, &gl->gl_holders, gh_list) {
1441 + if (unlikely(gh2->gh_owner_pid == gh->gh_owner_pid &&
1442 +- (gh->gh_gl->gl_ops->go_type != LM_TYPE_FLOCK)))
1443 ++ (gh->gh_gl->gl_ops->go_type != LM_TYPE_FLOCK) &&
1444 ++ !test_bit(HIF_MAY_DEMOTE, &gh2->gh_iflags)))
1445 + goto trap_recursive;
1446 + if (try_futile &&
1447 + !(gh2->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))) {
1448 +@@ -1458,51 +1550,83 @@ int gfs2_glock_poll(struct gfs2_holder *gh)
1449 + return test_bit(HIF_WAIT, &gh->gh_iflags) ? 0 : 1;
1450 + }
1451 +
1452 +-/**
1453 +- * gfs2_glock_dq - dequeue a struct gfs2_holder from a glock (release a glock)
1454 +- * @gh: the glock holder
1455 +- *
1456 +- */
1457 ++static inline bool needs_demote(struct gfs2_glock *gl)
1458 ++{
1459 ++ return (test_bit(GLF_DEMOTE, &gl->gl_flags) ||
1460 ++ test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags));
1461 ++}
1462 +
1463 +-void gfs2_glock_dq(struct gfs2_holder *gh)
1464 ++static void __gfs2_glock_dq(struct gfs2_holder *gh)
1465 + {
1466 + struct gfs2_glock *gl = gh->gh_gl;
1467 + struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
1468 + unsigned delay = 0;
1469 + int fast_path = 0;
1470 +
1471 +- spin_lock(&gl->gl_lockref.lock);
1472 + /*
1473 +- * If we're in the process of file system withdraw, we cannot just
1474 +- * dequeue any glocks until our journal is recovered, lest we
1475 +- * introduce file system corruption. We need two exceptions to this
1476 +- * rule: We need to allow unlocking of nondisk glocks and the glock
1477 +- * for our own journal that needs recovery.
1478 ++ * This while loop is similar to function demote_incompat_holders:
1479 ++ * If the glock is due to be demoted (which may be from another node
1480 ++ * or even if this holder is GL_NOCACHE), the weak holders are
1481 ++ * demoted as well, allowing the glock to be demoted.
1482 + */
1483 +- if (test_bit(SDF_WITHDRAW_RECOVERY, &sdp->sd_flags) &&
1484 +- glock_blocked_by_withdraw(gl) &&
1485 +- gh->gh_gl != sdp->sd_jinode_gl) {
1486 +- sdp->sd_glock_dqs_held++;
1487 +- spin_unlock(&gl->gl_lockref.lock);
1488 +- might_sleep();
1489 +- wait_on_bit(&sdp->sd_flags, SDF_WITHDRAW_RECOVERY,
1490 +- TASK_UNINTERRUPTIBLE);
1491 +- spin_lock(&gl->gl_lockref.lock);
1492 +- }
1493 +- if (gh->gh_flags & GL_NOCACHE)
1494 +- handle_callback(gl, LM_ST_UNLOCKED, 0, false);
1495 ++ while (gh) {
1496 ++ /*
1497 ++ * If we're in the process of file system withdraw, we cannot
1498 ++ * just dequeue any glocks until our journal is recovered, lest
1499 ++ * we introduce file system corruption. We need two exceptions
1500 ++ * to this rule: We need to allow unlocking of nondisk glocks
1501 ++ * and the glock for our own journal that needs recovery.
1502 ++ */
1503 ++ if (test_bit(SDF_WITHDRAW_RECOVERY, &sdp->sd_flags) &&
1504 ++ glock_blocked_by_withdraw(gl) &&
1505 ++ gh->gh_gl != sdp->sd_jinode_gl) {
1506 ++ sdp->sd_glock_dqs_held++;
1507 ++ spin_unlock(&gl->gl_lockref.lock);
1508 ++ might_sleep();
1509 ++ wait_on_bit(&sdp->sd_flags, SDF_WITHDRAW_RECOVERY,
1510 ++ TASK_UNINTERRUPTIBLE);
1511 ++ spin_lock(&gl->gl_lockref.lock);
1512 ++ }
1513 +
1514 +- list_del_init(&gh->gh_list);
1515 +- clear_bit(HIF_HOLDER, &gh->gh_iflags);
1516 +- if (list_empty(&gl->gl_holders) &&
1517 +- !test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) &&
1518 +- !test_bit(GLF_DEMOTE, &gl->gl_flags))
1519 +- fast_path = 1;
1520 ++ /*
1521 ++ * This holder should not be cached, so mark it for demote.
1522 ++ * Note: this should be done before the check for needs_demote
1523 ++ * below.
1524 ++ */
1525 ++ if (gh->gh_flags & GL_NOCACHE)
1526 ++ handle_callback(gl, LM_ST_UNLOCKED, 0, false);
1527 ++
1528 ++ list_del_init(&gh->gh_list);
1529 ++ clear_bit(HIF_HOLDER, &gh->gh_iflags);
1530 ++ trace_gfs2_glock_queue(gh, 0);
1531 ++
1532 ++ /*
1533 ++ * If there hasn't been a demote request we are done.
1534 ++ * (Let the remaining holders, if any, keep holding it.)
1535 ++ */
1536 ++ if (!needs_demote(gl)) {
1537 ++ if (list_empty(&gl->gl_holders))
1538 ++ fast_path = 1;
1539 ++ break;
1540 ++ }
1541 ++ /*
1542 ++ * If we have another strong holder (we cannot auto-demote)
1543 ++ * we are done. It keeps holding it until it is done.
1544 ++ */
1545 ++ if (find_first_strong_holder(gl))
1546 ++ break;
1547 ++
1548 ++ /*
1549 ++ * If we have a weak holder at the head of the list, it
1550 ++ * (and all others like it) must be auto-demoted. If there
1551 ++ * are no more weak holders, we exit the while loop.
1552 ++ */
1553 ++ gh = find_first_holder(gl);
1554 ++ }
1555 +
1556 + if (!test_bit(GLF_LFLUSH, &gl->gl_flags) && demote_ok(gl))
1557 + gfs2_glock_add_to_lru(gl);
1558 +
1559 +- trace_gfs2_glock_queue(gh, 0);
1560 + if (unlikely(!fast_path)) {
1561 + gl->gl_lockref.count++;
1562 + if (test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) &&
1563 +@@ -1511,6 +1635,19 @@ void gfs2_glock_dq(struct gfs2_holder *gh)
1564 + delay = gl->gl_hold_time;
1565 + __gfs2_glock_queue_work(gl, delay);
1566 + }
1567 ++}
1568 ++
1569 ++/**
1570 ++ * gfs2_glock_dq - dequeue a struct gfs2_holder from a glock (release a glock)
1571 ++ * @gh: the glock holder
1572 ++ *
1573 ++ */
1574 ++void gfs2_glock_dq(struct gfs2_holder *gh)
1575 ++{
1576 ++ struct gfs2_glock *gl = gh->gh_gl;
1577 ++
1578 ++ spin_lock(&gl->gl_lockref.lock);
1579 ++ __gfs2_glock_dq(gh);
1580 + spin_unlock(&gl->gl_lockref.lock);
1581 + }
1582 +
1583 +@@ -1673,6 +1810,7 @@ void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs)
1584 +
1585 + void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state)
1586 + {
1587 ++ struct gfs2_holder mock_gh = { .gh_gl = gl, .gh_state = state, };
1588 + unsigned long delay = 0;
1589 + unsigned long holdtime;
1590 + unsigned long now = jiffies;
1591 +@@ -1687,6 +1825,28 @@ void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state)
1592 + if (test_bit(GLF_REPLY_PENDING, &gl->gl_flags))
1593 + delay = gl->gl_hold_time;
1594 + }
1595 ++ /*
1596 ++ * Note 1: We cannot call demote_incompat_holders from handle_callback
1597 ++ * or gfs2_set_demote due to recursion problems like: gfs2_glock_dq ->
1598 ++ * handle_callback -> demote_incompat_holders -> gfs2_glock_dq
1599 ++ * Plus, we only want to demote the holders if the request comes from
1600 ++ * a remote cluster node because local holder conflicts are resolved
1601 ++ * elsewhere.
1602 ++ *
1603 ++ * Note 2: if a remote node wants this glock in EX mode, lock_dlm will
1604 ++ * request that we set our state to UNLOCKED. Here we mock up a holder
1605 ++ * to make it look like someone wants the lock EX locally. Any SH
1606 ++ * and DF requests should be able to share the lock without demoting.
1607 ++ *
1608 ++ * Note 3: We only want to demote the demoteable holders when there
1609 ++ * are no more strong holders. The demoteable holders might as well
1610 ++ * keep the glock until the last strong holder is done with it.
1611 ++ */
1612 ++ if (!find_first_strong_holder(gl)) {
1613 ++ if (state == LM_ST_UNLOCKED)
1614 ++ mock_gh.gh_state = LM_ST_EXCLUSIVE;
1615 ++ demote_incompat_holders(gl, &mock_gh);
1616 ++ }
1617 + handle_callback(gl, state, delay, true);
1618 + __gfs2_glock_queue_work(gl, delay);
1619 + spin_unlock(&gl->gl_lockref.lock);
1620 +@@ -2078,6 +2238,8 @@ static const char *hflags2str(char *buf, u16 flags, unsigned long iflags)
1621 + *p++ = 'H';
1622 + if (test_bit(HIF_WAIT, &iflags))
1623 + *p++ = 'W';
1624 ++ if (test_bit(HIF_MAY_DEMOTE, &iflags))
1625 ++ *p++ = 'D';
1626 + *p = 0;
1627 + return buf;
1628 + }
1629 +diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
1630 +index 31a8f2f649b52..9012487da4c69 100644
1631 +--- a/fs/gfs2/glock.h
1632 ++++ b/fs/gfs2/glock.h
1633 +@@ -150,6 +150,8 @@ static inline struct gfs2_holder *gfs2_glock_is_locked_by_me(struct gfs2_glock *
1634 + list_for_each_entry(gh, &gl->gl_holders, gh_list) {
1635 + if (!test_bit(HIF_HOLDER, &gh->gh_iflags))
1636 + break;
1637 ++ if (test_bit(HIF_MAY_DEMOTE, &gh->gh_iflags))
1638 ++ continue;
1639 + if (gh->gh_owner_pid == pid)
1640 + goto out;
1641 + }
1642 +@@ -325,6 +327,24 @@ static inline void glock_clear_object(struct gfs2_glock *gl, void *object)
1643 + spin_unlock(&gl->gl_lockref.lock);
1644 + }
1645 +
1646 ++static inline void gfs2_holder_allow_demote(struct gfs2_holder *gh)
1647 ++{
1648 ++ struct gfs2_glock *gl = gh->gh_gl;
1649 ++
1650 ++ spin_lock(&gl->gl_lockref.lock);
1651 ++ set_bit(HIF_MAY_DEMOTE, &gh->gh_iflags);
1652 ++ spin_unlock(&gl->gl_lockref.lock);
1653 ++}
1654 ++
1655 ++static inline void gfs2_holder_disallow_demote(struct gfs2_holder *gh)
1656 ++{
1657 ++ struct gfs2_glock *gl = gh->gh_gl;
1658 ++
1659 ++ spin_lock(&gl->gl_lockref.lock);
1660 ++ clear_bit(HIF_MAY_DEMOTE, &gh->gh_iflags);
1661 ++ spin_unlock(&gl->gl_lockref.lock);
1662 ++}
1663 ++
1664 + extern void gfs2_inode_remember_delete(struct gfs2_glock *gl, u64 generation);
1665 + extern bool gfs2_inode_already_deleted(struct gfs2_glock *gl, u64 generation);
1666 +
1667 +diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
1668 +index 0fe49770166ea..ca42d310fd4d6 100644
1669 +--- a/fs/gfs2/incore.h
1670 ++++ b/fs/gfs2/incore.h
1671 +@@ -252,6 +252,7 @@ struct gfs2_lkstats {
1672 +
1673 + enum {
1674 + /* States */
1675 ++ HIF_MAY_DEMOTE = 1,
1676 + HIF_HOLDER = 6, /* Set for gh that "holds" the glock */
1677 + HIF_WAIT = 10,
1678 + };
1679 +@@ -386,9 +387,8 @@ struct gfs2_inode {
1680 + u64 i_generation;
1681 + u64 i_eattr;
1682 + unsigned long i_flags; /* GIF_... */
1683 +- struct gfs2_glock *i_gl; /* Move into i_gh? */
1684 ++ struct gfs2_glock *i_gl;
1685 + struct gfs2_holder i_iopen_gh;
1686 +- struct gfs2_holder i_gh; /* for prepare/commit_write only */
1687 + struct gfs2_qadata *i_qadata; /* quota allocation data */
1688 + struct gfs2_holder i_rgd_gh;
1689 + struct gfs2_blkreserv i_res; /* rgrp multi-block reservation */
1690 +diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
1691 +index 97119ec3b8503..fe10d8a30f6bd 100644
1692 +--- a/fs/iomap/buffered-io.c
1693 ++++ b/fs/iomap/buffered-io.c
1694 +@@ -757,7 +757,7 @@ again:
1695 + * same page as we're writing to, without it being marked
1696 + * up-to-date.
1697 + */
1698 +- if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
1699 ++ if (unlikely(fault_in_iov_iter_readable(i, bytes))) {
1700 + status = -EFAULT;
1701 + break;
1702 + }
1703 +diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
1704 +index 4ecd255e0511c..468dcbba45bcb 100644
1705 +--- a/fs/iomap/direct-io.c
1706 ++++ b/fs/iomap/direct-io.c
1707 +@@ -31,6 +31,7 @@ struct iomap_dio {
1708 + atomic_t ref;
1709 + unsigned flags;
1710 + int error;
1711 ++ size_t done_before;
1712 + bool wait_for_completion;
1713 +
1714 + union {
1715 +@@ -124,6 +125,9 @@ ssize_t iomap_dio_complete(struct iomap_dio *dio)
1716 + if (ret > 0 && (dio->flags & IOMAP_DIO_NEED_SYNC))
1717 + ret = generic_write_sync(iocb, ret);
1718 +
1719 ++ if (ret > 0)
1720 ++ ret += dio->done_before;
1721 ++
1722 + kfree(dio);
1723 +
1724 + return ret;
1725 +@@ -371,6 +375,8 @@ static loff_t iomap_dio_hole_iter(const struct iomap_iter *iter,
1726 + loff_t length = iov_iter_zero(iomap_length(iter), dio->submit.iter);
1727 +
1728 + dio->size += length;
1729 ++ if (!length)
1730 ++ return -EFAULT;
1731 + return length;
1732 + }
1733 +
1734 +@@ -402,6 +408,8 @@ static loff_t iomap_dio_inline_iter(const struct iomap_iter *iomi,
1735 + copied = copy_to_iter(inline_data, length, iter);
1736 + }
1737 + dio->size += copied;
1738 ++ if (!copied)
1739 ++ return -EFAULT;
1740 + return copied;
1741 + }
1742 +
1743 +@@ -446,13 +454,21 @@ static loff_t iomap_dio_iter(const struct iomap_iter *iter,
1744 + * may be pure data writes. In that case, we still need to do a full data sync
1745 + * completion.
1746 + *
1747 ++ * When page faults are disabled and @dio_flags includes IOMAP_DIO_PARTIAL,
1748 ++ * __iomap_dio_rw can return a partial result if it encounters a non-resident
1749 ++ * page in @iter after preparing a transfer. In that case, the non-resident
1750 ++ * pages can be faulted in and the request resumed with @done_before set to the
1751 ++ * number of bytes previously transferred. The request will then complete with
1752 ++ * the correct total number of bytes transferred; this is essential for
1753 ++ * completing partial requests asynchronously.
1754 ++ *
1755 + * Returns -ENOTBLK In case of a page invalidation invalidation failure for
1756 + * writes. The callers needs to fall back to buffered I/O in this case.
1757 + */
1758 + struct iomap_dio *
1759 + __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
1760 + const struct iomap_ops *ops, const struct iomap_dio_ops *dops,
1761 +- unsigned int dio_flags)
1762 ++ unsigned int dio_flags, size_t done_before)
1763 + {
1764 + struct address_space *mapping = iocb->ki_filp->f_mapping;
1765 + struct inode *inode = file_inode(iocb->ki_filp);
1766 +@@ -482,6 +498,7 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
1767 + dio->dops = dops;
1768 + dio->error = 0;
1769 + dio->flags = 0;
1770 ++ dio->done_before = done_before;
1771 +
1772 + dio->submit.iter = iter;
1773 + dio->submit.waiter = current;
1774 +@@ -577,6 +594,12 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
1775 + if (iov_iter_rw(iter) == READ && iomi.pos >= dio->i_size)
1776 + iov_iter_revert(iter, iomi.pos - dio->i_size);
1777 +
1778 ++ if (ret == -EFAULT && dio->size && (dio_flags & IOMAP_DIO_PARTIAL)) {
1779 ++ if (!(iocb->ki_flags & IOCB_NOWAIT))
1780 ++ wait_for_completion = true;
1781 ++ ret = 0;
1782 ++ }
1783 ++
1784 + /* magic error code to fall back to buffered I/O */
1785 + if (ret == -ENOTBLK) {
1786 + wait_for_completion = true;
1787 +@@ -642,11 +665,11 @@ EXPORT_SYMBOL_GPL(__iomap_dio_rw);
1788 + ssize_t
1789 + iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
1790 + const struct iomap_ops *ops, const struct iomap_dio_ops *dops,
1791 +- unsigned int dio_flags)
1792 ++ unsigned int dio_flags, size_t done_before)
1793 + {
1794 + struct iomap_dio *dio;
1795 +
1796 +- dio = __iomap_dio_rw(iocb, iter, ops, dops, dio_flags);
1797 ++ dio = __iomap_dio_rw(iocb, iter, ops, dops, dio_flags, done_before);
1798 + if (IS_ERR_OR_NULL(dio))
1799 + return PTR_ERR_OR_ZERO(dio);
1800 + return iomap_dio_complete(dio);
1801 +diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
1802 +index ab4f3362466d0..a43adeacd930c 100644
1803 +--- a/fs/ntfs/file.c
1804 ++++ b/fs/ntfs/file.c
1805 +@@ -1829,7 +1829,7 @@ again:
1806 + * pages being swapped out between us bringing them into memory
1807 + * and doing the actual copying.
1808 + */
1809 +- if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
1810 ++ if (unlikely(fault_in_iov_iter_readable(i, bytes))) {
1811 + status = -EFAULT;
1812 + break;
1813 + }
1814 +diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c
1815 +index 43b1451bff539..54b9599640ef4 100644
1816 +--- a/fs/ntfs3/file.c
1817 ++++ b/fs/ntfs3/file.c
1818 +@@ -989,7 +989,7 @@ static ssize_t ntfs_compress_write(struct kiocb *iocb, struct iov_iter *from)
1819 + frame_vbo = pos & ~(frame_size - 1);
1820 + index = frame_vbo >> PAGE_SHIFT;
1821 +
1822 +- if (unlikely(iov_iter_fault_in_readable(from, bytes))) {
1823 ++ if (unlikely(fault_in_iov_iter_readable(from, bytes))) {
1824 + err = -EFAULT;
1825 + goto out;
1826 + }
1827 +diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
1828 +index 7aa943edfc02f..240eb932c014b 100644
1829 +--- a/fs/xfs/xfs_file.c
1830 ++++ b/fs/xfs/xfs_file.c
1831 +@@ -259,7 +259,7 @@ xfs_file_dio_read(
1832 + ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
1833 + if (ret)
1834 + return ret;
1835 +- ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL, 0);
1836 ++ ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL, 0, 0);
1837 + xfs_iunlock(ip, XFS_IOLOCK_SHARED);
1838 +
1839 + return ret;
1840 +@@ -569,7 +569,7 @@ xfs_file_dio_write_aligned(
1841 + }
1842 + trace_xfs_file_direct_write(iocb, from);
1843 + ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,
1844 +- &xfs_dio_write_ops, 0);
1845 ++ &xfs_dio_write_ops, 0, 0);
1846 + out_unlock:
1847 + if (iolock)
1848 + xfs_iunlock(ip, iolock);
1849 +@@ -647,7 +647,7 @@ retry_exclusive:
1850 +
1851 + trace_xfs_file_direct_write(iocb, from);
1852 + ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,
1853 +- &xfs_dio_write_ops, flags);
1854 ++ &xfs_dio_write_ops, flags, 0);
1855 +
1856 + /*
1857 + * Retry unaligned I/O with exclusive blocking semantics if the DIO
1858 +diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
1859 +index 807f33553a8eb..bced33b76beac 100644
1860 +--- a/fs/zonefs/super.c
1861 ++++ b/fs/zonefs/super.c
1862 +@@ -852,7 +852,7 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from)
1863 + ret = zonefs_file_dio_append(iocb, from);
1864 + else
1865 + ret = iomap_dio_rw(iocb, from, &zonefs_iomap_ops,
1866 +- &zonefs_write_dio_ops, 0);
1867 ++ &zonefs_write_dio_ops, 0, 0);
1868 + if (zi->i_ztype == ZONEFS_ZTYPE_SEQ &&
1869 + (ret > 0 || ret == -EIOCBQUEUED)) {
1870 + if (ret > 0)
1871 +@@ -987,7 +987,7 @@ static ssize_t zonefs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
1872 + }
1873 + file_accessed(iocb->ki_filp);
1874 + ret = iomap_dio_rw(iocb, to, &zonefs_iomap_ops,
1875 +- &zonefs_read_dio_ops, 0);
1876 ++ &zonefs_read_dio_ops, 0, 0);
1877 + } else {
1878 + ret = generic_file_read_iter(iocb, to);
1879 + if (ret == -EIO)
1880 +diff --git a/include/linux/bpf.h b/include/linux/bpf.h
1881 +index 15b690a0cecb0..c5c4b6f09e230 100644
1882 +--- a/include/linux/bpf.h
1883 ++++ b/include/linux/bpf.h
1884 +@@ -293,6 +293,34 @@ bool bpf_map_meta_equal(const struct bpf_map *meta0,
1885 +
1886 + extern const struct bpf_map_ops bpf_map_offload_ops;
1887 +
1888 ++/* bpf_type_flag contains a set of flags that are applicable to the values of
1889 ++ * arg_type, ret_type and reg_type. For example, a pointer value may be null,
1890 ++ * or a memory is read-only. We classify types into two categories: base types
1891 ++ * and extended types. Extended types are base types combined with a type flag.
1892 ++ *
1893 ++ * Currently there are no more than 32 base types in arg_type, ret_type and
1894 ++ * reg_types.
1895 ++ */
1896 ++#define BPF_BASE_TYPE_BITS 8
1897 ++
1898 ++enum bpf_type_flag {
1899 ++ /* PTR may be NULL. */
1900 ++ PTR_MAYBE_NULL = BIT(0 + BPF_BASE_TYPE_BITS),
1901 ++
1902 ++ /* MEM is read-only. When applied on bpf_arg, it indicates the arg is
1903 ++ * compatible with both mutable and immutable memory.
1904 ++ */
1905 ++ MEM_RDONLY = BIT(1 + BPF_BASE_TYPE_BITS),
1906 ++
1907 ++ __BPF_TYPE_LAST_FLAG = MEM_RDONLY,
1908 ++};
1909 ++
1910 ++/* Max number of base types. */
1911 ++#define BPF_BASE_TYPE_LIMIT (1UL << BPF_BASE_TYPE_BITS)
1912 ++
1913 ++/* Max number of all types. */
1914 ++#define BPF_TYPE_LIMIT (__BPF_TYPE_LAST_FLAG | (__BPF_TYPE_LAST_FLAG - 1))
1915 ++
1916 + /* function argument constraints */
1917 + enum bpf_arg_type {
1918 + ARG_DONTCARE = 0, /* unused argument in helper function */
1919 +@@ -304,13 +332,11 @@ enum bpf_arg_type {
1920 + ARG_PTR_TO_MAP_KEY, /* pointer to stack used as map key */
1921 + ARG_PTR_TO_MAP_VALUE, /* pointer to stack used as map value */
1922 + ARG_PTR_TO_UNINIT_MAP_VALUE, /* pointer to valid memory used to store a map value */
1923 +- ARG_PTR_TO_MAP_VALUE_OR_NULL, /* pointer to stack used as map value or NULL */
1924 +
1925 + /* the following constraints used to prototype bpf_memcmp() and other
1926 + * functions that access data on eBPF program stack
1927 + */
1928 + ARG_PTR_TO_MEM, /* pointer to valid memory (stack, packet, map value) */
1929 +- ARG_PTR_TO_MEM_OR_NULL, /* pointer to valid memory or NULL */
1930 + ARG_PTR_TO_UNINIT_MEM, /* pointer to memory does not need to be initialized,
1931 + * helper function must fill all bytes or clear
1932 + * them in error case.
1933 +@@ -320,42 +346,65 @@ enum bpf_arg_type {
1934 + ARG_CONST_SIZE_OR_ZERO, /* number of bytes accessed from memory or 0 */
1935 +
1936 + ARG_PTR_TO_CTX, /* pointer to context */
1937 +- ARG_PTR_TO_CTX_OR_NULL, /* pointer to context or NULL */
1938 + ARG_ANYTHING, /* any (initialized) argument is ok */
1939 + ARG_PTR_TO_SPIN_LOCK, /* pointer to bpf_spin_lock */
1940 + ARG_PTR_TO_SOCK_COMMON, /* pointer to sock_common */
1941 + ARG_PTR_TO_INT, /* pointer to int */
1942 + ARG_PTR_TO_LONG, /* pointer to long */
1943 + ARG_PTR_TO_SOCKET, /* pointer to bpf_sock (fullsock) */
1944 +- ARG_PTR_TO_SOCKET_OR_NULL, /* pointer to bpf_sock (fullsock) or NULL */
1945 + ARG_PTR_TO_BTF_ID, /* pointer to in-kernel struct */
1946 + ARG_PTR_TO_ALLOC_MEM, /* pointer to dynamically allocated memory */
1947 +- ARG_PTR_TO_ALLOC_MEM_OR_NULL, /* pointer to dynamically allocated memory or NULL */
1948 + ARG_CONST_ALLOC_SIZE_OR_ZERO, /* number of allocated bytes requested */
1949 + ARG_PTR_TO_BTF_ID_SOCK_COMMON, /* pointer to in-kernel sock_common or bpf-mirrored bpf_sock */
1950 + ARG_PTR_TO_PERCPU_BTF_ID, /* pointer to in-kernel percpu type */
1951 + ARG_PTR_TO_FUNC, /* pointer to a bpf program function */
1952 +- ARG_PTR_TO_STACK_OR_NULL, /* pointer to stack or NULL */
1953 ++ ARG_PTR_TO_STACK, /* pointer to stack */
1954 + ARG_PTR_TO_CONST_STR, /* pointer to a null terminated read-only string */
1955 + ARG_PTR_TO_TIMER, /* pointer to bpf_timer */
1956 + __BPF_ARG_TYPE_MAX,
1957 ++
1958 ++ /* Extended arg_types. */
1959 ++ ARG_PTR_TO_MAP_VALUE_OR_NULL = PTR_MAYBE_NULL | ARG_PTR_TO_MAP_VALUE,
1960 ++ ARG_PTR_TO_MEM_OR_NULL = PTR_MAYBE_NULL | ARG_PTR_TO_MEM,
1961 ++ ARG_PTR_TO_CTX_OR_NULL = PTR_MAYBE_NULL | ARG_PTR_TO_CTX,
1962 ++ ARG_PTR_TO_SOCKET_OR_NULL = PTR_MAYBE_NULL | ARG_PTR_TO_SOCKET,
1963 ++ ARG_PTR_TO_ALLOC_MEM_OR_NULL = PTR_MAYBE_NULL | ARG_PTR_TO_ALLOC_MEM,
1964 ++ ARG_PTR_TO_STACK_OR_NULL = PTR_MAYBE_NULL | ARG_PTR_TO_STACK,
1965 ++
1966 ++ /* This must be the last entry. Its purpose is to ensure the enum is
1967 ++ * wide enough to hold the higher bits reserved for bpf_type_flag.
1968 ++ */
1969 ++ __BPF_ARG_TYPE_LIMIT = BPF_TYPE_LIMIT,
1970 + };
1971 ++static_assert(__BPF_ARG_TYPE_MAX <= BPF_BASE_TYPE_LIMIT);
1972 +
1973 + /* type of values returned from helper functions */
1974 + enum bpf_return_type {
1975 + RET_INTEGER, /* function returns integer */
1976 + RET_VOID, /* function doesn't return anything */
1977 + RET_PTR_TO_MAP_VALUE, /* returns a pointer to map elem value */
1978 +- RET_PTR_TO_MAP_VALUE_OR_NULL, /* returns a pointer to map elem value or NULL */
1979 +- RET_PTR_TO_SOCKET_OR_NULL, /* returns a pointer to a socket or NULL */
1980 +- RET_PTR_TO_TCP_SOCK_OR_NULL, /* returns a pointer to a tcp_sock or NULL */
1981 +- RET_PTR_TO_SOCK_COMMON_OR_NULL, /* returns a pointer to a sock_common or NULL */
1982 +- RET_PTR_TO_ALLOC_MEM_OR_NULL, /* returns a pointer to dynamically allocated memory or NULL */
1983 +- RET_PTR_TO_BTF_ID_OR_NULL, /* returns a pointer to a btf_id or NULL */
1984 +- RET_PTR_TO_MEM_OR_BTF_ID_OR_NULL, /* returns a pointer to a valid memory or a btf_id or NULL */
1985 ++ RET_PTR_TO_SOCKET, /* returns a pointer to a socket */
1986 ++ RET_PTR_TO_TCP_SOCK, /* returns a pointer to a tcp_sock */
1987 ++ RET_PTR_TO_SOCK_COMMON, /* returns a pointer to a sock_common */
1988 ++ RET_PTR_TO_ALLOC_MEM, /* returns a pointer to dynamically allocated memory */
1989 + RET_PTR_TO_MEM_OR_BTF_ID, /* returns a pointer to a valid memory or a btf_id */
1990 + RET_PTR_TO_BTF_ID, /* returns a pointer to a btf_id */
1991 ++ __BPF_RET_TYPE_MAX,
1992 ++
1993 ++ /* Extended ret_types. */
1994 ++ RET_PTR_TO_MAP_VALUE_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_MAP_VALUE,
1995 ++ RET_PTR_TO_SOCKET_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_SOCKET,
1996 ++ RET_PTR_TO_TCP_SOCK_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_TCP_SOCK,
1997 ++ RET_PTR_TO_SOCK_COMMON_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_SOCK_COMMON,
1998 ++ RET_PTR_TO_ALLOC_MEM_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_ALLOC_MEM,
1999 ++ RET_PTR_TO_BTF_ID_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_BTF_ID,
2000 ++
2001 ++ /* This must be the last entry. Its purpose is to ensure the enum is
2002 ++ * wide enough to hold the higher bits reserved for bpf_type_flag.
2003 ++ */
2004 ++ __BPF_RET_TYPE_LIMIT = BPF_TYPE_LIMIT,
2005 + };
2006 ++static_assert(__BPF_RET_TYPE_MAX <= BPF_BASE_TYPE_LIMIT);
2007 +
2008 + /* eBPF function prototype used by verifier to allow BPF_CALLs from eBPF programs
2009 + * to in-kernel helper functions and for adjusting imm32 field in BPF_CALL
2010 +@@ -417,18 +466,15 @@ enum bpf_reg_type {
2011 + PTR_TO_CTX, /* reg points to bpf_context */
2012 + CONST_PTR_TO_MAP, /* reg points to struct bpf_map */
2013 + PTR_TO_MAP_VALUE, /* reg points to map element value */
2014 +- PTR_TO_MAP_VALUE_OR_NULL,/* points to map elem value or NULL */
2015 ++ PTR_TO_MAP_KEY, /* reg points to a map element key */
2016 + PTR_TO_STACK, /* reg == frame_pointer + offset */
2017 + PTR_TO_PACKET_META, /* skb->data - meta_len */
2018 + PTR_TO_PACKET, /* reg points to skb->data */
2019 + PTR_TO_PACKET_END, /* skb->data + headlen */
2020 + PTR_TO_FLOW_KEYS, /* reg points to bpf_flow_keys */
2021 + PTR_TO_SOCKET, /* reg points to struct bpf_sock */
2022 +- PTR_TO_SOCKET_OR_NULL, /* reg points to struct bpf_sock or NULL */
2023 + PTR_TO_SOCK_COMMON, /* reg points to sock_common */
2024 +- PTR_TO_SOCK_COMMON_OR_NULL, /* reg points to sock_common or NULL */
2025 + PTR_TO_TCP_SOCK, /* reg points to struct tcp_sock */
2026 +- PTR_TO_TCP_SOCK_OR_NULL, /* reg points to struct tcp_sock or NULL */
2027 + PTR_TO_TP_BUFFER, /* reg points to a writable raw tp's buffer */
2028 + PTR_TO_XDP_SOCK, /* reg points to struct xdp_sock */
2029 + /* PTR_TO_BTF_ID points to a kernel struct that does not need
2030 +@@ -446,18 +492,25 @@ enum bpf_reg_type {
2031 + * been checked for null. Used primarily to inform the verifier
2032 + * an explicit null check is required for this struct.
2033 + */
2034 +- PTR_TO_BTF_ID_OR_NULL,
2035 + PTR_TO_MEM, /* reg points to valid memory region */
2036 +- PTR_TO_MEM_OR_NULL, /* reg points to valid memory region or NULL */
2037 +- PTR_TO_RDONLY_BUF, /* reg points to a readonly buffer */
2038 +- PTR_TO_RDONLY_BUF_OR_NULL, /* reg points to a readonly buffer or NULL */
2039 +- PTR_TO_RDWR_BUF, /* reg points to a read/write buffer */
2040 +- PTR_TO_RDWR_BUF_OR_NULL, /* reg points to a read/write buffer or NULL */
2041 ++ PTR_TO_BUF, /* reg points to a read/write buffer */
2042 + PTR_TO_PERCPU_BTF_ID, /* reg points to a percpu kernel variable */
2043 + PTR_TO_FUNC, /* reg points to a bpf program function */
2044 +- PTR_TO_MAP_KEY, /* reg points to a map element key */
2045 + __BPF_REG_TYPE_MAX,
2046 ++
2047 ++ /* Extended reg_types. */
2048 ++ PTR_TO_MAP_VALUE_OR_NULL = PTR_MAYBE_NULL | PTR_TO_MAP_VALUE,
2049 ++ PTR_TO_SOCKET_OR_NULL = PTR_MAYBE_NULL | PTR_TO_SOCKET,
2050 ++ PTR_TO_SOCK_COMMON_OR_NULL = PTR_MAYBE_NULL | PTR_TO_SOCK_COMMON,
2051 ++ PTR_TO_TCP_SOCK_OR_NULL = PTR_MAYBE_NULL | PTR_TO_TCP_SOCK,
2052 ++ PTR_TO_BTF_ID_OR_NULL = PTR_MAYBE_NULL | PTR_TO_BTF_ID,
2053 ++
2054 ++ /* This must be the last entry. Its purpose is to ensure the enum is
2055 ++ * wide enough to hold the higher bits reserved for bpf_type_flag.
2056 ++ */
2057 ++ __BPF_REG_TYPE_LIMIT = BPF_TYPE_LIMIT,
2058 + };
2059 ++static_assert(__BPF_REG_TYPE_MAX <= BPF_BASE_TYPE_LIMIT);
2060 +
2061 + /* The information passed from prog-specific *_is_valid_access
2062 + * back to the verifier.
2063 +diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
2064 +index 364550dd19c4a..bb1cc3fbc4bab 100644
2065 +--- a/include/linux/bpf_verifier.h
2066 ++++ b/include/linux/bpf_verifier.h
2067 +@@ -18,6 +18,8 @@
2068 + * that converting umax_value to int cannot overflow.
2069 + */
2070 + #define BPF_MAX_VAR_SIZ (1 << 29)
2071 ++/* size of type_str_buf in bpf_verifier. */
2072 ++#define TYPE_STR_BUF_LEN 64
2073 +
2074 + /* Liveness marks, used for registers and spilled-regs (in stack slots).
2075 + * Read marks propagate upwards until they find a write mark; they record that
2076 +@@ -474,6 +476,8 @@ struct bpf_verifier_env {
2077 + /* longest register parentage chain walked for liveness marking */
2078 + u32 longest_mark_read_walk;
2079 + bpfptr_t fd_array;
2080 ++ /* buffer used in reg_type_str() to generate reg_type string */
2081 ++ char type_str_buf[TYPE_STR_BUF_LEN];
2082 + };
2083 +
2084 + __printf(2, 0) void bpf_verifier_vlog(struct bpf_verifier_log *log,
2085 +@@ -535,4 +539,18 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
2086 + u32 btf_id,
2087 + struct bpf_attach_target_info *tgt_info);
2088 +
2089 ++#define BPF_BASE_TYPE_MASK GENMASK(BPF_BASE_TYPE_BITS - 1, 0)
2090 ++
2091 ++/* extract base type from bpf_{arg, return, reg}_type. */
2092 ++static inline u32 base_type(u32 type)
2093 ++{
2094 ++ return type & BPF_BASE_TYPE_MASK;
2095 ++}
2096 ++
2097 ++/* extract flags from an extended type. See bpf_type_flag in bpf.h. */
2098 ++static inline u32 type_flag(u32 type)
2099 ++{
2100 ++ return type & ~BPF_BASE_TYPE_MASK;
2101 ++}
2102 ++
2103 + #endif /* _LINUX_BPF_VERIFIER_H */
2104 +diff --git a/include/linux/iomap.h b/include/linux/iomap.h
2105 +index 24f8489583ca7..829f2325ecbab 100644
2106 +--- a/include/linux/iomap.h
2107 ++++ b/include/linux/iomap.h
2108 +@@ -330,12 +330,19 @@ struct iomap_dio_ops {
2109 + */
2110 + #define IOMAP_DIO_OVERWRITE_ONLY (1 << 1)
2111 +
2112 ++/*
2113 ++ * When a page fault occurs, return a partial synchronous result and allow
2114 ++ * the caller to retry the rest of the operation after dealing with the page
2115 ++ * fault.
2116 ++ */
2117 ++#define IOMAP_DIO_PARTIAL (1 << 2)
2118 ++
2119 + ssize_t iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
2120 + const struct iomap_ops *ops, const struct iomap_dio_ops *dops,
2121 +- unsigned int dio_flags);
2122 ++ unsigned int dio_flags, size_t done_before);
2123 + struct iomap_dio *__iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
2124 + const struct iomap_ops *ops, const struct iomap_dio_ops *dops,
2125 +- unsigned int dio_flags);
2126 ++ unsigned int dio_flags, size_t done_before);
2127 + ssize_t iomap_dio_complete(struct iomap_dio *dio);
2128 + int iomap_dio_iopoll(struct kiocb *kiocb, bool spin);
2129 +
2130 +diff --git a/include/linux/mm.h b/include/linux/mm.h
2131 +index 90c2d7f3c7a88..04345ff97f8ca 100644
2132 +--- a/include/linux/mm.h
2133 ++++ b/include/linux/mm.h
2134 +@@ -2858,7 +2858,8 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
2135 + #define FOLL_FORCE 0x10 /* get_user_pages read/write w/o permission */
2136 + #define FOLL_NOWAIT 0x20 /* if a disk transfer is needed, start the IO
2137 + * and return without waiting upon it */
2138 +-#define FOLL_POPULATE 0x40 /* fault in page */
2139 ++#define FOLL_POPULATE 0x40 /* fault in pages (with FOLL_MLOCK) */
2140 ++#define FOLL_NOFAULT 0x80 /* do not fault in pages */
2141 + #define FOLL_HWPOISON 0x100 /* check page is hwpoisoned */
2142 + #define FOLL_NUMA 0x200 /* force NUMA hinting page fault */
2143 + #define FOLL_MIGRATION 0x400 /* wait for page to replace migration entry */
2144 +diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
2145 +index 62db6b0176b95..2f7dd14083d94 100644
2146 +--- a/include/linux/pagemap.h
2147 ++++ b/include/linux/pagemap.h
2148 +@@ -733,61 +733,11 @@ int wait_on_page_private_2_killable(struct page *page);
2149 + extern void add_page_wait_queue(struct page *page, wait_queue_entry_t *waiter);
2150 +
2151 + /*
2152 +- * Fault everything in given userspace address range in.
2153 ++ * Fault in userspace address range.
2154 + */
2155 +-static inline int fault_in_pages_writeable(char __user *uaddr, size_t size)
2156 +-{
2157 +- char __user *end = uaddr + size - 1;
2158 +-
2159 +- if (unlikely(size == 0))
2160 +- return 0;
2161 +-
2162 +- if (unlikely(uaddr > end))
2163 +- return -EFAULT;
2164 +- /*
2165 +- * Writing zeroes into userspace here is OK, because we know that if
2166 +- * the zero gets there, we'll be overwriting it.
2167 +- */
2168 +- do {
2169 +- if (unlikely(__put_user(0, uaddr) != 0))
2170 +- return -EFAULT;
2171 +- uaddr += PAGE_SIZE;
2172 +- } while (uaddr <= end);
2173 +-
2174 +- /* Check whether the range spilled into the next page. */
2175 +- if (((unsigned long)uaddr & PAGE_MASK) ==
2176 +- ((unsigned long)end & PAGE_MASK))
2177 +- return __put_user(0, end);
2178 +-
2179 +- return 0;
2180 +-}
2181 +-
2182 +-static inline int fault_in_pages_readable(const char __user *uaddr, size_t size)
2183 +-{
2184 +- volatile char c;
2185 +- const char __user *end = uaddr + size - 1;
2186 +-
2187 +- if (unlikely(size == 0))
2188 +- return 0;
2189 +-
2190 +- if (unlikely(uaddr > end))
2191 +- return -EFAULT;
2192 +-
2193 +- do {
2194 +- if (unlikely(__get_user(c, uaddr) != 0))
2195 +- return -EFAULT;
2196 +- uaddr += PAGE_SIZE;
2197 +- } while (uaddr <= end);
2198 +-
2199 +- /* Check whether the range spilled into the next page. */
2200 +- if (((unsigned long)uaddr & PAGE_MASK) ==
2201 +- ((unsigned long)end & PAGE_MASK)) {
2202 +- return __get_user(c, end);
2203 +- }
2204 +-
2205 +- (void)c;
2206 +- return 0;
2207 +-}
2208 ++size_t fault_in_writeable(char __user *uaddr, size_t size);
2209 ++size_t fault_in_safe_writeable(const char __user *uaddr, size_t size);
2210 ++size_t fault_in_readable(const char __user *uaddr, size_t size);
2211 +
2212 + int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
2213 + pgoff_t index, gfp_t gfp_mask);
2214 +diff --git a/include/linux/uio.h b/include/linux/uio.h
2215 +index 207101a9c5c32..6350354f97e90 100644
2216 +--- a/include/linux/uio.h
2217 ++++ b/include/linux/uio.h
2218 +@@ -35,6 +35,7 @@ struct iov_iter_state {
2219 +
2220 + struct iov_iter {
2221 + u8 iter_type;
2222 ++ bool nofault;
2223 + bool data_source;
2224 + size_t iov_offset;
2225 + size_t count;
2226 +@@ -133,7 +134,8 @@ size_t copy_page_from_iter_atomic(struct page *page, unsigned offset,
2227 + size_t bytes, struct iov_iter *i);
2228 + void iov_iter_advance(struct iov_iter *i, size_t bytes);
2229 + void iov_iter_revert(struct iov_iter *i, size_t bytes);
2230 +-int iov_iter_fault_in_readable(const struct iov_iter *i, size_t bytes);
2231 ++size_t fault_in_iov_iter_readable(const struct iov_iter *i, size_t bytes);
2232 ++size_t fault_in_iov_iter_writeable(const struct iov_iter *i, size_t bytes);
2233 + size_t iov_iter_single_seg_count(const struct iov_iter *i);
2234 + size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes,
2235 + struct iov_iter *i);
2236 +diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
2237 +index 09406b0e215e1..40df35088cdbd 100644
2238 +--- a/kernel/bpf/btf.c
2239 ++++ b/kernel/bpf/btf.c
2240 +@@ -4800,10 +4800,12 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
2241 + /* check for PTR_TO_RDONLY_BUF_OR_NULL or PTR_TO_RDWR_BUF_OR_NULL */
2242 + for (i = 0; i < prog->aux->ctx_arg_info_size; i++) {
2243 + const struct bpf_ctx_arg_aux *ctx_arg_info = &prog->aux->ctx_arg_info[i];
2244 ++ u32 type, flag;
2245 +
2246 +- if (ctx_arg_info->offset == off &&
2247 +- (ctx_arg_info->reg_type == PTR_TO_RDONLY_BUF_OR_NULL ||
2248 +- ctx_arg_info->reg_type == PTR_TO_RDWR_BUF_OR_NULL)) {
2249 ++ type = base_type(ctx_arg_info->reg_type);
2250 ++ flag = type_flag(ctx_arg_info->reg_type);
2251 ++ if (ctx_arg_info->offset == off && type == PTR_TO_BUF &&
2252 ++ (flag & PTR_MAYBE_NULL)) {
2253 + info->reg_type = ctx_arg_info->reg_type;
2254 + return true;
2255 + }
2256 +@@ -5508,9 +5510,9 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
2257 + if (reg->type == PTR_TO_BTF_ID) {
2258 + reg_btf = reg->btf;
2259 + reg_ref_id = reg->btf_id;
2260 +- } else if (reg2btf_ids[reg->type]) {
2261 ++ } else if (reg2btf_ids[base_type(reg->type)]) {
2262 + reg_btf = btf_vmlinux;
2263 +- reg_ref_id = *reg2btf_ids[reg->type];
2264 ++ reg_ref_id = *reg2btf_ids[base_type(reg->type)];
2265 + } else {
2266 + bpf_log(log, "kernel function %s args#%d expected pointer to %s %s but R%d is not a pointer to btf_id\n",
2267 + func_name, i,
2268 +@@ -5717,7 +5719,7 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog,
2269 + return -EINVAL;
2270 + }
2271 +
2272 +- reg->type = PTR_TO_MEM_OR_NULL;
2273 ++ reg->type = PTR_TO_MEM | PTR_MAYBE_NULL;
2274 + reg->id = ++env->id_gen;
2275 +
2276 + continue;
2277 +@@ -6229,7 +6231,7 @@ const struct bpf_func_proto bpf_btf_find_by_name_kind_proto = {
2278 + .func = bpf_btf_find_by_name_kind,
2279 + .gpl_only = false,
2280 + .ret_type = RET_INTEGER,
2281 +- .arg1_type = ARG_PTR_TO_MEM,
2282 ++ .arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY,
2283 + .arg2_type = ARG_CONST_SIZE,
2284 + .arg3_type = ARG_ANYTHING,
2285 + .arg4_type = ARG_ANYTHING,
2286 +diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
2287 +index 7dbd68195a2b0..fe053ffd89329 100644
2288 +--- a/kernel/bpf/cgroup.c
2289 ++++ b/kernel/bpf/cgroup.c
2290 +@@ -1753,7 +1753,7 @@ static const struct bpf_func_proto bpf_sysctl_set_new_value_proto = {
2291 + .gpl_only = false,
2292 + .ret_type = RET_INTEGER,
2293 + .arg1_type = ARG_PTR_TO_CTX,
2294 +- .arg2_type = ARG_PTR_TO_MEM,
2295 ++ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
2296 + .arg3_type = ARG_CONST_SIZE,
2297 + };
2298 +
2299 +diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
2300 +index 6f600cc95ccda..a711ffe238932 100644
2301 +--- a/kernel/bpf/helpers.c
2302 ++++ b/kernel/bpf/helpers.c
2303 +@@ -530,7 +530,7 @@ const struct bpf_func_proto bpf_strtol_proto = {
2304 + .func = bpf_strtol,
2305 + .gpl_only = false,
2306 + .ret_type = RET_INTEGER,
2307 +- .arg1_type = ARG_PTR_TO_MEM,
2308 ++ .arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY,
2309 + .arg2_type = ARG_CONST_SIZE,
2310 + .arg3_type = ARG_ANYTHING,
2311 + .arg4_type = ARG_PTR_TO_LONG,
2312 +@@ -558,7 +558,7 @@ const struct bpf_func_proto bpf_strtoul_proto = {
2313 + .func = bpf_strtoul,
2314 + .gpl_only = false,
2315 + .ret_type = RET_INTEGER,
2316 +- .arg1_type = ARG_PTR_TO_MEM,
2317 ++ .arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY,
2318 + .arg2_type = ARG_CONST_SIZE,
2319 + .arg3_type = ARG_ANYTHING,
2320 + .arg4_type = ARG_PTR_TO_LONG,
2321 +@@ -630,7 +630,7 @@ const struct bpf_func_proto bpf_event_output_data_proto = {
2322 + .arg1_type = ARG_PTR_TO_CTX,
2323 + .arg2_type = ARG_CONST_MAP_PTR,
2324 + .arg3_type = ARG_ANYTHING,
2325 +- .arg4_type = ARG_PTR_TO_MEM,
2326 ++ .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY,
2327 + .arg5_type = ARG_CONST_SIZE_OR_ZERO,
2328 + };
2329 +
2330 +@@ -667,7 +667,7 @@ BPF_CALL_2(bpf_per_cpu_ptr, const void *, ptr, u32, cpu)
2331 + const struct bpf_func_proto bpf_per_cpu_ptr_proto = {
2332 + .func = bpf_per_cpu_ptr,
2333 + .gpl_only = false,
2334 +- .ret_type = RET_PTR_TO_MEM_OR_BTF_ID_OR_NULL,
2335 ++ .ret_type = RET_PTR_TO_MEM_OR_BTF_ID | PTR_MAYBE_NULL | MEM_RDONLY,
2336 + .arg1_type = ARG_PTR_TO_PERCPU_BTF_ID,
2337 + .arg2_type = ARG_ANYTHING,
2338 + };
2339 +@@ -680,7 +680,7 @@ BPF_CALL_1(bpf_this_cpu_ptr, const void *, percpu_ptr)
2340 + const struct bpf_func_proto bpf_this_cpu_ptr_proto = {
2341 + .func = bpf_this_cpu_ptr,
2342 + .gpl_only = false,
2343 +- .ret_type = RET_PTR_TO_MEM_OR_BTF_ID,
2344 ++ .ret_type = RET_PTR_TO_MEM_OR_BTF_ID | MEM_RDONLY,
2345 + .arg1_type = ARG_PTR_TO_PERCPU_BTF_ID,
2346 + };
2347 +
2348 +@@ -1013,7 +1013,7 @@ const struct bpf_func_proto bpf_snprintf_proto = {
2349 + .arg1_type = ARG_PTR_TO_MEM_OR_NULL,
2350 + .arg2_type = ARG_CONST_SIZE_OR_ZERO,
2351 + .arg3_type = ARG_PTR_TO_CONST_STR,
2352 +- .arg4_type = ARG_PTR_TO_MEM_OR_NULL,
2353 ++ .arg4_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY,
2354 + .arg5_type = ARG_CONST_SIZE_OR_ZERO,
2355 + };
2356 +
2357 +diff --git a/kernel/bpf/map_iter.c b/kernel/bpf/map_iter.c
2358 +index 6a9542af4212a..b0fa190b09790 100644
2359 +--- a/kernel/bpf/map_iter.c
2360 ++++ b/kernel/bpf/map_iter.c
2361 +@@ -174,9 +174,9 @@ static const struct bpf_iter_reg bpf_map_elem_reg_info = {
2362 + .ctx_arg_info_size = 2,
2363 + .ctx_arg_info = {
2364 + { offsetof(struct bpf_iter__bpf_map_elem, key),
2365 +- PTR_TO_RDONLY_BUF_OR_NULL },
2366 ++ PTR_TO_BUF | PTR_MAYBE_NULL | MEM_RDONLY },
2367 + { offsetof(struct bpf_iter__bpf_map_elem, value),
2368 +- PTR_TO_RDWR_BUF_OR_NULL },
2369 ++ PTR_TO_BUF | PTR_MAYBE_NULL },
2370 + },
2371 + };
2372 +
2373 +diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c
2374 +index f1c51c45667d3..710ba9de12ce4 100644
2375 +--- a/kernel/bpf/ringbuf.c
2376 ++++ b/kernel/bpf/ringbuf.c
2377 +@@ -444,7 +444,7 @@ const struct bpf_func_proto bpf_ringbuf_output_proto = {
2378 + .func = bpf_ringbuf_output,
2379 + .ret_type = RET_INTEGER,
2380 + .arg1_type = ARG_CONST_MAP_PTR,
2381 +- .arg2_type = ARG_PTR_TO_MEM,
2382 ++ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
2383 + .arg3_type = ARG_CONST_SIZE_OR_ZERO,
2384 + .arg4_type = ARG_ANYTHING,
2385 + };
2386 +diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
2387 +index 42490c39dfbf5..48e02a725563f 100644
2388 +--- a/kernel/bpf/syscall.c
2389 ++++ b/kernel/bpf/syscall.c
2390 +@@ -4753,7 +4753,7 @@ static const struct bpf_func_proto bpf_sys_bpf_proto = {
2391 + .gpl_only = false,
2392 + .ret_type = RET_INTEGER,
2393 + .arg1_type = ARG_ANYTHING,
2394 +- .arg2_type = ARG_PTR_TO_MEM,
2395 ++ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
2396 + .arg3_type = ARG_CONST_SIZE,
2397 + };
2398 +
2399 +diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
2400 +index 670721e39c0e8..d2b119b4fbe74 100644
2401 +--- a/kernel/bpf/verifier.c
2402 ++++ b/kernel/bpf/verifier.c
2403 +@@ -445,18 +445,6 @@ static bool reg_type_not_null(enum bpf_reg_type type)
2404 + type == PTR_TO_SOCK_COMMON;
2405 + }
2406 +
2407 +-static bool reg_type_may_be_null(enum bpf_reg_type type)
2408 +-{
2409 +- return type == PTR_TO_MAP_VALUE_OR_NULL ||
2410 +- type == PTR_TO_SOCKET_OR_NULL ||
2411 +- type == PTR_TO_SOCK_COMMON_OR_NULL ||
2412 +- type == PTR_TO_TCP_SOCK_OR_NULL ||
2413 +- type == PTR_TO_BTF_ID_OR_NULL ||
2414 +- type == PTR_TO_MEM_OR_NULL ||
2415 +- type == PTR_TO_RDONLY_BUF_OR_NULL ||
2416 +- type == PTR_TO_RDWR_BUF_OR_NULL;
2417 +-}
2418 +-
2419 + static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg)
2420 + {
2421 + return reg->type == PTR_TO_MAP_VALUE &&
2422 +@@ -465,12 +453,14 @@ static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg)
2423 +
2424 + static bool reg_type_may_be_refcounted_or_null(enum bpf_reg_type type)
2425 + {
2426 +- return type == PTR_TO_SOCKET ||
2427 +- type == PTR_TO_SOCKET_OR_NULL ||
2428 +- type == PTR_TO_TCP_SOCK ||
2429 +- type == PTR_TO_TCP_SOCK_OR_NULL ||
2430 +- type == PTR_TO_MEM ||
2431 +- type == PTR_TO_MEM_OR_NULL;
2432 ++ return base_type(type) == PTR_TO_SOCKET ||
2433 ++ base_type(type) == PTR_TO_TCP_SOCK ||
2434 ++ base_type(type) == PTR_TO_MEM;
2435 ++}
2436 ++
2437 ++static bool type_is_rdonly_mem(u32 type)
2438 ++{
2439 ++ return type & MEM_RDONLY;
2440 + }
2441 +
2442 + static bool arg_type_may_be_refcounted(enum bpf_arg_type type)
2443 +@@ -478,14 +468,9 @@ static bool arg_type_may_be_refcounted(enum bpf_arg_type type)
2444 + return type == ARG_PTR_TO_SOCK_COMMON;
2445 + }
2446 +
2447 +-static bool arg_type_may_be_null(enum bpf_arg_type type)
2448 ++static bool type_may_be_null(u32 type)
2449 + {
2450 +- return type == ARG_PTR_TO_MAP_VALUE_OR_NULL ||
2451 +- type == ARG_PTR_TO_MEM_OR_NULL ||
2452 +- type == ARG_PTR_TO_CTX_OR_NULL ||
2453 +- type == ARG_PTR_TO_SOCKET_OR_NULL ||
2454 +- type == ARG_PTR_TO_ALLOC_MEM_OR_NULL ||
2455 +- type == ARG_PTR_TO_STACK_OR_NULL;
2456 ++ return type & PTR_MAYBE_NULL;
2457 + }
2458 +
2459 + /* Determine whether the function releases some resources allocated by another
2460 +@@ -545,39 +530,54 @@ static bool is_cmpxchg_insn(const struct bpf_insn *insn)
2461 + insn->imm == BPF_CMPXCHG;
2462 + }
2463 +
2464 +-/* string representation of 'enum bpf_reg_type' */
2465 +-static const char * const reg_type_str[] = {
2466 +- [NOT_INIT] = "?",
2467 +- [SCALAR_VALUE] = "inv",
2468 +- [PTR_TO_CTX] = "ctx",
2469 +- [CONST_PTR_TO_MAP] = "map_ptr",
2470 +- [PTR_TO_MAP_VALUE] = "map_value",
2471 +- [PTR_TO_MAP_VALUE_OR_NULL] = "map_value_or_null",
2472 +- [PTR_TO_STACK] = "fp",
2473 +- [PTR_TO_PACKET] = "pkt",
2474 +- [PTR_TO_PACKET_META] = "pkt_meta",
2475 +- [PTR_TO_PACKET_END] = "pkt_end",
2476 +- [PTR_TO_FLOW_KEYS] = "flow_keys",
2477 +- [PTR_TO_SOCKET] = "sock",
2478 +- [PTR_TO_SOCKET_OR_NULL] = "sock_or_null",
2479 +- [PTR_TO_SOCK_COMMON] = "sock_common",
2480 +- [PTR_TO_SOCK_COMMON_OR_NULL] = "sock_common_or_null",
2481 +- [PTR_TO_TCP_SOCK] = "tcp_sock",
2482 +- [PTR_TO_TCP_SOCK_OR_NULL] = "tcp_sock_or_null",
2483 +- [PTR_TO_TP_BUFFER] = "tp_buffer",
2484 +- [PTR_TO_XDP_SOCK] = "xdp_sock",
2485 +- [PTR_TO_BTF_ID] = "ptr_",
2486 +- [PTR_TO_BTF_ID_OR_NULL] = "ptr_or_null_",
2487 +- [PTR_TO_PERCPU_BTF_ID] = "percpu_ptr_",
2488 +- [PTR_TO_MEM] = "mem",
2489 +- [PTR_TO_MEM_OR_NULL] = "mem_or_null",
2490 +- [PTR_TO_RDONLY_BUF] = "rdonly_buf",
2491 +- [PTR_TO_RDONLY_BUF_OR_NULL] = "rdonly_buf_or_null",
2492 +- [PTR_TO_RDWR_BUF] = "rdwr_buf",
2493 +- [PTR_TO_RDWR_BUF_OR_NULL] = "rdwr_buf_or_null",
2494 +- [PTR_TO_FUNC] = "func",
2495 +- [PTR_TO_MAP_KEY] = "map_key",
2496 +-};
2497 ++/* string representation of 'enum bpf_reg_type'
2498 ++ *
2499 ++ * Note that reg_type_str() can not appear more than once in a single verbose()
2500 ++ * statement.
2501 ++ */
2502 ++static const char *reg_type_str(struct bpf_verifier_env *env,
2503 ++ enum bpf_reg_type type)
2504 ++{
2505 ++ char postfix[16] = {0}, prefix[16] = {0};
2506 ++ static const char * const str[] = {
2507 ++ [NOT_INIT] = "?",
2508 ++ [SCALAR_VALUE] = "inv",
2509 ++ [PTR_TO_CTX] = "ctx",
2510 ++ [CONST_PTR_TO_MAP] = "map_ptr",
2511 ++ [PTR_TO_MAP_VALUE] = "map_value",
2512 ++ [PTR_TO_STACK] = "fp",
2513 ++ [PTR_TO_PACKET] = "pkt",
2514 ++ [PTR_TO_PACKET_META] = "pkt_meta",
2515 ++ [PTR_TO_PACKET_END] = "pkt_end",
2516 ++ [PTR_TO_FLOW_KEYS] = "flow_keys",
2517 ++ [PTR_TO_SOCKET] = "sock",
2518 ++ [PTR_TO_SOCK_COMMON] = "sock_common",
2519 ++ [PTR_TO_TCP_SOCK] = "tcp_sock",
2520 ++ [PTR_TO_TP_BUFFER] = "tp_buffer",
2521 ++ [PTR_TO_XDP_SOCK] = "xdp_sock",
2522 ++ [PTR_TO_BTF_ID] = "ptr_",
2523 ++ [PTR_TO_PERCPU_BTF_ID] = "percpu_ptr_",
2524 ++ [PTR_TO_MEM] = "mem",
2525 ++ [PTR_TO_BUF] = "buf",
2526 ++ [PTR_TO_FUNC] = "func",
2527 ++ [PTR_TO_MAP_KEY] = "map_key",
2528 ++ };
2529 ++
2530 ++ if (type & PTR_MAYBE_NULL) {
2531 ++ if (base_type(type) == PTR_TO_BTF_ID ||
2532 ++ base_type(type) == PTR_TO_PERCPU_BTF_ID)
2533 ++ strncpy(postfix, "or_null_", 16);
2534 ++ else
2535 ++ strncpy(postfix, "_or_null", 16);
2536 ++ }
2537 ++
2538 ++ if (type & MEM_RDONLY)
2539 ++ strncpy(prefix, "rdonly_", 16);
2540 ++
2541 ++ snprintf(env->type_str_buf, TYPE_STR_BUF_LEN, "%s%s%s",
2542 ++ prefix, str[base_type(type)], postfix);
2543 ++ return env->type_str_buf;
2544 ++}
2545 +
2546 + static char slot_type_char[] = {
2547 + [STACK_INVALID] = '?',
2548 +@@ -628,7 +628,7 @@ static void print_verifier_state(struct bpf_verifier_env *env,
2549 + continue;
2550 + verbose(env, " R%d", i);
2551 + print_liveness(env, reg->live);
2552 +- verbose(env, "=%s", reg_type_str[t]);
2553 ++ verbose(env, "=%s", reg_type_str(env, t));
2554 + if (t == SCALAR_VALUE && reg->precise)
2555 + verbose(env, "P");
2556 + if ((t == SCALAR_VALUE || t == PTR_TO_STACK) &&
2557 +@@ -636,9 +636,8 @@ static void print_verifier_state(struct bpf_verifier_env *env,
2558 + /* reg->off should be 0 for SCALAR_VALUE */
2559 + verbose(env, "%lld", reg->var_off.value + reg->off);
2560 + } else {
2561 +- if (t == PTR_TO_BTF_ID ||
2562 +- t == PTR_TO_BTF_ID_OR_NULL ||
2563 +- t == PTR_TO_PERCPU_BTF_ID)
2564 ++ if (base_type(t) == PTR_TO_BTF_ID ||
2565 ++ base_type(t) == PTR_TO_PERCPU_BTF_ID)
2566 + verbose(env, "%s", kernel_type_name(reg->btf, reg->btf_id));
2567 + verbose(env, "(id=%d", reg->id);
2568 + if (reg_type_may_be_refcounted_or_null(t))
2569 +@@ -647,10 +646,9 @@ static void print_verifier_state(struct bpf_verifier_env *env,
2570 + verbose(env, ",off=%d", reg->off);
2571 + if (type_is_pkt_pointer(t))
2572 + verbose(env, ",r=%d", reg->range);
2573 +- else if (t == CONST_PTR_TO_MAP ||
2574 +- t == PTR_TO_MAP_KEY ||
2575 +- t == PTR_TO_MAP_VALUE ||
2576 +- t == PTR_TO_MAP_VALUE_OR_NULL)
2577 ++ else if (base_type(t) == CONST_PTR_TO_MAP ||
2578 ++ base_type(t) == PTR_TO_MAP_KEY ||
2579 ++ base_type(t) == PTR_TO_MAP_VALUE)
2580 + verbose(env, ",ks=%d,vs=%d",
2581 + reg->map_ptr->key_size,
2582 + reg->map_ptr->value_size);
2583 +@@ -720,7 +718,7 @@ static void print_verifier_state(struct bpf_verifier_env *env,
2584 + if (state->stack[i].slot_type[0] == STACK_SPILL) {
2585 + reg = &state->stack[i].spilled_ptr;
2586 + t = reg->type;
2587 +- verbose(env, "=%s", reg_type_str[t]);
2588 ++ verbose(env, "=%s", reg_type_str(env, t));
2589 + if (t == SCALAR_VALUE && reg->precise)
2590 + verbose(env, "P");
2591 + if (t == SCALAR_VALUE && tnum_is_const(reg->var_off))
2592 +@@ -1133,8 +1131,7 @@ static void mark_reg_known_zero(struct bpf_verifier_env *env,
2593 +
2594 + static void mark_ptr_not_null_reg(struct bpf_reg_state *reg)
2595 + {
2596 +- switch (reg->type) {
2597 +- case PTR_TO_MAP_VALUE_OR_NULL: {
2598 ++ if (base_type(reg->type) == PTR_TO_MAP_VALUE) {
2599 + const struct bpf_map *map = reg->map_ptr;
2600 +
2601 + if (map->inner_map_meta) {
2602 +@@ -1153,32 +1150,10 @@ static void mark_ptr_not_null_reg(struct bpf_reg_state *reg)
2603 + } else {
2604 + reg->type = PTR_TO_MAP_VALUE;
2605 + }
2606 +- break;
2607 +- }
2608 +- case PTR_TO_SOCKET_OR_NULL:
2609 +- reg->type = PTR_TO_SOCKET;
2610 +- break;
2611 +- case PTR_TO_SOCK_COMMON_OR_NULL:
2612 +- reg->type = PTR_TO_SOCK_COMMON;
2613 +- break;
2614 +- case PTR_TO_TCP_SOCK_OR_NULL:
2615 +- reg->type = PTR_TO_TCP_SOCK;
2616 +- break;
2617 +- case PTR_TO_BTF_ID_OR_NULL:
2618 +- reg->type = PTR_TO_BTF_ID;
2619 +- break;
2620 +- case PTR_TO_MEM_OR_NULL:
2621 +- reg->type = PTR_TO_MEM;
2622 +- break;
2623 +- case PTR_TO_RDONLY_BUF_OR_NULL:
2624 +- reg->type = PTR_TO_RDONLY_BUF;
2625 +- break;
2626 +- case PTR_TO_RDWR_BUF_OR_NULL:
2627 +- reg->type = PTR_TO_RDWR_BUF;
2628 +- break;
2629 +- default:
2630 +- WARN_ONCE(1, "unknown nullable register type");
2631 ++ return;
2632 + }
2633 ++
2634 ++ reg->type &= ~PTR_MAYBE_NULL;
2635 + }
2636 +
2637 + static bool reg_is_pkt_pointer(const struct bpf_reg_state *reg)
2638 +@@ -1906,7 +1881,7 @@ static int mark_reg_read(struct bpf_verifier_env *env,
2639 + break;
2640 + if (parent->live & REG_LIVE_DONE) {
2641 + verbose(env, "verifier BUG type %s var_off %lld off %d\n",
2642 +- reg_type_str[parent->type],
2643 ++ reg_type_str(env, parent->type),
2644 + parent->var_off.value, parent->off);
2645 + return -EFAULT;
2646 + }
2647 +@@ -2564,9 +2539,8 @@ static int mark_chain_precision_stack(struct bpf_verifier_env *env, int spi)
2648 +
2649 + static bool is_spillable_regtype(enum bpf_reg_type type)
2650 + {
2651 +- switch (type) {
2652 ++ switch (base_type(type)) {
2653 + case PTR_TO_MAP_VALUE:
2654 +- case PTR_TO_MAP_VALUE_OR_NULL:
2655 + case PTR_TO_STACK:
2656 + case PTR_TO_CTX:
2657 + case PTR_TO_PACKET:
2658 +@@ -2575,21 +2549,13 @@ static bool is_spillable_regtype(enum bpf_reg_type type)
2659 + case PTR_TO_FLOW_KEYS:
2660 + case CONST_PTR_TO_MAP:
2661 + case PTR_TO_SOCKET:
2662 +- case PTR_TO_SOCKET_OR_NULL:
2663 + case PTR_TO_SOCK_COMMON:
2664 +- case PTR_TO_SOCK_COMMON_OR_NULL:
2665 + case PTR_TO_TCP_SOCK:
2666 +- case PTR_TO_TCP_SOCK_OR_NULL:
2667 + case PTR_TO_XDP_SOCK:
2668 + case PTR_TO_BTF_ID:
2669 +- case PTR_TO_BTF_ID_OR_NULL:
2670 +- case PTR_TO_RDONLY_BUF:
2671 +- case PTR_TO_RDONLY_BUF_OR_NULL:
2672 +- case PTR_TO_RDWR_BUF:
2673 +- case PTR_TO_RDWR_BUF_OR_NULL:
2674 ++ case PTR_TO_BUF:
2675 + case PTR_TO_PERCPU_BTF_ID:
2676 + case PTR_TO_MEM:
2677 +- case PTR_TO_MEM_OR_NULL:
2678 + case PTR_TO_FUNC:
2679 + case PTR_TO_MAP_KEY:
2680 + return true;
2681 +@@ -3405,7 +3371,7 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off,
2682 + */
2683 + *reg_type = info.reg_type;
2684 +
2685 +- if (*reg_type == PTR_TO_BTF_ID || *reg_type == PTR_TO_BTF_ID_OR_NULL) {
2686 ++ if (base_type(*reg_type) == PTR_TO_BTF_ID) {
2687 + *btf = info.btf;
2688 + *btf_id = info.btf_id;
2689 + } else {
2690 +@@ -3473,7 +3439,7 @@ static int check_sock_access(struct bpf_verifier_env *env, int insn_idx,
2691 + }
2692 +
2693 + verbose(env, "R%d invalid %s access off=%d size=%d\n",
2694 +- regno, reg_type_str[reg->type], off, size);
2695 ++ regno, reg_type_str(env, reg->type), off, size);
2696 +
2697 + return -EACCES;
2698 + }
2699 +@@ -4200,15 +4166,30 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
2700 + mark_reg_unknown(env, regs, value_regno);
2701 + }
2702 + }
2703 +- } else if (reg->type == PTR_TO_MEM) {
2704 ++ } else if (base_type(reg->type) == PTR_TO_MEM) {
2705 ++ bool rdonly_mem = type_is_rdonly_mem(reg->type);
2706 ++
2707 ++ if (type_may_be_null(reg->type)) {
2708 ++ verbose(env, "R%d invalid mem access '%s'\n", regno,
2709 ++ reg_type_str(env, reg->type));
2710 ++ return -EACCES;
2711 ++ }
2712 ++
2713 ++ if (t == BPF_WRITE && rdonly_mem) {
2714 ++ verbose(env, "R%d cannot write into %s\n",
2715 ++ regno, reg_type_str(env, reg->type));
2716 ++ return -EACCES;
2717 ++ }
2718 ++
2719 + if (t == BPF_WRITE && value_regno >= 0 &&
2720 + is_pointer_value(env, value_regno)) {
2721 + verbose(env, "R%d leaks addr into mem\n", value_regno);
2722 + return -EACCES;
2723 + }
2724 ++
2725 + err = check_mem_region_access(env, regno, off, size,
2726 + reg->mem_size, false);
2727 +- if (!err && t == BPF_READ && value_regno >= 0)
2728 ++ if (!err && value_regno >= 0 && (t == BPF_READ || rdonly_mem))
2729 + mark_reg_unknown(env, regs, value_regno);
2730 + } else if (reg->type == PTR_TO_CTX) {
2731 + enum bpf_reg_type reg_type = SCALAR_VALUE;
2732 +@@ -4238,7 +4219,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
2733 + } else {
2734 + mark_reg_known_zero(env, regs,
2735 + value_regno);
2736 +- if (reg_type_may_be_null(reg_type))
2737 ++ if (type_may_be_null(reg_type))
2738 + regs[value_regno].id = ++env->id_gen;
2739 + /* A load of ctx field could have different
2740 + * actual load size with the one encoded in the
2741 +@@ -4246,8 +4227,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
2742 + * a sub-register.
2743 + */
2744 + regs[value_regno].subreg_def = DEF_NOT_SUBREG;
2745 +- if (reg_type == PTR_TO_BTF_ID ||
2746 +- reg_type == PTR_TO_BTF_ID_OR_NULL) {
2747 ++ if (base_type(reg_type) == PTR_TO_BTF_ID) {
2748 + regs[value_regno].btf = btf;
2749 + regs[value_regno].btf_id = btf_id;
2750 + }
2751 +@@ -4300,7 +4280,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
2752 + } else if (type_is_sk_pointer(reg->type)) {
2753 + if (t == BPF_WRITE) {
2754 + verbose(env, "R%d cannot write into %s\n",
2755 +- regno, reg_type_str[reg->type]);
2756 ++ regno, reg_type_str(env, reg->type));
2757 + return -EACCES;
2758 + }
2759 + err = check_sock_access(env, insn_idx, regno, off, size, t);
2760 +@@ -4316,26 +4296,32 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
2761 + } else if (reg->type == CONST_PTR_TO_MAP) {
2762 + err = check_ptr_to_map_access(env, regs, regno, off, size, t,
2763 + value_regno);
2764 +- } else if (reg->type == PTR_TO_RDONLY_BUF) {
2765 +- if (t == BPF_WRITE) {
2766 +- verbose(env, "R%d cannot write into %s\n",
2767 +- regno, reg_type_str[reg->type]);
2768 +- return -EACCES;
2769 ++ } else if (base_type(reg->type) == PTR_TO_BUF) {
2770 ++ bool rdonly_mem = type_is_rdonly_mem(reg->type);
2771 ++ const char *buf_info;
2772 ++ u32 *max_access;
2773 ++
2774 ++ if (rdonly_mem) {
2775 ++ if (t == BPF_WRITE) {
2776 ++ verbose(env, "R%d cannot write into %s\n",
2777 ++ regno, reg_type_str(env, reg->type));
2778 ++ return -EACCES;
2779 ++ }
2780 ++ buf_info = "rdonly";
2781 ++ max_access = &env->prog->aux->max_rdonly_access;
2782 ++ } else {
2783 ++ buf_info = "rdwr";
2784 ++ max_access = &env->prog->aux->max_rdwr_access;
2785 + }
2786 ++
2787 + err = check_buffer_access(env, reg, regno, off, size, false,
2788 +- "rdonly",
2789 +- &env->prog->aux->max_rdonly_access);
2790 +- if (!err && value_regno >= 0)
2791 +- mark_reg_unknown(env, regs, value_regno);
2792 +- } else if (reg->type == PTR_TO_RDWR_BUF) {
2793 +- err = check_buffer_access(env, reg, regno, off, size, false,
2794 +- "rdwr",
2795 +- &env->prog->aux->max_rdwr_access);
2796 +- if (!err && t == BPF_READ && value_regno >= 0)
2797 ++ buf_info, max_access);
2798 ++
2799 ++ if (!err && value_regno >= 0 && (rdonly_mem || t == BPF_READ))
2800 + mark_reg_unknown(env, regs, value_regno);
2801 + } else {
2802 + verbose(env, "R%d invalid mem access '%s'\n", regno,
2803 +- reg_type_str[reg->type]);
2804 ++ reg_type_str(env, reg->type));
2805 + return -EACCES;
2806 + }
2807 +
2808 +@@ -4409,7 +4395,7 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i
2809 + is_sk_reg(env, insn->dst_reg)) {
2810 + verbose(env, "BPF_ATOMIC stores into R%d %s is not allowed\n",
2811 + insn->dst_reg,
2812 +- reg_type_str[reg_state(env, insn->dst_reg)->type]);
2813 ++ reg_type_str(env, reg_state(env, insn->dst_reg)->type));
2814 + return -EACCES;
2815 + }
2816 +
2817 +@@ -4592,8 +4578,10 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
2818 + struct bpf_call_arg_meta *meta)
2819 + {
2820 + struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
2821 ++ const char *buf_info;
2822 ++ u32 *max_access;
2823 +
2824 +- switch (reg->type) {
2825 ++ switch (base_type(reg->type)) {
2826 + case PTR_TO_PACKET:
2827 + case PTR_TO_PACKET_META:
2828 + return check_packet_access(env, regno, reg->off, access_size,
2829 +@@ -4612,18 +4600,20 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
2830 + return check_mem_region_access(env, regno, reg->off,
2831 + access_size, reg->mem_size,
2832 + zero_size_allowed);
2833 +- case PTR_TO_RDONLY_BUF:
2834 +- if (meta && meta->raw_mode)
2835 +- return -EACCES;
2836 +- return check_buffer_access(env, reg, regno, reg->off,
2837 +- access_size, zero_size_allowed,
2838 +- "rdonly",
2839 +- &env->prog->aux->max_rdonly_access);
2840 +- case PTR_TO_RDWR_BUF:
2841 ++ case PTR_TO_BUF:
2842 ++ if (type_is_rdonly_mem(reg->type)) {
2843 ++ if (meta && meta->raw_mode)
2844 ++ return -EACCES;
2845 ++
2846 ++ buf_info = "rdonly";
2847 ++ max_access = &env->prog->aux->max_rdonly_access;
2848 ++ } else {
2849 ++ buf_info = "rdwr";
2850 ++ max_access = &env->prog->aux->max_rdwr_access;
2851 ++ }
2852 + return check_buffer_access(env, reg, regno, reg->off,
2853 + access_size, zero_size_allowed,
2854 +- "rdwr",
2855 +- &env->prog->aux->max_rdwr_access);
2856 ++ buf_info, max_access);
2857 + case PTR_TO_STACK:
2858 + return check_stack_range_initialized(
2859 + env,
2860 +@@ -4635,9 +4625,9 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
2861 + register_is_null(reg))
2862 + return 0;
2863 +
2864 +- verbose(env, "R%d type=%s expected=%s\n", regno,
2865 +- reg_type_str[reg->type],
2866 +- reg_type_str[PTR_TO_STACK]);
2867 ++ verbose(env, "R%d type=%s ", regno,
2868 ++ reg_type_str(env, reg->type));
2869 ++ verbose(env, "expected=%s\n", reg_type_str(env, PTR_TO_STACK));
2870 + return -EACCES;
2871 + }
2872 + }
2873 +@@ -4648,7 +4638,7 @@ int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
2874 + if (register_is_null(reg))
2875 + return 0;
2876 +
2877 +- if (reg_type_may_be_null(reg->type)) {
2878 ++ if (type_may_be_null(reg->type)) {
2879 + /* Assuming that the register contains a value check if the memory
2880 + * access is safe. Temporarily save and restore the register's state as
2881 + * the conversion shouldn't be visible to a caller.
2882 +@@ -4796,9 +4786,8 @@ static int process_timer_func(struct bpf_verifier_env *env, int regno,
2883 +
2884 + static bool arg_type_is_mem_ptr(enum bpf_arg_type type)
2885 + {
2886 +- return type == ARG_PTR_TO_MEM ||
2887 +- type == ARG_PTR_TO_MEM_OR_NULL ||
2888 +- type == ARG_PTR_TO_UNINIT_MEM;
2889 ++ return base_type(type) == ARG_PTR_TO_MEM ||
2890 ++ base_type(type) == ARG_PTR_TO_UNINIT_MEM;
2891 + }
2892 +
2893 + static bool arg_type_is_mem_size(enum bpf_arg_type type)
2894 +@@ -4900,8 +4889,7 @@ static const struct bpf_reg_types mem_types = {
2895 + PTR_TO_MAP_KEY,
2896 + PTR_TO_MAP_VALUE,
2897 + PTR_TO_MEM,
2898 +- PTR_TO_RDONLY_BUF,
2899 +- PTR_TO_RDWR_BUF,
2900 ++ PTR_TO_BUF,
2901 + },
2902 + };
2903 +
2904 +@@ -4932,31 +4920,26 @@ static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = {
2905 + [ARG_PTR_TO_MAP_KEY] = &map_key_value_types,
2906 + [ARG_PTR_TO_MAP_VALUE] = &map_key_value_types,
2907 + [ARG_PTR_TO_UNINIT_MAP_VALUE] = &map_key_value_types,
2908 +- [ARG_PTR_TO_MAP_VALUE_OR_NULL] = &map_key_value_types,
2909 + [ARG_CONST_SIZE] = &scalar_types,
2910 + [ARG_CONST_SIZE_OR_ZERO] = &scalar_types,
2911 + [ARG_CONST_ALLOC_SIZE_OR_ZERO] = &scalar_types,
2912 + [ARG_CONST_MAP_PTR] = &const_map_ptr_types,
2913 + [ARG_PTR_TO_CTX] = &context_types,
2914 +- [ARG_PTR_TO_CTX_OR_NULL] = &context_types,
2915 + [ARG_PTR_TO_SOCK_COMMON] = &sock_types,
2916 + #ifdef CONFIG_NET
2917 + [ARG_PTR_TO_BTF_ID_SOCK_COMMON] = &btf_id_sock_common_types,
2918 + #endif
2919 + [ARG_PTR_TO_SOCKET] = &fullsock_types,
2920 +- [ARG_PTR_TO_SOCKET_OR_NULL] = &fullsock_types,
2921 + [ARG_PTR_TO_BTF_ID] = &btf_ptr_types,
2922 + [ARG_PTR_TO_SPIN_LOCK] = &spin_lock_types,
2923 + [ARG_PTR_TO_MEM] = &mem_types,
2924 +- [ARG_PTR_TO_MEM_OR_NULL] = &mem_types,
2925 + [ARG_PTR_TO_UNINIT_MEM] = &mem_types,
2926 + [ARG_PTR_TO_ALLOC_MEM] = &alloc_mem_types,
2927 +- [ARG_PTR_TO_ALLOC_MEM_OR_NULL] = &alloc_mem_types,
2928 + [ARG_PTR_TO_INT] = &int_ptr_types,
2929 + [ARG_PTR_TO_LONG] = &int_ptr_types,
2930 + [ARG_PTR_TO_PERCPU_BTF_ID] = &percpu_btf_ptr_types,
2931 + [ARG_PTR_TO_FUNC] = &func_ptr_types,
2932 +- [ARG_PTR_TO_STACK_OR_NULL] = &stack_ptr_types,
2933 ++ [ARG_PTR_TO_STACK] = &stack_ptr_types,
2934 + [ARG_PTR_TO_CONST_STR] = &const_str_ptr_types,
2935 + [ARG_PTR_TO_TIMER] = &timer_types,
2936 + };
2937 +@@ -4970,12 +4953,27 @@ static int check_reg_type(struct bpf_verifier_env *env, u32 regno,
2938 + const struct bpf_reg_types *compatible;
2939 + int i, j;
2940 +
2941 +- compatible = compatible_reg_types[arg_type];
2942 ++ compatible = compatible_reg_types[base_type(arg_type)];
2943 + if (!compatible) {
2944 + verbose(env, "verifier internal error: unsupported arg type %d\n", arg_type);
2945 + return -EFAULT;
2946 + }
2947 +
2948 ++ /* ARG_PTR_TO_MEM + RDONLY is compatible with PTR_TO_MEM and PTR_TO_MEM + RDONLY,
2949 ++ * but ARG_PTR_TO_MEM is compatible only with PTR_TO_MEM and NOT with PTR_TO_MEM + RDONLY
2950 ++ *
2951 ++ * Same for MAYBE_NULL:
2952 ++ *
2953 ++ * ARG_PTR_TO_MEM + MAYBE_NULL is compatible with PTR_TO_MEM and PTR_TO_MEM + MAYBE_NULL,
2954 ++ * but ARG_PTR_TO_MEM is compatible only with PTR_TO_MEM but NOT with PTR_TO_MEM + MAYBE_NULL
2955 ++ *
2956 ++ * Therefore we fold these flags depending on the arg_type before comparison.
2957 ++ */
2958 ++ if (arg_type & MEM_RDONLY)
2959 ++ type &= ~MEM_RDONLY;
2960 ++ if (arg_type & PTR_MAYBE_NULL)
2961 ++ type &= ~PTR_MAYBE_NULL;
2962 ++
2963 + for (i = 0; i < ARRAY_SIZE(compatible->types); i++) {
2964 + expected = compatible->types[i];
2965 + if (expected == NOT_INIT)
2966 +@@ -4985,14 +4983,14 @@ static int check_reg_type(struct bpf_verifier_env *env, u32 regno,
2967 + goto found;
2968 + }
2969 +
2970 +- verbose(env, "R%d type=%s expected=", regno, reg_type_str[type]);
2971 ++ verbose(env, "R%d type=%s expected=", regno, reg_type_str(env, reg->type));
2972 + for (j = 0; j + 1 < i; j++)
2973 +- verbose(env, "%s, ", reg_type_str[compatible->types[j]]);
2974 +- verbose(env, "%s\n", reg_type_str[compatible->types[j]]);
2975 ++ verbose(env, "%s, ", reg_type_str(env, compatible->types[j]));
2976 ++ verbose(env, "%s\n", reg_type_str(env, compatible->types[j]));
2977 + return -EACCES;
2978 +
2979 + found:
2980 +- if (type == PTR_TO_BTF_ID) {
2981 ++ if (reg->type == PTR_TO_BTF_ID) {
2982 + if (!arg_btf_id) {
2983 + if (!compatible->btf_id) {
2984 + verbose(env, "verifier internal error: missing arg compatible BTF ID\n");
2985 +@@ -5051,15 +5049,14 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
2986 + return -EACCES;
2987 + }
2988 +
2989 +- if (arg_type == ARG_PTR_TO_MAP_VALUE ||
2990 +- arg_type == ARG_PTR_TO_UNINIT_MAP_VALUE ||
2991 +- arg_type == ARG_PTR_TO_MAP_VALUE_OR_NULL) {
2992 ++ if (base_type(arg_type) == ARG_PTR_TO_MAP_VALUE ||
2993 ++ base_type(arg_type) == ARG_PTR_TO_UNINIT_MAP_VALUE) {
2994 + err = resolve_map_arg_type(env, meta, &arg_type);
2995 + if (err)
2996 + return err;
2997 + }
2998 +
2999 +- if (register_is_null(reg) && arg_type_may_be_null(arg_type))
3000 ++ if (register_is_null(reg) && type_may_be_null(arg_type))
3001 + /* A NULL register has a SCALAR_VALUE type, so skip
3002 + * type checking.
3003 + */
3004 +@@ -5128,10 +5125,11 @@ skip_type_check:
3005 + err = check_helper_mem_access(env, regno,
3006 + meta->map_ptr->key_size, false,
3007 + NULL);
3008 +- } else if (arg_type == ARG_PTR_TO_MAP_VALUE ||
3009 +- (arg_type == ARG_PTR_TO_MAP_VALUE_OR_NULL &&
3010 +- !register_is_null(reg)) ||
3011 +- arg_type == ARG_PTR_TO_UNINIT_MAP_VALUE) {
3012 ++ } else if (base_type(arg_type) == ARG_PTR_TO_MAP_VALUE ||
3013 ++ base_type(arg_type) == ARG_PTR_TO_UNINIT_MAP_VALUE) {
3014 ++ if (type_may_be_null(arg_type) && register_is_null(reg))
3015 ++ return 0;
3016 ++
3017 + /* bpf_map_xxx(..., map_ptr, ..., value) call:
3018 + * check [value, value + map->value_size) validity
3019 + */
3020 +@@ -6206,6 +6204,8 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
3021 + int *insn_idx_p)
3022 + {
3023 + const struct bpf_func_proto *fn = NULL;
3024 ++ enum bpf_return_type ret_type;
3025 ++ enum bpf_type_flag ret_flag;
3026 + struct bpf_reg_state *regs;
3027 + struct bpf_call_arg_meta meta;
3028 + int insn_idx = *insn_idx_p;
3029 +@@ -6339,13 +6339,14 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
3030 + regs[BPF_REG_0].subreg_def = DEF_NOT_SUBREG;
3031 +
3032 + /* update return register (already marked as written above) */
3033 +- if (fn->ret_type == RET_INTEGER) {
3034 ++ ret_type = fn->ret_type;
3035 ++ ret_flag = type_flag(fn->ret_type);
3036 ++ if (ret_type == RET_INTEGER) {
3037 + /* sets type to SCALAR_VALUE */
3038 + mark_reg_unknown(env, regs, BPF_REG_0);
3039 +- } else if (fn->ret_type == RET_VOID) {
3040 ++ } else if (ret_type == RET_VOID) {
3041 + regs[BPF_REG_0].type = NOT_INIT;
3042 +- } else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL ||
3043 +- fn->ret_type == RET_PTR_TO_MAP_VALUE) {
3044 ++ } else if (base_type(ret_type) == RET_PTR_TO_MAP_VALUE) {
3045 + /* There is no offset yet applied, variable or fixed */
3046 + mark_reg_known_zero(env, regs, BPF_REG_0);
3047 + /* remember map_ptr, so that check_map_access()
3048 +@@ -6359,28 +6360,25 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
3049 + }
3050 + regs[BPF_REG_0].map_ptr = meta.map_ptr;
3051 + regs[BPF_REG_0].map_uid = meta.map_uid;
3052 +- if (fn->ret_type == RET_PTR_TO_MAP_VALUE) {
3053 +- regs[BPF_REG_0].type = PTR_TO_MAP_VALUE;
3054 +- if (map_value_has_spin_lock(meta.map_ptr))
3055 +- regs[BPF_REG_0].id = ++env->id_gen;
3056 +- } else {
3057 +- regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL;
3058 ++ regs[BPF_REG_0].type = PTR_TO_MAP_VALUE | ret_flag;
3059 ++ if (!type_may_be_null(ret_type) &&
3060 ++ map_value_has_spin_lock(meta.map_ptr)) {
3061 ++ regs[BPF_REG_0].id = ++env->id_gen;
3062 + }
3063 +- } else if (fn->ret_type == RET_PTR_TO_SOCKET_OR_NULL) {
3064 ++ } else if (base_type(ret_type) == RET_PTR_TO_SOCKET) {
3065 + mark_reg_known_zero(env, regs, BPF_REG_0);
3066 +- regs[BPF_REG_0].type = PTR_TO_SOCKET_OR_NULL;
3067 +- } else if (fn->ret_type == RET_PTR_TO_SOCK_COMMON_OR_NULL) {
3068 ++ regs[BPF_REG_0].type = PTR_TO_SOCKET | ret_flag;
3069 ++ } else if (base_type(ret_type) == RET_PTR_TO_SOCK_COMMON) {
3070 + mark_reg_known_zero(env, regs, BPF_REG_0);
3071 +- regs[BPF_REG_0].type = PTR_TO_SOCK_COMMON_OR_NULL;
3072 +- } else if (fn->ret_type == RET_PTR_TO_TCP_SOCK_OR_NULL) {
3073 ++ regs[BPF_REG_0].type = PTR_TO_SOCK_COMMON | ret_flag;
3074 ++ } else if (base_type(ret_type) == RET_PTR_TO_TCP_SOCK) {
3075 + mark_reg_known_zero(env, regs, BPF_REG_0);
3076 +- regs[BPF_REG_0].type = PTR_TO_TCP_SOCK_OR_NULL;
3077 +- } else if (fn->ret_type == RET_PTR_TO_ALLOC_MEM_OR_NULL) {
3078 ++ regs[BPF_REG_0].type = PTR_TO_TCP_SOCK | ret_flag;
3079 ++ } else if (base_type(ret_type) == RET_PTR_TO_ALLOC_MEM) {
3080 + mark_reg_known_zero(env, regs, BPF_REG_0);
3081 +- regs[BPF_REG_0].type = PTR_TO_MEM_OR_NULL;
3082 ++ regs[BPF_REG_0].type = PTR_TO_MEM | ret_flag;
3083 + regs[BPF_REG_0].mem_size = meta.mem_size;
3084 +- } else if (fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID_OR_NULL ||
3085 +- fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID) {
3086 ++ } else if (base_type(ret_type) == RET_PTR_TO_MEM_OR_BTF_ID) {
3087 + const struct btf_type *t;
3088 +
3089 + mark_reg_known_zero(env, regs, BPF_REG_0);
3090 +@@ -6398,29 +6396,30 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
3091 + tname, PTR_ERR(ret));
3092 + return -EINVAL;
3093 + }
3094 +- regs[BPF_REG_0].type =
3095 +- fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID ?
3096 +- PTR_TO_MEM : PTR_TO_MEM_OR_NULL;
3097 ++ regs[BPF_REG_0].type = PTR_TO_MEM | ret_flag;
3098 + regs[BPF_REG_0].mem_size = tsize;
3099 + } else {
3100 +- regs[BPF_REG_0].type =
3101 +- fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID ?
3102 +- PTR_TO_BTF_ID : PTR_TO_BTF_ID_OR_NULL;
3103 ++ /* MEM_RDONLY may be carried from ret_flag, but it
3104 ++ * doesn't apply on PTR_TO_BTF_ID. Fold it, otherwise
3105 ++ * it will confuse the check of PTR_TO_BTF_ID in
3106 ++ * check_mem_access().
3107 ++ */
3108 ++ ret_flag &= ~MEM_RDONLY;
3109 ++
3110 ++ regs[BPF_REG_0].type = PTR_TO_BTF_ID | ret_flag;
3111 + regs[BPF_REG_0].btf = meta.ret_btf;
3112 + regs[BPF_REG_0].btf_id = meta.ret_btf_id;
3113 + }
3114 +- } else if (fn->ret_type == RET_PTR_TO_BTF_ID_OR_NULL ||
3115 +- fn->ret_type == RET_PTR_TO_BTF_ID) {
3116 ++ } else if (base_type(ret_type) == RET_PTR_TO_BTF_ID) {
3117 + int ret_btf_id;
3118 +
3119 + mark_reg_known_zero(env, regs, BPF_REG_0);
3120 +- regs[BPF_REG_0].type = fn->ret_type == RET_PTR_TO_BTF_ID ?
3121 +- PTR_TO_BTF_ID :
3122 +- PTR_TO_BTF_ID_OR_NULL;
3123 ++ regs[BPF_REG_0].type = PTR_TO_BTF_ID | ret_flag;
3124 + ret_btf_id = *fn->ret_btf_id;
3125 + if (ret_btf_id == 0) {
3126 +- verbose(env, "invalid return type %d of func %s#%d\n",
3127 +- fn->ret_type, func_id_name(func_id), func_id);
3128 ++ verbose(env, "invalid return type %u of func %s#%d\n",
3129 ++ base_type(ret_type), func_id_name(func_id),
3130 ++ func_id);
3131 + return -EINVAL;
3132 + }
3133 + /* current BPF helper definitions are only coming from
3134 +@@ -6429,12 +6428,12 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
3135 + regs[BPF_REG_0].btf = btf_vmlinux;
3136 + regs[BPF_REG_0].btf_id = ret_btf_id;
3137 + } else {
3138 +- verbose(env, "unknown return type %d of func %s#%d\n",
3139 +- fn->ret_type, func_id_name(func_id), func_id);
3140 ++ verbose(env, "unknown return type %u of func %s#%d\n",
3141 ++ base_type(ret_type), func_id_name(func_id), func_id);
3142 + return -EINVAL;
3143 + }
3144 +
3145 +- if (reg_type_may_be_null(regs[BPF_REG_0].type))
3146 ++ if (type_may_be_null(regs[BPF_REG_0].type))
3147 + regs[BPF_REG_0].id = ++env->id_gen;
3148 +
3149 + if (is_ptr_cast_function(func_id)) {
3150 +@@ -6633,25 +6632,25 @@ static bool check_reg_sane_offset(struct bpf_verifier_env *env,
3151 +
3152 + if (known && (val >= BPF_MAX_VAR_OFF || val <= -BPF_MAX_VAR_OFF)) {
3153 + verbose(env, "math between %s pointer and %lld is not allowed\n",
3154 +- reg_type_str[type], val);
3155 ++ reg_type_str(env, type), val);
3156 + return false;
3157 + }
3158 +
3159 + if (reg->off >= BPF_MAX_VAR_OFF || reg->off <= -BPF_MAX_VAR_OFF) {
3160 + verbose(env, "%s pointer offset %d is not allowed\n",
3161 +- reg_type_str[type], reg->off);
3162 ++ reg_type_str(env, type), reg->off);
3163 + return false;
3164 + }
3165 +
3166 + if (smin == S64_MIN) {
3167 + verbose(env, "math between %s pointer and register with unbounded min value is not allowed\n",
3168 +- reg_type_str[type]);
3169 ++ reg_type_str(env, type));
3170 + return false;
3171 + }
3172 +
3173 + if (smin >= BPF_MAX_VAR_OFF || smin <= -BPF_MAX_VAR_OFF) {
3174 + verbose(env, "value %lld makes %s pointer be out of bounds\n",
3175 +- smin, reg_type_str[type]);
3176 ++ smin, reg_type_str(env, type));
3177 + return false;
3178 + }
3179 +
3180 +@@ -7028,11 +7027,13 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
3181 + return -EACCES;
3182 + }
3183 +
3184 +- switch (ptr_reg->type) {
3185 +- case PTR_TO_MAP_VALUE_OR_NULL:
3186 ++ if (ptr_reg->type & PTR_MAYBE_NULL) {
3187 + verbose(env, "R%d pointer arithmetic on %s prohibited, null-check it first\n",
3188 +- dst, reg_type_str[ptr_reg->type]);
3189 ++ dst, reg_type_str(env, ptr_reg->type));
3190 + return -EACCES;
3191 ++ }
3192 ++
3193 ++ switch (base_type(ptr_reg->type)) {
3194 + case CONST_PTR_TO_MAP:
3195 + /* smin_val represents the known value */
3196 + if (known && smin_val == 0 && opcode == BPF_ADD)
3197 +@@ -7045,10 +7046,10 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
3198 + case PTR_TO_XDP_SOCK:
3199 + reject:
3200 + verbose(env, "R%d pointer arithmetic on %s prohibited\n",
3201 +- dst, reg_type_str[ptr_reg->type]);
3202 ++ dst, reg_type_str(env, ptr_reg->type));
3203 + return -EACCES;
3204 + default:
3205 +- if (reg_type_may_be_null(ptr_reg->type))
3206 ++ if (type_may_be_null(ptr_reg->type))
3207 + goto reject;
3208 + break;
3209 + }
3210 +@@ -8770,7 +8771,7 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state,
3211 + struct bpf_reg_state *reg, u32 id,
3212 + bool is_null)
3213 + {
3214 +- if (reg_type_may_be_null(reg->type) && reg->id == id &&
3215 ++ if (type_may_be_null(reg->type) && reg->id == id &&
3216 + !WARN_ON_ONCE(!reg->id)) {
3217 + if (WARN_ON_ONCE(reg->smin_value || reg->smax_value ||
3218 + !tnum_equals_const(reg->var_off, 0) ||
3219 +@@ -9148,7 +9149,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
3220 + */
3221 + if (!is_jmp32 && BPF_SRC(insn->code) == BPF_K &&
3222 + insn->imm == 0 && (opcode == BPF_JEQ || opcode == BPF_JNE) &&
3223 +- reg_type_may_be_null(dst_reg->type)) {
3224 ++ type_may_be_null(dst_reg->type)) {
3225 + /* Mark all identical registers in each branch as either
3226 + * safe or unknown depending R == 0 or R != 0 conditional.
3227 + */
3228 +@@ -9207,7 +9208,7 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn)
3229 +
3230 + if (insn->src_reg == BPF_PSEUDO_BTF_ID) {
3231 + dst_reg->type = aux->btf_var.reg_type;
3232 +- switch (dst_reg->type) {
3233 ++ switch (base_type(dst_reg->type)) {
3234 + case PTR_TO_MEM:
3235 + dst_reg->mem_size = aux->btf_var.mem_size;
3236 + break;
3237 +@@ -9404,7 +9405,7 @@ static int check_return_code(struct bpf_verifier_env *env)
3238 + /* enforce return zero from async callbacks like timer */
3239 + if (reg->type != SCALAR_VALUE) {
3240 + verbose(env, "In async callback the register R0 is not a known value (%s)\n",
3241 +- reg_type_str[reg->type]);
3242 ++ reg_type_str(env, reg->type));
3243 + return -EINVAL;
3244 + }
3245 +
3246 +@@ -9418,7 +9419,7 @@ static int check_return_code(struct bpf_verifier_env *env)
3247 + if (is_subprog) {
3248 + if (reg->type != SCALAR_VALUE) {
3249 + verbose(env, "At subprogram exit the register R0 is not a scalar value (%s)\n",
3250 +- reg_type_str[reg->type]);
3251 ++ reg_type_str(env, reg->type));
3252 + return -EINVAL;
3253 + }
3254 + return 0;
3255 +@@ -9482,7 +9483,7 @@ static int check_return_code(struct bpf_verifier_env *env)
3256 +
3257 + if (reg->type != SCALAR_VALUE) {
3258 + verbose(env, "At program exit the register R0 is not a known value (%s)\n",
3259 +- reg_type_str[reg->type]);
3260 ++ reg_type_str(env, reg->type));
3261 + return -EINVAL;
3262 + }
3263 +
3264 +@@ -10263,7 +10264,7 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,
3265 + return true;
3266 + if (rcur->type == NOT_INIT)
3267 + return false;
3268 +- switch (rold->type) {
3269 ++ switch (base_type(rold->type)) {
3270 + case SCALAR_VALUE:
3271 + if (env->explore_alu_limits)
3272 + return false;
3273 +@@ -10285,6 +10286,22 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,
3274 + }
3275 + case PTR_TO_MAP_KEY:
3276 + case PTR_TO_MAP_VALUE:
3277 ++ /* a PTR_TO_MAP_VALUE could be safe to use as a
3278 ++ * PTR_TO_MAP_VALUE_OR_NULL into the same map.
3279 ++ * However, if the old PTR_TO_MAP_VALUE_OR_NULL then got NULL-
3280 ++ * checked, doing so could have affected others with the same
3281 ++ * id, and we can't check for that because we lost the id when
3282 ++ * we converted to a PTR_TO_MAP_VALUE.
3283 ++ */
3284 ++ if (type_may_be_null(rold->type)) {
3285 ++ if (!type_may_be_null(rcur->type))
3286 ++ return false;
3287 ++ if (memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)))
3288 ++ return false;
3289 ++ /* Check our ids match any regs they're supposed to */
3290 ++ return check_ids(rold->id, rcur->id, idmap);
3291 ++ }
3292 ++
3293 + /* If the new min/max/var_off satisfy the old ones and
3294 + * everything else matches, we are OK.
3295 + * 'id' is not compared, since it's only used for maps with
3296 +@@ -10296,20 +10313,6 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,
3297 + return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0 &&
3298 + range_within(rold, rcur) &&
3299 + tnum_in(rold->var_off, rcur->var_off);
3300 +- case PTR_TO_MAP_VALUE_OR_NULL:
3301 +- /* a PTR_TO_MAP_VALUE could be safe to use as a
3302 +- * PTR_TO_MAP_VALUE_OR_NULL into the same map.
3303 +- * However, if the old PTR_TO_MAP_VALUE_OR_NULL then got NULL-
3304 +- * checked, doing so could have affected others with the same
3305 +- * id, and we can't check for that because we lost the id when
3306 +- * we converted to a PTR_TO_MAP_VALUE.
3307 +- */
3308 +- if (rcur->type != PTR_TO_MAP_VALUE_OR_NULL)
3309 +- return false;
3310 +- if (memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)))
3311 +- return false;
3312 +- /* Check our ids match any regs they're supposed to */
3313 +- return check_ids(rold->id, rcur->id, idmap);
3314 + case PTR_TO_PACKET_META:
3315 + case PTR_TO_PACKET:
3316 + if (rcur->type != rold->type)
3317 +@@ -10338,11 +10341,8 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,
3318 + case PTR_TO_PACKET_END:
3319 + case PTR_TO_FLOW_KEYS:
3320 + case PTR_TO_SOCKET:
3321 +- case PTR_TO_SOCKET_OR_NULL:
3322 + case PTR_TO_SOCK_COMMON:
3323 +- case PTR_TO_SOCK_COMMON_OR_NULL:
3324 + case PTR_TO_TCP_SOCK:
3325 +- case PTR_TO_TCP_SOCK_OR_NULL:
3326 + case PTR_TO_XDP_SOCK:
3327 + /* Only valid matches are exact, which memcmp() above
3328 + * would have accepted
3329 +@@ -10868,17 +10868,13 @@ next:
3330 + /* Return true if it's OK to have the same insn return a different type. */
3331 + static bool reg_type_mismatch_ok(enum bpf_reg_type type)
3332 + {
3333 +- switch (type) {
3334 ++ switch (base_type(type)) {
3335 + case PTR_TO_CTX:
3336 + case PTR_TO_SOCKET:
3337 +- case PTR_TO_SOCKET_OR_NULL:
3338 + case PTR_TO_SOCK_COMMON:
3339 +- case PTR_TO_SOCK_COMMON_OR_NULL:
3340 + case PTR_TO_TCP_SOCK:
3341 +- case PTR_TO_TCP_SOCK_OR_NULL:
3342 + case PTR_TO_XDP_SOCK:
3343 + case PTR_TO_BTF_ID:
3344 +- case PTR_TO_BTF_ID_OR_NULL:
3345 + return false;
3346 + default:
3347 + return true;
3348 +@@ -11102,7 +11098,7 @@ static int do_check(struct bpf_verifier_env *env)
3349 + if (is_ctx_reg(env, insn->dst_reg)) {
3350 + verbose(env, "BPF_ST stores into R%d %s is not allowed\n",
3351 + insn->dst_reg,
3352 +- reg_type_str[reg_state(env, insn->dst_reg)->type]);
3353 ++ reg_type_str(env, reg_state(env, insn->dst_reg)->type));
3354 + return -EACCES;
3355 + }
3356 +
3357 +@@ -11353,7 +11349,7 @@ static int check_pseudo_btf_id(struct bpf_verifier_env *env,
3358 + err = -EINVAL;
3359 + goto err_put;
3360 + }
3361 +- aux->btf_var.reg_type = PTR_TO_MEM;
3362 ++ aux->btf_var.reg_type = PTR_TO_MEM | MEM_RDONLY;
3363 + aux->btf_var.mem_size = tsize;
3364 + } else {
3365 + aux->btf_var.reg_type = PTR_TO_BTF_ID;
3366 +@@ -13175,7 +13171,7 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog)
3367 + mark_reg_known_zero(env, regs, i);
3368 + else if (regs[i].type == SCALAR_VALUE)
3369 + mark_reg_unknown(env, regs, i);
3370 +- else if (regs[i].type == PTR_TO_MEM_OR_NULL) {
3371 ++ else if (base_type(regs[i].type) == PTR_TO_MEM) {
3372 + const u32 mem_size = regs[i].mem_size;
3373 +
3374 + mark_reg_known_zero(env, regs, i);
3375 +diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
3376 +index 5a18b861fcf75..c289010b0964e 100644
3377 +--- a/kernel/trace/bpf_trace.c
3378 ++++ b/kernel/trace/bpf_trace.c
3379 +@@ -345,7 +345,7 @@ static const struct bpf_func_proto bpf_probe_write_user_proto = {
3380 + .gpl_only = true,
3381 + .ret_type = RET_INTEGER,
3382 + .arg1_type = ARG_ANYTHING,
3383 +- .arg2_type = ARG_PTR_TO_MEM,
3384 ++ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
3385 + .arg3_type = ARG_CONST_SIZE,
3386 + };
3387 +
3388 +@@ -394,7 +394,7 @@ static const struct bpf_func_proto bpf_trace_printk_proto = {
3389 + .func = bpf_trace_printk,
3390 + .gpl_only = true,
3391 + .ret_type = RET_INTEGER,
3392 +- .arg1_type = ARG_PTR_TO_MEM,
3393 ++ .arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY,
3394 + .arg2_type = ARG_CONST_SIZE,
3395 + };
3396 +
3397 +@@ -446,9 +446,9 @@ static const struct bpf_func_proto bpf_seq_printf_proto = {
3398 + .ret_type = RET_INTEGER,
3399 + .arg1_type = ARG_PTR_TO_BTF_ID,
3400 + .arg1_btf_id = &btf_seq_file_ids[0],
3401 +- .arg2_type = ARG_PTR_TO_MEM,
3402 ++ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
3403 + .arg3_type = ARG_CONST_SIZE,
3404 +- .arg4_type = ARG_PTR_TO_MEM_OR_NULL,
3405 ++ .arg4_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY,
3406 + .arg5_type = ARG_CONST_SIZE_OR_ZERO,
3407 + };
3408 +
3409 +@@ -463,7 +463,7 @@ static const struct bpf_func_proto bpf_seq_write_proto = {
3410 + .ret_type = RET_INTEGER,
3411 + .arg1_type = ARG_PTR_TO_BTF_ID,
3412 + .arg1_btf_id = &btf_seq_file_ids[0],
3413 +- .arg2_type = ARG_PTR_TO_MEM,
3414 ++ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
3415 + .arg3_type = ARG_CONST_SIZE_OR_ZERO,
3416 + };
3417 +
3418 +@@ -487,7 +487,7 @@ static const struct bpf_func_proto bpf_seq_printf_btf_proto = {
3419 + .ret_type = RET_INTEGER,
3420 + .arg1_type = ARG_PTR_TO_BTF_ID,
3421 + .arg1_btf_id = &btf_seq_file_ids[0],
3422 +- .arg2_type = ARG_PTR_TO_MEM,
3423 ++ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
3424 + .arg3_type = ARG_CONST_SIZE_OR_ZERO,
3425 + .arg4_type = ARG_ANYTHING,
3426 + };
3427 +@@ -648,7 +648,7 @@ static const struct bpf_func_proto bpf_perf_event_output_proto = {
3428 + .arg1_type = ARG_PTR_TO_CTX,
3429 + .arg2_type = ARG_CONST_MAP_PTR,
3430 + .arg3_type = ARG_ANYTHING,
3431 +- .arg4_type = ARG_PTR_TO_MEM,
3432 ++ .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY,
3433 + .arg5_type = ARG_CONST_SIZE_OR_ZERO,
3434 + };
3435 +
3436 +@@ -958,7 +958,7 @@ const struct bpf_func_proto bpf_snprintf_btf_proto = {
3437 + .ret_type = RET_INTEGER,
3438 + .arg1_type = ARG_PTR_TO_MEM,
3439 + .arg2_type = ARG_CONST_SIZE,
3440 +- .arg3_type = ARG_PTR_TO_MEM,
3441 ++ .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY,
3442 + .arg4_type = ARG_CONST_SIZE,
3443 + .arg5_type = ARG_ANYTHING,
3444 + };
3445 +@@ -1207,7 +1207,7 @@ static const struct bpf_func_proto bpf_perf_event_output_proto_tp = {
3446 + .arg1_type = ARG_PTR_TO_CTX,
3447 + .arg2_type = ARG_CONST_MAP_PTR,
3448 + .arg3_type = ARG_ANYTHING,
3449 +- .arg4_type = ARG_PTR_TO_MEM,
3450 ++ .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY,
3451 + .arg5_type = ARG_CONST_SIZE_OR_ZERO,
3452 + };
3453 +
3454 +@@ -1429,7 +1429,7 @@ static const struct bpf_func_proto bpf_perf_event_output_proto_raw_tp = {
3455 + .arg1_type = ARG_PTR_TO_CTX,
3456 + .arg2_type = ARG_CONST_MAP_PTR,
3457 + .arg3_type = ARG_ANYTHING,
3458 +- .arg4_type = ARG_PTR_TO_MEM,
3459 ++ .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY,
3460 + .arg5_type = ARG_CONST_SIZE_OR_ZERO,
3461 + };
3462 +
3463 +@@ -1483,7 +1483,7 @@ static const struct bpf_func_proto bpf_get_stack_proto_raw_tp = {
3464 + .gpl_only = true,
3465 + .ret_type = RET_INTEGER,
3466 + .arg1_type = ARG_PTR_TO_CTX,
3467 +- .arg2_type = ARG_PTR_TO_MEM,
3468 ++ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
3469 + .arg3_type = ARG_CONST_SIZE_OR_ZERO,
3470 + .arg4_type = ARG_ANYTHING,
3471 + };
3472 +diff --git a/lib/iov_iter.c b/lib/iov_iter.c
3473 +index c5b2f0f4b8a84..6d146f77601d7 100644
3474 +--- a/lib/iov_iter.c
3475 ++++ b/lib/iov_iter.c
3476 +@@ -191,7 +191,7 @@ static size_t copy_page_to_iter_iovec(struct page *page, size_t offset, size_t b
3477 + buf = iov->iov_base + skip;
3478 + copy = min(bytes, iov->iov_len - skip);
3479 +
3480 +- if (IS_ENABLED(CONFIG_HIGHMEM) && !fault_in_pages_writeable(buf, copy)) {
3481 ++ if (IS_ENABLED(CONFIG_HIGHMEM) && !fault_in_writeable(buf, copy)) {
3482 + kaddr = kmap_atomic(page);
3483 + from = kaddr + offset;
3484 +
3485 +@@ -275,7 +275,7 @@ static size_t copy_page_from_iter_iovec(struct page *page, size_t offset, size_t
3486 + buf = iov->iov_base + skip;
3487 + copy = min(bytes, iov->iov_len - skip);
3488 +
3489 +- if (IS_ENABLED(CONFIG_HIGHMEM) && !fault_in_pages_readable(buf, copy)) {
3490 ++ if (IS_ENABLED(CONFIG_HIGHMEM) && !fault_in_readable(buf, copy)) {
3491 + kaddr = kmap_atomic(page);
3492 + to = kaddr + offset;
3493 +
3494 +@@ -431,35 +431,81 @@ out:
3495 + }
3496 +
3497 + /*
3498 ++ * fault_in_iov_iter_readable - fault in iov iterator for reading
3499 ++ * @i: iterator
3500 ++ * @size: maximum length
3501 ++ *
3502 + * Fault in one or more iovecs of the given iov_iter, to a maximum length of
3503 +- * bytes. For each iovec, fault in each page that constitutes the iovec.
3504 ++ * @size. For each iovec, fault in each page that constitutes the iovec.
3505 ++ *
3506 ++ * Returns the number of bytes not faulted in (like copy_to_user() and
3507 ++ * copy_from_user()).
3508 + *
3509 +- * Return 0 on success, or non-zero if the memory could not be accessed (i.e.
3510 +- * because it is an invalid address).
3511 ++ * Always returns 0 for non-userspace iterators.
3512 + */
3513 +-int iov_iter_fault_in_readable(const struct iov_iter *i, size_t bytes)
3514 ++size_t fault_in_iov_iter_readable(const struct iov_iter *i, size_t size)
3515 + {
3516 + if (iter_is_iovec(i)) {
3517 ++ size_t count = min(size, iov_iter_count(i));
3518 + const struct iovec *p;
3519 + size_t skip;
3520 +
3521 +- if (bytes > i->count)
3522 +- bytes = i->count;
3523 +- for (p = i->iov, skip = i->iov_offset; bytes; p++, skip = 0) {
3524 +- size_t len = min(bytes, p->iov_len - skip);
3525 +- int err;
3526 ++ size -= count;
3527 ++ for (p = i->iov, skip = i->iov_offset; count; p++, skip = 0) {
3528 ++ size_t len = min(count, p->iov_len - skip);
3529 ++ size_t ret;
3530 +
3531 + if (unlikely(!len))
3532 + continue;
3533 +- err = fault_in_pages_readable(p->iov_base + skip, len);
3534 +- if (unlikely(err))
3535 +- return err;
3536 +- bytes -= len;
3537 ++ ret = fault_in_readable(p->iov_base + skip, len);
3538 ++ count -= len - ret;
3539 ++ if (ret)
3540 ++ break;
3541 + }
3542 ++ return count + size;
3543 + }
3544 + return 0;
3545 + }
3546 +-EXPORT_SYMBOL(iov_iter_fault_in_readable);
3547 ++EXPORT_SYMBOL(fault_in_iov_iter_readable);
3548 ++
3549 ++/*
3550 ++ * fault_in_iov_iter_writeable - fault in iov iterator for writing
3551 ++ * @i: iterator
3552 ++ * @size: maximum length
3553 ++ *
3554 ++ * Faults in the iterator using get_user_pages(), i.e., without triggering
3555 ++ * hardware page faults. This is primarily useful when we already know that
3556 ++ * some or all of the pages in @i aren't in memory.
3557 ++ *
3558 ++ * Returns the number of bytes not faulted in, like copy_to_user() and
3559 ++ * copy_from_user().
3560 ++ *
3561 ++ * Always returns 0 for non-user-space iterators.
3562 ++ */
3563 ++size_t fault_in_iov_iter_writeable(const struct iov_iter *i, size_t size)
3564 ++{
3565 ++ if (iter_is_iovec(i)) {
3566 ++ size_t count = min(size, iov_iter_count(i));
3567 ++ const struct iovec *p;
3568 ++ size_t skip;
3569 ++
3570 ++ size -= count;
3571 ++ for (p = i->iov, skip = i->iov_offset; count; p++, skip = 0) {
3572 ++ size_t len = min(count, p->iov_len - skip);
3573 ++ size_t ret;
3574 ++
3575 ++ if (unlikely(!len))
3576 ++ continue;
3577 ++ ret = fault_in_safe_writeable(p->iov_base + skip, len);
3578 ++ count -= len - ret;
3579 ++ if (ret)
3580 ++ break;
3581 ++ }
3582 ++ return count + size;
3583 ++ }
3584 ++ return 0;
3585 ++}
3586 ++EXPORT_SYMBOL(fault_in_iov_iter_writeable);
3587 +
3588 + void iov_iter_init(struct iov_iter *i, unsigned int direction,
3589 + const struct iovec *iov, unsigned long nr_segs,
3590 +@@ -468,6 +514,7 @@ void iov_iter_init(struct iov_iter *i, unsigned int direction,
3591 + WARN_ON(direction & ~(READ | WRITE));
3592 + *i = (struct iov_iter) {
3593 + .iter_type = ITER_IOVEC,
3594 ++ .nofault = false,
3595 + .data_source = direction,
3596 + .iov = iov,
3597 + .nr_segs = nr_segs,
3598 +@@ -1483,13 +1530,17 @@ ssize_t iov_iter_get_pages(struct iov_iter *i,
3599 + return 0;
3600 +
3601 + if (likely(iter_is_iovec(i))) {
3602 ++ unsigned int gup_flags = 0;
3603 + unsigned long addr;
3604 +
3605 ++ if (iov_iter_rw(i) != WRITE)
3606 ++ gup_flags |= FOLL_WRITE;
3607 ++ if (i->nofault)
3608 ++ gup_flags |= FOLL_NOFAULT;
3609 ++
3610 + addr = first_iovec_segment(i, &len, start, maxsize, maxpages);
3611 + n = DIV_ROUND_UP(len, PAGE_SIZE);
3612 +- res = get_user_pages_fast(addr, n,
3613 +- iov_iter_rw(i) != WRITE ? FOLL_WRITE : 0,
3614 +- pages);
3615 ++ res = get_user_pages_fast(addr, n, gup_flags, pages);
3616 + if (unlikely(res <= 0))
3617 + return res;
3618 + return (res == n ? len : res * PAGE_SIZE) - *start;
3619 +@@ -1605,15 +1656,20 @@ ssize_t iov_iter_get_pages_alloc(struct iov_iter *i,
3620 + return 0;
3621 +
3622 + if (likely(iter_is_iovec(i))) {
3623 ++ unsigned int gup_flags = 0;
3624 + unsigned long addr;
3625 +
3626 ++ if (iov_iter_rw(i) != WRITE)
3627 ++ gup_flags |= FOLL_WRITE;
3628 ++ if (i->nofault)
3629 ++ gup_flags |= FOLL_NOFAULT;
3630 ++
3631 + addr = first_iovec_segment(i, &len, start, maxsize, ~0U);
3632 + n = DIV_ROUND_UP(len, PAGE_SIZE);
3633 + p = get_pages_array(n);
3634 + if (!p)
3635 + return -ENOMEM;
3636 +- res = get_user_pages_fast(addr, n,
3637 +- iov_iter_rw(i) != WRITE ? FOLL_WRITE : 0, p);
3638 ++ res = get_user_pages_fast(addr, n, gup_flags, p);
3639 + if (unlikely(res <= 0)) {
3640 + kvfree(p);
3641 + *pages = NULL;
3642 +diff --git a/mm/filemap.c b/mm/filemap.c
3643 +index 1293c3409e429..00e391e758801 100644
3644 +--- a/mm/filemap.c
3645 ++++ b/mm/filemap.c
3646 +@@ -90,7 +90,7 @@
3647 + * ->lock_page (filemap_fault, access_process_vm)
3648 + *
3649 + * ->i_rwsem (generic_perform_write)
3650 +- * ->mmap_lock (fault_in_pages_readable->do_page_fault)
3651 ++ * ->mmap_lock (fault_in_readable->do_page_fault)
3652 + *
3653 + * bdi->wb.list_lock
3654 + * sb_lock (fs/fs-writeback.c)
3655 +@@ -3760,7 +3760,7 @@ again:
3656 + * same page as we're writing to, without it being marked
3657 + * up-to-date.
3658 + */
3659 +- if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
3660 ++ if (unlikely(fault_in_iov_iter_readable(i, bytes))) {
3661 + status = -EFAULT;
3662 + break;
3663 + }
3664 +diff --git a/mm/gup.c b/mm/gup.c
3665 +index 52f08e3177e9f..ba2ab7a223f8e 100644
3666 +--- a/mm/gup.c
3667 ++++ b/mm/gup.c
3668 +@@ -943,6 +943,8 @@ static int faultin_page(struct vm_area_struct *vma,
3669 + /* mlock all present pages, but do not fault in new pages */
3670 + if ((*flags & (FOLL_POPULATE | FOLL_MLOCK)) == FOLL_MLOCK)
3671 + return -ENOENT;
3672 ++ if (*flags & FOLL_NOFAULT)
3673 ++ return -EFAULT;
3674 + if (*flags & FOLL_WRITE)
3675 + fault_flags |= FAULT_FLAG_WRITE;
3676 + if (*flags & FOLL_REMOTE)
3677 +@@ -1681,6 +1683,122 @@ finish_or_fault:
3678 + }
3679 + #endif /* !CONFIG_MMU */
3680 +
3681 ++/**
3682 ++ * fault_in_writeable - fault in userspace address range for writing
3683 ++ * @uaddr: start of address range
3684 ++ * @size: size of address range
3685 ++ *
3686 ++ * Returns the number of bytes not faulted in (like copy_to_user() and
3687 ++ * copy_from_user()).
3688 ++ */
3689 ++size_t fault_in_writeable(char __user *uaddr, size_t size)
3690 ++{
3691 ++ char __user *start = uaddr, *end;
3692 ++
3693 ++ if (unlikely(size == 0))
3694 ++ return 0;
3695 ++ if (!PAGE_ALIGNED(uaddr)) {
3696 ++ if (unlikely(__put_user(0, uaddr) != 0))
3697 ++ return size;
3698 ++ uaddr = (char __user *)PAGE_ALIGN((unsigned long)uaddr);
3699 ++ }
3700 ++ end = (char __user *)PAGE_ALIGN((unsigned long)start + size);
3701 ++ if (unlikely(end < start))
3702 ++ end = NULL;
3703 ++ while (uaddr != end) {
3704 ++ if (unlikely(__put_user(0, uaddr) != 0))
3705 ++ goto out;
3706 ++ uaddr += PAGE_SIZE;
3707 ++ }
3708 ++
3709 ++out:
3710 ++ if (size > uaddr - start)
3711 ++ return size - (uaddr - start);
3712 ++ return 0;
3713 ++}
3714 ++EXPORT_SYMBOL(fault_in_writeable);
3715 ++
3716 ++/*
3717 ++ * fault_in_safe_writeable - fault in an address range for writing
3718 ++ * @uaddr: start of address range
3719 ++ * @size: length of address range
3720 ++ *
3721 ++ * Faults in an address range for writing. This is primarily useful when we
3722 ++ * already know that some or all of the pages in the address range aren't in
3723 ++ * memory.
3724 ++ *
3725 ++ * Unlike fault_in_writeable(), this function is non-destructive.
3726 ++ *
3727 ++ * Note that we don't pin or otherwise hold the pages referenced that we fault
3728 ++ * in. There's no guarantee that they'll stay in memory for any duration of
3729 ++ * time.
3730 ++ *
3731 ++ * Returns the number of bytes not faulted in, like copy_to_user() and
3732 ++ * copy_from_user().
3733 ++ */
3734 ++size_t fault_in_safe_writeable(const char __user *uaddr, size_t size)
3735 ++{
3736 ++ unsigned long start = (unsigned long)uaddr, end;
3737 ++ struct mm_struct *mm = current->mm;
3738 ++ bool unlocked = false;
3739 ++
3740 ++ if (unlikely(size == 0))
3741 ++ return 0;
3742 ++ end = PAGE_ALIGN(start + size);
3743 ++ if (end < start)
3744 ++ end = 0;
3745 ++
3746 ++ mmap_read_lock(mm);
3747 ++ do {
3748 ++ if (fixup_user_fault(mm, start, FAULT_FLAG_WRITE, &unlocked))
3749 ++ break;
3750 ++ start = (start + PAGE_SIZE) & PAGE_MASK;
3751 ++ } while (start != end);
3752 ++ mmap_read_unlock(mm);
3753 ++
3754 ++ if (size > (unsigned long)uaddr - start)
3755 ++ return size - ((unsigned long)uaddr - start);
3756 ++ return 0;
3757 ++}
3758 ++EXPORT_SYMBOL(fault_in_safe_writeable);
3759 ++
3760 ++/**
3761 ++ * fault_in_readable - fault in userspace address range for reading
3762 ++ * @uaddr: start of user address range
3763 ++ * @size: size of user address range
3764 ++ *
3765 ++ * Returns the number of bytes not faulted in (like copy_to_user() and
3766 ++ * copy_from_user()).
3767 ++ */
3768 ++size_t fault_in_readable(const char __user *uaddr, size_t size)
3769 ++{
3770 ++ const char __user *start = uaddr, *end;
3771 ++ volatile char c;
3772 ++
3773 ++ if (unlikely(size == 0))
3774 ++ return 0;
3775 ++ if (!PAGE_ALIGNED(uaddr)) {
3776 ++ if (unlikely(__get_user(c, uaddr) != 0))
3777 ++ return size;
3778 ++ uaddr = (const char __user *)PAGE_ALIGN((unsigned long)uaddr);
3779 ++ }
3780 ++ end = (const char __user *)PAGE_ALIGN((unsigned long)start + size);
3781 ++ if (unlikely(end < start))
3782 ++ end = NULL;
3783 ++ while (uaddr != end) {
3784 ++ if (unlikely(__get_user(c, uaddr) != 0))
3785 ++ goto out;
3786 ++ uaddr += PAGE_SIZE;
3787 ++ }
3788 ++
3789 ++out:
3790 ++ (void)c;
3791 ++ if (size > uaddr - start)
3792 ++ return size - (uaddr - start);
3793 ++ return 0;
3794 ++}
3795 ++EXPORT_SYMBOL(fault_in_readable);
3796 ++
3797 + /**
3798 + * get_dump_page() - pin user page in memory while writing it to core dump
3799 + * @addr: user address
3800 +@@ -2733,7 +2851,7 @@ static int internal_get_user_pages_fast(unsigned long start,
3801 +
3802 + if (WARN_ON_ONCE(gup_flags & ~(FOLL_WRITE | FOLL_LONGTERM |
3803 + FOLL_FORCE | FOLL_PIN | FOLL_GET |
3804 +- FOLL_FAST_ONLY)))
3805 ++ FOLL_FAST_ONLY | FOLL_NOFAULT)))
3806 + return -EINVAL;
3807 +
3808 + if (gup_flags & FOLL_PIN)
3809 +diff --git a/mm/kfence/core.c b/mm/kfence/core.c
3810 +index 86260e8f28302..66076d8742b78 100644
3811 +--- a/mm/kfence/core.c
3812 ++++ b/mm/kfence/core.c
3813 +@@ -528,6 +528,8 @@ static bool __init kfence_init_pool(void)
3814 + * enters __slab_free() slow-path.
3815 + */
3816 + for (i = 0; i < KFENCE_POOL_SIZE / PAGE_SIZE; i++) {
3817 ++ struct page *page = &pages[i];
3818 ++
3819 + if (!i || (i % 2))
3820 + continue;
3821 +
3822 +@@ -535,7 +537,11 @@ static bool __init kfence_init_pool(void)
3823 + if (WARN_ON(compound_head(&pages[i]) != &pages[i]))
3824 + goto err;
3825 +
3826 +- __SetPageSlab(&pages[i]);
3827 ++ __SetPageSlab(page);
3828 ++#ifdef CONFIG_MEMCG
3829 ++ page->memcg_data = (unsigned long)&kfence_metadata[i / 2 - 1].objcg |
3830 ++ MEMCG_DATA_OBJCGS;
3831 ++#endif
3832 + }
3833 +
3834 + /*
3835 +@@ -911,6 +917,9 @@ void __kfence_free(void *addr)
3836 + {
3837 + struct kfence_metadata *meta = addr_to_metadata((unsigned long)addr);
3838 +
3839 ++#ifdef CONFIG_MEMCG
3840 ++ KFENCE_WARN_ON(meta->objcg);
3841 ++#endif
3842 + /*
3843 + * If the objects of the cache are SLAB_TYPESAFE_BY_RCU, defer freeing
3844 + * the object, as the object page may be recycled for other-typed
3845 +diff --git a/mm/kfence/kfence.h b/mm/kfence/kfence.h
3846 +index 92bf6eff6060d..600f2e2431d6d 100644
3847 +--- a/mm/kfence/kfence.h
3848 ++++ b/mm/kfence/kfence.h
3849 +@@ -89,6 +89,9 @@ struct kfence_metadata {
3850 + struct kfence_track free_track;
3851 + /* For updating alloc_covered on frees. */
3852 + u32 alloc_stack_hash;
3853 ++#ifdef CONFIG_MEMCG
3854 ++ struct obj_cgroup *objcg;
3855 ++#endif
3856 + };
3857 +
3858 + extern struct kfence_metadata kfence_metadata[CONFIG_KFENCE_NUM_OBJECTS];
3859 +diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c
3860 +index 68d2cbf8331ac..ea61dfe19c869 100644
3861 +--- a/net/core/bpf_sk_storage.c
3862 ++++ b/net/core/bpf_sk_storage.c
3863 +@@ -929,7 +929,7 @@ static struct bpf_iter_reg bpf_sk_storage_map_reg_info = {
3864 + { offsetof(struct bpf_iter__bpf_sk_storage_map, sk),
3865 + PTR_TO_BTF_ID_OR_NULL },
3866 + { offsetof(struct bpf_iter__bpf_sk_storage_map, value),
3867 +- PTR_TO_RDWR_BUF_OR_NULL },
3868 ++ PTR_TO_BUF | PTR_MAYBE_NULL },
3869 + },
3870 + .seq_info = &iter_seq_info,
3871 + };
3872 +diff --git a/net/core/filter.c b/net/core/filter.c
3873 +index cdd7e92db3030..821278b906b71 100644
3874 +--- a/net/core/filter.c
3875 ++++ b/net/core/filter.c
3876 +@@ -1713,7 +1713,7 @@ static const struct bpf_func_proto bpf_skb_store_bytes_proto = {
3877 + .ret_type = RET_INTEGER,
3878 + .arg1_type = ARG_PTR_TO_CTX,
3879 + .arg2_type = ARG_ANYTHING,
3880 +- .arg3_type = ARG_PTR_TO_MEM,
3881 ++ .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY,
3882 + .arg4_type = ARG_CONST_SIZE,
3883 + .arg5_type = ARG_ANYTHING,
3884 + };
3885 +@@ -2018,9 +2018,9 @@ static const struct bpf_func_proto bpf_csum_diff_proto = {
3886 + .gpl_only = false,
3887 + .pkt_access = true,
3888 + .ret_type = RET_INTEGER,
3889 +- .arg1_type = ARG_PTR_TO_MEM_OR_NULL,
3890 ++ .arg1_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY,
3891 + .arg2_type = ARG_CONST_SIZE_OR_ZERO,
3892 +- .arg3_type = ARG_PTR_TO_MEM_OR_NULL,
3893 ++ .arg3_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY,
3894 + .arg4_type = ARG_CONST_SIZE_OR_ZERO,
3895 + .arg5_type = ARG_ANYTHING,
3896 + };
3897 +@@ -2541,7 +2541,7 @@ static const struct bpf_func_proto bpf_redirect_neigh_proto = {
3898 + .gpl_only = false,
3899 + .ret_type = RET_INTEGER,
3900 + .arg1_type = ARG_ANYTHING,
3901 +- .arg2_type = ARG_PTR_TO_MEM_OR_NULL,
3902 ++ .arg2_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY,
3903 + .arg3_type = ARG_CONST_SIZE_OR_ZERO,
3904 + .arg4_type = ARG_ANYTHING,
3905 + };
3906 +@@ -4177,7 +4177,7 @@ static const struct bpf_func_proto bpf_skb_event_output_proto = {
3907 + .arg1_type = ARG_PTR_TO_CTX,
3908 + .arg2_type = ARG_CONST_MAP_PTR,
3909 + .arg3_type = ARG_ANYTHING,
3910 +- .arg4_type = ARG_PTR_TO_MEM,
3911 ++ .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY,
3912 + .arg5_type = ARG_CONST_SIZE_OR_ZERO,
3913 + };
3914 +
3915 +@@ -4191,7 +4191,7 @@ const struct bpf_func_proto bpf_skb_output_proto = {
3916 + .arg1_btf_id = &bpf_skb_output_btf_ids[0],
3917 + .arg2_type = ARG_CONST_MAP_PTR,
3918 + .arg3_type = ARG_ANYTHING,
3919 +- .arg4_type = ARG_PTR_TO_MEM,
3920 ++ .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY,
3921 + .arg5_type = ARG_CONST_SIZE_OR_ZERO,
3922 + };
3923 +
3924 +@@ -4374,7 +4374,7 @@ static const struct bpf_func_proto bpf_skb_set_tunnel_key_proto = {
3925 + .gpl_only = false,
3926 + .ret_type = RET_INTEGER,
3927 + .arg1_type = ARG_PTR_TO_CTX,
3928 +- .arg2_type = ARG_PTR_TO_MEM,
3929 ++ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
3930 + .arg3_type = ARG_CONST_SIZE,
3931 + .arg4_type = ARG_ANYTHING,
3932 + };
3933 +@@ -4400,7 +4400,7 @@ static const struct bpf_func_proto bpf_skb_set_tunnel_opt_proto = {
3934 + .gpl_only = false,
3935 + .ret_type = RET_INTEGER,
3936 + .arg1_type = ARG_PTR_TO_CTX,
3937 +- .arg2_type = ARG_PTR_TO_MEM,
3938 ++ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
3939 + .arg3_type = ARG_CONST_SIZE,
3940 + };
3941 +
3942 +@@ -4570,7 +4570,7 @@ static const struct bpf_func_proto bpf_xdp_event_output_proto = {
3943 + .arg1_type = ARG_PTR_TO_CTX,
3944 + .arg2_type = ARG_CONST_MAP_PTR,
3945 + .arg3_type = ARG_ANYTHING,
3946 +- .arg4_type = ARG_PTR_TO_MEM,
3947 ++ .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY,
3948 + .arg5_type = ARG_CONST_SIZE_OR_ZERO,
3949 + };
3950 +
3951 +@@ -4584,7 +4584,7 @@ const struct bpf_func_proto bpf_xdp_output_proto = {
3952 + .arg1_btf_id = &bpf_xdp_output_btf_ids[0],
3953 + .arg2_type = ARG_CONST_MAP_PTR,
3954 + .arg3_type = ARG_ANYTHING,
3955 +- .arg4_type = ARG_PTR_TO_MEM,
3956 ++ .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY,
3957 + .arg5_type = ARG_CONST_SIZE_OR_ZERO,
3958 + };
3959 +
3960 +@@ -5072,7 +5072,7 @@ const struct bpf_func_proto bpf_sk_setsockopt_proto = {
3961 + .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
3962 + .arg2_type = ARG_ANYTHING,
3963 + .arg3_type = ARG_ANYTHING,
3964 +- .arg4_type = ARG_PTR_TO_MEM,
3965 ++ .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY,
3966 + .arg5_type = ARG_CONST_SIZE,
3967 + };
3968 +
3969 +@@ -5106,7 +5106,7 @@ static const struct bpf_func_proto bpf_sock_addr_setsockopt_proto = {
3970 + .arg1_type = ARG_PTR_TO_CTX,
3971 + .arg2_type = ARG_ANYTHING,
3972 + .arg3_type = ARG_ANYTHING,
3973 +- .arg4_type = ARG_PTR_TO_MEM,
3974 ++ .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY,
3975 + .arg5_type = ARG_CONST_SIZE,
3976 + };
3977 +
3978 +@@ -5140,7 +5140,7 @@ static const struct bpf_func_proto bpf_sock_ops_setsockopt_proto = {
3979 + .arg1_type = ARG_PTR_TO_CTX,
3980 + .arg2_type = ARG_ANYTHING,
3981 + .arg3_type = ARG_ANYTHING,
3982 +- .arg4_type = ARG_PTR_TO_MEM,
3983 ++ .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY,
3984 + .arg5_type = ARG_CONST_SIZE,
3985 + };
3986 +
3987 +@@ -5315,7 +5315,7 @@ static const struct bpf_func_proto bpf_bind_proto = {
3988 + .gpl_only = false,
3989 + .ret_type = RET_INTEGER,
3990 + .arg1_type = ARG_PTR_TO_CTX,
3991 +- .arg2_type = ARG_PTR_TO_MEM,
3992 ++ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
3993 + .arg3_type = ARG_CONST_SIZE,
3994 + };
3995 +
3996 +@@ -5903,7 +5903,7 @@ static const struct bpf_func_proto bpf_lwt_in_push_encap_proto = {
3997 + .ret_type = RET_INTEGER,
3998 + .arg1_type = ARG_PTR_TO_CTX,
3999 + .arg2_type = ARG_ANYTHING,
4000 +- .arg3_type = ARG_PTR_TO_MEM,
4001 ++ .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY,
4002 + .arg4_type = ARG_CONST_SIZE
4003 + };
4004 +
4005 +@@ -5913,7 +5913,7 @@ static const struct bpf_func_proto bpf_lwt_xmit_push_encap_proto = {
4006 + .ret_type = RET_INTEGER,
4007 + .arg1_type = ARG_PTR_TO_CTX,
4008 + .arg2_type = ARG_ANYTHING,
4009 +- .arg3_type = ARG_PTR_TO_MEM,
4010 ++ .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY,
4011 + .arg4_type = ARG_CONST_SIZE
4012 + };
4013 +
4014 +@@ -5956,7 +5956,7 @@ static const struct bpf_func_proto bpf_lwt_seg6_store_bytes_proto = {
4015 + .ret_type = RET_INTEGER,
4016 + .arg1_type = ARG_PTR_TO_CTX,
4017 + .arg2_type = ARG_ANYTHING,
4018 +- .arg3_type = ARG_PTR_TO_MEM,
4019 ++ .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY,
4020 + .arg4_type = ARG_CONST_SIZE
4021 + };
4022 +
4023 +@@ -6044,7 +6044,7 @@ static const struct bpf_func_proto bpf_lwt_seg6_action_proto = {
4024 + .ret_type = RET_INTEGER,
4025 + .arg1_type = ARG_PTR_TO_CTX,
4026 + .arg2_type = ARG_ANYTHING,
4027 +- .arg3_type = ARG_PTR_TO_MEM,
4028 ++ .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY,
4029 + .arg4_type = ARG_CONST_SIZE
4030 + };
4031 +
4032 +@@ -6269,7 +6269,7 @@ static const struct bpf_func_proto bpf_skc_lookup_tcp_proto = {
4033 + .pkt_access = true,
4034 + .ret_type = RET_PTR_TO_SOCK_COMMON_OR_NULL,
4035 + .arg1_type = ARG_PTR_TO_CTX,
4036 +- .arg2_type = ARG_PTR_TO_MEM,
4037 ++ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
4038 + .arg3_type = ARG_CONST_SIZE,
4039 + .arg4_type = ARG_ANYTHING,
4040 + .arg5_type = ARG_ANYTHING,
4041 +@@ -6288,7 +6288,7 @@ static const struct bpf_func_proto bpf_sk_lookup_tcp_proto = {
4042 + .pkt_access = true,
4043 + .ret_type = RET_PTR_TO_SOCKET_OR_NULL,
4044 + .arg1_type = ARG_PTR_TO_CTX,
4045 +- .arg2_type = ARG_PTR_TO_MEM,
4046 ++ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
4047 + .arg3_type = ARG_CONST_SIZE,
4048 + .arg4_type = ARG_ANYTHING,
4049 + .arg5_type = ARG_ANYTHING,
4050 +@@ -6307,7 +6307,7 @@ static const struct bpf_func_proto bpf_sk_lookup_udp_proto = {
4051 + .pkt_access = true,
4052 + .ret_type = RET_PTR_TO_SOCKET_OR_NULL,
4053 + .arg1_type = ARG_PTR_TO_CTX,
4054 +- .arg2_type = ARG_PTR_TO_MEM,
4055 ++ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
4056 + .arg3_type = ARG_CONST_SIZE,
4057 + .arg4_type = ARG_ANYTHING,
4058 + .arg5_type = ARG_ANYTHING,
4059 +@@ -6344,7 +6344,7 @@ static const struct bpf_func_proto bpf_xdp_sk_lookup_udp_proto = {
4060 + .pkt_access = true,
4061 + .ret_type = RET_PTR_TO_SOCKET_OR_NULL,
4062 + .arg1_type = ARG_PTR_TO_CTX,
4063 +- .arg2_type = ARG_PTR_TO_MEM,
4064 ++ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
4065 + .arg3_type = ARG_CONST_SIZE,
4066 + .arg4_type = ARG_ANYTHING,
4067 + .arg5_type = ARG_ANYTHING,
4068 +@@ -6367,7 +6367,7 @@ static const struct bpf_func_proto bpf_xdp_skc_lookup_tcp_proto = {
4069 + .pkt_access = true,
4070 + .ret_type = RET_PTR_TO_SOCK_COMMON_OR_NULL,
4071 + .arg1_type = ARG_PTR_TO_CTX,
4072 +- .arg2_type = ARG_PTR_TO_MEM,
4073 ++ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
4074 + .arg3_type = ARG_CONST_SIZE,
4075 + .arg4_type = ARG_ANYTHING,
4076 + .arg5_type = ARG_ANYTHING,
4077 +@@ -6390,7 +6390,7 @@ static const struct bpf_func_proto bpf_xdp_sk_lookup_tcp_proto = {
4078 + .pkt_access = true,
4079 + .ret_type = RET_PTR_TO_SOCKET_OR_NULL,
4080 + .arg1_type = ARG_PTR_TO_CTX,
4081 +- .arg2_type = ARG_PTR_TO_MEM,
4082 ++ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
4083 + .arg3_type = ARG_CONST_SIZE,
4084 + .arg4_type = ARG_ANYTHING,
4085 + .arg5_type = ARG_ANYTHING,
4086 +@@ -6409,7 +6409,7 @@ static const struct bpf_func_proto bpf_sock_addr_skc_lookup_tcp_proto = {
4087 + .gpl_only = false,
4088 + .ret_type = RET_PTR_TO_SOCK_COMMON_OR_NULL,
4089 + .arg1_type = ARG_PTR_TO_CTX,
4090 +- .arg2_type = ARG_PTR_TO_MEM,
4091 ++ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
4092 + .arg3_type = ARG_CONST_SIZE,
4093 + .arg4_type = ARG_ANYTHING,
4094 + .arg5_type = ARG_ANYTHING,
4095 +@@ -6428,7 +6428,7 @@ static const struct bpf_func_proto bpf_sock_addr_sk_lookup_tcp_proto = {
4096 + .gpl_only = false,
4097 + .ret_type = RET_PTR_TO_SOCKET_OR_NULL,
4098 + .arg1_type = ARG_PTR_TO_CTX,
4099 +- .arg2_type = ARG_PTR_TO_MEM,
4100 ++ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
4101 + .arg3_type = ARG_CONST_SIZE,
4102 + .arg4_type = ARG_ANYTHING,
4103 + .arg5_type = ARG_ANYTHING,
4104 +@@ -6447,7 +6447,7 @@ static const struct bpf_func_proto bpf_sock_addr_sk_lookup_udp_proto = {
4105 + .gpl_only = false,
4106 + .ret_type = RET_PTR_TO_SOCKET_OR_NULL,
4107 + .arg1_type = ARG_PTR_TO_CTX,
4108 +- .arg2_type = ARG_PTR_TO_MEM,
4109 ++ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
4110 + .arg3_type = ARG_CONST_SIZE,
4111 + .arg4_type = ARG_ANYTHING,
4112 + .arg5_type = ARG_ANYTHING,
4113 +@@ -6769,9 +6769,9 @@ static const struct bpf_func_proto bpf_tcp_check_syncookie_proto = {
4114 + .pkt_access = true,
4115 + .ret_type = RET_INTEGER,
4116 + .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
4117 +- .arg2_type = ARG_PTR_TO_MEM,
4118 ++ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
4119 + .arg3_type = ARG_CONST_SIZE,
4120 +- .arg4_type = ARG_PTR_TO_MEM,
4121 ++ .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY,
4122 + .arg5_type = ARG_CONST_SIZE,
4123 + };
4124 +
4125 +@@ -6838,9 +6838,9 @@ static const struct bpf_func_proto bpf_tcp_gen_syncookie_proto = {
4126 + .pkt_access = true,
4127 + .ret_type = RET_INTEGER,
4128 + .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
4129 +- .arg2_type = ARG_PTR_TO_MEM,
4130 ++ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
4131 + .arg3_type = ARG_CONST_SIZE,
4132 +- .arg4_type = ARG_PTR_TO_MEM,
4133 ++ .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY,
4134 + .arg5_type = ARG_CONST_SIZE,
4135 + };
4136 +
4137 +@@ -7069,7 +7069,7 @@ static const struct bpf_func_proto bpf_sock_ops_store_hdr_opt_proto = {
4138 + .gpl_only = false,
4139 + .ret_type = RET_INTEGER,
4140 + .arg1_type = ARG_PTR_TO_CTX,
4141 +- .arg2_type = ARG_PTR_TO_MEM,
4142 ++ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
4143 + .arg3_type = ARG_CONST_SIZE,
4144 + .arg4_type = ARG_ANYTHING,
4145 + };
4146 +diff --git a/net/core/sock_map.c b/net/core/sock_map.c
4147 +index 8288b5382f08d..6351b6af7aca9 100644
4148 +--- a/net/core/sock_map.c
4149 ++++ b/net/core/sock_map.c
4150 +@@ -1575,7 +1575,7 @@ static struct bpf_iter_reg sock_map_iter_reg = {
4151 + .ctx_arg_info_size = 2,
4152 + .ctx_arg_info = {
4153 + { offsetof(struct bpf_iter__sockmap, key),
4154 +- PTR_TO_RDONLY_BUF_OR_NULL },
4155 ++ PTR_TO_BUF | PTR_MAYBE_NULL | MEM_RDONLY },
4156 + { offsetof(struct bpf_iter__sockmap, sk),
4157 + PTR_TO_BTF_ID_OR_NULL },
4158 + },
4159 +diff --git a/tools/testing/selftests/bpf/prog_tests/ksyms_btf.c b/tools/testing/selftests/bpf/prog_tests/ksyms_btf.c
4160 +index cf3acfa5a91d5..69455fe90ac3e 100644
4161 +--- a/tools/testing/selftests/bpf/prog_tests/ksyms_btf.c
4162 ++++ b/tools/testing/selftests/bpf/prog_tests/ksyms_btf.c
4163 +@@ -7,6 +7,7 @@
4164 + #include "test_ksyms_btf.skel.h"
4165 + #include "test_ksyms_btf_null_check.skel.h"
4166 + #include "test_ksyms_weak.skel.h"
4167 ++#include "test_ksyms_btf_write_check.skel.h"
4168 +
4169 + static int duration;
4170 +
4171 +@@ -109,6 +110,16 @@ cleanup:
4172 + test_ksyms_weak__destroy(skel);
4173 + }
4174 +
4175 ++static void test_write_check(void)
4176 ++{
4177 ++ struct test_ksyms_btf_write_check *skel;
4178 ++
4179 ++ skel = test_ksyms_btf_write_check__open_and_load();
4180 ++ ASSERT_ERR_PTR(skel, "unexpected load of a prog writing to ksym memory\n");
4181 ++
4182 ++ test_ksyms_btf_write_check__destroy(skel);
4183 ++}
4184 ++
4185 + void test_ksyms_btf(void)
4186 + {
4187 + int percpu_datasec;
4188 +@@ -136,4 +147,7 @@ void test_ksyms_btf(void)
4189 +
4190 + if (test__start_subtest("weak_ksyms"))
4191 + test_weak_syms();
4192 ++
4193 ++ if (test__start_subtest("write_check"))
4194 ++ test_write_check();
4195 + }
4196 +diff --git a/tools/testing/selftests/bpf/progs/test_ksyms_btf_write_check.c b/tools/testing/selftests/bpf/progs/test_ksyms_btf_write_check.c
4197 +new file mode 100644
4198 +index 0000000000000..2180c41cd890f
4199 +--- /dev/null
4200 ++++ b/tools/testing/selftests/bpf/progs/test_ksyms_btf_write_check.c
4201 +@@ -0,0 +1,29 @@
4202 ++// SPDX-License-Identifier: GPL-2.0
4203 ++/* Copyright (c) 2021 Google */
4204 ++
4205 ++#include "vmlinux.h"
4206 ++
4207 ++#include <bpf/bpf_helpers.h>
4208 ++
4209 ++extern const int bpf_prog_active __ksym; /* int type global var. */
4210 ++
4211 ++SEC("raw_tp/sys_enter")
4212 ++int handler(const void *ctx)
4213 ++{
4214 ++ int *active;
4215 ++ __u32 cpu;
4216 ++
4217 ++ cpu = bpf_get_smp_processor_id();
4218 ++ active = (int *)bpf_per_cpu_ptr(&bpf_prog_active, cpu);
4219 ++ if (active) {
4220 ++ /* Kernel memory obtained from bpf_{per,this}_cpu_ptr
4221 ++ * is read-only, should _not_ pass verification.
4222 ++ */
4223 ++ /* WRITE_ONCE */
4224 ++ *(volatile int *)active = -1;
4225 ++ }
4226 ++
4227 ++ return 0;
4228 ++}
4229 ++
4230 ++char _license[] SEC("license") = "GPL";
4231 +diff --git a/tools/testing/selftests/bpf/verifier/calls.c b/tools/testing/selftests/bpf/verifier/calls.c
4232 +index 336a749673d19..2e701e7f69680 100644
4233 +--- a/tools/testing/selftests/bpf/verifier/calls.c
4234 ++++ b/tools/testing/selftests/bpf/verifier/calls.c
4235 +@@ -107,6 +107,25 @@
4236 + .result = REJECT,
4237 + .errstr = "R0 min value is outside of the allowed memory range",
4238 + },
4239 ++{
4240 ++ "calls: trigger reg2btf_ids[reg->type] for reg->type > __BPF_REG_TYPE_MAX",
4241 ++ .insns = {
4242 ++ BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
4243 ++ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -8),
4244 ++ BPF_ST_MEM(BPF_DW, BPF_REG_1, 0, 0),
4245 ++ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, BPF_PSEUDO_KFUNC_CALL, 0, 0),
4246 ++ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
4247 ++ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, BPF_PSEUDO_KFUNC_CALL, 0, 0),
4248 ++ BPF_EXIT_INSN(),
4249 ++ },
4250 ++ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
4251 ++ .result = REJECT,
4252 ++ .errstr = "arg#0 pointer type STRUCT prog_test_ref_kfunc must point",
4253 ++ .fixup_kfunc_btf_id = {
4254 ++ { "bpf_kfunc_call_test_acquire", 3 },
4255 ++ { "bpf_kfunc_call_test_release", 5 },
4256 ++ },
4257 ++},
4258 + {
4259 + "calls: overlapping caller/callee",
4260 + .insns = {