1 |
commit: aa3aade4f155b96481a44b6733e806c8181271cc |
2 |
Author: Mike Pagano <mpagano <AT> gentoo <DOT> org> |
3 |
AuthorDate: Sun May 1 17:02:58 2022 +0000 |
4 |
Commit: Mike Pagano <mpagano <AT> gentoo <DOT> org> |
5 |
CommitDate: Sun May 1 17:02:58 2022 +0000 |
6 |
URL: https://gitweb.gentoo.org/proj/linux-patches.git/commit/?id=aa3aade4 |
7 |
|
8 |
Linux patch 5.15.37 |
9 |
|
10 |
Signed-off-by: Mike Pagano <mpagano <AT> gentoo.org> |
11 |
|
12 |
0000_README | 4 + |
13 |
1036_linux-5.15.37.patch | 4223 ++++++++++++++++++++++++++++++++++++++++++++++ |
14 |
2 files changed, 4227 insertions(+) |
15 |
|
16 |
diff --git a/0000_README b/0000_README |
17 |
index 0f44e39b..cb4266b1 100644 |
18 |
--- a/0000_README |
19 |
+++ b/0000_README |
20 |
@@ -187,6 +187,10 @@ Patch: 1035_linux-5.15.36.patch |
21 |
From: http://www.kernel.org |
22 |
Desc: Linux 5.15.36 |
23 |
|
24 |
+Patch: 1036_linux-5.15.37.patch |
25 |
+From: http://www.kernel.org |
26 |
+Desc: Linux 5.15.37 |
27 |
+ |
28 |
Patch: 1500_XATTR_USER_PREFIX.patch |
29 |
From: https://bugs.gentoo.org/show_bug.cgi?id=470644 |
30 |
Desc: Support for namespace user.pax.* on tmpfs. |
31 |
|
32 |
diff --git a/1036_linux-5.15.37.patch b/1036_linux-5.15.37.patch |
33 |
new file mode 100644 |
34 |
index 00000000..b9d4c0ea |
35 |
--- /dev/null |
36 |
+++ b/1036_linux-5.15.37.patch |
37 |
@@ -0,0 +1,4223 @@ |
38 |
+diff --git a/Makefile b/Makefile |
39 |
+index e0710f9837847..50b1688a4ca2c 100644 |
40 |
+--- a/Makefile |
41 |
++++ b/Makefile |
42 |
+@@ -1,7 +1,7 @@ |
43 |
+ # SPDX-License-Identifier: GPL-2.0 |
44 |
+ VERSION = 5 |
45 |
+ PATCHLEVEL = 15 |
46 |
+-SUBLEVEL = 36 |
47 |
++SUBLEVEL = 37 |
48 |
+ EXTRAVERSION = |
49 |
+ NAME = Trick or Treat |
50 |
+ |
51 |
+diff --git a/arch/arm/boot/dts/socfpga.dtsi b/arch/arm/boot/dts/socfpga.dtsi |
52 |
+index 0b021eef0b538..7c1d6423d7f8c 100644 |
53 |
+--- a/arch/arm/boot/dts/socfpga.dtsi |
54 |
++++ b/arch/arm/boot/dts/socfpga.dtsi |
55 |
+@@ -782,7 +782,7 @@ |
56 |
+ }; |
57 |
+ |
58 |
+ qspi: spi@ff705000 { |
59 |
+- compatible = "cdns,qspi-nor"; |
60 |
++ compatible = "intel,socfpga-qspi", "cdns,qspi-nor"; |
61 |
+ #address-cells = <1>; |
62 |
+ #size-cells = <0>; |
63 |
+ reg = <0xff705000 0x1000>, |
64 |
+diff --git a/arch/arm/boot/dts/socfpga_arria10.dtsi b/arch/arm/boot/dts/socfpga_arria10.dtsi |
65 |
+index a574ea91d9d3f..3ba431dfa8c94 100644 |
66 |
+--- a/arch/arm/boot/dts/socfpga_arria10.dtsi |
67 |
++++ b/arch/arm/boot/dts/socfpga_arria10.dtsi |
68 |
+@@ -756,7 +756,7 @@ |
69 |
+ }; |
70 |
+ |
71 |
+ qspi: spi@ff809000 { |
72 |
+- compatible = "cdns,qspi-nor"; |
73 |
++ compatible = "intel,socfpga-qspi", "cdns,qspi-nor"; |
74 |
+ #address-cells = <1>; |
75 |
+ #size-cells = <0>; |
76 |
+ reg = <0xff809000 0x100>, |
77 |
+diff --git a/arch/arm64/boot/dts/altera/socfpga_stratix10.dtsi b/arch/arm64/boot/dts/altera/socfpga_stratix10.dtsi |
78 |
+index d301ac0d406bf..3ec301bd08a91 100644 |
79 |
+--- a/arch/arm64/boot/dts/altera/socfpga_stratix10.dtsi |
80 |
++++ b/arch/arm64/boot/dts/altera/socfpga_stratix10.dtsi |
81 |
+@@ -594,7 +594,7 @@ |
82 |
+ }; |
83 |
+ |
84 |
+ qspi: spi@ff8d2000 { |
85 |
+- compatible = "cdns,qspi-nor"; |
86 |
++ compatible = "intel,socfpga-qspi", "cdns,qspi-nor"; |
87 |
+ #address-cells = <1>; |
88 |
+ #size-cells = <0>; |
89 |
+ reg = <0xff8d2000 0x100>, |
90 |
+diff --git a/arch/arm64/boot/dts/intel/socfpga_agilex.dtsi b/arch/arm64/boot/dts/intel/socfpga_agilex.dtsi |
91 |
+index de1e98c99ec5b..f4270cf189962 100644 |
92 |
+--- a/arch/arm64/boot/dts/intel/socfpga_agilex.dtsi |
93 |
++++ b/arch/arm64/boot/dts/intel/socfpga_agilex.dtsi |
94 |
+@@ -628,7 +628,7 @@ |
95 |
+ }; |
96 |
+ |
97 |
+ qspi: spi@ff8d2000 { |
98 |
+- compatible = "cdns,qspi-nor"; |
99 |
++ compatible = "intel,socfpga-qspi", "cdns,qspi-nor"; |
100 |
+ #address-cells = <1>; |
101 |
+ #size-cells = <0>; |
102 |
+ reg = <0xff8d2000 0x100>, |
103 |
+diff --git a/arch/powerpc/kernel/kvm.c b/arch/powerpc/kernel/kvm.c |
104 |
+index d89cf802d9aa7..6568823cf3063 100644 |
105 |
+--- a/arch/powerpc/kernel/kvm.c |
106 |
++++ b/arch/powerpc/kernel/kvm.c |
107 |
+@@ -669,7 +669,8 @@ static void __init kvm_use_magic_page(void) |
108 |
+ on_each_cpu(kvm_map_magic_page, &features, 1); |
109 |
+ |
110 |
+ /* Quick self-test to see if the mapping works */ |
111 |
+- if (fault_in_pages_readable((const char *)KVM_MAGIC_PAGE, sizeof(u32))) { |
112 |
++ if (fault_in_readable((const char __user *)KVM_MAGIC_PAGE, |
113 |
++ sizeof(u32))) { |
114 |
+ kvm_patching_worked = false; |
115 |
+ return; |
116 |
+ } |
117 |
+diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c |
118 |
+index f2da879264bcd..3e053e2fd6b69 100644 |
119 |
+--- a/arch/powerpc/kernel/signal_32.c |
120 |
++++ b/arch/powerpc/kernel/signal_32.c |
121 |
+@@ -1048,7 +1048,7 @@ SYSCALL_DEFINE3(swapcontext, struct ucontext __user *, old_ctx, |
122 |
+ if (new_ctx == NULL) |
123 |
+ return 0; |
124 |
+ if (!access_ok(new_ctx, ctx_size) || |
125 |
+- fault_in_pages_readable((u8 __user *)new_ctx, ctx_size)) |
126 |
++ fault_in_readable((char __user *)new_ctx, ctx_size)) |
127 |
+ return -EFAULT; |
128 |
+ |
129 |
+ /* |
130 |
+@@ -1239,7 +1239,7 @@ SYSCALL_DEFINE3(debug_setcontext, struct ucontext __user *, ctx, |
131 |
+ #endif |
132 |
+ |
133 |
+ if (!access_ok(ctx, sizeof(*ctx)) || |
134 |
+- fault_in_pages_readable((u8 __user *)ctx, sizeof(*ctx))) |
135 |
++ fault_in_readable((char __user *)ctx, sizeof(*ctx))) |
136 |
+ return -EFAULT; |
137 |
+ |
138 |
+ /* |
139 |
+diff --git a/arch/powerpc/kernel/signal_64.c b/arch/powerpc/kernel/signal_64.c |
140 |
+index bb9c077ac1322..d1e1fc0acbea3 100644 |
141 |
+--- a/arch/powerpc/kernel/signal_64.c |
142 |
++++ b/arch/powerpc/kernel/signal_64.c |
143 |
+@@ -688,7 +688,7 @@ SYSCALL_DEFINE3(swapcontext, struct ucontext __user *, old_ctx, |
144 |
+ if (new_ctx == NULL) |
145 |
+ return 0; |
146 |
+ if (!access_ok(new_ctx, ctx_size) || |
147 |
+- fault_in_pages_readable((u8 __user *)new_ctx, ctx_size)) |
148 |
++ fault_in_readable((char __user *)new_ctx, ctx_size)) |
149 |
+ return -EFAULT; |
150 |
+ |
151 |
+ /* |
152 |
+diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c |
153 |
+index 831b25c5e7058..7f71bd4dcd0d6 100644 |
154 |
+--- a/arch/x86/kernel/fpu/signal.c |
155 |
++++ b/arch/x86/kernel/fpu/signal.c |
156 |
+@@ -205,7 +205,7 @@ retry: |
157 |
+ fpregs_unlock(); |
158 |
+ |
159 |
+ if (ret) { |
160 |
+- if (!fault_in_pages_writeable(buf_fx, fpu_user_xstate_size)) |
161 |
++ if (!fault_in_writeable(buf_fx, fpu_user_xstate_size)) |
162 |
+ goto retry; |
163 |
+ return -EFAULT; |
164 |
+ } |
165 |
+@@ -278,10 +278,9 @@ retry: |
166 |
+ if (ret != -EFAULT) |
167 |
+ return -EINVAL; |
168 |
+ |
169 |
+- ret = fault_in_pages_readable(buf, size); |
170 |
+- if (!ret) |
171 |
++ if (!fault_in_readable(buf, size)) |
172 |
+ goto retry; |
173 |
+- return ret; |
174 |
++ return -EFAULT; |
175 |
+ } |
176 |
+ |
177 |
+ /* |
178 |
+diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig |
179 |
+index ab3e37aa1830c..f93cb989241cc 100644 |
180 |
+--- a/drivers/block/Kconfig |
181 |
++++ b/drivers/block/Kconfig |
182 |
+@@ -33,6 +33,22 @@ config BLK_DEV_FD |
183 |
+ To compile this driver as a module, choose M here: the |
184 |
+ module will be called floppy. |
185 |
+ |
186 |
++config BLK_DEV_FD_RAWCMD |
187 |
++ bool "Support for raw floppy disk commands (DEPRECATED)" |
188 |
++ depends on BLK_DEV_FD |
189 |
++ help |
190 |
++ If you want to use actual physical floppies and expect to do |
191 |
++ special low-level hardware accesses to them (access and use |
192 |
++ non-standard formats, for example), then enable this. |
193 |
++ |
194 |
++ Note that the code enabled by this option is rarely used and |
195 |
++ might be unstable or insecure, and distros should not enable it. |
196 |
++ |
197 |
++ Note: FDRAWCMD is deprecated and will be removed from the kernel |
198 |
++ in the near future. |
199 |
++ |
200 |
++ If unsure, say N. |
201 |
++ |
202 |
+ config AMIGA_FLOPPY |
203 |
+ tristate "Amiga floppy support" |
204 |
+ depends on AMIGA |
205 |
+diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c |
206 |
+index 0f58594c5a4d6..1c152b542a52d 100644 |
207 |
+--- a/drivers/block/floppy.c |
208 |
++++ b/drivers/block/floppy.c |
209 |
+@@ -2984,6 +2984,8 @@ static const char *drive_name(int type, int drive) |
210 |
+ return "(null)"; |
211 |
+ } |
212 |
+ |
213 |
++#ifdef CONFIG_BLK_DEV_FD_RAWCMD |
214 |
++ |
215 |
+ /* raw commands */ |
216 |
+ static void raw_cmd_done(int flag) |
217 |
+ { |
218 |
+@@ -3183,6 +3185,35 @@ static int raw_cmd_ioctl(int cmd, void __user *param) |
219 |
+ return ret; |
220 |
+ } |
221 |
+ |
222 |
++static int floppy_raw_cmd_ioctl(int type, int drive, int cmd, |
223 |
++ void __user *param) |
224 |
++{ |
225 |
++ int ret; |
226 |
++ |
227 |
++ pr_warn_once("Note: FDRAWCMD is deprecated and will be removed from the kernel in the near future.\n"); |
228 |
++ |
229 |
++ if (type) |
230 |
++ return -EINVAL; |
231 |
++ if (lock_fdc(drive)) |
232 |
++ return -EINTR; |
233 |
++ set_floppy(drive); |
234 |
++ ret = raw_cmd_ioctl(cmd, param); |
235 |
++ if (ret == -EINTR) |
236 |
++ return -EINTR; |
237 |
++ process_fd_request(); |
238 |
++ return ret; |
239 |
++} |
240 |
++ |
241 |
++#else /* CONFIG_BLK_DEV_FD_RAWCMD */ |
242 |
++ |
243 |
++static int floppy_raw_cmd_ioctl(int type, int drive, int cmd, |
244 |
++ void __user *param) |
245 |
++{ |
246 |
++ return -EOPNOTSUPP; |
247 |
++} |
248 |
++ |
249 |
++#endif |
250 |
++ |
251 |
+ static int invalidate_drive(struct block_device *bdev) |
252 |
+ { |
253 |
+ /* invalidate the buffer track to force a reread */ |
254 |
+@@ -3371,7 +3402,6 @@ static int fd_locked_ioctl(struct block_device *bdev, fmode_t mode, unsigned int |
255 |
+ { |
256 |
+ int drive = (long)bdev->bd_disk->private_data; |
257 |
+ int type = ITYPE(drive_state[drive].fd_device); |
258 |
+- int i; |
259 |
+ int ret; |
260 |
+ int size; |
261 |
+ union inparam { |
262 |
+@@ -3522,16 +3552,7 @@ static int fd_locked_ioctl(struct block_device *bdev, fmode_t mode, unsigned int |
263 |
+ outparam = &write_errors[drive]; |
264 |
+ break; |
265 |
+ case FDRAWCMD: |
266 |
+- if (type) |
267 |
+- return -EINVAL; |
268 |
+- if (lock_fdc(drive)) |
269 |
+- return -EINTR; |
270 |
+- set_floppy(drive); |
271 |
+- i = raw_cmd_ioctl(cmd, (void __user *)param); |
272 |
+- if (i == -EINTR) |
273 |
+- return -EINTR; |
274 |
+- process_fd_request(); |
275 |
+- return i; |
276 |
++ return floppy_raw_cmd_ioctl(type, drive, cmd, (void __user *)param); |
277 |
+ case FDTWADDLE: |
278 |
+ if (lock_fdc(drive)) |
279 |
+ return -EINTR; |
280 |
+diff --git a/drivers/gpu/drm/armada/armada_gem.c b/drivers/gpu/drm/armada/armada_gem.c |
281 |
+index 21909642ee4ca..8fbb25913327c 100644 |
282 |
+--- a/drivers/gpu/drm/armada/armada_gem.c |
283 |
++++ b/drivers/gpu/drm/armada/armada_gem.c |
284 |
+@@ -336,7 +336,7 @@ int armada_gem_pwrite_ioctl(struct drm_device *dev, void *data, |
285 |
+ struct drm_armada_gem_pwrite *args = data; |
286 |
+ struct armada_gem_object *dobj; |
287 |
+ char __user *ptr; |
288 |
+- int ret; |
289 |
++ int ret = 0; |
290 |
+ |
291 |
+ DRM_DEBUG_DRIVER("handle %u off %u size %u ptr 0x%llx\n", |
292 |
+ args->handle, args->offset, args->size, args->ptr); |
293 |
+@@ -349,9 +349,8 @@ int armada_gem_pwrite_ioctl(struct drm_device *dev, void *data, |
294 |
+ if (!access_ok(ptr, args->size)) |
295 |
+ return -EFAULT; |
296 |
+ |
297 |
+- ret = fault_in_pages_readable(ptr, args->size); |
298 |
+- if (ret) |
299 |
+- return ret; |
300 |
++ if (fault_in_readable(ptr, args->size)) |
301 |
++ return -EFAULT; |
302 |
+ |
303 |
+ dobj = armada_gem_object_lookup(file, args->handle); |
304 |
+ if (dobj == NULL) |
305 |
+diff --git a/drivers/spi/spi-cadence-quadspi.c b/drivers/spi/spi-cadence-quadspi.c |
306 |
+index 75680eecd2f7d..2714ba02b176b 100644 |
307 |
+--- a/drivers/spi/spi-cadence-quadspi.c |
308 |
++++ b/drivers/spi/spi-cadence-quadspi.c |
309 |
+@@ -36,6 +36,7 @@ |
310 |
+ /* Quirks */ |
311 |
+ #define CQSPI_NEEDS_WR_DELAY BIT(0) |
312 |
+ #define CQSPI_DISABLE_DAC_MODE BIT(1) |
313 |
++#define CQSPI_NO_SUPPORT_WR_COMPLETION BIT(3) |
314 |
+ |
315 |
+ /* Capabilities */ |
316 |
+ #define CQSPI_SUPPORTS_OCTAL BIT(0) |
317 |
+@@ -83,6 +84,7 @@ struct cqspi_st { |
318 |
+ u32 wr_delay; |
319 |
+ bool use_direct_mode; |
320 |
+ struct cqspi_flash_pdata f_pdata[CQSPI_MAX_CHIPSELECT]; |
321 |
++ bool wr_completion; |
322 |
+ }; |
323 |
+ |
324 |
+ struct cqspi_driver_platdata { |
325 |
+@@ -797,9 +799,11 @@ static int cqspi_write_setup(struct cqspi_flash_pdata *f_pdata, |
326 |
+ * polling on the controller's side. spinand and spi-nor will take |
327 |
+ * care of polling the status register. |
328 |
+ */ |
329 |
+- reg = readl(reg_base + CQSPI_REG_WR_COMPLETION_CTRL); |
330 |
+- reg |= CQSPI_REG_WR_DISABLE_AUTO_POLL; |
331 |
+- writel(reg, reg_base + CQSPI_REG_WR_COMPLETION_CTRL); |
332 |
++ if (cqspi->wr_completion) { |
333 |
++ reg = readl(reg_base + CQSPI_REG_WR_COMPLETION_CTRL); |
334 |
++ reg |= CQSPI_REG_WR_DISABLE_AUTO_POLL; |
335 |
++ writel(reg, reg_base + CQSPI_REG_WR_COMPLETION_CTRL); |
336 |
++ } |
337 |
+ |
338 |
+ reg = readl(reg_base + CQSPI_REG_SIZE); |
339 |
+ reg &= ~CQSPI_REG_SIZE_ADDRESS_MASK; |
340 |
+@@ -1532,6 +1536,10 @@ static int cqspi_probe(struct platform_device *pdev) |
341 |
+ |
342 |
+ cqspi->master_ref_clk_hz = clk_get_rate(cqspi->clk); |
343 |
+ master->max_speed_hz = cqspi->master_ref_clk_hz; |
344 |
++ |
345 |
++ /* write completion is supported by default */ |
346 |
++ cqspi->wr_completion = true; |
347 |
++ |
348 |
+ ddata = of_device_get_match_data(dev); |
349 |
+ if (ddata) { |
350 |
+ if (ddata->quirks & CQSPI_NEEDS_WR_DELAY) |
351 |
+@@ -1541,6 +1549,8 @@ static int cqspi_probe(struct platform_device *pdev) |
352 |
+ master->mode_bits |= SPI_RX_OCTAL | SPI_TX_OCTAL; |
353 |
+ if (!(ddata->quirks & CQSPI_DISABLE_DAC_MODE)) |
354 |
+ cqspi->use_direct_mode = true; |
355 |
++ if (ddata->quirks & CQSPI_NO_SUPPORT_WR_COMPLETION) |
356 |
++ cqspi->wr_completion = false; |
357 |
+ } |
358 |
+ |
359 |
+ ret = devm_request_irq(dev, irq, cqspi_irq_handler, 0, |
360 |
+@@ -1649,6 +1659,10 @@ static const struct cqspi_driver_platdata intel_lgm_qspi = { |
361 |
+ .quirks = CQSPI_DISABLE_DAC_MODE, |
362 |
+ }; |
363 |
+ |
364 |
++static const struct cqspi_driver_platdata socfpga_qspi = { |
365 |
++ .quirks = CQSPI_NO_SUPPORT_WR_COMPLETION, |
366 |
++}; |
367 |
++ |
368 |
+ static const struct of_device_id cqspi_dt_ids[] = { |
369 |
+ { |
370 |
+ .compatible = "cdns,qspi-nor", |
371 |
+@@ -1666,6 +1680,10 @@ static const struct of_device_id cqspi_dt_ids[] = { |
372 |
+ .compatible = "intel,lgm-qspi", |
373 |
+ .data = &intel_lgm_qspi, |
374 |
+ }, |
375 |
++ { |
376 |
++ .compatible = "intel,socfpga-qspi", |
377 |
++ .data = (void *)&socfpga_qspi, |
378 |
++ }, |
379 |
+ { /* end of table */ } |
380 |
+ }; |
381 |
+ |
382 |
+diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c |
383 |
+index dc1e4d1b72914..ff578c934bbcf 100644 |
384 |
+--- a/fs/btrfs/file.c |
385 |
++++ b/fs/btrfs/file.c |
386 |
+@@ -1709,7 +1709,7 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb, |
387 |
+ * Fault pages before locking them in prepare_pages |
388 |
+ * to avoid recursive lock |
389 |
+ */ |
390 |
+- if (unlikely(iov_iter_fault_in_readable(i, write_bytes))) { |
391 |
++ if (unlikely(fault_in_iov_iter_readable(i, write_bytes))) { |
392 |
+ ret = -EFAULT; |
393 |
+ break; |
394 |
+ } |
395 |
+@@ -1903,16 +1903,17 @@ static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info, |
396 |
+ |
397 |
+ static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from) |
398 |
+ { |
399 |
++ const bool is_sync_write = (iocb->ki_flags & IOCB_DSYNC); |
400 |
+ struct file *file = iocb->ki_filp; |
401 |
+ struct inode *inode = file_inode(file); |
402 |
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); |
403 |
+ loff_t pos; |
404 |
+ ssize_t written = 0; |
405 |
+ ssize_t written_buffered; |
406 |
++ size_t prev_left = 0; |
407 |
+ loff_t endbyte; |
408 |
+ ssize_t err; |
409 |
+ unsigned int ilock_flags = 0; |
410 |
+- struct iomap_dio *dio = NULL; |
411 |
+ |
412 |
+ if (iocb->ki_flags & IOCB_NOWAIT) |
413 |
+ ilock_flags |= BTRFS_ILOCK_TRY; |
414 |
+@@ -1955,23 +1956,80 @@ relock: |
415 |
+ goto buffered; |
416 |
+ } |
417 |
+ |
418 |
+- dio = __iomap_dio_rw(iocb, from, &btrfs_dio_iomap_ops, &btrfs_dio_ops, |
419 |
+- 0); |
420 |
++ /* |
421 |
++ * We remove IOCB_DSYNC so that we don't deadlock when iomap_dio_rw() |
422 |
++ * calls generic_write_sync() (through iomap_dio_complete()), because |
423 |
++ * that results in calling fsync (btrfs_sync_file()) which will try to |
424 |
++ * lock the inode in exclusive/write mode. |
425 |
++ */ |
426 |
++ if (is_sync_write) |
427 |
++ iocb->ki_flags &= ~IOCB_DSYNC; |
428 |
+ |
429 |
+- btrfs_inode_unlock(inode, ilock_flags); |
430 |
++ /* |
431 |
++ * The iov_iter can be mapped to the same file range we are writing to. |
432 |
++ * If that's the case, then we will deadlock in the iomap code, because |
433 |
++ * it first calls our callback btrfs_dio_iomap_begin(), which will create |
434 |
++ * an ordered extent, and after that it will fault in the pages that the |
435 |
++ * iov_iter refers to. During the fault in we end up in the readahead |
436 |
++ * pages code (starting at btrfs_readahead()), which will lock the range, |
437 |
++ * find that ordered extent and then wait for it to complete (at |
438 |
++ * btrfs_lock_and_flush_ordered_range()), resulting in a deadlock since |
439 |
++ * obviously the ordered extent can never complete as we didn't submit |
440 |
++ * yet the respective bio(s). This always happens when the buffer is |
441 |
++ * memory mapped to the same file range, since the iomap DIO code always |
442 |
++ * invalidates pages in the target file range (after starting and waiting |
443 |
++ * for any writeback). |
444 |
++ * |
445 |
++ * So here we disable page faults in the iov_iter and then retry if we |
446 |
++ * got -EFAULT, faulting in the pages before the retry. |
447 |
++ */ |
448 |
++again: |
449 |
++ from->nofault = true; |
450 |
++ err = iomap_dio_rw(iocb, from, &btrfs_dio_iomap_ops, &btrfs_dio_ops, |
451 |
++ IOMAP_DIO_PARTIAL, written); |
452 |
++ from->nofault = false; |
453 |
+ |
454 |
+- if (IS_ERR_OR_NULL(dio)) { |
455 |
+- err = PTR_ERR_OR_ZERO(dio); |
456 |
+- if (err < 0 && err != -ENOTBLK) |
457 |
+- goto out; |
458 |
+- } else { |
459 |
+- written = iomap_dio_complete(dio); |
460 |
++ /* No increment (+=) because iomap returns a cumulative value. */ |
461 |
++ if (err > 0) |
462 |
++ written = err; |
463 |
++ |
464 |
++ if (iov_iter_count(from) > 0 && (err == -EFAULT || err > 0)) { |
465 |
++ const size_t left = iov_iter_count(from); |
466 |
++ /* |
467 |
++ * We have more data left to write. Try to fault in as many as |
468 |
++ * possible of the remainder pages and retry. We do this without |
469 |
++ * releasing and locking again the inode, to prevent races with |
470 |
++ * truncate. |
471 |
++ * |
472 |
++ * Also, in case the iov refers to pages in the file range of the |
473 |
++ * file we want to write to (due to a mmap), we could enter an |
474 |
++ * infinite loop if we retry after faulting the pages in, since |
475 |
++ * iomap will invalidate any pages in the range early on, before |
476 |
++ * it tries to fault in the pages of the iov. So we keep track of |
477 |
++ * how much was left of iov in the previous EFAULT and fallback |
478 |
++ * to buffered IO in case we haven't made any progress. |
479 |
++ */ |
480 |
++ if (left == prev_left) { |
481 |
++ err = -ENOTBLK; |
482 |
++ } else { |
483 |
++ fault_in_iov_iter_readable(from, left); |
484 |
++ prev_left = left; |
485 |
++ goto again; |
486 |
++ } |
487 |
+ } |
488 |
+ |
489 |
+- if (written < 0 || !iov_iter_count(from)) { |
490 |
+- err = written; |
491 |
++ btrfs_inode_unlock(inode, ilock_flags); |
492 |
++ |
493 |
++ /* |
494 |
++ * Add back IOCB_DSYNC. Our caller, btrfs_file_write_iter(), will do |
495 |
++ * the fsync (call generic_write_sync()). |
496 |
++ */ |
497 |
++ if (is_sync_write) |
498 |
++ iocb->ki_flags |= IOCB_DSYNC; |
499 |
++ |
500 |
++ /* If 'err' is -ENOTBLK then it means we must fallback to buffered IO. */ |
501 |
++ if ((err < 0 && err != -ENOTBLK) || !iov_iter_count(from)) |
502 |
+ goto out; |
503 |
+- } |
504 |
+ |
505 |
+ buffered: |
506 |
+ pos = iocb->ki_pos; |
507 |
+@@ -1996,7 +2054,7 @@ buffered: |
508 |
+ invalidate_mapping_pages(file->f_mapping, pos >> PAGE_SHIFT, |
509 |
+ endbyte >> PAGE_SHIFT); |
510 |
+ out: |
511 |
+- return written ? written : err; |
512 |
++ return err < 0 ? err : written; |
513 |
+ } |
514 |
+ |
515 |
+ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, |
516 |
+@@ -3659,6 +3717,8 @@ static int check_direct_read(struct btrfs_fs_info *fs_info, |
517 |
+ static ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to) |
518 |
+ { |
519 |
+ struct inode *inode = file_inode(iocb->ki_filp); |
520 |
++ size_t prev_left = 0; |
521 |
++ ssize_t read = 0; |
522 |
+ ssize_t ret; |
523 |
+ |
524 |
+ if (fsverity_active(inode)) |
525 |
+@@ -3668,9 +3728,57 @@ static ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to) |
526 |
+ return 0; |
527 |
+ |
528 |
+ btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED); |
529 |
+- ret = iomap_dio_rw(iocb, to, &btrfs_dio_iomap_ops, &btrfs_dio_ops, 0); |
530 |
++again: |
531 |
++ /* |
532 |
++ * This is similar to what we do for direct IO writes, see the comment |
533 |
++ * at btrfs_direct_write(), but we also disable page faults in addition |
534 |
++ * to disabling them only at the iov_iter level. This is because when |
535 |
++ * reading from a hole or prealloc extent, iomap calls iov_iter_zero(), |
536 |
++ * which can still trigger page fault ins despite having set ->nofault |
537 |
++ * to true of our 'to' iov_iter. |
538 |
++ * |
539 |
++ * The difference to direct IO writes is that we deadlock when trying |
540 |
++ * to lock the extent range in the inode's tree during he page reads |
541 |
++ * triggered by the fault in (while for writes it is due to waiting for |
542 |
++ * our own ordered extent). This is because for direct IO reads, |
543 |
++ * btrfs_dio_iomap_begin() returns with the extent range locked, which |
544 |
++ * is only unlocked in the endio callback (end_bio_extent_readpage()). |
545 |
++ */ |
546 |
++ pagefault_disable(); |
547 |
++ to->nofault = true; |
548 |
++ ret = iomap_dio_rw(iocb, to, &btrfs_dio_iomap_ops, &btrfs_dio_ops, |
549 |
++ IOMAP_DIO_PARTIAL, read); |
550 |
++ to->nofault = false; |
551 |
++ pagefault_enable(); |
552 |
++ |
553 |
++ /* No increment (+=) because iomap returns a cumulative value. */ |
554 |
++ if (ret > 0) |
555 |
++ read = ret; |
556 |
++ |
557 |
++ if (iov_iter_count(to) > 0 && (ret == -EFAULT || ret > 0)) { |
558 |
++ const size_t left = iov_iter_count(to); |
559 |
++ |
560 |
++ if (left == prev_left) { |
561 |
++ /* |
562 |
++ * We didn't make any progress since the last attempt, |
563 |
++ * fallback to a buffered read for the remainder of the |
564 |
++ * range. This is just to avoid any possibility of looping |
565 |
++ * for too long. |
566 |
++ */ |
567 |
++ ret = read; |
568 |
++ } else { |
569 |
++ /* |
570 |
++ * We made some progress since the last retry or this is |
571 |
++ * the first time we are retrying. Fault in as many pages |
572 |
++ * as possible and retry. |
573 |
++ */ |
574 |
++ fault_in_iov_iter_writeable(to, left); |
575 |
++ prev_left = left; |
576 |
++ goto again; |
577 |
++ } |
578 |
++ } |
579 |
+ btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); |
580 |
+- return ret; |
581 |
++ return ret < 0 ? ret : read; |
582 |
+ } |
583 |
+ |
584 |
+ static ssize_t btrfs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) |
585 |
+diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c |
586 |
+index 6266a706bff7d..044d584c3467c 100644 |
587 |
+--- a/fs/btrfs/inode.c |
588 |
++++ b/fs/btrfs/inode.c |
589 |
+@@ -7961,6 +7961,34 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, |
590 |
+ } |
591 |
+ |
592 |
+ len = min(len, em->len - (start - em->start)); |
593 |
++ |
594 |
++ /* |
595 |
++ * If we have a NOWAIT request and the range contains multiple extents |
596 |
++ * (or a mix of extents and holes), then we return -EAGAIN to make the |
597 |
++ * caller fallback to a context where it can do a blocking (without |
598 |
++ * NOWAIT) request. This way we avoid doing partial IO and returning |
599 |
++ * success to the caller, which is not optimal for writes and for reads |
600 |
++ * it can result in unexpected behaviour for an application. |
601 |
++ * |
602 |
++ * When doing a read, because we use IOMAP_DIO_PARTIAL when calling |
603 |
++ * iomap_dio_rw(), we can end up returning less data then what the caller |
604 |
++ * asked for, resulting in an unexpected, and incorrect, short read. |
605 |
++ * That is, the caller asked to read N bytes and we return less than that, |
606 |
++ * which is wrong unless we are crossing EOF. This happens if we get a |
607 |
++ * page fault error when trying to fault in pages for the buffer that is |
608 |
++ * associated to the struct iov_iter passed to iomap_dio_rw(), and we |
609 |
++ * have previously submitted bios for other extents in the range, in |
610 |
++ * which case iomap_dio_rw() may return us EIOCBQUEUED if not all of |
611 |
++ * those bios have completed by the time we get the page fault error, |
612 |
++ * which we return back to our caller - we should only return EIOCBQUEUED |
613 |
++ * after we have submitted bios for all the extents in the range. |
614 |
++ */ |
615 |
++ if ((flags & IOMAP_NOWAIT) && len < length) { |
616 |
++ free_extent_map(em); |
617 |
++ ret = -EAGAIN; |
618 |
++ goto unlock_err; |
619 |
++ } |
620 |
++ |
621 |
+ if (write) { |
622 |
+ ret = btrfs_get_blocks_direct_write(&em, inode, dio_data, |
623 |
+ start, len); |
624 |
+diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c |
625 |
+index 6a863b3f6de03..bf53af8694f8e 100644 |
626 |
+--- a/fs/btrfs/ioctl.c |
627 |
++++ b/fs/btrfs/ioctl.c |
628 |
+@@ -2258,9 +2258,8 @@ static noinline int search_ioctl(struct inode *inode, |
629 |
+ key.offset = sk->min_offset; |
630 |
+ |
631 |
+ while (1) { |
632 |
+- ret = fault_in_pages_writeable(ubuf + sk_offset, |
633 |
+- *buf_size - sk_offset); |
634 |
+- if (ret) |
635 |
++ ret = -EFAULT; |
636 |
++ if (fault_in_writeable(ubuf + sk_offset, *buf_size - sk_offset)) |
637 |
+ break; |
638 |
+ |
639 |
+ ret = btrfs_search_forward(root, &key, path, sk->min_transid); |
640 |
+diff --git a/fs/erofs/data.c b/fs/erofs/data.c |
641 |
+index 9db8297156527..16a41d0db55a3 100644 |
642 |
+--- a/fs/erofs/data.c |
643 |
++++ b/fs/erofs/data.c |
644 |
+@@ -287,7 +287,7 @@ static ssize_t erofs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) |
645 |
+ |
646 |
+ if (!err) |
647 |
+ return iomap_dio_rw(iocb, to, &erofs_iomap_ops, |
648 |
+- NULL, 0); |
649 |
++ NULL, 0, 0); |
650 |
+ if (err < 0) |
651 |
+ return err; |
652 |
+ } |
653 |
+diff --git a/fs/ext4/file.c b/fs/ext4/file.c |
654 |
+index ac0e11bbb4450..b25c1f8f7c4f1 100644 |
655 |
+--- a/fs/ext4/file.c |
656 |
++++ b/fs/ext4/file.c |
657 |
+@@ -74,7 +74,7 @@ static ssize_t ext4_dio_read_iter(struct kiocb *iocb, struct iov_iter *to) |
658 |
+ return generic_file_read_iter(iocb, to); |
659 |
+ } |
660 |
+ |
661 |
+- ret = iomap_dio_rw(iocb, to, &ext4_iomap_ops, NULL, 0); |
662 |
++ ret = iomap_dio_rw(iocb, to, &ext4_iomap_ops, NULL, 0, 0); |
663 |
+ inode_unlock_shared(inode); |
664 |
+ |
665 |
+ file_accessed(iocb->ki_filp); |
666 |
+@@ -566,7 +566,8 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from) |
667 |
+ if (ilock_shared) |
668 |
+ iomap_ops = &ext4_iomap_overwrite_ops; |
669 |
+ ret = iomap_dio_rw(iocb, from, iomap_ops, &ext4_dio_write_ops, |
670 |
+- (unaligned_io || extend) ? IOMAP_DIO_FORCE_WAIT : 0); |
671 |
++ (unaligned_io || extend) ? IOMAP_DIO_FORCE_WAIT : 0, |
672 |
++ 0); |
673 |
+ if (ret == -ENOTBLK) |
674 |
+ ret = 0; |
675 |
+ |
676 |
+diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c |
677 |
+index 0e14dc41ed4e6..8ef92719c6799 100644 |
678 |
+--- a/fs/f2fs/file.c |
679 |
++++ b/fs/f2fs/file.c |
680 |
+@@ -4279,7 +4279,7 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) |
681 |
+ size_t target_size = 0; |
682 |
+ int err; |
683 |
+ |
684 |
+- if (iov_iter_fault_in_readable(from, iov_iter_count(from))) |
685 |
++ if (fault_in_iov_iter_readable(from, iov_iter_count(from))) |
686 |
+ set_inode_flag(inode, FI_NO_PREALLOC); |
687 |
+ |
688 |
+ if ((iocb->ki_flags & IOCB_NOWAIT)) { |
689 |
+diff --git a/fs/fuse/file.c b/fs/fuse/file.c |
690 |
+index bc50a9fa84a0c..71e9e301e569d 100644 |
691 |
+--- a/fs/fuse/file.c |
692 |
++++ b/fs/fuse/file.c |
693 |
+@@ -1164,7 +1164,7 @@ static ssize_t fuse_fill_write_pages(struct fuse_io_args *ia, |
694 |
+ |
695 |
+ again: |
696 |
+ err = -EFAULT; |
697 |
+- if (iov_iter_fault_in_readable(ii, bytes)) |
698 |
++ if (fault_in_iov_iter_readable(ii, bytes)) |
699 |
+ break; |
700 |
+ |
701 |
+ err = -ENOMEM; |
702 |
+diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c |
703 |
+index bb9014ced702a..fbdb7a30470a3 100644 |
704 |
+--- a/fs/gfs2/bmap.c |
705 |
++++ b/fs/gfs2/bmap.c |
706 |
+@@ -961,46 +961,6 @@ hole_found: |
707 |
+ goto out; |
708 |
+ } |
709 |
+ |
710 |
+-static int gfs2_write_lock(struct inode *inode) |
711 |
+-{ |
712 |
+- struct gfs2_inode *ip = GFS2_I(inode); |
713 |
+- struct gfs2_sbd *sdp = GFS2_SB(inode); |
714 |
+- int error; |
715 |
+- |
716 |
+- gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh); |
717 |
+- error = gfs2_glock_nq(&ip->i_gh); |
718 |
+- if (error) |
719 |
+- goto out_uninit; |
720 |
+- if (&ip->i_inode == sdp->sd_rindex) { |
721 |
+- struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode); |
722 |
+- |
723 |
+- error = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE, |
724 |
+- GL_NOCACHE, &m_ip->i_gh); |
725 |
+- if (error) |
726 |
+- goto out_unlock; |
727 |
+- } |
728 |
+- return 0; |
729 |
+- |
730 |
+-out_unlock: |
731 |
+- gfs2_glock_dq(&ip->i_gh); |
732 |
+-out_uninit: |
733 |
+- gfs2_holder_uninit(&ip->i_gh); |
734 |
+- return error; |
735 |
+-} |
736 |
+- |
737 |
+-static void gfs2_write_unlock(struct inode *inode) |
738 |
+-{ |
739 |
+- struct gfs2_inode *ip = GFS2_I(inode); |
740 |
+- struct gfs2_sbd *sdp = GFS2_SB(inode); |
741 |
+- |
742 |
+- if (&ip->i_inode == sdp->sd_rindex) { |
743 |
+- struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode); |
744 |
+- |
745 |
+- gfs2_glock_dq_uninit(&m_ip->i_gh); |
746 |
+- } |
747 |
+- gfs2_glock_dq_uninit(&ip->i_gh); |
748 |
+-} |
749 |
+- |
750 |
+ static int gfs2_iomap_page_prepare(struct inode *inode, loff_t pos, |
751 |
+ unsigned len) |
752 |
+ { |
753 |
+@@ -1118,11 +1078,6 @@ out_qunlock: |
754 |
+ return ret; |
755 |
+ } |
756 |
+ |
757 |
+-static inline bool gfs2_iomap_need_write_lock(unsigned flags) |
758 |
+-{ |
759 |
+- return (flags & IOMAP_WRITE) && !(flags & IOMAP_DIRECT); |
760 |
+-} |
761 |
+- |
762 |
+ static int gfs2_iomap_begin(struct inode *inode, loff_t pos, loff_t length, |
763 |
+ unsigned flags, struct iomap *iomap, |
764 |
+ struct iomap *srcmap) |
765 |
+@@ -1135,12 +1090,6 @@ static int gfs2_iomap_begin(struct inode *inode, loff_t pos, loff_t length, |
766 |
+ iomap->flags |= IOMAP_F_BUFFER_HEAD; |
767 |
+ |
768 |
+ trace_gfs2_iomap_start(ip, pos, length, flags); |
769 |
+- if (gfs2_iomap_need_write_lock(flags)) { |
770 |
+- ret = gfs2_write_lock(inode); |
771 |
+- if (ret) |
772 |
+- goto out; |
773 |
+- } |
774 |
+- |
775 |
+ ret = __gfs2_iomap_get(inode, pos, length, flags, iomap, &mp); |
776 |
+ if (ret) |
777 |
+ goto out_unlock; |
778 |
+@@ -1168,10 +1117,7 @@ static int gfs2_iomap_begin(struct inode *inode, loff_t pos, loff_t length, |
779 |
+ ret = gfs2_iomap_begin_write(inode, pos, length, flags, iomap, &mp); |
780 |
+ |
781 |
+ out_unlock: |
782 |
+- if (ret && gfs2_iomap_need_write_lock(flags)) |
783 |
+- gfs2_write_unlock(inode); |
784 |
+ release_metapath(&mp); |
785 |
+-out: |
786 |
+ trace_gfs2_iomap_end(ip, iomap, ret); |
787 |
+ return ret; |
788 |
+ } |
789 |
+@@ -1219,15 +1165,11 @@ static int gfs2_iomap_end(struct inode *inode, loff_t pos, loff_t length, |
790 |
+ } |
791 |
+ |
792 |
+ if (unlikely(!written)) |
793 |
+- goto out_unlock; |
794 |
++ return 0; |
795 |
+ |
796 |
+ if (iomap->flags & IOMAP_F_SIZE_CHANGED) |
797 |
+ mark_inode_dirty(inode); |
798 |
+ set_bit(GLF_DIRTY, &ip->i_gl->gl_flags); |
799 |
+- |
800 |
+-out_unlock: |
801 |
+- if (gfs2_iomap_need_write_lock(flags)) |
802 |
+- gfs2_write_unlock(inode); |
803 |
+ return 0; |
804 |
+ } |
805 |
+ |
806 |
+diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c |
807 |
+index 1c8b747072cba..247b8d95b5ef4 100644 |
808 |
+--- a/fs/gfs2/file.c |
809 |
++++ b/fs/gfs2/file.c |
810 |
+@@ -777,27 +777,99 @@ static int gfs2_fsync(struct file *file, loff_t start, loff_t end, |
811 |
+ return ret ? ret : ret1; |
812 |
+ } |
813 |
+ |
814 |
++static inline bool should_fault_in_pages(ssize_t ret, struct iov_iter *i, |
815 |
++ size_t *prev_count, |
816 |
++ size_t *window_size) |
817 |
++{ |
818 |
++ char __user *p = i->iov[0].iov_base + i->iov_offset; |
819 |
++ size_t count = iov_iter_count(i); |
820 |
++ int pages = 1; |
821 |
++ |
822 |
++ if (likely(!count)) |
823 |
++ return false; |
824 |
++ if (ret <= 0 && ret != -EFAULT) |
825 |
++ return false; |
826 |
++ if (!iter_is_iovec(i)) |
827 |
++ return false; |
828 |
++ |
829 |
++ if (*prev_count != count || !*window_size) { |
830 |
++ int pages, nr_dirtied; |
831 |
++ |
832 |
++ pages = min_t(int, BIO_MAX_VECS, |
833 |
++ DIV_ROUND_UP(iov_iter_count(i), PAGE_SIZE)); |
834 |
++ nr_dirtied = max(current->nr_dirtied_pause - |
835 |
++ current->nr_dirtied, 1); |
836 |
++ pages = min(pages, nr_dirtied); |
837 |
++ } |
838 |
++ |
839 |
++ *prev_count = count; |
840 |
++ *window_size = (size_t)PAGE_SIZE * pages - offset_in_page(p); |
841 |
++ return true; |
842 |
++} |
843 |
++ |
844 |
+ static ssize_t gfs2_file_direct_read(struct kiocb *iocb, struct iov_iter *to, |
845 |
+ struct gfs2_holder *gh) |
846 |
+ { |
847 |
+ struct file *file = iocb->ki_filp; |
848 |
+ struct gfs2_inode *ip = GFS2_I(file->f_mapping->host); |
849 |
+- size_t count = iov_iter_count(to); |
850 |
++ size_t prev_count = 0, window_size = 0; |
851 |
++ size_t written = 0; |
852 |
+ ssize_t ret; |
853 |
+ |
854 |
+- if (!count) |
855 |
++ /* |
856 |
++ * In this function, we disable page faults when we're holding the |
857 |
++ * inode glock while doing I/O. If a page fault occurs, we indicate |
858 |
++ * that the inode glock may be dropped, fault in the pages manually, |
859 |
++ * and retry. |
860 |
++ * |
861 |
++ * Unlike generic_file_read_iter, for reads, iomap_dio_rw can trigger |
862 |
++ * physical as well as manual page faults, and we need to disable both |
863 |
++ * kinds. |
864 |
++ * |
865 |
++ * For direct I/O, gfs2 takes the inode glock in deferred mode. This |
866 |
++ * locking mode is compatible with other deferred holders, so multiple |
867 |
++ * processes and nodes can do direct I/O to a file at the same time. |
868 |
++ * There's no guarantee that reads or writes will be atomic. Any |
869 |
++ * coordination among readers and writers needs to happen externally. |
870 |
++ */ |
871 |
++ |
872 |
++ if (!iov_iter_count(to)) |
873 |
+ return 0; /* skip atime */ |
874 |
+ |
875 |
+ gfs2_holder_init(ip->i_gl, LM_ST_DEFERRED, 0, gh); |
876 |
++retry: |
877 |
+ ret = gfs2_glock_nq(gh); |
878 |
+ if (ret) |
879 |
+ goto out_uninit; |
880 |
++retry_under_glock: |
881 |
++ pagefault_disable(); |
882 |
++ to->nofault = true; |
883 |
++ ret = iomap_dio_rw(iocb, to, &gfs2_iomap_ops, NULL, |
884 |
++ IOMAP_DIO_PARTIAL, written); |
885 |
++ to->nofault = false; |
886 |
++ pagefault_enable(); |
887 |
++ if (ret > 0) |
888 |
++ written = ret; |
889 |
++ |
890 |
++ if (should_fault_in_pages(ret, to, &prev_count, &window_size)) { |
891 |
++ size_t leftover; |
892 |
+ |
893 |
+- ret = iomap_dio_rw(iocb, to, &gfs2_iomap_ops, NULL, 0); |
894 |
+- gfs2_glock_dq(gh); |
895 |
++ gfs2_holder_allow_demote(gh); |
896 |
++ leftover = fault_in_iov_iter_writeable(to, window_size); |
897 |
++ gfs2_holder_disallow_demote(gh); |
898 |
++ if (leftover != window_size) { |
899 |
++ if (!gfs2_holder_queued(gh)) |
900 |
++ goto retry; |
901 |
++ goto retry_under_glock; |
902 |
++ } |
903 |
++ } |
904 |
++ if (gfs2_holder_queued(gh)) |
905 |
++ gfs2_glock_dq(gh); |
906 |
+ out_uninit: |
907 |
+ gfs2_holder_uninit(gh); |
908 |
+- return ret; |
909 |
++ if (ret < 0) |
910 |
++ return ret; |
911 |
++ return written; |
912 |
+ } |
913 |
+ |
914 |
+ static ssize_t gfs2_file_direct_write(struct kiocb *iocb, struct iov_iter *from, |
915 |
+@@ -806,10 +878,20 @@ static ssize_t gfs2_file_direct_write(struct kiocb *iocb, struct iov_iter *from, |
916 |
+ struct file *file = iocb->ki_filp; |
917 |
+ struct inode *inode = file->f_mapping->host; |
918 |
+ struct gfs2_inode *ip = GFS2_I(inode); |
919 |
+- size_t len = iov_iter_count(from); |
920 |
+- loff_t offset = iocb->ki_pos; |
921 |
++ size_t prev_count = 0, window_size = 0; |
922 |
++ size_t read = 0; |
923 |
+ ssize_t ret; |
924 |
+ |
925 |
++ /* |
926 |
++ * In this function, we disable page faults when we're holding the |
927 |
++ * inode glock while doing I/O. If a page fault occurs, we indicate |
928 |
++ * that the inode glock may be dropped, fault in the pages manually, |
929 |
++ * and retry. |
930 |
++ * |
931 |
++ * For writes, iomap_dio_rw only triggers manual page faults, so we |
932 |
++ * don't need to disable physical ones. |
933 |
++ */ |
934 |
++ |
935 |
+ /* |
936 |
+ * Deferred lock, even if its a write, since we do no allocation on |
937 |
+ * this path. All we need to change is the atime, and this lock mode |
938 |
+@@ -819,31 +901,62 @@ static ssize_t gfs2_file_direct_write(struct kiocb *iocb, struct iov_iter *from, |
939 |
+ * VFS does. |
940 |
+ */ |
941 |
+ gfs2_holder_init(ip->i_gl, LM_ST_DEFERRED, 0, gh); |
942 |
++retry: |
943 |
+ ret = gfs2_glock_nq(gh); |
944 |
+ if (ret) |
945 |
+ goto out_uninit; |
946 |
+- |
947 |
++retry_under_glock: |
948 |
+ /* Silently fall back to buffered I/O when writing beyond EOF */ |
949 |
+- if (offset + len > i_size_read(&ip->i_inode)) |
950 |
++ if (iocb->ki_pos + iov_iter_count(from) > i_size_read(&ip->i_inode)) |
951 |
+ goto out; |
952 |
+ |
953 |
+- ret = iomap_dio_rw(iocb, from, &gfs2_iomap_ops, NULL, 0); |
954 |
++ from->nofault = true; |
955 |
++ ret = iomap_dio_rw(iocb, from, &gfs2_iomap_ops, NULL, |
956 |
++ IOMAP_DIO_PARTIAL, read); |
957 |
++ from->nofault = false; |
958 |
++ |
959 |
+ if (ret == -ENOTBLK) |
960 |
+ ret = 0; |
961 |
++ if (ret > 0) |
962 |
++ read = ret; |
963 |
++ |
964 |
++ if (should_fault_in_pages(ret, from, &prev_count, &window_size)) { |
965 |
++ size_t leftover; |
966 |
++ |
967 |
++ gfs2_holder_allow_demote(gh); |
968 |
++ leftover = fault_in_iov_iter_readable(from, window_size); |
969 |
++ gfs2_holder_disallow_demote(gh); |
970 |
++ if (leftover != window_size) { |
971 |
++ if (!gfs2_holder_queued(gh)) |
972 |
++ goto retry; |
973 |
++ goto retry_under_glock; |
974 |
++ } |
975 |
++ } |
976 |
+ out: |
977 |
+- gfs2_glock_dq(gh); |
978 |
++ if (gfs2_holder_queued(gh)) |
979 |
++ gfs2_glock_dq(gh); |
980 |
+ out_uninit: |
981 |
+ gfs2_holder_uninit(gh); |
982 |
+- return ret; |
983 |
++ if (ret < 0) |
984 |
++ return ret; |
985 |
++ return read; |
986 |
+ } |
987 |
+ |
988 |
+ static ssize_t gfs2_file_read_iter(struct kiocb *iocb, struct iov_iter *to) |
989 |
+ { |
990 |
+ struct gfs2_inode *ip; |
991 |
+ struct gfs2_holder gh; |
992 |
++ size_t prev_count = 0, window_size = 0; |
993 |
+ size_t written = 0; |
994 |
+ ssize_t ret; |
995 |
+ |
996 |
++ /* |
997 |
++ * In this function, we disable page faults when we're holding the |
998 |
++ * inode glock while doing I/O. If a page fault occurs, we indicate |
999 |
++ * that the inode glock may be dropped, fault in the pages manually, |
1000 |
++ * and retry. |
1001 |
++ */ |
1002 |
++ |
1003 |
+ if (iocb->ki_flags & IOCB_DIRECT) { |
1004 |
+ ret = gfs2_file_direct_read(iocb, to, &gh); |
1005 |
+ if (likely(ret != -ENOTBLK)) |
1006 |
+@@ -865,18 +978,118 @@ static ssize_t gfs2_file_read_iter(struct kiocb *iocb, struct iov_iter *to) |
1007 |
+ } |
1008 |
+ ip = GFS2_I(iocb->ki_filp->f_mapping->host); |
1009 |
+ gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh); |
1010 |
++retry: |
1011 |
+ ret = gfs2_glock_nq(&gh); |
1012 |
+ if (ret) |
1013 |
+ goto out_uninit; |
1014 |
++retry_under_glock: |
1015 |
++ pagefault_disable(); |
1016 |
+ ret = generic_file_read_iter(iocb, to); |
1017 |
++ pagefault_enable(); |
1018 |
+ if (ret > 0) |
1019 |
+ written += ret; |
1020 |
+- gfs2_glock_dq(&gh); |
1021 |
++ |
1022 |
++ if (should_fault_in_pages(ret, to, &prev_count, &window_size)) { |
1023 |
++ size_t leftover; |
1024 |
++ |
1025 |
++ gfs2_holder_allow_demote(&gh); |
1026 |
++ leftover = fault_in_iov_iter_writeable(to, window_size); |
1027 |
++ gfs2_holder_disallow_demote(&gh); |
1028 |
++ if (leftover != window_size) { |
1029 |
++ if (!gfs2_holder_queued(&gh)) { |
1030 |
++ if (written) |
1031 |
++ goto out_uninit; |
1032 |
++ goto retry; |
1033 |
++ } |
1034 |
++ goto retry_under_glock; |
1035 |
++ } |
1036 |
++ } |
1037 |
++ if (gfs2_holder_queued(&gh)) |
1038 |
++ gfs2_glock_dq(&gh); |
1039 |
+ out_uninit: |
1040 |
+ gfs2_holder_uninit(&gh); |
1041 |
+ return written ? written : ret; |
1042 |
+ } |
1043 |
+ |
1044 |
++static ssize_t gfs2_file_buffered_write(struct kiocb *iocb, |
1045 |
++ struct iov_iter *from, |
1046 |
++ struct gfs2_holder *gh) |
1047 |
++{ |
1048 |
++ struct file *file = iocb->ki_filp; |
1049 |
++ struct inode *inode = file_inode(file); |
1050 |
++ struct gfs2_inode *ip = GFS2_I(inode); |
1051 |
++ struct gfs2_sbd *sdp = GFS2_SB(inode); |
1052 |
++ struct gfs2_holder *statfs_gh = NULL; |
1053 |
++ size_t prev_count = 0, window_size = 0; |
1054 |
++ size_t read = 0; |
1055 |
++ ssize_t ret; |
1056 |
++ |
1057 |
++ /* |
1058 |
++ * In this function, we disable page faults when we're holding the |
1059 |
++ * inode glock while doing I/O. If a page fault occurs, we indicate |
1060 |
++ * that the inode glock may be dropped, fault in the pages manually, |
1061 |
++ * and retry. |
1062 |
++ */ |
1063 |
++ |
1064 |
++ if (inode == sdp->sd_rindex) { |
1065 |
++ statfs_gh = kmalloc(sizeof(*statfs_gh), GFP_NOFS); |
1066 |
++ if (!statfs_gh) |
1067 |
++ return -ENOMEM; |
1068 |
++ } |
1069 |
++ |
1070 |
++ gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, gh); |
1071 |
++retry: |
1072 |
++ ret = gfs2_glock_nq(gh); |
1073 |
++ if (ret) |
1074 |
++ goto out_uninit; |
1075 |
++retry_under_glock: |
1076 |
++ if (inode == sdp->sd_rindex) { |
1077 |
++ struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode); |
1078 |
++ |
1079 |
++ ret = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE, |
1080 |
++ GL_NOCACHE, statfs_gh); |
1081 |
++ if (ret) |
1082 |
++ goto out_unlock; |
1083 |
++ } |
1084 |
++ |
1085 |
++ current->backing_dev_info = inode_to_bdi(inode); |
1086 |
++ pagefault_disable(); |
1087 |
++ ret = iomap_file_buffered_write(iocb, from, &gfs2_iomap_ops); |
1088 |
++ pagefault_enable(); |
1089 |
++ current->backing_dev_info = NULL; |
1090 |
++ if (ret > 0) { |
1091 |
++ iocb->ki_pos += ret; |
1092 |
++ read += ret; |
1093 |
++ } |
1094 |
++ |
1095 |
++ if (inode == sdp->sd_rindex) |
1096 |
++ gfs2_glock_dq_uninit(statfs_gh); |
1097 |
++ |
1098 |
++ if (should_fault_in_pages(ret, from, &prev_count, &window_size)) { |
1099 |
++ size_t leftover; |
1100 |
++ |
1101 |
++ gfs2_holder_allow_demote(gh); |
1102 |
++ leftover = fault_in_iov_iter_readable(from, window_size); |
1103 |
++ gfs2_holder_disallow_demote(gh); |
1104 |
++ if (leftover != window_size) { |
1105 |
++ if (!gfs2_holder_queued(gh)) { |
1106 |
++ if (read) |
1107 |
++ goto out_uninit; |
1108 |
++ goto retry; |
1109 |
++ } |
1110 |
++ goto retry_under_glock; |
1111 |
++ } |
1112 |
++ } |
1113 |
++out_unlock: |
1114 |
++ if (gfs2_holder_queued(gh)) |
1115 |
++ gfs2_glock_dq(gh); |
1116 |
++out_uninit: |
1117 |
++ gfs2_holder_uninit(gh); |
1118 |
++ if (statfs_gh) |
1119 |
++ kfree(statfs_gh); |
1120 |
++ return read ? read : ret; |
1121 |
++} |
1122 |
++ |
1123 |
+ /** |
1124 |
+ * gfs2_file_write_iter - Perform a write to a file |
1125 |
+ * @iocb: The io context |
1126 |
+@@ -928,9 +1141,7 @@ static ssize_t gfs2_file_write_iter(struct kiocb *iocb, struct iov_iter *from) |
1127 |
+ goto out_unlock; |
1128 |
+ |
1129 |
+ iocb->ki_flags |= IOCB_DSYNC; |
1130 |
+- current->backing_dev_info = inode_to_bdi(inode); |
1131 |
+- buffered = iomap_file_buffered_write(iocb, from, &gfs2_iomap_ops); |
1132 |
+- current->backing_dev_info = NULL; |
1133 |
++ buffered = gfs2_file_buffered_write(iocb, from, &gh); |
1134 |
+ if (unlikely(buffered <= 0)) { |
1135 |
+ if (!ret) |
1136 |
+ ret = buffered; |
1137 |
+@@ -944,7 +1155,6 @@ static ssize_t gfs2_file_write_iter(struct kiocb *iocb, struct iov_iter *from) |
1138 |
+ * the direct I/O range as we don't know if the buffered pages |
1139 |
+ * made it to disk. |
1140 |
+ */ |
1141 |
+- iocb->ki_pos += buffered; |
1142 |
+ ret2 = generic_write_sync(iocb, buffered); |
1143 |
+ invalidate_mapping_pages(mapping, |
1144 |
+ (iocb->ki_pos - buffered) >> PAGE_SHIFT, |
1145 |
+@@ -952,13 +1162,9 @@ static ssize_t gfs2_file_write_iter(struct kiocb *iocb, struct iov_iter *from) |
1146 |
+ if (!ret || ret2 > 0) |
1147 |
+ ret += ret2; |
1148 |
+ } else { |
1149 |
+- current->backing_dev_info = inode_to_bdi(inode); |
1150 |
+- ret = iomap_file_buffered_write(iocb, from, &gfs2_iomap_ops); |
1151 |
+- current->backing_dev_info = NULL; |
1152 |
+- if (likely(ret > 0)) { |
1153 |
+- iocb->ki_pos += ret; |
1154 |
++ ret = gfs2_file_buffered_write(iocb, from, &gh); |
1155 |
++ if (likely(ret > 0)) |
1156 |
+ ret = generic_write_sync(iocb, ret); |
1157 |
+- } |
1158 |
+ } |
1159 |
+ |
1160 |
+ out_unlock: |
1161 |
+diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c |
1162 |
+index 02cd0ae98208d..e85ef6b14777d 100644 |
1163 |
+--- a/fs/gfs2/glock.c |
1164 |
++++ b/fs/gfs2/glock.c |
1165 |
+@@ -58,6 +58,7 @@ struct gfs2_glock_iter { |
1166 |
+ typedef void (*glock_examiner) (struct gfs2_glock * gl); |
1167 |
+ |
1168 |
+ static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int target); |
1169 |
++static void __gfs2_glock_dq(struct gfs2_holder *gh); |
1170 |
+ |
1171 |
+ static struct dentry *gfs2_root; |
1172 |
+ static struct workqueue_struct *glock_workqueue; |
1173 |
+@@ -197,6 +198,12 @@ static int demote_ok(const struct gfs2_glock *gl) |
1174 |
+ |
1175 |
+ if (gl->gl_state == LM_ST_UNLOCKED) |
1176 |
+ return 0; |
1177 |
++ /* |
1178 |
++ * Note that demote_ok is used for the lru process of disposing of |
1179 |
++ * glocks. For this purpose, we don't care if the glock's holders |
1180 |
++ * have the HIF_MAY_DEMOTE flag set or not. If someone is using |
1181 |
++ * them, don't demote. |
1182 |
++ */ |
1183 |
+ if (!list_empty(&gl->gl_holders)) |
1184 |
+ return 0; |
1185 |
+ if (glops->go_demote_ok) |
1186 |
+@@ -301,46 +308,59 @@ void gfs2_glock_put(struct gfs2_glock *gl) |
1187 |
+ } |
1188 |
+ |
1189 |
+ /** |
1190 |
+- * may_grant - check if its ok to grant a new lock |
1191 |
++ * may_grant - check if it's ok to grant a new lock |
1192 |
+ * @gl: The glock |
1193 |
++ * @current_gh: One of the current holders of @gl |
1194 |
+ * @gh: The lock request which we wish to grant |
1195 |
+ * |
1196 |
+- * Returns: true if its ok to grant the lock |
1197 |
++ * With our current compatibility rules, if a glock has one or more active |
1198 |
++ * holders (HIF_HOLDER flag set), any of those holders can be passed in as |
1199 |
++ * @current_gh; they are all the same as far as compatibility with the new @gh |
1200 |
++ * goes. |
1201 |
++ * |
1202 |
++ * Returns true if it's ok to grant the lock. |
1203 |
+ */ |
1204 |
+ |
1205 |
+-static inline int may_grant(const struct gfs2_glock *gl, const struct gfs2_holder *gh) |
1206 |
+-{ |
1207 |
+- const struct gfs2_holder *gh_head = list_first_entry(&gl->gl_holders, const struct gfs2_holder, gh_list); |
1208 |
++static inline bool may_grant(struct gfs2_glock *gl, |
1209 |
++ struct gfs2_holder *current_gh, |
1210 |
++ struct gfs2_holder *gh) |
1211 |
++{ |
1212 |
++ if (current_gh) { |
1213 |
++ GLOCK_BUG_ON(gl, !test_bit(HIF_HOLDER, ¤t_gh->gh_iflags)); |
1214 |
++ |
1215 |
++ switch(current_gh->gh_state) { |
1216 |
++ case LM_ST_EXCLUSIVE: |
1217 |
++ /* |
1218 |
++ * Here we make a special exception to grant holders |
1219 |
++ * who agree to share the EX lock with other holders |
1220 |
++ * who also have the bit set. If the original holder |
1221 |
++ * has the LM_FLAG_NODE_SCOPE bit set, we grant more |
1222 |
++ * holders with the bit set. |
1223 |
++ */ |
1224 |
++ return gh->gh_state == LM_ST_EXCLUSIVE && |
1225 |
++ (current_gh->gh_flags & LM_FLAG_NODE_SCOPE) && |
1226 |
++ (gh->gh_flags & LM_FLAG_NODE_SCOPE); |
1227 |
+ |
1228 |
+- if (gh != gh_head) { |
1229 |
+- /** |
1230 |
+- * Here we make a special exception to grant holders who agree |
1231 |
+- * to share the EX lock with other holders who also have the |
1232 |
+- * bit set. If the original holder has the LM_FLAG_NODE_SCOPE bit |
1233 |
+- * is set, we grant more holders with the bit set. |
1234 |
+- */ |
1235 |
+- if (gh_head->gh_state == LM_ST_EXCLUSIVE && |
1236 |
+- (gh_head->gh_flags & LM_FLAG_NODE_SCOPE) && |
1237 |
+- gh->gh_state == LM_ST_EXCLUSIVE && |
1238 |
+- (gh->gh_flags & LM_FLAG_NODE_SCOPE)) |
1239 |
+- return 1; |
1240 |
+- if ((gh->gh_state == LM_ST_EXCLUSIVE || |
1241 |
+- gh_head->gh_state == LM_ST_EXCLUSIVE)) |
1242 |
+- return 0; |
1243 |
++ case LM_ST_SHARED: |
1244 |
++ case LM_ST_DEFERRED: |
1245 |
++ return gh->gh_state == current_gh->gh_state; |
1246 |
++ |
1247 |
++ default: |
1248 |
++ return false; |
1249 |
++ } |
1250 |
+ } |
1251 |
++ |
1252 |
+ if (gl->gl_state == gh->gh_state) |
1253 |
+- return 1; |
1254 |
++ return true; |
1255 |
+ if (gh->gh_flags & GL_EXACT) |
1256 |
+- return 0; |
1257 |
++ return false; |
1258 |
+ if (gl->gl_state == LM_ST_EXCLUSIVE) { |
1259 |
+- if (gh->gh_state == LM_ST_SHARED && gh_head->gh_state == LM_ST_SHARED) |
1260 |
+- return 1; |
1261 |
+- if (gh->gh_state == LM_ST_DEFERRED && gh_head->gh_state == LM_ST_DEFERRED) |
1262 |
+- return 1; |
1263 |
++ return gh->gh_state == LM_ST_SHARED || |
1264 |
++ gh->gh_state == LM_ST_DEFERRED; |
1265 |
+ } |
1266 |
+- if (gl->gl_state != LM_ST_UNLOCKED && (gh->gh_flags & LM_FLAG_ANY)) |
1267 |
+- return 1; |
1268 |
+- return 0; |
1269 |
++ if (gh->gh_flags & LM_FLAG_ANY) |
1270 |
++ return gl->gl_state != LM_ST_UNLOCKED; |
1271 |
++ return false; |
1272 |
+ } |
1273 |
+ |
1274 |
+ static void gfs2_holder_wake(struct gfs2_holder *gh) |
1275 |
+@@ -366,7 +386,7 @@ static void do_error(struct gfs2_glock *gl, const int ret) |
1276 |
+ struct gfs2_holder *gh, *tmp; |
1277 |
+ |
1278 |
+ list_for_each_entry_safe(gh, tmp, &gl->gl_holders, gh_list) { |
1279 |
+- if (test_bit(HIF_HOLDER, &gh->gh_iflags)) |
1280 |
++ if (!test_bit(HIF_WAIT, &gh->gh_iflags)) |
1281 |
+ continue; |
1282 |
+ if (ret & LM_OUT_ERROR) |
1283 |
+ gh->gh_error = -EIO; |
1284 |
+@@ -380,6 +400,78 @@ static void do_error(struct gfs2_glock *gl, const int ret) |
1285 |
+ } |
1286 |
+ } |
1287 |
+ |
1288 |
++/** |
1289 |
++ * demote_incompat_holders - demote incompatible demoteable holders |
1290 |
++ * @gl: the glock we want to promote |
1291 |
++ * @new_gh: the new holder to be promoted |
1292 |
++ */ |
1293 |
++static void demote_incompat_holders(struct gfs2_glock *gl, |
1294 |
++ struct gfs2_holder *new_gh) |
1295 |
++{ |
1296 |
++ struct gfs2_holder *gh; |
1297 |
++ |
1298 |
++ /* |
1299 |
++ * Demote incompatible holders before we make ourselves eligible. |
1300 |
++ * (This holder may or may not allow auto-demoting, but we don't want |
1301 |
++ * to demote the new holder before it's even granted.) |
1302 |
++ */ |
1303 |
++ list_for_each_entry(gh, &gl->gl_holders, gh_list) { |
1304 |
++ /* |
1305 |
++ * Since holders are at the front of the list, we stop when we |
1306 |
++ * find the first non-holder. |
1307 |
++ */ |
1308 |
++ if (!test_bit(HIF_HOLDER, &gh->gh_iflags)) |
1309 |
++ return; |
1310 |
++ if (test_bit(HIF_MAY_DEMOTE, &gh->gh_iflags) && |
1311 |
++ !may_grant(gl, new_gh, gh)) { |
1312 |
++ /* |
1313 |
++ * We should not recurse into do_promote because |
1314 |
++ * __gfs2_glock_dq only calls handle_callback, |
1315 |
++ * gfs2_glock_add_to_lru and __gfs2_glock_queue_work. |
1316 |
++ */ |
1317 |
++ __gfs2_glock_dq(gh); |
1318 |
++ } |
1319 |
++ } |
1320 |
++} |
1321 |
++ |
1322 |
++/** |
1323 |
++ * find_first_holder - find the first "holder" gh |
1324 |
++ * @gl: the glock |
1325 |
++ */ |
1326 |
++ |
1327 |
++static inline struct gfs2_holder *find_first_holder(const struct gfs2_glock *gl) |
1328 |
++{ |
1329 |
++ struct gfs2_holder *gh; |
1330 |
++ |
1331 |
++ if (!list_empty(&gl->gl_holders)) { |
1332 |
++ gh = list_first_entry(&gl->gl_holders, struct gfs2_holder, |
1333 |
++ gh_list); |
1334 |
++ if (test_bit(HIF_HOLDER, &gh->gh_iflags)) |
1335 |
++ return gh; |
1336 |
++ } |
1337 |
++ return NULL; |
1338 |
++} |
1339 |
++ |
1340 |
++/** |
1341 |
++ * find_first_strong_holder - find the first non-demoteable holder |
1342 |
++ * @gl: the glock |
1343 |
++ * |
1344 |
++ * Find the first holder that doesn't have the HIF_MAY_DEMOTE flag set. |
1345 |
++ */ |
1346 |
++static inline struct gfs2_holder * |
1347 |
++find_first_strong_holder(struct gfs2_glock *gl) |
1348 |
++{ |
1349 |
++ struct gfs2_holder *gh; |
1350 |
++ |
1351 |
++ list_for_each_entry(gh, &gl->gl_holders, gh_list) { |
1352 |
++ if (!test_bit(HIF_HOLDER, &gh->gh_iflags)) |
1353 |
++ return NULL; |
1354 |
++ if (!test_bit(HIF_MAY_DEMOTE, &gh->gh_iflags)) |
1355 |
++ return gh; |
1356 |
++ } |
1357 |
++ return NULL; |
1358 |
++} |
1359 |
++ |
1360 |
+ /** |
1361 |
+ * do_promote - promote as many requests as possible on the current queue |
1362 |
+ * @gl: The glock |
1363 |
+@@ -393,14 +485,21 @@ __releases(&gl->gl_lockref.lock) |
1364 |
+ __acquires(&gl->gl_lockref.lock) |
1365 |
+ { |
1366 |
+ const struct gfs2_glock_operations *glops = gl->gl_ops; |
1367 |
+- struct gfs2_holder *gh, *tmp; |
1368 |
++ struct gfs2_holder *gh, *tmp, *first_gh; |
1369 |
++ bool incompat_holders_demoted = false; |
1370 |
+ int ret; |
1371 |
+ |
1372 |
+ restart: |
1373 |
++ first_gh = find_first_strong_holder(gl); |
1374 |
+ list_for_each_entry_safe(gh, tmp, &gl->gl_holders, gh_list) { |
1375 |
+- if (test_bit(HIF_HOLDER, &gh->gh_iflags)) |
1376 |
++ if (!test_bit(HIF_WAIT, &gh->gh_iflags)) |
1377 |
+ continue; |
1378 |
+- if (may_grant(gl, gh)) { |
1379 |
++ if (may_grant(gl, first_gh, gh)) { |
1380 |
++ if (!incompat_holders_demoted) { |
1381 |
++ demote_incompat_holders(gl, first_gh); |
1382 |
++ incompat_holders_demoted = true; |
1383 |
++ first_gh = gh; |
1384 |
++ } |
1385 |
+ if (gh->gh_list.prev == &gl->gl_holders && |
1386 |
+ glops->go_lock) { |
1387 |
+ spin_unlock(&gl->gl_lockref.lock); |
1388 |
+@@ -426,6 +525,11 @@ restart: |
1389 |
+ gfs2_holder_wake(gh); |
1390 |
+ continue; |
1391 |
+ } |
1392 |
++ /* |
1393 |
++ * If we get here, it means we may not grant this holder for |
1394 |
++ * some reason. If this holder is the head of the list, it |
1395 |
++ * means we have a blocked holder at the head, so return 1. |
1396 |
++ */ |
1397 |
+ if (gh->gh_list.prev == &gl->gl_holders) |
1398 |
+ return 1; |
1399 |
+ do_error(gl, 0); |
1400 |
+@@ -722,23 +826,6 @@ out: |
1401 |
+ spin_lock(&gl->gl_lockref.lock); |
1402 |
+ } |
1403 |
+ |
1404 |
+-/** |
1405 |
+- * find_first_holder - find the first "holder" gh |
1406 |
+- * @gl: the glock |
1407 |
+- */ |
1408 |
+- |
1409 |
+-static inline struct gfs2_holder *find_first_holder(const struct gfs2_glock *gl) |
1410 |
+-{ |
1411 |
+- struct gfs2_holder *gh; |
1412 |
+- |
1413 |
+- if (!list_empty(&gl->gl_holders)) { |
1414 |
+- gh = list_first_entry(&gl->gl_holders, struct gfs2_holder, gh_list); |
1415 |
+- if (test_bit(HIF_HOLDER, &gh->gh_iflags)) |
1416 |
+- return gh; |
1417 |
+- } |
1418 |
+- return NULL; |
1419 |
+-} |
1420 |
+- |
1421 |
+ /** |
1422 |
+ * run_queue - do all outstanding tasks related to a glock |
1423 |
+ * @gl: The glock in question |
1424 |
+@@ -1354,15 +1441,20 @@ __acquires(&gl->gl_lockref.lock) |
1425 |
+ GLOCK_BUG_ON(gl, true); |
1426 |
+ |
1427 |
+ if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) { |
1428 |
+- if (test_bit(GLF_LOCK, &gl->gl_flags)) |
1429 |
+- try_futile = !may_grant(gl, gh); |
1430 |
++ if (test_bit(GLF_LOCK, &gl->gl_flags)) { |
1431 |
++ struct gfs2_holder *first_gh; |
1432 |
++ |
1433 |
++ first_gh = find_first_strong_holder(gl); |
1434 |
++ try_futile = !may_grant(gl, first_gh, gh); |
1435 |
++ } |
1436 |
+ if (test_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags)) |
1437 |
+ goto fail; |
1438 |
+ } |
1439 |
+ |
1440 |
+ list_for_each_entry(gh2, &gl->gl_holders, gh_list) { |
1441 |
+ if (unlikely(gh2->gh_owner_pid == gh->gh_owner_pid && |
1442 |
+- (gh->gh_gl->gl_ops->go_type != LM_TYPE_FLOCK))) |
1443 |
++ (gh->gh_gl->gl_ops->go_type != LM_TYPE_FLOCK) && |
1444 |
++ !test_bit(HIF_MAY_DEMOTE, &gh2->gh_iflags))) |
1445 |
+ goto trap_recursive; |
1446 |
+ if (try_futile && |
1447 |
+ !(gh2->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))) { |
1448 |
+@@ -1458,51 +1550,83 @@ int gfs2_glock_poll(struct gfs2_holder *gh) |
1449 |
+ return test_bit(HIF_WAIT, &gh->gh_iflags) ? 0 : 1; |
1450 |
+ } |
1451 |
+ |
1452 |
+-/** |
1453 |
+- * gfs2_glock_dq - dequeue a struct gfs2_holder from a glock (release a glock) |
1454 |
+- * @gh: the glock holder |
1455 |
+- * |
1456 |
+- */ |
1457 |
++static inline bool needs_demote(struct gfs2_glock *gl) |
1458 |
++{ |
1459 |
++ return (test_bit(GLF_DEMOTE, &gl->gl_flags) || |
1460 |
++ test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags)); |
1461 |
++} |
1462 |
+ |
1463 |
+-void gfs2_glock_dq(struct gfs2_holder *gh) |
1464 |
++static void __gfs2_glock_dq(struct gfs2_holder *gh) |
1465 |
+ { |
1466 |
+ struct gfs2_glock *gl = gh->gh_gl; |
1467 |
+ struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; |
1468 |
+ unsigned delay = 0; |
1469 |
+ int fast_path = 0; |
1470 |
+ |
1471 |
+- spin_lock(&gl->gl_lockref.lock); |
1472 |
+ /* |
1473 |
+- * If we're in the process of file system withdraw, we cannot just |
1474 |
+- * dequeue any glocks until our journal is recovered, lest we |
1475 |
+- * introduce file system corruption. We need two exceptions to this |
1476 |
+- * rule: We need to allow unlocking of nondisk glocks and the glock |
1477 |
+- * for our own journal that needs recovery. |
1478 |
++ * This while loop is similar to function demote_incompat_holders: |
1479 |
++ * If the glock is due to be demoted (which may be from another node |
1480 |
++ * or even if this holder is GL_NOCACHE), the weak holders are |
1481 |
++ * demoted as well, allowing the glock to be demoted. |
1482 |
+ */ |
1483 |
+- if (test_bit(SDF_WITHDRAW_RECOVERY, &sdp->sd_flags) && |
1484 |
+- glock_blocked_by_withdraw(gl) && |
1485 |
+- gh->gh_gl != sdp->sd_jinode_gl) { |
1486 |
+- sdp->sd_glock_dqs_held++; |
1487 |
+- spin_unlock(&gl->gl_lockref.lock); |
1488 |
+- might_sleep(); |
1489 |
+- wait_on_bit(&sdp->sd_flags, SDF_WITHDRAW_RECOVERY, |
1490 |
+- TASK_UNINTERRUPTIBLE); |
1491 |
+- spin_lock(&gl->gl_lockref.lock); |
1492 |
+- } |
1493 |
+- if (gh->gh_flags & GL_NOCACHE) |
1494 |
+- handle_callback(gl, LM_ST_UNLOCKED, 0, false); |
1495 |
++ while (gh) { |
1496 |
++ /* |
1497 |
++ * If we're in the process of file system withdraw, we cannot |
1498 |
++ * just dequeue any glocks until our journal is recovered, lest |
1499 |
++ * we introduce file system corruption. We need two exceptions |
1500 |
++ * to this rule: We need to allow unlocking of nondisk glocks |
1501 |
++ * and the glock for our own journal that needs recovery. |
1502 |
++ */ |
1503 |
++ if (test_bit(SDF_WITHDRAW_RECOVERY, &sdp->sd_flags) && |
1504 |
++ glock_blocked_by_withdraw(gl) && |
1505 |
++ gh->gh_gl != sdp->sd_jinode_gl) { |
1506 |
++ sdp->sd_glock_dqs_held++; |
1507 |
++ spin_unlock(&gl->gl_lockref.lock); |
1508 |
++ might_sleep(); |
1509 |
++ wait_on_bit(&sdp->sd_flags, SDF_WITHDRAW_RECOVERY, |
1510 |
++ TASK_UNINTERRUPTIBLE); |
1511 |
++ spin_lock(&gl->gl_lockref.lock); |
1512 |
++ } |
1513 |
+ |
1514 |
+- list_del_init(&gh->gh_list); |
1515 |
+- clear_bit(HIF_HOLDER, &gh->gh_iflags); |
1516 |
+- if (list_empty(&gl->gl_holders) && |
1517 |
+- !test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) && |
1518 |
+- !test_bit(GLF_DEMOTE, &gl->gl_flags)) |
1519 |
+- fast_path = 1; |
1520 |
++ /* |
1521 |
++ * This holder should not be cached, so mark it for demote. |
1522 |
++ * Note: this should be done before the check for needs_demote |
1523 |
++ * below. |
1524 |
++ */ |
1525 |
++ if (gh->gh_flags & GL_NOCACHE) |
1526 |
++ handle_callback(gl, LM_ST_UNLOCKED, 0, false); |
1527 |
++ |
1528 |
++ list_del_init(&gh->gh_list); |
1529 |
++ clear_bit(HIF_HOLDER, &gh->gh_iflags); |
1530 |
++ trace_gfs2_glock_queue(gh, 0); |
1531 |
++ |
1532 |
++ /* |
1533 |
++ * If there hasn't been a demote request we are done. |
1534 |
++ * (Let the remaining holders, if any, keep holding it.) |
1535 |
++ */ |
1536 |
++ if (!needs_demote(gl)) { |
1537 |
++ if (list_empty(&gl->gl_holders)) |
1538 |
++ fast_path = 1; |
1539 |
++ break; |
1540 |
++ } |
1541 |
++ /* |
1542 |
++ * If we have another strong holder (we cannot auto-demote) |
1543 |
++ * we are done. It keeps holding it until it is done. |
1544 |
++ */ |
1545 |
++ if (find_first_strong_holder(gl)) |
1546 |
++ break; |
1547 |
++ |
1548 |
++ /* |
1549 |
++ * If we have a weak holder at the head of the list, it |
1550 |
++ * (and all others like it) must be auto-demoted. If there |
1551 |
++ * are no more weak holders, we exit the while loop. |
1552 |
++ */ |
1553 |
++ gh = find_first_holder(gl); |
1554 |
++ } |
1555 |
+ |
1556 |
+ if (!test_bit(GLF_LFLUSH, &gl->gl_flags) && demote_ok(gl)) |
1557 |
+ gfs2_glock_add_to_lru(gl); |
1558 |
+ |
1559 |
+- trace_gfs2_glock_queue(gh, 0); |
1560 |
+ if (unlikely(!fast_path)) { |
1561 |
+ gl->gl_lockref.count++; |
1562 |
+ if (test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) && |
1563 |
+@@ -1511,6 +1635,19 @@ void gfs2_glock_dq(struct gfs2_holder *gh) |
1564 |
+ delay = gl->gl_hold_time; |
1565 |
+ __gfs2_glock_queue_work(gl, delay); |
1566 |
+ } |
1567 |
++} |
1568 |
++ |
1569 |
++/** |
1570 |
++ * gfs2_glock_dq - dequeue a struct gfs2_holder from a glock (release a glock) |
1571 |
++ * @gh: the glock holder |
1572 |
++ * |
1573 |
++ */ |
1574 |
++void gfs2_glock_dq(struct gfs2_holder *gh) |
1575 |
++{ |
1576 |
++ struct gfs2_glock *gl = gh->gh_gl; |
1577 |
++ |
1578 |
++ spin_lock(&gl->gl_lockref.lock); |
1579 |
++ __gfs2_glock_dq(gh); |
1580 |
+ spin_unlock(&gl->gl_lockref.lock); |
1581 |
+ } |
1582 |
+ |
1583 |
+@@ -1673,6 +1810,7 @@ void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs) |
1584 |
+ |
1585 |
+ void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state) |
1586 |
+ { |
1587 |
++ struct gfs2_holder mock_gh = { .gh_gl = gl, .gh_state = state, }; |
1588 |
+ unsigned long delay = 0; |
1589 |
+ unsigned long holdtime; |
1590 |
+ unsigned long now = jiffies; |
1591 |
+@@ -1687,6 +1825,28 @@ void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state) |
1592 |
+ if (test_bit(GLF_REPLY_PENDING, &gl->gl_flags)) |
1593 |
+ delay = gl->gl_hold_time; |
1594 |
+ } |
1595 |
++ /* |
1596 |
++ * Note 1: We cannot call demote_incompat_holders from handle_callback |
1597 |
++ * or gfs2_set_demote due to recursion problems like: gfs2_glock_dq -> |
1598 |
++ * handle_callback -> demote_incompat_holders -> gfs2_glock_dq |
1599 |
++ * Plus, we only want to demote the holders if the request comes from |
1600 |
++ * a remote cluster node because local holder conflicts are resolved |
1601 |
++ * elsewhere. |
1602 |
++ * |
1603 |
++ * Note 2: if a remote node wants this glock in EX mode, lock_dlm will |
1604 |
++ * request that we set our state to UNLOCKED. Here we mock up a holder |
1605 |
++ * to make it look like someone wants the lock EX locally. Any SH |
1606 |
++ * and DF requests should be able to share the lock without demoting. |
1607 |
++ * |
1608 |
++ * Note 3: We only want to demote the demoteable holders when there |
1609 |
++ * are no more strong holders. The demoteable holders might as well |
1610 |
++ * keep the glock until the last strong holder is done with it. |
1611 |
++ */ |
1612 |
++ if (!find_first_strong_holder(gl)) { |
1613 |
++ if (state == LM_ST_UNLOCKED) |
1614 |
++ mock_gh.gh_state = LM_ST_EXCLUSIVE; |
1615 |
++ demote_incompat_holders(gl, &mock_gh); |
1616 |
++ } |
1617 |
+ handle_callback(gl, state, delay, true); |
1618 |
+ __gfs2_glock_queue_work(gl, delay); |
1619 |
+ spin_unlock(&gl->gl_lockref.lock); |
1620 |
+@@ -2078,6 +2238,8 @@ static const char *hflags2str(char *buf, u16 flags, unsigned long iflags) |
1621 |
+ *p++ = 'H'; |
1622 |
+ if (test_bit(HIF_WAIT, &iflags)) |
1623 |
+ *p++ = 'W'; |
1624 |
++ if (test_bit(HIF_MAY_DEMOTE, &iflags)) |
1625 |
++ *p++ = 'D'; |
1626 |
+ *p = 0; |
1627 |
+ return buf; |
1628 |
+ } |
1629 |
+diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h |
1630 |
+index 31a8f2f649b52..9012487da4c69 100644 |
1631 |
+--- a/fs/gfs2/glock.h |
1632 |
++++ b/fs/gfs2/glock.h |
1633 |
+@@ -150,6 +150,8 @@ static inline struct gfs2_holder *gfs2_glock_is_locked_by_me(struct gfs2_glock * |
1634 |
+ list_for_each_entry(gh, &gl->gl_holders, gh_list) { |
1635 |
+ if (!test_bit(HIF_HOLDER, &gh->gh_iflags)) |
1636 |
+ break; |
1637 |
++ if (test_bit(HIF_MAY_DEMOTE, &gh->gh_iflags)) |
1638 |
++ continue; |
1639 |
+ if (gh->gh_owner_pid == pid) |
1640 |
+ goto out; |
1641 |
+ } |
1642 |
+@@ -325,6 +327,24 @@ static inline void glock_clear_object(struct gfs2_glock *gl, void *object) |
1643 |
+ spin_unlock(&gl->gl_lockref.lock); |
1644 |
+ } |
1645 |
+ |
1646 |
++static inline void gfs2_holder_allow_demote(struct gfs2_holder *gh) |
1647 |
++{ |
1648 |
++ struct gfs2_glock *gl = gh->gh_gl; |
1649 |
++ |
1650 |
++ spin_lock(&gl->gl_lockref.lock); |
1651 |
++ set_bit(HIF_MAY_DEMOTE, &gh->gh_iflags); |
1652 |
++ spin_unlock(&gl->gl_lockref.lock); |
1653 |
++} |
1654 |
++ |
1655 |
++static inline void gfs2_holder_disallow_demote(struct gfs2_holder *gh) |
1656 |
++{ |
1657 |
++ struct gfs2_glock *gl = gh->gh_gl; |
1658 |
++ |
1659 |
++ spin_lock(&gl->gl_lockref.lock); |
1660 |
++ clear_bit(HIF_MAY_DEMOTE, &gh->gh_iflags); |
1661 |
++ spin_unlock(&gl->gl_lockref.lock); |
1662 |
++} |
1663 |
++ |
1664 |
+ extern void gfs2_inode_remember_delete(struct gfs2_glock *gl, u64 generation); |
1665 |
+ extern bool gfs2_inode_already_deleted(struct gfs2_glock *gl, u64 generation); |
1666 |
+ |
1667 |
+diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h |
1668 |
+index 0fe49770166ea..ca42d310fd4d6 100644 |
1669 |
+--- a/fs/gfs2/incore.h |
1670 |
++++ b/fs/gfs2/incore.h |
1671 |
+@@ -252,6 +252,7 @@ struct gfs2_lkstats { |
1672 |
+ |
1673 |
+ enum { |
1674 |
+ /* States */ |
1675 |
++ HIF_MAY_DEMOTE = 1, |
1676 |
+ HIF_HOLDER = 6, /* Set for gh that "holds" the glock */ |
1677 |
+ HIF_WAIT = 10, |
1678 |
+ }; |
1679 |
+@@ -386,9 +387,8 @@ struct gfs2_inode { |
1680 |
+ u64 i_generation; |
1681 |
+ u64 i_eattr; |
1682 |
+ unsigned long i_flags; /* GIF_... */ |
1683 |
+- struct gfs2_glock *i_gl; /* Move into i_gh? */ |
1684 |
++ struct gfs2_glock *i_gl; |
1685 |
+ struct gfs2_holder i_iopen_gh; |
1686 |
+- struct gfs2_holder i_gh; /* for prepare/commit_write only */ |
1687 |
+ struct gfs2_qadata *i_qadata; /* quota allocation data */ |
1688 |
+ struct gfs2_holder i_rgd_gh; |
1689 |
+ struct gfs2_blkreserv i_res; /* rgrp multi-block reservation */ |
1690 |
+diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c |
1691 |
+index 97119ec3b8503..fe10d8a30f6bd 100644 |
1692 |
+--- a/fs/iomap/buffered-io.c |
1693 |
++++ b/fs/iomap/buffered-io.c |
1694 |
+@@ -757,7 +757,7 @@ again: |
1695 |
+ * same page as we're writing to, without it being marked |
1696 |
+ * up-to-date. |
1697 |
+ */ |
1698 |
+- if (unlikely(iov_iter_fault_in_readable(i, bytes))) { |
1699 |
++ if (unlikely(fault_in_iov_iter_readable(i, bytes))) { |
1700 |
+ status = -EFAULT; |
1701 |
+ break; |
1702 |
+ } |
1703 |
+diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c |
1704 |
+index 4ecd255e0511c..468dcbba45bcb 100644 |
1705 |
+--- a/fs/iomap/direct-io.c |
1706 |
++++ b/fs/iomap/direct-io.c |
1707 |
+@@ -31,6 +31,7 @@ struct iomap_dio { |
1708 |
+ atomic_t ref; |
1709 |
+ unsigned flags; |
1710 |
+ int error; |
1711 |
++ size_t done_before; |
1712 |
+ bool wait_for_completion; |
1713 |
+ |
1714 |
+ union { |
1715 |
+@@ -124,6 +125,9 @@ ssize_t iomap_dio_complete(struct iomap_dio *dio) |
1716 |
+ if (ret > 0 && (dio->flags & IOMAP_DIO_NEED_SYNC)) |
1717 |
+ ret = generic_write_sync(iocb, ret); |
1718 |
+ |
1719 |
++ if (ret > 0) |
1720 |
++ ret += dio->done_before; |
1721 |
++ |
1722 |
+ kfree(dio); |
1723 |
+ |
1724 |
+ return ret; |
1725 |
+@@ -371,6 +375,8 @@ static loff_t iomap_dio_hole_iter(const struct iomap_iter *iter, |
1726 |
+ loff_t length = iov_iter_zero(iomap_length(iter), dio->submit.iter); |
1727 |
+ |
1728 |
+ dio->size += length; |
1729 |
++ if (!length) |
1730 |
++ return -EFAULT; |
1731 |
+ return length; |
1732 |
+ } |
1733 |
+ |
1734 |
+@@ -402,6 +408,8 @@ static loff_t iomap_dio_inline_iter(const struct iomap_iter *iomi, |
1735 |
+ copied = copy_to_iter(inline_data, length, iter); |
1736 |
+ } |
1737 |
+ dio->size += copied; |
1738 |
++ if (!copied) |
1739 |
++ return -EFAULT; |
1740 |
+ return copied; |
1741 |
+ } |
1742 |
+ |
1743 |
+@@ -446,13 +454,21 @@ static loff_t iomap_dio_iter(const struct iomap_iter *iter, |
1744 |
+ * may be pure data writes. In that case, we still need to do a full data sync |
1745 |
+ * completion. |
1746 |
+ * |
1747 |
++ * When page faults are disabled and @dio_flags includes IOMAP_DIO_PARTIAL, |
1748 |
++ * __iomap_dio_rw can return a partial result if it encounters a non-resident |
1749 |
++ * page in @iter after preparing a transfer. In that case, the non-resident |
1750 |
++ * pages can be faulted in and the request resumed with @done_before set to the |
1751 |
++ * number of bytes previously transferred. The request will then complete with |
1752 |
++ * the correct total number of bytes transferred; this is essential for |
1753 |
++ * completing partial requests asynchronously. |
1754 |
++ * |
1755 |
+ * Returns -ENOTBLK In case of a page invalidation invalidation failure for |
1756 |
+ * writes. The callers needs to fall back to buffered I/O in this case. |
1757 |
+ */ |
1758 |
+ struct iomap_dio * |
1759 |
+ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, |
1760 |
+ const struct iomap_ops *ops, const struct iomap_dio_ops *dops, |
1761 |
+- unsigned int dio_flags) |
1762 |
++ unsigned int dio_flags, size_t done_before) |
1763 |
+ { |
1764 |
+ struct address_space *mapping = iocb->ki_filp->f_mapping; |
1765 |
+ struct inode *inode = file_inode(iocb->ki_filp); |
1766 |
+@@ -482,6 +498,7 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, |
1767 |
+ dio->dops = dops; |
1768 |
+ dio->error = 0; |
1769 |
+ dio->flags = 0; |
1770 |
++ dio->done_before = done_before; |
1771 |
+ |
1772 |
+ dio->submit.iter = iter; |
1773 |
+ dio->submit.waiter = current; |
1774 |
+@@ -577,6 +594,12 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, |
1775 |
+ if (iov_iter_rw(iter) == READ && iomi.pos >= dio->i_size) |
1776 |
+ iov_iter_revert(iter, iomi.pos - dio->i_size); |
1777 |
+ |
1778 |
++ if (ret == -EFAULT && dio->size && (dio_flags & IOMAP_DIO_PARTIAL)) { |
1779 |
++ if (!(iocb->ki_flags & IOCB_NOWAIT)) |
1780 |
++ wait_for_completion = true; |
1781 |
++ ret = 0; |
1782 |
++ } |
1783 |
++ |
1784 |
+ /* magic error code to fall back to buffered I/O */ |
1785 |
+ if (ret == -ENOTBLK) { |
1786 |
+ wait_for_completion = true; |
1787 |
+@@ -642,11 +665,11 @@ EXPORT_SYMBOL_GPL(__iomap_dio_rw); |
1788 |
+ ssize_t |
1789 |
+ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, |
1790 |
+ const struct iomap_ops *ops, const struct iomap_dio_ops *dops, |
1791 |
+- unsigned int dio_flags) |
1792 |
++ unsigned int dio_flags, size_t done_before) |
1793 |
+ { |
1794 |
+ struct iomap_dio *dio; |
1795 |
+ |
1796 |
+- dio = __iomap_dio_rw(iocb, iter, ops, dops, dio_flags); |
1797 |
++ dio = __iomap_dio_rw(iocb, iter, ops, dops, dio_flags, done_before); |
1798 |
+ if (IS_ERR_OR_NULL(dio)) |
1799 |
+ return PTR_ERR_OR_ZERO(dio); |
1800 |
+ return iomap_dio_complete(dio); |
1801 |
+diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c |
1802 |
+index ab4f3362466d0..a43adeacd930c 100644 |
1803 |
+--- a/fs/ntfs/file.c |
1804 |
++++ b/fs/ntfs/file.c |
1805 |
+@@ -1829,7 +1829,7 @@ again: |
1806 |
+ * pages being swapped out between us bringing them into memory |
1807 |
+ * and doing the actual copying. |
1808 |
+ */ |
1809 |
+- if (unlikely(iov_iter_fault_in_readable(i, bytes))) { |
1810 |
++ if (unlikely(fault_in_iov_iter_readable(i, bytes))) { |
1811 |
+ status = -EFAULT; |
1812 |
+ break; |
1813 |
+ } |
1814 |
+diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c |
1815 |
+index 43b1451bff539..54b9599640ef4 100644 |
1816 |
+--- a/fs/ntfs3/file.c |
1817 |
++++ b/fs/ntfs3/file.c |
1818 |
+@@ -989,7 +989,7 @@ static ssize_t ntfs_compress_write(struct kiocb *iocb, struct iov_iter *from) |
1819 |
+ frame_vbo = pos & ~(frame_size - 1); |
1820 |
+ index = frame_vbo >> PAGE_SHIFT; |
1821 |
+ |
1822 |
+- if (unlikely(iov_iter_fault_in_readable(from, bytes))) { |
1823 |
++ if (unlikely(fault_in_iov_iter_readable(from, bytes))) { |
1824 |
+ err = -EFAULT; |
1825 |
+ goto out; |
1826 |
+ } |
1827 |
+diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c |
1828 |
+index 7aa943edfc02f..240eb932c014b 100644 |
1829 |
+--- a/fs/xfs/xfs_file.c |
1830 |
++++ b/fs/xfs/xfs_file.c |
1831 |
+@@ -259,7 +259,7 @@ xfs_file_dio_read( |
1832 |
+ ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED); |
1833 |
+ if (ret) |
1834 |
+ return ret; |
1835 |
+- ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL, 0); |
1836 |
++ ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL, 0, 0); |
1837 |
+ xfs_iunlock(ip, XFS_IOLOCK_SHARED); |
1838 |
+ |
1839 |
+ return ret; |
1840 |
+@@ -569,7 +569,7 @@ xfs_file_dio_write_aligned( |
1841 |
+ } |
1842 |
+ trace_xfs_file_direct_write(iocb, from); |
1843 |
+ ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops, |
1844 |
+- &xfs_dio_write_ops, 0); |
1845 |
++ &xfs_dio_write_ops, 0, 0); |
1846 |
+ out_unlock: |
1847 |
+ if (iolock) |
1848 |
+ xfs_iunlock(ip, iolock); |
1849 |
+@@ -647,7 +647,7 @@ retry_exclusive: |
1850 |
+ |
1851 |
+ trace_xfs_file_direct_write(iocb, from); |
1852 |
+ ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops, |
1853 |
+- &xfs_dio_write_ops, flags); |
1854 |
++ &xfs_dio_write_ops, flags, 0); |
1855 |
+ |
1856 |
+ /* |
1857 |
+ * Retry unaligned I/O with exclusive blocking semantics if the DIO |
1858 |
+diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c |
1859 |
+index 807f33553a8eb..bced33b76beac 100644 |
1860 |
+--- a/fs/zonefs/super.c |
1861 |
++++ b/fs/zonefs/super.c |
1862 |
+@@ -852,7 +852,7 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from) |
1863 |
+ ret = zonefs_file_dio_append(iocb, from); |
1864 |
+ else |
1865 |
+ ret = iomap_dio_rw(iocb, from, &zonefs_iomap_ops, |
1866 |
+- &zonefs_write_dio_ops, 0); |
1867 |
++ &zonefs_write_dio_ops, 0, 0); |
1868 |
+ if (zi->i_ztype == ZONEFS_ZTYPE_SEQ && |
1869 |
+ (ret > 0 || ret == -EIOCBQUEUED)) { |
1870 |
+ if (ret > 0) |
1871 |
+@@ -987,7 +987,7 @@ static ssize_t zonefs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) |
1872 |
+ } |
1873 |
+ file_accessed(iocb->ki_filp); |
1874 |
+ ret = iomap_dio_rw(iocb, to, &zonefs_iomap_ops, |
1875 |
+- &zonefs_read_dio_ops, 0); |
1876 |
++ &zonefs_read_dio_ops, 0, 0); |
1877 |
+ } else { |
1878 |
+ ret = generic_file_read_iter(iocb, to); |
1879 |
+ if (ret == -EIO) |
1880 |
+diff --git a/include/linux/bpf.h b/include/linux/bpf.h |
1881 |
+index 15b690a0cecb0..c5c4b6f09e230 100644 |
1882 |
+--- a/include/linux/bpf.h |
1883 |
++++ b/include/linux/bpf.h |
1884 |
+@@ -293,6 +293,34 @@ bool bpf_map_meta_equal(const struct bpf_map *meta0, |
1885 |
+ |
1886 |
+ extern const struct bpf_map_ops bpf_map_offload_ops; |
1887 |
+ |
1888 |
++/* bpf_type_flag contains a set of flags that are applicable to the values of |
1889 |
++ * arg_type, ret_type and reg_type. For example, a pointer value may be null, |
1890 |
++ * or a memory is read-only. We classify types into two categories: base types |
1891 |
++ * and extended types. Extended types are base types combined with a type flag. |
1892 |
++ * |
1893 |
++ * Currently there are no more than 32 base types in arg_type, ret_type and |
1894 |
++ * reg_types. |
1895 |
++ */ |
1896 |
++#define BPF_BASE_TYPE_BITS 8 |
1897 |
++ |
1898 |
++enum bpf_type_flag { |
1899 |
++ /* PTR may be NULL. */ |
1900 |
++ PTR_MAYBE_NULL = BIT(0 + BPF_BASE_TYPE_BITS), |
1901 |
++ |
1902 |
++ /* MEM is read-only. When applied on bpf_arg, it indicates the arg is |
1903 |
++ * compatible with both mutable and immutable memory. |
1904 |
++ */ |
1905 |
++ MEM_RDONLY = BIT(1 + BPF_BASE_TYPE_BITS), |
1906 |
++ |
1907 |
++ __BPF_TYPE_LAST_FLAG = MEM_RDONLY, |
1908 |
++}; |
1909 |
++ |
1910 |
++/* Max number of base types. */ |
1911 |
++#define BPF_BASE_TYPE_LIMIT (1UL << BPF_BASE_TYPE_BITS) |
1912 |
++ |
1913 |
++/* Max number of all types. */ |
1914 |
++#define BPF_TYPE_LIMIT (__BPF_TYPE_LAST_FLAG | (__BPF_TYPE_LAST_FLAG - 1)) |
1915 |
++ |
1916 |
+ /* function argument constraints */ |
1917 |
+ enum bpf_arg_type { |
1918 |
+ ARG_DONTCARE = 0, /* unused argument in helper function */ |
1919 |
+@@ -304,13 +332,11 @@ enum bpf_arg_type { |
1920 |
+ ARG_PTR_TO_MAP_KEY, /* pointer to stack used as map key */ |
1921 |
+ ARG_PTR_TO_MAP_VALUE, /* pointer to stack used as map value */ |
1922 |
+ ARG_PTR_TO_UNINIT_MAP_VALUE, /* pointer to valid memory used to store a map value */ |
1923 |
+- ARG_PTR_TO_MAP_VALUE_OR_NULL, /* pointer to stack used as map value or NULL */ |
1924 |
+ |
1925 |
+ /* the following constraints used to prototype bpf_memcmp() and other |
1926 |
+ * functions that access data on eBPF program stack |
1927 |
+ */ |
1928 |
+ ARG_PTR_TO_MEM, /* pointer to valid memory (stack, packet, map value) */ |
1929 |
+- ARG_PTR_TO_MEM_OR_NULL, /* pointer to valid memory or NULL */ |
1930 |
+ ARG_PTR_TO_UNINIT_MEM, /* pointer to memory does not need to be initialized, |
1931 |
+ * helper function must fill all bytes or clear |
1932 |
+ * them in error case. |
1933 |
+@@ -320,42 +346,65 @@ enum bpf_arg_type { |
1934 |
+ ARG_CONST_SIZE_OR_ZERO, /* number of bytes accessed from memory or 0 */ |
1935 |
+ |
1936 |
+ ARG_PTR_TO_CTX, /* pointer to context */ |
1937 |
+- ARG_PTR_TO_CTX_OR_NULL, /* pointer to context or NULL */ |
1938 |
+ ARG_ANYTHING, /* any (initialized) argument is ok */ |
1939 |
+ ARG_PTR_TO_SPIN_LOCK, /* pointer to bpf_spin_lock */ |
1940 |
+ ARG_PTR_TO_SOCK_COMMON, /* pointer to sock_common */ |
1941 |
+ ARG_PTR_TO_INT, /* pointer to int */ |
1942 |
+ ARG_PTR_TO_LONG, /* pointer to long */ |
1943 |
+ ARG_PTR_TO_SOCKET, /* pointer to bpf_sock (fullsock) */ |
1944 |
+- ARG_PTR_TO_SOCKET_OR_NULL, /* pointer to bpf_sock (fullsock) or NULL */ |
1945 |
+ ARG_PTR_TO_BTF_ID, /* pointer to in-kernel struct */ |
1946 |
+ ARG_PTR_TO_ALLOC_MEM, /* pointer to dynamically allocated memory */ |
1947 |
+- ARG_PTR_TO_ALLOC_MEM_OR_NULL, /* pointer to dynamically allocated memory or NULL */ |
1948 |
+ ARG_CONST_ALLOC_SIZE_OR_ZERO, /* number of allocated bytes requested */ |
1949 |
+ ARG_PTR_TO_BTF_ID_SOCK_COMMON, /* pointer to in-kernel sock_common or bpf-mirrored bpf_sock */ |
1950 |
+ ARG_PTR_TO_PERCPU_BTF_ID, /* pointer to in-kernel percpu type */ |
1951 |
+ ARG_PTR_TO_FUNC, /* pointer to a bpf program function */ |
1952 |
+- ARG_PTR_TO_STACK_OR_NULL, /* pointer to stack or NULL */ |
1953 |
++ ARG_PTR_TO_STACK, /* pointer to stack */ |
1954 |
+ ARG_PTR_TO_CONST_STR, /* pointer to a null terminated read-only string */ |
1955 |
+ ARG_PTR_TO_TIMER, /* pointer to bpf_timer */ |
1956 |
+ __BPF_ARG_TYPE_MAX, |
1957 |
++ |
1958 |
++ /* Extended arg_types. */ |
1959 |
++ ARG_PTR_TO_MAP_VALUE_OR_NULL = PTR_MAYBE_NULL | ARG_PTR_TO_MAP_VALUE, |
1960 |
++ ARG_PTR_TO_MEM_OR_NULL = PTR_MAYBE_NULL | ARG_PTR_TO_MEM, |
1961 |
++ ARG_PTR_TO_CTX_OR_NULL = PTR_MAYBE_NULL | ARG_PTR_TO_CTX, |
1962 |
++ ARG_PTR_TO_SOCKET_OR_NULL = PTR_MAYBE_NULL | ARG_PTR_TO_SOCKET, |
1963 |
++ ARG_PTR_TO_ALLOC_MEM_OR_NULL = PTR_MAYBE_NULL | ARG_PTR_TO_ALLOC_MEM, |
1964 |
++ ARG_PTR_TO_STACK_OR_NULL = PTR_MAYBE_NULL | ARG_PTR_TO_STACK, |
1965 |
++ |
1966 |
++ /* This must be the last entry. Its purpose is to ensure the enum is |
1967 |
++ * wide enough to hold the higher bits reserved for bpf_type_flag. |
1968 |
++ */ |
1969 |
++ __BPF_ARG_TYPE_LIMIT = BPF_TYPE_LIMIT, |
1970 |
+ }; |
1971 |
++static_assert(__BPF_ARG_TYPE_MAX <= BPF_BASE_TYPE_LIMIT); |
1972 |
+ |
1973 |
+ /* type of values returned from helper functions */ |
1974 |
+ enum bpf_return_type { |
1975 |
+ RET_INTEGER, /* function returns integer */ |
1976 |
+ RET_VOID, /* function doesn't return anything */ |
1977 |
+ RET_PTR_TO_MAP_VALUE, /* returns a pointer to map elem value */ |
1978 |
+- RET_PTR_TO_MAP_VALUE_OR_NULL, /* returns a pointer to map elem value or NULL */ |
1979 |
+- RET_PTR_TO_SOCKET_OR_NULL, /* returns a pointer to a socket or NULL */ |
1980 |
+- RET_PTR_TO_TCP_SOCK_OR_NULL, /* returns a pointer to a tcp_sock or NULL */ |
1981 |
+- RET_PTR_TO_SOCK_COMMON_OR_NULL, /* returns a pointer to a sock_common or NULL */ |
1982 |
+- RET_PTR_TO_ALLOC_MEM_OR_NULL, /* returns a pointer to dynamically allocated memory or NULL */ |
1983 |
+- RET_PTR_TO_BTF_ID_OR_NULL, /* returns a pointer to a btf_id or NULL */ |
1984 |
+- RET_PTR_TO_MEM_OR_BTF_ID_OR_NULL, /* returns a pointer to a valid memory or a btf_id or NULL */ |
1985 |
++ RET_PTR_TO_SOCKET, /* returns a pointer to a socket */ |
1986 |
++ RET_PTR_TO_TCP_SOCK, /* returns a pointer to a tcp_sock */ |
1987 |
++ RET_PTR_TO_SOCK_COMMON, /* returns a pointer to a sock_common */ |
1988 |
++ RET_PTR_TO_ALLOC_MEM, /* returns a pointer to dynamically allocated memory */ |
1989 |
+ RET_PTR_TO_MEM_OR_BTF_ID, /* returns a pointer to a valid memory or a btf_id */ |
1990 |
+ RET_PTR_TO_BTF_ID, /* returns a pointer to a btf_id */ |
1991 |
++ __BPF_RET_TYPE_MAX, |
1992 |
++ |
1993 |
++ /* Extended ret_types. */ |
1994 |
++ RET_PTR_TO_MAP_VALUE_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_MAP_VALUE, |
1995 |
++ RET_PTR_TO_SOCKET_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_SOCKET, |
1996 |
++ RET_PTR_TO_TCP_SOCK_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_TCP_SOCK, |
1997 |
++ RET_PTR_TO_SOCK_COMMON_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_SOCK_COMMON, |
1998 |
++ RET_PTR_TO_ALLOC_MEM_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_ALLOC_MEM, |
1999 |
++ RET_PTR_TO_BTF_ID_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_BTF_ID, |
2000 |
++ |
2001 |
++ /* This must be the last entry. Its purpose is to ensure the enum is |
2002 |
++ * wide enough to hold the higher bits reserved for bpf_type_flag. |
2003 |
++ */ |
2004 |
++ __BPF_RET_TYPE_LIMIT = BPF_TYPE_LIMIT, |
2005 |
+ }; |
2006 |
++static_assert(__BPF_RET_TYPE_MAX <= BPF_BASE_TYPE_LIMIT); |
2007 |
+ |
2008 |
+ /* eBPF function prototype used by verifier to allow BPF_CALLs from eBPF programs |
2009 |
+ * to in-kernel helper functions and for adjusting imm32 field in BPF_CALL |
2010 |
+@@ -417,18 +466,15 @@ enum bpf_reg_type { |
2011 |
+ PTR_TO_CTX, /* reg points to bpf_context */ |
2012 |
+ CONST_PTR_TO_MAP, /* reg points to struct bpf_map */ |
2013 |
+ PTR_TO_MAP_VALUE, /* reg points to map element value */ |
2014 |
+- PTR_TO_MAP_VALUE_OR_NULL,/* points to map elem value or NULL */ |
2015 |
++ PTR_TO_MAP_KEY, /* reg points to a map element key */ |
2016 |
+ PTR_TO_STACK, /* reg == frame_pointer + offset */ |
2017 |
+ PTR_TO_PACKET_META, /* skb->data - meta_len */ |
2018 |
+ PTR_TO_PACKET, /* reg points to skb->data */ |
2019 |
+ PTR_TO_PACKET_END, /* skb->data + headlen */ |
2020 |
+ PTR_TO_FLOW_KEYS, /* reg points to bpf_flow_keys */ |
2021 |
+ PTR_TO_SOCKET, /* reg points to struct bpf_sock */ |
2022 |
+- PTR_TO_SOCKET_OR_NULL, /* reg points to struct bpf_sock or NULL */ |
2023 |
+ PTR_TO_SOCK_COMMON, /* reg points to sock_common */ |
2024 |
+- PTR_TO_SOCK_COMMON_OR_NULL, /* reg points to sock_common or NULL */ |
2025 |
+ PTR_TO_TCP_SOCK, /* reg points to struct tcp_sock */ |
2026 |
+- PTR_TO_TCP_SOCK_OR_NULL, /* reg points to struct tcp_sock or NULL */ |
2027 |
+ PTR_TO_TP_BUFFER, /* reg points to a writable raw tp's buffer */ |
2028 |
+ PTR_TO_XDP_SOCK, /* reg points to struct xdp_sock */ |
2029 |
+ /* PTR_TO_BTF_ID points to a kernel struct that does not need |
2030 |
+@@ -446,18 +492,25 @@ enum bpf_reg_type { |
2031 |
+ * been checked for null. Used primarily to inform the verifier |
2032 |
+ * an explicit null check is required for this struct. |
2033 |
+ */ |
2034 |
+- PTR_TO_BTF_ID_OR_NULL, |
2035 |
+ PTR_TO_MEM, /* reg points to valid memory region */ |
2036 |
+- PTR_TO_MEM_OR_NULL, /* reg points to valid memory region or NULL */ |
2037 |
+- PTR_TO_RDONLY_BUF, /* reg points to a readonly buffer */ |
2038 |
+- PTR_TO_RDONLY_BUF_OR_NULL, /* reg points to a readonly buffer or NULL */ |
2039 |
+- PTR_TO_RDWR_BUF, /* reg points to a read/write buffer */ |
2040 |
+- PTR_TO_RDWR_BUF_OR_NULL, /* reg points to a read/write buffer or NULL */ |
2041 |
++ PTR_TO_BUF, /* reg points to a read/write buffer */ |
2042 |
+ PTR_TO_PERCPU_BTF_ID, /* reg points to a percpu kernel variable */ |
2043 |
+ PTR_TO_FUNC, /* reg points to a bpf program function */ |
2044 |
+- PTR_TO_MAP_KEY, /* reg points to a map element key */ |
2045 |
+ __BPF_REG_TYPE_MAX, |
2046 |
++ |
2047 |
++ /* Extended reg_types. */ |
2048 |
++ PTR_TO_MAP_VALUE_OR_NULL = PTR_MAYBE_NULL | PTR_TO_MAP_VALUE, |
2049 |
++ PTR_TO_SOCKET_OR_NULL = PTR_MAYBE_NULL | PTR_TO_SOCKET, |
2050 |
++ PTR_TO_SOCK_COMMON_OR_NULL = PTR_MAYBE_NULL | PTR_TO_SOCK_COMMON, |
2051 |
++ PTR_TO_TCP_SOCK_OR_NULL = PTR_MAYBE_NULL | PTR_TO_TCP_SOCK, |
2052 |
++ PTR_TO_BTF_ID_OR_NULL = PTR_MAYBE_NULL | PTR_TO_BTF_ID, |
2053 |
++ |
2054 |
++ /* This must be the last entry. Its purpose is to ensure the enum is |
2055 |
++ * wide enough to hold the higher bits reserved for bpf_type_flag. |
2056 |
++ */ |
2057 |
++ __BPF_REG_TYPE_LIMIT = BPF_TYPE_LIMIT, |
2058 |
+ }; |
2059 |
++static_assert(__BPF_REG_TYPE_MAX <= BPF_BASE_TYPE_LIMIT); |
2060 |
+ |
2061 |
+ /* The information passed from prog-specific *_is_valid_access |
2062 |
+ * back to the verifier. |
2063 |
+diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h |
2064 |
+index 364550dd19c4a..bb1cc3fbc4bab 100644 |
2065 |
+--- a/include/linux/bpf_verifier.h |
2066 |
++++ b/include/linux/bpf_verifier.h |
2067 |
+@@ -18,6 +18,8 @@ |
2068 |
+ * that converting umax_value to int cannot overflow. |
2069 |
+ */ |
2070 |
+ #define BPF_MAX_VAR_SIZ (1 << 29) |
2071 |
++/* size of type_str_buf in bpf_verifier. */ |
2072 |
++#define TYPE_STR_BUF_LEN 64 |
2073 |
+ |
2074 |
+ /* Liveness marks, used for registers and spilled-regs (in stack slots). |
2075 |
+ * Read marks propagate upwards until they find a write mark; they record that |
2076 |
+@@ -474,6 +476,8 @@ struct bpf_verifier_env { |
2077 |
+ /* longest register parentage chain walked for liveness marking */ |
2078 |
+ u32 longest_mark_read_walk; |
2079 |
+ bpfptr_t fd_array; |
2080 |
++ /* buffer used in reg_type_str() to generate reg_type string */ |
2081 |
++ char type_str_buf[TYPE_STR_BUF_LEN]; |
2082 |
+ }; |
2083 |
+ |
2084 |
+ __printf(2, 0) void bpf_verifier_vlog(struct bpf_verifier_log *log, |
2085 |
+@@ -535,4 +539,18 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, |
2086 |
+ u32 btf_id, |
2087 |
+ struct bpf_attach_target_info *tgt_info); |
2088 |
+ |
2089 |
++#define BPF_BASE_TYPE_MASK GENMASK(BPF_BASE_TYPE_BITS - 1, 0) |
2090 |
++ |
2091 |
++/* extract base type from bpf_{arg, return, reg}_type. */ |
2092 |
++static inline u32 base_type(u32 type) |
2093 |
++{ |
2094 |
++ return type & BPF_BASE_TYPE_MASK; |
2095 |
++} |
2096 |
++ |
2097 |
++/* extract flags from an extended type. See bpf_type_flag in bpf.h. */ |
2098 |
++static inline u32 type_flag(u32 type) |
2099 |
++{ |
2100 |
++ return type & ~BPF_BASE_TYPE_MASK; |
2101 |
++} |
2102 |
++ |
2103 |
+ #endif /* _LINUX_BPF_VERIFIER_H */ |
2104 |
+diff --git a/include/linux/iomap.h b/include/linux/iomap.h |
2105 |
+index 24f8489583ca7..829f2325ecbab 100644 |
2106 |
+--- a/include/linux/iomap.h |
2107 |
++++ b/include/linux/iomap.h |
2108 |
+@@ -330,12 +330,19 @@ struct iomap_dio_ops { |
2109 |
+ */ |
2110 |
+ #define IOMAP_DIO_OVERWRITE_ONLY (1 << 1) |
2111 |
+ |
2112 |
++/* |
2113 |
++ * When a page fault occurs, return a partial synchronous result and allow |
2114 |
++ * the caller to retry the rest of the operation after dealing with the page |
2115 |
++ * fault. |
2116 |
++ */ |
2117 |
++#define IOMAP_DIO_PARTIAL (1 << 2) |
2118 |
++ |
2119 |
+ ssize_t iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, |
2120 |
+ const struct iomap_ops *ops, const struct iomap_dio_ops *dops, |
2121 |
+- unsigned int dio_flags); |
2122 |
++ unsigned int dio_flags, size_t done_before); |
2123 |
+ struct iomap_dio *__iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, |
2124 |
+ const struct iomap_ops *ops, const struct iomap_dio_ops *dops, |
2125 |
+- unsigned int dio_flags); |
2126 |
++ unsigned int dio_flags, size_t done_before); |
2127 |
+ ssize_t iomap_dio_complete(struct iomap_dio *dio); |
2128 |
+ int iomap_dio_iopoll(struct kiocb *kiocb, bool spin); |
2129 |
+ |
2130 |
+diff --git a/include/linux/mm.h b/include/linux/mm.h |
2131 |
+index 90c2d7f3c7a88..04345ff97f8ca 100644 |
2132 |
+--- a/include/linux/mm.h |
2133 |
++++ b/include/linux/mm.h |
2134 |
+@@ -2858,7 +2858,8 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, |
2135 |
+ #define FOLL_FORCE 0x10 /* get_user_pages read/write w/o permission */ |
2136 |
+ #define FOLL_NOWAIT 0x20 /* if a disk transfer is needed, start the IO |
2137 |
+ * and return without waiting upon it */ |
2138 |
+-#define FOLL_POPULATE 0x40 /* fault in page */ |
2139 |
++#define FOLL_POPULATE 0x40 /* fault in pages (with FOLL_MLOCK) */ |
2140 |
++#define FOLL_NOFAULT 0x80 /* do not fault in pages */ |
2141 |
+ #define FOLL_HWPOISON 0x100 /* check page is hwpoisoned */ |
2142 |
+ #define FOLL_NUMA 0x200 /* force NUMA hinting page fault */ |
2143 |
+ #define FOLL_MIGRATION 0x400 /* wait for page to replace migration entry */ |
2144 |
+diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h |
2145 |
+index 62db6b0176b95..2f7dd14083d94 100644 |
2146 |
+--- a/include/linux/pagemap.h |
2147 |
++++ b/include/linux/pagemap.h |
2148 |
+@@ -733,61 +733,11 @@ int wait_on_page_private_2_killable(struct page *page); |
2149 |
+ extern void add_page_wait_queue(struct page *page, wait_queue_entry_t *waiter); |
2150 |
+ |
2151 |
+ /* |
2152 |
+- * Fault everything in given userspace address range in. |
2153 |
++ * Fault in userspace address range. |
2154 |
+ */ |
2155 |
+-static inline int fault_in_pages_writeable(char __user *uaddr, size_t size) |
2156 |
+-{ |
2157 |
+- char __user *end = uaddr + size - 1; |
2158 |
+- |
2159 |
+- if (unlikely(size == 0)) |
2160 |
+- return 0; |
2161 |
+- |
2162 |
+- if (unlikely(uaddr > end)) |
2163 |
+- return -EFAULT; |
2164 |
+- /* |
2165 |
+- * Writing zeroes into userspace here is OK, because we know that if |
2166 |
+- * the zero gets there, we'll be overwriting it. |
2167 |
+- */ |
2168 |
+- do { |
2169 |
+- if (unlikely(__put_user(0, uaddr) != 0)) |
2170 |
+- return -EFAULT; |
2171 |
+- uaddr += PAGE_SIZE; |
2172 |
+- } while (uaddr <= end); |
2173 |
+- |
2174 |
+- /* Check whether the range spilled into the next page. */ |
2175 |
+- if (((unsigned long)uaddr & PAGE_MASK) == |
2176 |
+- ((unsigned long)end & PAGE_MASK)) |
2177 |
+- return __put_user(0, end); |
2178 |
+- |
2179 |
+- return 0; |
2180 |
+-} |
2181 |
+- |
2182 |
+-static inline int fault_in_pages_readable(const char __user *uaddr, size_t size) |
2183 |
+-{ |
2184 |
+- volatile char c; |
2185 |
+- const char __user *end = uaddr + size - 1; |
2186 |
+- |
2187 |
+- if (unlikely(size == 0)) |
2188 |
+- return 0; |
2189 |
+- |
2190 |
+- if (unlikely(uaddr > end)) |
2191 |
+- return -EFAULT; |
2192 |
+- |
2193 |
+- do { |
2194 |
+- if (unlikely(__get_user(c, uaddr) != 0)) |
2195 |
+- return -EFAULT; |
2196 |
+- uaddr += PAGE_SIZE; |
2197 |
+- } while (uaddr <= end); |
2198 |
+- |
2199 |
+- /* Check whether the range spilled into the next page. */ |
2200 |
+- if (((unsigned long)uaddr & PAGE_MASK) == |
2201 |
+- ((unsigned long)end & PAGE_MASK)) { |
2202 |
+- return __get_user(c, end); |
2203 |
+- } |
2204 |
+- |
2205 |
+- (void)c; |
2206 |
+- return 0; |
2207 |
+-} |
2208 |
++size_t fault_in_writeable(char __user *uaddr, size_t size); |
2209 |
++size_t fault_in_safe_writeable(const char __user *uaddr, size_t size); |
2210 |
++size_t fault_in_readable(const char __user *uaddr, size_t size); |
2211 |
+ |
2212 |
+ int add_to_page_cache_locked(struct page *page, struct address_space *mapping, |
2213 |
+ pgoff_t index, gfp_t gfp_mask); |
2214 |
+diff --git a/include/linux/uio.h b/include/linux/uio.h |
2215 |
+index 207101a9c5c32..6350354f97e90 100644 |
2216 |
+--- a/include/linux/uio.h |
2217 |
++++ b/include/linux/uio.h |
2218 |
+@@ -35,6 +35,7 @@ struct iov_iter_state { |
2219 |
+ |
2220 |
+ struct iov_iter { |
2221 |
+ u8 iter_type; |
2222 |
++ bool nofault; |
2223 |
+ bool data_source; |
2224 |
+ size_t iov_offset; |
2225 |
+ size_t count; |
2226 |
+@@ -133,7 +134,8 @@ size_t copy_page_from_iter_atomic(struct page *page, unsigned offset, |
2227 |
+ size_t bytes, struct iov_iter *i); |
2228 |
+ void iov_iter_advance(struct iov_iter *i, size_t bytes); |
2229 |
+ void iov_iter_revert(struct iov_iter *i, size_t bytes); |
2230 |
+-int iov_iter_fault_in_readable(const struct iov_iter *i, size_t bytes); |
2231 |
++size_t fault_in_iov_iter_readable(const struct iov_iter *i, size_t bytes); |
2232 |
++size_t fault_in_iov_iter_writeable(const struct iov_iter *i, size_t bytes); |
2233 |
+ size_t iov_iter_single_seg_count(const struct iov_iter *i); |
2234 |
+ size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes, |
2235 |
+ struct iov_iter *i); |
2236 |
+diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c |
2237 |
+index 09406b0e215e1..40df35088cdbd 100644 |
2238 |
+--- a/kernel/bpf/btf.c |
2239 |
++++ b/kernel/bpf/btf.c |
2240 |
+@@ -4800,10 +4800,12 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, |
2241 |
+ /* check for PTR_TO_RDONLY_BUF_OR_NULL or PTR_TO_RDWR_BUF_OR_NULL */ |
2242 |
+ for (i = 0; i < prog->aux->ctx_arg_info_size; i++) { |
2243 |
+ const struct bpf_ctx_arg_aux *ctx_arg_info = &prog->aux->ctx_arg_info[i]; |
2244 |
++ u32 type, flag; |
2245 |
+ |
2246 |
+- if (ctx_arg_info->offset == off && |
2247 |
+- (ctx_arg_info->reg_type == PTR_TO_RDONLY_BUF_OR_NULL || |
2248 |
+- ctx_arg_info->reg_type == PTR_TO_RDWR_BUF_OR_NULL)) { |
2249 |
++ type = base_type(ctx_arg_info->reg_type); |
2250 |
++ flag = type_flag(ctx_arg_info->reg_type); |
2251 |
++ if (ctx_arg_info->offset == off && type == PTR_TO_BUF && |
2252 |
++ (flag & PTR_MAYBE_NULL)) { |
2253 |
+ info->reg_type = ctx_arg_info->reg_type; |
2254 |
+ return true; |
2255 |
+ } |
2256 |
+@@ -5508,9 +5510,9 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, |
2257 |
+ if (reg->type == PTR_TO_BTF_ID) { |
2258 |
+ reg_btf = reg->btf; |
2259 |
+ reg_ref_id = reg->btf_id; |
2260 |
+- } else if (reg2btf_ids[reg->type]) { |
2261 |
++ } else if (reg2btf_ids[base_type(reg->type)]) { |
2262 |
+ reg_btf = btf_vmlinux; |
2263 |
+- reg_ref_id = *reg2btf_ids[reg->type]; |
2264 |
++ reg_ref_id = *reg2btf_ids[base_type(reg->type)]; |
2265 |
+ } else { |
2266 |
+ bpf_log(log, "kernel function %s args#%d expected pointer to %s %s but R%d is not a pointer to btf_id\n", |
2267 |
+ func_name, i, |
2268 |
+@@ -5717,7 +5719,7 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog, |
2269 |
+ return -EINVAL; |
2270 |
+ } |
2271 |
+ |
2272 |
+- reg->type = PTR_TO_MEM_OR_NULL; |
2273 |
++ reg->type = PTR_TO_MEM | PTR_MAYBE_NULL; |
2274 |
+ reg->id = ++env->id_gen; |
2275 |
+ |
2276 |
+ continue; |
2277 |
+@@ -6229,7 +6231,7 @@ const struct bpf_func_proto bpf_btf_find_by_name_kind_proto = { |
2278 |
+ .func = bpf_btf_find_by_name_kind, |
2279 |
+ .gpl_only = false, |
2280 |
+ .ret_type = RET_INTEGER, |
2281 |
+- .arg1_type = ARG_PTR_TO_MEM, |
2282 |
++ .arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY, |
2283 |
+ .arg2_type = ARG_CONST_SIZE, |
2284 |
+ .arg3_type = ARG_ANYTHING, |
2285 |
+ .arg4_type = ARG_ANYTHING, |
2286 |
+diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c |
2287 |
+index 7dbd68195a2b0..fe053ffd89329 100644 |
2288 |
+--- a/kernel/bpf/cgroup.c |
2289 |
++++ b/kernel/bpf/cgroup.c |
2290 |
+@@ -1753,7 +1753,7 @@ static const struct bpf_func_proto bpf_sysctl_set_new_value_proto = { |
2291 |
+ .gpl_only = false, |
2292 |
+ .ret_type = RET_INTEGER, |
2293 |
+ .arg1_type = ARG_PTR_TO_CTX, |
2294 |
+- .arg2_type = ARG_PTR_TO_MEM, |
2295 |
++ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, |
2296 |
+ .arg3_type = ARG_CONST_SIZE, |
2297 |
+ }; |
2298 |
+ |
2299 |
+diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c |
2300 |
+index 6f600cc95ccda..a711ffe238932 100644 |
2301 |
+--- a/kernel/bpf/helpers.c |
2302 |
++++ b/kernel/bpf/helpers.c |
2303 |
+@@ -530,7 +530,7 @@ const struct bpf_func_proto bpf_strtol_proto = { |
2304 |
+ .func = bpf_strtol, |
2305 |
+ .gpl_only = false, |
2306 |
+ .ret_type = RET_INTEGER, |
2307 |
+- .arg1_type = ARG_PTR_TO_MEM, |
2308 |
++ .arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY, |
2309 |
+ .arg2_type = ARG_CONST_SIZE, |
2310 |
+ .arg3_type = ARG_ANYTHING, |
2311 |
+ .arg4_type = ARG_PTR_TO_LONG, |
2312 |
+@@ -558,7 +558,7 @@ const struct bpf_func_proto bpf_strtoul_proto = { |
2313 |
+ .func = bpf_strtoul, |
2314 |
+ .gpl_only = false, |
2315 |
+ .ret_type = RET_INTEGER, |
2316 |
+- .arg1_type = ARG_PTR_TO_MEM, |
2317 |
++ .arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY, |
2318 |
+ .arg2_type = ARG_CONST_SIZE, |
2319 |
+ .arg3_type = ARG_ANYTHING, |
2320 |
+ .arg4_type = ARG_PTR_TO_LONG, |
2321 |
+@@ -630,7 +630,7 @@ const struct bpf_func_proto bpf_event_output_data_proto = { |
2322 |
+ .arg1_type = ARG_PTR_TO_CTX, |
2323 |
+ .arg2_type = ARG_CONST_MAP_PTR, |
2324 |
+ .arg3_type = ARG_ANYTHING, |
2325 |
+- .arg4_type = ARG_PTR_TO_MEM, |
2326 |
++ .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, |
2327 |
+ .arg5_type = ARG_CONST_SIZE_OR_ZERO, |
2328 |
+ }; |
2329 |
+ |
2330 |
+@@ -667,7 +667,7 @@ BPF_CALL_2(bpf_per_cpu_ptr, const void *, ptr, u32, cpu) |
2331 |
+ const struct bpf_func_proto bpf_per_cpu_ptr_proto = { |
2332 |
+ .func = bpf_per_cpu_ptr, |
2333 |
+ .gpl_only = false, |
2334 |
+- .ret_type = RET_PTR_TO_MEM_OR_BTF_ID_OR_NULL, |
2335 |
++ .ret_type = RET_PTR_TO_MEM_OR_BTF_ID | PTR_MAYBE_NULL | MEM_RDONLY, |
2336 |
+ .arg1_type = ARG_PTR_TO_PERCPU_BTF_ID, |
2337 |
+ .arg2_type = ARG_ANYTHING, |
2338 |
+ }; |
2339 |
+@@ -680,7 +680,7 @@ BPF_CALL_1(bpf_this_cpu_ptr, const void *, percpu_ptr) |
2340 |
+ const struct bpf_func_proto bpf_this_cpu_ptr_proto = { |
2341 |
+ .func = bpf_this_cpu_ptr, |
2342 |
+ .gpl_only = false, |
2343 |
+- .ret_type = RET_PTR_TO_MEM_OR_BTF_ID, |
2344 |
++ .ret_type = RET_PTR_TO_MEM_OR_BTF_ID | MEM_RDONLY, |
2345 |
+ .arg1_type = ARG_PTR_TO_PERCPU_BTF_ID, |
2346 |
+ }; |
2347 |
+ |
2348 |
+@@ -1013,7 +1013,7 @@ const struct bpf_func_proto bpf_snprintf_proto = { |
2349 |
+ .arg1_type = ARG_PTR_TO_MEM_OR_NULL, |
2350 |
+ .arg2_type = ARG_CONST_SIZE_OR_ZERO, |
2351 |
+ .arg3_type = ARG_PTR_TO_CONST_STR, |
2352 |
+- .arg4_type = ARG_PTR_TO_MEM_OR_NULL, |
2353 |
++ .arg4_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY, |
2354 |
+ .arg5_type = ARG_CONST_SIZE_OR_ZERO, |
2355 |
+ }; |
2356 |
+ |
2357 |
+diff --git a/kernel/bpf/map_iter.c b/kernel/bpf/map_iter.c |
2358 |
+index 6a9542af4212a..b0fa190b09790 100644 |
2359 |
+--- a/kernel/bpf/map_iter.c |
2360 |
++++ b/kernel/bpf/map_iter.c |
2361 |
+@@ -174,9 +174,9 @@ static const struct bpf_iter_reg bpf_map_elem_reg_info = { |
2362 |
+ .ctx_arg_info_size = 2, |
2363 |
+ .ctx_arg_info = { |
2364 |
+ { offsetof(struct bpf_iter__bpf_map_elem, key), |
2365 |
+- PTR_TO_RDONLY_BUF_OR_NULL }, |
2366 |
++ PTR_TO_BUF | PTR_MAYBE_NULL | MEM_RDONLY }, |
2367 |
+ { offsetof(struct bpf_iter__bpf_map_elem, value), |
2368 |
+- PTR_TO_RDWR_BUF_OR_NULL }, |
2369 |
++ PTR_TO_BUF | PTR_MAYBE_NULL }, |
2370 |
+ }, |
2371 |
+ }; |
2372 |
+ |
2373 |
+diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c |
2374 |
+index f1c51c45667d3..710ba9de12ce4 100644 |
2375 |
+--- a/kernel/bpf/ringbuf.c |
2376 |
++++ b/kernel/bpf/ringbuf.c |
2377 |
+@@ -444,7 +444,7 @@ const struct bpf_func_proto bpf_ringbuf_output_proto = { |
2378 |
+ .func = bpf_ringbuf_output, |
2379 |
+ .ret_type = RET_INTEGER, |
2380 |
+ .arg1_type = ARG_CONST_MAP_PTR, |
2381 |
+- .arg2_type = ARG_PTR_TO_MEM, |
2382 |
++ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, |
2383 |
+ .arg3_type = ARG_CONST_SIZE_OR_ZERO, |
2384 |
+ .arg4_type = ARG_ANYTHING, |
2385 |
+ }; |
2386 |
+diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c |
2387 |
+index 42490c39dfbf5..48e02a725563f 100644 |
2388 |
+--- a/kernel/bpf/syscall.c |
2389 |
++++ b/kernel/bpf/syscall.c |
2390 |
+@@ -4753,7 +4753,7 @@ static const struct bpf_func_proto bpf_sys_bpf_proto = { |
2391 |
+ .gpl_only = false, |
2392 |
+ .ret_type = RET_INTEGER, |
2393 |
+ .arg1_type = ARG_ANYTHING, |
2394 |
+- .arg2_type = ARG_PTR_TO_MEM, |
2395 |
++ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, |
2396 |
+ .arg3_type = ARG_CONST_SIZE, |
2397 |
+ }; |
2398 |
+ |
2399 |
+diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c |
2400 |
+index 670721e39c0e8..d2b119b4fbe74 100644 |
2401 |
+--- a/kernel/bpf/verifier.c |
2402 |
++++ b/kernel/bpf/verifier.c |
2403 |
+@@ -445,18 +445,6 @@ static bool reg_type_not_null(enum bpf_reg_type type) |
2404 |
+ type == PTR_TO_SOCK_COMMON; |
2405 |
+ } |
2406 |
+ |
2407 |
+-static bool reg_type_may_be_null(enum bpf_reg_type type) |
2408 |
+-{ |
2409 |
+- return type == PTR_TO_MAP_VALUE_OR_NULL || |
2410 |
+- type == PTR_TO_SOCKET_OR_NULL || |
2411 |
+- type == PTR_TO_SOCK_COMMON_OR_NULL || |
2412 |
+- type == PTR_TO_TCP_SOCK_OR_NULL || |
2413 |
+- type == PTR_TO_BTF_ID_OR_NULL || |
2414 |
+- type == PTR_TO_MEM_OR_NULL || |
2415 |
+- type == PTR_TO_RDONLY_BUF_OR_NULL || |
2416 |
+- type == PTR_TO_RDWR_BUF_OR_NULL; |
2417 |
+-} |
2418 |
+- |
2419 |
+ static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg) |
2420 |
+ { |
2421 |
+ return reg->type == PTR_TO_MAP_VALUE && |
2422 |
+@@ -465,12 +453,14 @@ static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg) |
2423 |
+ |
2424 |
+ static bool reg_type_may_be_refcounted_or_null(enum bpf_reg_type type) |
2425 |
+ { |
2426 |
+- return type == PTR_TO_SOCKET || |
2427 |
+- type == PTR_TO_SOCKET_OR_NULL || |
2428 |
+- type == PTR_TO_TCP_SOCK || |
2429 |
+- type == PTR_TO_TCP_SOCK_OR_NULL || |
2430 |
+- type == PTR_TO_MEM || |
2431 |
+- type == PTR_TO_MEM_OR_NULL; |
2432 |
++ return base_type(type) == PTR_TO_SOCKET || |
2433 |
++ base_type(type) == PTR_TO_TCP_SOCK || |
2434 |
++ base_type(type) == PTR_TO_MEM; |
2435 |
++} |
2436 |
++ |
2437 |
++static bool type_is_rdonly_mem(u32 type) |
2438 |
++{ |
2439 |
++ return type & MEM_RDONLY; |
2440 |
+ } |
2441 |
+ |
2442 |
+ static bool arg_type_may_be_refcounted(enum bpf_arg_type type) |
2443 |
+@@ -478,14 +468,9 @@ static bool arg_type_may_be_refcounted(enum bpf_arg_type type) |
2444 |
+ return type == ARG_PTR_TO_SOCK_COMMON; |
2445 |
+ } |
2446 |
+ |
2447 |
+-static bool arg_type_may_be_null(enum bpf_arg_type type) |
2448 |
++static bool type_may_be_null(u32 type) |
2449 |
+ { |
2450 |
+- return type == ARG_PTR_TO_MAP_VALUE_OR_NULL || |
2451 |
+- type == ARG_PTR_TO_MEM_OR_NULL || |
2452 |
+- type == ARG_PTR_TO_CTX_OR_NULL || |
2453 |
+- type == ARG_PTR_TO_SOCKET_OR_NULL || |
2454 |
+- type == ARG_PTR_TO_ALLOC_MEM_OR_NULL || |
2455 |
+- type == ARG_PTR_TO_STACK_OR_NULL; |
2456 |
++ return type & PTR_MAYBE_NULL; |
2457 |
+ } |
2458 |
+ |
2459 |
+ /* Determine whether the function releases some resources allocated by another |
2460 |
+@@ -545,39 +530,54 @@ static bool is_cmpxchg_insn(const struct bpf_insn *insn) |
2461 |
+ insn->imm == BPF_CMPXCHG; |
2462 |
+ } |
2463 |
+ |
2464 |
+-/* string representation of 'enum bpf_reg_type' */ |
2465 |
+-static const char * const reg_type_str[] = { |
2466 |
+- [NOT_INIT] = "?", |
2467 |
+- [SCALAR_VALUE] = "inv", |
2468 |
+- [PTR_TO_CTX] = "ctx", |
2469 |
+- [CONST_PTR_TO_MAP] = "map_ptr", |
2470 |
+- [PTR_TO_MAP_VALUE] = "map_value", |
2471 |
+- [PTR_TO_MAP_VALUE_OR_NULL] = "map_value_or_null", |
2472 |
+- [PTR_TO_STACK] = "fp", |
2473 |
+- [PTR_TO_PACKET] = "pkt", |
2474 |
+- [PTR_TO_PACKET_META] = "pkt_meta", |
2475 |
+- [PTR_TO_PACKET_END] = "pkt_end", |
2476 |
+- [PTR_TO_FLOW_KEYS] = "flow_keys", |
2477 |
+- [PTR_TO_SOCKET] = "sock", |
2478 |
+- [PTR_TO_SOCKET_OR_NULL] = "sock_or_null", |
2479 |
+- [PTR_TO_SOCK_COMMON] = "sock_common", |
2480 |
+- [PTR_TO_SOCK_COMMON_OR_NULL] = "sock_common_or_null", |
2481 |
+- [PTR_TO_TCP_SOCK] = "tcp_sock", |
2482 |
+- [PTR_TO_TCP_SOCK_OR_NULL] = "tcp_sock_or_null", |
2483 |
+- [PTR_TO_TP_BUFFER] = "tp_buffer", |
2484 |
+- [PTR_TO_XDP_SOCK] = "xdp_sock", |
2485 |
+- [PTR_TO_BTF_ID] = "ptr_", |
2486 |
+- [PTR_TO_BTF_ID_OR_NULL] = "ptr_or_null_", |
2487 |
+- [PTR_TO_PERCPU_BTF_ID] = "percpu_ptr_", |
2488 |
+- [PTR_TO_MEM] = "mem", |
2489 |
+- [PTR_TO_MEM_OR_NULL] = "mem_or_null", |
2490 |
+- [PTR_TO_RDONLY_BUF] = "rdonly_buf", |
2491 |
+- [PTR_TO_RDONLY_BUF_OR_NULL] = "rdonly_buf_or_null", |
2492 |
+- [PTR_TO_RDWR_BUF] = "rdwr_buf", |
2493 |
+- [PTR_TO_RDWR_BUF_OR_NULL] = "rdwr_buf_or_null", |
2494 |
+- [PTR_TO_FUNC] = "func", |
2495 |
+- [PTR_TO_MAP_KEY] = "map_key", |
2496 |
+-}; |
2497 |
++/* string representation of 'enum bpf_reg_type' |
2498 |
++ * |
2499 |
++ * Note that reg_type_str() can not appear more than once in a single verbose() |
2500 |
++ * statement. |
2501 |
++ */ |
2502 |
++static const char *reg_type_str(struct bpf_verifier_env *env, |
2503 |
++ enum bpf_reg_type type) |
2504 |
++{ |
2505 |
++ char postfix[16] = {0}, prefix[16] = {0}; |
2506 |
++ static const char * const str[] = { |
2507 |
++ [NOT_INIT] = "?", |
2508 |
++ [SCALAR_VALUE] = "inv", |
2509 |
++ [PTR_TO_CTX] = "ctx", |
2510 |
++ [CONST_PTR_TO_MAP] = "map_ptr", |
2511 |
++ [PTR_TO_MAP_VALUE] = "map_value", |
2512 |
++ [PTR_TO_STACK] = "fp", |
2513 |
++ [PTR_TO_PACKET] = "pkt", |
2514 |
++ [PTR_TO_PACKET_META] = "pkt_meta", |
2515 |
++ [PTR_TO_PACKET_END] = "pkt_end", |
2516 |
++ [PTR_TO_FLOW_KEYS] = "flow_keys", |
2517 |
++ [PTR_TO_SOCKET] = "sock", |
2518 |
++ [PTR_TO_SOCK_COMMON] = "sock_common", |
2519 |
++ [PTR_TO_TCP_SOCK] = "tcp_sock", |
2520 |
++ [PTR_TO_TP_BUFFER] = "tp_buffer", |
2521 |
++ [PTR_TO_XDP_SOCK] = "xdp_sock", |
2522 |
++ [PTR_TO_BTF_ID] = "ptr_", |
2523 |
++ [PTR_TO_PERCPU_BTF_ID] = "percpu_ptr_", |
2524 |
++ [PTR_TO_MEM] = "mem", |
2525 |
++ [PTR_TO_BUF] = "buf", |
2526 |
++ [PTR_TO_FUNC] = "func", |
2527 |
++ [PTR_TO_MAP_KEY] = "map_key", |
2528 |
++ }; |
2529 |
++ |
2530 |
++ if (type & PTR_MAYBE_NULL) { |
2531 |
++ if (base_type(type) == PTR_TO_BTF_ID || |
2532 |
++ base_type(type) == PTR_TO_PERCPU_BTF_ID) |
2533 |
++ strncpy(postfix, "or_null_", 16); |
2534 |
++ else |
2535 |
++ strncpy(postfix, "_or_null", 16); |
2536 |
++ } |
2537 |
++ |
2538 |
++ if (type & MEM_RDONLY) |
2539 |
++ strncpy(prefix, "rdonly_", 16); |
2540 |
++ |
2541 |
++ snprintf(env->type_str_buf, TYPE_STR_BUF_LEN, "%s%s%s", |
2542 |
++ prefix, str[base_type(type)], postfix); |
2543 |
++ return env->type_str_buf; |
2544 |
++} |
2545 |
+ |
2546 |
+ static char slot_type_char[] = { |
2547 |
+ [STACK_INVALID] = '?', |
2548 |
+@@ -628,7 +628,7 @@ static void print_verifier_state(struct bpf_verifier_env *env, |
2549 |
+ continue; |
2550 |
+ verbose(env, " R%d", i); |
2551 |
+ print_liveness(env, reg->live); |
2552 |
+- verbose(env, "=%s", reg_type_str[t]); |
2553 |
++ verbose(env, "=%s", reg_type_str(env, t)); |
2554 |
+ if (t == SCALAR_VALUE && reg->precise) |
2555 |
+ verbose(env, "P"); |
2556 |
+ if ((t == SCALAR_VALUE || t == PTR_TO_STACK) && |
2557 |
+@@ -636,9 +636,8 @@ static void print_verifier_state(struct bpf_verifier_env *env, |
2558 |
+ /* reg->off should be 0 for SCALAR_VALUE */ |
2559 |
+ verbose(env, "%lld", reg->var_off.value + reg->off); |
2560 |
+ } else { |
2561 |
+- if (t == PTR_TO_BTF_ID || |
2562 |
+- t == PTR_TO_BTF_ID_OR_NULL || |
2563 |
+- t == PTR_TO_PERCPU_BTF_ID) |
2564 |
++ if (base_type(t) == PTR_TO_BTF_ID || |
2565 |
++ base_type(t) == PTR_TO_PERCPU_BTF_ID) |
2566 |
+ verbose(env, "%s", kernel_type_name(reg->btf, reg->btf_id)); |
2567 |
+ verbose(env, "(id=%d", reg->id); |
2568 |
+ if (reg_type_may_be_refcounted_or_null(t)) |
2569 |
+@@ -647,10 +646,9 @@ static void print_verifier_state(struct bpf_verifier_env *env, |
2570 |
+ verbose(env, ",off=%d", reg->off); |
2571 |
+ if (type_is_pkt_pointer(t)) |
2572 |
+ verbose(env, ",r=%d", reg->range); |
2573 |
+- else if (t == CONST_PTR_TO_MAP || |
2574 |
+- t == PTR_TO_MAP_KEY || |
2575 |
+- t == PTR_TO_MAP_VALUE || |
2576 |
+- t == PTR_TO_MAP_VALUE_OR_NULL) |
2577 |
++ else if (base_type(t) == CONST_PTR_TO_MAP || |
2578 |
++ base_type(t) == PTR_TO_MAP_KEY || |
2579 |
++ base_type(t) == PTR_TO_MAP_VALUE) |
2580 |
+ verbose(env, ",ks=%d,vs=%d", |
2581 |
+ reg->map_ptr->key_size, |
2582 |
+ reg->map_ptr->value_size); |
2583 |
+@@ -720,7 +718,7 @@ static void print_verifier_state(struct bpf_verifier_env *env, |
2584 |
+ if (state->stack[i].slot_type[0] == STACK_SPILL) { |
2585 |
+ reg = &state->stack[i].spilled_ptr; |
2586 |
+ t = reg->type; |
2587 |
+- verbose(env, "=%s", reg_type_str[t]); |
2588 |
++ verbose(env, "=%s", reg_type_str(env, t)); |
2589 |
+ if (t == SCALAR_VALUE && reg->precise) |
2590 |
+ verbose(env, "P"); |
2591 |
+ if (t == SCALAR_VALUE && tnum_is_const(reg->var_off)) |
2592 |
+@@ -1133,8 +1131,7 @@ static void mark_reg_known_zero(struct bpf_verifier_env *env, |
2593 |
+ |
2594 |
+ static void mark_ptr_not_null_reg(struct bpf_reg_state *reg) |
2595 |
+ { |
2596 |
+- switch (reg->type) { |
2597 |
+- case PTR_TO_MAP_VALUE_OR_NULL: { |
2598 |
++ if (base_type(reg->type) == PTR_TO_MAP_VALUE) { |
2599 |
+ const struct bpf_map *map = reg->map_ptr; |
2600 |
+ |
2601 |
+ if (map->inner_map_meta) { |
2602 |
+@@ -1153,32 +1150,10 @@ static void mark_ptr_not_null_reg(struct bpf_reg_state *reg) |
2603 |
+ } else { |
2604 |
+ reg->type = PTR_TO_MAP_VALUE; |
2605 |
+ } |
2606 |
+- break; |
2607 |
+- } |
2608 |
+- case PTR_TO_SOCKET_OR_NULL: |
2609 |
+- reg->type = PTR_TO_SOCKET; |
2610 |
+- break; |
2611 |
+- case PTR_TO_SOCK_COMMON_OR_NULL: |
2612 |
+- reg->type = PTR_TO_SOCK_COMMON; |
2613 |
+- break; |
2614 |
+- case PTR_TO_TCP_SOCK_OR_NULL: |
2615 |
+- reg->type = PTR_TO_TCP_SOCK; |
2616 |
+- break; |
2617 |
+- case PTR_TO_BTF_ID_OR_NULL: |
2618 |
+- reg->type = PTR_TO_BTF_ID; |
2619 |
+- break; |
2620 |
+- case PTR_TO_MEM_OR_NULL: |
2621 |
+- reg->type = PTR_TO_MEM; |
2622 |
+- break; |
2623 |
+- case PTR_TO_RDONLY_BUF_OR_NULL: |
2624 |
+- reg->type = PTR_TO_RDONLY_BUF; |
2625 |
+- break; |
2626 |
+- case PTR_TO_RDWR_BUF_OR_NULL: |
2627 |
+- reg->type = PTR_TO_RDWR_BUF; |
2628 |
+- break; |
2629 |
+- default: |
2630 |
+- WARN_ONCE(1, "unknown nullable register type"); |
2631 |
++ return; |
2632 |
+ } |
2633 |
++ |
2634 |
++ reg->type &= ~PTR_MAYBE_NULL; |
2635 |
+ } |
2636 |
+ |
2637 |
+ static bool reg_is_pkt_pointer(const struct bpf_reg_state *reg) |
2638 |
+@@ -1906,7 +1881,7 @@ static int mark_reg_read(struct bpf_verifier_env *env, |
2639 |
+ break; |
2640 |
+ if (parent->live & REG_LIVE_DONE) { |
2641 |
+ verbose(env, "verifier BUG type %s var_off %lld off %d\n", |
2642 |
+- reg_type_str[parent->type], |
2643 |
++ reg_type_str(env, parent->type), |
2644 |
+ parent->var_off.value, parent->off); |
2645 |
+ return -EFAULT; |
2646 |
+ } |
2647 |
+@@ -2564,9 +2539,8 @@ static int mark_chain_precision_stack(struct bpf_verifier_env *env, int spi) |
2648 |
+ |
2649 |
+ static bool is_spillable_regtype(enum bpf_reg_type type) |
2650 |
+ { |
2651 |
+- switch (type) { |
2652 |
++ switch (base_type(type)) { |
2653 |
+ case PTR_TO_MAP_VALUE: |
2654 |
+- case PTR_TO_MAP_VALUE_OR_NULL: |
2655 |
+ case PTR_TO_STACK: |
2656 |
+ case PTR_TO_CTX: |
2657 |
+ case PTR_TO_PACKET: |
2658 |
+@@ -2575,21 +2549,13 @@ static bool is_spillable_regtype(enum bpf_reg_type type) |
2659 |
+ case PTR_TO_FLOW_KEYS: |
2660 |
+ case CONST_PTR_TO_MAP: |
2661 |
+ case PTR_TO_SOCKET: |
2662 |
+- case PTR_TO_SOCKET_OR_NULL: |
2663 |
+ case PTR_TO_SOCK_COMMON: |
2664 |
+- case PTR_TO_SOCK_COMMON_OR_NULL: |
2665 |
+ case PTR_TO_TCP_SOCK: |
2666 |
+- case PTR_TO_TCP_SOCK_OR_NULL: |
2667 |
+ case PTR_TO_XDP_SOCK: |
2668 |
+ case PTR_TO_BTF_ID: |
2669 |
+- case PTR_TO_BTF_ID_OR_NULL: |
2670 |
+- case PTR_TO_RDONLY_BUF: |
2671 |
+- case PTR_TO_RDONLY_BUF_OR_NULL: |
2672 |
+- case PTR_TO_RDWR_BUF: |
2673 |
+- case PTR_TO_RDWR_BUF_OR_NULL: |
2674 |
++ case PTR_TO_BUF: |
2675 |
+ case PTR_TO_PERCPU_BTF_ID: |
2676 |
+ case PTR_TO_MEM: |
2677 |
+- case PTR_TO_MEM_OR_NULL: |
2678 |
+ case PTR_TO_FUNC: |
2679 |
+ case PTR_TO_MAP_KEY: |
2680 |
+ return true; |
2681 |
+@@ -3405,7 +3371,7 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, |
2682 |
+ */ |
2683 |
+ *reg_type = info.reg_type; |
2684 |
+ |
2685 |
+- if (*reg_type == PTR_TO_BTF_ID || *reg_type == PTR_TO_BTF_ID_OR_NULL) { |
2686 |
++ if (base_type(*reg_type) == PTR_TO_BTF_ID) { |
2687 |
+ *btf = info.btf; |
2688 |
+ *btf_id = info.btf_id; |
2689 |
+ } else { |
2690 |
+@@ -3473,7 +3439,7 @@ static int check_sock_access(struct bpf_verifier_env *env, int insn_idx, |
2691 |
+ } |
2692 |
+ |
2693 |
+ verbose(env, "R%d invalid %s access off=%d size=%d\n", |
2694 |
+- regno, reg_type_str[reg->type], off, size); |
2695 |
++ regno, reg_type_str(env, reg->type), off, size); |
2696 |
+ |
2697 |
+ return -EACCES; |
2698 |
+ } |
2699 |
+@@ -4200,15 +4166,30 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn |
2700 |
+ mark_reg_unknown(env, regs, value_regno); |
2701 |
+ } |
2702 |
+ } |
2703 |
+- } else if (reg->type == PTR_TO_MEM) { |
2704 |
++ } else if (base_type(reg->type) == PTR_TO_MEM) { |
2705 |
++ bool rdonly_mem = type_is_rdonly_mem(reg->type); |
2706 |
++ |
2707 |
++ if (type_may_be_null(reg->type)) { |
2708 |
++ verbose(env, "R%d invalid mem access '%s'\n", regno, |
2709 |
++ reg_type_str(env, reg->type)); |
2710 |
++ return -EACCES; |
2711 |
++ } |
2712 |
++ |
2713 |
++ if (t == BPF_WRITE && rdonly_mem) { |
2714 |
++ verbose(env, "R%d cannot write into %s\n", |
2715 |
++ regno, reg_type_str(env, reg->type)); |
2716 |
++ return -EACCES; |
2717 |
++ } |
2718 |
++ |
2719 |
+ if (t == BPF_WRITE && value_regno >= 0 && |
2720 |
+ is_pointer_value(env, value_regno)) { |
2721 |
+ verbose(env, "R%d leaks addr into mem\n", value_regno); |
2722 |
+ return -EACCES; |
2723 |
+ } |
2724 |
++ |
2725 |
+ err = check_mem_region_access(env, regno, off, size, |
2726 |
+ reg->mem_size, false); |
2727 |
+- if (!err && t == BPF_READ && value_regno >= 0) |
2728 |
++ if (!err && value_regno >= 0 && (t == BPF_READ || rdonly_mem)) |
2729 |
+ mark_reg_unknown(env, regs, value_regno); |
2730 |
+ } else if (reg->type == PTR_TO_CTX) { |
2731 |
+ enum bpf_reg_type reg_type = SCALAR_VALUE; |
2732 |
+@@ -4238,7 +4219,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn |
2733 |
+ } else { |
2734 |
+ mark_reg_known_zero(env, regs, |
2735 |
+ value_regno); |
2736 |
+- if (reg_type_may_be_null(reg_type)) |
2737 |
++ if (type_may_be_null(reg_type)) |
2738 |
+ regs[value_regno].id = ++env->id_gen; |
2739 |
+ /* A load of ctx field could have different |
2740 |
+ * actual load size with the one encoded in the |
2741 |
+@@ -4246,8 +4227,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn |
2742 |
+ * a sub-register. |
2743 |
+ */ |
2744 |
+ regs[value_regno].subreg_def = DEF_NOT_SUBREG; |
2745 |
+- if (reg_type == PTR_TO_BTF_ID || |
2746 |
+- reg_type == PTR_TO_BTF_ID_OR_NULL) { |
2747 |
++ if (base_type(reg_type) == PTR_TO_BTF_ID) { |
2748 |
+ regs[value_regno].btf = btf; |
2749 |
+ regs[value_regno].btf_id = btf_id; |
2750 |
+ } |
2751 |
+@@ -4300,7 +4280,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn |
2752 |
+ } else if (type_is_sk_pointer(reg->type)) { |
2753 |
+ if (t == BPF_WRITE) { |
2754 |
+ verbose(env, "R%d cannot write into %s\n", |
2755 |
+- regno, reg_type_str[reg->type]); |
2756 |
++ regno, reg_type_str(env, reg->type)); |
2757 |
+ return -EACCES; |
2758 |
+ } |
2759 |
+ err = check_sock_access(env, insn_idx, regno, off, size, t); |
2760 |
+@@ -4316,26 +4296,32 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn |
2761 |
+ } else if (reg->type == CONST_PTR_TO_MAP) { |
2762 |
+ err = check_ptr_to_map_access(env, regs, regno, off, size, t, |
2763 |
+ value_regno); |
2764 |
+- } else if (reg->type == PTR_TO_RDONLY_BUF) { |
2765 |
+- if (t == BPF_WRITE) { |
2766 |
+- verbose(env, "R%d cannot write into %s\n", |
2767 |
+- regno, reg_type_str[reg->type]); |
2768 |
+- return -EACCES; |
2769 |
++ } else if (base_type(reg->type) == PTR_TO_BUF) { |
2770 |
++ bool rdonly_mem = type_is_rdonly_mem(reg->type); |
2771 |
++ const char *buf_info; |
2772 |
++ u32 *max_access; |
2773 |
++ |
2774 |
++ if (rdonly_mem) { |
2775 |
++ if (t == BPF_WRITE) { |
2776 |
++ verbose(env, "R%d cannot write into %s\n", |
2777 |
++ regno, reg_type_str(env, reg->type)); |
2778 |
++ return -EACCES; |
2779 |
++ } |
2780 |
++ buf_info = "rdonly"; |
2781 |
++ max_access = &env->prog->aux->max_rdonly_access; |
2782 |
++ } else { |
2783 |
++ buf_info = "rdwr"; |
2784 |
++ max_access = &env->prog->aux->max_rdwr_access; |
2785 |
+ } |
2786 |
++ |
2787 |
+ err = check_buffer_access(env, reg, regno, off, size, false, |
2788 |
+- "rdonly", |
2789 |
+- &env->prog->aux->max_rdonly_access); |
2790 |
+- if (!err && value_regno >= 0) |
2791 |
+- mark_reg_unknown(env, regs, value_regno); |
2792 |
+- } else if (reg->type == PTR_TO_RDWR_BUF) { |
2793 |
+- err = check_buffer_access(env, reg, regno, off, size, false, |
2794 |
+- "rdwr", |
2795 |
+- &env->prog->aux->max_rdwr_access); |
2796 |
+- if (!err && t == BPF_READ && value_regno >= 0) |
2797 |
++ buf_info, max_access); |
2798 |
++ |
2799 |
++ if (!err && value_regno >= 0 && (rdonly_mem || t == BPF_READ)) |
2800 |
+ mark_reg_unknown(env, regs, value_regno); |
2801 |
+ } else { |
2802 |
+ verbose(env, "R%d invalid mem access '%s'\n", regno, |
2803 |
+- reg_type_str[reg->type]); |
2804 |
++ reg_type_str(env, reg->type)); |
2805 |
+ return -EACCES; |
2806 |
+ } |
2807 |
+ |
2808 |
+@@ -4409,7 +4395,7 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i |
2809 |
+ is_sk_reg(env, insn->dst_reg)) { |
2810 |
+ verbose(env, "BPF_ATOMIC stores into R%d %s is not allowed\n", |
2811 |
+ insn->dst_reg, |
2812 |
+- reg_type_str[reg_state(env, insn->dst_reg)->type]); |
2813 |
++ reg_type_str(env, reg_state(env, insn->dst_reg)->type)); |
2814 |
+ return -EACCES; |
2815 |
+ } |
2816 |
+ |
2817 |
+@@ -4592,8 +4578,10 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, |
2818 |
+ struct bpf_call_arg_meta *meta) |
2819 |
+ { |
2820 |
+ struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; |
2821 |
++ const char *buf_info; |
2822 |
++ u32 *max_access; |
2823 |
+ |
2824 |
+- switch (reg->type) { |
2825 |
++ switch (base_type(reg->type)) { |
2826 |
+ case PTR_TO_PACKET: |
2827 |
+ case PTR_TO_PACKET_META: |
2828 |
+ return check_packet_access(env, regno, reg->off, access_size, |
2829 |
+@@ -4612,18 +4600,20 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, |
2830 |
+ return check_mem_region_access(env, regno, reg->off, |
2831 |
+ access_size, reg->mem_size, |
2832 |
+ zero_size_allowed); |
2833 |
+- case PTR_TO_RDONLY_BUF: |
2834 |
+- if (meta && meta->raw_mode) |
2835 |
+- return -EACCES; |
2836 |
+- return check_buffer_access(env, reg, regno, reg->off, |
2837 |
+- access_size, zero_size_allowed, |
2838 |
+- "rdonly", |
2839 |
+- &env->prog->aux->max_rdonly_access); |
2840 |
+- case PTR_TO_RDWR_BUF: |
2841 |
++ case PTR_TO_BUF: |
2842 |
++ if (type_is_rdonly_mem(reg->type)) { |
2843 |
++ if (meta && meta->raw_mode) |
2844 |
++ return -EACCES; |
2845 |
++ |
2846 |
++ buf_info = "rdonly"; |
2847 |
++ max_access = &env->prog->aux->max_rdonly_access; |
2848 |
++ } else { |
2849 |
++ buf_info = "rdwr"; |
2850 |
++ max_access = &env->prog->aux->max_rdwr_access; |
2851 |
++ } |
2852 |
+ return check_buffer_access(env, reg, regno, reg->off, |
2853 |
+ access_size, zero_size_allowed, |
2854 |
+- "rdwr", |
2855 |
+- &env->prog->aux->max_rdwr_access); |
2856 |
++ buf_info, max_access); |
2857 |
+ case PTR_TO_STACK: |
2858 |
+ return check_stack_range_initialized( |
2859 |
+ env, |
2860 |
+@@ -4635,9 +4625,9 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, |
2861 |
+ register_is_null(reg)) |
2862 |
+ return 0; |
2863 |
+ |
2864 |
+- verbose(env, "R%d type=%s expected=%s\n", regno, |
2865 |
+- reg_type_str[reg->type], |
2866 |
+- reg_type_str[PTR_TO_STACK]); |
2867 |
++ verbose(env, "R%d type=%s ", regno, |
2868 |
++ reg_type_str(env, reg->type)); |
2869 |
++ verbose(env, "expected=%s\n", reg_type_str(env, PTR_TO_STACK)); |
2870 |
+ return -EACCES; |
2871 |
+ } |
2872 |
+ } |
2873 |
+@@ -4648,7 +4638,7 @@ int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, |
2874 |
+ if (register_is_null(reg)) |
2875 |
+ return 0; |
2876 |
+ |
2877 |
+- if (reg_type_may_be_null(reg->type)) { |
2878 |
++ if (type_may_be_null(reg->type)) { |
2879 |
+ /* Assuming that the register contains a value check if the memory |
2880 |
+ * access is safe. Temporarily save and restore the register's state as |
2881 |
+ * the conversion shouldn't be visible to a caller. |
2882 |
+@@ -4796,9 +4786,8 @@ static int process_timer_func(struct bpf_verifier_env *env, int regno, |
2883 |
+ |
2884 |
+ static bool arg_type_is_mem_ptr(enum bpf_arg_type type) |
2885 |
+ { |
2886 |
+- return type == ARG_PTR_TO_MEM || |
2887 |
+- type == ARG_PTR_TO_MEM_OR_NULL || |
2888 |
+- type == ARG_PTR_TO_UNINIT_MEM; |
2889 |
++ return base_type(type) == ARG_PTR_TO_MEM || |
2890 |
++ base_type(type) == ARG_PTR_TO_UNINIT_MEM; |
2891 |
+ } |
2892 |
+ |
2893 |
+ static bool arg_type_is_mem_size(enum bpf_arg_type type) |
2894 |
+@@ -4900,8 +4889,7 @@ static const struct bpf_reg_types mem_types = { |
2895 |
+ PTR_TO_MAP_KEY, |
2896 |
+ PTR_TO_MAP_VALUE, |
2897 |
+ PTR_TO_MEM, |
2898 |
+- PTR_TO_RDONLY_BUF, |
2899 |
+- PTR_TO_RDWR_BUF, |
2900 |
++ PTR_TO_BUF, |
2901 |
+ }, |
2902 |
+ }; |
2903 |
+ |
2904 |
+@@ -4932,31 +4920,26 @@ static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = { |
2905 |
+ [ARG_PTR_TO_MAP_KEY] = &map_key_value_types, |
2906 |
+ [ARG_PTR_TO_MAP_VALUE] = &map_key_value_types, |
2907 |
+ [ARG_PTR_TO_UNINIT_MAP_VALUE] = &map_key_value_types, |
2908 |
+- [ARG_PTR_TO_MAP_VALUE_OR_NULL] = &map_key_value_types, |
2909 |
+ [ARG_CONST_SIZE] = &scalar_types, |
2910 |
+ [ARG_CONST_SIZE_OR_ZERO] = &scalar_types, |
2911 |
+ [ARG_CONST_ALLOC_SIZE_OR_ZERO] = &scalar_types, |
2912 |
+ [ARG_CONST_MAP_PTR] = &const_map_ptr_types, |
2913 |
+ [ARG_PTR_TO_CTX] = &context_types, |
2914 |
+- [ARG_PTR_TO_CTX_OR_NULL] = &context_types, |
2915 |
+ [ARG_PTR_TO_SOCK_COMMON] = &sock_types, |
2916 |
+ #ifdef CONFIG_NET |
2917 |
+ [ARG_PTR_TO_BTF_ID_SOCK_COMMON] = &btf_id_sock_common_types, |
2918 |
+ #endif |
2919 |
+ [ARG_PTR_TO_SOCKET] = &fullsock_types, |
2920 |
+- [ARG_PTR_TO_SOCKET_OR_NULL] = &fullsock_types, |
2921 |
+ [ARG_PTR_TO_BTF_ID] = &btf_ptr_types, |
2922 |
+ [ARG_PTR_TO_SPIN_LOCK] = &spin_lock_types, |
2923 |
+ [ARG_PTR_TO_MEM] = &mem_types, |
2924 |
+- [ARG_PTR_TO_MEM_OR_NULL] = &mem_types, |
2925 |
+ [ARG_PTR_TO_UNINIT_MEM] = &mem_types, |
2926 |
+ [ARG_PTR_TO_ALLOC_MEM] = &alloc_mem_types, |
2927 |
+- [ARG_PTR_TO_ALLOC_MEM_OR_NULL] = &alloc_mem_types, |
2928 |
+ [ARG_PTR_TO_INT] = &int_ptr_types, |
2929 |
+ [ARG_PTR_TO_LONG] = &int_ptr_types, |
2930 |
+ [ARG_PTR_TO_PERCPU_BTF_ID] = &percpu_btf_ptr_types, |
2931 |
+ [ARG_PTR_TO_FUNC] = &func_ptr_types, |
2932 |
+- [ARG_PTR_TO_STACK_OR_NULL] = &stack_ptr_types, |
2933 |
++ [ARG_PTR_TO_STACK] = &stack_ptr_types, |
2934 |
+ [ARG_PTR_TO_CONST_STR] = &const_str_ptr_types, |
2935 |
+ [ARG_PTR_TO_TIMER] = &timer_types, |
2936 |
+ }; |
2937 |
+@@ -4970,12 +4953,27 @@ static int check_reg_type(struct bpf_verifier_env *env, u32 regno, |
2938 |
+ const struct bpf_reg_types *compatible; |
2939 |
+ int i, j; |
2940 |
+ |
2941 |
+- compatible = compatible_reg_types[arg_type]; |
2942 |
++ compatible = compatible_reg_types[base_type(arg_type)]; |
2943 |
+ if (!compatible) { |
2944 |
+ verbose(env, "verifier internal error: unsupported arg type %d\n", arg_type); |
2945 |
+ return -EFAULT; |
2946 |
+ } |
2947 |
+ |
2948 |
++ /* ARG_PTR_TO_MEM + RDONLY is compatible with PTR_TO_MEM and PTR_TO_MEM + RDONLY, |
2949 |
++ * but ARG_PTR_TO_MEM is compatible only with PTR_TO_MEM and NOT with PTR_TO_MEM + RDONLY |
2950 |
++ * |
2951 |
++ * Same for MAYBE_NULL: |
2952 |
++ * |
2953 |
++ * ARG_PTR_TO_MEM + MAYBE_NULL is compatible with PTR_TO_MEM and PTR_TO_MEM + MAYBE_NULL, |
2954 |
++ * but ARG_PTR_TO_MEM is compatible only with PTR_TO_MEM but NOT with PTR_TO_MEM + MAYBE_NULL |
2955 |
++ * |
2956 |
++ * Therefore we fold these flags depending on the arg_type before comparison. |
2957 |
++ */ |
2958 |
++ if (arg_type & MEM_RDONLY) |
2959 |
++ type &= ~MEM_RDONLY; |
2960 |
++ if (arg_type & PTR_MAYBE_NULL) |
2961 |
++ type &= ~PTR_MAYBE_NULL; |
2962 |
++ |
2963 |
+ for (i = 0; i < ARRAY_SIZE(compatible->types); i++) { |
2964 |
+ expected = compatible->types[i]; |
2965 |
+ if (expected == NOT_INIT) |
2966 |
+@@ -4985,14 +4983,14 @@ static int check_reg_type(struct bpf_verifier_env *env, u32 regno, |
2967 |
+ goto found; |
2968 |
+ } |
2969 |
+ |
2970 |
+- verbose(env, "R%d type=%s expected=", regno, reg_type_str[type]); |
2971 |
++ verbose(env, "R%d type=%s expected=", regno, reg_type_str(env, reg->type)); |
2972 |
+ for (j = 0; j + 1 < i; j++) |
2973 |
+- verbose(env, "%s, ", reg_type_str[compatible->types[j]]); |
2974 |
+- verbose(env, "%s\n", reg_type_str[compatible->types[j]]); |
2975 |
++ verbose(env, "%s, ", reg_type_str(env, compatible->types[j])); |
2976 |
++ verbose(env, "%s\n", reg_type_str(env, compatible->types[j])); |
2977 |
+ return -EACCES; |
2978 |
+ |
2979 |
+ found: |
2980 |
+- if (type == PTR_TO_BTF_ID) { |
2981 |
++ if (reg->type == PTR_TO_BTF_ID) { |
2982 |
+ if (!arg_btf_id) { |
2983 |
+ if (!compatible->btf_id) { |
2984 |
+ verbose(env, "verifier internal error: missing arg compatible BTF ID\n"); |
2985 |
+@@ -5051,15 +5049,14 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, |
2986 |
+ return -EACCES; |
2987 |
+ } |
2988 |
+ |
2989 |
+- if (arg_type == ARG_PTR_TO_MAP_VALUE || |
2990 |
+- arg_type == ARG_PTR_TO_UNINIT_MAP_VALUE || |
2991 |
+- arg_type == ARG_PTR_TO_MAP_VALUE_OR_NULL) { |
2992 |
++ if (base_type(arg_type) == ARG_PTR_TO_MAP_VALUE || |
2993 |
++ base_type(arg_type) == ARG_PTR_TO_UNINIT_MAP_VALUE) { |
2994 |
+ err = resolve_map_arg_type(env, meta, &arg_type); |
2995 |
+ if (err) |
2996 |
+ return err; |
2997 |
+ } |
2998 |
+ |
2999 |
+- if (register_is_null(reg) && arg_type_may_be_null(arg_type)) |
3000 |
++ if (register_is_null(reg) && type_may_be_null(arg_type)) |
3001 |
+ /* A NULL register has a SCALAR_VALUE type, so skip |
3002 |
+ * type checking. |
3003 |
+ */ |
3004 |
+@@ -5128,10 +5125,11 @@ skip_type_check: |
3005 |
+ err = check_helper_mem_access(env, regno, |
3006 |
+ meta->map_ptr->key_size, false, |
3007 |
+ NULL); |
3008 |
+- } else if (arg_type == ARG_PTR_TO_MAP_VALUE || |
3009 |
+- (arg_type == ARG_PTR_TO_MAP_VALUE_OR_NULL && |
3010 |
+- !register_is_null(reg)) || |
3011 |
+- arg_type == ARG_PTR_TO_UNINIT_MAP_VALUE) { |
3012 |
++ } else if (base_type(arg_type) == ARG_PTR_TO_MAP_VALUE || |
3013 |
++ base_type(arg_type) == ARG_PTR_TO_UNINIT_MAP_VALUE) { |
3014 |
++ if (type_may_be_null(arg_type) && register_is_null(reg)) |
3015 |
++ return 0; |
3016 |
++ |
3017 |
+ /* bpf_map_xxx(..., map_ptr, ..., value) call: |
3018 |
+ * check [value, value + map->value_size) validity |
3019 |
+ */ |
3020 |
+@@ -6206,6 +6204,8 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn |
3021 |
+ int *insn_idx_p) |
3022 |
+ { |
3023 |
+ const struct bpf_func_proto *fn = NULL; |
3024 |
++ enum bpf_return_type ret_type; |
3025 |
++ enum bpf_type_flag ret_flag; |
3026 |
+ struct bpf_reg_state *regs; |
3027 |
+ struct bpf_call_arg_meta meta; |
3028 |
+ int insn_idx = *insn_idx_p; |
3029 |
+@@ -6339,13 +6339,14 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn |
3030 |
+ regs[BPF_REG_0].subreg_def = DEF_NOT_SUBREG; |
3031 |
+ |
3032 |
+ /* update return register (already marked as written above) */ |
3033 |
+- if (fn->ret_type == RET_INTEGER) { |
3034 |
++ ret_type = fn->ret_type; |
3035 |
++ ret_flag = type_flag(fn->ret_type); |
3036 |
++ if (ret_type == RET_INTEGER) { |
3037 |
+ /* sets type to SCALAR_VALUE */ |
3038 |
+ mark_reg_unknown(env, regs, BPF_REG_0); |
3039 |
+- } else if (fn->ret_type == RET_VOID) { |
3040 |
++ } else if (ret_type == RET_VOID) { |
3041 |
+ regs[BPF_REG_0].type = NOT_INIT; |
3042 |
+- } else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL || |
3043 |
+- fn->ret_type == RET_PTR_TO_MAP_VALUE) { |
3044 |
++ } else if (base_type(ret_type) == RET_PTR_TO_MAP_VALUE) { |
3045 |
+ /* There is no offset yet applied, variable or fixed */ |
3046 |
+ mark_reg_known_zero(env, regs, BPF_REG_0); |
3047 |
+ /* remember map_ptr, so that check_map_access() |
3048 |
+@@ -6359,28 +6360,25 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn |
3049 |
+ } |
3050 |
+ regs[BPF_REG_0].map_ptr = meta.map_ptr; |
3051 |
+ regs[BPF_REG_0].map_uid = meta.map_uid; |
3052 |
+- if (fn->ret_type == RET_PTR_TO_MAP_VALUE) { |
3053 |
+- regs[BPF_REG_0].type = PTR_TO_MAP_VALUE; |
3054 |
+- if (map_value_has_spin_lock(meta.map_ptr)) |
3055 |
+- regs[BPF_REG_0].id = ++env->id_gen; |
3056 |
+- } else { |
3057 |
+- regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL; |
3058 |
++ regs[BPF_REG_0].type = PTR_TO_MAP_VALUE | ret_flag; |
3059 |
++ if (!type_may_be_null(ret_type) && |
3060 |
++ map_value_has_spin_lock(meta.map_ptr)) { |
3061 |
++ regs[BPF_REG_0].id = ++env->id_gen; |
3062 |
+ } |
3063 |
+- } else if (fn->ret_type == RET_PTR_TO_SOCKET_OR_NULL) { |
3064 |
++ } else if (base_type(ret_type) == RET_PTR_TO_SOCKET) { |
3065 |
+ mark_reg_known_zero(env, regs, BPF_REG_0); |
3066 |
+- regs[BPF_REG_0].type = PTR_TO_SOCKET_OR_NULL; |
3067 |
+- } else if (fn->ret_type == RET_PTR_TO_SOCK_COMMON_OR_NULL) { |
3068 |
++ regs[BPF_REG_0].type = PTR_TO_SOCKET | ret_flag; |
3069 |
++ } else if (base_type(ret_type) == RET_PTR_TO_SOCK_COMMON) { |
3070 |
+ mark_reg_known_zero(env, regs, BPF_REG_0); |
3071 |
+- regs[BPF_REG_0].type = PTR_TO_SOCK_COMMON_OR_NULL; |
3072 |
+- } else if (fn->ret_type == RET_PTR_TO_TCP_SOCK_OR_NULL) { |
3073 |
++ regs[BPF_REG_0].type = PTR_TO_SOCK_COMMON | ret_flag; |
3074 |
++ } else if (base_type(ret_type) == RET_PTR_TO_TCP_SOCK) { |
3075 |
+ mark_reg_known_zero(env, regs, BPF_REG_0); |
3076 |
+- regs[BPF_REG_0].type = PTR_TO_TCP_SOCK_OR_NULL; |
3077 |
+- } else if (fn->ret_type == RET_PTR_TO_ALLOC_MEM_OR_NULL) { |
3078 |
++ regs[BPF_REG_0].type = PTR_TO_TCP_SOCK | ret_flag; |
3079 |
++ } else if (base_type(ret_type) == RET_PTR_TO_ALLOC_MEM) { |
3080 |
+ mark_reg_known_zero(env, regs, BPF_REG_0); |
3081 |
+- regs[BPF_REG_0].type = PTR_TO_MEM_OR_NULL; |
3082 |
++ regs[BPF_REG_0].type = PTR_TO_MEM | ret_flag; |
3083 |
+ regs[BPF_REG_0].mem_size = meta.mem_size; |
3084 |
+- } else if (fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID_OR_NULL || |
3085 |
+- fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID) { |
3086 |
++ } else if (base_type(ret_type) == RET_PTR_TO_MEM_OR_BTF_ID) { |
3087 |
+ const struct btf_type *t; |
3088 |
+ |
3089 |
+ mark_reg_known_zero(env, regs, BPF_REG_0); |
3090 |
+@@ -6398,29 +6396,30 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn |
3091 |
+ tname, PTR_ERR(ret)); |
3092 |
+ return -EINVAL; |
3093 |
+ } |
3094 |
+- regs[BPF_REG_0].type = |
3095 |
+- fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID ? |
3096 |
+- PTR_TO_MEM : PTR_TO_MEM_OR_NULL; |
3097 |
++ regs[BPF_REG_0].type = PTR_TO_MEM | ret_flag; |
3098 |
+ regs[BPF_REG_0].mem_size = tsize; |
3099 |
+ } else { |
3100 |
+- regs[BPF_REG_0].type = |
3101 |
+- fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID ? |
3102 |
+- PTR_TO_BTF_ID : PTR_TO_BTF_ID_OR_NULL; |
3103 |
++ /* MEM_RDONLY may be carried from ret_flag, but it |
3104 |
++ * doesn't apply on PTR_TO_BTF_ID. Fold it, otherwise |
3105 |
++ * it will confuse the check of PTR_TO_BTF_ID in |
3106 |
++ * check_mem_access(). |
3107 |
++ */ |
3108 |
++ ret_flag &= ~MEM_RDONLY; |
3109 |
++ |
3110 |
++ regs[BPF_REG_0].type = PTR_TO_BTF_ID | ret_flag; |
3111 |
+ regs[BPF_REG_0].btf = meta.ret_btf; |
3112 |
+ regs[BPF_REG_0].btf_id = meta.ret_btf_id; |
3113 |
+ } |
3114 |
+- } else if (fn->ret_type == RET_PTR_TO_BTF_ID_OR_NULL || |
3115 |
+- fn->ret_type == RET_PTR_TO_BTF_ID) { |
3116 |
++ } else if (base_type(ret_type) == RET_PTR_TO_BTF_ID) { |
3117 |
+ int ret_btf_id; |
3118 |
+ |
3119 |
+ mark_reg_known_zero(env, regs, BPF_REG_0); |
3120 |
+- regs[BPF_REG_0].type = fn->ret_type == RET_PTR_TO_BTF_ID ? |
3121 |
+- PTR_TO_BTF_ID : |
3122 |
+- PTR_TO_BTF_ID_OR_NULL; |
3123 |
++ regs[BPF_REG_0].type = PTR_TO_BTF_ID | ret_flag; |
3124 |
+ ret_btf_id = *fn->ret_btf_id; |
3125 |
+ if (ret_btf_id == 0) { |
3126 |
+- verbose(env, "invalid return type %d of func %s#%d\n", |
3127 |
+- fn->ret_type, func_id_name(func_id), func_id); |
3128 |
++ verbose(env, "invalid return type %u of func %s#%d\n", |
3129 |
++ base_type(ret_type), func_id_name(func_id), |
3130 |
++ func_id); |
3131 |
+ return -EINVAL; |
3132 |
+ } |
3133 |
+ /* current BPF helper definitions are only coming from |
3134 |
+@@ -6429,12 +6428,12 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn |
3135 |
+ regs[BPF_REG_0].btf = btf_vmlinux; |
3136 |
+ regs[BPF_REG_0].btf_id = ret_btf_id; |
3137 |
+ } else { |
3138 |
+- verbose(env, "unknown return type %d of func %s#%d\n", |
3139 |
+- fn->ret_type, func_id_name(func_id), func_id); |
3140 |
++ verbose(env, "unknown return type %u of func %s#%d\n", |
3141 |
++ base_type(ret_type), func_id_name(func_id), func_id); |
3142 |
+ return -EINVAL; |
3143 |
+ } |
3144 |
+ |
3145 |
+- if (reg_type_may_be_null(regs[BPF_REG_0].type)) |
3146 |
++ if (type_may_be_null(regs[BPF_REG_0].type)) |
3147 |
+ regs[BPF_REG_0].id = ++env->id_gen; |
3148 |
+ |
3149 |
+ if (is_ptr_cast_function(func_id)) { |
3150 |
+@@ -6633,25 +6632,25 @@ static bool check_reg_sane_offset(struct bpf_verifier_env *env, |
3151 |
+ |
3152 |
+ if (known && (val >= BPF_MAX_VAR_OFF || val <= -BPF_MAX_VAR_OFF)) { |
3153 |
+ verbose(env, "math between %s pointer and %lld is not allowed\n", |
3154 |
+- reg_type_str[type], val); |
3155 |
++ reg_type_str(env, type), val); |
3156 |
+ return false; |
3157 |
+ } |
3158 |
+ |
3159 |
+ if (reg->off >= BPF_MAX_VAR_OFF || reg->off <= -BPF_MAX_VAR_OFF) { |
3160 |
+ verbose(env, "%s pointer offset %d is not allowed\n", |
3161 |
+- reg_type_str[type], reg->off); |
3162 |
++ reg_type_str(env, type), reg->off); |
3163 |
+ return false; |
3164 |
+ } |
3165 |
+ |
3166 |
+ if (smin == S64_MIN) { |
3167 |
+ verbose(env, "math between %s pointer and register with unbounded min value is not allowed\n", |
3168 |
+- reg_type_str[type]); |
3169 |
++ reg_type_str(env, type)); |
3170 |
+ return false; |
3171 |
+ } |
3172 |
+ |
3173 |
+ if (smin >= BPF_MAX_VAR_OFF || smin <= -BPF_MAX_VAR_OFF) { |
3174 |
+ verbose(env, "value %lld makes %s pointer be out of bounds\n", |
3175 |
+- smin, reg_type_str[type]); |
3176 |
++ smin, reg_type_str(env, type)); |
3177 |
+ return false; |
3178 |
+ } |
3179 |
+ |
3180 |
+@@ -7028,11 +7027,13 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, |
3181 |
+ return -EACCES; |
3182 |
+ } |
3183 |
+ |
3184 |
+- switch (ptr_reg->type) { |
3185 |
+- case PTR_TO_MAP_VALUE_OR_NULL: |
3186 |
++ if (ptr_reg->type & PTR_MAYBE_NULL) { |
3187 |
+ verbose(env, "R%d pointer arithmetic on %s prohibited, null-check it first\n", |
3188 |
+- dst, reg_type_str[ptr_reg->type]); |
3189 |
++ dst, reg_type_str(env, ptr_reg->type)); |
3190 |
+ return -EACCES; |
3191 |
++ } |
3192 |
++ |
3193 |
++ switch (base_type(ptr_reg->type)) { |
3194 |
+ case CONST_PTR_TO_MAP: |
3195 |
+ /* smin_val represents the known value */ |
3196 |
+ if (known && smin_val == 0 && opcode == BPF_ADD) |
3197 |
+@@ -7045,10 +7046,10 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, |
3198 |
+ case PTR_TO_XDP_SOCK: |
3199 |
+ reject: |
3200 |
+ verbose(env, "R%d pointer arithmetic on %s prohibited\n", |
3201 |
+- dst, reg_type_str[ptr_reg->type]); |
3202 |
++ dst, reg_type_str(env, ptr_reg->type)); |
3203 |
+ return -EACCES; |
3204 |
+ default: |
3205 |
+- if (reg_type_may_be_null(ptr_reg->type)) |
3206 |
++ if (type_may_be_null(ptr_reg->type)) |
3207 |
+ goto reject; |
3208 |
+ break; |
3209 |
+ } |
3210 |
+@@ -8770,7 +8771,7 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state, |
3211 |
+ struct bpf_reg_state *reg, u32 id, |
3212 |
+ bool is_null) |
3213 |
+ { |
3214 |
+- if (reg_type_may_be_null(reg->type) && reg->id == id && |
3215 |
++ if (type_may_be_null(reg->type) && reg->id == id && |
3216 |
+ !WARN_ON_ONCE(!reg->id)) { |
3217 |
+ if (WARN_ON_ONCE(reg->smin_value || reg->smax_value || |
3218 |
+ !tnum_equals_const(reg->var_off, 0) || |
3219 |
+@@ -9148,7 +9149,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, |
3220 |
+ */ |
3221 |
+ if (!is_jmp32 && BPF_SRC(insn->code) == BPF_K && |
3222 |
+ insn->imm == 0 && (opcode == BPF_JEQ || opcode == BPF_JNE) && |
3223 |
+- reg_type_may_be_null(dst_reg->type)) { |
3224 |
++ type_may_be_null(dst_reg->type)) { |
3225 |
+ /* Mark all identical registers in each branch as either |
3226 |
+ * safe or unknown depending R == 0 or R != 0 conditional. |
3227 |
+ */ |
3228 |
+@@ -9207,7 +9208,7 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) |
3229 |
+ |
3230 |
+ if (insn->src_reg == BPF_PSEUDO_BTF_ID) { |
3231 |
+ dst_reg->type = aux->btf_var.reg_type; |
3232 |
+- switch (dst_reg->type) { |
3233 |
++ switch (base_type(dst_reg->type)) { |
3234 |
+ case PTR_TO_MEM: |
3235 |
+ dst_reg->mem_size = aux->btf_var.mem_size; |
3236 |
+ break; |
3237 |
+@@ -9404,7 +9405,7 @@ static int check_return_code(struct bpf_verifier_env *env) |
3238 |
+ /* enforce return zero from async callbacks like timer */ |
3239 |
+ if (reg->type != SCALAR_VALUE) { |
3240 |
+ verbose(env, "In async callback the register R0 is not a known value (%s)\n", |
3241 |
+- reg_type_str[reg->type]); |
3242 |
++ reg_type_str(env, reg->type)); |
3243 |
+ return -EINVAL; |
3244 |
+ } |
3245 |
+ |
3246 |
+@@ -9418,7 +9419,7 @@ static int check_return_code(struct bpf_verifier_env *env) |
3247 |
+ if (is_subprog) { |
3248 |
+ if (reg->type != SCALAR_VALUE) { |
3249 |
+ verbose(env, "At subprogram exit the register R0 is not a scalar value (%s)\n", |
3250 |
+- reg_type_str[reg->type]); |
3251 |
++ reg_type_str(env, reg->type)); |
3252 |
+ return -EINVAL; |
3253 |
+ } |
3254 |
+ return 0; |
3255 |
+@@ -9482,7 +9483,7 @@ static int check_return_code(struct bpf_verifier_env *env) |
3256 |
+ |
3257 |
+ if (reg->type != SCALAR_VALUE) { |
3258 |
+ verbose(env, "At program exit the register R0 is not a known value (%s)\n", |
3259 |
+- reg_type_str[reg->type]); |
3260 |
++ reg_type_str(env, reg->type)); |
3261 |
+ return -EINVAL; |
3262 |
+ } |
3263 |
+ |
3264 |
+@@ -10263,7 +10264,7 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, |
3265 |
+ return true; |
3266 |
+ if (rcur->type == NOT_INIT) |
3267 |
+ return false; |
3268 |
+- switch (rold->type) { |
3269 |
++ switch (base_type(rold->type)) { |
3270 |
+ case SCALAR_VALUE: |
3271 |
+ if (env->explore_alu_limits) |
3272 |
+ return false; |
3273 |
+@@ -10285,6 +10286,22 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, |
3274 |
+ } |
3275 |
+ case PTR_TO_MAP_KEY: |
3276 |
+ case PTR_TO_MAP_VALUE: |
3277 |
++ /* a PTR_TO_MAP_VALUE could be safe to use as a |
3278 |
++ * PTR_TO_MAP_VALUE_OR_NULL into the same map. |
3279 |
++ * However, if the old PTR_TO_MAP_VALUE_OR_NULL then got NULL- |
3280 |
++ * checked, doing so could have affected others with the same |
3281 |
++ * id, and we can't check for that because we lost the id when |
3282 |
++ * we converted to a PTR_TO_MAP_VALUE. |
3283 |
++ */ |
3284 |
++ if (type_may_be_null(rold->type)) { |
3285 |
++ if (!type_may_be_null(rcur->type)) |
3286 |
++ return false; |
3287 |
++ if (memcmp(rold, rcur, offsetof(struct bpf_reg_state, id))) |
3288 |
++ return false; |
3289 |
++ /* Check our ids match any regs they're supposed to */ |
3290 |
++ return check_ids(rold->id, rcur->id, idmap); |
3291 |
++ } |
3292 |
++ |
3293 |
+ /* If the new min/max/var_off satisfy the old ones and |
3294 |
+ * everything else matches, we are OK. |
3295 |
+ * 'id' is not compared, since it's only used for maps with |
3296 |
+@@ -10296,20 +10313,6 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, |
3297 |
+ return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0 && |
3298 |
+ range_within(rold, rcur) && |
3299 |
+ tnum_in(rold->var_off, rcur->var_off); |
3300 |
+- case PTR_TO_MAP_VALUE_OR_NULL: |
3301 |
+- /* a PTR_TO_MAP_VALUE could be safe to use as a |
3302 |
+- * PTR_TO_MAP_VALUE_OR_NULL into the same map. |
3303 |
+- * However, if the old PTR_TO_MAP_VALUE_OR_NULL then got NULL- |
3304 |
+- * checked, doing so could have affected others with the same |
3305 |
+- * id, and we can't check for that because we lost the id when |
3306 |
+- * we converted to a PTR_TO_MAP_VALUE. |
3307 |
+- */ |
3308 |
+- if (rcur->type != PTR_TO_MAP_VALUE_OR_NULL) |
3309 |
+- return false; |
3310 |
+- if (memcmp(rold, rcur, offsetof(struct bpf_reg_state, id))) |
3311 |
+- return false; |
3312 |
+- /* Check our ids match any regs they're supposed to */ |
3313 |
+- return check_ids(rold->id, rcur->id, idmap); |
3314 |
+ case PTR_TO_PACKET_META: |
3315 |
+ case PTR_TO_PACKET: |
3316 |
+ if (rcur->type != rold->type) |
3317 |
+@@ -10338,11 +10341,8 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, |
3318 |
+ case PTR_TO_PACKET_END: |
3319 |
+ case PTR_TO_FLOW_KEYS: |
3320 |
+ case PTR_TO_SOCKET: |
3321 |
+- case PTR_TO_SOCKET_OR_NULL: |
3322 |
+ case PTR_TO_SOCK_COMMON: |
3323 |
+- case PTR_TO_SOCK_COMMON_OR_NULL: |
3324 |
+ case PTR_TO_TCP_SOCK: |
3325 |
+- case PTR_TO_TCP_SOCK_OR_NULL: |
3326 |
+ case PTR_TO_XDP_SOCK: |
3327 |
+ /* Only valid matches are exact, which memcmp() above |
3328 |
+ * would have accepted |
3329 |
+@@ -10868,17 +10868,13 @@ next: |
3330 |
+ /* Return true if it's OK to have the same insn return a different type. */ |
3331 |
+ static bool reg_type_mismatch_ok(enum bpf_reg_type type) |
3332 |
+ { |
3333 |
+- switch (type) { |
3334 |
++ switch (base_type(type)) { |
3335 |
+ case PTR_TO_CTX: |
3336 |
+ case PTR_TO_SOCKET: |
3337 |
+- case PTR_TO_SOCKET_OR_NULL: |
3338 |
+ case PTR_TO_SOCK_COMMON: |
3339 |
+- case PTR_TO_SOCK_COMMON_OR_NULL: |
3340 |
+ case PTR_TO_TCP_SOCK: |
3341 |
+- case PTR_TO_TCP_SOCK_OR_NULL: |
3342 |
+ case PTR_TO_XDP_SOCK: |
3343 |
+ case PTR_TO_BTF_ID: |
3344 |
+- case PTR_TO_BTF_ID_OR_NULL: |
3345 |
+ return false; |
3346 |
+ default: |
3347 |
+ return true; |
3348 |
+@@ -11102,7 +11098,7 @@ static int do_check(struct bpf_verifier_env *env) |
3349 |
+ if (is_ctx_reg(env, insn->dst_reg)) { |
3350 |
+ verbose(env, "BPF_ST stores into R%d %s is not allowed\n", |
3351 |
+ insn->dst_reg, |
3352 |
+- reg_type_str[reg_state(env, insn->dst_reg)->type]); |
3353 |
++ reg_type_str(env, reg_state(env, insn->dst_reg)->type)); |
3354 |
+ return -EACCES; |
3355 |
+ } |
3356 |
+ |
3357 |
+@@ -11353,7 +11349,7 @@ static int check_pseudo_btf_id(struct bpf_verifier_env *env, |
3358 |
+ err = -EINVAL; |
3359 |
+ goto err_put; |
3360 |
+ } |
3361 |
+- aux->btf_var.reg_type = PTR_TO_MEM; |
3362 |
++ aux->btf_var.reg_type = PTR_TO_MEM | MEM_RDONLY; |
3363 |
+ aux->btf_var.mem_size = tsize; |
3364 |
+ } else { |
3365 |
+ aux->btf_var.reg_type = PTR_TO_BTF_ID; |
3366 |
+@@ -13175,7 +13171,7 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog) |
3367 |
+ mark_reg_known_zero(env, regs, i); |
3368 |
+ else if (regs[i].type == SCALAR_VALUE) |
3369 |
+ mark_reg_unknown(env, regs, i); |
3370 |
+- else if (regs[i].type == PTR_TO_MEM_OR_NULL) { |
3371 |
++ else if (base_type(regs[i].type) == PTR_TO_MEM) { |
3372 |
+ const u32 mem_size = regs[i].mem_size; |
3373 |
+ |
3374 |
+ mark_reg_known_zero(env, regs, i); |
3375 |
+diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c |
3376 |
+index 5a18b861fcf75..c289010b0964e 100644 |
3377 |
+--- a/kernel/trace/bpf_trace.c |
3378 |
++++ b/kernel/trace/bpf_trace.c |
3379 |
+@@ -345,7 +345,7 @@ static const struct bpf_func_proto bpf_probe_write_user_proto = { |
3380 |
+ .gpl_only = true, |
3381 |
+ .ret_type = RET_INTEGER, |
3382 |
+ .arg1_type = ARG_ANYTHING, |
3383 |
+- .arg2_type = ARG_PTR_TO_MEM, |
3384 |
++ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, |
3385 |
+ .arg3_type = ARG_CONST_SIZE, |
3386 |
+ }; |
3387 |
+ |
3388 |
+@@ -394,7 +394,7 @@ static const struct bpf_func_proto bpf_trace_printk_proto = { |
3389 |
+ .func = bpf_trace_printk, |
3390 |
+ .gpl_only = true, |
3391 |
+ .ret_type = RET_INTEGER, |
3392 |
+- .arg1_type = ARG_PTR_TO_MEM, |
3393 |
++ .arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY, |
3394 |
+ .arg2_type = ARG_CONST_SIZE, |
3395 |
+ }; |
3396 |
+ |
3397 |
+@@ -446,9 +446,9 @@ static const struct bpf_func_proto bpf_seq_printf_proto = { |
3398 |
+ .ret_type = RET_INTEGER, |
3399 |
+ .arg1_type = ARG_PTR_TO_BTF_ID, |
3400 |
+ .arg1_btf_id = &btf_seq_file_ids[0], |
3401 |
+- .arg2_type = ARG_PTR_TO_MEM, |
3402 |
++ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, |
3403 |
+ .arg3_type = ARG_CONST_SIZE, |
3404 |
+- .arg4_type = ARG_PTR_TO_MEM_OR_NULL, |
3405 |
++ .arg4_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY, |
3406 |
+ .arg5_type = ARG_CONST_SIZE_OR_ZERO, |
3407 |
+ }; |
3408 |
+ |
3409 |
+@@ -463,7 +463,7 @@ static const struct bpf_func_proto bpf_seq_write_proto = { |
3410 |
+ .ret_type = RET_INTEGER, |
3411 |
+ .arg1_type = ARG_PTR_TO_BTF_ID, |
3412 |
+ .arg1_btf_id = &btf_seq_file_ids[0], |
3413 |
+- .arg2_type = ARG_PTR_TO_MEM, |
3414 |
++ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, |
3415 |
+ .arg3_type = ARG_CONST_SIZE_OR_ZERO, |
3416 |
+ }; |
3417 |
+ |
3418 |
+@@ -487,7 +487,7 @@ static const struct bpf_func_proto bpf_seq_printf_btf_proto = { |
3419 |
+ .ret_type = RET_INTEGER, |
3420 |
+ .arg1_type = ARG_PTR_TO_BTF_ID, |
3421 |
+ .arg1_btf_id = &btf_seq_file_ids[0], |
3422 |
+- .arg2_type = ARG_PTR_TO_MEM, |
3423 |
++ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, |
3424 |
+ .arg3_type = ARG_CONST_SIZE_OR_ZERO, |
3425 |
+ .arg4_type = ARG_ANYTHING, |
3426 |
+ }; |
3427 |
+@@ -648,7 +648,7 @@ static const struct bpf_func_proto bpf_perf_event_output_proto = { |
3428 |
+ .arg1_type = ARG_PTR_TO_CTX, |
3429 |
+ .arg2_type = ARG_CONST_MAP_PTR, |
3430 |
+ .arg3_type = ARG_ANYTHING, |
3431 |
+- .arg4_type = ARG_PTR_TO_MEM, |
3432 |
++ .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, |
3433 |
+ .arg5_type = ARG_CONST_SIZE_OR_ZERO, |
3434 |
+ }; |
3435 |
+ |
3436 |
+@@ -958,7 +958,7 @@ const struct bpf_func_proto bpf_snprintf_btf_proto = { |
3437 |
+ .ret_type = RET_INTEGER, |
3438 |
+ .arg1_type = ARG_PTR_TO_MEM, |
3439 |
+ .arg2_type = ARG_CONST_SIZE, |
3440 |
+- .arg3_type = ARG_PTR_TO_MEM, |
3441 |
++ .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY, |
3442 |
+ .arg4_type = ARG_CONST_SIZE, |
3443 |
+ .arg5_type = ARG_ANYTHING, |
3444 |
+ }; |
3445 |
+@@ -1207,7 +1207,7 @@ static const struct bpf_func_proto bpf_perf_event_output_proto_tp = { |
3446 |
+ .arg1_type = ARG_PTR_TO_CTX, |
3447 |
+ .arg2_type = ARG_CONST_MAP_PTR, |
3448 |
+ .arg3_type = ARG_ANYTHING, |
3449 |
+- .arg4_type = ARG_PTR_TO_MEM, |
3450 |
++ .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, |
3451 |
+ .arg5_type = ARG_CONST_SIZE_OR_ZERO, |
3452 |
+ }; |
3453 |
+ |
3454 |
+@@ -1429,7 +1429,7 @@ static const struct bpf_func_proto bpf_perf_event_output_proto_raw_tp = { |
3455 |
+ .arg1_type = ARG_PTR_TO_CTX, |
3456 |
+ .arg2_type = ARG_CONST_MAP_PTR, |
3457 |
+ .arg3_type = ARG_ANYTHING, |
3458 |
+- .arg4_type = ARG_PTR_TO_MEM, |
3459 |
++ .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, |
3460 |
+ .arg5_type = ARG_CONST_SIZE_OR_ZERO, |
3461 |
+ }; |
3462 |
+ |
3463 |
+@@ -1483,7 +1483,7 @@ static const struct bpf_func_proto bpf_get_stack_proto_raw_tp = { |
3464 |
+ .gpl_only = true, |
3465 |
+ .ret_type = RET_INTEGER, |
3466 |
+ .arg1_type = ARG_PTR_TO_CTX, |
3467 |
+- .arg2_type = ARG_PTR_TO_MEM, |
3468 |
++ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, |
3469 |
+ .arg3_type = ARG_CONST_SIZE_OR_ZERO, |
3470 |
+ .arg4_type = ARG_ANYTHING, |
3471 |
+ }; |
3472 |
+diff --git a/lib/iov_iter.c b/lib/iov_iter.c |
3473 |
+index c5b2f0f4b8a84..6d146f77601d7 100644 |
3474 |
+--- a/lib/iov_iter.c |
3475 |
++++ b/lib/iov_iter.c |
3476 |
+@@ -191,7 +191,7 @@ static size_t copy_page_to_iter_iovec(struct page *page, size_t offset, size_t b |
3477 |
+ buf = iov->iov_base + skip; |
3478 |
+ copy = min(bytes, iov->iov_len - skip); |
3479 |
+ |
3480 |
+- if (IS_ENABLED(CONFIG_HIGHMEM) && !fault_in_pages_writeable(buf, copy)) { |
3481 |
++ if (IS_ENABLED(CONFIG_HIGHMEM) && !fault_in_writeable(buf, copy)) { |
3482 |
+ kaddr = kmap_atomic(page); |
3483 |
+ from = kaddr + offset; |
3484 |
+ |
3485 |
+@@ -275,7 +275,7 @@ static size_t copy_page_from_iter_iovec(struct page *page, size_t offset, size_t |
3486 |
+ buf = iov->iov_base + skip; |
3487 |
+ copy = min(bytes, iov->iov_len - skip); |
3488 |
+ |
3489 |
+- if (IS_ENABLED(CONFIG_HIGHMEM) && !fault_in_pages_readable(buf, copy)) { |
3490 |
++ if (IS_ENABLED(CONFIG_HIGHMEM) && !fault_in_readable(buf, copy)) { |
3491 |
+ kaddr = kmap_atomic(page); |
3492 |
+ to = kaddr + offset; |
3493 |
+ |
3494 |
+@@ -431,35 +431,81 @@ out: |
3495 |
+ } |
3496 |
+ |
3497 |
+ /* |
3498 |
++ * fault_in_iov_iter_readable - fault in iov iterator for reading |
3499 |
++ * @i: iterator |
3500 |
++ * @size: maximum length |
3501 |
++ * |
3502 |
+ * Fault in one or more iovecs of the given iov_iter, to a maximum length of |
3503 |
+- * bytes. For each iovec, fault in each page that constitutes the iovec. |
3504 |
++ * @size. For each iovec, fault in each page that constitutes the iovec. |
3505 |
++ * |
3506 |
++ * Returns the number of bytes not faulted in (like copy_to_user() and |
3507 |
++ * copy_from_user()). |
3508 |
+ * |
3509 |
+- * Return 0 on success, or non-zero if the memory could not be accessed (i.e. |
3510 |
+- * because it is an invalid address). |
3511 |
++ * Always returns 0 for non-userspace iterators. |
3512 |
+ */ |
3513 |
+-int iov_iter_fault_in_readable(const struct iov_iter *i, size_t bytes) |
3514 |
++size_t fault_in_iov_iter_readable(const struct iov_iter *i, size_t size) |
3515 |
+ { |
3516 |
+ if (iter_is_iovec(i)) { |
3517 |
++ size_t count = min(size, iov_iter_count(i)); |
3518 |
+ const struct iovec *p; |
3519 |
+ size_t skip; |
3520 |
+ |
3521 |
+- if (bytes > i->count) |
3522 |
+- bytes = i->count; |
3523 |
+- for (p = i->iov, skip = i->iov_offset; bytes; p++, skip = 0) { |
3524 |
+- size_t len = min(bytes, p->iov_len - skip); |
3525 |
+- int err; |
3526 |
++ size -= count; |
3527 |
++ for (p = i->iov, skip = i->iov_offset; count; p++, skip = 0) { |
3528 |
++ size_t len = min(count, p->iov_len - skip); |
3529 |
++ size_t ret; |
3530 |
+ |
3531 |
+ if (unlikely(!len)) |
3532 |
+ continue; |
3533 |
+- err = fault_in_pages_readable(p->iov_base + skip, len); |
3534 |
+- if (unlikely(err)) |
3535 |
+- return err; |
3536 |
+- bytes -= len; |
3537 |
++ ret = fault_in_readable(p->iov_base + skip, len); |
3538 |
++ count -= len - ret; |
3539 |
++ if (ret) |
3540 |
++ break; |
3541 |
+ } |
3542 |
++ return count + size; |
3543 |
+ } |
3544 |
+ return 0; |
3545 |
+ } |
3546 |
+-EXPORT_SYMBOL(iov_iter_fault_in_readable); |
3547 |
++EXPORT_SYMBOL(fault_in_iov_iter_readable); |
3548 |
++ |
3549 |
++/* |
3550 |
++ * fault_in_iov_iter_writeable - fault in iov iterator for writing |
3551 |
++ * @i: iterator |
3552 |
++ * @size: maximum length |
3553 |
++ * |
3554 |
++ * Faults in the iterator using get_user_pages(), i.e., without triggering |
3555 |
++ * hardware page faults. This is primarily useful when we already know that |
3556 |
++ * some or all of the pages in @i aren't in memory. |
3557 |
++ * |
3558 |
++ * Returns the number of bytes not faulted in, like copy_to_user() and |
3559 |
++ * copy_from_user(). |
3560 |
++ * |
3561 |
++ * Always returns 0 for non-user-space iterators. |
3562 |
++ */ |
3563 |
++size_t fault_in_iov_iter_writeable(const struct iov_iter *i, size_t size) |
3564 |
++{ |
3565 |
++ if (iter_is_iovec(i)) { |
3566 |
++ size_t count = min(size, iov_iter_count(i)); |
3567 |
++ const struct iovec *p; |
3568 |
++ size_t skip; |
3569 |
++ |
3570 |
++ size -= count; |
3571 |
++ for (p = i->iov, skip = i->iov_offset; count; p++, skip = 0) { |
3572 |
++ size_t len = min(count, p->iov_len - skip); |
3573 |
++ size_t ret; |
3574 |
++ |
3575 |
++ if (unlikely(!len)) |
3576 |
++ continue; |
3577 |
++ ret = fault_in_safe_writeable(p->iov_base + skip, len); |
3578 |
++ count -= len - ret; |
3579 |
++ if (ret) |
3580 |
++ break; |
3581 |
++ } |
3582 |
++ return count + size; |
3583 |
++ } |
3584 |
++ return 0; |
3585 |
++} |
3586 |
++EXPORT_SYMBOL(fault_in_iov_iter_writeable); |
3587 |
+ |
3588 |
+ void iov_iter_init(struct iov_iter *i, unsigned int direction, |
3589 |
+ const struct iovec *iov, unsigned long nr_segs, |
3590 |
+@@ -468,6 +514,7 @@ void iov_iter_init(struct iov_iter *i, unsigned int direction, |
3591 |
+ WARN_ON(direction & ~(READ | WRITE)); |
3592 |
+ *i = (struct iov_iter) { |
3593 |
+ .iter_type = ITER_IOVEC, |
3594 |
++ .nofault = false, |
3595 |
+ .data_source = direction, |
3596 |
+ .iov = iov, |
3597 |
+ .nr_segs = nr_segs, |
3598 |
+@@ -1483,13 +1530,17 @@ ssize_t iov_iter_get_pages(struct iov_iter *i, |
3599 |
+ return 0; |
3600 |
+ |
3601 |
+ if (likely(iter_is_iovec(i))) { |
3602 |
++ unsigned int gup_flags = 0; |
3603 |
+ unsigned long addr; |
3604 |
+ |
3605 |
++ if (iov_iter_rw(i) != WRITE) |
3606 |
++ gup_flags |= FOLL_WRITE; |
3607 |
++ if (i->nofault) |
3608 |
++ gup_flags |= FOLL_NOFAULT; |
3609 |
++ |
3610 |
+ addr = first_iovec_segment(i, &len, start, maxsize, maxpages); |
3611 |
+ n = DIV_ROUND_UP(len, PAGE_SIZE); |
3612 |
+- res = get_user_pages_fast(addr, n, |
3613 |
+- iov_iter_rw(i) != WRITE ? FOLL_WRITE : 0, |
3614 |
+- pages); |
3615 |
++ res = get_user_pages_fast(addr, n, gup_flags, pages); |
3616 |
+ if (unlikely(res <= 0)) |
3617 |
+ return res; |
3618 |
+ return (res == n ? len : res * PAGE_SIZE) - *start; |
3619 |
+@@ -1605,15 +1656,20 @@ ssize_t iov_iter_get_pages_alloc(struct iov_iter *i, |
3620 |
+ return 0; |
3621 |
+ |
3622 |
+ if (likely(iter_is_iovec(i))) { |
3623 |
++ unsigned int gup_flags = 0; |
3624 |
+ unsigned long addr; |
3625 |
+ |
3626 |
++ if (iov_iter_rw(i) != WRITE) |
3627 |
++ gup_flags |= FOLL_WRITE; |
3628 |
++ if (i->nofault) |
3629 |
++ gup_flags |= FOLL_NOFAULT; |
3630 |
++ |
3631 |
+ addr = first_iovec_segment(i, &len, start, maxsize, ~0U); |
3632 |
+ n = DIV_ROUND_UP(len, PAGE_SIZE); |
3633 |
+ p = get_pages_array(n); |
3634 |
+ if (!p) |
3635 |
+ return -ENOMEM; |
3636 |
+- res = get_user_pages_fast(addr, n, |
3637 |
+- iov_iter_rw(i) != WRITE ? FOLL_WRITE : 0, p); |
3638 |
++ res = get_user_pages_fast(addr, n, gup_flags, p); |
3639 |
+ if (unlikely(res <= 0)) { |
3640 |
+ kvfree(p); |
3641 |
+ *pages = NULL; |
3642 |
+diff --git a/mm/filemap.c b/mm/filemap.c |
3643 |
+index 1293c3409e429..00e391e758801 100644 |
3644 |
+--- a/mm/filemap.c |
3645 |
++++ b/mm/filemap.c |
3646 |
+@@ -90,7 +90,7 @@ |
3647 |
+ * ->lock_page (filemap_fault, access_process_vm) |
3648 |
+ * |
3649 |
+ * ->i_rwsem (generic_perform_write) |
3650 |
+- * ->mmap_lock (fault_in_pages_readable->do_page_fault) |
3651 |
++ * ->mmap_lock (fault_in_readable->do_page_fault) |
3652 |
+ * |
3653 |
+ * bdi->wb.list_lock |
3654 |
+ * sb_lock (fs/fs-writeback.c) |
3655 |
+@@ -3760,7 +3760,7 @@ again: |
3656 |
+ * same page as we're writing to, without it being marked |
3657 |
+ * up-to-date. |
3658 |
+ */ |
3659 |
+- if (unlikely(iov_iter_fault_in_readable(i, bytes))) { |
3660 |
++ if (unlikely(fault_in_iov_iter_readable(i, bytes))) { |
3661 |
+ status = -EFAULT; |
3662 |
+ break; |
3663 |
+ } |
3664 |
+diff --git a/mm/gup.c b/mm/gup.c |
3665 |
+index 52f08e3177e9f..ba2ab7a223f8e 100644 |
3666 |
+--- a/mm/gup.c |
3667 |
++++ b/mm/gup.c |
3668 |
+@@ -943,6 +943,8 @@ static int faultin_page(struct vm_area_struct *vma, |
3669 |
+ /* mlock all present pages, but do not fault in new pages */ |
3670 |
+ if ((*flags & (FOLL_POPULATE | FOLL_MLOCK)) == FOLL_MLOCK) |
3671 |
+ return -ENOENT; |
3672 |
++ if (*flags & FOLL_NOFAULT) |
3673 |
++ return -EFAULT; |
3674 |
+ if (*flags & FOLL_WRITE) |
3675 |
+ fault_flags |= FAULT_FLAG_WRITE; |
3676 |
+ if (*flags & FOLL_REMOTE) |
3677 |
+@@ -1681,6 +1683,122 @@ finish_or_fault: |
3678 |
+ } |
3679 |
+ #endif /* !CONFIG_MMU */ |
3680 |
+ |
3681 |
++/** |
3682 |
++ * fault_in_writeable - fault in userspace address range for writing |
3683 |
++ * @uaddr: start of address range |
3684 |
++ * @size: size of address range |
3685 |
++ * |
3686 |
++ * Returns the number of bytes not faulted in (like copy_to_user() and |
3687 |
++ * copy_from_user()). |
3688 |
++ */ |
3689 |
++size_t fault_in_writeable(char __user *uaddr, size_t size) |
3690 |
++{ |
3691 |
++ char __user *start = uaddr, *end; |
3692 |
++ |
3693 |
++ if (unlikely(size == 0)) |
3694 |
++ return 0; |
3695 |
++ if (!PAGE_ALIGNED(uaddr)) { |
3696 |
++ if (unlikely(__put_user(0, uaddr) != 0)) |
3697 |
++ return size; |
3698 |
++ uaddr = (char __user *)PAGE_ALIGN((unsigned long)uaddr); |
3699 |
++ } |
3700 |
++ end = (char __user *)PAGE_ALIGN((unsigned long)start + size); |
3701 |
++ if (unlikely(end < start)) |
3702 |
++ end = NULL; |
3703 |
++ while (uaddr != end) { |
3704 |
++ if (unlikely(__put_user(0, uaddr) != 0)) |
3705 |
++ goto out; |
3706 |
++ uaddr += PAGE_SIZE; |
3707 |
++ } |
3708 |
++ |
3709 |
++out: |
3710 |
++ if (size > uaddr - start) |
3711 |
++ return size - (uaddr - start); |
3712 |
++ return 0; |
3713 |
++} |
3714 |
++EXPORT_SYMBOL(fault_in_writeable); |
3715 |
++ |
3716 |
++/* |
3717 |
++ * fault_in_safe_writeable - fault in an address range for writing |
3718 |
++ * @uaddr: start of address range |
3719 |
++ * @size: length of address range |
3720 |
++ * |
3721 |
++ * Faults in an address range for writing. This is primarily useful when we |
3722 |
++ * already know that some or all of the pages in the address range aren't in |
3723 |
++ * memory. |
3724 |
++ * |
3725 |
++ * Unlike fault_in_writeable(), this function is non-destructive. |
3726 |
++ * |
3727 |
++ * Note that we don't pin or otherwise hold the pages referenced that we fault |
3728 |
++ * in. There's no guarantee that they'll stay in memory for any duration of |
3729 |
++ * time. |
3730 |
++ * |
3731 |
++ * Returns the number of bytes not faulted in, like copy_to_user() and |
3732 |
++ * copy_from_user(). |
3733 |
++ */ |
3734 |
++size_t fault_in_safe_writeable(const char __user *uaddr, size_t size) |
3735 |
++{ |
3736 |
++ unsigned long start = (unsigned long)uaddr, end; |
3737 |
++ struct mm_struct *mm = current->mm; |
3738 |
++ bool unlocked = false; |
3739 |
++ |
3740 |
++ if (unlikely(size == 0)) |
3741 |
++ return 0; |
3742 |
++ end = PAGE_ALIGN(start + size); |
3743 |
++ if (end < start) |
3744 |
++ end = 0; |
3745 |
++ |
3746 |
++ mmap_read_lock(mm); |
3747 |
++ do { |
3748 |
++ if (fixup_user_fault(mm, start, FAULT_FLAG_WRITE, &unlocked)) |
3749 |
++ break; |
3750 |
++ start = (start + PAGE_SIZE) & PAGE_MASK; |
3751 |
++ } while (start != end); |
3752 |
++ mmap_read_unlock(mm); |
3753 |
++ |
3754 |
++ if (size > (unsigned long)uaddr - start) |
3755 |
++ return size - ((unsigned long)uaddr - start); |
3756 |
++ return 0; |
3757 |
++} |
3758 |
++EXPORT_SYMBOL(fault_in_safe_writeable); |
3759 |
++ |
3760 |
++/** |
3761 |
++ * fault_in_readable - fault in userspace address range for reading |
3762 |
++ * @uaddr: start of user address range |
3763 |
++ * @size: size of user address range |
3764 |
++ * |
3765 |
++ * Returns the number of bytes not faulted in (like copy_to_user() and |
3766 |
++ * copy_from_user()). |
3767 |
++ */ |
3768 |
++size_t fault_in_readable(const char __user *uaddr, size_t size) |
3769 |
++{ |
3770 |
++ const char __user *start = uaddr, *end; |
3771 |
++ volatile char c; |
3772 |
++ |
3773 |
++ if (unlikely(size == 0)) |
3774 |
++ return 0; |
3775 |
++ if (!PAGE_ALIGNED(uaddr)) { |
3776 |
++ if (unlikely(__get_user(c, uaddr) != 0)) |
3777 |
++ return size; |
3778 |
++ uaddr = (const char __user *)PAGE_ALIGN((unsigned long)uaddr); |
3779 |
++ } |
3780 |
++ end = (const char __user *)PAGE_ALIGN((unsigned long)start + size); |
3781 |
++ if (unlikely(end < start)) |
3782 |
++ end = NULL; |
3783 |
++ while (uaddr != end) { |
3784 |
++ if (unlikely(__get_user(c, uaddr) != 0)) |
3785 |
++ goto out; |
3786 |
++ uaddr += PAGE_SIZE; |
3787 |
++ } |
3788 |
++ |
3789 |
++out: |
3790 |
++ (void)c; |
3791 |
++ if (size > uaddr - start) |
3792 |
++ return size - (uaddr - start); |
3793 |
++ return 0; |
3794 |
++} |
3795 |
++EXPORT_SYMBOL(fault_in_readable); |
3796 |
++ |
3797 |
+ /** |
3798 |
+ * get_dump_page() - pin user page in memory while writing it to core dump |
3799 |
+ * @addr: user address |
3800 |
+@@ -2733,7 +2851,7 @@ static int internal_get_user_pages_fast(unsigned long start, |
3801 |
+ |
3802 |
+ if (WARN_ON_ONCE(gup_flags & ~(FOLL_WRITE | FOLL_LONGTERM | |
3803 |
+ FOLL_FORCE | FOLL_PIN | FOLL_GET | |
3804 |
+- FOLL_FAST_ONLY))) |
3805 |
++ FOLL_FAST_ONLY | FOLL_NOFAULT))) |
3806 |
+ return -EINVAL; |
3807 |
+ |
3808 |
+ if (gup_flags & FOLL_PIN) |
3809 |
+diff --git a/mm/kfence/core.c b/mm/kfence/core.c |
3810 |
+index 86260e8f28302..66076d8742b78 100644 |
3811 |
+--- a/mm/kfence/core.c |
3812 |
++++ b/mm/kfence/core.c |
3813 |
+@@ -528,6 +528,8 @@ static bool __init kfence_init_pool(void) |
3814 |
+ * enters __slab_free() slow-path. |
3815 |
+ */ |
3816 |
+ for (i = 0; i < KFENCE_POOL_SIZE / PAGE_SIZE; i++) { |
3817 |
++ struct page *page = &pages[i]; |
3818 |
++ |
3819 |
+ if (!i || (i % 2)) |
3820 |
+ continue; |
3821 |
+ |
3822 |
+@@ -535,7 +537,11 @@ static bool __init kfence_init_pool(void) |
3823 |
+ if (WARN_ON(compound_head(&pages[i]) != &pages[i])) |
3824 |
+ goto err; |
3825 |
+ |
3826 |
+- __SetPageSlab(&pages[i]); |
3827 |
++ __SetPageSlab(page); |
3828 |
++#ifdef CONFIG_MEMCG |
3829 |
++ page->memcg_data = (unsigned long)&kfence_metadata[i / 2 - 1].objcg | |
3830 |
++ MEMCG_DATA_OBJCGS; |
3831 |
++#endif |
3832 |
+ } |
3833 |
+ |
3834 |
+ /* |
3835 |
+@@ -911,6 +917,9 @@ void __kfence_free(void *addr) |
3836 |
+ { |
3837 |
+ struct kfence_metadata *meta = addr_to_metadata((unsigned long)addr); |
3838 |
+ |
3839 |
++#ifdef CONFIG_MEMCG |
3840 |
++ KFENCE_WARN_ON(meta->objcg); |
3841 |
++#endif |
3842 |
+ /* |
3843 |
+ * If the objects of the cache are SLAB_TYPESAFE_BY_RCU, defer freeing |
3844 |
+ * the object, as the object page may be recycled for other-typed |
3845 |
+diff --git a/mm/kfence/kfence.h b/mm/kfence/kfence.h |
3846 |
+index 92bf6eff6060d..600f2e2431d6d 100644 |
3847 |
+--- a/mm/kfence/kfence.h |
3848 |
++++ b/mm/kfence/kfence.h |
3849 |
+@@ -89,6 +89,9 @@ struct kfence_metadata { |
3850 |
+ struct kfence_track free_track; |
3851 |
+ /* For updating alloc_covered on frees. */ |
3852 |
+ u32 alloc_stack_hash; |
3853 |
++#ifdef CONFIG_MEMCG |
3854 |
++ struct obj_cgroup *objcg; |
3855 |
++#endif |
3856 |
+ }; |
3857 |
+ |
3858 |
+ extern struct kfence_metadata kfence_metadata[CONFIG_KFENCE_NUM_OBJECTS]; |
3859 |
+diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c |
3860 |
+index 68d2cbf8331ac..ea61dfe19c869 100644 |
3861 |
+--- a/net/core/bpf_sk_storage.c |
3862 |
++++ b/net/core/bpf_sk_storage.c |
3863 |
+@@ -929,7 +929,7 @@ static struct bpf_iter_reg bpf_sk_storage_map_reg_info = { |
3864 |
+ { offsetof(struct bpf_iter__bpf_sk_storage_map, sk), |
3865 |
+ PTR_TO_BTF_ID_OR_NULL }, |
3866 |
+ { offsetof(struct bpf_iter__bpf_sk_storage_map, value), |
3867 |
+- PTR_TO_RDWR_BUF_OR_NULL }, |
3868 |
++ PTR_TO_BUF | PTR_MAYBE_NULL }, |
3869 |
+ }, |
3870 |
+ .seq_info = &iter_seq_info, |
3871 |
+ }; |
3872 |
+diff --git a/net/core/filter.c b/net/core/filter.c |
3873 |
+index cdd7e92db3030..821278b906b71 100644 |
3874 |
+--- a/net/core/filter.c |
3875 |
++++ b/net/core/filter.c |
3876 |
+@@ -1713,7 +1713,7 @@ static const struct bpf_func_proto bpf_skb_store_bytes_proto = { |
3877 |
+ .ret_type = RET_INTEGER, |
3878 |
+ .arg1_type = ARG_PTR_TO_CTX, |
3879 |
+ .arg2_type = ARG_ANYTHING, |
3880 |
+- .arg3_type = ARG_PTR_TO_MEM, |
3881 |
++ .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY, |
3882 |
+ .arg4_type = ARG_CONST_SIZE, |
3883 |
+ .arg5_type = ARG_ANYTHING, |
3884 |
+ }; |
3885 |
+@@ -2018,9 +2018,9 @@ static const struct bpf_func_proto bpf_csum_diff_proto = { |
3886 |
+ .gpl_only = false, |
3887 |
+ .pkt_access = true, |
3888 |
+ .ret_type = RET_INTEGER, |
3889 |
+- .arg1_type = ARG_PTR_TO_MEM_OR_NULL, |
3890 |
++ .arg1_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY, |
3891 |
+ .arg2_type = ARG_CONST_SIZE_OR_ZERO, |
3892 |
+- .arg3_type = ARG_PTR_TO_MEM_OR_NULL, |
3893 |
++ .arg3_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY, |
3894 |
+ .arg4_type = ARG_CONST_SIZE_OR_ZERO, |
3895 |
+ .arg5_type = ARG_ANYTHING, |
3896 |
+ }; |
3897 |
+@@ -2541,7 +2541,7 @@ static const struct bpf_func_proto bpf_redirect_neigh_proto = { |
3898 |
+ .gpl_only = false, |
3899 |
+ .ret_type = RET_INTEGER, |
3900 |
+ .arg1_type = ARG_ANYTHING, |
3901 |
+- .arg2_type = ARG_PTR_TO_MEM_OR_NULL, |
3902 |
++ .arg2_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY, |
3903 |
+ .arg3_type = ARG_CONST_SIZE_OR_ZERO, |
3904 |
+ .arg4_type = ARG_ANYTHING, |
3905 |
+ }; |
3906 |
+@@ -4177,7 +4177,7 @@ static const struct bpf_func_proto bpf_skb_event_output_proto = { |
3907 |
+ .arg1_type = ARG_PTR_TO_CTX, |
3908 |
+ .arg2_type = ARG_CONST_MAP_PTR, |
3909 |
+ .arg3_type = ARG_ANYTHING, |
3910 |
+- .arg4_type = ARG_PTR_TO_MEM, |
3911 |
++ .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, |
3912 |
+ .arg5_type = ARG_CONST_SIZE_OR_ZERO, |
3913 |
+ }; |
3914 |
+ |
3915 |
+@@ -4191,7 +4191,7 @@ const struct bpf_func_proto bpf_skb_output_proto = { |
3916 |
+ .arg1_btf_id = &bpf_skb_output_btf_ids[0], |
3917 |
+ .arg2_type = ARG_CONST_MAP_PTR, |
3918 |
+ .arg3_type = ARG_ANYTHING, |
3919 |
+- .arg4_type = ARG_PTR_TO_MEM, |
3920 |
++ .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, |
3921 |
+ .arg5_type = ARG_CONST_SIZE_OR_ZERO, |
3922 |
+ }; |
3923 |
+ |
3924 |
+@@ -4374,7 +4374,7 @@ static const struct bpf_func_proto bpf_skb_set_tunnel_key_proto = { |
3925 |
+ .gpl_only = false, |
3926 |
+ .ret_type = RET_INTEGER, |
3927 |
+ .arg1_type = ARG_PTR_TO_CTX, |
3928 |
+- .arg2_type = ARG_PTR_TO_MEM, |
3929 |
++ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, |
3930 |
+ .arg3_type = ARG_CONST_SIZE, |
3931 |
+ .arg4_type = ARG_ANYTHING, |
3932 |
+ }; |
3933 |
+@@ -4400,7 +4400,7 @@ static const struct bpf_func_proto bpf_skb_set_tunnel_opt_proto = { |
3934 |
+ .gpl_only = false, |
3935 |
+ .ret_type = RET_INTEGER, |
3936 |
+ .arg1_type = ARG_PTR_TO_CTX, |
3937 |
+- .arg2_type = ARG_PTR_TO_MEM, |
3938 |
++ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, |
3939 |
+ .arg3_type = ARG_CONST_SIZE, |
3940 |
+ }; |
3941 |
+ |
3942 |
+@@ -4570,7 +4570,7 @@ static const struct bpf_func_proto bpf_xdp_event_output_proto = { |
3943 |
+ .arg1_type = ARG_PTR_TO_CTX, |
3944 |
+ .arg2_type = ARG_CONST_MAP_PTR, |
3945 |
+ .arg3_type = ARG_ANYTHING, |
3946 |
+- .arg4_type = ARG_PTR_TO_MEM, |
3947 |
++ .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, |
3948 |
+ .arg5_type = ARG_CONST_SIZE_OR_ZERO, |
3949 |
+ }; |
3950 |
+ |
3951 |
+@@ -4584,7 +4584,7 @@ const struct bpf_func_proto bpf_xdp_output_proto = { |
3952 |
+ .arg1_btf_id = &bpf_xdp_output_btf_ids[0], |
3953 |
+ .arg2_type = ARG_CONST_MAP_PTR, |
3954 |
+ .arg3_type = ARG_ANYTHING, |
3955 |
+- .arg4_type = ARG_PTR_TO_MEM, |
3956 |
++ .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, |
3957 |
+ .arg5_type = ARG_CONST_SIZE_OR_ZERO, |
3958 |
+ }; |
3959 |
+ |
3960 |
+@@ -5072,7 +5072,7 @@ const struct bpf_func_proto bpf_sk_setsockopt_proto = { |
3961 |
+ .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, |
3962 |
+ .arg2_type = ARG_ANYTHING, |
3963 |
+ .arg3_type = ARG_ANYTHING, |
3964 |
+- .arg4_type = ARG_PTR_TO_MEM, |
3965 |
++ .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, |
3966 |
+ .arg5_type = ARG_CONST_SIZE, |
3967 |
+ }; |
3968 |
+ |
3969 |
+@@ -5106,7 +5106,7 @@ static const struct bpf_func_proto bpf_sock_addr_setsockopt_proto = { |
3970 |
+ .arg1_type = ARG_PTR_TO_CTX, |
3971 |
+ .arg2_type = ARG_ANYTHING, |
3972 |
+ .arg3_type = ARG_ANYTHING, |
3973 |
+- .arg4_type = ARG_PTR_TO_MEM, |
3974 |
++ .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, |
3975 |
+ .arg5_type = ARG_CONST_SIZE, |
3976 |
+ }; |
3977 |
+ |
3978 |
+@@ -5140,7 +5140,7 @@ static const struct bpf_func_proto bpf_sock_ops_setsockopt_proto = { |
3979 |
+ .arg1_type = ARG_PTR_TO_CTX, |
3980 |
+ .arg2_type = ARG_ANYTHING, |
3981 |
+ .arg3_type = ARG_ANYTHING, |
3982 |
+- .arg4_type = ARG_PTR_TO_MEM, |
3983 |
++ .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, |
3984 |
+ .arg5_type = ARG_CONST_SIZE, |
3985 |
+ }; |
3986 |
+ |
3987 |
+@@ -5315,7 +5315,7 @@ static const struct bpf_func_proto bpf_bind_proto = { |
3988 |
+ .gpl_only = false, |
3989 |
+ .ret_type = RET_INTEGER, |
3990 |
+ .arg1_type = ARG_PTR_TO_CTX, |
3991 |
+- .arg2_type = ARG_PTR_TO_MEM, |
3992 |
++ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, |
3993 |
+ .arg3_type = ARG_CONST_SIZE, |
3994 |
+ }; |
3995 |
+ |
3996 |
+@@ -5903,7 +5903,7 @@ static const struct bpf_func_proto bpf_lwt_in_push_encap_proto = { |
3997 |
+ .ret_type = RET_INTEGER, |
3998 |
+ .arg1_type = ARG_PTR_TO_CTX, |
3999 |
+ .arg2_type = ARG_ANYTHING, |
4000 |
+- .arg3_type = ARG_PTR_TO_MEM, |
4001 |
++ .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY, |
4002 |
+ .arg4_type = ARG_CONST_SIZE |
4003 |
+ }; |
4004 |
+ |
4005 |
+@@ -5913,7 +5913,7 @@ static const struct bpf_func_proto bpf_lwt_xmit_push_encap_proto = { |
4006 |
+ .ret_type = RET_INTEGER, |
4007 |
+ .arg1_type = ARG_PTR_TO_CTX, |
4008 |
+ .arg2_type = ARG_ANYTHING, |
4009 |
+- .arg3_type = ARG_PTR_TO_MEM, |
4010 |
++ .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY, |
4011 |
+ .arg4_type = ARG_CONST_SIZE |
4012 |
+ }; |
4013 |
+ |
4014 |
+@@ -5956,7 +5956,7 @@ static const struct bpf_func_proto bpf_lwt_seg6_store_bytes_proto = { |
4015 |
+ .ret_type = RET_INTEGER, |
4016 |
+ .arg1_type = ARG_PTR_TO_CTX, |
4017 |
+ .arg2_type = ARG_ANYTHING, |
4018 |
+- .arg3_type = ARG_PTR_TO_MEM, |
4019 |
++ .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY, |
4020 |
+ .arg4_type = ARG_CONST_SIZE |
4021 |
+ }; |
4022 |
+ |
4023 |
+@@ -6044,7 +6044,7 @@ static const struct bpf_func_proto bpf_lwt_seg6_action_proto = { |
4024 |
+ .ret_type = RET_INTEGER, |
4025 |
+ .arg1_type = ARG_PTR_TO_CTX, |
4026 |
+ .arg2_type = ARG_ANYTHING, |
4027 |
+- .arg3_type = ARG_PTR_TO_MEM, |
4028 |
++ .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY, |
4029 |
+ .arg4_type = ARG_CONST_SIZE |
4030 |
+ }; |
4031 |
+ |
4032 |
+@@ -6269,7 +6269,7 @@ static const struct bpf_func_proto bpf_skc_lookup_tcp_proto = { |
4033 |
+ .pkt_access = true, |
4034 |
+ .ret_type = RET_PTR_TO_SOCK_COMMON_OR_NULL, |
4035 |
+ .arg1_type = ARG_PTR_TO_CTX, |
4036 |
+- .arg2_type = ARG_PTR_TO_MEM, |
4037 |
++ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, |
4038 |
+ .arg3_type = ARG_CONST_SIZE, |
4039 |
+ .arg4_type = ARG_ANYTHING, |
4040 |
+ .arg5_type = ARG_ANYTHING, |
4041 |
+@@ -6288,7 +6288,7 @@ static const struct bpf_func_proto bpf_sk_lookup_tcp_proto = { |
4042 |
+ .pkt_access = true, |
4043 |
+ .ret_type = RET_PTR_TO_SOCKET_OR_NULL, |
4044 |
+ .arg1_type = ARG_PTR_TO_CTX, |
4045 |
+- .arg2_type = ARG_PTR_TO_MEM, |
4046 |
++ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, |
4047 |
+ .arg3_type = ARG_CONST_SIZE, |
4048 |
+ .arg4_type = ARG_ANYTHING, |
4049 |
+ .arg5_type = ARG_ANYTHING, |
4050 |
+@@ -6307,7 +6307,7 @@ static const struct bpf_func_proto bpf_sk_lookup_udp_proto = { |
4051 |
+ .pkt_access = true, |
4052 |
+ .ret_type = RET_PTR_TO_SOCKET_OR_NULL, |
4053 |
+ .arg1_type = ARG_PTR_TO_CTX, |
4054 |
+- .arg2_type = ARG_PTR_TO_MEM, |
4055 |
++ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, |
4056 |
+ .arg3_type = ARG_CONST_SIZE, |
4057 |
+ .arg4_type = ARG_ANYTHING, |
4058 |
+ .arg5_type = ARG_ANYTHING, |
4059 |
+@@ -6344,7 +6344,7 @@ static const struct bpf_func_proto bpf_xdp_sk_lookup_udp_proto = { |
4060 |
+ .pkt_access = true, |
4061 |
+ .ret_type = RET_PTR_TO_SOCKET_OR_NULL, |
4062 |
+ .arg1_type = ARG_PTR_TO_CTX, |
4063 |
+- .arg2_type = ARG_PTR_TO_MEM, |
4064 |
++ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, |
4065 |
+ .arg3_type = ARG_CONST_SIZE, |
4066 |
+ .arg4_type = ARG_ANYTHING, |
4067 |
+ .arg5_type = ARG_ANYTHING, |
4068 |
+@@ -6367,7 +6367,7 @@ static const struct bpf_func_proto bpf_xdp_skc_lookup_tcp_proto = { |
4069 |
+ .pkt_access = true, |
4070 |
+ .ret_type = RET_PTR_TO_SOCK_COMMON_OR_NULL, |
4071 |
+ .arg1_type = ARG_PTR_TO_CTX, |
4072 |
+- .arg2_type = ARG_PTR_TO_MEM, |
4073 |
++ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, |
4074 |
+ .arg3_type = ARG_CONST_SIZE, |
4075 |
+ .arg4_type = ARG_ANYTHING, |
4076 |
+ .arg5_type = ARG_ANYTHING, |
4077 |
+@@ -6390,7 +6390,7 @@ static const struct bpf_func_proto bpf_xdp_sk_lookup_tcp_proto = { |
4078 |
+ .pkt_access = true, |
4079 |
+ .ret_type = RET_PTR_TO_SOCKET_OR_NULL, |
4080 |
+ .arg1_type = ARG_PTR_TO_CTX, |
4081 |
+- .arg2_type = ARG_PTR_TO_MEM, |
4082 |
++ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, |
4083 |
+ .arg3_type = ARG_CONST_SIZE, |
4084 |
+ .arg4_type = ARG_ANYTHING, |
4085 |
+ .arg5_type = ARG_ANYTHING, |
4086 |
+@@ -6409,7 +6409,7 @@ static const struct bpf_func_proto bpf_sock_addr_skc_lookup_tcp_proto = { |
4087 |
+ .gpl_only = false, |
4088 |
+ .ret_type = RET_PTR_TO_SOCK_COMMON_OR_NULL, |
4089 |
+ .arg1_type = ARG_PTR_TO_CTX, |
4090 |
+- .arg2_type = ARG_PTR_TO_MEM, |
4091 |
++ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, |
4092 |
+ .arg3_type = ARG_CONST_SIZE, |
4093 |
+ .arg4_type = ARG_ANYTHING, |
4094 |
+ .arg5_type = ARG_ANYTHING, |
4095 |
+@@ -6428,7 +6428,7 @@ static const struct bpf_func_proto bpf_sock_addr_sk_lookup_tcp_proto = { |
4096 |
+ .gpl_only = false, |
4097 |
+ .ret_type = RET_PTR_TO_SOCKET_OR_NULL, |
4098 |
+ .arg1_type = ARG_PTR_TO_CTX, |
4099 |
+- .arg2_type = ARG_PTR_TO_MEM, |
4100 |
++ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, |
4101 |
+ .arg3_type = ARG_CONST_SIZE, |
4102 |
+ .arg4_type = ARG_ANYTHING, |
4103 |
+ .arg5_type = ARG_ANYTHING, |
4104 |
+@@ -6447,7 +6447,7 @@ static const struct bpf_func_proto bpf_sock_addr_sk_lookup_udp_proto = { |
4105 |
+ .gpl_only = false, |
4106 |
+ .ret_type = RET_PTR_TO_SOCKET_OR_NULL, |
4107 |
+ .arg1_type = ARG_PTR_TO_CTX, |
4108 |
+- .arg2_type = ARG_PTR_TO_MEM, |
4109 |
++ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, |
4110 |
+ .arg3_type = ARG_CONST_SIZE, |
4111 |
+ .arg4_type = ARG_ANYTHING, |
4112 |
+ .arg5_type = ARG_ANYTHING, |
4113 |
+@@ -6769,9 +6769,9 @@ static const struct bpf_func_proto bpf_tcp_check_syncookie_proto = { |
4114 |
+ .pkt_access = true, |
4115 |
+ .ret_type = RET_INTEGER, |
4116 |
+ .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, |
4117 |
+- .arg2_type = ARG_PTR_TO_MEM, |
4118 |
++ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, |
4119 |
+ .arg3_type = ARG_CONST_SIZE, |
4120 |
+- .arg4_type = ARG_PTR_TO_MEM, |
4121 |
++ .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, |
4122 |
+ .arg5_type = ARG_CONST_SIZE, |
4123 |
+ }; |
4124 |
+ |
4125 |
+@@ -6838,9 +6838,9 @@ static const struct bpf_func_proto bpf_tcp_gen_syncookie_proto = { |
4126 |
+ .pkt_access = true, |
4127 |
+ .ret_type = RET_INTEGER, |
4128 |
+ .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, |
4129 |
+- .arg2_type = ARG_PTR_TO_MEM, |
4130 |
++ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, |
4131 |
+ .arg3_type = ARG_CONST_SIZE, |
4132 |
+- .arg4_type = ARG_PTR_TO_MEM, |
4133 |
++ .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, |
4134 |
+ .arg5_type = ARG_CONST_SIZE, |
4135 |
+ }; |
4136 |
+ |
4137 |
+@@ -7069,7 +7069,7 @@ static const struct bpf_func_proto bpf_sock_ops_store_hdr_opt_proto = { |
4138 |
+ .gpl_only = false, |
4139 |
+ .ret_type = RET_INTEGER, |
4140 |
+ .arg1_type = ARG_PTR_TO_CTX, |
4141 |
+- .arg2_type = ARG_PTR_TO_MEM, |
4142 |
++ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, |
4143 |
+ .arg3_type = ARG_CONST_SIZE, |
4144 |
+ .arg4_type = ARG_ANYTHING, |
4145 |
+ }; |
4146 |
+diff --git a/net/core/sock_map.c b/net/core/sock_map.c |
4147 |
+index 8288b5382f08d..6351b6af7aca9 100644 |
4148 |
+--- a/net/core/sock_map.c |
4149 |
++++ b/net/core/sock_map.c |
4150 |
+@@ -1575,7 +1575,7 @@ static struct bpf_iter_reg sock_map_iter_reg = { |
4151 |
+ .ctx_arg_info_size = 2, |
4152 |
+ .ctx_arg_info = { |
4153 |
+ { offsetof(struct bpf_iter__sockmap, key), |
4154 |
+- PTR_TO_RDONLY_BUF_OR_NULL }, |
4155 |
++ PTR_TO_BUF | PTR_MAYBE_NULL | MEM_RDONLY }, |
4156 |
+ { offsetof(struct bpf_iter__sockmap, sk), |
4157 |
+ PTR_TO_BTF_ID_OR_NULL }, |
4158 |
+ }, |
4159 |
+diff --git a/tools/testing/selftests/bpf/prog_tests/ksyms_btf.c b/tools/testing/selftests/bpf/prog_tests/ksyms_btf.c |
4160 |
+index cf3acfa5a91d5..69455fe90ac3e 100644 |
4161 |
+--- a/tools/testing/selftests/bpf/prog_tests/ksyms_btf.c |
4162 |
++++ b/tools/testing/selftests/bpf/prog_tests/ksyms_btf.c |
4163 |
+@@ -7,6 +7,7 @@ |
4164 |
+ #include "test_ksyms_btf.skel.h" |
4165 |
+ #include "test_ksyms_btf_null_check.skel.h" |
4166 |
+ #include "test_ksyms_weak.skel.h" |
4167 |
++#include "test_ksyms_btf_write_check.skel.h" |
4168 |
+ |
4169 |
+ static int duration; |
4170 |
+ |
4171 |
+@@ -109,6 +110,16 @@ cleanup: |
4172 |
+ test_ksyms_weak__destroy(skel); |
4173 |
+ } |
4174 |
+ |
4175 |
++static void test_write_check(void) |
4176 |
++{ |
4177 |
++ struct test_ksyms_btf_write_check *skel; |
4178 |
++ |
4179 |
++ skel = test_ksyms_btf_write_check__open_and_load(); |
4180 |
++ ASSERT_ERR_PTR(skel, "unexpected load of a prog writing to ksym memory\n"); |
4181 |
++ |
4182 |
++ test_ksyms_btf_write_check__destroy(skel); |
4183 |
++} |
4184 |
++ |
4185 |
+ void test_ksyms_btf(void) |
4186 |
+ { |
4187 |
+ int percpu_datasec; |
4188 |
+@@ -136,4 +147,7 @@ void test_ksyms_btf(void) |
4189 |
+ |
4190 |
+ if (test__start_subtest("weak_ksyms")) |
4191 |
+ test_weak_syms(); |
4192 |
++ |
4193 |
++ if (test__start_subtest("write_check")) |
4194 |
++ test_write_check(); |
4195 |
+ } |
4196 |
+diff --git a/tools/testing/selftests/bpf/progs/test_ksyms_btf_write_check.c b/tools/testing/selftests/bpf/progs/test_ksyms_btf_write_check.c |
4197 |
+new file mode 100644 |
4198 |
+index 0000000000000..2180c41cd890f |
4199 |
+--- /dev/null |
4200 |
++++ b/tools/testing/selftests/bpf/progs/test_ksyms_btf_write_check.c |
4201 |
+@@ -0,0 +1,29 @@ |
4202 |
++// SPDX-License-Identifier: GPL-2.0 |
4203 |
++/* Copyright (c) 2021 Google */ |
4204 |
++ |
4205 |
++#include "vmlinux.h" |
4206 |
++ |
4207 |
++#include <bpf/bpf_helpers.h> |
4208 |
++ |
4209 |
++extern const int bpf_prog_active __ksym; /* int type global var. */ |
4210 |
++ |
4211 |
++SEC("raw_tp/sys_enter") |
4212 |
++int handler(const void *ctx) |
4213 |
++{ |
4214 |
++ int *active; |
4215 |
++ __u32 cpu; |
4216 |
++ |
4217 |
++ cpu = bpf_get_smp_processor_id(); |
4218 |
++ active = (int *)bpf_per_cpu_ptr(&bpf_prog_active, cpu); |
4219 |
++ if (active) { |
4220 |
++ /* Kernel memory obtained from bpf_{per,this}_cpu_ptr |
4221 |
++ * is read-only, should _not_ pass verification. |
4222 |
++ */ |
4223 |
++ /* WRITE_ONCE */ |
4224 |
++ *(volatile int *)active = -1; |
4225 |
++ } |
4226 |
++ |
4227 |
++ return 0; |
4228 |
++} |
4229 |
++ |
4230 |
++char _license[] SEC("license") = "GPL"; |
4231 |
+diff --git a/tools/testing/selftests/bpf/verifier/calls.c b/tools/testing/selftests/bpf/verifier/calls.c |
4232 |
+index 336a749673d19..2e701e7f69680 100644 |
4233 |
+--- a/tools/testing/selftests/bpf/verifier/calls.c |
4234 |
++++ b/tools/testing/selftests/bpf/verifier/calls.c |
4235 |
+@@ -107,6 +107,25 @@ |
4236 |
+ .result = REJECT, |
4237 |
+ .errstr = "R0 min value is outside of the allowed memory range", |
4238 |
+ }, |
4239 |
++{ |
4240 |
++ "calls: trigger reg2btf_ids[reg->type] for reg->type > __BPF_REG_TYPE_MAX", |
4241 |
++ .insns = { |
4242 |
++ BPF_MOV64_REG(BPF_REG_1, BPF_REG_10), |
4243 |
++ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -8), |
4244 |
++ BPF_ST_MEM(BPF_DW, BPF_REG_1, 0, 0), |
4245 |
++ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, BPF_PSEUDO_KFUNC_CALL, 0, 0), |
4246 |
++ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), |
4247 |
++ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, BPF_PSEUDO_KFUNC_CALL, 0, 0), |
4248 |
++ BPF_EXIT_INSN(), |
4249 |
++ }, |
4250 |
++ .prog_type = BPF_PROG_TYPE_SCHED_CLS, |
4251 |
++ .result = REJECT, |
4252 |
++ .errstr = "arg#0 pointer type STRUCT prog_test_ref_kfunc must point", |
4253 |
++ .fixup_kfunc_btf_id = { |
4254 |
++ { "bpf_kfunc_call_test_acquire", 3 }, |
4255 |
++ { "bpf_kfunc_call_test_release", 5 }, |
4256 |
++ }, |
4257 |
++}, |
4258 |
+ { |
4259 |
+ "calls: overlapping caller/callee", |
4260 |
+ .insns = { |