Gentoo Archives: gentoo-commits

From: Mike Pagano <mpagano@g.o>
To: gentoo-commits@l.g.o
Subject: [gentoo-commits] proj/linux-patches:4.14 commit in: /
Date: Tue, 02 Jan 2018 20:14:57
Message-Id: 1514924083.042e4ad420014bdd293720b69e29f4e6be0dda65.mpagano@gentoo
1 commit: 042e4ad420014bdd293720b69e29f4e6be0dda65
2 Author: Mike Pagano <mpagano <AT> gentoo <DOT> org>
3 AuthorDate: Tue Jan 2 20:14:43 2018 +0000
4 Commit: Mike Pagano <mpagano <AT> gentoo <DOT> org>
5 CommitDate: Tue Jan 2 20:14:43 2018 +0000
6 URL: https://gitweb.gentoo.org/proj/linux-patches.git/commit/?id=042e4ad4
7
8 Linux patch 4.14.11
9
10 0000_README | 4 +
11 1010_linux-4.14.11.patch | 7283 ++++++++++++++++++++++++++++++++++++++++++++++
12 2 files changed, 7287 insertions(+)
13
14 diff --git a/0000_README b/0000_README
15 index 86c72af..c1861d7 100644
16 --- a/0000_README
17 +++ b/0000_README
18 @@ -83,6 +83,10 @@ Patch: 1009_linux-4.14.10.patch
19 From: http://www.kernel.org
20 Desc: Linux 4.14.10
21
22 +Patch: 1010_linux-4.14.11.patch
23 +From: http://www.kernel.org
24 +Desc: Linux 4.14.11
25 +
26 Patch: 1500_XATTR_USER_PREFIX.patch
27 From: https://bugs.gentoo.org/show_bug.cgi?id=470644
28 Desc: Support for namespace user.pax.* on tmpfs.
29
30 diff --git a/1010_linux-4.14.11.patch b/1010_linux-4.14.11.patch
31 new file mode 100644
32 index 0000000..9febb2b
33 --- /dev/null
34 +++ b/1010_linux-4.14.11.patch
35 @@ -0,0 +1,7283 @@
36 +diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
37 +index 05496622b4ef..520fdec15bbb 100644
38 +--- a/Documentation/admin-guide/kernel-parameters.txt
39 ++++ b/Documentation/admin-guide/kernel-parameters.txt
40 +@@ -2685,6 +2685,8 @@
41 + steal time is computed, but won't influence scheduler
42 + behaviour
43 +
44 ++ nopti [X86-64] Disable kernel page table isolation
45 ++
46 + nolapic [X86-32,APIC] Do not enable or use the local APIC.
47 +
48 + nolapic_timer [X86-32,APIC] Do not use the local APIC timer.
49 +@@ -3253,6 +3255,12 @@
50 + pt. [PARIDE]
51 + See Documentation/blockdev/paride.txt.
52 +
53 ++ pti= [X86_64]
54 ++ Control user/kernel address space isolation:
55 ++ on - enable
56 ++ off - disable
57 ++ auto - default setting
58 ++
59 + pty.legacy_count=
60 + [KNL] Number of legacy pty's. Overwrites compiled-in
61 + default number.
62 +diff --git a/Documentation/x86/x86_64/mm.txt b/Documentation/x86/x86_64/mm.txt
63 +index 51101708a03a..ad41b3813f0a 100644
64 +--- a/Documentation/x86/x86_64/mm.txt
65 ++++ b/Documentation/x86/x86_64/mm.txt
66 +@@ -12,6 +12,7 @@ ffffea0000000000 - ffffeaffffffffff (=40 bits) virtual memory map (1TB)
67 + ... unused hole ...
68 + ffffec0000000000 - fffffbffffffffff (=44 bits) kasan shadow memory (16TB)
69 + ... unused hole ...
70 ++fffffe0000000000 - fffffe7fffffffff (=39 bits) LDT remap for PTI
71 + fffffe8000000000 - fffffeffffffffff (=39 bits) cpu_entry_area mapping
72 + ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks
73 + ... unused hole ...
74 +@@ -29,8 +30,8 @@ Virtual memory map with 5 level page tables:
75 + hole caused by [56:63] sign extension
76 + ff00000000000000 - ff0fffffffffffff (=52 bits) guard hole, reserved for hypervisor
77 + ff10000000000000 - ff8fffffffffffff (=55 bits) direct mapping of all phys. memory
78 +-ff90000000000000 - ff91ffffffffffff (=49 bits) hole
79 +-ff92000000000000 - ffd1ffffffffffff (=54 bits) vmalloc/ioremap space
80 ++ff90000000000000 - ff9fffffffffffff (=52 bits) LDT remap for PTI
81 ++ffa0000000000000 - ffd1ffffffffffff (=54 bits) vmalloc/ioremap space (12800 TB)
82 + ffd2000000000000 - ffd3ffffffffffff (=49 bits) hole
83 + ffd4000000000000 - ffd5ffffffffffff (=49 bits) virtual memory map (512TB)
84 + ... unused hole ...
85 +diff --git a/Makefile b/Makefile
86 +index 9edfb78836a9..655887067dc7 100644
87 +--- a/Makefile
88 ++++ b/Makefile
89 +@@ -1,7 +1,7 @@
90 + # SPDX-License-Identifier: GPL-2.0
91 + VERSION = 4
92 + PATCHLEVEL = 14
93 +-SUBLEVEL = 10
94 ++SUBLEVEL = 11
95 + EXTRAVERSION =
96 + NAME = Petit Gorille
97 +
98 +@@ -802,6 +802,9 @@ KBUILD_CFLAGS += $(call cc-disable-warning, pointer-sign)
99 + # disable invalid "can't wrap" optimizations for signed / pointers
100 + KBUILD_CFLAGS += $(call cc-option,-fno-strict-overflow)
101 +
102 ++# Make sure -fstack-check isn't enabled (like gentoo apparently did)
103 ++KBUILD_CFLAGS += $(call cc-option,-fno-stack-check,)
104 ++
105 + # conserve stack if available
106 + KBUILD_CFLAGS += $(call cc-option,-fconserve-stack)
107 +
108 +diff --git a/arch/sparc/lib/hweight.S b/arch/sparc/lib/hweight.S
109 +index e5547b22cd18..0ddbbb031822 100644
110 +--- a/arch/sparc/lib/hweight.S
111 ++++ b/arch/sparc/lib/hweight.S
112 +@@ -44,8 +44,8 @@ EXPORT_SYMBOL(__arch_hweight32)
113 + .previous
114 +
115 + ENTRY(__arch_hweight64)
116 +- sethi %hi(__sw_hweight16), %g1
117 +- jmpl %g1 + %lo(__sw_hweight16), %g0
118 ++ sethi %hi(__sw_hweight64), %g1
119 ++ jmpl %g1 + %lo(__sw_hweight64), %g0
120 + nop
121 + ENDPROC(__arch_hweight64)
122 + EXPORT_SYMBOL(__arch_hweight64)
123 +diff --git a/arch/x86/boot/compressed/pagetable.c b/arch/x86/boot/compressed/pagetable.c
124 +index 972319ff5b01..e691ff734cb5 100644
125 +--- a/arch/x86/boot/compressed/pagetable.c
126 ++++ b/arch/x86/boot/compressed/pagetable.c
127 +@@ -23,6 +23,9 @@
128 + */
129 + #undef CONFIG_AMD_MEM_ENCRYPT
130 +
131 ++/* No PAGE_TABLE_ISOLATION support needed either: */
132 ++#undef CONFIG_PAGE_TABLE_ISOLATION
133 ++
134 + #include "misc.h"
135 +
136 + /* These actually do the work of building the kernel identity maps. */
137 +diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
138 +index 3fd8bc560fae..45a63e00a6af 100644
139 +--- a/arch/x86/entry/calling.h
140 ++++ b/arch/x86/entry/calling.h
141 +@@ -1,6 +1,11 @@
142 + /* SPDX-License-Identifier: GPL-2.0 */
143 + #include <linux/jump_label.h>
144 + #include <asm/unwind_hints.h>
145 ++#include <asm/cpufeatures.h>
146 ++#include <asm/page_types.h>
147 ++#include <asm/percpu.h>
148 ++#include <asm/asm-offsets.h>
149 ++#include <asm/processor-flags.h>
150 +
151 + /*
152 +
153 +@@ -187,6 +192,146 @@ For 32-bit we have the following conventions - kernel is built with
154 + #endif
155 + .endm
156 +
157 ++#ifdef CONFIG_PAGE_TABLE_ISOLATION
158 ++
159 ++/*
160 ++ * PAGE_TABLE_ISOLATION PGDs are 8k. Flip bit 12 to switch between the two
161 ++ * halves:
162 ++ */
163 ++#define PTI_SWITCH_PGTABLES_MASK (1<<PAGE_SHIFT)
164 ++#define PTI_SWITCH_MASK (PTI_SWITCH_PGTABLES_MASK|(1<<X86_CR3_PTI_SWITCH_BIT))
165 ++
166 ++.macro SET_NOFLUSH_BIT reg:req
167 ++ bts $X86_CR3_PCID_NOFLUSH_BIT, \reg
168 ++.endm
169 ++
170 ++.macro ADJUST_KERNEL_CR3 reg:req
171 ++ ALTERNATIVE "", "SET_NOFLUSH_BIT \reg", X86_FEATURE_PCID
172 ++ /* Clear PCID and "PAGE_TABLE_ISOLATION bit", point CR3 at kernel pagetables: */
173 ++ andq $(~PTI_SWITCH_MASK), \reg
174 ++.endm
175 ++
176 ++.macro SWITCH_TO_KERNEL_CR3 scratch_reg:req
177 ++ ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
178 ++ mov %cr3, \scratch_reg
179 ++ ADJUST_KERNEL_CR3 \scratch_reg
180 ++ mov \scratch_reg, %cr3
181 ++.Lend_\@:
182 ++.endm
183 ++
184 ++#define THIS_CPU_user_pcid_flush_mask \
185 ++ PER_CPU_VAR(cpu_tlbstate) + TLB_STATE_user_pcid_flush_mask
186 ++
187 ++.macro SWITCH_TO_USER_CR3_NOSTACK scratch_reg:req scratch_reg2:req
188 ++ ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
189 ++ mov %cr3, \scratch_reg
190 ++
191 ++ ALTERNATIVE "jmp .Lwrcr3_\@", "", X86_FEATURE_PCID
192 ++
193 ++ /*
194 ++ * Test if the ASID needs a flush.
195 ++ */
196 ++ movq \scratch_reg, \scratch_reg2
197 ++ andq $(0x7FF), \scratch_reg /* mask ASID */
198 ++ bt \scratch_reg, THIS_CPU_user_pcid_flush_mask
199 ++ jnc .Lnoflush_\@
200 ++
201 ++ /* Flush needed, clear the bit */
202 ++ btr \scratch_reg, THIS_CPU_user_pcid_flush_mask
203 ++ movq \scratch_reg2, \scratch_reg
204 ++ jmp .Lwrcr3_\@
205 ++
206 ++.Lnoflush_\@:
207 ++ movq \scratch_reg2, \scratch_reg
208 ++ SET_NOFLUSH_BIT \scratch_reg
209 ++
210 ++.Lwrcr3_\@:
211 ++ /* Flip the PGD and ASID to the user version */
212 ++ orq $(PTI_SWITCH_MASK), \scratch_reg
213 ++ mov \scratch_reg, %cr3
214 ++.Lend_\@:
215 ++.endm
216 ++
217 ++.macro SWITCH_TO_USER_CR3_STACK scratch_reg:req
218 ++ pushq %rax
219 ++ SWITCH_TO_USER_CR3_NOSTACK scratch_reg=\scratch_reg scratch_reg2=%rax
220 ++ popq %rax
221 ++.endm
222 ++
223 ++.macro SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg:req save_reg:req
224 ++ ALTERNATIVE "jmp .Ldone_\@", "", X86_FEATURE_PTI
225 ++ movq %cr3, \scratch_reg
226 ++ movq \scratch_reg, \save_reg
227 ++ /*
228 ++ * Is the "switch mask" all zero? That means that both of
229 ++ * these are zero:
230 ++ *
231 ++ * 1. The user/kernel PCID bit, and
232 ++ * 2. The user/kernel "bit" that points CR3 to the
233 ++ * bottom half of the 8k PGD
234 ++ *
235 ++ * That indicates a kernel CR3 value, not a user CR3.
236 ++ */
237 ++ testq $(PTI_SWITCH_MASK), \scratch_reg
238 ++ jz .Ldone_\@
239 ++
240 ++ ADJUST_KERNEL_CR3 \scratch_reg
241 ++ movq \scratch_reg, %cr3
242 ++
243 ++.Ldone_\@:
244 ++.endm
245 ++
246 ++.macro RESTORE_CR3 scratch_reg:req save_reg:req
247 ++ ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
248 ++
249 ++ ALTERNATIVE "jmp .Lwrcr3_\@", "", X86_FEATURE_PCID
250 ++
251 ++ /*
252 ++ * KERNEL pages can always resume with NOFLUSH as we do
253 ++ * explicit flushes.
254 ++ */
255 ++ bt $X86_CR3_PTI_SWITCH_BIT, \save_reg
256 ++ jnc .Lnoflush_\@
257 ++
258 ++ /*
259 ++ * Check if there's a pending flush for the user ASID we're
260 ++ * about to set.
261 ++ */
262 ++ movq \save_reg, \scratch_reg
263 ++ andq $(0x7FF), \scratch_reg
264 ++ bt \scratch_reg, THIS_CPU_user_pcid_flush_mask
265 ++ jnc .Lnoflush_\@
266 ++
267 ++ btr \scratch_reg, THIS_CPU_user_pcid_flush_mask
268 ++ jmp .Lwrcr3_\@
269 ++
270 ++.Lnoflush_\@:
271 ++ SET_NOFLUSH_BIT \save_reg
272 ++
273 ++.Lwrcr3_\@:
274 ++ /*
275 ++ * The CR3 write could be avoided when not changing its value,
276 ++ * but would require a CR3 read *and* a scratch register.
277 ++ */
278 ++ movq \save_reg, %cr3
279 ++.Lend_\@:
280 ++.endm
281 ++
282 ++#else /* CONFIG_PAGE_TABLE_ISOLATION=n: */
283 ++
284 ++.macro SWITCH_TO_KERNEL_CR3 scratch_reg:req
285 ++.endm
286 ++.macro SWITCH_TO_USER_CR3_NOSTACK scratch_reg:req scratch_reg2:req
287 ++.endm
288 ++.macro SWITCH_TO_USER_CR3_STACK scratch_reg:req
289 ++.endm
290 ++.macro SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg:req save_reg:req
291 ++.endm
292 ++.macro RESTORE_CR3 scratch_reg:req save_reg:req
293 ++.endm
294 ++
295 ++#endif
296 ++
297 + #endif /* CONFIG_X86_64 */
298 +
299 + /*
300 +diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
301 +index 22c891c3b78d..dd696b966e58 100644
302 +--- a/arch/x86/entry/entry_64.S
303 ++++ b/arch/x86/entry/entry_64.S
304 +@@ -23,7 +23,6 @@
305 + #include <asm/segment.h>
306 + #include <asm/cache.h>
307 + #include <asm/errno.h>
308 +-#include "calling.h"
309 + #include <asm/asm-offsets.h>
310 + #include <asm/msr.h>
311 + #include <asm/unistd.h>
312 +@@ -40,6 +39,8 @@
313 + #include <asm/frame.h>
314 + #include <linux/err.h>
315 +
316 ++#include "calling.h"
317 ++
318 + .code64
319 + .section .entry.text, "ax"
320 +
321 +@@ -164,6 +165,9 @@ ENTRY(entry_SYSCALL_64_trampoline)
322 + /* Stash the user RSP. */
323 + movq %rsp, RSP_SCRATCH
324 +
325 ++ /* Note: using %rsp as a scratch reg. */
326 ++ SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp
327 ++
328 + /* Load the top of the task stack into RSP */
329 + movq CPU_ENTRY_AREA_tss + TSS_sp1 + CPU_ENTRY_AREA, %rsp
330 +
331 +@@ -203,6 +207,10 @@ ENTRY(entry_SYSCALL_64)
332 + */
333 +
334 + swapgs
335 ++ /*
336 ++ * This path is not taken when PAGE_TABLE_ISOLATION is disabled so it
337 ++ * is not required to switch CR3.
338 ++ */
339 + movq %rsp, PER_CPU_VAR(rsp_scratch)
340 + movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
341 +
342 +@@ -399,6 +407,7 @@ syscall_return_via_sysret:
343 + * We are on the trampoline stack. All regs except RDI are live.
344 + * We can do future final exit work right here.
345 + */
346 ++ SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
347 +
348 + popq %rdi
349 + popq %rsp
350 +@@ -736,6 +745,8 @@ GLOBAL(swapgs_restore_regs_and_return_to_usermode)
351 + * We can do future final exit work right here.
352 + */
353 +
354 ++ SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
355 ++
356 + /* Restore RDI. */
357 + popq %rdi
358 + SWAPGS
359 +@@ -818,7 +829,9 @@ native_irq_return_ldt:
360 + */
361 +
362 + pushq %rdi /* Stash user RDI */
363 +- SWAPGS
364 ++ SWAPGS /* to kernel GS */
365 ++ SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi /* to kernel CR3 */
366 ++
367 + movq PER_CPU_VAR(espfix_waddr), %rdi
368 + movq %rax, (0*8)(%rdi) /* user RAX */
369 + movq (1*8)(%rsp), %rax /* user RIP */
370 +@@ -834,7 +847,6 @@ native_irq_return_ldt:
371 + /* Now RAX == RSP. */
372 +
373 + andl $0xffff0000, %eax /* RAX = (RSP & 0xffff0000) */
374 +- popq %rdi /* Restore user RDI */
375 +
376 + /*
377 + * espfix_stack[31:16] == 0. The page tables are set up such that
378 +@@ -845,7 +857,11 @@ native_irq_return_ldt:
379 + * still points to an RO alias of the ESPFIX stack.
380 + */
381 + orq PER_CPU_VAR(espfix_stack), %rax
382 +- SWAPGS
383 ++
384 ++ SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
385 ++ SWAPGS /* to user GS */
386 ++ popq %rdi /* Restore user RDI */
387 ++
388 + movq %rax, %rsp
389 + UNWIND_HINT_IRET_REGS offset=8
390 +
391 +@@ -945,6 +961,8 @@ ENTRY(switch_to_thread_stack)
392 + UNWIND_HINT_FUNC
393 +
394 + pushq %rdi
395 ++ /* Need to switch before accessing the thread stack. */
396 ++ SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi
397 + movq %rsp, %rdi
398 + movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
399 + UNWIND_HINT sp_offset=16 sp_reg=ORC_REG_DI
400 +@@ -1244,7 +1262,11 @@ ENTRY(paranoid_entry)
401 + js 1f /* negative -> in kernel */
402 + SWAPGS
403 + xorl %ebx, %ebx
404 +-1: ret
405 ++
406 ++1:
407 ++ SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg=%rax save_reg=%r14
408 ++
409 ++ ret
410 + END(paranoid_entry)
411 +
412 + /*
413 +@@ -1266,6 +1288,7 @@ ENTRY(paranoid_exit)
414 + testl %ebx, %ebx /* swapgs needed? */
415 + jnz .Lparanoid_exit_no_swapgs
416 + TRACE_IRQS_IRETQ
417 ++ RESTORE_CR3 scratch_reg=%rbx save_reg=%r14
418 + SWAPGS_UNSAFE_STACK
419 + jmp .Lparanoid_exit_restore
420 + .Lparanoid_exit_no_swapgs:
421 +@@ -1293,6 +1316,8 @@ ENTRY(error_entry)
422 + * from user mode due to an IRET fault.
423 + */
424 + SWAPGS
425 ++ /* We have user CR3. Change to kernel CR3. */
426 ++ SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
427 +
428 + .Lerror_entry_from_usermode_after_swapgs:
429 + /* Put us onto the real thread stack. */
430 +@@ -1339,6 +1364,7 @@ ENTRY(error_entry)
431 + * .Lgs_change's error handler with kernel gsbase.
432 + */
433 + SWAPGS
434 ++ SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
435 + jmp .Lerror_entry_done
436 +
437 + .Lbstep_iret:
438 +@@ -1348,10 +1374,11 @@ ENTRY(error_entry)
439 +
440 + .Lerror_bad_iret:
441 + /*
442 +- * We came from an IRET to user mode, so we have user gsbase.
443 +- * Switch to kernel gsbase:
444 ++ * We came from an IRET to user mode, so we have user
445 ++ * gsbase and CR3. Switch to kernel gsbase and CR3:
446 + */
447 + SWAPGS
448 ++ SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
449 +
450 + /*
451 + * Pretend that the exception came from user mode: set up pt_regs
452 +@@ -1383,6 +1410,10 @@ END(error_exit)
453 + /*
454 + * Runs on exception stack. Xen PV does not go through this path at all,
455 + * so we can use real assembly here.
456 ++ *
457 ++ * Registers:
458 ++ * %r14: Used to save/restore the CR3 of the interrupted context
459 ++ * when PAGE_TABLE_ISOLATION is in use. Do not clobber.
460 + */
461 + ENTRY(nmi)
462 + UNWIND_HINT_IRET_REGS
463 +@@ -1446,6 +1477,7 @@ ENTRY(nmi)
464 +
465 + swapgs
466 + cld
467 ++ SWITCH_TO_KERNEL_CR3 scratch_reg=%rdx
468 + movq %rsp, %rdx
469 + movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
470 + UNWIND_HINT_IRET_REGS base=%rdx offset=8
471 +@@ -1698,6 +1730,8 @@ end_repeat_nmi:
472 + movq $-1, %rsi
473 + call do_nmi
474 +
475 ++ RESTORE_CR3 scratch_reg=%r15 save_reg=%r14
476 ++
477 + testl %ebx, %ebx /* swapgs needed? */
478 + jnz nmi_restore
479 + nmi_swapgs:
480 +diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
481 +index 95ad40eb7eff..40f17009ec20 100644
482 +--- a/arch/x86/entry/entry_64_compat.S
483 ++++ b/arch/x86/entry/entry_64_compat.S
484 +@@ -49,6 +49,10 @@
485 + ENTRY(entry_SYSENTER_compat)
486 + /* Interrupts are off on entry. */
487 + SWAPGS
488 ++
489 ++ /* We are about to clobber %rsp anyway, clobbering here is OK */
490 ++ SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp
491 ++
492 + movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
493 +
494 + /*
495 +@@ -215,6 +219,12 @@ GLOBAL(entry_SYSCALL_compat_after_hwframe)
496 + pushq $0 /* pt_regs->r14 = 0 */
497 + pushq $0 /* pt_regs->r15 = 0 */
498 +
499 ++ /*
500 ++ * We just saved %rdi so it is safe to clobber. It is not
501 ++ * preserved during the C calls inside TRACE_IRQS_OFF anyway.
502 ++ */
503 ++ SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi
504 ++
505 + /*
506 + * User mode is traced as though IRQs are on, and SYSENTER
507 + * turned them off.
508 +@@ -256,10 +266,22 @@ sysret32_from_system_call:
509 + * when the system call started, which is already known to user
510 + * code. We zero R8-R10 to avoid info leaks.
511 + */
512 ++ movq RSP-ORIG_RAX(%rsp), %rsp
513 ++
514 ++ /*
515 ++ * The original userspace %rsp (RSP-ORIG_RAX(%rsp)) is stored
516 ++ * on the process stack which is not mapped to userspace and
517 ++ * not readable after we SWITCH_TO_USER_CR3. Delay the CR3
518 ++ * switch until after after the last reference to the process
519 ++ * stack.
520 ++ *
521 ++ * %r8/%r9 are zeroed before the sysret, thus safe to clobber.
522 ++ */
523 ++ SWITCH_TO_USER_CR3_NOSTACK scratch_reg=%r8 scratch_reg2=%r9
524 ++
525 + xorq %r8, %r8
526 + xorq %r9, %r9
527 + xorq %r10, %r10
528 +- movq RSP-ORIG_RAX(%rsp), %rsp
529 + swapgs
530 + sysretl
531 + END(entry_SYSCALL_compat)
532 +diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c
533 +index 1faf40f2dda9..577fa8adb785 100644
534 +--- a/arch/x86/entry/vsyscall/vsyscall_64.c
535 ++++ b/arch/x86/entry/vsyscall/vsyscall_64.c
536 +@@ -344,14 +344,14 @@ int in_gate_area_no_mm(unsigned long addr)
537 + * vsyscalls but leave the page not present. If so, we skip calling
538 + * this.
539 + */
540 +-static void __init set_vsyscall_pgtable_user_bits(void)
541 ++void __init set_vsyscall_pgtable_user_bits(pgd_t *root)
542 + {
543 + pgd_t *pgd;
544 + p4d_t *p4d;
545 + pud_t *pud;
546 + pmd_t *pmd;
547 +
548 +- pgd = pgd_offset_k(VSYSCALL_ADDR);
549 ++ pgd = pgd_offset_pgd(root, VSYSCALL_ADDR);
550 + set_pgd(pgd, __pgd(pgd_val(*pgd) | _PAGE_USER));
551 + p4d = p4d_offset(pgd, VSYSCALL_ADDR);
552 + #if CONFIG_PGTABLE_LEVELS >= 5
553 +@@ -373,7 +373,7 @@ void __init map_vsyscall(void)
554 + vsyscall_mode == NATIVE
555 + ? PAGE_KERNEL_VSYSCALL
556 + : PAGE_KERNEL_VVAR);
557 +- set_vsyscall_pgtable_user_bits();
558 ++ set_vsyscall_pgtable_user_bits(swapper_pg_dir);
559 + }
560 +
561 + BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_PAGE) !=
562 +diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
563 +index 3674a4b6f8bd..8f0aace08b87 100644
564 +--- a/arch/x86/events/intel/ds.c
565 ++++ b/arch/x86/events/intel/ds.c
566 +@@ -3,16 +3,18 @@
567 + #include <linux/types.h>
568 + #include <linux/slab.h>
569 +
570 ++#include <asm/cpu_entry_area.h>
571 + #include <asm/perf_event.h>
572 + #include <asm/insn.h>
573 +
574 + #include "../perf_event.h"
575 +
576 ++/* Waste a full page so it can be mapped into the cpu_entry_area */
577 ++DEFINE_PER_CPU_PAGE_ALIGNED(struct debug_store, cpu_debug_store);
578 ++
579 + /* The size of a BTS record in bytes: */
580 + #define BTS_RECORD_SIZE 24
581 +
582 +-#define BTS_BUFFER_SIZE (PAGE_SIZE << 4)
583 +-#define PEBS_BUFFER_SIZE (PAGE_SIZE << 4)
584 + #define PEBS_FIXUP_SIZE PAGE_SIZE
585 +
586 + /*
587 +@@ -279,17 +281,52 @@ void fini_debug_store_on_cpu(int cpu)
588 +
589 + static DEFINE_PER_CPU(void *, insn_buffer);
590 +
591 +-static int alloc_pebs_buffer(int cpu)
592 ++static void ds_update_cea(void *cea, void *addr, size_t size, pgprot_t prot)
593 + {
594 +- struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
595 ++ phys_addr_t pa;
596 ++ size_t msz = 0;
597 ++
598 ++ pa = virt_to_phys(addr);
599 ++ for (; msz < size; msz += PAGE_SIZE, pa += PAGE_SIZE, cea += PAGE_SIZE)
600 ++ cea_set_pte(cea, pa, prot);
601 ++}
602 ++
603 ++static void ds_clear_cea(void *cea, size_t size)
604 ++{
605 ++ size_t msz = 0;
606 ++
607 ++ for (; msz < size; msz += PAGE_SIZE, cea += PAGE_SIZE)
608 ++ cea_set_pte(cea, 0, PAGE_NONE);
609 ++}
610 ++
611 ++static void *dsalloc_pages(size_t size, gfp_t flags, int cpu)
612 ++{
613 ++ unsigned int order = get_order(size);
614 + int node = cpu_to_node(cpu);
615 +- int max;
616 +- void *buffer, *ibuffer;
617 ++ struct page *page;
618 ++
619 ++ page = __alloc_pages_node(node, flags | __GFP_ZERO, order);
620 ++ return page ? page_address(page) : NULL;
621 ++}
622 ++
623 ++static void dsfree_pages(const void *buffer, size_t size)
624 ++{
625 ++ if (buffer)
626 ++ free_pages((unsigned long)buffer, get_order(size));
627 ++}
628 ++
629 ++static int alloc_pebs_buffer(int cpu)
630 ++{
631 ++ struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu);
632 ++ struct debug_store *ds = hwev->ds;
633 ++ size_t bsiz = x86_pmu.pebs_buffer_size;
634 ++ int max, node = cpu_to_node(cpu);
635 ++ void *buffer, *ibuffer, *cea;
636 +
637 + if (!x86_pmu.pebs)
638 + return 0;
639 +
640 +- buffer = kzalloc_node(x86_pmu.pebs_buffer_size, GFP_KERNEL, node);
641 ++ buffer = dsalloc_pages(bsiz, GFP_KERNEL, cpu);
642 + if (unlikely(!buffer))
643 + return -ENOMEM;
644 +
645 +@@ -300,25 +337,27 @@ static int alloc_pebs_buffer(int cpu)
646 + if (x86_pmu.intel_cap.pebs_format < 2) {
647 + ibuffer = kzalloc_node(PEBS_FIXUP_SIZE, GFP_KERNEL, node);
648 + if (!ibuffer) {
649 +- kfree(buffer);
650 ++ dsfree_pages(buffer, bsiz);
651 + return -ENOMEM;
652 + }
653 + per_cpu(insn_buffer, cpu) = ibuffer;
654 + }
655 +-
656 +- max = x86_pmu.pebs_buffer_size / x86_pmu.pebs_record_size;
657 +-
658 +- ds->pebs_buffer_base = (u64)(unsigned long)buffer;
659 ++ hwev->ds_pebs_vaddr = buffer;
660 ++ /* Update the cpu entry area mapping */
661 ++ cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.pebs_buffer;
662 ++ ds->pebs_buffer_base = (unsigned long) cea;
663 ++ ds_update_cea(cea, buffer, bsiz, PAGE_KERNEL);
664 + ds->pebs_index = ds->pebs_buffer_base;
665 +- ds->pebs_absolute_maximum = ds->pebs_buffer_base +
666 +- max * x86_pmu.pebs_record_size;
667 +-
668 ++ max = x86_pmu.pebs_record_size * (bsiz / x86_pmu.pebs_record_size);
669 ++ ds->pebs_absolute_maximum = ds->pebs_buffer_base + max;
670 + return 0;
671 + }
672 +
673 + static void release_pebs_buffer(int cpu)
674 + {
675 +- struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
676 ++ struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu);
677 ++ struct debug_store *ds = hwev->ds;
678 ++ void *cea;
679 +
680 + if (!ds || !x86_pmu.pebs)
681 + return;
682 +@@ -326,73 +365,70 @@ static void release_pebs_buffer(int cpu)
683 + kfree(per_cpu(insn_buffer, cpu));
684 + per_cpu(insn_buffer, cpu) = NULL;
685 +
686 +- kfree((void *)(unsigned long)ds->pebs_buffer_base);
687 ++ /* Clear the fixmap */
688 ++ cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.pebs_buffer;
689 ++ ds_clear_cea(cea, x86_pmu.pebs_buffer_size);
690 + ds->pebs_buffer_base = 0;
691 ++ dsfree_pages(hwev->ds_pebs_vaddr, x86_pmu.pebs_buffer_size);
692 ++ hwev->ds_pebs_vaddr = NULL;
693 + }
694 +
695 + static int alloc_bts_buffer(int cpu)
696 + {
697 +- struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
698 +- int node = cpu_to_node(cpu);
699 +- int max, thresh;
700 +- void *buffer;
701 ++ struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu);
702 ++ struct debug_store *ds = hwev->ds;
703 ++ void *buffer, *cea;
704 ++ int max;
705 +
706 + if (!x86_pmu.bts)
707 + return 0;
708 +
709 +- buffer = kzalloc_node(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, node);
710 ++ buffer = dsalloc_pages(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, cpu);
711 + if (unlikely(!buffer)) {
712 + WARN_ONCE(1, "%s: BTS buffer allocation failure\n", __func__);
713 + return -ENOMEM;
714 + }
715 +-
716 +- max = BTS_BUFFER_SIZE / BTS_RECORD_SIZE;
717 +- thresh = max / 16;
718 +-
719 +- ds->bts_buffer_base = (u64)(unsigned long)buffer;
720 ++ hwev->ds_bts_vaddr = buffer;
721 ++ /* Update the fixmap */
722 ++ cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.bts_buffer;
723 ++ ds->bts_buffer_base = (unsigned long) cea;
724 ++ ds_update_cea(cea, buffer, BTS_BUFFER_SIZE, PAGE_KERNEL);
725 + ds->bts_index = ds->bts_buffer_base;
726 +- ds->bts_absolute_maximum = ds->bts_buffer_base +
727 +- max * BTS_RECORD_SIZE;
728 +- ds->bts_interrupt_threshold = ds->bts_absolute_maximum -
729 +- thresh * BTS_RECORD_SIZE;
730 +-
731 ++ max = BTS_RECORD_SIZE * (BTS_BUFFER_SIZE / BTS_RECORD_SIZE);
732 ++ ds->bts_absolute_maximum = ds->bts_buffer_base + max;
733 ++ ds->bts_interrupt_threshold = ds->bts_absolute_maximum - (max / 16);
734 + return 0;
735 + }
736 +
737 + static void release_bts_buffer(int cpu)
738 + {
739 +- struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
740 ++ struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu);
741 ++ struct debug_store *ds = hwev->ds;
742 ++ void *cea;
743 +
744 + if (!ds || !x86_pmu.bts)
745 + return;
746 +
747 +- kfree((void *)(unsigned long)ds->bts_buffer_base);
748 ++ /* Clear the fixmap */
749 ++ cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.bts_buffer;
750 ++ ds_clear_cea(cea, BTS_BUFFER_SIZE);
751 + ds->bts_buffer_base = 0;
752 ++ dsfree_pages(hwev->ds_bts_vaddr, BTS_BUFFER_SIZE);
753 ++ hwev->ds_bts_vaddr = NULL;
754 + }
755 +
756 + static int alloc_ds_buffer(int cpu)
757 + {
758 +- int node = cpu_to_node(cpu);
759 +- struct debug_store *ds;
760 +-
761 +- ds = kzalloc_node(sizeof(*ds), GFP_KERNEL, node);
762 +- if (unlikely(!ds))
763 +- return -ENOMEM;
764 ++ struct debug_store *ds = &get_cpu_entry_area(cpu)->cpu_debug_store;
765 +
766 ++ memset(ds, 0, sizeof(*ds));
767 + per_cpu(cpu_hw_events, cpu).ds = ds;
768 +-
769 + return 0;
770 + }
771 +
772 + static void release_ds_buffer(int cpu)
773 + {
774 +- struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
775 +-
776 +- if (!ds)
777 +- return;
778 +-
779 + per_cpu(cpu_hw_events, cpu).ds = NULL;
780 +- kfree(ds);
781 + }
782 +
783 + void release_ds_buffers(void)
784 +diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
785 +index f7aaadf9331f..8e4ea143ed96 100644
786 +--- a/arch/x86/events/perf_event.h
787 ++++ b/arch/x86/events/perf_event.h
788 +@@ -14,6 +14,8 @@
789 +
790 + #include <linux/perf_event.h>
791 +
792 ++#include <asm/intel_ds.h>
793 ++
794 + /* To enable MSR tracing please use the generic trace points. */
795 +
796 + /*
797 +@@ -77,8 +79,6 @@ struct amd_nb {
798 + struct event_constraint event_constraints[X86_PMC_IDX_MAX];
799 + };
800 +
801 +-/* The maximal number of PEBS events: */
802 +-#define MAX_PEBS_EVENTS 8
803 + #define PEBS_COUNTER_MASK ((1ULL << MAX_PEBS_EVENTS) - 1)
804 +
805 + /*
806 +@@ -95,23 +95,6 @@ struct amd_nb {
807 + PERF_SAMPLE_TRANSACTION | PERF_SAMPLE_PHYS_ADDR | \
808 + PERF_SAMPLE_REGS_INTR | PERF_SAMPLE_REGS_USER)
809 +
810 +-/*
811 +- * A debug store configuration.
812 +- *
813 +- * We only support architectures that use 64bit fields.
814 +- */
815 +-struct debug_store {
816 +- u64 bts_buffer_base;
817 +- u64 bts_index;
818 +- u64 bts_absolute_maximum;
819 +- u64 bts_interrupt_threshold;
820 +- u64 pebs_buffer_base;
821 +- u64 pebs_index;
822 +- u64 pebs_absolute_maximum;
823 +- u64 pebs_interrupt_threshold;
824 +- u64 pebs_event_reset[MAX_PEBS_EVENTS];
825 +-};
826 +-
827 + #define PEBS_REGS \
828 + (PERF_REG_X86_AX | \
829 + PERF_REG_X86_BX | \
830 +@@ -216,6 +199,8 @@ struct cpu_hw_events {
831 + * Intel DebugStore bits
832 + */
833 + struct debug_store *ds;
834 ++ void *ds_pebs_vaddr;
835 ++ void *ds_bts_vaddr;
836 + u64 pebs_enabled;
837 + int n_pebs;
838 + int n_large_pebs;
839 +diff --git a/arch/x86/include/asm/cpu_entry_area.h b/arch/x86/include/asm/cpu_entry_area.h
840 +index 2fbc69a0916e..4a7884b8dca5 100644
841 +--- a/arch/x86/include/asm/cpu_entry_area.h
842 ++++ b/arch/x86/include/asm/cpu_entry_area.h
843 +@@ -5,6 +5,7 @@
844 +
845 + #include <linux/percpu-defs.h>
846 + #include <asm/processor.h>
847 ++#include <asm/intel_ds.h>
848 +
849 + /*
850 + * cpu_entry_area is a percpu region that contains things needed by the CPU
851 +@@ -40,6 +41,18 @@ struct cpu_entry_area {
852 + */
853 + char exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ];
854 + #endif
855 ++#ifdef CONFIG_CPU_SUP_INTEL
856 ++ /*
857 ++ * Per CPU debug store for Intel performance monitoring. Wastes a
858 ++ * full page at the moment.
859 ++ */
860 ++ struct debug_store cpu_debug_store;
861 ++ /*
862 ++ * The actual PEBS/BTS buffers must be mapped to user space
863 ++ * Reserve enough fixmap PTEs.
864 ++ */
865 ++ struct debug_store_buffers cpu_debug_buffers;
866 ++#endif
867 + };
868 +
869 + #define CPU_ENTRY_AREA_SIZE (sizeof(struct cpu_entry_area))
870 +diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
871 +index 800104c8a3ed..07cdd1715705 100644
872 +--- a/arch/x86/include/asm/cpufeatures.h
873 ++++ b/arch/x86/include/asm/cpufeatures.h
874 +@@ -197,11 +197,12 @@
875 + #define X86_FEATURE_CAT_L3 ( 7*32+ 4) /* Cache Allocation Technology L3 */
876 + #define X86_FEATURE_CAT_L2 ( 7*32+ 5) /* Cache Allocation Technology L2 */
877 + #define X86_FEATURE_CDP_L3 ( 7*32+ 6) /* Code and Data Prioritization L3 */
878 ++#define X86_FEATURE_INVPCID_SINGLE ( 7*32+ 7) /* Effectively INVPCID && CR4.PCIDE=1 */
879 +
880 + #define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */
881 + #define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */
882 + #define X86_FEATURE_SME ( 7*32+10) /* AMD Secure Memory Encryption */
883 +-
884 ++#define X86_FEATURE_PTI ( 7*32+11) /* Kernel Page Table Isolation enabled */
885 + #define X86_FEATURE_INTEL_PPIN ( 7*32+14) /* Intel Processor Inventory Number */
886 + #define X86_FEATURE_INTEL_PT ( 7*32+15) /* Intel Processor Trace */
887 + #define X86_FEATURE_AVX512_4VNNIW ( 7*32+16) /* AVX-512 Neural Network Instructions */
888 +@@ -340,5 +341,6 @@
889 + #define X86_BUG_SWAPGS_FENCE X86_BUG(11) /* SWAPGS without input dep on GS */
890 + #define X86_BUG_MONITOR X86_BUG(12) /* IPI required to wake up remote CPU */
891 + #define X86_BUG_AMD_E400 X86_BUG(13) /* CPU is among the affected by Erratum 400 */
892 ++#define X86_BUG_CPU_INSECURE X86_BUG(14) /* CPU is insecure and needs kernel page table isolation */
893 +
894 + #endif /* _ASM_X86_CPUFEATURES_H */
895 +diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
896 +index bc359dd2f7f6..85e23bb7b34e 100644
897 +--- a/arch/x86/include/asm/desc.h
898 ++++ b/arch/x86/include/asm/desc.h
899 +@@ -21,6 +21,8 @@ static inline void fill_ldt(struct desc_struct *desc, const struct user_desc *in
900 +
901 + desc->type = (info->read_exec_only ^ 1) << 1;
902 + desc->type |= info->contents << 2;
903 ++ /* Set the ACCESS bit so it can be mapped RO */
904 ++ desc->type |= 1;
905 +
906 + desc->s = 1;
907 + desc->dpl = 0x3;
908 +diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h
909 +index c10c9128f54e..e428e16dd822 100644
910 +--- a/arch/x86/include/asm/disabled-features.h
911 ++++ b/arch/x86/include/asm/disabled-features.h
912 +@@ -44,6 +44,12 @@
913 + # define DISABLE_LA57 (1<<(X86_FEATURE_LA57 & 31))
914 + #endif
915 +
916 ++#ifdef CONFIG_PAGE_TABLE_ISOLATION
917 ++# define DISABLE_PTI 0
918 ++#else
919 ++# define DISABLE_PTI (1 << (X86_FEATURE_PTI & 31))
920 ++#endif
921 ++
922 + /*
923 + * Make sure to add features to the correct mask
924 + */
925 +@@ -54,7 +60,7 @@
926 + #define DISABLED_MASK4 (DISABLE_PCID)
927 + #define DISABLED_MASK5 0
928 + #define DISABLED_MASK6 0
929 +-#define DISABLED_MASK7 0
930 ++#define DISABLED_MASK7 (DISABLE_PTI)
931 + #define DISABLED_MASK8 0
932 + #define DISABLED_MASK9 (DISABLE_MPX)
933 + #define DISABLED_MASK10 0
934 +diff --git a/arch/x86/include/asm/intel_ds.h b/arch/x86/include/asm/intel_ds.h
935 +new file mode 100644
936 +index 000000000000..62a9f4966b42
937 +--- /dev/null
938 ++++ b/arch/x86/include/asm/intel_ds.h
939 +@@ -0,0 +1,36 @@
940 ++#ifndef _ASM_INTEL_DS_H
941 ++#define _ASM_INTEL_DS_H
942 ++
943 ++#include <linux/percpu-defs.h>
944 ++
945 ++#define BTS_BUFFER_SIZE (PAGE_SIZE << 4)
946 ++#define PEBS_BUFFER_SIZE (PAGE_SIZE << 4)
947 ++
948 ++/* The maximal number of PEBS events: */
949 ++#define MAX_PEBS_EVENTS 8
950 ++
951 ++/*
952 ++ * A debug store configuration.
953 ++ *
954 ++ * We only support architectures that use 64bit fields.
955 ++ */
956 ++struct debug_store {
957 ++ u64 bts_buffer_base;
958 ++ u64 bts_index;
959 ++ u64 bts_absolute_maximum;
960 ++ u64 bts_interrupt_threshold;
961 ++ u64 pebs_buffer_base;
962 ++ u64 pebs_index;
963 ++ u64 pebs_absolute_maximum;
964 ++ u64 pebs_interrupt_threshold;
965 ++ u64 pebs_event_reset[MAX_PEBS_EVENTS];
966 ++} __aligned(PAGE_SIZE);
967 ++
968 ++DECLARE_PER_CPU_PAGE_ALIGNED(struct debug_store, cpu_debug_store);
969 ++
970 ++struct debug_store_buffers {
971 ++ char bts_buffer[BTS_BUFFER_SIZE];
972 ++ char pebs_buffer[PEBS_BUFFER_SIZE];
973 ++};
974 ++
975 ++#endif
976 +diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
977 +index 5ede7cae1d67..c931b88982a0 100644
978 +--- a/arch/x86/include/asm/mmu_context.h
979 ++++ b/arch/x86/include/asm/mmu_context.h
980 +@@ -50,10 +50,33 @@ struct ldt_struct {
981 + * call gates. On native, we could merge the ldt_struct and LDT
982 + * allocations, but it's not worth trying to optimize.
983 + */
984 +- struct desc_struct *entries;
985 +- unsigned int nr_entries;
986 ++ struct desc_struct *entries;
987 ++ unsigned int nr_entries;
988 ++
989 ++ /*
990 ++ * If PTI is in use, then the entries array is not mapped while we're
991 ++ * in user mode. The whole array will be aliased at the addressed
992 ++ * given by ldt_slot_va(slot). We use two slots so that we can allocate
993 ++ * and map, and enable a new LDT without invalidating the mapping
994 ++ * of an older, still-in-use LDT.
995 ++ *
996 ++ * slot will be -1 if this LDT doesn't have an alias mapping.
997 ++ */
998 ++ int slot;
999 + };
1000 +
1001 ++/* This is a multiple of PAGE_SIZE. */
1002 ++#define LDT_SLOT_STRIDE (LDT_ENTRIES * LDT_ENTRY_SIZE)
1003 ++
1004 ++static inline void *ldt_slot_va(int slot)
1005 ++{
1006 ++#ifdef CONFIG_X86_64
1007 ++ return (void *)(LDT_BASE_ADDR + LDT_SLOT_STRIDE * slot);
1008 ++#else
1009 ++ BUG();
1010 ++#endif
1011 ++}
1012 ++
1013 + /*
1014 + * Used for LDT copy/destruction.
1015 + */
1016 +@@ -64,6 +87,7 @@ static inline void init_new_context_ldt(struct mm_struct *mm)
1017 + }
1018 + int ldt_dup_context(struct mm_struct *oldmm, struct mm_struct *mm);
1019 + void destroy_context_ldt(struct mm_struct *mm);
1020 ++void ldt_arch_exit_mmap(struct mm_struct *mm);
1021 + #else /* CONFIG_MODIFY_LDT_SYSCALL */
1022 + static inline void init_new_context_ldt(struct mm_struct *mm) { }
1023 + static inline int ldt_dup_context(struct mm_struct *oldmm,
1024 +@@ -71,7 +95,8 @@ static inline int ldt_dup_context(struct mm_struct *oldmm,
1025 + {
1026 + return 0;
1027 + }
1028 +-static inline void destroy_context_ldt(struct mm_struct *mm) {}
1029 ++static inline void destroy_context_ldt(struct mm_struct *mm) { }
1030 ++static inline void ldt_arch_exit_mmap(struct mm_struct *mm) { }
1031 + #endif
1032 +
1033 + static inline void load_mm_ldt(struct mm_struct *mm)
1034 +@@ -96,10 +121,31 @@ static inline void load_mm_ldt(struct mm_struct *mm)
1035 + * that we can see.
1036 + */
1037 +
1038 +- if (unlikely(ldt))
1039 +- set_ldt(ldt->entries, ldt->nr_entries);
1040 +- else
1041 ++ if (unlikely(ldt)) {
1042 ++ if (static_cpu_has(X86_FEATURE_PTI)) {
1043 ++ if (WARN_ON_ONCE((unsigned long)ldt->slot > 1)) {
1044 ++ /*
1045 ++ * Whoops -- either the new LDT isn't mapped
1046 ++ * (if slot == -1) or is mapped into a bogus
1047 ++ * slot (if slot > 1).
1048 ++ */
1049 ++ clear_LDT();
1050 ++ return;
1051 ++ }
1052 ++
1053 ++ /*
1054 ++ * If page table isolation is enabled, ldt->entries
1055 ++ * will not be mapped in the userspace pagetables.
1056 ++ * Tell the CPU to access the LDT through the alias
1057 ++ * at ldt_slot_va(ldt->slot).
1058 ++ */
1059 ++ set_ldt(ldt_slot_va(ldt->slot), ldt->nr_entries);
1060 ++ } else {
1061 ++ set_ldt(ldt->entries, ldt->nr_entries);
1062 ++ }
1063 ++ } else {
1064 + clear_LDT();
1065 ++ }
1066 + #else
1067 + clear_LDT();
1068 + #endif
1069 +@@ -194,6 +240,7 @@ static inline int arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
1070 + static inline void arch_exit_mmap(struct mm_struct *mm)
1071 + {
1072 + paravirt_arch_exit_mmap(mm);
1073 ++ ldt_arch_exit_mmap(mm);
1074 + }
1075 +
1076 + #ifdef CONFIG_X86_64
1077 +diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h
1078 +index 4b5e1eafada7..aff42e1da6ee 100644
1079 +--- a/arch/x86/include/asm/pgalloc.h
1080 ++++ b/arch/x86/include/asm/pgalloc.h
1081 +@@ -30,6 +30,17 @@ static inline void paravirt_release_p4d(unsigned long pfn) {}
1082 + */
1083 + extern gfp_t __userpte_alloc_gfp;
1084 +
1085 ++#ifdef CONFIG_PAGE_TABLE_ISOLATION
1086 ++/*
1087 ++ * Instead of one PGD, we acquire two PGDs. Being order-1, it is
1088 ++ * both 8k in size and 8k-aligned. That lets us just flip bit 12
1089 ++ * in a pointer to swap between the two 4k halves.
1090 ++ */
1091 ++#define PGD_ALLOCATION_ORDER 1
1092 ++#else
1093 ++#define PGD_ALLOCATION_ORDER 0
1094 ++#endif
1095 ++
1096 + /*
1097 + * Allocate and free page tables.
1098 + */
1099 +diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
1100 +index f02de8bc1f72..211368922cad 100644
1101 +--- a/arch/x86/include/asm/pgtable.h
1102 ++++ b/arch/x86/include/asm/pgtable.h
1103 +@@ -28,6 +28,7 @@ extern pgd_t early_top_pgt[PTRS_PER_PGD];
1104 + int __init __early_make_pgtable(unsigned long address, pmdval_t pmd);
1105 +
1106 + void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd);
1107 ++void ptdump_walk_pgd_level_debugfs(struct seq_file *m, pgd_t *pgd, bool user);
1108 + void ptdump_walk_pgd_level_checkwx(void);
1109 +
1110 + #ifdef CONFIG_DEBUG_WX
1111 +@@ -846,7 +847,12 @@ static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address)
1112 +
1113 + static inline int p4d_bad(p4d_t p4d)
1114 + {
1115 +- return (p4d_flags(p4d) & ~(_KERNPG_TABLE | _PAGE_USER)) != 0;
1116 ++ unsigned long ignore_flags = _KERNPG_TABLE | _PAGE_USER;
1117 ++
1118 ++ if (IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION))
1119 ++ ignore_flags |= _PAGE_NX;
1120 ++
1121 ++ return (p4d_flags(p4d) & ~ignore_flags) != 0;
1122 + }
1123 + #endif /* CONFIG_PGTABLE_LEVELS > 3 */
1124 +
1125 +@@ -880,7 +886,12 @@ static inline p4d_t *p4d_offset(pgd_t *pgd, unsigned long address)
1126 +
1127 + static inline int pgd_bad(pgd_t pgd)
1128 + {
1129 +- return (pgd_flags(pgd) & ~_PAGE_USER) != _KERNPG_TABLE;
1130 ++ unsigned long ignore_flags = _PAGE_USER;
1131 ++
1132 ++ if (IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION))
1133 ++ ignore_flags |= _PAGE_NX;
1134 ++
1135 ++ return (pgd_flags(pgd) & ~ignore_flags) != _KERNPG_TABLE;
1136 + }
1137 +
1138 + static inline int pgd_none(pgd_t pgd)
1139 +@@ -909,7 +920,11 @@ static inline int pgd_none(pgd_t pgd)
1140 + * pgd_offset() returns a (pgd_t *)
1141 + * pgd_index() is used get the offset into the pgd page's array of pgd_t's;
1142 + */
1143 +-#define pgd_offset(mm, address) ((mm)->pgd + pgd_index((address)))
1144 ++#define pgd_offset_pgd(pgd, address) (pgd + pgd_index((address)))
1145 ++/*
1146 ++ * a shortcut to get a pgd_t in a given mm
1147 ++ */
1148 ++#define pgd_offset(mm, address) pgd_offset_pgd((mm)->pgd, (address))
1149 + /*
1150 + * a shortcut which implies the use of the kernel's pgd, instead
1151 + * of a process's
1152 +@@ -1111,7 +1126,14 @@ static inline int pud_write(pud_t pud)
1153 + */
1154 + static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
1155 + {
1156 +- memcpy(dst, src, count * sizeof(pgd_t));
1157 ++ memcpy(dst, src, count * sizeof(pgd_t));
1158 ++#ifdef CONFIG_PAGE_TABLE_ISOLATION
1159 ++ if (!static_cpu_has(X86_FEATURE_PTI))
1160 ++ return;
1161 ++ /* Clone the user space pgd as well */
1162 ++ memcpy(kernel_to_user_pgdp(dst), kernel_to_user_pgdp(src),
1163 ++ count * sizeof(pgd_t));
1164 ++#endif
1165 + }
1166 +
1167 + #define PTE_SHIFT ilog2(PTRS_PER_PTE)
1168 +diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
1169 +index e9f05331e732..81462e9a34f6 100644
1170 +--- a/arch/x86/include/asm/pgtable_64.h
1171 ++++ b/arch/x86/include/asm/pgtable_64.h
1172 +@@ -131,9 +131,97 @@ static inline pud_t native_pudp_get_and_clear(pud_t *xp)
1173 + #endif
1174 + }
1175 +
1176 ++#ifdef CONFIG_PAGE_TABLE_ISOLATION
1177 ++/*
1178 ++ * All top-level PAGE_TABLE_ISOLATION page tables are order-1 pages
1179 ++ * (8k-aligned and 8k in size). The kernel one is at the beginning 4k and
1180 ++ * the user one is in the last 4k. To switch between them, you
1181 ++ * just need to flip the 12th bit in their addresses.
1182 ++ */
1183 ++#define PTI_PGTABLE_SWITCH_BIT PAGE_SHIFT
1184 ++
1185 ++/*
1186 ++ * This generates better code than the inline assembly in
1187 ++ * __set_bit().
1188 ++ */
1189 ++static inline void *ptr_set_bit(void *ptr, int bit)
1190 ++{
1191 ++ unsigned long __ptr = (unsigned long)ptr;
1192 ++
1193 ++ __ptr |= BIT(bit);
1194 ++ return (void *)__ptr;
1195 ++}
1196 ++static inline void *ptr_clear_bit(void *ptr, int bit)
1197 ++{
1198 ++ unsigned long __ptr = (unsigned long)ptr;
1199 ++
1200 ++ __ptr &= ~BIT(bit);
1201 ++ return (void *)__ptr;
1202 ++}
1203 ++
1204 ++static inline pgd_t *kernel_to_user_pgdp(pgd_t *pgdp)
1205 ++{
1206 ++ return ptr_set_bit(pgdp, PTI_PGTABLE_SWITCH_BIT);
1207 ++}
1208 ++
1209 ++static inline pgd_t *user_to_kernel_pgdp(pgd_t *pgdp)
1210 ++{
1211 ++ return ptr_clear_bit(pgdp, PTI_PGTABLE_SWITCH_BIT);
1212 ++}
1213 ++
1214 ++static inline p4d_t *kernel_to_user_p4dp(p4d_t *p4dp)
1215 ++{
1216 ++ return ptr_set_bit(p4dp, PTI_PGTABLE_SWITCH_BIT);
1217 ++}
1218 ++
1219 ++static inline p4d_t *user_to_kernel_p4dp(p4d_t *p4dp)
1220 ++{
1221 ++ return ptr_clear_bit(p4dp, PTI_PGTABLE_SWITCH_BIT);
1222 ++}
1223 ++#endif /* CONFIG_PAGE_TABLE_ISOLATION */
1224 ++
1225 ++/*
1226 ++ * Page table pages are page-aligned. The lower half of the top
1227 ++ * level is used for userspace and the top half for the kernel.
1228 ++ *
1229 ++ * Returns true for parts of the PGD that map userspace and
1230 ++ * false for the parts that map the kernel.
1231 ++ */
1232 ++static inline bool pgdp_maps_userspace(void *__ptr)
1233 ++{
1234 ++ unsigned long ptr = (unsigned long)__ptr;
1235 ++
1236 ++ return (ptr & ~PAGE_MASK) < (PAGE_SIZE / 2);
1237 ++}
1238 ++
1239 ++#ifdef CONFIG_PAGE_TABLE_ISOLATION
1240 ++pgd_t __pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd);
1241 ++
1242 ++/*
1243 ++ * Take a PGD location (pgdp) and a pgd value that needs to be set there.
1244 ++ * Populates the user and returns the resulting PGD that must be set in
1245 ++ * the kernel copy of the page tables.
1246 ++ */
1247 ++static inline pgd_t pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd)
1248 ++{
1249 ++ if (!static_cpu_has(X86_FEATURE_PTI))
1250 ++ return pgd;
1251 ++ return __pti_set_user_pgd(pgdp, pgd);
1252 ++}
1253 ++#else
1254 ++static inline pgd_t pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd)
1255 ++{
1256 ++ return pgd;
1257 ++}
1258 ++#endif
1259 ++
1260 + static inline void native_set_p4d(p4d_t *p4dp, p4d_t p4d)
1261 + {
1262 ++#if defined(CONFIG_PAGE_TABLE_ISOLATION) && !defined(CONFIG_X86_5LEVEL)
1263 ++ p4dp->pgd = pti_set_user_pgd(&p4dp->pgd, p4d.pgd);
1264 ++#else
1265 + *p4dp = p4d;
1266 ++#endif
1267 + }
1268 +
1269 + static inline void native_p4d_clear(p4d_t *p4d)
1270 +@@ -147,7 +235,11 @@ static inline void native_p4d_clear(p4d_t *p4d)
1271 +
1272 + static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd)
1273 + {
1274 ++#ifdef CONFIG_PAGE_TABLE_ISOLATION
1275 ++ *pgdp = pti_set_user_pgd(pgdp, pgd);
1276 ++#else
1277 + *pgdp = pgd;
1278 ++#endif
1279 + }
1280 +
1281 + static inline void native_pgd_clear(pgd_t *pgd)
1282 +diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
1283 +index 3d27831bc58d..b97a539bcdee 100644
1284 +--- a/arch/x86/include/asm/pgtable_64_types.h
1285 ++++ b/arch/x86/include/asm/pgtable_64_types.h
1286 +@@ -79,13 +79,17 @@ typedef struct { pteval_t pte; } pte_t;
1287 + #define MAXMEM _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL)
1288 +
1289 + #ifdef CONFIG_X86_5LEVEL
1290 +-# define VMALLOC_SIZE_TB _AC(16384, UL)
1291 +-# define __VMALLOC_BASE _AC(0xff92000000000000, UL)
1292 ++# define VMALLOC_SIZE_TB _AC(12800, UL)
1293 ++# define __VMALLOC_BASE _AC(0xffa0000000000000, UL)
1294 + # define __VMEMMAP_BASE _AC(0xffd4000000000000, UL)
1295 ++# define LDT_PGD_ENTRY _AC(-112, UL)
1296 ++# define LDT_BASE_ADDR (LDT_PGD_ENTRY << PGDIR_SHIFT)
1297 + #else
1298 + # define VMALLOC_SIZE_TB _AC(32, UL)
1299 + # define __VMALLOC_BASE _AC(0xffffc90000000000, UL)
1300 + # define __VMEMMAP_BASE _AC(0xffffea0000000000, UL)
1301 ++# define LDT_PGD_ENTRY _AC(-4, UL)
1302 ++# define LDT_BASE_ADDR (LDT_PGD_ENTRY << PGDIR_SHIFT)
1303 + #endif
1304 +
1305 + #ifdef CONFIG_RANDOMIZE_MEMORY
1306 +diff --git a/arch/x86/include/asm/processor-flags.h b/arch/x86/include/asm/processor-flags.h
1307 +index 43212a43ee69..6a60fea90b9d 100644
1308 +--- a/arch/x86/include/asm/processor-flags.h
1309 ++++ b/arch/x86/include/asm/processor-flags.h
1310 +@@ -38,6 +38,11 @@
1311 + #define CR3_ADDR_MASK __sme_clr(0x7FFFFFFFFFFFF000ull)
1312 + #define CR3_PCID_MASK 0xFFFull
1313 + #define CR3_NOFLUSH BIT_ULL(63)
1314 ++
1315 ++#ifdef CONFIG_PAGE_TABLE_ISOLATION
1316 ++# define X86_CR3_PTI_SWITCH_BIT 11
1317 ++#endif
1318 ++
1319 + #else
1320 + /*
1321 + * CR3_ADDR_MASK needs at least bits 31:5 set on PAE systems, and we save
1322 +diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
1323 +index 9e482d8b0b97..9c18da64daa9 100644
1324 +--- a/arch/x86/include/asm/processor.h
1325 ++++ b/arch/x86/include/asm/processor.h
1326 +@@ -851,13 +851,22 @@ static inline void spin_lock_prefetch(const void *x)
1327 +
1328 + #else
1329 + /*
1330 +- * User space process size. 47bits minus one guard page. The guard
1331 +- * page is necessary on Intel CPUs: if a SYSCALL instruction is at
1332 +- * the highest possible canonical userspace address, then that
1333 +- * syscall will enter the kernel with a non-canonical return
1334 +- * address, and SYSRET will explode dangerously. We avoid this
1335 +- * particular problem by preventing anything from being mapped
1336 +- * at the maximum canonical address.
1337 ++ * User space process size. This is the first address outside the user range.
1338 ++ * There are a few constraints that determine this:
1339 ++ *
1340 ++ * On Intel CPUs, if a SYSCALL instruction is at the highest canonical
1341 ++ * address, then that syscall will enter the kernel with a
1342 ++ * non-canonical return address, and SYSRET will explode dangerously.
1343 ++ * We avoid this particular problem by preventing anything executable
1344 ++ * from being mapped at the maximum canonical address.
1345 ++ *
1346 ++ * On AMD CPUs in the Ryzen family, there's a nasty bug in which the
1347 ++ * CPUs malfunction if they execute code from the highest canonical page.
1348 ++ * They'll speculate right off the end of the canonical space, and
1349 ++ * bad things happen. This is worked around in the same way as the
1350 ++ * Intel problem.
1351 ++ *
1352 ++ * With page table isolation enabled, we map the LDT in ... [stay tuned]
1353 + */
1354 + #define TASK_SIZE_MAX ((1UL << __VIRTUAL_MASK_SHIFT) - PAGE_SIZE)
1355 +
1356 +diff --git a/arch/x86/include/asm/pti.h b/arch/x86/include/asm/pti.h
1357 +new file mode 100644
1358 +index 000000000000..0b5ef05b2d2d
1359 +--- /dev/null
1360 ++++ b/arch/x86/include/asm/pti.h
1361 +@@ -0,0 +1,14 @@
1362 ++// SPDX-License-Identifier: GPL-2.0
1363 ++#ifndef _ASM_X86_PTI_H
1364 ++#define _ASM_X86_PTI_H
1365 ++#ifndef __ASSEMBLY__
1366 ++
1367 ++#ifdef CONFIG_PAGE_TABLE_ISOLATION
1368 ++extern void pti_init(void);
1369 ++extern void pti_check_boottime_disable(void);
1370 ++#else
1371 ++static inline void pti_check_boottime_disable(void) { }
1372 ++#endif
1373 ++
1374 ++#endif /* __ASSEMBLY__ */
1375 ++#endif /* _ASM_X86_PTI_H */
1376 +diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
1377 +index 171b429f43a2..f9b48ce152eb 100644
1378 +--- a/arch/x86/include/asm/tlbflush.h
1379 ++++ b/arch/x86/include/asm/tlbflush.h
1380 +@@ -10,38 +10,90 @@
1381 + #include <asm/special_insns.h>
1382 + #include <asm/smp.h>
1383 + #include <asm/invpcid.h>
1384 ++#include <asm/pti.h>
1385 ++#include <asm/processor-flags.h>
1386 +
1387 +-static inline u64 inc_mm_tlb_gen(struct mm_struct *mm)
1388 +-{
1389 +- /*
1390 +- * Bump the generation count. This also serves as a full barrier
1391 +- * that synchronizes with switch_mm(): callers are required to order
1392 +- * their read of mm_cpumask after their writes to the paging
1393 +- * structures.
1394 +- */
1395 +- return atomic64_inc_return(&mm->context.tlb_gen);
1396 +-}
1397 ++/*
1398 ++ * The x86 feature is called PCID (Process Context IDentifier). It is similar
1399 ++ * to what is traditionally called ASID on the RISC processors.
1400 ++ *
1401 ++ * We don't use the traditional ASID implementation, where each process/mm gets
1402 ++ * its own ASID and flush/restart when we run out of ASID space.
1403 ++ *
1404 ++ * Instead we have a small per-cpu array of ASIDs and cache the last few mm's
1405 ++ * that came by on this CPU, allowing cheaper switch_mm between processes on
1406 ++ * this CPU.
1407 ++ *
1408 ++ * We end up with different spaces for different things. To avoid confusion we
1409 ++ * use different names for each of them:
1410 ++ *
1411 ++ * ASID - [0, TLB_NR_DYN_ASIDS-1]
1412 ++ * the canonical identifier for an mm
1413 ++ *
1414 ++ * kPCID - [1, TLB_NR_DYN_ASIDS]
1415 ++ * the value we write into the PCID part of CR3; corresponds to the
1416 ++ * ASID+1, because PCID 0 is special.
1417 ++ *
1418 ++ * uPCID - [2048 + 1, 2048 + TLB_NR_DYN_ASIDS]
1419 ++ * for KPTI each mm has two address spaces and thus needs two
1420 ++ * PCID values, but we can still do with a single ASID denomination
1421 ++ * for each mm. Corresponds to kPCID + 2048.
1422 ++ *
1423 ++ */
1424 +
1425 + /* There are 12 bits of space for ASIDS in CR3 */
1426 + #define CR3_HW_ASID_BITS 12
1427 ++
1428 + /*
1429 + * When enabled, PAGE_TABLE_ISOLATION consumes a single bit for
1430 + * user/kernel switches
1431 + */
1432 +-#define PTI_CONSUMED_ASID_BITS 0
1433 ++#ifdef CONFIG_PAGE_TABLE_ISOLATION
1434 ++# define PTI_CONSUMED_PCID_BITS 1
1435 ++#else
1436 ++# define PTI_CONSUMED_PCID_BITS 0
1437 ++#endif
1438 ++
1439 ++#define CR3_AVAIL_PCID_BITS (X86_CR3_PCID_BITS - PTI_CONSUMED_PCID_BITS)
1440 +
1441 +-#define CR3_AVAIL_ASID_BITS (CR3_HW_ASID_BITS - PTI_CONSUMED_ASID_BITS)
1442 + /*
1443 + * ASIDs are zero-based: 0->MAX_AVAIL_ASID are valid. -1 below to account
1444 +- * for them being zero-based. Another -1 is because ASID 0 is reserved for
1445 ++ * for them being zero-based. Another -1 is because PCID 0 is reserved for
1446 + * use by non-PCID-aware users.
1447 + */
1448 +-#define MAX_ASID_AVAILABLE ((1 << CR3_AVAIL_ASID_BITS) - 2)
1449 ++#define MAX_ASID_AVAILABLE ((1 << CR3_AVAIL_PCID_BITS) - 2)
1450 ++
1451 ++/*
1452 ++ * 6 because 6 should be plenty and struct tlb_state will fit in two cache
1453 ++ * lines.
1454 ++ */
1455 ++#define TLB_NR_DYN_ASIDS 6
1456 +
1457 ++/*
1458 ++ * Given @asid, compute kPCID
1459 ++ */
1460 + static inline u16 kern_pcid(u16 asid)
1461 + {
1462 + VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE);
1463 ++
1464 ++#ifdef CONFIG_PAGE_TABLE_ISOLATION
1465 ++ /*
1466 ++ * Make sure that the dynamic ASID space does not confict with the
1467 ++ * bit we are using to switch between user and kernel ASIDs.
1468 ++ */
1469 ++ BUILD_BUG_ON(TLB_NR_DYN_ASIDS >= (1 << X86_CR3_PTI_SWITCH_BIT));
1470 ++
1471 + /*
1472 ++ * The ASID being passed in here should have respected the
1473 ++ * MAX_ASID_AVAILABLE and thus never have the switch bit set.
1474 ++ */
1475 ++ VM_WARN_ON_ONCE(asid & (1 << X86_CR3_PTI_SWITCH_BIT));
1476 ++#endif
1477 ++ /*
1478 ++ * The dynamically-assigned ASIDs that get passed in are small
1479 ++ * (<TLB_NR_DYN_ASIDS). They never have the high switch bit set,
1480 ++ * so do not bother to clear it.
1481 ++ *
1482 + * If PCID is on, ASID-aware code paths put the ASID+1 into the
1483 + * PCID bits. This serves two purposes. It prevents a nasty
1484 + * situation in which PCID-unaware code saves CR3, loads some other
1485 +@@ -53,6 +105,18 @@ static inline u16 kern_pcid(u16 asid)
1486 + return asid + 1;
1487 + }
1488 +
1489 ++/*
1490 ++ * Given @asid, compute uPCID
1491 ++ */
1492 ++static inline u16 user_pcid(u16 asid)
1493 ++{
1494 ++ u16 ret = kern_pcid(asid);
1495 ++#ifdef CONFIG_PAGE_TABLE_ISOLATION
1496 ++ ret |= 1 << X86_CR3_PTI_SWITCH_BIT;
1497 ++#endif
1498 ++ return ret;
1499 ++}
1500 ++
1501 + struct pgd_t;
1502 + static inline unsigned long build_cr3(pgd_t *pgd, u16 asid)
1503 + {
1504 +@@ -95,12 +159,6 @@ static inline bool tlb_defer_switch_to_init_mm(void)
1505 + return !static_cpu_has(X86_FEATURE_PCID);
1506 + }
1507 +
1508 +-/*
1509 +- * 6 because 6 should be plenty and struct tlb_state will fit in
1510 +- * two cache lines.
1511 +- */
1512 +-#define TLB_NR_DYN_ASIDS 6
1513 +-
1514 + struct tlb_context {
1515 + u64 ctx_id;
1516 + u64 tlb_gen;
1517 +@@ -134,6 +192,24 @@ struct tlb_state {
1518 + */
1519 + bool is_lazy;
1520 +
1521 ++ /*
1522 ++ * If set we changed the page tables in such a way that we
1523 ++ * needed an invalidation of all contexts (aka. PCIDs / ASIDs).
1524 ++ * This tells us to go invalidate all the non-loaded ctxs[]
1525 ++ * on the next context switch.
1526 ++ *
1527 ++ * The current ctx was kept up-to-date as it ran and does not
1528 ++ * need to be invalidated.
1529 ++ */
1530 ++ bool invalidate_other;
1531 ++
1532 ++ /*
1533 ++ * Mask that contains TLB_NR_DYN_ASIDS+1 bits to indicate
1534 ++ * the corresponding user PCID needs a flush next time we
1535 ++ * switch to it; see SWITCH_TO_USER_CR3.
1536 ++ */
1537 ++ unsigned short user_pcid_flush_mask;
1538 ++
1539 + /*
1540 + * Access to this CR4 shadow and to H/W CR4 is protected by
1541 + * disabling interrupts when modifying either one.
1542 +@@ -211,6 +287,14 @@ static inline unsigned long cr4_read_shadow(void)
1543 + return this_cpu_read(cpu_tlbstate.cr4);
1544 + }
1545 +
1546 ++/*
1547 ++ * Mark all other ASIDs as invalid, preserves the current.
1548 ++ */
1549 ++static inline void invalidate_other_asid(void)
1550 ++{
1551 ++ this_cpu_write(cpu_tlbstate.invalidate_other, true);
1552 ++}
1553 ++
1554 + /*
1555 + * Save some of cr4 feature set we're using (e.g. Pentium 4MB
1556 + * enable and PPro Global page enable), so that any CPU's that boot
1557 +@@ -230,19 +314,48 @@ static inline void cr4_set_bits_and_update_boot(unsigned long mask)
1558 +
1559 + extern void initialize_tlbstate_and_flush(void);
1560 +
1561 ++/*
1562 ++ * Given an ASID, flush the corresponding user ASID. We can delay this
1563 ++ * until the next time we switch to it.
1564 ++ *
1565 ++ * See SWITCH_TO_USER_CR3.
1566 ++ */
1567 ++static inline void invalidate_user_asid(u16 asid)
1568 ++{
1569 ++ /* There is no user ASID if address space separation is off */
1570 ++ if (!IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION))
1571 ++ return;
1572 ++
1573 ++ /*
1574 ++ * We only have a single ASID if PCID is off and the CR3
1575 ++ * write will have flushed it.
1576 ++ */
1577 ++ if (!cpu_feature_enabled(X86_FEATURE_PCID))
1578 ++ return;
1579 ++
1580 ++ if (!static_cpu_has(X86_FEATURE_PTI))
1581 ++ return;
1582 ++
1583 ++ __set_bit(kern_pcid(asid),
1584 ++ (unsigned long *)this_cpu_ptr(&cpu_tlbstate.user_pcid_flush_mask));
1585 ++}
1586 ++
1587 + /*
1588 + * flush the entire current user mapping
1589 + */
1590 + static inline void __native_flush_tlb(void)
1591 + {
1592 + /*
1593 +- * If current->mm == NULL then we borrow a mm which may change during a
1594 +- * task switch and therefore we must not be preempted while we write CR3
1595 +- * back:
1596 ++ * Preemption or interrupts must be disabled to protect the access
1597 ++ * to the per CPU variable and to prevent being preempted between
1598 ++ * read_cr3() and write_cr3().
1599 + */
1600 +- preempt_disable();
1601 ++ WARN_ON_ONCE(preemptible());
1602 ++
1603 ++ invalidate_user_asid(this_cpu_read(cpu_tlbstate.loaded_mm_asid));
1604 ++
1605 ++ /* If current->mm == NULL then the read_cr3() "borrows" an mm */
1606 + native_write_cr3(__native_read_cr3());
1607 +- preempt_enable();
1608 + }
1609 +
1610 + /*
1611 +@@ -256,6 +369,8 @@ static inline void __native_flush_tlb_global(void)
1612 + /*
1613 + * Using INVPCID is considerably faster than a pair of writes
1614 + * to CR4 sandwiched inside an IRQ flag save/restore.
1615 ++ *
1616 ++ * Note, this works with CR4.PCIDE=0 or 1.
1617 + */
1618 + invpcid_flush_all();
1619 + return;
1620 +@@ -282,7 +397,21 @@ static inline void __native_flush_tlb_global(void)
1621 + */
1622 + static inline void __native_flush_tlb_single(unsigned long addr)
1623 + {
1624 ++ u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
1625 ++
1626 + asm volatile("invlpg (%0)" ::"r" (addr) : "memory");
1627 ++
1628 ++ if (!static_cpu_has(X86_FEATURE_PTI))
1629 ++ return;
1630 ++
1631 ++ /*
1632 ++ * Some platforms #GP if we call invpcid(type=1/2) before CR4.PCIDE=1.
1633 ++ * Just use invalidate_user_asid() in case we are called early.
1634 ++ */
1635 ++ if (!this_cpu_has(X86_FEATURE_INVPCID_SINGLE))
1636 ++ invalidate_user_asid(loaded_mm_asid);
1637 ++ else
1638 ++ invpcid_flush_one(user_pcid(loaded_mm_asid), addr);
1639 + }
1640 +
1641 + /*
1642 +@@ -298,14 +427,6 @@ static inline void __flush_tlb_all(void)
1643 + */
1644 + __flush_tlb();
1645 + }
1646 +-
1647 +- /*
1648 +- * Note: if we somehow had PCID but not PGE, then this wouldn't work --
1649 +- * we'd end up flushing kernel translations for the current ASID but
1650 +- * we might fail to flush kernel translations for other cached ASIDs.
1651 +- *
1652 +- * To avoid this issue, we force PCID off if PGE is off.
1653 +- */
1654 + }
1655 +
1656 + /*
1657 +@@ -315,6 +436,16 @@ static inline void __flush_tlb_one(unsigned long addr)
1658 + {
1659 + count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE);
1660 + __flush_tlb_single(addr);
1661 ++
1662 ++ if (!static_cpu_has(X86_FEATURE_PTI))
1663 ++ return;
1664 ++
1665 ++ /*
1666 ++ * __flush_tlb_single() will have cleared the TLB entry for this ASID,
1667 ++ * but since kernel space is replicated across all, we must also
1668 ++ * invalidate all others.
1669 ++ */
1670 ++ invalidate_other_asid();
1671 + }
1672 +
1673 + #define TLB_FLUSH_ALL -1UL
1674 +@@ -375,6 +506,17 @@ static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long a)
1675 + void native_flush_tlb_others(const struct cpumask *cpumask,
1676 + const struct flush_tlb_info *info);
1677 +
1678 ++static inline u64 inc_mm_tlb_gen(struct mm_struct *mm)
1679 ++{
1680 ++ /*
1681 ++ * Bump the generation count. This also serves as a full barrier
1682 ++ * that synchronizes with switch_mm(): callers are required to order
1683 ++ * their read of mm_cpumask after their writes to the paging
1684 ++ * structures.
1685 ++ */
1686 ++ return atomic64_inc_return(&mm->context.tlb_gen);
1687 ++}
1688 ++
1689 + static inline void arch_tlbbatch_add_mm(struct arch_tlbflush_unmap_batch *batch,
1690 + struct mm_struct *mm)
1691 + {
1692 +diff --git a/arch/x86/include/asm/vsyscall.h b/arch/x86/include/asm/vsyscall.h
1693 +index d9a7c659009c..b986b2ca688a 100644
1694 +--- a/arch/x86/include/asm/vsyscall.h
1695 ++++ b/arch/x86/include/asm/vsyscall.h
1696 +@@ -7,6 +7,7 @@
1697 +
1698 + #ifdef CONFIG_X86_VSYSCALL_EMULATION
1699 + extern void map_vsyscall(void);
1700 ++extern void set_vsyscall_pgtable_user_bits(pgd_t *root);
1701 +
1702 + /*
1703 + * Called on instruction fetch fault in vsyscall page.
1704 +diff --git a/arch/x86/include/uapi/asm/processor-flags.h b/arch/x86/include/uapi/asm/processor-flags.h
1705 +index 53b4ca55ebb6..97abdaab9535 100644
1706 +--- a/arch/x86/include/uapi/asm/processor-flags.h
1707 ++++ b/arch/x86/include/uapi/asm/processor-flags.h
1708 +@@ -78,7 +78,12 @@
1709 + #define X86_CR3_PWT _BITUL(X86_CR3_PWT_BIT)
1710 + #define X86_CR3_PCD_BIT 4 /* Page Cache Disable */
1711 + #define X86_CR3_PCD _BITUL(X86_CR3_PCD_BIT)
1712 +-#define X86_CR3_PCID_MASK _AC(0x00000fff,UL) /* PCID Mask */
1713 ++
1714 ++#define X86_CR3_PCID_BITS 12
1715 ++#define X86_CR3_PCID_MASK (_AC((1UL << X86_CR3_PCID_BITS) - 1, UL))
1716 ++
1717 ++#define X86_CR3_PCID_NOFLUSH_BIT 63 /* Preserve old PCID */
1718 ++#define X86_CR3_PCID_NOFLUSH _BITULL(X86_CR3_PCID_NOFLUSH_BIT)
1719 +
1720 + /*
1721 + * Intel CPU features in CR4
1722 +diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
1723 +index 676b7cf4b62b..76417a9aab73 100644
1724 +--- a/arch/x86/kernel/asm-offsets.c
1725 ++++ b/arch/x86/kernel/asm-offsets.c
1726 +@@ -17,6 +17,7 @@
1727 + #include <asm/sigframe.h>
1728 + #include <asm/bootparam.h>
1729 + #include <asm/suspend.h>
1730 ++#include <asm/tlbflush.h>
1731 +
1732 + #ifdef CONFIG_XEN
1733 + #include <xen/interface/xen.h>
1734 +@@ -94,6 +95,9 @@ void common(void) {
1735 + BLANK();
1736 + DEFINE(PTREGS_SIZE, sizeof(struct pt_regs));
1737 +
1738 ++ /* TLB state for the entry code */
1739 ++ OFFSET(TLB_STATE_user_pcid_flush_mask, tlb_state, user_pcid_flush_mask);
1740 ++
1741 + /* Layout info for cpu_entry_area */
1742 + OFFSET(CPU_ENTRY_AREA_tss, cpu_entry_area, tss);
1743 + OFFSET(CPU_ENTRY_AREA_entry_trampoline, cpu_entry_area, entry_trampoline);
1744 +diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
1745 +index 8ddcfa4d4165..f2a94dfb434e 100644
1746 +--- a/arch/x86/kernel/cpu/common.c
1747 ++++ b/arch/x86/kernel/cpu/common.c
1748 +@@ -898,6 +898,10 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
1749 + }
1750 +
1751 + setup_force_cpu_cap(X86_FEATURE_ALWAYS);
1752 ++
1753 ++ /* Assume for now that ALL x86 CPUs are insecure */
1754 ++ setup_force_cpu_bug(X86_BUG_CPU_INSECURE);
1755 ++
1756 + fpu__init_system(c);
1757 +
1758 + #ifdef CONFIG_X86_32
1759 +@@ -1335,7 +1339,10 @@ void syscall_init(void)
1760 + (entry_SYSCALL_64_trampoline - _entry_trampoline);
1761 +
1762 + wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS);
1763 +- wrmsrl(MSR_LSTAR, SYSCALL64_entry_trampoline);
1764 ++ if (static_cpu_has(X86_FEATURE_PTI))
1765 ++ wrmsrl(MSR_LSTAR, SYSCALL64_entry_trampoline);
1766 ++ else
1767 ++ wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64);
1768 +
1769 + #ifdef CONFIG_IA32_EMULATION
1770 + wrmsrl(MSR_CSTAR, (unsigned long)entry_SYSCALL_compat);
1771 +diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
1772 +index 36b17e0febe8..5fa110699ed2 100644
1773 +--- a/arch/x86/kernel/dumpstack.c
1774 ++++ b/arch/x86/kernel/dumpstack.c
1775 +@@ -297,11 +297,13 @@ int __die(const char *str, struct pt_regs *regs, long err)
1776 + unsigned long sp;
1777 + #endif
1778 + printk(KERN_DEFAULT
1779 +- "%s: %04lx [#%d]%s%s%s%s\n", str, err & 0xffff, ++die_counter,
1780 ++ "%s: %04lx [#%d]%s%s%s%s%s\n", str, err & 0xffff, ++die_counter,
1781 + IS_ENABLED(CONFIG_PREEMPT) ? " PREEMPT" : "",
1782 + IS_ENABLED(CONFIG_SMP) ? " SMP" : "",
1783 + debug_pagealloc_enabled() ? " DEBUG_PAGEALLOC" : "",
1784 +- IS_ENABLED(CONFIG_KASAN) ? " KASAN" : "");
1785 ++ IS_ENABLED(CONFIG_KASAN) ? " KASAN" : "",
1786 ++ IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION) ?
1787 ++ (boot_cpu_has(X86_FEATURE_PTI) ? " PTI" : " NOPTI") : "");
1788 +
1789 + if (notify_die(DIE_OOPS, str, regs, err,
1790 + current->thread.trap_nr, SIGSEGV) == NOTIFY_STOP)
1791 +diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
1792 +index 7dca675fe78d..04a625f0fcda 100644
1793 +--- a/arch/x86/kernel/head_64.S
1794 ++++ b/arch/x86/kernel/head_64.S
1795 +@@ -341,6 +341,27 @@ GLOBAL(early_recursion_flag)
1796 + .balign PAGE_SIZE; \
1797 + GLOBAL(name)
1798 +
1799 ++#ifdef CONFIG_PAGE_TABLE_ISOLATION
1800 ++/*
1801 ++ * Each PGD needs to be 8k long and 8k aligned. We do not
1802 ++ * ever go out to userspace with these, so we do not
1803 ++ * strictly *need* the second page, but this allows us to
1804 ++ * have a single set_pgd() implementation that does not
1805 ++ * need to worry about whether it has 4k or 8k to work
1806 ++ * with.
1807 ++ *
1808 ++ * This ensures PGDs are 8k long:
1809 ++ */
1810 ++#define PTI_USER_PGD_FILL 512
1811 ++/* This ensures they are 8k-aligned: */
1812 ++#define NEXT_PGD_PAGE(name) \
1813 ++ .balign 2 * PAGE_SIZE; \
1814 ++GLOBAL(name)
1815 ++#else
1816 ++#define NEXT_PGD_PAGE(name) NEXT_PAGE(name)
1817 ++#define PTI_USER_PGD_FILL 0
1818 ++#endif
1819 ++
1820 + /* Automate the creation of 1 to 1 mapping pmd entries */
1821 + #define PMDS(START, PERM, COUNT) \
1822 + i = 0 ; \
1823 +@@ -350,13 +371,14 @@ GLOBAL(name)
1824 + .endr
1825 +
1826 + __INITDATA
1827 +-NEXT_PAGE(early_top_pgt)
1828 ++NEXT_PGD_PAGE(early_top_pgt)
1829 + .fill 511,8,0
1830 + #ifdef CONFIG_X86_5LEVEL
1831 + .quad level4_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
1832 + #else
1833 + .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
1834 + #endif
1835 ++ .fill PTI_USER_PGD_FILL,8,0
1836 +
1837 + NEXT_PAGE(early_dynamic_pgts)
1838 + .fill 512*EARLY_DYNAMIC_PAGE_TABLES,8,0
1839 +@@ -364,13 +386,14 @@ NEXT_PAGE(early_dynamic_pgts)
1840 + .data
1841 +
1842 + #if defined(CONFIG_XEN_PV) || defined(CONFIG_XEN_PVH)
1843 +-NEXT_PAGE(init_top_pgt)
1844 ++NEXT_PGD_PAGE(init_top_pgt)
1845 + .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
1846 + .org init_top_pgt + PGD_PAGE_OFFSET*8, 0
1847 + .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
1848 + .org init_top_pgt + PGD_START_KERNEL*8, 0
1849 + /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
1850 + .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
1851 ++ .fill PTI_USER_PGD_FILL,8,0
1852 +
1853 + NEXT_PAGE(level3_ident_pgt)
1854 + .quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
1855 +@@ -381,8 +404,9 @@ NEXT_PAGE(level2_ident_pgt)
1856 + */
1857 + PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD)
1858 + #else
1859 +-NEXT_PAGE(init_top_pgt)
1860 ++NEXT_PGD_PAGE(init_top_pgt)
1861 + .fill 512,8,0
1862 ++ .fill PTI_USER_PGD_FILL,8,0
1863 + #endif
1864 +
1865 + #ifdef CONFIG_X86_5LEVEL
1866 +diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
1867 +index a6b5d62f45a7..26d713ecad34 100644
1868 +--- a/arch/x86/kernel/ldt.c
1869 ++++ b/arch/x86/kernel/ldt.c
1870 +@@ -24,6 +24,7 @@
1871 + #include <linux/uaccess.h>
1872 +
1873 + #include <asm/ldt.h>
1874 ++#include <asm/tlb.h>
1875 + #include <asm/desc.h>
1876 + #include <asm/mmu_context.h>
1877 + #include <asm/syscalls.h>
1878 +@@ -51,13 +52,11 @@ static void refresh_ldt_segments(void)
1879 + static void flush_ldt(void *__mm)
1880 + {
1881 + struct mm_struct *mm = __mm;
1882 +- mm_context_t *pc;
1883 +
1884 + if (this_cpu_read(cpu_tlbstate.loaded_mm) != mm)
1885 + return;
1886 +
1887 +- pc = &mm->context;
1888 +- set_ldt(pc->ldt->entries, pc->ldt->nr_entries);
1889 ++ load_mm_ldt(mm);
1890 +
1891 + refresh_ldt_segments();
1892 + }
1893 +@@ -94,10 +93,126 @@ static struct ldt_struct *alloc_ldt_struct(unsigned int num_entries)
1894 + return NULL;
1895 + }
1896 +
1897 ++ /* The new LDT isn't aliased for PTI yet. */
1898 ++ new_ldt->slot = -1;
1899 ++
1900 + new_ldt->nr_entries = num_entries;
1901 + return new_ldt;
1902 + }
1903 +
1904 ++/*
1905 ++ * If PTI is enabled, this maps the LDT into the kernelmode and
1906 ++ * usermode tables for the given mm.
1907 ++ *
1908 ++ * There is no corresponding unmap function. Even if the LDT is freed, we
1909 ++ * leave the PTEs around until the slot is reused or the mm is destroyed.
1910 ++ * This is harmless: the LDT is always in ordinary memory, and no one will
1911 ++ * access the freed slot.
1912 ++ *
1913 ++ * If we wanted to unmap freed LDTs, we'd also need to do a flush to make
1914 ++ * it useful, and the flush would slow down modify_ldt().
1915 ++ */
1916 ++static int
1917 ++map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot)
1918 ++{
1919 ++#ifdef CONFIG_PAGE_TABLE_ISOLATION
1920 ++ bool is_vmalloc, had_top_level_entry;
1921 ++ unsigned long va;
1922 ++ spinlock_t *ptl;
1923 ++ pgd_t *pgd;
1924 ++ int i;
1925 ++
1926 ++ if (!static_cpu_has(X86_FEATURE_PTI))
1927 ++ return 0;
1928 ++
1929 ++ /*
1930 ++ * Any given ldt_struct should have map_ldt_struct() called at most
1931 ++ * once.
1932 ++ */
1933 ++ WARN_ON(ldt->slot != -1);
1934 ++
1935 ++ /*
1936 ++ * Did we already have the top level entry allocated? We can't
1937 ++ * use pgd_none() for this because it doens't do anything on
1938 ++ * 4-level page table kernels.
1939 ++ */
1940 ++ pgd = pgd_offset(mm, LDT_BASE_ADDR);
1941 ++ had_top_level_entry = (pgd->pgd != 0);
1942 ++
1943 ++ is_vmalloc = is_vmalloc_addr(ldt->entries);
1944 ++
1945 ++ for (i = 0; i * PAGE_SIZE < ldt->nr_entries * LDT_ENTRY_SIZE; i++) {
1946 ++ unsigned long offset = i << PAGE_SHIFT;
1947 ++ const void *src = (char *)ldt->entries + offset;
1948 ++ unsigned long pfn;
1949 ++ pte_t pte, *ptep;
1950 ++
1951 ++ va = (unsigned long)ldt_slot_va(slot) + offset;
1952 ++ pfn = is_vmalloc ? vmalloc_to_pfn(src) :
1953 ++ page_to_pfn(virt_to_page(src));
1954 ++ /*
1955 ++ * Treat the PTI LDT range as a *userspace* range.
1956 ++ * get_locked_pte() will allocate all needed pagetables
1957 ++ * and account for them in this mm.
1958 ++ */
1959 ++ ptep = get_locked_pte(mm, va, &ptl);
1960 ++ if (!ptep)
1961 ++ return -ENOMEM;
1962 ++ /*
1963 ++ * Map it RO so the easy to find address is not a primary
1964 ++ * target via some kernel interface which misses a
1965 ++ * permission check.
1966 ++ */
1967 ++ pte = pfn_pte(pfn, __pgprot(__PAGE_KERNEL_RO & ~_PAGE_GLOBAL));
1968 ++ set_pte_at(mm, va, ptep, pte);
1969 ++ pte_unmap_unlock(ptep, ptl);
1970 ++ }
1971 ++
1972 ++ if (mm->context.ldt) {
1973 ++ /*
1974 ++ * We already had an LDT. The top-level entry should already
1975 ++ * have been allocated and synchronized with the usermode
1976 ++ * tables.
1977 ++ */
1978 ++ WARN_ON(!had_top_level_entry);
1979 ++ if (static_cpu_has(X86_FEATURE_PTI))
1980 ++ WARN_ON(!kernel_to_user_pgdp(pgd)->pgd);
1981 ++ } else {
1982 ++ /*
1983 ++ * This is the first time we're mapping an LDT for this process.
1984 ++ * Sync the pgd to the usermode tables.
1985 ++ */
1986 ++ WARN_ON(had_top_level_entry);
1987 ++ if (static_cpu_has(X86_FEATURE_PTI)) {
1988 ++ WARN_ON(kernel_to_user_pgdp(pgd)->pgd);
1989 ++ set_pgd(kernel_to_user_pgdp(pgd), *pgd);
1990 ++ }
1991 ++ }
1992 ++
1993 ++ va = (unsigned long)ldt_slot_va(slot);
1994 ++ flush_tlb_mm_range(mm, va, va + LDT_SLOT_STRIDE, 0);
1995 ++
1996 ++ ldt->slot = slot;
1997 ++#endif
1998 ++ return 0;
1999 ++}
2000 ++
2001 ++static void free_ldt_pgtables(struct mm_struct *mm)
2002 ++{
2003 ++#ifdef CONFIG_PAGE_TABLE_ISOLATION
2004 ++ struct mmu_gather tlb;
2005 ++ unsigned long start = LDT_BASE_ADDR;
2006 ++ unsigned long end = start + (1UL << PGDIR_SHIFT);
2007 ++
2008 ++ if (!static_cpu_has(X86_FEATURE_PTI))
2009 ++ return;
2010 ++
2011 ++ tlb_gather_mmu(&tlb, mm, start, end);
2012 ++ free_pgd_range(&tlb, start, end, start, end);
2013 ++ tlb_finish_mmu(&tlb, start, end);
2014 ++#endif
2015 ++}
2016 ++
2017 + /* After calling this, the LDT is immutable. */
2018 + static void finalize_ldt_struct(struct ldt_struct *ldt)
2019 + {
2020 +@@ -156,6 +271,12 @@ int ldt_dup_context(struct mm_struct *old_mm, struct mm_struct *mm)
2021 + new_ldt->nr_entries * LDT_ENTRY_SIZE);
2022 + finalize_ldt_struct(new_ldt);
2023 +
2024 ++ retval = map_ldt_struct(mm, new_ldt, 0);
2025 ++ if (retval) {
2026 ++ free_ldt_pgtables(mm);
2027 ++ free_ldt_struct(new_ldt);
2028 ++ goto out_unlock;
2029 ++ }
2030 + mm->context.ldt = new_ldt;
2031 +
2032 + out_unlock:
2033 +@@ -174,6 +295,11 @@ void destroy_context_ldt(struct mm_struct *mm)
2034 + mm->context.ldt = NULL;
2035 + }
2036 +
2037 ++void ldt_arch_exit_mmap(struct mm_struct *mm)
2038 ++{
2039 ++ free_ldt_pgtables(mm);
2040 ++}
2041 ++
2042 + static int read_ldt(void __user *ptr, unsigned long bytecount)
2043 + {
2044 + struct mm_struct *mm = current->mm;
2045 +@@ -287,6 +413,25 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
2046 + new_ldt->entries[ldt_info.entry_number] = ldt;
2047 + finalize_ldt_struct(new_ldt);
2048 +
2049 ++ /*
2050 ++ * If we are using PTI, map the new LDT into the userspace pagetables.
2051 ++ * If there is already an LDT, use the other slot so that other CPUs
2052 ++ * will continue to use the old LDT until install_ldt() switches
2053 ++ * them over to the new LDT.
2054 ++ */
2055 ++ error = map_ldt_struct(mm, new_ldt, old_ldt ? !old_ldt->slot : 0);
2056 ++ if (error) {
2057 ++ /*
2058 ++ * This only can fail for the first LDT setup. If an LDT is
2059 ++ * already installed then the PTE page is already
2060 ++ * populated. Mop up a half populated page table.
2061 ++ */
2062 ++ if (!WARN_ON_ONCE(old_ldt))
2063 ++ free_ldt_pgtables(mm);
2064 ++ free_ldt_struct(new_ldt);
2065 ++ goto out_unlock;
2066 ++ }
2067 ++
2068 + install_ldt(mm, new_ldt);
2069 + free_ldt_struct(old_ldt);
2070 + error = 0;
2071 +diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c
2072 +index 00bc751c861c..edfede768688 100644
2073 +--- a/arch/x86/kernel/machine_kexec_32.c
2074 ++++ b/arch/x86/kernel/machine_kexec_32.c
2075 +@@ -48,8 +48,6 @@ static void load_segments(void)
2076 + "\tmovl $"STR(__KERNEL_DS)",%%eax\n"
2077 + "\tmovl %%eax,%%ds\n"
2078 + "\tmovl %%eax,%%es\n"
2079 +- "\tmovl %%eax,%%fs\n"
2080 +- "\tmovl %%eax,%%gs\n"
2081 + "\tmovl %%eax,%%ss\n"
2082 + : : : "eax", "memory");
2083 + #undef STR
2084 +@@ -232,8 +230,8 @@ void machine_kexec(struct kimage *image)
2085 + * The gdt & idt are now invalid.
2086 + * If you want to load them you must set up your own idt & gdt.
2087 + */
2088 +- set_gdt(phys_to_virt(0), 0);
2089 + idt_invalidate(phys_to_virt(0));
2090 ++ set_gdt(phys_to_virt(0), 0);
2091 +
2092 + /* now call it */
2093 + image->start = relocate_kernel_ptr((unsigned long)image->head,
2094 +diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
2095 +index 12bf07d44dfe..2651ca2112c4 100644
2096 +--- a/arch/x86/kernel/smpboot.c
2097 ++++ b/arch/x86/kernel/smpboot.c
2098 +@@ -128,25 +128,16 @@ static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip)
2099 + spin_lock_irqsave(&rtc_lock, flags);
2100 + CMOS_WRITE(0xa, 0xf);
2101 + spin_unlock_irqrestore(&rtc_lock, flags);
2102 +- local_flush_tlb();
2103 +- pr_debug("1.\n");
2104 + *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_HIGH)) =
2105 + start_eip >> 4;
2106 +- pr_debug("2.\n");
2107 + *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) =
2108 + start_eip & 0xf;
2109 +- pr_debug("3.\n");
2110 + }
2111 +
2112 + static inline void smpboot_restore_warm_reset_vector(void)
2113 + {
2114 + unsigned long flags;
2115 +
2116 +- /*
2117 +- * Install writable page 0 entry to set BIOS data area.
2118 +- */
2119 +- local_flush_tlb();
2120 +-
2121 + /*
2122 + * Paranoid: Set warm reset code and vector here back
2123 + * to default values.
2124 +diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c
2125 +index 9a9c9b076955..a5b802a12212 100644
2126 +--- a/arch/x86/kernel/tls.c
2127 ++++ b/arch/x86/kernel/tls.c
2128 +@@ -93,17 +93,10 @@ static void set_tls_desc(struct task_struct *p, int idx,
2129 + cpu = get_cpu();
2130 +
2131 + while (n-- > 0) {
2132 +- if (LDT_empty(info) || LDT_zero(info)) {
2133 ++ if (LDT_empty(info) || LDT_zero(info))
2134 + memset(desc, 0, sizeof(*desc));
2135 +- } else {
2136 ++ else
2137 + fill_ldt(desc, info);
2138 +-
2139 +- /*
2140 +- * Always set the accessed bit so that the CPU
2141 +- * doesn't try to write to the (read-only) GDT.
2142 +- */
2143 +- desc->type |= 1;
2144 +- }
2145 + ++info;
2146 + ++desc;
2147 + }
2148 +diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
2149 +index 7c16fe0b60c2..b33e860d32fe 100644
2150 +--- a/arch/x86/kernel/traps.c
2151 ++++ b/arch/x86/kernel/traps.c
2152 +@@ -361,7 +361,7 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
2153 + *
2154 + * No need for ist_enter here because we don't use RCU.
2155 + */
2156 +- if (((long)regs->sp >> PGDIR_SHIFT) == ESPFIX_PGD_ENTRY &&
2157 ++ if (((long)regs->sp >> P4D_SHIFT) == ESPFIX_PGD_ENTRY &&
2158 + regs->cs == __KERNEL_CS &&
2159 + regs->ip == (unsigned long)native_irq_return_iret)
2160 + {
2161 +diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
2162 +index d2a8b5a24a44..1e413a9326aa 100644
2163 +--- a/arch/x86/kernel/vmlinux.lds.S
2164 ++++ b/arch/x86/kernel/vmlinux.lds.S
2165 +@@ -61,11 +61,17 @@ jiffies_64 = jiffies;
2166 + . = ALIGN(HPAGE_SIZE); \
2167 + __end_rodata_hpage_align = .;
2168 +
2169 ++#define ALIGN_ENTRY_TEXT_BEGIN . = ALIGN(PMD_SIZE);
2170 ++#define ALIGN_ENTRY_TEXT_END . = ALIGN(PMD_SIZE);
2171 ++
2172 + #else
2173 +
2174 + #define X64_ALIGN_RODATA_BEGIN
2175 + #define X64_ALIGN_RODATA_END
2176 +
2177 ++#define ALIGN_ENTRY_TEXT_BEGIN
2178 ++#define ALIGN_ENTRY_TEXT_END
2179 ++
2180 + #endif
2181 +
2182 + PHDRS {
2183 +@@ -102,8 +108,10 @@ SECTIONS
2184 + CPUIDLE_TEXT
2185 + LOCK_TEXT
2186 + KPROBES_TEXT
2187 ++ ALIGN_ENTRY_TEXT_BEGIN
2188 + ENTRY_TEXT
2189 + IRQENTRY_TEXT
2190 ++ ALIGN_ENTRY_TEXT_END
2191 + SOFTIRQENTRY_TEXT
2192 + *(.fixup)
2193 + *(.gnu.warning)
2194 +diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
2195 +index 2e0017af8f9b..52906808e277 100644
2196 +--- a/arch/x86/mm/Makefile
2197 ++++ b/arch/x86/mm/Makefile
2198 +@@ -43,9 +43,10 @@ obj-$(CONFIG_AMD_NUMA) += amdtopology.o
2199 + obj-$(CONFIG_ACPI_NUMA) += srat.o
2200 + obj-$(CONFIG_NUMA_EMU) += numa_emulation.o
2201 +
2202 +-obj-$(CONFIG_X86_INTEL_MPX) += mpx.o
2203 +-obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o
2204 +-obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o
2205 ++obj-$(CONFIG_X86_INTEL_MPX) += mpx.o
2206 ++obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o
2207 ++obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o
2208 ++obj-$(CONFIG_PAGE_TABLE_ISOLATION) += pti.o
2209 +
2210 + obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt.o
2211 + obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt_boot.o
2212 +diff --git a/arch/x86/mm/cpu_entry_area.c b/arch/x86/mm/cpu_entry_area.c
2213 +index fe814fd5e014..b9283cc27622 100644
2214 +--- a/arch/x86/mm/cpu_entry_area.c
2215 ++++ b/arch/x86/mm/cpu_entry_area.c
2216 +@@ -38,6 +38,32 @@ cea_map_percpu_pages(void *cea_vaddr, void *ptr, int pages, pgprot_t prot)
2217 + cea_set_pte(cea_vaddr, per_cpu_ptr_to_phys(ptr), prot);
2218 + }
2219 +
2220 ++static void percpu_setup_debug_store(int cpu)
2221 ++{
2222 ++#ifdef CONFIG_CPU_SUP_INTEL
2223 ++ int npages;
2224 ++ void *cea;
2225 ++
2226 ++ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
2227 ++ return;
2228 ++
2229 ++ cea = &get_cpu_entry_area(cpu)->cpu_debug_store;
2230 ++ npages = sizeof(struct debug_store) / PAGE_SIZE;
2231 ++ BUILD_BUG_ON(sizeof(struct debug_store) % PAGE_SIZE != 0);
2232 ++ cea_map_percpu_pages(cea, &per_cpu(cpu_debug_store, cpu), npages,
2233 ++ PAGE_KERNEL);
2234 ++
2235 ++ cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers;
2236 ++ /*
2237 ++ * Force the population of PMDs for not yet allocated per cpu
2238 ++ * memory like debug store buffers.
2239 ++ */
2240 ++ npages = sizeof(struct debug_store_buffers) / PAGE_SIZE;
2241 ++ for (; npages; npages--, cea += PAGE_SIZE)
2242 ++ cea_set_pte(cea, 0, PAGE_NONE);
2243 ++#endif
2244 ++}
2245 ++
2246 + /* Setup the fixmap mappings only once per-processor */
2247 + static void __init setup_cpu_entry_area(int cpu)
2248 + {
2249 +@@ -109,6 +135,7 @@ static void __init setup_cpu_entry_area(int cpu)
2250 + cea_set_pte(&get_cpu_entry_area(cpu)->entry_trampoline,
2251 + __pa_symbol(_entry_trampoline), PAGE_KERNEL_RX);
2252 + #endif
2253 ++ percpu_setup_debug_store(cpu);
2254 + }
2255 +
2256 + static __init void setup_cpu_entry_area_ptes(void)
2257 +diff --git a/arch/x86/mm/debug_pagetables.c b/arch/x86/mm/debug_pagetables.c
2258 +index bfcffdf6c577..421f2664ffa0 100644
2259 +--- a/arch/x86/mm/debug_pagetables.c
2260 ++++ b/arch/x86/mm/debug_pagetables.c
2261 +@@ -5,7 +5,7 @@
2262 +
2263 + static int ptdump_show(struct seq_file *m, void *v)
2264 + {
2265 +- ptdump_walk_pgd_level(m, NULL);
2266 ++ ptdump_walk_pgd_level_debugfs(m, NULL, false);
2267 + return 0;
2268 + }
2269 +
2270 +@@ -22,21 +22,89 @@ static const struct file_operations ptdump_fops = {
2271 + .release = single_release,
2272 + };
2273 +
2274 +-static struct dentry *pe;
2275 ++static int ptdump_show_curknl(struct seq_file *m, void *v)
2276 ++{
2277 ++ if (current->mm->pgd) {
2278 ++ down_read(&current->mm->mmap_sem);
2279 ++ ptdump_walk_pgd_level_debugfs(m, current->mm->pgd, false);
2280 ++ up_read(&current->mm->mmap_sem);
2281 ++ }
2282 ++ return 0;
2283 ++}
2284 ++
2285 ++static int ptdump_open_curknl(struct inode *inode, struct file *filp)
2286 ++{
2287 ++ return single_open(filp, ptdump_show_curknl, NULL);
2288 ++}
2289 ++
2290 ++static const struct file_operations ptdump_curknl_fops = {
2291 ++ .owner = THIS_MODULE,
2292 ++ .open = ptdump_open_curknl,
2293 ++ .read = seq_read,
2294 ++ .llseek = seq_lseek,
2295 ++ .release = single_release,
2296 ++};
2297 ++
2298 ++#ifdef CONFIG_PAGE_TABLE_ISOLATION
2299 ++static struct dentry *pe_curusr;
2300 ++
2301 ++static int ptdump_show_curusr(struct seq_file *m, void *v)
2302 ++{
2303 ++ if (current->mm->pgd) {
2304 ++ down_read(&current->mm->mmap_sem);
2305 ++ ptdump_walk_pgd_level_debugfs(m, current->mm->pgd, true);
2306 ++ up_read(&current->mm->mmap_sem);
2307 ++ }
2308 ++ return 0;
2309 ++}
2310 ++
2311 ++static int ptdump_open_curusr(struct inode *inode, struct file *filp)
2312 ++{
2313 ++ return single_open(filp, ptdump_show_curusr, NULL);
2314 ++}
2315 ++
2316 ++static const struct file_operations ptdump_curusr_fops = {
2317 ++ .owner = THIS_MODULE,
2318 ++ .open = ptdump_open_curusr,
2319 ++ .read = seq_read,
2320 ++ .llseek = seq_lseek,
2321 ++ .release = single_release,
2322 ++};
2323 ++#endif
2324 ++
2325 ++static struct dentry *dir, *pe_knl, *pe_curknl;
2326 +
2327 + static int __init pt_dump_debug_init(void)
2328 + {
2329 +- pe = debugfs_create_file("kernel_page_tables", S_IRUSR, NULL, NULL,
2330 +- &ptdump_fops);
2331 +- if (!pe)
2332 ++ dir = debugfs_create_dir("page_tables", NULL);
2333 ++ if (!dir)
2334 + return -ENOMEM;
2335 +
2336 ++ pe_knl = debugfs_create_file("kernel", 0400, dir, NULL,
2337 ++ &ptdump_fops);
2338 ++ if (!pe_knl)
2339 ++ goto err;
2340 ++
2341 ++ pe_curknl = debugfs_create_file("current_kernel", 0400,
2342 ++ dir, NULL, &ptdump_curknl_fops);
2343 ++ if (!pe_curknl)
2344 ++ goto err;
2345 ++
2346 ++#ifdef CONFIG_PAGE_TABLE_ISOLATION
2347 ++ pe_curusr = debugfs_create_file("current_user", 0400,
2348 ++ dir, NULL, &ptdump_curusr_fops);
2349 ++ if (!pe_curusr)
2350 ++ goto err;
2351 ++#endif
2352 + return 0;
2353 ++err:
2354 ++ debugfs_remove_recursive(dir);
2355 ++ return -ENOMEM;
2356 + }
2357 +
2358 + static void __exit pt_dump_debug_exit(void)
2359 + {
2360 +- debugfs_remove_recursive(pe);
2361 ++ debugfs_remove_recursive(dir);
2362 + }
2363 +
2364 + module_init(pt_dump_debug_init);
2365 +diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
2366 +index 43dedbfb7257..f56902c1f04b 100644
2367 +--- a/arch/x86/mm/dump_pagetables.c
2368 ++++ b/arch/x86/mm/dump_pagetables.c
2369 +@@ -52,11 +52,17 @@ enum address_markers_idx {
2370 + USER_SPACE_NR = 0,
2371 + KERNEL_SPACE_NR,
2372 + LOW_KERNEL_NR,
2373 ++#if defined(CONFIG_MODIFY_LDT_SYSCALL) && defined(CONFIG_X86_5LEVEL)
2374 ++ LDT_NR,
2375 ++#endif
2376 + VMALLOC_START_NR,
2377 + VMEMMAP_START_NR,
2378 + #ifdef CONFIG_KASAN
2379 + KASAN_SHADOW_START_NR,
2380 + KASAN_SHADOW_END_NR,
2381 ++#endif
2382 ++#if defined(CONFIG_MODIFY_LDT_SYSCALL) && !defined(CONFIG_X86_5LEVEL)
2383 ++ LDT_NR,
2384 + #endif
2385 + CPU_ENTRY_AREA_NR,
2386 + #ifdef CONFIG_X86_ESPFIX64
2387 +@@ -81,6 +87,9 @@ static struct addr_marker address_markers[] = {
2388 + #ifdef CONFIG_KASAN
2389 + [KASAN_SHADOW_START_NR] = { KASAN_SHADOW_START, "KASAN shadow" },
2390 + [KASAN_SHADOW_END_NR] = { KASAN_SHADOW_END, "KASAN shadow end" },
2391 ++#endif
2392 ++#ifdef CONFIG_MODIFY_LDT_SYSCALL
2393 ++ [LDT_NR] = { LDT_BASE_ADDR, "LDT remap" },
2394 + #endif
2395 + [CPU_ENTRY_AREA_NR] = { CPU_ENTRY_AREA_BASE,"CPU entry Area" },
2396 + #ifdef CONFIG_X86_ESPFIX64
2397 +@@ -467,7 +476,7 @@ static inline bool is_hypervisor_range(int idx)
2398 + }
2399 +
2400 + static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd,
2401 +- bool checkwx)
2402 ++ bool checkwx, bool dmesg)
2403 + {
2404 + #ifdef CONFIG_X86_64
2405 + pgd_t *start = (pgd_t *) &init_top_pgt;
2406 +@@ -480,7 +489,7 @@ static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd,
2407 +
2408 + if (pgd) {
2409 + start = pgd;
2410 +- st.to_dmesg = true;
2411 ++ st.to_dmesg = dmesg;
2412 + }
2413 +
2414 + st.check_wx = checkwx;
2415 +@@ -518,13 +527,37 @@ static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd,
2416 +
2417 + void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd)
2418 + {
2419 +- ptdump_walk_pgd_level_core(m, pgd, false);
2420 ++ ptdump_walk_pgd_level_core(m, pgd, false, true);
2421 ++}
2422 ++
2423 ++void ptdump_walk_pgd_level_debugfs(struct seq_file *m, pgd_t *pgd, bool user)
2424 ++{
2425 ++#ifdef CONFIG_PAGE_TABLE_ISOLATION
2426 ++ if (user && static_cpu_has(X86_FEATURE_PTI))
2427 ++ pgd = kernel_to_user_pgdp(pgd);
2428 ++#endif
2429 ++ ptdump_walk_pgd_level_core(m, pgd, false, false);
2430 ++}
2431 ++EXPORT_SYMBOL_GPL(ptdump_walk_pgd_level_debugfs);
2432 ++
2433 ++static void ptdump_walk_user_pgd_level_checkwx(void)
2434 ++{
2435 ++#ifdef CONFIG_PAGE_TABLE_ISOLATION
2436 ++ pgd_t *pgd = (pgd_t *) &init_top_pgt;
2437 ++
2438 ++ if (!static_cpu_has(X86_FEATURE_PTI))
2439 ++ return;
2440 ++
2441 ++ pr_info("x86/mm: Checking user space page tables\n");
2442 ++ pgd = kernel_to_user_pgdp(pgd);
2443 ++ ptdump_walk_pgd_level_core(NULL, pgd, true, false);
2444 ++#endif
2445 + }
2446 +-EXPORT_SYMBOL_GPL(ptdump_walk_pgd_level);
2447 +
2448 + void ptdump_walk_pgd_level_checkwx(void)
2449 + {
2450 +- ptdump_walk_pgd_level_core(NULL, NULL, true);
2451 ++ ptdump_walk_pgd_level_core(NULL, NULL, true, false);
2452 ++ ptdump_walk_user_pgd_level_checkwx();
2453 + }
2454 +
2455 + static int __init pt_dump_init(void)
2456 +diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
2457 +index a22c2b95e513..80259ad8c386 100644
2458 +--- a/arch/x86/mm/init.c
2459 ++++ b/arch/x86/mm/init.c
2460 +@@ -20,6 +20,7 @@
2461 + #include <asm/kaslr.h>
2462 + #include <asm/hypervisor.h>
2463 + #include <asm/cpufeature.h>
2464 ++#include <asm/pti.h>
2465 +
2466 + /*
2467 + * We need to define the tracepoints somewhere, and tlb.c
2468 +@@ -161,6 +162,12 @@ struct map_range {
2469 +
2470 + static int page_size_mask;
2471 +
2472 ++static void enable_global_pages(void)
2473 ++{
2474 ++ if (!static_cpu_has(X86_FEATURE_PTI))
2475 ++ __supported_pte_mask |= _PAGE_GLOBAL;
2476 ++}
2477 ++
2478 + static void __init probe_page_size_mask(void)
2479 + {
2480 + /*
2481 +@@ -179,11 +186,11 @@ static void __init probe_page_size_mask(void)
2482 + cr4_set_bits_and_update_boot(X86_CR4_PSE);
2483 +
2484 + /* Enable PGE if available */
2485 ++ __supported_pte_mask &= ~_PAGE_GLOBAL;
2486 + if (boot_cpu_has(X86_FEATURE_PGE)) {
2487 + cr4_set_bits_and_update_boot(X86_CR4_PGE);
2488 +- __supported_pte_mask |= _PAGE_GLOBAL;
2489 +- } else
2490 +- __supported_pte_mask &= ~_PAGE_GLOBAL;
2491 ++ enable_global_pages();
2492 ++ }
2493 +
2494 + /* Enable 1 GB linear kernel mappings if available: */
2495 + if (direct_gbpages && boot_cpu_has(X86_FEATURE_GBPAGES)) {
2496 +@@ -196,34 +203,44 @@ static void __init probe_page_size_mask(void)
2497 +
2498 + static void setup_pcid(void)
2499 + {
2500 +-#ifdef CONFIG_X86_64
2501 +- if (boot_cpu_has(X86_FEATURE_PCID)) {
2502 +- if (boot_cpu_has(X86_FEATURE_PGE)) {
2503 +- /*
2504 +- * This can't be cr4_set_bits_and_update_boot() --
2505 +- * the trampoline code can't handle CR4.PCIDE and
2506 +- * it wouldn't do any good anyway. Despite the name,
2507 +- * cr4_set_bits_and_update_boot() doesn't actually
2508 +- * cause the bits in question to remain set all the
2509 +- * way through the secondary boot asm.
2510 +- *
2511 +- * Instead, we brute-force it and set CR4.PCIDE
2512 +- * manually in start_secondary().
2513 +- */
2514 +- cr4_set_bits(X86_CR4_PCIDE);
2515 +- } else {
2516 +- /*
2517 +- * flush_tlb_all(), as currently implemented, won't
2518 +- * work if PCID is on but PGE is not. Since that
2519 +- * combination doesn't exist on real hardware, there's
2520 +- * no reason to try to fully support it, but it's
2521 +- * polite to avoid corrupting data if we're on
2522 +- * an improperly configured VM.
2523 +- */
2524 +- setup_clear_cpu_cap(X86_FEATURE_PCID);
2525 +- }
2526 ++ if (!IS_ENABLED(CONFIG_X86_64))
2527 ++ return;
2528 ++
2529 ++ if (!boot_cpu_has(X86_FEATURE_PCID))
2530 ++ return;
2531 ++
2532 ++ if (boot_cpu_has(X86_FEATURE_PGE)) {
2533 ++ /*
2534 ++ * This can't be cr4_set_bits_and_update_boot() -- the
2535 ++ * trampoline code can't handle CR4.PCIDE and it wouldn't
2536 ++ * do any good anyway. Despite the name,
2537 ++ * cr4_set_bits_and_update_boot() doesn't actually cause
2538 ++ * the bits in question to remain set all the way through
2539 ++ * the secondary boot asm.
2540 ++ *
2541 ++ * Instead, we brute-force it and set CR4.PCIDE manually in
2542 ++ * start_secondary().
2543 ++ */
2544 ++ cr4_set_bits(X86_CR4_PCIDE);
2545 ++
2546 ++ /*
2547 ++ * INVPCID's single-context modes (2/3) only work if we set
2548 ++ * X86_CR4_PCIDE, *and* we INVPCID support. It's unusable
2549 ++ * on systems that have X86_CR4_PCIDE clear, or that have
2550 ++ * no INVPCID support at all.
2551 ++ */
2552 ++ if (boot_cpu_has(X86_FEATURE_INVPCID))
2553 ++ setup_force_cpu_cap(X86_FEATURE_INVPCID_SINGLE);
2554 ++ } else {
2555 ++ /*
2556 ++ * flush_tlb_all(), as currently implemented, won't work if
2557 ++ * PCID is on but PGE is not. Since that combination
2558 ++ * doesn't exist on real hardware, there's no reason to try
2559 ++ * to fully support it, but it's polite to avoid corrupting
2560 ++ * data if we're on an improperly configured VM.
2561 ++ */
2562 ++ setup_clear_cpu_cap(X86_FEATURE_PCID);
2563 + }
2564 +-#endif
2565 + }
2566 +
2567 + #ifdef CONFIG_X86_32
2568 +@@ -624,6 +641,7 @@ void __init init_mem_mapping(void)
2569 + {
2570 + unsigned long end;
2571 +
2572 ++ pti_check_boottime_disable();
2573 + probe_page_size_mask();
2574 + setup_pcid();
2575 +
2576 +@@ -847,7 +865,7 @@ void __init zone_sizes_init(void)
2577 + free_area_init_nodes(max_zone_pfns);
2578 + }
2579 +
2580 +-DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = {
2581 ++__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = {
2582 + .loaded_mm = &init_mm,
2583 + .next_asid = 1,
2584 + .cr4 = ~0UL, /* fail hard if we screw up cr4 shadow initialization */
2585 +diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
2586 +index 17ebc5a978cc..9b7bcbd33cc2 100644
2587 +--- a/arch/x86/mm/pgtable.c
2588 ++++ b/arch/x86/mm/pgtable.c
2589 +@@ -355,14 +355,15 @@ static inline void _pgd_free(pgd_t *pgd)
2590 + kmem_cache_free(pgd_cache, pgd);
2591 + }
2592 + #else
2593 ++
2594 + static inline pgd_t *_pgd_alloc(void)
2595 + {
2596 +- return (pgd_t *)__get_free_page(PGALLOC_GFP);
2597 ++ return (pgd_t *)__get_free_pages(PGALLOC_GFP, PGD_ALLOCATION_ORDER);
2598 + }
2599 +
2600 + static inline void _pgd_free(pgd_t *pgd)
2601 + {
2602 +- free_page((unsigned long)pgd);
2603 ++ free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER);
2604 + }
2605 + #endif /* CONFIG_X86_PAE */
2606 +
2607 +diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c
2608 +new file mode 100644
2609 +index 000000000000..bce8aea65606
2610 +--- /dev/null
2611 ++++ b/arch/x86/mm/pti.c
2612 +@@ -0,0 +1,387 @@
2613 ++/*
2614 ++ * Copyright(c) 2017 Intel Corporation. All rights reserved.
2615 ++ *
2616 ++ * This program is free software; you can redistribute it and/or modify
2617 ++ * it under the terms of version 2 of the GNU General Public License as
2618 ++ * published by the Free Software Foundation.
2619 ++ *
2620 ++ * This program is distributed in the hope that it will be useful, but
2621 ++ * WITHOUT ANY WARRANTY; without even the implied warranty of
2622 ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
2623 ++ * General Public License for more details.
2624 ++ *
2625 ++ * This code is based in part on work published here:
2626 ++ *
2627 ++ * https://github.com/IAIK/KAISER
2628 ++ *
2629 ++ * The original work was written by and and signed off by for the Linux
2630 ++ * kernel by:
2631 ++ *
2632 ++ * Signed-off-by: Richard Fellner <richard.fellner@××××××××××××××.at>
2633 ++ * Signed-off-by: Moritz Lipp <moritz.lipp@×××××××××××.at>
2634 ++ * Signed-off-by: Daniel Gruss <daniel.gruss@×××××××××××.at>
2635 ++ * Signed-off-by: Michael Schwarz <michael.schwarz@×××××××××××.at>
2636 ++ *
2637 ++ * Major changes to the original code by: Dave Hansen <dave.hansen@×××××.com>
2638 ++ * Mostly rewritten by Thomas Gleixner <tglx@××××××××××.de> and
2639 ++ * Andy Lutomirsky <luto@××××××××××.net>
2640 ++ */
2641 ++#include <linux/kernel.h>
2642 ++#include <linux/errno.h>
2643 ++#include <linux/string.h>
2644 ++#include <linux/types.h>
2645 ++#include <linux/bug.h>
2646 ++#include <linux/init.h>
2647 ++#include <linux/spinlock.h>
2648 ++#include <linux/mm.h>
2649 ++#include <linux/uaccess.h>
2650 ++
2651 ++#include <asm/cpufeature.h>
2652 ++#include <asm/hypervisor.h>
2653 ++#include <asm/vsyscall.h>
2654 ++#include <asm/cmdline.h>
2655 ++#include <asm/pti.h>
2656 ++#include <asm/pgtable.h>
2657 ++#include <asm/pgalloc.h>
2658 ++#include <asm/tlbflush.h>
2659 ++#include <asm/desc.h>
2660 ++
2661 ++#undef pr_fmt
2662 ++#define pr_fmt(fmt) "Kernel/User page tables isolation: " fmt
2663 ++
2664 ++/* Backporting helper */
2665 ++#ifndef __GFP_NOTRACK
2666 ++#define __GFP_NOTRACK 0
2667 ++#endif
2668 ++
2669 ++static void __init pti_print_if_insecure(const char *reason)
2670 ++{
2671 ++ if (boot_cpu_has_bug(X86_BUG_CPU_INSECURE))
2672 ++ pr_info("%s\n", reason);
2673 ++}
2674 ++
2675 ++static void __init pti_print_if_secure(const char *reason)
2676 ++{
2677 ++ if (!boot_cpu_has_bug(X86_BUG_CPU_INSECURE))
2678 ++ pr_info("%s\n", reason);
2679 ++}
2680 ++
2681 ++void __init pti_check_boottime_disable(void)
2682 ++{
2683 ++ char arg[5];
2684 ++ int ret;
2685 ++
2686 ++ if (hypervisor_is_type(X86_HYPER_XEN_PV)) {
2687 ++ pti_print_if_insecure("disabled on XEN PV.");
2688 ++ return;
2689 ++ }
2690 ++
2691 ++ ret = cmdline_find_option(boot_command_line, "pti", arg, sizeof(arg));
2692 ++ if (ret > 0) {
2693 ++ if (ret == 3 && !strncmp(arg, "off", 3)) {
2694 ++ pti_print_if_insecure("disabled on command line.");
2695 ++ return;
2696 ++ }
2697 ++ if (ret == 2 && !strncmp(arg, "on", 2)) {
2698 ++ pti_print_if_secure("force enabled on command line.");
2699 ++ goto enable;
2700 ++ }
2701 ++ if (ret == 4 && !strncmp(arg, "auto", 4))
2702 ++ goto autosel;
2703 ++ }
2704 ++
2705 ++ if (cmdline_find_option_bool(boot_command_line, "nopti")) {
2706 ++ pti_print_if_insecure("disabled on command line.");
2707 ++ return;
2708 ++ }
2709 ++
2710 ++autosel:
2711 ++ if (!boot_cpu_has_bug(X86_BUG_CPU_INSECURE))
2712 ++ return;
2713 ++enable:
2714 ++ setup_force_cpu_cap(X86_FEATURE_PTI);
2715 ++}
2716 ++
2717 ++pgd_t __pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd)
2718 ++{
2719 ++ /*
2720 ++ * Changes to the high (kernel) portion of the kernelmode page
2721 ++ * tables are not automatically propagated to the usermode tables.
2722 ++ *
2723 ++ * Users should keep in mind that, unlike the kernelmode tables,
2724 ++ * there is no vmalloc_fault equivalent for the usermode tables.
2725 ++ * Top-level entries added to init_mm's usermode pgd after boot
2726 ++ * will not be automatically propagated to other mms.
2727 ++ */
2728 ++ if (!pgdp_maps_userspace(pgdp))
2729 ++ return pgd;
2730 ++
2731 ++ /*
2732 ++ * The user page tables get the full PGD, accessible from
2733 ++ * userspace:
2734 ++ */
2735 ++ kernel_to_user_pgdp(pgdp)->pgd = pgd.pgd;
2736 ++
2737 ++ /*
2738 ++ * If this is normal user memory, make it NX in the kernel
2739 ++ * pagetables so that, if we somehow screw up and return to
2740 ++ * usermode with the kernel CR3 loaded, we'll get a page fault
2741 ++ * instead of allowing user code to execute with the wrong CR3.
2742 ++ *
2743 ++ * As exceptions, we don't set NX if:
2744 ++ * - _PAGE_USER is not set. This could be an executable
2745 ++ * EFI runtime mapping or something similar, and the kernel
2746 ++ * may execute from it
2747 ++ * - we don't have NX support
2748 ++ * - we're clearing the PGD (i.e. the new pgd is not present).
2749 ++ */
2750 ++ if ((pgd.pgd & (_PAGE_USER|_PAGE_PRESENT)) == (_PAGE_USER|_PAGE_PRESENT) &&
2751 ++ (__supported_pte_mask & _PAGE_NX))
2752 ++ pgd.pgd |= _PAGE_NX;
2753 ++
2754 ++ /* return the copy of the PGD we want the kernel to use: */
2755 ++ return pgd;
2756 ++}
2757 ++
2758 ++/*
2759 ++ * Walk the user copy of the page tables (optionally) trying to allocate
2760 ++ * page table pages on the way down.
2761 ++ *
2762 ++ * Returns a pointer to a P4D on success, or NULL on failure.
2763 ++ */
2764 ++static p4d_t *pti_user_pagetable_walk_p4d(unsigned long address)
2765 ++{
2766 ++ pgd_t *pgd = kernel_to_user_pgdp(pgd_offset_k(address));
2767 ++ gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
2768 ++
2769 ++ if (address < PAGE_OFFSET) {
2770 ++ WARN_ONCE(1, "attempt to walk user address\n");
2771 ++ return NULL;
2772 ++ }
2773 ++
2774 ++ if (pgd_none(*pgd)) {
2775 ++ unsigned long new_p4d_page = __get_free_page(gfp);
2776 ++ if (!new_p4d_page)
2777 ++ return NULL;
2778 ++
2779 ++ if (pgd_none(*pgd)) {
2780 ++ set_pgd(pgd, __pgd(_KERNPG_TABLE | __pa(new_p4d_page)));
2781 ++ new_p4d_page = 0;
2782 ++ }
2783 ++ if (new_p4d_page)
2784 ++ free_page(new_p4d_page);
2785 ++ }
2786 ++ BUILD_BUG_ON(pgd_large(*pgd) != 0);
2787 ++
2788 ++ return p4d_offset(pgd, address);
2789 ++}
2790 ++
2791 ++/*
2792 ++ * Walk the user copy of the page tables (optionally) trying to allocate
2793 ++ * page table pages on the way down.
2794 ++ *
2795 ++ * Returns a pointer to a PMD on success, or NULL on failure.
2796 ++ */
2797 ++static pmd_t *pti_user_pagetable_walk_pmd(unsigned long address)
2798 ++{
2799 ++ gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
2800 ++ p4d_t *p4d = pti_user_pagetable_walk_p4d(address);
2801 ++ pud_t *pud;
2802 ++
2803 ++ BUILD_BUG_ON(p4d_large(*p4d) != 0);
2804 ++ if (p4d_none(*p4d)) {
2805 ++ unsigned long new_pud_page = __get_free_page(gfp);
2806 ++ if (!new_pud_page)
2807 ++ return NULL;
2808 ++
2809 ++ if (p4d_none(*p4d)) {
2810 ++ set_p4d(p4d, __p4d(_KERNPG_TABLE | __pa(new_pud_page)));
2811 ++ new_pud_page = 0;
2812 ++ }
2813 ++ if (new_pud_page)
2814 ++ free_page(new_pud_page);
2815 ++ }
2816 ++
2817 ++ pud = pud_offset(p4d, address);
2818 ++ /* The user page tables do not use large mappings: */
2819 ++ if (pud_large(*pud)) {
2820 ++ WARN_ON(1);
2821 ++ return NULL;
2822 ++ }
2823 ++ if (pud_none(*pud)) {
2824 ++ unsigned long new_pmd_page = __get_free_page(gfp);
2825 ++ if (!new_pmd_page)
2826 ++ return NULL;
2827 ++
2828 ++ if (pud_none(*pud)) {
2829 ++ set_pud(pud, __pud(_KERNPG_TABLE | __pa(new_pmd_page)));
2830 ++ new_pmd_page = 0;
2831 ++ }
2832 ++ if (new_pmd_page)
2833 ++ free_page(new_pmd_page);
2834 ++ }
2835 ++
2836 ++ return pmd_offset(pud, address);
2837 ++}
2838 ++
2839 ++#ifdef CONFIG_X86_VSYSCALL_EMULATION
2840 ++/*
2841 ++ * Walk the shadow copy of the page tables (optionally) trying to allocate
2842 ++ * page table pages on the way down. Does not support large pages.
2843 ++ *
2844 ++ * Note: this is only used when mapping *new* kernel data into the
2845 ++ * user/shadow page tables. It is never used for userspace data.
2846 ++ *
2847 ++ * Returns a pointer to a PTE on success, or NULL on failure.
2848 ++ */
2849 ++static __init pte_t *pti_user_pagetable_walk_pte(unsigned long address)
2850 ++{
2851 ++ gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
2852 ++ pmd_t *pmd = pti_user_pagetable_walk_pmd(address);
2853 ++ pte_t *pte;
2854 ++
2855 ++ /* We can't do anything sensible if we hit a large mapping. */
2856 ++ if (pmd_large(*pmd)) {
2857 ++ WARN_ON(1);
2858 ++ return NULL;
2859 ++ }
2860 ++
2861 ++ if (pmd_none(*pmd)) {
2862 ++ unsigned long new_pte_page = __get_free_page(gfp);
2863 ++ if (!new_pte_page)
2864 ++ return NULL;
2865 ++
2866 ++ if (pmd_none(*pmd)) {
2867 ++ set_pmd(pmd, __pmd(_KERNPG_TABLE | __pa(new_pte_page)));
2868 ++ new_pte_page = 0;
2869 ++ }
2870 ++ if (new_pte_page)
2871 ++ free_page(new_pte_page);
2872 ++ }
2873 ++
2874 ++ pte = pte_offset_kernel(pmd, address);
2875 ++ if (pte_flags(*pte) & _PAGE_USER) {
2876 ++ WARN_ONCE(1, "attempt to walk to user pte\n");
2877 ++ return NULL;
2878 ++ }
2879 ++ return pte;
2880 ++}
2881 ++
2882 ++static void __init pti_setup_vsyscall(void)
2883 ++{
2884 ++ pte_t *pte, *target_pte;
2885 ++ unsigned int level;
2886 ++
2887 ++ pte = lookup_address(VSYSCALL_ADDR, &level);
2888 ++ if (!pte || WARN_ON(level != PG_LEVEL_4K) || pte_none(*pte))
2889 ++ return;
2890 ++
2891 ++ target_pte = pti_user_pagetable_walk_pte(VSYSCALL_ADDR);
2892 ++ if (WARN_ON(!target_pte))
2893 ++ return;
2894 ++
2895 ++ *target_pte = *pte;
2896 ++ set_vsyscall_pgtable_user_bits(kernel_to_user_pgdp(swapper_pg_dir));
2897 ++}
2898 ++#else
2899 ++static void __init pti_setup_vsyscall(void) { }
2900 ++#endif
2901 ++
2902 ++static void __init
2903 ++pti_clone_pmds(unsigned long start, unsigned long end, pmdval_t clear)
2904 ++{
2905 ++ unsigned long addr;
2906 ++
2907 ++ /*
2908 ++ * Clone the populated PMDs which cover start to end. These PMD areas
2909 ++ * can have holes.
2910 ++ */
2911 ++ for (addr = start; addr < end; addr += PMD_SIZE) {
2912 ++ pmd_t *pmd, *target_pmd;
2913 ++ pgd_t *pgd;
2914 ++ p4d_t *p4d;
2915 ++ pud_t *pud;
2916 ++
2917 ++ pgd = pgd_offset_k(addr);
2918 ++ if (WARN_ON(pgd_none(*pgd)))
2919 ++ return;
2920 ++ p4d = p4d_offset(pgd, addr);
2921 ++ if (WARN_ON(p4d_none(*p4d)))
2922 ++ return;
2923 ++ pud = pud_offset(p4d, addr);
2924 ++ if (pud_none(*pud))
2925 ++ continue;
2926 ++ pmd = pmd_offset(pud, addr);
2927 ++ if (pmd_none(*pmd))
2928 ++ continue;
2929 ++
2930 ++ target_pmd = pti_user_pagetable_walk_pmd(addr);
2931 ++ if (WARN_ON(!target_pmd))
2932 ++ return;
2933 ++
2934 ++ /*
2935 ++ * Copy the PMD. That is, the kernelmode and usermode
2936 ++ * tables will share the last-level page tables of this
2937 ++ * address range
2938 ++ */
2939 ++ *target_pmd = pmd_clear_flags(*pmd, clear);
2940 ++ }
2941 ++}
2942 ++
2943 ++/*
2944 ++ * Clone a single p4d (i.e. a top-level entry on 4-level systems and a
2945 ++ * next-level entry on 5-level systems.
2946 ++ */
2947 ++static void __init pti_clone_p4d(unsigned long addr)
2948 ++{
2949 ++ p4d_t *kernel_p4d, *user_p4d;
2950 ++ pgd_t *kernel_pgd;
2951 ++
2952 ++ user_p4d = pti_user_pagetable_walk_p4d(addr);
2953 ++ kernel_pgd = pgd_offset_k(addr);
2954 ++ kernel_p4d = p4d_offset(kernel_pgd, addr);
2955 ++ *user_p4d = *kernel_p4d;
2956 ++}
2957 ++
2958 ++/*
2959 ++ * Clone the CPU_ENTRY_AREA into the user space visible page table.
2960 ++ */
2961 ++static void __init pti_clone_user_shared(void)
2962 ++{
2963 ++ pti_clone_p4d(CPU_ENTRY_AREA_BASE);
2964 ++}
2965 ++
2966 ++/*
2967 ++ * Clone the ESPFIX P4D into the user space visinble page table
2968 ++ */
2969 ++static void __init pti_setup_espfix64(void)
2970 ++{
2971 ++#ifdef CONFIG_X86_ESPFIX64
2972 ++ pti_clone_p4d(ESPFIX_BASE_ADDR);
2973 ++#endif
2974 ++}
2975 ++
2976 ++/*
2977 ++ * Clone the populated PMDs of the entry and irqentry text and force it RO.
2978 ++ */
2979 ++static void __init pti_clone_entry_text(void)
2980 ++{
2981 ++ pti_clone_pmds((unsigned long) __entry_text_start,
2982 ++ (unsigned long) __irqentry_text_end, _PAGE_RW);
2983 ++}
2984 ++
2985 ++/*
2986 ++ * Initialize kernel page table isolation
2987 ++ */
2988 ++void __init pti_init(void)
2989 ++{
2990 ++ if (!static_cpu_has(X86_FEATURE_PTI))
2991 ++ return;
2992 ++
2993 ++ pr_info("enabled\n");
2994 ++
2995 ++ pti_clone_user_shared();
2996 ++ pti_clone_entry_text();
2997 ++ pti_setup_espfix64();
2998 ++ pti_setup_vsyscall();
2999 ++}
3000 +diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
3001 +index 0a1be3adc97e..a1561957dccb 100644
3002 +--- a/arch/x86/mm/tlb.c
3003 ++++ b/arch/x86/mm/tlb.c
3004 +@@ -28,6 +28,38 @@
3005 + * Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi
3006 + */
3007 +
3008 ++/*
3009 ++ * We get here when we do something requiring a TLB invalidation
3010 ++ * but could not go invalidate all of the contexts. We do the
3011 ++ * necessary invalidation by clearing out the 'ctx_id' which
3012 ++ * forces a TLB flush when the context is loaded.
3013 ++ */
3014 ++void clear_asid_other(void)
3015 ++{
3016 ++ u16 asid;
3017 ++
3018 ++ /*
3019 ++ * This is only expected to be set if we have disabled
3020 ++ * kernel _PAGE_GLOBAL pages.
3021 ++ */
3022 ++ if (!static_cpu_has(X86_FEATURE_PTI)) {
3023 ++ WARN_ON_ONCE(1);
3024 ++ return;
3025 ++ }
3026 ++
3027 ++ for (asid = 0; asid < TLB_NR_DYN_ASIDS; asid++) {
3028 ++ /* Do not need to flush the current asid */
3029 ++ if (asid == this_cpu_read(cpu_tlbstate.loaded_mm_asid))
3030 ++ continue;
3031 ++ /*
3032 ++ * Make sure the next time we go to switch to
3033 ++ * this asid, we do a flush:
3034 ++ */
3035 ++ this_cpu_write(cpu_tlbstate.ctxs[asid].ctx_id, 0);
3036 ++ }
3037 ++ this_cpu_write(cpu_tlbstate.invalidate_other, false);
3038 ++}
3039 ++
3040 + atomic64_t last_mm_ctx_id = ATOMIC64_INIT(1);
3041 +
3042 +
3043 +@@ -42,6 +74,9 @@ static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen,
3044 + return;
3045 + }
3046 +
3047 ++ if (this_cpu_read(cpu_tlbstate.invalidate_other))
3048 ++ clear_asid_other();
3049 ++
3050 + for (asid = 0; asid < TLB_NR_DYN_ASIDS; asid++) {
3051 + if (this_cpu_read(cpu_tlbstate.ctxs[asid].ctx_id) !=
3052 + next->context.ctx_id)
3053 +@@ -65,6 +100,25 @@ static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen,
3054 + *need_flush = true;
3055 + }
3056 +
3057 ++static void load_new_mm_cr3(pgd_t *pgdir, u16 new_asid, bool need_flush)
3058 ++{
3059 ++ unsigned long new_mm_cr3;
3060 ++
3061 ++ if (need_flush) {
3062 ++ invalidate_user_asid(new_asid);
3063 ++ new_mm_cr3 = build_cr3(pgdir, new_asid);
3064 ++ } else {
3065 ++ new_mm_cr3 = build_cr3_noflush(pgdir, new_asid);
3066 ++ }
3067 ++
3068 ++ /*
3069 ++ * Caution: many callers of this function expect
3070 ++ * that load_cr3() is serializing and orders TLB
3071 ++ * fills with respect to the mm_cpumask writes.
3072 ++ */
3073 ++ write_cr3(new_mm_cr3);
3074 ++}
3075 ++
3076 + void leave_mm(int cpu)
3077 + {
3078 + struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
3079 +@@ -195,7 +249,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
3080 + if (need_flush) {
3081 + this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
3082 + this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
3083 +- write_cr3(build_cr3(next->pgd, new_asid));
3084 ++ load_new_mm_cr3(next->pgd, new_asid, true);
3085 +
3086 + /*
3087 + * NB: This gets called via leave_mm() in the idle path
3088 +@@ -208,7 +262,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
3089 + trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
3090 + } else {
3091 + /* The new ASID is already up to date. */
3092 +- write_cr3(build_cr3_noflush(next->pgd, new_asid));
3093 ++ load_new_mm_cr3(next->pgd, new_asid, false);
3094 +
3095 + /* See above wrt _rcuidle. */
3096 + trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0);
3097 +diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
3098 +index 20fb31579b69..39c4b35ac7a4 100644
3099 +--- a/arch/x86/platform/efi/efi_64.c
3100 ++++ b/arch/x86/platform/efi/efi_64.c
3101 +@@ -195,6 +195,9 @@ static pgd_t *efi_pgd;
3102 + * because we want to avoid inserting EFI region mappings (EFI_VA_END
3103 + * to EFI_VA_START) into the standard kernel page tables. Everything
3104 + * else can be shared, see efi_sync_low_kernel_mappings().
3105 ++ *
3106 ++ * We don't want the pgd on the pgd_list and cannot use pgd_alloc() for the
3107 ++ * allocation.
3108 + */
3109 + int __init efi_alloc_page_tables(void)
3110 + {
3111 +@@ -207,7 +210,7 @@ int __init efi_alloc_page_tables(void)
3112 + return 0;
3113 +
3114 + gfp_mask = GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO;
3115 +- efi_pgd = (pgd_t *)__get_free_page(gfp_mask);
3116 ++ efi_pgd = (pgd_t *)__get_free_pages(gfp_mask, PGD_ALLOCATION_ORDER);
3117 + if (!efi_pgd)
3118 + return -ENOMEM;
3119 +
3120 +diff --git a/block/blk-map.c b/block/blk-map.c
3121 +index d5251edcc0dd..368daa02714e 100644
3122 +--- a/block/blk-map.c
3123 ++++ b/block/blk-map.c
3124 +@@ -12,22 +12,29 @@
3125 + #include "blk.h"
3126 +
3127 + /*
3128 +- * Append a bio to a passthrough request. Only works can be merged into
3129 +- * the request based on the driver constraints.
3130 ++ * Append a bio to a passthrough request. Only works if the bio can be merged
3131 ++ * into the request based on the driver constraints.
3132 + */
3133 +-int blk_rq_append_bio(struct request *rq, struct bio *bio)
3134 ++int blk_rq_append_bio(struct request *rq, struct bio **bio)
3135 + {
3136 +- blk_queue_bounce(rq->q, &bio);
3137 ++ struct bio *orig_bio = *bio;
3138 ++
3139 ++ blk_queue_bounce(rq->q, bio);
3140 +
3141 + if (!rq->bio) {
3142 +- blk_rq_bio_prep(rq->q, rq, bio);
3143 ++ blk_rq_bio_prep(rq->q, rq, *bio);
3144 + } else {
3145 +- if (!ll_back_merge_fn(rq->q, rq, bio))
3146 ++ if (!ll_back_merge_fn(rq->q, rq, *bio)) {
3147 ++ if (orig_bio != *bio) {
3148 ++ bio_put(*bio);
3149 ++ *bio = orig_bio;
3150 ++ }
3151 + return -EINVAL;
3152 ++ }
3153 +
3154 +- rq->biotail->bi_next = bio;
3155 +- rq->biotail = bio;
3156 +- rq->__data_len += bio->bi_iter.bi_size;
3157 ++ rq->biotail->bi_next = *bio;
3158 ++ rq->biotail = *bio;
3159 ++ rq->__data_len += (*bio)->bi_iter.bi_size;
3160 + }
3161 +
3162 + return 0;
3163 +@@ -80,14 +87,12 @@ static int __blk_rq_map_user_iov(struct request *rq,
3164 + * We link the bounce buffer in and could have to traverse it
3165 + * later so we have to get a ref to prevent it from being freed
3166 + */
3167 +- ret = blk_rq_append_bio(rq, bio);
3168 +- bio_get(bio);
3169 ++ ret = blk_rq_append_bio(rq, &bio);
3170 + if (ret) {
3171 +- bio_endio(bio);
3172 + __blk_rq_unmap_user(orig_bio);
3173 +- bio_put(bio);
3174 + return ret;
3175 + }
3176 ++ bio_get(bio);
3177 +
3178 + return 0;
3179 + }
3180 +@@ -220,7 +225,7 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf,
3181 + int reading = rq_data_dir(rq) == READ;
3182 + unsigned long addr = (unsigned long) kbuf;
3183 + int do_copy = 0;
3184 +- struct bio *bio;
3185 ++ struct bio *bio, *orig_bio;
3186 + int ret;
3187 +
3188 + if (len > (queue_max_hw_sectors(q) << 9))
3189 +@@ -243,10 +248,11 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf,
3190 + if (do_copy)
3191 + rq->rq_flags |= RQF_COPY_USER;
3192 +
3193 +- ret = blk_rq_append_bio(rq, bio);
3194 ++ orig_bio = bio;
3195 ++ ret = blk_rq_append_bio(rq, &bio);
3196 + if (unlikely(ret)) {
3197 + /* request is too big */
3198 +- bio_put(bio);
3199 ++ bio_put(orig_bio);
3200 + return ret;
3201 + }
3202 +
3203 +diff --git a/block/bounce.c b/block/bounce.c
3204 +index fceb1a96480b..1d05c422c932 100644
3205 +--- a/block/bounce.c
3206 ++++ b/block/bounce.c
3207 +@@ -200,6 +200,7 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig,
3208 + unsigned i = 0;
3209 + bool bounce = false;
3210 + int sectors = 0;
3211 ++ bool passthrough = bio_is_passthrough(*bio_orig);
3212 +
3213 + bio_for_each_segment(from, *bio_orig, iter) {
3214 + if (i++ < BIO_MAX_PAGES)
3215 +@@ -210,13 +211,14 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig,
3216 + if (!bounce)
3217 + return;
3218 +
3219 +- if (sectors < bio_sectors(*bio_orig)) {
3220 ++ if (!passthrough && sectors < bio_sectors(*bio_orig)) {
3221 + bio = bio_split(*bio_orig, sectors, GFP_NOIO, bounce_bio_split);
3222 + bio_chain(bio, *bio_orig);
3223 + generic_make_request(*bio_orig);
3224 + *bio_orig = bio;
3225 + }
3226 +- bio = bio_clone_bioset(*bio_orig, GFP_NOIO, bounce_bio_set);
3227 ++ bio = bio_clone_bioset(*bio_orig, GFP_NOIO, passthrough ? NULL :
3228 ++ bounce_bio_set);
3229 +
3230 + bio_for_each_segment_all(to, bio, i) {
3231 + struct page *page = to->bv_page;
3232 +diff --git a/drivers/android/binder.c b/drivers/android/binder.c
3233 +index 88b4bbe58100..a340766b51fe 100644
3234 +--- a/drivers/android/binder.c
3235 ++++ b/drivers/android/binder.c
3236 +@@ -482,7 +482,8 @@ enum binder_deferred_state {
3237 + * @tsk task_struct for group_leader of process
3238 + * (invariant after initialized)
3239 + * @files files_struct for process
3240 +- * (invariant after initialized)
3241 ++ * (protected by @files_lock)
3242 ++ * @files_lock mutex to protect @files
3243 + * @deferred_work_node: element for binder_deferred_list
3244 + * (protected by binder_deferred_lock)
3245 + * @deferred_work: bitmap of deferred work to perform
3246 +@@ -530,6 +531,7 @@ struct binder_proc {
3247 + int pid;
3248 + struct task_struct *tsk;
3249 + struct files_struct *files;
3250 ++ struct mutex files_lock;
3251 + struct hlist_node deferred_work_node;
3252 + int deferred_work;
3253 + bool is_dead;
3254 +@@ -877,20 +879,26 @@ static void binder_inc_node_tmpref_ilocked(struct binder_node *node);
3255 +
3256 + static int task_get_unused_fd_flags(struct binder_proc *proc, int flags)
3257 + {
3258 +- struct files_struct *files = proc->files;
3259 + unsigned long rlim_cur;
3260 + unsigned long irqs;
3261 ++ int ret;
3262 +
3263 +- if (files == NULL)
3264 +- return -ESRCH;
3265 +-
3266 +- if (!lock_task_sighand(proc->tsk, &irqs))
3267 +- return -EMFILE;
3268 +-
3269 ++ mutex_lock(&proc->files_lock);
3270 ++ if (proc->files == NULL) {
3271 ++ ret = -ESRCH;
3272 ++ goto err;
3273 ++ }
3274 ++ if (!lock_task_sighand(proc->tsk, &irqs)) {
3275 ++ ret = -EMFILE;
3276 ++ goto err;
3277 ++ }
3278 + rlim_cur = task_rlimit(proc->tsk, RLIMIT_NOFILE);
3279 + unlock_task_sighand(proc->tsk, &irqs);
3280 +
3281 +- return __alloc_fd(files, 0, rlim_cur, flags);
3282 ++ ret = __alloc_fd(proc->files, 0, rlim_cur, flags);
3283 ++err:
3284 ++ mutex_unlock(&proc->files_lock);
3285 ++ return ret;
3286 + }
3287 +
3288 + /*
3289 +@@ -899,8 +907,10 @@ static int task_get_unused_fd_flags(struct binder_proc *proc, int flags)
3290 + static void task_fd_install(
3291 + struct binder_proc *proc, unsigned int fd, struct file *file)
3292 + {
3293 ++ mutex_lock(&proc->files_lock);
3294 + if (proc->files)
3295 + __fd_install(proc->files, fd, file);
3296 ++ mutex_unlock(&proc->files_lock);
3297 + }
3298 +
3299 + /*
3300 +@@ -910,9 +920,11 @@ static long task_close_fd(struct binder_proc *proc, unsigned int fd)
3301 + {
3302 + int retval;
3303 +
3304 +- if (proc->files == NULL)
3305 +- return -ESRCH;
3306 +-
3307 ++ mutex_lock(&proc->files_lock);
3308 ++ if (proc->files == NULL) {
3309 ++ retval = -ESRCH;
3310 ++ goto err;
3311 ++ }
3312 + retval = __close_fd(proc->files, fd);
3313 + /* can't restart close syscall because file table entry was cleared */
3314 + if (unlikely(retval == -ERESTARTSYS ||
3315 +@@ -920,7 +932,8 @@ static long task_close_fd(struct binder_proc *proc, unsigned int fd)
3316 + retval == -ERESTARTNOHAND ||
3317 + retval == -ERESTART_RESTARTBLOCK))
3318 + retval = -EINTR;
3319 +-
3320 ++err:
3321 ++ mutex_unlock(&proc->files_lock);
3322 + return retval;
3323 + }
3324 +
3325 +@@ -4627,7 +4640,9 @@ static int binder_mmap(struct file *filp, struct vm_area_struct *vma)
3326 + ret = binder_alloc_mmap_handler(&proc->alloc, vma);
3327 + if (ret)
3328 + return ret;
3329 ++ mutex_lock(&proc->files_lock);
3330 + proc->files = get_files_struct(current);
3331 ++ mutex_unlock(&proc->files_lock);
3332 + return 0;
3333 +
3334 + err_bad_arg:
3335 +@@ -4651,6 +4666,7 @@ static int binder_open(struct inode *nodp, struct file *filp)
3336 + spin_lock_init(&proc->outer_lock);
3337 + get_task_struct(current->group_leader);
3338 + proc->tsk = current->group_leader;
3339 ++ mutex_init(&proc->files_lock);
3340 + INIT_LIST_HEAD(&proc->todo);
3341 + proc->default_priority = task_nice(current);
3342 + binder_dev = container_of(filp->private_data, struct binder_device,
3343 +@@ -4903,9 +4919,11 @@ static void binder_deferred_func(struct work_struct *work)
3344 +
3345 + files = NULL;
3346 + if (defer & BINDER_DEFERRED_PUT_FILES) {
3347 ++ mutex_lock(&proc->files_lock);
3348 + files = proc->files;
3349 + if (files)
3350 + proc->files = NULL;
3351 ++ mutex_unlock(&proc->files_lock);
3352 + }
3353 +
3354 + if (defer & BINDER_DEFERRED_FLUSH)
3355 +diff --git a/drivers/base/cacheinfo.c b/drivers/base/cacheinfo.c
3356 +index eb3af2739537..07532d83be0b 100644
3357 +--- a/drivers/base/cacheinfo.c
3358 ++++ b/drivers/base/cacheinfo.c
3359 +@@ -186,6 +186,11 @@ static void cache_associativity(struct cacheinfo *this_leaf)
3360 + this_leaf->ways_of_associativity = (size / nr_sets) / line_size;
3361 + }
3362 +
3363 ++static bool cache_node_is_unified(struct cacheinfo *this_leaf)
3364 ++{
3365 ++ return of_property_read_bool(this_leaf->of_node, "cache-unified");
3366 ++}
3367 ++
3368 + static void cache_of_override_properties(unsigned int cpu)
3369 + {
3370 + int index;
3371 +@@ -194,6 +199,14 @@ static void cache_of_override_properties(unsigned int cpu)
3372 +
3373 + for (index = 0; index < cache_leaves(cpu); index++) {
3374 + this_leaf = this_cpu_ci->info_list + index;
3375 ++ /*
3376 ++ * init_cache_level must setup the cache level correctly
3377 ++ * overriding the architecturally specified levels, so
3378 ++ * if type is NONE at this stage, it should be unified
3379 ++ */
3380 ++ if (this_leaf->type == CACHE_TYPE_NOCACHE &&
3381 ++ cache_node_is_unified(this_leaf))
3382 ++ this_leaf->type = CACHE_TYPE_UNIFIED;
3383 + cache_size(this_leaf);
3384 + cache_get_line_size(this_leaf);
3385 + cache_nr_sets(this_leaf);
3386 +diff --git a/drivers/gpio/gpiolib-acpi.c b/drivers/gpio/gpiolib-acpi.c
3387 +index eb4528c87c0b..d6f3d9ee1350 100644
3388 +--- a/drivers/gpio/gpiolib-acpi.c
3389 ++++ b/drivers/gpio/gpiolib-acpi.c
3390 +@@ -1074,7 +1074,7 @@ void acpi_gpiochip_add(struct gpio_chip *chip)
3391 + }
3392 +
3393 + if (!chip->names)
3394 +- devprop_gpiochip_set_names(chip);
3395 ++ devprop_gpiochip_set_names(chip, dev_fwnode(chip->parent));
3396 +
3397 + acpi_gpiochip_request_regions(acpi_gpio);
3398 + acpi_gpiochip_scan_gpios(acpi_gpio);
3399 +diff --git a/drivers/gpio/gpiolib-devprop.c b/drivers/gpio/gpiolib-devprop.c
3400 +index 27f383bda7d9..f748aa3e77f7 100644
3401 +--- a/drivers/gpio/gpiolib-devprop.c
3402 ++++ b/drivers/gpio/gpiolib-devprop.c
3403 +@@ -19,30 +19,27 @@
3404 + /**
3405 + * devprop_gpiochip_set_names - Set GPIO line names using device properties
3406 + * @chip: GPIO chip whose lines should be named, if possible
3407 ++ * @fwnode: Property Node containing the gpio-line-names property
3408 + *
3409 + * Looks for device property "gpio-line-names" and if it exists assigns
3410 + * GPIO line names for the chip. The memory allocated for the assigned
3411 + * names belong to the underlying firmware node and should not be released
3412 + * by the caller.
3413 + */
3414 +-void devprop_gpiochip_set_names(struct gpio_chip *chip)
3415 ++void devprop_gpiochip_set_names(struct gpio_chip *chip,
3416 ++ const struct fwnode_handle *fwnode)
3417 + {
3418 + struct gpio_device *gdev = chip->gpiodev;
3419 + const char **names;
3420 + int ret, i;
3421 +
3422 +- if (!chip->parent) {
3423 +- dev_warn(&gdev->dev, "GPIO chip parent is NULL\n");
3424 +- return;
3425 +- }
3426 +-
3427 +- ret = device_property_read_string_array(chip->parent, "gpio-line-names",
3428 ++ ret = fwnode_property_read_string_array(fwnode, "gpio-line-names",
3429 + NULL, 0);
3430 + if (ret < 0)
3431 + return;
3432 +
3433 + if (ret != gdev->ngpio) {
3434 +- dev_warn(chip->parent,
3435 ++ dev_warn(&gdev->dev,
3436 + "names %d do not match number of GPIOs %d\n", ret,
3437 + gdev->ngpio);
3438 + return;
3439 +@@ -52,10 +49,10 @@ void devprop_gpiochip_set_names(struct gpio_chip *chip)
3440 + if (!names)
3441 + return;
3442 +
3443 +- ret = device_property_read_string_array(chip->parent, "gpio-line-names",
3444 ++ ret = fwnode_property_read_string_array(fwnode, "gpio-line-names",
3445 + names, gdev->ngpio);
3446 + if (ret < 0) {
3447 +- dev_warn(chip->parent, "failed to read GPIO line names\n");
3448 ++ dev_warn(&gdev->dev, "failed to read GPIO line names\n");
3449 + kfree(names);
3450 + return;
3451 + }
3452 +diff --git a/drivers/gpio/gpiolib-of.c b/drivers/gpio/gpiolib-of.c
3453 +index bfcd20699ec8..ba38f530e403 100644
3454 +--- a/drivers/gpio/gpiolib-of.c
3455 ++++ b/drivers/gpio/gpiolib-of.c
3456 +@@ -493,7 +493,8 @@ int of_gpiochip_add(struct gpio_chip *chip)
3457 +
3458 + /* If the chip defines names itself, these take precedence */
3459 + if (!chip->names)
3460 +- devprop_gpiochip_set_names(chip);
3461 ++ devprop_gpiochip_set_names(chip,
3462 ++ of_fwnode_handle(chip->of_node));
3463 +
3464 + of_node_get(chip->of_node);
3465 +
3466 +diff --git a/drivers/gpio/gpiolib.h b/drivers/gpio/gpiolib.h
3467 +index d003ccb12781..3d4d0634c9dd 100644
3468 +--- a/drivers/gpio/gpiolib.h
3469 ++++ b/drivers/gpio/gpiolib.h
3470 +@@ -224,7 +224,8 @@ static inline int gpio_chip_hwgpio(const struct gpio_desc *desc)
3471 + return desc - &desc->gdev->descs[0];
3472 + }
3473 +
3474 +-void devprop_gpiochip_set_names(struct gpio_chip *chip);
3475 ++void devprop_gpiochip_set_names(struct gpio_chip *chip,
3476 ++ const struct fwnode_handle *fwnode);
3477 +
3478 + /* With descriptor prefix */
3479 +
3480 +diff --git a/drivers/infiniband/core/security.c b/drivers/infiniband/core/security.c
3481 +index feafdb961c48..59b2f96d986a 100644
3482 +--- a/drivers/infiniband/core/security.c
3483 ++++ b/drivers/infiniband/core/security.c
3484 +@@ -386,6 +386,9 @@ int ib_open_shared_qp_security(struct ib_qp *qp, struct ib_device *dev)
3485 + if (ret)
3486 + return ret;
3487 +
3488 ++ if (!qp->qp_sec)
3489 ++ return 0;
3490 ++
3491 + mutex_lock(&real_qp->qp_sec->mutex);
3492 + ret = check_qp_port_pkey_settings(real_qp->qp_sec->ports_pkeys,
3493 + qp->qp_sec);
3494 +diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c
3495 +index d8f540054392..93c1a57dbff1 100644
3496 +--- a/drivers/infiniband/core/uverbs_cmd.c
3497 ++++ b/drivers/infiniband/core/uverbs_cmd.c
3498 +@@ -2085,8 +2085,8 @@ int ib_uverbs_ex_modify_qp(struct ib_uverbs_file *file,
3499 + return -EOPNOTSUPP;
3500 +
3501 + if (ucore->inlen > sizeof(cmd)) {
3502 +- if (ib_is_udata_cleared(ucore, sizeof(cmd),
3503 +- ucore->inlen - sizeof(cmd)))
3504 ++ if (!ib_is_udata_cleared(ucore, sizeof(cmd),
3505 ++ ucore->inlen - sizeof(cmd)))
3506 + return -EOPNOTSUPP;
3507 + }
3508 +
3509 +diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
3510 +index de57d6c11a25..9032f77cc38d 100644
3511 +--- a/drivers/infiniband/core/verbs.c
3512 ++++ b/drivers/infiniband/core/verbs.c
3513 +@@ -1400,7 +1400,8 @@ int ib_close_qp(struct ib_qp *qp)
3514 + spin_unlock_irqrestore(&real_qp->device->event_handler_lock, flags);
3515 +
3516 + atomic_dec(&real_qp->usecnt);
3517 +- ib_close_shared_qp_security(qp->qp_sec);
3518 ++ if (qp->qp_sec)
3519 ++ ib_close_shared_qp_security(qp->qp_sec);
3520 + kfree(qp);
3521 +
3522 + return 0;
3523 +diff --git a/drivers/infiniband/hw/cxgb4/cq.c b/drivers/infiniband/hw/cxgb4/cq.c
3524 +index eae8ea81c6e2..514c1000ded1 100644
3525 +--- a/drivers/infiniband/hw/cxgb4/cq.c
3526 ++++ b/drivers/infiniband/hw/cxgb4/cq.c
3527 +@@ -586,10 +586,10 @@ static int poll_cq(struct t4_wq *wq, struct t4_cq *cq, struct t4_cqe *cqe,
3528 + ret = -EAGAIN;
3529 + goto skip_cqe;
3530 + }
3531 +- if (unlikely((CQE_WRID_MSN(hw_cqe) != (wq->rq.msn)))) {
3532 ++ if (unlikely(!CQE_STATUS(hw_cqe) &&
3533 ++ CQE_WRID_MSN(hw_cqe) != wq->rq.msn)) {
3534 + t4_set_wq_in_error(wq);
3535 +- hw_cqe->header |= htonl(CQE_STATUS_V(T4_ERR_MSN));
3536 +- goto proc_cqe;
3537 ++ hw_cqe->header |= cpu_to_be32(CQE_STATUS_V(T4_ERR_MSN));
3538 + }
3539 + goto proc_cqe;
3540 + }
3541 +diff --git a/drivers/infiniband/hw/hfi1/hfi.h b/drivers/infiniband/hw/hfi1/hfi.h
3542 +index 6ff44dc606eb..3409eee16092 100644
3543 +--- a/drivers/infiniband/hw/hfi1/hfi.h
3544 ++++ b/drivers/infiniband/hw/hfi1/hfi.h
3545 +@@ -1129,7 +1129,6 @@ struct hfi1_devdata {
3546 + u16 pcie_lnkctl;
3547 + u16 pcie_devctl2;
3548 + u32 pci_msix0;
3549 +- u32 pci_lnkctl3;
3550 + u32 pci_tph2;
3551 +
3552 + /*
3553 +diff --git a/drivers/infiniband/hw/hfi1/pcie.c b/drivers/infiniband/hw/hfi1/pcie.c
3554 +index 09e50fd2a08f..8c7e7a60b715 100644
3555 +--- a/drivers/infiniband/hw/hfi1/pcie.c
3556 ++++ b/drivers/infiniband/hw/hfi1/pcie.c
3557 +@@ -411,15 +411,12 @@ int restore_pci_variables(struct hfi1_devdata *dd)
3558 + if (ret)
3559 + goto error;
3560 +
3561 +- ret = pci_write_config_dword(dd->pcidev, PCIE_CFG_SPCIE1,
3562 +- dd->pci_lnkctl3);
3563 +- if (ret)
3564 +- goto error;
3565 +-
3566 +- ret = pci_write_config_dword(dd->pcidev, PCIE_CFG_TPH2, dd->pci_tph2);
3567 +- if (ret)
3568 +- goto error;
3569 +-
3570 ++ if (pci_find_ext_capability(dd->pcidev, PCI_EXT_CAP_ID_TPH)) {
3571 ++ ret = pci_write_config_dword(dd->pcidev, PCIE_CFG_TPH2,
3572 ++ dd->pci_tph2);
3573 ++ if (ret)
3574 ++ goto error;
3575 ++ }
3576 + return 0;
3577 +
3578 + error:
3579 +@@ -469,15 +466,12 @@ int save_pci_variables(struct hfi1_devdata *dd)
3580 + if (ret)
3581 + goto error;
3582 +
3583 +- ret = pci_read_config_dword(dd->pcidev, PCIE_CFG_SPCIE1,
3584 +- &dd->pci_lnkctl3);
3585 +- if (ret)
3586 +- goto error;
3587 +-
3588 +- ret = pci_read_config_dword(dd->pcidev, PCIE_CFG_TPH2, &dd->pci_tph2);
3589 +- if (ret)
3590 +- goto error;
3591 +-
3592 ++ if (pci_find_ext_capability(dd->pcidev, PCI_EXT_CAP_ID_TPH)) {
3593 ++ ret = pci_read_config_dword(dd->pcidev, PCIE_CFG_TPH2,
3594 ++ &dd->pci_tph2);
3595 ++ if (ret)
3596 ++ goto error;
3597 ++ }
3598 + return 0;
3599 +
3600 + error:
3601 +diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
3602 +index 5aff1e33d984..30d479f87cb8 100644
3603 +--- a/drivers/infiniband/hw/mlx5/main.c
3604 ++++ b/drivers/infiniband/hw/mlx5/main.c
3605 +@@ -1415,6 +1415,7 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev,
3606 + }
3607 +
3608 + INIT_LIST_HEAD(&context->vma_private_list);
3609 ++ mutex_init(&context->vma_private_list_mutex);
3610 + INIT_LIST_HEAD(&context->db_page_list);
3611 + mutex_init(&context->db_page_mutex);
3612 +
3613 +@@ -1576,7 +1577,9 @@ static void mlx5_ib_vma_close(struct vm_area_struct *area)
3614 + * mlx5_ib_disassociate_ucontext().
3615 + */
3616 + mlx5_ib_vma_priv_data->vma = NULL;
3617 ++ mutex_lock(mlx5_ib_vma_priv_data->vma_private_list_mutex);
3618 + list_del(&mlx5_ib_vma_priv_data->list);
3619 ++ mutex_unlock(mlx5_ib_vma_priv_data->vma_private_list_mutex);
3620 + kfree(mlx5_ib_vma_priv_data);
3621 + }
3622 +
3623 +@@ -1596,10 +1599,13 @@ static int mlx5_ib_set_vma_data(struct vm_area_struct *vma,
3624 + return -ENOMEM;
3625 +
3626 + vma_prv->vma = vma;
3627 ++ vma_prv->vma_private_list_mutex = &ctx->vma_private_list_mutex;
3628 + vma->vm_private_data = vma_prv;
3629 + vma->vm_ops = &mlx5_ib_vm_ops;
3630 +
3631 ++ mutex_lock(&ctx->vma_private_list_mutex);
3632 + list_add(&vma_prv->list, vma_head);
3633 ++ mutex_unlock(&ctx->vma_private_list_mutex);
3634 +
3635 + return 0;
3636 + }
3637 +@@ -1642,6 +1648,7 @@ static void mlx5_ib_disassociate_ucontext(struct ib_ucontext *ibcontext)
3638 + * mlx5_ib_vma_close.
3639 + */
3640 + down_write(&owning_mm->mmap_sem);
3641 ++ mutex_lock(&context->vma_private_list_mutex);
3642 + list_for_each_entry_safe(vma_private, n, &context->vma_private_list,
3643 + list) {
3644 + vma = vma_private->vma;
3645 +@@ -1656,6 +1663,7 @@ static void mlx5_ib_disassociate_ucontext(struct ib_ucontext *ibcontext)
3646 + list_del(&vma_private->list);
3647 + kfree(vma_private);
3648 + }
3649 ++ mutex_unlock(&context->vma_private_list_mutex);
3650 + up_write(&owning_mm->mmap_sem);
3651 + mmput(owning_mm);
3652 + put_task_struct(owning_process);
3653 +diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
3654 +index 189e80cd6b2f..754103372faa 100644
3655 +--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
3656 ++++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
3657 +@@ -115,6 +115,8 @@ enum {
3658 + struct mlx5_ib_vma_private_data {
3659 + struct list_head list;
3660 + struct vm_area_struct *vma;
3661 ++ /* protect vma_private_list add/del */
3662 ++ struct mutex *vma_private_list_mutex;
3663 + };
3664 +
3665 + struct mlx5_ib_ucontext {
3666 +@@ -129,6 +131,8 @@ struct mlx5_ib_ucontext {
3667 + /* Transport Domain number */
3668 + u32 tdn;
3669 + struct list_head vma_private_list;
3670 ++ /* protect vma_private_list add/del */
3671 ++ struct mutex vma_private_list_mutex;
3672 +
3673 + unsigned long upd_xlt_page;
3674 + /* protect ODP/KSM */
3675 +diff --git a/drivers/net/dsa/bcm_sf2.c b/drivers/net/dsa/bcm_sf2.c
3676 +index d7b53d53c116..72d6ffbfd638 100644
3677 +--- a/drivers/net/dsa/bcm_sf2.c
3678 ++++ b/drivers/net/dsa/bcm_sf2.c
3679 +@@ -167,7 +167,7 @@ static void bcm_sf2_gphy_enable_set(struct dsa_switch *ds, bool enable)
3680 + reg = reg_readl(priv, REG_SPHY_CNTRL);
3681 + if (enable) {
3682 + reg |= PHY_RESET;
3683 +- reg &= ~(EXT_PWR_DOWN | IDDQ_BIAS | CK25_DIS);
3684 ++ reg &= ~(EXT_PWR_DOWN | IDDQ_BIAS | IDDQ_GLOBAL_PWR | CK25_DIS);
3685 + reg_writel(priv, reg, REG_SPHY_CNTRL);
3686 + udelay(21);
3687 + reg = reg_readl(priv, REG_SPHY_CNTRL);
3688 +diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
3689 +index dc5de275352a..aa764c5e3c6b 100644
3690 +--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
3691 ++++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
3692 +@@ -1875,7 +1875,7 @@ static int bnxt_poll_work(struct bnxt *bp, struct bnxt_napi *bnapi, int budget)
3693 + * here forever if we consistently cannot allocate
3694 + * buffers.
3695 + */
3696 +- else if (rc == -ENOMEM)
3697 ++ else if (rc == -ENOMEM && budget)
3698 + rx_pkts++;
3699 + else if (rc == -EBUSY) /* partial completion */
3700 + break;
3701 +@@ -1961,7 +1961,7 @@ static int bnxt_poll_nitroa0(struct napi_struct *napi, int budget)
3702 + cpu_to_le32(RX_CMPL_ERRORS_CRC_ERROR);
3703 +
3704 + rc = bnxt_rx_pkt(bp, bnapi, &raw_cons, &event);
3705 +- if (likely(rc == -EIO))
3706 ++ if (likely(rc == -EIO) && budget)
3707 + rx_pkts++;
3708 + else if (rc == -EBUSY) /* partial completion */
3709 + break;
3710 +diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c
3711 +index 656e6af70f0a..aef3fcf2f5b9 100644
3712 +--- a/drivers/net/ethernet/broadcom/tg3.c
3713 ++++ b/drivers/net/ethernet/broadcom/tg3.c
3714 +@@ -14227,7 +14227,9 @@ static int tg3_change_mtu(struct net_device *dev, int new_mtu)
3715 + /* Reset PHY, otherwise the read DMA engine will be in a mode that
3716 + * breaks all requests to 256 bytes.
3717 + */
3718 +- if (tg3_asic_rev(tp) == ASIC_REV_57766)
3719 ++ if (tg3_asic_rev(tp) == ASIC_REV_57766 ||
3720 ++ tg3_asic_rev(tp) == ASIC_REV_5717 ||
3721 ++ tg3_asic_rev(tp) == ASIC_REV_5719)
3722 + reset_phy = true;
3723 +
3724 + err = tg3_restart_hw(tp, reset_phy);
3725 +diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c
3726 +index 3dc2d771a222..faf7cdc97ebf 100644
3727 +--- a/drivers/net/ethernet/freescale/fec_main.c
3728 ++++ b/drivers/net/ethernet/freescale/fec_main.c
3729 +@@ -818,6 +818,12 @@ static void fec_enet_bd_init(struct net_device *dev)
3730 + for (i = 0; i < txq->bd.ring_size; i++) {
3731 + /* Initialize the BD for every fragment in the page. */
3732 + bdp->cbd_sc = cpu_to_fec16(0);
3733 ++ if (bdp->cbd_bufaddr &&
3734 ++ !IS_TSO_HEADER(txq, fec32_to_cpu(bdp->cbd_bufaddr)))
3735 ++ dma_unmap_single(&fep->pdev->dev,
3736 ++ fec32_to_cpu(bdp->cbd_bufaddr),
3737 ++ fec16_to_cpu(bdp->cbd_datlen),
3738 ++ DMA_TO_DEVICE);
3739 + if (txq->tx_skbuff[i]) {
3740 + dev_kfree_skb_any(txq->tx_skbuff[i]);
3741 + txq->tx_skbuff[i] = NULL;
3742 +diff --git a/drivers/net/ethernet/marvell/mvmdio.c b/drivers/net/ethernet/marvell/mvmdio.c
3743 +index c9798210fa0f..0495487f7b42 100644
3744 +--- a/drivers/net/ethernet/marvell/mvmdio.c
3745 ++++ b/drivers/net/ethernet/marvell/mvmdio.c
3746 +@@ -344,7 +344,8 @@ static int orion_mdio_probe(struct platform_device *pdev)
3747 + dev->regs + MVMDIO_ERR_INT_MASK);
3748 +
3749 + } else if (dev->err_interrupt == -EPROBE_DEFER) {
3750 +- return -EPROBE_DEFER;
3751 ++ ret = -EPROBE_DEFER;
3752 ++ goto out_mdio;
3753 + }
3754 +
3755 + if (pdev->dev.of_node)
3756 +diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
3757 +index 1fffdebbc9e8..e9a1fbcc4adf 100644
3758 +--- a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
3759 ++++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
3760 +@@ -362,7 +362,7 @@ static int mlx5_internal_err_ret_value(struct mlx5_core_dev *dev, u16 op,
3761 + case MLX5_CMD_OP_QUERY_VPORT_COUNTER:
3762 + case MLX5_CMD_OP_ALLOC_Q_COUNTER:
3763 + case MLX5_CMD_OP_QUERY_Q_COUNTER:
3764 +- case MLX5_CMD_OP_SET_RATE_LIMIT:
3765 ++ case MLX5_CMD_OP_SET_PP_RATE_LIMIT:
3766 + case MLX5_CMD_OP_QUERY_RATE_LIMIT:
3767 + case MLX5_CMD_OP_CREATE_SCHEDULING_ELEMENT:
3768 + case MLX5_CMD_OP_QUERY_SCHEDULING_ELEMENT:
3769 +@@ -505,7 +505,7 @@ const char *mlx5_command_str(int command)
3770 + MLX5_COMMAND_STR_CASE(ALLOC_Q_COUNTER);
3771 + MLX5_COMMAND_STR_CASE(DEALLOC_Q_COUNTER);
3772 + MLX5_COMMAND_STR_CASE(QUERY_Q_COUNTER);
3773 +- MLX5_COMMAND_STR_CASE(SET_RATE_LIMIT);
3774 ++ MLX5_COMMAND_STR_CASE(SET_PP_RATE_LIMIT);
3775 + MLX5_COMMAND_STR_CASE(QUERY_RATE_LIMIT);
3776 + MLX5_COMMAND_STR_CASE(CREATE_SCHEDULING_ELEMENT);
3777 + MLX5_COMMAND_STR_CASE(DESTROY_SCHEDULING_ELEMENT);
3778 +diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
3779 +index 13b5ef9d8703..5fa071620104 100644
3780 +--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
3781 ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
3782 +@@ -590,6 +590,7 @@ struct mlx5e_channel {
3783 + struct mlx5_core_dev *mdev;
3784 + struct mlx5e_tstamp *tstamp;
3785 + int ix;
3786 ++ int cpu;
3787 + };
3788 +
3789 + struct mlx5e_channels {
3790 +diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
3791 +index cc11bbbd0309..3cdb932cae76 100644
3792 +--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
3793 ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
3794 +@@ -71,11 +71,6 @@ struct mlx5e_channel_param {
3795 + struct mlx5e_cq_param icosq_cq;
3796 + };
3797 +
3798 +-static int mlx5e_get_node(struct mlx5e_priv *priv, int ix)
3799 +-{
3800 +- return pci_irq_get_node(priv->mdev->pdev, MLX5_EQ_VEC_COMP_BASE + ix);
3801 +-}
3802 +-
3803 + static bool mlx5e_check_fragmented_striding_rq_cap(struct mlx5_core_dev *mdev)
3804 + {
3805 + return MLX5_CAP_GEN(mdev, striding_rq) &&
3806 +@@ -452,17 +447,16 @@ static int mlx5e_rq_alloc_mpwqe_info(struct mlx5e_rq *rq,
3807 + int wq_sz = mlx5_wq_ll_get_size(&rq->wq);
3808 + int mtt_sz = mlx5e_get_wqe_mtt_sz();
3809 + int mtt_alloc = mtt_sz + MLX5_UMR_ALIGN - 1;
3810 +- int node = mlx5e_get_node(c->priv, c->ix);
3811 + int i;
3812 +
3813 + rq->mpwqe.info = kzalloc_node(wq_sz * sizeof(*rq->mpwqe.info),
3814 +- GFP_KERNEL, node);
3815 ++ GFP_KERNEL, cpu_to_node(c->cpu));
3816 + if (!rq->mpwqe.info)
3817 + goto err_out;
3818 +
3819 + /* We allocate more than mtt_sz as we will align the pointer */
3820 +- rq->mpwqe.mtt_no_align = kzalloc_node(mtt_alloc * wq_sz,
3821 +- GFP_KERNEL, node);
3822 ++ rq->mpwqe.mtt_no_align = kzalloc_node(mtt_alloc * wq_sz, GFP_KERNEL,
3823 ++ cpu_to_node(c->cpu));
3824 + if (unlikely(!rq->mpwqe.mtt_no_align))
3825 + goto err_free_wqe_info;
3826 +
3827 +@@ -570,7 +564,7 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
3828 + int err;
3829 + int i;
3830 +
3831 +- rqp->wq.db_numa_node = mlx5e_get_node(c->priv, c->ix);
3832 ++ rqp->wq.db_numa_node = cpu_to_node(c->cpu);
3833 +
3834 + err = mlx5_wq_ll_create(mdev, &rqp->wq, rqc_wq, &rq->wq,
3835 + &rq->wq_ctrl);
3836 +@@ -636,8 +630,7 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
3837 + default: /* MLX5_WQ_TYPE_LINKED_LIST */
3838 + rq->wqe.frag_info =
3839 + kzalloc_node(wq_sz * sizeof(*rq->wqe.frag_info),
3840 +- GFP_KERNEL,
3841 +- mlx5e_get_node(c->priv, c->ix));
3842 ++ GFP_KERNEL, cpu_to_node(c->cpu));
3843 + if (!rq->wqe.frag_info) {
3844 + err = -ENOMEM;
3845 + goto err_rq_wq_destroy;
3846 +@@ -1007,13 +1000,13 @@ static int mlx5e_alloc_xdpsq(struct mlx5e_channel *c,
3847 + sq->uar_map = mdev->mlx5e_res.bfreg.map;
3848 + sq->min_inline_mode = params->tx_min_inline_mode;
3849 +
3850 +- param->wq.db_numa_node = mlx5e_get_node(c->priv, c->ix);
3851 ++ param->wq.db_numa_node = cpu_to_node(c->cpu);
3852 + err = mlx5_wq_cyc_create(mdev, &param->wq, sqc_wq, &sq->wq, &sq->wq_ctrl);
3853 + if (err)
3854 + return err;
3855 + sq->wq.db = &sq->wq.db[MLX5_SND_DBR];
3856 +
3857 +- err = mlx5e_alloc_xdpsq_db(sq, mlx5e_get_node(c->priv, c->ix));
3858 ++ err = mlx5e_alloc_xdpsq_db(sq, cpu_to_node(c->cpu));
3859 + if (err)
3860 + goto err_sq_wq_destroy;
3861 +
3862 +@@ -1060,13 +1053,13 @@ static int mlx5e_alloc_icosq(struct mlx5e_channel *c,
3863 + sq->channel = c;
3864 + sq->uar_map = mdev->mlx5e_res.bfreg.map;
3865 +
3866 +- param->wq.db_numa_node = mlx5e_get_node(c->priv, c->ix);
3867 ++ param->wq.db_numa_node = cpu_to_node(c->cpu);
3868 + err = mlx5_wq_cyc_create(mdev, &param->wq, sqc_wq, &sq->wq, &sq->wq_ctrl);
3869 + if (err)
3870 + return err;
3871 + sq->wq.db = &sq->wq.db[MLX5_SND_DBR];
3872 +
3873 +- err = mlx5e_alloc_icosq_db(sq, mlx5e_get_node(c->priv, c->ix));
3874 ++ err = mlx5e_alloc_icosq_db(sq, cpu_to_node(c->cpu));
3875 + if (err)
3876 + goto err_sq_wq_destroy;
3877 +
3878 +@@ -1132,13 +1125,13 @@ static int mlx5e_alloc_txqsq(struct mlx5e_channel *c,
3879 + if (MLX5_IPSEC_DEV(c->priv->mdev))
3880 + set_bit(MLX5E_SQ_STATE_IPSEC, &sq->state);
3881 +
3882 +- param->wq.db_numa_node = mlx5e_get_node(c->priv, c->ix);
3883 ++ param->wq.db_numa_node = cpu_to_node(c->cpu);
3884 + err = mlx5_wq_cyc_create(mdev, &param->wq, sqc_wq, &sq->wq, &sq->wq_ctrl);
3885 + if (err)
3886 + return err;
3887 + sq->wq.db = &sq->wq.db[MLX5_SND_DBR];
3888 +
3889 +- err = mlx5e_alloc_txqsq_db(sq, mlx5e_get_node(c->priv, c->ix));
3890 ++ err = mlx5e_alloc_txqsq_db(sq, cpu_to_node(c->cpu));
3891 + if (err)
3892 + goto err_sq_wq_destroy;
3893 +
3894 +@@ -1510,8 +1503,8 @@ static int mlx5e_alloc_cq(struct mlx5e_channel *c,
3895 + struct mlx5_core_dev *mdev = c->priv->mdev;
3896 + int err;
3897 +
3898 +- param->wq.buf_numa_node = mlx5e_get_node(c->priv, c->ix);
3899 +- param->wq.db_numa_node = mlx5e_get_node(c->priv, c->ix);
3900 ++ param->wq.buf_numa_node = cpu_to_node(c->cpu);
3901 ++ param->wq.db_numa_node = cpu_to_node(c->cpu);
3902 + param->eq_ix = c->ix;
3903 +
3904 + err = mlx5e_alloc_cq_common(mdev, param, cq);
3905 +@@ -1610,6 +1603,11 @@ static void mlx5e_close_cq(struct mlx5e_cq *cq)
3906 + mlx5e_free_cq(cq);
3907 + }
3908 +
3909 ++static int mlx5e_get_cpu(struct mlx5e_priv *priv, int ix)
3910 ++{
3911 ++ return cpumask_first(priv->mdev->priv.irq_info[ix].mask);
3912 ++}
3913 ++
3914 + static int mlx5e_open_tx_cqs(struct mlx5e_channel *c,
3915 + struct mlx5e_params *params,
3916 + struct mlx5e_channel_param *cparam)
3917 +@@ -1758,12 +1756,13 @@ static int mlx5e_open_channel(struct mlx5e_priv *priv, int ix,
3918 + {
3919 + struct mlx5e_cq_moder icocq_moder = {0, 0};
3920 + struct net_device *netdev = priv->netdev;
3921 ++ int cpu = mlx5e_get_cpu(priv, ix);
3922 + struct mlx5e_channel *c;
3923 + unsigned int irq;
3924 + int err;
3925 + int eqn;
3926 +
3927 +- c = kzalloc_node(sizeof(*c), GFP_KERNEL, mlx5e_get_node(priv, ix));
3928 ++ c = kzalloc_node(sizeof(*c), GFP_KERNEL, cpu_to_node(cpu));
3929 + if (!c)
3930 + return -ENOMEM;
3931 +
3932 +@@ -1771,6 +1770,7 @@ static int mlx5e_open_channel(struct mlx5e_priv *priv, int ix,
3933 + c->mdev = priv->mdev;
3934 + c->tstamp = &priv->tstamp;
3935 + c->ix = ix;
3936 ++ c->cpu = cpu;
3937 + c->pdev = &priv->mdev->pdev->dev;
3938 + c->netdev = priv->netdev;
3939 + c->mkey_be = cpu_to_be32(priv->mdev->mlx5e_res.mkey.key);
3940 +@@ -1859,8 +1859,7 @@ static void mlx5e_activate_channel(struct mlx5e_channel *c)
3941 + for (tc = 0; tc < c->num_tc; tc++)
3942 + mlx5e_activate_txqsq(&c->sq[tc]);
3943 + mlx5e_activate_rq(&c->rq);
3944 +- netif_set_xps_queue(c->netdev,
3945 +- mlx5_get_vector_affinity(c->priv->mdev, c->ix), c->ix);
3946 ++ netif_set_xps_queue(c->netdev, get_cpu_mask(c->cpu), c->ix);
3947 + }
3948 +
3949 + static void mlx5e_deactivate_channel(struct mlx5e_channel *c)
3950 +@@ -3554,6 +3553,7 @@ static netdev_features_t mlx5e_tunnel_features_check(struct mlx5e_priv *priv,
3951 + struct sk_buff *skb,
3952 + netdev_features_t features)
3953 + {
3954 ++ unsigned int offset = 0;
3955 + struct udphdr *udph;
3956 + u8 proto;
3957 + u16 port;
3958 +@@ -3563,7 +3563,7 @@ static netdev_features_t mlx5e_tunnel_features_check(struct mlx5e_priv *priv,
3959 + proto = ip_hdr(skb)->protocol;
3960 + break;
3961 + case htons(ETH_P_IPV6):
3962 +- proto = ipv6_hdr(skb)->nexthdr;
3963 ++ proto = ipv6_find_hdr(skb, &offset, -1, NULL, NULL);
3964 + break;
3965 + default:
3966 + goto out;
3967 +diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/sdk.c b/drivers/net/ethernet/mellanox/mlx5/core/fpga/sdk.c
3968 +index 3c11d6e2160a..14962969c5ba 100644
3969 +--- a/drivers/net/ethernet/mellanox/mlx5/core/fpga/sdk.c
3970 ++++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/sdk.c
3971 +@@ -66,6 +66,9 @@ static int mlx5_fpga_mem_read_i2c(struct mlx5_fpga_device *fdev, size_t size,
3972 + u8 actual_size;
3973 + int err;
3974 +
3975 ++ if (!size)
3976 ++ return -EINVAL;
3977 ++
3978 + if (!fdev->mdev)
3979 + return -ENOTCONN;
3980 +
3981 +@@ -95,6 +98,9 @@ static int mlx5_fpga_mem_write_i2c(struct mlx5_fpga_device *fdev, size_t size,
3982 + u8 actual_size;
3983 + int err;
3984 +
3985 ++ if (!size)
3986 ++ return -EINVAL;
3987 ++
3988 + if (!fdev->mdev)
3989 + return -ENOTCONN;
3990 +
3991 +diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
3992 +index 06562c9a6b9c..8bfc37e4ec87 100644
3993 +--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
3994 ++++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
3995 +@@ -316,9 +316,6 @@ static int mlx5_alloc_irq_vectors(struct mlx5_core_dev *dev)
3996 + {
3997 + struct mlx5_priv *priv = &dev->priv;
3998 + struct mlx5_eq_table *table = &priv->eq_table;
3999 +- struct irq_affinity irqdesc = {
4000 +- .pre_vectors = MLX5_EQ_VEC_COMP_BASE,
4001 +- };
4002 + int num_eqs = 1 << MLX5_CAP_GEN(dev, log_max_eq);
4003 + int nvec;
4004 +
4005 +@@ -332,10 +329,9 @@ static int mlx5_alloc_irq_vectors(struct mlx5_core_dev *dev)
4006 + if (!priv->irq_info)
4007 + goto err_free_msix;
4008 +
4009 +- nvec = pci_alloc_irq_vectors_affinity(dev->pdev,
4010 ++ nvec = pci_alloc_irq_vectors(dev->pdev,
4011 + MLX5_EQ_VEC_COMP_BASE + 1, nvec,
4012 +- PCI_IRQ_MSIX | PCI_IRQ_AFFINITY,
4013 +- &irqdesc);
4014 ++ PCI_IRQ_MSIX);
4015 + if (nvec < 0)
4016 + return nvec;
4017 +
4018 +@@ -621,6 +617,63 @@ u64 mlx5_read_internal_timer(struct mlx5_core_dev *dev)
4019 + return (u64)timer_l | (u64)timer_h1 << 32;
4020 + }
4021 +
4022 ++static int mlx5_irq_set_affinity_hint(struct mlx5_core_dev *mdev, int i)
4023 ++{
4024 ++ struct mlx5_priv *priv = &mdev->priv;
4025 ++ int irq = pci_irq_vector(mdev->pdev, MLX5_EQ_VEC_COMP_BASE + i);
4026 ++
4027 ++ if (!zalloc_cpumask_var(&priv->irq_info[i].mask, GFP_KERNEL)) {
4028 ++ mlx5_core_warn(mdev, "zalloc_cpumask_var failed");
4029 ++ return -ENOMEM;
4030 ++ }
4031 ++
4032 ++ cpumask_set_cpu(cpumask_local_spread(i, priv->numa_node),
4033 ++ priv->irq_info[i].mask);
4034 ++
4035 ++ if (IS_ENABLED(CONFIG_SMP) &&
4036 ++ irq_set_affinity_hint(irq, priv->irq_info[i].mask))
4037 ++ mlx5_core_warn(mdev, "irq_set_affinity_hint failed, irq 0x%.4x", irq);
4038 ++
4039 ++ return 0;
4040 ++}
4041 ++
4042 ++static void mlx5_irq_clear_affinity_hint(struct mlx5_core_dev *mdev, int i)
4043 ++{
4044 ++ struct mlx5_priv *priv = &mdev->priv;
4045 ++ int irq = pci_irq_vector(mdev->pdev, MLX5_EQ_VEC_COMP_BASE + i);
4046 ++
4047 ++ irq_set_affinity_hint(irq, NULL);
4048 ++ free_cpumask_var(priv->irq_info[i].mask);
4049 ++}
4050 ++
4051 ++static int mlx5_irq_set_affinity_hints(struct mlx5_core_dev *mdev)
4052 ++{
4053 ++ int err;
4054 ++ int i;
4055 ++
4056 ++ for (i = 0; i < mdev->priv.eq_table.num_comp_vectors; i++) {
4057 ++ err = mlx5_irq_set_affinity_hint(mdev, i);
4058 ++ if (err)
4059 ++ goto err_out;
4060 ++ }
4061 ++
4062 ++ return 0;
4063 ++
4064 ++err_out:
4065 ++ for (i--; i >= 0; i--)
4066 ++ mlx5_irq_clear_affinity_hint(mdev, i);
4067 ++
4068 ++ return err;
4069 ++}
4070 ++
4071 ++static void mlx5_irq_clear_affinity_hints(struct mlx5_core_dev *mdev)
4072 ++{
4073 ++ int i;
4074 ++
4075 ++ for (i = 0; i < mdev->priv.eq_table.num_comp_vectors; i++)
4076 ++ mlx5_irq_clear_affinity_hint(mdev, i);
4077 ++}
4078 ++
4079 + int mlx5_vector2eqn(struct mlx5_core_dev *dev, int vector, int *eqn,
4080 + unsigned int *irqn)
4081 + {
4082 +@@ -1093,6 +1146,12 @@ static int mlx5_load_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv,
4083 + goto err_stop_eqs;
4084 + }
4085 +
4086 ++ err = mlx5_irq_set_affinity_hints(dev);
4087 ++ if (err) {
4088 ++ dev_err(&pdev->dev, "Failed to alloc affinity hint cpumask\n");
4089 ++ goto err_affinity_hints;
4090 ++ }
4091 ++
4092 + err = mlx5_init_fs(dev);
4093 + if (err) {
4094 + dev_err(&pdev->dev, "Failed to init flow steering\n");
4095 +@@ -1150,6 +1209,9 @@ static int mlx5_load_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv,
4096 + mlx5_cleanup_fs(dev);
4097 +
4098 + err_fs:
4099 ++ mlx5_irq_clear_affinity_hints(dev);
4100 ++
4101 ++err_affinity_hints:
4102 + free_comp_eqs(dev);
4103 +
4104 + err_stop_eqs:
4105 +@@ -1218,6 +1280,7 @@ static int mlx5_unload_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv,
4106 +
4107 + mlx5_sriov_detach(dev);
4108 + mlx5_cleanup_fs(dev);
4109 ++ mlx5_irq_clear_affinity_hints(dev);
4110 + free_comp_eqs(dev);
4111 + mlx5_stop_eqs(dev);
4112 + mlx5_put_uars_page(dev, priv->uar);
4113 +diff --git a/drivers/net/ethernet/mellanox/mlx5/core/qp.c b/drivers/net/ethernet/mellanox/mlx5/core/qp.c
4114 +index db9e665ab104..889130edb715 100644
4115 +--- a/drivers/net/ethernet/mellanox/mlx5/core/qp.c
4116 ++++ b/drivers/net/ethernet/mellanox/mlx5/core/qp.c
4117 +@@ -213,8 +213,8 @@ int mlx5_core_create_qp(struct mlx5_core_dev *dev,
4118 + err_cmd:
4119 + memset(din, 0, sizeof(din));
4120 + memset(dout, 0, sizeof(dout));
4121 +- MLX5_SET(destroy_qp_in, in, opcode, MLX5_CMD_OP_DESTROY_QP);
4122 +- MLX5_SET(destroy_qp_in, in, qpn, qp->qpn);
4123 ++ MLX5_SET(destroy_qp_in, din, opcode, MLX5_CMD_OP_DESTROY_QP);
4124 ++ MLX5_SET(destroy_qp_in, din, qpn, qp->qpn);
4125 + mlx5_cmd_exec(dev, din, sizeof(din), dout, sizeof(dout));
4126 + return err;
4127 + }
4128 +diff --git a/drivers/net/ethernet/mellanox/mlx5/core/rl.c b/drivers/net/ethernet/mellanox/mlx5/core/rl.c
4129 +index e651e4c02867..d3c33e9eea72 100644
4130 +--- a/drivers/net/ethernet/mellanox/mlx5/core/rl.c
4131 ++++ b/drivers/net/ethernet/mellanox/mlx5/core/rl.c
4132 +@@ -125,16 +125,16 @@ static struct mlx5_rl_entry *find_rl_entry(struct mlx5_rl_table *table,
4133 + return ret_entry;
4134 + }
4135 +
4136 +-static int mlx5_set_rate_limit_cmd(struct mlx5_core_dev *dev,
4137 ++static int mlx5_set_pp_rate_limit_cmd(struct mlx5_core_dev *dev,
4138 + u32 rate, u16 index)
4139 + {
4140 +- u32 in[MLX5_ST_SZ_DW(set_rate_limit_in)] = {0};
4141 +- u32 out[MLX5_ST_SZ_DW(set_rate_limit_out)] = {0};
4142 ++ u32 in[MLX5_ST_SZ_DW(set_pp_rate_limit_in)] = {0};
4143 ++ u32 out[MLX5_ST_SZ_DW(set_pp_rate_limit_out)] = {0};
4144 +
4145 +- MLX5_SET(set_rate_limit_in, in, opcode,
4146 +- MLX5_CMD_OP_SET_RATE_LIMIT);
4147 +- MLX5_SET(set_rate_limit_in, in, rate_limit_index, index);
4148 +- MLX5_SET(set_rate_limit_in, in, rate_limit, rate);
4149 ++ MLX5_SET(set_pp_rate_limit_in, in, opcode,
4150 ++ MLX5_CMD_OP_SET_PP_RATE_LIMIT);
4151 ++ MLX5_SET(set_pp_rate_limit_in, in, rate_limit_index, index);
4152 ++ MLX5_SET(set_pp_rate_limit_in, in, rate_limit, rate);
4153 + return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
4154 + }
4155 +
4156 +@@ -173,7 +173,7 @@ int mlx5_rl_add_rate(struct mlx5_core_dev *dev, u32 rate, u16 *index)
4157 + entry->refcount++;
4158 + } else {
4159 + /* new rate limit */
4160 +- err = mlx5_set_rate_limit_cmd(dev, rate, entry->index);
4161 ++ err = mlx5_set_pp_rate_limit_cmd(dev, rate, entry->index);
4162 + if (err) {
4163 + mlx5_core_err(dev, "Failed configuring rate: %u (%d)\n",
4164 + rate, err);
4165 +@@ -209,7 +209,7 @@ void mlx5_rl_remove_rate(struct mlx5_core_dev *dev, u32 rate)
4166 + entry->refcount--;
4167 + if (!entry->refcount) {
4168 + /* need to remove rate */
4169 +- mlx5_set_rate_limit_cmd(dev, 0, entry->index);
4170 ++ mlx5_set_pp_rate_limit_cmd(dev, 0, entry->index);
4171 + entry->rate = 0;
4172 + }
4173 +
4174 +@@ -262,8 +262,8 @@ void mlx5_cleanup_rl_table(struct mlx5_core_dev *dev)
4175 + /* Clear all configured rates */
4176 + for (i = 0; i < table->max_size; i++)
4177 + if (table->rl_entry[i].rate)
4178 +- mlx5_set_rate_limit_cmd(dev, 0,
4179 +- table->rl_entry[i].index);
4180 ++ mlx5_set_pp_rate_limit_cmd(dev, 0,
4181 ++ table->rl_entry[i].index);
4182 +
4183 + kfree(dev->priv.rl_table.rl_entry);
4184 + }
4185 +diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c b/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c
4186 +index 07a9ba6cfc70..2f74953e4561 100644
4187 +--- a/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c
4188 ++++ b/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c
4189 +@@ -71,9 +71,9 @@ struct mlx5e_vxlan *mlx5e_vxlan_lookup_port(struct mlx5e_priv *priv, u16 port)
4190 + struct mlx5e_vxlan_db *vxlan_db = &priv->vxlan;
4191 + struct mlx5e_vxlan *vxlan;
4192 +
4193 +- spin_lock(&vxlan_db->lock);
4194 ++ spin_lock_bh(&vxlan_db->lock);
4195 + vxlan = radix_tree_lookup(&vxlan_db->tree, port);
4196 +- spin_unlock(&vxlan_db->lock);
4197 ++ spin_unlock_bh(&vxlan_db->lock);
4198 +
4199 + return vxlan;
4200 + }
4201 +@@ -88,8 +88,12 @@ static void mlx5e_vxlan_add_port(struct work_struct *work)
4202 + struct mlx5e_vxlan *vxlan;
4203 + int err;
4204 +
4205 +- if (mlx5e_vxlan_lookup_port(priv, port))
4206 ++ mutex_lock(&priv->state_lock);
4207 ++ vxlan = mlx5e_vxlan_lookup_port(priv, port);
4208 ++ if (vxlan) {
4209 ++ atomic_inc(&vxlan->refcount);
4210 + goto free_work;
4211 ++ }
4212 +
4213 + if (mlx5e_vxlan_core_add_port_cmd(priv->mdev, port))
4214 + goto free_work;
4215 +@@ -99,10 +103,11 @@ static void mlx5e_vxlan_add_port(struct work_struct *work)
4216 + goto err_delete_port;
4217 +
4218 + vxlan->udp_port = port;
4219 ++ atomic_set(&vxlan->refcount, 1);
4220 +
4221 +- spin_lock_irq(&vxlan_db->lock);
4222 ++ spin_lock_bh(&vxlan_db->lock);
4223 + err = radix_tree_insert(&vxlan_db->tree, vxlan->udp_port, vxlan);
4224 +- spin_unlock_irq(&vxlan_db->lock);
4225 ++ spin_unlock_bh(&vxlan_db->lock);
4226 + if (err)
4227 + goto err_free;
4228 +
4229 +@@ -113,35 +118,39 @@ static void mlx5e_vxlan_add_port(struct work_struct *work)
4230 + err_delete_port:
4231 + mlx5e_vxlan_core_del_port_cmd(priv->mdev, port);
4232 + free_work:
4233 ++ mutex_unlock(&priv->state_lock);
4234 + kfree(vxlan_work);
4235 + }
4236 +
4237 +-static void __mlx5e_vxlan_core_del_port(struct mlx5e_priv *priv, u16 port)
4238 ++static void mlx5e_vxlan_del_port(struct work_struct *work)
4239 + {
4240 ++ struct mlx5e_vxlan_work *vxlan_work =
4241 ++ container_of(work, struct mlx5e_vxlan_work, work);
4242 ++ struct mlx5e_priv *priv = vxlan_work->priv;
4243 + struct mlx5e_vxlan_db *vxlan_db = &priv->vxlan;
4244 ++ u16 port = vxlan_work->port;
4245 + struct mlx5e_vxlan *vxlan;
4246 ++ bool remove = false;
4247 +
4248 +- spin_lock_irq(&vxlan_db->lock);
4249 +- vxlan = radix_tree_delete(&vxlan_db->tree, port);
4250 +- spin_unlock_irq(&vxlan_db->lock);
4251 +-
4252 ++ mutex_lock(&priv->state_lock);
4253 ++ spin_lock_bh(&vxlan_db->lock);
4254 ++ vxlan = radix_tree_lookup(&vxlan_db->tree, port);
4255 + if (!vxlan)
4256 +- return;
4257 +-
4258 +- mlx5e_vxlan_core_del_port_cmd(priv->mdev, vxlan->udp_port);
4259 +-
4260 +- kfree(vxlan);
4261 +-}
4262 ++ goto out_unlock;
4263 +
4264 +-static void mlx5e_vxlan_del_port(struct work_struct *work)
4265 +-{
4266 +- struct mlx5e_vxlan_work *vxlan_work =
4267 +- container_of(work, struct mlx5e_vxlan_work, work);
4268 +- struct mlx5e_priv *priv = vxlan_work->priv;
4269 +- u16 port = vxlan_work->port;
4270 ++ if (atomic_dec_and_test(&vxlan->refcount)) {
4271 ++ radix_tree_delete(&vxlan_db->tree, port);
4272 ++ remove = true;
4273 ++ }
4274 +
4275 +- __mlx5e_vxlan_core_del_port(priv, port);
4276 ++out_unlock:
4277 ++ spin_unlock_bh(&vxlan_db->lock);
4278 +
4279 ++ if (remove) {
4280 ++ mlx5e_vxlan_core_del_port_cmd(priv->mdev, port);
4281 ++ kfree(vxlan);
4282 ++ }
4283 ++ mutex_unlock(&priv->state_lock);
4284 + kfree(vxlan_work);
4285 + }
4286 +
4287 +@@ -171,12 +180,11 @@ void mlx5e_vxlan_cleanup(struct mlx5e_priv *priv)
4288 + struct mlx5e_vxlan *vxlan;
4289 + unsigned int port = 0;
4290 +
4291 +- spin_lock_irq(&vxlan_db->lock);
4292 ++ /* Lockless since we are the only radix-tree consumers, wq is disabled */
4293 + while (radix_tree_gang_lookup(&vxlan_db->tree, (void **)&vxlan, port, 1)) {
4294 + port = vxlan->udp_port;
4295 +- spin_unlock_irq(&vxlan_db->lock);
4296 +- __mlx5e_vxlan_core_del_port(priv, (u16)port);
4297 +- spin_lock_irq(&vxlan_db->lock);
4298 ++ radix_tree_delete(&vxlan_db->tree, port);
4299 ++ mlx5e_vxlan_core_del_port_cmd(priv->mdev, port);
4300 ++ kfree(vxlan);
4301 + }
4302 +- spin_unlock_irq(&vxlan_db->lock);
4303 + }
4304 +diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vxlan.h b/drivers/net/ethernet/mellanox/mlx5/core/vxlan.h
4305 +index 5def12c048e3..5ef6ae7d568a 100644
4306 +--- a/drivers/net/ethernet/mellanox/mlx5/core/vxlan.h
4307 ++++ b/drivers/net/ethernet/mellanox/mlx5/core/vxlan.h
4308 +@@ -36,6 +36,7 @@
4309 + #include "en.h"
4310 +
4311 + struct mlx5e_vxlan {
4312 ++ atomic_t refcount;
4313 + u16 udp_port;
4314 + };
4315 +
4316 +diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
4317 +index db38880f54b4..3ead7439821c 100644
4318 +--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
4319 ++++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
4320 +@@ -4164,6 +4164,7 @@ static int mlxsw_sp_port_stp_set(struct mlxsw_sp_port *mlxsw_sp_port,
4321 +
4322 + static int mlxsw_sp_port_ovs_join(struct mlxsw_sp_port *mlxsw_sp_port)
4323 + {
4324 ++ u16 vid = 1;
4325 + int err;
4326 +
4327 + err = mlxsw_sp_port_vp_mode_set(mlxsw_sp_port, true);
4328 +@@ -4176,8 +4177,19 @@ static int mlxsw_sp_port_ovs_join(struct mlxsw_sp_port *mlxsw_sp_port)
4329 + true, false);
4330 + if (err)
4331 + goto err_port_vlan_set;
4332 ++
4333 ++ for (; vid <= VLAN_N_VID - 1; vid++) {
4334 ++ err = mlxsw_sp_port_vid_learning_set(mlxsw_sp_port,
4335 ++ vid, false);
4336 ++ if (err)
4337 ++ goto err_vid_learning_set;
4338 ++ }
4339 ++
4340 + return 0;
4341 +
4342 ++err_vid_learning_set:
4343 ++ for (vid--; vid >= 1; vid--)
4344 ++ mlxsw_sp_port_vid_learning_set(mlxsw_sp_port, vid, true);
4345 + err_port_vlan_set:
4346 + mlxsw_sp_port_stp_set(mlxsw_sp_port, false);
4347 + err_port_stp_set:
4348 +@@ -4187,6 +4199,12 @@ static int mlxsw_sp_port_ovs_join(struct mlxsw_sp_port *mlxsw_sp_port)
4349 +
4350 + static void mlxsw_sp_port_ovs_leave(struct mlxsw_sp_port *mlxsw_sp_port)
4351 + {
4352 ++ u16 vid;
4353 ++
4354 ++ for (vid = VLAN_N_VID - 1; vid >= 1; vid--)
4355 ++ mlxsw_sp_port_vid_learning_set(mlxsw_sp_port,
4356 ++ vid, true);
4357 ++
4358 + mlxsw_sp_port_vlan_set(mlxsw_sp_port, 2, VLAN_N_VID - 1,
4359 + false, false);
4360 + mlxsw_sp_port_stp_set(mlxsw_sp_port, false);
4361 +diff --git a/drivers/net/ethernet/sfc/tx.c b/drivers/net/ethernet/sfc/tx.c
4362 +index 32bf1fecf864..9b85cbd5a231 100644
4363 +--- a/drivers/net/ethernet/sfc/tx.c
4364 ++++ b/drivers/net/ethernet/sfc/tx.c
4365 +@@ -77,6 +77,7 @@ static void efx_dequeue_buffer(struct efx_tx_queue *tx_queue,
4366 + }
4367 +
4368 + if (buffer->flags & EFX_TX_BUF_SKB) {
4369 ++ EFX_WARN_ON_PARANOID(!pkts_compl || !bytes_compl);
4370 + (*pkts_compl)++;
4371 + (*bytes_compl) += buffer->skb->len;
4372 + dev_consume_skb_any((struct sk_buff *)buffer->skb);
4373 +@@ -426,12 +427,14 @@ static int efx_tx_map_data(struct efx_tx_queue *tx_queue, struct sk_buff *skb,
4374 + static void efx_enqueue_unwind(struct efx_tx_queue *tx_queue)
4375 + {
4376 + struct efx_tx_buffer *buffer;
4377 ++ unsigned int bytes_compl = 0;
4378 ++ unsigned int pkts_compl = 0;
4379 +
4380 + /* Work backwards until we hit the original insert pointer value */
4381 + while (tx_queue->insert_count != tx_queue->write_count) {
4382 + --tx_queue->insert_count;
4383 + buffer = __efx_tx_queue_get_insert_buffer(tx_queue);
4384 +- efx_dequeue_buffer(tx_queue, buffer, NULL, NULL);
4385 ++ efx_dequeue_buffer(tx_queue, buffer, &pkts_compl, &bytes_compl);
4386 + }
4387 + }
4388 +
4389 +diff --git a/drivers/net/phy/marvell.c b/drivers/net/phy/marvell.c
4390 +index 4d02b27df044..a3f456b91c99 100644
4391 +--- a/drivers/net/phy/marvell.c
4392 ++++ b/drivers/net/phy/marvell.c
4393 +@@ -2069,7 +2069,7 @@ static struct phy_driver marvell_drivers[] = {
4394 + .flags = PHY_HAS_INTERRUPT,
4395 + .probe = marvell_probe,
4396 + .config_init = &m88e1145_config_init,
4397 +- .config_aneg = &marvell_config_aneg,
4398 ++ .config_aneg = &m88e1101_config_aneg,
4399 + .read_status = &genphy_read_status,
4400 + .ack_interrupt = &marvell_ack_interrupt,
4401 + .config_intr = &marvell_config_intr,
4402 +diff --git a/drivers/net/phy/micrel.c b/drivers/net/phy/micrel.c
4403 +index fdb43dd9b5cd..6c45ff650ec7 100644
4404 +--- a/drivers/net/phy/micrel.c
4405 ++++ b/drivers/net/phy/micrel.c
4406 +@@ -622,6 +622,7 @@ static int ksz9031_read_status(struct phy_device *phydev)
4407 + phydev->link = 0;
4408 + if (phydev->drv->config_intr && phy_interrupt_is_valid(phydev))
4409 + phydev->drv->config_intr(phydev);
4410 ++ return genphy_config_aneg(phydev);
4411 + }
4412 +
4413 + return 0;
4414 +diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c
4415 +index bcb4755bcd95..4b377b978a0b 100644
4416 +--- a/drivers/net/phy/phylink.c
4417 ++++ b/drivers/net/phy/phylink.c
4418 +@@ -525,6 +525,7 @@ struct phylink *phylink_create(struct net_device *ndev, struct device_node *np,
4419 + pl->link_config.pause = MLO_PAUSE_AN;
4420 + pl->link_config.speed = SPEED_UNKNOWN;
4421 + pl->link_config.duplex = DUPLEX_UNKNOWN;
4422 ++ pl->link_config.an_enabled = true;
4423 + pl->ops = ops;
4424 + __set_bit(PHYLINK_DISABLE_STOPPED, &pl->phylink_disable_state);
4425 +
4426 +@@ -948,6 +949,7 @@ int phylink_ethtool_ksettings_set(struct phylink *pl,
4427 + mutex_lock(&pl->state_mutex);
4428 + /* Configure the MAC to match the new settings */
4429 + linkmode_copy(pl->link_config.advertising, our_kset.link_modes.advertising);
4430 ++ pl->link_config.interface = config.interface;
4431 + pl->link_config.speed = our_kset.base.speed;
4432 + pl->link_config.duplex = our_kset.base.duplex;
4433 + pl->link_config.an_enabled = our_kset.base.autoneg != AUTONEG_DISABLE;
4434 +diff --git a/drivers/net/usb/qmi_wwan.c b/drivers/net/usb/qmi_wwan.c
4435 +index 81394a4b2803..2092febfcb42 100644
4436 +--- a/drivers/net/usb/qmi_wwan.c
4437 ++++ b/drivers/net/usb/qmi_wwan.c
4438 +@@ -1204,6 +1204,7 @@ static const struct usb_device_id products[] = {
4439 + {QMI_FIXED_INTF(0x1199, 0x9079, 10)}, /* Sierra Wireless EM74xx */
4440 + {QMI_FIXED_INTF(0x1199, 0x907b, 8)}, /* Sierra Wireless EM74xx */
4441 + {QMI_FIXED_INTF(0x1199, 0x907b, 10)}, /* Sierra Wireless EM74xx */
4442 ++ {QMI_FIXED_INTF(0x1199, 0x9091, 8)}, /* Sierra Wireless EM7565 */
4443 + {QMI_FIXED_INTF(0x1bbb, 0x011e, 4)}, /* Telekom Speedstick LTE II (Alcatel One Touch L100V LTE) */
4444 + {QMI_FIXED_INTF(0x1bbb, 0x0203, 2)}, /* Alcatel L800MA */
4445 + {QMI_FIXED_INTF(0x2357, 0x0201, 4)}, /* TP-LINK HSUPA Modem MA180 */
4446 +diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
4447 +index a2f4e52fadb5..9e9202b50e73 100644
4448 +--- a/drivers/net/vxlan.c
4449 ++++ b/drivers/net/vxlan.c
4450 +@@ -3105,6 +3105,11 @@ static void vxlan_config_apply(struct net_device *dev,
4451 +
4452 + max_mtu = lowerdev->mtu - (use_ipv6 ? VXLAN6_HEADROOM :
4453 + VXLAN_HEADROOM);
4454 ++ if (max_mtu < ETH_MIN_MTU)
4455 ++ max_mtu = ETH_MIN_MTU;
4456 ++
4457 ++ if (!changelink && !conf->mtu)
4458 ++ dev->mtu = max_mtu;
4459 + }
4460 +
4461 + if (dev->mtu > max_mtu)
4462 +diff --git a/drivers/phy/tegra/xusb.c b/drivers/phy/tegra/xusb.c
4463 +index 4307bf0013e1..63e916d4d069 100644
4464 +--- a/drivers/phy/tegra/xusb.c
4465 ++++ b/drivers/phy/tegra/xusb.c
4466 +@@ -75,14 +75,14 @@ MODULE_DEVICE_TABLE(of, tegra_xusb_padctl_of_match);
4467 + static struct device_node *
4468 + tegra_xusb_find_pad_node(struct tegra_xusb_padctl *padctl, const char *name)
4469 + {
4470 +- /*
4471 +- * of_find_node_by_name() drops a reference, so make sure to grab one.
4472 +- */
4473 +- struct device_node *np = of_node_get(padctl->dev->of_node);
4474 ++ struct device_node *pads, *np;
4475 ++
4476 ++ pads = of_get_child_by_name(padctl->dev->of_node, "pads");
4477 ++ if (!pads)
4478 ++ return NULL;
4479 +
4480 +- np = of_find_node_by_name(np, "pads");
4481 +- if (np)
4482 +- np = of_find_node_by_name(np, name);
4483 ++ np = of_get_child_by_name(pads, name);
4484 ++ of_node_put(pads);
4485 +
4486 + return np;
4487 + }
4488 +@@ -90,16 +90,16 @@ tegra_xusb_find_pad_node(struct tegra_xusb_padctl *padctl, const char *name)
4489 + static struct device_node *
4490 + tegra_xusb_pad_find_phy_node(struct tegra_xusb_pad *pad, unsigned int index)
4491 + {
4492 +- /*
4493 +- * of_find_node_by_name() drops a reference, so make sure to grab one.
4494 +- */
4495 +- struct device_node *np = of_node_get(pad->dev.of_node);
4496 ++ struct device_node *np, *lanes;
4497 +
4498 +- np = of_find_node_by_name(np, "lanes");
4499 +- if (!np)
4500 ++ lanes = of_get_child_by_name(pad->dev.of_node, "lanes");
4501 ++ if (!lanes)
4502 + return NULL;
4503 +
4504 +- return of_find_node_by_name(np, pad->soc->lanes[index].name);
4505 ++ np = of_get_child_by_name(lanes, pad->soc->lanes[index].name);
4506 ++ of_node_put(lanes);
4507 ++
4508 ++ return np;
4509 + }
4510 +
4511 + static int
4512 +@@ -195,7 +195,7 @@ int tegra_xusb_pad_register(struct tegra_xusb_pad *pad,
4513 + unsigned int i;
4514 + int err;
4515 +
4516 +- children = of_find_node_by_name(pad->dev.of_node, "lanes");
4517 ++ children = of_get_child_by_name(pad->dev.of_node, "lanes");
4518 + if (!children)
4519 + return -ENODEV;
4520 +
4521 +@@ -444,21 +444,21 @@ static struct device_node *
4522 + tegra_xusb_find_port_node(struct tegra_xusb_padctl *padctl, const char *type,
4523 + unsigned int index)
4524 + {
4525 +- /*
4526 +- * of_find_node_by_name() drops a reference, so make sure to grab one.
4527 +- */
4528 +- struct device_node *np = of_node_get(padctl->dev->of_node);
4529 ++ struct device_node *ports, *np;
4530 ++ char *name;
4531 +
4532 +- np = of_find_node_by_name(np, "ports");
4533 +- if (np) {
4534 +- char *name;
4535 ++ ports = of_get_child_by_name(padctl->dev->of_node, "ports");
4536 ++ if (!ports)
4537 ++ return NULL;
4538 +
4539 +- name = kasprintf(GFP_KERNEL, "%s-%u", type, index);
4540 +- if (!name)
4541 +- return ERR_PTR(-ENOMEM);
4542 +- np = of_find_node_by_name(np, name);
4543 +- kfree(name);
4544 ++ name = kasprintf(GFP_KERNEL, "%s-%u", type, index);
4545 ++ if (!name) {
4546 ++ of_node_put(ports);
4547 ++ return ERR_PTR(-ENOMEM);
4548 + }
4549 ++ np = of_get_child_by_name(ports, name);
4550 ++ kfree(name);
4551 ++ of_node_put(ports);
4552 +
4553 + return np;
4554 + }
4555 +@@ -847,7 +847,7 @@ static void tegra_xusb_remove_ports(struct tegra_xusb_padctl *padctl)
4556 +
4557 + static int tegra_xusb_padctl_probe(struct platform_device *pdev)
4558 + {
4559 +- struct device_node *np = of_node_get(pdev->dev.of_node);
4560 ++ struct device_node *np = pdev->dev.of_node;
4561 + const struct tegra_xusb_padctl_soc *soc;
4562 + struct tegra_xusb_padctl *padctl;
4563 + const struct of_device_id *match;
4564 +@@ -855,7 +855,7 @@ static int tegra_xusb_padctl_probe(struct platform_device *pdev)
4565 + int err;
4566 +
4567 + /* for backwards compatibility with old device trees */
4568 +- np = of_find_node_by_name(np, "pads");
4569 ++ np = of_get_child_by_name(np, "pads");
4570 + if (!np) {
4571 + dev_warn(&pdev->dev, "deprecated DT, using legacy driver\n");
4572 + return tegra_xusb_padctl_legacy_probe(pdev);
4573 +diff --git a/drivers/s390/net/qeth_core.h b/drivers/s390/net/qeth_core.h
4574 +index 5340efc673a9..92dd4aef21a3 100644
4575 +--- a/drivers/s390/net/qeth_core.h
4576 ++++ b/drivers/s390/net/qeth_core.h
4577 +@@ -564,9 +564,9 @@ enum qeth_cq {
4578 + };
4579 +
4580 + struct qeth_ipato {
4581 +- int enabled;
4582 +- int invert4;
4583 +- int invert6;
4584 ++ bool enabled;
4585 ++ bool invert4;
4586 ++ bool invert6;
4587 + struct list_head entries;
4588 + };
4589 +
4590 +diff --git a/drivers/s390/net/qeth_core_main.c b/drivers/s390/net/qeth_core_main.c
4591 +index 330e5d3dadf3..7c7a244b6684 100644
4592 +--- a/drivers/s390/net/qeth_core_main.c
4593 ++++ b/drivers/s390/net/qeth_core_main.c
4594 +@@ -1479,9 +1479,9 @@ static int qeth_setup_card(struct qeth_card *card)
4595 + qeth_set_intial_options(card);
4596 + /* IP address takeover */
4597 + INIT_LIST_HEAD(&card->ipato.entries);
4598 +- card->ipato.enabled = 0;
4599 +- card->ipato.invert4 = 0;
4600 +- card->ipato.invert6 = 0;
4601 ++ card->ipato.enabled = false;
4602 ++ card->ipato.invert4 = false;
4603 ++ card->ipato.invert6 = false;
4604 + /* init QDIO stuff */
4605 + qeth_init_qdio_info(card);
4606 + INIT_DELAYED_WORK(&card->buffer_reclaim_work, qeth_buffer_reclaim_work);
4607 +@@ -5445,6 +5445,13 @@ int qeth_poll(struct napi_struct *napi, int budget)
4608 + }
4609 + EXPORT_SYMBOL_GPL(qeth_poll);
4610 +
4611 ++static int qeth_setassparms_inspect_rc(struct qeth_ipa_cmd *cmd)
4612 ++{
4613 ++ if (!cmd->hdr.return_code)
4614 ++ cmd->hdr.return_code = cmd->data.setassparms.hdr.return_code;
4615 ++ return cmd->hdr.return_code;
4616 ++}
4617 ++
4618 + int qeth_setassparms_cb(struct qeth_card *card,
4619 + struct qeth_reply *reply, unsigned long data)
4620 + {
4621 +@@ -6304,7 +6311,7 @@ static int qeth_ipa_checksum_run_cmd_cb(struct qeth_card *card,
4622 + (struct qeth_checksum_cmd *)reply->param;
4623 +
4624 + QETH_CARD_TEXT(card, 4, "chkdoccb");
4625 +- if (cmd->hdr.return_code)
4626 ++ if (qeth_setassparms_inspect_rc(cmd))
4627 + return 0;
4628 +
4629 + memset(chksum_cb, 0, sizeof(*chksum_cb));
4630 +diff --git a/drivers/s390/net/qeth_l3.h b/drivers/s390/net/qeth_l3.h
4631 +index 194ae9b577cc..e5833837b799 100644
4632 +--- a/drivers/s390/net/qeth_l3.h
4633 ++++ b/drivers/s390/net/qeth_l3.h
4634 +@@ -82,7 +82,7 @@ void qeth_l3_del_vipa(struct qeth_card *, enum qeth_prot_versions, const u8 *);
4635 + int qeth_l3_add_rxip(struct qeth_card *, enum qeth_prot_versions, const u8 *);
4636 + void qeth_l3_del_rxip(struct qeth_card *card, enum qeth_prot_versions,
4637 + const u8 *);
4638 +-int qeth_l3_is_addr_covered_by_ipato(struct qeth_card *, struct qeth_ipaddr *);
4639 ++void qeth_l3_update_ipato(struct qeth_card *card);
4640 + struct qeth_ipaddr *qeth_l3_get_addr_buffer(enum qeth_prot_versions);
4641 + int qeth_l3_add_ip(struct qeth_card *, struct qeth_ipaddr *);
4642 + int qeth_l3_delete_ip(struct qeth_card *, struct qeth_ipaddr *);
4643 +diff --git a/drivers/s390/net/qeth_l3_main.c b/drivers/s390/net/qeth_l3_main.c
4644 +index 27185ab38136..36dee176f8e2 100644
4645 +--- a/drivers/s390/net/qeth_l3_main.c
4646 ++++ b/drivers/s390/net/qeth_l3_main.c
4647 +@@ -163,8 +163,8 @@ static void qeth_l3_convert_addr_to_bits(u8 *addr, u8 *bits, int len)
4648 + }
4649 + }
4650 +
4651 +-int qeth_l3_is_addr_covered_by_ipato(struct qeth_card *card,
4652 +- struct qeth_ipaddr *addr)
4653 ++static bool qeth_l3_is_addr_covered_by_ipato(struct qeth_card *card,
4654 ++ struct qeth_ipaddr *addr)
4655 + {
4656 + struct qeth_ipato_entry *ipatoe;
4657 + u8 addr_bits[128] = {0, };
4658 +@@ -173,6 +173,8 @@ int qeth_l3_is_addr_covered_by_ipato(struct qeth_card *card,
4659 +
4660 + if (!card->ipato.enabled)
4661 + return 0;
4662 ++ if (addr->type != QETH_IP_TYPE_NORMAL)
4663 ++ return 0;
4664 +
4665 + qeth_l3_convert_addr_to_bits((u8 *) &addr->u, addr_bits,
4666 + (addr->proto == QETH_PROT_IPV4)? 4:16);
4667 +@@ -289,8 +291,7 @@ int qeth_l3_add_ip(struct qeth_card *card, struct qeth_ipaddr *tmp_addr)
4668 + memcpy(addr, tmp_addr, sizeof(struct qeth_ipaddr));
4669 + addr->ref_counter = 1;
4670 +
4671 +- if (addr->type == QETH_IP_TYPE_NORMAL &&
4672 +- qeth_l3_is_addr_covered_by_ipato(card, addr)) {
4673 ++ if (qeth_l3_is_addr_covered_by_ipato(card, addr)) {
4674 + QETH_CARD_TEXT(card, 2, "tkovaddr");
4675 + addr->set_flags |= QETH_IPA_SETIP_TAKEOVER_FLAG;
4676 + }
4677 +@@ -604,6 +605,27 @@ int qeth_l3_setrouting_v6(struct qeth_card *card)
4678 + /*
4679 + * IP address takeover related functions
4680 + */
4681 ++
4682 ++/**
4683 ++ * qeth_l3_update_ipato() - Update 'takeover' property, for all NORMAL IPs.
4684 ++ *
4685 ++ * Caller must hold ip_lock.
4686 ++ */
4687 ++void qeth_l3_update_ipato(struct qeth_card *card)
4688 ++{
4689 ++ struct qeth_ipaddr *addr;
4690 ++ unsigned int i;
4691 ++
4692 ++ hash_for_each(card->ip_htable, i, addr, hnode) {
4693 ++ if (addr->type != QETH_IP_TYPE_NORMAL)
4694 ++ continue;
4695 ++ if (qeth_l3_is_addr_covered_by_ipato(card, addr))
4696 ++ addr->set_flags |= QETH_IPA_SETIP_TAKEOVER_FLAG;
4697 ++ else
4698 ++ addr->set_flags &= ~QETH_IPA_SETIP_TAKEOVER_FLAG;
4699 ++ }
4700 ++}
4701 ++
4702 + static void qeth_l3_clear_ipato_list(struct qeth_card *card)
4703 + {
4704 + struct qeth_ipato_entry *ipatoe, *tmp;
4705 +@@ -615,6 +637,7 @@ static void qeth_l3_clear_ipato_list(struct qeth_card *card)
4706 + kfree(ipatoe);
4707 + }
4708 +
4709 ++ qeth_l3_update_ipato(card);
4710 + spin_unlock_bh(&card->ip_lock);
4711 + }
4712 +
4713 +@@ -639,8 +662,10 @@ int qeth_l3_add_ipato_entry(struct qeth_card *card,
4714 + }
4715 + }
4716 +
4717 +- if (!rc)
4718 ++ if (!rc) {
4719 + list_add_tail(&new->entry, &card->ipato.entries);
4720 ++ qeth_l3_update_ipato(card);
4721 ++ }
4722 +
4723 + spin_unlock_bh(&card->ip_lock);
4724 +
4725 +@@ -663,6 +688,7 @@ void qeth_l3_del_ipato_entry(struct qeth_card *card,
4726 + (proto == QETH_PROT_IPV4)? 4:16) &&
4727 + (ipatoe->mask_bits == mask_bits)) {
4728 + list_del(&ipatoe->entry);
4729 ++ qeth_l3_update_ipato(card);
4730 + kfree(ipatoe);
4731 + }
4732 + }
4733 +diff --git a/drivers/s390/net/qeth_l3_sys.c b/drivers/s390/net/qeth_l3_sys.c
4734 +index 7a829ad77783..1295dd8ec849 100644
4735 +--- a/drivers/s390/net/qeth_l3_sys.c
4736 ++++ b/drivers/s390/net/qeth_l3_sys.c
4737 +@@ -370,8 +370,8 @@ static ssize_t qeth_l3_dev_ipato_enable_store(struct device *dev,
4738 + struct device_attribute *attr, const char *buf, size_t count)
4739 + {
4740 + struct qeth_card *card = dev_get_drvdata(dev);
4741 +- struct qeth_ipaddr *addr;
4742 +- int i, rc = 0;
4743 ++ bool enable;
4744 ++ int rc = 0;
4745 +
4746 + if (!card)
4747 + return -EINVAL;
4748 +@@ -384,25 +384,18 @@ static ssize_t qeth_l3_dev_ipato_enable_store(struct device *dev,
4749 + }
4750 +
4751 + if (sysfs_streq(buf, "toggle")) {
4752 +- card->ipato.enabled = (card->ipato.enabled)? 0 : 1;
4753 +- } else if (sysfs_streq(buf, "1")) {
4754 +- card->ipato.enabled = 1;
4755 +- hash_for_each(card->ip_htable, i, addr, hnode) {
4756 +- if ((addr->type == QETH_IP_TYPE_NORMAL) &&
4757 +- qeth_l3_is_addr_covered_by_ipato(card, addr))
4758 +- addr->set_flags |=
4759 +- QETH_IPA_SETIP_TAKEOVER_FLAG;
4760 +- }
4761 +- } else if (sysfs_streq(buf, "0")) {
4762 +- card->ipato.enabled = 0;
4763 +- hash_for_each(card->ip_htable, i, addr, hnode) {
4764 +- if (addr->set_flags &
4765 +- QETH_IPA_SETIP_TAKEOVER_FLAG)
4766 +- addr->set_flags &=
4767 +- ~QETH_IPA_SETIP_TAKEOVER_FLAG;
4768 +- }
4769 +- } else
4770 ++ enable = !card->ipato.enabled;
4771 ++ } else if (kstrtobool(buf, &enable)) {
4772 + rc = -EINVAL;
4773 ++ goto out;
4774 ++ }
4775 ++
4776 ++ if (card->ipato.enabled != enable) {
4777 ++ card->ipato.enabled = enable;
4778 ++ spin_lock_bh(&card->ip_lock);
4779 ++ qeth_l3_update_ipato(card);
4780 ++ spin_unlock_bh(&card->ip_lock);
4781 ++ }
4782 + out:
4783 + mutex_unlock(&card->conf_mutex);
4784 + return rc ? rc : count;
4785 +@@ -428,20 +421,27 @@ static ssize_t qeth_l3_dev_ipato_invert4_store(struct device *dev,
4786 + const char *buf, size_t count)
4787 + {
4788 + struct qeth_card *card = dev_get_drvdata(dev);
4789 ++ bool invert;
4790 + int rc = 0;
4791 +
4792 + if (!card)
4793 + return -EINVAL;
4794 +
4795 + mutex_lock(&card->conf_mutex);
4796 +- if (sysfs_streq(buf, "toggle"))
4797 +- card->ipato.invert4 = (card->ipato.invert4)? 0 : 1;
4798 +- else if (sysfs_streq(buf, "1"))
4799 +- card->ipato.invert4 = 1;
4800 +- else if (sysfs_streq(buf, "0"))
4801 +- card->ipato.invert4 = 0;
4802 +- else
4803 ++ if (sysfs_streq(buf, "toggle")) {
4804 ++ invert = !card->ipato.invert4;
4805 ++ } else if (kstrtobool(buf, &invert)) {
4806 + rc = -EINVAL;
4807 ++ goto out;
4808 ++ }
4809 ++
4810 ++ if (card->ipato.invert4 != invert) {
4811 ++ card->ipato.invert4 = invert;
4812 ++ spin_lock_bh(&card->ip_lock);
4813 ++ qeth_l3_update_ipato(card);
4814 ++ spin_unlock_bh(&card->ip_lock);
4815 ++ }
4816 ++out:
4817 + mutex_unlock(&card->conf_mutex);
4818 + return rc ? rc : count;
4819 + }
4820 +@@ -607,20 +607,27 @@ static ssize_t qeth_l3_dev_ipato_invert6_store(struct device *dev,
4821 + struct device_attribute *attr, const char *buf, size_t count)
4822 + {
4823 + struct qeth_card *card = dev_get_drvdata(dev);
4824 ++ bool invert;
4825 + int rc = 0;
4826 +
4827 + if (!card)
4828 + return -EINVAL;
4829 +
4830 + mutex_lock(&card->conf_mutex);
4831 +- if (sysfs_streq(buf, "toggle"))
4832 +- card->ipato.invert6 = (card->ipato.invert6)? 0 : 1;
4833 +- else if (sysfs_streq(buf, "1"))
4834 +- card->ipato.invert6 = 1;
4835 +- else if (sysfs_streq(buf, "0"))
4836 +- card->ipato.invert6 = 0;
4837 +- else
4838 ++ if (sysfs_streq(buf, "toggle")) {
4839 ++ invert = !card->ipato.invert6;
4840 ++ } else if (kstrtobool(buf, &invert)) {
4841 + rc = -EINVAL;
4842 ++ goto out;
4843 ++ }
4844 ++
4845 ++ if (card->ipato.invert6 != invert) {
4846 ++ card->ipato.invert6 = invert;
4847 ++ spin_lock_bh(&card->ip_lock);
4848 ++ qeth_l3_update_ipato(card);
4849 ++ spin_unlock_bh(&card->ip_lock);
4850 ++ }
4851 ++out:
4852 + mutex_unlock(&card->conf_mutex);
4853 + return rc ? rc : count;
4854 + }
4855 +diff --git a/drivers/scsi/osd/osd_initiator.c b/drivers/scsi/osd/osd_initiator.c
4856 +index a4f28b7e4c65..e18877177f1b 100644
4857 +--- a/drivers/scsi/osd/osd_initiator.c
4858 ++++ b/drivers/scsi/osd/osd_initiator.c
4859 +@@ -1576,7 +1576,9 @@ static struct request *_make_request(struct request_queue *q, bool has_write,
4860 + return req;
4861 +
4862 + for_each_bio(bio) {
4863 +- ret = blk_rq_append_bio(req, bio);
4864 ++ struct bio *bounce_bio = bio;
4865 ++
4866 ++ ret = blk_rq_append_bio(req, &bounce_bio);
4867 + if (ret)
4868 + return ERR_PTR(ret);
4869 + }
4870 +diff --git a/drivers/staging/android/ion/ion.c b/drivers/staging/android/ion/ion.c
4871 +index 93e2c90fa77d..83dc3292e9ab 100644
4872 +--- a/drivers/staging/android/ion/ion.c
4873 ++++ b/drivers/staging/android/ion/ion.c
4874 +@@ -348,7 +348,7 @@ static int ion_dma_buf_begin_cpu_access(struct dma_buf *dmabuf,
4875 + mutex_lock(&buffer->lock);
4876 + list_for_each_entry(a, &buffer->attachments, list) {
4877 + dma_sync_sg_for_cpu(a->dev, a->table->sgl, a->table->nents,
4878 +- DMA_BIDIRECTIONAL);
4879 ++ direction);
4880 + }
4881 + mutex_unlock(&buffer->lock);
4882 +
4883 +@@ -370,7 +370,7 @@ static int ion_dma_buf_end_cpu_access(struct dma_buf *dmabuf,
4884 + mutex_lock(&buffer->lock);
4885 + list_for_each_entry(a, &buffer->attachments, list) {
4886 + dma_sync_sg_for_device(a->dev, a->table->sgl, a->table->nents,
4887 +- DMA_BIDIRECTIONAL);
4888 ++ direction);
4889 + }
4890 + mutex_unlock(&buffer->lock);
4891 +
4892 +diff --git a/drivers/target/target_core_pscsi.c b/drivers/target/target_core_pscsi.c
4893 +index 7c69b4a9694d..0d99b242e82e 100644
4894 +--- a/drivers/target/target_core_pscsi.c
4895 ++++ b/drivers/target/target_core_pscsi.c
4896 +@@ -920,7 +920,7 @@ pscsi_map_sg(struct se_cmd *cmd, struct scatterlist *sgl, u32 sgl_nents,
4897 + " %d i: %d bio: %p, allocating another"
4898 + " bio\n", bio->bi_vcnt, i, bio);
4899 +
4900 +- rc = blk_rq_append_bio(req, bio);
4901 ++ rc = blk_rq_append_bio(req, &bio);
4902 + if (rc) {
4903 + pr_err("pSCSI: failed to append bio\n");
4904 + goto fail;
4905 +@@ -938,7 +938,7 @@ pscsi_map_sg(struct se_cmd *cmd, struct scatterlist *sgl, u32 sgl_nents,
4906 + }
4907 +
4908 + if (bio) {
4909 +- rc = blk_rq_append_bio(req, bio);
4910 ++ rc = blk_rq_append_bio(req, &bio);
4911 + if (rc) {
4912 + pr_err("pSCSI: failed to append bio\n");
4913 + goto fail;
4914 +diff --git a/drivers/tty/n_tty.c b/drivers/tty/n_tty.c
4915 +index bdf0e6e89991..faf50df81622 100644
4916 +--- a/drivers/tty/n_tty.c
4917 ++++ b/drivers/tty/n_tty.c
4918 +@@ -1764,7 +1764,7 @@ static void n_tty_set_termios(struct tty_struct *tty, struct ktermios *old)
4919 + {
4920 + struct n_tty_data *ldata = tty->disc_data;
4921 +
4922 +- if (!old || (old->c_lflag ^ tty->termios.c_lflag) & ICANON) {
4923 ++ if (!old || (old->c_lflag ^ tty->termios.c_lflag) & (ICANON | EXTPROC)) {
4924 + bitmap_zero(ldata->read_flags, N_TTY_BUF_SIZE);
4925 + ldata->line_start = ldata->read_tail;
4926 + if (!L_ICANON(tty) || !read_cnt(ldata)) {
4927 +@@ -2427,7 +2427,7 @@ static int n_tty_ioctl(struct tty_struct *tty, struct file *file,
4928 + return put_user(tty_chars_in_buffer(tty), (int __user *) arg);
4929 + case TIOCINQ:
4930 + down_write(&tty->termios_rwsem);
4931 +- if (L_ICANON(tty))
4932 ++ if (L_ICANON(tty) && !L_EXTPROC(tty))
4933 + retval = inq_canon(ldata);
4934 + else
4935 + retval = read_cnt(ldata);
4936 +diff --git a/drivers/tty/tty_buffer.c b/drivers/tty/tty_buffer.c
4937 +index f8eba1c5412f..677fa99b7747 100644
4938 +--- a/drivers/tty/tty_buffer.c
4939 ++++ b/drivers/tty/tty_buffer.c
4940 +@@ -446,7 +446,7 @@ EXPORT_SYMBOL_GPL(tty_prepare_flip_string);
4941 + * Callers other than flush_to_ldisc() need to exclude the kworker
4942 + * from concurrent use of the line discipline, see paste_selection().
4943 + *
4944 +- * Returns the number of bytes not processed
4945 ++ * Returns the number of bytes processed
4946 + */
4947 + int tty_ldisc_receive_buf(struct tty_ldisc *ld, const unsigned char *p,
4948 + char *f, int count)
4949 +diff --git a/drivers/usb/chipidea/ci_hdrc_msm.c b/drivers/usb/chipidea/ci_hdrc_msm.c
4950 +index bb626120296f..53f3bf459dd1 100644
4951 +--- a/drivers/usb/chipidea/ci_hdrc_msm.c
4952 ++++ b/drivers/usb/chipidea/ci_hdrc_msm.c
4953 +@@ -251,7 +251,7 @@ static int ci_hdrc_msm_probe(struct platform_device *pdev)
4954 + if (ret)
4955 + goto err_mux;
4956 +
4957 +- ulpi_node = of_find_node_by_name(of_node_get(pdev->dev.of_node), "ulpi");
4958 ++ ulpi_node = of_get_child_by_name(pdev->dev.of_node, "ulpi");
4959 + if (ulpi_node) {
4960 + phy_node = of_get_next_available_child(ulpi_node, NULL);
4961 + ci->hsic = of_device_is_compatible(phy_node, "qcom,usb-hsic-phy");
4962 +diff --git a/drivers/usb/core/config.c b/drivers/usb/core/config.c
4963 +index 843ef46d2537..9e3355b97396 100644
4964 +--- a/drivers/usb/core/config.c
4965 ++++ b/drivers/usb/core/config.c
4966 +@@ -1007,7 +1007,7 @@ int usb_get_bos_descriptor(struct usb_device *dev)
4967 + case USB_SSP_CAP_TYPE:
4968 + ssp_cap = (struct usb_ssp_cap_descriptor *)buffer;
4969 + ssac = (le32_to_cpu(ssp_cap->bmAttributes) &
4970 +- USB_SSP_SUBLINK_SPEED_ATTRIBS) + 1;
4971 ++ USB_SSP_SUBLINK_SPEED_ATTRIBS);
4972 + if (length >= USB_DT_USB_SSP_CAP_SIZE(ssac))
4973 + dev->bos->ssp_cap = ssp_cap;
4974 + break;
4975 +diff --git a/drivers/usb/core/quirks.c b/drivers/usb/core/quirks.c
4976 +index 50010282c010..c05c4f877750 100644
4977 +--- a/drivers/usb/core/quirks.c
4978 ++++ b/drivers/usb/core/quirks.c
4979 +@@ -57,10 +57,11 @@ static const struct usb_device_id usb_quirk_list[] = {
4980 + /* Microsoft LifeCam-VX700 v2.0 */
4981 + { USB_DEVICE(0x045e, 0x0770), .driver_info = USB_QUIRK_RESET_RESUME },
4982 +
4983 +- /* Logitech HD Pro Webcams C920, C920-C and C930e */
4984 ++ /* Logitech HD Pro Webcams C920, C920-C, C925e and C930e */
4985 + { USB_DEVICE(0x046d, 0x082d), .driver_info = USB_QUIRK_DELAY_INIT },
4986 + { USB_DEVICE(0x046d, 0x0841), .driver_info = USB_QUIRK_DELAY_INIT },
4987 + { USB_DEVICE(0x046d, 0x0843), .driver_info = USB_QUIRK_DELAY_INIT },
4988 ++ { USB_DEVICE(0x046d, 0x085b), .driver_info = USB_QUIRK_DELAY_INIT },
4989 +
4990 + /* Logitech ConferenceCam CC3000e */
4991 + { USB_DEVICE(0x046d, 0x0847), .driver_info = USB_QUIRK_DELAY_INIT },
4992 +@@ -154,6 +155,9 @@ static const struct usb_device_id usb_quirk_list[] = {
4993 + /* Genesys Logic hub, internally used by KY-688 USB 3.1 Type-C Hub */
4994 + { USB_DEVICE(0x05e3, 0x0612), .driver_info = USB_QUIRK_NO_LPM },
4995 +
4996 ++ /* ELSA MicroLink 56K */
4997 ++ { USB_DEVICE(0x05cc, 0x2267), .driver_info = USB_QUIRK_RESET_RESUME },
4998 ++
4999 + /* Genesys Logic hub, internally used by Moshi USB to Ethernet Adapter */
5000 + { USB_DEVICE(0x05e3, 0x0616), .driver_info = USB_QUIRK_NO_LPM },
5001 +
5002 +diff --git a/drivers/usb/host/xhci-pci.c b/drivers/usb/host/xhci-pci.c
5003 +index 76f392954733..abb8f19ae40f 100644
5004 +--- a/drivers/usb/host/xhci-pci.c
5005 ++++ b/drivers/usb/host/xhci-pci.c
5006 +@@ -189,6 +189,9 @@ static void xhci_pci_quirks(struct device *dev, struct xhci_hcd *xhci)
5007 + xhci->quirks |= XHCI_TRUST_TX_LENGTH;
5008 + xhci->quirks |= XHCI_BROKEN_STREAMS;
5009 + }
5010 ++ if (pdev->vendor == PCI_VENDOR_ID_RENESAS &&
5011 ++ pdev->device == 0x0014)
5012 ++ xhci->quirks |= XHCI_TRUST_TX_LENGTH;
5013 + if (pdev->vendor == PCI_VENDOR_ID_RENESAS &&
5014 + pdev->device == 0x0015)
5015 + xhci->quirks |= XHCI_RESET_ON_RESUME;
5016 +diff --git a/drivers/usb/serial/ftdi_sio.c b/drivers/usb/serial/ftdi_sio.c
5017 +index 49d1b2d4606d..d038e543c246 100644
5018 +--- a/drivers/usb/serial/ftdi_sio.c
5019 ++++ b/drivers/usb/serial/ftdi_sio.c
5020 +@@ -1017,6 +1017,7 @@ static const struct usb_device_id id_table_combined[] = {
5021 + .driver_info = (kernel_ulong_t)&ftdi_jtag_quirk },
5022 + { USB_DEVICE(CYPRESS_VID, CYPRESS_WICED_BT_USB_PID) },
5023 + { USB_DEVICE(CYPRESS_VID, CYPRESS_WICED_WL_USB_PID) },
5024 ++ { USB_DEVICE(AIRBUS_DS_VID, AIRBUS_DS_P8GR) },
5025 + { } /* Terminating entry */
5026 + };
5027 +
5028 +diff --git a/drivers/usb/serial/ftdi_sio_ids.h b/drivers/usb/serial/ftdi_sio_ids.h
5029 +index 4faa09fe308c..8b4ecd2bd297 100644
5030 +--- a/drivers/usb/serial/ftdi_sio_ids.h
5031 ++++ b/drivers/usb/serial/ftdi_sio_ids.h
5032 +@@ -914,6 +914,12 @@
5033 + #define ICPDAS_I7561U_PID 0x0104
5034 + #define ICPDAS_I7563U_PID 0x0105
5035 +
5036 ++/*
5037 ++ * Airbus Defence and Space
5038 ++ */
5039 ++#define AIRBUS_DS_VID 0x1e8e /* Vendor ID */
5040 ++#define AIRBUS_DS_P8GR 0x6001 /* Tetra P8GR */
5041 ++
5042 + /*
5043 + * RT Systems programming cables for various ham radios
5044 + */
5045 +diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c
5046 +index 54e316b1892d..a9400458ccea 100644
5047 +--- a/drivers/usb/serial/option.c
5048 ++++ b/drivers/usb/serial/option.c
5049 +@@ -236,6 +236,8 @@ static void option_instat_callback(struct urb *urb);
5050 + /* These Quectel products use Qualcomm's vendor ID */
5051 + #define QUECTEL_PRODUCT_UC20 0x9003
5052 + #define QUECTEL_PRODUCT_UC15 0x9090
5053 ++/* These Yuga products use Qualcomm's vendor ID */
5054 ++#define YUGA_PRODUCT_CLM920_NC5 0x9625
5055 +
5056 + #define QUECTEL_VENDOR_ID 0x2c7c
5057 + /* These Quectel products use Quectel's vendor ID */
5058 +@@ -283,6 +285,7 @@ static void option_instat_callback(struct urb *urb);
5059 + #define TELIT_PRODUCT_LE922_USBCFG3 0x1043
5060 + #define TELIT_PRODUCT_LE922_USBCFG5 0x1045
5061 + #define TELIT_PRODUCT_ME910 0x1100
5062 ++#define TELIT_PRODUCT_ME910_DUAL_MODEM 0x1101
5063 + #define TELIT_PRODUCT_LE920 0x1200
5064 + #define TELIT_PRODUCT_LE910 0x1201
5065 + #define TELIT_PRODUCT_LE910_USBCFG4 0x1206
5066 +@@ -648,6 +651,11 @@ static const struct option_blacklist_info telit_me910_blacklist = {
5067 + .reserved = BIT(1) | BIT(3),
5068 + };
5069 +
5070 ++static const struct option_blacklist_info telit_me910_dual_modem_blacklist = {
5071 ++ .sendsetup = BIT(0),
5072 ++ .reserved = BIT(3),
5073 ++};
5074 ++
5075 + static const struct option_blacklist_info telit_le910_blacklist = {
5076 + .sendsetup = BIT(0),
5077 + .reserved = BIT(1) | BIT(2),
5078 +@@ -677,6 +685,10 @@ static const struct option_blacklist_info cinterion_rmnet2_blacklist = {
5079 + .reserved = BIT(4) | BIT(5),
5080 + };
5081 +
5082 ++static const struct option_blacklist_info yuga_clm920_nc5_blacklist = {
5083 ++ .reserved = BIT(1) | BIT(4),
5084 ++};
5085 ++
5086 + static const struct usb_device_id option_ids[] = {
5087 + { USB_DEVICE(OPTION_VENDOR_ID, OPTION_PRODUCT_COLT) },
5088 + { USB_DEVICE(OPTION_VENDOR_ID, OPTION_PRODUCT_RICOLA) },
5089 +@@ -1181,6 +1193,9 @@ static const struct usb_device_id option_ids[] = {
5090 + { USB_DEVICE(QUALCOMM_VENDOR_ID, QUECTEL_PRODUCT_UC15)},
5091 + { USB_DEVICE(QUALCOMM_VENDOR_ID, QUECTEL_PRODUCT_UC20),
5092 + .driver_info = (kernel_ulong_t)&net_intf4_blacklist },
5093 ++ /* Yuga products use Qualcomm vendor ID */
5094 ++ { USB_DEVICE(QUALCOMM_VENDOR_ID, YUGA_PRODUCT_CLM920_NC5),
5095 ++ .driver_info = (kernel_ulong_t)&yuga_clm920_nc5_blacklist },
5096 + /* Quectel products using Quectel vendor ID */
5097 + { USB_DEVICE(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EC21),
5098 + .driver_info = (kernel_ulong_t)&net_intf4_blacklist },
5099 +@@ -1247,6 +1262,8 @@ static const struct usb_device_id option_ids[] = {
5100 + .driver_info = (kernel_ulong_t)&telit_le922_blacklist_usbcfg0 },
5101 + { USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_ME910),
5102 + .driver_info = (kernel_ulong_t)&telit_me910_blacklist },
5103 ++ { USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_ME910_DUAL_MODEM),
5104 ++ .driver_info = (kernel_ulong_t)&telit_me910_dual_modem_blacklist },
5105 + { USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_LE910),
5106 + .driver_info = (kernel_ulong_t)&telit_le910_blacklist },
5107 + { USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_LE910_USBCFG4),
5108 +diff --git a/drivers/usb/serial/qcserial.c b/drivers/usb/serial/qcserial.c
5109 +index 9f9d3a904464..55a8fb25ce2b 100644
5110 +--- a/drivers/usb/serial/qcserial.c
5111 ++++ b/drivers/usb/serial/qcserial.c
5112 +@@ -166,6 +166,8 @@ static const struct usb_device_id id_table[] = {
5113 + {DEVICE_SWI(0x1199, 0x9079)}, /* Sierra Wireless EM74xx */
5114 + {DEVICE_SWI(0x1199, 0x907a)}, /* Sierra Wireless EM74xx QDL */
5115 + {DEVICE_SWI(0x1199, 0x907b)}, /* Sierra Wireless EM74xx */
5116 ++ {DEVICE_SWI(0x1199, 0x9090)}, /* Sierra Wireless EM7565 QDL */
5117 ++ {DEVICE_SWI(0x1199, 0x9091)}, /* Sierra Wireless EM7565 */
5118 + {DEVICE_SWI(0x413c, 0x81a2)}, /* Dell Wireless 5806 Gobi(TM) 4G LTE Mobile Broadband Card */
5119 + {DEVICE_SWI(0x413c, 0x81a3)}, /* Dell Wireless 5570 HSPA+ (42Mbps) Mobile Broadband Card */
5120 + {DEVICE_SWI(0x413c, 0x81a4)}, /* Dell Wireless 5570e HSPA+ (42Mbps) Mobile Broadband Card */
5121 +@@ -346,6 +348,7 @@ static int qcprobe(struct usb_serial *serial, const struct usb_device_id *id)
5122 + break;
5123 + case 2:
5124 + dev_dbg(dev, "NMEA GPS interface found\n");
5125 ++ sendsetup = true;
5126 + break;
5127 + case 3:
5128 + dev_dbg(dev, "Modem port found\n");
5129 +diff --git a/drivers/usb/usbip/stub_dev.c b/drivers/usb/usbip/stub_dev.c
5130 +index c653ce533430..720408d39f11 100644
5131 +--- a/drivers/usb/usbip/stub_dev.c
5132 ++++ b/drivers/usb/usbip/stub_dev.c
5133 +@@ -163,8 +163,7 @@ static void stub_shutdown_connection(struct usbip_device *ud)
5134 + * step 1?
5135 + */
5136 + if (ud->tcp_socket) {
5137 +- dev_dbg(&sdev->udev->dev, "shutdown tcp_socket %p\n",
5138 +- ud->tcp_socket);
5139 ++ dev_dbg(&sdev->udev->dev, "shutdown sockfd %d\n", ud->sockfd);
5140 + kernel_sock_shutdown(ud->tcp_socket, SHUT_RDWR);
5141 + }
5142 +
5143 +diff --git a/drivers/usb/usbip/stub_main.c b/drivers/usb/usbip/stub_main.c
5144 +index 7170404e8979..6968c906fa29 100644
5145 +--- a/drivers/usb/usbip/stub_main.c
5146 ++++ b/drivers/usb/usbip/stub_main.c
5147 +@@ -251,11 +251,12 @@ void stub_device_cleanup_urbs(struct stub_device *sdev)
5148 + struct stub_priv *priv;
5149 + struct urb *urb;
5150 +
5151 +- dev_dbg(&sdev->udev->dev, "free sdev %p\n", sdev);
5152 ++ dev_dbg(&sdev->udev->dev, "Stub device cleaning up urbs\n");
5153 +
5154 + while ((priv = stub_priv_pop(sdev))) {
5155 + urb = priv->urb;
5156 +- dev_dbg(&sdev->udev->dev, "free urb %p\n", urb);
5157 ++ dev_dbg(&sdev->udev->dev, "free urb seqnum %lu\n",
5158 ++ priv->seqnum);
5159 + usb_kill_urb(urb);
5160 +
5161 + kmem_cache_free(stub_priv_cache, priv);
5162 +diff --git a/drivers/usb/usbip/stub_rx.c b/drivers/usb/usbip/stub_rx.c
5163 +index 283a9be77a22..5b807185f79e 100644
5164 +--- a/drivers/usb/usbip/stub_rx.c
5165 ++++ b/drivers/usb/usbip/stub_rx.c
5166 +@@ -225,9 +225,6 @@ static int stub_recv_cmd_unlink(struct stub_device *sdev,
5167 + if (priv->seqnum != pdu->u.cmd_unlink.seqnum)
5168 + continue;
5169 +
5170 +- dev_info(&priv->urb->dev->dev, "unlink urb %p\n",
5171 +- priv->urb);
5172 +-
5173 + /*
5174 + * This matched urb is not completed yet (i.e., be in
5175 + * flight in usb hcd hardware/driver). Now we are
5176 +@@ -266,8 +263,8 @@ static int stub_recv_cmd_unlink(struct stub_device *sdev,
5177 + ret = usb_unlink_urb(priv->urb);
5178 + if (ret != -EINPROGRESS)
5179 + dev_err(&priv->urb->dev->dev,
5180 +- "failed to unlink a urb %p, ret %d\n",
5181 +- priv->urb, ret);
5182 ++ "failed to unlink a urb # %lu, ret %d\n",
5183 ++ priv->seqnum, ret);
5184 +
5185 + return 0;
5186 + }
5187 +diff --git a/drivers/usb/usbip/stub_tx.c b/drivers/usb/usbip/stub_tx.c
5188 +index 87ff94be4235..96aa375b80d9 100644
5189 +--- a/drivers/usb/usbip/stub_tx.c
5190 ++++ b/drivers/usb/usbip/stub_tx.c
5191 +@@ -102,7 +102,7 @@ void stub_complete(struct urb *urb)
5192 + /* link a urb to the queue of tx. */
5193 + spin_lock_irqsave(&sdev->priv_lock, flags);
5194 + if (sdev->ud.tcp_socket == NULL) {
5195 +- usbip_dbg_stub_tx("ignore urb for closed connection %p", urb);
5196 ++ usbip_dbg_stub_tx("ignore urb for closed connection\n");
5197 + /* It will be freed in stub_device_cleanup_urbs(). */
5198 + } else if (priv->unlinking) {
5199 + stub_enqueue_ret_unlink(sdev, priv->seqnum, urb->status);
5200 +@@ -204,8 +204,8 @@ static int stub_send_ret_submit(struct stub_device *sdev)
5201 +
5202 + /* 1. setup usbip_header */
5203 + setup_ret_submit_pdu(&pdu_header, urb);
5204 +- usbip_dbg_stub_tx("setup txdata seqnum: %d urb: %p\n",
5205 +- pdu_header.base.seqnum, urb);
5206 ++ usbip_dbg_stub_tx("setup txdata seqnum: %d\n",
5207 ++ pdu_header.base.seqnum);
5208 + usbip_header_correct_endian(&pdu_header, 1);
5209 +
5210 + iov[iovnum].iov_base = &pdu_header;
5211 +diff --git a/drivers/usb/usbip/usbip_common.c b/drivers/usb/usbip/usbip_common.c
5212 +index 2281f3562870..17b599b923f3 100644
5213 +--- a/drivers/usb/usbip/usbip_common.c
5214 ++++ b/drivers/usb/usbip/usbip_common.c
5215 +@@ -331,26 +331,20 @@ int usbip_recv(struct socket *sock, void *buf, int size)
5216 + struct msghdr msg = {.msg_flags = MSG_NOSIGNAL};
5217 + int total = 0;
5218 +
5219 ++ if (!sock || !buf || !size)
5220 ++ return -EINVAL;
5221 ++
5222 + iov_iter_kvec(&msg.msg_iter, READ|ITER_KVEC, &iov, 1, size);
5223 +
5224 + usbip_dbg_xmit("enter\n");
5225 +
5226 +- if (!sock || !buf || !size) {
5227 +- pr_err("invalid arg, sock %p buff %p size %d\n", sock, buf,
5228 +- size);
5229 +- return -EINVAL;
5230 +- }
5231 +-
5232 + do {
5233 +- int sz = msg_data_left(&msg);
5234 ++ msg_data_left(&msg);
5235 + sock->sk->sk_allocation = GFP_NOIO;
5236 +
5237 + result = sock_recvmsg(sock, &msg, MSG_WAITALL);
5238 +- if (result <= 0) {
5239 +- pr_debug("receive sock %p buf %p size %u ret %d total %d\n",
5240 +- sock, buf + total, sz, result, total);
5241 ++ if (result <= 0)
5242 + goto err;
5243 +- }
5244 +
5245 + total += result;
5246 + } while (msg_data_left(&msg));
5247 +diff --git a/drivers/usb/usbip/vhci_hcd.c b/drivers/usb/usbip/vhci_hcd.c
5248 +index 1f0cf81cc145..692cfdef667e 100644
5249 +--- a/drivers/usb/usbip/vhci_hcd.c
5250 ++++ b/drivers/usb/usbip/vhci_hcd.c
5251 +@@ -670,9 +670,6 @@ static int vhci_urb_enqueue(struct usb_hcd *hcd, struct urb *urb, gfp_t mem_flag
5252 + struct vhci_device *vdev;
5253 + unsigned long flags;
5254 +
5255 +- usbip_dbg_vhci_hc("enter, usb_hcd %p urb %p mem_flags %d\n",
5256 +- hcd, urb, mem_flags);
5257 +-
5258 + if (portnum > VHCI_HC_PORTS) {
5259 + pr_err("invalid port number %d\n", portnum);
5260 + return -ENODEV;
5261 +@@ -836,8 +833,6 @@ static int vhci_urb_dequeue(struct usb_hcd *hcd, struct urb *urb, int status)
5262 + struct vhci_device *vdev;
5263 + unsigned long flags;
5264 +
5265 +- pr_info("dequeue a urb %p\n", urb);
5266 +-
5267 + spin_lock_irqsave(&vhci->lock, flags);
5268 +
5269 + priv = urb->hcpriv;
5270 +@@ -865,7 +860,6 @@ static int vhci_urb_dequeue(struct usb_hcd *hcd, struct urb *urb, int status)
5271 + /* tcp connection is closed */
5272 + spin_lock(&vdev->priv_lock);
5273 +
5274 +- pr_info("device %p seems to be disconnected\n", vdev);
5275 + list_del(&priv->list);
5276 + kfree(priv);
5277 + urb->hcpriv = NULL;
5278 +@@ -877,8 +871,6 @@ static int vhci_urb_dequeue(struct usb_hcd *hcd, struct urb *urb, int status)
5279 + * vhci_rx will receive RET_UNLINK and give back the URB.
5280 + * Otherwise, we give back it here.
5281 + */
5282 +- pr_info("gives back urb %p\n", urb);
5283 +-
5284 + usb_hcd_unlink_urb_from_ep(hcd, urb);
5285 +
5286 + spin_unlock_irqrestore(&vhci->lock, flags);
5287 +@@ -906,8 +898,6 @@ static int vhci_urb_dequeue(struct usb_hcd *hcd, struct urb *urb, int status)
5288 +
5289 + unlink->unlink_seqnum = priv->seqnum;
5290 +
5291 +- pr_info("device %p seems to be still connected\n", vdev);
5292 +-
5293 + /* send cmd_unlink and try to cancel the pending URB in the
5294 + * peer */
5295 + list_add_tail(&unlink->list, &vdev->unlink_tx);
5296 +@@ -989,7 +979,7 @@ static void vhci_shutdown_connection(struct usbip_device *ud)
5297 +
5298 + /* need this? see stub_dev.c */
5299 + if (ud->tcp_socket) {
5300 +- pr_debug("shutdown tcp_socket %p\n", ud->tcp_socket);
5301 ++ pr_debug("shutdown tcp_socket %d\n", ud->sockfd);
5302 + kernel_sock_shutdown(ud->tcp_socket, SHUT_RDWR);
5303 + }
5304 +
5305 +diff --git a/drivers/usb/usbip/vhci_rx.c b/drivers/usb/usbip/vhci_rx.c
5306 +index ef2f2d5ca6b2..1343037d00f9 100644
5307 +--- a/drivers/usb/usbip/vhci_rx.c
5308 ++++ b/drivers/usb/usbip/vhci_rx.c
5309 +@@ -37,24 +37,23 @@ struct urb *pickup_urb_and_free_priv(struct vhci_device *vdev, __u32 seqnum)
5310 + urb = priv->urb;
5311 + status = urb->status;
5312 +
5313 +- usbip_dbg_vhci_rx("find urb %p vurb %p seqnum %u\n",
5314 +- urb, priv, seqnum);
5315 ++ usbip_dbg_vhci_rx("find urb seqnum %u\n", seqnum);
5316 +
5317 + switch (status) {
5318 + case -ENOENT:
5319 + /* fall through */
5320 + case -ECONNRESET:
5321 +- dev_info(&urb->dev->dev,
5322 +- "urb %p was unlinked %ssynchronuously.\n", urb,
5323 +- status == -ENOENT ? "" : "a");
5324 ++ dev_dbg(&urb->dev->dev,
5325 ++ "urb seq# %u was unlinked %ssynchronuously\n",
5326 ++ seqnum, status == -ENOENT ? "" : "a");
5327 + break;
5328 + case -EINPROGRESS:
5329 + /* no info output */
5330 + break;
5331 + default:
5332 +- dev_info(&urb->dev->dev,
5333 +- "urb %p may be in a error, status %d\n", urb,
5334 +- status);
5335 ++ dev_dbg(&urb->dev->dev,
5336 ++ "urb seq# %u may be in a error, status %d\n",
5337 ++ seqnum, status);
5338 + }
5339 +
5340 + list_del(&priv->list);
5341 +@@ -81,8 +80,8 @@ static void vhci_recv_ret_submit(struct vhci_device *vdev,
5342 + spin_unlock_irqrestore(&vdev->priv_lock, flags);
5343 +
5344 + if (!urb) {
5345 +- pr_err("cannot find a urb of seqnum %u\n", pdu->base.seqnum);
5346 +- pr_info("max seqnum %d\n",
5347 ++ pr_err("cannot find a urb of seqnum %u max seqnum %d\n",
5348 ++ pdu->base.seqnum,
5349 + atomic_read(&vhci_hcd->seqnum));
5350 + usbip_event_add(ud, VDEV_EVENT_ERROR_TCP);
5351 + return;
5352 +@@ -105,7 +104,7 @@ static void vhci_recv_ret_submit(struct vhci_device *vdev,
5353 + if (usbip_dbg_flag_vhci_rx)
5354 + usbip_dump_urb(urb);
5355 +
5356 +- usbip_dbg_vhci_rx("now giveback urb %p\n", urb);
5357 ++ usbip_dbg_vhci_rx("now giveback urb %u\n", pdu->base.seqnum);
5358 +
5359 + spin_lock_irqsave(&vhci->lock, flags);
5360 + usb_hcd_unlink_urb_from_ep(vhci_hcd_to_hcd(vhci_hcd), urb);
5361 +@@ -172,7 +171,7 @@ static void vhci_recv_ret_unlink(struct vhci_device *vdev,
5362 + pr_info("the urb (seqnum %d) was already given back\n",
5363 + pdu->base.seqnum);
5364 + } else {
5365 +- usbip_dbg_vhci_rx("now giveback urb %p\n", urb);
5366 ++ usbip_dbg_vhci_rx("now giveback urb %d\n", pdu->base.seqnum);
5367 +
5368 + /* If unlink is successful, status is -ECONNRESET */
5369 + urb->status = pdu->u.ret_unlink.status;
5370 +diff --git a/drivers/usb/usbip/vhci_tx.c b/drivers/usb/usbip/vhci_tx.c
5371 +index 3e7878fe2fd4..a9a663a578b6 100644
5372 +--- a/drivers/usb/usbip/vhci_tx.c
5373 ++++ b/drivers/usb/usbip/vhci_tx.c
5374 +@@ -83,7 +83,8 @@ static int vhci_send_cmd_submit(struct vhci_device *vdev)
5375 + memset(&msg, 0, sizeof(msg));
5376 + memset(&iov, 0, sizeof(iov));
5377 +
5378 +- usbip_dbg_vhci_tx("setup txdata urb %p\n", urb);
5379 ++ usbip_dbg_vhci_tx("setup txdata urb seqnum %lu\n",
5380 ++ priv->seqnum);
5381 +
5382 + /* 1. setup usbip_header */
5383 + setup_cmd_submit_pdu(&pdu_header, urb);
5384 +diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
5385 +index fd47bd96b5d3..6362e3606aa5 100644
5386 +--- a/include/linux/blkdev.h
5387 ++++ b/include/linux/blkdev.h
5388 +@@ -241,14 +241,24 @@ struct request {
5389 + struct request *next_rq;
5390 + };
5391 +
5392 ++static inline bool blk_op_is_scsi(unsigned int op)
5393 ++{
5394 ++ return op == REQ_OP_SCSI_IN || op == REQ_OP_SCSI_OUT;
5395 ++}
5396 ++
5397 ++static inline bool blk_op_is_private(unsigned int op)
5398 ++{
5399 ++ return op == REQ_OP_DRV_IN || op == REQ_OP_DRV_OUT;
5400 ++}
5401 ++
5402 + static inline bool blk_rq_is_scsi(struct request *rq)
5403 + {
5404 +- return req_op(rq) == REQ_OP_SCSI_IN || req_op(rq) == REQ_OP_SCSI_OUT;
5405 ++ return blk_op_is_scsi(req_op(rq));
5406 + }
5407 +
5408 + static inline bool blk_rq_is_private(struct request *rq)
5409 + {
5410 +- return req_op(rq) == REQ_OP_DRV_IN || req_op(rq) == REQ_OP_DRV_OUT;
5411 ++ return blk_op_is_private(req_op(rq));
5412 + }
5413 +
5414 + static inline bool blk_rq_is_passthrough(struct request *rq)
5415 +@@ -256,6 +266,13 @@ static inline bool blk_rq_is_passthrough(struct request *rq)
5416 + return blk_rq_is_scsi(rq) || blk_rq_is_private(rq);
5417 + }
5418 +
5419 ++static inline bool bio_is_passthrough(struct bio *bio)
5420 ++{
5421 ++ unsigned op = bio_op(bio);
5422 ++
5423 ++ return blk_op_is_scsi(op) || blk_op_is_private(op);
5424 ++}
5425 ++
5426 + static inline unsigned short req_get_ioprio(struct request *req)
5427 + {
5428 + return req->ioprio;
5429 +@@ -952,7 +969,7 @@ extern int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
5430 + extern void blk_rq_unprep_clone(struct request *rq);
5431 + extern blk_status_t blk_insert_cloned_request(struct request_queue *q,
5432 + struct request *rq);
5433 +-extern int blk_rq_append_bio(struct request *rq, struct bio *bio);
5434 ++extern int blk_rq_append_bio(struct request *rq, struct bio **bio);
5435 + extern void blk_delay_queue(struct request_queue *, unsigned long);
5436 + extern void blk_queue_split(struct request_queue *, struct bio **);
5437 + extern void blk_recount_segments(struct request_queue *, struct bio *);
5438 +diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
5439 +index 2477a5cb5bd5..fb83dee528a1 100644
5440 +--- a/include/linux/cpuhotplug.h
5441 ++++ b/include/linux/cpuhotplug.h
5442 +@@ -86,7 +86,7 @@ enum cpuhp_state {
5443 + CPUHP_MM_ZSWP_POOL_PREPARE,
5444 + CPUHP_KVM_PPC_BOOK3S_PREPARE,
5445 + CPUHP_ZCOMP_PREPARE,
5446 +- CPUHP_TIMERS_DEAD,
5447 ++ CPUHP_TIMERS_PREPARE,
5448 + CPUHP_MIPS_SOC_PREPARE,
5449 + CPUHP_BP_PREPARE_DYN,
5450 + CPUHP_BP_PREPARE_DYN_END = CPUHP_BP_PREPARE_DYN + 20,
5451 +diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
5452 +index ea04ca024f0d..067a6fa675ed 100644
5453 +--- a/include/linux/ipv6.h
5454 ++++ b/include/linux/ipv6.h
5455 +@@ -272,7 +272,8 @@ struct ipv6_pinfo {
5456 + * 100: prefer care-of address
5457 + */
5458 + dontfrag:1,
5459 +- autoflowlabel:1;
5460 ++ autoflowlabel:1,
5461 ++ autoflowlabel_set:1;
5462 + __u8 min_hopcount;
5463 + __u8 tclass;
5464 + __be32 rcv_flowinfo;
5465 +diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
5466 +index 401c8972cc3a..8b3d0103c03a 100644
5467 +--- a/include/linux/mlx5/driver.h
5468 ++++ b/include/linux/mlx5/driver.h
5469 +@@ -546,6 +546,7 @@ struct mlx5_core_sriov {
5470 + };
5471 +
5472 + struct mlx5_irq_info {
5473 ++ cpumask_var_t mask;
5474 + char name[MLX5_MAX_IRQ_NAME];
5475 + };
5476 +
5477 +diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
5478 +index 69772347f866..c8091f06eaa4 100644
5479 +--- a/include/linux/mlx5/mlx5_ifc.h
5480 ++++ b/include/linux/mlx5/mlx5_ifc.h
5481 +@@ -147,7 +147,7 @@ enum {
5482 + MLX5_CMD_OP_ALLOC_Q_COUNTER = 0x771,
5483 + MLX5_CMD_OP_DEALLOC_Q_COUNTER = 0x772,
5484 + MLX5_CMD_OP_QUERY_Q_COUNTER = 0x773,
5485 +- MLX5_CMD_OP_SET_RATE_LIMIT = 0x780,
5486 ++ MLX5_CMD_OP_SET_PP_RATE_LIMIT = 0x780,
5487 + MLX5_CMD_OP_QUERY_RATE_LIMIT = 0x781,
5488 + MLX5_CMD_OP_CREATE_SCHEDULING_ELEMENT = 0x782,
5489 + MLX5_CMD_OP_DESTROY_SCHEDULING_ELEMENT = 0x783,
5490 +@@ -7233,7 +7233,7 @@ struct mlx5_ifc_add_vxlan_udp_dport_in_bits {
5491 + u8 vxlan_udp_port[0x10];
5492 + };
5493 +
5494 +-struct mlx5_ifc_set_rate_limit_out_bits {
5495 ++struct mlx5_ifc_set_pp_rate_limit_out_bits {
5496 + u8 status[0x8];
5497 + u8 reserved_at_8[0x18];
5498 +
5499 +@@ -7242,7 +7242,7 @@ struct mlx5_ifc_set_rate_limit_out_bits {
5500 + u8 reserved_at_40[0x40];
5501 + };
5502 +
5503 +-struct mlx5_ifc_set_rate_limit_in_bits {
5504 ++struct mlx5_ifc_set_pp_rate_limit_in_bits {
5505 + u8 opcode[0x10];
5506 + u8 reserved_at_10[0x10];
5507 +
5508 +@@ -7255,6 +7255,8 @@ struct mlx5_ifc_set_rate_limit_in_bits {
5509 + u8 reserved_at_60[0x20];
5510 +
5511 + u8 rate_limit[0x20];
5512 ++
5513 ++ u8 reserved_at_a0[0x160];
5514 + };
5515 +
5516 + struct mlx5_ifc_access_register_out_bits {
5517 +diff --git a/include/linux/pti.h b/include/linux/pti.h
5518 +new file mode 100644
5519 +index 000000000000..0174883a935a
5520 +--- /dev/null
5521 ++++ b/include/linux/pti.h
5522 +@@ -0,0 +1,11 @@
5523 ++// SPDX-License-Identifier: GPL-2.0
5524 ++#ifndef _INCLUDE_PTI_H
5525 ++#define _INCLUDE_PTI_H
5526 ++
5527 ++#ifdef CONFIG_PAGE_TABLE_ISOLATION
5528 ++#include <asm/pti.h>
5529 ++#else
5530 ++static inline void pti_init(void) { }
5531 ++#endif
5532 ++
5533 ++#endif
5534 +diff --git a/include/linux/ptr_ring.h b/include/linux/ptr_ring.h
5535 +index 37b4bb2545b3..6866df4f31b5 100644
5536 +--- a/include/linux/ptr_ring.h
5537 ++++ b/include/linux/ptr_ring.h
5538 +@@ -101,12 +101,18 @@ static inline bool ptr_ring_full_bh(struct ptr_ring *r)
5539 +
5540 + /* Note: callers invoking this in a loop must use a compiler barrier,
5541 + * for example cpu_relax(). Callers must hold producer_lock.
5542 ++ * Callers are responsible for making sure pointer that is being queued
5543 ++ * points to a valid data.
5544 + */
5545 + static inline int __ptr_ring_produce(struct ptr_ring *r, void *ptr)
5546 + {
5547 + if (unlikely(!r->size) || r->queue[r->producer])
5548 + return -ENOSPC;
5549 +
5550 ++ /* Make sure the pointer we are storing points to a valid data. */
5551 ++ /* Pairs with smp_read_barrier_depends in __ptr_ring_consume. */
5552 ++ smp_wmb();
5553 ++
5554 + r->queue[r->producer++] = ptr;
5555 + if (unlikely(r->producer >= r->size))
5556 + r->producer = 0;
5557 +@@ -275,6 +281,9 @@ static inline void *__ptr_ring_consume(struct ptr_ring *r)
5558 + if (ptr)
5559 + __ptr_ring_discard_one(r);
5560 +
5561 ++ /* Make sure anyone accessing data through the pointer is up to date. */
5562 ++ /* Pairs with smp_wmb in __ptr_ring_produce. */
5563 ++ smp_read_barrier_depends();
5564 + return ptr;
5565 + }
5566 +
5567 +diff --git a/include/linux/tcp.h b/include/linux/tcp.h
5568 +index 4aa40ef02d32..e8418fc77a43 100644
5569 +--- a/include/linux/tcp.h
5570 ++++ b/include/linux/tcp.h
5571 +@@ -214,7 +214,8 @@ struct tcp_sock {
5572 + u8 chrono_type:2, /* current chronograph type */
5573 + rate_app_limited:1, /* rate_{delivered,interval_us} limited? */
5574 + fastopen_connect:1, /* FASTOPEN_CONNECT sockopt */
5575 +- unused:4;
5576 ++ is_sack_reneg:1, /* in recovery from loss with SACK reneg? */
5577 ++ unused:3;
5578 + u8 nonagle : 4,/* Disable Nagle algorithm? */
5579 + thin_lto : 1,/* Use linear timeouts for thin streams */
5580 + unused1 : 1,
5581 +diff --git a/include/linux/tick.h b/include/linux/tick.h
5582 +index cf413b344ddb..5cdac11dd317 100644
5583 +--- a/include/linux/tick.h
5584 ++++ b/include/linux/tick.h
5585 +@@ -119,6 +119,7 @@ extern void tick_nohz_idle_exit(void);
5586 + extern void tick_nohz_irq_exit(void);
5587 + extern ktime_t tick_nohz_get_sleep_length(void);
5588 + extern unsigned long tick_nohz_get_idle_calls(void);
5589 ++extern unsigned long tick_nohz_get_idle_calls_cpu(int cpu);
5590 + extern u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time);
5591 + extern u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time);
5592 + #else /* !CONFIG_NO_HZ_COMMON */
5593 +diff --git a/include/linux/timer.h b/include/linux/timer.h
5594 +index ac66f29c6916..e0ea1fe87572 100644
5595 +--- a/include/linux/timer.h
5596 ++++ b/include/linux/timer.h
5597 +@@ -246,9 +246,11 @@ unsigned long round_jiffies_up(unsigned long j);
5598 + unsigned long round_jiffies_up_relative(unsigned long j);
5599 +
5600 + #ifdef CONFIG_HOTPLUG_CPU
5601 ++int timers_prepare_cpu(unsigned int cpu);
5602 + int timers_dead_cpu(unsigned int cpu);
5603 + #else
5604 +-#define timers_dead_cpu NULL
5605 ++#define timers_prepare_cpu NULL
5606 ++#define timers_dead_cpu NULL
5607 + #endif
5608 +
5609 + #endif
5610 +diff --git a/include/net/ip.h b/include/net/ip.h
5611 +index 9896f46cbbf1..af8addbaa3c1 100644
5612 +--- a/include/net/ip.h
5613 ++++ b/include/net/ip.h
5614 +@@ -34,6 +34,7 @@
5615 + #include <net/flow_dissector.h>
5616 +
5617 + #define IPV4_MAX_PMTU 65535U /* RFC 2675, Section 5.1 */
5618 ++#define IPV4_MIN_MTU 68 /* RFC 791 */
5619 +
5620 + struct sock;
5621 +
5622 +diff --git a/include/net/tcp.h b/include/net/tcp.h
5623 +index 6ced69940f5c..0a13574134b8 100644
5624 +--- a/include/net/tcp.h
5625 ++++ b/include/net/tcp.h
5626 +@@ -1085,7 +1085,7 @@ void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb);
5627 + void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb,
5628 + struct rate_sample *rs);
5629 + void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost,
5630 +- struct rate_sample *rs);
5631 ++ bool is_sack_reneg, struct rate_sample *rs);
5632 + void tcp_rate_check_app_limited(struct sock *sk);
5633 +
5634 + /* These functions determine how the current flow behaves in respect of SACK
5635 +diff --git a/init/main.c b/init/main.c
5636 +index 8a390f60ec81..b32ec72cdf3d 100644
5637 +--- a/init/main.c
5638 ++++ b/init/main.c
5639 +@@ -75,6 +75,7 @@
5640 + #include <linux/slab.h>
5641 + #include <linux/perf_event.h>
5642 + #include <linux/ptrace.h>
5643 ++#include <linux/pti.h>
5644 + #include <linux/blkdev.h>
5645 + #include <linux/elevator.h>
5646 + #include <linux/sched_clock.h>
5647 +@@ -506,6 +507,8 @@ static void __init mm_init(void)
5648 + ioremap_huge_init();
5649 + /* Should be run before the first non-init thread is created */
5650 + init_espfix_bsp();
5651 ++ /* Should be run after espfix64 is set up. */
5652 ++ pti_init();
5653 + }
5654 +
5655 + asmlinkage __visible void __init start_kernel(void)
5656 +diff --git a/kernel/cpu.c b/kernel/cpu.c
5657 +index 7891aecc6aec..f21bfa3172d8 100644
5658 +--- a/kernel/cpu.c
5659 ++++ b/kernel/cpu.c
5660 +@@ -1277,9 +1277,9 @@ static struct cpuhp_step cpuhp_bp_states[] = {
5661 + * before blk_mq_queue_reinit_notify() from notify_dead(),
5662 + * otherwise a RCU stall occurs.
5663 + */
5664 +- [CPUHP_TIMERS_DEAD] = {
5665 ++ [CPUHP_TIMERS_PREPARE] = {
5666 + .name = "timers:dead",
5667 +- .startup.single = NULL,
5668 ++ .startup.single = timers_prepare_cpu,
5669 + .teardown.single = timers_dead_cpu,
5670 + },
5671 + /* Kicks the plugged cpu into life */
5672 +diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
5673 +index 2f52ec0f1539..d6717a3331a1 100644
5674 +--- a/kernel/sched/cpufreq_schedutil.c
5675 ++++ b/kernel/sched/cpufreq_schedutil.c
5676 +@@ -244,7 +244,7 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, unsigned long *util,
5677 + #ifdef CONFIG_NO_HZ_COMMON
5678 + static bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu)
5679 + {
5680 +- unsigned long idle_calls = tick_nohz_get_idle_calls();
5681 ++ unsigned long idle_calls = tick_nohz_get_idle_calls_cpu(sg_cpu->cpu);
5682 + bool ret = idle_calls == sg_cpu->saved_idle_calls;
5683 +
5684 + sg_cpu->saved_idle_calls = idle_calls;
5685 +diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
5686 +index c7a899c5ce64..dfa4a117fee3 100644
5687 +--- a/kernel/time/tick-sched.c
5688 ++++ b/kernel/time/tick-sched.c
5689 +@@ -674,6 +674,11 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
5690 + ts->next_tick = 0;
5691 + }
5692 +
5693 ++static inline bool local_timer_softirq_pending(void)
5694 ++{
5695 ++ return local_softirq_pending() & TIMER_SOFTIRQ;
5696 ++}
5697 ++
5698 + static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
5699 + ktime_t now, int cpu)
5700 + {
5701 +@@ -690,8 +695,18 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
5702 + } while (read_seqretry(&jiffies_lock, seq));
5703 + ts->last_jiffies = basejiff;
5704 +
5705 +- if (rcu_needs_cpu(basemono, &next_rcu) ||
5706 +- arch_needs_cpu() || irq_work_needs_cpu()) {
5707 ++ /*
5708 ++ * Keep the periodic tick, when RCU, architecture or irq_work
5709 ++ * requests it.
5710 ++ * Aside of that check whether the local timer softirq is
5711 ++ * pending. If so its a bad idea to call get_next_timer_interrupt()
5712 ++ * because there is an already expired timer, so it will request
5713 ++ * immeditate expiry, which rearms the hardware timer with a
5714 ++ * minimal delta which brings us back to this place
5715 ++ * immediately. Lather, rinse and repeat...
5716 ++ */
5717 ++ if (rcu_needs_cpu(basemono, &next_rcu) || arch_needs_cpu() ||
5718 ++ irq_work_needs_cpu() || local_timer_softirq_pending()) {
5719 + next_tick = basemono + TICK_NSEC;
5720 + } else {
5721 + /*
5722 +@@ -1009,6 +1024,19 @@ ktime_t tick_nohz_get_sleep_length(void)
5723 + return ts->sleep_length;
5724 + }
5725 +
5726 ++/**
5727 ++ * tick_nohz_get_idle_calls_cpu - return the current idle calls counter value
5728 ++ * for a particular CPU.
5729 ++ *
5730 ++ * Called from the schedutil frequency scaling governor in scheduler context.
5731 ++ */
5732 ++unsigned long tick_nohz_get_idle_calls_cpu(int cpu)
5733 ++{
5734 ++ struct tick_sched *ts = tick_get_tick_sched(cpu);
5735 ++
5736 ++ return ts->idle_calls;
5737 ++}
5738 ++
5739 + /**
5740 + * tick_nohz_get_idle_calls - return the current idle calls counter value
5741 + *
5742 +diff --git a/kernel/time/timer.c b/kernel/time/timer.c
5743 +index f2674a056c26..73e3cdbc61f1 100644
5744 +--- a/kernel/time/timer.c
5745 ++++ b/kernel/time/timer.c
5746 +@@ -814,11 +814,10 @@ static inline struct timer_base *get_timer_cpu_base(u32 tflags, u32 cpu)
5747 + struct timer_base *base = per_cpu_ptr(&timer_bases[BASE_STD], cpu);
5748 +
5749 + /*
5750 +- * If the timer is deferrable and nohz is active then we need to use
5751 +- * the deferrable base.
5752 ++ * If the timer is deferrable and NO_HZ_COMMON is set then we need
5753 ++ * to use the deferrable base.
5754 + */
5755 +- if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active &&
5756 +- (tflags & TIMER_DEFERRABLE))
5757 ++ if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && (tflags & TIMER_DEFERRABLE))
5758 + base = per_cpu_ptr(&timer_bases[BASE_DEF], cpu);
5759 + return base;
5760 + }
5761 +@@ -828,11 +827,10 @@ static inline struct timer_base *get_timer_this_cpu_base(u32 tflags)
5762 + struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
5763 +
5764 + /*
5765 +- * If the timer is deferrable and nohz is active then we need to use
5766 +- * the deferrable base.
5767 ++ * If the timer is deferrable and NO_HZ_COMMON is set then we need
5768 ++ * to use the deferrable base.
5769 + */
5770 +- if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active &&
5771 +- (tflags & TIMER_DEFERRABLE))
5772 ++ if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && (tflags & TIMER_DEFERRABLE))
5773 + base = this_cpu_ptr(&timer_bases[BASE_DEF]);
5774 + return base;
5775 + }
5776 +@@ -984,8 +982,6 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
5777 + if (!ret && pending_only)
5778 + goto out_unlock;
5779 +
5780 +- debug_activate(timer, expires);
5781 +-
5782 + new_base = get_target_base(base, timer->flags);
5783 +
5784 + if (base != new_base) {
5785 +@@ -1009,6 +1005,8 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
5786 + }
5787 + }
5788 +
5789 ++ debug_activate(timer, expires);
5790 ++
5791 + timer->expires = expires;
5792 + /*
5793 + * If 'idx' was calculated above and the base time did not advance
5794 +@@ -1644,7 +1642,7 @@ static __latent_entropy void run_timer_softirq(struct softirq_action *h)
5795 + base->must_forward_clk = false;
5796 +
5797 + __run_timers(base);
5798 +- if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active)
5799 ++ if (IS_ENABLED(CONFIG_NO_HZ_COMMON))
5800 + __run_timers(this_cpu_ptr(&timer_bases[BASE_DEF]));
5801 + }
5802 +
5803 +@@ -1803,6 +1801,21 @@ static void migrate_timer_list(struct timer_base *new_base, struct hlist_head *h
5804 + }
5805 + }
5806 +
5807 ++int timers_prepare_cpu(unsigned int cpu)
5808 ++{
5809 ++ struct timer_base *base;
5810 ++ int b;
5811 ++
5812 ++ for (b = 0; b < NR_BASES; b++) {
5813 ++ base = per_cpu_ptr(&timer_bases[b], cpu);
5814 ++ base->clk = jiffies;
5815 ++ base->next_expiry = base->clk + NEXT_TIMER_MAX_DELTA;
5816 ++ base->is_idle = false;
5817 ++ base->must_forward_clk = true;
5818 ++ }
5819 ++ return 0;
5820 ++}
5821 ++
5822 + int timers_dead_cpu(unsigned int cpu)
5823 + {
5824 + struct timer_base *old_base;
5825 +diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
5826 +index 81279c6602ff..0476a9372014 100644
5827 +--- a/kernel/trace/ring_buffer.c
5828 ++++ b/kernel/trace/ring_buffer.c
5829 +@@ -281,6 +281,8 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_data);
5830 + /* Missed count stored at end */
5831 + #define RB_MISSED_STORED (1 << 30)
5832 +
5833 ++#define RB_MISSED_FLAGS (RB_MISSED_EVENTS|RB_MISSED_STORED)
5834 ++
5835 + struct buffer_data_page {
5836 + u64 time_stamp; /* page time stamp */
5837 + local_t commit; /* write committed index */
5838 +@@ -332,7 +334,9 @@ static void rb_init_page(struct buffer_data_page *bpage)
5839 + */
5840 + size_t ring_buffer_page_len(void *page)
5841 + {
5842 +- return local_read(&((struct buffer_data_page *)page)->commit)
5843 ++ struct buffer_data_page *bpage = page;
5844 ++
5845 ++ return (local_read(&bpage->commit) & ~RB_MISSED_FLAGS)
5846 + + BUF_PAGE_HDR_SIZE;
5847 + }
5848 +
5849 +@@ -4439,8 +4443,13 @@ void ring_buffer_free_read_page(struct ring_buffer *buffer, int cpu, void *data)
5850 + {
5851 + struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
5852 + struct buffer_data_page *bpage = data;
5853 ++ struct page *page = virt_to_page(bpage);
5854 + unsigned long flags;
5855 +
5856 ++ /* If the page is still in use someplace else, we can't reuse it */
5857 ++ if (page_ref_count(page) > 1)
5858 ++ goto out;
5859 ++
5860 + local_irq_save(flags);
5861 + arch_spin_lock(&cpu_buffer->lock);
5862 +
5863 +@@ -4452,6 +4461,7 @@ void ring_buffer_free_read_page(struct ring_buffer *buffer, int cpu, void *data)
5864 + arch_spin_unlock(&cpu_buffer->lock);
5865 + local_irq_restore(flags);
5866 +
5867 ++ out:
5868 + free_page((unsigned long)bpage);
5869 + }
5870 + EXPORT_SYMBOL_GPL(ring_buffer_free_read_page);
5871 +diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
5872 +index 80de14973b42..76bcc80b893e 100644
5873 +--- a/kernel/trace/trace.c
5874 ++++ b/kernel/trace/trace.c
5875 +@@ -6769,7 +6769,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
5876 + .spd_release = buffer_spd_release,
5877 + };
5878 + struct buffer_ref *ref;
5879 +- int entries, size, i;
5880 ++ int entries, i;
5881 + ssize_t ret = 0;
5882 +
5883 + #ifdef CONFIG_TRACER_MAX_TRACE
5884 +@@ -6823,14 +6823,6 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
5885 + break;
5886 + }
5887 +
5888 +- /*
5889 +- * zero out any left over data, this is going to
5890 +- * user land.
5891 +- */
5892 +- size = ring_buffer_page_len(ref->page);
5893 +- if (size < PAGE_SIZE)
5894 +- memset(ref->page + size, 0, PAGE_SIZE - size);
5895 +-
5896 + page = virt_to_page(ref->page);
5897 +
5898 + spd.pages[i] = page;
5899 +@@ -7588,6 +7580,7 @@ allocate_trace_buffer(struct trace_array *tr, struct trace_buffer *buf, int size
5900 + buf->data = alloc_percpu(struct trace_array_cpu);
5901 + if (!buf->data) {
5902 + ring_buffer_free(buf->buffer);
5903 ++ buf->buffer = NULL;
5904 + return -ENOMEM;
5905 + }
5906 +
5907 +@@ -7611,7 +7604,9 @@ static int allocate_trace_buffers(struct trace_array *tr, int size)
5908 + allocate_snapshot ? size : 1);
5909 + if (WARN_ON(ret)) {
5910 + ring_buffer_free(tr->trace_buffer.buffer);
5911 ++ tr->trace_buffer.buffer = NULL;
5912 + free_percpu(tr->trace_buffer.data);
5913 ++ tr->trace_buffer.data = NULL;
5914 + return -ENOMEM;
5915 + }
5916 + tr->allocated_snapshot = allocate_snapshot;
5917 +diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
5918 +index de2152730809..08190db0a2dc 100644
5919 +--- a/net/bridge/br_netlink.c
5920 ++++ b/net/bridge/br_netlink.c
5921 +@@ -1223,19 +1223,20 @@ static int br_dev_newlink(struct net *src_net, struct net_device *dev,
5922 + struct net_bridge *br = netdev_priv(dev);
5923 + int err;
5924 +
5925 ++ err = register_netdevice(dev);
5926 ++ if (err)
5927 ++ return err;
5928 ++
5929 + if (tb[IFLA_ADDRESS]) {
5930 + spin_lock_bh(&br->lock);
5931 + br_stp_change_bridge_id(br, nla_data(tb[IFLA_ADDRESS]));
5932 + spin_unlock_bh(&br->lock);
5933 + }
5934 +
5935 +- err = register_netdevice(dev);
5936 +- if (err)
5937 +- return err;
5938 +-
5939 + err = br_changelink(dev, tb, data, extack);
5940 + if (err)
5941 +- unregister_netdevice(dev);
5942 ++ br_dev_delete(dev, NULL);
5943 ++
5944 + return err;
5945 + }
5946 +
5947 +diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
5948 +index 6cfdc7c84c48..0dd6359e5924 100644
5949 +--- a/net/core/net_namespace.c
5950 ++++ b/net/core/net_namespace.c
5951 +@@ -266,7 +266,7 @@ struct net *get_net_ns_by_id(struct net *net, int id)
5952 + spin_lock_bh(&net->nsid_lock);
5953 + peer = idr_find(&net->netns_ids, id);
5954 + if (peer)
5955 +- get_net(peer);
5956 ++ peer = maybe_get_net(peer);
5957 + spin_unlock_bh(&net->nsid_lock);
5958 + rcu_read_unlock();
5959 +
5960 +diff --git a/net/core/skbuff.c b/net/core/skbuff.c
5961 +index e140ba49b30a..15fa5baa8fae 100644
5962 +--- a/net/core/skbuff.c
5963 ++++ b/net/core/skbuff.c
5964 +@@ -1181,12 +1181,12 @@ int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask)
5965 + int i, new_frags;
5966 + u32 d_off;
5967 +
5968 +- if (!num_frags)
5969 +- return 0;
5970 +-
5971 + if (skb_shared(skb) || skb_unclone(skb, gfp_mask))
5972 + return -EINVAL;
5973 +
5974 ++ if (!num_frags)
5975 ++ goto release;
5976 ++
5977 + new_frags = (__skb_pagelen(skb) + PAGE_SIZE - 1) >> PAGE_SHIFT;
5978 + for (i = 0; i < new_frags; i++) {
5979 + page = alloc_page(gfp_mask);
5980 +@@ -1242,6 +1242,7 @@ int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask)
5981 + __skb_fill_page_desc(skb, new_frags - 1, head, 0, d_off);
5982 + skb_shinfo(skb)->nr_frags = new_frags;
5983 +
5984 ++release:
5985 + skb_zcopy_clear(skb, false);
5986 + return 0;
5987 + }
5988 +@@ -3657,8 +3658,6 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb,
5989 +
5990 + skb_shinfo(nskb)->tx_flags |= skb_shinfo(head_skb)->tx_flags &
5991 + SKBTX_SHARED_FRAG;
5992 +- if (skb_zerocopy_clone(nskb, head_skb, GFP_ATOMIC))
5993 +- goto err;
5994 +
5995 + while (pos < offset + len) {
5996 + if (i >= nfrags) {
5997 +@@ -3684,6 +3683,8 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb,
5998 +
5999 + if (unlikely(skb_orphan_frags(frag_skb, GFP_ATOMIC)))
6000 + goto err;
6001 ++ if (skb_zerocopy_clone(nskb, frag_skb, GFP_ATOMIC))
6002 ++ goto err;
6003 +
6004 + *nskb_frag = *frag;
6005 + __skb_frag_ref(nskb_frag);
6006 +@@ -4296,7 +4297,7 @@ void skb_complete_tx_timestamp(struct sk_buff *skb,
6007 + struct sock *sk = skb->sk;
6008 +
6009 + if (!skb_may_tx_timestamp(sk, false))
6010 +- return;
6011 ++ goto err;
6012 +
6013 + /* Take a reference to prevent skb_orphan() from freeing the socket,
6014 + * but only if the socket refcount is not zero.
6015 +@@ -4305,7 +4306,11 @@ void skb_complete_tx_timestamp(struct sk_buff *skb,
6016 + *skb_hwtstamps(skb) = *hwtstamps;
6017 + __skb_complete_tx_timestamp(skb, sk, SCM_TSTAMP_SND, false);
6018 + sock_put(sk);
6019 ++ return;
6020 + }
6021 ++
6022 ++err:
6023 ++ kfree_skb(skb);
6024 + }
6025 + EXPORT_SYMBOL_GPL(skb_complete_tx_timestamp);
6026 +
6027 +diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
6028 +index d7adc0616599..bffa88ecc534 100644
6029 +--- a/net/ipv4/devinet.c
6030 ++++ b/net/ipv4/devinet.c
6031 +@@ -1420,7 +1420,7 @@ static void inetdev_changename(struct net_device *dev, struct in_device *in_dev)
6032 +
6033 + static bool inetdev_valid_mtu(unsigned int mtu)
6034 + {
6035 +- return mtu >= 68;
6036 ++ return mtu >= IPV4_MIN_MTU;
6037 + }
6038 +
6039 + static void inetdev_send_gratuitous_arp(struct net_device *dev,
6040 +diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
6041 +index 37819ab4cc74..d72874150905 100644
6042 +--- a/net/ipv4/fib_frontend.c
6043 ++++ b/net/ipv4/fib_frontend.c
6044 +@@ -1274,14 +1274,19 @@ static int __net_init ip_fib_net_init(struct net *net)
6045 +
6046 + static void ip_fib_net_exit(struct net *net)
6047 + {
6048 +- unsigned int i;
6049 ++ int i;
6050 +
6051 + rtnl_lock();
6052 + #ifdef CONFIG_IP_MULTIPLE_TABLES
6053 + RCU_INIT_POINTER(net->ipv4.fib_main, NULL);
6054 + RCU_INIT_POINTER(net->ipv4.fib_default, NULL);
6055 + #endif
6056 +- for (i = 0; i < FIB_TABLE_HASHSZ; i++) {
6057 ++ /* Destroy the tables in reverse order to guarantee that the
6058 ++ * local table, ID 255, is destroyed before the main table, ID
6059 ++ * 254. This is necessary as the local table may contain
6060 ++ * references to data contained in the main table.
6061 ++ */
6062 ++ for (i = FIB_TABLE_HASHSZ - 1; i >= 0; i--) {
6063 + struct hlist_head *head = &net->ipv4.fib_table_hash[i];
6064 + struct hlist_node *tmp;
6065 + struct fib_table *tb;
6066 +diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
6067 +index 01ed22139ac2..aff3751df950 100644
6068 +--- a/net/ipv4/fib_semantics.c
6069 ++++ b/net/ipv4/fib_semantics.c
6070 +@@ -706,7 +706,7 @@ bool fib_metrics_match(struct fib_config *cfg, struct fib_info *fi)
6071 +
6072 + nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
6073 + int type = nla_type(nla);
6074 +- u32 val;
6075 ++ u32 fi_val, val;
6076 +
6077 + if (!type)
6078 + continue;
6079 +@@ -723,7 +723,11 @@ bool fib_metrics_match(struct fib_config *cfg, struct fib_info *fi)
6080 + val = nla_get_u32(nla);
6081 + }
6082 +
6083 +- if (fi->fib_metrics->metrics[type - 1] != val)
6084 ++ fi_val = fi->fib_metrics->metrics[type - 1];
6085 ++ if (type == RTAX_FEATURES)
6086 ++ fi_val &= ~DST_FEATURE_ECN_CA;
6087 ++
6088 ++ if (fi_val != val)
6089 + return false;
6090 + }
6091 +
6092 +diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
6093 +index ab183af0b5b6..c621266e0306 100644
6094 +--- a/net/ipv4/igmp.c
6095 ++++ b/net/ipv4/igmp.c
6096 +@@ -89,6 +89,7 @@
6097 + #include <linux/rtnetlink.h>
6098 + #include <linux/times.h>
6099 + #include <linux/pkt_sched.h>
6100 ++#include <linux/byteorder/generic.h>
6101 +
6102 + #include <net/net_namespace.h>
6103 + #include <net/arp.h>
6104 +@@ -321,6 +322,23 @@ igmp_scount(struct ip_mc_list *pmc, int type, int gdeleted, int sdeleted)
6105 + return scount;
6106 + }
6107 +
6108 ++/* source address selection per RFC 3376 section 4.2.13 */
6109 ++static __be32 igmpv3_get_srcaddr(struct net_device *dev,
6110 ++ const struct flowi4 *fl4)
6111 ++{
6112 ++ struct in_device *in_dev = __in_dev_get_rcu(dev);
6113 ++
6114 ++ if (!in_dev)
6115 ++ return htonl(INADDR_ANY);
6116 ++
6117 ++ for_ifa(in_dev) {
6118 ++ if (inet_ifa_match(fl4->saddr, ifa))
6119 ++ return fl4->saddr;
6120 ++ } endfor_ifa(in_dev);
6121 ++
6122 ++ return htonl(INADDR_ANY);
6123 ++}
6124 ++
6125 + static struct sk_buff *igmpv3_newpack(struct net_device *dev, unsigned int mtu)
6126 + {
6127 + struct sk_buff *skb;
6128 +@@ -368,7 +386,7 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, unsigned int mtu)
6129 + pip->frag_off = htons(IP_DF);
6130 + pip->ttl = 1;
6131 + pip->daddr = fl4.daddr;
6132 +- pip->saddr = fl4.saddr;
6133 ++ pip->saddr = igmpv3_get_srcaddr(dev, &fl4);
6134 + pip->protocol = IPPROTO_IGMP;
6135 + pip->tot_len = 0; /* filled in later */
6136 + ip_select_ident(net, skb, NULL);
6137 +@@ -404,16 +422,17 @@ static int grec_size(struct ip_mc_list *pmc, int type, int gdel, int sdel)
6138 + }
6139 +
6140 + static struct sk_buff *add_grhead(struct sk_buff *skb, struct ip_mc_list *pmc,
6141 +- int type, struct igmpv3_grec **ppgr)
6142 ++ int type, struct igmpv3_grec **ppgr, unsigned int mtu)
6143 + {
6144 + struct net_device *dev = pmc->interface->dev;
6145 + struct igmpv3_report *pih;
6146 + struct igmpv3_grec *pgr;
6147 +
6148 +- if (!skb)
6149 +- skb = igmpv3_newpack(dev, dev->mtu);
6150 +- if (!skb)
6151 +- return NULL;
6152 ++ if (!skb) {
6153 ++ skb = igmpv3_newpack(dev, mtu);
6154 ++ if (!skb)
6155 ++ return NULL;
6156 ++ }
6157 + pgr = skb_put(skb, sizeof(struct igmpv3_grec));
6158 + pgr->grec_type = type;
6159 + pgr->grec_auxwords = 0;
6160 +@@ -436,12 +455,17 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc,
6161 + struct igmpv3_grec *pgr = NULL;
6162 + struct ip_sf_list *psf, *psf_next, *psf_prev, **psf_list;
6163 + int scount, stotal, first, isquery, truncate;
6164 ++ unsigned int mtu;
6165 +
6166 + if (pmc->multiaddr == IGMP_ALL_HOSTS)
6167 + return skb;
6168 + if (ipv4_is_local_multicast(pmc->multiaddr) && !net->ipv4.sysctl_igmp_llm_reports)
6169 + return skb;
6170 +
6171 ++ mtu = READ_ONCE(dev->mtu);
6172 ++ if (mtu < IPV4_MIN_MTU)
6173 ++ return skb;
6174 ++
6175 + isquery = type == IGMPV3_MODE_IS_INCLUDE ||
6176 + type == IGMPV3_MODE_IS_EXCLUDE;
6177 + truncate = type == IGMPV3_MODE_IS_EXCLUDE ||
6178 +@@ -462,7 +486,7 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc,
6179 + AVAILABLE(skb) < grec_size(pmc, type, gdeleted, sdeleted)) {
6180 + if (skb)
6181 + igmpv3_sendpack(skb);
6182 +- skb = igmpv3_newpack(dev, dev->mtu);
6183 ++ skb = igmpv3_newpack(dev, mtu);
6184 + }
6185 + }
6186 + first = 1;
6187 +@@ -498,12 +522,12 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc,
6188 + pgr->grec_nsrcs = htons(scount);
6189 + if (skb)
6190 + igmpv3_sendpack(skb);
6191 +- skb = igmpv3_newpack(dev, dev->mtu);
6192 ++ skb = igmpv3_newpack(dev, mtu);
6193 + first = 1;
6194 + scount = 0;
6195 + }
6196 + if (first) {
6197 +- skb = add_grhead(skb, pmc, type, &pgr);
6198 ++ skb = add_grhead(skb, pmc, type, &pgr, mtu);
6199 + first = 0;
6200 + }
6201 + if (!skb)
6202 +@@ -538,7 +562,7 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc,
6203 + igmpv3_sendpack(skb);
6204 + skb = NULL; /* add_grhead will get a new one */
6205 + }
6206 +- skb = add_grhead(skb, pmc, type, &pgr);
6207 ++ skb = add_grhead(skb, pmc, type, &pgr, mtu);
6208 + }
6209 + }
6210 + if (pgr)
6211 +diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
6212 +index e9805ad664ac..4e90082b23a6 100644
6213 +--- a/net/ipv4/ip_tunnel.c
6214 ++++ b/net/ipv4/ip_tunnel.c
6215 +@@ -349,8 +349,8 @@ static int ip_tunnel_bind_dev(struct net_device *dev)
6216 + dev->needed_headroom = t_hlen + hlen;
6217 + mtu -= (dev->hard_header_len + t_hlen);
6218 +
6219 +- if (mtu < 68)
6220 +- mtu = 68;
6221 ++ if (mtu < IPV4_MIN_MTU)
6222 ++ mtu = IPV4_MIN_MTU;
6223 +
6224 + return mtu;
6225 + }
6226 +diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
6227 +index 33b70bfd1122..125c1eab3eaa 100644
6228 +--- a/net/ipv4/raw.c
6229 ++++ b/net/ipv4/raw.c
6230 +@@ -513,11 +513,16 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
6231 + int err;
6232 + struct ip_options_data opt_copy;
6233 + struct raw_frag_vec rfv;
6234 ++ int hdrincl;
6235 +
6236 + err = -EMSGSIZE;
6237 + if (len > 0xFFFF)
6238 + goto out;
6239 +
6240 ++ /* hdrincl should be READ_ONCE(inet->hdrincl)
6241 ++ * but READ_ONCE() doesn't work with bit fields
6242 ++ */
6243 ++ hdrincl = inet->hdrincl;
6244 + /*
6245 + * Check the flags.
6246 + */
6247 +@@ -593,7 +598,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
6248 + /* Linux does not mangle headers on raw sockets,
6249 + * so that IP options + IP_HDRINCL is non-sense.
6250 + */
6251 +- if (inet->hdrincl)
6252 ++ if (hdrincl)
6253 + goto done;
6254 + if (ipc.opt->opt.srr) {
6255 + if (!daddr)
6256 +@@ -615,12 +620,12 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
6257 +
6258 + flowi4_init_output(&fl4, ipc.oif, sk->sk_mark, tos,
6259 + RT_SCOPE_UNIVERSE,
6260 +- inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
6261 ++ hdrincl ? IPPROTO_RAW : sk->sk_protocol,
6262 + inet_sk_flowi_flags(sk) |
6263 +- (inet->hdrincl ? FLOWI_FLAG_KNOWN_NH : 0),
6264 ++ (hdrincl ? FLOWI_FLAG_KNOWN_NH : 0),
6265 + daddr, saddr, 0, 0, sk->sk_uid);
6266 +
6267 +- if (!inet->hdrincl) {
6268 ++ if (!hdrincl) {
6269 + rfv.msg = msg;
6270 + rfv.hlen = 0;
6271 +
6272 +@@ -645,7 +650,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
6273 + goto do_confirm;
6274 + back_from_confirm:
6275 +
6276 +- if (inet->hdrincl)
6277 ++ if (hdrincl)
6278 + err = raw_send_hdrinc(sk, &fl4, msg, len,
6279 + &rt, msg->msg_flags, &ipc.sockc);
6280 +
6281 +diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
6282 +index 5091402720ab..a0c72b09cefc 100644
6283 +--- a/net/ipv4/tcp.c
6284 ++++ b/net/ipv4/tcp.c
6285 +@@ -2356,6 +2356,7 @@ int tcp_disconnect(struct sock *sk, int flags)
6286 + tp->snd_cwnd_cnt = 0;
6287 + tp->window_clamp = 0;
6288 + tcp_set_ca_state(sk, TCP_CA_Open);
6289 ++ tp->is_sack_reneg = 0;
6290 + tcp_clear_retrans(tp);
6291 + inet_csk_delack_init(sk);
6292 + /* Initialize rcv_mss to TCP_MIN_MSS to avoid division by 0
6293 +diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c
6294 +index 69ee877574d0..8322f26e770e 100644
6295 +--- a/net/ipv4/tcp_bbr.c
6296 ++++ b/net/ipv4/tcp_bbr.c
6297 +@@ -110,7 +110,8 @@ struct bbr {
6298 + u32 lt_last_lost; /* LT intvl start: tp->lost */
6299 + u32 pacing_gain:10, /* current gain for setting pacing rate */
6300 + cwnd_gain:10, /* current gain for setting cwnd */
6301 +- full_bw_cnt:3, /* number of rounds without large bw gains */
6302 ++ full_bw_reached:1, /* reached full bw in Startup? */
6303 ++ full_bw_cnt:2, /* number of rounds without large bw gains */
6304 + cycle_idx:3, /* current index in pacing_gain cycle array */
6305 + has_seen_rtt:1, /* have we seen an RTT sample yet? */
6306 + unused_b:5;
6307 +@@ -180,7 +181,7 @@ static bool bbr_full_bw_reached(const struct sock *sk)
6308 + {
6309 + const struct bbr *bbr = inet_csk_ca(sk);
6310 +
6311 +- return bbr->full_bw_cnt >= bbr_full_bw_cnt;
6312 ++ return bbr->full_bw_reached;
6313 + }
6314 +
6315 + /* Return the windowed max recent bandwidth sample, in pkts/uS << BW_SCALE. */
6316 +@@ -717,6 +718,7 @@ static void bbr_check_full_bw_reached(struct sock *sk,
6317 + return;
6318 + }
6319 + ++bbr->full_bw_cnt;
6320 ++ bbr->full_bw_reached = bbr->full_bw_cnt >= bbr_full_bw_cnt;
6321 + }
6322 +
6323 + /* If pipe is probably full, drain the queue and then enter steady-state. */
6324 +@@ -850,6 +852,7 @@ static void bbr_init(struct sock *sk)
6325 + bbr->restore_cwnd = 0;
6326 + bbr->round_start = 0;
6327 + bbr->idle_restart = 0;
6328 ++ bbr->full_bw_reached = 0;
6329 + bbr->full_bw = 0;
6330 + bbr->full_bw_cnt = 0;
6331 + bbr->cycle_mstamp = 0;
6332 +@@ -871,6 +874,11 @@ static u32 bbr_sndbuf_expand(struct sock *sk)
6333 + */
6334 + static u32 bbr_undo_cwnd(struct sock *sk)
6335 + {
6336 ++ struct bbr *bbr = inet_csk_ca(sk);
6337 ++
6338 ++ bbr->full_bw = 0; /* spurious slow-down; reset full pipe detection */
6339 ++ bbr->full_bw_cnt = 0;
6340 ++ bbr_reset_lt_bw_sampling(sk);
6341 + return tcp_sk(sk)->snd_cwnd;
6342 + }
6343 +
6344 +diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
6345 +index c5447b9f8517..ff48ac654e5a 100644
6346 +--- a/net/ipv4/tcp_input.c
6347 ++++ b/net/ipv4/tcp_input.c
6348 +@@ -521,9 +521,6 @@ static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep)
6349 + u32 new_sample = tp->rcv_rtt_est.rtt_us;
6350 + long m = sample;
6351 +
6352 +- if (m == 0)
6353 +- m = 1;
6354 +-
6355 + if (new_sample != 0) {
6356 + /* If we sample in larger samples in the non-timestamp
6357 + * case, we could grossly overestimate the RTT especially
6358 +@@ -560,6 +557,8 @@ static inline void tcp_rcv_rtt_measure(struct tcp_sock *tp)
6359 + if (before(tp->rcv_nxt, tp->rcv_rtt_est.seq))
6360 + return;
6361 + delta_us = tcp_stamp_us_delta(tp->tcp_mstamp, tp->rcv_rtt_est.time);
6362 ++ if (!delta_us)
6363 ++ delta_us = 1;
6364 + tcp_rcv_rtt_update(tp, delta_us, 1);
6365 +
6366 + new_measure:
6367 +@@ -576,8 +575,11 @@ static inline void tcp_rcv_rtt_measure_ts(struct sock *sk,
6368 + (TCP_SKB_CB(skb)->end_seq -
6369 + TCP_SKB_CB(skb)->seq >= inet_csk(sk)->icsk_ack.rcv_mss)) {
6370 + u32 delta = tcp_time_stamp(tp) - tp->rx_opt.rcv_tsecr;
6371 +- u32 delta_us = delta * (USEC_PER_SEC / TCP_TS_HZ);
6372 ++ u32 delta_us;
6373 +
6374 ++ if (!delta)
6375 ++ delta = 1;
6376 ++ delta_us = delta * (USEC_PER_SEC / TCP_TS_HZ);
6377 + tcp_rcv_rtt_update(tp, delta_us, 0);
6378 + }
6379 + }
6380 +@@ -1975,6 +1977,8 @@ void tcp_enter_loss(struct sock *sk)
6381 + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSACKRENEGING);
6382 + tp->sacked_out = 0;
6383 + tp->fackets_out = 0;
6384 ++ /* Mark SACK reneging until we recover from this loss event. */
6385 ++ tp->is_sack_reneg = 1;
6386 + }
6387 + tcp_clear_all_retrans_hints(tp);
6388 +
6389 +@@ -2428,6 +2432,7 @@ static bool tcp_try_undo_recovery(struct sock *sk)
6390 + return true;
6391 + }
6392 + tcp_set_ca_state(sk, TCP_CA_Open);
6393 ++ tp->is_sack_reneg = 0;
6394 + return false;
6395 + }
6396 +
6397 +@@ -2459,8 +2464,10 @@ static bool tcp_try_undo_loss(struct sock *sk, bool frto_undo)
6398 + NET_INC_STATS(sock_net(sk),
6399 + LINUX_MIB_TCPSPURIOUSRTOS);
6400 + inet_csk(sk)->icsk_retransmits = 0;
6401 +- if (frto_undo || tcp_is_sack(tp))
6402 ++ if (frto_undo || tcp_is_sack(tp)) {
6403 + tcp_set_ca_state(sk, TCP_CA_Open);
6404 ++ tp->is_sack_reneg = 0;
6405 ++ }
6406 + return true;
6407 + }
6408 + return false;
6409 +@@ -3551,6 +3558,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
6410 + struct tcp_sacktag_state sack_state;
6411 + struct rate_sample rs = { .prior_delivered = 0 };
6412 + u32 prior_snd_una = tp->snd_una;
6413 ++ bool is_sack_reneg = tp->is_sack_reneg;
6414 + u32 ack_seq = TCP_SKB_CB(skb)->seq;
6415 + u32 ack = TCP_SKB_CB(skb)->ack_seq;
6416 + bool is_dupack = false;
6417 +@@ -3666,7 +3674,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
6418 +
6419 + delivered = tp->delivered - delivered; /* freshly ACKed or SACKed */
6420 + lost = tp->lost - lost; /* freshly marked lost */
6421 +- tcp_rate_gen(sk, delivered, lost, sack_state.rate);
6422 ++ tcp_rate_gen(sk, delivered, lost, is_sack_reneg, sack_state.rate);
6423 + tcp_cong_control(sk, ack, delivered, flag, sack_state.rate);
6424 + tcp_xmit_recovery(sk, rexmit);
6425 + return 1;
6426 +diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
6427 +index 5a5ed4f14678..cab4b935e474 100644
6428 +--- a/net/ipv4/tcp_ipv4.c
6429 ++++ b/net/ipv4/tcp_ipv4.c
6430 +@@ -844,7 +844,7 @@ static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
6431 + tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
6432 + req->ts_recent,
6433 + 0,
6434 +- tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
6435 ++ tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->saddr,
6436 + AF_INET),
6437 + inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
6438 + ip_hdr(skb)->tos);
6439 +diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c
6440 +index 3330a370d306..c61240e43923 100644
6441 +--- a/net/ipv4/tcp_rate.c
6442 ++++ b/net/ipv4/tcp_rate.c
6443 +@@ -106,7 +106,7 @@ void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb,
6444 +
6445 + /* Update the connection delivery information and generate a rate sample. */
6446 + void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost,
6447 +- struct rate_sample *rs)
6448 ++ bool is_sack_reneg, struct rate_sample *rs)
6449 + {
6450 + struct tcp_sock *tp = tcp_sk(sk);
6451 + u32 snd_us, ack_us;
6452 +@@ -124,8 +124,12 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost,
6453 +
6454 + rs->acked_sacked = delivered; /* freshly ACKed or SACKed */
6455 + rs->losses = lost; /* freshly marked lost */
6456 +- /* Return an invalid sample if no timing information is available. */
6457 +- if (!rs->prior_mstamp) {
6458 ++ /* Return an invalid sample if no timing information is available or
6459 ++ * in recovery from loss with SACK reneging. Rate samples taken during
6460 ++ * a SACK reneging event may overestimate bw by including packets that
6461 ++ * were SACKed before the reneg.
6462 ++ */
6463 ++ if (!rs->prior_mstamp || is_sack_reneg) {
6464 + rs->delivered = -1;
6465 + rs->interval_us = -1;
6466 + return;
6467 +diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
6468 +index 655dd8d7f064..e9af1879cd53 100644
6469 +--- a/net/ipv4/tcp_timer.c
6470 ++++ b/net/ipv4/tcp_timer.c
6471 +@@ -264,6 +264,7 @@ void tcp_delack_timer_handler(struct sock *sk)
6472 + icsk->icsk_ack.pingpong = 0;
6473 + icsk->icsk_ack.ato = TCP_ATO_MIN;
6474 + }
6475 ++ tcp_mstamp_refresh(tcp_sk(sk));
6476 + tcp_send_ack(sk);
6477 + __NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKS);
6478 + }
6479 +@@ -627,6 +628,7 @@ static void tcp_keepalive_timer (unsigned long data)
6480 + goto out;
6481 + }
6482 +
6483 ++ tcp_mstamp_refresh(tp);
6484 + if (sk->sk_state == TCP_FIN_WAIT2 && sock_flag(sk, SOCK_DEAD)) {
6485 + if (tp->linger2 >= 0) {
6486 + const int tmo = tcp_fin_time(sk) - TCP_TIMEWAIT_LEN;
6487 +diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
6488 +index 2ec39404c449..c5318f5f6a14 100644
6489 +--- a/net/ipv6/addrconf.c
6490 ++++ b/net/ipv6/addrconf.c
6491 +@@ -231,7 +231,7 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = {
6492 + .proxy_ndp = 0,
6493 + .accept_source_route = 0, /* we do not accept RH0 by default. */
6494 + .disable_ipv6 = 0,
6495 +- .accept_dad = 1,
6496 ++ .accept_dad = 0,
6497 + .suppress_frag_ndisc = 1,
6498 + .accept_ra_mtu = 1,
6499 + .stable_secret = {
6500 +diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
6501 +index fe5262fd6aa5..bcbd5f3bf8bd 100644
6502 +--- a/net/ipv6/af_inet6.c
6503 ++++ b/net/ipv6/af_inet6.c
6504 +@@ -210,7 +210,6 @@ static int inet6_create(struct net *net, struct socket *sock, int protocol,
6505 + np->mcast_hops = IPV6_DEFAULT_MCASTHOPS;
6506 + np->mc_loop = 1;
6507 + np->pmtudisc = IPV6_PMTUDISC_WANT;
6508 +- np->autoflowlabel = ip6_default_np_autolabel(net);
6509 + np->repflow = net->ipv6.sysctl.flowlabel_reflect;
6510 + sk->sk_ipv6only = net->ipv6.sysctl.bindv6only;
6511 +
6512 +diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
6513 +index 5d6bee070871..7a2df6646486 100644
6514 +--- a/net/ipv6/ip6_gre.c
6515 ++++ b/net/ipv6/ip6_gre.c
6516 +@@ -1020,6 +1020,36 @@ static void ip6gre_tunnel_setup(struct net_device *dev)
6517 + eth_random_addr(dev->perm_addr);
6518 + }
6519 +
6520 ++#define GRE6_FEATURES (NETIF_F_SG | \
6521 ++ NETIF_F_FRAGLIST | \
6522 ++ NETIF_F_HIGHDMA | \
6523 ++ NETIF_F_HW_CSUM)
6524 ++
6525 ++static void ip6gre_tnl_init_features(struct net_device *dev)
6526 ++{
6527 ++ struct ip6_tnl *nt = netdev_priv(dev);
6528 ++
6529 ++ dev->features |= GRE6_FEATURES;
6530 ++ dev->hw_features |= GRE6_FEATURES;
6531 ++
6532 ++ if (!(nt->parms.o_flags & TUNNEL_SEQ)) {
6533 ++ /* TCP offload with GRE SEQ is not supported, nor
6534 ++ * can we support 2 levels of outer headers requiring
6535 ++ * an update.
6536 ++ */
6537 ++ if (!(nt->parms.o_flags & TUNNEL_CSUM) ||
6538 ++ nt->encap.type == TUNNEL_ENCAP_NONE) {
6539 ++ dev->features |= NETIF_F_GSO_SOFTWARE;
6540 ++ dev->hw_features |= NETIF_F_GSO_SOFTWARE;
6541 ++ }
6542 ++
6543 ++ /* Can use a lockless transmit, unless we generate
6544 ++ * output sequences
6545 ++ */
6546 ++ dev->features |= NETIF_F_LLTX;
6547 ++ }
6548 ++}
6549 ++
6550 + static int ip6gre_tunnel_init_common(struct net_device *dev)
6551 + {
6552 + struct ip6_tnl *tunnel;
6553 +@@ -1054,6 +1084,8 @@ static int ip6gre_tunnel_init_common(struct net_device *dev)
6554 + if (!(tunnel->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
6555 + dev->mtu -= 8;
6556 +
6557 ++ ip6gre_tnl_init_features(dev);
6558 ++
6559 + return 0;
6560 + }
6561 +
6562 +@@ -1302,11 +1334,6 @@ static const struct net_device_ops ip6gre_tap_netdev_ops = {
6563 + .ndo_get_iflink = ip6_tnl_get_iflink,
6564 + };
6565 +
6566 +-#define GRE6_FEATURES (NETIF_F_SG | \
6567 +- NETIF_F_FRAGLIST | \
6568 +- NETIF_F_HIGHDMA | \
6569 +- NETIF_F_HW_CSUM)
6570 +-
6571 + static void ip6gre_tap_setup(struct net_device *dev)
6572 + {
6573 +
6574 +@@ -1386,26 +1413,6 @@ static int ip6gre_newlink(struct net *src_net, struct net_device *dev,
6575 + nt->net = dev_net(dev);
6576 + ip6gre_tnl_link_config(nt, !tb[IFLA_MTU]);
6577 +
6578 +- dev->features |= GRE6_FEATURES;
6579 +- dev->hw_features |= GRE6_FEATURES;
6580 +-
6581 +- if (!(nt->parms.o_flags & TUNNEL_SEQ)) {
6582 +- /* TCP offload with GRE SEQ is not supported, nor
6583 +- * can we support 2 levels of outer headers requiring
6584 +- * an update.
6585 +- */
6586 +- if (!(nt->parms.o_flags & TUNNEL_CSUM) ||
6587 +- (nt->encap.type == TUNNEL_ENCAP_NONE)) {
6588 +- dev->features |= NETIF_F_GSO_SOFTWARE;
6589 +- dev->hw_features |= NETIF_F_GSO_SOFTWARE;
6590 +- }
6591 +-
6592 +- /* Can use a lockless transmit, unless we generate
6593 +- * output sequences
6594 +- */
6595 +- dev->features |= NETIF_F_LLTX;
6596 +- }
6597 +-
6598 + err = register_netdevice(dev);
6599 + if (err)
6600 + goto out;
6601 +diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
6602 +index 5110a418cc4d..f7dd51c42314 100644
6603 +--- a/net/ipv6/ip6_output.c
6604 ++++ b/net/ipv6/ip6_output.c
6605 +@@ -166,6 +166,14 @@ int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
6606 + !(IP6CB(skb)->flags & IP6SKB_REROUTED));
6607 + }
6608 +
6609 ++static bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
6610 ++{
6611 ++ if (!np->autoflowlabel_set)
6612 ++ return ip6_default_np_autolabel(net);
6613 ++ else
6614 ++ return np->autoflowlabel;
6615 ++}
6616 ++
6617 + /*
6618 + * xmit an sk_buff (used by TCP, SCTP and DCCP)
6619 + * Note : socket lock is not held for SYNACK packets, but might be modified
6620 +@@ -230,7 +238,7 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
6621 + hlimit = ip6_dst_hoplimit(dst);
6622 +
6623 + ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
6624 +- np->autoflowlabel, fl6));
6625 ++ ip6_autoflowlabel(net, np), fl6));
6626 +
6627 + hdr->payload_len = htons(seg_len);
6628 + hdr->nexthdr = proto;
6629 +@@ -1626,7 +1634,7 @@ struct sk_buff *__ip6_make_skb(struct sock *sk,
6630 +
6631 + ip6_flow_hdr(hdr, v6_cork->tclass,
6632 + ip6_make_flowlabel(net, skb, fl6->flowlabel,
6633 +- np->autoflowlabel, fl6));
6634 ++ ip6_autoflowlabel(net, np), fl6));
6635 + hdr->hop_limit = v6_cork->hop_limit;
6636 + hdr->nexthdr = proto;
6637 + hdr->saddr = fl6->saddr;
6638 +diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
6639 +index a1c24443cd9e..ef958d50746b 100644
6640 +--- a/net/ipv6/ip6_tunnel.c
6641 ++++ b/net/ipv6/ip6_tunnel.c
6642 +@@ -912,7 +912,7 @@ static int ipxip6_rcv(struct sk_buff *skb, u8 ipproto,
6643 + if (t->parms.collect_md) {
6644 + tun_dst = ipv6_tun_rx_dst(skb, 0, 0, 0);
6645 + if (!tun_dst)
6646 +- return 0;
6647 ++ goto drop;
6648 + }
6649 + ret = __ip6_tnl_rcv(t, skb, tpi, tun_dst, dscp_ecn_decapsulate,
6650 + log_ecn_error);
6651 +diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
6652 +index a5e466d4e093..90dbfa78a390 100644
6653 +--- a/net/ipv6/ipv6_sockglue.c
6654 ++++ b/net/ipv6/ipv6_sockglue.c
6655 +@@ -878,6 +878,7 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
6656 + break;
6657 + case IPV6_AUTOFLOWLABEL:
6658 + np->autoflowlabel = valbool;
6659 ++ np->autoflowlabel_set = 1;
6660 + retv = 0;
6661 + break;
6662 + case IPV6_RECVFRAGSIZE:
6663 +diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
6664 +index 12b7c27ce5ce..9a38a2c641fa 100644
6665 +--- a/net/ipv6/mcast.c
6666 ++++ b/net/ipv6/mcast.c
6667 +@@ -1682,16 +1682,16 @@ static int grec_size(struct ifmcaddr6 *pmc, int type, int gdel, int sdel)
6668 + }
6669 +
6670 + static struct sk_buff *add_grhead(struct sk_buff *skb, struct ifmcaddr6 *pmc,
6671 +- int type, struct mld2_grec **ppgr)
6672 ++ int type, struct mld2_grec **ppgr, unsigned int mtu)
6673 + {
6674 +- struct net_device *dev = pmc->idev->dev;
6675 + struct mld2_report *pmr;
6676 + struct mld2_grec *pgr;
6677 +
6678 +- if (!skb)
6679 +- skb = mld_newpack(pmc->idev, dev->mtu);
6680 +- if (!skb)
6681 +- return NULL;
6682 ++ if (!skb) {
6683 ++ skb = mld_newpack(pmc->idev, mtu);
6684 ++ if (!skb)
6685 ++ return NULL;
6686 ++ }
6687 + pgr = skb_put(skb, sizeof(struct mld2_grec));
6688 + pgr->grec_type = type;
6689 + pgr->grec_auxwords = 0;
6690 +@@ -1714,10 +1714,15 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ifmcaddr6 *pmc,
6691 + struct mld2_grec *pgr = NULL;
6692 + struct ip6_sf_list *psf, *psf_next, *psf_prev, **psf_list;
6693 + int scount, stotal, first, isquery, truncate;
6694 ++ unsigned int mtu;
6695 +
6696 + if (pmc->mca_flags & MAF_NOREPORT)
6697 + return skb;
6698 +
6699 ++ mtu = READ_ONCE(dev->mtu);
6700 ++ if (mtu < IPV6_MIN_MTU)
6701 ++ return skb;
6702 ++
6703 + isquery = type == MLD2_MODE_IS_INCLUDE ||
6704 + type == MLD2_MODE_IS_EXCLUDE;
6705 + truncate = type == MLD2_MODE_IS_EXCLUDE ||
6706 +@@ -1738,7 +1743,7 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ifmcaddr6 *pmc,
6707 + AVAILABLE(skb) < grec_size(pmc, type, gdeleted, sdeleted)) {
6708 + if (skb)
6709 + mld_sendpack(skb);
6710 +- skb = mld_newpack(idev, dev->mtu);
6711 ++ skb = mld_newpack(idev, mtu);
6712 + }
6713 + }
6714 + first = 1;
6715 +@@ -1774,12 +1779,12 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ifmcaddr6 *pmc,
6716 + pgr->grec_nsrcs = htons(scount);
6717 + if (skb)
6718 + mld_sendpack(skb);
6719 +- skb = mld_newpack(idev, dev->mtu);
6720 ++ skb = mld_newpack(idev, mtu);
6721 + first = 1;
6722 + scount = 0;
6723 + }
6724 + if (first) {
6725 +- skb = add_grhead(skb, pmc, type, &pgr);
6726 ++ skb = add_grhead(skb, pmc, type, &pgr, mtu);
6727 + first = 0;
6728 + }
6729 + if (!skb)
6730 +@@ -1814,7 +1819,7 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ifmcaddr6 *pmc,
6731 + mld_sendpack(skb);
6732 + skb = NULL; /* add_grhead will get a new one */
6733 + }
6734 +- skb = add_grhead(skb, pmc, type, &pgr);
6735 ++ skb = add_grhead(skb, pmc, type, &pgr, mtu);
6736 + }
6737 + }
6738 + if (pgr)
6739 +diff --git a/net/ipv6/route.c b/net/ipv6/route.c
6740 +index 598efa8cfe25..ca8d3266e92e 100644
6741 +--- a/net/ipv6/route.c
6742 ++++ b/net/ipv6/route.c
6743 +@@ -3700,19 +3700,13 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
6744 + if (!ipv6_addr_any(&fl6.saddr))
6745 + flags |= RT6_LOOKUP_F_HAS_SADDR;
6746 +
6747 +- if (!fibmatch)
6748 +- dst = ip6_route_input_lookup(net, dev, &fl6, flags);
6749 +- else
6750 +- dst = ip6_route_lookup(net, &fl6, 0);
6751 ++ dst = ip6_route_input_lookup(net, dev, &fl6, flags);
6752 +
6753 + rcu_read_unlock();
6754 + } else {
6755 + fl6.flowi6_oif = oif;
6756 +
6757 +- if (!fibmatch)
6758 +- dst = ip6_route_output(net, NULL, &fl6);
6759 +- else
6760 +- dst = ip6_route_lookup(net, &fl6, 0);
6761 ++ dst = ip6_route_output(net, NULL, &fl6);
6762 + }
6763 +
6764 +
6765 +@@ -3729,6 +3723,15 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
6766 + goto errout;
6767 + }
6768 +
6769 ++ if (fibmatch && rt->dst.from) {
6770 ++ struct rt6_info *ort = container_of(rt->dst.from,
6771 ++ struct rt6_info, dst);
6772 ++
6773 ++ dst_hold(&ort->dst);
6774 ++ ip6_rt_put(rt);
6775 ++ rt = ort;
6776 ++ }
6777 ++
6778 + skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
6779 + if (!skb) {
6780 + ip6_rt_put(rt);
6781 +diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
6782 +index 32ded300633d..237cc6187c5a 100644
6783 +--- a/net/ipv6/tcp_ipv6.c
6784 ++++ b/net/ipv6/tcp_ipv6.c
6785 +@@ -988,7 +988,7 @@ static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
6786 + req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
6787 + tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
6788 + req->ts_recent, sk->sk_bound_dev_if,
6789 +- tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->daddr),
6790 ++ tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->saddr),
6791 + 0, 0);
6792 + }
6793 +
6794 +diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
6795 +index 15c99dfa3d72..aac9d68b4636 100644
6796 +--- a/net/netlink/af_netlink.c
6797 ++++ b/net/netlink/af_netlink.c
6798 +@@ -254,6 +254,9 @@ static int __netlink_deliver_tap_skb(struct sk_buff *skb,
6799 + struct sock *sk = skb->sk;
6800 + int ret = -ENOMEM;
6801 +
6802 ++ if (!net_eq(dev_net(dev), sock_net(sk)))
6803 ++ return 0;
6804 ++
6805 + dev_hold(dev);
6806 +
6807 + if (is_vmalloc_addr(skb->head))
6808 +diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c
6809 +index cfb652a4e007..dbe1079a1651 100644
6810 +--- a/net/openvswitch/flow.c
6811 ++++ b/net/openvswitch/flow.c
6812 +@@ -532,6 +532,7 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key)
6813 + return -EINVAL;
6814 +
6815 + skb_reset_network_header(skb);
6816 ++ key->eth.type = skb->protocol;
6817 + } else {
6818 + eth = eth_hdr(skb);
6819 + ether_addr_copy(key->eth.src, eth->h_source);
6820 +@@ -545,15 +546,23 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key)
6821 + if (unlikely(parse_vlan(skb, key)))
6822 + return -ENOMEM;
6823 +
6824 +- skb->protocol = parse_ethertype(skb);
6825 +- if (unlikely(skb->protocol == htons(0)))
6826 ++ key->eth.type = parse_ethertype(skb);
6827 ++ if (unlikely(key->eth.type == htons(0)))
6828 + return -ENOMEM;
6829 +
6830 ++ /* Multiple tagged packets need to retain TPID to satisfy
6831 ++ * skb_vlan_pop(), which will later shift the ethertype into
6832 ++ * skb->protocol.
6833 ++ */
6834 ++ if (key->eth.cvlan.tci & htons(VLAN_TAG_PRESENT))
6835 ++ skb->protocol = key->eth.cvlan.tpid;
6836 ++ else
6837 ++ skb->protocol = key->eth.type;
6838 ++
6839 + skb_reset_network_header(skb);
6840 + __skb_push(skb, skb->data - skb_mac_header(skb));
6841 + }
6842 + skb_reset_mac_len(skb);
6843 +- key->eth.type = skb->protocol;
6844 +
6845 + /* Network layer. */
6846 + if (key->eth.type == htons(ETH_P_IP)) {
6847 +diff --git a/net/rds/send.c b/net/rds/send.c
6848 +index b52cdc8ae428..f72466c63f0c 100644
6849 +--- a/net/rds/send.c
6850 ++++ b/net/rds/send.c
6851 +@@ -1009,6 +1009,9 @@ static int rds_rdma_bytes(struct msghdr *msg, size_t *rdma_bytes)
6852 + continue;
6853 +
6854 + if (cmsg->cmsg_type == RDS_CMSG_RDMA_ARGS) {
6855 ++ if (cmsg->cmsg_len <
6856 ++ CMSG_LEN(sizeof(struct rds_rdma_args)))
6857 ++ return -EINVAL;
6858 + args = CMSG_DATA(cmsg);
6859 + *rdma_bytes += args->remote_vec.bytes;
6860 + }
6861 +diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c
6862 +index 44de4ee51ce9..a08a32fa0949 100644
6863 +--- a/net/sched/sch_ingress.c
6864 ++++ b/net/sched/sch_ingress.c
6865 +@@ -59,11 +59,12 @@ static int ingress_init(struct Qdisc *sch, struct nlattr *opt)
6866 + struct net_device *dev = qdisc_dev(sch);
6867 + int err;
6868 +
6869 ++ net_inc_ingress_queue();
6870 ++
6871 + err = tcf_block_get(&q->block, &dev->ingress_cl_list);
6872 + if (err)
6873 + return err;
6874 +
6875 +- net_inc_ingress_queue();
6876 + sch->flags |= TCQ_F_CPUSTATS;
6877 +
6878 + return 0;
6879 +@@ -153,6 +154,9 @@ static int clsact_init(struct Qdisc *sch, struct nlattr *opt)
6880 + struct net_device *dev = qdisc_dev(sch);
6881 + int err;
6882 +
6883 ++ net_inc_ingress_queue();
6884 ++ net_inc_egress_queue();
6885 ++
6886 + err = tcf_block_get(&q->ingress_block, &dev->ingress_cl_list);
6887 + if (err)
6888 + return err;
6889 +@@ -161,9 +165,6 @@ static int clsact_init(struct Qdisc *sch, struct nlattr *opt)
6890 + if (err)
6891 + return err;
6892 +
6893 +- net_inc_ingress_queue();
6894 +- net_inc_egress_queue();
6895 +-
6896 + sch->flags |= TCQ_F_CPUSTATS;
6897 +
6898 + return 0;
6899 +diff --git a/net/sctp/socket.c b/net/sctp/socket.c
6900 +index d6163f7aefb1..df806b8819aa 100644
6901 +--- a/net/sctp/socket.c
6902 ++++ b/net/sctp/socket.c
6903 +@@ -3874,13 +3874,17 @@ static int sctp_setsockopt_reset_streams(struct sock *sk,
6904 + struct sctp_association *asoc;
6905 + int retval = -EINVAL;
6906 +
6907 +- if (optlen < sizeof(struct sctp_reset_streams))
6908 ++ if (optlen < sizeof(*params))
6909 + return -EINVAL;
6910 +
6911 + params = memdup_user(optval, optlen);
6912 + if (IS_ERR(params))
6913 + return PTR_ERR(params);
6914 +
6915 ++ if (params->srs_number_streams * sizeof(__u16) >
6916 ++ optlen - sizeof(*params))
6917 ++ goto out;
6918 ++
6919 + asoc = sctp_id2assoc(sk, params->srs_assoc_id);
6920 + if (!asoc)
6921 + goto out;
6922 +@@ -4413,7 +4417,7 @@ static int sctp_init_sock(struct sock *sk)
6923 + SCTP_DBG_OBJCNT_INC(sock);
6924 +
6925 + local_bh_disable();
6926 +- percpu_counter_inc(&sctp_sockets_allocated);
6927 ++ sk_sockets_allocated_inc(sk);
6928 + sock_prot_inuse_add(net, sk->sk_prot, 1);
6929 +
6930 + /* Nothing can fail after this block, otherwise
6931 +@@ -4457,7 +4461,7 @@ static void sctp_destroy_sock(struct sock *sk)
6932 + }
6933 + sctp_endpoint_free(sp->ep);
6934 + local_bh_disable();
6935 +- percpu_counter_dec(&sctp_sockets_allocated);
6936 ++ sk_sockets_allocated_dec(sk);
6937 + sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
6938 + local_bh_enable();
6939 + }
6940 +diff --git a/net/tipc/socket.c b/net/tipc/socket.c
6941 +index d50edd6e0019..98a44ecb11e7 100644
6942 +--- a/net/tipc/socket.c
6943 ++++ b/net/tipc/socket.c
6944 +@@ -709,11 +709,11 @@ static unsigned int tipc_poll(struct file *file, struct socket *sock,
6945 +
6946 + switch (sk->sk_state) {
6947 + case TIPC_ESTABLISHED:
6948 ++ case TIPC_CONNECTING:
6949 + if (!tsk->cong_link_cnt && !tsk_conn_cong(tsk))
6950 + mask |= POLLOUT;
6951 + /* fall thru' */
6952 + case TIPC_LISTEN:
6953 +- case TIPC_CONNECTING:
6954 + if (!skb_queue_empty(&sk->sk_receive_queue))
6955 + mask |= (POLLIN | POLLRDNORM);
6956 + break;
6957 +diff --git a/security/Kconfig b/security/Kconfig
6958 +index e8e449444e65..6614b9312b45 100644
6959 +--- a/security/Kconfig
6960 ++++ b/security/Kconfig
6961 +@@ -54,6 +54,17 @@ config SECURITY_NETWORK
6962 + implement socket and networking access controls.
6963 + If you are unsure how to answer this question, answer N.
6964 +
6965 ++config PAGE_TABLE_ISOLATION
6966 ++ bool "Remove the kernel mapping in user mode"
6967 ++ depends on X86_64 && !UML
6968 ++ default y
6969 ++ help
6970 ++ This feature reduces the number of hardware side channels by
6971 ++ ensuring that the majority of kernel addresses are not mapped
6972 ++ into userspace.
6973 ++
6974 ++ See Documentation/x86/pagetable-isolation.txt for more details.
6975 ++
6976 + config SECURITY_INFINIBAND
6977 + bool "Infiniband Security Hooks"
6978 + depends on SECURITY && INFINIBAND
6979 +diff --git a/sound/hda/hdac_i915.c b/sound/hda/hdac_i915.c
6980 +index 038a180d3f81..cbe818eda336 100644
6981 +--- a/sound/hda/hdac_i915.c
6982 ++++ b/sound/hda/hdac_i915.c
6983 +@@ -325,7 +325,7 @@ static int hdac_component_master_match(struct device *dev, void *data)
6984 + */
6985 + int snd_hdac_i915_register_notifier(const struct i915_audio_component_audio_ops *aops)
6986 + {
6987 +- if (WARN_ON(!hdac_acomp))
6988 ++ if (!hdac_acomp)
6989 + return -ENODEV;
6990 +
6991 + hdac_acomp->audio_ops = aops;
6992 +diff --git a/sound/pci/hda/patch_conexant.c b/sound/pci/hda/patch_conexant.c
6993 +index a81aacf684b2..37e1cf8218ff 100644
6994 +--- a/sound/pci/hda/patch_conexant.c
6995 ++++ b/sound/pci/hda/patch_conexant.c
6996 +@@ -271,6 +271,8 @@ enum {
6997 + CXT_FIXUP_HP_SPECTRE,
6998 + CXT_FIXUP_HP_GATE_MIC,
6999 + CXT_FIXUP_MUTE_LED_GPIO,
7000 ++ CXT_FIXUP_HEADSET_MIC,
7001 ++ CXT_FIXUP_HP_MIC_NO_PRESENCE,
7002 + };
7003 +
7004 + /* for hda_fixup_thinkpad_acpi() */
7005 +@@ -350,6 +352,18 @@ static void cxt_fixup_headphone_mic(struct hda_codec *codec,
7006 + }
7007 + }
7008 +
7009 ++static void cxt_fixup_headset_mic(struct hda_codec *codec,
7010 ++ const struct hda_fixup *fix, int action)
7011 ++{
7012 ++ struct conexant_spec *spec = codec->spec;
7013 ++
7014 ++ switch (action) {
7015 ++ case HDA_FIXUP_ACT_PRE_PROBE:
7016 ++ spec->parse_flags |= HDA_PINCFG_HEADSET_MIC;
7017 ++ break;
7018 ++ }
7019 ++}
7020 ++
7021 + /* OPLC XO 1.5 fixup */
7022 +
7023 + /* OLPC XO-1.5 supports DC input mode (e.g. for use with analog sensors)
7024 +@@ -880,6 +894,19 @@ static const struct hda_fixup cxt_fixups[] = {
7025 + .type = HDA_FIXUP_FUNC,
7026 + .v.func = cxt_fixup_mute_led_gpio,
7027 + },
7028 ++ [CXT_FIXUP_HEADSET_MIC] = {
7029 ++ .type = HDA_FIXUP_FUNC,
7030 ++ .v.func = cxt_fixup_headset_mic,
7031 ++ },
7032 ++ [CXT_FIXUP_HP_MIC_NO_PRESENCE] = {
7033 ++ .type = HDA_FIXUP_PINS,
7034 ++ .v.pins = (const struct hda_pintbl[]) {
7035 ++ { 0x1a, 0x02a1113c },
7036 ++ { }
7037 ++ },
7038 ++ .chained = true,
7039 ++ .chain_id = CXT_FIXUP_HEADSET_MIC,
7040 ++ },
7041 + };
7042 +
7043 + static const struct snd_pci_quirk cxt5045_fixups[] = {
7044 +@@ -934,6 +961,8 @@ static const struct snd_pci_quirk cxt5066_fixups[] = {
7045 + SND_PCI_QUIRK(0x103c, 0x8115, "HP Z1 Gen3", CXT_FIXUP_HP_GATE_MIC),
7046 + SND_PCI_QUIRK(0x103c, 0x814f, "HP ZBook 15u G3", CXT_FIXUP_MUTE_LED_GPIO),
7047 + SND_PCI_QUIRK(0x103c, 0x822e, "HP ProBook 440 G4", CXT_FIXUP_MUTE_LED_GPIO),
7048 ++ SND_PCI_QUIRK(0x103c, 0x8299, "HP 800 G3 SFF", CXT_FIXUP_HP_MIC_NO_PRESENCE),
7049 ++ SND_PCI_QUIRK(0x103c, 0x829a, "HP 800 G3 DM", CXT_FIXUP_HP_MIC_NO_PRESENCE),
7050 + SND_PCI_QUIRK(0x1043, 0x138d, "Asus", CXT_FIXUP_HEADPHONE_MIC_PIN),
7051 + SND_PCI_QUIRK(0x152d, 0x0833, "OLPC XO-1.5", CXT_FIXUP_OLPC_XO),
7052 + SND_PCI_QUIRK(0x17aa, 0x20f2, "Lenovo T400", CXT_PINCFG_LENOVO_TP410),
7053 +diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c
7054 +index 9ac4b9076ee2..acdb196ddb44 100644
7055 +--- a/sound/pci/hda/patch_realtek.c
7056 ++++ b/sound/pci/hda/patch_realtek.c
7057 +@@ -324,8 +324,12 @@ static void alc_fill_eapd_coef(struct hda_codec *codec)
7058 + case 0x10ec0292:
7059 + alc_update_coef_idx(codec, 0x4, 1<<15, 0);
7060 + break;
7061 +- case 0x10ec0215:
7062 + case 0x10ec0225:
7063 ++ case 0x10ec0295:
7064 ++ case 0x10ec0299:
7065 ++ alc_update_coef_idx(codec, 0x67, 0xf000, 0x3000);
7066 ++ /* fallthrough */
7067 ++ case 0x10ec0215:
7068 + case 0x10ec0233:
7069 + case 0x10ec0236:
7070 + case 0x10ec0255:
7071 +@@ -336,10 +340,8 @@ static void alc_fill_eapd_coef(struct hda_codec *codec)
7072 + case 0x10ec0286:
7073 + case 0x10ec0288:
7074 + case 0x10ec0285:
7075 +- case 0x10ec0295:
7076 + case 0x10ec0298:
7077 + case 0x10ec0289:
7078 +- case 0x10ec0299:
7079 + alc_update_coef_idx(codec, 0x10, 1<<9, 0);
7080 + break;
7081 + case 0x10ec0275:
7082 +@@ -6305,6 +6307,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = {
7083 + SND_PCI_QUIRK(0x17aa, 0x30bb, "ThinkCentre AIO", ALC233_FIXUP_LENOVO_LINE2_MIC_HOTKEY),
7084 + SND_PCI_QUIRK(0x17aa, 0x30e2, "ThinkCentre AIO", ALC233_FIXUP_LENOVO_LINE2_MIC_HOTKEY),
7085 + SND_PCI_QUIRK(0x17aa, 0x310c, "ThinkCentre Station", ALC294_FIXUP_LENOVO_MIC_LOCATION),
7086 ++ SND_PCI_QUIRK(0x17aa, 0x313c, "ThinkCentre Station", ALC294_FIXUP_LENOVO_MIC_LOCATION),
7087 + SND_PCI_QUIRK(0x17aa, 0x3112, "ThinkCentre AIO", ALC233_FIXUP_LENOVO_LINE2_MIC_HOTKEY),
7088 + SND_PCI_QUIRK(0x17aa, 0x3902, "Lenovo E50-80", ALC269_FIXUP_DMIC_THINKPAD_ACPI),
7089 + SND_PCI_QUIRK(0x17aa, 0x3977, "IdeaPad S210", ALC283_FIXUP_INT_MIC),
7090 +@@ -6557,6 +6560,11 @@ static const struct snd_hda_pin_quirk alc269_pin_fixup_tbl[] = {
7091 + SND_HDA_PIN_QUIRK(0x10ec0255, 0x1028, "Dell", ALC255_FIXUP_DELL1_MIC_NO_PRESENCE,
7092 + {0x1b, 0x01011020},
7093 + {0x21, 0x02211010}),
7094 ++ SND_HDA_PIN_QUIRK(0x10ec0256, 0x1028, "Dell", ALC255_FIXUP_DELL1_MIC_NO_PRESENCE,
7095 ++ {0x12, 0x90a60130},
7096 ++ {0x14, 0x90170110},
7097 ++ {0x1b, 0x01011020},
7098 ++ {0x21, 0x0221101f}),
7099 + SND_HDA_PIN_QUIRK(0x10ec0256, 0x1028, "Dell", ALC255_FIXUP_DELL1_MIC_NO_PRESENCE,
7100 + {0x12, 0x90a60160},
7101 + {0x14, 0x90170120},
7102 +diff --git a/sound/soc/codecs/da7218.c b/sound/soc/codecs/da7218.c
7103 +index b2d42ec1dcd9..56564ce90cb6 100644
7104 +--- a/sound/soc/codecs/da7218.c
7105 ++++ b/sound/soc/codecs/da7218.c
7106 +@@ -2520,7 +2520,7 @@ static struct da7218_pdata *da7218_of_to_pdata(struct snd_soc_codec *codec)
7107 + }
7108 +
7109 + if (da7218->dev_id == DA7218_DEV_ID) {
7110 +- hpldet_np = of_find_node_by_name(np, "da7218_hpldet");
7111 ++ hpldet_np = of_get_child_by_name(np, "da7218_hpldet");
7112 + if (!hpldet_np)
7113 + return pdata;
7114 +
7115 +diff --git a/sound/soc/codecs/msm8916-wcd-analog.c b/sound/soc/codecs/msm8916-wcd-analog.c
7116 +index 18933bf6473f..8c7063e1aa46 100644
7117 +--- a/sound/soc/codecs/msm8916-wcd-analog.c
7118 ++++ b/sound/soc/codecs/msm8916-wcd-analog.c
7119 +@@ -267,7 +267,7 @@
7120 + #define MSM8916_WCD_ANALOG_RATES (SNDRV_PCM_RATE_8000 | SNDRV_PCM_RATE_16000 |\
7121 + SNDRV_PCM_RATE_32000 | SNDRV_PCM_RATE_48000)
7122 + #define MSM8916_WCD_ANALOG_FORMATS (SNDRV_PCM_FMTBIT_S16_LE |\
7123 +- SNDRV_PCM_FMTBIT_S24_LE)
7124 ++ SNDRV_PCM_FMTBIT_S32_LE)
7125 +
7126 + static int btn_mask = SND_JACK_BTN_0 | SND_JACK_BTN_1 |
7127 + SND_JACK_BTN_2 | SND_JACK_BTN_3 | SND_JACK_BTN_4;
7128 +diff --git a/sound/soc/codecs/msm8916-wcd-digital.c b/sound/soc/codecs/msm8916-wcd-digital.c
7129 +index 66df8f810f0d..694db27b11fa 100644
7130 +--- a/sound/soc/codecs/msm8916-wcd-digital.c
7131 ++++ b/sound/soc/codecs/msm8916-wcd-digital.c
7132 +@@ -194,7 +194,7 @@
7133 + SNDRV_PCM_RATE_32000 | \
7134 + SNDRV_PCM_RATE_48000)
7135 + #define MSM8916_WCD_DIGITAL_FORMATS (SNDRV_PCM_FMTBIT_S16_LE |\
7136 +- SNDRV_PCM_FMTBIT_S24_LE)
7137 ++ SNDRV_PCM_FMTBIT_S32_LE)
7138 +
7139 + struct msm8916_wcd_digital_priv {
7140 + struct clk *ahbclk, *mclk;
7141 +@@ -645,7 +645,7 @@ static int msm8916_wcd_digital_hw_params(struct snd_pcm_substream *substream,
7142 + RX_I2S_CTL_RX_I2S_MODE_MASK,
7143 + RX_I2S_CTL_RX_I2S_MODE_16);
7144 + break;
7145 +- case SNDRV_PCM_FORMAT_S24_LE:
7146 ++ case SNDRV_PCM_FORMAT_S32_LE:
7147 + snd_soc_update_bits(dai->codec, LPASS_CDC_CLK_TX_I2S_CTL,
7148 + TX_I2S_CTL_TX_I2S_MODE_MASK,
7149 + TX_I2S_CTL_TX_I2S_MODE_32);
7150 +diff --git a/sound/soc/codecs/tlv320aic31xx.h b/sound/soc/codecs/tlv320aic31xx.h
7151 +index 730fb2058869..1ff3edb7bbb6 100644
7152 +--- a/sound/soc/codecs/tlv320aic31xx.h
7153 ++++ b/sound/soc/codecs/tlv320aic31xx.h
7154 +@@ -116,7 +116,7 @@ struct aic31xx_pdata {
7155 + /* INT2 interrupt control */
7156 + #define AIC31XX_INT2CTRL AIC31XX_REG(0, 49)
7157 + /* GPIO1 control */
7158 +-#define AIC31XX_GPIO1 AIC31XX_REG(0, 50)
7159 ++#define AIC31XX_GPIO1 AIC31XX_REG(0, 51)
7160 +
7161 + #define AIC31XX_DACPRB AIC31XX_REG(0, 60)
7162 + /* ADC Instruction Set Register */
7163 +diff --git a/sound/soc/codecs/twl4030.c b/sound/soc/codecs/twl4030.c
7164 +index c482b2e7a7d2..cfe72b9d4356 100644
7165 +--- a/sound/soc/codecs/twl4030.c
7166 ++++ b/sound/soc/codecs/twl4030.c
7167 +@@ -232,7 +232,7 @@ static struct twl4030_codec_data *twl4030_get_pdata(struct snd_soc_codec *codec)
7168 + struct twl4030_codec_data *pdata = dev_get_platdata(codec->dev);
7169 + struct device_node *twl4030_codec_node = NULL;
7170 +
7171 +- twl4030_codec_node = of_find_node_by_name(codec->dev->parent->of_node,
7172 ++ twl4030_codec_node = of_get_child_by_name(codec->dev->parent->of_node,
7173 + "codec");
7174 +
7175 + if (!pdata && twl4030_codec_node) {
7176 +@@ -241,9 +241,11 @@ static struct twl4030_codec_data *twl4030_get_pdata(struct snd_soc_codec *codec)
7177 + GFP_KERNEL);
7178 + if (!pdata) {
7179 + dev_err(codec->dev, "Can not allocate memory\n");
7180 ++ of_node_put(twl4030_codec_node);
7181 + return NULL;
7182 + }
7183 + twl4030_setup_pdata_of(pdata, twl4030_codec_node);
7184 ++ of_node_put(twl4030_codec_node);
7185 + }
7186 +
7187 + return pdata;
7188 +diff --git a/sound/soc/codecs/wm_adsp.c b/sound/soc/codecs/wm_adsp.c
7189 +index 65c059b5ffd7..66e32f5d2917 100644
7190 +--- a/sound/soc/codecs/wm_adsp.c
7191 ++++ b/sound/soc/codecs/wm_adsp.c
7192 +@@ -1733,7 +1733,7 @@ static int wm_adsp_load(struct wm_adsp *dsp)
7193 + le64_to_cpu(footer->timestamp));
7194 +
7195 + while (pos < firmware->size &&
7196 +- pos - firmware->size > sizeof(*region)) {
7197 ++ sizeof(*region) < firmware->size - pos) {
7198 + region = (void *)&(firmware->data[pos]);
7199 + region_name = "Unknown";
7200 + reg = 0;
7201 +@@ -1782,8 +1782,8 @@ static int wm_adsp_load(struct wm_adsp *dsp)
7202 + regions, le32_to_cpu(region->len), offset,
7203 + region_name);
7204 +
7205 +- if ((pos + le32_to_cpu(region->len) + sizeof(*region)) >
7206 +- firmware->size) {
7207 ++ if (le32_to_cpu(region->len) >
7208 ++ firmware->size - pos - sizeof(*region)) {
7209 + adsp_err(dsp,
7210 + "%s.%d: %s region len %d bytes exceeds file length %zu\n",
7211 + file, regions, region_name,
7212 +@@ -2253,7 +2253,7 @@ static int wm_adsp_load_coeff(struct wm_adsp *dsp)
7213 +
7214 + blocks = 0;
7215 + while (pos < firmware->size &&
7216 +- pos - firmware->size > sizeof(*blk)) {
7217 ++ sizeof(*blk) < firmware->size - pos) {
7218 + blk = (void *)(&firmware->data[pos]);
7219 +
7220 + type = le16_to_cpu(blk->type);
7221 +@@ -2327,8 +2327,8 @@ static int wm_adsp_load_coeff(struct wm_adsp *dsp)
7222 + }
7223 +
7224 + if (reg) {
7225 +- if ((pos + le32_to_cpu(blk->len) + sizeof(*blk)) >
7226 +- firmware->size) {
7227 ++ if (le32_to_cpu(blk->len) >
7228 ++ firmware->size - pos - sizeof(*blk)) {
7229 + adsp_err(dsp,
7230 + "%s.%d: %s region len %d bytes exceeds file length %zu\n",
7231 + file, blocks, region_name,
7232 +diff --git a/sound/soc/fsl/fsl_ssi.c b/sound/soc/fsl/fsl_ssi.c
7233 +index 64598d1183f8..3ffbb498cc70 100644
7234 +--- a/sound/soc/fsl/fsl_ssi.c
7235 ++++ b/sound/soc/fsl/fsl_ssi.c
7236 +@@ -1452,12 +1452,6 @@ static int fsl_ssi_probe(struct platform_device *pdev)
7237 + sizeof(fsl_ssi_ac97_dai));
7238 +
7239 + fsl_ac97_data = ssi_private;
7240 +-
7241 +- ret = snd_soc_set_ac97_ops_of_reset(&fsl_ssi_ac97_ops, pdev);
7242 +- if (ret) {
7243 +- dev_err(&pdev->dev, "could not set AC'97 ops\n");
7244 +- return ret;
7245 +- }
7246 + } else {
7247 + /* Initialize this copy of the CPU DAI driver structure */
7248 + memcpy(&ssi_private->cpu_dai_drv, &fsl_ssi_dai_template,
7249 +@@ -1568,6 +1562,14 @@ static int fsl_ssi_probe(struct platform_device *pdev)
7250 + return ret;
7251 + }
7252 +
7253 ++ if (fsl_ssi_is_ac97(ssi_private)) {
7254 ++ ret = snd_soc_set_ac97_ops_of_reset(&fsl_ssi_ac97_ops, pdev);
7255 ++ if (ret) {
7256 ++ dev_err(&pdev->dev, "could not set AC'97 ops\n");
7257 ++ goto error_ac97_ops;
7258 ++ }
7259 ++ }
7260 ++
7261 + ret = devm_snd_soc_register_component(&pdev->dev, &fsl_ssi_component,
7262 + &ssi_private->cpu_dai_drv, 1);
7263 + if (ret) {
7264 +@@ -1651,6 +1653,10 @@ static int fsl_ssi_probe(struct platform_device *pdev)
7265 + fsl_ssi_debugfs_remove(&ssi_private->dbg_stats);
7266 +
7267 + error_asoc_register:
7268 ++ if (fsl_ssi_is_ac97(ssi_private))
7269 ++ snd_soc_set_ac97_ops(NULL);
7270 ++
7271 ++error_ac97_ops:
7272 + if (ssi_private->soc->imx)
7273 + fsl_ssi_imx_clean(pdev, ssi_private);
7274 +
7275 +diff --git a/tools/testing/selftests/x86/ldt_gdt.c b/tools/testing/selftests/x86/ldt_gdt.c
7276 +index 0304ffb714f2..1aef72df20a1 100644
7277 +--- a/tools/testing/selftests/x86/ldt_gdt.c
7278 ++++ b/tools/testing/selftests/x86/ldt_gdt.c
7279 +@@ -122,8 +122,7 @@ static void check_valid_segment(uint16_t index, int ldt,
7280 + * NB: Different Linux versions do different things with the
7281 + * accessed bit in set_thread_area().
7282 + */
7283 +- if (ar != expected_ar &&
7284 +- (ldt || ar != (expected_ar | AR_ACCESSED))) {
7285 ++ if (ar != expected_ar && ar != (expected_ar | AR_ACCESSED)) {
7286 + printf("[FAIL]\t%s entry %hu has AR 0x%08X but expected 0x%08X\n",
7287 + (ldt ? "LDT" : "GDT"), index, ar, expected_ar);
7288 + nerrs++;
7289 +diff --git a/tools/usb/usbip/src/utils.c b/tools/usb/usbip/src/utils.c
7290 +index 2b3d6d235015..3d7b42e77299 100644
7291 +--- a/tools/usb/usbip/src/utils.c
7292 ++++ b/tools/usb/usbip/src/utils.c
7293 +@@ -30,6 +30,7 @@ int modify_match_busid(char *busid, int add)
7294 + char command[SYSFS_BUS_ID_SIZE + 4];
7295 + char match_busid_attr_path[SYSFS_PATH_MAX];
7296 + int rc;
7297 ++ int cmd_size;
7298 +
7299 + snprintf(match_busid_attr_path, sizeof(match_busid_attr_path),
7300 + "%s/%s/%s/%s/%s/%s", SYSFS_MNT_PATH, SYSFS_BUS_NAME,
7301 +@@ -37,12 +38,14 @@ int modify_match_busid(char *busid, int add)
7302 + attr_name);
7303 +
7304 + if (add)
7305 +- snprintf(command, SYSFS_BUS_ID_SIZE + 4, "add %s", busid);
7306 ++ cmd_size = snprintf(command, SYSFS_BUS_ID_SIZE + 4, "add %s",
7307 ++ busid);
7308 + else
7309 +- snprintf(command, SYSFS_BUS_ID_SIZE + 4, "del %s", busid);
7310 ++ cmd_size = snprintf(command, SYSFS_BUS_ID_SIZE + 4, "del %s",
7311 ++ busid);
7312 +
7313 + rc = write_sysfs_attribute(match_busid_attr_path, command,
7314 +- sizeof(command));
7315 ++ cmd_size);
7316 + if (rc < 0) {
7317 + dbg("failed to write match_busid: %s", strerror(errno));
7318 + return -1;