Gentoo Archives: gentoo-commits

From: Alice Ferrazzi <alicef@g.o>
To: gentoo-commits@l.g.o
Subject: [gentoo-commits] proj/linux-patches:4.9 commit in: /
Date: Fri, 05 Jan 2018 15:04:10
Message-Id: 1515164634.a08c6f0923abc66cb0192f849780a30c3016e946.alicef@gentoo
1 commit: a08c6f0923abc66cb0192f849780a30c3016e946
2 Author: Alice Ferrazzi <alicef <AT> gentoo <DOT> org>
3 AuthorDate: Fri Jan 5 15:03:54 2018 +0000
4 Commit: Alice Ferrazzi <alicef <AT> gentoo <DOT> org>
5 CommitDate: Fri Jan 5 15:03:54 2018 +0000
6 URL: https://gitweb.gentoo.org/proj/linux-patches.git/commit/?id=a08c6f09
7
8 linux kernel 4.9.75
9
10 0000_README | 4 +
11 1074_linux-4.9.75.patch | 2577 +++++++++++++++++++++++++++++++++++++++++++++++
12 2 files changed, 2581 insertions(+)
13
14 diff --git a/0000_README b/0000_README
15 index 350d2c5..eed3372 100644
16 --- a/0000_README
17 +++ b/0000_README
18 @@ -339,6 +339,10 @@ Patch: 1073_linux-4.9.74.patch
19 From: http://www.kernel.org
20 Desc: Linux 4.9.74
21
22 +Patch: 1074_linux-4.9.75.patch
23 +From: http://www.kernel.org
24 +Desc: Linux 4.9.75
25 +
26 Patch: 1500_XATTR_USER_PREFIX.patch
27 From: https://bugs.gentoo.org/show_bug.cgi?id=470644
28 Desc: Support for namespace user.pax.* on tmpfs.
29
30 diff --git a/1074_linux-4.9.75.patch b/1074_linux-4.9.75.patch
31 new file mode 100644
32 index 0000000..6299f19
33 --- /dev/null
34 +++ b/1074_linux-4.9.75.patch
35 @@ -0,0 +1,2577 @@
36 +diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
37 +index 152ec4e87b57..5d2676d043de 100644
38 +--- a/Documentation/kernel-parameters.txt
39 ++++ b/Documentation/kernel-parameters.txt
40 +@@ -2763,6 +2763,8 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
41 +
42 + nojitter [IA-64] Disables jitter checking for ITC timers.
43 +
44 ++ nopti [X86-64] Disable KAISER isolation of kernel from user.
45 ++
46 + no-kvmclock [X86,KVM] Disable paravirtualized KVM clock driver
47 +
48 + no-kvmapf [X86,KVM] Disable paravirtualized asynchronous page
49 +@@ -3325,6 +3327,12 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
50 + pt. [PARIDE]
51 + See Documentation/blockdev/paride.txt.
52 +
53 ++ pti= [X86_64]
54 ++ Control KAISER user/kernel address space isolation:
55 ++ on - enable
56 ++ off - disable
57 ++ auto - default setting
58 ++
59 + pty.legacy_count=
60 + [KNL] Number of legacy pty's. Overwrites compiled-in
61 + default number.
62 +diff --git a/Makefile b/Makefile
63 +index 075e429732e7..acbc1b032db2 100644
64 +--- a/Makefile
65 ++++ b/Makefile
66 +@@ -1,6 +1,6 @@
67 + VERSION = 4
68 + PATCHLEVEL = 9
69 +-SUBLEVEL = 74
70 ++SUBLEVEL = 75
71 + EXTRAVERSION =
72 + NAME = Roaring Lionus
73 +
74 +diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h
75 +index 766a5211f827..2728e1b7e4a6 100644
76 +--- a/arch/x86/boot/compressed/misc.h
77 ++++ b/arch/x86/boot/compressed/misc.h
78 +@@ -9,6 +9,7 @@
79 + */
80 + #undef CONFIG_PARAVIRT
81 + #undef CONFIG_PARAVIRT_SPINLOCKS
82 ++#undef CONFIG_PAGE_TABLE_ISOLATION
83 + #undef CONFIG_KASAN
84 +
85 + #include <linux/linkage.h>
86 +diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
87 +index e7b0e7ff4c58..af4e58132d91 100644
88 +--- a/arch/x86/entry/entry_64.S
89 ++++ b/arch/x86/entry/entry_64.S
90 +@@ -36,6 +36,7 @@
91 + #include <asm/smap.h>
92 + #include <asm/pgtable_types.h>
93 + #include <asm/export.h>
94 ++#include <asm/kaiser.h>
95 + #include <linux/err.h>
96 +
97 + /* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
98 +@@ -146,6 +147,7 @@ ENTRY(entry_SYSCALL_64)
99 + * it is too small to ever cause noticeable irq latency.
100 + */
101 + SWAPGS_UNSAFE_STACK
102 ++ SWITCH_KERNEL_CR3_NO_STACK
103 + /*
104 + * A hypervisor implementation might want to use a label
105 + * after the swapgs, so that it can do the swapgs
106 +@@ -228,6 +230,14 @@ entry_SYSCALL_64_fastpath:
107 + movq RIP(%rsp), %rcx
108 + movq EFLAGS(%rsp), %r11
109 + RESTORE_C_REGS_EXCEPT_RCX_R11
110 ++ /*
111 ++ * This opens a window where we have a user CR3, but are
112 ++ * running in the kernel. This makes using the CS
113 ++ * register useless for telling whether or not we need to
114 ++ * switch CR3 in NMIs. Normal interrupts are OK because
115 ++ * they are off here.
116 ++ */
117 ++ SWITCH_USER_CR3
118 + movq RSP(%rsp), %rsp
119 + USERGS_SYSRET64
120 +
121 +@@ -323,10 +333,26 @@ return_from_SYSCALL_64:
122 + syscall_return_via_sysret:
123 + /* rcx and r11 are already restored (see code above) */
124 + RESTORE_C_REGS_EXCEPT_RCX_R11
125 ++ /*
126 ++ * This opens a window where we have a user CR3, but are
127 ++ * running in the kernel. This makes using the CS
128 ++ * register useless for telling whether or not we need to
129 ++ * switch CR3 in NMIs. Normal interrupts are OK because
130 ++ * they are off here.
131 ++ */
132 ++ SWITCH_USER_CR3
133 + movq RSP(%rsp), %rsp
134 + USERGS_SYSRET64
135 +
136 + opportunistic_sysret_failed:
137 ++ /*
138 ++ * This opens a window where we have a user CR3, but are
139 ++ * running in the kernel. This makes using the CS
140 ++ * register useless for telling whether or not we need to
141 ++ * switch CR3 in NMIs. Normal interrupts are OK because
142 ++ * they are off here.
143 ++ */
144 ++ SWITCH_USER_CR3
145 + SWAPGS
146 + jmp restore_c_regs_and_iret
147 + END(entry_SYSCALL_64)
148 +@@ -424,6 +450,7 @@ ENTRY(ret_from_fork)
149 + movq %rsp, %rdi
150 + call syscall_return_slowpath /* returns with IRQs disabled */
151 + TRACE_IRQS_ON /* user mode is traced as IRQS on */
152 ++ SWITCH_USER_CR3
153 + SWAPGS
154 + jmp restore_regs_and_iret
155 +
156 +@@ -478,6 +505,7 @@ END(irq_entries_start)
157 + * tracking that we're in kernel mode.
158 + */
159 + SWAPGS
160 ++ SWITCH_KERNEL_CR3
161 +
162 + /*
163 + * We need to tell lockdep that IRQs are off. We can't do this until
164 +@@ -535,6 +563,7 @@ GLOBAL(retint_user)
165 + mov %rsp,%rdi
166 + call prepare_exit_to_usermode
167 + TRACE_IRQS_IRETQ
168 ++ SWITCH_USER_CR3
169 + SWAPGS
170 + jmp restore_regs_and_iret
171 +
172 +@@ -612,6 +641,7 @@ native_irq_return_ldt:
173 +
174 + pushq %rdi /* Stash user RDI */
175 + SWAPGS
176 ++ SWITCH_KERNEL_CR3
177 + movq PER_CPU_VAR(espfix_waddr), %rdi
178 + movq %rax, (0*8)(%rdi) /* user RAX */
179 + movq (1*8)(%rsp), %rax /* user RIP */
180 +@@ -638,6 +668,7 @@ native_irq_return_ldt:
181 + * still points to an RO alias of the ESPFIX stack.
182 + */
183 + orq PER_CPU_VAR(espfix_stack), %rax
184 ++ SWITCH_USER_CR3
185 + SWAPGS
186 + movq %rax, %rsp
187 +
188 +@@ -1022,7 +1053,11 @@ idtentry machine_check has_error_code=0 paranoid=1 do_sym=*machine_check_vec
189 + /*
190 + * Save all registers in pt_regs, and switch gs if needed.
191 + * Use slow, but surefire "are we in kernel?" check.
192 +- * Return: ebx=0: need swapgs on exit, ebx=1: otherwise
193 ++ *
194 ++ * Return: ebx=0: needs swapgs but not SWITCH_USER_CR3 in paranoid_exit
195 ++ * ebx=1: needs neither swapgs nor SWITCH_USER_CR3 in paranoid_exit
196 ++ * ebx=2: needs both swapgs and SWITCH_USER_CR3 in paranoid_exit
197 ++ * ebx=3: needs SWITCH_USER_CR3 but not swapgs in paranoid_exit
198 + */
199 + ENTRY(paranoid_entry)
200 + cld
201 +@@ -1035,7 +1070,26 @@ ENTRY(paranoid_entry)
202 + js 1f /* negative -> in kernel */
203 + SWAPGS
204 + xorl %ebx, %ebx
205 +-1: ret
206 ++1:
207 ++#ifdef CONFIG_PAGE_TABLE_ISOLATION
208 ++ /*
209 ++ * We might have come in between a swapgs and a SWITCH_KERNEL_CR3
210 ++ * on entry, or between a SWITCH_USER_CR3 and a swapgs on exit.
211 ++ * Do a conditional SWITCH_KERNEL_CR3: this could safely be done
212 ++ * unconditionally, but we need to find out whether the reverse
213 ++ * should be done on return (conveyed to paranoid_exit in %ebx).
214 ++ */
215 ++ ALTERNATIVE "jmp 2f", "movq %cr3, %rax", X86_FEATURE_KAISER
216 ++ testl $KAISER_SHADOW_PGD_OFFSET, %eax
217 ++ jz 2f
218 ++ orl $2, %ebx
219 ++ andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax
220 ++ /* If PCID enabled, set X86_CR3_PCID_NOFLUSH_BIT */
221 ++ ALTERNATIVE "", "bts $63, %rax", X86_FEATURE_PCID
222 ++ movq %rax, %cr3
223 ++2:
224 ++#endif
225 ++ ret
226 + END(paranoid_entry)
227 +
228 + /*
229 +@@ -1048,19 +1102,26 @@ END(paranoid_entry)
230 + * be complicated. Fortunately, we there's no good reason
231 + * to try to handle preemption here.
232 + *
233 +- * On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it)
234 ++ * On entry: ebx=0: needs swapgs but not SWITCH_USER_CR3
235 ++ * ebx=1: needs neither swapgs nor SWITCH_USER_CR3
236 ++ * ebx=2: needs both swapgs and SWITCH_USER_CR3
237 ++ * ebx=3: needs SWITCH_USER_CR3 but not swapgs
238 + */
239 + ENTRY(paranoid_exit)
240 + DISABLE_INTERRUPTS(CLBR_NONE)
241 + TRACE_IRQS_OFF_DEBUG
242 +- testl %ebx, %ebx /* swapgs needed? */
243 ++ TRACE_IRQS_IRETQ_DEBUG
244 ++#ifdef CONFIG_PAGE_TABLE_ISOLATION
245 ++ /* No ALTERNATIVE for X86_FEATURE_KAISER: paranoid_entry sets %ebx */
246 ++ testl $2, %ebx /* SWITCH_USER_CR3 needed? */
247 ++ jz paranoid_exit_no_switch
248 ++ SWITCH_USER_CR3
249 ++paranoid_exit_no_switch:
250 ++#endif
251 ++ testl $1, %ebx /* swapgs needed? */
252 + jnz paranoid_exit_no_swapgs
253 +- TRACE_IRQS_IRETQ
254 + SWAPGS_UNSAFE_STACK
255 +- jmp paranoid_exit_restore
256 + paranoid_exit_no_swapgs:
257 +- TRACE_IRQS_IRETQ_DEBUG
258 +-paranoid_exit_restore:
259 + RESTORE_EXTRA_REGS
260 + RESTORE_C_REGS
261 + REMOVE_PT_GPREGS_FROM_STACK 8
262 +@@ -1075,6 +1136,13 @@ ENTRY(error_entry)
263 + cld
264 + SAVE_C_REGS 8
265 + SAVE_EXTRA_REGS 8
266 ++ /*
267 ++ * error_entry() always returns with a kernel gsbase and
268 ++ * CR3. We must also have a kernel CR3/gsbase before
269 ++ * calling TRACE_IRQS_*. Just unconditionally switch to
270 ++ * the kernel CR3 here.
271 ++ */
272 ++ SWITCH_KERNEL_CR3
273 + xorl %ebx, %ebx
274 + testb $3, CS+8(%rsp)
275 + jz .Lerror_kernelspace
276 +@@ -1235,6 +1303,10 @@ ENTRY(nmi)
277 + */
278 +
279 + SWAPGS_UNSAFE_STACK
280 ++ /*
281 ++ * percpu variables are mapped with user CR3, so no need
282 ++ * to switch CR3 here.
283 ++ */
284 + cld
285 + movq %rsp, %rdx
286 + movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
287 +@@ -1268,12 +1340,34 @@ ENTRY(nmi)
288 +
289 + movq %rsp, %rdi
290 + movq $-1, %rsi
291 ++#ifdef CONFIG_PAGE_TABLE_ISOLATION
292 ++ /* Unconditionally use kernel CR3 for do_nmi() */
293 ++ /* %rax is saved above, so OK to clobber here */
294 ++ ALTERNATIVE "jmp 2f", "movq %cr3, %rax", X86_FEATURE_KAISER
295 ++ /* If PCID enabled, NOFLUSH now and NOFLUSH on return */
296 ++ ALTERNATIVE "", "bts $63, %rax", X86_FEATURE_PCID
297 ++ pushq %rax
298 ++ /* mask off "user" bit of pgd address and 12 PCID bits: */
299 ++ andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax
300 ++ movq %rax, %cr3
301 ++2:
302 ++#endif
303 + call do_nmi
304 +
305 ++#ifdef CONFIG_PAGE_TABLE_ISOLATION
306 ++ /*
307 ++ * Unconditionally restore CR3. I know we return to
308 ++ * kernel code that needs user CR3, but do we ever return
309 ++ * to "user mode" where we need the kernel CR3?
310 ++ */
311 ++ ALTERNATIVE "", "popq %rax; movq %rax, %cr3", X86_FEATURE_KAISER
312 ++#endif
313 ++
314 + /*
315 + * Return back to user mode. We must *not* do the normal exit
316 +- * work, because we don't want to enable interrupts. Fortunately,
317 +- * do_nmi doesn't modify pt_regs.
318 ++ * work, because we don't want to enable interrupts. Do not
319 ++ * switch to user CR3: we might be going back to kernel code
320 ++ * that had a user CR3 set.
321 + */
322 + SWAPGS
323 + jmp restore_c_regs_and_iret
324 +@@ -1470,22 +1564,55 @@ end_repeat_nmi:
325 + ALLOC_PT_GPREGS_ON_STACK
326 +
327 + /*
328 +- * Use paranoid_entry to handle SWAPGS, but no need to use paranoid_exit
329 +- * as we should not be calling schedule in NMI context.
330 +- * Even with normal interrupts enabled. An NMI should not be
331 +- * setting NEED_RESCHED or anything that normal interrupts and
332 +- * exceptions might do.
333 ++ * Use the same approach as paranoid_entry to handle SWAPGS, but
334 ++ * without CR3 handling since we do that differently in NMIs. No
335 ++ * need to use paranoid_exit as we should not be calling schedule
336 ++ * in NMI context. Even with normal interrupts enabled. An NMI
337 ++ * should not be setting NEED_RESCHED or anything that normal
338 ++ * interrupts and exceptions might do.
339 + */
340 +- call paranoid_entry
341 +-
342 +- /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
343 ++ cld
344 ++ SAVE_C_REGS
345 ++ SAVE_EXTRA_REGS
346 ++ movl $1, %ebx
347 ++ movl $MSR_GS_BASE, %ecx
348 ++ rdmsr
349 ++ testl %edx, %edx
350 ++ js 1f /* negative -> in kernel */
351 ++ SWAPGS
352 ++ xorl %ebx, %ebx
353 ++1:
354 + movq %rsp, %rdi
355 + movq $-1, %rsi
356 ++#ifdef CONFIG_PAGE_TABLE_ISOLATION
357 ++ /* Unconditionally use kernel CR3 for do_nmi() */
358 ++ /* %rax is saved above, so OK to clobber here */
359 ++ ALTERNATIVE "jmp 2f", "movq %cr3, %rax", X86_FEATURE_KAISER
360 ++ /* If PCID enabled, NOFLUSH now and NOFLUSH on return */
361 ++ ALTERNATIVE "", "bts $63, %rax", X86_FEATURE_PCID
362 ++ pushq %rax
363 ++ /* mask off "user" bit of pgd address and 12 PCID bits: */
364 ++ andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax
365 ++ movq %rax, %cr3
366 ++2:
367 ++#endif
368 ++
369 ++ /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
370 + call do_nmi
371 +
372 ++#ifdef CONFIG_PAGE_TABLE_ISOLATION
373 ++ /*
374 ++ * Unconditionally restore CR3. We might be returning to
375 ++ * kernel code that needs user CR3, like just just before
376 ++ * a sysret.
377 ++ */
378 ++ ALTERNATIVE "", "popq %rax; movq %rax, %cr3", X86_FEATURE_KAISER
379 ++#endif
380 ++
381 + testl %ebx, %ebx /* swapgs needed? */
382 + jnz nmi_restore
383 + nmi_swapgs:
384 ++ /* We fixed up CR3 above, so no need to switch it here */
385 + SWAPGS_UNSAFE_STACK
386 + nmi_restore:
387 + RESTORE_EXTRA_REGS
388 +diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
389 +index e1721dafbcb1..d76a97653980 100644
390 +--- a/arch/x86/entry/entry_64_compat.S
391 ++++ b/arch/x86/entry/entry_64_compat.S
392 +@@ -13,6 +13,8 @@
393 + #include <asm/irqflags.h>
394 + #include <asm/asm.h>
395 + #include <asm/smap.h>
396 ++#include <asm/pgtable_types.h>
397 ++#include <asm/kaiser.h>
398 + #include <linux/linkage.h>
399 + #include <linux/err.h>
400 +
401 +@@ -48,6 +50,7 @@
402 + ENTRY(entry_SYSENTER_compat)
403 + /* Interrupts are off on entry. */
404 + SWAPGS_UNSAFE_STACK
405 ++ SWITCH_KERNEL_CR3_NO_STACK
406 + movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
407 +
408 + /*
409 +@@ -184,6 +187,7 @@ ENDPROC(entry_SYSENTER_compat)
410 + ENTRY(entry_SYSCALL_compat)
411 + /* Interrupts are off on entry. */
412 + SWAPGS_UNSAFE_STACK
413 ++ SWITCH_KERNEL_CR3_NO_STACK
414 +
415 + /* Stash user ESP and switch to the kernel stack. */
416 + movl %esp, %r8d
417 +@@ -259,6 +263,7 @@ sysret32_from_system_call:
418 + xorq %r8, %r8
419 + xorq %r9, %r9
420 + xorq %r10, %r10
421 ++ SWITCH_USER_CR3
422 + movq RSP-ORIG_RAX(%rsp), %rsp
423 + swapgs
424 + sysretl
425 +@@ -297,7 +302,7 @@ ENTRY(entry_INT80_compat)
426 + PARAVIRT_ADJUST_EXCEPTION_FRAME
427 + ASM_CLAC /* Do this early to minimize exposure */
428 + SWAPGS
429 +-
430 ++ SWITCH_KERNEL_CR3_NO_STACK
431 + /*
432 + * User tracing code (ptrace or signal handlers) might assume that
433 + * the saved RAX contains a 32-bit number when we're invoking a 32-bit
434 +@@ -338,6 +343,7 @@ ENTRY(entry_INT80_compat)
435 +
436 + /* Go back to user mode. */
437 + TRACE_IRQS_ON
438 ++ SWITCH_USER_CR3
439 + SWAPGS
440 + jmp restore_regs_and_iret
441 + END(entry_INT80_compat)
442 +diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
443 +index 9dfeeeca0ea8..8e7a3f1df3a5 100644
444 +--- a/arch/x86/events/intel/ds.c
445 ++++ b/arch/x86/events/intel/ds.c
446 +@@ -2,11 +2,15 @@
447 + #include <linux/types.h>
448 + #include <linux/slab.h>
449 +
450 ++#include <asm/kaiser.h>
451 + #include <asm/perf_event.h>
452 + #include <asm/insn.h>
453 +
454 + #include "../perf_event.h"
455 +
456 ++static
457 ++DEFINE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct debug_store, cpu_debug_store);
458 ++
459 + /* The size of a BTS record in bytes: */
460 + #define BTS_RECORD_SIZE 24
461 +
462 +@@ -268,6 +272,39 @@ void fini_debug_store_on_cpu(int cpu)
463 +
464 + static DEFINE_PER_CPU(void *, insn_buffer);
465 +
466 ++static void *dsalloc(size_t size, gfp_t flags, int node)
467 ++{
468 ++#ifdef CONFIG_PAGE_TABLE_ISOLATION
469 ++ unsigned int order = get_order(size);
470 ++ struct page *page;
471 ++ unsigned long addr;
472 ++
473 ++ page = __alloc_pages_node(node, flags | __GFP_ZERO, order);
474 ++ if (!page)
475 ++ return NULL;
476 ++ addr = (unsigned long)page_address(page);
477 ++ if (kaiser_add_mapping(addr, size, __PAGE_KERNEL) < 0) {
478 ++ __free_pages(page, order);
479 ++ addr = 0;
480 ++ }
481 ++ return (void *)addr;
482 ++#else
483 ++ return kmalloc_node(size, flags | __GFP_ZERO, node);
484 ++#endif
485 ++}
486 ++
487 ++static void dsfree(const void *buffer, size_t size)
488 ++{
489 ++#ifdef CONFIG_PAGE_TABLE_ISOLATION
490 ++ if (!buffer)
491 ++ return;
492 ++ kaiser_remove_mapping((unsigned long)buffer, size);
493 ++ free_pages((unsigned long)buffer, get_order(size));
494 ++#else
495 ++ kfree(buffer);
496 ++#endif
497 ++}
498 ++
499 + static int alloc_pebs_buffer(int cpu)
500 + {
501 + struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
502 +@@ -278,7 +315,7 @@ static int alloc_pebs_buffer(int cpu)
503 + if (!x86_pmu.pebs)
504 + return 0;
505 +
506 +- buffer = kzalloc_node(x86_pmu.pebs_buffer_size, GFP_KERNEL, node);
507 ++ buffer = dsalloc(x86_pmu.pebs_buffer_size, GFP_KERNEL, node);
508 + if (unlikely(!buffer))
509 + return -ENOMEM;
510 +
511 +@@ -289,7 +326,7 @@ static int alloc_pebs_buffer(int cpu)
512 + if (x86_pmu.intel_cap.pebs_format < 2) {
513 + ibuffer = kzalloc_node(PEBS_FIXUP_SIZE, GFP_KERNEL, node);
514 + if (!ibuffer) {
515 +- kfree(buffer);
516 ++ dsfree(buffer, x86_pmu.pebs_buffer_size);
517 + return -ENOMEM;
518 + }
519 + per_cpu(insn_buffer, cpu) = ibuffer;
520 +@@ -315,7 +352,8 @@ static void release_pebs_buffer(int cpu)
521 + kfree(per_cpu(insn_buffer, cpu));
522 + per_cpu(insn_buffer, cpu) = NULL;
523 +
524 +- kfree((void *)(unsigned long)ds->pebs_buffer_base);
525 ++ dsfree((void *)(unsigned long)ds->pebs_buffer_base,
526 ++ x86_pmu.pebs_buffer_size);
527 + ds->pebs_buffer_base = 0;
528 + }
529 +
530 +@@ -329,7 +367,7 @@ static int alloc_bts_buffer(int cpu)
531 + if (!x86_pmu.bts)
532 + return 0;
533 +
534 +- buffer = kzalloc_node(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, node);
535 ++ buffer = dsalloc(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, node);
536 + if (unlikely(!buffer)) {
537 + WARN_ONCE(1, "%s: BTS buffer allocation failure\n", __func__);
538 + return -ENOMEM;
539 +@@ -355,19 +393,15 @@ static void release_bts_buffer(int cpu)
540 + if (!ds || !x86_pmu.bts)
541 + return;
542 +
543 +- kfree((void *)(unsigned long)ds->bts_buffer_base);
544 ++ dsfree((void *)(unsigned long)ds->bts_buffer_base, BTS_BUFFER_SIZE);
545 + ds->bts_buffer_base = 0;
546 + }
547 +
548 + static int alloc_ds_buffer(int cpu)
549 + {
550 +- int node = cpu_to_node(cpu);
551 +- struct debug_store *ds;
552 +-
553 +- ds = kzalloc_node(sizeof(*ds), GFP_KERNEL, node);
554 +- if (unlikely(!ds))
555 +- return -ENOMEM;
556 ++ struct debug_store *ds = per_cpu_ptr(&cpu_debug_store, cpu);
557 +
558 ++ memset(ds, 0, sizeof(*ds));
559 + per_cpu(cpu_hw_events, cpu).ds = ds;
560 +
561 + return 0;
562 +@@ -381,7 +415,6 @@ static void release_ds_buffer(int cpu)
563 + return;
564 +
565 + per_cpu(cpu_hw_events, cpu).ds = NULL;
566 +- kfree(ds);
567 + }
568 +
569 + void release_ds_buffers(void)
570 +diff --git a/arch/x86/include/asm/cmdline.h b/arch/x86/include/asm/cmdline.h
571 +index e01f7f7ccb0c..84ae170bc3d0 100644
572 +--- a/arch/x86/include/asm/cmdline.h
573 ++++ b/arch/x86/include/asm/cmdline.h
574 +@@ -2,5 +2,7 @@
575 + #define _ASM_X86_CMDLINE_H
576 +
577 + int cmdline_find_option_bool(const char *cmdline_ptr, const char *option);
578 ++int cmdline_find_option(const char *cmdline_ptr, const char *option,
579 ++ char *buffer, int bufsize);
580 +
581 + #endif /* _ASM_X86_CMDLINE_H */
582 +diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
583 +index ed10b5bf9b93..454a37adb823 100644
584 +--- a/arch/x86/include/asm/cpufeatures.h
585 ++++ b/arch/x86/include/asm/cpufeatures.h
586 +@@ -189,6 +189,7 @@
587 +
588 + #define X86_FEATURE_CPB ( 7*32+ 2) /* AMD Core Performance Boost */
589 + #define X86_FEATURE_EPB ( 7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */
590 ++#define X86_FEATURE_INVPCID_SINGLE ( 7*32+ 4) /* Effectively INVPCID && CR4.PCIDE=1 */
591 +
592 + #define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */
593 + #define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */
594 +@@ -197,6 +198,9 @@
595 + #define X86_FEATURE_AVX512_4VNNIW (7*32+16) /* AVX-512 Neural Network Instructions */
596 + #define X86_FEATURE_AVX512_4FMAPS (7*32+17) /* AVX-512 Multiply Accumulation Single precision */
597 +
598 ++/* Because the ALTERNATIVE scheme is for members of the X86_FEATURE club... */
599 ++#define X86_FEATURE_KAISER ( 7*32+31) /* CONFIG_PAGE_TABLE_ISOLATION w/o nokaiser */
600 ++
601 + /* Virtualization flags: Linux defined, word 8 */
602 + #define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */
603 + #define X86_FEATURE_VNMI ( 8*32+ 1) /* Intel Virtual NMI */
604 +diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
605 +index 12080d87da3b..2ed5a2b3f8f7 100644
606 +--- a/arch/x86/include/asm/desc.h
607 ++++ b/arch/x86/include/asm/desc.h
608 +@@ -43,7 +43,7 @@ struct gdt_page {
609 + struct desc_struct gdt[GDT_ENTRIES];
610 + } __attribute__((aligned(PAGE_SIZE)));
611 +
612 +-DECLARE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page);
613 ++DECLARE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(struct gdt_page, gdt_page);
614 +
615 + static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
616 + {
617 +diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
618 +index b90e1053049b..0817d63bce41 100644
619 +--- a/arch/x86/include/asm/hw_irq.h
620 ++++ b/arch/x86/include/asm/hw_irq.h
621 +@@ -178,7 +178,7 @@ extern char irq_entries_start[];
622 + #define VECTOR_RETRIGGERED ((void *)~0UL)
623 +
624 + typedef struct irq_desc* vector_irq_t[NR_VECTORS];
625 +-DECLARE_PER_CPU(vector_irq_t, vector_irq);
626 ++DECLARE_PER_CPU_USER_MAPPED(vector_irq_t, vector_irq);
627 +
628 + #endif /* !ASSEMBLY_ */
629 +
630 +diff --git a/arch/x86/include/asm/kaiser.h b/arch/x86/include/asm/kaiser.h
631 +new file mode 100644
632 +index 000000000000..802bbbdfe143
633 +--- /dev/null
634 ++++ b/arch/x86/include/asm/kaiser.h
635 +@@ -0,0 +1,141 @@
636 ++#ifndef _ASM_X86_KAISER_H
637 ++#define _ASM_X86_KAISER_H
638 ++
639 ++#include <uapi/asm/processor-flags.h> /* For PCID constants */
640 ++
641 ++/*
642 ++ * This file includes the definitions for the KAISER feature.
643 ++ * KAISER is a counter measure against x86_64 side channel attacks on
644 ++ * the kernel virtual memory. It has a shadow pgd for every process: the
645 ++ * shadow pgd has a minimalistic kernel-set mapped, but includes the whole
646 ++ * user memory. Within a kernel context switch, or when an interrupt is handled,
647 ++ * the pgd is switched to the normal one. When the system switches to user mode,
648 ++ * the shadow pgd is enabled. By this, the virtual memory caches are freed,
649 ++ * and the user may not attack the whole kernel memory.
650 ++ *
651 ++ * A minimalistic kernel mapping holds the parts needed to be mapped in user
652 ++ * mode, such as the entry/exit functions of the user space, or the stacks.
653 ++ */
654 ++
655 ++#define KAISER_SHADOW_PGD_OFFSET 0x1000
656 ++
657 ++#ifdef __ASSEMBLY__
658 ++#ifdef CONFIG_PAGE_TABLE_ISOLATION
659 ++
660 ++.macro _SWITCH_TO_KERNEL_CR3 reg
661 ++movq %cr3, \reg
662 ++andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), \reg
663 ++/* If PCID enabled, set X86_CR3_PCID_NOFLUSH_BIT */
664 ++ALTERNATIVE "", "bts $63, \reg", X86_FEATURE_PCID
665 ++movq \reg, %cr3
666 ++.endm
667 ++
668 ++.macro _SWITCH_TO_USER_CR3 reg regb
669 ++/*
670 ++ * regb must be the low byte portion of reg: because we have arranged
671 ++ * for the low byte of the user PCID to serve as the high byte of NOFLUSH
672 ++ * (0x80 for each when PCID is enabled, or 0x00 when PCID and NOFLUSH are
673 ++ * not enabled): so that the one register can update both memory and cr3.
674 ++ */
675 ++movq %cr3, \reg
676 ++orq PER_CPU_VAR(x86_cr3_pcid_user), \reg
677 ++js 9f
678 ++/* If PCID enabled, FLUSH this time, reset to NOFLUSH for next time */
679 ++movb \regb, PER_CPU_VAR(x86_cr3_pcid_user+7)
680 ++9:
681 ++movq \reg, %cr3
682 ++.endm
683 ++
684 ++.macro SWITCH_KERNEL_CR3
685 ++ALTERNATIVE "jmp 8f", "pushq %rax", X86_FEATURE_KAISER
686 ++_SWITCH_TO_KERNEL_CR3 %rax
687 ++popq %rax
688 ++8:
689 ++.endm
690 ++
691 ++.macro SWITCH_USER_CR3
692 ++ALTERNATIVE "jmp 8f", "pushq %rax", X86_FEATURE_KAISER
693 ++_SWITCH_TO_USER_CR3 %rax %al
694 ++popq %rax
695 ++8:
696 ++.endm
697 ++
698 ++.macro SWITCH_KERNEL_CR3_NO_STACK
699 ++ALTERNATIVE "jmp 8f", \
700 ++ __stringify(movq %rax, PER_CPU_VAR(unsafe_stack_register_backup)), \
701 ++ X86_FEATURE_KAISER
702 ++_SWITCH_TO_KERNEL_CR3 %rax
703 ++movq PER_CPU_VAR(unsafe_stack_register_backup), %rax
704 ++8:
705 ++.endm
706 ++
707 ++#else /* CONFIG_PAGE_TABLE_ISOLATION */
708 ++
709 ++.macro SWITCH_KERNEL_CR3
710 ++.endm
711 ++.macro SWITCH_USER_CR3
712 ++.endm
713 ++.macro SWITCH_KERNEL_CR3_NO_STACK
714 ++.endm
715 ++
716 ++#endif /* CONFIG_PAGE_TABLE_ISOLATION */
717 ++
718 ++#else /* __ASSEMBLY__ */
719 ++
720 ++#ifdef CONFIG_PAGE_TABLE_ISOLATION
721 ++/*
722 ++ * Upon kernel/user mode switch, it may happen that the address
723 ++ * space has to be switched before the registers have been
724 ++ * stored. To change the address space, another register is
725 ++ * needed. A register therefore has to be stored/restored.
726 ++*/
727 ++DECLARE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
728 ++
729 ++DECLARE_PER_CPU(unsigned long, x86_cr3_pcid_user);
730 ++
731 ++extern char __per_cpu_user_mapped_start[], __per_cpu_user_mapped_end[];
732 ++
733 ++extern int kaiser_enabled;
734 ++extern void __init kaiser_check_boottime_disable(void);
735 ++#else
736 ++#define kaiser_enabled 0
737 ++static inline void __init kaiser_check_boottime_disable(void) {}
738 ++#endif /* CONFIG_PAGE_TABLE_ISOLATION */
739 ++
740 ++/*
741 ++ * Kaiser function prototypes are needed even when CONFIG_PAGE_TABLE_ISOLATION is not set,
742 ++ * so as to build with tests on kaiser_enabled instead of #ifdefs.
743 ++ */
744 ++
745 ++/**
746 ++ * kaiser_add_mapping - map a virtual memory part to the shadow (user) mapping
747 ++ * @addr: the start address of the range
748 ++ * @size: the size of the range
749 ++ * @flags: The mapping flags of the pages
750 ++ *
751 ++ * The mapping is done on a global scope, so no bigger
752 ++ * synchronization has to be done. the pages have to be
753 ++ * manually unmapped again when they are not needed any longer.
754 ++ */
755 ++extern int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags);
756 ++
757 ++/**
758 ++ * kaiser_remove_mapping - unmap a virtual memory part of the shadow mapping
759 ++ * @addr: the start address of the range
760 ++ * @size: the size of the range
761 ++ */
762 ++extern void kaiser_remove_mapping(unsigned long start, unsigned long size);
763 ++
764 ++/**
765 ++ * kaiser_init - Initialize the shadow mapping
766 ++ *
767 ++ * Most parts of the shadow mapping can be mapped upon boot
768 ++ * time. Only per-process things like the thread stacks
769 ++ * or a new LDT have to be mapped at runtime. These boot-
770 ++ * time mappings are permanent and never unmapped.
771 ++ */
772 ++extern void kaiser_init(void);
773 ++
774 ++#endif /* __ASSEMBLY */
775 ++
776 ++#endif /* _ASM_X86_KAISER_H */
777 +diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
778 +index 437feb436efa..2536f90cd30c 100644
779 +--- a/arch/x86/include/asm/pgtable.h
780 ++++ b/arch/x86/include/asm/pgtable.h
781 +@@ -18,6 +18,12 @@
782 + #ifndef __ASSEMBLY__
783 + #include <asm/x86_init.h>
784 +
785 ++#ifdef CONFIG_PAGE_TABLE_ISOLATION
786 ++extern int kaiser_enabled;
787 ++#else
788 ++#define kaiser_enabled 0
789 ++#endif
790 ++
791 + void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd);
792 + void ptdump_walk_pgd_level_checkwx(void);
793 +
794 +@@ -690,7 +696,17 @@ static inline pud_t *pud_offset(pgd_t *pgd, unsigned long address)
795 +
796 + static inline int pgd_bad(pgd_t pgd)
797 + {
798 +- return (pgd_flags(pgd) & ~_PAGE_USER) != _KERNPG_TABLE;
799 ++ pgdval_t ignore_flags = _PAGE_USER;
800 ++ /*
801 ++ * We set NX on KAISER pgds that map userspace memory so
802 ++ * that userspace can not meaningfully use the kernel
803 ++ * page table by accident; it will fault on the first
804 ++ * instruction it tries to run. See native_set_pgd().
805 ++ */
806 ++ if (kaiser_enabled)
807 ++ ignore_flags |= _PAGE_NX;
808 ++
809 ++ return (pgd_flags(pgd) & ~ignore_flags) != _KERNPG_TABLE;
810 + }
811 +
812 + static inline int pgd_none(pgd_t pgd)
813 +@@ -903,7 +919,15 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm,
814 + */
815 + static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
816 + {
817 +- memcpy(dst, src, count * sizeof(pgd_t));
818 ++ memcpy(dst, src, count * sizeof(pgd_t));
819 ++#ifdef CONFIG_PAGE_TABLE_ISOLATION
820 ++ if (kaiser_enabled) {
821 ++ /* Clone the shadow pgd part as well */
822 ++ memcpy(native_get_shadow_pgd(dst),
823 ++ native_get_shadow_pgd(src),
824 ++ count * sizeof(pgd_t));
825 ++ }
826 ++#endif
827 + }
828 +
829 + #define PTE_SHIFT ilog2(PTRS_PER_PTE)
830 +diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
831 +index 1cc82ece9ac1..ce97c8c6a310 100644
832 +--- a/arch/x86/include/asm/pgtable_64.h
833 ++++ b/arch/x86/include/asm/pgtable_64.h
834 +@@ -106,9 +106,32 @@ static inline void native_pud_clear(pud_t *pud)
835 + native_set_pud(pud, native_make_pud(0));
836 + }
837 +
838 ++#ifdef CONFIG_PAGE_TABLE_ISOLATION
839 ++extern pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd);
840 ++
841 ++static inline pgd_t *native_get_shadow_pgd(pgd_t *pgdp)
842 ++{
843 ++#ifdef CONFIG_DEBUG_VM
844 ++ /* linux/mmdebug.h may not have been included at this point */
845 ++ BUG_ON(!kaiser_enabled);
846 ++#endif
847 ++ return (pgd_t *)((unsigned long)pgdp | (unsigned long)PAGE_SIZE);
848 ++}
849 ++#else
850 ++static inline pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd)
851 ++{
852 ++ return pgd;
853 ++}
854 ++static inline pgd_t *native_get_shadow_pgd(pgd_t *pgdp)
855 ++{
856 ++ BUILD_BUG_ON(1);
857 ++ return NULL;
858 ++}
859 ++#endif /* CONFIG_PAGE_TABLE_ISOLATION */
860 ++
861 + static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd)
862 + {
863 +- *pgdp = pgd;
864 ++ *pgdp = kaiser_set_shadow_pgd(pgdp, pgd);
865 + }
866 +
867 + static inline void native_pgd_clear(pgd_t *pgd)
868 +diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
869 +index 8b4de22d6429..f1c8ac468292 100644
870 +--- a/arch/x86/include/asm/pgtable_types.h
871 ++++ b/arch/x86/include/asm/pgtable_types.h
872 +@@ -119,7 +119,7 @@
873 + #define _PAGE_DEVMAP (_AT(pteval_t, 0))
874 + #endif
875 +
876 +-#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
877 ++#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
878 +
879 + #define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \
880 + _PAGE_ACCESSED | _PAGE_DIRTY)
881 +@@ -137,6 +137,33 @@
882 + _PAGE_SOFT_DIRTY)
883 + #define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE)
884 +
885 ++/* The ASID is the lower 12 bits of CR3 */
886 ++#define X86_CR3_PCID_ASID_MASK (_AC((1<<12)-1,UL))
887 ++
888 ++/* Mask for all the PCID-related bits in CR3: */
889 ++#define X86_CR3_PCID_MASK (X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_MASK)
890 ++#define X86_CR3_PCID_ASID_KERN (_AC(0x0,UL))
891 ++
892 ++#if defined(CONFIG_PAGE_TABLE_ISOLATION) && defined(CONFIG_X86_64)
893 ++/* Let X86_CR3_PCID_ASID_USER be usable for the X86_CR3_PCID_NOFLUSH bit */
894 ++#define X86_CR3_PCID_ASID_USER (_AC(0x80,UL))
895 ++
896 ++#define X86_CR3_PCID_KERN_FLUSH (X86_CR3_PCID_ASID_KERN)
897 ++#define X86_CR3_PCID_USER_FLUSH (X86_CR3_PCID_ASID_USER)
898 ++#define X86_CR3_PCID_KERN_NOFLUSH (X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_KERN)
899 ++#define X86_CR3_PCID_USER_NOFLUSH (X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_USER)
900 ++#else
901 ++#define X86_CR3_PCID_ASID_USER (_AC(0x0,UL))
902 ++/*
903 ++ * PCIDs are unsupported on 32-bit and none of these bits can be
904 ++ * set in CR3:
905 ++ */
906 ++#define X86_CR3_PCID_KERN_FLUSH (0)
907 ++#define X86_CR3_PCID_USER_FLUSH (0)
908 ++#define X86_CR3_PCID_KERN_NOFLUSH (0)
909 ++#define X86_CR3_PCID_USER_NOFLUSH (0)
910 ++#endif
911 ++
912 + /*
913 + * The cache modes defined here are used to translate between pure SW usage
914 + * and the HW defined cache mode bits and/or PAT entries.
915 +diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
916 +index 83db0eae9979..8cb52ee3ade6 100644
917 +--- a/arch/x86/include/asm/processor.h
918 ++++ b/arch/x86/include/asm/processor.h
919 +@@ -308,7 +308,7 @@ struct tss_struct {
920 +
921 + } ____cacheline_aligned;
922 +
923 +-DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss);
924 ++DECLARE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct tss_struct, cpu_tss);
925 +
926 + #ifdef CONFIG_X86_32
927 + DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack);
928 +diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
929 +index 7d2ea6b1f7d9..94146f665a3c 100644
930 +--- a/arch/x86/include/asm/tlbflush.h
931 ++++ b/arch/x86/include/asm/tlbflush.h
932 +@@ -132,6 +132,24 @@ static inline void cr4_set_bits_and_update_boot(unsigned long mask)
933 + cr4_set_bits(mask);
934 + }
935 +
936 ++/*
937 ++ * Declare a couple of kaiser interfaces here for convenience,
938 ++ * to avoid the need for asm/kaiser.h in unexpected places.
939 ++ */
940 ++#ifdef CONFIG_PAGE_TABLE_ISOLATION
941 ++extern int kaiser_enabled;
942 ++extern void kaiser_setup_pcid(void);
943 ++extern void kaiser_flush_tlb_on_return_to_user(void);
944 ++#else
945 ++#define kaiser_enabled 0
946 ++static inline void kaiser_setup_pcid(void)
947 ++{
948 ++}
949 ++static inline void kaiser_flush_tlb_on_return_to_user(void)
950 ++{
951 ++}
952 ++#endif
953 ++
954 + static inline void __native_flush_tlb(void)
955 + {
956 + /*
957 +@@ -140,6 +158,8 @@ static inline void __native_flush_tlb(void)
958 + * back:
959 + */
960 + preempt_disable();
961 ++ if (kaiser_enabled)
962 ++ kaiser_flush_tlb_on_return_to_user();
963 + native_write_cr3(native_read_cr3());
964 + preempt_enable();
965 + }
966 +@@ -149,20 +169,27 @@ static inline void __native_flush_tlb_global_irq_disabled(void)
967 + unsigned long cr4;
968 +
969 + cr4 = this_cpu_read(cpu_tlbstate.cr4);
970 +- /* clear PGE */
971 +- native_write_cr4(cr4 & ~X86_CR4_PGE);
972 +- /* write old PGE again and flush TLBs */
973 +- native_write_cr4(cr4);
974 ++ if (cr4 & X86_CR4_PGE) {
975 ++ /* clear PGE and flush TLB of all entries */
976 ++ native_write_cr4(cr4 & ~X86_CR4_PGE);
977 ++ /* restore PGE as it was before */
978 ++ native_write_cr4(cr4);
979 ++ } else {
980 ++ /* do it with cr3, letting kaiser flush user PCID */
981 ++ __native_flush_tlb();
982 ++ }
983 + }
984 +
985 + static inline void __native_flush_tlb_global(void)
986 + {
987 + unsigned long flags;
988 +
989 +- if (static_cpu_has(X86_FEATURE_INVPCID)) {
990 ++ if (this_cpu_has(X86_FEATURE_INVPCID)) {
991 + /*
992 + * Using INVPCID is considerably faster than a pair of writes
993 + * to CR4 sandwiched inside an IRQ flag save/restore.
994 ++ *
995 ++ * Note, this works with CR4.PCIDE=0 or 1.
996 + */
997 + invpcid_flush_all();
998 + return;
999 +@@ -174,24 +201,45 @@ static inline void __native_flush_tlb_global(void)
1000 + * be called from deep inside debugging code.)
1001 + */
1002 + raw_local_irq_save(flags);
1003 +-
1004 + __native_flush_tlb_global_irq_disabled();
1005 +-
1006 + raw_local_irq_restore(flags);
1007 + }
1008 +
1009 + static inline void __native_flush_tlb_single(unsigned long addr)
1010 + {
1011 +- asm volatile("invlpg (%0)" ::"r" (addr) : "memory");
1012 ++ /*
1013 ++ * SIMICS #GP's if you run INVPCID with type 2/3
1014 ++ * and X86_CR4_PCIDE clear. Shame!
1015 ++ *
1016 ++ * The ASIDs used below are hard-coded. But, we must not
1017 ++ * call invpcid(type=1/2) before CR4.PCIDE=1. Just call
1018 ++ * invlpg in the case we are called early.
1019 ++ */
1020 ++
1021 ++ if (!this_cpu_has(X86_FEATURE_INVPCID_SINGLE)) {
1022 ++ if (kaiser_enabled)
1023 ++ kaiser_flush_tlb_on_return_to_user();
1024 ++ asm volatile("invlpg (%0)" ::"r" (addr) : "memory");
1025 ++ return;
1026 ++ }
1027 ++ /* Flush the address out of both PCIDs. */
1028 ++ /*
1029 ++ * An optimization here might be to determine addresses
1030 ++ * that are only kernel-mapped and only flush the kernel
1031 ++ * ASID. But, userspace flushes are probably much more
1032 ++ * important performance-wise.
1033 ++ *
1034 ++ * Make sure to do only a single invpcid when KAISER is
1035 ++ * disabled and we have only a single ASID.
1036 ++ */
1037 ++ if (kaiser_enabled)
1038 ++ invpcid_flush_one(X86_CR3_PCID_ASID_USER, addr);
1039 ++ invpcid_flush_one(X86_CR3_PCID_ASID_KERN, addr);
1040 + }
1041 +
1042 + static inline void __flush_tlb_all(void)
1043 + {
1044 +- if (boot_cpu_has(X86_FEATURE_PGE))
1045 +- __flush_tlb_global();
1046 +- else
1047 +- __flush_tlb();
1048 +-
1049 ++ __flush_tlb_global();
1050 + /*
1051 + * Note: if we somehow had PCID but not PGE, then this wouldn't work --
1052 + * we'd end up flushing kernel translations for the current ASID but
1053 +diff --git a/arch/x86/include/uapi/asm/processor-flags.h b/arch/x86/include/uapi/asm/processor-flags.h
1054 +index 567de50a4c2a..6768d1321016 100644
1055 +--- a/arch/x86/include/uapi/asm/processor-flags.h
1056 ++++ b/arch/x86/include/uapi/asm/processor-flags.h
1057 +@@ -77,7 +77,8 @@
1058 + #define X86_CR3_PWT _BITUL(X86_CR3_PWT_BIT)
1059 + #define X86_CR3_PCD_BIT 4 /* Page Cache Disable */
1060 + #define X86_CR3_PCD _BITUL(X86_CR3_PCD_BIT)
1061 +-#define X86_CR3_PCID_MASK _AC(0x00000fff,UL) /* PCID Mask */
1062 ++#define X86_CR3_PCID_NOFLUSH_BIT 63 /* Preserve old PCID */
1063 ++#define X86_CR3_PCID_NOFLUSH _BITULL(X86_CR3_PCID_NOFLUSH_BIT)
1064 +
1065 + /*
1066 + * Intel CPU features in CR4
1067 +diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
1068 +index 91588be529b9..918e44772b04 100644
1069 +--- a/arch/x86/kernel/cpu/common.c
1070 ++++ b/arch/x86/kernel/cpu/common.c
1071 +@@ -93,7 +93,7 @@ static const struct cpu_dev default_cpu = {
1072 +
1073 + static const struct cpu_dev *this_cpu = &default_cpu;
1074 +
1075 +-DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
1076 ++DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(struct gdt_page, gdt_page) = { .gdt = {
1077 + #ifdef CONFIG_X86_64
1078 + /*
1079 + * We need valid kernel segments for data and code in long mode too
1080 +@@ -327,8 +327,21 @@ static __always_inline void setup_smap(struct cpuinfo_x86 *c)
1081 + static void setup_pcid(struct cpuinfo_x86 *c)
1082 + {
1083 + if (cpu_has(c, X86_FEATURE_PCID)) {
1084 +- if (cpu_has(c, X86_FEATURE_PGE)) {
1085 ++ if (cpu_has(c, X86_FEATURE_PGE) || kaiser_enabled) {
1086 + cr4_set_bits(X86_CR4_PCIDE);
1087 ++ /*
1088 ++ * INVPCID has two "groups" of types:
1089 ++ * 1/2: Invalidate an individual address
1090 ++ * 3/4: Invalidate all contexts
1091 ++ *
1092 ++ * 1/2 take a PCID, but 3/4 do not. So, 3/4
1093 ++ * ignore the PCID argument in the descriptor.
1094 ++ * But, we have to be careful not to call 1/2
1095 ++ * with an actual non-zero PCID in them before
1096 ++ * we do the above cr4_set_bits().
1097 ++ */
1098 ++ if (cpu_has(c, X86_FEATURE_INVPCID))
1099 ++ set_cpu_cap(c, X86_FEATURE_INVPCID_SINGLE);
1100 + } else {
1101 + /*
1102 + * flush_tlb_all(), as currently implemented, won't
1103 +@@ -341,6 +354,7 @@ static void setup_pcid(struct cpuinfo_x86 *c)
1104 + clear_cpu_cap(c, X86_FEATURE_PCID);
1105 + }
1106 + }
1107 ++ kaiser_setup_pcid();
1108 + }
1109 +
1110 + /*
1111 +@@ -1365,7 +1379,7 @@ static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = {
1112 + [DEBUG_STACK - 1] = DEBUG_STKSZ
1113 + };
1114 +
1115 +-static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
1116 ++DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(char, exception_stacks
1117 + [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]);
1118 +
1119 + /* May not be marked __init: used by software suspend */
1120 +@@ -1523,6 +1537,14 @@ void cpu_init(void)
1121 + * try to read it.
1122 + */
1123 + cr4_init_shadow();
1124 ++ if (!kaiser_enabled) {
1125 ++ /*
1126 ++ * secondary_startup_64() deferred setting PGE in cr4:
1127 ++ * probe_page_size_mask() sets it on the boot cpu,
1128 ++ * but it needs to be set on each secondary cpu.
1129 ++ */
1130 ++ cr4_set_bits(X86_CR4_PGE);
1131 ++ }
1132 +
1133 + /*
1134 + * Load microcode on this cpu if a valid microcode is available.
1135 +diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c
1136 +index 04f89caef9c4..e33b38541be3 100644
1137 +--- a/arch/x86/kernel/espfix_64.c
1138 ++++ b/arch/x86/kernel/espfix_64.c
1139 +@@ -41,6 +41,7 @@
1140 + #include <asm/pgalloc.h>
1141 + #include <asm/setup.h>
1142 + #include <asm/espfix.h>
1143 ++#include <asm/kaiser.h>
1144 +
1145 + /*
1146 + * Note: we only need 6*8 = 48 bytes for the espfix stack, but round
1147 +@@ -126,6 +127,15 @@ void __init init_espfix_bsp(void)
1148 + /* Install the espfix pud into the kernel page directory */
1149 + pgd_p = &init_level4_pgt[pgd_index(ESPFIX_BASE_ADDR)];
1150 + pgd_populate(&init_mm, pgd_p, (pud_t *)espfix_pud_page);
1151 ++ /*
1152 ++ * Just copy the top-level PGD that is mapping the espfix
1153 ++ * area to ensure it is mapped into the shadow user page
1154 ++ * tables.
1155 ++ */
1156 ++ if (kaiser_enabled) {
1157 ++ set_pgd(native_get_shadow_pgd(pgd_p),
1158 ++ __pgd(_KERNPG_TABLE | __pa((pud_t *)espfix_pud_page)));
1159 ++ }
1160 +
1161 + /* Randomize the locations */
1162 + init_espfix_random();
1163 +diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
1164 +index b4421cc191b0..67cd7c1b99da 100644
1165 +--- a/arch/x86/kernel/head_64.S
1166 ++++ b/arch/x86/kernel/head_64.S
1167 +@@ -190,8 +190,8 @@ ENTRY(secondary_startup_64)
1168 + movq $(init_level4_pgt - __START_KERNEL_map), %rax
1169 + 1:
1170 +
1171 +- /* Enable PAE mode and PGE */
1172 +- movl $(X86_CR4_PAE | X86_CR4_PGE), %ecx
1173 ++ /* Enable PAE and PSE, but defer PGE until kaiser_enabled is decided */
1174 ++ movl $(X86_CR4_PAE | X86_CR4_PSE), %ecx
1175 + movq %rcx, %cr4
1176 +
1177 + /* Setup early boot stage 4 level pagetables. */
1178 +@@ -405,6 +405,27 @@ GLOBAL(early_recursion_flag)
1179 + .balign PAGE_SIZE; \
1180 + GLOBAL(name)
1181 +
1182 ++#ifdef CONFIG_PAGE_TABLE_ISOLATION
1183 ++/*
1184 ++ * Each PGD needs to be 8k long and 8k aligned. We do not
1185 ++ * ever go out to userspace with these, so we do not
1186 ++ * strictly *need* the second page, but this allows us to
1187 ++ * have a single set_pgd() implementation that does not
1188 ++ * need to worry about whether it has 4k or 8k to work
1189 ++ * with.
1190 ++ *
1191 ++ * This ensures PGDs are 8k long:
1192 ++ */
1193 ++#define KAISER_USER_PGD_FILL 512
1194 ++/* This ensures they are 8k-aligned: */
1195 ++#define NEXT_PGD_PAGE(name) \
1196 ++ .balign 2 * PAGE_SIZE; \
1197 ++GLOBAL(name)
1198 ++#else
1199 ++#define NEXT_PGD_PAGE(name) NEXT_PAGE(name)
1200 ++#define KAISER_USER_PGD_FILL 0
1201 ++#endif
1202 ++
1203 + /* Automate the creation of 1 to 1 mapping pmd entries */
1204 + #define PMDS(START, PERM, COUNT) \
1205 + i = 0 ; \
1206 +@@ -414,9 +435,10 @@ GLOBAL(name)
1207 + .endr
1208 +
1209 + __INITDATA
1210 +-NEXT_PAGE(early_level4_pgt)
1211 ++NEXT_PGD_PAGE(early_level4_pgt)
1212 + .fill 511,8,0
1213 + .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
1214 ++ .fill KAISER_USER_PGD_FILL,8,0
1215 +
1216 + NEXT_PAGE(early_dynamic_pgts)
1217 + .fill 512*EARLY_DYNAMIC_PAGE_TABLES,8,0
1218 +@@ -424,16 +446,18 @@ NEXT_PAGE(early_dynamic_pgts)
1219 + .data
1220 +
1221 + #ifndef CONFIG_XEN
1222 +-NEXT_PAGE(init_level4_pgt)
1223 ++NEXT_PGD_PAGE(init_level4_pgt)
1224 + .fill 512,8,0
1225 ++ .fill KAISER_USER_PGD_FILL,8,0
1226 + #else
1227 +-NEXT_PAGE(init_level4_pgt)
1228 ++NEXT_PGD_PAGE(init_level4_pgt)
1229 + .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
1230 + .org init_level4_pgt + L4_PAGE_OFFSET*8, 0
1231 + .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
1232 + .org init_level4_pgt + L4_START_KERNEL*8, 0
1233 + /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
1234 + .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
1235 ++ .fill KAISER_USER_PGD_FILL,8,0
1236 +
1237 + NEXT_PAGE(level3_ident_pgt)
1238 + .quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
1239 +@@ -444,6 +468,7 @@ NEXT_PAGE(level2_ident_pgt)
1240 + */
1241 + PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD)
1242 + #endif
1243 ++ .fill KAISER_USER_PGD_FILL,8,0
1244 +
1245 + NEXT_PAGE(level3_kernel_pgt)
1246 + .fill L3_START_KERNEL,8,0
1247 +diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
1248 +index 1423ab1b0312..f480b38a03c3 100644
1249 +--- a/arch/x86/kernel/irqinit.c
1250 ++++ b/arch/x86/kernel/irqinit.c
1251 +@@ -51,7 +51,7 @@ static struct irqaction irq2 = {
1252 + .flags = IRQF_NO_THREAD,
1253 + };
1254 +
1255 +-DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
1256 ++DEFINE_PER_CPU_USER_MAPPED(vector_irq_t, vector_irq) = {
1257 + [0 ... NR_VECTORS - 1] = VECTOR_UNUSED,
1258 + };
1259 +
1260 +diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
1261 +index 5f70014ca602..8bc68cfc0d33 100644
1262 +--- a/arch/x86/kernel/ldt.c
1263 ++++ b/arch/x86/kernel/ldt.c
1264 +@@ -16,6 +16,7 @@
1265 + #include <linux/slab.h>
1266 + #include <linux/vmalloc.h>
1267 + #include <linux/uaccess.h>
1268 ++#include <linux/kaiser.h>
1269 +
1270 + #include <asm/ldt.h>
1271 + #include <asm/desc.h>
1272 +@@ -34,11 +35,21 @@ static void flush_ldt(void *current_mm)
1273 + set_ldt(pc->ldt->entries, pc->ldt->size);
1274 + }
1275 +
1276 ++static void __free_ldt_struct(struct ldt_struct *ldt)
1277 ++{
1278 ++ if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE)
1279 ++ vfree(ldt->entries);
1280 ++ else
1281 ++ free_page((unsigned long)ldt->entries);
1282 ++ kfree(ldt);
1283 ++}
1284 ++
1285 + /* The caller must call finalize_ldt_struct on the result. LDT starts zeroed. */
1286 + static struct ldt_struct *alloc_ldt_struct(int size)
1287 + {
1288 + struct ldt_struct *new_ldt;
1289 + int alloc_size;
1290 ++ int ret;
1291 +
1292 + if (size > LDT_ENTRIES)
1293 + return NULL;
1294 +@@ -66,7 +77,13 @@ static struct ldt_struct *alloc_ldt_struct(int size)
1295 + return NULL;
1296 + }
1297 +
1298 ++ ret = kaiser_add_mapping((unsigned long)new_ldt->entries, alloc_size,
1299 ++ __PAGE_KERNEL);
1300 + new_ldt->size = size;
1301 ++ if (ret) {
1302 ++ __free_ldt_struct(new_ldt);
1303 ++ return NULL;
1304 ++ }
1305 + return new_ldt;
1306 + }
1307 +
1308 +@@ -92,12 +109,10 @@ static void free_ldt_struct(struct ldt_struct *ldt)
1309 + if (likely(!ldt))
1310 + return;
1311 +
1312 ++ kaiser_remove_mapping((unsigned long)ldt->entries,
1313 ++ ldt->size * LDT_ENTRY_SIZE);
1314 + paravirt_free_ldt(ldt->entries, ldt->size);
1315 +- if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE)
1316 +- vfree(ldt->entries);
1317 +- else
1318 +- free_page((unsigned long)ldt->entries);
1319 +- kfree(ldt);
1320 ++ __free_ldt_struct(ldt);
1321 + }
1322 +
1323 + /*
1324 +diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c
1325 +index bb3840cedb4f..ee43b36075c7 100644
1326 +--- a/arch/x86/kernel/paravirt_patch_64.c
1327 ++++ b/arch/x86/kernel/paravirt_patch_64.c
1328 +@@ -9,7 +9,6 @@ DEF_NATIVE(pv_irq_ops, save_fl, "pushfq; popq %rax");
1329 + DEF_NATIVE(pv_mmu_ops, read_cr2, "movq %cr2, %rax");
1330 + DEF_NATIVE(pv_mmu_ops, read_cr3, "movq %cr3, %rax");
1331 + DEF_NATIVE(pv_mmu_ops, write_cr3, "movq %rdi, %cr3");
1332 +-DEF_NATIVE(pv_mmu_ops, flush_tlb_single, "invlpg (%rdi)");
1333 + DEF_NATIVE(pv_cpu_ops, clts, "clts");
1334 + DEF_NATIVE(pv_cpu_ops, wbinvd, "wbinvd");
1335 +
1336 +@@ -59,7 +58,6 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
1337 + PATCH_SITE(pv_mmu_ops, read_cr3);
1338 + PATCH_SITE(pv_mmu_ops, write_cr3);
1339 + PATCH_SITE(pv_cpu_ops, clts);
1340 +- PATCH_SITE(pv_mmu_ops, flush_tlb_single);
1341 + PATCH_SITE(pv_cpu_ops, wbinvd);
1342 + #if defined(CONFIG_PARAVIRT_SPINLOCKS)
1343 + case PARAVIRT_PATCH(pv_lock_ops.queued_spin_unlock):
1344 +diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
1345 +index 8e10e72bf6ee..a55b32007785 100644
1346 +--- a/arch/x86/kernel/process.c
1347 ++++ b/arch/x86/kernel/process.c
1348 +@@ -41,7 +41,7 @@
1349 + * section. Since TSS's are completely CPU-local, we want them
1350 + * on exact cacheline boundaries, to eliminate cacheline ping-pong.
1351 + */
1352 +-__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = {
1353 ++__visible DEFINE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct tss_struct, cpu_tss) = {
1354 + .x86_tss = {
1355 + .sp0 = TOP_OF_INIT_STACK,
1356 + #ifdef CONFIG_X86_32
1357 +diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
1358 +index feaab07fa124..6b55012d02a3 100644
1359 +--- a/arch/x86/kernel/setup.c
1360 ++++ b/arch/x86/kernel/setup.c
1361 +@@ -114,6 +114,7 @@
1362 + #include <asm/microcode.h>
1363 + #include <asm/mmu_context.h>
1364 + #include <asm/kaslr.h>
1365 ++#include <asm/kaiser.h>
1366 +
1367 + /*
1368 + * max_low_pfn_mapped: highest direct mapped pfn under 4GB
1369 +@@ -1019,6 +1020,12 @@ void __init setup_arch(char **cmdline_p)
1370 + */
1371 + init_hypervisor_platform();
1372 +
1373 ++ /*
1374 ++ * This needs to happen right after XENPV is set on xen and
1375 ++ * kaiser_enabled is checked below in cleanup_highmap().
1376 ++ */
1377 ++ kaiser_check_boottime_disable();
1378 ++
1379 + x86_init.resources.probe_roms();
1380 +
1381 + /* after parse_early_param, so could debug it */
1382 +diff --git a/arch/x86/kernel/tracepoint.c b/arch/x86/kernel/tracepoint.c
1383 +index 1c113db9ed57..2bb5ee464df3 100644
1384 +--- a/arch/x86/kernel/tracepoint.c
1385 ++++ b/arch/x86/kernel/tracepoint.c
1386 +@@ -9,10 +9,12 @@
1387 + #include <linux/atomic.h>
1388 +
1389 + atomic_t trace_idt_ctr = ATOMIC_INIT(0);
1390 ++__aligned(PAGE_SIZE)
1391 + struct desc_ptr trace_idt_descr = { NR_VECTORS * 16 - 1,
1392 + (unsigned long) trace_idt_table };
1393 +
1394 + /* No need to be aligned, but done to keep all IDTs defined the same way. */
1395 ++__aligned(PAGE_SIZE)
1396 + gate_desc trace_idt_table[NR_VECTORS] __page_aligned_bss;
1397 +
1398 + static int trace_irq_vector_refcount;
1399 +diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
1400 +index 7e28e6c877d9..73304b1a03cc 100644
1401 +--- a/arch/x86/kvm/x86.c
1402 ++++ b/arch/x86/kvm/x86.c
1403 +@@ -773,7 +773,8 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
1404 + return 1;
1405 +
1406 + /* PCID can not be enabled when cr3[11:0]!=000H or EFER.LMA=0 */
1407 +- if ((kvm_read_cr3(vcpu) & X86_CR3_PCID_MASK) || !is_long_mode(vcpu))
1408 ++ if ((kvm_read_cr3(vcpu) & X86_CR3_PCID_ASID_MASK) ||
1409 ++ !is_long_mode(vcpu))
1410 + return 1;
1411 + }
1412 +
1413 +diff --git a/arch/x86/lib/cmdline.c b/arch/x86/lib/cmdline.c
1414 +index 5cc78bf57232..3261abb21ef4 100644
1415 +--- a/arch/x86/lib/cmdline.c
1416 ++++ b/arch/x86/lib/cmdline.c
1417 +@@ -104,7 +104,112 @@ __cmdline_find_option_bool(const char *cmdline, int max_cmdline_size,
1418 + return 0; /* Buffer overrun */
1419 + }
1420 +
1421 ++/*
1422 ++ * Find a non-boolean option (i.e. option=argument). In accordance with
1423 ++ * standard Linux practice, if this option is repeated, this returns the
1424 ++ * last instance on the command line.
1425 ++ *
1426 ++ * @cmdline: the cmdline string
1427 ++ * @max_cmdline_size: the maximum size of cmdline
1428 ++ * @option: option string to look for
1429 ++ * @buffer: memory buffer to return the option argument
1430 ++ * @bufsize: size of the supplied memory buffer
1431 ++ *
1432 ++ * Returns the length of the argument (regardless of if it was
1433 ++ * truncated to fit in the buffer), or -1 on not found.
1434 ++ */
1435 ++static int
1436 ++__cmdline_find_option(const char *cmdline, int max_cmdline_size,
1437 ++ const char *option, char *buffer, int bufsize)
1438 ++{
1439 ++ char c;
1440 ++ int pos = 0, len = -1;
1441 ++ const char *opptr = NULL;
1442 ++ char *bufptr = buffer;
1443 ++ enum {
1444 ++ st_wordstart = 0, /* Start of word/after whitespace */
1445 ++ st_wordcmp, /* Comparing this word */
1446 ++ st_wordskip, /* Miscompare, skip */
1447 ++ st_bufcpy, /* Copying this to buffer */
1448 ++ } state = st_wordstart;
1449 ++
1450 ++ if (!cmdline)
1451 ++ return -1; /* No command line */
1452 ++
1453 ++ /*
1454 ++ * This 'pos' check ensures we do not overrun
1455 ++ * a non-NULL-terminated 'cmdline'
1456 ++ */
1457 ++ while (pos++ < max_cmdline_size) {
1458 ++ c = *(char *)cmdline++;
1459 ++ if (!c)
1460 ++ break;
1461 ++
1462 ++ switch (state) {
1463 ++ case st_wordstart:
1464 ++ if (myisspace(c))
1465 ++ break;
1466 ++
1467 ++ state = st_wordcmp;
1468 ++ opptr = option;
1469 ++ /* fall through */
1470 ++
1471 ++ case st_wordcmp:
1472 ++ if ((c == '=') && !*opptr) {
1473 ++ /*
1474 ++ * We matched all the way to the end of the
1475 ++ * option we were looking for, prepare to
1476 ++ * copy the argument.
1477 ++ */
1478 ++ len = 0;
1479 ++ bufptr = buffer;
1480 ++ state = st_bufcpy;
1481 ++ break;
1482 ++ } else if (c == *opptr++) {
1483 ++ /*
1484 ++ * We are currently matching, so continue
1485 ++ * to the next character on the cmdline.
1486 ++ */
1487 ++ break;
1488 ++ }
1489 ++ state = st_wordskip;
1490 ++ /* fall through */
1491 ++
1492 ++ case st_wordskip:
1493 ++ if (myisspace(c))
1494 ++ state = st_wordstart;
1495 ++ break;
1496 ++
1497 ++ case st_bufcpy:
1498 ++ if (myisspace(c)) {
1499 ++ state = st_wordstart;
1500 ++ } else {
1501 ++ /*
1502 ++ * Increment len, but don't overrun the
1503 ++ * supplied buffer and leave room for the
1504 ++ * NULL terminator.
1505 ++ */
1506 ++ if (++len < bufsize)
1507 ++ *bufptr++ = c;
1508 ++ }
1509 ++ break;
1510 ++ }
1511 ++ }
1512 ++
1513 ++ if (bufsize)
1514 ++ *bufptr = '\0';
1515 ++
1516 ++ return len;
1517 ++}
1518 ++
1519 + int cmdline_find_option_bool(const char *cmdline, const char *option)
1520 + {
1521 + return __cmdline_find_option_bool(cmdline, COMMAND_LINE_SIZE, option);
1522 + }
1523 ++
1524 ++int cmdline_find_option(const char *cmdline, const char *option, char *buffer,
1525 ++ int bufsize)
1526 ++{
1527 ++ return __cmdline_find_option(cmdline, COMMAND_LINE_SIZE, option,
1528 ++ buffer, bufsize);
1529 ++}
1530 +diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
1531 +index 96d2b847e09e..c548b46100cb 100644
1532 +--- a/arch/x86/mm/Makefile
1533 ++++ b/arch/x86/mm/Makefile
1534 +@@ -37,5 +37,5 @@ obj-$(CONFIG_NUMA_EMU) += numa_emulation.o
1535 +
1536 + obj-$(CONFIG_X86_INTEL_MPX) += mpx.o
1537 + obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o
1538 +-obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o
1539 +-
1540 ++obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o
1541 ++obj-$(CONFIG_PAGE_TABLE_ISOLATION) += kaiser.o
1542 +diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
1543 +index 0381638168d1..1e779bca4f3e 100644
1544 +--- a/arch/x86/mm/init.c
1545 ++++ b/arch/x86/mm/init.c
1546 +@@ -177,7 +177,7 @@ static void __init probe_page_size_mask(void)
1547 + cr4_set_bits_and_update_boot(X86_CR4_PSE);
1548 +
1549 + /* Enable PGE if available */
1550 +- if (boot_cpu_has(X86_FEATURE_PGE)) {
1551 ++ if (boot_cpu_has(X86_FEATURE_PGE) && !kaiser_enabled) {
1552 + cr4_set_bits_and_update_boot(X86_CR4_PGE);
1553 + __supported_pte_mask |= _PAGE_GLOBAL;
1554 + } else
1555 +diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
1556 +index 3e27ded6ac65..7df8e3a79dc0 100644
1557 +--- a/arch/x86/mm/init_64.c
1558 ++++ b/arch/x86/mm/init_64.c
1559 +@@ -324,6 +324,16 @@ void __init cleanup_highmap(void)
1560 + continue;
1561 + if (vaddr < (unsigned long) _text || vaddr > end)
1562 + set_pmd(pmd, __pmd(0));
1563 ++ else if (kaiser_enabled) {
1564 ++ /*
1565 ++ * level2_kernel_pgt is initialized with _PAGE_GLOBAL:
1566 ++ * clear that now. This is not important, so long as
1567 ++ * CR4.PGE remains clear, but it removes an anomaly.
1568 ++ * Physical mapping setup below avoids _PAGE_GLOBAL
1569 ++ * by use of massage_pgprot() inside pfn_pte() etc.
1570 ++ */
1571 ++ set_pmd(pmd, pmd_clear_flags(*pmd, _PAGE_GLOBAL));
1572 ++ }
1573 + }
1574 + }
1575 +
1576 +diff --git a/arch/x86/mm/kaiser.c b/arch/x86/mm/kaiser.c
1577 +new file mode 100644
1578 +index 000000000000..d8376b4ad9f0
1579 +--- /dev/null
1580 ++++ b/arch/x86/mm/kaiser.c
1581 +@@ -0,0 +1,455 @@
1582 ++#include <linux/bug.h>
1583 ++#include <linux/kernel.h>
1584 ++#include <linux/errno.h>
1585 ++#include <linux/string.h>
1586 ++#include <linux/types.h>
1587 ++#include <linux/bug.h>
1588 ++#include <linux/init.h>
1589 ++#include <linux/interrupt.h>
1590 ++#include <linux/spinlock.h>
1591 ++#include <linux/mm.h>
1592 ++#include <linux/uaccess.h>
1593 ++
1594 ++#undef pr_fmt
1595 ++#define pr_fmt(fmt) "Kernel/User page tables isolation: " fmt
1596 ++
1597 ++#include <asm/kaiser.h>
1598 ++#include <asm/tlbflush.h> /* to verify its kaiser declarations */
1599 ++#include <asm/pgtable.h>
1600 ++#include <asm/pgalloc.h>
1601 ++#include <asm/desc.h>
1602 ++#include <asm/cmdline.h>
1603 ++
1604 ++int kaiser_enabled __read_mostly = 1;
1605 ++EXPORT_SYMBOL(kaiser_enabled); /* for inlined TLB flush functions */
1606 ++
1607 ++__visible
1608 ++DEFINE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
1609 ++
1610 ++/*
1611 ++ * These can have bit 63 set, so we can not just use a plain "or"
1612 ++ * instruction to get their value or'd into CR3. It would take
1613 ++ * another register. So, we use a memory reference to these instead.
1614 ++ *
1615 ++ * This is also handy because systems that do not support PCIDs
1616 ++ * just end up or'ing a 0 into their CR3, which does no harm.
1617 ++ */
1618 ++DEFINE_PER_CPU(unsigned long, x86_cr3_pcid_user);
1619 ++
1620 ++/*
1621 ++ * At runtime, the only things we map are some things for CPU
1622 ++ * hotplug, and stacks for new processes. No two CPUs will ever
1623 ++ * be populating the same addresses, so we only need to ensure
1624 ++ * that we protect between two CPUs trying to allocate and
1625 ++ * populate the same page table page.
1626 ++ *
1627 ++ * Only take this lock when doing a set_p[4um]d(), but it is not
1628 ++ * needed for doing a set_pte(). We assume that only the *owner*
1629 ++ * of a given allocation will be doing this for _their_
1630 ++ * allocation.
1631 ++ *
1632 ++ * This ensures that once a system has been running for a while
1633 ++ * and there have been stacks all over and these page tables
1634 ++ * are fully populated, there will be no further acquisitions of
1635 ++ * this lock.
1636 ++ */
1637 ++static DEFINE_SPINLOCK(shadow_table_allocation_lock);
1638 ++
1639 ++/*
1640 ++ * Returns -1 on error.
1641 ++ */
1642 ++static inline unsigned long get_pa_from_mapping(unsigned long vaddr)
1643 ++{
1644 ++ pgd_t *pgd;
1645 ++ pud_t *pud;
1646 ++ pmd_t *pmd;
1647 ++ pte_t *pte;
1648 ++
1649 ++ pgd = pgd_offset_k(vaddr);
1650 ++ /*
1651 ++ * We made all the kernel PGDs present in kaiser_init().
1652 ++ * We expect them to stay that way.
1653 ++ */
1654 ++ BUG_ON(pgd_none(*pgd));
1655 ++ /*
1656 ++ * PGDs are either 512GB or 128TB on all x86_64
1657 ++ * configurations. We don't handle these.
1658 ++ */
1659 ++ BUG_ON(pgd_large(*pgd));
1660 ++
1661 ++ pud = pud_offset(pgd, vaddr);
1662 ++ if (pud_none(*pud)) {
1663 ++ WARN_ON_ONCE(1);
1664 ++ return -1;
1665 ++ }
1666 ++
1667 ++ if (pud_large(*pud))
1668 ++ return (pud_pfn(*pud) << PAGE_SHIFT) | (vaddr & ~PUD_PAGE_MASK);
1669 ++
1670 ++ pmd = pmd_offset(pud, vaddr);
1671 ++ if (pmd_none(*pmd)) {
1672 ++ WARN_ON_ONCE(1);
1673 ++ return -1;
1674 ++ }
1675 ++
1676 ++ if (pmd_large(*pmd))
1677 ++ return (pmd_pfn(*pmd) << PAGE_SHIFT) | (vaddr & ~PMD_PAGE_MASK);
1678 ++
1679 ++ pte = pte_offset_kernel(pmd, vaddr);
1680 ++ if (pte_none(*pte)) {
1681 ++ WARN_ON_ONCE(1);
1682 ++ return -1;
1683 ++ }
1684 ++
1685 ++ return (pte_pfn(*pte) << PAGE_SHIFT) | (vaddr & ~PAGE_MASK);
1686 ++}
1687 ++
1688 ++/*
1689 ++ * This is a relatively normal page table walk, except that it
1690 ++ * also tries to allocate page tables pages along the way.
1691 ++ *
1692 ++ * Returns a pointer to a PTE on success, or NULL on failure.
1693 ++ */
1694 ++static pte_t *kaiser_pagetable_walk(unsigned long address)
1695 ++{
1696 ++ pmd_t *pmd;
1697 ++ pud_t *pud;
1698 ++ pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(address));
1699 ++ gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
1700 ++
1701 ++ if (pgd_none(*pgd)) {
1702 ++ WARN_ONCE(1, "All shadow pgds should have been populated");
1703 ++ return NULL;
1704 ++ }
1705 ++ BUILD_BUG_ON(pgd_large(*pgd) != 0);
1706 ++
1707 ++ pud = pud_offset(pgd, address);
1708 ++ /* The shadow page tables do not use large mappings: */
1709 ++ if (pud_large(*pud)) {
1710 ++ WARN_ON(1);
1711 ++ return NULL;
1712 ++ }
1713 ++ if (pud_none(*pud)) {
1714 ++ unsigned long new_pmd_page = __get_free_page(gfp);
1715 ++ if (!new_pmd_page)
1716 ++ return NULL;
1717 ++ spin_lock(&shadow_table_allocation_lock);
1718 ++ if (pud_none(*pud)) {
1719 ++ set_pud(pud, __pud(_KERNPG_TABLE | __pa(new_pmd_page)));
1720 ++ __inc_zone_page_state(virt_to_page((void *)
1721 ++ new_pmd_page), NR_KAISERTABLE);
1722 ++ } else
1723 ++ free_page(new_pmd_page);
1724 ++ spin_unlock(&shadow_table_allocation_lock);
1725 ++ }
1726 ++
1727 ++ pmd = pmd_offset(pud, address);
1728 ++ /* The shadow page tables do not use large mappings: */
1729 ++ if (pmd_large(*pmd)) {
1730 ++ WARN_ON(1);
1731 ++ return NULL;
1732 ++ }
1733 ++ if (pmd_none(*pmd)) {
1734 ++ unsigned long new_pte_page = __get_free_page(gfp);
1735 ++ if (!new_pte_page)
1736 ++ return NULL;
1737 ++ spin_lock(&shadow_table_allocation_lock);
1738 ++ if (pmd_none(*pmd)) {
1739 ++ set_pmd(pmd, __pmd(_KERNPG_TABLE | __pa(new_pte_page)));
1740 ++ __inc_zone_page_state(virt_to_page((void *)
1741 ++ new_pte_page), NR_KAISERTABLE);
1742 ++ } else
1743 ++ free_page(new_pte_page);
1744 ++ spin_unlock(&shadow_table_allocation_lock);
1745 ++ }
1746 ++
1747 ++ return pte_offset_kernel(pmd, address);
1748 ++}
1749 ++
1750 ++static int kaiser_add_user_map(const void *__start_addr, unsigned long size,
1751 ++ unsigned long flags)
1752 ++{
1753 ++ int ret = 0;
1754 ++ pte_t *pte;
1755 ++ unsigned long start_addr = (unsigned long )__start_addr;
1756 ++ unsigned long address = start_addr & PAGE_MASK;
1757 ++ unsigned long end_addr = PAGE_ALIGN(start_addr + size);
1758 ++ unsigned long target_address;
1759 ++
1760 ++ /*
1761 ++ * It is convenient for callers to pass in __PAGE_KERNEL etc,
1762 ++ * and there is no actual harm from setting _PAGE_GLOBAL, so
1763 ++ * long as CR4.PGE is not set. But it is nonetheless troubling
1764 ++ * to see Kaiser itself setting _PAGE_GLOBAL (now that "nokaiser"
1765 ++ * requires that not to be #defined to 0): so mask it off here.
1766 ++ */
1767 ++ flags &= ~_PAGE_GLOBAL;
1768 ++
1769 ++ for (; address < end_addr; address += PAGE_SIZE) {
1770 ++ target_address = get_pa_from_mapping(address);
1771 ++ if (target_address == -1) {
1772 ++ ret = -EIO;
1773 ++ break;
1774 ++ }
1775 ++ pte = kaiser_pagetable_walk(address);
1776 ++ if (!pte) {
1777 ++ ret = -ENOMEM;
1778 ++ break;
1779 ++ }
1780 ++ if (pte_none(*pte)) {
1781 ++ set_pte(pte, __pte(flags | target_address));
1782 ++ } else {
1783 ++ pte_t tmp;
1784 ++ set_pte(&tmp, __pte(flags | target_address));
1785 ++ WARN_ON_ONCE(!pte_same(*pte, tmp));
1786 ++ }
1787 ++ }
1788 ++ return ret;
1789 ++}
1790 ++
1791 ++static int kaiser_add_user_map_ptrs(const void *start, const void *end, unsigned long flags)
1792 ++{
1793 ++ unsigned long size = end - start;
1794 ++
1795 ++ return kaiser_add_user_map(start, size, flags);
1796 ++}
1797 ++
1798 ++/*
1799 ++ * Ensure that the top level of the (shadow) page tables are
1800 ++ * entirely populated. This ensures that all processes that get
1801 ++ * forked have the same entries. This way, we do not have to
1802 ++ * ever go set up new entries in older processes.
1803 ++ *
1804 ++ * Note: we never free these, so there are no updates to them
1805 ++ * after this.
1806 ++ */
1807 ++static void __init kaiser_init_all_pgds(void)
1808 ++{
1809 ++ pgd_t *pgd;
1810 ++ int i = 0;
1811 ++
1812 ++ pgd = native_get_shadow_pgd(pgd_offset_k((unsigned long )0));
1813 ++ for (i = PTRS_PER_PGD / 2; i < PTRS_PER_PGD; i++) {
1814 ++ pgd_t new_pgd;
1815 ++ pud_t *pud = pud_alloc_one(&init_mm,
1816 ++ PAGE_OFFSET + i * PGDIR_SIZE);
1817 ++ if (!pud) {
1818 ++ WARN_ON(1);
1819 ++ break;
1820 ++ }
1821 ++ inc_zone_page_state(virt_to_page(pud), NR_KAISERTABLE);
1822 ++ new_pgd = __pgd(_KERNPG_TABLE |__pa(pud));
1823 ++ /*
1824 ++ * Make sure not to stomp on some other pgd entry.
1825 ++ */
1826 ++ if (!pgd_none(pgd[i])) {
1827 ++ WARN_ON(1);
1828 ++ continue;
1829 ++ }
1830 ++ set_pgd(pgd + i, new_pgd);
1831 ++ }
1832 ++}
1833 ++
1834 ++#define kaiser_add_user_map_early(start, size, flags) do { \
1835 ++ int __ret = kaiser_add_user_map(start, size, flags); \
1836 ++ WARN_ON(__ret); \
1837 ++} while (0)
1838 ++
1839 ++#define kaiser_add_user_map_ptrs_early(start, end, flags) do { \
1840 ++ int __ret = kaiser_add_user_map_ptrs(start, end, flags); \
1841 ++ WARN_ON(__ret); \
1842 ++} while (0)
1843 ++
1844 ++void __init kaiser_check_boottime_disable(void)
1845 ++{
1846 ++ bool enable = true;
1847 ++ char arg[5];
1848 ++ int ret;
1849 ++
1850 ++ if (boot_cpu_has(X86_FEATURE_XENPV))
1851 ++ goto silent_disable;
1852 ++
1853 ++ ret = cmdline_find_option(boot_command_line, "pti", arg, sizeof(arg));
1854 ++ if (ret > 0) {
1855 ++ if (!strncmp(arg, "on", 2))
1856 ++ goto enable;
1857 ++
1858 ++ if (!strncmp(arg, "off", 3))
1859 ++ goto disable;
1860 ++
1861 ++ if (!strncmp(arg, "auto", 4))
1862 ++ goto skip;
1863 ++ }
1864 ++
1865 ++ if (cmdline_find_option_bool(boot_command_line, "nopti"))
1866 ++ goto disable;
1867 ++
1868 ++skip:
1869 ++ if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
1870 ++ goto disable;
1871 ++
1872 ++enable:
1873 ++ if (enable)
1874 ++ setup_force_cpu_cap(X86_FEATURE_KAISER);
1875 ++
1876 ++ return;
1877 ++
1878 ++disable:
1879 ++ pr_info("disabled\n");
1880 ++
1881 ++silent_disable:
1882 ++ kaiser_enabled = 0;
1883 ++ setup_clear_cpu_cap(X86_FEATURE_KAISER);
1884 ++}
1885 ++
1886 ++/*
1887 ++ * If anything in here fails, we will likely die on one of the
1888 ++ * first kernel->user transitions and init will die. But, we
1889 ++ * will have most of the kernel up by then and should be able to
1890 ++ * get a clean warning out of it. If we BUG_ON() here, we run
1891 ++ * the risk of being before we have good console output.
1892 ++ */
1893 ++void __init kaiser_init(void)
1894 ++{
1895 ++ int cpu;
1896 ++
1897 ++ if (!kaiser_enabled)
1898 ++ return;
1899 ++
1900 ++ kaiser_init_all_pgds();
1901 ++
1902 ++ for_each_possible_cpu(cpu) {
1903 ++ void *percpu_vaddr = __per_cpu_user_mapped_start +
1904 ++ per_cpu_offset(cpu);
1905 ++ unsigned long percpu_sz = __per_cpu_user_mapped_end -
1906 ++ __per_cpu_user_mapped_start;
1907 ++ kaiser_add_user_map_early(percpu_vaddr, percpu_sz,
1908 ++ __PAGE_KERNEL);
1909 ++ }
1910 ++
1911 ++ /*
1912 ++ * Map the entry/exit text section, which is needed at
1913 ++ * switches from user to and from kernel.
1914 ++ */
1915 ++ kaiser_add_user_map_ptrs_early(__entry_text_start, __entry_text_end,
1916 ++ __PAGE_KERNEL_RX);
1917 ++
1918 ++#if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN)
1919 ++ kaiser_add_user_map_ptrs_early(__irqentry_text_start,
1920 ++ __irqentry_text_end,
1921 ++ __PAGE_KERNEL_RX);
1922 ++#endif
1923 ++ kaiser_add_user_map_early((void *)idt_descr.address,
1924 ++ sizeof(gate_desc) * NR_VECTORS,
1925 ++ __PAGE_KERNEL_RO);
1926 ++#ifdef CONFIG_TRACING
1927 ++ kaiser_add_user_map_early(&trace_idt_descr,
1928 ++ sizeof(trace_idt_descr),
1929 ++ __PAGE_KERNEL);
1930 ++ kaiser_add_user_map_early(&trace_idt_table,
1931 ++ sizeof(gate_desc) * NR_VECTORS,
1932 ++ __PAGE_KERNEL);
1933 ++#endif
1934 ++ kaiser_add_user_map_early(&debug_idt_descr, sizeof(debug_idt_descr),
1935 ++ __PAGE_KERNEL);
1936 ++ kaiser_add_user_map_early(&debug_idt_table,
1937 ++ sizeof(gate_desc) * NR_VECTORS,
1938 ++ __PAGE_KERNEL);
1939 ++
1940 ++ pr_info("enabled\n");
1941 ++}
1942 ++
1943 ++/* Add a mapping to the shadow mapping, and synchronize the mappings */
1944 ++int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags)
1945 ++{
1946 ++ if (!kaiser_enabled)
1947 ++ return 0;
1948 ++ return kaiser_add_user_map((const void *)addr, size, flags);
1949 ++}
1950 ++
1951 ++void kaiser_remove_mapping(unsigned long start, unsigned long size)
1952 ++{
1953 ++ extern void unmap_pud_range_nofree(pgd_t *pgd,
1954 ++ unsigned long start, unsigned long end);
1955 ++ unsigned long end = start + size;
1956 ++ unsigned long addr, next;
1957 ++ pgd_t *pgd;
1958 ++
1959 ++ if (!kaiser_enabled)
1960 ++ return;
1961 ++ pgd = native_get_shadow_pgd(pgd_offset_k(start));
1962 ++ for (addr = start; addr < end; pgd++, addr = next) {
1963 ++ next = pgd_addr_end(addr, end);
1964 ++ unmap_pud_range_nofree(pgd, addr, next);
1965 ++ }
1966 ++}
1967 ++
1968 ++/*
1969 ++ * Page table pages are page-aligned. The lower half of the top
1970 ++ * level is used for userspace and the top half for the kernel.
1971 ++ * This returns true for user pages that need to get copied into
1972 ++ * both the user and kernel copies of the page tables, and false
1973 ++ * for kernel pages that should only be in the kernel copy.
1974 ++ */
1975 ++static inline bool is_userspace_pgd(pgd_t *pgdp)
1976 ++{
1977 ++ return ((unsigned long)pgdp % PAGE_SIZE) < (PAGE_SIZE / 2);
1978 ++}
1979 ++
1980 ++pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd)
1981 ++{
1982 ++ if (!kaiser_enabled)
1983 ++ return pgd;
1984 ++ /*
1985 ++ * Do we need to also populate the shadow pgd? Check _PAGE_USER to
1986 ++ * skip cases like kexec and EFI which make temporary low mappings.
1987 ++ */
1988 ++ if (pgd.pgd & _PAGE_USER) {
1989 ++ if (is_userspace_pgd(pgdp)) {
1990 ++ native_get_shadow_pgd(pgdp)->pgd = pgd.pgd;
1991 ++ /*
1992 ++ * Even if the entry is *mapping* userspace, ensure
1993 ++ * that userspace can not use it. This way, if we
1994 ++ * get out to userspace running on the kernel CR3,
1995 ++ * userspace will crash instead of running.
1996 ++ */
1997 ++ if (__supported_pte_mask & _PAGE_NX)
1998 ++ pgd.pgd |= _PAGE_NX;
1999 ++ }
2000 ++ } else if (!pgd.pgd) {
2001 ++ /*
2002 ++ * pgd_clear() cannot check _PAGE_USER, and is even used to
2003 ++ * clear corrupted pgd entries: so just rely on cases like
2004 ++ * kexec and EFI never to be using pgd_clear().
2005 ++ */
2006 ++ if (!WARN_ON_ONCE((unsigned long)pgdp & PAGE_SIZE) &&
2007 ++ is_userspace_pgd(pgdp))
2008 ++ native_get_shadow_pgd(pgdp)->pgd = pgd.pgd;
2009 ++ }
2010 ++ return pgd;
2011 ++}
2012 ++
2013 ++void kaiser_setup_pcid(void)
2014 ++{
2015 ++ unsigned long user_cr3 = KAISER_SHADOW_PGD_OFFSET;
2016 ++
2017 ++ if (this_cpu_has(X86_FEATURE_PCID))
2018 ++ user_cr3 |= X86_CR3_PCID_USER_NOFLUSH;
2019 ++ /*
2020 ++ * These variables are used by the entry/exit
2021 ++ * code to change PCID and pgd and TLB flushing.
2022 ++ */
2023 ++ this_cpu_write(x86_cr3_pcid_user, user_cr3);
2024 ++}
2025 ++
2026 ++/*
2027 ++ * Make a note that this cpu will need to flush USER tlb on return to user.
2028 ++ * If cpu does not have PCID, then the NOFLUSH bit will never have been set.
2029 ++ */
2030 ++void kaiser_flush_tlb_on_return_to_user(void)
2031 ++{
2032 ++ if (this_cpu_has(X86_FEATURE_PCID))
2033 ++ this_cpu_write(x86_cr3_pcid_user,
2034 ++ X86_CR3_PCID_USER_FLUSH | KAISER_SHADOW_PGD_OFFSET);
2035 ++}
2036 ++EXPORT_SYMBOL(kaiser_flush_tlb_on_return_to_user);
2037 +diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c
2038 +index aed206475aa7..319183d93602 100644
2039 +--- a/arch/x86/mm/kaslr.c
2040 ++++ b/arch/x86/mm/kaslr.c
2041 +@@ -189,6 +189,6 @@ void __meminit init_trampoline(void)
2042 + *pud_tramp = *pud;
2043 + }
2044 +
2045 +- set_pgd(&trampoline_pgd_entry,
2046 +- __pgd(_KERNPG_TABLE | __pa(pud_page_tramp)));
2047 ++ /* Avoid set_pgd(), in case it's complicated by CONFIG_PAGE_TABLE_ISOLATION */
2048 ++ trampoline_pgd_entry = __pgd(_KERNPG_TABLE | __pa(pud_page_tramp));
2049 + }
2050 +diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
2051 +index e3353c97d086..73dcb0e18c1b 100644
2052 +--- a/arch/x86/mm/pageattr.c
2053 ++++ b/arch/x86/mm/pageattr.c
2054 +@@ -52,6 +52,7 @@ static DEFINE_SPINLOCK(cpa_lock);
2055 + #define CPA_FLUSHTLB 1
2056 + #define CPA_ARRAY 2
2057 + #define CPA_PAGES_ARRAY 4
2058 ++#define CPA_FREE_PAGETABLES 8
2059 +
2060 + #ifdef CONFIG_PROC_FS
2061 + static unsigned long direct_pages_count[PG_LEVEL_NUM];
2062 +@@ -729,10 +730,13 @@ static int split_large_page(struct cpa_data *cpa, pte_t *kpte,
2063 + return 0;
2064 + }
2065 +
2066 +-static bool try_to_free_pte_page(pte_t *pte)
2067 ++static bool try_to_free_pte_page(struct cpa_data *cpa, pte_t *pte)
2068 + {
2069 + int i;
2070 +
2071 ++ if (!(cpa->flags & CPA_FREE_PAGETABLES))
2072 ++ return false;
2073 ++
2074 + for (i = 0; i < PTRS_PER_PTE; i++)
2075 + if (!pte_none(pte[i]))
2076 + return false;
2077 +@@ -741,10 +745,13 @@ static bool try_to_free_pte_page(pte_t *pte)
2078 + return true;
2079 + }
2080 +
2081 +-static bool try_to_free_pmd_page(pmd_t *pmd)
2082 ++static bool try_to_free_pmd_page(struct cpa_data *cpa, pmd_t *pmd)
2083 + {
2084 + int i;
2085 +
2086 ++ if (!(cpa->flags & CPA_FREE_PAGETABLES))
2087 ++ return false;
2088 ++
2089 + for (i = 0; i < PTRS_PER_PMD; i++)
2090 + if (!pmd_none(pmd[i]))
2091 + return false;
2092 +@@ -753,7 +760,9 @@ static bool try_to_free_pmd_page(pmd_t *pmd)
2093 + return true;
2094 + }
2095 +
2096 +-static bool unmap_pte_range(pmd_t *pmd, unsigned long start, unsigned long end)
2097 ++static bool unmap_pte_range(struct cpa_data *cpa, pmd_t *pmd,
2098 ++ unsigned long start,
2099 ++ unsigned long end)
2100 + {
2101 + pte_t *pte = pte_offset_kernel(pmd, start);
2102 +
2103 +@@ -764,22 +773,23 @@ static bool unmap_pte_range(pmd_t *pmd, unsigned long start, unsigned long end)
2104 + pte++;
2105 + }
2106 +
2107 +- if (try_to_free_pte_page((pte_t *)pmd_page_vaddr(*pmd))) {
2108 ++ if (try_to_free_pte_page(cpa, (pte_t *)pmd_page_vaddr(*pmd))) {
2109 + pmd_clear(pmd);
2110 + return true;
2111 + }
2112 + return false;
2113 + }
2114 +
2115 +-static void __unmap_pmd_range(pud_t *pud, pmd_t *pmd,
2116 ++static void __unmap_pmd_range(struct cpa_data *cpa, pud_t *pud, pmd_t *pmd,
2117 + unsigned long start, unsigned long end)
2118 + {
2119 +- if (unmap_pte_range(pmd, start, end))
2120 +- if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud)))
2121 ++ if (unmap_pte_range(cpa, pmd, start, end))
2122 ++ if (try_to_free_pmd_page(cpa, (pmd_t *)pud_page_vaddr(*pud)))
2123 + pud_clear(pud);
2124 + }
2125 +
2126 +-static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end)
2127 ++static void unmap_pmd_range(struct cpa_data *cpa, pud_t *pud,
2128 ++ unsigned long start, unsigned long end)
2129 + {
2130 + pmd_t *pmd = pmd_offset(pud, start);
2131 +
2132 +@@ -790,7 +800,7 @@ static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end)
2133 + unsigned long next_page = (start + PMD_SIZE) & PMD_MASK;
2134 + unsigned long pre_end = min_t(unsigned long, end, next_page);
2135 +
2136 +- __unmap_pmd_range(pud, pmd, start, pre_end);
2137 ++ __unmap_pmd_range(cpa, pud, pmd, start, pre_end);
2138 +
2139 + start = pre_end;
2140 + pmd++;
2141 +@@ -803,7 +813,8 @@ static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end)
2142 + if (pmd_large(*pmd))
2143 + pmd_clear(pmd);
2144 + else
2145 +- __unmap_pmd_range(pud, pmd, start, start + PMD_SIZE);
2146 ++ __unmap_pmd_range(cpa, pud, pmd,
2147 ++ start, start + PMD_SIZE);
2148 +
2149 + start += PMD_SIZE;
2150 + pmd++;
2151 +@@ -813,17 +824,19 @@ static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end)
2152 + * 4K leftovers?
2153 + */
2154 + if (start < end)
2155 +- return __unmap_pmd_range(pud, pmd, start, end);
2156 ++ return __unmap_pmd_range(cpa, pud, pmd, start, end);
2157 +
2158 + /*
2159 + * Try again to free the PMD page if haven't succeeded above.
2160 + */
2161 + if (!pud_none(*pud))
2162 +- if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud)))
2163 ++ if (try_to_free_pmd_page(cpa, (pmd_t *)pud_page_vaddr(*pud)))
2164 + pud_clear(pud);
2165 + }
2166 +
2167 +-static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
2168 ++static void __unmap_pud_range(struct cpa_data *cpa, pgd_t *pgd,
2169 ++ unsigned long start,
2170 ++ unsigned long end)
2171 + {
2172 + pud_t *pud = pud_offset(pgd, start);
2173 +
2174 +@@ -834,7 +847,7 @@ static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
2175 + unsigned long next_page = (start + PUD_SIZE) & PUD_MASK;
2176 + unsigned long pre_end = min_t(unsigned long, end, next_page);
2177 +
2178 +- unmap_pmd_range(pud, start, pre_end);
2179 ++ unmap_pmd_range(cpa, pud, start, pre_end);
2180 +
2181 + start = pre_end;
2182 + pud++;
2183 +@@ -848,7 +861,7 @@ static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
2184 + if (pud_large(*pud))
2185 + pud_clear(pud);
2186 + else
2187 +- unmap_pmd_range(pud, start, start + PUD_SIZE);
2188 ++ unmap_pmd_range(cpa, pud, start, start + PUD_SIZE);
2189 +
2190 + start += PUD_SIZE;
2191 + pud++;
2192 +@@ -858,7 +871,7 @@ static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
2193 + * 2M leftovers?
2194 + */
2195 + if (start < end)
2196 +- unmap_pmd_range(pud, start, end);
2197 ++ unmap_pmd_range(cpa, pud, start, end);
2198 +
2199 + /*
2200 + * No need to try to free the PUD page because we'll free it in
2201 +@@ -866,6 +879,24 @@ static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
2202 + */
2203 + }
2204 +
2205 ++static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
2206 ++{
2207 ++ struct cpa_data cpa = {
2208 ++ .flags = CPA_FREE_PAGETABLES,
2209 ++ };
2210 ++
2211 ++ __unmap_pud_range(&cpa, pgd, start, end);
2212 ++}
2213 ++
2214 ++void unmap_pud_range_nofree(pgd_t *pgd, unsigned long start, unsigned long end)
2215 ++{
2216 ++ struct cpa_data cpa = {
2217 ++ .flags = 0,
2218 ++ };
2219 ++
2220 ++ __unmap_pud_range(&cpa, pgd, start, end);
2221 ++}
2222 ++
2223 + static int alloc_pte_page(pmd_t *pmd)
2224 + {
2225 + pte_t *pte = (pte_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK);
2226 +diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
2227 +index 3feec5af4e67..5aaec8effc5f 100644
2228 +--- a/arch/x86/mm/pgtable.c
2229 ++++ b/arch/x86/mm/pgtable.c
2230 +@@ -344,14 +344,22 @@ static inline void _pgd_free(pgd_t *pgd)
2231 + kmem_cache_free(pgd_cache, pgd);
2232 + }
2233 + #else
2234 ++
2235 ++/*
2236 ++ * Instead of one pgd, Kaiser acquires two pgds. Being order-1, it is
2237 ++ * both 8k in size and 8k-aligned. That lets us just flip bit 12
2238 ++ * in a pointer to swap between the two 4k halves.
2239 ++ */
2240 ++#define PGD_ALLOCATION_ORDER kaiser_enabled
2241 ++
2242 + static inline pgd_t *_pgd_alloc(void)
2243 + {
2244 +- return (pgd_t *)__get_free_page(PGALLOC_GFP);
2245 ++ return (pgd_t *)__get_free_pages(PGALLOC_GFP, PGD_ALLOCATION_ORDER);
2246 + }
2247 +
2248 + static inline void _pgd_free(pgd_t *pgd)
2249 + {
2250 +- free_page((unsigned long)pgd);
2251 ++ free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER);
2252 + }
2253 + #endif /* CONFIG_X86_PAE */
2254 +
2255 +diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
2256 +index 53b72fb4e781..41205de487e7 100644
2257 +--- a/arch/x86/mm/tlb.c
2258 ++++ b/arch/x86/mm/tlb.c
2259 +@@ -6,13 +6,14 @@
2260 + #include <linux/interrupt.h>
2261 + #include <linux/export.h>
2262 + #include <linux/cpu.h>
2263 ++#include <linux/debugfs.h>
2264 +
2265 + #include <asm/tlbflush.h>
2266 + #include <asm/mmu_context.h>
2267 + #include <asm/cache.h>
2268 + #include <asm/apic.h>
2269 + #include <asm/uv/uv.h>
2270 +-#include <linux/debugfs.h>
2271 ++#include <asm/kaiser.h>
2272 +
2273 + /*
2274 + * TLB flushing, formerly SMP-only
2275 +@@ -34,6 +35,36 @@ struct flush_tlb_info {
2276 + unsigned long flush_end;
2277 + };
2278 +
2279 ++static void load_new_mm_cr3(pgd_t *pgdir)
2280 ++{
2281 ++ unsigned long new_mm_cr3 = __pa(pgdir);
2282 ++
2283 ++ if (kaiser_enabled) {
2284 ++ /*
2285 ++ * We reuse the same PCID for different tasks, so we must
2286 ++ * flush all the entries for the PCID out when we change tasks.
2287 ++ * Flush KERN below, flush USER when returning to userspace in
2288 ++ * kaiser's SWITCH_USER_CR3 (_SWITCH_TO_USER_CR3) macro.
2289 ++ *
2290 ++ * invpcid_flush_single_context(X86_CR3_PCID_ASID_USER) could
2291 ++ * do it here, but can only be used if X86_FEATURE_INVPCID is
2292 ++ * available - and many machines support pcid without invpcid.
2293 ++ *
2294 ++ * If X86_CR3_PCID_KERN_FLUSH actually added something, then it
2295 ++ * would be needed in the write_cr3() below - if PCIDs enabled.
2296 ++ */
2297 ++ BUILD_BUG_ON(X86_CR3_PCID_KERN_FLUSH);
2298 ++ kaiser_flush_tlb_on_return_to_user();
2299 ++ }
2300 ++
2301 ++ /*
2302 ++ * Caution: many callers of this function expect
2303 ++ * that load_cr3() is serializing and orders TLB
2304 ++ * fills with respect to the mm_cpumask writes.
2305 ++ */
2306 ++ write_cr3(new_mm_cr3);
2307 ++}
2308 ++
2309 + /*
2310 + * We cannot call mmdrop() because we are in interrupt context,
2311 + * instead update mm->cpu_vm_mask.
2312 +@@ -45,7 +76,7 @@ void leave_mm(int cpu)
2313 + BUG();
2314 + if (cpumask_test_cpu(cpu, mm_cpumask(active_mm))) {
2315 + cpumask_clear_cpu(cpu, mm_cpumask(active_mm));
2316 +- load_cr3(swapper_pg_dir);
2317 ++ load_new_mm_cr3(swapper_pg_dir);
2318 + /*
2319 + * This gets called in the idle path where RCU
2320 + * functions differently. Tracing normally
2321 +@@ -120,7 +151,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
2322 + * ordering guarantee we need.
2323 + *
2324 + */
2325 +- load_cr3(next->pgd);
2326 ++ load_new_mm_cr3(next->pgd);
2327 +
2328 + trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
2329 +
2330 +@@ -167,7 +198,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
2331 + * As above, load_cr3() is serializing and orders TLB
2332 + * fills with respect to the mm_cpumask write.
2333 + */
2334 +- load_cr3(next->pgd);
2335 ++ load_new_mm_cr3(next->pgd);
2336 + trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
2337 + load_mm_cr4(next);
2338 + load_mm_ldt(next);
2339 +diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
2340 +index dc81e5287ebf..2e6000a4eb2c 100644
2341 +--- a/include/asm-generic/vmlinux.lds.h
2342 ++++ b/include/asm-generic/vmlinux.lds.h
2343 +@@ -778,7 +778,14 @@
2344 + */
2345 + #define PERCPU_INPUT(cacheline) \
2346 + VMLINUX_SYMBOL(__per_cpu_start) = .; \
2347 ++ VMLINUX_SYMBOL(__per_cpu_user_mapped_start) = .; \
2348 + *(.data..percpu..first) \
2349 ++ . = ALIGN(cacheline); \
2350 ++ *(.data..percpu..user_mapped) \
2351 ++ *(.data..percpu..user_mapped..shared_aligned) \
2352 ++ . = ALIGN(PAGE_SIZE); \
2353 ++ *(.data..percpu..user_mapped..page_aligned) \
2354 ++ VMLINUX_SYMBOL(__per_cpu_user_mapped_end) = .; \
2355 + . = ALIGN(PAGE_SIZE); \
2356 + *(.data..percpu..page_aligned) \
2357 + . = ALIGN(cacheline); \
2358 +diff --git a/include/linux/kaiser.h b/include/linux/kaiser.h
2359 +new file mode 100644
2360 +index 000000000000..58c55b1589d0
2361 +--- /dev/null
2362 ++++ b/include/linux/kaiser.h
2363 +@@ -0,0 +1,52 @@
2364 ++#ifndef _LINUX_KAISER_H
2365 ++#define _LINUX_KAISER_H
2366 ++
2367 ++#ifdef CONFIG_PAGE_TABLE_ISOLATION
2368 ++#include <asm/kaiser.h>
2369 ++
2370 ++static inline int kaiser_map_thread_stack(void *stack)
2371 ++{
2372 ++ /*
2373 ++ * Map that page of kernel stack on which we enter from user context.
2374 ++ */
2375 ++ return kaiser_add_mapping((unsigned long)stack +
2376 ++ THREAD_SIZE - PAGE_SIZE, PAGE_SIZE, __PAGE_KERNEL);
2377 ++}
2378 ++
2379 ++static inline void kaiser_unmap_thread_stack(void *stack)
2380 ++{
2381 ++ /*
2382 ++ * Note: may be called even when kaiser_map_thread_stack() failed.
2383 ++ */
2384 ++ kaiser_remove_mapping((unsigned long)stack +
2385 ++ THREAD_SIZE - PAGE_SIZE, PAGE_SIZE);
2386 ++}
2387 ++#else
2388 ++
2389 ++/*
2390 ++ * These stubs are used whenever CONFIG_PAGE_TABLE_ISOLATION is off, which
2391 ++ * includes architectures that support KAISER, but have it disabled.
2392 ++ */
2393 ++
2394 ++static inline void kaiser_init(void)
2395 ++{
2396 ++}
2397 ++static inline int kaiser_add_mapping(unsigned long addr,
2398 ++ unsigned long size, unsigned long flags)
2399 ++{
2400 ++ return 0;
2401 ++}
2402 ++static inline void kaiser_remove_mapping(unsigned long start,
2403 ++ unsigned long size)
2404 ++{
2405 ++}
2406 ++static inline int kaiser_map_thread_stack(void *stack)
2407 ++{
2408 ++ return 0;
2409 ++}
2410 ++static inline void kaiser_unmap_thread_stack(void *stack)
2411 ++{
2412 ++}
2413 ++
2414 ++#endif /* !CONFIG_PAGE_TABLE_ISOLATION */
2415 ++#endif /* _LINUX_KAISER_H */
2416 +diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
2417 +index fff21a82780c..490f5a83f947 100644
2418 +--- a/include/linux/mmzone.h
2419 ++++ b/include/linux/mmzone.h
2420 +@@ -124,8 +124,9 @@ enum zone_stat_item {
2421 + NR_SLAB_UNRECLAIMABLE,
2422 + NR_PAGETABLE, /* used for pagetables */
2423 + NR_KERNEL_STACK_KB, /* measured in KiB */
2424 +- /* Second 128 byte cacheline */
2425 ++ NR_KAISERTABLE,
2426 + NR_BOUNCE,
2427 ++ /* Second 128 byte cacheline */
2428 + #if IS_ENABLED(CONFIG_ZSMALLOC)
2429 + NR_ZSPAGES, /* allocated in zsmalloc */
2430 + #endif
2431 +diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h
2432 +index 8f16299ca068..8902f23bb770 100644
2433 +--- a/include/linux/percpu-defs.h
2434 ++++ b/include/linux/percpu-defs.h
2435 +@@ -35,6 +35,12 @@
2436 +
2437 + #endif
2438 +
2439 ++#ifdef CONFIG_PAGE_TABLE_ISOLATION
2440 ++#define USER_MAPPED_SECTION "..user_mapped"
2441 ++#else
2442 ++#define USER_MAPPED_SECTION ""
2443 ++#endif
2444 ++
2445 + /*
2446 + * Base implementations of per-CPU variable declarations and definitions, where
2447 + * the section in which the variable is to be placed is provided by the
2448 +@@ -115,6 +121,12 @@
2449 + #define DEFINE_PER_CPU(type, name) \
2450 + DEFINE_PER_CPU_SECTION(type, name, "")
2451 +
2452 ++#define DECLARE_PER_CPU_USER_MAPPED(type, name) \
2453 ++ DECLARE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION)
2454 ++
2455 ++#define DEFINE_PER_CPU_USER_MAPPED(type, name) \
2456 ++ DEFINE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION)
2457 ++
2458 + /*
2459 + * Declaration/definition used for per-CPU variables that must come first in
2460 + * the set of variables.
2461 +@@ -144,6 +156,14 @@
2462 + DEFINE_PER_CPU_SECTION(type, name, PER_CPU_SHARED_ALIGNED_SECTION) \
2463 + ____cacheline_aligned_in_smp
2464 +
2465 ++#define DECLARE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(type, name) \
2466 ++ DECLARE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION PER_CPU_SHARED_ALIGNED_SECTION) \
2467 ++ ____cacheline_aligned_in_smp
2468 ++
2469 ++#define DEFINE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(type, name) \
2470 ++ DEFINE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION PER_CPU_SHARED_ALIGNED_SECTION) \
2471 ++ ____cacheline_aligned_in_smp
2472 ++
2473 + #define DECLARE_PER_CPU_ALIGNED(type, name) \
2474 + DECLARE_PER_CPU_SECTION(type, name, PER_CPU_ALIGNED_SECTION) \
2475 + ____cacheline_aligned
2476 +@@ -162,11 +182,21 @@
2477 + #define DEFINE_PER_CPU_PAGE_ALIGNED(type, name) \
2478 + DEFINE_PER_CPU_SECTION(type, name, "..page_aligned") \
2479 + __aligned(PAGE_SIZE)
2480 ++/*
2481 ++ * Declaration/definition used for per-CPU variables that must be page aligned and need to be mapped in user mode.
2482 ++ */
2483 ++#define DECLARE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(type, name) \
2484 ++ DECLARE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION"..page_aligned") \
2485 ++ __aligned(PAGE_SIZE)
2486 ++
2487 ++#define DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(type, name) \
2488 ++ DEFINE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION"..page_aligned") \
2489 ++ __aligned(PAGE_SIZE)
2490 +
2491 + /*
2492 + * Declaration/definition used for per-CPU variables that must be read mostly.
2493 + */
2494 +-#define DECLARE_PER_CPU_READ_MOSTLY(type, name) \
2495 ++#define DECLARE_PER_CPU_READ_MOSTLY(type, name) \
2496 + DECLARE_PER_CPU_SECTION(type, name, "..read_mostly")
2497 +
2498 + #define DEFINE_PER_CPU_READ_MOSTLY(type, name) \
2499 +diff --git a/init/main.c b/init/main.c
2500 +index 25bac88bc66e..99f026565608 100644
2501 +--- a/init/main.c
2502 ++++ b/init/main.c
2503 +@@ -80,6 +80,7 @@
2504 + #include <linux/integrity.h>
2505 + #include <linux/proc_ns.h>
2506 + #include <linux/io.h>
2507 ++#include <linux/kaiser.h>
2508 +
2509 + #include <asm/io.h>
2510 + #include <asm/bugs.h>
2511 +@@ -473,6 +474,7 @@ static void __init mm_init(void)
2512 + pgtable_init();
2513 + vmalloc_init();
2514 + ioremap_huge_init();
2515 ++ kaiser_init();
2516 + }
2517 +
2518 + asmlinkage __visible void __init start_kernel(void)
2519 +diff --git a/kernel/fork.c b/kernel/fork.c
2520 +index 9321b1ad3335..70e10cb49be0 100644
2521 +--- a/kernel/fork.c
2522 ++++ b/kernel/fork.c
2523 +@@ -58,6 +58,7 @@
2524 + #include <linux/tsacct_kern.h>
2525 + #include <linux/cn_proc.h>
2526 + #include <linux/freezer.h>
2527 ++#include <linux/kaiser.h>
2528 + #include <linux/delayacct.h>
2529 + #include <linux/taskstats_kern.h>
2530 + #include <linux/random.h>
2531 +@@ -213,6 +214,7 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
2532 +
2533 + static inline void free_thread_stack(struct task_struct *tsk)
2534 + {
2535 ++ kaiser_unmap_thread_stack(tsk->stack);
2536 + #ifdef CONFIG_VMAP_STACK
2537 + if (task_stack_vm_area(tsk)) {
2538 + unsigned long flags;
2539 +@@ -495,6 +497,10 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
2540 + * functions again.
2541 + */
2542 + tsk->stack = stack;
2543 ++
2544 ++ err= kaiser_map_thread_stack(tsk->stack);
2545 ++ if (err)
2546 ++ goto free_stack;
2547 + #ifdef CONFIG_VMAP_STACK
2548 + tsk->stack_vm_area = stack_vm_area;
2549 + #endif
2550 +diff --git a/mm/vmstat.c b/mm/vmstat.c
2551 +index 604f26a4f696..6a088df04b29 100644
2552 +--- a/mm/vmstat.c
2553 ++++ b/mm/vmstat.c
2554 +@@ -932,6 +932,7 @@ const char * const vmstat_text[] = {
2555 + "nr_slab_unreclaimable",
2556 + "nr_page_table_pages",
2557 + "nr_kernel_stack",
2558 ++ "nr_overhead",
2559 + "nr_bounce",
2560 + #if IS_ENABLED(CONFIG_ZSMALLOC)
2561 + "nr_zspages",
2562 +diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c
2563 +index 97f9cac98348..e86a34fd5484 100644
2564 +--- a/net/ipv4/tcp_bbr.c
2565 ++++ b/net/ipv4/tcp_bbr.c
2566 +@@ -843,6 +843,11 @@ static u32 bbr_sndbuf_expand(struct sock *sk)
2567 + */
2568 + static u32 bbr_undo_cwnd(struct sock *sk)
2569 + {
2570 ++ struct bbr *bbr = inet_csk_ca(sk);
2571 ++
2572 ++ bbr->full_bw = 0; /* spurious slow-down; reset full pipe detection */
2573 ++ bbr->full_bw_cnt = 0;
2574 ++ bbr_reset_lt_bw_sampling(sk);
2575 + return tcp_sk(sk)->snd_cwnd;
2576 + }
2577 +
2578 +diff --git a/security/Kconfig b/security/Kconfig
2579 +index 118f4549404e..32f36b40e9f0 100644
2580 +--- a/security/Kconfig
2581 ++++ b/security/Kconfig
2582 +@@ -31,6 +31,16 @@ config SECURITY
2583 +
2584 + If you are unsure how to answer this question, answer N.
2585 +
2586 ++config PAGE_TABLE_ISOLATION
2587 ++ bool "Remove the kernel mapping in user mode"
2588 ++ default y
2589 ++ depends on X86_64 && SMP
2590 ++ help
2591 ++ This enforces a strict kernel and user space isolation, in order
2592 ++ to close hardware side channels on kernel address information.
2593 ++
2594 ++ If you are unsure how to answer this question, answer Y.
2595 ++
2596 + config SECURITYFS
2597 + bool "Enable the securityfs filesystem"
2598 + help
2599 +diff --git a/tools/arch/x86/include/asm/cpufeatures.h b/tools/arch/x86/include/asm/cpufeatures.h
2600 +index a39629206864..f79669a38c0c 100644
2601 +--- a/tools/arch/x86/include/asm/cpufeatures.h
2602 ++++ b/tools/arch/x86/include/asm/cpufeatures.h
2603 +@@ -197,6 +197,9 @@
2604 + #define X86_FEATURE_AVX512_4VNNIW (7*32+16) /* AVX-512 Neural Network Instructions */
2605 + #define X86_FEATURE_AVX512_4FMAPS (7*32+17) /* AVX-512 Multiply Accumulation Single precision */
2606 +
2607 ++/* Because the ALTERNATIVE scheme is for members of the X86_FEATURE club... */
2608 ++#define X86_FEATURE_KAISER ( 7*32+31) /* CONFIG_PAGE_TABLE_ISOLATION w/o nokaiser */
2609 ++
2610 + /* Virtualization flags: Linux defined, word 8 */
2611 + #define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */
2612 + #define X86_FEATURE_VNMI ( 8*32+ 1) /* Intel Virtual NMI */