Gentoo Archives: gentoo-commits

From: Alice Ferrazzi <alicef@g.o>
To: gentoo-commits@l.g.o
Subject: [gentoo-commits] proj/linux-patches:4.4 commit in: /
Date: Fri, 05 Jan 2018 15:05:55
Message-Id: 1515164738.c7d7705101af05e259b3b84ffc59a60ff2b96142.alicef@gentoo
1 commit: c7d7705101af05e259b3b84ffc59a60ff2b96142
2 Author: Alice Ferrazzi <alicef <AT> gentoo <DOT> org>
3 AuthorDate: Fri Jan 5 15:05:38 2018 +0000
4 Commit: Alice Ferrazzi <alicef <AT> gentoo <DOT> org>
5 CommitDate: Fri Jan 5 15:05:38 2018 +0000
6 URL: https://gitweb.gentoo.org/proj/linux-patches.git/commit/?id=c7d77051
7
8 linux kernel 4.4.110
9
10 0000_README | 4 +
11 1109_linux-4.4.110.patch | 2814 ++++++++++++++++++++++++++++++++++++++++++++++
12 2 files changed, 2818 insertions(+)
13
14 diff --git a/0000_README b/0000_README
15 index 3be106c..46149de 100644
16 --- a/0000_README
17 +++ b/0000_README
18 @@ -479,6 +479,10 @@ Patch: 1108_linux-4.4.109.patch
19 From: http://www.kernel.org
20 Desc: Linux 4.4.109
21
22 +Patch: 1109_linux-4.4.110.patch
23 +From: http://www.kernel.org
24 +Desc: Linux 4.4.110
25 +
26 Patch: 1500_XATTR_USER_PREFIX.patch
27 From: https://bugs.gentoo.org/show_bug.cgi?id=470644
28 Desc: Support for namespace user.pax.* on tmpfs.
29
30 diff --git a/1109_linux-4.4.110.patch b/1109_linux-4.4.110.patch
31 new file mode 100644
32 index 0000000..1c226ed
33 --- /dev/null
34 +++ b/1109_linux-4.4.110.patch
35 @@ -0,0 +1,2814 @@
36 +diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
37 +index b4a83a490212..5977c4d71356 100644
38 +--- a/Documentation/kernel-parameters.txt
39 ++++ b/Documentation/kernel-parameters.txt
40 +@@ -2523,6 +2523,8 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
41 +
42 + nojitter [IA-64] Disables jitter checking for ITC timers.
43 +
44 ++ nopti [X86-64] Disable KAISER isolation of kernel from user.
45 ++
46 + no-kvmclock [X86,KVM] Disable paravirtualized KVM clock driver
47 +
48 + no-kvmapf [X86,KVM] Disable paravirtualized asynchronous page
49 +@@ -3054,6 +3056,12 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
50 + pt. [PARIDE]
51 + See Documentation/blockdev/paride.txt.
52 +
53 ++ pti= [X86_64]
54 ++ Control KAISER user/kernel address space isolation:
55 ++ on - enable
56 ++ off - disable
57 ++ auto - default setting
58 ++
59 + pty.legacy_count=
60 + [KNL] Number of legacy pty's. Overwrites compiled-in
61 + default number.
62 +diff --git a/Makefile b/Makefile
63 +index 5d67056e24dd..b028c106535b 100644
64 +--- a/Makefile
65 ++++ b/Makefile
66 +@@ -1,6 +1,6 @@
67 + VERSION = 4
68 + PATCHLEVEL = 4
69 +-SUBLEVEL = 109
70 ++SUBLEVEL = 110
71 + EXTRAVERSION =
72 + NAME = Blurry Fish Butt
73 +
74 +diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h
75 +index 3783dc3e10b3..4abb284a5b9c 100644
76 +--- a/arch/x86/boot/compressed/misc.h
77 ++++ b/arch/x86/boot/compressed/misc.h
78 +@@ -9,6 +9,7 @@
79 + */
80 + #undef CONFIG_PARAVIRT
81 + #undef CONFIG_PARAVIRT_SPINLOCKS
82 ++#undef CONFIG_PAGE_TABLE_ISOLATION
83 + #undef CONFIG_KASAN
84 +
85 + #include <linux/linkage.h>
86 +diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
87 +index cc0f2f5da19b..952b23b5d4e9 100644
88 +--- a/arch/x86/entry/entry_64.S
89 ++++ b/arch/x86/entry/entry_64.S
90 +@@ -35,6 +35,7 @@
91 + #include <asm/asm.h>
92 + #include <asm/smap.h>
93 + #include <asm/pgtable_types.h>
94 ++#include <asm/kaiser.h>
95 + #include <linux/err.h>
96 +
97 + /* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
98 +@@ -135,6 +136,7 @@ ENTRY(entry_SYSCALL_64)
99 + * it is too small to ever cause noticeable irq latency.
100 + */
101 + SWAPGS_UNSAFE_STACK
102 ++ SWITCH_KERNEL_CR3_NO_STACK
103 + /*
104 + * A hypervisor implementation might want to use a label
105 + * after the swapgs, so that it can do the swapgs
106 +@@ -207,9 +209,17 @@ entry_SYSCALL_64_fastpath:
107 + testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
108 + jnz int_ret_from_sys_call_irqs_off /* Go to the slow path */
109 +
110 +- RESTORE_C_REGS_EXCEPT_RCX_R11
111 + movq RIP(%rsp), %rcx
112 + movq EFLAGS(%rsp), %r11
113 ++ RESTORE_C_REGS_EXCEPT_RCX_R11
114 ++ /*
115 ++ * This opens a window where we have a user CR3, but are
116 ++ * running in the kernel. This makes using the CS
117 ++ * register useless for telling whether or not we need to
118 ++ * switch CR3 in NMIs. Normal interrupts are OK because
119 ++ * they are off here.
120 ++ */
121 ++ SWITCH_USER_CR3
122 + movq RSP(%rsp), %rsp
123 + /*
124 + * 64-bit SYSRET restores rip from rcx,
125 +@@ -347,10 +357,26 @@ GLOBAL(int_ret_from_sys_call)
126 + syscall_return_via_sysret:
127 + /* rcx and r11 are already restored (see code above) */
128 + RESTORE_C_REGS_EXCEPT_RCX_R11
129 ++ /*
130 ++ * This opens a window where we have a user CR3, but are
131 ++ * running in the kernel. This makes using the CS
132 ++ * register useless for telling whether or not we need to
133 ++ * switch CR3 in NMIs. Normal interrupts are OK because
134 ++ * they are off here.
135 ++ */
136 ++ SWITCH_USER_CR3
137 + movq RSP(%rsp), %rsp
138 + USERGS_SYSRET64
139 +
140 + opportunistic_sysret_failed:
141 ++ /*
142 ++ * This opens a window where we have a user CR3, but are
143 ++ * running in the kernel. This makes using the CS
144 ++ * register useless for telling whether or not we need to
145 ++ * switch CR3 in NMIs. Normal interrupts are OK because
146 ++ * they are off here.
147 ++ */
148 ++ SWITCH_USER_CR3
149 + SWAPGS
150 + jmp restore_c_regs_and_iret
151 + END(entry_SYSCALL_64)
152 +@@ -509,6 +535,7 @@ END(irq_entries_start)
153 + * tracking that we're in kernel mode.
154 + */
155 + SWAPGS
156 ++ SWITCH_KERNEL_CR3
157 +
158 + /*
159 + * We need to tell lockdep that IRQs are off. We can't do this until
160 +@@ -568,6 +595,7 @@ GLOBAL(retint_user)
161 + mov %rsp,%rdi
162 + call prepare_exit_to_usermode
163 + TRACE_IRQS_IRETQ
164 ++ SWITCH_USER_CR3
165 + SWAPGS
166 + jmp restore_regs_and_iret
167 +
168 +@@ -625,6 +653,7 @@ native_irq_return_ldt:
169 + pushq %rax
170 + pushq %rdi
171 + SWAPGS
172 ++ SWITCH_KERNEL_CR3
173 + movq PER_CPU_VAR(espfix_waddr), %rdi
174 + movq %rax, (0*8)(%rdi) /* RAX */
175 + movq (2*8)(%rsp), %rax /* RIP */
176 +@@ -640,6 +669,7 @@ native_irq_return_ldt:
177 + andl $0xffff0000, %eax
178 + popq %rdi
179 + orq PER_CPU_VAR(espfix_stack), %rax
180 ++ SWITCH_USER_CR3
181 + SWAPGS
182 + movq %rax, %rsp
183 + popq %rax
184 +@@ -995,7 +1025,11 @@ idtentry machine_check has_error_code=0 paranoid=1 do_sym=*machine_check_vec
185 + /*
186 + * Save all registers in pt_regs, and switch gs if needed.
187 + * Use slow, but surefire "are we in kernel?" check.
188 +- * Return: ebx=0: need swapgs on exit, ebx=1: otherwise
189 ++ *
190 ++ * Return: ebx=0: needs swapgs but not SWITCH_USER_CR3 in paranoid_exit
191 ++ * ebx=1: needs neither swapgs nor SWITCH_USER_CR3 in paranoid_exit
192 ++ * ebx=2: needs both swapgs and SWITCH_USER_CR3 in paranoid_exit
193 ++ * ebx=3: needs SWITCH_USER_CR3 but not swapgs in paranoid_exit
194 + */
195 + ENTRY(paranoid_entry)
196 + cld
197 +@@ -1008,7 +1042,26 @@ ENTRY(paranoid_entry)
198 + js 1f /* negative -> in kernel */
199 + SWAPGS
200 + xorl %ebx, %ebx
201 +-1: ret
202 ++1:
203 ++#ifdef CONFIG_PAGE_TABLE_ISOLATION
204 ++ /*
205 ++ * We might have come in between a swapgs and a SWITCH_KERNEL_CR3
206 ++ * on entry, or between a SWITCH_USER_CR3 and a swapgs on exit.
207 ++ * Do a conditional SWITCH_KERNEL_CR3: this could safely be done
208 ++ * unconditionally, but we need to find out whether the reverse
209 ++ * should be done on return (conveyed to paranoid_exit in %ebx).
210 ++ */
211 ++ ALTERNATIVE "jmp 2f", "movq %cr3, %rax", X86_FEATURE_KAISER
212 ++ testl $KAISER_SHADOW_PGD_OFFSET, %eax
213 ++ jz 2f
214 ++ orl $2, %ebx
215 ++ andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax
216 ++ /* If PCID enabled, set X86_CR3_PCID_NOFLUSH_BIT */
217 ++ ALTERNATIVE "", "bts $63, %rax", X86_FEATURE_PCID
218 ++ movq %rax, %cr3
219 ++2:
220 ++#endif
221 ++ ret
222 + END(paranoid_entry)
223 +
224 + /*
225 +@@ -1021,19 +1074,26 @@ END(paranoid_entry)
226 + * be complicated. Fortunately, we there's no good reason
227 + * to try to handle preemption here.
228 + *
229 +- * On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it)
230 ++ * On entry: ebx=0: needs swapgs but not SWITCH_USER_CR3
231 ++ * ebx=1: needs neither swapgs nor SWITCH_USER_CR3
232 ++ * ebx=2: needs both swapgs and SWITCH_USER_CR3
233 ++ * ebx=3: needs SWITCH_USER_CR3 but not swapgs
234 + */
235 + ENTRY(paranoid_exit)
236 + DISABLE_INTERRUPTS(CLBR_NONE)
237 + TRACE_IRQS_OFF_DEBUG
238 +- testl %ebx, %ebx /* swapgs needed? */
239 ++ TRACE_IRQS_IRETQ_DEBUG
240 ++#ifdef CONFIG_PAGE_TABLE_ISOLATION
241 ++ /* No ALTERNATIVE for X86_FEATURE_KAISER: paranoid_entry sets %ebx */
242 ++ testl $2, %ebx /* SWITCH_USER_CR3 needed? */
243 ++ jz paranoid_exit_no_switch
244 ++ SWITCH_USER_CR3
245 ++paranoid_exit_no_switch:
246 ++#endif
247 ++ testl $1, %ebx /* swapgs needed? */
248 + jnz paranoid_exit_no_swapgs
249 +- TRACE_IRQS_IRETQ
250 + SWAPGS_UNSAFE_STACK
251 +- jmp paranoid_exit_restore
252 + paranoid_exit_no_swapgs:
253 +- TRACE_IRQS_IRETQ_DEBUG
254 +-paranoid_exit_restore:
255 + RESTORE_EXTRA_REGS
256 + RESTORE_C_REGS
257 + REMOVE_PT_GPREGS_FROM_STACK 8
258 +@@ -1048,6 +1108,13 @@ ENTRY(error_entry)
259 + cld
260 + SAVE_C_REGS 8
261 + SAVE_EXTRA_REGS 8
262 ++ /*
263 ++ * error_entry() always returns with a kernel gsbase and
264 ++ * CR3. We must also have a kernel CR3/gsbase before
265 ++ * calling TRACE_IRQS_*. Just unconditionally switch to
266 ++ * the kernel CR3 here.
267 ++ */
268 ++ SWITCH_KERNEL_CR3
269 + xorl %ebx, %ebx
270 + testb $3, CS+8(%rsp)
271 + jz .Lerror_kernelspace
272 +@@ -1210,6 +1277,10 @@ ENTRY(nmi)
273 + */
274 +
275 + SWAPGS_UNSAFE_STACK
276 ++ /*
277 ++ * percpu variables are mapped with user CR3, so no need
278 ++ * to switch CR3 here.
279 ++ */
280 + cld
281 + movq %rsp, %rdx
282 + movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
283 +@@ -1243,12 +1314,34 @@ ENTRY(nmi)
284 +
285 + movq %rsp, %rdi
286 + movq $-1, %rsi
287 ++#ifdef CONFIG_PAGE_TABLE_ISOLATION
288 ++ /* Unconditionally use kernel CR3 for do_nmi() */
289 ++ /* %rax is saved above, so OK to clobber here */
290 ++ ALTERNATIVE "jmp 2f", "movq %cr3, %rax", X86_FEATURE_KAISER
291 ++ /* If PCID enabled, NOFLUSH now and NOFLUSH on return */
292 ++ ALTERNATIVE "", "bts $63, %rax", X86_FEATURE_PCID
293 ++ pushq %rax
294 ++ /* mask off "user" bit of pgd address and 12 PCID bits: */
295 ++ andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax
296 ++ movq %rax, %cr3
297 ++2:
298 ++#endif
299 + call do_nmi
300 +
301 ++#ifdef CONFIG_PAGE_TABLE_ISOLATION
302 ++ /*
303 ++ * Unconditionally restore CR3. I know we return to
304 ++ * kernel code that needs user CR3, but do we ever return
305 ++ * to "user mode" where we need the kernel CR3?
306 ++ */
307 ++ ALTERNATIVE "", "popq %rax; movq %rax, %cr3", X86_FEATURE_KAISER
308 ++#endif
309 ++
310 + /*
311 + * Return back to user mode. We must *not* do the normal exit
312 +- * work, because we don't want to enable interrupts. Fortunately,
313 +- * do_nmi doesn't modify pt_regs.
314 ++ * work, because we don't want to enable interrupts. Do not
315 ++ * switch to user CR3: we might be going back to kernel code
316 ++ * that had a user CR3 set.
317 + */
318 + SWAPGS
319 + jmp restore_c_regs_and_iret
320 +@@ -1445,22 +1538,55 @@ end_repeat_nmi:
321 + ALLOC_PT_GPREGS_ON_STACK
322 +
323 + /*
324 +- * Use paranoid_entry to handle SWAPGS, but no need to use paranoid_exit
325 +- * as we should not be calling schedule in NMI context.
326 +- * Even with normal interrupts enabled. An NMI should not be
327 +- * setting NEED_RESCHED or anything that normal interrupts and
328 +- * exceptions might do.
329 ++ * Use the same approach as paranoid_entry to handle SWAPGS, but
330 ++ * without CR3 handling since we do that differently in NMIs. No
331 ++ * need to use paranoid_exit as we should not be calling schedule
332 ++ * in NMI context. Even with normal interrupts enabled. An NMI
333 ++ * should not be setting NEED_RESCHED or anything that normal
334 ++ * interrupts and exceptions might do.
335 + */
336 +- call paranoid_entry
337 +-
338 +- /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
339 ++ cld
340 ++ SAVE_C_REGS
341 ++ SAVE_EXTRA_REGS
342 ++ movl $1, %ebx
343 ++ movl $MSR_GS_BASE, %ecx
344 ++ rdmsr
345 ++ testl %edx, %edx
346 ++ js 1f /* negative -> in kernel */
347 ++ SWAPGS
348 ++ xorl %ebx, %ebx
349 ++1:
350 + movq %rsp, %rdi
351 + movq $-1, %rsi
352 ++#ifdef CONFIG_PAGE_TABLE_ISOLATION
353 ++ /* Unconditionally use kernel CR3 for do_nmi() */
354 ++ /* %rax is saved above, so OK to clobber here */
355 ++ ALTERNATIVE "jmp 2f", "movq %cr3, %rax", X86_FEATURE_KAISER
356 ++ /* If PCID enabled, NOFLUSH now and NOFLUSH on return */
357 ++ ALTERNATIVE "", "bts $63, %rax", X86_FEATURE_PCID
358 ++ pushq %rax
359 ++ /* mask off "user" bit of pgd address and 12 PCID bits: */
360 ++ andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax
361 ++ movq %rax, %cr3
362 ++2:
363 ++#endif
364 ++
365 ++ /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
366 + call do_nmi
367 +
368 ++#ifdef CONFIG_PAGE_TABLE_ISOLATION
369 ++ /*
370 ++ * Unconditionally restore CR3. We might be returning to
371 ++ * kernel code that needs user CR3, like just just before
372 ++ * a sysret.
373 ++ */
374 ++ ALTERNATIVE "", "popq %rax; movq %rax, %cr3", X86_FEATURE_KAISER
375 ++#endif
376 ++
377 + testl %ebx, %ebx /* swapgs needed? */
378 + jnz nmi_restore
379 + nmi_swapgs:
380 ++ /* We fixed up CR3 above, so no need to switch it here */
381 + SWAPGS_UNSAFE_STACK
382 + nmi_restore:
383 + RESTORE_EXTRA_REGS
384 +diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
385 +index 15cfebaa7688..d03bf0e28b8b 100644
386 +--- a/arch/x86/entry/entry_64_compat.S
387 ++++ b/arch/x86/entry/entry_64_compat.S
388 +@@ -13,6 +13,8 @@
389 + #include <asm/irqflags.h>
390 + #include <asm/asm.h>
391 + #include <asm/smap.h>
392 ++#include <asm/pgtable_types.h>
393 ++#include <asm/kaiser.h>
394 + #include <linux/linkage.h>
395 + #include <linux/err.h>
396 +
397 +@@ -50,6 +52,7 @@ ENDPROC(native_usergs_sysret32)
398 + ENTRY(entry_SYSENTER_compat)
399 + /* Interrupts are off on entry. */
400 + SWAPGS_UNSAFE_STACK
401 ++ SWITCH_KERNEL_CR3_NO_STACK
402 + movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
403 +
404 + /*
405 +@@ -161,6 +164,7 @@ ENDPROC(entry_SYSENTER_compat)
406 + ENTRY(entry_SYSCALL_compat)
407 + /* Interrupts are off on entry. */
408 + SWAPGS_UNSAFE_STACK
409 ++ SWITCH_KERNEL_CR3_NO_STACK
410 +
411 + /* Stash user ESP and switch to the kernel stack. */
412 + movl %esp, %r8d
413 +@@ -208,6 +212,7 @@ ENTRY(entry_SYSCALL_compat)
414 + /* Opportunistic SYSRET */
415 + sysret32_from_system_call:
416 + TRACE_IRQS_ON /* User mode traces as IRQs on. */
417 ++ SWITCH_USER_CR3
418 + movq RBX(%rsp), %rbx /* pt_regs->rbx */
419 + movq RBP(%rsp), %rbp /* pt_regs->rbp */
420 + movq EFLAGS(%rsp), %r11 /* pt_regs->flags (in r11) */
421 +@@ -269,6 +274,7 @@ ENTRY(entry_INT80_compat)
422 + PARAVIRT_ADJUST_EXCEPTION_FRAME
423 + ASM_CLAC /* Do this early to minimize exposure */
424 + SWAPGS
425 ++ SWITCH_KERNEL_CR3_NO_STACK
426 +
427 + /*
428 + * User tracing code (ptrace or signal handlers) might assume that
429 +@@ -311,6 +317,7 @@ ENTRY(entry_INT80_compat)
430 +
431 + /* Go back to user mode. */
432 + TRACE_IRQS_ON
433 ++ SWITCH_USER_CR3
434 + SWAPGS
435 + jmp restore_regs_and_iret
436 + END(entry_INT80_compat)
437 +diff --git a/arch/x86/entry/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c
438 +index ca94fa649251..5dd363d54348 100644
439 +--- a/arch/x86/entry/vdso/vclock_gettime.c
440 ++++ b/arch/x86/entry/vdso/vclock_gettime.c
441 +@@ -36,6 +36,11 @@ static notrace cycle_t vread_hpet(void)
442 + }
443 + #endif
444 +
445 ++#ifdef CONFIG_PARAVIRT_CLOCK
446 ++extern u8 pvclock_page
447 ++ __attribute__((visibility("hidden")));
448 ++#endif
449 ++
450 + #ifndef BUILD_VDSO32
451 +
452 + #include <linux/kernel.h>
453 +@@ -62,63 +67,65 @@ notrace static long vdso_fallback_gtod(struct timeval *tv, struct timezone *tz)
454 +
455 + #ifdef CONFIG_PARAVIRT_CLOCK
456 +
457 +-static notrace const struct pvclock_vsyscall_time_info *get_pvti(int cpu)
458 ++static notrace const struct pvclock_vsyscall_time_info *get_pvti0(void)
459 + {
460 +- const struct pvclock_vsyscall_time_info *pvti_base;
461 +- int idx = cpu / (PAGE_SIZE/PVTI_SIZE);
462 +- int offset = cpu % (PAGE_SIZE/PVTI_SIZE);
463 +-
464 +- BUG_ON(PVCLOCK_FIXMAP_BEGIN + idx > PVCLOCK_FIXMAP_END);
465 +-
466 +- pvti_base = (struct pvclock_vsyscall_time_info *)
467 +- __fix_to_virt(PVCLOCK_FIXMAP_BEGIN+idx);
468 +-
469 +- return &pvti_base[offset];
470 ++ return (const struct pvclock_vsyscall_time_info *)&pvclock_page;
471 + }
472 +
473 + static notrace cycle_t vread_pvclock(int *mode)
474 + {
475 +- const struct pvclock_vsyscall_time_info *pvti;
476 ++ const struct pvclock_vcpu_time_info *pvti = &get_pvti0()->pvti;
477 + cycle_t ret;
478 +- u64 last;
479 +- u32 version;
480 +- u8 flags;
481 +- unsigned cpu, cpu1;
482 +-
483 ++ u64 tsc, pvti_tsc;
484 ++ u64 last, delta, pvti_system_time;
485 ++ u32 version, pvti_tsc_to_system_mul, pvti_tsc_shift;
486 +
487 + /*
488 +- * Note: hypervisor must guarantee that:
489 +- * 1. cpu ID number maps 1:1 to per-CPU pvclock time info.
490 +- * 2. that per-CPU pvclock time info is updated if the
491 +- * underlying CPU changes.
492 +- * 3. that version is increased whenever underlying CPU
493 +- * changes.
494 ++ * Note: The kernel and hypervisor must guarantee that cpu ID
495 ++ * number maps 1:1 to per-CPU pvclock time info.
496 ++ *
497 ++ * Because the hypervisor is entirely unaware of guest userspace
498 ++ * preemption, it cannot guarantee that per-CPU pvclock time
499 ++ * info is updated if the underlying CPU changes or that that
500 ++ * version is increased whenever underlying CPU changes.
501 + *
502 ++ * On KVM, we are guaranteed that pvti updates for any vCPU are
503 ++ * atomic as seen by *all* vCPUs. This is an even stronger
504 ++ * guarantee than we get with a normal seqlock.
505 ++ *
506 ++ * On Xen, we don't appear to have that guarantee, but Xen still
507 ++ * supplies a valid seqlock using the version field.
508 ++
509 ++ * We only do pvclock vdso timing at all if
510 ++ * PVCLOCK_TSC_STABLE_BIT is set, and we interpret that bit to
511 ++ * mean that all vCPUs have matching pvti and that the TSC is
512 ++ * synced, so we can just look at vCPU 0's pvti.
513 + */
514 +- do {
515 +- cpu = __getcpu() & VGETCPU_CPU_MASK;
516 +- /* TODO: We can put vcpu id into higher bits of pvti.version.
517 +- * This will save a couple of cycles by getting rid of
518 +- * __getcpu() calls (Gleb).
519 +- */
520 +-
521 +- pvti = get_pvti(cpu);
522 +-
523 +- version = __pvclock_read_cycles(&pvti->pvti, &ret, &flags);
524 +-
525 +- /*
526 +- * Test we're still on the cpu as well as the version.
527 +- * We could have been migrated just after the first
528 +- * vgetcpu but before fetching the version, so we
529 +- * wouldn't notice a version change.
530 +- */
531 +- cpu1 = __getcpu() & VGETCPU_CPU_MASK;
532 +- } while (unlikely(cpu != cpu1 ||
533 +- (pvti->pvti.version & 1) ||
534 +- pvti->pvti.version != version));
535 +-
536 +- if (unlikely(!(flags & PVCLOCK_TSC_STABLE_BIT)))
537 ++
538 ++ if (unlikely(!(pvti->flags & PVCLOCK_TSC_STABLE_BIT))) {
539 + *mode = VCLOCK_NONE;
540 ++ return 0;
541 ++ }
542 ++
543 ++ do {
544 ++ version = pvti->version;
545 ++
546 ++ /* This is also a read barrier, so we'll read version first. */
547 ++ tsc = rdtsc_ordered();
548 ++
549 ++ pvti_tsc_to_system_mul = pvti->tsc_to_system_mul;
550 ++ pvti_tsc_shift = pvti->tsc_shift;
551 ++ pvti_system_time = pvti->system_time;
552 ++ pvti_tsc = pvti->tsc_timestamp;
553 ++
554 ++ /* Make sure that the version double-check is last. */
555 ++ smp_rmb();
556 ++ } while (unlikely((version & 1) || version != pvti->version));
557 ++
558 ++ delta = tsc - pvti_tsc;
559 ++ ret = pvti_system_time +
560 ++ pvclock_scale_delta(delta, pvti_tsc_to_system_mul,
561 ++ pvti_tsc_shift);
562 +
563 + /* refer to tsc.c read_tsc() comment for rationale */
564 + last = gtod->cycle_last;
565 +diff --git a/arch/x86/entry/vdso/vdso-layout.lds.S b/arch/x86/entry/vdso/vdso-layout.lds.S
566 +index de2c921025f5..4158acc17df0 100644
567 +--- a/arch/x86/entry/vdso/vdso-layout.lds.S
568 ++++ b/arch/x86/entry/vdso/vdso-layout.lds.S
569 +@@ -25,7 +25,7 @@ SECTIONS
570 + * segment.
571 + */
572 +
573 +- vvar_start = . - 2 * PAGE_SIZE;
574 ++ vvar_start = . - 3 * PAGE_SIZE;
575 + vvar_page = vvar_start;
576 +
577 + /* Place all vvars at the offsets in asm/vvar.h. */
578 +@@ -36,6 +36,7 @@ SECTIONS
579 + #undef EMIT_VVAR
580 +
581 + hpet_page = vvar_start + PAGE_SIZE;
582 ++ pvclock_page = vvar_start + 2 * PAGE_SIZE;
583 +
584 + . = SIZEOF_HEADERS;
585 +
586 +diff --git a/arch/x86/entry/vdso/vdso2c.c b/arch/x86/entry/vdso/vdso2c.c
587 +index 785d9922b106..491020b2826d 100644
588 +--- a/arch/x86/entry/vdso/vdso2c.c
589 ++++ b/arch/x86/entry/vdso/vdso2c.c
590 +@@ -73,6 +73,7 @@ enum {
591 + sym_vvar_start,
592 + sym_vvar_page,
593 + sym_hpet_page,
594 ++ sym_pvclock_page,
595 + sym_VDSO_FAKE_SECTION_TABLE_START,
596 + sym_VDSO_FAKE_SECTION_TABLE_END,
597 + };
598 +@@ -80,6 +81,7 @@ enum {
599 + const int special_pages[] = {
600 + sym_vvar_page,
601 + sym_hpet_page,
602 ++ sym_pvclock_page,
603 + };
604 +
605 + struct vdso_sym {
606 +@@ -91,6 +93,7 @@ struct vdso_sym required_syms[] = {
607 + [sym_vvar_start] = {"vvar_start", true},
608 + [sym_vvar_page] = {"vvar_page", true},
609 + [sym_hpet_page] = {"hpet_page", true},
610 ++ [sym_pvclock_page] = {"pvclock_page", true},
611 + [sym_VDSO_FAKE_SECTION_TABLE_START] = {
612 + "VDSO_FAKE_SECTION_TABLE_START", false
613 + },
614 +diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c
615 +index 64df47148160..aa828191c654 100644
616 +--- a/arch/x86/entry/vdso/vma.c
617 ++++ b/arch/x86/entry/vdso/vma.c
618 +@@ -100,6 +100,7 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr)
619 + .name = "[vvar]",
620 + .pages = no_pages,
621 + };
622 ++ struct pvclock_vsyscall_time_info *pvti;
623 +
624 + if (calculate_addr) {
625 + addr = vdso_addr(current->mm->start_stack,
626 +@@ -169,6 +170,18 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr)
627 + }
628 + #endif
629 +
630 ++ pvti = pvclock_pvti_cpu0_va();
631 ++ if (pvti && image->sym_pvclock_page) {
632 ++ ret = remap_pfn_range(vma,
633 ++ text_start + image->sym_pvclock_page,
634 ++ __pa(pvti) >> PAGE_SHIFT,
635 ++ PAGE_SIZE,
636 ++ PAGE_READONLY);
637 ++
638 ++ if (ret)
639 ++ goto up_fail;
640 ++ }
641 ++
642 + up_fail:
643 + if (ret)
644 + current->mm->context.vdso = NULL;
645 +diff --git a/arch/x86/include/asm/cmdline.h b/arch/x86/include/asm/cmdline.h
646 +index e01f7f7ccb0c..84ae170bc3d0 100644
647 +--- a/arch/x86/include/asm/cmdline.h
648 ++++ b/arch/x86/include/asm/cmdline.h
649 +@@ -2,5 +2,7 @@
650 + #define _ASM_X86_CMDLINE_H
651 +
652 + int cmdline_find_option_bool(const char *cmdline_ptr, const char *option);
653 ++int cmdline_find_option(const char *cmdline_ptr, const char *option,
654 ++ char *buffer, int bufsize);
655 +
656 + #endif /* _ASM_X86_CMDLINE_H */
657 +diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
658 +index f7ba9fbf12ee..f6605712ca90 100644
659 +--- a/arch/x86/include/asm/cpufeature.h
660 ++++ b/arch/x86/include/asm/cpufeature.h
661 +@@ -187,6 +187,7 @@
662 + #define X86_FEATURE_ARAT ( 7*32+ 1) /* Always Running APIC Timer */
663 + #define X86_FEATURE_CPB ( 7*32+ 2) /* AMD Core Performance Boost */
664 + #define X86_FEATURE_EPB ( 7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */
665 ++#define X86_FEATURE_INVPCID_SINGLE ( 7*32+ 4) /* Effectively INVPCID && CR4.PCIDE=1 */
666 + #define X86_FEATURE_PLN ( 7*32+ 5) /* Intel Power Limit Notification */
667 + #define X86_FEATURE_PTS ( 7*32+ 6) /* Intel Package Thermal Status */
668 + #define X86_FEATURE_DTHERM ( 7*32+ 7) /* Digital Thermal Sensor */
669 +@@ -199,6 +200,9 @@
670 + #define X86_FEATURE_HWP_PKG_REQ ( 7*32+14) /* Intel HWP_PKG_REQ */
671 + #define X86_FEATURE_INTEL_PT ( 7*32+15) /* Intel Processor Trace */
672 +
673 ++/* Because the ALTERNATIVE scheme is for members of the X86_FEATURE club... */
674 ++#define X86_FEATURE_KAISER ( 7*32+31) /* CONFIG_PAGE_TABLE_ISOLATION w/o nokaiser */
675 ++
676 + /* Virtualization flags: Linux defined, word 8 */
677 + #define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */
678 + #define X86_FEATURE_VNMI ( 8*32+ 1) /* Intel Virtual NMI */
679 +diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
680 +index 4e10d73cf018..880db91d9457 100644
681 +--- a/arch/x86/include/asm/desc.h
682 ++++ b/arch/x86/include/asm/desc.h
683 +@@ -43,7 +43,7 @@ struct gdt_page {
684 + struct desc_struct gdt[GDT_ENTRIES];
685 + } __attribute__((aligned(PAGE_SIZE)));
686 +
687 +-DECLARE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page);
688 ++DECLARE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(struct gdt_page, gdt_page);
689 +
690 + static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
691 + {
692 +diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
693 +index 59caa55fb9b5..ee52ff858699 100644
694 +--- a/arch/x86/include/asm/hw_irq.h
695 ++++ b/arch/x86/include/asm/hw_irq.h
696 +@@ -187,7 +187,7 @@ extern char irq_entries_start[];
697 + #define VECTOR_RETRIGGERED ((void *)~0UL)
698 +
699 + typedef struct irq_desc* vector_irq_t[NR_VECTORS];
700 +-DECLARE_PER_CPU(vector_irq_t, vector_irq);
701 ++DECLARE_PER_CPU_USER_MAPPED(vector_irq_t, vector_irq);
702 +
703 + #endif /* !ASSEMBLY_ */
704 +
705 +diff --git a/arch/x86/include/asm/kaiser.h b/arch/x86/include/asm/kaiser.h
706 +new file mode 100644
707 +index 000000000000..802bbbdfe143
708 +--- /dev/null
709 ++++ b/arch/x86/include/asm/kaiser.h
710 +@@ -0,0 +1,141 @@
711 ++#ifndef _ASM_X86_KAISER_H
712 ++#define _ASM_X86_KAISER_H
713 ++
714 ++#include <uapi/asm/processor-flags.h> /* For PCID constants */
715 ++
716 ++/*
717 ++ * This file includes the definitions for the KAISER feature.
718 ++ * KAISER is a counter measure against x86_64 side channel attacks on
719 ++ * the kernel virtual memory. It has a shadow pgd for every process: the
720 ++ * shadow pgd has a minimalistic kernel-set mapped, but includes the whole
721 ++ * user memory. Within a kernel context switch, or when an interrupt is handled,
722 ++ * the pgd is switched to the normal one. When the system switches to user mode,
723 ++ * the shadow pgd is enabled. By this, the virtual memory caches are freed,
724 ++ * and the user may not attack the whole kernel memory.
725 ++ *
726 ++ * A minimalistic kernel mapping holds the parts needed to be mapped in user
727 ++ * mode, such as the entry/exit functions of the user space, or the stacks.
728 ++ */
729 ++
730 ++#define KAISER_SHADOW_PGD_OFFSET 0x1000
731 ++
732 ++#ifdef __ASSEMBLY__
733 ++#ifdef CONFIG_PAGE_TABLE_ISOLATION
734 ++
735 ++.macro _SWITCH_TO_KERNEL_CR3 reg
736 ++movq %cr3, \reg
737 ++andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), \reg
738 ++/* If PCID enabled, set X86_CR3_PCID_NOFLUSH_BIT */
739 ++ALTERNATIVE "", "bts $63, \reg", X86_FEATURE_PCID
740 ++movq \reg, %cr3
741 ++.endm
742 ++
743 ++.macro _SWITCH_TO_USER_CR3 reg regb
744 ++/*
745 ++ * regb must be the low byte portion of reg: because we have arranged
746 ++ * for the low byte of the user PCID to serve as the high byte of NOFLUSH
747 ++ * (0x80 for each when PCID is enabled, or 0x00 when PCID and NOFLUSH are
748 ++ * not enabled): so that the one register can update both memory and cr3.
749 ++ */
750 ++movq %cr3, \reg
751 ++orq PER_CPU_VAR(x86_cr3_pcid_user), \reg
752 ++js 9f
753 ++/* If PCID enabled, FLUSH this time, reset to NOFLUSH for next time */
754 ++movb \regb, PER_CPU_VAR(x86_cr3_pcid_user+7)
755 ++9:
756 ++movq \reg, %cr3
757 ++.endm
758 ++
759 ++.macro SWITCH_KERNEL_CR3
760 ++ALTERNATIVE "jmp 8f", "pushq %rax", X86_FEATURE_KAISER
761 ++_SWITCH_TO_KERNEL_CR3 %rax
762 ++popq %rax
763 ++8:
764 ++.endm
765 ++
766 ++.macro SWITCH_USER_CR3
767 ++ALTERNATIVE "jmp 8f", "pushq %rax", X86_FEATURE_KAISER
768 ++_SWITCH_TO_USER_CR3 %rax %al
769 ++popq %rax
770 ++8:
771 ++.endm
772 ++
773 ++.macro SWITCH_KERNEL_CR3_NO_STACK
774 ++ALTERNATIVE "jmp 8f", \
775 ++ __stringify(movq %rax, PER_CPU_VAR(unsafe_stack_register_backup)), \
776 ++ X86_FEATURE_KAISER
777 ++_SWITCH_TO_KERNEL_CR3 %rax
778 ++movq PER_CPU_VAR(unsafe_stack_register_backup), %rax
779 ++8:
780 ++.endm
781 ++
782 ++#else /* CONFIG_PAGE_TABLE_ISOLATION */
783 ++
784 ++.macro SWITCH_KERNEL_CR3
785 ++.endm
786 ++.macro SWITCH_USER_CR3
787 ++.endm
788 ++.macro SWITCH_KERNEL_CR3_NO_STACK
789 ++.endm
790 ++
791 ++#endif /* CONFIG_PAGE_TABLE_ISOLATION */
792 ++
793 ++#else /* __ASSEMBLY__ */
794 ++
795 ++#ifdef CONFIG_PAGE_TABLE_ISOLATION
796 ++/*
797 ++ * Upon kernel/user mode switch, it may happen that the address
798 ++ * space has to be switched before the registers have been
799 ++ * stored. To change the address space, another register is
800 ++ * needed. A register therefore has to be stored/restored.
801 ++*/
802 ++DECLARE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
803 ++
804 ++DECLARE_PER_CPU(unsigned long, x86_cr3_pcid_user);
805 ++
806 ++extern char __per_cpu_user_mapped_start[], __per_cpu_user_mapped_end[];
807 ++
808 ++extern int kaiser_enabled;
809 ++extern void __init kaiser_check_boottime_disable(void);
810 ++#else
811 ++#define kaiser_enabled 0
812 ++static inline void __init kaiser_check_boottime_disable(void) {}
813 ++#endif /* CONFIG_PAGE_TABLE_ISOLATION */
814 ++
815 ++/*
816 ++ * Kaiser function prototypes are needed even when CONFIG_PAGE_TABLE_ISOLATION is not set,
817 ++ * so as to build with tests on kaiser_enabled instead of #ifdefs.
818 ++ */
819 ++
820 ++/**
821 ++ * kaiser_add_mapping - map a virtual memory part to the shadow (user) mapping
822 ++ * @addr: the start address of the range
823 ++ * @size: the size of the range
824 ++ * @flags: The mapping flags of the pages
825 ++ *
826 ++ * The mapping is done on a global scope, so no bigger
827 ++ * synchronization has to be done. the pages have to be
828 ++ * manually unmapped again when they are not needed any longer.
829 ++ */
830 ++extern int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags);
831 ++
832 ++/**
833 ++ * kaiser_remove_mapping - unmap a virtual memory part of the shadow mapping
834 ++ * @addr: the start address of the range
835 ++ * @size: the size of the range
836 ++ */
837 ++extern void kaiser_remove_mapping(unsigned long start, unsigned long size);
838 ++
839 ++/**
840 ++ * kaiser_init - Initialize the shadow mapping
841 ++ *
842 ++ * Most parts of the shadow mapping can be mapped upon boot
843 ++ * time. Only per-process things like the thread stacks
844 ++ * or a new LDT have to be mapped at runtime. These boot-
845 ++ * time mappings are permanent and never unmapped.
846 ++ */
847 ++extern void kaiser_init(void);
848 ++
849 ++#endif /* __ASSEMBLY */
850 ++
851 ++#endif /* _ASM_X86_KAISER_H */
852 +diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
853 +index 6ec0c8b2e9df..84c62d950023 100644
854 +--- a/arch/x86/include/asm/pgtable.h
855 ++++ b/arch/x86/include/asm/pgtable.h
856 +@@ -18,6 +18,12 @@
857 + #ifndef __ASSEMBLY__
858 + #include <asm/x86_init.h>
859 +
860 ++#ifdef CONFIG_PAGE_TABLE_ISOLATION
861 ++extern int kaiser_enabled;
862 ++#else
863 ++#define kaiser_enabled 0
864 ++#endif
865 ++
866 + void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd);
867 + void ptdump_walk_pgd_level_checkwx(void);
868 +
869 +@@ -653,7 +659,17 @@ static inline pud_t *pud_offset(pgd_t *pgd, unsigned long address)
870 +
871 + static inline int pgd_bad(pgd_t pgd)
872 + {
873 +- return (pgd_flags(pgd) & ~_PAGE_USER) != _KERNPG_TABLE;
874 ++ pgdval_t ignore_flags = _PAGE_USER;
875 ++ /*
876 ++ * We set NX on KAISER pgds that map userspace memory so
877 ++ * that userspace can not meaningfully use the kernel
878 ++ * page table by accident; it will fault on the first
879 ++ * instruction it tries to run. See native_set_pgd().
880 ++ */
881 ++ if (kaiser_enabled)
882 ++ ignore_flags |= _PAGE_NX;
883 ++
884 ++ return (pgd_flags(pgd) & ~ignore_flags) != _KERNPG_TABLE;
885 + }
886 +
887 + static inline int pgd_none(pgd_t pgd)
888 +@@ -855,7 +871,15 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm,
889 + */
890 + static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
891 + {
892 +- memcpy(dst, src, count * sizeof(pgd_t));
893 ++ memcpy(dst, src, count * sizeof(pgd_t));
894 ++#ifdef CONFIG_PAGE_TABLE_ISOLATION
895 ++ if (kaiser_enabled) {
896 ++ /* Clone the shadow pgd part as well */
897 ++ memcpy(native_get_shadow_pgd(dst),
898 ++ native_get_shadow_pgd(src),
899 ++ count * sizeof(pgd_t));
900 ++ }
901 ++#endif
902 + }
903 +
904 + #define PTE_SHIFT ilog2(PTRS_PER_PTE)
905 +diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
906 +index 2ee781114d34..c810226e741a 100644
907 +--- a/arch/x86/include/asm/pgtable_64.h
908 ++++ b/arch/x86/include/asm/pgtable_64.h
909 +@@ -106,9 +106,32 @@ static inline void native_pud_clear(pud_t *pud)
910 + native_set_pud(pud, native_make_pud(0));
911 + }
912 +
913 ++#ifdef CONFIG_PAGE_TABLE_ISOLATION
914 ++extern pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd);
915 ++
916 ++static inline pgd_t *native_get_shadow_pgd(pgd_t *pgdp)
917 ++{
918 ++#ifdef CONFIG_DEBUG_VM
919 ++ /* linux/mmdebug.h may not have been included at this point */
920 ++ BUG_ON(!kaiser_enabled);
921 ++#endif
922 ++ return (pgd_t *)((unsigned long)pgdp | (unsigned long)PAGE_SIZE);
923 ++}
924 ++#else
925 ++static inline pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd)
926 ++{
927 ++ return pgd;
928 ++}
929 ++static inline pgd_t *native_get_shadow_pgd(pgd_t *pgdp)
930 ++{
931 ++ BUILD_BUG_ON(1);
932 ++ return NULL;
933 ++}
934 ++#endif /* CONFIG_PAGE_TABLE_ISOLATION */
935 ++
936 + static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd)
937 + {
938 +- *pgdp = pgd;
939 ++ *pgdp = kaiser_set_shadow_pgd(pgdp, pgd);
940 + }
941 +
942 + static inline void native_pgd_clear(pgd_t *pgd)
943 +diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
944 +index 79c91853e50e..8dba273da25a 100644
945 +--- a/arch/x86/include/asm/pgtable_types.h
946 ++++ b/arch/x86/include/asm/pgtable_types.h
947 +@@ -89,7 +89,7 @@
948 + #define _PAGE_NX (_AT(pteval_t, 0))
949 + #endif
950 +
951 +-#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
952 ++#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
953 +
954 + #define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \
955 + _PAGE_ACCESSED | _PAGE_DIRTY)
956 +@@ -102,6 +102,33 @@
957 + _PAGE_SOFT_DIRTY)
958 + #define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE)
959 +
960 ++/* The ASID is the lower 12 bits of CR3 */
961 ++#define X86_CR3_PCID_ASID_MASK (_AC((1<<12)-1,UL))
962 ++
963 ++/* Mask for all the PCID-related bits in CR3: */
964 ++#define X86_CR3_PCID_MASK (X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_MASK)
965 ++#define X86_CR3_PCID_ASID_KERN (_AC(0x0,UL))
966 ++
967 ++#if defined(CONFIG_PAGE_TABLE_ISOLATION) && defined(CONFIG_X86_64)
968 ++/* Let X86_CR3_PCID_ASID_USER be usable for the X86_CR3_PCID_NOFLUSH bit */
969 ++#define X86_CR3_PCID_ASID_USER (_AC(0x80,UL))
970 ++
971 ++#define X86_CR3_PCID_KERN_FLUSH (X86_CR3_PCID_ASID_KERN)
972 ++#define X86_CR3_PCID_USER_FLUSH (X86_CR3_PCID_ASID_USER)
973 ++#define X86_CR3_PCID_KERN_NOFLUSH (X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_KERN)
974 ++#define X86_CR3_PCID_USER_NOFLUSH (X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_USER)
975 ++#else
976 ++#define X86_CR3_PCID_ASID_USER (_AC(0x0,UL))
977 ++/*
978 ++ * PCIDs are unsupported on 32-bit and none of these bits can be
979 ++ * set in CR3:
980 ++ */
981 ++#define X86_CR3_PCID_KERN_FLUSH (0)
982 ++#define X86_CR3_PCID_USER_FLUSH (0)
983 ++#define X86_CR3_PCID_KERN_NOFLUSH (0)
984 ++#define X86_CR3_PCID_USER_NOFLUSH (0)
985 ++#endif
986 ++
987 + /*
988 + * The cache modes defined here are used to translate between pure SW usage
989 + * and the HW defined cache mode bits and/or PAT entries.
990 +diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
991 +index 2d5a50cb61a2..f3bdaed0188f 100644
992 +--- a/arch/x86/include/asm/processor.h
993 ++++ b/arch/x86/include/asm/processor.h
994 +@@ -305,7 +305,7 @@ struct tss_struct {
995 +
996 + } ____cacheline_aligned;
997 +
998 +-DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss);
999 ++DECLARE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct tss_struct, cpu_tss);
1000 +
1001 + #ifdef CONFIG_X86_32
1002 + DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack);
1003 +diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h
1004 +index baad72e4c100..6045cef376c2 100644
1005 +--- a/arch/x86/include/asm/pvclock.h
1006 ++++ b/arch/x86/include/asm/pvclock.h
1007 +@@ -4,6 +4,15 @@
1008 + #include <linux/clocksource.h>
1009 + #include <asm/pvclock-abi.h>
1010 +
1011 ++#ifdef CONFIG_PARAVIRT_CLOCK
1012 ++extern struct pvclock_vsyscall_time_info *pvclock_pvti_cpu0_va(void);
1013 ++#else
1014 ++static inline struct pvclock_vsyscall_time_info *pvclock_pvti_cpu0_va(void)
1015 ++{
1016 ++ return NULL;
1017 ++}
1018 ++#endif
1019 ++
1020 + /* some helper functions for xen and kvm pv clock sources */
1021 + cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src);
1022 + u8 pvclock_read_flags(struct pvclock_vcpu_time_info *src);
1023 +diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
1024 +index 9fc5968da820..a691b66cc40a 100644
1025 +--- a/arch/x86/include/asm/tlbflush.h
1026 ++++ b/arch/x86/include/asm/tlbflush.h
1027 +@@ -131,6 +131,24 @@ static inline void cr4_set_bits_and_update_boot(unsigned long mask)
1028 + cr4_set_bits(mask);
1029 + }
1030 +
1031 ++/*
1032 ++ * Declare a couple of kaiser interfaces here for convenience,
1033 ++ * to avoid the need for asm/kaiser.h in unexpected places.
1034 ++ */
1035 ++#ifdef CONFIG_PAGE_TABLE_ISOLATION
1036 ++extern int kaiser_enabled;
1037 ++extern void kaiser_setup_pcid(void);
1038 ++extern void kaiser_flush_tlb_on_return_to_user(void);
1039 ++#else
1040 ++#define kaiser_enabled 0
1041 ++static inline void kaiser_setup_pcid(void)
1042 ++{
1043 ++}
1044 ++static inline void kaiser_flush_tlb_on_return_to_user(void)
1045 ++{
1046 ++}
1047 ++#endif
1048 ++
1049 + static inline void __native_flush_tlb(void)
1050 + {
1051 + /*
1052 +@@ -139,6 +157,8 @@ static inline void __native_flush_tlb(void)
1053 + * back:
1054 + */
1055 + preempt_disable();
1056 ++ if (kaiser_enabled)
1057 ++ kaiser_flush_tlb_on_return_to_user();
1058 + native_write_cr3(native_read_cr3());
1059 + preempt_enable();
1060 + }
1061 +@@ -148,20 +168,27 @@ static inline void __native_flush_tlb_global_irq_disabled(void)
1062 + unsigned long cr4;
1063 +
1064 + cr4 = this_cpu_read(cpu_tlbstate.cr4);
1065 +- /* clear PGE */
1066 +- native_write_cr4(cr4 & ~X86_CR4_PGE);
1067 +- /* write old PGE again and flush TLBs */
1068 +- native_write_cr4(cr4);
1069 ++ if (cr4 & X86_CR4_PGE) {
1070 ++ /* clear PGE and flush TLB of all entries */
1071 ++ native_write_cr4(cr4 & ~X86_CR4_PGE);
1072 ++ /* restore PGE as it was before */
1073 ++ native_write_cr4(cr4);
1074 ++ } else {
1075 ++ /* do it with cr3, letting kaiser flush user PCID */
1076 ++ __native_flush_tlb();
1077 ++ }
1078 + }
1079 +
1080 + static inline void __native_flush_tlb_global(void)
1081 + {
1082 + unsigned long flags;
1083 +
1084 +- if (static_cpu_has(X86_FEATURE_INVPCID)) {
1085 ++ if (this_cpu_has(X86_FEATURE_INVPCID)) {
1086 + /*
1087 + * Using INVPCID is considerably faster than a pair of writes
1088 + * to CR4 sandwiched inside an IRQ flag save/restore.
1089 ++ *
1090 ++ * Note, this works with CR4.PCIDE=0 or 1.
1091 + */
1092 + invpcid_flush_all();
1093 + return;
1094 +@@ -173,24 +200,45 @@ static inline void __native_flush_tlb_global(void)
1095 + * be called from deep inside debugging code.)
1096 + */
1097 + raw_local_irq_save(flags);
1098 +-
1099 + __native_flush_tlb_global_irq_disabled();
1100 +-
1101 + raw_local_irq_restore(flags);
1102 + }
1103 +
1104 + static inline void __native_flush_tlb_single(unsigned long addr)
1105 + {
1106 +- asm volatile("invlpg (%0)" ::"r" (addr) : "memory");
1107 ++ /*
1108 ++ * SIMICS #GP's if you run INVPCID with type 2/3
1109 ++ * and X86_CR4_PCIDE clear. Shame!
1110 ++ *
1111 ++ * The ASIDs used below are hard-coded. But, we must not
1112 ++ * call invpcid(type=1/2) before CR4.PCIDE=1. Just call
1113 ++ * invlpg in the case we are called early.
1114 ++ */
1115 ++
1116 ++ if (!this_cpu_has(X86_FEATURE_INVPCID_SINGLE)) {
1117 ++ if (kaiser_enabled)
1118 ++ kaiser_flush_tlb_on_return_to_user();
1119 ++ asm volatile("invlpg (%0)" ::"r" (addr) : "memory");
1120 ++ return;
1121 ++ }
1122 ++ /* Flush the address out of both PCIDs. */
1123 ++ /*
1124 ++ * An optimization here might be to determine addresses
1125 ++ * that are only kernel-mapped and only flush the kernel
1126 ++ * ASID. But, userspace flushes are probably much more
1127 ++ * important performance-wise.
1128 ++ *
1129 ++ * Make sure to do only a single invpcid when KAISER is
1130 ++ * disabled and we have only a single ASID.
1131 ++ */
1132 ++ if (kaiser_enabled)
1133 ++ invpcid_flush_one(X86_CR3_PCID_ASID_USER, addr);
1134 ++ invpcid_flush_one(X86_CR3_PCID_ASID_KERN, addr);
1135 + }
1136 +
1137 + static inline void __flush_tlb_all(void)
1138 + {
1139 +- if (cpu_has_pge)
1140 +- __flush_tlb_global();
1141 +- else
1142 +- __flush_tlb();
1143 +-
1144 ++ __flush_tlb_global();
1145 + /*
1146 + * Note: if we somehow had PCID but not PGE, then this wouldn't work --
1147 + * we'd end up flushing kernel translations for the current ASID but
1148 +diff --git a/arch/x86/include/asm/vdso.h b/arch/x86/include/asm/vdso.h
1149 +index 756de9190aec..deabaf9759b6 100644
1150 +--- a/arch/x86/include/asm/vdso.h
1151 ++++ b/arch/x86/include/asm/vdso.h
1152 +@@ -22,6 +22,7 @@ struct vdso_image {
1153 +
1154 + long sym_vvar_page;
1155 + long sym_hpet_page;
1156 ++ long sym_pvclock_page;
1157 + long sym_VDSO32_NOTE_MASK;
1158 + long sym___kernel_sigreturn;
1159 + long sym___kernel_rt_sigreturn;
1160 +diff --git a/arch/x86/include/uapi/asm/processor-flags.h b/arch/x86/include/uapi/asm/processor-flags.h
1161 +index 79887abcb5e1..1361779f44fe 100644
1162 +--- a/arch/x86/include/uapi/asm/processor-flags.h
1163 ++++ b/arch/x86/include/uapi/asm/processor-flags.h
1164 +@@ -77,7 +77,8 @@
1165 + #define X86_CR3_PWT _BITUL(X86_CR3_PWT_BIT)
1166 + #define X86_CR3_PCD_BIT 4 /* Page Cache Disable */
1167 + #define X86_CR3_PCD _BITUL(X86_CR3_PCD_BIT)
1168 +-#define X86_CR3_PCID_MASK _AC(0x00000fff,UL) /* PCID Mask */
1169 ++#define X86_CR3_PCID_NOFLUSH_BIT 63 /* Preserve old PCID */
1170 ++#define X86_CR3_PCID_NOFLUSH _BITULL(X86_CR3_PCID_NOFLUSH_BIT)
1171 +
1172 + /*
1173 + * Intel CPU features in CR4
1174 +diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
1175 +index aa1e7246b06b..cc154ac64f00 100644
1176 +--- a/arch/x86/kernel/cpu/common.c
1177 ++++ b/arch/x86/kernel/cpu/common.c
1178 +@@ -92,7 +92,7 @@ static const struct cpu_dev default_cpu = {
1179 +
1180 + static const struct cpu_dev *this_cpu = &default_cpu;
1181 +
1182 +-DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
1183 ++DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(struct gdt_page, gdt_page) = { .gdt = {
1184 + #ifdef CONFIG_X86_64
1185 + /*
1186 + * We need valid kernel segments for data and code in long mode too
1187 +@@ -324,8 +324,21 @@ static __always_inline void setup_smap(struct cpuinfo_x86 *c)
1188 + static void setup_pcid(struct cpuinfo_x86 *c)
1189 + {
1190 + if (cpu_has(c, X86_FEATURE_PCID)) {
1191 +- if (cpu_has(c, X86_FEATURE_PGE)) {
1192 ++ if (cpu_has(c, X86_FEATURE_PGE) || kaiser_enabled) {
1193 + cr4_set_bits(X86_CR4_PCIDE);
1194 ++ /*
1195 ++ * INVPCID has two "groups" of types:
1196 ++ * 1/2: Invalidate an individual address
1197 ++ * 3/4: Invalidate all contexts
1198 ++ *
1199 ++ * 1/2 take a PCID, but 3/4 do not. So, 3/4
1200 ++ * ignore the PCID argument in the descriptor.
1201 ++ * But, we have to be careful not to call 1/2
1202 ++ * with an actual non-zero PCID in them before
1203 ++ * we do the above cr4_set_bits().
1204 ++ */
1205 ++ if (cpu_has(c, X86_FEATURE_INVPCID))
1206 ++ set_cpu_cap(c, X86_FEATURE_INVPCID_SINGLE);
1207 + } else {
1208 + /*
1209 + * flush_tlb_all(), as currently implemented, won't
1210 +@@ -338,6 +351,7 @@ static void setup_pcid(struct cpuinfo_x86 *c)
1211 + clear_cpu_cap(c, X86_FEATURE_PCID);
1212 + }
1213 + }
1214 ++ kaiser_setup_pcid();
1215 + }
1216 +
1217 + /*
1218 +@@ -1229,7 +1243,7 @@ static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = {
1219 + [DEBUG_STACK - 1] = DEBUG_STKSZ
1220 + };
1221 +
1222 +-static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
1223 ++DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(char, exception_stacks
1224 + [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]);
1225 +
1226 + /* May not be marked __init: used by software suspend */
1227 +@@ -1392,6 +1406,14 @@ void cpu_init(void)
1228 + * try to read it.
1229 + */
1230 + cr4_init_shadow();
1231 ++ if (!kaiser_enabled) {
1232 ++ /*
1233 ++ * secondary_startup_64() deferred setting PGE in cr4:
1234 ++ * probe_page_size_mask() sets it on the boot cpu,
1235 ++ * but it needs to be set on each secondary cpu.
1236 ++ */
1237 ++ cr4_set_bits(X86_CR4_PGE);
1238 ++ }
1239 +
1240 + /*
1241 + * Load microcode on this cpu if a valid microcode is available.
1242 +diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
1243 +index 1e7de3cefc9c..f01b3a12dce0 100644
1244 +--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
1245 ++++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
1246 +@@ -2,11 +2,15 @@
1247 + #include <linux/types.h>
1248 + #include <linux/slab.h>
1249 +
1250 ++#include <asm/kaiser.h>
1251 + #include <asm/perf_event.h>
1252 + #include <asm/insn.h>
1253 +
1254 + #include "perf_event.h"
1255 +
1256 ++static
1257 ++DEFINE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct debug_store, cpu_debug_store);
1258 ++
1259 + /* The size of a BTS record in bytes: */
1260 + #define BTS_RECORD_SIZE 24
1261 +
1262 +@@ -268,6 +272,39 @@ void fini_debug_store_on_cpu(int cpu)
1263 +
1264 + static DEFINE_PER_CPU(void *, insn_buffer);
1265 +
1266 ++static void *dsalloc(size_t size, gfp_t flags, int node)
1267 ++{
1268 ++#ifdef CONFIG_PAGE_TABLE_ISOLATION
1269 ++ unsigned int order = get_order(size);
1270 ++ struct page *page;
1271 ++ unsigned long addr;
1272 ++
1273 ++ page = __alloc_pages_node(node, flags | __GFP_ZERO, order);
1274 ++ if (!page)
1275 ++ return NULL;
1276 ++ addr = (unsigned long)page_address(page);
1277 ++ if (kaiser_add_mapping(addr, size, __PAGE_KERNEL) < 0) {
1278 ++ __free_pages(page, order);
1279 ++ addr = 0;
1280 ++ }
1281 ++ return (void *)addr;
1282 ++#else
1283 ++ return kmalloc_node(size, flags | __GFP_ZERO, node);
1284 ++#endif
1285 ++}
1286 ++
1287 ++static void dsfree(const void *buffer, size_t size)
1288 ++{
1289 ++#ifdef CONFIG_PAGE_TABLE_ISOLATION
1290 ++ if (!buffer)
1291 ++ return;
1292 ++ kaiser_remove_mapping((unsigned long)buffer, size);
1293 ++ free_pages((unsigned long)buffer, get_order(size));
1294 ++#else
1295 ++ kfree(buffer);
1296 ++#endif
1297 ++}
1298 ++
1299 + static int alloc_pebs_buffer(int cpu)
1300 + {
1301 + struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
1302 +@@ -278,7 +315,7 @@ static int alloc_pebs_buffer(int cpu)
1303 + if (!x86_pmu.pebs)
1304 + return 0;
1305 +
1306 +- buffer = kzalloc_node(x86_pmu.pebs_buffer_size, GFP_KERNEL, node);
1307 ++ buffer = dsalloc(x86_pmu.pebs_buffer_size, GFP_KERNEL, node);
1308 + if (unlikely(!buffer))
1309 + return -ENOMEM;
1310 +
1311 +@@ -289,7 +326,7 @@ static int alloc_pebs_buffer(int cpu)
1312 + if (x86_pmu.intel_cap.pebs_format < 2) {
1313 + ibuffer = kzalloc_node(PEBS_FIXUP_SIZE, GFP_KERNEL, node);
1314 + if (!ibuffer) {
1315 +- kfree(buffer);
1316 ++ dsfree(buffer, x86_pmu.pebs_buffer_size);
1317 + return -ENOMEM;
1318 + }
1319 + per_cpu(insn_buffer, cpu) = ibuffer;
1320 +@@ -315,7 +352,8 @@ static void release_pebs_buffer(int cpu)
1321 + kfree(per_cpu(insn_buffer, cpu));
1322 + per_cpu(insn_buffer, cpu) = NULL;
1323 +
1324 +- kfree((void *)(unsigned long)ds->pebs_buffer_base);
1325 ++ dsfree((void *)(unsigned long)ds->pebs_buffer_base,
1326 ++ x86_pmu.pebs_buffer_size);
1327 + ds->pebs_buffer_base = 0;
1328 + }
1329 +
1330 +@@ -329,7 +367,7 @@ static int alloc_bts_buffer(int cpu)
1331 + if (!x86_pmu.bts)
1332 + return 0;
1333 +
1334 +- buffer = kzalloc_node(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, node);
1335 ++ buffer = dsalloc(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, node);
1336 + if (unlikely(!buffer)) {
1337 + WARN_ONCE(1, "%s: BTS buffer allocation failure\n", __func__);
1338 + return -ENOMEM;
1339 +@@ -355,19 +393,15 @@ static void release_bts_buffer(int cpu)
1340 + if (!ds || !x86_pmu.bts)
1341 + return;
1342 +
1343 +- kfree((void *)(unsigned long)ds->bts_buffer_base);
1344 ++ dsfree((void *)(unsigned long)ds->bts_buffer_base, BTS_BUFFER_SIZE);
1345 + ds->bts_buffer_base = 0;
1346 + }
1347 +
1348 + static int alloc_ds_buffer(int cpu)
1349 + {
1350 +- int node = cpu_to_node(cpu);
1351 +- struct debug_store *ds;
1352 +-
1353 +- ds = kzalloc_node(sizeof(*ds), GFP_KERNEL, node);
1354 +- if (unlikely(!ds))
1355 +- return -ENOMEM;
1356 ++ struct debug_store *ds = per_cpu_ptr(&cpu_debug_store, cpu);
1357 +
1358 ++ memset(ds, 0, sizeof(*ds));
1359 + per_cpu(cpu_hw_events, cpu).ds = ds;
1360 +
1361 + return 0;
1362 +@@ -381,7 +415,6 @@ static void release_ds_buffer(int cpu)
1363 + return;
1364 +
1365 + per_cpu(cpu_hw_events, cpu).ds = NULL;
1366 +- kfree(ds);
1367 + }
1368 +
1369 + void release_ds_buffers(void)
1370 +diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c
1371 +index 4d38416e2a7f..b02cb2ec6726 100644
1372 +--- a/arch/x86/kernel/espfix_64.c
1373 ++++ b/arch/x86/kernel/espfix_64.c
1374 +@@ -41,6 +41,7 @@
1375 + #include <asm/pgalloc.h>
1376 + #include <asm/setup.h>
1377 + #include <asm/espfix.h>
1378 ++#include <asm/kaiser.h>
1379 +
1380 + /*
1381 + * Note: we only need 6*8 = 48 bytes for the espfix stack, but round
1382 +@@ -126,6 +127,15 @@ void __init init_espfix_bsp(void)
1383 + /* Install the espfix pud into the kernel page directory */
1384 + pgd_p = &init_level4_pgt[pgd_index(ESPFIX_BASE_ADDR)];
1385 + pgd_populate(&init_mm, pgd_p, (pud_t *)espfix_pud_page);
1386 ++ /*
1387 ++ * Just copy the top-level PGD that is mapping the espfix
1388 ++ * area to ensure it is mapped into the shadow user page
1389 ++ * tables.
1390 ++ */
1391 ++ if (kaiser_enabled) {
1392 ++ set_pgd(native_get_shadow_pgd(pgd_p),
1393 ++ __pgd(_KERNPG_TABLE | __pa((pud_t *)espfix_pud_page)));
1394 ++ }
1395 +
1396 + /* Randomize the locations */
1397 + init_espfix_random();
1398 +diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
1399 +index ffdc0e860390..4034e905741a 100644
1400 +--- a/arch/x86/kernel/head_64.S
1401 ++++ b/arch/x86/kernel/head_64.S
1402 +@@ -183,8 +183,8 @@ ENTRY(secondary_startup_64)
1403 + movq $(init_level4_pgt - __START_KERNEL_map), %rax
1404 + 1:
1405 +
1406 +- /* Enable PAE mode and PGE */
1407 +- movl $(X86_CR4_PAE | X86_CR4_PGE), %ecx
1408 ++ /* Enable PAE and PSE, but defer PGE until kaiser_enabled is decided */
1409 ++ movl $(X86_CR4_PAE | X86_CR4_PSE), %ecx
1410 + movq %rcx, %cr4
1411 +
1412 + /* Setup early boot stage 4 level pagetables. */
1413 +@@ -441,6 +441,27 @@ early_idt_ripmsg:
1414 + .balign PAGE_SIZE; \
1415 + GLOBAL(name)
1416 +
1417 ++#ifdef CONFIG_PAGE_TABLE_ISOLATION
1418 ++/*
1419 ++ * Each PGD needs to be 8k long and 8k aligned. We do not
1420 ++ * ever go out to userspace with these, so we do not
1421 ++ * strictly *need* the second page, but this allows us to
1422 ++ * have a single set_pgd() implementation that does not
1423 ++ * need to worry about whether it has 4k or 8k to work
1424 ++ * with.
1425 ++ *
1426 ++ * This ensures PGDs are 8k long:
1427 ++ */
1428 ++#define KAISER_USER_PGD_FILL 512
1429 ++/* This ensures they are 8k-aligned: */
1430 ++#define NEXT_PGD_PAGE(name) \
1431 ++ .balign 2 * PAGE_SIZE; \
1432 ++GLOBAL(name)
1433 ++#else
1434 ++#define NEXT_PGD_PAGE(name) NEXT_PAGE(name)
1435 ++#define KAISER_USER_PGD_FILL 0
1436 ++#endif
1437 ++
1438 + /* Automate the creation of 1 to 1 mapping pmd entries */
1439 + #define PMDS(START, PERM, COUNT) \
1440 + i = 0 ; \
1441 +@@ -450,9 +471,10 @@ GLOBAL(name)
1442 + .endr
1443 +
1444 + __INITDATA
1445 +-NEXT_PAGE(early_level4_pgt)
1446 ++NEXT_PGD_PAGE(early_level4_pgt)
1447 + .fill 511,8,0
1448 + .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
1449 ++ .fill KAISER_USER_PGD_FILL,8,0
1450 +
1451 + NEXT_PAGE(early_dynamic_pgts)
1452 + .fill 512*EARLY_DYNAMIC_PAGE_TABLES,8,0
1453 +@@ -460,16 +482,18 @@ NEXT_PAGE(early_dynamic_pgts)
1454 + .data
1455 +
1456 + #ifndef CONFIG_XEN
1457 +-NEXT_PAGE(init_level4_pgt)
1458 ++NEXT_PGD_PAGE(init_level4_pgt)
1459 + .fill 512,8,0
1460 ++ .fill KAISER_USER_PGD_FILL,8,0
1461 + #else
1462 +-NEXT_PAGE(init_level4_pgt)
1463 ++NEXT_PGD_PAGE(init_level4_pgt)
1464 + .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
1465 + .org init_level4_pgt + L4_PAGE_OFFSET*8, 0
1466 + .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
1467 + .org init_level4_pgt + L4_START_KERNEL*8, 0
1468 + /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
1469 + .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
1470 ++ .fill KAISER_USER_PGD_FILL,8,0
1471 +
1472 + NEXT_PAGE(level3_ident_pgt)
1473 + .quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
1474 +@@ -480,6 +504,7 @@ NEXT_PAGE(level2_ident_pgt)
1475 + */
1476 + PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD)
1477 + #endif
1478 ++ .fill KAISER_USER_PGD_FILL,8,0
1479 +
1480 + NEXT_PAGE(level3_kernel_pgt)
1481 + .fill L3_START_KERNEL,8,0
1482 +diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
1483 +index 1423ab1b0312..f480b38a03c3 100644
1484 +--- a/arch/x86/kernel/irqinit.c
1485 ++++ b/arch/x86/kernel/irqinit.c
1486 +@@ -51,7 +51,7 @@ static struct irqaction irq2 = {
1487 + .flags = IRQF_NO_THREAD,
1488 + };
1489 +
1490 +-DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
1491 ++DEFINE_PER_CPU_USER_MAPPED(vector_irq_t, vector_irq) = {
1492 + [0 ... NR_VECTORS - 1] = VECTOR_UNUSED,
1493 + };
1494 +
1495 +diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
1496 +index 2bd81e302427..ec1b06dc82d2 100644
1497 +--- a/arch/x86/kernel/kvmclock.c
1498 ++++ b/arch/x86/kernel/kvmclock.c
1499 +@@ -45,6 +45,11 @@ early_param("no-kvmclock", parse_no_kvmclock);
1500 + static struct pvclock_vsyscall_time_info *hv_clock;
1501 + static struct pvclock_wall_clock wall_clock;
1502 +
1503 ++struct pvclock_vsyscall_time_info *pvclock_pvti_cpu0_va(void)
1504 ++{
1505 ++ return hv_clock;
1506 ++}
1507 ++
1508 + /*
1509 + * The wallclock is the time of day when we booted. Since then, some time may
1510 + * have elapsed since the hypervisor wrote the data. So we try to account for
1511 +diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
1512 +index d6279593bcdd..bc429365b72a 100644
1513 +--- a/arch/x86/kernel/ldt.c
1514 ++++ b/arch/x86/kernel/ldt.c
1515 +@@ -16,6 +16,7 @@
1516 + #include <linux/slab.h>
1517 + #include <linux/vmalloc.h>
1518 + #include <linux/uaccess.h>
1519 ++#include <linux/kaiser.h>
1520 +
1521 + #include <asm/ldt.h>
1522 + #include <asm/desc.h>
1523 +@@ -34,11 +35,21 @@ static void flush_ldt(void *current_mm)
1524 + set_ldt(pc->ldt->entries, pc->ldt->size);
1525 + }
1526 +
1527 ++static void __free_ldt_struct(struct ldt_struct *ldt)
1528 ++{
1529 ++ if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE)
1530 ++ vfree(ldt->entries);
1531 ++ else
1532 ++ free_page((unsigned long)ldt->entries);
1533 ++ kfree(ldt);
1534 ++}
1535 ++
1536 + /* The caller must call finalize_ldt_struct on the result. LDT starts zeroed. */
1537 + static struct ldt_struct *alloc_ldt_struct(int size)
1538 + {
1539 + struct ldt_struct *new_ldt;
1540 + int alloc_size;
1541 ++ int ret;
1542 +
1543 + if (size > LDT_ENTRIES)
1544 + return NULL;
1545 +@@ -66,7 +77,13 @@ static struct ldt_struct *alloc_ldt_struct(int size)
1546 + return NULL;
1547 + }
1548 +
1549 ++ ret = kaiser_add_mapping((unsigned long)new_ldt->entries, alloc_size,
1550 ++ __PAGE_KERNEL);
1551 + new_ldt->size = size;
1552 ++ if (ret) {
1553 ++ __free_ldt_struct(new_ldt);
1554 ++ return NULL;
1555 ++ }
1556 + return new_ldt;
1557 + }
1558 +
1559 +@@ -92,12 +109,10 @@ static void free_ldt_struct(struct ldt_struct *ldt)
1560 + if (likely(!ldt))
1561 + return;
1562 +
1563 ++ kaiser_remove_mapping((unsigned long)ldt->entries,
1564 ++ ldt->size * LDT_ENTRY_SIZE);
1565 + paravirt_free_ldt(ldt->entries, ldt->size);
1566 +- if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE)
1567 +- vfree(ldt->entries);
1568 +- else
1569 +- free_page((unsigned long)ldt->entries);
1570 +- kfree(ldt);
1571 ++ __free_ldt_struct(ldt);
1572 + }
1573 +
1574 + /*
1575 +diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c
1576 +index 8aa05583bc42..0677bf8d3a42 100644
1577 +--- a/arch/x86/kernel/paravirt_patch_64.c
1578 ++++ b/arch/x86/kernel/paravirt_patch_64.c
1579 +@@ -9,7 +9,6 @@ DEF_NATIVE(pv_irq_ops, save_fl, "pushfq; popq %rax");
1580 + DEF_NATIVE(pv_mmu_ops, read_cr2, "movq %cr2, %rax");
1581 + DEF_NATIVE(pv_mmu_ops, read_cr3, "movq %cr3, %rax");
1582 + DEF_NATIVE(pv_mmu_ops, write_cr3, "movq %rdi, %cr3");
1583 +-DEF_NATIVE(pv_mmu_ops, flush_tlb_single, "invlpg (%rdi)");
1584 + DEF_NATIVE(pv_cpu_ops, clts, "clts");
1585 + DEF_NATIVE(pv_cpu_ops, wbinvd, "wbinvd");
1586 +
1587 +@@ -62,7 +61,6 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
1588 + PATCH_SITE(pv_mmu_ops, read_cr3);
1589 + PATCH_SITE(pv_mmu_ops, write_cr3);
1590 + PATCH_SITE(pv_cpu_ops, clts);
1591 +- PATCH_SITE(pv_mmu_ops, flush_tlb_single);
1592 + PATCH_SITE(pv_cpu_ops, wbinvd);
1593 + #if defined(CONFIG_PARAVIRT_SPINLOCKS) && defined(CONFIG_QUEUED_SPINLOCKS)
1594 + case PARAVIRT_PATCH(pv_lock_ops.queued_spin_unlock):
1595 +diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
1596 +index 9f7c21c22477..7c5c5dc90ffa 100644
1597 +--- a/arch/x86/kernel/process.c
1598 ++++ b/arch/x86/kernel/process.c
1599 +@@ -39,7 +39,7 @@
1600 + * section. Since TSS's are completely CPU-local, we want them
1601 + * on exact cacheline boundaries, to eliminate cacheline ping-pong.
1602 + */
1603 +-__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = {
1604 ++__visible DEFINE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct tss_struct, cpu_tss) = {
1605 + .x86_tss = {
1606 + .sp0 = TOP_OF_INIT_STACK,
1607 + #ifdef CONFIG_X86_32
1608 +diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
1609 +index e67b834279b2..bbaae4cf9e8e 100644
1610 +--- a/arch/x86/kernel/setup.c
1611 ++++ b/arch/x86/kernel/setup.c
1612 +@@ -112,6 +112,7 @@
1613 + #include <asm/alternative.h>
1614 + #include <asm/prom.h>
1615 + #include <asm/microcode.h>
1616 ++#include <asm/kaiser.h>
1617 +
1618 + /*
1619 + * max_low_pfn_mapped: highest direct mapped pfn under 4GB
1620 +@@ -1016,6 +1017,12 @@ void __init setup_arch(char **cmdline_p)
1621 + */
1622 + init_hypervisor_platform();
1623 +
1624 ++ /*
1625 ++ * This needs to happen right after XENPV is set on xen and
1626 ++ * kaiser_enabled is checked below in cleanup_highmap().
1627 ++ */
1628 ++ kaiser_check_boottime_disable();
1629 ++
1630 + x86_init.resources.probe_roms();
1631 +
1632 + /* after parse_early_param, so could debug it */
1633 +diff --git a/arch/x86/kernel/tracepoint.c b/arch/x86/kernel/tracepoint.c
1634 +index 1c113db9ed57..2bb5ee464df3 100644
1635 +--- a/arch/x86/kernel/tracepoint.c
1636 ++++ b/arch/x86/kernel/tracepoint.c
1637 +@@ -9,10 +9,12 @@
1638 + #include <linux/atomic.h>
1639 +
1640 + atomic_t trace_idt_ctr = ATOMIC_INIT(0);
1641 ++__aligned(PAGE_SIZE)
1642 + struct desc_ptr trace_idt_descr = { NR_VECTORS * 16 - 1,
1643 + (unsigned long) trace_idt_table };
1644 +
1645 + /* No need to be aligned, but done to keep all IDTs defined the same way. */
1646 ++__aligned(PAGE_SIZE)
1647 + gate_desc trace_idt_table[NR_VECTORS] __page_aligned_bss;
1648 +
1649 + static int trace_irq_vector_refcount;
1650 +diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
1651 +index 796f1ec67469..ccf17dbfea09 100644
1652 +--- a/arch/x86/kvm/x86.c
1653 ++++ b/arch/x86/kvm/x86.c
1654 +@@ -759,7 +759,8 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
1655 + return 1;
1656 +
1657 + /* PCID can not be enabled when cr3[11:0]!=000H or EFER.LMA=0 */
1658 +- if ((kvm_read_cr3(vcpu) & X86_CR3_PCID_MASK) || !is_long_mode(vcpu))
1659 ++ if ((kvm_read_cr3(vcpu) & X86_CR3_PCID_ASID_MASK) ||
1660 ++ !is_long_mode(vcpu))
1661 + return 1;
1662 + }
1663 +
1664 +diff --git a/arch/x86/lib/cmdline.c b/arch/x86/lib/cmdline.c
1665 +index 422db000d727..a744506856b1 100644
1666 +--- a/arch/x86/lib/cmdline.c
1667 ++++ b/arch/x86/lib/cmdline.c
1668 +@@ -82,3 +82,108 @@ int cmdline_find_option_bool(const char *cmdline, const char *option)
1669 +
1670 + return 0; /* Buffer overrun */
1671 + }
1672 ++
1673 ++/*
1674 ++ * Find a non-boolean option (i.e. option=argument). In accordance with
1675 ++ * standard Linux practice, if this option is repeated, this returns the
1676 ++ * last instance on the command line.
1677 ++ *
1678 ++ * @cmdline: the cmdline string
1679 ++ * @max_cmdline_size: the maximum size of cmdline
1680 ++ * @option: option string to look for
1681 ++ * @buffer: memory buffer to return the option argument
1682 ++ * @bufsize: size of the supplied memory buffer
1683 ++ *
1684 ++ * Returns the length of the argument (regardless of if it was
1685 ++ * truncated to fit in the buffer), or -1 on not found.
1686 ++ */
1687 ++static int
1688 ++__cmdline_find_option(const char *cmdline, int max_cmdline_size,
1689 ++ const char *option, char *buffer, int bufsize)
1690 ++{
1691 ++ char c;
1692 ++ int pos = 0, len = -1;
1693 ++ const char *opptr = NULL;
1694 ++ char *bufptr = buffer;
1695 ++ enum {
1696 ++ st_wordstart = 0, /* Start of word/after whitespace */
1697 ++ st_wordcmp, /* Comparing this word */
1698 ++ st_wordskip, /* Miscompare, skip */
1699 ++ st_bufcpy, /* Copying this to buffer */
1700 ++ } state = st_wordstart;
1701 ++
1702 ++ if (!cmdline)
1703 ++ return -1; /* No command line */
1704 ++
1705 ++ /*
1706 ++ * This 'pos' check ensures we do not overrun
1707 ++ * a non-NULL-terminated 'cmdline'
1708 ++ */
1709 ++ while (pos++ < max_cmdline_size) {
1710 ++ c = *(char *)cmdline++;
1711 ++ if (!c)
1712 ++ break;
1713 ++
1714 ++ switch (state) {
1715 ++ case st_wordstart:
1716 ++ if (myisspace(c))
1717 ++ break;
1718 ++
1719 ++ state = st_wordcmp;
1720 ++ opptr = option;
1721 ++ /* fall through */
1722 ++
1723 ++ case st_wordcmp:
1724 ++ if ((c == '=') && !*opptr) {
1725 ++ /*
1726 ++ * We matched all the way to the end of the
1727 ++ * option we were looking for, prepare to
1728 ++ * copy the argument.
1729 ++ */
1730 ++ len = 0;
1731 ++ bufptr = buffer;
1732 ++ state = st_bufcpy;
1733 ++ break;
1734 ++ } else if (c == *opptr++) {
1735 ++ /*
1736 ++ * We are currently matching, so continue
1737 ++ * to the next character on the cmdline.
1738 ++ */
1739 ++ break;
1740 ++ }
1741 ++ state = st_wordskip;
1742 ++ /* fall through */
1743 ++
1744 ++ case st_wordskip:
1745 ++ if (myisspace(c))
1746 ++ state = st_wordstart;
1747 ++ break;
1748 ++
1749 ++ case st_bufcpy:
1750 ++ if (myisspace(c)) {
1751 ++ state = st_wordstart;
1752 ++ } else {
1753 ++ /*
1754 ++ * Increment len, but don't overrun the
1755 ++ * supplied buffer and leave room for the
1756 ++ * NULL terminator.
1757 ++ */
1758 ++ if (++len < bufsize)
1759 ++ *bufptr++ = c;
1760 ++ }
1761 ++ break;
1762 ++ }
1763 ++ }
1764 ++
1765 ++ if (bufsize)
1766 ++ *bufptr = '\0';
1767 ++
1768 ++ return len;
1769 ++}
1770 ++
1771 ++int cmdline_find_option(const char *cmdline, const char *option, char *buffer,
1772 ++ int bufsize)
1773 ++{
1774 ++ return __cmdline_find_option(cmdline, COMMAND_LINE_SIZE, option,
1775 ++ buffer, bufsize);
1776 ++}
1777 +diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
1778 +index 1ae7c141f778..61e6cead9c4a 100644
1779 +--- a/arch/x86/mm/Makefile
1780 ++++ b/arch/x86/mm/Makefile
1781 +@@ -32,3 +32,4 @@ obj-$(CONFIG_ACPI_NUMA) += srat.o
1782 + obj-$(CONFIG_NUMA_EMU) += numa_emulation.o
1783 +
1784 + obj-$(CONFIG_X86_INTEL_MPX) += mpx.o
1785 ++obj-$(CONFIG_PAGE_TABLE_ISOLATION) += kaiser.o
1786 +diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
1787 +index ed4b372860e4..2bd45ae91eb3 100644
1788 +--- a/arch/x86/mm/init.c
1789 ++++ b/arch/x86/mm/init.c
1790 +@@ -165,7 +165,7 @@ static void __init probe_page_size_mask(void)
1791 + cr4_set_bits_and_update_boot(X86_CR4_PSE);
1792 +
1793 + /* Enable PGE if available */
1794 +- if (cpu_has_pge) {
1795 ++ if (cpu_has_pge && !kaiser_enabled) {
1796 + cr4_set_bits_and_update_boot(X86_CR4_PGE);
1797 + __supported_pte_mask |= _PAGE_GLOBAL;
1798 + } else
1799 +diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
1800 +index ec081fe0ce2c..d76ec9348cff 100644
1801 +--- a/arch/x86/mm/init_64.c
1802 ++++ b/arch/x86/mm/init_64.c
1803 +@@ -395,6 +395,16 @@ void __init cleanup_highmap(void)
1804 + continue;
1805 + if (vaddr < (unsigned long) _text || vaddr > end)
1806 + set_pmd(pmd, __pmd(0));
1807 ++ else if (kaiser_enabled) {
1808 ++ /*
1809 ++ * level2_kernel_pgt is initialized with _PAGE_GLOBAL:
1810 ++ * clear that now. This is not important, so long as
1811 ++ * CR4.PGE remains clear, but it removes an anomaly.
1812 ++ * Physical mapping setup below avoids _PAGE_GLOBAL
1813 ++ * by use of massage_pgprot() inside pfn_pte() etc.
1814 ++ */
1815 ++ set_pmd(pmd, pmd_clear_flags(*pmd, _PAGE_GLOBAL));
1816 ++ }
1817 + }
1818 + }
1819 +
1820 +diff --git a/arch/x86/mm/kaiser.c b/arch/x86/mm/kaiser.c
1821 +new file mode 100644
1822 +index 000000000000..b0b3a69f1c7f
1823 +--- /dev/null
1824 ++++ b/arch/x86/mm/kaiser.c
1825 +@@ -0,0 +1,456 @@
1826 ++#include <linux/bug.h>
1827 ++#include <linux/kernel.h>
1828 ++#include <linux/errno.h>
1829 ++#include <linux/string.h>
1830 ++#include <linux/types.h>
1831 ++#include <linux/bug.h>
1832 ++#include <linux/init.h>
1833 ++#include <linux/interrupt.h>
1834 ++#include <linux/spinlock.h>
1835 ++#include <linux/mm.h>
1836 ++#include <linux/uaccess.h>
1837 ++#include <linux/ftrace.h>
1838 ++
1839 ++#undef pr_fmt
1840 ++#define pr_fmt(fmt) "Kernel/User page tables isolation: " fmt
1841 ++
1842 ++#include <asm/kaiser.h>
1843 ++#include <asm/tlbflush.h> /* to verify its kaiser declarations */
1844 ++#include <asm/pgtable.h>
1845 ++#include <asm/pgalloc.h>
1846 ++#include <asm/desc.h>
1847 ++#include <asm/cmdline.h>
1848 ++
1849 ++int kaiser_enabled __read_mostly = 1;
1850 ++EXPORT_SYMBOL(kaiser_enabled); /* for inlined TLB flush functions */
1851 ++
1852 ++__visible
1853 ++DEFINE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
1854 ++
1855 ++/*
1856 ++ * These can have bit 63 set, so we can not just use a plain "or"
1857 ++ * instruction to get their value or'd into CR3. It would take
1858 ++ * another register. So, we use a memory reference to these instead.
1859 ++ *
1860 ++ * This is also handy because systems that do not support PCIDs
1861 ++ * just end up or'ing a 0 into their CR3, which does no harm.
1862 ++ */
1863 ++DEFINE_PER_CPU(unsigned long, x86_cr3_pcid_user);
1864 ++
1865 ++/*
1866 ++ * At runtime, the only things we map are some things for CPU
1867 ++ * hotplug, and stacks for new processes. No two CPUs will ever
1868 ++ * be populating the same addresses, so we only need to ensure
1869 ++ * that we protect between two CPUs trying to allocate and
1870 ++ * populate the same page table page.
1871 ++ *
1872 ++ * Only take this lock when doing a set_p[4um]d(), but it is not
1873 ++ * needed for doing a set_pte(). We assume that only the *owner*
1874 ++ * of a given allocation will be doing this for _their_
1875 ++ * allocation.
1876 ++ *
1877 ++ * This ensures that once a system has been running for a while
1878 ++ * and there have been stacks all over and these page tables
1879 ++ * are fully populated, there will be no further acquisitions of
1880 ++ * this lock.
1881 ++ */
1882 ++static DEFINE_SPINLOCK(shadow_table_allocation_lock);
1883 ++
1884 ++/*
1885 ++ * Returns -1 on error.
1886 ++ */
1887 ++static inline unsigned long get_pa_from_mapping(unsigned long vaddr)
1888 ++{
1889 ++ pgd_t *pgd;
1890 ++ pud_t *pud;
1891 ++ pmd_t *pmd;
1892 ++ pte_t *pte;
1893 ++
1894 ++ pgd = pgd_offset_k(vaddr);
1895 ++ /*
1896 ++ * We made all the kernel PGDs present in kaiser_init().
1897 ++ * We expect them to stay that way.
1898 ++ */
1899 ++ BUG_ON(pgd_none(*pgd));
1900 ++ /*
1901 ++ * PGDs are either 512GB or 128TB on all x86_64
1902 ++ * configurations. We don't handle these.
1903 ++ */
1904 ++ BUG_ON(pgd_large(*pgd));
1905 ++
1906 ++ pud = pud_offset(pgd, vaddr);
1907 ++ if (pud_none(*pud)) {
1908 ++ WARN_ON_ONCE(1);
1909 ++ return -1;
1910 ++ }
1911 ++
1912 ++ if (pud_large(*pud))
1913 ++ return (pud_pfn(*pud) << PAGE_SHIFT) | (vaddr & ~PUD_PAGE_MASK);
1914 ++
1915 ++ pmd = pmd_offset(pud, vaddr);
1916 ++ if (pmd_none(*pmd)) {
1917 ++ WARN_ON_ONCE(1);
1918 ++ return -1;
1919 ++ }
1920 ++
1921 ++ if (pmd_large(*pmd))
1922 ++ return (pmd_pfn(*pmd) << PAGE_SHIFT) | (vaddr & ~PMD_PAGE_MASK);
1923 ++
1924 ++ pte = pte_offset_kernel(pmd, vaddr);
1925 ++ if (pte_none(*pte)) {
1926 ++ WARN_ON_ONCE(1);
1927 ++ return -1;
1928 ++ }
1929 ++
1930 ++ return (pte_pfn(*pte) << PAGE_SHIFT) | (vaddr & ~PAGE_MASK);
1931 ++}
1932 ++
1933 ++/*
1934 ++ * This is a relatively normal page table walk, except that it
1935 ++ * also tries to allocate page tables pages along the way.
1936 ++ *
1937 ++ * Returns a pointer to a PTE on success, or NULL on failure.
1938 ++ */
1939 ++static pte_t *kaiser_pagetable_walk(unsigned long address)
1940 ++{
1941 ++ pmd_t *pmd;
1942 ++ pud_t *pud;
1943 ++ pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(address));
1944 ++ gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
1945 ++
1946 ++ if (pgd_none(*pgd)) {
1947 ++ WARN_ONCE(1, "All shadow pgds should have been populated");
1948 ++ return NULL;
1949 ++ }
1950 ++ BUILD_BUG_ON(pgd_large(*pgd) != 0);
1951 ++
1952 ++ pud = pud_offset(pgd, address);
1953 ++ /* The shadow page tables do not use large mappings: */
1954 ++ if (pud_large(*pud)) {
1955 ++ WARN_ON(1);
1956 ++ return NULL;
1957 ++ }
1958 ++ if (pud_none(*pud)) {
1959 ++ unsigned long new_pmd_page = __get_free_page(gfp);
1960 ++ if (!new_pmd_page)
1961 ++ return NULL;
1962 ++ spin_lock(&shadow_table_allocation_lock);
1963 ++ if (pud_none(*pud)) {
1964 ++ set_pud(pud, __pud(_KERNPG_TABLE | __pa(new_pmd_page)));
1965 ++ __inc_zone_page_state(virt_to_page((void *)
1966 ++ new_pmd_page), NR_KAISERTABLE);
1967 ++ } else
1968 ++ free_page(new_pmd_page);
1969 ++ spin_unlock(&shadow_table_allocation_lock);
1970 ++ }
1971 ++
1972 ++ pmd = pmd_offset(pud, address);
1973 ++ /* The shadow page tables do not use large mappings: */
1974 ++ if (pmd_large(*pmd)) {
1975 ++ WARN_ON(1);
1976 ++ return NULL;
1977 ++ }
1978 ++ if (pmd_none(*pmd)) {
1979 ++ unsigned long new_pte_page = __get_free_page(gfp);
1980 ++ if (!new_pte_page)
1981 ++ return NULL;
1982 ++ spin_lock(&shadow_table_allocation_lock);
1983 ++ if (pmd_none(*pmd)) {
1984 ++ set_pmd(pmd, __pmd(_KERNPG_TABLE | __pa(new_pte_page)));
1985 ++ __inc_zone_page_state(virt_to_page((void *)
1986 ++ new_pte_page), NR_KAISERTABLE);
1987 ++ } else
1988 ++ free_page(new_pte_page);
1989 ++ spin_unlock(&shadow_table_allocation_lock);
1990 ++ }
1991 ++
1992 ++ return pte_offset_kernel(pmd, address);
1993 ++}
1994 ++
1995 ++static int kaiser_add_user_map(const void *__start_addr, unsigned long size,
1996 ++ unsigned long flags)
1997 ++{
1998 ++ int ret = 0;
1999 ++ pte_t *pte;
2000 ++ unsigned long start_addr = (unsigned long )__start_addr;
2001 ++ unsigned long address = start_addr & PAGE_MASK;
2002 ++ unsigned long end_addr = PAGE_ALIGN(start_addr + size);
2003 ++ unsigned long target_address;
2004 ++
2005 ++ /*
2006 ++ * It is convenient for callers to pass in __PAGE_KERNEL etc,
2007 ++ * and there is no actual harm from setting _PAGE_GLOBAL, so
2008 ++ * long as CR4.PGE is not set. But it is nonetheless troubling
2009 ++ * to see Kaiser itself setting _PAGE_GLOBAL (now that "nokaiser"
2010 ++ * requires that not to be #defined to 0): so mask it off here.
2011 ++ */
2012 ++ flags &= ~_PAGE_GLOBAL;
2013 ++
2014 ++ for (; address < end_addr; address += PAGE_SIZE) {
2015 ++ target_address = get_pa_from_mapping(address);
2016 ++ if (target_address == -1) {
2017 ++ ret = -EIO;
2018 ++ break;
2019 ++ }
2020 ++ pte = kaiser_pagetable_walk(address);
2021 ++ if (!pte) {
2022 ++ ret = -ENOMEM;
2023 ++ break;
2024 ++ }
2025 ++ if (pte_none(*pte)) {
2026 ++ set_pte(pte, __pte(flags | target_address));
2027 ++ } else {
2028 ++ pte_t tmp;
2029 ++ set_pte(&tmp, __pte(flags | target_address));
2030 ++ WARN_ON_ONCE(!pte_same(*pte, tmp));
2031 ++ }
2032 ++ }
2033 ++ return ret;
2034 ++}
2035 ++
2036 ++static int kaiser_add_user_map_ptrs(const void *start, const void *end, unsigned long flags)
2037 ++{
2038 ++ unsigned long size = end - start;
2039 ++
2040 ++ return kaiser_add_user_map(start, size, flags);
2041 ++}
2042 ++
2043 ++/*
2044 ++ * Ensure that the top level of the (shadow) page tables are
2045 ++ * entirely populated. This ensures that all processes that get
2046 ++ * forked have the same entries. This way, we do not have to
2047 ++ * ever go set up new entries in older processes.
2048 ++ *
2049 ++ * Note: we never free these, so there are no updates to them
2050 ++ * after this.
2051 ++ */
2052 ++static void __init kaiser_init_all_pgds(void)
2053 ++{
2054 ++ pgd_t *pgd;
2055 ++ int i = 0;
2056 ++
2057 ++ pgd = native_get_shadow_pgd(pgd_offset_k((unsigned long )0));
2058 ++ for (i = PTRS_PER_PGD / 2; i < PTRS_PER_PGD; i++) {
2059 ++ pgd_t new_pgd;
2060 ++ pud_t *pud = pud_alloc_one(&init_mm,
2061 ++ PAGE_OFFSET + i * PGDIR_SIZE);
2062 ++ if (!pud) {
2063 ++ WARN_ON(1);
2064 ++ break;
2065 ++ }
2066 ++ inc_zone_page_state(virt_to_page(pud), NR_KAISERTABLE);
2067 ++ new_pgd = __pgd(_KERNPG_TABLE |__pa(pud));
2068 ++ /*
2069 ++ * Make sure not to stomp on some other pgd entry.
2070 ++ */
2071 ++ if (!pgd_none(pgd[i])) {
2072 ++ WARN_ON(1);
2073 ++ continue;
2074 ++ }
2075 ++ set_pgd(pgd + i, new_pgd);
2076 ++ }
2077 ++}
2078 ++
2079 ++#define kaiser_add_user_map_early(start, size, flags) do { \
2080 ++ int __ret = kaiser_add_user_map(start, size, flags); \
2081 ++ WARN_ON(__ret); \
2082 ++} while (0)
2083 ++
2084 ++#define kaiser_add_user_map_ptrs_early(start, end, flags) do { \
2085 ++ int __ret = kaiser_add_user_map_ptrs(start, end, flags); \
2086 ++ WARN_ON(__ret); \
2087 ++} while (0)
2088 ++
2089 ++void __init kaiser_check_boottime_disable(void)
2090 ++{
2091 ++ bool enable = true;
2092 ++ char arg[5];
2093 ++ int ret;
2094 ++
2095 ++ if (boot_cpu_has(X86_FEATURE_XENPV))
2096 ++ goto silent_disable;
2097 ++
2098 ++ ret = cmdline_find_option(boot_command_line, "pti", arg, sizeof(arg));
2099 ++ if (ret > 0) {
2100 ++ if (!strncmp(arg, "on", 2))
2101 ++ goto enable;
2102 ++
2103 ++ if (!strncmp(arg, "off", 3))
2104 ++ goto disable;
2105 ++
2106 ++ if (!strncmp(arg, "auto", 4))
2107 ++ goto skip;
2108 ++ }
2109 ++
2110 ++ if (cmdline_find_option_bool(boot_command_line, "nopti"))
2111 ++ goto disable;
2112 ++
2113 ++skip:
2114 ++ if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
2115 ++ goto disable;
2116 ++
2117 ++enable:
2118 ++ if (enable)
2119 ++ setup_force_cpu_cap(X86_FEATURE_KAISER);
2120 ++
2121 ++ return;
2122 ++
2123 ++disable:
2124 ++ pr_info("disabled\n");
2125 ++
2126 ++silent_disable:
2127 ++ kaiser_enabled = 0;
2128 ++ setup_clear_cpu_cap(X86_FEATURE_KAISER);
2129 ++}
2130 ++
2131 ++/*
2132 ++ * If anything in here fails, we will likely die on one of the
2133 ++ * first kernel->user transitions and init will die. But, we
2134 ++ * will have most of the kernel up by then and should be able to
2135 ++ * get a clean warning out of it. If we BUG_ON() here, we run
2136 ++ * the risk of being before we have good console output.
2137 ++ */
2138 ++void __init kaiser_init(void)
2139 ++{
2140 ++ int cpu;
2141 ++
2142 ++ if (!kaiser_enabled)
2143 ++ return;
2144 ++
2145 ++ kaiser_init_all_pgds();
2146 ++
2147 ++ for_each_possible_cpu(cpu) {
2148 ++ void *percpu_vaddr = __per_cpu_user_mapped_start +
2149 ++ per_cpu_offset(cpu);
2150 ++ unsigned long percpu_sz = __per_cpu_user_mapped_end -
2151 ++ __per_cpu_user_mapped_start;
2152 ++ kaiser_add_user_map_early(percpu_vaddr, percpu_sz,
2153 ++ __PAGE_KERNEL);
2154 ++ }
2155 ++
2156 ++ /*
2157 ++ * Map the entry/exit text section, which is needed at
2158 ++ * switches from user to and from kernel.
2159 ++ */
2160 ++ kaiser_add_user_map_ptrs_early(__entry_text_start, __entry_text_end,
2161 ++ __PAGE_KERNEL_RX);
2162 ++
2163 ++#if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN)
2164 ++ kaiser_add_user_map_ptrs_early(__irqentry_text_start,
2165 ++ __irqentry_text_end,
2166 ++ __PAGE_KERNEL_RX);
2167 ++#endif
2168 ++ kaiser_add_user_map_early((void *)idt_descr.address,
2169 ++ sizeof(gate_desc) * NR_VECTORS,
2170 ++ __PAGE_KERNEL_RO);
2171 ++#ifdef CONFIG_TRACING
2172 ++ kaiser_add_user_map_early(&trace_idt_descr,
2173 ++ sizeof(trace_idt_descr),
2174 ++ __PAGE_KERNEL);
2175 ++ kaiser_add_user_map_early(&trace_idt_table,
2176 ++ sizeof(gate_desc) * NR_VECTORS,
2177 ++ __PAGE_KERNEL);
2178 ++#endif
2179 ++ kaiser_add_user_map_early(&debug_idt_descr, sizeof(debug_idt_descr),
2180 ++ __PAGE_KERNEL);
2181 ++ kaiser_add_user_map_early(&debug_idt_table,
2182 ++ sizeof(gate_desc) * NR_VECTORS,
2183 ++ __PAGE_KERNEL);
2184 ++
2185 ++ pr_info("enabled\n");
2186 ++}
2187 ++
2188 ++/* Add a mapping to the shadow mapping, and synchronize the mappings */
2189 ++int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags)
2190 ++{
2191 ++ if (!kaiser_enabled)
2192 ++ return 0;
2193 ++ return kaiser_add_user_map((const void *)addr, size, flags);
2194 ++}
2195 ++
2196 ++void kaiser_remove_mapping(unsigned long start, unsigned long size)
2197 ++{
2198 ++ extern void unmap_pud_range_nofree(pgd_t *pgd,
2199 ++ unsigned long start, unsigned long end);
2200 ++ unsigned long end = start + size;
2201 ++ unsigned long addr, next;
2202 ++ pgd_t *pgd;
2203 ++
2204 ++ if (!kaiser_enabled)
2205 ++ return;
2206 ++ pgd = native_get_shadow_pgd(pgd_offset_k(start));
2207 ++ for (addr = start; addr < end; pgd++, addr = next) {
2208 ++ next = pgd_addr_end(addr, end);
2209 ++ unmap_pud_range_nofree(pgd, addr, next);
2210 ++ }
2211 ++}
2212 ++
2213 ++/*
2214 ++ * Page table pages are page-aligned. The lower half of the top
2215 ++ * level is used for userspace and the top half for the kernel.
2216 ++ * This returns true for user pages that need to get copied into
2217 ++ * both the user and kernel copies of the page tables, and false
2218 ++ * for kernel pages that should only be in the kernel copy.
2219 ++ */
2220 ++static inline bool is_userspace_pgd(pgd_t *pgdp)
2221 ++{
2222 ++ return ((unsigned long)pgdp % PAGE_SIZE) < (PAGE_SIZE / 2);
2223 ++}
2224 ++
2225 ++pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd)
2226 ++{
2227 ++ if (!kaiser_enabled)
2228 ++ return pgd;
2229 ++ /*
2230 ++ * Do we need to also populate the shadow pgd? Check _PAGE_USER to
2231 ++ * skip cases like kexec and EFI which make temporary low mappings.
2232 ++ */
2233 ++ if (pgd.pgd & _PAGE_USER) {
2234 ++ if (is_userspace_pgd(pgdp)) {
2235 ++ native_get_shadow_pgd(pgdp)->pgd = pgd.pgd;
2236 ++ /*
2237 ++ * Even if the entry is *mapping* userspace, ensure
2238 ++ * that userspace can not use it. This way, if we
2239 ++ * get out to userspace running on the kernel CR3,
2240 ++ * userspace will crash instead of running.
2241 ++ */
2242 ++ if (__supported_pte_mask & _PAGE_NX)
2243 ++ pgd.pgd |= _PAGE_NX;
2244 ++ }
2245 ++ } else if (!pgd.pgd) {
2246 ++ /*
2247 ++ * pgd_clear() cannot check _PAGE_USER, and is even used to
2248 ++ * clear corrupted pgd entries: so just rely on cases like
2249 ++ * kexec and EFI never to be using pgd_clear().
2250 ++ */
2251 ++ if (!WARN_ON_ONCE((unsigned long)pgdp & PAGE_SIZE) &&
2252 ++ is_userspace_pgd(pgdp))
2253 ++ native_get_shadow_pgd(pgdp)->pgd = pgd.pgd;
2254 ++ }
2255 ++ return pgd;
2256 ++}
2257 ++
2258 ++void kaiser_setup_pcid(void)
2259 ++{
2260 ++ unsigned long user_cr3 = KAISER_SHADOW_PGD_OFFSET;
2261 ++
2262 ++ if (this_cpu_has(X86_FEATURE_PCID))
2263 ++ user_cr3 |= X86_CR3_PCID_USER_NOFLUSH;
2264 ++ /*
2265 ++ * These variables are used by the entry/exit
2266 ++ * code to change PCID and pgd and TLB flushing.
2267 ++ */
2268 ++ this_cpu_write(x86_cr3_pcid_user, user_cr3);
2269 ++}
2270 ++
2271 ++/*
2272 ++ * Make a note that this cpu will need to flush USER tlb on return to user.
2273 ++ * If cpu does not have PCID, then the NOFLUSH bit will never have been set.
2274 ++ */
2275 ++void kaiser_flush_tlb_on_return_to_user(void)
2276 ++{
2277 ++ if (this_cpu_has(X86_FEATURE_PCID))
2278 ++ this_cpu_write(x86_cr3_pcid_user,
2279 ++ X86_CR3_PCID_USER_FLUSH | KAISER_SHADOW_PGD_OFFSET);
2280 ++}
2281 ++EXPORT_SYMBOL(kaiser_flush_tlb_on_return_to_user);
2282 +diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c
2283 +index 4e5ac46adc9d..81ec7c02f968 100644
2284 +--- a/arch/x86/mm/kasan_init_64.c
2285 ++++ b/arch/x86/mm/kasan_init_64.c
2286 +@@ -121,11 +121,16 @@ void __init kasan_init(void)
2287 + kasan_populate_zero_shadow(kasan_mem_to_shadow((void *)MODULES_END),
2288 + (void *)KASAN_SHADOW_END);
2289 +
2290 +- memset(kasan_zero_page, 0, PAGE_SIZE);
2291 +-
2292 + load_cr3(init_level4_pgt);
2293 + __flush_tlb_all();
2294 +- init_task.kasan_depth = 0;
2295 +
2296 ++ /*
2297 ++ * kasan_zero_page has been used as early shadow memory, thus it may
2298 ++ * contain some garbage. Now we can clear it, since after the TLB flush
2299 ++ * no one should write to it.
2300 ++ */
2301 ++ memset(kasan_zero_page, 0, PAGE_SIZE);
2302 ++
2303 ++ init_task.kasan_depth = 0;
2304 + pr_info("KernelAddressSanitizer initialized\n");
2305 + }
2306 +diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
2307 +index b599a780a5a9..79377e2a7bcd 100644
2308 +--- a/arch/x86/mm/pageattr.c
2309 ++++ b/arch/x86/mm/pageattr.c
2310 +@@ -52,6 +52,7 @@ static DEFINE_SPINLOCK(cpa_lock);
2311 + #define CPA_FLUSHTLB 1
2312 + #define CPA_ARRAY 2
2313 + #define CPA_PAGES_ARRAY 4
2314 ++#define CPA_FREE_PAGETABLES 8
2315 +
2316 + #ifdef CONFIG_PROC_FS
2317 + static unsigned long direct_pages_count[PG_LEVEL_NUM];
2318 +@@ -723,10 +724,13 @@ static int split_large_page(struct cpa_data *cpa, pte_t *kpte,
2319 + return 0;
2320 + }
2321 +
2322 +-static bool try_to_free_pte_page(pte_t *pte)
2323 ++static bool try_to_free_pte_page(struct cpa_data *cpa, pte_t *pte)
2324 + {
2325 + int i;
2326 +
2327 ++ if (!(cpa->flags & CPA_FREE_PAGETABLES))
2328 ++ return false;
2329 ++
2330 + for (i = 0; i < PTRS_PER_PTE; i++)
2331 + if (!pte_none(pte[i]))
2332 + return false;
2333 +@@ -735,10 +739,13 @@ static bool try_to_free_pte_page(pte_t *pte)
2334 + return true;
2335 + }
2336 +
2337 +-static bool try_to_free_pmd_page(pmd_t *pmd)
2338 ++static bool try_to_free_pmd_page(struct cpa_data *cpa, pmd_t *pmd)
2339 + {
2340 + int i;
2341 +
2342 ++ if (!(cpa->flags & CPA_FREE_PAGETABLES))
2343 ++ return false;
2344 ++
2345 + for (i = 0; i < PTRS_PER_PMD; i++)
2346 + if (!pmd_none(pmd[i]))
2347 + return false;
2348 +@@ -759,7 +766,9 @@ static bool try_to_free_pud_page(pud_t *pud)
2349 + return true;
2350 + }
2351 +
2352 +-static bool unmap_pte_range(pmd_t *pmd, unsigned long start, unsigned long end)
2353 ++static bool unmap_pte_range(struct cpa_data *cpa, pmd_t *pmd,
2354 ++ unsigned long start,
2355 ++ unsigned long end)
2356 + {
2357 + pte_t *pte = pte_offset_kernel(pmd, start);
2358 +
2359 +@@ -770,22 +779,23 @@ static bool unmap_pte_range(pmd_t *pmd, unsigned long start, unsigned long end)
2360 + pte++;
2361 + }
2362 +
2363 +- if (try_to_free_pte_page((pte_t *)pmd_page_vaddr(*pmd))) {
2364 ++ if (try_to_free_pte_page(cpa, (pte_t *)pmd_page_vaddr(*pmd))) {
2365 + pmd_clear(pmd);
2366 + return true;
2367 + }
2368 + return false;
2369 + }
2370 +
2371 +-static void __unmap_pmd_range(pud_t *pud, pmd_t *pmd,
2372 ++static void __unmap_pmd_range(struct cpa_data *cpa, pud_t *pud, pmd_t *pmd,
2373 + unsigned long start, unsigned long end)
2374 + {
2375 +- if (unmap_pte_range(pmd, start, end))
2376 +- if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud)))
2377 ++ if (unmap_pte_range(cpa, pmd, start, end))
2378 ++ if (try_to_free_pmd_page(cpa, (pmd_t *)pud_page_vaddr(*pud)))
2379 + pud_clear(pud);
2380 + }
2381 +
2382 +-static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end)
2383 ++static void unmap_pmd_range(struct cpa_data *cpa, pud_t *pud,
2384 ++ unsigned long start, unsigned long end)
2385 + {
2386 + pmd_t *pmd = pmd_offset(pud, start);
2387 +
2388 +@@ -796,7 +806,7 @@ static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end)
2389 + unsigned long next_page = (start + PMD_SIZE) & PMD_MASK;
2390 + unsigned long pre_end = min_t(unsigned long, end, next_page);
2391 +
2392 +- __unmap_pmd_range(pud, pmd, start, pre_end);
2393 ++ __unmap_pmd_range(cpa, pud, pmd, start, pre_end);
2394 +
2395 + start = pre_end;
2396 + pmd++;
2397 +@@ -809,7 +819,8 @@ static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end)
2398 + if (pmd_large(*pmd))
2399 + pmd_clear(pmd);
2400 + else
2401 +- __unmap_pmd_range(pud, pmd, start, start + PMD_SIZE);
2402 ++ __unmap_pmd_range(cpa, pud, pmd,
2403 ++ start, start + PMD_SIZE);
2404 +
2405 + start += PMD_SIZE;
2406 + pmd++;
2407 +@@ -819,17 +830,19 @@ static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end)
2408 + * 4K leftovers?
2409 + */
2410 + if (start < end)
2411 +- return __unmap_pmd_range(pud, pmd, start, end);
2412 ++ return __unmap_pmd_range(cpa, pud, pmd, start, end);
2413 +
2414 + /*
2415 + * Try again to free the PMD page if haven't succeeded above.
2416 + */
2417 + if (!pud_none(*pud))
2418 +- if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud)))
2419 ++ if (try_to_free_pmd_page(cpa, (pmd_t *)pud_page_vaddr(*pud)))
2420 + pud_clear(pud);
2421 + }
2422 +
2423 +-static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
2424 ++static void __unmap_pud_range(struct cpa_data *cpa, pgd_t *pgd,
2425 ++ unsigned long start,
2426 ++ unsigned long end)
2427 + {
2428 + pud_t *pud = pud_offset(pgd, start);
2429 +
2430 +@@ -840,7 +853,7 @@ static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
2431 + unsigned long next_page = (start + PUD_SIZE) & PUD_MASK;
2432 + unsigned long pre_end = min_t(unsigned long, end, next_page);
2433 +
2434 +- unmap_pmd_range(pud, start, pre_end);
2435 ++ unmap_pmd_range(cpa, pud, start, pre_end);
2436 +
2437 + start = pre_end;
2438 + pud++;
2439 +@@ -854,7 +867,7 @@ static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
2440 + if (pud_large(*pud))
2441 + pud_clear(pud);
2442 + else
2443 +- unmap_pmd_range(pud, start, start + PUD_SIZE);
2444 ++ unmap_pmd_range(cpa, pud, start, start + PUD_SIZE);
2445 +
2446 + start += PUD_SIZE;
2447 + pud++;
2448 +@@ -864,7 +877,7 @@ static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
2449 + * 2M leftovers?
2450 + */
2451 + if (start < end)
2452 +- unmap_pmd_range(pud, start, end);
2453 ++ unmap_pmd_range(cpa, pud, start, end);
2454 +
2455 + /*
2456 + * No need to try to free the PUD page because we'll free it in
2457 +@@ -872,6 +885,24 @@ static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
2458 + */
2459 + }
2460 +
2461 ++static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
2462 ++{
2463 ++ struct cpa_data cpa = {
2464 ++ .flags = CPA_FREE_PAGETABLES,
2465 ++ };
2466 ++
2467 ++ __unmap_pud_range(&cpa, pgd, start, end);
2468 ++}
2469 ++
2470 ++void unmap_pud_range_nofree(pgd_t *pgd, unsigned long start, unsigned long end)
2471 ++{
2472 ++ struct cpa_data cpa = {
2473 ++ .flags = 0,
2474 ++ };
2475 ++
2476 ++ __unmap_pud_range(&cpa, pgd, start, end);
2477 ++}
2478 ++
2479 + static void unmap_pgd_range(pgd_t *root, unsigned long addr, unsigned long end)
2480 + {
2481 + pgd_t *pgd_entry = root + pgd_index(addr);
2482 +diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
2483 +index fb0a9dd1d6e4..dbc27a2b4ad5 100644
2484 +--- a/arch/x86/mm/pgtable.c
2485 ++++ b/arch/x86/mm/pgtable.c
2486 +@@ -6,7 +6,7 @@
2487 + #include <asm/fixmap.h>
2488 + #include <asm/mtrr.h>
2489 +
2490 +-#define PGALLOC_GFP GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO
2491 ++#define PGALLOC_GFP (GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO)
2492 +
2493 + #ifdef CONFIG_HIGHPTE
2494 + #define PGALLOC_USER_GFP __GFP_HIGHMEM
2495 +@@ -340,14 +340,24 @@ static inline void _pgd_free(pgd_t *pgd)
2496 + kmem_cache_free(pgd_cache, pgd);
2497 + }
2498 + #else
2499 ++
2500 ++/*
2501 ++ * Instead of one pgd, Kaiser acquires two pgds. Being order-1, it is
2502 ++ * both 8k in size and 8k-aligned. That lets us just flip bit 12
2503 ++ * in a pointer to swap between the two 4k halves.
2504 ++ */
2505 ++#define PGD_ALLOCATION_ORDER kaiser_enabled
2506 ++
2507 + static inline pgd_t *_pgd_alloc(void)
2508 + {
2509 +- return (pgd_t *)__get_free_page(PGALLOC_GFP);
2510 ++ /* No __GFP_REPEAT: to avoid page allocation stalls in order-1 case */
2511 ++ return (pgd_t *)__get_free_pages(PGALLOC_GFP & ~__GFP_REPEAT,
2512 ++ PGD_ALLOCATION_ORDER);
2513 + }
2514 +
2515 + static inline void _pgd_free(pgd_t *pgd)
2516 + {
2517 +- free_page((unsigned long)pgd);
2518 ++ free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER);
2519 + }
2520 + #endif /* CONFIG_X86_PAE */
2521 +
2522 +diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
2523 +index 7a4cdb632508..7cad01af6dcd 100644
2524 +--- a/arch/x86/mm/tlb.c
2525 ++++ b/arch/x86/mm/tlb.c
2526 +@@ -6,13 +6,14 @@
2527 + #include <linux/interrupt.h>
2528 + #include <linux/module.h>
2529 + #include <linux/cpu.h>
2530 ++#include <linux/debugfs.h>
2531 +
2532 + #include <asm/tlbflush.h>
2533 + #include <asm/mmu_context.h>
2534 + #include <asm/cache.h>
2535 + #include <asm/apic.h>
2536 + #include <asm/uv/uv.h>
2537 +-#include <linux/debugfs.h>
2538 ++#include <asm/kaiser.h>
2539 +
2540 + /*
2541 + * TLB flushing, formerly SMP-only
2542 +@@ -34,6 +35,36 @@ struct flush_tlb_info {
2543 + unsigned long flush_end;
2544 + };
2545 +
2546 ++static void load_new_mm_cr3(pgd_t *pgdir)
2547 ++{
2548 ++ unsigned long new_mm_cr3 = __pa(pgdir);
2549 ++
2550 ++ if (kaiser_enabled) {
2551 ++ /*
2552 ++ * We reuse the same PCID for different tasks, so we must
2553 ++ * flush all the entries for the PCID out when we change tasks.
2554 ++ * Flush KERN below, flush USER when returning to userspace in
2555 ++ * kaiser's SWITCH_USER_CR3 (_SWITCH_TO_USER_CR3) macro.
2556 ++ *
2557 ++ * invpcid_flush_single_context(X86_CR3_PCID_ASID_USER) could
2558 ++ * do it here, but can only be used if X86_FEATURE_INVPCID is
2559 ++ * available - and many machines support pcid without invpcid.
2560 ++ *
2561 ++ * If X86_CR3_PCID_KERN_FLUSH actually added something, then it
2562 ++ * would be needed in the write_cr3() below - if PCIDs enabled.
2563 ++ */
2564 ++ BUILD_BUG_ON(X86_CR3_PCID_KERN_FLUSH);
2565 ++ kaiser_flush_tlb_on_return_to_user();
2566 ++ }
2567 ++
2568 ++ /*
2569 ++ * Caution: many callers of this function expect
2570 ++ * that load_cr3() is serializing and orders TLB
2571 ++ * fills with respect to the mm_cpumask writes.
2572 ++ */
2573 ++ write_cr3(new_mm_cr3);
2574 ++}
2575 ++
2576 + /*
2577 + * We cannot call mmdrop() because we are in interrupt context,
2578 + * instead update mm->cpu_vm_mask.
2579 +@@ -45,7 +76,7 @@ void leave_mm(int cpu)
2580 + BUG();
2581 + if (cpumask_test_cpu(cpu, mm_cpumask(active_mm))) {
2582 + cpumask_clear_cpu(cpu, mm_cpumask(active_mm));
2583 +- load_cr3(swapper_pg_dir);
2584 ++ load_new_mm_cr3(swapper_pg_dir);
2585 + /*
2586 + * This gets called in the idle path where RCU
2587 + * functions differently. Tracing normally
2588 +@@ -105,7 +136,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
2589 + * ordering guarantee we need.
2590 + *
2591 + */
2592 +- load_cr3(next->pgd);
2593 ++ load_new_mm_cr3(next->pgd);
2594 +
2595 + trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
2596 +
2597 +@@ -152,7 +183,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
2598 + * As above, load_cr3() is serializing and orders TLB
2599 + * fills with respect to the mm_cpumask write.
2600 + */
2601 +- load_cr3(next->pgd);
2602 ++ load_new_mm_cr3(next->pgd);
2603 + trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
2604 + load_mm_cr4(next);
2605 + load_mm_ldt(next);
2606 +diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
2607 +index ef2e8c97e183..a461b6604fd9 100644
2608 +--- a/include/asm-generic/vmlinux.lds.h
2609 ++++ b/include/asm-generic/vmlinux.lds.h
2610 +@@ -725,7 +725,14 @@
2611 + */
2612 + #define PERCPU_INPUT(cacheline) \
2613 + VMLINUX_SYMBOL(__per_cpu_start) = .; \
2614 ++ VMLINUX_SYMBOL(__per_cpu_user_mapped_start) = .; \
2615 + *(.data..percpu..first) \
2616 ++ . = ALIGN(cacheline); \
2617 ++ *(.data..percpu..user_mapped) \
2618 ++ *(.data..percpu..user_mapped..shared_aligned) \
2619 ++ . = ALIGN(PAGE_SIZE); \
2620 ++ *(.data..percpu..user_mapped..page_aligned) \
2621 ++ VMLINUX_SYMBOL(__per_cpu_user_mapped_end) = .; \
2622 + . = ALIGN(PAGE_SIZE); \
2623 + *(.data..percpu..page_aligned) \
2624 + . = ALIGN(cacheline); \
2625 +diff --git a/include/linux/kaiser.h b/include/linux/kaiser.h
2626 +new file mode 100644
2627 +index 000000000000..58c55b1589d0
2628 +--- /dev/null
2629 ++++ b/include/linux/kaiser.h
2630 +@@ -0,0 +1,52 @@
2631 ++#ifndef _LINUX_KAISER_H
2632 ++#define _LINUX_KAISER_H
2633 ++
2634 ++#ifdef CONFIG_PAGE_TABLE_ISOLATION
2635 ++#include <asm/kaiser.h>
2636 ++
2637 ++static inline int kaiser_map_thread_stack(void *stack)
2638 ++{
2639 ++ /*
2640 ++ * Map that page of kernel stack on which we enter from user context.
2641 ++ */
2642 ++ return kaiser_add_mapping((unsigned long)stack +
2643 ++ THREAD_SIZE - PAGE_SIZE, PAGE_SIZE, __PAGE_KERNEL);
2644 ++}
2645 ++
2646 ++static inline void kaiser_unmap_thread_stack(void *stack)
2647 ++{
2648 ++ /*
2649 ++ * Note: may be called even when kaiser_map_thread_stack() failed.
2650 ++ */
2651 ++ kaiser_remove_mapping((unsigned long)stack +
2652 ++ THREAD_SIZE - PAGE_SIZE, PAGE_SIZE);
2653 ++}
2654 ++#else
2655 ++
2656 ++/*
2657 ++ * These stubs are used whenever CONFIG_PAGE_TABLE_ISOLATION is off, which
2658 ++ * includes architectures that support KAISER, but have it disabled.
2659 ++ */
2660 ++
2661 ++static inline void kaiser_init(void)
2662 ++{
2663 ++}
2664 ++static inline int kaiser_add_mapping(unsigned long addr,
2665 ++ unsigned long size, unsigned long flags)
2666 ++{
2667 ++ return 0;
2668 ++}
2669 ++static inline void kaiser_remove_mapping(unsigned long start,
2670 ++ unsigned long size)
2671 ++{
2672 ++}
2673 ++static inline int kaiser_map_thread_stack(void *stack)
2674 ++{
2675 ++ return 0;
2676 ++}
2677 ++static inline void kaiser_unmap_thread_stack(void *stack)
2678 ++{
2679 ++}
2680 ++
2681 ++#endif /* !CONFIG_PAGE_TABLE_ISOLATION */
2682 ++#endif /* _LINUX_KAISER_H */
2683 +diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
2684 +index ff88d6189411..b93b578cfa42 100644
2685 +--- a/include/linux/mmzone.h
2686 ++++ b/include/linux/mmzone.h
2687 +@@ -131,8 +131,9 @@ enum zone_stat_item {
2688 + NR_SLAB_RECLAIMABLE,
2689 + NR_SLAB_UNRECLAIMABLE,
2690 + NR_PAGETABLE, /* used for pagetables */
2691 +- NR_KERNEL_STACK,
2692 + /* Second 128 byte cacheline */
2693 ++ NR_KERNEL_STACK,
2694 ++ NR_KAISERTABLE,
2695 + NR_UNSTABLE_NFS, /* NFS unstable pages */
2696 + NR_BOUNCE,
2697 + NR_VMSCAN_WRITE,
2698 +diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h
2699 +index 8f16299ca068..8902f23bb770 100644
2700 +--- a/include/linux/percpu-defs.h
2701 ++++ b/include/linux/percpu-defs.h
2702 +@@ -35,6 +35,12 @@
2703 +
2704 + #endif
2705 +
2706 ++#ifdef CONFIG_PAGE_TABLE_ISOLATION
2707 ++#define USER_MAPPED_SECTION "..user_mapped"
2708 ++#else
2709 ++#define USER_MAPPED_SECTION ""
2710 ++#endif
2711 ++
2712 + /*
2713 + * Base implementations of per-CPU variable declarations and definitions, where
2714 + * the section in which the variable is to be placed is provided by the
2715 +@@ -115,6 +121,12 @@
2716 + #define DEFINE_PER_CPU(type, name) \
2717 + DEFINE_PER_CPU_SECTION(type, name, "")
2718 +
2719 ++#define DECLARE_PER_CPU_USER_MAPPED(type, name) \
2720 ++ DECLARE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION)
2721 ++
2722 ++#define DEFINE_PER_CPU_USER_MAPPED(type, name) \
2723 ++ DEFINE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION)
2724 ++
2725 + /*
2726 + * Declaration/definition used for per-CPU variables that must come first in
2727 + * the set of variables.
2728 +@@ -144,6 +156,14 @@
2729 + DEFINE_PER_CPU_SECTION(type, name, PER_CPU_SHARED_ALIGNED_SECTION) \
2730 + ____cacheline_aligned_in_smp
2731 +
2732 ++#define DECLARE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(type, name) \
2733 ++ DECLARE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION PER_CPU_SHARED_ALIGNED_SECTION) \
2734 ++ ____cacheline_aligned_in_smp
2735 ++
2736 ++#define DEFINE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(type, name) \
2737 ++ DEFINE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION PER_CPU_SHARED_ALIGNED_SECTION) \
2738 ++ ____cacheline_aligned_in_smp
2739 ++
2740 + #define DECLARE_PER_CPU_ALIGNED(type, name) \
2741 + DECLARE_PER_CPU_SECTION(type, name, PER_CPU_ALIGNED_SECTION) \
2742 + ____cacheline_aligned
2743 +@@ -162,11 +182,21 @@
2744 + #define DEFINE_PER_CPU_PAGE_ALIGNED(type, name) \
2745 + DEFINE_PER_CPU_SECTION(type, name, "..page_aligned") \
2746 + __aligned(PAGE_SIZE)
2747 ++/*
2748 ++ * Declaration/definition used for per-CPU variables that must be page aligned and need to be mapped in user mode.
2749 ++ */
2750 ++#define DECLARE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(type, name) \
2751 ++ DECLARE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION"..page_aligned") \
2752 ++ __aligned(PAGE_SIZE)
2753 ++
2754 ++#define DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(type, name) \
2755 ++ DEFINE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION"..page_aligned") \
2756 ++ __aligned(PAGE_SIZE)
2757 +
2758 + /*
2759 + * Declaration/definition used for per-CPU variables that must be read mostly.
2760 + */
2761 +-#define DECLARE_PER_CPU_READ_MOSTLY(type, name) \
2762 ++#define DECLARE_PER_CPU_READ_MOSTLY(type, name) \
2763 + DECLARE_PER_CPU_SECTION(type, name, "..read_mostly")
2764 +
2765 + #define DEFINE_PER_CPU_READ_MOSTLY(type, name) \
2766 +diff --git a/init/main.c b/init/main.c
2767 +index 9e64d7097f1a..49926d95442f 100644
2768 +--- a/init/main.c
2769 ++++ b/init/main.c
2770 +@@ -81,6 +81,7 @@
2771 + #include <linux/integrity.h>
2772 + #include <linux/proc_ns.h>
2773 + #include <linux/io.h>
2774 ++#include <linux/kaiser.h>
2775 +
2776 + #include <asm/io.h>
2777 + #include <asm/bugs.h>
2778 +@@ -492,6 +493,7 @@ static void __init mm_init(void)
2779 + pgtable_init();
2780 + vmalloc_init();
2781 + ioremap_huge_init();
2782 ++ kaiser_init();
2783 + }
2784 +
2785 + asmlinkage __visible void __init start_kernel(void)
2786 +diff --git a/kernel/fork.c b/kernel/fork.c
2787 +index 68cfda1c1800..ac00f14208b7 100644
2788 +--- a/kernel/fork.c
2789 ++++ b/kernel/fork.c
2790 +@@ -58,6 +58,7 @@
2791 + #include <linux/tsacct_kern.h>
2792 + #include <linux/cn_proc.h>
2793 + #include <linux/freezer.h>
2794 ++#include <linux/kaiser.h>
2795 + #include <linux/delayacct.h>
2796 + #include <linux/taskstats_kern.h>
2797 + #include <linux/random.h>
2798 +@@ -169,6 +170,7 @@ static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
2799 +
2800 + static inline void free_thread_info(struct thread_info *ti)
2801 + {
2802 ++ kaiser_unmap_thread_stack(ti);
2803 + free_kmem_pages((unsigned long)ti, THREAD_SIZE_ORDER);
2804 + }
2805 + # else
2806 +@@ -352,6 +354,10 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
2807 + goto free_ti;
2808 +
2809 + tsk->stack = ti;
2810 ++
2811 ++ err = kaiser_map_thread_stack(tsk->stack);
2812 ++ if (err)
2813 ++ goto free_ti;
2814 + #ifdef CONFIG_SECCOMP
2815 + /*
2816 + * We must handle setting up seccomp filters once we're under
2817 +diff --git a/mm/vmstat.c b/mm/vmstat.c
2818 +index c344e3609c53..324b7e90b4c5 100644
2819 +--- a/mm/vmstat.c
2820 ++++ b/mm/vmstat.c
2821 +@@ -736,6 +736,7 @@ const char * const vmstat_text[] = {
2822 + "nr_slab_unreclaimable",
2823 + "nr_page_table_pages",
2824 + "nr_kernel_stack",
2825 ++ "nr_overhead",
2826 + "nr_unstable",
2827 + "nr_bounce",
2828 + "nr_vmscan_write",
2829 +diff --git a/security/Kconfig b/security/Kconfig
2830 +index e45237897b43..a3ebb6ee5bd5 100644
2831 +--- a/security/Kconfig
2832 ++++ b/security/Kconfig
2833 +@@ -31,6 +31,16 @@ config SECURITY
2834 +
2835 + If you are unsure how to answer this question, answer N.
2836 +
2837 ++config PAGE_TABLE_ISOLATION
2838 ++ bool "Remove the kernel mapping in user mode"
2839 ++ default y
2840 ++ depends on X86_64 && SMP
2841 ++ help
2842 ++ This enforces a strict kernel and user space isolation, in order
2843 ++ to close hardware side channels on kernel address information.
2844 ++
2845 ++ If you are unsure how to answer this question, answer Y.
2846 ++
2847 + config SECURITYFS
2848 + bool "Enable the securityfs filesystem"
2849 + help