1 |
commit: c7d7705101af05e259b3b84ffc59a60ff2b96142 |
2 |
Author: Alice Ferrazzi <alicef <AT> gentoo <DOT> org> |
3 |
AuthorDate: Fri Jan 5 15:05:38 2018 +0000 |
4 |
Commit: Alice Ferrazzi <alicef <AT> gentoo <DOT> org> |
5 |
CommitDate: Fri Jan 5 15:05:38 2018 +0000 |
6 |
URL: https://gitweb.gentoo.org/proj/linux-patches.git/commit/?id=c7d77051 |
7 |
|
8 |
linux kernel 4.4.110 |
9 |
|
10 |
0000_README | 4 + |
11 |
1109_linux-4.4.110.patch | 2814 ++++++++++++++++++++++++++++++++++++++++++++++ |
12 |
2 files changed, 2818 insertions(+) |
13 |
|
14 |
diff --git a/0000_README b/0000_README |
15 |
index 3be106c..46149de 100644 |
16 |
--- a/0000_README |
17 |
+++ b/0000_README |
18 |
@@ -479,6 +479,10 @@ Patch: 1108_linux-4.4.109.patch |
19 |
From: http://www.kernel.org |
20 |
Desc: Linux 4.4.109 |
21 |
|
22 |
+Patch: 1109_linux-4.4.110.patch |
23 |
+From: http://www.kernel.org |
24 |
+Desc: Linux 4.4.110 |
25 |
+ |
26 |
Patch: 1500_XATTR_USER_PREFIX.patch |
27 |
From: https://bugs.gentoo.org/show_bug.cgi?id=470644 |
28 |
Desc: Support for namespace user.pax.* on tmpfs. |
29 |
|
30 |
diff --git a/1109_linux-4.4.110.patch b/1109_linux-4.4.110.patch |
31 |
new file mode 100644 |
32 |
index 0000000..1c226ed |
33 |
--- /dev/null |
34 |
+++ b/1109_linux-4.4.110.patch |
35 |
@@ -0,0 +1,2814 @@ |
36 |
+diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt |
37 |
+index b4a83a490212..5977c4d71356 100644 |
38 |
+--- a/Documentation/kernel-parameters.txt |
39 |
++++ b/Documentation/kernel-parameters.txt |
40 |
+@@ -2523,6 +2523,8 @@ bytes respectively. Such letter suffixes can also be entirely omitted. |
41 |
+ |
42 |
+ nojitter [IA-64] Disables jitter checking for ITC timers. |
43 |
+ |
44 |
++ nopti [X86-64] Disable KAISER isolation of kernel from user. |
45 |
++ |
46 |
+ no-kvmclock [X86,KVM] Disable paravirtualized KVM clock driver |
47 |
+ |
48 |
+ no-kvmapf [X86,KVM] Disable paravirtualized asynchronous page |
49 |
+@@ -3054,6 +3056,12 @@ bytes respectively. Such letter suffixes can also be entirely omitted. |
50 |
+ pt. [PARIDE] |
51 |
+ See Documentation/blockdev/paride.txt. |
52 |
+ |
53 |
++ pti= [X86_64] |
54 |
++ Control KAISER user/kernel address space isolation: |
55 |
++ on - enable |
56 |
++ off - disable |
57 |
++ auto - default setting |
58 |
++ |
59 |
+ pty.legacy_count= |
60 |
+ [KNL] Number of legacy pty's. Overwrites compiled-in |
61 |
+ default number. |
62 |
+diff --git a/Makefile b/Makefile |
63 |
+index 5d67056e24dd..b028c106535b 100644 |
64 |
+--- a/Makefile |
65 |
++++ b/Makefile |
66 |
+@@ -1,6 +1,6 @@ |
67 |
+ VERSION = 4 |
68 |
+ PATCHLEVEL = 4 |
69 |
+-SUBLEVEL = 109 |
70 |
++SUBLEVEL = 110 |
71 |
+ EXTRAVERSION = |
72 |
+ NAME = Blurry Fish Butt |
73 |
+ |
74 |
+diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h |
75 |
+index 3783dc3e10b3..4abb284a5b9c 100644 |
76 |
+--- a/arch/x86/boot/compressed/misc.h |
77 |
++++ b/arch/x86/boot/compressed/misc.h |
78 |
+@@ -9,6 +9,7 @@ |
79 |
+ */ |
80 |
+ #undef CONFIG_PARAVIRT |
81 |
+ #undef CONFIG_PARAVIRT_SPINLOCKS |
82 |
++#undef CONFIG_PAGE_TABLE_ISOLATION |
83 |
+ #undef CONFIG_KASAN |
84 |
+ |
85 |
+ #include <linux/linkage.h> |
86 |
+diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S |
87 |
+index cc0f2f5da19b..952b23b5d4e9 100644 |
88 |
+--- a/arch/x86/entry/entry_64.S |
89 |
++++ b/arch/x86/entry/entry_64.S |
90 |
+@@ -35,6 +35,7 @@ |
91 |
+ #include <asm/asm.h> |
92 |
+ #include <asm/smap.h> |
93 |
+ #include <asm/pgtable_types.h> |
94 |
++#include <asm/kaiser.h> |
95 |
+ #include <linux/err.h> |
96 |
+ |
97 |
+ /* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */ |
98 |
+@@ -135,6 +136,7 @@ ENTRY(entry_SYSCALL_64) |
99 |
+ * it is too small to ever cause noticeable irq latency. |
100 |
+ */ |
101 |
+ SWAPGS_UNSAFE_STACK |
102 |
++ SWITCH_KERNEL_CR3_NO_STACK |
103 |
+ /* |
104 |
+ * A hypervisor implementation might want to use a label |
105 |
+ * after the swapgs, so that it can do the swapgs |
106 |
+@@ -207,9 +209,17 @@ entry_SYSCALL_64_fastpath: |
107 |
+ testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) |
108 |
+ jnz int_ret_from_sys_call_irqs_off /* Go to the slow path */ |
109 |
+ |
110 |
+- RESTORE_C_REGS_EXCEPT_RCX_R11 |
111 |
+ movq RIP(%rsp), %rcx |
112 |
+ movq EFLAGS(%rsp), %r11 |
113 |
++ RESTORE_C_REGS_EXCEPT_RCX_R11 |
114 |
++ /* |
115 |
++ * This opens a window where we have a user CR3, but are |
116 |
++ * running in the kernel. This makes using the CS |
117 |
++ * register useless for telling whether or not we need to |
118 |
++ * switch CR3 in NMIs. Normal interrupts are OK because |
119 |
++ * they are off here. |
120 |
++ */ |
121 |
++ SWITCH_USER_CR3 |
122 |
+ movq RSP(%rsp), %rsp |
123 |
+ /* |
124 |
+ * 64-bit SYSRET restores rip from rcx, |
125 |
+@@ -347,10 +357,26 @@ GLOBAL(int_ret_from_sys_call) |
126 |
+ syscall_return_via_sysret: |
127 |
+ /* rcx and r11 are already restored (see code above) */ |
128 |
+ RESTORE_C_REGS_EXCEPT_RCX_R11 |
129 |
++ /* |
130 |
++ * This opens a window where we have a user CR3, but are |
131 |
++ * running in the kernel. This makes using the CS |
132 |
++ * register useless for telling whether or not we need to |
133 |
++ * switch CR3 in NMIs. Normal interrupts are OK because |
134 |
++ * they are off here. |
135 |
++ */ |
136 |
++ SWITCH_USER_CR3 |
137 |
+ movq RSP(%rsp), %rsp |
138 |
+ USERGS_SYSRET64 |
139 |
+ |
140 |
+ opportunistic_sysret_failed: |
141 |
++ /* |
142 |
++ * This opens a window where we have a user CR3, but are |
143 |
++ * running in the kernel. This makes using the CS |
144 |
++ * register useless for telling whether or not we need to |
145 |
++ * switch CR3 in NMIs. Normal interrupts are OK because |
146 |
++ * they are off here. |
147 |
++ */ |
148 |
++ SWITCH_USER_CR3 |
149 |
+ SWAPGS |
150 |
+ jmp restore_c_regs_and_iret |
151 |
+ END(entry_SYSCALL_64) |
152 |
+@@ -509,6 +535,7 @@ END(irq_entries_start) |
153 |
+ * tracking that we're in kernel mode. |
154 |
+ */ |
155 |
+ SWAPGS |
156 |
++ SWITCH_KERNEL_CR3 |
157 |
+ |
158 |
+ /* |
159 |
+ * We need to tell lockdep that IRQs are off. We can't do this until |
160 |
+@@ -568,6 +595,7 @@ GLOBAL(retint_user) |
161 |
+ mov %rsp,%rdi |
162 |
+ call prepare_exit_to_usermode |
163 |
+ TRACE_IRQS_IRETQ |
164 |
++ SWITCH_USER_CR3 |
165 |
+ SWAPGS |
166 |
+ jmp restore_regs_and_iret |
167 |
+ |
168 |
+@@ -625,6 +653,7 @@ native_irq_return_ldt: |
169 |
+ pushq %rax |
170 |
+ pushq %rdi |
171 |
+ SWAPGS |
172 |
++ SWITCH_KERNEL_CR3 |
173 |
+ movq PER_CPU_VAR(espfix_waddr), %rdi |
174 |
+ movq %rax, (0*8)(%rdi) /* RAX */ |
175 |
+ movq (2*8)(%rsp), %rax /* RIP */ |
176 |
+@@ -640,6 +669,7 @@ native_irq_return_ldt: |
177 |
+ andl $0xffff0000, %eax |
178 |
+ popq %rdi |
179 |
+ orq PER_CPU_VAR(espfix_stack), %rax |
180 |
++ SWITCH_USER_CR3 |
181 |
+ SWAPGS |
182 |
+ movq %rax, %rsp |
183 |
+ popq %rax |
184 |
+@@ -995,7 +1025,11 @@ idtentry machine_check has_error_code=0 paranoid=1 do_sym=*machine_check_vec |
185 |
+ /* |
186 |
+ * Save all registers in pt_regs, and switch gs if needed. |
187 |
+ * Use slow, but surefire "are we in kernel?" check. |
188 |
+- * Return: ebx=0: need swapgs on exit, ebx=1: otherwise |
189 |
++ * |
190 |
++ * Return: ebx=0: needs swapgs but not SWITCH_USER_CR3 in paranoid_exit |
191 |
++ * ebx=1: needs neither swapgs nor SWITCH_USER_CR3 in paranoid_exit |
192 |
++ * ebx=2: needs both swapgs and SWITCH_USER_CR3 in paranoid_exit |
193 |
++ * ebx=3: needs SWITCH_USER_CR3 but not swapgs in paranoid_exit |
194 |
+ */ |
195 |
+ ENTRY(paranoid_entry) |
196 |
+ cld |
197 |
+@@ -1008,7 +1042,26 @@ ENTRY(paranoid_entry) |
198 |
+ js 1f /* negative -> in kernel */ |
199 |
+ SWAPGS |
200 |
+ xorl %ebx, %ebx |
201 |
+-1: ret |
202 |
++1: |
203 |
++#ifdef CONFIG_PAGE_TABLE_ISOLATION |
204 |
++ /* |
205 |
++ * We might have come in between a swapgs and a SWITCH_KERNEL_CR3 |
206 |
++ * on entry, or between a SWITCH_USER_CR3 and a swapgs on exit. |
207 |
++ * Do a conditional SWITCH_KERNEL_CR3: this could safely be done |
208 |
++ * unconditionally, but we need to find out whether the reverse |
209 |
++ * should be done on return (conveyed to paranoid_exit in %ebx). |
210 |
++ */ |
211 |
++ ALTERNATIVE "jmp 2f", "movq %cr3, %rax", X86_FEATURE_KAISER |
212 |
++ testl $KAISER_SHADOW_PGD_OFFSET, %eax |
213 |
++ jz 2f |
214 |
++ orl $2, %ebx |
215 |
++ andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax |
216 |
++ /* If PCID enabled, set X86_CR3_PCID_NOFLUSH_BIT */ |
217 |
++ ALTERNATIVE "", "bts $63, %rax", X86_FEATURE_PCID |
218 |
++ movq %rax, %cr3 |
219 |
++2: |
220 |
++#endif |
221 |
++ ret |
222 |
+ END(paranoid_entry) |
223 |
+ |
224 |
+ /* |
225 |
+@@ -1021,19 +1074,26 @@ END(paranoid_entry) |
226 |
+ * be complicated. Fortunately, we there's no good reason |
227 |
+ * to try to handle preemption here. |
228 |
+ * |
229 |
+- * On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it) |
230 |
++ * On entry: ebx=0: needs swapgs but not SWITCH_USER_CR3 |
231 |
++ * ebx=1: needs neither swapgs nor SWITCH_USER_CR3 |
232 |
++ * ebx=2: needs both swapgs and SWITCH_USER_CR3 |
233 |
++ * ebx=3: needs SWITCH_USER_CR3 but not swapgs |
234 |
+ */ |
235 |
+ ENTRY(paranoid_exit) |
236 |
+ DISABLE_INTERRUPTS(CLBR_NONE) |
237 |
+ TRACE_IRQS_OFF_DEBUG |
238 |
+- testl %ebx, %ebx /* swapgs needed? */ |
239 |
++ TRACE_IRQS_IRETQ_DEBUG |
240 |
++#ifdef CONFIG_PAGE_TABLE_ISOLATION |
241 |
++ /* No ALTERNATIVE for X86_FEATURE_KAISER: paranoid_entry sets %ebx */ |
242 |
++ testl $2, %ebx /* SWITCH_USER_CR3 needed? */ |
243 |
++ jz paranoid_exit_no_switch |
244 |
++ SWITCH_USER_CR3 |
245 |
++paranoid_exit_no_switch: |
246 |
++#endif |
247 |
++ testl $1, %ebx /* swapgs needed? */ |
248 |
+ jnz paranoid_exit_no_swapgs |
249 |
+- TRACE_IRQS_IRETQ |
250 |
+ SWAPGS_UNSAFE_STACK |
251 |
+- jmp paranoid_exit_restore |
252 |
+ paranoid_exit_no_swapgs: |
253 |
+- TRACE_IRQS_IRETQ_DEBUG |
254 |
+-paranoid_exit_restore: |
255 |
+ RESTORE_EXTRA_REGS |
256 |
+ RESTORE_C_REGS |
257 |
+ REMOVE_PT_GPREGS_FROM_STACK 8 |
258 |
+@@ -1048,6 +1108,13 @@ ENTRY(error_entry) |
259 |
+ cld |
260 |
+ SAVE_C_REGS 8 |
261 |
+ SAVE_EXTRA_REGS 8 |
262 |
++ /* |
263 |
++ * error_entry() always returns with a kernel gsbase and |
264 |
++ * CR3. We must also have a kernel CR3/gsbase before |
265 |
++ * calling TRACE_IRQS_*. Just unconditionally switch to |
266 |
++ * the kernel CR3 here. |
267 |
++ */ |
268 |
++ SWITCH_KERNEL_CR3 |
269 |
+ xorl %ebx, %ebx |
270 |
+ testb $3, CS+8(%rsp) |
271 |
+ jz .Lerror_kernelspace |
272 |
+@@ -1210,6 +1277,10 @@ ENTRY(nmi) |
273 |
+ */ |
274 |
+ |
275 |
+ SWAPGS_UNSAFE_STACK |
276 |
++ /* |
277 |
++ * percpu variables are mapped with user CR3, so no need |
278 |
++ * to switch CR3 here. |
279 |
++ */ |
280 |
+ cld |
281 |
+ movq %rsp, %rdx |
282 |
+ movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp |
283 |
+@@ -1243,12 +1314,34 @@ ENTRY(nmi) |
284 |
+ |
285 |
+ movq %rsp, %rdi |
286 |
+ movq $-1, %rsi |
287 |
++#ifdef CONFIG_PAGE_TABLE_ISOLATION |
288 |
++ /* Unconditionally use kernel CR3 for do_nmi() */ |
289 |
++ /* %rax is saved above, so OK to clobber here */ |
290 |
++ ALTERNATIVE "jmp 2f", "movq %cr3, %rax", X86_FEATURE_KAISER |
291 |
++ /* If PCID enabled, NOFLUSH now and NOFLUSH on return */ |
292 |
++ ALTERNATIVE "", "bts $63, %rax", X86_FEATURE_PCID |
293 |
++ pushq %rax |
294 |
++ /* mask off "user" bit of pgd address and 12 PCID bits: */ |
295 |
++ andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax |
296 |
++ movq %rax, %cr3 |
297 |
++2: |
298 |
++#endif |
299 |
+ call do_nmi |
300 |
+ |
301 |
++#ifdef CONFIG_PAGE_TABLE_ISOLATION |
302 |
++ /* |
303 |
++ * Unconditionally restore CR3. I know we return to |
304 |
++ * kernel code that needs user CR3, but do we ever return |
305 |
++ * to "user mode" where we need the kernel CR3? |
306 |
++ */ |
307 |
++ ALTERNATIVE "", "popq %rax; movq %rax, %cr3", X86_FEATURE_KAISER |
308 |
++#endif |
309 |
++ |
310 |
+ /* |
311 |
+ * Return back to user mode. We must *not* do the normal exit |
312 |
+- * work, because we don't want to enable interrupts. Fortunately, |
313 |
+- * do_nmi doesn't modify pt_regs. |
314 |
++ * work, because we don't want to enable interrupts. Do not |
315 |
++ * switch to user CR3: we might be going back to kernel code |
316 |
++ * that had a user CR3 set. |
317 |
+ */ |
318 |
+ SWAPGS |
319 |
+ jmp restore_c_regs_and_iret |
320 |
+@@ -1445,22 +1538,55 @@ end_repeat_nmi: |
321 |
+ ALLOC_PT_GPREGS_ON_STACK |
322 |
+ |
323 |
+ /* |
324 |
+- * Use paranoid_entry to handle SWAPGS, but no need to use paranoid_exit |
325 |
+- * as we should not be calling schedule in NMI context. |
326 |
+- * Even with normal interrupts enabled. An NMI should not be |
327 |
+- * setting NEED_RESCHED or anything that normal interrupts and |
328 |
+- * exceptions might do. |
329 |
++ * Use the same approach as paranoid_entry to handle SWAPGS, but |
330 |
++ * without CR3 handling since we do that differently in NMIs. No |
331 |
++ * need to use paranoid_exit as we should not be calling schedule |
332 |
++ * in NMI context. Even with normal interrupts enabled. An NMI |
333 |
++ * should not be setting NEED_RESCHED or anything that normal |
334 |
++ * interrupts and exceptions might do. |
335 |
+ */ |
336 |
+- call paranoid_entry |
337 |
+- |
338 |
+- /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */ |
339 |
++ cld |
340 |
++ SAVE_C_REGS |
341 |
++ SAVE_EXTRA_REGS |
342 |
++ movl $1, %ebx |
343 |
++ movl $MSR_GS_BASE, %ecx |
344 |
++ rdmsr |
345 |
++ testl %edx, %edx |
346 |
++ js 1f /* negative -> in kernel */ |
347 |
++ SWAPGS |
348 |
++ xorl %ebx, %ebx |
349 |
++1: |
350 |
+ movq %rsp, %rdi |
351 |
+ movq $-1, %rsi |
352 |
++#ifdef CONFIG_PAGE_TABLE_ISOLATION |
353 |
++ /* Unconditionally use kernel CR3 for do_nmi() */ |
354 |
++ /* %rax is saved above, so OK to clobber here */ |
355 |
++ ALTERNATIVE "jmp 2f", "movq %cr3, %rax", X86_FEATURE_KAISER |
356 |
++ /* If PCID enabled, NOFLUSH now and NOFLUSH on return */ |
357 |
++ ALTERNATIVE "", "bts $63, %rax", X86_FEATURE_PCID |
358 |
++ pushq %rax |
359 |
++ /* mask off "user" bit of pgd address and 12 PCID bits: */ |
360 |
++ andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax |
361 |
++ movq %rax, %cr3 |
362 |
++2: |
363 |
++#endif |
364 |
++ |
365 |
++ /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */ |
366 |
+ call do_nmi |
367 |
+ |
368 |
++#ifdef CONFIG_PAGE_TABLE_ISOLATION |
369 |
++ /* |
370 |
++ * Unconditionally restore CR3. We might be returning to |
371 |
++ * kernel code that needs user CR3, like just just before |
372 |
++ * a sysret. |
373 |
++ */ |
374 |
++ ALTERNATIVE "", "popq %rax; movq %rax, %cr3", X86_FEATURE_KAISER |
375 |
++#endif |
376 |
++ |
377 |
+ testl %ebx, %ebx /* swapgs needed? */ |
378 |
+ jnz nmi_restore |
379 |
+ nmi_swapgs: |
380 |
++ /* We fixed up CR3 above, so no need to switch it here */ |
381 |
+ SWAPGS_UNSAFE_STACK |
382 |
+ nmi_restore: |
383 |
+ RESTORE_EXTRA_REGS |
384 |
+diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S |
385 |
+index 15cfebaa7688..d03bf0e28b8b 100644 |
386 |
+--- a/arch/x86/entry/entry_64_compat.S |
387 |
++++ b/arch/x86/entry/entry_64_compat.S |
388 |
+@@ -13,6 +13,8 @@ |
389 |
+ #include <asm/irqflags.h> |
390 |
+ #include <asm/asm.h> |
391 |
+ #include <asm/smap.h> |
392 |
++#include <asm/pgtable_types.h> |
393 |
++#include <asm/kaiser.h> |
394 |
+ #include <linux/linkage.h> |
395 |
+ #include <linux/err.h> |
396 |
+ |
397 |
+@@ -50,6 +52,7 @@ ENDPROC(native_usergs_sysret32) |
398 |
+ ENTRY(entry_SYSENTER_compat) |
399 |
+ /* Interrupts are off on entry. */ |
400 |
+ SWAPGS_UNSAFE_STACK |
401 |
++ SWITCH_KERNEL_CR3_NO_STACK |
402 |
+ movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp |
403 |
+ |
404 |
+ /* |
405 |
+@@ -161,6 +164,7 @@ ENDPROC(entry_SYSENTER_compat) |
406 |
+ ENTRY(entry_SYSCALL_compat) |
407 |
+ /* Interrupts are off on entry. */ |
408 |
+ SWAPGS_UNSAFE_STACK |
409 |
++ SWITCH_KERNEL_CR3_NO_STACK |
410 |
+ |
411 |
+ /* Stash user ESP and switch to the kernel stack. */ |
412 |
+ movl %esp, %r8d |
413 |
+@@ -208,6 +212,7 @@ ENTRY(entry_SYSCALL_compat) |
414 |
+ /* Opportunistic SYSRET */ |
415 |
+ sysret32_from_system_call: |
416 |
+ TRACE_IRQS_ON /* User mode traces as IRQs on. */ |
417 |
++ SWITCH_USER_CR3 |
418 |
+ movq RBX(%rsp), %rbx /* pt_regs->rbx */ |
419 |
+ movq RBP(%rsp), %rbp /* pt_regs->rbp */ |
420 |
+ movq EFLAGS(%rsp), %r11 /* pt_regs->flags (in r11) */ |
421 |
+@@ -269,6 +274,7 @@ ENTRY(entry_INT80_compat) |
422 |
+ PARAVIRT_ADJUST_EXCEPTION_FRAME |
423 |
+ ASM_CLAC /* Do this early to minimize exposure */ |
424 |
+ SWAPGS |
425 |
++ SWITCH_KERNEL_CR3_NO_STACK |
426 |
+ |
427 |
+ /* |
428 |
+ * User tracing code (ptrace or signal handlers) might assume that |
429 |
+@@ -311,6 +317,7 @@ ENTRY(entry_INT80_compat) |
430 |
+ |
431 |
+ /* Go back to user mode. */ |
432 |
+ TRACE_IRQS_ON |
433 |
++ SWITCH_USER_CR3 |
434 |
+ SWAPGS |
435 |
+ jmp restore_regs_and_iret |
436 |
+ END(entry_INT80_compat) |
437 |
+diff --git a/arch/x86/entry/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c |
438 |
+index ca94fa649251..5dd363d54348 100644 |
439 |
+--- a/arch/x86/entry/vdso/vclock_gettime.c |
440 |
++++ b/arch/x86/entry/vdso/vclock_gettime.c |
441 |
+@@ -36,6 +36,11 @@ static notrace cycle_t vread_hpet(void) |
442 |
+ } |
443 |
+ #endif |
444 |
+ |
445 |
++#ifdef CONFIG_PARAVIRT_CLOCK |
446 |
++extern u8 pvclock_page |
447 |
++ __attribute__((visibility("hidden"))); |
448 |
++#endif |
449 |
++ |
450 |
+ #ifndef BUILD_VDSO32 |
451 |
+ |
452 |
+ #include <linux/kernel.h> |
453 |
+@@ -62,63 +67,65 @@ notrace static long vdso_fallback_gtod(struct timeval *tv, struct timezone *tz) |
454 |
+ |
455 |
+ #ifdef CONFIG_PARAVIRT_CLOCK |
456 |
+ |
457 |
+-static notrace const struct pvclock_vsyscall_time_info *get_pvti(int cpu) |
458 |
++static notrace const struct pvclock_vsyscall_time_info *get_pvti0(void) |
459 |
+ { |
460 |
+- const struct pvclock_vsyscall_time_info *pvti_base; |
461 |
+- int idx = cpu / (PAGE_SIZE/PVTI_SIZE); |
462 |
+- int offset = cpu % (PAGE_SIZE/PVTI_SIZE); |
463 |
+- |
464 |
+- BUG_ON(PVCLOCK_FIXMAP_BEGIN + idx > PVCLOCK_FIXMAP_END); |
465 |
+- |
466 |
+- pvti_base = (struct pvclock_vsyscall_time_info *) |
467 |
+- __fix_to_virt(PVCLOCK_FIXMAP_BEGIN+idx); |
468 |
+- |
469 |
+- return &pvti_base[offset]; |
470 |
++ return (const struct pvclock_vsyscall_time_info *)&pvclock_page; |
471 |
+ } |
472 |
+ |
473 |
+ static notrace cycle_t vread_pvclock(int *mode) |
474 |
+ { |
475 |
+- const struct pvclock_vsyscall_time_info *pvti; |
476 |
++ const struct pvclock_vcpu_time_info *pvti = &get_pvti0()->pvti; |
477 |
+ cycle_t ret; |
478 |
+- u64 last; |
479 |
+- u32 version; |
480 |
+- u8 flags; |
481 |
+- unsigned cpu, cpu1; |
482 |
+- |
483 |
++ u64 tsc, pvti_tsc; |
484 |
++ u64 last, delta, pvti_system_time; |
485 |
++ u32 version, pvti_tsc_to_system_mul, pvti_tsc_shift; |
486 |
+ |
487 |
+ /* |
488 |
+- * Note: hypervisor must guarantee that: |
489 |
+- * 1. cpu ID number maps 1:1 to per-CPU pvclock time info. |
490 |
+- * 2. that per-CPU pvclock time info is updated if the |
491 |
+- * underlying CPU changes. |
492 |
+- * 3. that version is increased whenever underlying CPU |
493 |
+- * changes. |
494 |
++ * Note: The kernel and hypervisor must guarantee that cpu ID |
495 |
++ * number maps 1:1 to per-CPU pvclock time info. |
496 |
++ * |
497 |
++ * Because the hypervisor is entirely unaware of guest userspace |
498 |
++ * preemption, it cannot guarantee that per-CPU pvclock time |
499 |
++ * info is updated if the underlying CPU changes or that that |
500 |
++ * version is increased whenever underlying CPU changes. |
501 |
+ * |
502 |
++ * On KVM, we are guaranteed that pvti updates for any vCPU are |
503 |
++ * atomic as seen by *all* vCPUs. This is an even stronger |
504 |
++ * guarantee than we get with a normal seqlock. |
505 |
++ * |
506 |
++ * On Xen, we don't appear to have that guarantee, but Xen still |
507 |
++ * supplies a valid seqlock using the version field. |
508 |
++ |
509 |
++ * We only do pvclock vdso timing at all if |
510 |
++ * PVCLOCK_TSC_STABLE_BIT is set, and we interpret that bit to |
511 |
++ * mean that all vCPUs have matching pvti and that the TSC is |
512 |
++ * synced, so we can just look at vCPU 0's pvti. |
513 |
+ */ |
514 |
+- do { |
515 |
+- cpu = __getcpu() & VGETCPU_CPU_MASK; |
516 |
+- /* TODO: We can put vcpu id into higher bits of pvti.version. |
517 |
+- * This will save a couple of cycles by getting rid of |
518 |
+- * __getcpu() calls (Gleb). |
519 |
+- */ |
520 |
+- |
521 |
+- pvti = get_pvti(cpu); |
522 |
+- |
523 |
+- version = __pvclock_read_cycles(&pvti->pvti, &ret, &flags); |
524 |
+- |
525 |
+- /* |
526 |
+- * Test we're still on the cpu as well as the version. |
527 |
+- * We could have been migrated just after the first |
528 |
+- * vgetcpu but before fetching the version, so we |
529 |
+- * wouldn't notice a version change. |
530 |
+- */ |
531 |
+- cpu1 = __getcpu() & VGETCPU_CPU_MASK; |
532 |
+- } while (unlikely(cpu != cpu1 || |
533 |
+- (pvti->pvti.version & 1) || |
534 |
+- pvti->pvti.version != version)); |
535 |
+- |
536 |
+- if (unlikely(!(flags & PVCLOCK_TSC_STABLE_BIT))) |
537 |
++ |
538 |
++ if (unlikely(!(pvti->flags & PVCLOCK_TSC_STABLE_BIT))) { |
539 |
+ *mode = VCLOCK_NONE; |
540 |
++ return 0; |
541 |
++ } |
542 |
++ |
543 |
++ do { |
544 |
++ version = pvti->version; |
545 |
++ |
546 |
++ /* This is also a read barrier, so we'll read version first. */ |
547 |
++ tsc = rdtsc_ordered(); |
548 |
++ |
549 |
++ pvti_tsc_to_system_mul = pvti->tsc_to_system_mul; |
550 |
++ pvti_tsc_shift = pvti->tsc_shift; |
551 |
++ pvti_system_time = pvti->system_time; |
552 |
++ pvti_tsc = pvti->tsc_timestamp; |
553 |
++ |
554 |
++ /* Make sure that the version double-check is last. */ |
555 |
++ smp_rmb(); |
556 |
++ } while (unlikely((version & 1) || version != pvti->version)); |
557 |
++ |
558 |
++ delta = tsc - pvti_tsc; |
559 |
++ ret = pvti_system_time + |
560 |
++ pvclock_scale_delta(delta, pvti_tsc_to_system_mul, |
561 |
++ pvti_tsc_shift); |
562 |
+ |
563 |
+ /* refer to tsc.c read_tsc() comment for rationale */ |
564 |
+ last = gtod->cycle_last; |
565 |
+diff --git a/arch/x86/entry/vdso/vdso-layout.lds.S b/arch/x86/entry/vdso/vdso-layout.lds.S |
566 |
+index de2c921025f5..4158acc17df0 100644 |
567 |
+--- a/arch/x86/entry/vdso/vdso-layout.lds.S |
568 |
++++ b/arch/x86/entry/vdso/vdso-layout.lds.S |
569 |
+@@ -25,7 +25,7 @@ SECTIONS |
570 |
+ * segment. |
571 |
+ */ |
572 |
+ |
573 |
+- vvar_start = . - 2 * PAGE_SIZE; |
574 |
++ vvar_start = . - 3 * PAGE_SIZE; |
575 |
+ vvar_page = vvar_start; |
576 |
+ |
577 |
+ /* Place all vvars at the offsets in asm/vvar.h. */ |
578 |
+@@ -36,6 +36,7 @@ SECTIONS |
579 |
+ #undef EMIT_VVAR |
580 |
+ |
581 |
+ hpet_page = vvar_start + PAGE_SIZE; |
582 |
++ pvclock_page = vvar_start + 2 * PAGE_SIZE; |
583 |
+ |
584 |
+ . = SIZEOF_HEADERS; |
585 |
+ |
586 |
+diff --git a/arch/x86/entry/vdso/vdso2c.c b/arch/x86/entry/vdso/vdso2c.c |
587 |
+index 785d9922b106..491020b2826d 100644 |
588 |
+--- a/arch/x86/entry/vdso/vdso2c.c |
589 |
++++ b/arch/x86/entry/vdso/vdso2c.c |
590 |
+@@ -73,6 +73,7 @@ enum { |
591 |
+ sym_vvar_start, |
592 |
+ sym_vvar_page, |
593 |
+ sym_hpet_page, |
594 |
++ sym_pvclock_page, |
595 |
+ sym_VDSO_FAKE_SECTION_TABLE_START, |
596 |
+ sym_VDSO_FAKE_SECTION_TABLE_END, |
597 |
+ }; |
598 |
+@@ -80,6 +81,7 @@ enum { |
599 |
+ const int special_pages[] = { |
600 |
+ sym_vvar_page, |
601 |
+ sym_hpet_page, |
602 |
++ sym_pvclock_page, |
603 |
+ }; |
604 |
+ |
605 |
+ struct vdso_sym { |
606 |
+@@ -91,6 +93,7 @@ struct vdso_sym required_syms[] = { |
607 |
+ [sym_vvar_start] = {"vvar_start", true}, |
608 |
+ [sym_vvar_page] = {"vvar_page", true}, |
609 |
+ [sym_hpet_page] = {"hpet_page", true}, |
610 |
++ [sym_pvclock_page] = {"pvclock_page", true}, |
611 |
+ [sym_VDSO_FAKE_SECTION_TABLE_START] = { |
612 |
+ "VDSO_FAKE_SECTION_TABLE_START", false |
613 |
+ }, |
614 |
+diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c |
615 |
+index 64df47148160..aa828191c654 100644 |
616 |
+--- a/arch/x86/entry/vdso/vma.c |
617 |
++++ b/arch/x86/entry/vdso/vma.c |
618 |
+@@ -100,6 +100,7 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr) |
619 |
+ .name = "[vvar]", |
620 |
+ .pages = no_pages, |
621 |
+ }; |
622 |
++ struct pvclock_vsyscall_time_info *pvti; |
623 |
+ |
624 |
+ if (calculate_addr) { |
625 |
+ addr = vdso_addr(current->mm->start_stack, |
626 |
+@@ -169,6 +170,18 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr) |
627 |
+ } |
628 |
+ #endif |
629 |
+ |
630 |
++ pvti = pvclock_pvti_cpu0_va(); |
631 |
++ if (pvti && image->sym_pvclock_page) { |
632 |
++ ret = remap_pfn_range(vma, |
633 |
++ text_start + image->sym_pvclock_page, |
634 |
++ __pa(pvti) >> PAGE_SHIFT, |
635 |
++ PAGE_SIZE, |
636 |
++ PAGE_READONLY); |
637 |
++ |
638 |
++ if (ret) |
639 |
++ goto up_fail; |
640 |
++ } |
641 |
++ |
642 |
+ up_fail: |
643 |
+ if (ret) |
644 |
+ current->mm->context.vdso = NULL; |
645 |
+diff --git a/arch/x86/include/asm/cmdline.h b/arch/x86/include/asm/cmdline.h |
646 |
+index e01f7f7ccb0c..84ae170bc3d0 100644 |
647 |
+--- a/arch/x86/include/asm/cmdline.h |
648 |
++++ b/arch/x86/include/asm/cmdline.h |
649 |
+@@ -2,5 +2,7 @@ |
650 |
+ #define _ASM_X86_CMDLINE_H |
651 |
+ |
652 |
+ int cmdline_find_option_bool(const char *cmdline_ptr, const char *option); |
653 |
++int cmdline_find_option(const char *cmdline_ptr, const char *option, |
654 |
++ char *buffer, int bufsize); |
655 |
+ |
656 |
+ #endif /* _ASM_X86_CMDLINE_H */ |
657 |
+diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h |
658 |
+index f7ba9fbf12ee..f6605712ca90 100644 |
659 |
+--- a/arch/x86/include/asm/cpufeature.h |
660 |
++++ b/arch/x86/include/asm/cpufeature.h |
661 |
+@@ -187,6 +187,7 @@ |
662 |
+ #define X86_FEATURE_ARAT ( 7*32+ 1) /* Always Running APIC Timer */ |
663 |
+ #define X86_FEATURE_CPB ( 7*32+ 2) /* AMD Core Performance Boost */ |
664 |
+ #define X86_FEATURE_EPB ( 7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */ |
665 |
++#define X86_FEATURE_INVPCID_SINGLE ( 7*32+ 4) /* Effectively INVPCID && CR4.PCIDE=1 */ |
666 |
+ #define X86_FEATURE_PLN ( 7*32+ 5) /* Intel Power Limit Notification */ |
667 |
+ #define X86_FEATURE_PTS ( 7*32+ 6) /* Intel Package Thermal Status */ |
668 |
+ #define X86_FEATURE_DTHERM ( 7*32+ 7) /* Digital Thermal Sensor */ |
669 |
+@@ -199,6 +200,9 @@ |
670 |
+ #define X86_FEATURE_HWP_PKG_REQ ( 7*32+14) /* Intel HWP_PKG_REQ */ |
671 |
+ #define X86_FEATURE_INTEL_PT ( 7*32+15) /* Intel Processor Trace */ |
672 |
+ |
673 |
++/* Because the ALTERNATIVE scheme is for members of the X86_FEATURE club... */ |
674 |
++#define X86_FEATURE_KAISER ( 7*32+31) /* CONFIG_PAGE_TABLE_ISOLATION w/o nokaiser */ |
675 |
++ |
676 |
+ /* Virtualization flags: Linux defined, word 8 */ |
677 |
+ #define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */ |
678 |
+ #define X86_FEATURE_VNMI ( 8*32+ 1) /* Intel Virtual NMI */ |
679 |
+diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h |
680 |
+index 4e10d73cf018..880db91d9457 100644 |
681 |
+--- a/arch/x86/include/asm/desc.h |
682 |
++++ b/arch/x86/include/asm/desc.h |
683 |
+@@ -43,7 +43,7 @@ struct gdt_page { |
684 |
+ struct desc_struct gdt[GDT_ENTRIES]; |
685 |
+ } __attribute__((aligned(PAGE_SIZE))); |
686 |
+ |
687 |
+-DECLARE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page); |
688 |
++DECLARE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(struct gdt_page, gdt_page); |
689 |
+ |
690 |
+ static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu) |
691 |
+ { |
692 |
+diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h |
693 |
+index 59caa55fb9b5..ee52ff858699 100644 |
694 |
+--- a/arch/x86/include/asm/hw_irq.h |
695 |
++++ b/arch/x86/include/asm/hw_irq.h |
696 |
+@@ -187,7 +187,7 @@ extern char irq_entries_start[]; |
697 |
+ #define VECTOR_RETRIGGERED ((void *)~0UL) |
698 |
+ |
699 |
+ typedef struct irq_desc* vector_irq_t[NR_VECTORS]; |
700 |
+-DECLARE_PER_CPU(vector_irq_t, vector_irq); |
701 |
++DECLARE_PER_CPU_USER_MAPPED(vector_irq_t, vector_irq); |
702 |
+ |
703 |
+ #endif /* !ASSEMBLY_ */ |
704 |
+ |
705 |
+diff --git a/arch/x86/include/asm/kaiser.h b/arch/x86/include/asm/kaiser.h |
706 |
+new file mode 100644 |
707 |
+index 000000000000..802bbbdfe143 |
708 |
+--- /dev/null |
709 |
++++ b/arch/x86/include/asm/kaiser.h |
710 |
+@@ -0,0 +1,141 @@ |
711 |
++#ifndef _ASM_X86_KAISER_H |
712 |
++#define _ASM_X86_KAISER_H |
713 |
++ |
714 |
++#include <uapi/asm/processor-flags.h> /* For PCID constants */ |
715 |
++ |
716 |
++/* |
717 |
++ * This file includes the definitions for the KAISER feature. |
718 |
++ * KAISER is a counter measure against x86_64 side channel attacks on |
719 |
++ * the kernel virtual memory. It has a shadow pgd for every process: the |
720 |
++ * shadow pgd has a minimalistic kernel-set mapped, but includes the whole |
721 |
++ * user memory. Within a kernel context switch, or when an interrupt is handled, |
722 |
++ * the pgd is switched to the normal one. When the system switches to user mode, |
723 |
++ * the shadow pgd is enabled. By this, the virtual memory caches are freed, |
724 |
++ * and the user may not attack the whole kernel memory. |
725 |
++ * |
726 |
++ * A minimalistic kernel mapping holds the parts needed to be mapped in user |
727 |
++ * mode, such as the entry/exit functions of the user space, or the stacks. |
728 |
++ */ |
729 |
++ |
730 |
++#define KAISER_SHADOW_PGD_OFFSET 0x1000 |
731 |
++ |
732 |
++#ifdef __ASSEMBLY__ |
733 |
++#ifdef CONFIG_PAGE_TABLE_ISOLATION |
734 |
++ |
735 |
++.macro _SWITCH_TO_KERNEL_CR3 reg |
736 |
++movq %cr3, \reg |
737 |
++andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), \reg |
738 |
++/* If PCID enabled, set X86_CR3_PCID_NOFLUSH_BIT */ |
739 |
++ALTERNATIVE "", "bts $63, \reg", X86_FEATURE_PCID |
740 |
++movq \reg, %cr3 |
741 |
++.endm |
742 |
++ |
743 |
++.macro _SWITCH_TO_USER_CR3 reg regb |
744 |
++/* |
745 |
++ * regb must be the low byte portion of reg: because we have arranged |
746 |
++ * for the low byte of the user PCID to serve as the high byte of NOFLUSH |
747 |
++ * (0x80 for each when PCID is enabled, or 0x00 when PCID and NOFLUSH are |
748 |
++ * not enabled): so that the one register can update both memory and cr3. |
749 |
++ */ |
750 |
++movq %cr3, \reg |
751 |
++orq PER_CPU_VAR(x86_cr3_pcid_user), \reg |
752 |
++js 9f |
753 |
++/* If PCID enabled, FLUSH this time, reset to NOFLUSH for next time */ |
754 |
++movb \regb, PER_CPU_VAR(x86_cr3_pcid_user+7) |
755 |
++9: |
756 |
++movq \reg, %cr3 |
757 |
++.endm |
758 |
++ |
759 |
++.macro SWITCH_KERNEL_CR3 |
760 |
++ALTERNATIVE "jmp 8f", "pushq %rax", X86_FEATURE_KAISER |
761 |
++_SWITCH_TO_KERNEL_CR3 %rax |
762 |
++popq %rax |
763 |
++8: |
764 |
++.endm |
765 |
++ |
766 |
++.macro SWITCH_USER_CR3 |
767 |
++ALTERNATIVE "jmp 8f", "pushq %rax", X86_FEATURE_KAISER |
768 |
++_SWITCH_TO_USER_CR3 %rax %al |
769 |
++popq %rax |
770 |
++8: |
771 |
++.endm |
772 |
++ |
773 |
++.macro SWITCH_KERNEL_CR3_NO_STACK |
774 |
++ALTERNATIVE "jmp 8f", \ |
775 |
++ __stringify(movq %rax, PER_CPU_VAR(unsafe_stack_register_backup)), \ |
776 |
++ X86_FEATURE_KAISER |
777 |
++_SWITCH_TO_KERNEL_CR3 %rax |
778 |
++movq PER_CPU_VAR(unsafe_stack_register_backup), %rax |
779 |
++8: |
780 |
++.endm |
781 |
++ |
782 |
++#else /* CONFIG_PAGE_TABLE_ISOLATION */ |
783 |
++ |
784 |
++.macro SWITCH_KERNEL_CR3 |
785 |
++.endm |
786 |
++.macro SWITCH_USER_CR3 |
787 |
++.endm |
788 |
++.macro SWITCH_KERNEL_CR3_NO_STACK |
789 |
++.endm |
790 |
++ |
791 |
++#endif /* CONFIG_PAGE_TABLE_ISOLATION */ |
792 |
++ |
793 |
++#else /* __ASSEMBLY__ */ |
794 |
++ |
795 |
++#ifdef CONFIG_PAGE_TABLE_ISOLATION |
796 |
++/* |
797 |
++ * Upon kernel/user mode switch, it may happen that the address |
798 |
++ * space has to be switched before the registers have been |
799 |
++ * stored. To change the address space, another register is |
800 |
++ * needed. A register therefore has to be stored/restored. |
801 |
++*/ |
802 |
++DECLARE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup); |
803 |
++ |
804 |
++DECLARE_PER_CPU(unsigned long, x86_cr3_pcid_user); |
805 |
++ |
806 |
++extern char __per_cpu_user_mapped_start[], __per_cpu_user_mapped_end[]; |
807 |
++ |
808 |
++extern int kaiser_enabled; |
809 |
++extern void __init kaiser_check_boottime_disable(void); |
810 |
++#else |
811 |
++#define kaiser_enabled 0 |
812 |
++static inline void __init kaiser_check_boottime_disable(void) {} |
813 |
++#endif /* CONFIG_PAGE_TABLE_ISOLATION */ |
814 |
++ |
815 |
++/* |
816 |
++ * Kaiser function prototypes are needed even when CONFIG_PAGE_TABLE_ISOLATION is not set, |
817 |
++ * so as to build with tests on kaiser_enabled instead of #ifdefs. |
818 |
++ */ |
819 |
++ |
820 |
++/** |
821 |
++ * kaiser_add_mapping - map a virtual memory part to the shadow (user) mapping |
822 |
++ * @addr: the start address of the range |
823 |
++ * @size: the size of the range |
824 |
++ * @flags: The mapping flags of the pages |
825 |
++ * |
826 |
++ * The mapping is done on a global scope, so no bigger |
827 |
++ * synchronization has to be done. the pages have to be |
828 |
++ * manually unmapped again when they are not needed any longer. |
829 |
++ */ |
830 |
++extern int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags); |
831 |
++ |
832 |
++/** |
833 |
++ * kaiser_remove_mapping - unmap a virtual memory part of the shadow mapping |
834 |
++ * @addr: the start address of the range |
835 |
++ * @size: the size of the range |
836 |
++ */ |
837 |
++extern void kaiser_remove_mapping(unsigned long start, unsigned long size); |
838 |
++ |
839 |
++/** |
840 |
++ * kaiser_init - Initialize the shadow mapping |
841 |
++ * |
842 |
++ * Most parts of the shadow mapping can be mapped upon boot |
843 |
++ * time. Only per-process things like the thread stacks |
844 |
++ * or a new LDT have to be mapped at runtime. These boot- |
845 |
++ * time mappings are permanent and never unmapped. |
846 |
++ */ |
847 |
++extern void kaiser_init(void); |
848 |
++ |
849 |
++#endif /* __ASSEMBLY */ |
850 |
++ |
851 |
++#endif /* _ASM_X86_KAISER_H */ |
852 |
+diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h |
853 |
+index 6ec0c8b2e9df..84c62d950023 100644 |
854 |
+--- a/arch/x86/include/asm/pgtable.h |
855 |
++++ b/arch/x86/include/asm/pgtable.h |
856 |
+@@ -18,6 +18,12 @@ |
857 |
+ #ifndef __ASSEMBLY__ |
858 |
+ #include <asm/x86_init.h> |
859 |
+ |
860 |
++#ifdef CONFIG_PAGE_TABLE_ISOLATION |
861 |
++extern int kaiser_enabled; |
862 |
++#else |
863 |
++#define kaiser_enabled 0 |
864 |
++#endif |
865 |
++ |
866 |
+ void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd); |
867 |
+ void ptdump_walk_pgd_level_checkwx(void); |
868 |
+ |
869 |
+@@ -653,7 +659,17 @@ static inline pud_t *pud_offset(pgd_t *pgd, unsigned long address) |
870 |
+ |
871 |
+ static inline int pgd_bad(pgd_t pgd) |
872 |
+ { |
873 |
+- return (pgd_flags(pgd) & ~_PAGE_USER) != _KERNPG_TABLE; |
874 |
++ pgdval_t ignore_flags = _PAGE_USER; |
875 |
++ /* |
876 |
++ * We set NX on KAISER pgds that map userspace memory so |
877 |
++ * that userspace can not meaningfully use the kernel |
878 |
++ * page table by accident; it will fault on the first |
879 |
++ * instruction it tries to run. See native_set_pgd(). |
880 |
++ */ |
881 |
++ if (kaiser_enabled) |
882 |
++ ignore_flags |= _PAGE_NX; |
883 |
++ |
884 |
++ return (pgd_flags(pgd) & ~ignore_flags) != _KERNPG_TABLE; |
885 |
+ } |
886 |
+ |
887 |
+ static inline int pgd_none(pgd_t pgd) |
888 |
+@@ -855,7 +871,15 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm, |
889 |
+ */ |
890 |
+ static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count) |
891 |
+ { |
892 |
+- memcpy(dst, src, count * sizeof(pgd_t)); |
893 |
++ memcpy(dst, src, count * sizeof(pgd_t)); |
894 |
++#ifdef CONFIG_PAGE_TABLE_ISOLATION |
895 |
++ if (kaiser_enabled) { |
896 |
++ /* Clone the shadow pgd part as well */ |
897 |
++ memcpy(native_get_shadow_pgd(dst), |
898 |
++ native_get_shadow_pgd(src), |
899 |
++ count * sizeof(pgd_t)); |
900 |
++ } |
901 |
++#endif |
902 |
+ } |
903 |
+ |
904 |
+ #define PTE_SHIFT ilog2(PTRS_PER_PTE) |
905 |
+diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h |
906 |
+index 2ee781114d34..c810226e741a 100644 |
907 |
+--- a/arch/x86/include/asm/pgtable_64.h |
908 |
++++ b/arch/x86/include/asm/pgtable_64.h |
909 |
+@@ -106,9 +106,32 @@ static inline void native_pud_clear(pud_t *pud) |
910 |
+ native_set_pud(pud, native_make_pud(0)); |
911 |
+ } |
912 |
+ |
913 |
++#ifdef CONFIG_PAGE_TABLE_ISOLATION |
914 |
++extern pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd); |
915 |
++ |
916 |
++static inline pgd_t *native_get_shadow_pgd(pgd_t *pgdp) |
917 |
++{ |
918 |
++#ifdef CONFIG_DEBUG_VM |
919 |
++ /* linux/mmdebug.h may not have been included at this point */ |
920 |
++ BUG_ON(!kaiser_enabled); |
921 |
++#endif |
922 |
++ return (pgd_t *)((unsigned long)pgdp | (unsigned long)PAGE_SIZE); |
923 |
++} |
924 |
++#else |
925 |
++static inline pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd) |
926 |
++{ |
927 |
++ return pgd; |
928 |
++} |
929 |
++static inline pgd_t *native_get_shadow_pgd(pgd_t *pgdp) |
930 |
++{ |
931 |
++ BUILD_BUG_ON(1); |
932 |
++ return NULL; |
933 |
++} |
934 |
++#endif /* CONFIG_PAGE_TABLE_ISOLATION */ |
935 |
++ |
936 |
+ static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd) |
937 |
+ { |
938 |
+- *pgdp = pgd; |
939 |
++ *pgdp = kaiser_set_shadow_pgd(pgdp, pgd); |
940 |
+ } |
941 |
+ |
942 |
+ static inline void native_pgd_clear(pgd_t *pgd) |
943 |
+diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h |
944 |
+index 79c91853e50e..8dba273da25a 100644 |
945 |
+--- a/arch/x86/include/asm/pgtable_types.h |
946 |
++++ b/arch/x86/include/asm/pgtable_types.h |
947 |
+@@ -89,7 +89,7 @@ |
948 |
+ #define _PAGE_NX (_AT(pteval_t, 0)) |
949 |
+ #endif |
950 |
+ |
951 |
+-#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE) |
952 |
++#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE) |
953 |
+ |
954 |
+ #define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \ |
955 |
+ _PAGE_ACCESSED | _PAGE_DIRTY) |
956 |
+@@ -102,6 +102,33 @@ |
957 |
+ _PAGE_SOFT_DIRTY) |
958 |
+ #define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE) |
959 |
+ |
960 |
++/* The ASID is the lower 12 bits of CR3 */ |
961 |
++#define X86_CR3_PCID_ASID_MASK (_AC((1<<12)-1,UL)) |
962 |
++ |
963 |
++/* Mask for all the PCID-related bits in CR3: */ |
964 |
++#define X86_CR3_PCID_MASK (X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_MASK) |
965 |
++#define X86_CR3_PCID_ASID_KERN (_AC(0x0,UL)) |
966 |
++ |
967 |
++#if defined(CONFIG_PAGE_TABLE_ISOLATION) && defined(CONFIG_X86_64) |
968 |
++/* Let X86_CR3_PCID_ASID_USER be usable for the X86_CR3_PCID_NOFLUSH bit */ |
969 |
++#define X86_CR3_PCID_ASID_USER (_AC(0x80,UL)) |
970 |
++ |
971 |
++#define X86_CR3_PCID_KERN_FLUSH (X86_CR3_PCID_ASID_KERN) |
972 |
++#define X86_CR3_PCID_USER_FLUSH (X86_CR3_PCID_ASID_USER) |
973 |
++#define X86_CR3_PCID_KERN_NOFLUSH (X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_KERN) |
974 |
++#define X86_CR3_PCID_USER_NOFLUSH (X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_USER) |
975 |
++#else |
976 |
++#define X86_CR3_PCID_ASID_USER (_AC(0x0,UL)) |
977 |
++/* |
978 |
++ * PCIDs are unsupported on 32-bit and none of these bits can be |
979 |
++ * set in CR3: |
980 |
++ */ |
981 |
++#define X86_CR3_PCID_KERN_FLUSH (0) |
982 |
++#define X86_CR3_PCID_USER_FLUSH (0) |
983 |
++#define X86_CR3_PCID_KERN_NOFLUSH (0) |
984 |
++#define X86_CR3_PCID_USER_NOFLUSH (0) |
985 |
++#endif |
986 |
++ |
987 |
+ /* |
988 |
+ * The cache modes defined here are used to translate between pure SW usage |
989 |
+ * and the HW defined cache mode bits and/or PAT entries. |
990 |
+diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h |
991 |
+index 2d5a50cb61a2..f3bdaed0188f 100644 |
992 |
+--- a/arch/x86/include/asm/processor.h |
993 |
++++ b/arch/x86/include/asm/processor.h |
994 |
+@@ -305,7 +305,7 @@ struct tss_struct { |
995 |
+ |
996 |
+ } ____cacheline_aligned; |
997 |
+ |
998 |
+-DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss); |
999 |
++DECLARE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct tss_struct, cpu_tss); |
1000 |
+ |
1001 |
+ #ifdef CONFIG_X86_32 |
1002 |
+ DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack); |
1003 |
+diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h |
1004 |
+index baad72e4c100..6045cef376c2 100644 |
1005 |
+--- a/arch/x86/include/asm/pvclock.h |
1006 |
++++ b/arch/x86/include/asm/pvclock.h |
1007 |
+@@ -4,6 +4,15 @@ |
1008 |
+ #include <linux/clocksource.h> |
1009 |
+ #include <asm/pvclock-abi.h> |
1010 |
+ |
1011 |
++#ifdef CONFIG_PARAVIRT_CLOCK |
1012 |
++extern struct pvclock_vsyscall_time_info *pvclock_pvti_cpu0_va(void); |
1013 |
++#else |
1014 |
++static inline struct pvclock_vsyscall_time_info *pvclock_pvti_cpu0_va(void) |
1015 |
++{ |
1016 |
++ return NULL; |
1017 |
++} |
1018 |
++#endif |
1019 |
++ |
1020 |
+ /* some helper functions for xen and kvm pv clock sources */ |
1021 |
+ cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src); |
1022 |
+ u8 pvclock_read_flags(struct pvclock_vcpu_time_info *src); |
1023 |
+diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h |
1024 |
+index 9fc5968da820..a691b66cc40a 100644 |
1025 |
+--- a/arch/x86/include/asm/tlbflush.h |
1026 |
++++ b/arch/x86/include/asm/tlbflush.h |
1027 |
+@@ -131,6 +131,24 @@ static inline void cr4_set_bits_and_update_boot(unsigned long mask) |
1028 |
+ cr4_set_bits(mask); |
1029 |
+ } |
1030 |
+ |
1031 |
++/* |
1032 |
++ * Declare a couple of kaiser interfaces here for convenience, |
1033 |
++ * to avoid the need for asm/kaiser.h in unexpected places. |
1034 |
++ */ |
1035 |
++#ifdef CONFIG_PAGE_TABLE_ISOLATION |
1036 |
++extern int kaiser_enabled; |
1037 |
++extern void kaiser_setup_pcid(void); |
1038 |
++extern void kaiser_flush_tlb_on_return_to_user(void); |
1039 |
++#else |
1040 |
++#define kaiser_enabled 0 |
1041 |
++static inline void kaiser_setup_pcid(void) |
1042 |
++{ |
1043 |
++} |
1044 |
++static inline void kaiser_flush_tlb_on_return_to_user(void) |
1045 |
++{ |
1046 |
++} |
1047 |
++#endif |
1048 |
++ |
1049 |
+ static inline void __native_flush_tlb(void) |
1050 |
+ { |
1051 |
+ /* |
1052 |
+@@ -139,6 +157,8 @@ static inline void __native_flush_tlb(void) |
1053 |
+ * back: |
1054 |
+ */ |
1055 |
+ preempt_disable(); |
1056 |
++ if (kaiser_enabled) |
1057 |
++ kaiser_flush_tlb_on_return_to_user(); |
1058 |
+ native_write_cr3(native_read_cr3()); |
1059 |
+ preempt_enable(); |
1060 |
+ } |
1061 |
+@@ -148,20 +168,27 @@ static inline void __native_flush_tlb_global_irq_disabled(void) |
1062 |
+ unsigned long cr4; |
1063 |
+ |
1064 |
+ cr4 = this_cpu_read(cpu_tlbstate.cr4); |
1065 |
+- /* clear PGE */ |
1066 |
+- native_write_cr4(cr4 & ~X86_CR4_PGE); |
1067 |
+- /* write old PGE again and flush TLBs */ |
1068 |
+- native_write_cr4(cr4); |
1069 |
++ if (cr4 & X86_CR4_PGE) { |
1070 |
++ /* clear PGE and flush TLB of all entries */ |
1071 |
++ native_write_cr4(cr4 & ~X86_CR4_PGE); |
1072 |
++ /* restore PGE as it was before */ |
1073 |
++ native_write_cr4(cr4); |
1074 |
++ } else { |
1075 |
++ /* do it with cr3, letting kaiser flush user PCID */ |
1076 |
++ __native_flush_tlb(); |
1077 |
++ } |
1078 |
+ } |
1079 |
+ |
1080 |
+ static inline void __native_flush_tlb_global(void) |
1081 |
+ { |
1082 |
+ unsigned long flags; |
1083 |
+ |
1084 |
+- if (static_cpu_has(X86_FEATURE_INVPCID)) { |
1085 |
++ if (this_cpu_has(X86_FEATURE_INVPCID)) { |
1086 |
+ /* |
1087 |
+ * Using INVPCID is considerably faster than a pair of writes |
1088 |
+ * to CR4 sandwiched inside an IRQ flag save/restore. |
1089 |
++ * |
1090 |
++ * Note, this works with CR4.PCIDE=0 or 1. |
1091 |
+ */ |
1092 |
+ invpcid_flush_all(); |
1093 |
+ return; |
1094 |
+@@ -173,24 +200,45 @@ static inline void __native_flush_tlb_global(void) |
1095 |
+ * be called from deep inside debugging code.) |
1096 |
+ */ |
1097 |
+ raw_local_irq_save(flags); |
1098 |
+- |
1099 |
+ __native_flush_tlb_global_irq_disabled(); |
1100 |
+- |
1101 |
+ raw_local_irq_restore(flags); |
1102 |
+ } |
1103 |
+ |
1104 |
+ static inline void __native_flush_tlb_single(unsigned long addr) |
1105 |
+ { |
1106 |
+- asm volatile("invlpg (%0)" ::"r" (addr) : "memory"); |
1107 |
++ /* |
1108 |
++ * SIMICS #GP's if you run INVPCID with type 2/3 |
1109 |
++ * and X86_CR4_PCIDE clear. Shame! |
1110 |
++ * |
1111 |
++ * The ASIDs used below are hard-coded. But, we must not |
1112 |
++ * call invpcid(type=1/2) before CR4.PCIDE=1. Just call |
1113 |
++ * invlpg in the case we are called early. |
1114 |
++ */ |
1115 |
++ |
1116 |
++ if (!this_cpu_has(X86_FEATURE_INVPCID_SINGLE)) { |
1117 |
++ if (kaiser_enabled) |
1118 |
++ kaiser_flush_tlb_on_return_to_user(); |
1119 |
++ asm volatile("invlpg (%0)" ::"r" (addr) : "memory"); |
1120 |
++ return; |
1121 |
++ } |
1122 |
++ /* Flush the address out of both PCIDs. */ |
1123 |
++ /* |
1124 |
++ * An optimization here might be to determine addresses |
1125 |
++ * that are only kernel-mapped and only flush the kernel |
1126 |
++ * ASID. But, userspace flushes are probably much more |
1127 |
++ * important performance-wise. |
1128 |
++ * |
1129 |
++ * Make sure to do only a single invpcid when KAISER is |
1130 |
++ * disabled and we have only a single ASID. |
1131 |
++ */ |
1132 |
++ if (kaiser_enabled) |
1133 |
++ invpcid_flush_one(X86_CR3_PCID_ASID_USER, addr); |
1134 |
++ invpcid_flush_one(X86_CR3_PCID_ASID_KERN, addr); |
1135 |
+ } |
1136 |
+ |
1137 |
+ static inline void __flush_tlb_all(void) |
1138 |
+ { |
1139 |
+- if (cpu_has_pge) |
1140 |
+- __flush_tlb_global(); |
1141 |
+- else |
1142 |
+- __flush_tlb(); |
1143 |
+- |
1144 |
++ __flush_tlb_global(); |
1145 |
+ /* |
1146 |
+ * Note: if we somehow had PCID but not PGE, then this wouldn't work -- |
1147 |
+ * we'd end up flushing kernel translations for the current ASID but |
1148 |
+diff --git a/arch/x86/include/asm/vdso.h b/arch/x86/include/asm/vdso.h |
1149 |
+index 756de9190aec..deabaf9759b6 100644 |
1150 |
+--- a/arch/x86/include/asm/vdso.h |
1151 |
++++ b/arch/x86/include/asm/vdso.h |
1152 |
+@@ -22,6 +22,7 @@ struct vdso_image { |
1153 |
+ |
1154 |
+ long sym_vvar_page; |
1155 |
+ long sym_hpet_page; |
1156 |
++ long sym_pvclock_page; |
1157 |
+ long sym_VDSO32_NOTE_MASK; |
1158 |
+ long sym___kernel_sigreturn; |
1159 |
+ long sym___kernel_rt_sigreturn; |
1160 |
+diff --git a/arch/x86/include/uapi/asm/processor-flags.h b/arch/x86/include/uapi/asm/processor-flags.h |
1161 |
+index 79887abcb5e1..1361779f44fe 100644 |
1162 |
+--- a/arch/x86/include/uapi/asm/processor-flags.h |
1163 |
++++ b/arch/x86/include/uapi/asm/processor-flags.h |
1164 |
+@@ -77,7 +77,8 @@ |
1165 |
+ #define X86_CR3_PWT _BITUL(X86_CR3_PWT_BIT) |
1166 |
+ #define X86_CR3_PCD_BIT 4 /* Page Cache Disable */ |
1167 |
+ #define X86_CR3_PCD _BITUL(X86_CR3_PCD_BIT) |
1168 |
+-#define X86_CR3_PCID_MASK _AC(0x00000fff,UL) /* PCID Mask */ |
1169 |
++#define X86_CR3_PCID_NOFLUSH_BIT 63 /* Preserve old PCID */ |
1170 |
++#define X86_CR3_PCID_NOFLUSH _BITULL(X86_CR3_PCID_NOFLUSH_BIT) |
1171 |
+ |
1172 |
+ /* |
1173 |
+ * Intel CPU features in CR4 |
1174 |
+diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c |
1175 |
+index aa1e7246b06b..cc154ac64f00 100644 |
1176 |
+--- a/arch/x86/kernel/cpu/common.c |
1177 |
++++ b/arch/x86/kernel/cpu/common.c |
1178 |
+@@ -92,7 +92,7 @@ static const struct cpu_dev default_cpu = { |
1179 |
+ |
1180 |
+ static const struct cpu_dev *this_cpu = &default_cpu; |
1181 |
+ |
1182 |
+-DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = { |
1183 |
++DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(struct gdt_page, gdt_page) = { .gdt = { |
1184 |
+ #ifdef CONFIG_X86_64 |
1185 |
+ /* |
1186 |
+ * We need valid kernel segments for data and code in long mode too |
1187 |
+@@ -324,8 +324,21 @@ static __always_inline void setup_smap(struct cpuinfo_x86 *c) |
1188 |
+ static void setup_pcid(struct cpuinfo_x86 *c) |
1189 |
+ { |
1190 |
+ if (cpu_has(c, X86_FEATURE_PCID)) { |
1191 |
+- if (cpu_has(c, X86_FEATURE_PGE)) { |
1192 |
++ if (cpu_has(c, X86_FEATURE_PGE) || kaiser_enabled) { |
1193 |
+ cr4_set_bits(X86_CR4_PCIDE); |
1194 |
++ /* |
1195 |
++ * INVPCID has two "groups" of types: |
1196 |
++ * 1/2: Invalidate an individual address |
1197 |
++ * 3/4: Invalidate all contexts |
1198 |
++ * |
1199 |
++ * 1/2 take a PCID, but 3/4 do not. So, 3/4 |
1200 |
++ * ignore the PCID argument in the descriptor. |
1201 |
++ * But, we have to be careful not to call 1/2 |
1202 |
++ * with an actual non-zero PCID in them before |
1203 |
++ * we do the above cr4_set_bits(). |
1204 |
++ */ |
1205 |
++ if (cpu_has(c, X86_FEATURE_INVPCID)) |
1206 |
++ set_cpu_cap(c, X86_FEATURE_INVPCID_SINGLE); |
1207 |
+ } else { |
1208 |
+ /* |
1209 |
+ * flush_tlb_all(), as currently implemented, won't |
1210 |
+@@ -338,6 +351,7 @@ static void setup_pcid(struct cpuinfo_x86 *c) |
1211 |
+ clear_cpu_cap(c, X86_FEATURE_PCID); |
1212 |
+ } |
1213 |
+ } |
1214 |
++ kaiser_setup_pcid(); |
1215 |
+ } |
1216 |
+ |
1217 |
+ /* |
1218 |
+@@ -1229,7 +1243,7 @@ static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = { |
1219 |
+ [DEBUG_STACK - 1] = DEBUG_STKSZ |
1220 |
+ }; |
1221 |
+ |
1222 |
+-static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks |
1223 |
++DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(char, exception_stacks |
1224 |
+ [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]); |
1225 |
+ |
1226 |
+ /* May not be marked __init: used by software suspend */ |
1227 |
+@@ -1392,6 +1406,14 @@ void cpu_init(void) |
1228 |
+ * try to read it. |
1229 |
+ */ |
1230 |
+ cr4_init_shadow(); |
1231 |
++ if (!kaiser_enabled) { |
1232 |
++ /* |
1233 |
++ * secondary_startup_64() deferred setting PGE in cr4: |
1234 |
++ * probe_page_size_mask() sets it on the boot cpu, |
1235 |
++ * but it needs to be set on each secondary cpu. |
1236 |
++ */ |
1237 |
++ cr4_set_bits(X86_CR4_PGE); |
1238 |
++ } |
1239 |
+ |
1240 |
+ /* |
1241 |
+ * Load microcode on this cpu if a valid microcode is available. |
1242 |
+diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c |
1243 |
+index 1e7de3cefc9c..f01b3a12dce0 100644 |
1244 |
+--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c |
1245 |
++++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c |
1246 |
+@@ -2,11 +2,15 @@ |
1247 |
+ #include <linux/types.h> |
1248 |
+ #include <linux/slab.h> |
1249 |
+ |
1250 |
++#include <asm/kaiser.h> |
1251 |
+ #include <asm/perf_event.h> |
1252 |
+ #include <asm/insn.h> |
1253 |
+ |
1254 |
+ #include "perf_event.h" |
1255 |
+ |
1256 |
++static |
1257 |
++DEFINE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct debug_store, cpu_debug_store); |
1258 |
++ |
1259 |
+ /* The size of a BTS record in bytes: */ |
1260 |
+ #define BTS_RECORD_SIZE 24 |
1261 |
+ |
1262 |
+@@ -268,6 +272,39 @@ void fini_debug_store_on_cpu(int cpu) |
1263 |
+ |
1264 |
+ static DEFINE_PER_CPU(void *, insn_buffer); |
1265 |
+ |
1266 |
++static void *dsalloc(size_t size, gfp_t flags, int node) |
1267 |
++{ |
1268 |
++#ifdef CONFIG_PAGE_TABLE_ISOLATION |
1269 |
++ unsigned int order = get_order(size); |
1270 |
++ struct page *page; |
1271 |
++ unsigned long addr; |
1272 |
++ |
1273 |
++ page = __alloc_pages_node(node, flags | __GFP_ZERO, order); |
1274 |
++ if (!page) |
1275 |
++ return NULL; |
1276 |
++ addr = (unsigned long)page_address(page); |
1277 |
++ if (kaiser_add_mapping(addr, size, __PAGE_KERNEL) < 0) { |
1278 |
++ __free_pages(page, order); |
1279 |
++ addr = 0; |
1280 |
++ } |
1281 |
++ return (void *)addr; |
1282 |
++#else |
1283 |
++ return kmalloc_node(size, flags | __GFP_ZERO, node); |
1284 |
++#endif |
1285 |
++} |
1286 |
++ |
1287 |
++static void dsfree(const void *buffer, size_t size) |
1288 |
++{ |
1289 |
++#ifdef CONFIG_PAGE_TABLE_ISOLATION |
1290 |
++ if (!buffer) |
1291 |
++ return; |
1292 |
++ kaiser_remove_mapping((unsigned long)buffer, size); |
1293 |
++ free_pages((unsigned long)buffer, get_order(size)); |
1294 |
++#else |
1295 |
++ kfree(buffer); |
1296 |
++#endif |
1297 |
++} |
1298 |
++ |
1299 |
+ static int alloc_pebs_buffer(int cpu) |
1300 |
+ { |
1301 |
+ struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; |
1302 |
+@@ -278,7 +315,7 @@ static int alloc_pebs_buffer(int cpu) |
1303 |
+ if (!x86_pmu.pebs) |
1304 |
+ return 0; |
1305 |
+ |
1306 |
+- buffer = kzalloc_node(x86_pmu.pebs_buffer_size, GFP_KERNEL, node); |
1307 |
++ buffer = dsalloc(x86_pmu.pebs_buffer_size, GFP_KERNEL, node); |
1308 |
+ if (unlikely(!buffer)) |
1309 |
+ return -ENOMEM; |
1310 |
+ |
1311 |
+@@ -289,7 +326,7 @@ static int alloc_pebs_buffer(int cpu) |
1312 |
+ if (x86_pmu.intel_cap.pebs_format < 2) { |
1313 |
+ ibuffer = kzalloc_node(PEBS_FIXUP_SIZE, GFP_KERNEL, node); |
1314 |
+ if (!ibuffer) { |
1315 |
+- kfree(buffer); |
1316 |
++ dsfree(buffer, x86_pmu.pebs_buffer_size); |
1317 |
+ return -ENOMEM; |
1318 |
+ } |
1319 |
+ per_cpu(insn_buffer, cpu) = ibuffer; |
1320 |
+@@ -315,7 +352,8 @@ static void release_pebs_buffer(int cpu) |
1321 |
+ kfree(per_cpu(insn_buffer, cpu)); |
1322 |
+ per_cpu(insn_buffer, cpu) = NULL; |
1323 |
+ |
1324 |
+- kfree((void *)(unsigned long)ds->pebs_buffer_base); |
1325 |
++ dsfree((void *)(unsigned long)ds->pebs_buffer_base, |
1326 |
++ x86_pmu.pebs_buffer_size); |
1327 |
+ ds->pebs_buffer_base = 0; |
1328 |
+ } |
1329 |
+ |
1330 |
+@@ -329,7 +367,7 @@ static int alloc_bts_buffer(int cpu) |
1331 |
+ if (!x86_pmu.bts) |
1332 |
+ return 0; |
1333 |
+ |
1334 |
+- buffer = kzalloc_node(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, node); |
1335 |
++ buffer = dsalloc(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, node); |
1336 |
+ if (unlikely(!buffer)) { |
1337 |
+ WARN_ONCE(1, "%s: BTS buffer allocation failure\n", __func__); |
1338 |
+ return -ENOMEM; |
1339 |
+@@ -355,19 +393,15 @@ static void release_bts_buffer(int cpu) |
1340 |
+ if (!ds || !x86_pmu.bts) |
1341 |
+ return; |
1342 |
+ |
1343 |
+- kfree((void *)(unsigned long)ds->bts_buffer_base); |
1344 |
++ dsfree((void *)(unsigned long)ds->bts_buffer_base, BTS_BUFFER_SIZE); |
1345 |
+ ds->bts_buffer_base = 0; |
1346 |
+ } |
1347 |
+ |
1348 |
+ static int alloc_ds_buffer(int cpu) |
1349 |
+ { |
1350 |
+- int node = cpu_to_node(cpu); |
1351 |
+- struct debug_store *ds; |
1352 |
+- |
1353 |
+- ds = kzalloc_node(sizeof(*ds), GFP_KERNEL, node); |
1354 |
+- if (unlikely(!ds)) |
1355 |
+- return -ENOMEM; |
1356 |
++ struct debug_store *ds = per_cpu_ptr(&cpu_debug_store, cpu); |
1357 |
+ |
1358 |
++ memset(ds, 0, sizeof(*ds)); |
1359 |
+ per_cpu(cpu_hw_events, cpu).ds = ds; |
1360 |
+ |
1361 |
+ return 0; |
1362 |
+@@ -381,7 +415,6 @@ static void release_ds_buffer(int cpu) |
1363 |
+ return; |
1364 |
+ |
1365 |
+ per_cpu(cpu_hw_events, cpu).ds = NULL; |
1366 |
+- kfree(ds); |
1367 |
+ } |
1368 |
+ |
1369 |
+ void release_ds_buffers(void) |
1370 |
+diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c |
1371 |
+index 4d38416e2a7f..b02cb2ec6726 100644 |
1372 |
+--- a/arch/x86/kernel/espfix_64.c |
1373 |
++++ b/arch/x86/kernel/espfix_64.c |
1374 |
+@@ -41,6 +41,7 @@ |
1375 |
+ #include <asm/pgalloc.h> |
1376 |
+ #include <asm/setup.h> |
1377 |
+ #include <asm/espfix.h> |
1378 |
++#include <asm/kaiser.h> |
1379 |
+ |
1380 |
+ /* |
1381 |
+ * Note: we only need 6*8 = 48 bytes for the espfix stack, but round |
1382 |
+@@ -126,6 +127,15 @@ void __init init_espfix_bsp(void) |
1383 |
+ /* Install the espfix pud into the kernel page directory */ |
1384 |
+ pgd_p = &init_level4_pgt[pgd_index(ESPFIX_BASE_ADDR)]; |
1385 |
+ pgd_populate(&init_mm, pgd_p, (pud_t *)espfix_pud_page); |
1386 |
++ /* |
1387 |
++ * Just copy the top-level PGD that is mapping the espfix |
1388 |
++ * area to ensure it is mapped into the shadow user page |
1389 |
++ * tables. |
1390 |
++ */ |
1391 |
++ if (kaiser_enabled) { |
1392 |
++ set_pgd(native_get_shadow_pgd(pgd_p), |
1393 |
++ __pgd(_KERNPG_TABLE | __pa((pud_t *)espfix_pud_page))); |
1394 |
++ } |
1395 |
+ |
1396 |
+ /* Randomize the locations */ |
1397 |
+ init_espfix_random(); |
1398 |
+diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S |
1399 |
+index ffdc0e860390..4034e905741a 100644 |
1400 |
+--- a/arch/x86/kernel/head_64.S |
1401 |
++++ b/arch/x86/kernel/head_64.S |
1402 |
+@@ -183,8 +183,8 @@ ENTRY(secondary_startup_64) |
1403 |
+ movq $(init_level4_pgt - __START_KERNEL_map), %rax |
1404 |
+ 1: |
1405 |
+ |
1406 |
+- /* Enable PAE mode and PGE */ |
1407 |
+- movl $(X86_CR4_PAE | X86_CR4_PGE), %ecx |
1408 |
++ /* Enable PAE and PSE, but defer PGE until kaiser_enabled is decided */ |
1409 |
++ movl $(X86_CR4_PAE | X86_CR4_PSE), %ecx |
1410 |
+ movq %rcx, %cr4 |
1411 |
+ |
1412 |
+ /* Setup early boot stage 4 level pagetables. */ |
1413 |
+@@ -441,6 +441,27 @@ early_idt_ripmsg: |
1414 |
+ .balign PAGE_SIZE; \ |
1415 |
+ GLOBAL(name) |
1416 |
+ |
1417 |
++#ifdef CONFIG_PAGE_TABLE_ISOLATION |
1418 |
++/* |
1419 |
++ * Each PGD needs to be 8k long and 8k aligned. We do not |
1420 |
++ * ever go out to userspace with these, so we do not |
1421 |
++ * strictly *need* the second page, but this allows us to |
1422 |
++ * have a single set_pgd() implementation that does not |
1423 |
++ * need to worry about whether it has 4k or 8k to work |
1424 |
++ * with. |
1425 |
++ * |
1426 |
++ * This ensures PGDs are 8k long: |
1427 |
++ */ |
1428 |
++#define KAISER_USER_PGD_FILL 512 |
1429 |
++/* This ensures they are 8k-aligned: */ |
1430 |
++#define NEXT_PGD_PAGE(name) \ |
1431 |
++ .balign 2 * PAGE_SIZE; \ |
1432 |
++GLOBAL(name) |
1433 |
++#else |
1434 |
++#define NEXT_PGD_PAGE(name) NEXT_PAGE(name) |
1435 |
++#define KAISER_USER_PGD_FILL 0 |
1436 |
++#endif |
1437 |
++ |
1438 |
+ /* Automate the creation of 1 to 1 mapping pmd entries */ |
1439 |
+ #define PMDS(START, PERM, COUNT) \ |
1440 |
+ i = 0 ; \ |
1441 |
+@@ -450,9 +471,10 @@ GLOBAL(name) |
1442 |
+ .endr |
1443 |
+ |
1444 |
+ __INITDATA |
1445 |
+-NEXT_PAGE(early_level4_pgt) |
1446 |
++NEXT_PGD_PAGE(early_level4_pgt) |
1447 |
+ .fill 511,8,0 |
1448 |
+ .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE |
1449 |
++ .fill KAISER_USER_PGD_FILL,8,0 |
1450 |
+ |
1451 |
+ NEXT_PAGE(early_dynamic_pgts) |
1452 |
+ .fill 512*EARLY_DYNAMIC_PAGE_TABLES,8,0 |
1453 |
+@@ -460,16 +482,18 @@ NEXT_PAGE(early_dynamic_pgts) |
1454 |
+ .data |
1455 |
+ |
1456 |
+ #ifndef CONFIG_XEN |
1457 |
+-NEXT_PAGE(init_level4_pgt) |
1458 |
++NEXT_PGD_PAGE(init_level4_pgt) |
1459 |
+ .fill 512,8,0 |
1460 |
++ .fill KAISER_USER_PGD_FILL,8,0 |
1461 |
+ #else |
1462 |
+-NEXT_PAGE(init_level4_pgt) |
1463 |
++NEXT_PGD_PAGE(init_level4_pgt) |
1464 |
+ .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE |
1465 |
+ .org init_level4_pgt + L4_PAGE_OFFSET*8, 0 |
1466 |
+ .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE |
1467 |
+ .org init_level4_pgt + L4_START_KERNEL*8, 0 |
1468 |
+ /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ |
1469 |
+ .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE |
1470 |
++ .fill KAISER_USER_PGD_FILL,8,0 |
1471 |
+ |
1472 |
+ NEXT_PAGE(level3_ident_pgt) |
1473 |
+ .quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE |
1474 |
+@@ -480,6 +504,7 @@ NEXT_PAGE(level2_ident_pgt) |
1475 |
+ */ |
1476 |
+ PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD) |
1477 |
+ #endif |
1478 |
++ .fill KAISER_USER_PGD_FILL,8,0 |
1479 |
+ |
1480 |
+ NEXT_PAGE(level3_kernel_pgt) |
1481 |
+ .fill L3_START_KERNEL,8,0 |
1482 |
+diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c |
1483 |
+index 1423ab1b0312..f480b38a03c3 100644 |
1484 |
+--- a/arch/x86/kernel/irqinit.c |
1485 |
++++ b/arch/x86/kernel/irqinit.c |
1486 |
+@@ -51,7 +51,7 @@ static struct irqaction irq2 = { |
1487 |
+ .flags = IRQF_NO_THREAD, |
1488 |
+ }; |
1489 |
+ |
1490 |
+-DEFINE_PER_CPU(vector_irq_t, vector_irq) = { |
1491 |
++DEFINE_PER_CPU_USER_MAPPED(vector_irq_t, vector_irq) = { |
1492 |
+ [0 ... NR_VECTORS - 1] = VECTOR_UNUSED, |
1493 |
+ }; |
1494 |
+ |
1495 |
+diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c |
1496 |
+index 2bd81e302427..ec1b06dc82d2 100644 |
1497 |
+--- a/arch/x86/kernel/kvmclock.c |
1498 |
++++ b/arch/x86/kernel/kvmclock.c |
1499 |
+@@ -45,6 +45,11 @@ early_param("no-kvmclock", parse_no_kvmclock); |
1500 |
+ static struct pvclock_vsyscall_time_info *hv_clock; |
1501 |
+ static struct pvclock_wall_clock wall_clock; |
1502 |
+ |
1503 |
++struct pvclock_vsyscall_time_info *pvclock_pvti_cpu0_va(void) |
1504 |
++{ |
1505 |
++ return hv_clock; |
1506 |
++} |
1507 |
++ |
1508 |
+ /* |
1509 |
+ * The wallclock is the time of day when we booted. Since then, some time may |
1510 |
+ * have elapsed since the hypervisor wrote the data. So we try to account for |
1511 |
+diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c |
1512 |
+index d6279593bcdd..bc429365b72a 100644 |
1513 |
+--- a/arch/x86/kernel/ldt.c |
1514 |
++++ b/arch/x86/kernel/ldt.c |
1515 |
+@@ -16,6 +16,7 @@ |
1516 |
+ #include <linux/slab.h> |
1517 |
+ #include <linux/vmalloc.h> |
1518 |
+ #include <linux/uaccess.h> |
1519 |
++#include <linux/kaiser.h> |
1520 |
+ |
1521 |
+ #include <asm/ldt.h> |
1522 |
+ #include <asm/desc.h> |
1523 |
+@@ -34,11 +35,21 @@ static void flush_ldt(void *current_mm) |
1524 |
+ set_ldt(pc->ldt->entries, pc->ldt->size); |
1525 |
+ } |
1526 |
+ |
1527 |
++static void __free_ldt_struct(struct ldt_struct *ldt) |
1528 |
++{ |
1529 |
++ if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE) |
1530 |
++ vfree(ldt->entries); |
1531 |
++ else |
1532 |
++ free_page((unsigned long)ldt->entries); |
1533 |
++ kfree(ldt); |
1534 |
++} |
1535 |
++ |
1536 |
+ /* The caller must call finalize_ldt_struct on the result. LDT starts zeroed. */ |
1537 |
+ static struct ldt_struct *alloc_ldt_struct(int size) |
1538 |
+ { |
1539 |
+ struct ldt_struct *new_ldt; |
1540 |
+ int alloc_size; |
1541 |
++ int ret; |
1542 |
+ |
1543 |
+ if (size > LDT_ENTRIES) |
1544 |
+ return NULL; |
1545 |
+@@ -66,7 +77,13 @@ static struct ldt_struct *alloc_ldt_struct(int size) |
1546 |
+ return NULL; |
1547 |
+ } |
1548 |
+ |
1549 |
++ ret = kaiser_add_mapping((unsigned long)new_ldt->entries, alloc_size, |
1550 |
++ __PAGE_KERNEL); |
1551 |
+ new_ldt->size = size; |
1552 |
++ if (ret) { |
1553 |
++ __free_ldt_struct(new_ldt); |
1554 |
++ return NULL; |
1555 |
++ } |
1556 |
+ return new_ldt; |
1557 |
+ } |
1558 |
+ |
1559 |
+@@ -92,12 +109,10 @@ static void free_ldt_struct(struct ldt_struct *ldt) |
1560 |
+ if (likely(!ldt)) |
1561 |
+ return; |
1562 |
+ |
1563 |
++ kaiser_remove_mapping((unsigned long)ldt->entries, |
1564 |
++ ldt->size * LDT_ENTRY_SIZE); |
1565 |
+ paravirt_free_ldt(ldt->entries, ldt->size); |
1566 |
+- if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE) |
1567 |
+- vfree(ldt->entries); |
1568 |
+- else |
1569 |
+- free_page((unsigned long)ldt->entries); |
1570 |
+- kfree(ldt); |
1571 |
++ __free_ldt_struct(ldt); |
1572 |
+ } |
1573 |
+ |
1574 |
+ /* |
1575 |
+diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c |
1576 |
+index 8aa05583bc42..0677bf8d3a42 100644 |
1577 |
+--- a/arch/x86/kernel/paravirt_patch_64.c |
1578 |
++++ b/arch/x86/kernel/paravirt_patch_64.c |
1579 |
+@@ -9,7 +9,6 @@ DEF_NATIVE(pv_irq_ops, save_fl, "pushfq; popq %rax"); |
1580 |
+ DEF_NATIVE(pv_mmu_ops, read_cr2, "movq %cr2, %rax"); |
1581 |
+ DEF_NATIVE(pv_mmu_ops, read_cr3, "movq %cr3, %rax"); |
1582 |
+ DEF_NATIVE(pv_mmu_ops, write_cr3, "movq %rdi, %cr3"); |
1583 |
+-DEF_NATIVE(pv_mmu_ops, flush_tlb_single, "invlpg (%rdi)"); |
1584 |
+ DEF_NATIVE(pv_cpu_ops, clts, "clts"); |
1585 |
+ DEF_NATIVE(pv_cpu_ops, wbinvd, "wbinvd"); |
1586 |
+ |
1587 |
+@@ -62,7 +61,6 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf, |
1588 |
+ PATCH_SITE(pv_mmu_ops, read_cr3); |
1589 |
+ PATCH_SITE(pv_mmu_ops, write_cr3); |
1590 |
+ PATCH_SITE(pv_cpu_ops, clts); |
1591 |
+- PATCH_SITE(pv_mmu_ops, flush_tlb_single); |
1592 |
+ PATCH_SITE(pv_cpu_ops, wbinvd); |
1593 |
+ #if defined(CONFIG_PARAVIRT_SPINLOCKS) && defined(CONFIG_QUEUED_SPINLOCKS) |
1594 |
+ case PARAVIRT_PATCH(pv_lock_ops.queued_spin_unlock): |
1595 |
+diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c |
1596 |
+index 9f7c21c22477..7c5c5dc90ffa 100644 |
1597 |
+--- a/arch/x86/kernel/process.c |
1598 |
++++ b/arch/x86/kernel/process.c |
1599 |
+@@ -39,7 +39,7 @@ |
1600 |
+ * section. Since TSS's are completely CPU-local, we want them |
1601 |
+ * on exact cacheline boundaries, to eliminate cacheline ping-pong. |
1602 |
+ */ |
1603 |
+-__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = { |
1604 |
++__visible DEFINE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct tss_struct, cpu_tss) = { |
1605 |
+ .x86_tss = { |
1606 |
+ .sp0 = TOP_OF_INIT_STACK, |
1607 |
+ #ifdef CONFIG_X86_32 |
1608 |
+diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c |
1609 |
+index e67b834279b2..bbaae4cf9e8e 100644 |
1610 |
+--- a/arch/x86/kernel/setup.c |
1611 |
++++ b/arch/x86/kernel/setup.c |
1612 |
+@@ -112,6 +112,7 @@ |
1613 |
+ #include <asm/alternative.h> |
1614 |
+ #include <asm/prom.h> |
1615 |
+ #include <asm/microcode.h> |
1616 |
++#include <asm/kaiser.h> |
1617 |
+ |
1618 |
+ /* |
1619 |
+ * max_low_pfn_mapped: highest direct mapped pfn under 4GB |
1620 |
+@@ -1016,6 +1017,12 @@ void __init setup_arch(char **cmdline_p) |
1621 |
+ */ |
1622 |
+ init_hypervisor_platform(); |
1623 |
+ |
1624 |
++ /* |
1625 |
++ * This needs to happen right after XENPV is set on xen and |
1626 |
++ * kaiser_enabled is checked below in cleanup_highmap(). |
1627 |
++ */ |
1628 |
++ kaiser_check_boottime_disable(); |
1629 |
++ |
1630 |
+ x86_init.resources.probe_roms(); |
1631 |
+ |
1632 |
+ /* after parse_early_param, so could debug it */ |
1633 |
+diff --git a/arch/x86/kernel/tracepoint.c b/arch/x86/kernel/tracepoint.c |
1634 |
+index 1c113db9ed57..2bb5ee464df3 100644 |
1635 |
+--- a/arch/x86/kernel/tracepoint.c |
1636 |
++++ b/arch/x86/kernel/tracepoint.c |
1637 |
+@@ -9,10 +9,12 @@ |
1638 |
+ #include <linux/atomic.h> |
1639 |
+ |
1640 |
+ atomic_t trace_idt_ctr = ATOMIC_INIT(0); |
1641 |
++__aligned(PAGE_SIZE) |
1642 |
+ struct desc_ptr trace_idt_descr = { NR_VECTORS * 16 - 1, |
1643 |
+ (unsigned long) trace_idt_table }; |
1644 |
+ |
1645 |
+ /* No need to be aligned, but done to keep all IDTs defined the same way. */ |
1646 |
++__aligned(PAGE_SIZE) |
1647 |
+ gate_desc trace_idt_table[NR_VECTORS] __page_aligned_bss; |
1648 |
+ |
1649 |
+ static int trace_irq_vector_refcount; |
1650 |
+diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c |
1651 |
+index 796f1ec67469..ccf17dbfea09 100644 |
1652 |
+--- a/arch/x86/kvm/x86.c |
1653 |
++++ b/arch/x86/kvm/x86.c |
1654 |
+@@ -759,7 +759,8 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) |
1655 |
+ return 1; |
1656 |
+ |
1657 |
+ /* PCID can not be enabled when cr3[11:0]!=000H or EFER.LMA=0 */ |
1658 |
+- if ((kvm_read_cr3(vcpu) & X86_CR3_PCID_MASK) || !is_long_mode(vcpu)) |
1659 |
++ if ((kvm_read_cr3(vcpu) & X86_CR3_PCID_ASID_MASK) || |
1660 |
++ !is_long_mode(vcpu)) |
1661 |
+ return 1; |
1662 |
+ } |
1663 |
+ |
1664 |
+diff --git a/arch/x86/lib/cmdline.c b/arch/x86/lib/cmdline.c |
1665 |
+index 422db000d727..a744506856b1 100644 |
1666 |
+--- a/arch/x86/lib/cmdline.c |
1667 |
++++ b/arch/x86/lib/cmdline.c |
1668 |
+@@ -82,3 +82,108 @@ int cmdline_find_option_bool(const char *cmdline, const char *option) |
1669 |
+ |
1670 |
+ return 0; /* Buffer overrun */ |
1671 |
+ } |
1672 |
++ |
1673 |
++/* |
1674 |
++ * Find a non-boolean option (i.e. option=argument). In accordance with |
1675 |
++ * standard Linux practice, if this option is repeated, this returns the |
1676 |
++ * last instance on the command line. |
1677 |
++ * |
1678 |
++ * @cmdline: the cmdline string |
1679 |
++ * @max_cmdline_size: the maximum size of cmdline |
1680 |
++ * @option: option string to look for |
1681 |
++ * @buffer: memory buffer to return the option argument |
1682 |
++ * @bufsize: size of the supplied memory buffer |
1683 |
++ * |
1684 |
++ * Returns the length of the argument (regardless of if it was |
1685 |
++ * truncated to fit in the buffer), or -1 on not found. |
1686 |
++ */ |
1687 |
++static int |
1688 |
++__cmdline_find_option(const char *cmdline, int max_cmdline_size, |
1689 |
++ const char *option, char *buffer, int bufsize) |
1690 |
++{ |
1691 |
++ char c; |
1692 |
++ int pos = 0, len = -1; |
1693 |
++ const char *opptr = NULL; |
1694 |
++ char *bufptr = buffer; |
1695 |
++ enum { |
1696 |
++ st_wordstart = 0, /* Start of word/after whitespace */ |
1697 |
++ st_wordcmp, /* Comparing this word */ |
1698 |
++ st_wordskip, /* Miscompare, skip */ |
1699 |
++ st_bufcpy, /* Copying this to buffer */ |
1700 |
++ } state = st_wordstart; |
1701 |
++ |
1702 |
++ if (!cmdline) |
1703 |
++ return -1; /* No command line */ |
1704 |
++ |
1705 |
++ /* |
1706 |
++ * This 'pos' check ensures we do not overrun |
1707 |
++ * a non-NULL-terminated 'cmdline' |
1708 |
++ */ |
1709 |
++ while (pos++ < max_cmdline_size) { |
1710 |
++ c = *(char *)cmdline++; |
1711 |
++ if (!c) |
1712 |
++ break; |
1713 |
++ |
1714 |
++ switch (state) { |
1715 |
++ case st_wordstart: |
1716 |
++ if (myisspace(c)) |
1717 |
++ break; |
1718 |
++ |
1719 |
++ state = st_wordcmp; |
1720 |
++ opptr = option; |
1721 |
++ /* fall through */ |
1722 |
++ |
1723 |
++ case st_wordcmp: |
1724 |
++ if ((c == '=') && !*opptr) { |
1725 |
++ /* |
1726 |
++ * We matched all the way to the end of the |
1727 |
++ * option we were looking for, prepare to |
1728 |
++ * copy the argument. |
1729 |
++ */ |
1730 |
++ len = 0; |
1731 |
++ bufptr = buffer; |
1732 |
++ state = st_bufcpy; |
1733 |
++ break; |
1734 |
++ } else if (c == *opptr++) { |
1735 |
++ /* |
1736 |
++ * We are currently matching, so continue |
1737 |
++ * to the next character on the cmdline. |
1738 |
++ */ |
1739 |
++ break; |
1740 |
++ } |
1741 |
++ state = st_wordskip; |
1742 |
++ /* fall through */ |
1743 |
++ |
1744 |
++ case st_wordskip: |
1745 |
++ if (myisspace(c)) |
1746 |
++ state = st_wordstart; |
1747 |
++ break; |
1748 |
++ |
1749 |
++ case st_bufcpy: |
1750 |
++ if (myisspace(c)) { |
1751 |
++ state = st_wordstart; |
1752 |
++ } else { |
1753 |
++ /* |
1754 |
++ * Increment len, but don't overrun the |
1755 |
++ * supplied buffer and leave room for the |
1756 |
++ * NULL terminator. |
1757 |
++ */ |
1758 |
++ if (++len < bufsize) |
1759 |
++ *bufptr++ = c; |
1760 |
++ } |
1761 |
++ break; |
1762 |
++ } |
1763 |
++ } |
1764 |
++ |
1765 |
++ if (bufsize) |
1766 |
++ *bufptr = '\0'; |
1767 |
++ |
1768 |
++ return len; |
1769 |
++} |
1770 |
++ |
1771 |
++int cmdline_find_option(const char *cmdline, const char *option, char *buffer, |
1772 |
++ int bufsize) |
1773 |
++{ |
1774 |
++ return __cmdline_find_option(cmdline, COMMAND_LINE_SIZE, option, |
1775 |
++ buffer, bufsize); |
1776 |
++} |
1777 |
+diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile |
1778 |
+index 1ae7c141f778..61e6cead9c4a 100644 |
1779 |
+--- a/arch/x86/mm/Makefile |
1780 |
++++ b/arch/x86/mm/Makefile |
1781 |
+@@ -32,3 +32,4 @@ obj-$(CONFIG_ACPI_NUMA) += srat.o |
1782 |
+ obj-$(CONFIG_NUMA_EMU) += numa_emulation.o |
1783 |
+ |
1784 |
+ obj-$(CONFIG_X86_INTEL_MPX) += mpx.o |
1785 |
++obj-$(CONFIG_PAGE_TABLE_ISOLATION) += kaiser.o |
1786 |
+diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c |
1787 |
+index ed4b372860e4..2bd45ae91eb3 100644 |
1788 |
+--- a/arch/x86/mm/init.c |
1789 |
++++ b/arch/x86/mm/init.c |
1790 |
+@@ -165,7 +165,7 @@ static void __init probe_page_size_mask(void) |
1791 |
+ cr4_set_bits_and_update_boot(X86_CR4_PSE); |
1792 |
+ |
1793 |
+ /* Enable PGE if available */ |
1794 |
+- if (cpu_has_pge) { |
1795 |
++ if (cpu_has_pge && !kaiser_enabled) { |
1796 |
+ cr4_set_bits_and_update_boot(X86_CR4_PGE); |
1797 |
+ __supported_pte_mask |= _PAGE_GLOBAL; |
1798 |
+ } else |
1799 |
+diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c |
1800 |
+index ec081fe0ce2c..d76ec9348cff 100644 |
1801 |
+--- a/arch/x86/mm/init_64.c |
1802 |
++++ b/arch/x86/mm/init_64.c |
1803 |
+@@ -395,6 +395,16 @@ void __init cleanup_highmap(void) |
1804 |
+ continue; |
1805 |
+ if (vaddr < (unsigned long) _text || vaddr > end) |
1806 |
+ set_pmd(pmd, __pmd(0)); |
1807 |
++ else if (kaiser_enabled) { |
1808 |
++ /* |
1809 |
++ * level2_kernel_pgt is initialized with _PAGE_GLOBAL: |
1810 |
++ * clear that now. This is not important, so long as |
1811 |
++ * CR4.PGE remains clear, but it removes an anomaly. |
1812 |
++ * Physical mapping setup below avoids _PAGE_GLOBAL |
1813 |
++ * by use of massage_pgprot() inside pfn_pte() etc. |
1814 |
++ */ |
1815 |
++ set_pmd(pmd, pmd_clear_flags(*pmd, _PAGE_GLOBAL)); |
1816 |
++ } |
1817 |
+ } |
1818 |
+ } |
1819 |
+ |
1820 |
+diff --git a/arch/x86/mm/kaiser.c b/arch/x86/mm/kaiser.c |
1821 |
+new file mode 100644 |
1822 |
+index 000000000000..b0b3a69f1c7f |
1823 |
+--- /dev/null |
1824 |
++++ b/arch/x86/mm/kaiser.c |
1825 |
+@@ -0,0 +1,456 @@ |
1826 |
++#include <linux/bug.h> |
1827 |
++#include <linux/kernel.h> |
1828 |
++#include <linux/errno.h> |
1829 |
++#include <linux/string.h> |
1830 |
++#include <linux/types.h> |
1831 |
++#include <linux/bug.h> |
1832 |
++#include <linux/init.h> |
1833 |
++#include <linux/interrupt.h> |
1834 |
++#include <linux/spinlock.h> |
1835 |
++#include <linux/mm.h> |
1836 |
++#include <linux/uaccess.h> |
1837 |
++#include <linux/ftrace.h> |
1838 |
++ |
1839 |
++#undef pr_fmt |
1840 |
++#define pr_fmt(fmt) "Kernel/User page tables isolation: " fmt |
1841 |
++ |
1842 |
++#include <asm/kaiser.h> |
1843 |
++#include <asm/tlbflush.h> /* to verify its kaiser declarations */ |
1844 |
++#include <asm/pgtable.h> |
1845 |
++#include <asm/pgalloc.h> |
1846 |
++#include <asm/desc.h> |
1847 |
++#include <asm/cmdline.h> |
1848 |
++ |
1849 |
++int kaiser_enabled __read_mostly = 1; |
1850 |
++EXPORT_SYMBOL(kaiser_enabled); /* for inlined TLB flush functions */ |
1851 |
++ |
1852 |
++__visible |
1853 |
++DEFINE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup); |
1854 |
++ |
1855 |
++/* |
1856 |
++ * These can have bit 63 set, so we can not just use a plain "or" |
1857 |
++ * instruction to get their value or'd into CR3. It would take |
1858 |
++ * another register. So, we use a memory reference to these instead. |
1859 |
++ * |
1860 |
++ * This is also handy because systems that do not support PCIDs |
1861 |
++ * just end up or'ing a 0 into their CR3, which does no harm. |
1862 |
++ */ |
1863 |
++DEFINE_PER_CPU(unsigned long, x86_cr3_pcid_user); |
1864 |
++ |
1865 |
++/* |
1866 |
++ * At runtime, the only things we map are some things for CPU |
1867 |
++ * hotplug, and stacks for new processes. No two CPUs will ever |
1868 |
++ * be populating the same addresses, so we only need to ensure |
1869 |
++ * that we protect between two CPUs trying to allocate and |
1870 |
++ * populate the same page table page. |
1871 |
++ * |
1872 |
++ * Only take this lock when doing a set_p[4um]d(), but it is not |
1873 |
++ * needed for doing a set_pte(). We assume that only the *owner* |
1874 |
++ * of a given allocation will be doing this for _their_ |
1875 |
++ * allocation. |
1876 |
++ * |
1877 |
++ * This ensures that once a system has been running for a while |
1878 |
++ * and there have been stacks all over and these page tables |
1879 |
++ * are fully populated, there will be no further acquisitions of |
1880 |
++ * this lock. |
1881 |
++ */ |
1882 |
++static DEFINE_SPINLOCK(shadow_table_allocation_lock); |
1883 |
++ |
1884 |
++/* |
1885 |
++ * Returns -1 on error. |
1886 |
++ */ |
1887 |
++static inline unsigned long get_pa_from_mapping(unsigned long vaddr) |
1888 |
++{ |
1889 |
++ pgd_t *pgd; |
1890 |
++ pud_t *pud; |
1891 |
++ pmd_t *pmd; |
1892 |
++ pte_t *pte; |
1893 |
++ |
1894 |
++ pgd = pgd_offset_k(vaddr); |
1895 |
++ /* |
1896 |
++ * We made all the kernel PGDs present in kaiser_init(). |
1897 |
++ * We expect them to stay that way. |
1898 |
++ */ |
1899 |
++ BUG_ON(pgd_none(*pgd)); |
1900 |
++ /* |
1901 |
++ * PGDs are either 512GB or 128TB on all x86_64 |
1902 |
++ * configurations. We don't handle these. |
1903 |
++ */ |
1904 |
++ BUG_ON(pgd_large(*pgd)); |
1905 |
++ |
1906 |
++ pud = pud_offset(pgd, vaddr); |
1907 |
++ if (pud_none(*pud)) { |
1908 |
++ WARN_ON_ONCE(1); |
1909 |
++ return -1; |
1910 |
++ } |
1911 |
++ |
1912 |
++ if (pud_large(*pud)) |
1913 |
++ return (pud_pfn(*pud) << PAGE_SHIFT) | (vaddr & ~PUD_PAGE_MASK); |
1914 |
++ |
1915 |
++ pmd = pmd_offset(pud, vaddr); |
1916 |
++ if (pmd_none(*pmd)) { |
1917 |
++ WARN_ON_ONCE(1); |
1918 |
++ return -1; |
1919 |
++ } |
1920 |
++ |
1921 |
++ if (pmd_large(*pmd)) |
1922 |
++ return (pmd_pfn(*pmd) << PAGE_SHIFT) | (vaddr & ~PMD_PAGE_MASK); |
1923 |
++ |
1924 |
++ pte = pte_offset_kernel(pmd, vaddr); |
1925 |
++ if (pte_none(*pte)) { |
1926 |
++ WARN_ON_ONCE(1); |
1927 |
++ return -1; |
1928 |
++ } |
1929 |
++ |
1930 |
++ return (pte_pfn(*pte) << PAGE_SHIFT) | (vaddr & ~PAGE_MASK); |
1931 |
++} |
1932 |
++ |
1933 |
++/* |
1934 |
++ * This is a relatively normal page table walk, except that it |
1935 |
++ * also tries to allocate page tables pages along the way. |
1936 |
++ * |
1937 |
++ * Returns a pointer to a PTE on success, or NULL on failure. |
1938 |
++ */ |
1939 |
++static pte_t *kaiser_pagetable_walk(unsigned long address) |
1940 |
++{ |
1941 |
++ pmd_t *pmd; |
1942 |
++ pud_t *pud; |
1943 |
++ pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(address)); |
1944 |
++ gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO); |
1945 |
++ |
1946 |
++ if (pgd_none(*pgd)) { |
1947 |
++ WARN_ONCE(1, "All shadow pgds should have been populated"); |
1948 |
++ return NULL; |
1949 |
++ } |
1950 |
++ BUILD_BUG_ON(pgd_large(*pgd) != 0); |
1951 |
++ |
1952 |
++ pud = pud_offset(pgd, address); |
1953 |
++ /* The shadow page tables do not use large mappings: */ |
1954 |
++ if (pud_large(*pud)) { |
1955 |
++ WARN_ON(1); |
1956 |
++ return NULL; |
1957 |
++ } |
1958 |
++ if (pud_none(*pud)) { |
1959 |
++ unsigned long new_pmd_page = __get_free_page(gfp); |
1960 |
++ if (!new_pmd_page) |
1961 |
++ return NULL; |
1962 |
++ spin_lock(&shadow_table_allocation_lock); |
1963 |
++ if (pud_none(*pud)) { |
1964 |
++ set_pud(pud, __pud(_KERNPG_TABLE | __pa(new_pmd_page))); |
1965 |
++ __inc_zone_page_state(virt_to_page((void *) |
1966 |
++ new_pmd_page), NR_KAISERTABLE); |
1967 |
++ } else |
1968 |
++ free_page(new_pmd_page); |
1969 |
++ spin_unlock(&shadow_table_allocation_lock); |
1970 |
++ } |
1971 |
++ |
1972 |
++ pmd = pmd_offset(pud, address); |
1973 |
++ /* The shadow page tables do not use large mappings: */ |
1974 |
++ if (pmd_large(*pmd)) { |
1975 |
++ WARN_ON(1); |
1976 |
++ return NULL; |
1977 |
++ } |
1978 |
++ if (pmd_none(*pmd)) { |
1979 |
++ unsigned long new_pte_page = __get_free_page(gfp); |
1980 |
++ if (!new_pte_page) |
1981 |
++ return NULL; |
1982 |
++ spin_lock(&shadow_table_allocation_lock); |
1983 |
++ if (pmd_none(*pmd)) { |
1984 |
++ set_pmd(pmd, __pmd(_KERNPG_TABLE | __pa(new_pte_page))); |
1985 |
++ __inc_zone_page_state(virt_to_page((void *) |
1986 |
++ new_pte_page), NR_KAISERTABLE); |
1987 |
++ } else |
1988 |
++ free_page(new_pte_page); |
1989 |
++ spin_unlock(&shadow_table_allocation_lock); |
1990 |
++ } |
1991 |
++ |
1992 |
++ return pte_offset_kernel(pmd, address); |
1993 |
++} |
1994 |
++ |
1995 |
++static int kaiser_add_user_map(const void *__start_addr, unsigned long size, |
1996 |
++ unsigned long flags) |
1997 |
++{ |
1998 |
++ int ret = 0; |
1999 |
++ pte_t *pte; |
2000 |
++ unsigned long start_addr = (unsigned long )__start_addr; |
2001 |
++ unsigned long address = start_addr & PAGE_MASK; |
2002 |
++ unsigned long end_addr = PAGE_ALIGN(start_addr + size); |
2003 |
++ unsigned long target_address; |
2004 |
++ |
2005 |
++ /* |
2006 |
++ * It is convenient for callers to pass in __PAGE_KERNEL etc, |
2007 |
++ * and there is no actual harm from setting _PAGE_GLOBAL, so |
2008 |
++ * long as CR4.PGE is not set. But it is nonetheless troubling |
2009 |
++ * to see Kaiser itself setting _PAGE_GLOBAL (now that "nokaiser" |
2010 |
++ * requires that not to be #defined to 0): so mask it off here. |
2011 |
++ */ |
2012 |
++ flags &= ~_PAGE_GLOBAL; |
2013 |
++ |
2014 |
++ for (; address < end_addr; address += PAGE_SIZE) { |
2015 |
++ target_address = get_pa_from_mapping(address); |
2016 |
++ if (target_address == -1) { |
2017 |
++ ret = -EIO; |
2018 |
++ break; |
2019 |
++ } |
2020 |
++ pte = kaiser_pagetable_walk(address); |
2021 |
++ if (!pte) { |
2022 |
++ ret = -ENOMEM; |
2023 |
++ break; |
2024 |
++ } |
2025 |
++ if (pte_none(*pte)) { |
2026 |
++ set_pte(pte, __pte(flags | target_address)); |
2027 |
++ } else { |
2028 |
++ pte_t tmp; |
2029 |
++ set_pte(&tmp, __pte(flags | target_address)); |
2030 |
++ WARN_ON_ONCE(!pte_same(*pte, tmp)); |
2031 |
++ } |
2032 |
++ } |
2033 |
++ return ret; |
2034 |
++} |
2035 |
++ |
2036 |
++static int kaiser_add_user_map_ptrs(const void *start, const void *end, unsigned long flags) |
2037 |
++{ |
2038 |
++ unsigned long size = end - start; |
2039 |
++ |
2040 |
++ return kaiser_add_user_map(start, size, flags); |
2041 |
++} |
2042 |
++ |
2043 |
++/* |
2044 |
++ * Ensure that the top level of the (shadow) page tables are |
2045 |
++ * entirely populated. This ensures that all processes that get |
2046 |
++ * forked have the same entries. This way, we do not have to |
2047 |
++ * ever go set up new entries in older processes. |
2048 |
++ * |
2049 |
++ * Note: we never free these, so there are no updates to them |
2050 |
++ * after this. |
2051 |
++ */ |
2052 |
++static void __init kaiser_init_all_pgds(void) |
2053 |
++{ |
2054 |
++ pgd_t *pgd; |
2055 |
++ int i = 0; |
2056 |
++ |
2057 |
++ pgd = native_get_shadow_pgd(pgd_offset_k((unsigned long )0)); |
2058 |
++ for (i = PTRS_PER_PGD / 2; i < PTRS_PER_PGD; i++) { |
2059 |
++ pgd_t new_pgd; |
2060 |
++ pud_t *pud = pud_alloc_one(&init_mm, |
2061 |
++ PAGE_OFFSET + i * PGDIR_SIZE); |
2062 |
++ if (!pud) { |
2063 |
++ WARN_ON(1); |
2064 |
++ break; |
2065 |
++ } |
2066 |
++ inc_zone_page_state(virt_to_page(pud), NR_KAISERTABLE); |
2067 |
++ new_pgd = __pgd(_KERNPG_TABLE |__pa(pud)); |
2068 |
++ /* |
2069 |
++ * Make sure not to stomp on some other pgd entry. |
2070 |
++ */ |
2071 |
++ if (!pgd_none(pgd[i])) { |
2072 |
++ WARN_ON(1); |
2073 |
++ continue; |
2074 |
++ } |
2075 |
++ set_pgd(pgd + i, new_pgd); |
2076 |
++ } |
2077 |
++} |
2078 |
++ |
2079 |
++#define kaiser_add_user_map_early(start, size, flags) do { \ |
2080 |
++ int __ret = kaiser_add_user_map(start, size, flags); \ |
2081 |
++ WARN_ON(__ret); \ |
2082 |
++} while (0) |
2083 |
++ |
2084 |
++#define kaiser_add_user_map_ptrs_early(start, end, flags) do { \ |
2085 |
++ int __ret = kaiser_add_user_map_ptrs(start, end, flags); \ |
2086 |
++ WARN_ON(__ret); \ |
2087 |
++} while (0) |
2088 |
++ |
2089 |
++void __init kaiser_check_boottime_disable(void) |
2090 |
++{ |
2091 |
++ bool enable = true; |
2092 |
++ char arg[5]; |
2093 |
++ int ret; |
2094 |
++ |
2095 |
++ if (boot_cpu_has(X86_FEATURE_XENPV)) |
2096 |
++ goto silent_disable; |
2097 |
++ |
2098 |
++ ret = cmdline_find_option(boot_command_line, "pti", arg, sizeof(arg)); |
2099 |
++ if (ret > 0) { |
2100 |
++ if (!strncmp(arg, "on", 2)) |
2101 |
++ goto enable; |
2102 |
++ |
2103 |
++ if (!strncmp(arg, "off", 3)) |
2104 |
++ goto disable; |
2105 |
++ |
2106 |
++ if (!strncmp(arg, "auto", 4)) |
2107 |
++ goto skip; |
2108 |
++ } |
2109 |
++ |
2110 |
++ if (cmdline_find_option_bool(boot_command_line, "nopti")) |
2111 |
++ goto disable; |
2112 |
++ |
2113 |
++skip: |
2114 |
++ if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) |
2115 |
++ goto disable; |
2116 |
++ |
2117 |
++enable: |
2118 |
++ if (enable) |
2119 |
++ setup_force_cpu_cap(X86_FEATURE_KAISER); |
2120 |
++ |
2121 |
++ return; |
2122 |
++ |
2123 |
++disable: |
2124 |
++ pr_info("disabled\n"); |
2125 |
++ |
2126 |
++silent_disable: |
2127 |
++ kaiser_enabled = 0; |
2128 |
++ setup_clear_cpu_cap(X86_FEATURE_KAISER); |
2129 |
++} |
2130 |
++ |
2131 |
++/* |
2132 |
++ * If anything in here fails, we will likely die on one of the |
2133 |
++ * first kernel->user transitions and init will die. But, we |
2134 |
++ * will have most of the kernel up by then and should be able to |
2135 |
++ * get a clean warning out of it. If we BUG_ON() here, we run |
2136 |
++ * the risk of being before we have good console output. |
2137 |
++ */ |
2138 |
++void __init kaiser_init(void) |
2139 |
++{ |
2140 |
++ int cpu; |
2141 |
++ |
2142 |
++ if (!kaiser_enabled) |
2143 |
++ return; |
2144 |
++ |
2145 |
++ kaiser_init_all_pgds(); |
2146 |
++ |
2147 |
++ for_each_possible_cpu(cpu) { |
2148 |
++ void *percpu_vaddr = __per_cpu_user_mapped_start + |
2149 |
++ per_cpu_offset(cpu); |
2150 |
++ unsigned long percpu_sz = __per_cpu_user_mapped_end - |
2151 |
++ __per_cpu_user_mapped_start; |
2152 |
++ kaiser_add_user_map_early(percpu_vaddr, percpu_sz, |
2153 |
++ __PAGE_KERNEL); |
2154 |
++ } |
2155 |
++ |
2156 |
++ /* |
2157 |
++ * Map the entry/exit text section, which is needed at |
2158 |
++ * switches from user to and from kernel. |
2159 |
++ */ |
2160 |
++ kaiser_add_user_map_ptrs_early(__entry_text_start, __entry_text_end, |
2161 |
++ __PAGE_KERNEL_RX); |
2162 |
++ |
2163 |
++#if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN) |
2164 |
++ kaiser_add_user_map_ptrs_early(__irqentry_text_start, |
2165 |
++ __irqentry_text_end, |
2166 |
++ __PAGE_KERNEL_RX); |
2167 |
++#endif |
2168 |
++ kaiser_add_user_map_early((void *)idt_descr.address, |
2169 |
++ sizeof(gate_desc) * NR_VECTORS, |
2170 |
++ __PAGE_KERNEL_RO); |
2171 |
++#ifdef CONFIG_TRACING |
2172 |
++ kaiser_add_user_map_early(&trace_idt_descr, |
2173 |
++ sizeof(trace_idt_descr), |
2174 |
++ __PAGE_KERNEL); |
2175 |
++ kaiser_add_user_map_early(&trace_idt_table, |
2176 |
++ sizeof(gate_desc) * NR_VECTORS, |
2177 |
++ __PAGE_KERNEL); |
2178 |
++#endif |
2179 |
++ kaiser_add_user_map_early(&debug_idt_descr, sizeof(debug_idt_descr), |
2180 |
++ __PAGE_KERNEL); |
2181 |
++ kaiser_add_user_map_early(&debug_idt_table, |
2182 |
++ sizeof(gate_desc) * NR_VECTORS, |
2183 |
++ __PAGE_KERNEL); |
2184 |
++ |
2185 |
++ pr_info("enabled\n"); |
2186 |
++} |
2187 |
++ |
2188 |
++/* Add a mapping to the shadow mapping, and synchronize the mappings */ |
2189 |
++int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags) |
2190 |
++{ |
2191 |
++ if (!kaiser_enabled) |
2192 |
++ return 0; |
2193 |
++ return kaiser_add_user_map((const void *)addr, size, flags); |
2194 |
++} |
2195 |
++ |
2196 |
++void kaiser_remove_mapping(unsigned long start, unsigned long size) |
2197 |
++{ |
2198 |
++ extern void unmap_pud_range_nofree(pgd_t *pgd, |
2199 |
++ unsigned long start, unsigned long end); |
2200 |
++ unsigned long end = start + size; |
2201 |
++ unsigned long addr, next; |
2202 |
++ pgd_t *pgd; |
2203 |
++ |
2204 |
++ if (!kaiser_enabled) |
2205 |
++ return; |
2206 |
++ pgd = native_get_shadow_pgd(pgd_offset_k(start)); |
2207 |
++ for (addr = start; addr < end; pgd++, addr = next) { |
2208 |
++ next = pgd_addr_end(addr, end); |
2209 |
++ unmap_pud_range_nofree(pgd, addr, next); |
2210 |
++ } |
2211 |
++} |
2212 |
++ |
2213 |
++/* |
2214 |
++ * Page table pages are page-aligned. The lower half of the top |
2215 |
++ * level is used for userspace and the top half for the kernel. |
2216 |
++ * This returns true for user pages that need to get copied into |
2217 |
++ * both the user and kernel copies of the page tables, and false |
2218 |
++ * for kernel pages that should only be in the kernel copy. |
2219 |
++ */ |
2220 |
++static inline bool is_userspace_pgd(pgd_t *pgdp) |
2221 |
++{ |
2222 |
++ return ((unsigned long)pgdp % PAGE_SIZE) < (PAGE_SIZE / 2); |
2223 |
++} |
2224 |
++ |
2225 |
++pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd) |
2226 |
++{ |
2227 |
++ if (!kaiser_enabled) |
2228 |
++ return pgd; |
2229 |
++ /* |
2230 |
++ * Do we need to also populate the shadow pgd? Check _PAGE_USER to |
2231 |
++ * skip cases like kexec and EFI which make temporary low mappings. |
2232 |
++ */ |
2233 |
++ if (pgd.pgd & _PAGE_USER) { |
2234 |
++ if (is_userspace_pgd(pgdp)) { |
2235 |
++ native_get_shadow_pgd(pgdp)->pgd = pgd.pgd; |
2236 |
++ /* |
2237 |
++ * Even if the entry is *mapping* userspace, ensure |
2238 |
++ * that userspace can not use it. This way, if we |
2239 |
++ * get out to userspace running on the kernel CR3, |
2240 |
++ * userspace will crash instead of running. |
2241 |
++ */ |
2242 |
++ if (__supported_pte_mask & _PAGE_NX) |
2243 |
++ pgd.pgd |= _PAGE_NX; |
2244 |
++ } |
2245 |
++ } else if (!pgd.pgd) { |
2246 |
++ /* |
2247 |
++ * pgd_clear() cannot check _PAGE_USER, and is even used to |
2248 |
++ * clear corrupted pgd entries: so just rely on cases like |
2249 |
++ * kexec and EFI never to be using pgd_clear(). |
2250 |
++ */ |
2251 |
++ if (!WARN_ON_ONCE((unsigned long)pgdp & PAGE_SIZE) && |
2252 |
++ is_userspace_pgd(pgdp)) |
2253 |
++ native_get_shadow_pgd(pgdp)->pgd = pgd.pgd; |
2254 |
++ } |
2255 |
++ return pgd; |
2256 |
++} |
2257 |
++ |
2258 |
++void kaiser_setup_pcid(void) |
2259 |
++{ |
2260 |
++ unsigned long user_cr3 = KAISER_SHADOW_PGD_OFFSET; |
2261 |
++ |
2262 |
++ if (this_cpu_has(X86_FEATURE_PCID)) |
2263 |
++ user_cr3 |= X86_CR3_PCID_USER_NOFLUSH; |
2264 |
++ /* |
2265 |
++ * These variables are used by the entry/exit |
2266 |
++ * code to change PCID and pgd and TLB flushing. |
2267 |
++ */ |
2268 |
++ this_cpu_write(x86_cr3_pcid_user, user_cr3); |
2269 |
++} |
2270 |
++ |
2271 |
++/* |
2272 |
++ * Make a note that this cpu will need to flush USER tlb on return to user. |
2273 |
++ * If cpu does not have PCID, then the NOFLUSH bit will never have been set. |
2274 |
++ */ |
2275 |
++void kaiser_flush_tlb_on_return_to_user(void) |
2276 |
++{ |
2277 |
++ if (this_cpu_has(X86_FEATURE_PCID)) |
2278 |
++ this_cpu_write(x86_cr3_pcid_user, |
2279 |
++ X86_CR3_PCID_USER_FLUSH | KAISER_SHADOW_PGD_OFFSET); |
2280 |
++} |
2281 |
++EXPORT_SYMBOL(kaiser_flush_tlb_on_return_to_user); |
2282 |
+diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c |
2283 |
+index 4e5ac46adc9d..81ec7c02f968 100644 |
2284 |
+--- a/arch/x86/mm/kasan_init_64.c |
2285 |
++++ b/arch/x86/mm/kasan_init_64.c |
2286 |
+@@ -121,11 +121,16 @@ void __init kasan_init(void) |
2287 |
+ kasan_populate_zero_shadow(kasan_mem_to_shadow((void *)MODULES_END), |
2288 |
+ (void *)KASAN_SHADOW_END); |
2289 |
+ |
2290 |
+- memset(kasan_zero_page, 0, PAGE_SIZE); |
2291 |
+- |
2292 |
+ load_cr3(init_level4_pgt); |
2293 |
+ __flush_tlb_all(); |
2294 |
+- init_task.kasan_depth = 0; |
2295 |
+ |
2296 |
++ /* |
2297 |
++ * kasan_zero_page has been used as early shadow memory, thus it may |
2298 |
++ * contain some garbage. Now we can clear it, since after the TLB flush |
2299 |
++ * no one should write to it. |
2300 |
++ */ |
2301 |
++ memset(kasan_zero_page, 0, PAGE_SIZE); |
2302 |
++ |
2303 |
++ init_task.kasan_depth = 0; |
2304 |
+ pr_info("KernelAddressSanitizer initialized\n"); |
2305 |
+ } |
2306 |
+diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c |
2307 |
+index b599a780a5a9..79377e2a7bcd 100644 |
2308 |
+--- a/arch/x86/mm/pageattr.c |
2309 |
++++ b/arch/x86/mm/pageattr.c |
2310 |
+@@ -52,6 +52,7 @@ static DEFINE_SPINLOCK(cpa_lock); |
2311 |
+ #define CPA_FLUSHTLB 1 |
2312 |
+ #define CPA_ARRAY 2 |
2313 |
+ #define CPA_PAGES_ARRAY 4 |
2314 |
++#define CPA_FREE_PAGETABLES 8 |
2315 |
+ |
2316 |
+ #ifdef CONFIG_PROC_FS |
2317 |
+ static unsigned long direct_pages_count[PG_LEVEL_NUM]; |
2318 |
+@@ -723,10 +724,13 @@ static int split_large_page(struct cpa_data *cpa, pte_t *kpte, |
2319 |
+ return 0; |
2320 |
+ } |
2321 |
+ |
2322 |
+-static bool try_to_free_pte_page(pte_t *pte) |
2323 |
++static bool try_to_free_pte_page(struct cpa_data *cpa, pte_t *pte) |
2324 |
+ { |
2325 |
+ int i; |
2326 |
+ |
2327 |
++ if (!(cpa->flags & CPA_FREE_PAGETABLES)) |
2328 |
++ return false; |
2329 |
++ |
2330 |
+ for (i = 0; i < PTRS_PER_PTE; i++) |
2331 |
+ if (!pte_none(pte[i])) |
2332 |
+ return false; |
2333 |
+@@ -735,10 +739,13 @@ static bool try_to_free_pte_page(pte_t *pte) |
2334 |
+ return true; |
2335 |
+ } |
2336 |
+ |
2337 |
+-static bool try_to_free_pmd_page(pmd_t *pmd) |
2338 |
++static bool try_to_free_pmd_page(struct cpa_data *cpa, pmd_t *pmd) |
2339 |
+ { |
2340 |
+ int i; |
2341 |
+ |
2342 |
++ if (!(cpa->flags & CPA_FREE_PAGETABLES)) |
2343 |
++ return false; |
2344 |
++ |
2345 |
+ for (i = 0; i < PTRS_PER_PMD; i++) |
2346 |
+ if (!pmd_none(pmd[i])) |
2347 |
+ return false; |
2348 |
+@@ -759,7 +766,9 @@ static bool try_to_free_pud_page(pud_t *pud) |
2349 |
+ return true; |
2350 |
+ } |
2351 |
+ |
2352 |
+-static bool unmap_pte_range(pmd_t *pmd, unsigned long start, unsigned long end) |
2353 |
++static bool unmap_pte_range(struct cpa_data *cpa, pmd_t *pmd, |
2354 |
++ unsigned long start, |
2355 |
++ unsigned long end) |
2356 |
+ { |
2357 |
+ pte_t *pte = pte_offset_kernel(pmd, start); |
2358 |
+ |
2359 |
+@@ -770,22 +779,23 @@ static bool unmap_pte_range(pmd_t *pmd, unsigned long start, unsigned long end) |
2360 |
+ pte++; |
2361 |
+ } |
2362 |
+ |
2363 |
+- if (try_to_free_pte_page((pte_t *)pmd_page_vaddr(*pmd))) { |
2364 |
++ if (try_to_free_pte_page(cpa, (pte_t *)pmd_page_vaddr(*pmd))) { |
2365 |
+ pmd_clear(pmd); |
2366 |
+ return true; |
2367 |
+ } |
2368 |
+ return false; |
2369 |
+ } |
2370 |
+ |
2371 |
+-static void __unmap_pmd_range(pud_t *pud, pmd_t *pmd, |
2372 |
++static void __unmap_pmd_range(struct cpa_data *cpa, pud_t *pud, pmd_t *pmd, |
2373 |
+ unsigned long start, unsigned long end) |
2374 |
+ { |
2375 |
+- if (unmap_pte_range(pmd, start, end)) |
2376 |
+- if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud))) |
2377 |
++ if (unmap_pte_range(cpa, pmd, start, end)) |
2378 |
++ if (try_to_free_pmd_page(cpa, (pmd_t *)pud_page_vaddr(*pud))) |
2379 |
+ pud_clear(pud); |
2380 |
+ } |
2381 |
+ |
2382 |
+-static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end) |
2383 |
++static void unmap_pmd_range(struct cpa_data *cpa, pud_t *pud, |
2384 |
++ unsigned long start, unsigned long end) |
2385 |
+ { |
2386 |
+ pmd_t *pmd = pmd_offset(pud, start); |
2387 |
+ |
2388 |
+@@ -796,7 +806,7 @@ static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end) |
2389 |
+ unsigned long next_page = (start + PMD_SIZE) & PMD_MASK; |
2390 |
+ unsigned long pre_end = min_t(unsigned long, end, next_page); |
2391 |
+ |
2392 |
+- __unmap_pmd_range(pud, pmd, start, pre_end); |
2393 |
++ __unmap_pmd_range(cpa, pud, pmd, start, pre_end); |
2394 |
+ |
2395 |
+ start = pre_end; |
2396 |
+ pmd++; |
2397 |
+@@ -809,7 +819,8 @@ static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end) |
2398 |
+ if (pmd_large(*pmd)) |
2399 |
+ pmd_clear(pmd); |
2400 |
+ else |
2401 |
+- __unmap_pmd_range(pud, pmd, start, start + PMD_SIZE); |
2402 |
++ __unmap_pmd_range(cpa, pud, pmd, |
2403 |
++ start, start + PMD_SIZE); |
2404 |
+ |
2405 |
+ start += PMD_SIZE; |
2406 |
+ pmd++; |
2407 |
+@@ -819,17 +830,19 @@ static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end) |
2408 |
+ * 4K leftovers? |
2409 |
+ */ |
2410 |
+ if (start < end) |
2411 |
+- return __unmap_pmd_range(pud, pmd, start, end); |
2412 |
++ return __unmap_pmd_range(cpa, pud, pmd, start, end); |
2413 |
+ |
2414 |
+ /* |
2415 |
+ * Try again to free the PMD page if haven't succeeded above. |
2416 |
+ */ |
2417 |
+ if (!pud_none(*pud)) |
2418 |
+- if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud))) |
2419 |
++ if (try_to_free_pmd_page(cpa, (pmd_t *)pud_page_vaddr(*pud))) |
2420 |
+ pud_clear(pud); |
2421 |
+ } |
2422 |
+ |
2423 |
+-static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end) |
2424 |
++static void __unmap_pud_range(struct cpa_data *cpa, pgd_t *pgd, |
2425 |
++ unsigned long start, |
2426 |
++ unsigned long end) |
2427 |
+ { |
2428 |
+ pud_t *pud = pud_offset(pgd, start); |
2429 |
+ |
2430 |
+@@ -840,7 +853,7 @@ static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end) |
2431 |
+ unsigned long next_page = (start + PUD_SIZE) & PUD_MASK; |
2432 |
+ unsigned long pre_end = min_t(unsigned long, end, next_page); |
2433 |
+ |
2434 |
+- unmap_pmd_range(pud, start, pre_end); |
2435 |
++ unmap_pmd_range(cpa, pud, start, pre_end); |
2436 |
+ |
2437 |
+ start = pre_end; |
2438 |
+ pud++; |
2439 |
+@@ -854,7 +867,7 @@ static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end) |
2440 |
+ if (pud_large(*pud)) |
2441 |
+ pud_clear(pud); |
2442 |
+ else |
2443 |
+- unmap_pmd_range(pud, start, start + PUD_SIZE); |
2444 |
++ unmap_pmd_range(cpa, pud, start, start + PUD_SIZE); |
2445 |
+ |
2446 |
+ start += PUD_SIZE; |
2447 |
+ pud++; |
2448 |
+@@ -864,7 +877,7 @@ static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end) |
2449 |
+ * 2M leftovers? |
2450 |
+ */ |
2451 |
+ if (start < end) |
2452 |
+- unmap_pmd_range(pud, start, end); |
2453 |
++ unmap_pmd_range(cpa, pud, start, end); |
2454 |
+ |
2455 |
+ /* |
2456 |
+ * No need to try to free the PUD page because we'll free it in |
2457 |
+@@ -872,6 +885,24 @@ static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end) |
2458 |
+ */ |
2459 |
+ } |
2460 |
+ |
2461 |
++static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end) |
2462 |
++{ |
2463 |
++ struct cpa_data cpa = { |
2464 |
++ .flags = CPA_FREE_PAGETABLES, |
2465 |
++ }; |
2466 |
++ |
2467 |
++ __unmap_pud_range(&cpa, pgd, start, end); |
2468 |
++} |
2469 |
++ |
2470 |
++void unmap_pud_range_nofree(pgd_t *pgd, unsigned long start, unsigned long end) |
2471 |
++{ |
2472 |
++ struct cpa_data cpa = { |
2473 |
++ .flags = 0, |
2474 |
++ }; |
2475 |
++ |
2476 |
++ __unmap_pud_range(&cpa, pgd, start, end); |
2477 |
++} |
2478 |
++ |
2479 |
+ static void unmap_pgd_range(pgd_t *root, unsigned long addr, unsigned long end) |
2480 |
+ { |
2481 |
+ pgd_t *pgd_entry = root + pgd_index(addr); |
2482 |
+diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c |
2483 |
+index fb0a9dd1d6e4..dbc27a2b4ad5 100644 |
2484 |
+--- a/arch/x86/mm/pgtable.c |
2485 |
++++ b/arch/x86/mm/pgtable.c |
2486 |
+@@ -6,7 +6,7 @@ |
2487 |
+ #include <asm/fixmap.h> |
2488 |
+ #include <asm/mtrr.h> |
2489 |
+ |
2490 |
+-#define PGALLOC_GFP GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO |
2491 |
++#define PGALLOC_GFP (GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO) |
2492 |
+ |
2493 |
+ #ifdef CONFIG_HIGHPTE |
2494 |
+ #define PGALLOC_USER_GFP __GFP_HIGHMEM |
2495 |
+@@ -340,14 +340,24 @@ static inline void _pgd_free(pgd_t *pgd) |
2496 |
+ kmem_cache_free(pgd_cache, pgd); |
2497 |
+ } |
2498 |
+ #else |
2499 |
++ |
2500 |
++/* |
2501 |
++ * Instead of one pgd, Kaiser acquires two pgds. Being order-1, it is |
2502 |
++ * both 8k in size and 8k-aligned. That lets us just flip bit 12 |
2503 |
++ * in a pointer to swap between the two 4k halves. |
2504 |
++ */ |
2505 |
++#define PGD_ALLOCATION_ORDER kaiser_enabled |
2506 |
++ |
2507 |
+ static inline pgd_t *_pgd_alloc(void) |
2508 |
+ { |
2509 |
+- return (pgd_t *)__get_free_page(PGALLOC_GFP); |
2510 |
++ /* No __GFP_REPEAT: to avoid page allocation stalls in order-1 case */ |
2511 |
++ return (pgd_t *)__get_free_pages(PGALLOC_GFP & ~__GFP_REPEAT, |
2512 |
++ PGD_ALLOCATION_ORDER); |
2513 |
+ } |
2514 |
+ |
2515 |
+ static inline void _pgd_free(pgd_t *pgd) |
2516 |
+ { |
2517 |
+- free_page((unsigned long)pgd); |
2518 |
++ free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER); |
2519 |
+ } |
2520 |
+ #endif /* CONFIG_X86_PAE */ |
2521 |
+ |
2522 |
+diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c |
2523 |
+index 7a4cdb632508..7cad01af6dcd 100644 |
2524 |
+--- a/arch/x86/mm/tlb.c |
2525 |
++++ b/arch/x86/mm/tlb.c |
2526 |
+@@ -6,13 +6,14 @@ |
2527 |
+ #include <linux/interrupt.h> |
2528 |
+ #include <linux/module.h> |
2529 |
+ #include <linux/cpu.h> |
2530 |
++#include <linux/debugfs.h> |
2531 |
+ |
2532 |
+ #include <asm/tlbflush.h> |
2533 |
+ #include <asm/mmu_context.h> |
2534 |
+ #include <asm/cache.h> |
2535 |
+ #include <asm/apic.h> |
2536 |
+ #include <asm/uv/uv.h> |
2537 |
+-#include <linux/debugfs.h> |
2538 |
++#include <asm/kaiser.h> |
2539 |
+ |
2540 |
+ /* |
2541 |
+ * TLB flushing, formerly SMP-only |
2542 |
+@@ -34,6 +35,36 @@ struct flush_tlb_info { |
2543 |
+ unsigned long flush_end; |
2544 |
+ }; |
2545 |
+ |
2546 |
++static void load_new_mm_cr3(pgd_t *pgdir) |
2547 |
++{ |
2548 |
++ unsigned long new_mm_cr3 = __pa(pgdir); |
2549 |
++ |
2550 |
++ if (kaiser_enabled) { |
2551 |
++ /* |
2552 |
++ * We reuse the same PCID for different tasks, so we must |
2553 |
++ * flush all the entries for the PCID out when we change tasks. |
2554 |
++ * Flush KERN below, flush USER when returning to userspace in |
2555 |
++ * kaiser's SWITCH_USER_CR3 (_SWITCH_TO_USER_CR3) macro. |
2556 |
++ * |
2557 |
++ * invpcid_flush_single_context(X86_CR3_PCID_ASID_USER) could |
2558 |
++ * do it here, but can only be used if X86_FEATURE_INVPCID is |
2559 |
++ * available - and many machines support pcid without invpcid. |
2560 |
++ * |
2561 |
++ * If X86_CR3_PCID_KERN_FLUSH actually added something, then it |
2562 |
++ * would be needed in the write_cr3() below - if PCIDs enabled. |
2563 |
++ */ |
2564 |
++ BUILD_BUG_ON(X86_CR3_PCID_KERN_FLUSH); |
2565 |
++ kaiser_flush_tlb_on_return_to_user(); |
2566 |
++ } |
2567 |
++ |
2568 |
++ /* |
2569 |
++ * Caution: many callers of this function expect |
2570 |
++ * that load_cr3() is serializing and orders TLB |
2571 |
++ * fills with respect to the mm_cpumask writes. |
2572 |
++ */ |
2573 |
++ write_cr3(new_mm_cr3); |
2574 |
++} |
2575 |
++ |
2576 |
+ /* |
2577 |
+ * We cannot call mmdrop() because we are in interrupt context, |
2578 |
+ * instead update mm->cpu_vm_mask. |
2579 |
+@@ -45,7 +76,7 @@ void leave_mm(int cpu) |
2580 |
+ BUG(); |
2581 |
+ if (cpumask_test_cpu(cpu, mm_cpumask(active_mm))) { |
2582 |
+ cpumask_clear_cpu(cpu, mm_cpumask(active_mm)); |
2583 |
+- load_cr3(swapper_pg_dir); |
2584 |
++ load_new_mm_cr3(swapper_pg_dir); |
2585 |
+ /* |
2586 |
+ * This gets called in the idle path where RCU |
2587 |
+ * functions differently. Tracing normally |
2588 |
+@@ -105,7 +136,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, |
2589 |
+ * ordering guarantee we need. |
2590 |
+ * |
2591 |
+ */ |
2592 |
+- load_cr3(next->pgd); |
2593 |
++ load_new_mm_cr3(next->pgd); |
2594 |
+ |
2595 |
+ trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); |
2596 |
+ |
2597 |
+@@ -152,7 +183,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, |
2598 |
+ * As above, load_cr3() is serializing and orders TLB |
2599 |
+ * fills with respect to the mm_cpumask write. |
2600 |
+ */ |
2601 |
+- load_cr3(next->pgd); |
2602 |
++ load_new_mm_cr3(next->pgd); |
2603 |
+ trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); |
2604 |
+ load_mm_cr4(next); |
2605 |
+ load_mm_ldt(next); |
2606 |
+diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h |
2607 |
+index ef2e8c97e183..a461b6604fd9 100644 |
2608 |
+--- a/include/asm-generic/vmlinux.lds.h |
2609 |
++++ b/include/asm-generic/vmlinux.lds.h |
2610 |
+@@ -725,7 +725,14 @@ |
2611 |
+ */ |
2612 |
+ #define PERCPU_INPUT(cacheline) \ |
2613 |
+ VMLINUX_SYMBOL(__per_cpu_start) = .; \ |
2614 |
++ VMLINUX_SYMBOL(__per_cpu_user_mapped_start) = .; \ |
2615 |
+ *(.data..percpu..first) \ |
2616 |
++ . = ALIGN(cacheline); \ |
2617 |
++ *(.data..percpu..user_mapped) \ |
2618 |
++ *(.data..percpu..user_mapped..shared_aligned) \ |
2619 |
++ . = ALIGN(PAGE_SIZE); \ |
2620 |
++ *(.data..percpu..user_mapped..page_aligned) \ |
2621 |
++ VMLINUX_SYMBOL(__per_cpu_user_mapped_end) = .; \ |
2622 |
+ . = ALIGN(PAGE_SIZE); \ |
2623 |
+ *(.data..percpu..page_aligned) \ |
2624 |
+ . = ALIGN(cacheline); \ |
2625 |
+diff --git a/include/linux/kaiser.h b/include/linux/kaiser.h |
2626 |
+new file mode 100644 |
2627 |
+index 000000000000..58c55b1589d0 |
2628 |
+--- /dev/null |
2629 |
++++ b/include/linux/kaiser.h |
2630 |
+@@ -0,0 +1,52 @@ |
2631 |
++#ifndef _LINUX_KAISER_H |
2632 |
++#define _LINUX_KAISER_H |
2633 |
++ |
2634 |
++#ifdef CONFIG_PAGE_TABLE_ISOLATION |
2635 |
++#include <asm/kaiser.h> |
2636 |
++ |
2637 |
++static inline int kaiser_map_thread_stack(void *stack) |
2638 |
++{ |
2639 |
++ /* |
2640 |
++ * Map that page of kernel stack on which we enter from user context. |
2641 |
++ */ |
2642 |
++ return kaiser_add_mapping((unsigned long)stack + |
2643 |
++ THREAD_SIZE - PAGE_SIZE, PAGE_SIZE, __PAGE_KERNEL); |
2644 |
++} |
2645 |
++ |
2646 |
++static inline void kaiser_unmap_thread_stack(void *stack) |
2647 |
++{ |
2648 |
++ /* |
2649 |
++ * Note: may be called even when kaiser_map_thread_stack() failed. |
2650 |
++ */ |
2651 |
++ kaiser_remove_mapping((unsigned long)stack + |
2652 |
++ THREAD_SIZE - PAGE_SIZE, PAGE_SIZE); |
2653 |
++} |
2654 |
++#else |
2655 |
++ |
2656 |
++/* |
2657 |
++ * These stubs are used whenever CONFIG_PAGE_TABLE_ISOLATION is off, which |
2658 |
++ * includes architectures that support KAISER, but have it disabled. |
2659 |
++ */ |
2660 |
++ |
2661 |
++static inline void kaiser_init(void) |
2662 |
++{ |
2663 |
++} |
2664 |
++static inline int kaiser_add_mapping(unsigned long addr, |
2665 |
++ unsigned long size, unsigned long flags) |
2666 |
++{ |
2667 |
++ return 0; |
2668 |
++} |
2669 |
++static inline void kaiser_remove_mapping(unsigned long start, |
2670 |
++ unsigned long size) |
2671 |
++{ |
2672 |
++} |
2673 |
++static inline int kaiser_map_thread_stack(void *stack) |
2674 |
++{ |
2675 |
++ return 0; |
2676 |
++} |
2677 |
++static inline void kaiser_unmap_thread_stack(void *stack) |
2678 |
++{ |
2679 |
++} |
2680 |
++ |
2681 |
++#endif /* !CONFIG_PAGE_TABLE_ISOLATION */ |
2682 |
++#endif /* _LINUX_KAISER_H */ |
2683 |
+diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h |
2684 |
+index ff88d6189411..b93b578cfa42 100644 |
2685 |
+--- a/include/linux/mmzone.h |
2686 |
++++ b/include/linux/mmzone.h |
2687 |
+@@ -131,8 +131,9 @@ enum zone_stat_item { |
2688 |
+ NR_SLAB_RECLAIMABLE, |
2689 |
+ NR_SLAB_UNRECLAIMABLE, |
2690 |
+ NR_PAGETABLE, /* used for pagetables */ |
2691 |
+- NR_KERNEL_STACK, |
2692 |
+ /* Second 128 byte cacheline */ |
2693 |
++ NR_KERNEL_STACK, |
2694 |
++ NR_KAISERTABLE, |
2695 |
+ NR_UNSTABLE_NFS, /* NFS unstable pages */ |
2696 |
+ NR_BOUNCE, |
2697 |
+ NR_VMSCAN_WRITE, |
2698 |
+diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h |
2699 |
+index 8f16299ca068..8902f23bb770 100644 |
2700 |
+--- a/include/linux/percpu-defs.h |
2701 |
++++ b/include/linux/percpu-defs.h |
2702 |
+@@ -35,6 +35,12 @@ |
2703 |
+ |
2704 |
+ #endif |
2705 |
+ |
2706 |
++#ifdef CONFIG_PAGE_TABLE_ISOLATION |
2707 |
++#define USER_MAPPED_SECTION "..user_mapped" |
2708 |
++#else |
2709 |
++#define USER_MAPPED_SECTION "" |
2710 |
++#endif |
2711 |
++ |
2712 |
+ /* |
2713 |
+ * Base implementations of per-CPU variable declarations and definitions, where |
2714 |
+ * the section in which the variable is to be placed is provided by the |
2715 |
+@@ -115,6 +121,12 @@ |
2716 |
+ #define DEFINE_PER_CPU(type, name) \ |
2717 |
+ DEFINE_PER_CPU_SECTION(type, name, "") |
2718 |
+ |
2719 |
++#define DECLARE_PER_CPU_USER_MAPPED(type, name) \ |
2720 |
++ DECLARE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION) |
2721 |
++ |
2722 |
++#define DEFINE_PER_CPU_USER_MAPPED(type, name) \ |
2723 |
++ DEFINE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION) |
2724 |
++ |
2725 |
+ /* |
2726 |
+ * Declaration/definition used for per-CPU variables that must come first in |
2727 |
+ * the set of variables. |
2728 |
+@@ -144,6 +156,14 @@ |
2729 |
+ DEFINE_PER_CPU_SECTION(type, name, PER_CPU_SHARED_ALIGNED_SECTION) \ |
2730 |
+ ____cacheline_aligned_in_smp |
2731 |
+ |
2732 |
++#define DECLARE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(type, name) \ |
2733 |
++ DECLARE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION PER_CPU_SHARED_ALIGNED_SECTION) \ |
2734 |
++ ____cacheline_aligned_in_smp |
2735 |
++ |
2736 |
++#define DEFINE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(type, name) \ |
2737 |
++ DEFINE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION PER_CPU_SHARED_ALIGNED_SECTION) \ |
2738 |
++ ____cacheline_aligned_in_smp |
2739 |
++ |
2740 |
+ #define DECLARE_PER_CPU_ALIGNED(type, name) \ |
2741 |
+ DECLARE_PER_CPU_SECTION(type, name, PER_CPU_ALIGNED_SECTION) \ |
2742 |
+ ____cacheline_aligned |
2743 |
+@@ -162,11 +182,21 @@ |
2744 |
+ #define DEFINE_PER_CPU_PAGE_ALIGNED(type, name) \ |
2745 |
+ DEFINE_PER_CPU_SECTION(type, name, "..page_aligned") \ |
2746 |
+ __aligned(PAGE_SIZE) |
2747 |
++/* |
2748 |
++ * Declaration/definition used for per-CPU variables that must be page aligned and need to be mapped in user mode. |
2749 |
++ */ |
2750 |
++#define DECLARE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(type, name) \ |
2751 |
++ DECLARE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION"..page_aligned") \ |
2752 |
++ __aligned(PAGE_SIZE) |
2753 |
++ |
2754 |
++#define DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(type, name) \ |
2755 |
++ DEFINE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION"..page_aligned") \ |
2756 |
++ __aligned(PAGE_SIZE) |
2757 |
+ |
2758 |
+ /* |
2759 |
+ * Declaration/definition used for per-CPU variables that must be read mostly. |
2760 |
+ */ |
2761 |
+-#define DECLARE_PER_CPU_READ_MOSTLY(type, name) \ |
2762 |
++#define DECLARE_PER_CPU_READ_MOSTLY(type, name) \ |
2763 |
+ DECLARE_PER_CPU_SECTION(type, name, "..read_mostly") |
2764 |
+ |
2765 |
+ #define DEFINE_PER_CPU_READ_MOSTLY(type, name) \ |
2766 |
+diff --git a/init/main.c b/init/main.c |
2767 |
+index 9e64d7097f1a..49926d95442f 100644 |
2768 |
+--- a/init/main.c |
2769 |
++++ b/init/main.c |
2770 |
+@@ -81,6 +81,7 @@ |
2771 |
+ #include <linux/integrity.h> |
2772 |
+ #include <linux/proc_ns.h> |
2773 |
+ #include <linux/io.h> |
2774 |
++#include <linux/kaiser.h> |
2775 |
+ |
2776 |
+ #include <asm/io.h> |
2777 |
+ #include <asm/bugs.h> |
2778 |
+@@ -492,6 +493,7 @@ static void __init mm_init(void) |
2779 |
+ pgtable_init(); |
2780 |
+ vmalloc_init(); |
2781 |
+ ioremap_huge_init(); |
2782 |
++ kaiser_init(); |
2783 |
+ } |
2784 |
+ |
2785 |
+ asmlinkage __visible void __init start_kernel(void) |
2786 |
+diff --git a/kernel/fork.c b/kernel/fork.c |
2787 |
+index 68cfda1c1800..ac00f14208b7 100644 |
2788 |
+--- a/kernel/fork.c |
2789 |
++++ b/kernel/fork.c |
2790 |
+@@ -58,6 +58,7 @@ |
2791 |
+ #include <linux/tsacct_kern.h> |
2792 |
+ #include <linux/cn_proc.h> |
2793 |
+ #include <linux/freezer.h> |
2794 |
++#include <linux/kaiser.h> |
2795 |
+ #include <linux/delayacct.h> |
2796 |
+ #include <linux/taskstats_kern.h> |
2797 |
+ #include <linux/random.h> |
2798 |
+@@ -169,6 +170,7 @@ static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, |
2799 |
+ |
2800 |
+ static inline void free_thread_info(struct thread_info *ti) |
2801 |
+ { |
2802 |
++ kaiser_unmap_thread_stack(ti); |
2803 |
+ free_kmem_pages((unsigned long)ti, THREAD_SIZE_ORDER); |
2804 |
+ } |
2805 |
+ # else |
2806 |
+@@ -352,6 +354,10 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) |
2807 |
+ goto free_ti; |
2808 |
+ |
2809 |
+ tsk->stack = ti; |
2810 |
++ |
2811 |
++ err = kaiser_map_thread_stack(tsk->stack); |
2812 |
++ if (err) |
2813 |
++ goto free_ti; |
2814 |
+ #ifdef CONFIG_SECCOMP |
2815 |
+ /* |
2816 |
+ * We must handle setting up seccomp filters once we're under |
2817 |
+diff --git a/mm/vmstat.c b/mm/vmstat.c |
2818 |
+index c344e3609c53..324b7e90b4c5 100644 |
2819 |
+--- a/mm/vmstat.c |
2820 |
++++ b/mm/vmstat.c |
2821 |
+@@ -736,6 +736,7 @@ const char * const vmstat_text[] = { |
2822 |
+ "nr_slab_unreclaimable", |
2823 |
+ "nr_page_table_pages", |
2824 |
+ "nr_kernel_stack", |
2825 |
++ "nr_overhead", |
2826 |
+ "nr_unstable", |
2827 |
+ "nr_bounce", |
2828 |
+ "nr_vmscan_write", |
2829 |
+diff --git a/security/Kconfig b/security/Kconfig |
2830 |
+index e45237897b43..a3ebb6ee5bd5 100644 |
2831 |
+--- a/security/Kconfig |
2832 |
++++ b/security/Kconfig |
2833 |
+@@ -31,6 +31,16 @@ config SECURITY |
2834 |
+ |
2835 |
+ If you are unsure how to answer this question, answer N. |
2836 |
+ |
2837 |
++config PAGE_TABLE_ISOLATION |
2838 |
++ bool "Remove the kernel mapping in user mode" |
2839 |
++ default y |
2840 |
++ depends on X86_64 && SMP |
2841 |
++ help |
2842 |
++ This enforces a strict kernel and user space isolation, in order |
2843 |
++ to close hardware side channels on kernel address information. |
2844 |
++ |
2845 |
++ If you are unsure how to answer this question, answer Y. |
2846 |
++ |
2847 |
+ config SECURITYFS |
2848 |
+ bool "Enable the securityfs filesystem" |
2849 |
+ help |