Gentoo Archives: gentoo-commits

From: Mike Pagano <mpagano@g.o>
To: gentoo-commits@l.g.o
Subject: [gentoo-commits] proj/linux-patches:4.14 commit in: /
Date: Thu, 08 Feb 2018 00:41:18
Message-Id: 1518050456.8e2a2ed3d7a6fc577c4e7954c2e9968ed574aa46.mpagano@gentoo
1 commit: 8e2a2ed3d7a6fc577c4e7954c2e9968ed574aa46
2 Author: Mike Pagano <mpagano <AT> gentoo <DOT> org>
3 AuthorDate: Thu Feb 8 00:40:27 2018 +0000
4 Commit: Mike Pagano <mpagano <AT> gentoo <DOT> org>
5 CommitDate: Thu Feb 8 00:40:56 2018 +0000
6 URL: https://gitweb.gentoo.org/proj/linux-patches.git/commit/?id=8e2a2ed3
7
8 Linux patch 4.14.18
9
10 1017_linux-4.14.18.patch | 3790 ++++++++++++++++++++++++++++++++++++++++++++++
11 1 file changed, 3790 insertions(+)
12
13 diff --git a/1017_linux-4.14.18.patch b/1017_linux-4.14.18.patch
14 new file mode 100644
15 index 0000000..07fbf45
16 --- /dev/null
17 +++ b/1017_linux-4.14.18.patch
18 @@ -0,0 +1,3790 @@
19 +diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
20 +index 8122b5f98ea1..c76afdcafbef 100644
21 +--- a/Documentation/admin-guide/kernel-parameters.txt
22 ++++ b/Documentation/admin-guide/kernel-parameters.txt
23 +@@ -2718,8 +2718,6 @@
24 + norandmaps Don't use address space randomization. Equivalent to
25 + echo 0 > /proc/sys/kernel/randomize_va_space
26 +
27 +- noreplace-paravirt [X86,IA-64,PV_OPS] Don't patch paravirt_ops
28 +-
29 + noreplace-smp [X86-32,SMP] Don't replace SMP instructions
30 + with UP alternatives
31 +
32 +diff --git a/Documentation/speculation.txt b/Documentation/speculation.txt
33 +new file mode 100644
34 +index 000000000000..e9e6cbae2841
35 +--- /dev/null
36 ++++ b/Documentation/speculation.txt
37 +@@ -0,0 +1,90 @@
38 ++This document explains potential effects of speculation, and how undesirable
39 ++effects can be mitigated portably using common APIs.
40 ++
41 ++===========
42 ++Speculation
43 ++===========
44 ++
45 ++To improve performance and minimize average latencies, many contemporary CPUs
46 ++employ speculative execution techniques such as branch prediction, performing
47 ++work which may be discarded at a later stage.
48 ++
49 ++Typically speculative execution cannot be observed from architectural state,
50 ++such as the contents of registers. However, in some cases it is possible to
51 ++observe its impact on microarchitectural state, such as the presence or
52 ++absence of data in caches. Such state may form side-channels which can be
53 ++observed to extract secret information.
54 ++
55 ++For example, in the presence of branch prediction, it is possible for bounds
56 ++checks to be ignored by code which is speculatively executed. Consider the
57 ++following code:
58 ++
59 ++ int load_array(int *array, unsigned int index)
60 ++ {
61 ++ if (index >= MAX_ARRAY_ELEMS)
62 ++ return 0;
63 ++ else
64 ++ return array[index];
65 ++ }
66 ++
67 ++Which, on arm64, may be compiled to an assembly sequence such as:
68 ++
69 ++ CMP <index>, #MAX_ARRAY_ELEMS
70 ++ B.LT less
71 ++ MOV <returnval>, #0
72 ++ RET
73 ++ less:
74 ++ LDR <returnval>, [<array>, <index>]
75 ++ RET
76 ++
77 ++It is possible that a CPU mis-predicts the conditional branch, and
78 ++speculatively loads array[index], even if index >= MAX_ARRAY_ELEMS. This
79 ++value will subsequently be discarded, but the speculated load may affect
80 ++microarchitectural state which can be subsequently measured.
81 ++
82 ++More complex sequences involving multiple dependent memory accesses may
83 ++result in sensitive information being leaked. Consider the following
84 ++code, building on the prior example:
85 ++
86 ++ int load_dependent_arrays(int *arr1, int *arr2, int index)
87 ++ {
88 ++ int val1, val2,
89 ++
90 ++ val1 = load_array(arr1, index);
91 ++ val2 = load_array(arr2, val1);
92 ++
93 ++ return val2;
94 ++ }
95 ++
96 ++Under speculation, the first call to load_array() may return the value
97 ++of an out-of-bounds address, while the second call will influence
98 ++microarchitectural state dependent on this value. This may provide an
99 ++arbitrary read primitive.
100 ++
101 ++====================================
102 ++Mitigating speculation side-channels
103 ++====================================
104 ++
105 ++The kernel provides a generic API to ensure that bounds checks are
106 ++respected even under speculation. Architectures which are affected by
107 ++speculation-based side-channels are expected to implement these
108 ++primitives.
109 ++
110 ++The array_index_nospec() helper in <linux/nospec.h> can be used to
111 ++prevent information from being leaked via side-channels.
112 ++
113 ++A call to array_index_nospec(index, size) returns a sanitized index
114 ++value that is bounded to [0, size) even under cpu speculation
115 ++conditions.
116 ++
117 ++This can be used to protect the earlier load_array() example:
118 ++
119 ++ int load_array(int *array, unsigned int index)
120 ++ {
121 ++ if (index >= MAX_ARRAY_ELEMS)
122 ++ return 0;
123 ++ else {
124 ++ index = array_index_nospec(index, MAX_ARRAY_ELEMS);
125 ++ return array[index];
126 ++ }
127 ++ }
128 +diff --git a/Makefile b/Makefile
129 +index 7ed993896dd5..a69e5da9ed86 100644
130 +--- a/Makefile
131 ++++ b/Makefile
132 +@@ -1,7 +1,7 @@
133 + # SPDX-License-Identifier: GPL-2.0
134 + VERSION = 4
135 + PATCHLEVEL = 14
136 +-SUBLEVEL = 17
137 ++SUBLEVEL = 18
138 + EXTRAVERSION =
139 + NAME = Petit Gorille
140 +
141 +diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
142 +index cb782ac1c35d..fe418226df7f 100644
143 +--- a/arch/powerpc/Kconfig
144 ++++ b/arch/powerpc/Kconfig
145 +@@ -164,6 +164,7 @@ config PPC
146 + select GENERIC_CLOCKEVENTS_BROADCAST if SMP
147 + select GENERIC_CMOS_UPDATE
148 + select GENERIC_CPU_AUTOPROBE
149 ++ select GENERIC_CPU_VULNERABILITIES if PPC_BOOK3S_64
150 + select GENERIC_IRQ_SHOW
151 + select GENERIC_IRQ_SHOW_LEVEL
152 + select GENERIC_SMP_IDLE_THREAD
153 +diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
154 +index 935059cb9e40..9527a4c6cbc2 100644
155 +--- a/arch/powerpc/kernel/setup_64.c
156 ++++ b/arch/powerpc/kernel/setup_64.c
157 +@@ -38,6 +38,7 @@
158 + #include <linux/memory.h>
159 + #include <linux/nmi.h>
160 +
161 ++#include <asm/debugfs.h>
162 + #include <asm/io.h>
163 + #include <asm/kdump.h>
164 + #include <asm/prom.h>
165 +@@ -884,4 +885,41 @@ void __init setup_rfi_flush(enum l1d_flush_type types, bool enable)
166 + if (!no_rfi_flush)
167 + rfi_flush_enable(enable);
168 + }
169 ++
170 ++#ifdef CONFIG_DEBUG_FS
171 ++static int rfi_flush_set(void *data, u64 val)
172 ++{
173 ++ if (val == 1)
174 ++ rfi_flush_enable(true);
175 ++ else if (val == 0)
176 ++ rfi_flush_enable(false);
177 ++ else
178 ++ return -EINVAL;
179 ++
180 ++ return 0;
181 ++}
182 ++
183 ++static int rfi_flush_get(void *data, u64 *val)
184 ++{
185 ++ *val = rfi_flush ? 1 : 0;
186 ++ return 0;
187 ++}
188 ++
189 ++DEFINE_SIMPLE_ATTRIBUTE(fops_rfi_flush, rfi_flush_get, rfi_flush_set, "%llu\n");
190 ++
191 ++static __init int rfi_flush_debugfs_init(void)
192 ++{
193 ++ debugfs_create_file("rfi_flush", 0600, powerpc_debugfs_root, NULL, &fops_rfi_flush);
194 ++ return 0;
195 ++}
196 ++device_initcall(rfi_flush_debugfs_init);
197 ++#endif
198 ++
199 ++ssize_t cpu_show_meltdown(struct device *dev, struct device_attribute *attr, char *buf)
200 ++{
201 ++ if (rfi_flush)
202 ++ return sprintf(buf, "Mitigation: RFI Flush\n");
203 ++
204 ++ return sprintf(buf, "Vulnerable\n");
205 ++}
206 + #endif /* CONFIG_PPC_BOOK3S_64 */
207 +diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
208 +index 03505ffbe1b6..60e21ccfb6d6 100644
209 +--- a/arch/x86/entry/common.c
210 ++++ b/arch/x86/entry/common.c
211 +@@ -21,6 +21,7 @@
212 + #include <linux/export.h>
213 + #include <linux/context_tracking.h>
214 + #include <linux/user-return-notifier.h>
215 ++#include <linux/nospec.h>
216 + #include <linux/uprobes.h>
217 + #include <linux/livepatch.h>
218 + #include <linux/syscalls.h>
219 +@@ -208,7 +209,7 @@ __visible inline void prepare_exit_to_usermode(struct pt_regs *regs)
220 + * special case only applies after poking regs and before the
221 + * very next return to user mode.
222 + */
223 +- current->thread.status &= ~(TS_COMPAT|TS_I386_REGS_POKED);
224 ++ ti->status &= ~(TS_COMPAT|TS_I386_REGS_POKED);
225 + #endif
226 +
227 + user_enter_irqoff();
228 +@@ -284,7 +285,8 @@ __visible void do_syscall_64(struct pt_regs *regs)
229 + * regs->orig_ax, which changes the behavior of some syscalls.
230 + */
231 + if (likely((nr & __SYSCALL_MASK) < NR_syscalls)) {
232 +- regs->ax = sys_call_table[nr & __SYSCALL_MASK](
233 ++ nr = array_index_nospec(nr & __SYSCALL_MASK, NR_syscalls);
234 ++ regs->ax = sys_call_table[nr](
235 + regs->di, regs->si, regs->dx,
236 + regs->r10, regs->r8, regs->r9);
237 + }
238 +@@ -306,7 +308,7 @@ static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs)
239 + unsigned int nr = (unsigned int)regs->orig_ax;
240 +
241 + #ifdef CONFIG_IA32_EMULATION
242 +- current->thread.status |= TS_COMPAT;
243 ++ ti->status |= TS_COMPAT;
244 + #endif
245 +
246 + if (READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY) {
247 +@@ -320,6 +322,7 @@ static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs)
248 + }
249 +
250 + if (likely(nr < IA32_NR_syscalls)) {
251 ++ nr = array_index_nospec(nr, IA32_NR_syscalls);
252 + /*
253 + * It's possible that a 32-bit syscall implementation
254 + * takes a 64-bit parameter but nonetheless assumes that
255 +diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
256 +index 60c4c342316c..2a35b1e0fb90 100644
257 +--- a/arch/x86/entry/entry_32.S
258 ++++ b/arch/x86/entry/entry_32.S
259 +@@ -252,7 +252,8 @@ ENTRY(__switch_to_asm)
260 + * exist, overwrite the RSB with entries which capture
261 + * speculative execution to prevent attack.
262 + */
263 +- FILL_RETURN_BUFFER %ebx, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_CTXSW
264 ++ /* Clobbers %ebx */
265 ++ FILL_RETURN_BUFFER RSB_CLEAR_LOOPS, X86_FEATURE_RSB_CTXSW
266 + #endif
267 +
268 + /* restore callee-saved registers */
269 +diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
270 +index be6b66464f6a..16e2d72e79a0 100644
271 +--- a/arch/x86/entry/entry_64.S
272 ++++ b/arch/x86/entry/entry_64.S
273 +@@ -232,91 +232,20 @@ GLOBAL(entry_SYSCALL_64_after_hwframe)
274 + pushq %r9 /* pt_regs->r9 */
275 + pushq %r10 /* pt_regs->r10 */
276 + pushq %r11 /* pt_regs->r11 */
277 +- sub $(6*8), %rsp /* pt_regs->bp, bx, r12-15 not saved */
278 +- UNWIND_HINT_REGS extra=0
279 +-
280 +- TRACE_IRQS_OFF
281 +-
282 +- /*
283 +- * If we need to do entry work or if we guess we'll need to do
284 +- * exit work, go straight to the slow path.
285 +- */
286 +- movq PER_CPU_VAR(current_task), %r11
287 +- testl $_TIF_WORK_SYSCALL_ENTRY|_TIF_ALLWORK_MASK, TASK_TI_flags(%r11)
288 +- jnz entry_SYSCALL64_slow_path
289 +-
290 +-entry_SYSCALL_64_fastpath:
291 +- /*
292 +- * Easy case: enable interrupts and issue the syscall. If the syscall
293 +- * needs pt_regs, we'll call a stub that disables interrupts again
294 +- * and jumps to the slow path.
295 +- */
296 +- TRACE_IRQS_ON
297 +- ENABLE_INTERRUPTS(CLBR_NONE)
298 +-#if __SYSCALL_MASK == ~0
299 +- cmpq $__NR_syscall_max, %rax
300 +-#else
301 +- andl $__SYSCALL_MASK, %eax
302 +- cmpl $__NR_syscall_max, %eax
303 +-#endif
304 +- ja 1f /* return -ENOSYS (already in pt_regs->ax) */
305 +- movq %r10, %rcx
306 +-
307 +- /*
308 +- * This call instruction is handled specially in stub_ptregs_64.
309 +- * It might end up jumping to the slow path. If it jumps, RAX
310 +- * and all argument registers are clobbered.
311 +- */
312 +-#ifdef CONFIG_RETPOLINE
313 +- movq sys_call_table(, %rax, 8), %rax
314 +- call __x86_indirect_thunk_rax
315 +-#else
316 +- call *sys_call_table(, %rax, 8)
317 +-#endif
318 +-.Lentry_SYSCALL_64_after_fastpath_call:
319 +-
320 +- movq %rax, RAX(%rsp)
321 +-1:
322 ++ pushq %rbx /* pt_regs->rbx */
323 ++ pushq %rbp /* pt_regs->rbp */
324 ++ pushq %r12 /* pt_regs->r12 */
325 ++ pushq %r13 /* pt_regs->r13 */
326 ++ pushq %r14 /* pt_regs->r14 */
327 ++ pushq %r15 /* pt_regs->r15 */
328 ++ UNWIND_HINT_REGS
329 +
330 +- /*
331 +- * If we get here, then we know that pt_regs is clean for SYSRET64.
332 +- * If we see that no exit work is required (which we are required
333 +- * to check with IRQs off), then we can go straight to SYSRET64.
334 +- */
335 +- DISABLE_INTERRUPTS(CLBR_ANY)
336 + TRACE_IRQS_OFF
337 +- movq PER_CPU_VAR(current_task), %r11
338 +- testl $_TIF_ALLWORK_MASK, TASK_TI_flags(%r11)
339 +- jnz 1f
340 +-
341 +- LOCKDEP_SYS_EXIT
342 +- TRACE_IRQS_ON /* user mode is traced as IRQs on */
343 +- movq RIP(%rsp), %rcx
344 +- movq EFLAGS(%rsp), %r11
345 +- addq $6*8, %rsp /* skip extra regs -- they were preserved */
346 +- UNWIND_HINT_EMPTY
347 +- jmp .Lpop_c_regs_except_rcx_r11_and_sysret
348 +
349 +-1:
350 +- /*
351 +- * The fast path looked good when we started, but something changed
352 +- * along the way and we need to switch to the slow path. Calling
353 +- * raise(3) will trigger this, for example. IRQs are off.
354 +- */
355 +- TRACE_IRQS_ON
356 +- ENABLE_INTERRUPTS(CLBR_ANY)
357 +- SAVE_EXTRA_REGS
358 +- movq %rsp, %rdi
359 +- call syscall_return_slowpath /* returns with IRQs disabled */
360 +- jmp return_from_SYSCALL_64
361 +-
362 +-entry_SYSCALL64_slow_path:
363 + /* IRQs are off. */
364 +- SAVE_EXTRA_REGS
365 + movq %rsp, %rdi
366 + call do_syscall_64 /* returns with IRQs disabled */
367 +
368 +-return_from_SYSCALL_64:
369 + TRACE_IRQS_IRETQ /* we're about to change IF */
370 +
371 + /*
372 +@@ -389,7 +318,6 @@ syscall_return_via_sysret:
373 + /* rcx and r11 are already restored (see code above) */
374 + UNWIND_HINT_EMPTY
375 + POP_EXTRA_REGS
376 +-.Lpop_c_regs_except_rcx_r11_and_sysret:
377 + popq %rsi /* skip r11 */
378 + popq %r10
379 + popq %r9
380 +@@ -420,47 +348,6 @@ syscall_return_via_sysret:
381 + USERGS_SYSRET64
382 + END(entry_SYSCALL_64)
383 +
384 +-ENTRY(stub_ptregs_64)
385 +- /*
386 +- * Syscalls marked as needing ptregs land here.
387 +- * If we are on the fast path, we need to save the extra regs,
388 +- * which we achieve by trying again on the slow path. If we are on
389 +- * the slow path, the extra regs are already saved.
390 +- *
391 +- * RAX stores a pointer to the C function implementing the syscall.
392 +- * IRQs are on.
393 +- */
394 +- cmpq $.Lentry_SYSCALL_64_after_fastpath_call, (%rsp)
395 +- jne 1f
396 +-
397 +- /*
398 +- * Called from fast path -- disable IRQs again, pop return address
399 +- * and jump to slow path
400 +- */
401 +- DISABLE_INTERRUPTS(CLBR_ANY)
402 +- TRACE_IRQS_OFF
403 +- popq %rax
404 +- UNWIND_HINT_REGS extra=0
405 +- jmp entry_SYSCALL64_slow_path
406 +-
407 +-1:
408 +- JMP_NOSPEC %rax /* Called from C */
409 +-END(stub_ptregs_64)
410 +-
411 +-.macro ptregs_stub func
412 +-ENTRY(ptregs_\func)
413 +- UNWIND_HINT_FUNC
414 +- leaq \func(%rip), %rax
415 +- jmp stub_ptregs_64
416 +-END(ptregs_\func)
417 +-.endm
418 +-
419 +-/* Instantiate ptregs_stub for each ptregs-using syscall */
420 +-#define __SYSCALL_64_QUAL_(sym)
421 +-#define __SYSCALL_64_QUAL_ptregs(sym) ptregs_stub sym
422 +-#define __SYSCALL_64(nr, sym, qual) __SYSCALL_64_QUAL_##qual(sym)
423 +-#include <asm/syscalls_64.h>
424 +-
425 + /*
426 + * %rdi: prev task
427 + * %rsi: next task
428 +@@ -495,7 +382,8 @@ ENTRY(__switch_to_asm)
429 + * exist, overwrite the RSB with entries which capture
430 + * speculative execution to prevent attack.
431 + */
432 +- FILL_RETURN_BUFFER %r12, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_CTXSW
433 ++ /* Clobbers %rbx */
434 ++ FILL_RETURN_BUFFER RSB_CLEAR_LOOPS, X86_FEATURE_RSB_CTXSW
435 + #endif
436 +
437 + /* restore callee-saved registers */
438 +diff --git a/arch/x86/entry/syscall_64.c b/arch/x86/entry/syscall_64.c
439 +index 9c09775e589d..c176d2fab1da 100644
440 +--- a/arch/x86/entry/syscall_64.c
441 ++++ b/arch/x86/entry/syscall_64.c
442 +@@ -7,14 +7,11 @@
443 + #include <asm/asm-offsets.h>
444 + #include <asm/syscall.h>
445 +
446 +-#define __SYSCALL_64_QUAL_(sym) sym
447 +-#define __SYSCALL_64_QUAL_ptregs(sym) ptregs_##sym
448 +-
449 +-#define __SYSCALL_64(nr, sym, qual) extern asmlinkage long __SYSCALL_64_QUAL_##qual(sym)(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
450 ++#define __SYSCALL_64(nr, sym, qual) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
451 + #include <asm/syscalls_64.h>
452 + #undef __SYSCALL_64
453 +
454 +-#define __SYSCALL_64(nr, sym, qual) [nr] = __SYSCALL_64_QUAL_##qual(sym),
455 ++#define __SYSCALL_64(nr, sym, qual) [nr] = sym,
456 +
457 + extern long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
458 +
459 +diff --git a/arch/x86/include/asm/asm-prototypes.h b/arch/x86/include/asm/asm-prototypes.h
460 +index 0927cdc4f946..4d111616524b 100644
461 +--- a/arch/x86/include/asm/asm-prototypes.h
462 ++++ b/arch/x86/include/asm/asm-prototypes.h
463 +@@ -38,5 +38,7 @@ INDIRECT_THUNK(dx)
464 + INDIRECT_THUNK(si)
465 + INDIRECT_THUNK(di)
466 + INDIRECT_THUNK(bp)
467 +-INDIRECT_THUNK(sp)
468 ++asmlinkage void __fill_rsb(void);
469 ++asmlinkage void __clear_rsb(void);
470 ++
471 + #endif /* CONFIG_RETPOLINE */
472 +diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h
473 +index 01727dbc294a..1e7c955b6303 100644
474 +--- a/arch/x86/include/asm/barrier.h
475 ++++ b/arch/x86/include/asm/barrier.h
476 +@@ -24,6 +24,34 @@
477 + #define wmb() asm volatile("sfence" ::: "memory")
478 + #endif
479 +
480 ++/**
481 ++ * array_index_mask_nospec() - generate a mask that is ~0UL when the
482 ++ * bounds check succeeds and 0 otherwise
483 ++ * @index: array element index
484 ++ * @size: number of elements in array
485 ++ *
486 ++ * Returns:
487 ++ * 0 - (index < size)
488 ++ */
489 ++static inline unsigned long array_index_mask_nospec(unsigned long index,
490 ++ unsigned long size)
491 ++{
492 ++ unsigned long mask;
493 ++
494 ++ asm ("cmp %1,%2; sbb %0,%0;"
495 ++ :"=r" (mask)
496 ++ :"r"(size),"r" (index)
497 ++ :"cc");
498 ++ return mask;
499 ++}
500 ++
501 ++/* Override the default implementation from linux/nospec.h. */
502 ++#define array_index_mask_nospec array_index_mask_nospec
503 ++
504 ++/* Prevent speculative execution past this barrier. */
505 ++#define barrier_nospec() alternative_2("", "mfence", X86_FEATURE_MFENCE_RDTSC, \
506 ++ "lfence", X86_FEATURE_LFENCE_RDTSC)
507 ++
508 + #ifdef CONFIG_X86_PPRO_FENCE
509 + #define dma_rmb() rmb()
510 + #else
511 +diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
512 +index ea9a7dde62e5..70eddb3922ff 100644
513 +--- a/arch/x86/include/asm/cpufeature.h
514 ++++ b/arch/x86/include/asm/cpufeature.h
515 +@@ -29,6 +29,7 @@ enum cpuid_leafs
516 + CPUID_8000_000A_EDX,
517 + CPUID_7_ECX,
518 + CPUID_8000_0007_EBX,
519 ++ CPUID_7_EDX,
520 + };
521 +
522 + #ifdef CONFIG_X86_FEATURE_NAMES
523 +@@ -79,8 +80,9 @@ extern const char * const x86_bug_flags[NBUGINTS*32];
524 + CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 15, feature_bit) || \
525 + CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 16, feature_bit) || \
526 + CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 17, feature_bit) || \
527 ++ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 18, feature_bit) || \
528 + REQUIRED_MASK_CHECK || \
529 +- BUILD_BUG_ON_ZERO(NCAPINTS != 18))
530 ++ BUILD_BUG_ON_ZERO(NCAPINTS != 19))
531 +
532 + #define DISABLED_MASK_BIT_SET(feature_bit) \
533 + ( CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 0, feature_bit) || \
534 +@@ -101,8 +103,9 @@ extern const char * const x86_bug_flags[NBUGINTS*32];
535 + CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 15, feature_bit) || \
536 + CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 16, feature_bit) || \
537 + CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 17, feature_bit) || \
538 ++ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 18, feature_bit) || \
539 + DISABLED_MASK_CHECK || \
540 +- BUILD_BUG_ON_ZERO(NCAPINTS != 18))
541 ++ BUILD_BUG_ON_ZERO(NCAPINTS != 19))
542 +
543 + #define cpu_has(c, bit) \
544 + (__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 : \
545 +diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
546 +index 25b9375c1484..73b5fff159a4 100644
547 +--- a/arch/x86/include/asm/cpufeatures.h
548 ++++ b/arch/x86/include/asm/cpufeatures.h
549 +@@ -13,7 +13,7 @@
550 + /*
551 + * Defines x86 CPU feature bits
552 + */
553 +-#define NCAPINTS 18 /* N 32-bit words worth of info */
554 ++#define NCAPINTS 19 /* N 32-bit words worth of info */
555 + #define NBUGINTS 1 /* N 32-bit bug flags */
556 +
557 + /*
558 +@@ -203,14 +203,14 @@
559 + #define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */
560 + #define X86_FEATURE_SME ( 7*32+10) /* AMD Secure Memory Encryption */
561 + #define X86_FEATURE_PTI ( 7*32+11) /* Kernel Page Table Isolation enabled */
562 +-#define X86_FEATURE_RETPOLINE ( 7*32+12) /* Generic Retpoline mitigation for Spectre variant 2 */
563 +-#define X86_FEATURE_RETPOLINE_AMD ( 7*32+13) /* AMD Retpoline mitigation for Spectre variant 2 */
564 ++#define X86_FEATURE_RETPOLINE ( 7*32+12) /* "" Generic Retpoline mitigation for Spectre variant 2 */
565 ++#define X86_FEATURE_RETPOLINE_AMD ( 7*32+13) /* "" AMD Retpoline mitigation for Spectre variant 2 */
566 + #define X86_FEATURE_INTEL_PPIN ( 7*32+14) /* Intel Processor Inventory Number */
567 +-#define X86_FEATURE_AVX512_4VNNIW ( 7*32+16) /* AVX-512 Neural Network Instructions */
568 +-#define X86_FEATURE_AVX512_4FMAPS ( 7*32+17) /* AVX-512 Multiply Accumulation Single precision */
569 +
570 + #define X86_FEATURE_MBA ( 7*32+18) /* Memory Bandwidth Allocation */
571 +-#define X86_FEATURE_RSB_CTXSW ( 7*32+19) /* Fill RSB on context switches */
572 ++#define X86_FEATURE_RSB_CTXSW ( 7*32+19) /* "" Fill RSB on context switches */
573 ++
574 ++#define X86_FEATURE_USE_IBPB ( 7*32+21) /* "" Indirect Branch Prediction Barrier enabled */
575 +
576 + /* Virtualization flags: Linux defined, word 8 */
577 + #define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */
578 +@@ -271,6 +271,9 @@
579 + #define X86_FEATURE_CLZERO (13*32+ 0) /* CLZERO instruction */
580 + #define X86_FEATURE_IRPERF (13*32+ 1) /* Instructions Retired Count */
581 + #define X86_FEATURE_XSAVEERPTR (13*32+ 2) /* Always save/restore FP error pointers */
582 ++#define X86_FEATURE_IBPB (13*32+12) /* Indirect Branch Prediction Barrier */
583 ++#define X86_FEATURE_IBRS (13*32+14) /* Indirect Branch Restricted Speculation */
584 ++#define X86_FEATURE_STIBP (13*32+15) /* Single Thread Indirect Branch Predictors */
585 +
586 + /* Thermal and Power Management Leaf, CPUID level 0x00000006 (EAX), word 14 */
587 + #define X86_FEATURE_DTHERM (14*32+ 0) /* Digital Thermal Sensor */
588 +@@ -319,6 +322,13 @@
589 + #define X86_FEATURE_SUCCOR (17*32+ 1) /* Uncorrectable error containment and recovery */
590 + #define X86_FEATURE_SMCA (17*32+ 3) /* Scalable MCA */
591 +
592 ++/* Intel-defined CPU features, CPUID level 0x00000007:0 (EDX), word 18 */
593 ++#define X86_FEATURE_AVX512_4VNNIW (18*32+ 2) /* AVX-512 Neural Network Instructions */
594 ++#define X86_FEATURE_AVX512_4FMAPS (18*32+ 3) /* AVX-512 Multiply Accumulation Single precision */
595 ++#define X86_FEATURE_SPEC_CTRL (18*32+26) /* "" Speculation Control (IBRS + IBPB) */
596 ++#define X86_FEATURE_INTEL_STIBP (18*32+27) /* "" Single Thread Indirect Branch Predictors */
597 ++#define X86_FEATURE_ARCH_CAPABILITIES (18*32+29) /* IA32_ARCH_CAPABILITIES MSR (Intel) */
598 ++
599 + /*
600 + * BUG word(s)
601 + */
602 +diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h
603 +index e428e16dd822..c6a3af198294 100644
604 +--- a/arch/x86/include/asm/disabled-features.h
605 ++++ b/arch/x86/include/asm/disabled-features.h
606 +@@ -71,6 +71,7 @@
607 + #define DISABLED_MASK15 0
608 + #define DISABLED_MASK16 (DISABLE_PKU|DISABLE_OSPKE|DISABLE_LA57)
609 + #define DISABLED_MASK17 0
610 +-#define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 18)
611 ++#define DISABLED_MASK18 0
612 ++#define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 19)
613 +
614 + #endif /* _ASM_X86_DISABLED_FEATURES_H */
615 +diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
616 +index 64c4a30e0d39..e203169931c7 100644
617 +--- a/arch/x86/include/asm/fixmap.h
618 ++++ b/arch/x86/include/asm/fixmap.h
619 +@@ -137,8 +137,10 @@ enum fixed_addresses {
620 +
621 + extern void reserve_top_address(unsigned long reserve);
622 +
623 +-#define FIXADDR_SIZE (__end_of_permanent_fixed_addresses << PAGE_SHIFT)
624 +-#define FIXADDR_START (FIXADDR_TOP - FIXADDR_SIZE)
625 ++#define FIXADDR_SIZE (__end_of_permanent_fixed_addresses << PAGE_SHIFT)
626 ++#define FIXADDR_START (FIXADDR_TOP - FIXADDR_SIZE)
627 ++#define FIXADDR_TOT_SIZE (__end_of_fixed_addresses << PAGE_SHIFT)
628 ++#define FIXADDR_TOT_START (FIXADDR_TOP - FIXADDR_TOT_SIZE)
629 +
630 + extern int fixmaps_set;
631 +
632 +diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
633 +index fa11fb1fa570..eb83ff1bae8f 100644
634 +--- a/arch/x86/include/asm/msr-index.h
635 ++++ b/arch/x86/include/asm/msr-index.h
636 +@@ -39,6 +39,13 @@
637 +
638 + /* Intel MSRs. Some also available on other CPUs */
639 +
640 ++#define MSR_IA32_SPEC_CTRL 0x00000048 /* Speculation Control */
641 ++#define SPEC_CTRL_IBRS (1 << 0) /* Indirect Branch Restricted Speculation */
642 ++#define SPEC_CTRL_STIBP (1 << 1) /* Single Thread Indirect Branch Predictors */
643 ++
644 ++#define MSR_IA32_PRED_CMD 0x00000049 /* Prediction Command */
645 ++#define PRED_CMD_IBPB (1 << 0) /* Indirect Branch Prediction Barrier */
646 ++
647 + #define MSR_PPIN_CTL 0x0000004e
648 + #define MSR_PPIN 0x0000004f
649 +
650 +@@ -57,6 +64,11 @@
651 + #define SNB_C3_AUTO_UNDEMOTE (1UL << 28)
652 +
653 + #define MSR_MTRRcap 0x000000fe
654 ++
655 ++#define MSR_IA32_ARCH_CAPABILITIES 0x0000010a
656 ++#define ARCH_CAP_RDCL_NO (1 << 0) /* Not susceptible to Meltdown */
657 ++#define ARCH_CAP_IBRS_ALL (1 << 1) /* Enhanced IBRS support */
658 ++
659 + #define MSR_IA32_BBL_CR_CTL 0x00000119
660 + #define MSR_IA32_BBL_CR_CTL3 0x0000011e
661 +
662 +diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h
663 +index 07962f5f6fba..30df295f6d94 100644
664 +--- a/arch/x86/include/asm/msr.h
665 ++++ b/arch/x86/include/asm/msr.h
666 +@@ -214,8 +214,7 @@ static __always_inline unsigned long long rdtsc_ordered(void)
667 + * that some other imaginary CPU is updating continuously with a
668 + * time stamp.
669 + */
670 +- alternative_2("", "mfence", X86_FEATURE_MFENCE_RDTSC,
671 +- "lfence", X86_FEATURE_LFENCE_RDTSC);
672 ++ barrier_nospec();
673 + return rdtsc();
674 + }
675 +
676 +diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
677 +index 4ad41087ce0e..4d57894635f2 100644
678 +--- a/arch/x86/include/asm/nospec-branch.h
679 ++++ b/arch/x86/include/asm/nospec-branch.h
680 +@@ -1,56 +1,12 @@
681 + /* SPDX-License-Identifier: GPL-2.0 */
682 +
683 +-#ifndef __NOSPEC_BRANCH_H__
684 +-#define __NOSPEC_BRANCH_H__
685 ++#ifndef _ASM_X86_NOSPEC_BRANCH_H_
686 ++#define _ASM_X86_NOSPEC_BRANCH_H_
687 +
688 + #include <asm/alternative.h>
689 + #include <asm/alternative-asm.h>
690 + #include <asm/cpufeatures.h>
691 +
692 +-/*
693 +- * Fill the CPU return stack buffer.
694 +- *
695 +- * Each entry in the RSB, if used for a speculative 'ret', contains an
696 +- * infinite 'pause; lfence; jmp' loop to capture speculative execution.
697 +- *
698 +- * This is required in various cases for retpoline and IBRS-based
699 +- * mitigations for the Spectre variant 2 vulnerability. Sometimes to
700 +- * eliminate potentially bogus entries from the RSB, and sometimes
701 +- * purely to ensure that it doesn't get empty, which on some CPUs would
702 +- * allow predictions from other (unwanted!) sources to be used.
703 +- *
704 +- * We define a CPP macro such that it can be used from both .S files and
705 +- * inline assembly. It's possible to do a .macro and then include that
706 +- * from C via asm(".include <asm/nospec-branch.h>") but let's not go there.
707 +- */
708 +-
709 +-#define RSB_CLEAR_LOOPS 32 /* To forcibly overwrite all entries */
710 +-#define RSB_FILL_LOOPS 16 /* To avoid underflow */
711 +-
712 +-/*
713 +- * Google experimented with loop-unrolling and this turned out to be
714 +- * the optimal version — two calls, each with their own speculation
715 +- * trap should their return address end up getting used, in a loop.
716 +- */
717 +-#define __FILL_RETURN_BUFFER(reg, nr, sp) \
718 +- mov $(nr/2), reg; \
719 +-771: \
720 +- call 772f; \
721 +-773: /* speculation trap */ \
722 +- pause; \
723 +- lfence; \
724 +- jmp 773b; \
725 +-772: \
726 +- call 774f; \
727 +-775: /* speculation trap */ \
728 +- pause; \
729 +- lfence; \
730 +- jmp 775b; \
731 +-774: \
732 +- dec reg; \
733 +- jnz 771b; \
734 +- add $(BITS_PER_LONG/8) * nr, sp;
735 +-
736 + #ifdef __ASSEMBLY__
737 +
738 + /*
739 +@@ -121,17 +77,10 @@
740 + #endif
741 + .endm
742 +
743 +- /*
744 +- * A simpler FILL_RETURN_BUFFER macro. Don't make people use the CPP
745 +- * monstrosity above, manually.
746 +- */
747 +-.macro FILL_RETURN_BUFFER reg:req nr:req ftr:req
748 ++/* This clobbers the BX register */
749 ++.macro FILL_RETURN_BUFFER nr:req ftr:req
750 + #ifdef CONFIG_RETPOLINE
751 +- ANNOTATE_NOSPEC_ALTERNATIVE
752 +- ALTERNATIVE "jmp .Lskip_rsb_\@", \
753 +- __stringify(__FILL_RETURN_BUFFER(\reg,\nr,%_ASM_SP)) \
754 +- \ftr
755 +-.Lskip_rsb_\@:
756 ++ ALTERNATIVE "", "call __clear_rsb", \ftr
757 + #endif
758 + .endm
759 +
760 +@@ -201,22 +150,25 @@ extern char __indirect_thunk_end[];
761 + * On VMEXIT we must ensure that no RSB predictions learned in the guest
762 + * can be followed in the host, by overwriting the RSB completely. Both
763 + * retpoline and IBRS mitigations for Spectre v2 need this; only on future
764 +- * CPUs with IBRS_ATT *might* it be avoided.
765 ++ * CPUs with IBRS_ALL *might* it be avoided.
766 + */
767 + static inline void vmexit_fill_RSB(void)
768 + {
769 + #ifdef CONFIG_RETPOLINE
770 +- unsigned long loops;
771 +-
772 +- asm volatile (ANNOTATE_NOSPEC_ALTERNATIVE
773 +- ALTERNATIVE("jmp 910f",
774 +- __stringify(__FILL_RETURN_BUFFER(%0, RSB_CLEAR_LOOPS, %1)),
775 +- X86_FEATURE_RETPOLINE)
776 +- "910:"
777 +- : "=r" (loops), ASM_CALL_CONSTRAINT
778 +- : : "memory" );
779 ++ alternative_input("",
780 ++ "call __fill_rsb",
781 ++ X86_FEATURE_RETPOLINE,
782 ++ ASM_NO_INPUT_CLOBBER(_ASM_BX, "memory"));
783 + #endif
784 + }
785 +
786 ++static inline void indirect_branch_prediction_barrier(void)
787 ++{
788 ++ alternative_input("",
789 ++ "call __ibp_barrier",
790 ++ X86_FEATURE_USE_IBPB,
791 ++ ASM_NO_INPUT_CLOBBER("eax", "ecx", "edx", "memory"));
792 ++}
793 ++
794 + #endif /* __ASSEMBLY__ */
795 +-#endif /* __NOSPEC_BRANCH_H__ */
796 ++#endif /* _ASM_X86_NOSPEC_BRANCH_H_ */
797 +diff --git a/arch/x86/include/asm/pgtable_32_types.h b/arch/x86/include/asm/pgtable_32_types.h
798 +index ce245b0cdfca..0777e18a1d23 100644
799 +--- a/arch/x86/include/asm/pgtable_32_types.h
800 ++++ b/arch/x86/include/asm/pgtable_32_types.h
801 +@@ -44,8 +44,9 @@ extern bool __vmalloc_start_set; /* set once high_memory is set */
802 + */
803 + #define CPU_ENTRY_AREA_PAGES (NR_CPUS * 40)
804 +
805 +-#define CPU_ENTRY_AREA_BASE \
806 +- ((FIXADDR_START - PAGE_SIZE * (CPU_ENTRY_AREA_PAGES + 1)) & PMD_MASK)
807 ++#define CPU_ENTRY_AREA_BASE \
808 ++ ((FIXADDR_TOT_START - PAGE_SIZE * (CPU_ENTRY_AREA_PAGES + 1)) \
809 ++ & PMD_MASK)
810 +
811 + #define PKMAP_BASE \
812 + ((CPU_ENTRY_AREA_BASE - PAGE_SIZE) & PMD_MASK)
813 +diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
814 +index 9c18da64daa9..c57c6e77c29f 100644
815 +--- a/arch/x86/include/asm/processor.h
816 ++++ b/arch/x86/include/asm/processor.h
817 +@@ -459,8 +459,6 @@ struct thread_struct {
818 + unsigned short gsindex;
819 + #endif
820 +
821 +- u32 status; /* thread synchronous flags */
822 +-
823 + #ifdef CONFIG_X86_64
824 + unsigned long fsbase;
825 + unsigned long gsbase;
826 +@@ -970,4 +968,7 @@ bool xen_set_default_idle(void);
827 +
828 + void stop_this_cpu(void *dummy);
829 + void df_debug(struct pt_regs *regs, long error_code);
830 ++
831 ++void __ibp_barrier(void);
832 ++
833 + #endif /* _ASM_X86_PROCESSOR_H */
834 +diff --git a/arch/x86/include/asm/required-features.h b/arch/x86/include/asm/required-features.h
835 +index d91ba04dd007..fb3a6de7440b 100644
836 +--- a/arch/x86/include/asm/required-features.h
837 ++++ b/arch/x86/include/asm/required-features.h
838 +@@ -106,6 +106,7 @@
839 + #define REQUIRED_MASK15 0
840 + #define REQUIRED_MASK16 (NEED_LA57)
841 + #define REQUIRED_MASK17 0
842 +-#define REQUIRED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 18)
843 ++#define REQUIRED_MASK18 0
844 ++#define REQUIRED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 19)
845 +
846 + #endif /* _ASM_X86_REQUIRED_FEATURES_H */
847 +diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h
848 +index e3c95e8e61c5..03eedc21246d 100644
849 +--- a/arch/x86/include/asm/syscall.h
850 ++++ b/arch/x86/include/asm/syscall.h
851 +@@ -60,7 +60,7 @@ static inline long syscall_get_error(struct task_struct *task,
852 + * TS_COMPAT is set for 32-bit syscall entries and then
853 + * remains set until we return to user mode.
854 + */
855 +- if (task->thread.status & (TS_COMPAT|TS_I386_REGS_POKED))
856 ++ if (task->thread_info.status & (TS_COMPAT|TS_I386_REGS_POKED))
857 + /*
858 + * Sign-extend the value so (int)-EFOO becomes (long)-EFOO
859 + * and will match correctly in comparisons.
860 +@@ -116,7 +116,7 @@ static inline void syscall_get_arguments(struct task_struct *task,
861 + unsigned long *args)
862 + {
863 + # ifdef CONFIG_IA32_EMULATION
864 +- if (task->thread.status & TS_COMPAT)
865 ++ if (task->thread_info.status & TS_COMPAT)
866 + switch (i) {
867 + case 0:
868 + if (!n--) break;
869 +@@ -177,7 +177,7 @@ static inline void syscall_set_arguments(struct task_struct *task,
870 + const unsigned long *args)
871 + {
872 + # ifdef CONFIG_IA32_EMULATION
873 +- if (task->thread.status & TS_COMPAT)
874 ++ if (task->thread_info.status & TS_COMPAT)
875 + switch (i) {
876 + case 0:
877 + if (!n--) break;
878 +diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
879 +index 00223333821a..eda3b6823ca4 100644
880 +--- a/arch/x86/include/asm/thread_info.h
881 ++++ b/arch/x86/include/asm/thread_info.h
882 +@@ -55,6 +55,7 @@ struct task_struct;
883 +
884 + struct thread_info {
885 + unsigned long flags; /* low level flags */
886 ++ u32 status; /* thread synchronous flags */
887 + };
888 +
889 + #define INIT_THREAD_INFO(tsk) \
890 +@@ -221,7 +222,7 @@ static inline int arch_within_stack_frames(const void * const stack,
891 + #define in_ia32_syscall() true
892 + #else
893 + #define in_ia32_syscall() (IS_ENABLED(CONFIG_IA32_EMULATION) && \
894 +- current->thread.status & TS_COMPAT)
895 ++ current_thread_info()->status & TS_COMPAT)
896 + #endif
897 +
898 + /*
899 +diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
900 +index 3effd3c994af..4405c4b308e8 100644
901 +--- a/arch/x86/include/asm/tlbflush.h
902 ++++ b/arch/x86/include/asm/tlbflush.h
903 +@@ -174,6 +174,8 @@ struct tlb_state {
904 + struct mm_struct *loaded_mm;
905 + u16 loaded_mm_asid;
906 + u16 next_asid;
907 ++ /* last user mm's ctx id */
908 ++ u64 last_ctx_id;
909 +
910 + /*
911 + * We can be in one of several states:
912 +diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
913 +index 574dff4d2913..aae77eb8491c 100644
914 +--- a/arch/x86/include/asm/uaccess.h
915 ++++ b/arch/x86/include/asm/uaccess.h
916 +@@ -124,6 +124,11 @@ extern int __get_user_bad(void);
917 +
918 + #define __uaccess_begin() stac()
919 + #define __uaccess_end() clac()
920 ++#define __uaccess_begin_nospec() \
921 ++({ \
922 ++ stac(); \
923 ++ barrier_nospec(); \
924 ++})
925 +
926 + /*
927 + * This is a type: either unsigned long, if the argument fits into
928 +@@ -445,7 +450,7 @@ do { \
929 + ({ \
930 + int __gu_err; \
931 + __inttype(*(ptr)) __gu_val; \
932 +- __uaccess_begin(); \
933 ++ __uaccess_begin_nospec(); \
934 + __get_user_size(__gu_val, (ptr), (size), __gu_err, -EFAULT); \
935 + __uaccess_end(); \
936 + (x) = (__force __typeof__(*(ptr)))__gu_val; \
937 +@@ -487,6 +492,10 @@ struct __large_struct { unsigned long buf[100]; };
938 + __uaccess_begin(); \
939 + barrier();
940 +
941 ++#define uaccess_try_nospec do { \
942 ++ current->thread.uaccess_err = 0; \
943 ++ __uaccess_begin_nospec(); \
944 ++
945 + #define uaccess_catch(err) \
946 + __uaccess_end(); \
947 + (err) |= (current->thread.uaccess_err ? -EFAULT : 0); \
948 +@@ -548,7 +557,7 @@ struct __large_struct { unsigned long buf[100]; };
949 + * get_user_ex(...);
950 + * } get_user_catch(err)
951 + */
952 +-#define get_user_try uaccess_try
953 ++#define get_user_try uaccess_try_nospec
954 + #define get_user_catch(err) uaccess_catch(err)
955 +
956 + #define get_user_ex(x, ptr) do { \
957 +@@ -582,7 +591,7 @@ extern void __cmpxchg_wrong_size(void)
958 + __typeof__(ptr) __uval = (uval); \
959 + __typeof__(*(ptr)) __old = (old); \
960 + __typeof__(*(ptr)) __new = (new); \
961 +- __uaccess_begin(); \
962 ++ __uaccess_begin_nospec(); \
963 + switch (size) { \
964 + case 1: \
965 + { \
966 +diff --git a/arch/x86/include/asm/uaccess_32.h b/arch/x86/include/asm/uaccess_32.h
967 +index 72950401b223..ba2dc1930630 100644
968 +--- a/arch/x86/include/asm/uaccess_32.h
969 ++++ b/arch/x86/include/asm/uaccess_32.h
970 +@@ -29,21 +29,21 @@ raw_copy_from_user(void *to, const void __user *from, unsigned long n)
971 + switch (n) {
972 + case 1:
973 + ret = 0;
974 +- __uaccess_begin();
975 ++ __uaccess_begin_nospec();
976 + __get_user_asm_nozero(*(u8 *)to, from, ret,
977 + "b", "b", "=q", 1);
978 + __uaccess_end();
979 + return ret;
980 + case 2:
981 + ret = 0;
982 +- __uaccess_begin();
983 ++ __uaccess_begin_nospec();
984 + __get_user_asm_nozero(*(u16 *)to, from, ret,
985 + "w", "w", "=r", 2);
986 + __uaccess_end();
987 + return ret;
988 + case 4:
989 + ret = 0;
990 +- __uaccess_begin();
991 ++ __uaccess_begin_nospec();
992 + __get_user_asm_nozero(*(u32 *)to, from, ret,
993 + "l", "k", "=r", 4);
994 + __uaccess_end();
995 +diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h
996 +index f07ef3c575db..62546b3a398e 100644
997 +--- a/arch/x86/include/asm/uaccess_64.h
998 ++++ b/arch/x86/include/asm/uaccess_64.h
999 +@@ -55,31 +55,31 @@ raw_copy_from_user(void *dst, const void __user *src, unsigned long size)
1000 + return copy_user_generic(dst, (__force void *)src, size);
1001 + switch (size) {
1002 + case 1:
1003 +- __uaccess_begin();
1004 ++ __uaccess_begin_nospec();
1005 + __get_user_asm_nozero(*(u8 *)dst, (u8 __user *)src,
1006 + ret, "b", "b", "=q", 1);
1007 + __uaccess_end();
1008 + return ret;
1009 + case 2:
1010 +- __uaccess_begin();
1011 ++ __uaccess_begin_nospec();
1012 + __get_user_asm_nozero(*(u16 *)dst, (u16 __user *)src,
1013 + ret, "w", "w", "=r", 2);
1014 + __uaccess_end();
1015 + return ret;
1016 + case 4:
1017 +- __uaccess_begin();
1018 ++ __uaccess_begin_nospec();
1019 + __get_user_asm_nozero(*(u32 *)dst, (u32 __user *)src,
1020 + ret, "l", "k", "=r", 4);
1021 + __uaccess_end();
1022 + return ret;
1023 + case 8:
1024 +- __uaccess_begin();
1025 ++ __uaccess_begin_nospec();
1026 + __get_user_asm_nozero(*(u64 *)dst, (u64 __user *)src,
1027 + ret, "q", "", "=r", 8);
1028 + __uaccess_end();
1029 + return ret;
1030 + case 10:
1031 +- __uaccess_begin();
1032 ++ __uaccess_begin_nospec();
1033 + __get_user_asm_nozero(*(u64 *)dst, (u64 __user *)src,
1034 + ret, "q", "", "=r", 10);
1035 + if (likely(!ret))
1036 +@@ -89,7 +89,7 @@ raw_copy_from_user(void *dst, const void __user *src, unsigned long size)
1037 + __uaccess_end();
1038 + return ret;
1039 + case 16:
1040 +- __uaccess_begin();
1041 ++ __uaccess_begin_nospec();
1042 + __get_user_asm_nozero(*(u64 *)dst, (u64 __user *)src,
1043 + ret, "q", "", "=r", 16);
1044 + if (likely(!ret))
1045 +diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
1046 +index e0b97e4d1db5..21be0193d9dc 100644
1047 +--- a/arch/x86/kernel/alternative.c
1048 ++++ b/arch/x86/kernel/alternative.c
1049 +@@ -46,17 +46,6 @@ static int __init setup_noreplace_smp(char *str)
1050 + }
1051 + __setup("noreplace-smp", setup_noreplace_smp);
1052 +
1053 +-#ifdef CONFIG_PARAVIRT
1054 +-static int __initdata_or_module noreplace_paravirt = 0;
1055 +-
1056 +-static int __init setup_noreplace_paravirt(char *str)
1057 +-{
1058 +- noreplace_paravirt = 1;
1059 +- return 1;
1060 +-}
1061 +-__setup("noreplace-paravirt", setup_noreplace_paravirt);
1062 +-#endif
1063 +-
1064 + #define DPRINTK(fmt, args...) \
1065 + do { \
1066 + if (debug_alternative) \
1067 +@@ -298,7 +287,7 @@ recompute_jump(struct alt_instr *a, u8 *orig_insn, u8 *repl_insn, u8 *insnbuf)
1068 + tgt_rip = next_rip + o_dspl;
1069 + n_dspl = tgt_rip - orig_insn;
1070 +
1071 +- DPRINTK("target RIP: %p, new_displ: 0x%x", tgt_rip, n_dspl);
1072 ++ DPRINTK("target RIP: %px, new_displ: 0x%x", tgt_rip, n_dspl);
1073 +
1074 + if (tgt_rip - orig_insn >= 0) {
1075 + if (n_dspl - 2 <= 127)
1076 +@@ -355,7 +344,7 @@ static void __init_or_module noinline optimize_nops(struct alt_instr *a, u8 *ins
1077 + add_nops(instr + (a->instrlen - a->padlen), a->padlen);
1078 + local_irq_restore(flags);
1079 +
1080 +- DUMP_BYTES(instr, a->instrlen, "%p: [%d:%d) optimized NOPs: ",
1081 ++ DUMP_BYTES(instr, a->instrlen, "%px: [%d:%d) optimized NOPs: ",
1082 + instr, a->instrlen - a->padlen, a->padlen);
1083 + }
1084 +
1085 +@@ -376,7 +365,7 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start,
1086 + u8 *instr, *replacement;
1087 + u8 insnbuf[MAX_PATCH_LEN];
1088 +
1089 +- DPRINTK("alt table %p -> %p", start, end);
1090 ++ DPRINTK("alt table %px, -> %px", start, end);
1091 + /*
1092 + * The scan order should be from start to end. A later scanned
1093 + * alternative code can overwrite previously scanned alternative code.
1094 +@@ -400,14 +389,14 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start,
1095 + continue;
1096 + }
1097 +
1098 +- DPRINTK("feat: %d*32+%d, old: (%p, len: %d), repl: (%p, len: %d), pad: %d",
1099 ++ DPRINTK("feat: %d*32+%d, old: (%px len: %d), repl: (%px, len: %d), pad: %d",
1100 + a->cpuid >> 5,
1101 + a->cpuid & 0x1f,
1102 + instr, a->instrlen,
1103 + replacement, a->replacementlen, a->padlen);
1104 +
1105 +- DUMP_BYTES(instr, a->instrlen, "%p: old_insn: ", instr);
1106 +- DUMP_BYTES(replacement, a->replacementlen, "%p: rpl_insn: ", replacement);
1107 ++ DUMP_BYTES(instr, a->instrlen, "%px: old_insn: ", instr);
1108 ++ DUMP_BYTES(replacement, a->replacementlen, "%px: rpl_insn: ", replacement);
1109 +
1110 + memcpy(insnbuf, replacement, a->replacementlen);
1111 + insnbuf_sz = a->replacementlen;
1112 +@@ -433,7 +422,7 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start,
1113 + a->instrlen - a->replacementlen);
1114 + insnbuf_sz += a->instrlen - a->replacementlen;
1115 + }
1116 +- DUMP_BYTES(insnbuf, insnbuf_sz, "%p: final_insn: ", instr);
1117 ++ DUMP_BYTES(insnbuf, insnbuf_sz, "%px: final_insn: ", instr);
1118 +
1119 + text_poke_early(instr, insnbuf, insnbuf_sz);
1120 + }
1121 +@@ -599,9 +588,6 @@ void __init_or_module apply_paravirt(struct paravirt_patch_site *start,
1122 + struct paravirt_patch_site *p;
1123 + char insnbuf[MAX_PATCH_LEN];
1124 +
1125 +- if (noreplace_paravirt)
1126 +- return;
1127 +-
1128 + for (p = start; p < end; p++) {
1129 + unsigned int used;
1130 +
1131 +diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
1132 +index 390b3dc3d438..71949bf2de5a 100644
1133 +--- a/arch/x86/kernel/cpu/bugs.c
1134 ++++ b/arch/x86/kernel/cpu/bugs.c
1135 +@@ -11,6 +11,7 @@
1136 + #include <linux/init.h>
1137 + #include <linux/utsname.h>
1138 + #include <linux/cpu.h>
1139 ++#include <linux/module.h>
1140 +
1141 + #include <asm/nospec-branch.h>
1142 + #include <asm/cmdline.h>
1143 +@@ -90,20 +91,41 @@ static const char *spectre_v2_strings[] = {
1144 + };
1145 +
1146 + #undef pr_fmt
1147 +-#define pr_fmt(fmt) "Spectre V2 mitigation: " fmt
1148 ++#define pr_fmt(fmt) "Spectre V2 : " fmt
1149 +
1150 + static enum spectre_v2_mitigation spectre_v2_enabled = SPECTRE_V2_NONE;
1151 +
1152 ++#ifdef RETPOLINE
1153 ++static bool spectre_v2_bad_module;
1154 ++
1155 ++bool retpoline_module_ok(bool has_retpoline)
1156 ++{
1157 ++ if (spectre_v2_enabled == SPECTRE_V2_NONE || has_retpoline)
1158 ++ return true;
1159 ++
1160 ++ pr_err("System may be vulnerable to spectre v2\n");
1161 ++ spectre_v2_bad_module = true;
1162 ++ return false;
1163 ++}
1164 ++
1165 ++static inline const char *spectre_v2_module_string(void)
1166 ++{
1167 ++ return spectre_v2_bad_module ? " - vulnerable module loaded" : "";
1168 ++}
1169 ++#else
1170 ++static inline const char *spectre_v2_module_string(void) { return ""; }
1171 ++#endif
1172 ++
1173 + static void __init spec2_print_if_insecure(const char *reason)
1174 + {
1175 + if (boot_cpu_has_bug(X86_BUG_SPECTRE_V2))
1176 +- pr_info("%s\n", reason);
1177 ++ pr_info("%s selected on command line.\n", reason);
1178 + }
1179 +
1180 + static void __init spec2_print_if_secure(const char *reason)
1181 + {
1182 + if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2))
1183 +- pr_info("%s\n", reason);
1184 ++ pr_info("%s selected on command line.\n", reason);
1185 + }
1186 +
1187 + static inline bool retp_compiler(void)
1188 +@@ -118,42 +140,68 @@ static inline bool match_option(const char *arg, int arglen, const char *opt)
1189 + return len == arglen && !strncmp(arg, opt, len);
1190 + }
1191 +
1192 ++static const struct {
1193 ++ const char *option;
1194 ++ enum spectre_v2_mitigation_cmd cmd;
1195 ++ bool secure;
1196 ++} mitigation_options[] = {
1197 ++ { "off", SPECTRE_V2_CMD_NONE, false },
1198 ++ { "on", SPECTRE_V2_CMD_FORCE, true },
1199 ++ { "retpoline", SPECTRE_V2_CMD_RETPOLINE, false },
1200 ++ { "retpoline,amd", SPECTRE_V2_CMD_RETPOLINE_AMD, false },
1201 ++ { "retpoline,generic", SPECTRE_V2_CMD_RETPOLINE_GENERIC, false },
1202 ++ { "auto", SPECTRE_V2_CMD_AUTO, false },
1203 ++};
1204 ++
1205 + static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void)
1206 + {
1207 + char arg[20];
1208 +- int ret;
1209 +-
1210 +- ret = cmdline_find_option(boot_command_line, "spectre_v2", arg,
1211 +- sizeof(arg));
1212 +- if (ret > 0) {
1213 +- if (match_option(arg, ret, "off")) {
1214 +- goto disable;
1215 +- } else if (match_option(arg, ret, "on")) {
1216 +- spec2_print_if_secure("force enabled on command line.");
1217 +- return SPECTRE_V2_CMD_FORCE;
1218 +- } else if (match_option(arg, ret, "retpoline")) {
1219 +- spec2_print_if_insecure("retpoline selected on command line.");
1220 +- return SPECTRE_V2_CMD_RETPOLINE;
1221 +- } else if (match_option(arg, ret, "retpoline,amd")) {
1222 +- if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) {
1223 +- pr_err("retpoline,amd selected but CPU is not AMD. Switching to AUTO select\n");
1224 +- return SPECTRE_V2_CMD_AUTO;
1225 +- }
1226 +- spec2_print_if_insecure("AMD retpoline selected on command line.");
1227 +- return SPECTRE_V2_CMD_RETPOLINE_AMD;
1228 +- } else if (match_option(arg, ret, "retpoline,generic")) {
1229 +- spec2_print_if_insecure("generic retpoline selected on command line.");
1230 +- return SPECTRE_V2_CMD_RETPOLINE_GENERIC;
1231 +- } else if (match_option(arg, ret, "auto")) {
1232 ++ int ret, i;
1233 ++ enum spectre_v2_mitigation_cmd cmd = SPECTRE_V2_CMD_AUTO;
1234 ++
1235 ++ if (cmdline_find_option_bool(boot_command_line, "nospectre_v2"))
1236 ++ return SPECTRE_V2_CMD_NONE;
1237 ++ else {
1238 ++ ret = cmdline_find_option(boot_command_line, "spectre_v2", arg,
1239 ++ sizeof(arg));
1240 ++ if (ret < 0)
1241 ++ return SPECTRE_V2_CMD_AUTO;
1242 ++
1243 ++ for (i = 0; i < ARRAY_SIZE(mitigation_options); i++) {
1244 ++ if (!match_option(arg, ret, mitigation_options[i].option))
1245 ++ continue;
1246 ++ cmd = mitigation_options[i].cmd;
1247 ++ break;
1248 ++ }
1249 ++
1250 ++ if (i >= ARRAY_SIZE(mitigation_options)) {
1251 ++ pr_err("unknown option (%s). Switching to AUTO select\n",
1252 ++ mitigation_options[i].option);
1253 + return SPECTRE_V2_CMD_AUTO;
1254 + }
1255 + }
1256 +
1257 +- if (!cmdline_find_option_bool(boot_command_line, "nospectre_v2"))
1258 ++ if ((cmd == SPECTRE_V2_CMD_RETPOLINE ||
1259 ++ cmd == SPECTRE_V2_CMD_RETPOLINE_AMD ||
1260 ++ cmd == SPECTRE_V2_CMD_RETPOLINE_GENERIC) &&
1261 ++ !IS_ENABLED(CONFIG_RETPOLINE)) {
1262 ++ pr_err("%s selected but not compiled in. Switching to AUTO select\n",
1263 ++ mitigation_options[i].option);
1264 + return SPECTRE_V2_CMD_AUTO;
1265 +-disable:
1266 +- spec2_print_if_insecure("disabled on command line.");
1267 +- return SPECTRE_V2_CMD_NONE;
1268 ++ }
1269 ++
1270 ++ if (cmd == SPECTRE_V2_CMD_RETPOLINE_AMD &&
1271 ++ boot_cpu_data.x86_vendor != X86_VENDOR_AMD) {
1272 ++ pr_err("retpoline,amd selected but CPU is not AMD. Switching to AUTO select\n");
1273 ++ return SPECTRE_V2_CMD_AUTO;
1274 ++ }
1275 ++
1276 ++ if (mitigation_options[i].secure)
1277 ++ spec2_print_if_secure(mitigation_options[i].option);
1278 ++ else
1279 ++ spec2_print_if_insecure(mitigation_options[i].option);
1280 ++
1281 ++ return cmd;
1282 + }
1283 +
1284 + /* Check for Skylake-like CPUs (for RSB handling) */
1285 +@@ -191,10 +239,10 @@ static void __init spectre_v2_select_mitigation(void)
1286 + return;
1287 +
1288 + case SPECTRE_V2_CMD_FORCE:
1289 +- /* FALLTRHU */
1290 + case SPECTRE_V2_CMD_AUTO:
1291 +- goto retpoline_auto;
1292 +-
1293 ++ if (IS_ENABLED(CONFIG_RETPOLINE))
1294 ++ goto retpoline_auto;
1295 ++ break;
1296 + case SPECTRE_V2_CMD_RETPOLINE_AMD:
1297 + if (IS_ENABLED(CONFIG_RETPOLINE))
1298 + goto retpoline_amd;
1299 +@@ -249,6 +297,12 @@ static void __init spectre_v2_select_mitigation(void)
1300 + setup_force_cpu_cap(X86_FEATURE_RSB_CTXSW);
1301 + pr_info("Filling RSB on context switch\n");
1302 + }
1303 ++
1304 ++ /* Initialize Indirect Branch Prediction Barrier if supported */
1305 ++ if (boot_cpu_has(X86_FEATURE_IBPB)) {
1306 ++ setup_force_cpu_cap(X86_FEATURE_USE_IBPB);
1307 ++ pr_info("Enabling Indirect Branch Prediction Barrier\n");
1308 ++ }
1309 + }
1310 +
1311 + #undef pr_fmt
1312 +@@ -269,7 +323,7 @@ ssize_t cpu_show_spectre_v1(struct device *dev,
1313 + {
1314 + if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V1))
1315 + return sprintf(buf, "Not affected\n");
1316 +- return sprintf(buf, "Vulnerable\n");
1317 ++ return sprintf(buf, "Mitigation: __user pointer sanitization\n");
1318 + }
1319 +
1320 + ssize_t cpu_show_spectre_v2(struct device *dev,
1321 +@@ -278,6 +332,14 @@ ssize_t cpu_show_spectre_v2(struct device *dev,
1322 + if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2))
1323 + return sprintf(buf, "Not affected\n");
1324 +
1325 +- return sprintf(buf, "%s\n", spectre_v2_strings[spectre_v2_enabled]);
1326 ++ return sprintf(buf, "%s%s%s\n", spectre_v2_strings[spectre_v2_enabled],
1327 ++ boot_cpu_has(X86_FEATURE_USE_IBPB) ? ", IBPB" : "",
1328 ++ spectre_v2_module_string());
1329 + }
1330 + #endif
1331 ++
1332 ++void __ibp_barrier(void)
1333 ++{
1334 ++ __wrmsr(MSR_IA32_PRED_CMD, PRED_CMD_IBPB, 0);
1335 ++}
1336 ++EXPORT_SYMBOL_GPL(__ibp_barrier);
1337 +diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
1338 +index 372ba3fb400f..92b66e21bae5 100644
1339 +--- a/arch/x86/kernel/cpu/common.c
1340 ++++ b/arch/x86/kernel/cpu/common.c
1341 +@@ -47,6 +47,8 @@
1342 + #include <asm/pat.h>
1343 + #include <asm/microcode.h>
1344 + #include <asm/microcode_intel.h>
1345 ++#include <asm/intel-family.h>
1346 ++#include <asm/cpu_device_id.h>
1347 +
1348 + #ifdef CONFIG_X86_LOCAL_APIC
1349 + #include <asm/uv/uv.h>
1350 +@@ -724,6 +726,26 @@ static void apply_forced_caps(struct cpuinfo_x86 *c)
1351 + }
1352 + }
1353 +
1354 ++static void init_speculation_control(struct cpuinfo_x86 *c)
1355 ++{
1356 ++ /*
1357 ++ * The Intel SPEC_CTRL CPUID bit implies IBRS and IBPB support,
1358 ++ * and they also have a different bit for STIBP support. Also,
1359 ++ * a hypervisor might have set the individual AMD bits even on
1360 ++ * Intel CPUs, for finer-grained selection of what's available.
1361 ++ *
1362 ++ * We use the AMD bits in 0x8000_0008 EBX as the generic hardware
1363 ++ * features, which are visible in /proc/cpuinfo and used by the
1364 ++ * kernel. So set those accordingly from the Intel bits.
1365 ++ */
1366 ++ if (cpu_has(c, X86_FEATURE_SPEC_CTRL)) {
1367 ++ set_cpu_cap(c, X86_FEATURE_IBRS);
1368 ++ set_cpu_cap(c, X86_FEATURE_IBPB);
1369 ++ }
1370 ++ if (cpu_has(c, X86_FEATURE_INTEL_STIBP))
1371 ++ set_cpu_cap(c, X86_FEATURE_STIBP);
1372 ++}
1373 ++
1374 + void get_cpu_cap(struct cpuinfo_x86 *c)
1375 + {
1376 + u32 eax, ebx, ecx, edx;
1377 +@@ -745,6 +767,7 @@ void get_cpu_cap(struct cpuinfo_x86 *c)
1378 + cpuid_count(0x00000007, 0, &eax, &ebx, &ecx, &edx);
1379 + c->x86_capability[CPUID_7_0_EBX] = ebx;
1380 + c->x86_capability[CPUID_7_ECX] = ecx;
1381 ++ c->x86_capability[CPUID_7_EDX] = edx;
1382 + }
1383 +
1384 + /* Extended state features: level 0x0000000d */
1385 +@@ -817,6 +840,7 @@ void get_cpu_cap(struct cpuinfo_x86 *c)
1386 + c->x86_capability[CPUID_8000_000A_EDX] = cpuid_edx(0x8000000a);
1387 +
1388 + init_scattered_cpuid_features(c);
1389 ++ init_speculation_control(c);
1390 +
1391 + /*
1392 + * Clear/Set all flags overridden by options, after probe.
1393 +@@ -852,6 +876,41 @@ static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c)
1394 + #endif
1395 + }
1396 +
1397 ++static const __initconst struct x86_cpu_id cpu_no_speculation[] = {
1398 ++ { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_CEDARVIEW, X86_FEATURE_ANY },
1399 ++ { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_CLOVERVIEW, X86_FEATURE_ANY },
1400 ++ { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_LINCROFT, X86_FEATURE_ANY },
1401 ++ { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_PENWELL, X86_FEATURE_ANY },
1402 ++ { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_PINEVIEW, X86_FEATURE_ANY },
1403 ++ { X86_VENDOR_CENTAUR, 5 },
1404 ++ { X86_VENDOR_INTEL, 5 },
1405 ++ { X86_VENDOR_NSC, 5 },
1406 ++ { X86_VENDOR_ANY, 4 },
1407 ++ {}
1408 ++};
1409 ++
1410 ++static const __initconst struct x86_cpu_id cpu_no_meltdown[] = {
1411 ++ { X86_VENDOR_AMD },
1412 ++ {}
1413 ++};
1414 ++
1415 ++static bool __init cpu_vulnerable_to_meltdown(struct cpuinfo_x86 *c)
1416 ++{
1417 ++ u64 ia32_cap = 0;
1418 ++
1419 ++ if (x86_match_cpu(cpu_no_meltdown))
1420 ++ return false;
1421 ++
1422 ++ if (cpu_has(c, X86_FEATURE_ARCH_CAPABILITIES))
1423 ++ rdmsrl(MSR_IA32_ARCH_CAPABILITIES, ia32_cap);
1424 ++
1425 ++ /* Rogue Data Cache Load? No! */
1426 ++ if (ia32_cap & ARCH_CAP_RDCL_NO)
1427 ++ return false;
1428 ++
1429 ++ return true;
1430 ++}
1431 ++
1432 + /*
1433 + * Do minimum CPU detection early.
1434 + * Fields really needed: vendor, cpuid_level, family, model, mask,
1435 +@@ -899,11 +958,12 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
1436 +
1437 + setup_force_cpu_cap(X86_FEATURE_ALWAYS);
1438 +
1439 +- if (c->x86_vendor != X86_VENDOR_AMD)
1440 +- setup_force_cpu_bug(X86_BUG_CPU_MELTDOWN);
1441 +-
1442 +- setup_force_cpu_bug(X86_BUG_SPECTRE_V1);
1443 +- setup_force_cpu_bug(X86_BUG_SPECTRE_V2);
1444 ++ if (!x86_match_cpu(cpu_no_speculation)) {
1445 ++ if (cpu_vulnerable_to_meltdown(c))
1446 ++ setup_force_cpu_bug(X86_BUG_CPU_MELTDOWN);
1447 ++ setup_force_cpu_bug(X86_BUG_SPECTRE_V1);
1448 ++ setup_force_cpu_bug(X86_BUG_SPECTRE_V2);
1449 ++ }
1450 +
1451 + fpu__init_system(c);
1452 +
1453 +diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
1454 +index b720dacac051..4cf4f8cbc69d 100644
1455 +--- a/arch/x86/kernel/cpu/intel.c
1456 ++++ b/arch/x86/kernel/cpu/intel.c
1457 +@@ -102,6 +102,59 @@ static void probe_xeon_phi_r3mwait(struct cpuinfo_x86 *c)
1458 + ELF_HWCAP2 |= HWCAP2_RING3MWAIT;
1459 + }
1460 +
1461 ++/*
1462 ++ * Early microcode releases for the Spectre v2 mitigation were broken.
1463 ++ * Information taken from;
1464 ++ * - https://newsroom.intel.com/wp-content/uploads/sites/11/2018/01/microcode-update-guidance.pdf
1465 ++ * - https://kb.vmware.com/s/article/52345
1466 ++ * - Microcode revisions observed in the wild
1467 ++ * - Release note from 20180108 microcode release
1468 ++ */
1469 ++struct sku_microcode {
1470 ++ u8 model;
1471 ++ u8 stepping;
1472 ++ u32 microcode;
1473 ++};
1474 ++static const struct sku_microcode spectre_bad_microcodes[] = {
1475 ++ { INTEL_FAM6_KABYLAKE_DESKTOP, 0x0B, 0x84 },
1476 ++ { INTEL_FAM6_KABYLAKE_DESKTOP, 0x0A, 0x84 },
1477 ++ { INTEL_FAM6_KABYLAKE_DESKTOP, 0x09, 0x84 },
1478 ++ { INTEL_FAM6_KABYLAKE_MOBILE, 0x0A, 0x84 },
1479 ++ { INTEL_FAM6_KABYLAKE_MOBILE, 0x09, 0x84 },
1480 ++ { INTEL_FAM6_SKYLAKE_X, 0x03, 0x0100013e },
1481 ++ { INTEL_FAM6_SKYLAKE_X, 0x04, 0x0200003c },
1482 ++ { INTEL_FAM6_SKYLAKE_MOBILE, 0x03, 0xc2 },
1483 ++ { INTEL_FAM6_SKYLAKE_DESKTOP, 0x03, 0xc2 },
1484 ++ { INTEL_FAM6_BROADWELL_CORE, 0x04, 0x28 },
1485 ++ { INTEL_FAM6_BROADWELL_GT3E, 0x01, 0x1b },
1486 ++ { INTEL_FAM6_BROADWELL_XEON_D, 0x02, 0x14 },
1487 ++ { INTEL_FAM6_BROADWELL_XEON_D, 0x03, 0x07000011 },
1488 ++ { INTEL_FAM6_BROADWELL_X, 0x01, 0x0b000025 },
1489 ++ { INTEL_FAM6_HASWELL_ULT, 0x01, 0x21 },
1490 ++ { INTEL_FAM6_HASWELL_GT3E, 0x01, 0x18 },
1491 ++ { INTEL_FAM6_HASWELL_CORE, 0x03, 0x23 },
1492 ++ { INTEL_FAM6_HASWELL_X, 0x02, 0x3b },
1493 ++ { INTEL_FAM6_HASWELL_X, 0x04, 0x10 },
1494 ++ { INTEL_FAM6_IVYBRIDGE_X, 0x04, 0x42a },
1495 ++ /* Updated in the 20180108 release; blacklist until we know otherwise */
1496 ++ { INTEL_FAM6_ATOM_GEMINI_LAKE, 0x01, 0x22 },
1497 ++ /* Observed in the wild */
1498 ++ { INTEL_FAM6_SANDYBRIDGE_X, 0x06, 0x61b },
1499 ++ { INTEL_FAM6_SANDYBRIDGE_X, 0x07, 0x712 },
1500 ++};
1501 ++
1502 ++static bool bad_spectre_microcode(struct cpuinfo_x86 *c)
1503 ++{
1504 ++ int i;
1505 ++
1506 ++ for (i = 0; i < ARRAY_SIZE(spectre_bad_microcodes); i++) {
1507 ++ if (c->x86_model == spectre_bad_microcodes[i].model &&
1508 ++ c->x86_mask == spectre_bad_microcodes[i].stepping)
1509 ++ return (c->microcode <= spectre_bad_microcodes[i].microcode);
1510 ++ }
1511 ++ return false;
1512 ++}
1513 ++
1514 + static void early_init_intel(struct cpuinfo_x86 *c)
1515 + {
1516 + u64 misc_enable;
1517 +@@ -122,6 +175,19 @@ static void early_init_intel(struct cpuinfo_x86 *c)
1518 + if (c->x86 >= 6 && !cpu_has(c, X86_FEATURE_IA64))
1519 + c->microcode = intel_get_microcode_revision();
1520 +
1521 ++ /* Now if any of them are set, check the blacklist and clear the lot */
1522 ++ if ((cpu_has(c, X86_FEATURE_SPEC_CTRL) ||
1523 ++ cpu_has(c, X86_FEATURE_INTEL_STIBP) ||
1524 ++ cpu_has(c, X86_FEATURE_IBRS) || cpu_has(c, X86_FEATURE_IBPB) ||
1525 ++ cpu_has(c, X86_FEATURE_STIBP)) && bad_spectre_microcode(c)) {
1526 ++ pr_warn("Intel Spectre v2 broken microcode detected; disabling Speculation Control\n");
1527 ++ setup_clear_cpu_cap(X86_FEATURE_IBRS);
1528 ++ setup_clear_cpu_cap(X86_FEATURE_IBPB);
1529 ++ setup_clear_cpu_cap(X86_FEATURE_STIBP);
1530 ++ setup_clear_cpu_cap(X86_FEATURE_SPEC_CTRL);
1531 ++ setup_clear_cpu_cap(X86_FEATURE_INTEL_STIBP);
1532 ++ }
1533 ++
1534 + /*
1535 + * Atom erratum AAE44/AAF40/AAG38/AAH41:
1536 + *
1537 +diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
1538 +index d0e69769abfd..df11f5d604be 100644
1539 +--- a/arch/x86/kernel/cpu/scattered.c
1540 ++++ b/arch/x86/kernel/cpu/scattered.c
1541 +@@ -21,8 +21,6 @@ struct cpuid_bit {
1542 + static const struct cpuid_bit cpuid_bits[] = {
1543 + { X86_FEATURE_APERFMPERF, CPUID_ECX, 0, 0x00000006, 0 },
1544 + { X86_FEATURE_EPB, CPUID_ECX, 3, 0x00000006, 0 },
1545 +- { X86_FEATURE_AVX512_4VNNIW, CPUID_EDX, 2, 0x00000007, 0 },
1546 +- { X86_FEATURE_AVX512_4FMAPS, CPUID_EDX, 3, 0x00000007, 0 },
1547 + { X86_FEATURE_CAT_L3, CPUID_EBX, 1, 0x00000010, 0 },
1548 + { X86_FEATURE_CAT_L2, CPUID_EBX, 2, 0x00000010, 0 },
1549 + { X86_FEATURE_CDP_L3, CPUID_ECX, 2, 0x00000010, 1 },
1550 +diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
1551 +index c75466232016..9eb448c7859d 100644
1552 +--- a/arch/x86/kernel/process_64.c
1553 ++++ b/arch/x86/kernel/process_64.c
1554 +@@ -557,7 +557,7 @@ static void __set_personality_x32(void)
1555 + * Pretend to come from a x32 execve.
1556 + */
1557 + task_pt_regs(current)->orig_ax = __NR_x32_execve | __X32_SYSCALL_BIT;
1558 +- current->thread.status &= ~TS_COMPAT;
1559 ++ current_thread_info()->status &= ~TS_COMPAT;
1560 + #endif
1561 + }
1562 +
1563 +@@ -571,7 +571,7 @@ static void __set_personality_ia32(void)
1564 + current->personality |= force_personality32;
1565 + /* Prepare the first "return" to user space */
1566 + task_pt_regs(current)->orig_ax = __NR_ia32_execve;
1567 +- current->thread.status |= TS_COMPAT;
1568 ++ current_thread_info()->status |= TS_COMPAT;
1569 + #endif
1570 + }
1571 +
1572 +diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
1573 +index f37d18124648..ed5c4cdf0a34 100644
1574 +--- a/arch/x86/kernel/ptrace.c
1575 ++++ b/arch/x86/kernel/ptrace.c
1576 +@@ -935,7 +935,7 @@ static int putreg32(struct task_struct *child, unsigned regno, u32 value)
1577 + */
1578 + regs->orig_ax = value;
1579 + if (syscall_get_nr(child, regs) >= 0)
1580 +- child->thread.status |= TS_I386_REGS_POKED;
1581 ++ child->thread_info.status |= TS_I386_REGS_POKED;
1582 + break;
1583 +
1584 + case offsetof(struct user32, regs.eflags):
1585 +diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
1586 +index b9e00e8f1c9b..4cdc0b27ec82 100644
1587 +--- a/arch/x86/kernel/signal.c
1588 ++++ b/arch/x86/kernel/signal.c
1589 +@@ -787,7 +787,7 @@ static inline unsigned long get_nr_restart_syscall(const struct pt_regs *regs)
1590 + * than the tracee.
1591 + */
1592 + #ifdef CONFIG_IA32_EMULATION
1593 +- if (current->thread.status & (TS_COMPAT|TS_I386_REGS_POKED))
1594 ++ if (current_thread_info()->status & (TS_COMPAT|TS_I386_REGS_POKED))
1595 + return __NR_ia32_restart_syscall;
1596 + #endif
1597 + #ifdef CONFIG_X86_X32_ABI
1598 +diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
1599 +index 0099e10eb045..13f5d4217e4f 100644
1600 +--- a/arch/x86/kvm/cpuid.c
1601 ++++ b/arch/x86/kvm/cpuid.c
1602 +@@ -67,9 +67,7 @@ u64 kvm_supported_xcr0(void)
1603 +
1604 + #define F(x) bit(X86_FEATURE_##x)
1605 +
1606 +-/* These are scattered features in cpufeatures.h. */
1607 +-#define KVM_CPUID_BIT_AVX512_4VNNIW 2
1608 +-#define KVM_CPUID_BIT_AVX512_4FMAPS 3
1609 ++/* For scattered features from cpufeatures.h; we currently expose none */
1610 + #define KF(x) bit(KVM_CPUID_BIT_##x)
1611 +
1612 + int kvm_update_cpuid(struct kvm_vcpu *vcpu)
1613 +@@ -367,6 +365,10 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
1614 + F(3DNOWPREFETCH) | F(OSVW) | 0 /* IBS */ | F(XOP) |
1615 + 0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM);
1616 +
1617 ++ /* cpuid 0x80000008.ebx */
1618 ++ const u32 kvm_cpuid_8000_0008_ebx_x86_features =
1619 ++ F(IBPB) | F(IBRS);
1620 ++
1621 + /* cpuid 0xC0000001.edx */
1622 + const u32 kvm_cpuid_C000_0001_edx_x86_features =
1623 + F(XSTORE) | F(XSTORE_EN) | F(XCRYPT) | F(XCRYPT_EN) |
1624 +@@ -392,7 +394,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
1625 +
1626 + /* cpuid 7.0.edx*/
1627 + const u32 kvm_cpuid_7_0_edx_x86_features =
1628 +- KF(AVX512_4VNNIW) | KF(AVX512_4FMAPS);
1629 ++ F(AVX512_4VNNIW) | F(AVX512_4FMAPS) | F(SPEC_CTRL) |
1630 ++ F(ARCH_CAPABILITIES);
1631 +
1632 + /* all calls to cpuid_count() should be made on the same cpu */
1633 + get_cpu();
1634 +@@ -477,7 +480,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
1635 + if (!tdp_enabled || !boot_cpu_has(X86_FEATURE_OSPKE))
1636 + entry->ecx &= ~F(PKU);
1637 + entry->edx &= kvm_cpuid_7_0_edx_x86_features;
1638 +- entry->edx &= get_scattered_cpuid_leaf(7, 0, CPUID_EDX);
1639 ++ cpuid_mask(&entry->edx, CPUID_7_EDX);
1640 + } else {
1641 + entry->ebx = 0;
1642 + entry->ecx = 0;
1643 +@@ -627,7 +630,14 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
1644 + if (!g_phys_as)
1645 + g_phys_as = phys_as;
1646 + entry->eax = g_phys_as | (virt_as << 8);
1647 +- entry->ebx = entry->edx = 0;
1648 ++ entry->edx = 0;
1649 ++ /* IBRS and IBPB aren't necessarily present in hardware cpuid */
1650 ++ if (boot_cpu_has(X86_FEATURE_IBPB))
1651 ++ entry->ebx |= F(IBPB);
1652 ++ if (boot_cpu_has(X86_FEATURE_IBRS))
1653 ++ entry->ebx |= F(IBRS);
1654 ++ entry->ebx &= kvm_cpuid_8000_0008_ebx_x86_features;
1655 ++ cpuid_mask(&entry->ebx, CPUID_8000_0008_EBX);
1656 + break;
1657 + }
1658 + case 0x80000019:
1659 +diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
1660 +index c2cea6651279..9a327d5b6d1f 100644
1661 +--- a/arch/x86/kvm/cpuid.h
1662 ++++ b/arch/x86/kvm/cpuid.h
1663 +@@ -54,6 +54,7 @@ static const struct cpuid_reg reverse_cpuid[] = {
1664 + [CPUID_8000_000A_EDX] = {0x8000000a, 0, CPUID_EDX},
1665 + [CPUID_7_ECX] = { 7, 0, CPUID_ECX},
1666 + [CPUID_8000_0007_EBX] = {0x80000007, 0, CPUID_EBX},
1667 ++ [CPUID_7_EDX] = { 7, 0, CPUID_EDX},
1668 + };
1669 +
1670 + static __always_inline struct cpuid_reg x86_feature_cpuid(unsigned x86_feature)
1671 +diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
1672 +index eca6a89f2326..fab073b19528 100644
1673 +--- a/arch/x86/kvm/emulate.c
1674 ++++ b/arch/x86/kvm/emulate.c
1675 +@@ -25,6 +25,7 @@
1676 + #include <asm/kvm_emulate.h>
1677 + #include <linux/stringify.h>
1678 + #include <asm/debugreg.h>
1679 ++#include <asm/nospec-branch.h>
1680 +
1681 + #include "x86.h"
1682 + #include "tss.h"
1683 +@@ -1021,8 +1022,8 @@ static __always_inline u8 test_cc(unsigned int condition, unsigned long flags)
1684 + void (*fop)(void) = (void *)em_setcc + 4 * (condition & 0xf);
1685 +
1686 + flags = (flags & EFLAGS_MASK) | X86_EFLAGS_IF;
1687 +- asm("push %[flags]; popf; call *%[fastop]"
1688 +- : "=a"(rc) : [fastop]"r"(fop), [flags]"r"(flags));
1689 ++ asm("push %[flags]; popf; " CALL_NOSPEC
1690 ++ : "=a"(rc) : [thunk_target]"r"(fop), [flags]"r"(flags));
1691 + return rc;
1692 + }
1693 +
1694 +@@ -5350,9 +5351,9 @@ static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *))
1695 + if (!(ctxt->d & ByteOp))
1696 + fop += __ffs(ctxt->dst.bytes) * FASTOP_SIZE;
1697 +
1698 +- asm("push %[flags]; popf; call *%[fastop]; pushf; pop %[flags]\n"
1699 ++ asm("push %[flags]; popf; " CALL_NOSPEC " ; pushf; pop %[flags]\n"
1700 + : "+a"(ctxt->dst.val), "+d"(ctxt->src.val), [flags]"+D"(flags),
1701 +- [fastop]"+S"(fop), ASM_CALL_CONSTRAINT
1702 ++ [thunk_target]"+S"(fop), ASM_CALL_CONSTRAINT
1703 + : "c"(ctxt->src2.val));
1704 +
1705 + ctxt->eflags = (ctxt->eflags & ~EFLAGS_MASK) | (flags & EFLAGS_MASK);
1706 +diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
1707 +index 6a8284f72328..e0bc3ad0f6cd 100644
1708 +--- a/arch/x86/kvm/svm.c
1709 ++++ b/arch/x86/kvm/svm.c
1710 +@@ -184,6 +184,8 @@ struct vcpu_svm {
1711 + u64 gs_base;
1712 + } host;
1713 +
1714 ++ u64 spec_ctrl;
1715 ++
1716 + u32 *msrpm;
1717 +
1718 + ulong nmi_iret_rip;
1719 +@@ -249,6 +251,8 @@ static const struct svm_direct_access_msrs {
1720 + { .index = MSR_CSTAR, .always = true },
1721 + { .index = MSR_SYSCALL_MASK, .always = true },
1722 + #endif
1723 ++ { .index = MSR_IA32_SPEC_CTRL, .always = false },
1724 ++ { .index = MSR_IA32_PRED_CMD, .always = false },
1725 + { .index = MSR_IA32_LASTBRANCHFROMIP, .always = false },
1726 + { .index = MSR_IA32_LASTBRANCHTOIP, .always = false },
1727 + { .index = MSR_IA32_LASTINTFROMIP, .always = false },
1728 +@@ -529,6 +533,7 @@ struct svm_cpu_data {
1729 + struct kvm_ldttss_desc *tss_desc;
1730 +
1731 + struct page *save_area;
1732 ++ struct vmcb *current_vmcb;
1733 + };
1734 +
1735 + static DEFINE_PER_CPU(struct svm_cpu_data *, svm_data);
1736 +@@ -880,6 +885,25 @@ static bool valid_msr_intercept(u32 index)
1737 + return false;
1738 + }
1739 +
1740 ++static bool msr_write_intercepted(struct kvm_vcpu *vcpu, unsigned msr)
1741 ++{
1742 ++ u8 bit_write;
1743 ++ unsigned long tmp;
1744 ++ u32 offset;
1745 ++ u32 *msrpm;
1746 ++
1747 ++ msrpm = is_guest_mode(vcpu) ? to_svm(vcpu)->nested.msrpm:
1748 ++ to_svm(vcpu)->msrpm;
1749 ++
1750 ++ offset = svm_msrpm_offset(msr);
1751 ++ bit_write = 2 * (msr & 0x0f) + 1;
1752 ++ tmp = msrpm[offset];
1753 ++
1754 ++ BUG_ON(offset == MSR_INVALID);
1755 ++
1756 ++ return !!test_bit(bit_write, &tmp);
1757 ++}
1758 ++
1759 + static void set_msr_interception(u32 *msrpm, unsigned msr,
1760 + int read, int write)
1761 + {
1762 +@@ -1585,6 +1609,8 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
1763 + u32 dummy;
1764 + u32 eax = 1;
1765 +
1766 ++ svm->spec_ctrl = 0;
1767 ++
1768 + if (!init_event) {
1769 + svm->vcpu.arch.apic_base = APIC_DEFAULT_PHYS_BASE |
1770 + MSR_IA32_APICBASE_ENABLE;
1771 +@@ -1706,11 +1732,17 @@ static void svm_free_vcpu(struct kvm_vcpu *vcpu)
1772 + __free_pages(virt_to_page(svm->nested.msrpm), MSRPM_ALLOC_ORDER);
1773 + kvm_vcpu_uninit(vcpu);
1774 + kmem_cache_free(kvm_vcpu_cache, svm);
1775 ++ /*
1776 ++ * The vmcb page can be recycled, causing a false negative in
1777 ++ * svm_vcpu_load(). So do a full IBPB now.
1778 ++ */
1779 ++ indirect_branch_prediction_barrier();
1780 + }
1781 +
1782 + static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
1783 + {
1784 + struct vcpu_svm *svm = to_svm(vcpu);
1785 ++ struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
1786 + int i;
1787 +
1788 + if (unlikely(cpu != vcpu->cpu)) {
1789 +@@ -1739,6 +1771,10 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
1790 + if (static_cpu_has(X86_FEATURE_RDTSCP))
1791 + wrmsrl(MSR_TSC_AUX, svm->tsc_aux);
1792 +
1793 ++ if (sd->current_vmcb != svm->vmcb) {
1794 ++ sd->current_vmcb = svm->vmcb;
1795 ++ indirect_branch_prediction_barrier();
1796 ++ }
1797 + avic_vcpu_load(vcpu, cpu);
1798 + }
1799 +
1800 +@@ -3579,6 +3615,13 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
1801 + case MSR_VM_CR:
1802 + msr_info->data = svm->nested.vm_cr_msr;
1803 + break;
1804 ++ case MSR_IA32_SPEC_CTRL:
1805 ++ if (!msr_info->host_initiated &&
1806 ++ !guest_cpuid_has(vcpu, X86_FEATURE_IBRS))
1807 ++ return 1;
1808 ++
1809 ++ msr_info->data = svm->spec_ctrl;
1810 ++ break;
1811 + case MSR_IA32_UCODE_REV:
1812 + msr_info->data = 0x01000065;
1813 + break;
1814 +@@ -3670,6 +3713,49 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
1815 + case MSR_IA32_TSC:
1816 + kvm_write_tsc(vcpu, msr);
1817 + break;
1818 ++ case MSR_IA32_SPEC_CTRL:
1819 ++ if (!msr->host_initiated &&
1820 ++ !guest_cpuid_has(vcpu, X86_FEATURE_IBRS))
1821 ++ return 1;
1822 ++
1823 ++ /* The STIBP bit doesn't fault even if it's not advertised */
1824 ++ if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP))
1825 ++ return 1;
1826 ++
1827 ++ svm->spec_ctrl = data;
1828 ++
1829 ++ if (!data)
1830 ++ break;
1831 ++
1832 ++ /*
1833 ++ * For non-nested:
1834 ++ * When it's written (to non-zero) for the first time, pass
1835 ++ * it through.
1836 ++ *
1837 ++ * For nested:
1838 ++ * The handling of the MSR bitmap for L2 guests is done in
1839 ++ * nested_svm_vmrun_msrpm.
1840 ++ * We update the L1 MSR bit as well since it will end up
1841 ++ * touching the MSR anyway now.
1842 ++ */
1843 ++ set_msr_interception(svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1);
1844 ++ break;
1845 ++ case MSR_IA32_PRED_CMD:
1846 ++ if (!msr->host_initiated &&
1847 ++ !guest_cpuid_has(vcpu, X86_FEATURE_IBPB))
1848 ++ return 1;
1849 ++
1850 ++ if (data & ~PRED_CMD_IBPB)
1851 ++ return 1;
1852 ++
1853 ++ if (!data)
1854 ++ break;
1855 ++
1856 ++ wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB);
1857 ++ if (is_guest_mode(vcpu))
1858 ++ break;
1859 ++ set_msr_interception(svm->msrpm, MSR_IA32_PRED_CMD, 0, 1);
1860 ++ break;
1861 + case MSR_STAR:
1862 + svm->vmcb->save.star = data;
1863 + break;
1864 +@@ -4922,6 +5008,15 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
1865 +
1866 + local_irq_enable();
1867 +
1868 ++ /*
1869 ++ * If this vCPU has touched SPEC_CTRL, restore the guest's value if
1870 ++ * it's non-zero. Since vmentry is serialising on affected CPUs, there
1871 ++ * is no need to worry about the conditional branch over the wrmsr
1872 ++ * being speculatively taken.
1873 ++ */
1874 ++ if (svm->spec_ctrl)
1875 ++ wrmsrl(MSR_IA32_SPEC_CTRL, svm->spec_ctrl);
1876 ++
1877 + asm volatile (
1878 + "push %%" _ASM_BP "; \n\t"
1879 + "mov %c[rbx](%[svm]), %%" _ASM_BX " \n\t"
1880 +@@ -5014,6 +5109,27 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
1881 + #endif
1882 + );
1883 +
1884 ++ /*
1885 ++ * We do not use IBRS in the kernel. If this vCPU has used the
1886 ++ * SPEC_CTRL MSR it may have left it on; save the value and
1887 ++ * turn it off. This is much more efficient than blindly adding
1888 ++ * it to the atomic save/restore list. Especially as the former
1889 ++ * (Saving guest MSRs on vmexit) doesn't even exist in KVM.
1890 ++ *
1891 ++ * For non-nested case:
1892 ++ * If the L01 MSR bitmap does not intercept the MSR, then we need to
1893 ++ * save it.
1894 ++ *
1895 ++ * For nested case:
1896 ++ * If the L02 MSR bitmap does not intercept the MSR, then we need to
1897 ++ * save it.
1898 ++ */
1899 ++ if (!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL))
1900 ++ rdmsrl(MSR_IA32_SPEC_CTRL, svm->spec_ctrl);
1901 ++
1902 ++ if (svm->spec_ctrl)
1903 ++ wrmsrl(MSR_IA32_SPEC_CTRL, 0);
1904 ++
1905 + /* Eliminate branch target predictions from guest mode */
1906 + vmexit_fill_RSB();
1907 +
1908 +diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
1909 +index a45063a9219c..0ae4b1a86168 100644
1910 +--- a/arch/x86/kvm/vmx.c
1911 ++++ b/arch/x86/kvm/vmx.c
1912 +@@ -34,6 +34,7 @@
1913 + #include <linux/tboot.h>
1914 + #include <linux/hrtimer.h>
1915 + #include <linux/frame.h>
1916 ++#include <linux/nospec.h>
1917 + #include "kvm_cache_regs.h"
1918 + #include "x86.h"
1919 +
1920 +@@ -108,6 +109,14 @@ static u64 __read_mostly host_xss;
1921 + static bool __read_mostly enable_pml = 1;
1922 + module_param_named(pml, enable_pml, bool, S_IRUGO);
1923 +
1924 ++#define MSR_TYPE_R 1
1925 ++#define MSR_TYPE_W 2
1926 ++#define MSR_TYPE_RW 3
1927 ++
1928 ++#define MSR_BITMAP_MODE_X2APIC 1
1929 ++#define MSR_BITMAP_MODE_X2APIC_APICV 2
1930 ++#define MSR_BITMAP_MODE_LM 4
1931 ++
1932 + #define KVM_VMX_TSC_MULTIPLIER_MAX 0xffffffffffffffffULL
1933 +
1934 + /* Guest_tsc -> host_tsc conversion requires 64-bit division. */
1935 +@@ -182,7 +191,6 @@ module_param(ple_window_max, int, S_IRUGO);
1936 + extern const ulong vmx_return;
1937 +
1938 + #define NR_AUTOLOAD_MSRS 8
1939 +-#define VMCS02_POOL_SIZE 1
1940 +
1941 + struct vmcs {
1942 + u32 revision_id;
1943 +@@ -207,6 +215,7 @@ struct loaded_vmcs {
1944 + int soft_vnmi_blocked;
1945 + ktime_t entry_time;
1946 + s64 vnmi_blocked_time;
1947 ++ unsigned long *msr_bitmap;
1948 + struct list_head loaded_vmcss_on_cpu_link;
1949 + };
1950 +
1951 +@@ -223,7 +232,7 @@ struct shared_msr_entry {
1952 + * stored in guest memory specified by VMPTRLD, but is opaque to the guest,
1953 + * which must access it using VMREAD/VMWRITE/VMCLEAR instructions.
1954 + * More than one of these structures may exist, if L1 runs multiple L2 guests.
1955 +- * nested_vmx_run() will use the data here to build a vmcs02: a VMCS for the
1956 ++ * nested_vmx_run() will use the data here to build the vmcs02: a VMCS for the
1957 + * underlying hardware which will be used to run L2.
1958 + * This structure is packed to ensure that its layout is identical across
1959 + * machines (necessary for live migration).
1960 +@@ -406,13 +415,6 @@ struct __packed vmcs12 {
1961 + */
1962 + #define VMCS12_SIZE 0x1000
1963 +
1964 +-/* Used to remember the last vmcs02 used for some recently used vmcs12s */
1965 +-struct vmcs02_list {
1966 +- struct list_head list;
1967 +- gpa_t vmptr;
1968 +- struct loaded_vmcs vmcs02;
1969 +-};
1970 +-
1971 + /*
1972 + * The nested_vmx structure is part of vcpu_vmx, and holds information we need
1973 + * for correct emulation of VMX (i.e., nested VMX) on this vcpu.
1974 +@@ -437,15 +439,15 @@ struct nested_vmx {
1975 + */
1976 + bool sync_shadow_vmcs;
1977 +
1978 +- /* vmcs02_list cache of VMCSs recently used to run L2 guests */
1979 +- struct list_head vmcs02_pool;
1980 +- int vmcs02_num;
1981 + bool change_vmcs01_virtual_x2apic_mode;
1982 + /* L2 must run next, and mustn't decide to exit to L1. */
1983 + bool nested_run_pending;
1984 ++
1985 ++ struct loaded_vmcs vmcs02;
1986 ++
1987 + /*
1988 +- * Guest pages referred to in vmcs02 with host-physical pointers, so
1989 +- * we must keep them pinned while L2 runs.
1990 ++ * Guest pages referred to in the vmcs02 with host-physical
1991 ++ * pointers, so we must keep them pinned while L2 runs.
1992 + */
1993 + struct page *apic_access_page;
1994 + struct page *virtual_apic_page;
1995 +@@ -454,8 +456,6 @@ struct nested_vmx {
1996 + bool pi_pending;
1997 + u16 posted_intr_nv;
1998 +
1999 +- unsigned long *msr_bitmap;
2000 +-
2001 + struct hrtimer preemption_timer;
2002 + bool preemption_timer_expired;
2003 +
2004 +@@ -570,6 +570,7 @@ struct vcpu_vmx {
2005 + struct kvm_vcpu vcpu;
2006 + unsigned long host_rsp;
2007 + u8 fail;
2008 ++ u8 msr_bitmap_mode;
2009 + u32 exit_intr_info;
2010 + u32 idt_vectoring_info;
2011 + ulong rflags;
2012 +@@ -581,6 +582,10 @@ struct vcpu_vmx {
2013 + u64 msr_host_kernel_gs_base;
2014 + u64 msr_guest_kernel_gs_base;
2015 + #endif
2016 ++
2017 ++ u64 arch_capabilities;
2018 ++ u64 spec_ctrl;
2019 ++
2020 + u32 vm_entry_controls_shadow;
2021 + u32 vm_exit_controls_shadow;
2022 + u32 secondary_exec_control;
2023 +@@ -887,21 +892,18 @@ static const unsigned short vmcs_field_to_offset_table[] = {
2024 +
2025 + static inline short vmcs_field_to_offset(unsigned long field)
2026 + {
2027 +- BUILD_BUG_ON(ARRAY_SIZE(vmcs_field_to_offset_table) > SHRT_MAX);
2028 ++ const size_t size = ARRAY_SIZE(vmcs_field_to_offset_table);
2029 ++ unsigned short offset;
2030 +
2031 +- if (field >= ARRAY_SIZE(vmcs_field_to_offset_table))
2032 ++ BUILD_BUG_ON(size > SHRT_MAX);
2033 ++ if (field >= size)
2034 + return -ENOENT;
2035 +
2036 +- /*
2037 +- * FIXME: Mitigation for CVE-2017-5753. To be replaced with a
2038 +- * generic mechanism.
2039 +- */
2040 +- asm("lfence");
2041 +-
2042 +- if (vmcs_field_to_offset_table[field] == 0)
2043 ++ field = array_index_nospec(field, size);
2044 ++ offset = vmcs_field_to_offset_table[field];
2045 ++ if (offset == 0)
2046 + return -ENOENT;
2047 +-
2048 +- return vmcs_field_to_offset_table[field];
2049 ++ return offset;
2050 + }
2051 +
2052 + static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu)
2053 +@@ -927,6 +929,9 @@ static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu);
2054 + static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked);
2055 + static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12,
2056 + u16 error_code);
2057 ++static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu);
2058 ++static void __always_inline vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
2059 ++ u32 msr, int type);
2060 +
2061 + static DEFINE_PER_CPU(struct vmcs *, vmxarea);
2062 + static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
2063 +@@ -946,12 +951,6 @@ static DEFINE_PER_CPU(spinlock_t, blocked_vcpu_on_cpu_lock);
2064 + enum {
2065 + VMX_IO_BITMAP_A,
2066 + VMX_IO_BITMAP_B,
2067 +- VMX_MSR_BITMAP_LEGACY,
2068 +- VMX_MSR_BITMAP_LONGMODE,
2069 +- VMX_MSR_BITMAP_LEGACY_X2APIC_APICV,
2070 +- VMX_MSR_BITMAP_LONGMODE_X2APIC_APICV,
2071 +- VMX_MSR_BITMAP_LEGACY_X2APIC,
2072 +- VMX_MSR_BITMAP_LONGMODE_X2APIC,
2073 + VMX_VMREAD_BITMAP,
2074 + VMX_VMWRITE_BITMAP,
2075 + VMX_BITMAP_NR
2076 +@@ -961,12 +960,6 @@ static unsigned long *vmx_bitmap[VMX_BITMAP_NR];
2077 +
2078 + #define vmx_io_bitmap_a (vmx_bitmap[VMX_IO_BITMAP_A])
2079 + #define vmx_io_bitmap_b (vmx_bitmap[VMX_IO_BITMAP_B])
2080 +-#define vmx_msr_bitmap_legacy (vmx_bitmap[VMX_MSR_BITMAP_LEGACY])
2081 +-#define vmx_msr_bitmap_longmode (vmx_bitmap[VMX_MSR_BITMAP_LONGMODE])
2082 +-#define vmx_msr_bitmap_legacy_x2apic_apicv (vmx_bitmap[VMX_MSR_BITMAP_LEGACY_X2APIC_APICV])
2083 +-#define vmx_msr_bitmap_longmode_x2apic_apicv (vmx_bitmap[VMX_MSR_BITMAP_LONGMODE_X2APIC_APICV])
2084 +-#define vmx_msr_bitmap_legacy_x2apic (vmx_bitmap[VMX_MSR_BITMAP_LEGACY_X2APIC])
2085 +-#define vmx_msr_bitmap_longmode_x2apic (vmx_bitmap[VMX_MSR_BITMAP_LONGMODE_X2APIC])
2086 + #define vmx_vmread_bitmap (vmx_bitmap[VMX_VMREAD_BITMAP])
2087 + #define vmx_vmwrite_bitmap (vmx_bitmap[VMX_VMWRITE_BITMAP])
2088 +
2089 +@@ -1913,6 +1906,52 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
2090 + vmcs_write32(EXCEPTION_BITMAP, eb);
2091 + }
2092 +
2093 ++/*
2094 ++ * Check if MSR is intercepted for currently loaded MSR bitmap.
2095 ++ */
2096 ++static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr)
2097 ++{
2098 ++ unsigned long *msr_bitmap;
2099 ++ int f = sizeof(unsigned long);
2100 ++
2101 ++ if (!cpu_has_vmx_msr_bitmap())
2102 ++ return true;
2103 ++
2104 ++ msr_bitmap = to_vmx(vcpu)->loaded_vmcs->msr_bitmap;
2105 ++
2106 ++ if (msr <= 0x1fff) {
2107 ++ return !!test_bit(msr, msr_bitmap + 0x800 / f);
2108 ++ } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
2109 ++ msr &= 0x1fff;
2110 ++ return !!test_bit(msr, msr_bitmap + 0xc00 / f);
2111 ++ }
2112 ++
2113 ++ return true;
2114 ++}
2115 ++
2116 ++/*
2117 ++ * Check if MSR is intercepted for L01 MSR bitmap.
2118 ++ */
2119 ++static bool msr_write_intercepted_l01(struct kvm_vcpu *vcpu, u32 msr)
2120 ++{
2121 ++ unsigned long *msr_bitmap;
2122 ++ int f = sizeof(unsigned long);
2123 ++
2124 ++ if (!cpu_has_vmx_msr_bitmap())
2125 ++ return true;
2126 ++
2127 ++ msr_bitmap = to_vmx(vcpu)->vmcs01.msr_bitmap;
2128 ++
2129 ++ if (msr <= 0x1fff) {
2130 ++ return !!test_bit(msr, msr_bitmap + 0x800 / f);
2131 ++ } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
2132 ++ msr &= 0x1fff;
2133 ++ return !!test_bit(msr, msr_bitmap + 0xc00 / f);
2134 ++ }
2135 ++
2136 ++ return true;
2137 ++}
2138 ++
2139 + static void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx,
2140 + unsigned long entry, unsigned long exit)
2141 + {
2142 +@@ -2291,6 +2330,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
2143 + if (per_cpu(current_vmcs, cpu) != vmx->loaded_vmcs->vmcs) {
2144 + per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs;
2145 + vmcs_load(vmx->loaded_vmcs->vmcs);
2146 ++ indirect_branch_prediction_barrier();
2147 + }
2148 +
2149 + if (!already_loaded) {
2150 +@@ -2567,36 +2607,6 @@ static void move_msr_up(struct vcpu_vmx *vmx, int from, int to)
2151 + vmx->guest_msrs[from] = tmp;
2152 + }
2153 +
2154 +-static void vmx_set_msr_bitmap(struct kvm_vcpu *vcpu)
2155 +-{
2156 +- unsigned long *msr_bitmap;
2157 +-
2158 +- if (is_guest_mode(vcpu))
2159 +- msr_bitmap = to_vmx(vcpu)->nested.msr_bitmap;
2160 +- else if (cpu_has_secondary_exec_ctrls() &&
2161 +- (vmcs_read32(SECONDARY_VM_EXEC_CONTROL) &
2162 +- SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) {
2163 +- if (enable_apicv && kvm_vcpu_apicv_active(vcpu)) {
2164 +- if (is_long_mode(vcpu))
2165 +- msr_bitmap = vmx_msr_bitmap_longmode_x2apic_apicv;
2166 +- else
2167 +- msr_bitmap = vmx_msr_bitmap_legacy_x2apic_apicv;
2168 +- } else {
2169 +- if (is_long_mode(vcpu))
2170 +- msr_bitmap = vmx_msr_bitmap_longmode_x2apic;
2171 +- else
2172 +- msr_bitmap = vmx_msr_bitmap_legacy_x2apic;
2173 +- }
2174 +- } else {
2175 +- if (is_long_mode(vcpu))
2176 +- msr_bitmap = vmx_msr_bitmap_longmode;
2177 +- else
2178 +- msr_bitmap = vmx_msr_bitmap_legacy;
2179 +- }
2180 +-
2181 +- vmcs_write64(MSR_BITMAP, __pa(msr_bitmap));
2182 +-}
2183 +-
2184 + /*
2185 + * Set up the vmcs to automatically save and restore system
2186 + * msrs. Don't touch the 64-bit msrs if the guest is in legacy
2187 +@@ -2637,7 +2647,7 @@ static void setup_msrs(struct vcpu_vmx *vmx)
2188 + vmx->save_nmsrs = save_nmsrs;
2189 +
2190 + if (cpu_has_vmx_msr_bitmap())
2191 +- vmx_set_msr_bitmap(&vmx->vcpu);
2192 ++ vmx_update_msr_bitmap(&vmx->vcpu);
2193 + }
2194 +
2195 + /*
2196 +@@ -3273,6 +3283,20 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2197 + case MSR_IA32_TSC:
2198 + msr_info->data = guest_read_tsc(vcpu);
2199 + break;
2200 ++ case MSR_IA32_SPEC_CTRL:
2201 ++ if (!msr_info->host_initiated &&
2202 ++ !guest_cpuid_has(vcpu, X86_FEATURE_IBRS) &&
2203 ++ !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
2204 ++ return 1;
2205 ++
2206 ++ msr_info->data = to_vmx(vcpu)->spec_ctrl;
2207 ++ break;
2208 ++ case MSR_IA32_ARCH_CAPABILITIES:
2209 ++ if (!msr_info->host_initiated &&
2210 ++ !guest_cpuid_has(vcpu, X86_FEATURE_ARCH_CAPABILITIES))
2211 ++ return 1;
2212 ++ msr_info->data = to_vmx(vcpu)->arch_capabilities;
2213 ++ break;
2214 + case MSR_IA32_SYSENTER_CS:
2215 + msr_info->data = vmcs_read32(GUEST_SYSENTER_CS);
2216 + break;
2217 +@@ -3380,6 +3404,70 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2218 + case MSR_IA32_TSC:
2219 + kvm_write_tsc(vcpu, msr_info);
2220 + break;
2221 ++ case MSR_IA32_SPEC_CTRL:
2222 ++ if (!msr_info->host_initiated &&
2223 ++ !guest_cpuid_has(vcpu, X86_FEATURE_IBRS) &&
2224 ++ !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
2225 ++ return 1;
2226 ++
2227 ++ /* The STIBP bit doesn't fault even if it's not advertised */
2228 ++ if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP))
2229 ++ return 1;
2230 ++
2231 ++ vmx->spec_ctrl = data;
2232 ++
2233 ++ if (!data)
2234 ++ break;
2235 ++
2236 ++ /*
2237 ++ * For non-nested:
2238 ++ * When it's written (to non-zero) for the first time, pass
2239 ++ * it through.
2240 ++ *
2241 ++ * For nested:
2242 ++ * The handling of the MSR bitmap for L2 guests is done in
2243 ++ * nested_vmx_merge_msr_bitmap. We should not touch the
2244 ++ * vmcs02.msr_bitmap here since it gets completely overwritten
2245 ++ * in the merging. We update the vmcs01 here for L1 as well
2246 ++ * since it will end up touching the MSR anyway now.
2247 ++ */
2248 ++ vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap,
2249 ++ MSR_IA32_SPEC_CTRL,
2250 ++ MSR_TYPE_RW);
2251 ++ break;
2252 ++ case MSR_IA32_PRED_CMD:
2253 ++ if (!msr_info->host_initiated &&
2254 ++ !guest_cpuid_has(vcpu, X86_FEATURE_IBPB) &&
2255 ++ !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
2256 ++ return 1;
2257 ++
2258 ++ if (data & ~PRED_CMD_IBPB)
2259 ++ return 1;
2260 ++
2261 ++ if (!data)
2262 ++ break;
2263 ++
2264 ++ wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB);
2265 ++
2266 ++ /*
2267 ++ * For non-nested:
2268 ++ * When it's written (to non-zero) for the first time, pass
2269 ++ * it through.
2270 ++ *
2271 ++ * For nested:
2272 ++ * The handling of the MSR bitmap for L2 guests is done in
2273 ++ * nested_vmx_merge_msr_bitmap. We should not touch the
2274 ++ * vmcs02.msr_bitmap here since it gets completely overwritten
2275 ++ * in the merging.
2276 ++ */
2277 ++ vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap, MSR_IA32_PRED_CMD,
2278 ++ MSR_TYPE_W);
2279 ++ break;
2280 ++ case MSR_IA32_ARCH_CAPABILITIES:
2281 ++ if (!msr_info->host_initiated)
2282 ++ return 1;
2283 ++ vmx->arch_capabilities = data;
2284 ++ break;
2285 + case MSR_IA32_CR_PAT:
2286 + if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
2287 + if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data))
2288 +@@ -3822,11 +3910,6 @@ static struct vmcs *alloc_vmcs_cpu(int cpu)
2289 + return vmcs;
2290 + }
2291 +
2292 +-static struct vmcs *alloc_vmcs(void)
2293 +-{
2294 +- return alloc_vmcs_cpu(raw_smp_processor_id());
2295 +-}
2296 +-
2297 + static void free_vmcs(struct vmcs *vmcs)
2298 + {
2299 + free_pages((unsigned long)vmcs, vmcs_config.order);
2300 +@@ -3842,9 +3925,38 @@ static void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
2301 + loaded_vmcs_clear(loaded_vmcs);
2302 + free_vmcs(loaded_vmcs->vmcs);
2303 + loaded_vmcs->vmcs = NULL;
2304 ++ if (loaded_vmcs->msr_bitmap)
2305 ++ free_page((unsigned long)loaded_vmcs->msr_bitmap);
2306 + WARN_ON(loaded_vmcs->shadow_vmcs != NULL);
2307 + }
2308 +
2309 ++static struct vmcs *alloc_vmcs(void)
2310 ++{
2311 ++ return alloc_vmcs_cpu(raw_smp_processor_id());
2312 ++}
2313 ++
2314 ++static int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
2315 ++{
2316 ++ loaded_vmcs->vmcs = alloc_vmcs();
2317 ++ if (!loaded_vmcs->vmcs)
2318 ++ return -ENOMEM;
2319 ++
2320 ++ loaded_vmcs->shadow_vmcs = NULL;
2321 ++ loaded_vmcs_init(loaded_vmcs);
2322 ++
2323 ++ if (cpu_has_vmx_msr_bitmap()) {
2324 ++ loaded_vmcs->msr_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
2325 ++ if (!loaded_vmcs->msr_bitmap)
2326 ++ goto out_vmcs;
2327 ++ memset(loaded_vmcs->msr_bitmap, 0xff, PAGE_SIZE);
2328 ++ }
2329 ++ return 0;
2330 ++
2331 ++out_vmcs:
2332 ++ free_loaded_vmcs(loaded_vmcs);
2333 ++ return -ENOMEM;
2334 ++}
2335 ++
2336 + static void free_kvm_area(void)
2337 + {
2338 + int cpu;
2339 +@@ -4917,10 +5029,8 @@ static void free_vpid(int vpid)
2340 + spin_unlock(&vmx_vpid_lock);
2341 + }
2342 +
2343 +-#define MSR_TYPE_R 1
2344 +-#define MSR_TYPE_W 2
2345 +-static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
2346 +- u32 msr, int type)
2347 ++static void __always_inline vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
2348 ++ u32 msr, int type)
2349 + {
2350 + int f = sizeof(unsigned long);
2351 +
2352 +@@ -4954,6 +5064,50 @@ static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
2353 + }
2354 + }
2355 +
2356 ++static void __always_inline vmx_enable_intercept_for_msr(unsigned long *msr_bitmap,
2357 ++ u32 msr, int type)
2358 ++{
2359 ++ int f = sizeof(unsigned long);
2360 ++
2361 ++ if (!cpu_has_vmx_msr_bitmap())
2362 ++ return;
2363 ++
2364 ++ /*
2365 ++ * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
2366 ++ * have the write-low and read-high bitmap offsets the wrong way round.
2367 ++ * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
2368 ++ */
2369 ++ if (msr <= 0x1fff) {
2370 ++ if (type & MSR_TYPE_R)
2371 ++ /* read-low */
2372 ++ __set_bit(msr, msr_bitmap + 0x000 / f);
2373 ++
2374 ++ if (type & MSR_TYPE_W)
2375 ++ /* write-low */
2376 ++ __set_bit(msr, msr_bitmap + 0x800 / f);
2377 ++
2378 ++ } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
2379 ++ msr &= 0x1fff;
2380 ++ if (type & MSR_TYPE_R)
2381 ++ /* read-high */
2382 ++ __set_bit(msr, msr_bitmap + 0x400 / f);
2383 ++
2384 ++ if (type & MSR_TYPE_W)
2385 ++ /* write-high */
2386 ++ __set_bit(msr, msr_bitmap + 0xc00 / f);
2387 ++
2388 ++ }
2389 ++}
2390 ++
2391 ++static void __always_inline vmx_set_intercept_for_msr(unsigned long *msr_bitmap,
2392 ++ u32 msr, int type, bool value)
2393 ++{
2394 ++ if (value)
2395 ++ vmx_enable_intercept_for_msr(msr_bitmap, msr, type);
2396 ++ else
2397 ++ vmx_disable_intercept_for_msr(msr_bitmap, msr, type);
2398 ++}
2399 ++
2400 + /*
2401 + * If a msr is allowed by L0, we should check whether it is allowed by L1.
2402 + * The corresponding bit will be cleared unless both of L0 and L1 allow it.
2403 +@@ -5000,30 +5154,70 @@ static void nested_vmx_disable_intercept_for_msr(unsigned long *msr_bitmap_l1,
2404 + }
2405 + }
2406 +
2407 +-static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only)
2408 ++static u8 vmx_msr_bitmap_mode(struct kvm_vcpu *vcpu)
2409 + {
2410 +- if (!longmode_only)
2411 +- __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy,
2412 +- msr, MSR_TYPE_R | MSR_TYPE_W);
2413 +- __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode,
2414 +- msr, MSR_TYPE_R | MSR_TYPE_W);
2415 ++ u8 mode = 0;
2416 ++
2417 ++ if (cpu_has_secondary_exec_ctrls() &&
2418 ++ (vmcs_read32(SECONDARY_VM_EXEC_CONTROL) &
2419 ++ SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) {
2420 ++ mode |= MSR_BITMAP_MODE_X2APIC;
2421 ++ if (enable_apicv && kvm_vcpu_apicv_active(vcpu))
2422 ++ mode |= MSR_BITMAP_MODE_X2APIC_APICV;
2423 ++ }
2424 ++
2425 ++ if (is_long_mode(vcpu))
2426 ++ mode |= MSR_BITMAP_MODE_LM;
2427 ++
2428 ++ return mode;
2429 + }
2430 +
2431 +-static void vmx_disable_intercept_msr_x2apic(u32 msr, int type, bool apicv_active)
2432 ++#define X2APIC_MSR(r) (APIC_BASE_MSR + ((r) >> 4))
2433 ++
2434 ++static void vmx_update_msr_bitmap_x2apic(unsigned long *msr_bitmap,
2435 ++ u8 mode)
2436 + {
2437 +- if (apicv_active) {
2438 +- __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic_apicv,
2439 +- msr, type);
2440 +- __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic_apicv,
2441 +- msr, type);
2442 +- } else {
2443 +- __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic,
2444 +- msr, type);
2445 +- __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic,
2446 +- msr, type);
2447 ++ int msr;
2448 ++
2449 ++ for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
2450 ++ unsigned word = msr / BITS_PER_LONG;
2451 ++ msr_bitmap[word] = (mode & MSR_BITMAP_MODE_X2APIC_APICV) ? 0 : ~0;
2452 ++ msr_bitmap[word + (0x800 / sizeof(long))] = ~0;
2453 ++ }
2454 ++
2455 ++ if (mode & MSR_BITMAP_MODE_X2APIC) {
2456 ++ /*
2457 ++ * TPR reads and writes can be virtualized even if virtual interrupt
2458 ++ * delivery is not in use.
2459 ++ */
2460 ++ vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_TASKPRI), MSR_TYPE_RW);
2461 ++ if (mode & MSR_BITMAP_MODE_X2APIC_APICV) {
2462 ++ vmx_enable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_TMCCT), MSR_TYPE_R);
2463 ++ vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_EOI), MSR_TYPE_W);
2464 ++ vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W);
2465 ++ }
2466 + }
2467 + }
2468 +
2469 ++static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu)
2470 ++{
2471 ++ struct vcpu_vmx *vmx = to_vmx(vcpu);
2472 ++ unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
2473 ++ u8 mode = vmx_msr_bitmap_mode(vcpu);
2474 ++ u8 changed = mode ^ vmx->msr_bitmap_mode;
2475 ++
2476 ++ if (!changed)
2477 ++ return;
2478 ++
2479 ++ vmx_set_intercept_for_msr(msr_bitmap, MSR_KERNEL_GS_BASE, MSR_TYPE_RW,
2480 ++ !(mode & MSR_BITMAP_MODE_LM));
2481 ++
2482 ++ if (changed & (MSR_BITMAP_MODE_X2APIC | MSR_BITMAP_MODE_X2APIC_APICV))
2483 ++ vmx_update_msr_bitmap_x2apic(msr_bitmap, mode);
2484 ++
2485 ++ vmx->msr_bitmap_mode = mode;
2486 ++}
2487 ++
2488 + static bool vmx_get_enable_apicv(struct kvm_vcpu *vcpu)
2489 + {
2490 + return enable_apicv;
2491 +@@ -5269,7 +5463,7 @@ static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
2492 + }
2493 +
2494 + if (cpu_has_vmx_msr_bitmap())
2495 +- vmx_set_msr_bitmap(vcpu);
2496 ++ vmx_update_msr_bitmap(vcpu);
2497 + }
2498 +
2499 + static u32 vmx_exec_control(struct vcpu_vmx *vmx)
2500 +@@ -5456,7 +5650,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
2501 + vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap));
2502 + }
2503 + if (cpu_has_vmx_msr_bitmap())
2504 +- vmcs_write64(MSR_BITMAP, __pa(vmx_msr_bitmap_legacy));
2505 ++ vmcs_write64(MSR_BITMAP, __pa(vmx->vmcs01.msr_bitmap));
2506 +
2507 + vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
2508 +
2509 +@@ -5534,6 +5728,8 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
2510 + ++vmx->nmsrs;
2511 + }
2512 +
2513 ++ if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES))
2514 ++ rdmsrl(MSR_IA32_ARCH_CAPABILITIES, vmx->arch_capabilities);
2515 +
2516 + vm_exit_controls_init(vmx, vmcs_config.vmexit_ctrl);
2517 +
2518 +@@ -5564,6 +5760,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
2519 + u64 cr0;
2520 +
2521 + vmx->rmode.vm86_active = 0;
2522 ++ vmx->spec_ctrl = 0;
2523 +
2524 + vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val();
2525 + kvm_set_cr8(vcpu, 0);
2526 +@@ -6739,7 +6936,7 @@ void vmx_enable_tdp(void)
2527 +
2528 + static __init int hardware_setup(void)
2529 + {
2530 +- int r = -ENOMEM, i, msr;
2531 ++ int r = -ENOMEM, i;
2532 +
2533 + rdmsrl_safe(MSR_EFER, &host_efer);
2534 +
2535 +@@ -6760,9 +6957,6 @@ static __init int hardware_setup(void)
2536 +
2537 + memset(vmx_io_bitmap_b, 0xff, PAGE_SIZE);
2538 +
2539 +- memset(vmx_msr_bitmap_legacy, 0xff, PAGE_SIZE);
2540 +- memset(vmx_msr_bitmap_longmode, 0xff, PAGE_SIZE);
2541 +-
2542 + if (setup_vmcs_config(&vmcs_config) < 0) {
2543 + r = -EIO;
2544 + goto out;
2545 +@@ -6825,42 +7019,8 @@ static __init int hardware_setup(void)
2546 + kvm_tsc_scaling_ratio_frac_bits = 48;
2547 + }
2548 +
2549 +- vmx_disable_intercept_for_msr(MSR_FS_BASE, false);
2550 +- vmx_disable_intercept_for_msr(MSR_GS_BASE, false);
2551 +- vmx_disable_intercept_for_msr(MSR_KERNEL_GS_BASE, true);
2552 +- vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false);
2553 +- vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false);
2554 +- vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false);
2555 +-
2556 +- memcpy(vmx_msr_bitmap_legacy_x2apic_apicv,
2557 +- vmx_msr_bitmap_legacy, PAGE_SIZE);
2558 +- memcpy(vmx_msr_bitmap_longmode_x2apic_apicv,
2559 +- vmx_msr_bitmap_longmode, PAGE_SIZE);
2560 +- memcpy(vmx_msr_bitmap_legacy_x2apic,
2561 +- vmx_msr_bitmap_legacy, PAGE_SIZE);
2562 +- memcpy(vmx_msr_bitmap_longmode_x2apic,
2563 +- vmx_msr_bitmap_longmode, PAGE_SIZE);
2564 +-
2565 + set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
2566 +
2567 +- for (msr = 0x800; msr <= 0x8ff; msr++) {
2568 +- if (msr == 0x839 /* TMCCT */)
2569 +- continue;
2570 +- vmx_disable_intercept_msr_x2apic(msr, MSR_TYPE_R, true);
2571 +- }
2572 +-
2573 +- /*
2574 +- * TPR reads and writes can be virtualized even if virtual interrupt
2575 +- * delivery is not in use.
2576 +- */
2577 +- vmx_disable_intercept_msr_x2apic(0x808, MSR_TYPE_W, true);
2578 +- vmx_disable_intercept_msr_x2apic(0x808, MSR_TYPE_R | MSR_TYPE_W, false);
2579 +-
2580 +- /* EOI */
2581 +- vmx_disable_intercept_msr_x2apic(0x80b, MSR_TYPE_W, true);
2582 +- /* SELF-IPI */
2583 +- vmx_disable_intercept_msr_x2apic(0x83f, MSR_TYPE_W, true);
2584 +-
2585 + if (enable_ept)
2586 + vmx_enable_tdp();
2587 + else
2588 +@@ -6963,94 +7123,6 @@ static int handle_monitor(struct kvm_vcpu *vcpu)
2589 + return handle_nop(vcpu);
2590 + }
2591 +
2592 +-/*
2593 +- * To run an L2 guest, we need a vmcs02 based on the L1-specified vmcs12.
2594 +- * We could reuse a single VMCS for all the L2 guests, but we also want the
2595 +- * option to allocate a separate vmcs02 for each separate loaded vmcs12 - this
2596 +- * allows keeping them loaded on the processor, and in the future will allow
2597 +- * optimizations where prepare_vmcs02 doesn't need to set all the fields on
2598 +- * every entry if they never change.
2599 +- * So we keep, in vmx->nested.vmcs02_pool, a cache of size VMCS02_POOL_SIZE
2600 +- * (>=0) with a vmcs02 for each recently loaded vmcs12s, most recent first.
2601 +- *
2602 +- * The following functions allocate and free a vmcs02 in this pool.
2603 +- */
2604 +-
2605 +-/* Get a VMCS from the pool to use as vmcs02 for the current vmcs12. */
2606 +-static struct loaded_vmcs *nested_get_current_vmcs02(struct vcpu_vmx *vmx)
2607 +-{
2608 +- struct vmcs02_list *item;
2609 +- list_for_each_entry(item, &vmx->nested.vmcs02_pool, list)
2610 +- if (item->vmptr == vmx->nested.current_vmptr) {
2611 +- list_move(&item->list, &vmx->nested.vmcs02_pool);
2612 +- return &item->vmcs02;
2613 +- }
2614 +-
2615 +- if (vmx->nested.vmcs02_num >= max(VMCS02_POOL_SIZE, 1)) {
2616 +- /* Recycle the least recently used VMCS. */
2617 +- item = list_last_entry(&vmx->nested.vmcs02_pool,
2618 +- struct vmcs02_list, list);
2619 +- item->vmptr = vmx->nested.current_vmptr;
2620 +- list_move(&item->list, &vmx->nested.vmcs02_pool);
2621 +- return &item->vmcs02;
2622 +- }
2623 +-
2624 +- /* Create a new VMCS */
2625 +- item = kzalloc(sizeof(struct vmcs02_list), GFP_KERNEL);
2626 +- if (!item)
2627 +- return NULL;
2628 +- item->vmcs02.vmcs = alloc_vmcs();
2629 +- item->vmcs02.shadow_vmcs = NULL;
2630 +- if (!item->vmcs02.vmcs) {
2631 +- kfree(item);
2632 +- return NULL;
2633 +- }
2634 +- loaded_vmcs_init(&item->vmcs02);
2635 +- item->vmptr = vmx->nested.current_vmptr;
2636 +- list_add(&(item->list), &(vmx->nested.vmcs02_pool));
2637 +- vmx->nested.vmcs02_num++;
2638 +- return &item->vmcs02;
2639 +-}
2640 +-
2641 +-/* Free and remove from pool a vmcs02 saved for a vmcs12 (if there is one) */
2642 +-static void nested_free_vmcs02(struct vcpu_vmx *vmx, gpa_t vmptr)
2643 +-{
2644 +- struct vmcs02_list *item;
2645 +- list_for_each_entry(item, &vmx->nested.vmcs02_pool, list)
2646 +- if (item->vmptr == vmptr) {
2647 +- free_loaded_vmcs(&item->vmcs02);
2648 +- list_del(&item->list);
2649 +- kfree(item);
2650 +- vmx->nested.vmcs02_num--;
2651 +- return;
2652 +- }
2653 +-}
2654 +-
2655 +-/*
2656 +- * Free all VMCSs saved for this vcpu, except the one pointed by
2657 +- * vmx->loaded_vmcs. We must be running L1, so vmx->loaded_vmcs
2658 +- * must be &vmx->vmcs01.
2659 +- */
2660 +-static void nested_free_all_saved_vmcss(struct vcpu_vmx *vmx)
2661 +-{
2662 +- struct vmcs02_list *item, *n;
2663 +-
2664 +- WARN_ON(vmx->loaded_vmcs != &vmx->vmcs01);
2665 +- list_for_each_entry_safe(item, n, &vmx->nested.vmcs02_pool, list) {
2666 +- /*
2667 +- * Something will leak if the above WARN triggers. Better than
2668 +- * a use-after-free.
2669 +- */
2670 +- if (vmx->loaded_vmcs == &item->vmcs02)
2671 +- continue;
2672 +-
2673 +- free_loaded_vmcs(&item->vmcs02);
2674 +- list_del(&item->list);
2675 +- kfree(item);
2676 +- vmx->nested.vmcs02_num--;
2677 +- }
2678 +-}
2679 +-
2680 + /*
2681 + * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(),
2682 + * set the success or error code of an emulated VMX instruction, as specified
2683 +@@ -7231,13 +7303,11 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu)
2684 + {
2685 + struct vcpu_vmx *vmx = to_vmx(vcpu);
2686 + struct vmcs *shadow_vmcs;
2687 ++ int r;
2688 +
2689 +- if (cpu_has_vmx_msr_bitmap()) {
2690 +- vmx->nested.msr_bitmap =
2691 +- (unsigned long *)__get_free_page(GFP_KERNEL);
2692 +- if (!vmx->nested.msr_bitmap)
2693 +- goto out_msr_bitmap;
2694 +- }
2695 ++ r = alloc_loaded_vmcs(&vmx->nested.vmcs02);
2696 ++ if (r < 0)
2697 ++ goto out_vmcs02;
2698 +
2699 + vmx->nested.cached_vmcs12 = kmalloc(VMCS12_SIZE, GFP_KERNEL);
2700 + if (!vmx->nested.cached_vmcs12)
2701 +@@ -7254,9 +7324,6 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu)
2702 + vmx->vmcs01.shadow_vmcs = shadow_vmcs;
2703 + }
2704 +
2705 +- INIT_LIST_HEAD(&(vmx->nested.vmcs02_pool));
2706 +- vmx->nested.vmcs02_num = 0;
2707 +-
2708 + hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC,
2709 + HRTIMER_MODE_REL_PINNED);
2710 + vmx->nested.preemption_timer.function = vmx_preemption_timer_fn;
2711 +@@ -7268,9 +7335,9 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu)
2712 + kfree(vmx->nested.cached_vmcs12);
2713 +
2714 + out_cached_vmcs12:
2715 +- free_page((unsigned long)vmx->nested.msr_bitmap);
2716 ++ free_loaded_vmcs(&vmx->nested.vmcs02);
2717 +
2718 +-out_msr_bitmap:
2719 ++out_vmcs02:
2720 + return -ENOMEM;
2721 + }
2722 +
2723 +@@ -7412,10 +7479,6 @@ static void free_nested(struct vcpu_vmx *vmx)
2724 + free_vpid(vmx->nested.vpid02);
2725 + vmx->nested.posted_intr_nv = -1;
2726 + vmx->nested.current_vmptr = -1ull;
2727 +- if (vmx->nested.msr_bitmap) {
2728 +- free_page((unsigned long)vmx->nested.msr_bitmap);
2729 +- vmx->nested.msr_bitmap = NULL;
2730 +- }
2731 + if (enable_shadow_vmcs) {
2732 + vmx_disable_shadow_vmcs(vmx);
2733 + vmcs_clear(vmx->vmcs01.shadow_vmcs);
2734 +@@ -7423,7 +7486,7 @@ static void free_nested(struct vcpu_vmx *vmx)
2735 + vmx->vmcs01.shadow_vmcs = NULL;
2736 + }
2737 + kfree(vmx->nested.cached_vmcs12);
2738 +- /* Unpin physical memory we referred to in current vmcs02 */
2739 ++ /* Unpin physical memory we referred to in the vmcs02 */
2740 + if (vmx->nested.apic_access_page) {
2741 + kvm_release_page_dirty(vmx->nested.apic_access_page);
2742 + vmx->nested.apic_access_page = NULL;
2743 +@@ -7439,7 +7502,7 @@ static void free_nested(struct vcpu_vmx *vmx)
2744 + vmx->nested.pi_desc = NULL;
2745 + }
2746 +
2747 +- nested_free_all_saved_vmcss(vmx);
2748 ++ free_loaded_vmcs(&vmx->nested.vmcs02);
2749 + }
2750 +
2751 + /* Emulate the VMXOFF instruction */
2752 +@@ -7482,8 +7545,6 @@ static int handle_vmclear(struct kvm_vcpu *vcpu)
2753 + vmptr + offsetof(struct vmcs12, launch_state),
2754 + &zero, sizeof(zero));
2755 +
2756 +- nested_free_vmcs02(vmx, vmptr);
2757 +-
2758 + nested_vmx_succeed(vcpu);
2759 + return kvm_skip_emulated_instruction(vcpu);
2760 + }
2761 +@@ -8395,10 +8456,11 @@ static bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason)
2762 +
2763 + /*
2764 + * The host physical addresses of some pages of guest memory
2765 +- * are loaded into VMCS02 (e.g. L1's Virtual APIC Page). The CPU
2766 +- * may write to these pages via their host physical address while
2767 +- * L2 is running, bypassing any address-translation-based dirty
2768 +- * tracking (e.g. EPT write protection).
2769 ++ * are loaded into the vmcs02 (e.g. vmcs12's Virtual APIC
2770 ++ * Page). The CPU may write to these pages via their host
2771 ++ * physical address while L2 is running, bypassing any
2772 ++ * address-translation-based dirty tracking (e.g. EPT write
2773 ++ * protection).
2774 + *
2775 + * Mark them dirty on every exit from L2 to prevent them from
2776 + * getting out of sync with dirty tracking.
2777 +@@ -8932,7 +8994,7 @@ static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set)
2778 + }
2779 + vmcs_write32(SECONDARY_VM_EXEC_CONTROL, sec_exec_control);
2780 +
2781 +- vmx_set_msr_bitmap(vcpu);
2782 ++ vmx_update_msr_bitmap(vcpu);
2783 + }
2784 +
2785 + static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu, hpa_t hpa)
2786 +@@ -9118,14 +9180,14 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu)
2787 + #endif
2788 + "pushf\n\t"
2789 + __ASM_SIZE(push) " $%c[cs]\n\t"
2790 +- "call *%[entry]\n\t"
2791 ++ CALL_NOSPEC
2792 + :
2793 + #ifdef CONFIG_X86_64
2794 + [sp]"=&r"(tmp),
2795 + #endif
2796 + ASM_CALL_CONSTRAINT
2797 + :
2798 +- [entry]"r"(entry),
2799 ++ THUNK_TARGET(entry),
2800 + [ss]"i"(__KERNEL_DS),
2801 + [cs]"i"(__KERNEL_CS)
2802 + );
2803 +@@ -9362,6 +9424,15 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
2804 +
2805 + vmx_arm_hv_timer(vcpu);
2806 +
2807 ++ /*
2808 ++ * If this vCPU has touched SPEC_CTRL, restore the guest's value if
2809 ++ * it's non-zero. Since vmentry is serialising on affected CPUs, there
2810 ++ * is no need to worry about the conditional branch over the wrmsr
2811 ++ * being speculatively taken.
2812 ++ */
2813 ++ if (vmx->spec_ctrl)
2814 ++ wrmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl);
2815 ++
2816 + vmx->__launched = vmx->loaded_vmcs->launched;
2817 + asm(
2818 + /* Store host registers */
2819 +@@ -9480,6 +9551,27 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
2820 + #endif
2821 + );
2822 +
2823 ++ /*
2824 ++ * We do not use IBRS in the kernel. If this vCPU has used the
2825 ++ * SPEC_CTRL MSR it may have left it on; save the value and
2826 ++ * turn it off. This is much more efficient than blindly adding
2827 ++ * it to the atomic save/restore list. Especially as the former
2828 ++ * (Saving guest MSRs on vmexit) doesn't even exist in KVM.
2829 ++ *
2830 ++ * For non-nested case:
2831 ++ * If the L01 MSR bitmap does not intercept the MSR, then we need to
2832 ++ * save it.
2833 ++ *
2834 ++ * For nested case:
2835 ++ * If the L02 MSR bitmap does not intercept the MSR, then we need to
2836 ++ * save it.
2837 ++ */
2838 ++ if (!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL))
2839 ++ rdmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl);
2840 ++
2841 ++ if (vmx->spec_ctrl)
2842 ++ wrmsrl(MSR_IA32_SPEC_CTRL, 0);
2843 ++
2844 + /* Eliminate branch target predictions from guest mode */
2845 + vmexit_fill_RSB();
2846 +
2847 +@@ -9594,6 +9686,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
2848 + {
2849 + int err;
2850 + struct vcpu_vmx *vmx = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
2851 ++ unsigned long *msr_bitmap;
2852 + int cpu;
2853 +
2854 + if (!vmx)
2855 +@@ -9626,13 +9719,20 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
2856 + if (!vmx->guest_msrs)
2857 + goto free_pml;
2858 +
2859 +- vmx->loaded_vmcs = &vmx->vmcs01;
2860 +- vmx->loaded_vmcs->vmcs = alloc_vmcs();
2861 +- vmx->loaded_vmcs->shadow_vmcs = NULL;
2862 +- if (!vmx->loaded_vmcs->vmcs)
2863 ++ err = alloc_loaded_vmcs(&vmx->vmcs01);
2864 ++ if (err < 0)
2865 + goto free_msrs;
2866 +- loaded_vmcs_init(vmx->loaded_vmcs);
2867 +
2868 ++ msr_bitmap = vmx->vmcs01.msr_bitmap;
2869 ++ vmx_disable_intercept_for_msr(msr_bitmap, MSR_FS_BASE, MSR_TYPE_RW);
2870 ++ vmx_disable_intercept_for_msr(msr_bitmap, MSR_GS_BASE, MSR_TYPE_RW);
2871 ++ vmx_disable_intercept_for_msr(msr_bitmap, MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
2872 ++ vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW);
2873 ++ vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW);
2874 ++ vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW);
2875 ++ vmx->msr_bitmap_mode = 0;
2876 ++
2877 ++ vmx->loaded_vmcs = &vmx->vmcs01;
2878 + cpu = get_cpu();
2879 + vmx_vcpu_load(&vmx->vcpu, cpu);
2880 + vmx->vcpu.cpu = cpu;
2881 +@@ -10101,10 +10201,25 @@ static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
2882 + int msr;
2883 + struct page *page;
2884 + unsigned long *msr_bitmap_l1;
2885 +- unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.msr_bitmap;
2886 ++ unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.vmcs02.msr_bitmap;
2887 ++ /*
2888 ++ * pred_cmd & spec_ctrl are trying to verify two things:
2889 ++ *
2890 ++ * 1. L0 gave a permission to L1 to actually passthrough the MSR. This
2891 ++ * ensures that we do not accidentally generate an L02 MSR bitmap
2892 ++ * from the L12 MSR bitmap that is too permissive.
2893 ++ * 2. That L1 or L2s have actually used the MSR. This avoids
2894 ++ * unnecessarily merging of the bitmap if the MSR is unused. This
2895 ++ * works properly because we only update the L01 MSR bitmap lazily.
2896 ++ * So even if L0 should pass L1 these MSRs, the L01 bitmap is only
2897 ++ * updated to reflect this when L1 (or its L2s) actually write to
2898 ++ * the MSR.
2899 ++ */
2900 ++ bool pred_cmd = msr_write_intercepted_l01(vcpu, MSR_IA32_PRED_CMD);
2901 ++ bool spec_ctrl = msr_write_intercepted_l01(vcpu, MSR_IA32_SPEC_CTRL);
2902 +
2903 +- /* This shortcut is ok because we support only x2APIC MSRs so far. */
2904 +- if (!nested_cpu_has_virt_x2apic_mode(vmcs12))
2905 ++ if (!nested_cpu_has_virt_x2apic_mode(vmcs12) &&
2906 ++ !pred_cmd && !spec_ctrl)
2907 + return false;
2908 +
2909 + page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->msr_bitmap);
2910 +@@ -10137,6 +10252,19 @@ static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
2911 + MSR_TYPE_W);
2912 + }
2913 + }
2914 ++
2915 ++ if (spec_ctrl)
2916 ++ nested_vmx_disable_intercept_for_msr(
2917 ++ msr_bitmap_l1, msr_bitmap_l0,
2918 ++ MSR_IA32_SPEC_CTRL,
2919 ++ MSR_TYPE_R | MSR_TYPE_W);
2920 ++
2921 ++ if (pred_cmd)
2922 ++ nested_vmx_disable_intercept_for_msr(
2923 ++ msr_bitmap_l1, msr_bitmap_l0,
2924 ++ MSR_IA32_PRED_CMD,
2925 ++ MSR_TYPE_W);
2926 ++
2927 + kunmap(page);
2928 + kvm_release_page_clean(page);
2929 +
2930 +@@ -10678,6 +10806,9 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
2931 + if (kvm_has_tsc_control)
2932 + decache_tsc_multiplier(vmx);
2933 +
2934 ++ if (cpu_has_vmx_msr_bitmap())
2935 ++ vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap));
2936 ++
2937 + if (enable_vpid) {
2938 + /*
2939 + * There is no direct mapping between vpid02 and vpid12, the
2940 +@@ -10894,20 +11025,15 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry)
2941 + {
2942 + struct vcpu_vmx *vmx = to_vmx(vcpu);
2943 + struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
2944 +- struct loaded_vmcs *vmcs02;
2945 + u32 msr_entry_idx;
2946 + u32 exit_qual;
2947 +
2948 +- vmcs02 = nested_get_current_vmcs02(vmx);
2949 +- if (!vmcs02)
2950 +- return -ENOMEM;
2951 +-
2952 + enter_guest_mode(vcpu);
2953 +
2954 + if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS))
2955 + vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
2956 +
2957 +- vmx_switch_vmcs(vcpu, vmcs02);
2958 ++ vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02);
2959 + vmx_segment_cache_clear(vmx);
2960 +
2961 + if (prepare_vmcs02(vcpu, vmcs12, from_vmentry, &exit_qual)) {
2962 +@@ -11476,7 +11602,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
2963 + vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
2964 +
2965 + if (cpu_has_vmx_msr_bitmap())
2966 +- vmx_set_msr_bitmap(vcpu);
2967 ++ vmx_update_msr_bitmap(vcpu);
2968 +
2969 + if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr,
2970 + vmcs12->vm_exit_msr_load_count))
2971 +@@ -11522,10 +11648,6 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
2972 + vm_exit_controls_reset_shadow(vmx);
2973 + vmx_segment_cache_clear(vmx);
2974 +
2975 +- /* if no vmcs02 cache requested, remove the one we used */
2976 +- if (VMCS02_POOL_SIZE == 0)
2977 +- nested_free_vmcs02(vmx, vmx->nested.current_vmptr);
2978 +-
2979 + /* Update any VMCS fields that might have changed while L2 ran */
2980 + vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.nr);
2981 + vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.nr);
2982 +diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
2983 +index 8c28023a43b1..f97358423f9c 100644
2984 +--- a/arch/x86/kvm/x86.c
2985 ++++ b/arch/x86/kvm/x86.c
2986 +@@ -1006,6 +1006,7 @@ static u32 msrs_to_save[] = {
2987 + #endif
2988 + MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA,
2989 + MSR_IA32_FEATURE_CONTROL, MSR_IA32_BNDCFGS, MSR_TSC_AUX,
2990 ++ MSR_IA32_SPEC_CTRL, MSR_IA32_ARCH_CAPABILITIES
2991 + };
2992 +
2993 + static unsigned num_msrs_to_save;
2994 +diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
2995 +index d435c89875c1..d0a3170e6804 100644
2996 +--- a/arch/x86/lib/Makefile
2997 ++++ b/arch/x86/lib/Makefile
2998 +@@ -27,6 +27,7 @@ lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o
2999 + lib-$(CONFIG_INSTRUCTION_DECODER) += insn.o inat.o
3000 + lib-$(CONFIG_RANDOMIZE_BASE) += kaslr.o
3001 + lib-$(CONFIG_RETPOLINE) += retpoline.o
3002 ++OBJECT_FILES_NON_STANDARD_retpoline.o :=y
3003 +
3004 + obj-y += msr.o msr-reg.o msr-reg-export.o hweight.o
3005 +
3006 +diff --git a/arch/x86/lib/getuser.S b/arch/x86/lib/getuser.S
3007 +index c97d935a29e8..49b167f73215 100644
3008 +--- a/arch/x86/lib/getuser.S
3009 ++++ b/arch/x86/lib/getuser.S
3010 +@@ -40,6 +40,8 @@ ENTRY(__get_user_1)
3011 + mov PER_CPU_VAR(current_task), %_ASM_DX
3012 + cmp TASK_addr_limit(%_ASM_DX),%_ASM_AX
3013 + jae bad_get_user
3014 ++ sbb %_ASM_DX, %_ASM_DX /* array_index_mask_nospec() */
3015 ++ and %_ASM_DX, %_ASM_AX
3016 + ASM_STAC
3017 + 1: movzbl (%_ASM_AX),%edx
3018 + xor %eax,%eax
3019 +@@ -54,6 +56,8 @@ ENTRY(__get_user_2)
3020 + mov PER_CPU_VAR(current_task), %_ASM_DX
3021 + cmp TASK_addr_limit(%_ASM_DX),%_ASM_AX
3022 + jae bad_get_user
3023 ++ sbb %_ASM_DX, %_ASM_DX /* array_index_mask_nospec() */
3024 ++ and %_ASM_DX, %_ASM_AX
3025 + ASM_STAC
3026 + 2: movzwl -1(%_ASM_AX),%edx
3027 + xor %eax,%eax
3028 +@@ -68,6 +72,8 @@ ENTRY(__get_user_4)
3029 + mov PER_CPU_VAR(current_task), %_ASM_DX
3030 + cmp TASK_addr_limit(%_ASM_DX),%_ASM_AX
3031 + jae bad_get_user
3032 ++ sbb %_ASM_DX, %_ASM_DX /* array_index_mask_nospec() */
3033 ++ and %_ASM_DX, %_ASM_AX
3034 + ASM_STAC
3035 + 3: movl -3(%_ASM_AX),%edx
3036 + xor %eax,%eax
3037 +@@ -83,6 +89,8 @@ ENTRY(__get_user_8)
3038 + mov PER_CPU_VAR(current_task), %_ASM_DX
3039 + cmp TASK_addr_limit(%_ASM_DX),%_ASM_AX
3040 + jae bad_get_user
3041 ++ sbb %_ASM_DX, %_ASM_DX /* array_index_mask_nospec() */
3042 ++ and %_ASM_DX, %_ASM_AX
3043 + ASM_STAC
3044 + 4: movq -7(%_ASM_AX),%rdx
3045 + xor %eax,%eax
3046 +@@ -94,6 +102,8 @@ ENTRY(__get_user_8)
3047 + mov PER_CPU_VAR(current_task), %_ASM_DX
3048 + cmp TASK_addr_limit(%_ASM_DX),%_ASM_AX
3049 + jae bad_get_user_8
3050 ++ sbb %_ASM_DX, %_ASM_DX /* array_index_mask_nospec() */
3051 ++ and %_ASM_DX, %_ASM_AX
3052 + ASM_STAC
3053 + 4: movl -7(%_ASM_AX),%edx
3054 + 5: movl -3(%_ASM_AX),%ecx
3055 +diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S
3056 +index dfb2ba91b670..480edc3a5e03 100644
3057 +--- a/arch/x86/lib/retpoline.S
3058 ++++ b/arch/x86/lib/retpoline.S
3059 +@@ -7,6 +7,7 @@
3060 + #include <asm/alternative-asm.h>
3061 + #include <asm/export.h>
3062 + #include <asm/nospec-branch.h>
3063 ++#include <asm/bitsperlong.h>
3064 +
3065 + .macro THUNK reg
3066 + .section .text.__x86.indirect_thunk
3067 +@@ -36,7 +37,6 @@ GENERATE_THUNK(_ASM_DX)
3068 + GENERATE_THUNK(_ASM_SI)
3069 + GENERATE_THUNK(_ASM_DI)
3070 + GENERATE_THUNK(_ASM_BP)
3071 +-GENERATE_THUNK(_ASM_SP)
3072 + #ifdef CONFIG_64BIT
3073 + GENERATE_THUNK(r8)
3074 + GENERATE_THUNK(r9)
3075 +@@ -47,3 +47,58 @@ GENERATE_THUNK(r13)
3076 + GENERATE_THUNK(r14)
3077 + GENERATE_THUNK(r15)
3078 + #endif
3079 ++
3080 ++/*
3081 ++ * Fill the CPU return stack buffer.
3082 ++ *
3083 ++ * Each entry in the RSB, if used for a speculative 'ret', contains an
3084 ++ * infinite 'pause; lfence; jmp' loop to capture speculative execution.
3085 ++ *
3086 ++ * This is required in various cases for retpoline and IBRS-based
3087 ++ * mitigations for the Spectre variant 2 vulnerability. Sometimes to
3088 ++ * eliminate potentially bogus entries from the RSB, and sometimes
3089 ++ * purely to ensure that it doesn't get empty, which on some CPUs would
3090 ++ * allow predictions from other (unwanted!) sources to be used.
3091 ++ *
3092 ++ * Google experimented with loop-unrolling and this turned out to be
3093 ++ * the optimal version - two calls, each with their own speculation
3094 ++ * trap should their return address end up getting used, in a loop.
3095 ++ */
3096 ++.macro STUFF_RSB nr:req sp:req
3097 ++ mov $(\nr / 2), %_ASM_BX
3098 ++ .align 16
3099 ++771:
3100 ++ call 772f
3101 ++773: /* speculation trap */
3102 ++ pause
3103 ++ lfence
3104 ++ jmp 773b
3105 ++ .align 16
3106 ++772:
3107 ++ call 774f
3108 ++775: /* speculation trap */
3109 ++ pause
3110 ++ lfence
3111 ++ jmp 775b
3112 ++ .align 16
3113 ++774:
3114 ++ dec %_ASM_BX
3115 ++ jnz 771b
3116 ++ add $((BITS_PER_LONG/8) * \nr), \sp
3117 ++.endm
3118 ++
3119 ++#define RSB_FILL_LOOPS 16 /* To avoid underflow */
3120 ++
3121 ++ENTRY(__fill_rsb)
3122 ++ STUFF_RSB RSB_FILL_LOOPS, %_ASM_SP
3123 ++ ret
3124 ++END(__fill_rsb)
3125 ++EXPORT_SYMBOL_GPL(__fill_rsb)
3126 ++
3127 ++#define RSB_CLEAR_LOOPS 32 /* To forcibly overwrite all entries */
3128 ++
3129 ++ENTRY(__clear_rsb)
3130 ++ STUFF_RSB RSB_CLEAR_LOOPS, %_ASM_SP
3131 ++ ret
3132 ++END(__clear_rsb)
3133 ++EXPORT_SYMBOL_GPL(__clear_rsb)
3134 +diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c
3135 +index 1b377f734e64..7add8ba06887 100644
3136 +--- a/arch/x86/lib/usercopy_32.c
3137 ++++ b/arch/x86/lib/usercopy_32.c
3138 +@@ -331,12 +331,12 @@ do { \
3139 +
3140 + unsigned long __copy_user_ll(void *to, const void *from, unsigned long n)
3141 + {
3142 +- stac();
3143 ++ __uaccess_begin_nospec();
3144 + if (movsl_is_ok(to, from, n))
3145 + __copy_user(to, from, n);
3146 + else
3147 + n = __copy_user_intel(to, from, n);
3148 +- clac();
3149 ++ __uaccess_end();
3150 + return n;
3151 + }
3152 + EXPORT_SYMBOL(__copy_user_ll);
3153 +@@ -344,7 +344,7 @@ EXPORT_SYMBOL(__copy_user_ll);
3154 + unsigned long __copy_from_user_ll_nocache_nozero(void *to, const void __user *from,
3155 + unsigned long n)
3156 + {
3157 +- stac();
3158 ++ __uaccess_begin_nospec();
3159 + #ifdef CONFIG_X86_INTEL_USERCOPY
3160 + if (n > 64 && static_cpu_has(X86_FEATURE_XMM2))
3161 + n = __copy_user_intel_nocache(to, from, n);
3162 +@@ -353,7 +353,7 @@ unsigned long __copy_from_user_ll_nocache_nozero(void *to, const void __user *fr
3163 + #else
3164 + __copy_user(to, from, n);
3165 + #endif
3166 +- clac();
3167 ++ __uaccess_end();
3168 + return n;
3169 + }
3170 + EXPORT_SYMBOL(__copy_from_user_ll_nocache_nozero);
3171 +diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
3172 +index 5bfe61a5e8e3..012d02624848 100644
3173 +--- a/arch/x86/mm/tlb.c
3174 ++++ b/arch/x86/mm/tlb.c
3175 +@@ -6,13 +6,14 @@
3176 + #include <linux/interrupt.h>
3177 + #include <linux/export.h>
3178 + #include <linux/cpu.h>
3179 ++#include <linux/debugfs.h>
3180 +
3181 + #include <asm/tlbflush.h>
3182 + #include <asm/mmu_context.h>
3183 ++#include <asm/nospec-branch.h>
3184 + #include <asm/cache.h>
3185 + #include <asm/apic.h>
3186 + #include <asm/uv/uv.h>
3187 +-#include <linux/debugfs.h>
3188 +
3189 + /*
3190 + * TLB flushing, formerly SMP-only
3191 +@@ -247,6 +248,27 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
3192 + } else {
3193 + u16 new_asid;
3194 + bool need_flush;
3195 ++ u64 last_ctx_id = this_cpu_read(cpu_tlbstate.last_ctx_id);
3196 ++
3197 ++ /*
3198 ++ * Avoid user/user BTB poisoning by flushing the branch
3199 ++ * predictor when switching between processes. This stops
3200 ++ * one process from doing Spectre-v2 attacks on another.
3201 ++ *
3202 ++ * As an optimization, flush indirect branches only when
3203 ++ * switching into processes that disable dumping. This
3204 ++ * protects high value processes like gpg, without having
3205 ++ * too high performance overhead. IBPB is *expensive*!
3206 ++ *
3207 ++ * This will not flush branches when switching into kernel
3208 ++ * threads. It will also not flush if we switch to idle
3209 ++ * thread and back to the same process. It will flush if we
3210 ++ * switch to a different non-dumpable process.
3211 ++ */
3212 ++ if (tsk && tsk->mm &&
3213 ++ tsk->mm->context.ctx_id != last_ctx_id &&
3214 ++ get_dumpable(tsk->mm) != SUID_DUMP_USER)
3215 ++ indirect_branch_prediction_barrier();
3216 +
3217 + if (IS_ENABLED(CONFIG_VMAP_STACK)) {
3218 + /*
3219 +@@ -292,6 +314,14 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
3220 + trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0);
3221 + }
3222 +
3223 ++ /*
3224 ++ * Record last user mm's context id, so we can avoid
3225 ++ * flushing branch buffer with IBPB if we switch back
3226 ++ * to the same user.
3227 ++ */
3228 ++ if (next != &init_mm)
3229 ++ this_cpu_write(cpu_tlbstate.last_ctx_id, next->context.ctx_id);
3230 ++
3231 + this_cpu_write(cpu_tlbstate.loaded_mm, next);
3232 + this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid);
3233 + }
3234 +@@ -369,6 +399,7 @@ void initialize_tlbstate_and_flush(void)
3235 + write_cr3(build_cr3(mm->pgd, 0));
3236 +
3237 + /* Reinitialize tlbstate. */
3238 ++ this_cpu_write(cpu_tlbstate.last_ctx_id, mm->context.ctx_id);
3239 + this_cpu_write(cpu_tlbstate.loaded_mm_asid, 0);
3240 + this_cpu_write(cpu_tlbstate.next_asid, 1);
3241 + this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, mm->context.ctx_id);
3242 +diff --git a/drivers/auxdisplay/img-ascii-lcd.c b/drivers/auxdisplay/img-ascii-lcd.c
3243 +index a9020f82eea7..58403052514f 100644
3244 +--- a/drivers/auxdisplay/img-ascii-lcd.c
3245 ++++ b/drivers/auxdisplay/img-ascii-lcd.c
3246 +@@ -443,3 +443,7 @@ static struct platform_driver img_ascii_lcd_driver = {
3247 + .remove = img_ascii_lcd_remove,
3248 + };
3249 + module_platform_driver(img_ascii_lcd_driver);
3250 ++
3251 ++MODULE_DESCRIPTION("Imagination Technologies ASCII LCD Display");
3252 ++MODULE_AUTHOR("Paul Burton <paul.burton@××××.com>");
3253 ++MODULE_LICENSE("GPL");
3254 +diff --git a/drivers/fpga/fpga-region.c b/drivers/fpga/fpga-region.c
3255 +index d9ab7c75b14f..e0c73ceba2ed 100644
3256 +--- a/drivers/fpga/fpga-region.c
3257 ++++ b/drivers/fpga/fpga-region.c
3258 +@@ -147,6 +147,7 @@ static struct fpga_manager *fpga_region_get_manager(struct fpga_region *region)
3259 + mgr_node = of_parse_phandle(np, "fpga-mgr", 0);
3260 + if (mgr_node) {
3261 + mgr = of_fpga_mgr_get(mgr_node);
3262 ++ of_node_put(mgr_node);
3263 + of_node_put(np);
3264 + return mgr;
3265 + }
3266 +@@ -192,10 +193,13 @@ static int fpga_region_get_bridges(struct fpga_region *region,
3267 + parent_br = region_np->parent;
3268 +
3269 + /* If overlay has a list of bridges, use it. */
3270 +- if (of_parse_phandle(overlay, "fpga-bridges", 0))
3271 ++ br = of_parse_phandle(overlay, "fpga-bridges", 0);
3272 ++ if (br) {
3273 ++ of_node_put(br);
3274 + np = overlay;
3275 +- else
3276 ++ } else {
3277 + np = region_np;
3278 ++ }
3279 +
3280 + for (i = 0; ; i++) {
3281 + br = of_parse_phandle(np, "fpga-bridges", i);
3282 +@@ -203,12 +207,15 @@ static int fpga_region_get_bridges(struct fpga_region *region,
3283 + break;
3284 +
3285 + /* If parent bridge is in list, skip it. */
3286 +- if (br == parent_br)
3287 ++ if (br == parent_br) {
3288 ++ of_node_put(br);
3289 + continue;
3290 ++ }
3291 +
3292 + /* If node is a bridge, get it and add to list */
3293 + ret = fpga_bridge_get_to_list(br, region->info,
3294 + &region->bridge_list);
3295 ++ of_node_put(br);
3296 +
3297 + /* If any of the bridges are in use, give up */
3298 + if (ret == -EBUSY) {
3299 +diff --git a/drivers/iio/accel/kxsd9-i2c.c b/drivers/iio/accel/kxsd9-i2c.c
3300 +index 98fbb628d5bd..38411e1c155b 100644
3301 +--- a/drivers/iio/accel/kxsd9-i2c.c
3302 ++++ b/drivers/iio/accel/kxsd9-i2c.c
3303 +@@ -63,3 +63,6 @@ static struct i2c_driver kxsd9_i2c_driver = {
3304 + .id_table = kxsd9_i2c_id,
3305 + };
3306 + module_i2c_driver(kxsd9_i2c_driver);
3307 ++
3308 ++MODULE_LICENSE("GPL v2");
3309 ++MODULE_DESCRIPTION("KXSD9 accelerometer I2C interface");
3310 +diff --git a/drivers/iio/adc/qcom-vadc-common.c b/drivers/iio/adc/qcom-vadc-common.c
3311 +index 47d24ae5462f..fe3d7826783c 100644
3312 +--- a/drivers/iio/adc/qcom-vadc-common.c
3313 ++++ b/drivers/iio/adc/qcom-vadc-common.c
3314 +@@ -5,6 +5,7 @@
3315 + #include <linux/math64.h>
3316 + #include <linux/log2.h>
3317 + #include <linux/err.h>
3318 ++#include <linux/module.h>
3319 +
3320 + #include "qcom-vadc-common.h"
3321 +
3322 +@@ -229,3 +230,6 @@ int qcom_vadc_decimation_from_dt(u32 value)
3323 + return __ffs64(value / VADC_DECIMATION_MIN);
3324 + }
3325 + EXPORT_SYMBOL(qcom_vadc_decimation_from_dt);
3326 ++
3327 ++MODULE_LICENSE("GPL v2");
3328 ++MODULE_DESCRIPTION("Qualcomm ADC common functionality");
3329 +diff --git a/drivers/pinctrl/pxa/pinctrl-pxa2xx.c b/drivers/pinctrl/pxa/pinctrl-pxa2xx.c
3330 +index 866aa3ce1ac9..6cf0006d4c8d 100644
3331 +--- a/drivers/pinctrl/pxa/pinctrl-pxa2xx.c
3332 ++++ b/drivers/pinctrl/pxa/pinctrl-pxa2xx.c
3333 +@@ -436,3 +436,7 @@ int pxa2xx_pinctrl_exit(struct platform_device *pdev)
3334 + return 0;
3335 + }
3336 + EXPORT_SYMBOL_GPL(pxa2xx_pinctrl_exit);
3337 ++
3338 ++MODULE_AUTHOR("Robert Jarzmik <robert.jarzmik@××××.fr>");
3339 ++MODULE_DESCRIPTION("Marvell PXA2xx pinctrl driver");
3340 ++MODULE_LICENSE("GPL v2");
3341 +diff --git a/drivers/tty/serial/serial_core.c b/drivers/tty/serial/serial_core.c
3342 +index 3a14cccbd7ff..7948acf14601 100644
3343 +--- a/drivers/tty/serial/serial_core.c
3344 ++++ b/drivers/tty/serial/serial_core.c
3345 +@@ -987,6 +987,8 @@ static int uart_set_info(struct tty_struct *tty, struct tty_port *port,
3346 + }
3347 + } else {
3348 + retval = uart_startup(tty, state, 1);
3349 ++ if (retval == 0)
3350 ++ tty_port_set_initialized(port, true);
3351 + if (retval > 0)
3352 + retval = 0;
3353 + }
3354 +diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h
3355 +index 1c65817673db..41615f38bcff 100644
3356 +--- a/include/linux/fdtable.h
3357 ++++ b/include/linux/fdtable.h
3358 +@@ -10,6 +10,7 @@
3359 + #include <linux/compiler.h>
3360 + #include <linux/spinlock.h>
3361 + #include <linux/rcupdate.h>
3362 ++#include <linux/nospec.h>
3363 + #include <linux/types.h>
3364 + #include <linux/init.h>
3365 + #include <linux/fs.h>
3366 +@@ -82,8 +83,10 @@ static inline struct file *__fcheck_files(struct files_struct *files, unsigned i
3367 + {
3368 + struct fdtable *fdt = rcu_dereference_raw(files->fdt);
3369 +
3370 +- if (fd < fdt->max_fds)
3371 ++ if (fd < fdt->max_fds) {
3372 ++ fd = array_index_nospec(fd, fdt->max_fds);
3373 + return rcu_dereference_raw(fdt->fd[fd]);
3374 ++ }
3375 + return NULL;
3376 + }
3377 +
3378 +diff --git a/include/linux/init.h b/include/linux/init.h
3379 +index f38b993edacb..943139a563e3 100644
3380 +--- a/include/linux/init.h
3381 ++++ b/include/linux/init.h
3382 +@@ -5,6 +5,13 @@
3383 + #include <linux/compiler.h>
3384 + #include <linux/types.h>
3385 +
3386 ++/* Built-in __init functions needn't be compiled with retpoline */
3387 ++#if defined(RETPOLINE) && !defined(MODULE)
3388 ++#define __noretpoline __attribute__((indirect_branch("keep")))
3389 ++#else
3390 ++#define __noretpoline
3391 ++#endif
3392 ++
3393 + /* These macros are used to mark some functions or
3394 + * initialized data (doesn't apply to uninitialized data)
3395 + * as `initialization' functions. The kernel can take this
3396 +@@ -40,7 +47,7 @@
3397 +
3398 + /* These are for everybody (although not all archs will actually
3399 + discard it in modules) */
3400 +-#define __init __section(.init.text) __cold __inittrace __latent_entropy
3401 ++#define __init __section(.init.text) __cold __inittrace __latent_entropy __noretpoline
3402 + #define __initdata __section(.init.data)
3403 + #define __initconst __section(.init.rodata)
3404 + #define __exitdata __section(.exit.data)
3405 +diff --git a/include/linux/module.h b/include/linux/module.h
3406 +index fe5aa3736707..b1cc541f2ddf 100644
3407 +--- a/include/linux/module.h
3408 ++++ b/include/linux/module.h
3409 +@@ -794,6 +794,15 @@ static inline void module_bug_finalize(const Elf_Ehdr *hdr,
3410 + static inline void module_bug_cleanup(struct module *mod) {}
3411 + #endif /* CONFIG_GENERIC_BUG */
3412 +
3413 ++#ifdef RETPOLINE
3414 ++extern bool retpoline_module_ok(bool has_retpoline);
3415 ++#else
3416 ++static inline bool retpoline_module_ok(bool has_retpoline)
3417 ++{
3418 ++ return true;
3419 ++}
3420 ++#endif
3421 ++
3422 + #ifdef CONFIG_MODULE_SIG
3423 + static inline bool module_sig_ok(struct module *module)
3424 + {
3425 +diff --git a/include/linux/nospec.h b/include/linux/nospec.h
3426 +new file mode 100644
3427 +index 000000000000..b99bced39ac2
3428 +--- /dev/null
3429 ++++ b/include/linux/nospec.h
3430 +@@ -0,0 +1,72 @@
3431 ++// SPDX-License-Identifier: GPL-2.0
3432 ++// Copyright(c) 2018 Linus Torvalds. All rights reserved.
3433 ++// Copyright(c) 2018 Alexei Starovoitov. All rights reserved.
3434 ++// Copyright(c) 2018 Intel Corporation. All rights reserved.
3435 ++
3436 ++#ifndef _LINUX_NOSPEC_H
3437 ++#define _LINUX_NOSPEC_H
3438 ++
3439 ++/**
3440 ++ * array_index_mask_nospec() - generate a ~0 mask when index < size, 0 otherwise
3441 ++ * @index: array element index
3442 ++ * @size: number of elements in array
3443 ++ *
3444 ++ * When @index is out of bounds (@index >= @size), the sign bit will be
3445 ++ * set. Extend the sign bit to all bits and invert, giving a result of
3446 ++ * zero for an out of bounds index, or ~0 if within bounds [0, @size).
3447 ++ */
3448 ++#ifndef array_index_mask_nospec
3449 ++static inline unsigned long array_index_mask_nospec(unsigned long index,
3450 ++ unsigned long size)
3451 ++{
3452 ++ /*
3453 ++ * Warn developers about inappropriate array_index_nospec() usage.
3454 ++ *
3455 ++ * Even if the CPU speculates past the WARN_ONCE branch, the
3456 ++ * sign bit of @index is taken into account when generating the
3457 ++ * mask.
3458 ++ *
3459 ++ * This warning is compiled out when the compiler can infer that
3460 ++ * @index and @size are less than LONG_MAX.
3461 ++ */
3462 ++ if (WARN_ONCE(index > LONG_MAX || size > LONG_MAX,
3463 ++ "array_index_nospec() limited to range of [0, LONG_MAX]\n"))
3464 ++ return 0;
3465 ++
3466 ++ /*
3467 ++ * Always calculate and emit the mask even if the compiler
3468 ++ * thinks the mask is not needed. The compiler does not take
3469 ++ * into account the value of @index under speculation.
3470 ++ */
3471 ++ OPTIMIZER_HIDE_VAR(index);
3472 ++ return ~(long)(index | (size - 1UL - index)) >> (BITS_PER_LONG - 1);
3473 ++}
3474 ++#endif
3475 ++
3476 ++/*
3477 ++ * array_index_nospec - sanitize an array index after a bounds check
3478 ++ *
3479 ++ * For a code sequence like:
3480 ++ *
3481 ++ * if (index < size) {
3482 ++ * index = array_index_nospec(index, size);
3483 ++ * val = array[index];
3484 ++ * }
3485 ++ *
3486 ++ * ...if the CPU speculates past the bounds check then
3487 ++ * array_index_nospec() will clamp the index within the range of [0,
3488 ++ * size).
3489 ++ */
3490 ++#define array_index_nospec(index, size) \
3491 ++({ \
3492 ++ typeof(index) _i = (index); \
3493 ++ typeof(size) _s = (size); \
3494 ++ unsigned long _mask = array_index_mask_nospec(_i, _s); \
3495 ++ \
3496 ++ BUILD_BUG_ON(sizeof(_i) > sizeof(long)); \
3497 ++ BUILD_BUG_ON(sizeof(_s) > sizeof(long)); \
3498 ++ \
3499 ++ _i &= _mask; \
3500 ++ _i; \
3501 ++})
3502 ++#endif /* _LINUX_NOSPEC_H */
3503 +diff --git a/kernel/module.c b/kernel/module.c
3504 +index de66ec825992..690c0651c40f 100644
3505 +--- a/kernel/module.c
3506 ++++ b/kernel/module.c
3507 +@@ -2855,6 +2855,15 @@ static int check_modinfo_livepatch(struct module *mod, struct load_info *info)
3508 + }
3509 + #endif /* CONFIG_LIVEPATCH */
3510 +
3511 ++static void check_modinfo_retpoline(struct module *mod, struct load_info *info)
3512 ++{
3513 ++ if (retpoline_module_ok(get_modinfo(info, "retpoline")))
3514 ++ return;
3515 ++
3516 ++ pr_warn("%s: loading module not compiled with retpoline compiler.\n",
3517 ++ mod->name);
3518 ++}
3519 ++
3520 + /* Sets info->hdr and info->len. */
3521 + static int copy_module_from_user(const void __user *umod, unsigned long len,
3522 + struct load_info *info)
3523 +@@ -3021,6 +3030,8 @@ static int check_modinfo(struct module *mod, struct load_info *info, int flags)
3524 + add_taint_module(mod, TAINT_OOT_MODULE, LOCKDEP_STILL_OK);
3525 + }
3526 +
3527 ++ check_modinfo_retpoline(mod, info);
3528 ++
3529 + if (get_modinfo(info, "staging")) {
3530 + add_taint_module(mod, TAINT_CRAP, LOCKDEP_STILL_OK);
3531 + pr_warn("%s: module is from the staging directory, the quality "
3532 +diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
3533 +index d396cb61a280..81bef0676e1d 100644
3534 +--- a/net/wireless/nl80211.c
3535 ++++ b/net/wireless/nl80211.c
3536 +@@ -16,6 +16,7 @@
3537 + #include <linux/nl80211.h>
3538 + #include <linux/rtnetlink.h>
3539 + #include <linux/netlink.h>
3540 ++#include <linux/nospec.h>
3541 + #include <linux/etherdevice.h>
3542 + #include <net/net_namespace.h>
3543 + #include <net/genetlink.h>
3544 +@@ -2056,20 +2057,22 @@ static const struct nla_policy txq_params_policy[NL80211_TXQ_ATTR_MAX + 1] = {
3545 + static int parse_txq_params(struct nlattr *tb[],
3546 + struct ieee80211_txq_params *txq_params)
3547 + {
3548 ++ u8 ac;
3549 ++
3550 + if (!tb[NL80211_TXQ_ATTR_AC] || !tb[NL80211_TXQ_ATTR_TXOP] ||
3551 + !tb[NL80211_TXQ_ATTR_CWMIN] || !tb[NL80211_TXQ_ATTR_CWMAX] ||
3552 + !tb[NL80211_TXQ_ATTR_AIFS])
3553 + return -EINVAL;
3554 +
3555 +- txq_params->ac = nla_get_u8(tb[NL80211_TXQ_ATTR_AC]);
3556 ++ ac = nla_get_u8(tb[NL80211_TXQ_ATTR_AC]);
3557 + txq_params->txop = nla_get_u16(tb[NL80211_TXQ_ATTR_TXOP]);
3558 + txq_params->cwmin = nla_get_u16(tb[NL80211_TXQ_ATTR_CWMIN]);
3559 + txq_params->cwmax = nla_get_u16(tb[NL80211_TXQ_ATTR_CWMAX]);
3560 + txq_params->aifs = nla_get_u8(tb[NL80211_TXQ_ATTR_AIFS]);
3561 +
3562 +- if (txq_params->ac >= NL80211_NUM_ACS)
3563 ++ if (ac >= NL80211_NUM_ACS)
3564 + return -EINVAL;
3565 +-
3566 ++ txq_params->ac = array_index_nospec(ac, NL80211_NUM_ACS);
3567 + return 0;
3568 + }
3569 +
3570 +diff --git a/scripts/faddr2line b/scripts/faddr2line
3571 +index 39e07d8574dd..7721d5b2b0c0 100755
3572 +--- a/scripts/faddr2line
3573 ++++ b/scripts/faddr2line
3574 +@@ -44,10 +44,10 @@
3575 + set -o errexit
3576 + set -o nounset
3577 +
3578 +-READELF="${CROSS_COMPILE}readelf"
3579 +-ADDR2LINE="${CROSS_COMPILE}addr2line"
3580 +-SIZE="${CROSS_COMPILE}size"
3581 +-NM="${CROSS_COMPILE}nm"
3582 ++READELF="${CROSS_COMPILE:-}readelf"
3583 ++ADDR2LINE="${CROSS_COMPILE:-}addr2line"
3584 ++SIZE="${CROSS_COMPILE:-}size"
3585 ++NM="${CROSS_COMPILE:-}nm"
3586 +
3587 + command -v awk >/dev/null 2>&1 || die "awk isn't installed"
3588 + command -v ${READELF} >/dev/null 2>&1 || die "readelf isn't installed"
3589 +diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c
3590 +index 98314b400a95..54deaa1066cf 100644
3591 +--- a/scripts/mod/modpost.c
3592 ++++ b/scripts/mod/modpost.c
3593 +@@ -2165,6 +2165,14 @@ static void add_intree_flag(struct buffer *b, int is_intree)
3594 + buf_printf(b, "\nMODULE_INFO(intree, \"Y\");\n");
3595 + }
3596 +
3597 ++/* Cannot check for assembler */
3598 ++static void add_retpoline(struct buffer *b)
3599 ++{
3600 ++ buf_printf(b, "\n#ifdef RETPOLINE\n");
3601 ++ buf_printf(b, "MODULE_INFO(retpoline, \"Y\");\n");
3602 ++ buf_printf(b, "#endif\n");
3603 ++}
3604 ++
3605 + static void add_staging_flag(struct buffer *b, const char *name)
3606 + {
3607 + static const char *staging_dir = "drivers/staging";
3608 +@@ -2506,6 +2514,7 @@ int main(int argc, char **argv)
3609 + err |= check_modname_len(mod);
3610 + add_header(&buf, mod);
3611 + add_intree_flag(&buf, !external_module);
3612 ++ add_retpoline(&buf);
3613 + add_staging_flag(&buf, mod->name);
3614 + err |= add_versions(&buf, mod);
3615 + add_depends(&buf, mod, modules);
3616 +diff --git a/sound/soc/codecs/pcm512x-spi.c b/sound/soc/codecs/pcm512x-spi.c
3617 +index 712ed6598c48..ebdf9bd5a64c 100644
3618 +--- a/sound/soc/codecs/pcm512x-spi.c
3619 ++++ b/sound/soc/codecs/pcm512x-spi.c
3620 +@@ -70,3 +70,7 @@ static struct spi_driver pcm512x_spi_driver = {
3621 + };
3622 +
3623 + module_spi_driver(pcm512x_spi_driver);
3624 ++
3625 ++MODULE_DESCRIPTION("ASoC PCM512x codec driver - SPI");
3626 ++MODULE_AUTHOR("Mark Brown <broonie@××××××.org>");
3627 ++MODULE_LICENSE("GPL v2");
3628 +diff --git a/tools/objtool/check.c b/tools/objtool/check.c
3629 +index f40d46e24bcc..9cd028aa1509 100644
3630 +--- a/tools/objtool/check.c
3631 ++++ b/tools/objtool/check.c
3632 +@@ -543,18 +543,14 @@ static int add_call_destinations(struct objtool_file *file)
3633 + dest_off = insn->offset + insn->len + insn->immediate;
3634 + insn->call_dest = find_symbol_by_offset(insn->sec,
3635 + dest_off);
3636 +- /*
3637 +- * FIXME: Thanks to retpolines, it's now considered
3638 +- * normal for a function to call within itself. So
3639 +- * disable this warning for now.
3640 +- */
3641 +-#if 0
3642 +- if (!insn->call_dest) {
3643 +- WARN_FUNC("can't find call dest symbol at offset 0x%lx",
3644 +- insn->sec, insn->offset, dest_off);
3645 ++
3646 ++ if (!insn->call_dest && !insn->ignore) {
3647 ++ WARN_FUNC("unsupported intra-function call",
3648 ++ insn->sec, insn->offset);
3649 ++ WARN("If this is a retpoline, please patch it in with alternatives and annotate it with ANNOTATE_NOSPEC_ALTERNATIVE.");
3650 + return -1;
3651 + }
3652 +-#endif
3653 ++
3654 + } else if (rela->sym->type == STT_SECTION) {
3655 + insn->call_dest = find_symbol_by_offset(rela->sym->sec,
3656 + rela->addend+4);
3657 +@@ -598,7 +594,7 @@ static int handle_group_alt(struct objtool_file *file,
3658 + struct instruction *orig_insn,
3659 + struct instruction **new_insn)
3660 + {
3661 +- struct instruction *last_orig_insn, *last_new_insn, *insn, *fake_jump;
3662 ++ struct instruction *last_orig_insn, *last_new_insn, *insn, *fake_jump = NULL;
3663 + unsigned long dest_off;
3664 +
3665 + last_orig_insn = NULL;
3666 +@@ -614,28 +610,30 @@ static int handle_group_alt(struct objtool_file *file,
3667 + last_orig_insn = insn;
3668 + }
3669 +
3670 +- if (!next_insn_same_sec(file, last_orig_insn)) {
3671 +- WARN("%s: don't know how to handle alternatives at end of section",
3672 +- special_alt->orig_sec->name);
3673 +- return -1;
3674 +- }
3675 +-
3676 +- fake_jump = malloc(sizeof(*fake_jump));
3677 +- if (!fake_jump) {
3678 +- WARN("malloc failed");
3679 +- return -1;
3680 ++ if (next_insn_same_sec(file, last_orig_insn)) {
3681 ++ fake_jump = malloc(sizeof(*fake_jump));
3682 ++ if (!fake_jump) {
3683 ++ WARN("malloc failed");
3684 ++ return -1;
3685 ++ }
3686 ++ memset(fake_jump, 0, sizeof(*fake_jump));
3687 ++ INIT_LIST_HEAD(&fake_jump->alts);
3688 ++ clear_insn_state(&fake_jump->state);
3689 ++
3690 ++ fake_jump->sec = special_alt->new_sec;
3691 ++ fake_jump->offset = -1;
3692 ++ fake_jump->type = INSN_JUMP_UNCONDITIONAL;
3693 ++ fake_jump->jump_dest = list_next_entry(last_orig_insn, list);
3694 ++ fake_jump->ignore = true;
3695 + }
3696 +- memset(fake_jump, 0, sizeof(*fake_jump));
3697 +- INIT_LIST_HEAD(&fake_jump->alts);
3698 +- clear_insn_state(&fake_jump->state);
3699 +-
3700 +- fake_jump->sec = special_alt->new_sec;
3701 +- fake_jump->offset = -1;
3702 +- fake_jump->type = INSN_JUMP_UNCONDITIONAL;
3703 +- fake_jump->jump_dest = list_next_entry(last_orig_insn, list);
3704 +- fake_jump->ignore = true;
3705 +
3706 + if (!special_alt->new_len) {
3707 ++ if (!fake_jump) {
3708 ++ WARN("%s: empty alternative at end of section",
3709 ++ special_alt->orig_sec->name);
3710 ++ return -1;
3711 ++ }
3712 ++
3713 + *new_insn = fake_jump;
3714 + return 0;
3715 + }
3716 +@@ -648,6 +646,8 @@ static int handle_group_alt(struct objtool_file *file,
3717 +
3718 + last_new_insn = insn;
3719 +
3720 ++ insn->ignore = orig_insn->ignore_alts;
3721 ++
3722 + if (insn->type != INSN_JUMP_CONDITIONAL &&
3723 + insn->type != INSN_JUMP_UNCONDITIONAL)
3724 + continue;
3725 +@@ -656,8 +656,14 @@ static int handle_group_alt(struct objtool_file *file,
3726 + continue;
3727 +
3728 + dest_off = insn->offset + insn->len + insn->immediate;
3729 +- if (dest_off == special_alt->new_off + special_alt->new_len)
3730 ++ if (dest_off == special_alt->new_off + special_alt->new_len) {
3731 ++ if (!fake_jump) {
3732 ++ WARN("%s: alternative jump to end of section",
3733 ++ special_alt->orig_sec->name);
3734 ++ return -1;
3735 ++ }
3736 + insn->jump_dest = fake_jump;
3737 ++ }
3738 +
3739 + if (!insn->jump_dest) {
3740 + WARN_FUNC("can't find alternative jump destination",
3741 +@@ -672,7 +678,8 @@ static int handle_group_alt(struct objtool_file *file,
3742 + return -1;
3743 + }
3744 +
3745 +- list_add(&fake_jump->list, &last_new_insn->list);
3746 ++ if (fake_jump)
3747 ++ list_add(&fake_jump->list, &last_new_insn->list);
3748 +
3749 + return 0;
3750 + }
3751 +@@ -729,10 +736,6 @@ static int add_special_section_alts(struct objtool_file *file)
3752 + goto out;
3753 + }
3754 +
3755 +- /* Ignore retpoline alternatives. */
3756 +- if (orig_insn->ignore_alts)
3757 +- continue;
3758 +-
3759 + new_insn = NULL;
3760 + if (!special_alt->group || special_alt->new_len) {
3761 + new_insn = find_insn(file, special_alt->new_sec,
3762 +@@ -1089,11 +1092,11 @@ static int decode_sections(struct objtool_file *file)
3763 + if (ret)
3764 + return ret;
3765 +
3766 +- ret = add_call_destinations(file);
3767 ++ ret = add_special_section_alts(file);
3768 + if (ret)
3769 + return ret;
3770 +
3771 +- ret = add_special_section_alts(file);
3772 ++ ret = add_call_destinations(file);
3773 + if (ret)
3774 + return ret;
3775 +
3776 +@@ -1720,10 +1723,12 @@ static int validate_branch(struct objtool_file *file, struct instruction *first,
3777 +
3778 + insn->visited = true;
3779 +
3780 +- list_for_each_entry(alt, &insn->alts, list) {
3781 +- ret = validate_branch(file, alt->insn, state);
3782 +- if (ret)
3783 +- return 1;
3784 ++ if (!insn->ignore_alts) {
3785 ++ list_for_each_entry(alt, &insn->alts, list) {
3786 ++ ret = validate_branch(file, alt->insn, state);
3787 ++ if (ret)
3788 ++ return 1;
3789 ++ }
3790 + }
3791 +
3792 + switch (insn->type) {
3793 +diff --git a/tools/objtool/orc_gen.c b/tools/objtool/orc_gen.c
3794 +index e61fe703197b..18384d9be4e1 100644
3795 +--- a/tools/objtool/orc_gen.c
3796 ++++ b/tools/objtool/orc_gen.c
3797 +@@ -98,6 +98,11 @@ static int create_orc_entry(struct section *u_sec, struct section *ip_relasec,
3798 + struct orc_entry *orc;
3799 + struct rela *rela;
3800 +
3801 ++ if (!insn_sec->sym) {
3802 ++ WARN("missing symbol for section %s", insn_sec->name);
3803 ++ return -1;
3804 ++ }
3805 ++
3806 + /* populate ORC data */
3807 + orc = (struct orc_entry *)u_sec->data->d_buf + idx;
3808 + memcpy(orc, o, sizeof(*orc));