1 |
commit: e758eda21d87b283279b863edee5538d33067904 |
2 |
Author: Mike Pagano <mpagano <AT> gentoo <DOT> org> |
3 |
AuthorDate: Sat Nov 16 10:54:45 2019 +0000 |
4 |
Commit: Mike Pagano <mpagano <AT> gentoo <DOT> org> |
5 |
CommitDate: Sat Nov 16 10:54:45 2019 +0000 |
6 |
URL: https://gitweb.gentoo.org/proj/linux-patches.git/commit/?id=e758eda2 |
7 |
|
8 |
Linux patch 4.9.202 |
9 |
|
10 |
Signed-off-by: Mike Pagano <mpagano <AT> gentoo.org> |
11 |
|
12 |
0000_README | 4 + |
13 |
1201_linux-4.9.202.patch | 3319 ++++++++++++++++++++++++++++++++++++++++++++++ |
14 |
2 files changed, 3323 insertions(+) |
15 |
|
16 |
diff --git a/0000_README b/0000_README |
17 |
index 7d079d7..33f5858 100644 |
18 |
--- a/0000_README |
19 |
+++ b/0000_README |
20 |
@@ -847,6 +847,10 @@ Patch: 1200_linux-4.9.201.patch |
21 |
From: http://www.kernel.org |
22 |
Desc: Linux 4.9.201 |
23 |
|
24 |
+Patch: 1201_linux-4.9.202.patch |
25 |
+From: http://www.kernel.org |
26 |
+Desc: Linux 4.9.202 |
27 |
+ |
28 |
Patch: 1500_XATTR_USER_PREFIX.patch |
29 |
From: https://bugs.gentoo.org/show_bug.cgi?id=470644 |
30 |
Desc: Support for namespace user.pax.* on tmpfs. |
31 |
|
32 |
diff --git a/1201_linux-4.9.202.patch b/1201_linux-4.9.202.patch |
33 |
new file mode 100644 |
34 |
index 0000000..bbc562b |
35 |
--- /dev/null |
36 |
+++ b/1201_linux-4.9.202.patch |
37 |
@@ -0,0 +1,3319 @@ |
38 |
+diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu |
39 |
+index cadb7a9a5218..b41046b5713b 100644 |
40 |
+--- a/Documentation/ABI/testing/sysfs-devices-system-cpu |
41 |
++++ b/Documentation/ABI/testing/sysfs-devices-system-cpu |
42 |
+@@ -358,6 +358,8 @@ What: /sys/devices/system/cpu/vulnerabilities |
43 |
+ /sys/devices/system/cpu/vulnerabilities/spec_store_bypass |
44 |
+ /sys/devices/system/cpu/vulnerabilities/l1tf |
45 |
+ /sys/devices/system/cpu/vulnerabilities/mds |
46 |
++ /sys/devices/system/cpu/vulnerabilities/tsx_async_abort |
47 |
++ /sys/devices/system/cpu/vulnerabilities/itlb_multihit |
48 |
+ Date: January 2018 |
49 |
+ Contact: Linux kernel mailing list <linux-kernel@×××××××××××.org> |
50 |
+ Description: Information about CPU vulnerabilities |
51 |
+diff --git a/Documentation/hw-vuln/index.rst b/Documentation/hw-vuln/index.rst |
52 |
+index ffc064c1ec68..24f53c501366 100644 |
53 |
+--- a/Documentation/hw-vuln/index.rst |
54 |
++++ b/Documentation/hw-vuln/index.rst |
55 |
+@@ -11,3 +11,5 @@ are configurable at compile, boot or run time. |
56 |
+ |
57 |
+ l1tf |
58 |
+ mds |
59 |
++ tsx_async_abort |
60 |
++ multihit.rst |
61 |
+diff --git a/Documentation/hw-vuln/multihit.rst b/Documentation/hw-vuln/multihit.rst |
62 |
+new file mode 100644 |
63 |
+index 000000000000..ba9988d8bce5 |
64 |
+--- /dev/null |
65 |
++++ b/Documentation/hw-vuln/multihit.rst |
66 |
+@@ -0,0 +1,163 @@ |
67 |
++iTLB multihit |
68 |
++============= |
69 |
++ |
70 |
++iTLB multihit is an erratum where some processors may incur a machine check |
71 |
++error, possibly resulting in an unrecoverable CPU lockup, when an |
72 |
++instruction fetch hits multiple entries in the instruction TLB. This can |
73 |
++occur when the page size is changed along with either the physical address |
74 |
++or cache type. A malicious guest running on a virtualized system can |
75 |
++exploit this erratum to perform a denial of service attack. |
76 |
++ |
77 |
++ |
78 |
++Affected processors |
79 |
++------------------- |
80 |
++ |
81 |
++Variations of this erratum are present on most Intel Core and Xeon processor |
82 |
++models. The erratum is not present on: |
83 |
++ |
84 |
++ - non-Intel processors |
85 |
++ |
86 |
++ - Some Atoms (Airmont, Bonnell, Goldmont, GoldmontPlus, Saltwell, Silvermont) |
87 |
++ |
88 |
++ - Intel processors that have the PSCHANGE_MC_NO bit set in the |
89 |
++ IA32_ARCH_CAPABILITIES MSR. |
90 |
++ |
91 |
++ |
92 |
++Related CVEs |
93 |
++------------ |
94 |
++ |
95 |
++The following CVE entry is related to this issue: |
96 |
++ |
97 |
++ ============== ================================================= |
98 |
++ CVE-2018-12207 Machine Check Error Avoidance on Page Size Change |
99 |
++ ============== ================================================= |
100 |
++ |
101 |
++ |
102 |
++Problem |
103 |
++------- |
104 |
++ |
105 |
++Privileged software, including OS and virtual machine managers (VMM), are in |
106 |
++charge of memory management. A key component in memory management is the control |
107 |
++of the page tables. Modern processors use virtual memory, a technique that creates |
108 |
++the illusion of a very large memory for processors. This virtual space is split |
109 |
++into pages of a given size. Page tables translate virtual addresses to physical |
110 |
++addresses. |
111 |
++ |
112 |
++To reduce latency when performing a virtual to physical address translation, |
113 |
++processors include a structure, called TLB, that caches recent translations. |
114 |
++There are separate TLBs for instruction (iTLB) and data (dTLB). |
115 |
++ |
116 |
++Under this errata, instructions are fetched from a linear address translated |
117 |
++using a 4 KB translation cached in the iTLB. Privileged software modifies the |
118 |
++paging structure so that the same linear address using large page size (2 MB, 4 |
119 |
++MB, 1 GB) with a different physical address or memory type. After the page |
120 |
++structure modification but before the software invalidates any iTLB entries for |
121 |
++the linear address, a code fetch that happens on the same linear address may |
122 |
++cause a machine-check error which can result in a system hang or shutdown. |
123 |
++ |
124 |
++ |
125 |
++Attack scenarios |
126 |
++---------------- |
127 |
++ |
128 |
++Attacks against the iTLB multihit erratum can be mounted from malicious |
129 |
++guests in a virtualized system. |
130 |
++ |
131 |
++ |
132 |
++iTLB multihit system information |
133 |
++-------------------------------- |
134 |
++ |
135 |
++The Linux kernel provides a sysfs interface to enumerate the current iTLB |
136 |
++multihit status of the system:whether the system is vulnerable and which |
137 |
++mitigations are active. The relevant sysfs file is: |
138 |
++ |
139 |
++/sys/devices/system/cpu/vulnerabilities/itlb_multihit |
140 |
++ |
141 |
++The possible values in this file are: |
142 |
++ |
143 |
++.. list-table:: |
144 |
++ |
145 |
++ * - Not affected |
146 |
++ - The processor is not vulnerable. |
147 |
++ * - KVM: Mitigation: Split huge pages |
148 |
++ - Software changes mitigate this issue. |
149 |
++ * - KVM: Vulnerable |
150 |
++ - The processor is vulnerable, but no mitigation enabled |
151 |
++ |
152 |
++ |
153 |
++Enumeration of the erratum |
154 |
++-------------------------------- |
155 |
++ |
156 |
++A new bit has been allocated in the IA32_ARCH_CAPABILITIES (PSCHANGE_MC_NO) msr |
157 |
++and will be set on CPU's which are mitigated against this issue. |
158 |
++ |
159 |
++ ======================================= =========== =============================== |
160 |
++ IA32_ARCH_CAPABILITIES MSR Not present Possibly vulnerable,check model |
161 |
++ IA32_ARCH_CAPABILITIES[PSCHANGE_MC_NO] '0' Likely vulnerable,check model |
162 |
++ IA32_ARCH_CAPABILITIES[PSCHANGE_MC_NO] '1' Not vulnerable |
163 |
++ ======================================= =========== =============================== |
164 |
++ |
165 |
++ |
166 |
++Mitigation mechanism |
167 |
++------------------------- |
168 |
++ |
169 |
++This erratum can be mitigated by restricting the use of large page sizes to |
170 |
++non-executable pages. This forces all iTLB entries to be 4K, and removes |
171 |
++the possibility of multiple hits. |
172 |
++ |
173 |
++In order to mitigate the vulnerability, KVM initially marks all huge pages |
174 |
++as non-executable. If the guest attempts to execute in one of those pages, |
175 |
++the page is broken down into 4K pages, which are then marked executable. |
176 |
++ |
177 |
++If EPT is disabled or not available on the host, KVM is in control of TLB |
178 |
++flushes and the problematic situation cannot happen. However, the shadow |
179 |
++EPT paging mechanism used by nested virtualization is vulnerable, because |
180 |
++the nested guest can trigger multiple iTLB hits by modifying its own |
181 |
++(non-nested) page tables. For simplicity, KVM will make large pages |
182 |
++non-executable in all shadow paging modes. |
183 |
++ |
184 |
++Mitigation control on the kernel command line and KVM - module parameter |
185 |
++------------------------------------------------------------------------ |
186 |
++ |
187 |
++The KVM hypervisor mitigation mechanism for marking huge pages as |
188 |
++non-executable can be controlled with a module parameter "nx_huge_pages=". |
189 |
++The kernel command line allows to control the iTLB multihit mitigations at |
190 |
++boot time with the option "kvm.nx_huge_pages=". |
191 |
++ |
192 |
++The valid arguments for these options are: |
193 |
++ |
194 |
++ ========== ================================================================ |
195 |
++ force Mitigation is enabled. In this case, the mitigation implements |
196 |
++ non-executable huge pages in Linux kernel KVM module. All huge |
197 |
++ pages in the EPT are marked as non-executable. |
198 |
++ If a guest attempts to execute in one of those pages, the page is |
199 |
++ broken down into 4K pages, which are then marked executable. |
200 |
++ |
201 |
++ off Mitigation is disabled. |
202 |
++ |
203 |
++ auto Enable mitigation only if the platform is affected and the kernel |
204 |
++ was not booted with the "mitigations=off" command line parameter. |
205 |
++ This is the default option. |
206 |
++ ========== ================================================================ |
207 |
++ |
208 |
++ |
209 |
++Mitigation selection guide |
210 |
++-------------------------- |
211 |
++ |
212 |
++1. No virtualization in use |
213 |
++^^^^^^^^^^^^^^^^^^^^^^^^^^^ |
214 |
++ |
215 |
++ The system is protected by the kernel unconditionally and no further |
216 |
++ action is required. |
217 |
++ |
218 |
++2. Virtualization with trusted guests |
219 |
++^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ |
220 |
++ |
221 |
++ If the guest comes from a trusted source, you may assume that the guest will |
222 |
++ not attempt to maliciously exploit these errata and no further action is |
223 |
++ required. |
224 |
++ |
225 |
++3. Virtualization with untrusted guests |
226 |
++^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ |
227 |
++ If the guest comes from an untrusted source, the guest host kernel will need |
228 |
++ to apply iTLB multihit mitigation via the kernel command line or kvm |
229 |
++ module parameter. |
230 |
+diff --git a/Documentation/hw-vuln/tsx_async_abort.rst b/Documentation/hw-vuln/tsx_async_abort.rst |
231 |
+new file mode 100644 |
232 |
+index 000000000000..fddbd7579c53 |
233 |
+--- /dev/null |
234 |
++++ b/Documentation/hw-vuln/tsx_async_abort.rst |
235 |
+@@ -0,0 +1,276 @@ |
236 |
++.. SPDX-License-Identifier: GPL-2.0 |
237 |
++ |
238 |
++TAA - TSX Asynchronous Abort |
239 |
++====================================== |
240 |
++ |
241 |
++TAA is a hardware vulnerability that allows unprivileged speculative access to |
242 |
++data which is available in various CPU internal buffers by using asynchronous |
243 |
++aborts within an Intel TSX transactional region. |
244 |
++ |
245 |
++Affected processors |
246 |
++------------------- |
247 |
++ |
248 |
++This vulnerability only affects Intel processors that support Intel |
249 |
++Transactional Synchronization Extensions (TSX) when the TAA_NO bit (bit 8) |
250 |
++is 0 in the IA32_ARCH_CAPABILITIES MSR. On processors where the MDS_NO bit |
251 |
++(bit 5) is 0 in the IA32_ARCH_CAPABILITIES MSR, the existing MDS mitigations |
252 |
++also mitigate against TAA. |
253 |
++ |
254 |
++Whether a processor is affected or not can be read out from the TAA |
255 |
++vulnerability file in sysfs. See :ref:`tsx_async_abort_sys_info`. |
256 |
++ |
257 |
++Related CVEs |
258 |
++------------ |
259 |
++ |
260 |
++The following CVE entry is related to this TAA issue: |
261 |
++ |
262 |
++ ============== ===== =================================================== |
263 |
++ CVE-2019-11135 TAA TSX Asynchronous Abort (TAA) condition on some |
264 |
++ microprocessors utilizing speculative execution may |
265 |
++ allow an authenticated user to potentially enable |
266 |
++ information disclosure via a side channel with |
267 |
++ local access. |
268 |
++ ============== ===== =================================================== |
269 |
++ |
270 |
++Problem |
271 |
++------- |
272 |
++ |
273 |
++When performing store, load or L1 refill operations, processors write |
274 |
++data into temporary microarchitectural structures (buffers). The data in |
275 |
++those buffers can be forwarded to load operations as an optimization. |
276 |
++ |
277 |
++Intel TSX is an extension to the x86 instruction set architecture that adds |
278 |
++hardware transactional memory support to improve performance of multi-threaded |
279 |
++software. TSX lets the processor expose and exploit concurrency hidden in an |
280 |
++application due to dynamically avoiding unnecessary synchronization. |
281 |
++ |
282 |
++TSX supports atomic memory transactions that are either committed (success) or |
283 |
++aborted. During an abort, operations that happened within the transactional region |
284 |
++are rolled back. An asynchronous abort takes place, among other options, when a |
285 |
++different thread accesses a cache line that is also used within the transactional |
286 |
++region when that access might lead to a data race. |
287 |
++ |
288 |
++Immediately after an uncompleted asynchronous abort, certain speculatively |
289 |
++executed loads may read data from those internal buffers and pass it to dependent |
290 |
++operations. This can be then used to infer the value via a cache side channel |
291 |
++attack. |
292 |
++ |
293 |
++Because the buffers are potentially shared between Hyper-Threads cross |
294 |
++Hyper-Thread attacks are possible. |
295 |
++ |
296 |
++The victim of a malicious actor does not need to make use of TSX. Only the |
297 |
++attacker needs to begin a TSX transaction and raise an asynchronous abort |
298 |
++which in turn potenitally leaks data stored in the buffers. |
299 |
++ |
300 |
++More detailed technical information is available in the TAA specific x86 |
301 |
++architecture section: :ref:`Documentation/x86/tsx_async_abort.rst <tsx_async_abort>`. |
302 |
++ |
303 |
++ |
304 |
++Attack scenarios |
305 |
++---------------- |
306 |
++ |
307 |
++Attacks against the TAA vulnerability can be implemented from unprivileged |
308 |
++applications running on hosts or guests. |
309 |
++ |
310 |
++As for MDS, the attacker has no control over the memory addresses that can |
311 |
++be leaked. Only the victim is responsible for bringing data to the CPU. As |
312 |
++a result, the malicious actor has to sample as much data as possible and |
313 |
++then postprocess it to try to infer any useful information from it. |
314 |
++ |
315 |
++A potential attacker only has read access to the data. Also, there is no direct |
316 |
++privilege escalation by using this technique. |
317 |
++ |
318 |
++ |
319 |
++.. _tsx_async_abort_sys_info: |
320 |
++ |
321 |
++TAA system information |
322 |
++----------------------- |
323 |
++ |
324 |
++The Linux kernel provides a sysfs interface to enumerate the current TAA status |
325 |
++of mitigated systems. The relevant sysfs file is: |
326 |
++ |
327 |
++/sys/devices/system/cpu/vulnerabilities/tsx_async_abort |
328 |
++ |
329 |
++The possible values in this file are: |
330 |
++ |
331 |
++.. list-table:: |
332 |
++ |
333 |
++ * - 'Vulnerable' |
334 |
++ - The CPU is affected by this vulnerability and the microcode and kernel mitigation are not applied. |
335 |
++ * - 'Vulnerable: Clear CPU buffers attempted, no microcode' |
336 |
++ - The system tries to clear the buffers but the microcode might not support the operation. |
337 |
++ * - 'Mitigation: Clear CPU buffers' |
338 |
++ - The microcode has been updated to clear the buffers. TSX is still enabled. |
339 |
++ * - 'Mitigation: TSX disabled' |
340 |
++ - TSX is disabled. |
341 |
++ * - 'Not affected' |
342 |
++ - The CPU is not affected by this issue. |
343 |
++ |
344 |
++.. _ucode_needed: |
345 |
++ |
346 |
++Best effort mitigation mode |
347 |
++^^^^^^^^^^^^^^^^^^^^^^^^^^^ |
348 |
++ |
349 |
++If the processor is vulnerable, but the availability of the microcode-based |
350 |
++mitigation mechanism is not advertised via CPUID the kernel selects a best |
351 |
++effort mitigation mode. This mode invokes the mitigation instructions |
352 |
++without a guarantee that they clear the CPU buffers. |
353 |
++ |
354 |
++This is done to address virtualization scenarios where the host has the |
355 |
++microcode update applied, but the hypervisor is not yet updated to expose the |
356 |
++CPUID to the guest. If the host has updated microcode the protection takes |
357 |
++effect; otherwise a few CPU cycles are wasted pointlessly. |
358 |
++ |
359 |
++The state in the tsx_async_abort sysfs file reflects this situation |
360 |
++accordingly. |
361 |
++ |
362 |
++ |
363 |
++Mitigation mechanism |
364 |
++-------------------- |
365 |
++ |
366 |
++The kernel detects the affected CPUs and the presence of the microcode which is |
367 |
++required. If a CPU is affected and the microcode is available, then the kernel |
368 |
++enables the mitigation by default. |
369 |
++ |
370 |
++ |
371 |
++The mitigation can be controlled at boot time via a kernel command line option. |
372 |
++See :ref:`taa_mitigation_control_command_line`. |
373 |
++ |
374 |
++.. _virt_mechanism: |
375 |
++ |
376 |
++Virtualization mitigation |
377 |
++^^^^^^^^^^^^^^^^^^^^^^^^^ |
378 |
++ |
379 |
++Affected systems where the host has TAA microcode and TAA is mitigated by |
380 |
++having disabled TSX previously, are not vulnerable regardless of the status |
381 |
++of the VMs. |
382 |
++ |
383 |
++In all other cases, if the host either does not have the TAA microcode or |
384 |
++the kernel is not mitigated, the system might be vulnerable. |
385 |
++ |
386 |
++ |
387 |
++.. _taa_mitigation_control_command_line: |
388 |
++ |
389 |
++Mitigation control on the kernel command line |
390 |
++--------------------------------------------- |
391 |
++ |
392 |
++The kernel command line allows to control the TAA mitigations at boot time with |
393 |
++the option "tsx_async_abort=". The valid arguments for this option are: |
394 |
++ |
395 |
++ ============ ============================================================= |
396 |
++ off This option disables the TAA mitigation on affected platforms. |
397 |
++ If the system has TSX enabled (see next parameter) and the CPU |
398 |
++ is affected, the system is vulnerable. |
399 |
++ |
400 |
++ full TAA mitigation is enabled. If TSX is enabled, on an affected |
401 |
++ system it will clear CPU buffers on ring transitions. On |
402 |
++ systems which are MDS-affected and deploy MDS mitigation, |
403 |
++ TAA is also mitigated. Specifying this option on those |
404 |
++ systems will have no effect. |
405 |
++ |
406 |
++ full,nosmt The same as tsx_async_abort=full, with SMT disabled on |
407 |
++ vulnerable CPUs that have TSX enabled. This is the complete |
408 |
++ mitigation. When TSX is disabled, SMT is not disabled because |
409 |
++ CPU is not vulnerable to cross-thread TAA attacks. |
410 |
++ ============ ============================================================= |
411 |
++ |
412 |
++Not specifying this option is equivalent to "tsx_async_abort=full". |
413 |
++ |
414 |
++The kernel command line also allows to control the TSX feature using the |
415 |
++parameter "tsx=" on CPUs which support TSX control. MSR_IA32_TSX_CTRL is used |
416 |
++to control the TSX feature and the enumeration of the TSX feature bits (RTM |
417 |
++and HLE) in CPUID. |
418 |
++ |
419 |
++The valid options are: |
420 |
++ |
421 |
++ ============ ============================================================= |
422 |
++ off Disables TSX on the system. |
423 |
++ |
424 |
++ Note that this option takes effect only on newer CPUs which are |
425 |
++ not vulnerable to MDS, i.e., have MSR_IA32_ARCH_CAPABILITIES.MDS_NO=1 |
426 |
++ and which get the new IA32_TSX_CTRL MSR through a microcode |
427 |
++ update. This new MSR allows for the reliable deactivation of |
428 |
++ the TSX functionality. |
429 |
++ |
430 |
++ on Enables TSX. |
431 |
++ |
432 |
++ Although there are mitigations for all known security |
433 |
++ vulnerabilities, TSX has been known to be an accelerator for |
434 |
++ several previous speculation-related CVEs, and so there may be |
435 |
++ unknown security risks associated with leaving it enabled. |
436 |
++ |
437 |
++ auto Disables TSX if X86_BUG_TAA is present, otherwise enables TSX |
438 |
++ on the system. |
439 |
++ ============ ============================================================= |
440 |
++ |
441 |
++Not specifying this option is equivalent to "tsx=off". |
442 |
++ |
443 |
++The following combinations of the "tsx_async_abort" and "tsx" are possible. For |
444 |
++affected platforms tsx=auto is equivalent to tsx=off and the result will be: |
445 |
++ |
446 |
++ ========= ========================== ========================================= |
447 |
++ tsx=on tsx_async_abort=full The system will use VERW to clear CPU |
448 |
++ buffers. Cross-thread attacks are still |
449 |
++ possible on SMT machines. |
450 |
++ tsx=on tsx_async_abort=full,nosmt As above, cross-thread attacks on SMT |
451 |
++ mitigated. |
452 |
++ tsx=on tsx_async_abort=off The system is vulnerable. |
453 |
++ tsx=off tsx_async_abort=full TSX might be disabled if microcode |
454 |
++ provides a TSX control MSR. If so, |
455 |
++ system is not vulnerable. |
456 |
++ tsx=off tsx_async_abort=full,nosmt Ditto |
457 |
++ tsx=off tsx_async_abort=off ditto |
458 |
++ ========= ========================== ========================================= |
459 |
++ |
460 |
++ |
461 |
++For unaffected platforms "tsx=on" and "tsx_async_abort=full" does not clear CPU |
462 |
++buffers. For platforms without TSX control (MSR_IA32_ARCH_CAPABILITIES.MDS_NO=0) |
463 |
++"tsx" command line argument has no effect. |
464 |
++ |
465 |
++For the affected platforms below table indicates the mitigation status for the |
466 |
++combinations of CPUID bit MD_CLEAR and IA32_ARCH_CAPABILITIES MSR bits MDS_NO |
467 |
++and TSX_CTRL_MSR. |
468 |
++ |
469 |
++ ======= ========= ============= ======================================== |
470 |
++ MDS_NO MD_CLEAR TSX_CTRL_MSR Status |
471 |
++ ======= ========= ============= ======================================== |
472 |
++ 0 0 0 Vulnerable (needs microcode) |
473 |
++ 0 1 0 MDS and TAA mitigated via VERW |
474 |
++ 1 1 0 MDS fixed, TAA vulnerable if TSX enabled |
475 |
++ because MD_CLEAR has no meaning and |
476 |
++ VERW is not guaranteed to clear buffers |
477 |
++ 1 X 1 MDS fixed, TAA can be mitigated by |
478 |
++ VERW or TSX_CTRL_MSR |
479 |
++ ======= ========= ============= ======================================== |
480 |
++ |
481 |
++Mitigation selection guide |
482 |
++-------------------------- |
483 |
++ |
484 |
++1. Trusted userspace and guests |
485 |
++^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ |
486 |
++ |
487 |
++If all user space applications are from a trusted source and do not execute |
488 |
++untrusted code which is supplied externally, then the mitigation can be |
489 |
++disabled. The same applies to virtualized environments with trusted guests. |
490 |
++ |
491 |
++ |
492 |
++2. Untrusted userspace and guests |
493 |
++^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ |
494 |
++ |
495 |
++If there are untrusted applications or guests on the system, enabling TSX |
496 |
++might allow a malicious actor to leak data from the host or from other |
497 |
++processes running on the same physical core. |
498 |
++ |
499 |
++If the microcode is available and the TSX is disabled on the host, attacks |
500 |
++are prevented in a virtualized environment as well, even if the VMs do not |
501 |
++explicitly enable the mitigation. |
502 |
++ |
503 |
++ |
504 |
++.. _taa_default_mitigations: |
505 |
++ |
506 |
++Default mitigations |
507 |
++------------------- |
508 |
++ |
509 |
++The kernel's default action for vulnerable processors is: |
510 |
++ |
511 |
++ - Deploy TSX disable mitigation (tsx_async_abort=full tsx=off). |
512 |
+diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt |
513 |
+index 61b73e42f488..c81a008d6512 100644 |
514 |
+--- a/Documentation/kernel-parameters.txt |
515 |
++++ b/Documentation/kernel-parameters.txt |
516 |
+@@ -1975,6 +1975,25 @@ bytes respectively. Such letter suffixes can also be entirely omitted. |
517 |
+ KVM MMU at runtime. |
518 |
+ Default is 0 (off) |
519 |
+ |
520 |
++ kvm.nx_huge_pages= |
521 |
++ [KVM] Controls the software workaround for the |
522 |
++ X86_BUG_ITLB_MULTIHIT bug. |
523 |
++ force : Always deploy workaround. |
524 |
++ off : Never deploy workaround. |
525 |
++ auto : Deploy workaround based on the presence of |
526 |
++ X86_BUG_ITLB_MULTIHIT. |
527 |
++ |
528 |
++ Default is 'auto'. |
529 |
++ |
530 |
++ If the software workaround is enabled for the host, |
531 |
++ guests do need not to enable it for nested guests. |
532 |
++ |
533 |
++ kvm.nx_huge_pages_recovery_ratio= |
534 |
++ [KVM] Controls how many 4KiB pages are periodically zapped |
535 |
++ back to huge pages. 0 disables the recovery, otherwise if |
536 |
++ the value is N KVM will zap 1/Nth of the 4KiB pages every |
537 |
++ minute. The default is 60. |
538 |
++ |
539 |
+ kvm-amd.nested= [KVM,AMD] Allow nested virtualization in KVM/SVM. |
540 |
+ Default is 1 (enabled) |
541 |
+ |
542 |
+@@ -2490,6 +2509,13 @@ bytes respectively. Such letter suffixes can also be entirely omitted. |
543 |
+ spec_store_bypass_disable=off [X86] |
544 |
+ l1tf=off [X86] |
545 |
+ mds=off [X86] |
546 |
++ tsx_async_abort=off [X86] |
547 |
++ kvm.nx_huge_pages=off [X86] |
548 |
++ |
549 |
++ Exceptions: |
550 |
++ This does not have any effect on |
551 |
++ kvm.nx_huge_pages when |
552 |
++ kvm.nx_huge_pages=force. |
553 |
+ |
554 |
+ auto (default) |
555 |
+ Mitigate all CPU vulnerabilities, but leave SMT |
556 |
+@@ -2505,6 +2531,7 @@ bytes respectively. Such letter suffixes can also be entirely omitted. |
557 |
+ be fully mitigated, even if it means losing SMT. |
558 |
+ Equivalent to: l1tf=flush,nosmt [X86] |
559 |
+ mds=full,nosmt [X86] |
560 |
++ tsx_async_abort=full,nosmt [X86] |
561 |
+ |
562 |
+ mminit_loglevel= |
563 |
+ [KNL] When CONFIG_DEBUG_MEMORY_INIT is set, this |
564 |
+@@ -4516,6 +4543,71 @@ bytes respectively. Such letter suffixes can also be entirely omitted. |
565 |
+ platforms where RDTSC is slow and this accounting |
566 |
+ can add overhead. |
567 |
+ |
568 |
++ tsx= [X86] Control Transactional Synchronization |
569 |
++ Extensions (TSX) feature in Intel processors that |
570 |
++ support TSX control. |
571 |
++ |
572 |
++ This parameter controls the TSX feature. The options are: |
573 |
++ |
574 |
++ on - Enable TSX on the system. Although there are |
575 |
++ mitigations for all known security vulnerabilities, |
576 |
++ TSX has been known to be an accelerator for |
577 |
++ several previous speculation-related CVEs, and |
578 |
++ so there may be unknown security risks associated |
579 |
++ with leaving it enabled. |
580 |
++ |
581 |
++ off - Disable TSX on the system. (Note that this |
582 |
++ option takes effect only on newer CPUs which are |
583 |
++ not vulnerable to MDS, i.e., have |
584 |
++ MSR_IA32_ARCH_CAPABILITIES.MDS_NO=1 and which get |
585 |
++ the new IA32_TSX_CTRL MSR through a microcode |
586 |
++ update. This new MSR allows for the reliable |
587 |
++ deactivation of the TSX functionality.) |
588 |
++ |
589 |
++ auto - Disable TSX if X86_BUG_TAA is present, |
590 |
++ otherwise enable TSX on the system. |
591 |
++ |
592 |
++ Not specifying this option is equivalent to tsx=off. |
593 |
++ |
594 |
++ See Documentation/hw-vuln/tsx_async_abort.rst |
595 |
++ for more details. |
596 |
++ |
597 |
++ tsx_async_abort= [X86,INTEL] Control mitigation for the TSX Async |
598 |
++ Abort (TAA) vulnerability. |
599 |
++ |
600 |
++ Similar to Micro-architectural Data Sampling (MDS) |
601 |
++ certain CPUs that support Transactional |
602 |
++ Synchronization Extensions (TSX) are vulnerable to an |
603 |
++ exploit against CPU internal buffers which can forward |
604 |
++ information to a disclosure gadget under certain |
605 |
++ conditions. |
606 |
++ |
607 |
++ In vulnerable processors, the speculatively forwarded |
608 |
++ data can be used in a cache side channel attack, to |
609 |
++ access data to which the attacker does not have direct |
610 |
++ access. |
611 |
++ |
612 |
++ This parameter controls the TAA mitigation. The |
613 |
++ options are: |
614 |
++ |
615 |
++ full - Enable TAA mitigation on vulnerable CPUs |
616 |
++ if TSX is enabled. |
617 |
++ |
618 |
++ full,nosmt - Enable TAA mitigation and disable SMT on |
619 |
++ vulnerable CPUs. If TSX is disabled, SMT |
620 |
++ is not disabled because CPU is not |
621 |
++ vulnerable to cross-thread TAA attacks. |
622 |
++ off - Unconditionally disable TAA mitigation |
623 |
++ |
624 |
++ Not specifying this option is equivalent to |
625 |
++ tsx_async_abort=full. On CPUs which are MDS affected |
626 |
++ and deploy MDS mitigation, TAA mitigation is not |
627 |
++ required and doesn't provide any additional |
628 |
++ mitigation. |
629 |
++ |
630 |
++ For details see: |
631 |
++ Documentation/hw-vuln/tsx_async_abort.rst |
632 |
++ |
633 |
+ turbografx.map[2|3]= [HW,JOY] |
634 |
+ TurboGraFX parallel port interface |
635 |
+ Format: |
636 |
+diff --git a/Documentation/virtual/kvm/locking.txt b/Documentation/virtual/kvm/locking.txt |
637 |
+index e5dd9f4d6100..46ef3680c8ab 100644 |
638 |
+--- a/Documentation/virtual/kvm/locking.txt |
639 |
++++ b/Documentation/virtual/kvm/locking.txt |
640 |
+@@ -13,8 +13,8 @@ The acquisition orders for mutexes are as follows: |
641 |
+ - kvm->slots_lock is taken outside kvm->irq_lock, though acquiring |
642 |
+ them together is quite rare. |
643 |
+ |
644 |
+-For spinlocks, kvm_lock is taken outside kvm->mmu_lock. Everything |
645 |
+-else is a leaf: no other lock is taken inside the critical sections. |
646 |
++Everything else is a leaf: no other lock is taken inside the critical |
647 |
++sections. |
648 |
+ |
649 |
+ 2: Exception |
650 |
+ ------------ |
651 |
+@@ -142,7 +142,7 @@ See the comments in spte_has_volatile_bits() and mmu_spte_update(). |
652 |
+ ------------ |
653 |
+ |
654 |
+ Name: kvm_lock |
655 |
+-Type: spinlock_t |
656 |
++Type: mutex |
657 |
+ Arch: any |
658 |
+ Protects: - vm_list |
659 |
+ |
660 |
+diff --git a/Documentation/x86/index.rst b/Documentation/x86/index.rst |
661 |
+index ef389dcf1b1d..0780d55c5aa8 100644 |
662 |
+--- a/Documentation/x86/index.rst |
663 |
++++ b/Documentation/x86/index.rst |
664 |
+@@ -6,3 +6,4 @@ x86 architecture specifics |
665 |
+ :maxdepth: 1 |
666 |
+ |
667 |
+ mds |
668 |
++ tsx_async_abort |
669 |
+diff --git a/Documentation/x86/tsx_async_abort.rst b/Documentation/x86/tsx_async_abort.rst |
670 |
+new file mode 100644 |
671 |
+index 000000000000..4a4336a89372 |
672 |
+--- /dev/null |
673 |
++++ b/Documentation/x86/tsx_async_abort.rst |
674 |
+@@ -0,0 +1,117 @@ |
675 |
++.. SPDX-License-Identifier: GPL-2.0 |
676 |
++ |
677 |
++TSX Async Abort (TAA) mitigation |
678 |
++================================ |
679 |
++ |
680 |
++.. _tsx_async_abort: |
681 |
++ |
682 |
++Overview |
683 |
++-------- |
684 |
++ |
685 |
++TSX Async Abort (TAA) is a side channel attack on internal buffers in some |
686 |
++Intel processors similar to Microachitectural Data Sampling (MDS). In this |
687 |
++case certain loads may speculatively pass invalid data to dependent operations |
688 |
++when an asynchronous abort condition is pending in a Transactional |
689 |
++Synchronization Extensions (TSX) transaction. This includes loads with no |
690 |
++fault or assist condition. Such loads may speculatively expose stale data from |
691 |
++the same uarch data structures as in MDS, with same scope of exposure i.e. |
692 |
++same-thread and cross-thread. This issue affects all current processors that |
693 |
++support TSX. |
694 |
++ |
695 |
++Mitigation strategy |
696 |
++------------------- |
697 |
++ |
698 |
++a) TSX disable - one of the mitigations is to disable TSX. A new MSR |
699 |
++IA32_TSX_CTRL will be available in future and current processors after |
700 |
++microcode update which can be used to disable TSX. In addition, it |
701 |
++controls the enumeration of the TSX feature bits (RTM and HLE) in CPUID. |
702 |
++ |
703 |
++b) Clear CPU buffers - similar to MDS, clearing the CPU buffers mitigates this |
704 |
++vulnerability. More details on this approach can be found in |
705 |
++:ref:`Documentation/hw-vuln/mds.rst <mds>`. |
706 |
++ |
707 |
++Kernel internal mitigation modes |
708 |
++-------------------------------- |
709 |
++ |
710 |
++ ============= ============================================================ |
711 |
++ off Mitigation is disabled. Either the CPU is not affected or |
712 |
++ tsx_async_abort=off is supplied on the kernel command line. |
713 |
++ |
714 |
++ tsx disabled Mitigation is enabled. TSX feature is disabled by default at |
715 |
++ bootup on processors that support TSX control. |
716 |
++ |
717 |
++ verw Mitigation is enabled. CPU is affected and MD_CLEAR is |
718 |
++ advertised in CPUID. |
719 |
++ |
720 |
++ ucode needed Mitigation is enabled. CPU is affected and MD_CLEAR is not |
721 |
++ advertised in CPUID. That is mainly for virtualization |
722 |
++ scenarios where the host has the updated microcode but the |
723 |
++ hypervisor does not expose MD_CLEAR in CPUID. It's a best |
724 |
++ effort approach without guarantee. |
725 |
++ ============= ============================================================ |
726 |
++ |
727 |
++If the CPU is affected and the "tsx_async_abort" kernel command line parameter is |
728 |
++not provided then the kernel selects an appropriate mitigation depending on the |
729 |
++status of RTM and MD_CLEAR CPUID bits. |
730 |
++ |
731 |
++Below tables indicate the impact of tsx=on|off|auto cmdline options on state of |
732 |
++TAA mitigation, VERW behavior and TSX feature for various combinations of |
733 |
++MSR_IA32_ARCH_CAPABILITIES bits. |
734 |
++ |
735 |
++1. "tsx=off" |
736 |
++ |
737 |
++========= ========= ============ ============ ============== =================== ====================== |
738 |
++MSR_IA32_ARCH_CAPABILITIES bits Result with cmdline tsx=off |
739 |
++---------------------------------- ------------------------------------------------------------------------- |
740 |
++TAA_NO MDS_NO TSX_CTRL_MSR TSX state VERW can clear TAA mitigation TAA mitigation |
741 |
++ after bootup CPU buffers tsx_async_abort=off tsx_async_abort=full |
742 |
++========= ========= ============ ============ ============== =================== ====================== |
743 |
++ 0 0 0 HW default Yes Same as MDS Same as MDS |
744 |
++ 0 0 1 Invalid case Invalid case Invalid case Invalid case |
745 |
++ 0 1 0 HW default No Need ucode update Need ucode update |
746 |
++ 0 1 1 Disabled Yes TSX disabled TSX disabled |
747 |
++ 1 X 1 Disabled X None needed None needed |
748 |
++========= ========= ============ ============ ============== =================== ====================== |
749 |
++ |
750 |
++2. "tsx=on" |
751 |
++ |
752 |
++========= ========= ============ ============ ============== =================== ====================== |
753 |
++MSR_IA32_ARCH_CAPABILITIES bits Result with cmdline tsx=on |
754 |
++---------------------------------- ------------------------------------------------------------------------- |
755 |
++TAA_NO MDS_NO TSX_CTRL_MSR TSX state VERW can clear TAA mitigation TAA mitigation |
756 |
++ after bootup CPU buffers tsx_async_abort=off tsx_async_abort=full |
757 |
++========= ========= ============ ============ ============== =================== ====================== |
758 |
++ 0 0 0 HW default Yes Same as MDS Same as MDS |
759 |
++ 0 0 1 Invalid case Invalid case Invalid case Invalid case |
760 |
++ 0 1 0 HW default No Need ucode update Need ucode update |
761 |
++ 0 1 1 Enabled Yes None Same as MDS |
762 |
++ 1 X 1 Enabled X None needed None needed |
763 |
++========= ========= ============ ============ ============== =================== ====================== |
764 |
++ |
765 |
++3. "tsx=auto" |
766 |
++ |
767 |
++========= ========= ============ ============ ============== =================== ====================== |
768 |
++MSR_IA32_ARCH_CAPABILITIES bits Result with cmdline tsx=auto |
769 |
++---------------------------------- ------------------------------------------------------------------------- |
770 |
++TAA_NO MDS_NO TSX_CTRL_MSR TSX state VERW can clear TAA mitigation TAA mitigation |
771 |
++ after bootup CPU buffers tsx_async_abort=off tsx_async_abort=full |
772 |
++========= ========= ============ ============ ============== =================== ====================== |
773 |
++ 0 0 0 HW default Yes Same as MDS Same as MDS |
774 |
++ 0 0 1 Invalid case Invalid case Invalid case Invalid case |
775 |
++ 0 1 0 HW default No Need ucode update Need ucode update |
776 |
++ 0 1 1 Disabled Yes TSX disabled TSX disabled |
777 |
++ 1 X 1 Enabled X None needed None needed |
778 |
++========= ========= ============ ============ ============== =================== ====================== |
779 |
++ |
780 |
++In the tables, TSX_CTRL_MSR is a new bit in MSR_IA32_ARCH_CAPABILITIES that |
781 |
++indicates whether MSR_IA32_TSX_CTRL is supported. |
782 |
++ |
783 |
++There are two control bits in IA32_TSX_CTRL MSR: |
784 |
++ |
785 |
++ Bit 0: When set it disables the Restricted Transactional Memory (RTM) |
786 |
++ sub-feature of TSX (will force all transactions to abort on the |
787 |
++ XBEGIN instruction). |
788 |
++ |
789 |
++ Bit 1: When set it disables the enumeration of the RTM and HLE feature |
790 |
++ (i.e. it will make CPUID(EAX=7).EBX{bit4} and |
791 |
++ CPUID(EAX=7).EBX{bit11} read as 0). |
792 |
+diff --git a/Makefile b/Makefile |
793 |
+index 4741bbdfaa10..1e322e669301 100644 |
794 |
+--- a/Makefile |
795 |
++++ b/Makefile |
796 |
+@@ -1,6 +1,6 @@ |
797 |
+ VERSION = 4 |
798 |
+ PATCHLEVEL = 9 |
799 |
+-SUBLEVEL = 201 |
800 |
++SUBLEVEL = 202 |
801 |
+ EXTRAVERSION = |
802 |
+ NAME = Roaring Lionus |
803 |
+ |
804 |
+diff --git a/arch/mips/bcm63xx/reset.c b/arch/mips/bcm63xx/reset.c |
805 |
+index d1fe51edf5e6..4d411da2497b 100644 |
806 |
+--- a/arch/mips/bcm63xx/reset.c |
807 |
++++ b/arch/mips/bcm63xx/reset.c |
808 |
+@@ -119,7 +119,7 @@ |
809 |
+ #define BCM6368_RESET_DSL 0 |
810 |
+ #define BCM6368_RESET_SAR SOFTRESET_6368_SAR_MASK |
811 |
+ #define BCM6368_RESET_EPHY SOFTRESET_6368_EPHY_MASK |
812 |
+-#define BCM6368_RESET_ENETSW 0 |
813 |
++#define BCM6368_RESET_ENETSW SOFTRESET_6368_ENETSW_MASK |
814 |
+ #define BCM6368_RESET_PCM SOFTRESET_6368_PCM_MASK |
815 |
+ #define BCM6368_RESET_MPI SOFTRESET_6368_MPI_MASK |
816 |
+ #define BCM6368_RESET_PCIE 0 |
817 |
+diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c |
818 |
+index 3dc96b455e0c..37c254677ccd 100644 |
819 |
+--- a/arch/s390/kvm/kvm-s390.c |
820 |
++++ b/arch/s390/kvm/kvm-s390.c |
821 |
+@@ -1422,13 +1422,13 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) |
822 |
+ kvm->arch.sca = (struct bsca_block *) get_zeroed_page(alloc_flags); |
823 |
+ if (!kvm->arch.sca) |
824 |
+ goto out_err; |
825 |
+- spin_lock(&kvm_lock); |
826 |
++ mutex_lock(&kvm_lock); |
827 |
+ sca_offset += 16; |
828 |
+ if (sca_offset + sizeof(struct bsca_block) > PAGE_SIZE) |
829 |
+ sca_offset = 0; |
830 |
+ kvm->arch.sca = (struct bsca_block *) |
831 |
+ ((char *) kvm->arch.sca + sca_offset); |
832 |
+- spin_unlock(&kvm_lock); |
833 |
++ mutex_unlock(&kvm_lock); |
834 |
+ |
835 |
+ sprintf(debug_name, "kvm-%u", current->pid); |
836 |
+ |
837 |
+diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig |
838 |
+index e0055b4302d6..1067f7668c4e 100644 |
839 |
+--- a/arch/x86/Kconfig |
840 |
++++ b/arch/x86/Kconfig |
841 |
+@@ -1755,6 +1755,51 @@ config X86_INTEL_MEMORY_PROTECTION_KEYS |
842 |
+ |
843 |
+ If unsure, say y. |
844 |
+ |
845 |
++choice |
846 |
++ prompt "TSX enable mode" |
847 |
++ depends on CPU_SUP_INTEL |
848 |
++ default X86_INTEL_TSX_MODE_OFF |
849 |
++ help |
850 |
++ Intel's TSX (Transactional Synchronization Extensions) feature |
851 |
++ allows to optimize locking protocols through lock elision which |
852 |
++ can lead to a noticeable performance boost. |
853 |
++ |
854 |
++ On the other hand it has been shown that TSX can be exploited |
855 |
++ to form side channel attacks (e.g. TAA) and chances are there |
856 |
++ will be more of those attacks discovered in the future. |
857 |
++ |
858 |
++ Therefore TSX is not enabled by default (aka tsx=off). An admin |
859 |
++ might override this decision by tsx=on the command line parameter. |
860 |
++ Even with TSX enabled, the kernel will attempt to enable the best |
861 |
++ possible TAA mitigation setting depending on the microcode available |
862 |
++ for the particular machine. |
863 |
++ |
864 |
++ This option allows to set the default tsx mode between tsx=on, =off |
865 |
++ and =auto. See Documentation/kernel-parameters.txt for more |
866 |
++ details. |
867 |
++ |
868 |
++ Say off if not sure, auto if TSX is in use but it should be used on safe |
869 |
++ platforms or on if TSX is in use and the security aspect of tsx is not |
870 |
++ relevant. |
871 |
++ |
872 |
++config X86_INTEL_TSX_MODE_OFF |
873 |
++ bool "off" |
874 |
++ help |
875 |
++ TSX is disabled if possible - equals to tsx=off command line parameter. |
876 |
++ |
877 |
++config X86_INTEL_TSX_MODE_ON |
878 |
++ bool "on" |
879 |
++ help |
880 |
++ TSX is always enabled on TSX capable HW - equals the tsx=on command |
881 |
++ line parameter. |
882 |
++ |
883 |
++config X86_INTEL_TSX_MODE_AUTO |
884 |
++ bool "auto" |
885 |
++ help |
886 |
++ TSX is enabled on TSX capable HW that is believed to be safe against |
887 |
++ side channel attacks- equals the tsx=auto command line parameter. |
888 |
++endchoice |
889 |
++ |
890 |
+ config EFI |
891 |
+ bool "EFI runtime service support" |
892 |
+ depends on ACPI |
893 |
+diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h |
894 |
+index 3a972da155d6..ccc4420f051b 100644 |
895 |
+--- a/arch/x86/include/asm/cpufeatures.h |
896 |
++++ b/arch/x86/include/asm/cpufeatures.h |
897 |
+@@ -357,5 +357,7 @@ |
898 |
+ #define X86_BUG_MDS X86_BUG(19) /* CPU is affected by Microarchitectural data sampling */ |
899 |
+ #define X86_BUG_MSBDS_ONLY X86_BUG(20) /* CPU is only affected by the MSDBS variant of BUG_MDS */ |
900 |
+ #define X86_BUG_SWAPGS X86_BUG(21) /* CPU is affected by speculation through SWAPGS */ |
901 |
++#define X86_BUG_TAA X86_BUG(22) /* CPU is affected by TSX Async Abort(TAA) */ |
902 |
++#define X86_BUG_ITLB_MULTIHIT X86_BUG(23) /* CPU may incur MCE during certain page attribute changes */ |
903 |
+ |
904 |
+ #endif /* _ASM_X86_CPUFEATURES_H */ |
905 |
+diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h |
906 |
+index 222cb69e1219..d2c14a96ec28 100644 |
907 |
+--- a/arch/x86/include/asm/kvm_host.h |
908 |
++++ b/arch/x86/include/asm/kvm_host.h |
909 |
+@@ -261,6 +261,7 @@ struct kvm_rmap_head { |
910 |
+ struct kvm_mmu_page { |
911 |
+ struct list_head link; |
912 |
+ struct hlist_node hash_link; |
913 |
++ struct list_head lpage_disallowed_link; |
914 |
+ |
915 |
+ /* |
916 |
+ * The following two entries are used to key the shadow page in the |
917 |
+@@ -273,6 +274,7 @@ struct kvm_mmu_page { |
918 |
+ /* hold the gfn of each spte inside spt */ |
919 |
+ gfn_t *gfns; |
920 |
+ bool unsync; |
921 |
++ bool lpage_disallowed; /* Can't be replaced by an equiv large page */ |
922 |
+ int root_count; /* Currently serving as active root */ |
923 |
+ unsigned int unsync_children; |
924 |
+ struct kvm_rmap_head parent_ptes; /* rmap pointers to parent sptes */ |
925 |
+@@ -724,6 +726,7 @@ struct kvm_arch { |
926 |
+ */ |
927 |
+ struct list_head active_mmu_pages; |
928 |
+ struct list_head zapped_obsolete_pages; |
929 |
++ struct list_head lpage_disallowed_mmu_pages; |
930 |
+ struct kvm_page_track_notifier_node mmu_sp_tracker; |
931 |
+ struct kvm_page_track_notifier_head track_notifier_head; |
932 |
+ |
933 |
+@@ -798,6 +801,8 @@ struct kvm_arch { |
934 |
+ |
935 |
+ bool x2apic_format; |
936 |
+ bool x2apic_broadcast_quirk_disabled; |
937 |
++ |
938 |
++ struct task_struct *nx_lpage_recovery_thread; |
939 |
+ }; |
940 |
+ |
941 |
+ struct kvm_vm_stat { |
942 |
+@@ -811,6 +816,7 @@ struct kvm_vm_stat { |
943 |
+ ulong mmu_unsync; |
944 |
+ ulong remote_tlb_flush; |
945 |
+ ulong lpages; |
946 |
++ ulong nx_lpage_splits; |
947 |
+ }; |
948 |
+ |
949 |
+ struct kvm_vcpu_stat { |
950 |
+diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h |
951 |
+index 86166868db8c..8d162e0f2881 100644 |
952 |
+--- a/arch/x86/include/asm/msr-index.h |
953 |
++++ b/arch/x86/include/asm/msr-index.h |
954 |
+@@ -77,6 +77,18 @@ |
955 |
+ * Microarchitectural Data |
956 |
+ * Sampling (MDS) vulnerabilities. |
957 |
+ */ |
958 |
++#define ARCH_CAP_PSCHANGE_MC_NO BIT(6) /* |
959 |
++ * The processor is not susceptible to a |
960 |
++ * machine check error due to modifying the |
961 |
++ * code page size along with either the |
962 |
++ * physical address or cache type |
963 |
++ * without TLB invalidation. |
964 |
++ */ |
965 |
++#define ARCH_CAP_TSX_CTRL_MSR BIT(7) /* MSR for TSX control is available. */ |
966 |
++#define ARCH_CAP_TAA_NO BIT(8) /* |
967 |
++ * Not susceptible to |
968 |
++ * TSX Async Abort (TAA) vulnerabilities. |
969 |
++ */ |
970 |
+ |
971 |
+ #define MSR_IA32_FLUSH_CMD 0x0000010b |
972 |
+ #define L1D_FLUSH BIT(0) /* |
973 |
+@@ -87,6 +99,10 @@ |
974 |
+ #define MSR_IA32_BBL_CR_CTL 0x00000119 |
975 |
+ #define MSR_IA32_BBL_CR_CTL3 0x0000011e |
976 |
+ |
977 |
++#define MSR_IA32_TSX_CTRL 0x00000122 |
978 |
++#define TSX_CTRL_RTM_DISABLE BIT(0) /* Disable RTM feature */ |
979 |
++#define TSX_CTRL_CPUID_CLEAR BIT(1) /* Disable TSX enumeration */ |
980 |
++ |
981 |
+ #define MSR_IA32_SYSENTER_CS 0x00000174 |
982 |
+ #define MSR_IA32_SYSENTER_ESP 0x00000175 |
983 |
+ #define MSR_IA32_SYSENTER_EIP 0x00000176 |
984 |
+diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h |
985 |
+index 10a48505abb5..8d56d701b5f7 100644 |
986 |
+--- a/arch/x86/include/asm/nospec-branch.h |
987 |
++++ b/arch/x86/include/asm/nospec-branch.h |
988 |
+@@ -314,7 +314,7 @@ DECLARE_STATIC_KEY_FALSE(mds_idle_clear); |
989 |
+ #include <asm/segment.h> |
990 |
+ |
991 |
+ /** |
992 |
+- * mds_clear_cpu_buffers - Mitigation for MDS vulnerability |
993 |
++ * mds_clear_cpu_buffers - Mitigation for MDS and TAA vulnerability |
994 |
+ * |
995 |
+ * This uses the otherwise unused and obsolete VERW instruction in |
996 |
+ * combination with microcode which triggers a CPU buffer flush when the |
997 |
+@@ -337,7 +337,7 @@ static inline void mds_clear_cpu_buffers(void) |
998 |
+ } |
999 |
+ |
1000 |
+ /** |
1001 |
+- * mds_user_clear_cpu_buffers - Mitigation for MDS vulnerability |
1002 |
++ * mds_user_clear_cpu_buffers - Mitigation for MDS and TAA vulnerability |
1003 |
+ * |
1004 |
+ * Clear CPU buffers if the corresponding static key is enabled |
1005 |
+ */ |
1006 |
+diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h |
1007 |
+index 155e49fc7010..92703fa09c19 100644 |
1008 |
+--- a/arch/x86/include/asm/processor.h |
1009 |
++++ b/arch/x86/include/asm/processor.h |
1010 |
+@@ -880,4 +880,11 @@ enum mds_mitigations { |
1011 |
+ MDS_MITIGATION_VMWERV, |
1012 |
+ }; |
1013 |
+ |
1014 |
++enum taa_mitigations { |
1015 |
++ TAA_MITIGATION_OFF, |
1016 |
++ TAA_MITIGATION_UCODE_NEEDED, |
1017 |
++ TAA_MITIGATION_VERW, |
1018 |
++ TAA_MITIGATION_TSX_DISABLED, |
1019 |
++}; |
1020 |
++ |
1021 |
+ #endif /* _ASM_X86_PROCESSOR_H */ |
1022 |
+diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile |
1023 |
+index 33b63670bf09..f6e386fe510c 100644 |
1024 |
+--- a/arch/x86/kernel/cpu/Makefile |
1025 |
++++ b/arch/x86/kernel/cpu/Makefile |
1026 |
+@@ -25,7 +25,7 @@ obj-y += bugs.o |
1027 |
+ obj-$(CONFIG_PROC_FS) += proc.o |
1028 |
+ obj-$(CONFIG_X86_FEATURE_NAMES) += capflags.o powerflags.o |
1029 |
+ |
1030 |
+-obj-$(CONFIG_CPU_SUP_INTEL) += intel.o |
1031 |
++obj-$(CONFIG_CPU_SUP_INTEL) += intel.o tsx.o |
1032 |
+ obj-$(CONFIG_CPU_SUP_AMD) += amd.o |
1033 |
+ obj-$(CONFIG_CPU_SUP_CYRIX_32) += cyrix.o |
1034 |
+ obj-$(CONFIG_CPU_SUP_CENTAUR) += centaur.o |
1035 |
+diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c |
1036 |
+index 2a42fef275ad..827fc38df97a 100644 |
1037 |
+--- a/arch/x86/kernel/cpu/bugs.c |
1038 |
++++ b/arch/x86/kernel/cpu/bugs.c |
1039 |
+@@ -31,11 +31,14 @@ |
1040 |
+ #include <asm/intel-family.h> |
1041 |
+ #include <asm/e820.h> |
1042 |
+ |
1043 |
++#include "cpu.h" |
1044 |
++ |
1045 |
+ static void __init spectre_v1_select_mitigation(void); |
1046 |
+ static void __init spectre_v2_select_mitigation(void); |
1047 |
+ static void __init ssb_select_mitigation(void); |
1048 |
+ static void __init l1tf_select_mitigation(void); |
1049 |
+ static void __init mds_select_mitigation(void); |
1050 |
++static void __init taa_select_mitigation(void); |
1051 |
+ |
1052 |
+ /* The base value of the SPEC_CTRL MSR that always has to be preserved. */ |
1053 |
+ u64 x86_spec_ctrl_base; |
1054 |
+@@ -102,6 +105,7 @@ void __init check_bugs(void) |
1055 |
+ ssb_select_mitigation(); |
1056 |
+ l1tf_select_mitigation(); |
1057 |
+ mds_select_mitigation(); |
1058 |
++ taa_select_mitigation(); |
1059 |
+ |
1060 |
+ arch_smt_update(); |
1061 |
+ |
1062 |
+@@ -265,6 +269,100 @@ static int __init mds_cmdline(char *str) |
1063 |
+ } |
1064 |
+ early_param("mds", mds_cmdline); |
1065 |
+ |
1066 |
++#undef pr_fmt |
1067 |
++#define pr_fmt(fmt) "TAA: " fmt |
1068 |
++ |
1069 |
++/* Default mitigation for TAA-affected CPUs */ |
1070 |
++static enum taa_mitigations taa_mitigation __ro_after_init = TAA_MITIGATION_VERW; |
1071 |
++static bool taa_nosmt __ro_after_init; |
1072 |
++ |
1073 |
++static const char * const taa_strings[] = { |
1074 |
++ [TAA_MITIGATION_OFF] = "Vulnerable", |
1075 |
++ [TAA_MITIGATION_UCODE_NEEDED] = "Vulnerable: Clear CPU buffers attempted, no microcode", |
1076 |
++ [TAA_MITIGATION_VERW] = "Mitigation: Clear CPU buffers", |
1077 |
++ [TAA_MITIGATION_TSX_DISABLED] = "Mitigation: TSX disabled", |
1078 |
++}; |
1079 |
++ |
1080 |
++static void __init taa_select_mitigation(void) |
1081 |
++{ |
1082 |
++ u64 ia32_cap; |
1083 |
++ |
1084 |
++ if (!boot_cpu_has_bug(X86_BUG_TAA)) { |
1085 |
++ taa_mitigation = TAA_MITIGATION_OFF; |
1086 |
++ return; |
1087 |
++ } |
1088 |
++ |
1089 |
++ /* TSX previously disabled by tsx=off */ |
1090 |
++ if (!boot_cpu_has(X86_FEATURE_RTM)) { |
1091 |
++ taa_mitigation = TAA_MITIGATION_TSX_DISABLED; |
1092 |
++ goto out; |
1093 |
++ } |
1094 |
++ |
1095 |
++ if (cpu_mitigations_off()) { |
1096 |
++ taa_mitigation = TAA_MITIGATION_OFF; |
1097 |
++ return; |
1098 |
++ } |
1099 |
++ |
1100 |
++ /* TAA mitigation is turned off on the cmdline (tsx_async_abort=off) */ |
1101 |
++ if (taa_mitigation == TAA_MITIGATION_OFF) |
1102 |
++ goto out; |
1103 |
++ |
1104 |
++ if (boot_cpu_has(X86_FEATURE_MD_CLEAR)) |
1105 |
++ taa_mitigation = TAA_MITIGATION_VERW; |
1106 |
++ else |
1107 |
++ taa_mitigation = TAA_MITIGATION_UCODE_NEEDED; |
1108 |
++ |
1109 |
++ /* |
1110 |
++ * VERW doesn't clear the CPU buffers when MD_CLEAR=1 and MDS_NO=1. |
1111 |
++ * A microcode update fixes this behavior to clear CPU buffers. It also |
1112 |
++ * adds support for MSR_IA32_TSX_CTRL which is enumerated by the |
1113 |
++ * ARCH_CAP_TSX_CTRL_MSR bit. |
1114 |
++ * |
1115 |
++ * On MDS_NO=1 CPUs if ARCH_CAP_TSX_CTRL_MSR is not set, microcode |
1116 |
++ * update is required. |
1117 |
++ */ |
1118 |
++ ia32_cap = x86_read_arch_cap_msr(); |
1119 |
++ if ( (ia32_cap & ARCH_CAP_MDS_NO) && |
1120 |
++ !(ia32_cap & ARCH_CAP_TSX_CTRL_MSR)) |
1121 |
++ taa_mitigation = TAA_MITIGATION_UCODE_NEEDED; |
1122 |
++ |
1123 |
++ /* |
1124 |
++ * TSX is enabled, select alternate mitigation for TAA which is |
1125 |
++ * the same as MDS. Enable MDS static branch to clear CPU buffers. |
1126 |
++ * |
1127 |
++ * For guests that can't determine whether the correct microcode is |
1128 |
++ * present on host, enable the mitigation for UCODE_NEEDED as well. |
1129 |
++ */ |
1130 |
++ static_branch_enable(&mds_user_clear); |
1131 |
++ |
1132 |
++ if (taa_nosmt || cpu_mitigations_auto_nosmt()) |
1133 |
++ cpu_smt_disable(false); |
1134 |
++ |
1135 |
++out: |
1136 |
++ pr_info("%s\n", taa_strings[taa_mitigation]); |
1137 |
++} |
1138 |
++ |
1139 |
++static int __init tsx_async_abort_parse_cmdline(char *str) |
1140 |
++{ |
1141 |
++ if (!boot_cpu_has_bug(X86_BUG_TAA)) |
1142 |
++ return 0; |
1143 |
++ |
1144 |
++ if (!str) |
1145 |
++ return -EINVAL; |
1146 |
++ |
1147 |
++ if (!strcmp(str, "off")) { |
1148 |
++ taa_mitigation = TAA_MITIGATION_OFF; |
1149 |
++ } else if (!strcmp(str, "full")) { |
1150 |
++ taa_mitigation = TAA_MITIGATION_VERW; |
1151 |
++ } else if (!strcmp(str, "full,nosmt")) { |
1152 |
++ taa_mitigation = TAA_MITIGATION_VERW; |
1153 |
++ taa_nosmt = true; |
1154 |
++ } |
1155 |
++ |
1156 |
++ return 0; |
1157 |
++} |
1158 |
++early_param("tsx_async_abort", tsx_async_abort_parse_cmdline); |
1159 |
++ |
1160 |
+ #undef pr_fmt |
1161 |
+ #define pr_fmt(fmt) "Spectre V1 : " fmt |
1162 |
+ |
1163 |
+@@ -780,13 +878,10 @@ static void update_mds_branch_idle(void) |
1164 |
+ } |
1165 |
+ |
1166 |
+ #define MDS_MSG_SMT "MDS CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/mds.html for more details.\n" |
1167 |
++#define TAA_MSG_SMT "TAA CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/tsx_async_abort.html for more details.\n" |
1168 |
+ |
1169 |
+ void arch_smt_update(void) |
1170 |
+ { |
1171 |
+- /* Enhanced IBRS implies STIBP. No update required. */ |
1172 |
+- if (spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED) |
1173 |
+- return; |
1174 |
+- |
1175 |
+ mutex_lock(&spec_ctrl_mutex); |
1176 |
+ |
1177 |
+ switch (spectre_v2_user) { |
1178 |
+@@ -812,6 +907,17 @@ void arch_smt_update(void) |
1179 |
+ break; |
1180 |
+ } |
1181 |
+ |
1182 |
++ switch (taa_mitigation) { |
1183 |
++ case TAA_MITIGATION_VERW: |
1184 |
++ case TAA_MITIGATION_UCODE_NEEDED: |
1185 |
++ if (sched_smt_active()) |
1186 |
++ pr_warn_once(TAA_MSG_SMT); |
1187 |
++ break; |
1188 |
++ case TAA_MITIGATION_TSX_DISABLED: |
1189 |
++ case TAA_MITIGATION_OFF: |
1190 |
++ break; |
1191 |
++ } |
1192 |
++ |
1193 |
+ mutex_unlock(&spec_ctrl_mutex); |
1194 |
+ } |
1195 |
+ |
1196 |
+@@ -1127,6 +1233,9 @@ void x86_spec_ctrl_setup_ap(void) |
1197 |
+ x86_amd_ssb_disable(); |
1198 |
+ } |
1199 |
+ |
1200 |
++bool itlb_multihit_kvm_mitigation; |
1201 |
++EXPORT_SYMBOL_GPL(itlb_multihit_kvm_mitigation); |
1202 |
++ |
1203 |
+ #undef pr_fmt |
1204 |
+ #define pr_fmt(fmt) "L1TF: " fmt |
1205 |
+ |
1206 |
+@@ -1282,11 +1391,24 @@ static ssize_t l1tf_show_state(char *buf) |
1207 |
+ l1tf_vmx_states[l1tf_vmx_mitigation], |
1208 |
+ sched_smt_active() ? "vulnerable" : "disabled"); |
1209 |
+ } |
1210 |
++ |
1211 |
++static ssize_t itlb_multihit_show_state(char *buf) |
1212 |
++{ |
1213 |
++ if (itlb_multihit_kvm_mitigation) |
1214 |
++ return sprintf(buf, "KVM: Mitigation: Split huge pages\n"); |
1215 |
++ else |
1216 |
++ return sprintf(buf, "KVM: Vulnerable\n"); |
1217 |
++} |
1218 |
+ #else |
1219 |
+ static ssize_t l1tf_show_state(char *buf) |
1220 |
+ { |
1221 |
+ return sprintf(buf, "%s\n", L1TF_DEFAULT_MSG); |
1222 |
+ } |
1223 |
++ |
1224 |
++static ssize_t itlb_multihit_show_state(char *buf) |
1225 |
++{ |
1226 |
++ return sprintf(buf, "Processor vulnerable\n"); |
1227 |
++} |
1228 |
+ #endif |
1229 |
+ |
1230 |
+ static ssize_t mds_show_state(char *buf) |
1231 |
+@@ -1308,6 +1430,21 @@ static ssize_t mds_show_state(char *buf) |
1232 |
+ sched_smt_active() ? "vulnerable" : "disabled"); |
1233 |
+ } |
1234 |
+ |
1235 |
++static ssize_t tsx_async_abort_show_state(char *buf) |
1236 |
++{ |
1237 |
++ if ((taa_mitigation == TAA_MITIGATION_TSX_DISABLED) || |
1238 |
++ (taa_mitigation == TAA_MITIGATION_OFF)) |
1239 |
++ return sprintf(buf, "%s\n", taa_strings[taa_mitigation]); |
1240 |
++ |
1241 |
++ if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) { |
1242 |
++ return sprintf(buf, "%s; SMT Host state unknown\n", |
1243 |
++ taa_strings[taa_mitigation]); |
1244 |
++ } |
1245 |
++ |
1246 |
++ return sprintf(buf, "%s; SMT %s\n", taa_strings[taa_mitigation], |
1247 |
++ sched_smt_active() ? "vulnerable" : "disabled"); |
1248 |
++} |
1249 |
++ |
1250 |
+ static char *stibp_state(void) |
1251 |
+ { |
1252 |
+ if (spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED) |
1253 |
+@@ -1373,6 +1510,12 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr |
1254 |
+ case X86_BUG_MDS: |
1255 |
+ return mds_show_state(buf); |
1256 |
+ |
1257 |
++ case X86_BUG_TAA: |
1258 |
++ return tsx_async_abort_show_state(buf); |
1259 |
++ |
1260 |
++ case X86_BUG_ITLB_MULTIHIT: |
1261 |
++ return itlb_multihit_show_state(buf); |
1262 |
++ |
1263 |
+ default: |
1264 |
+ break; |
1265 |
+ } |
1266 |
+@@ -1409,4 +1552,14 @@ ssize_t cpu_show_mds(struct device *dev, struct device_attribute *attr, char *bu |
1267 |
+ { |
1268 |
+ return cpu_show_common(dev, attr, buf, X86_BUG_MDS); |
1269 |
+ } |
1270 |
++ |
1271 |
++ssize_t cpu_show_tsx_async_abort(struct device *dev, struct device_attribute *attr, char *buf) |
1272 |
++{ |
1273 |
++ return cpu_show_common(dev, attr, buf, X86_BUG_TAA); |
1274 |
++} |
1275 |
++ |
1276 |
++ssize_t cpu_show_itlb_multihit(struct device *dev, struct device_attribute *attr, char *buf) |
1277 |
++{ |
1278 |
++ return cpu_show_common(dev, attr, buf, X86_BUG_ITLB_MULTIHIT); |
1279 |
++} |
1280 |
+ #endif |
1281 |
+diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c |
1282 |
+index 12fa16051871..477df9782fdf 100644 |
1283 |
+--- a/arch/x86/kernel/cpu/common.c |
1284 |
++++ b/arch/x86/kernel/cpu/common.c |
1285 |
+@@ -891,13 +891,14 @@ static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c) |
1286 |
+ c->x86_cache_bits = c->x86_phys_bits; |
1287 |
+ } |
1288 |
+ |
1289 |
+-#define NO_SPECULATION BIT(0) |
1290 |
+-#define NO_MELTDOWN BIT(1) |
1291 |
+-#define NO_SSB BIT(2) |
1292 |
+-#define NO_L1TF BIT(3) |
1293 |
+-#define NO_MDS BIT(4) |
1294 |
+-#define MSBDS_ONLY BIT(5) |
1295 |
+-#define NO_SWAPGS BIT(6) |
1296 |
++#define NO_SPECULATION BIT(0) |
1297 |
++#define NO_MELTDOWN BIT(1) |
1298 |
++#define NO_SSB BIT(2) |
1299 |
++#define NO_L1TF BIT(3) |
1300 |
++#define NO_MDS BIT(4) |
1301 |
++#define MSBDS_ONLY BIT(5) |
1302 |
++#define NO_SWAPGS BIT(6) |
1303 |
++#define NO_ITLB_MULTIHIT BIT(7) |
1304 |
+ |
1305 |
+ #define VULNWL(_vendor, _family, _model, _whitelist) \ |
1306 |
+ { X86_VENDOR_##_vendor, _family, _model, X86_FEATURE_ANY, _whitelist } |
1307 |
+@@ -915,26 +916,26 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = { |
1308 |
+ VULNWL(NSC, 5, X86_MODEL_ANY, NO_SPECULATION), |
1309 |
+ |
1310 |
+ /* Intel Family 6 */ |
1311 |
+- VULNWL_INTEL(ATOM_SALTWELL, NO_SPECULATION), |
1312 |
+- VULNWL_INTEL(ATOM_SALTWELL_TABLET, NO_SPECULATION), |
1313 |
+- VULNWL_INTEL(ATOM_SALTWELL_MID, NO_SPECULATION), |
1314 |
+- VULNWL_INTEL(ATOM_BONNELL, NO_SPECULATION), |
1315 |
+- VULNWL_INTEL(ATOM_BONNELL_MID, NO_SPECULATION), |
1316 |
+- |
1317 |
+- VULNWL_INTEL(ATOM_SILVERMONT, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS), |
1318 |
+- VULNWL_INTEL(ATOM_SILVERMONT_X, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS), |
1319 |
+- VULNWL_INTEL(ATOM_SILVERMONT_MID, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS), |
1320 |
+- VULNWL_INTEL(ATOM_AIRMONT, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS), |
1321 |
+- VULNWL_INTEL(XEON_PHI_KNL, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS), |
1322 |
+- VULNWL_INTEL(XEON_PHI_KNM, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS), |
1323 |
++ VULNWL_INTEL(ATOM_SALTWELL, NO_SPECULATION | NO_ITLB_MULTIHIT), |
1324 |
++ VULNWL_INTEL(ATOM_SALTWELL_TABLET, NO_SPECULATION | NO_ITLB_MULTIHIT), |
1325 |
++ VULNWL_INTEL(ATOM_SALTWELL_MID, NO_SPECULATION | NO_ITLB_MULTIHIT), |
1326 |
++ VULNWL_INTEL(ATOM_BONNELL, NO_SPECULATION | NO_ITLB_MULTIHIT), |
1327 |
++ VULNWL_INTEL(ATOM_BONNELL_MID, NO_SPECULATION | NO_ITLB_MULTIHIT), |
1328 |
++ |
1329 |
++ VULNWL_INTEL(ATOM_SILVERMONT, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), |
1330 |
++ VULNWL_INTEL(ATOM_SILVERMONT_X, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), |
1331 |
++ VULNWL_INTEL(ATOM_SILVERMONT_MID, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), |
1332 |
++ VULNWL_INTEL(ATOM_AIRMONT, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), |
1333 |
++ VULNWL_INTEL(XEON_PHI_KNL, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), |
1334 |
++ VULNWL_INTEL(XEON_PHI_KNM, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), |
1335 |
+ |
1336 |
+ VULNWL_INTEL(CORE_YONAH, NO_SSB), |
1337 |
+ |
1338 |
+- VULNWL_INTEL(ATOM_AIRMONT_MID, NO_L1TF | MSBDS_ONLY | NO_SWAPGS), |
1339 |
++ VULNWL_INTEL(ATOM_AIRMONT_MID, NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), |
1340 |
+ |
1341 |
+- VULNWL_INTEL(ATOM_GOLDMONT, NO_MDS | NO_L1TF | NO_SWAPGS), |
1342 |
+- VULNWL_INTEL(ATOM_GOLDMONT_X, NO_MDS | NO_L1TF | NO_SWAPGS), |
1343 |
+- VULNWL_INTEL(ATOM_GOLDMONT_PLUS, NO_MDS | NO_L1TF | NO_SWAPGS), |
1344 |
++ VULNWL_INTEL(ATOM_GOLDMONT, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT), |
1345 |
++ VULNWL_INTEL(ATOM_GOLDMONT_X, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT), |
1346 |
++ VULNWL_INTEL(ATOM_GOLDMONT_PLUS, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT), |
1347 |
+ |
1348 |
+ /* |
1349 |
+ * Technically, swapgs isn't serializing on AMD (despite it previously |
1350 |
+@@ -945,13 +946,13 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = { |
1351 |
+ */ |
1352 |
+ |
1353 |
+ /* AMD Family 0xf - 0x12 */ |
1354 |
+- VULNWL_AMD(0x0f, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS), |
1355 |
+- VULNWL_AMD(0x10, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS), |
1356 |
+- VULNWL_AMD(0x11, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS), |
1357 |
+- VULNWL_AMD(0x12, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS), |
1358 |
++ VULNWL_AMD(0x0f, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT), |
1359 |
++ VULNWL_AMD(0x10, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT), |
1360 |
++ VULNWL_AMD(0x11, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT), |
1361 |
++ VULNWL_AMD(0x12, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT), |
1362 |
+ |
1363 |
+ /* FAMILY_ANY must be last, otherwise 0x0f - 0x12 matches won't work */ |
1364 |
+- VULNWL_AMD(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS), |
1365 |
++ VULNWL_AMD(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT), |
1366 |
+ {} |
1367 |
+ }; |
1368 |
+ |
1369 |
+@@ -962,19 +963,30 @@ static bool __init cpu_matches(unsigned long which) |
1370 |
+ return m && !!(m->driver_data & which); |
1371 |
+ } |
1372 |
+ |
1373 |
+-static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) |
1374 |
++u64 x86_read_arch_cap_msr(void) |
1375 |
+ { |
1376 |
+ u64 ia32_cap = 0; |
1377 |
+ |
1378 |
++ if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) |
1379 |
++ rdmsrl(MSR_IA32_ARCH_CAPABILITIES, ia32_cap); |
1380 |
++ |
1381 |
++ return ia32_cap; |
1382 |
++} |
1383 |
++ |
1384 |
++static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) |
1385 |
++{ |
1386 |
++ u64 ia32_cap = x86_read_arch_cap_msr(); |
1387 |
++ |
1388 |
++ /* Set ITLB_MULTIHIT bug if cpu is not in the whitelist and not mitigated */ |
1389 |
++ if (!cpu_matches(NO_ITLB_MULTIHIT) && !(ia32_cap & ARCH_CAP_PSCHANGE_MC_NO)) |
1390 |
++ setup_force_cpu_bug(X86_BUG_ITLB_MULTIHIT); |
1391 |
++ |
1392 |
+ if (cpu_matches(NO_SPECULATION)) |
1393 |
+ return; |
1394 |
+ |
1395 |
+ setup_force_cpu_bug(X86_BUG_SPECTRE_V1); |
1396 |
+ setup_force_cpu_bug(X86_BUG_SPECTRE_V2); |
1397 |
+ |
1398 |
+- if (cpu_has(c, X86_FEATURE_ARCH_CAPABILITIES)) |
1399 |
+- rdmsrl(MSR_IA32_ARCH_CAPABILITIES, ia32_cap); |
1400 |
+- |
1401 |
+ if (!cpu_matches(NO_SSB) && !(ia32_cap & ARCH_CAP_SSB_NO) && |
1402 |
+ !cpu_has(c, X86_FEATURE_AMD_SSB_NO)) |
1403 |
+ setup_force_cpu_bug(X86_BUG_SPEC_STORE_BYPASS); |
1404 |
+@@ -991,6 +1003,21 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) |
1405 |
+ if (!cpu_matches(NO_SWAPGS)) |
1406 |
+ setup_force_cpu_bug(X86_BUG_SWAPGS); |
1407 |
+ |
1408 |
++ /* |
1409 |
++ * When the CPU is not mitigated for TAA (TAA_NO=0) set TAA bug when: |
1410 |
++ * - TSX is supported or |
1411 |
++ * - TSX_CTRL is present |
1412 |
++ * |
1413 |
++ * TSX_CTRL check is needed for cases when TSX could be disabled before |
1414 |
++ * the kernel boot e.g. kexec. |
1415 |
++ * TSX_CTRL check alone is not sufficient for cases when the microcode |
1416 |
++ * update is not present or running as guest that don't get TSX_CTRL. |
1417 |
++ */ |
1418 |
++ if (!(ia32_cap & ARCH_CAP_TAA_NO) && |
1419 |
++ (cpu_has(c, X86_FEATURE_RTM) || |
1420 |
++ (ia32_cap & ARCH_CAP_TSX_CTRL_MSR))) |
1421 |
++ setup_force_cpu_bug(X86_BUG_TAA); |
1422 |
++ |
1423 |
+ if (cpu_matches(NO_MELTDOWN)) |
1424 |
+ return; |
1425 |
+ |
1426 |
+@@ -1409,6 +1436,8 @@ void __init identify_boot_cpu(void) |
1427 |
+ enable_sep_cpu(); |
1428 |
+ #endif |
1429 |
+ cpu_detect_tlb(&boot_cpu_data); |
1430 |
++ |
1431 |
++ tsx_init(); |
1432 |
+ } |
1433 |
+ |
1434 |
+ void identify_secondary_cpu(struct cpuinfo_x86 *c) |
1435 |
+diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h |
1436 |
+index 2275900d4d1b..4350f50b5deb 100644 |
1437 |
+--- a/arch/x86/kernel/cpu/cpu.h |
1438 |
++++ b/arch/x86/kernel/cpu/cpu.h |
1439 |
+@@ -44,6 +44,22 @@ struct _tlb_table { |
1440 |
+ extern const struct cpu_dev *const __x86_cpu_dev_start[], |
1441 |
+ *const __x86_cpu_dev_end[]; |
1442 |
+ |
1443 |
++#ifdef CONFIG_CPU_SUP_INTEL |
1444 |
++enum tsx_ctrl_states { |
1445 |
++ TSX_CTRL_ENABLE, |
1446 |
++ TSX_CTRL_DISABLE, |
1447 |
++ TSX_CTRL_NOT_SUPPORTED, |
1448 |
++}; |
1449 |
++ |
1450 |
++extern __ro_after_init enum tsx_ctrl_states tsx_ctrl_state; |
1451 |
++ |
1452 |
++extern void __init tsx_init(void); |
1453 |
++extern void tsx_enable(void); |
1454 |
++extern void tsx_disable(void); |
1455 |
++#else |
1456 |
++static inline void tsx_init(void) { } |
1457 |
++#endif /* CONFIG_CPU_SUP_INTEL */ |
1458 |
++ |
1459 |
+ extern void get_cpu_cap(struct cpuinfo_x86 *c); |
1460 |
+ extern void cpu_detect_cache_sizes(struct cpuinfo_x86 *c); |
1461 |
+ extern int detect_extended_topology_early(struct cpuinfo_x86 *c); |
1462 |
+@@ -51,4 +67,6 @@ extern int detect_ht_early(struct cpuinfo_x86 *c); |
1463 |
+ |
1464 |
+ extern void x86_spec_ctrl_setup_ap(void); |
1465 |
+ |
1466 |
++extern u64 x86_read_arch_cap_msr(void); |
1467 |
++ |
1468 |
+ #endif /* ARCH_X86_CPU_H */ |
1469 |
+diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c |
1470 |
+index 860f2fd9f540..476a9d5c2f35 100644 |
1471 |
+--- a/arch/x86/kernel/cpu/intel.c |
1472 |
++++ b/arch/x86/kernel/cpu/intel.c |
1473 |
+@@ -642,6 +642,11 @@ static void init_intel(struct cpuinfo_x86 *c) |
1474 |
+ detect_vmx_virtcap(c); |
1475 |
+ |
1476 |
+ init_intel_energy_perf(c); |
1477 |
++ |
1478 |
++ if (tsx_ctrl_state == TSX_CTRL_ENABLE) |
1479 |
++ tsx_enable(); |
1480 |
++ if (tsx_ctrl_state == TSX_CTRL_DISABLE) |
1481 |
++ tsx_disable(); |
1482 |
+ } |
1483 |
+ |
1484 |
+ #ifdef CONFIG_X86_32 |
1485 |
+diff --git a/arch/x86/kernel/cpu/tsx.c b/arch/x86/kernel/cpu/tsx.c |
1486 |
+new file mode 100644 |
1487 |
+index 000000000000..3e20d322bc98 |
1488 |
+--- /dev/null |
1489 |
++++ b/arch/x86/kernel/cpu/tsx.c |
1490 |
+@@ -0,0 +1,140 @@ |
1491 |
++// SPDX-License-Identifier: GPL-2.0 |
1492 |
++/* |
1493 |
++ * Intel Transactional Synchronization Extensions (TSX) control. |
1494 |
++ * |
1495 |
++ * Copyright (C) 2019 Intel Corporation |
1496 |
++ * |
1497 |
++ * Author: |
1498 |
++ * Pawan Gupta <pawan.kumar.gupta@×××××××××××.com> |
1499 |
++ */ |
1500 |
++ |
1501 |
++#include <linux/cpufeature.h> |
1502 |
++ |
1503 |
++#include <asm/cmdline.h> |
1504 |
++ |
1505 |
++#include "cpu.h" |
1506 |
++ |
1507 |
++enum tsx_ctrl_states tsx_ctrl_state __ro_after_init = TSX_CTRL_NOT_SUPPORTED; |
1508 |
++ |
1509 |
++void tsx_disable(void) |
1510 |
++{ |
1511 |
++ u64 tsx; |
1512 |
++ |
1513 |
++ rdmsrl(MSR_IA32_TSX_CTRL, tsx); |
1514 |
++ |
1515 |
++ /* Force all transactions to immediately abort */ |
1516 |
++ tsx |= TSX_CTRL_RTM_DISABLE; |
1517 |
++ |
1518 |
++ /* |
1519 |
++ * Ensure TSX support is not enumerated in CPUID. |
1520 |
++ * This is visible to userspace and will ensure they |
1521 |
++ * do not waste resources trying TSX transactions that |
1522 |
++ * will always abort. |
1523 |
++ */ |
1524 |
++ tsx |= TSX_CTRL_CPUID_CLEAR; |
1525 |
++ |
1526 |
++ wrmsrl(MSR_IA32_TSX_CTRL, tsx); |
1527 |
++} |
1528 |
++ |
1529 |
++void tsx_enable(void) |
1530 |
++{ |
1531 |
++ u64 tsx; |
1532 |
++ |
1533 |
++ rdmsrl(MSR_IA32_TSX_CTRL, tsx); |
1534 |
++ |
1535 |
++ /* Enable the RTM feature in the cpu */ |
1536 |
++ tsx &= ~TSX_CTRL_RTM_DISABLE; |
1537 |
++ |
1538 |
++ /* |
1539 |
++ * Ensure TSX support is enumerated in CPUID. |
1540 |
++ * This is visible to userspace and will ensure they |
1541 |
++ * can enumerate and use the TSX feature. |
1542 |
++ */ |
1543 |
++ tsx &= ~TSX_CTRL_CPUID_CLEAR; |
1544 |
++ |
1545 |
++ wrmsrl(MSR_IA32_TSX_CTRL, tsx); |
1546 |
++} |
1547 |
++ |
1548 |
++static bool __init tsx_ctrl_is_supported(void) |
1549 |
++{ |
1550 |
++ u64 ia32_cap = x86_read_arch_cap_msr(); |
1551 |
++ |
1552 |
++ /* |
1553 |
++ * TSX is controlled via MSR_IA32_TSX_CTRL. However, support for this |
1554 |
++ * MSR is enumerated by ARCH_CAP_TSX_MSR bit in MSR_IA32_ARCH_CAPABILITIES. |
1555 |
++ * |
1556 |
++ * TSX control (aka MSR_IA32_TSX_CTRL) is only available after a |
1557 |
++ * microcode update on CPUs that have their MSR_IA32_ARCH_CAPABILITIES |
1558 |
++ * bit MDS_NO=1. CPUs with MDS_NO=0 are not planned to get |
1559 |
++ * MSR_IA32_TSX_CTRL support even after a microcode update. Thus, |
1560 |
++ * tsx= cmdline requests will do nothing on CPUs without |
1561 |
++ * MSR_IA32_TSX_CTRL support. |
1562 |
++ */ |
1563 |
++ return !!(ia32_cap & ARCH_CAP_TSX_CTRL_MSR); |
1564 |
++} |
1565 |
++ |
1566 |
++static enum tsx_ctrl_states x86_get_tsx_auto_mode(void) |
1567 |
++{ |
1568 |
++ if (boot_cpu_has_bug(X86_BUG_TAA)) |
1569 |
++ return TSX_CTRL_DISABLE; |
1570 |
++ |
1571 |
++ return TSX_CTRL_ENABLE; |
1572 |
++} |
1573 |
++ |
1574 |
++void __init tsx_init(void) |
1575 |
++{ |
1576 |
++ char arg[5] = {}; |
1577 |
++ int ret; |
1578 |
++ |
1579 |
++ if (!tsx_ctrl_is_supported()) |
1580 |
++ return; |
1581 |
++ |
1582 |
++ ret = cmdline_find_option(boot_command_line, "tsx", arg, sizeof(arg)); |
1583 |
++ if (ret >= 0) { |
1584 |
++ if (!strcmp(arg, "on")) { |
1585 |
++ tsx_ctrl_state = TSX_CTRL_ENABLE; |
1586 |
++ } else if (!strcmp(arg, "off")) { |
1587 |
++ tsx_ctrl_state = TSX_CTRL_DISABLE; |
1588 |
++ } else if (!strcmp(arg, "auto")) { |
1589 |
++ tsx_ctrl_state = x86_get_tsx_auto_mode(); |
1590 |
++ } else { |
1591 |
++ tsx_ctrl_state = TSX_CTRL_DISABLE; |
1592 |
++ pr_err("tsx: invalid option, defaulting to off\n"); |
1593 |
++ } |
1594 |
++ } else { |
1595 |
++ /* tsx= not provided */ |
1596 |
++ if (IS_ENABLED(CONFIG_X86_INTEL_TSX_MODE_AUTO)) |
1597 |
++ tsx_ctrl_state = x86_get_tsx_auto_mode(); |
1598 |
++ else if (IS_ENABLED(CONFIG_X86_INTEL_TSX_MODE_OFF)) |
1599 |
++ tsx_ctrl_state = TSX_CTRL_DISABLE; |
1600 |
++ else |
1601 |
++ tsx_ctrl_state = TSX_CTRL_ENABLE; |
1602 |
++ } |
1603 |
++ |
1604 |
++ if (tsx_ctrl_state == TSX_CTRL_DISABLE) { |
1605 |
++ tsx_disable(); |
1606 |
++ |
1607 |
++ /* |
1608 |
++ * tsx_disable() will change the state of the |
1609 |
++ * RTM CPUID bit. Clear it here since it is now |
1610 |
++ * expected to be not set. |
1611 |
++ */ |
1612 |
++ setup_clear_cpu_cap(X86_FEATURE_RTM); |
1613 |
++ } else if (tsx_ctrl_state == TSX_CTRL_ENABLE) { |
1614 |
++ |
1615 |
++ /* |
1616 |
++ * HW defaults TSX to be enabled at bootup. |
1617 |
++ * We may still need the TSX enable support |
1618 |
++ * during init for special cases like |
1619 |
++ * kexec after TSX is disabled. |
1620 |
++ */ |
1621 |
++ tsx_enable(); |
1622 |
++ |
1623 |
++ /* |
1624 |
++ * tsx_enable() will change the state of the |
1625 |
++ * RTM CPUID bit. Force it here since it is now |
1626 |
++ * expected to be set. |
1627 |
++ */ |
1628 |
++ setup_force_cpu_cap(X86_FEATURE_RTM); |
1629 |
++ } |
1630 |
++} |
1631 |
+diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c |
1632 |
+index fc8236fd2495..18c5b4920e92 100644 |
1633 |
+--- a/arch/x86/kvm/cpuid.c |
1634 |
++++ b/arch/x86/kvm/cpuid.c |
1635 |
+@@ -466,8 +466,16 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, |
1636 |
+ /* PKU is not yet implemented for shadow paging. */ |
1637 |
+ if (!tdp_enabled || !boot_cpu_has(X86_FEATURE_OSPKE)) |
1638 |
+ entry->ecx &= ~F(PKU); |
1639 |
++ |
1640 |
+ entry->edx &= kvm_cpuid_7_0_edx_x86_features; |
1641 |
+ cpuid_mask(&entry->edx, CPUID_7_EDX); |
1642 |
++ if (boot_cpu_has(X86_FEATURE_IBPB) && |
1643 |
++ boot_cpu_has(X86_FEATURE_IBRS)) |
1644 |
++ entry->edx |= F(SPEC_CTRL); |
1645 |
++ if (boot_cpu_has(X86_FEATURE_STIBP)) |
1646 |
++ entry->edx |= F(INTEL_STIBP); |
1647 |
++ if (boot_cpu_has(X86_FEATURE_SSBD)) |
1648 |
++ entry->edx |= F(SPEC_CTRL_SSBD); |
1649 |
+ /* |
1650 |
+ * We emulate ARCH_CAPABILITIES in software even |
1651 |
+ * if the host doesn't support it. |
1652 |
+diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c |
1653 |
+index 676edfc19a95..f0f180158c26 100644 |
1654 |
+--- a/arch/x86/kvm/mmu.c |
1655 |
++++ b/arch/x86/kvm/mmu.c |
1656 |
+@@ -37,6 +37,7 @@ |
1657 |
+ #include <linux/srcu.h> |
1658 |
+ #include <linux/slab.h> |
1659 |
+ #include <linux/uaccess.h> |
1660 |
++#include <linux/kthread.h> |
1661 |
+ |
1662 |
+ #include <asm/page.h> |
1663 |
+ #include <asm/cmpxchg.h> |
1664 |
+@@ -44,6 +45,30 @@ |
1665 |
+ #include <asm/vmx.h> |
1666 |
+ #include <asm/kvm_page_track.h> |
1667 |
+ |
1668 |
++extern bool itlb_multihit_kvm_mitigation; |
1669 |
++ |
1670 |
++static int __read_mostly nx_huge_pages = -1; |
1671 |
++static uint __read_mostly nx_huge_pages_recovery_ratio = 60; |
1672 |
++ |
1673 |
++static int set_nx_huge_pages(const char *val, const struct kernel_param *kp); |
1674 |
++static int set_nx_huge_pages_recovery_ratio(const char *val, const struct kernel_param *kp); |
1675 |
++ |
1676 |
++static struct kernel_param_ops nx_huge_pages_ops = { |
1677 |
++ .set = set_nx_huge_pages, |
1678 |
++ .get = param_get_bool, |
1679 |
++}; |
1680 |
++ |
1681 |
++static struct kernel_param_ops nx_huge_pages_recovery_ratio_ops = { |
1682 |
++ .set = set_nx_huge_pages_recovery_ratio, |
1683 |
++ .get = param_get_uint, |
1684 |
++}; |
1685 |
++ |
1686 |
++module_param_cb(nx_huge_pages, &nx_huge_pages_ops, &nx_huge_pages, 0644); |
1687 |
++__MODULE_PARM_TYPE(nx_huge_pages, "bool"); |
1688 |
++module_param_cb(nx_huge_pages_recovery_ratio, &nx_huge_pages_recovery_ratio_ops, |
1689 |
++ &nx_huge_pages_recovery_ratio, 0644); |
1690 |
++__MODULE_PARM_TYPE(nx_huge_pages_recovery_ratio, "uint"); |
1691 |
++ |
1692 |
+ /* |
1693 |
+ * When setting this variable to true it enables Two-Dimensional-Paging |
1694 |
+ * where the hardware walks 2 page tables: |
1695 |
+@@ -131,9 +156,6 @@ module_param(dbg, bool, 0644); |
1696 |
+ |
1697 |
+ #include <trace/events/kvm.h> |
1698 |
+ |
1699 |
+-#define CREATE_TRACE_POINTS |
1700 |
+-#include "mmutrace.h" |
1701 |
+- |
1702 |
+ #define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT) |
1703 |
+ #define SPTE_MMU_WRITEABLE (1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 1)) |
1704 |
+ |
1705 |
+@@ -142,6 +164,20 @@ module_param(dbg, bool, 0644); |
1706 |
+ /* make pte_list_desc fit well in cache line */ |
1707 |
+ #define PTE_LIST_EXT 3 |
1708 |
+ |
1709 |
++/* |
1710 |
++ * Return values of handle_mmio_page_fault and mmu.page_fault: |
1711 |
++ * RET_PF_RETRY: let CPU fault again on the address. |
1712 |
++ * RET_PF_EMULATE: mmio page fault, emulate the instruction directly. |
1713 |
++ * |
1714 |
++ * For handle_mmio_page_fault only: |
1715 |
++ * RET_PF_INVALID: the spte is invalid, let the real page fault path update it. |
1716 |
++ */ |
1717 |
++enum { |
1718 |
++ RET_PF_RETRY = 0, |
1719 |
++ RET_PF_EMULATE = 1, |
1720 |
++ RET_PF_INVALID = 2, |
1721 |
++}; |
1722 |
++ |
1723 |
+ struct pte_list_desc { |
1724 |
+ u64 *sptes[PTE_LIST_EXT]; |
1725 |
+ struct pte_list_desc *more; |
1726 |
+@@ -179,14 +215,23 @@ static u64 __read_mostly shadow_mmio_mask; |
1727 |
+ static u64 __read_mostly shadow_present_mask; |
1728 |
+ |
1729 |
+ static void mmu_spte_set(u64 *sptep, u64 spte); |
1730 |
++static bool is_executable_pte(u64 spte); |
1731 |
+ static void mmu_free_roots(struct kvm_vcpu *vcpu); |
1732 |
+ |
1733 |
++#define CREATE_TRACE_POINTS |
1734 |
++#include "mmutrace.h" |
1735 |
++ |
1736 |
+ void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask) |
1737 |
+ { |
1738 |
+ shadow_mmio_mask = mmio_mask; |
1739 |
+ } |
1740 |
+ EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask); |
1741 |
+ |
1742 |
++static bool is_nx_huge_page_enabled(void) |
1743 |
++{ |
1744 |
++ return READ_ONCE(nx_huge_pages); |
1745 |
++} |
1746 |
++ |
1747 |
+ /* |
1748 |
+ * the low bit of the generation number is always presumed to be zero. |
1749 |
+ * This disables mmio caching during memslot updates. The concept is |
1750 |
+@@ -324,6 +369,11 @@ static int is_last_spte(u64 pte, int level) |
1751 |
+ return 0; |
1752 |
+ } |
1753 |
+ |
1754 |
++static bool is_executable_pte(u64 spte) |
1755 |
++{ |
1756 |
++ return (spte & (shadow_x_mask | shadow_nx_mask)) == shadow_x_mask; |
1757 |
++} |
1758 |
++ |
1759 |
+ static kvm_pfn_t spte_to_pfn(u64 pte) |
1760 |
+ { |
1761 |
+ return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; |
1762 |
+@@ -767,10 +817,16 @@ static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index) |
1763 |
+ |
1764 |
+ static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn) |
1765 |
+ { |
1766 |
+- if (sp->role.direct) |
1767 |
+- BUG_ON(gfn != kvm_mmu_page_get_gfn(sp, index)); |
1768 |
+- else |
1769 |
++ if (!sp->role.direct) { |
1770 |
+ sp->gfns[index] = gfn; |
1771 |
++ return; |
1772 |
++ } |
1773 |
++ |
1774 |
++ if (WARN_ON(gfn != kvm_mmu_page_get_gfn(sp, index))) |
1775 |
++ pr_err_ratelimited("gfn mismatch under direct page %llx " |
1776 |
++ "(expected %llx, got %llx)\n", |
1777 |
++ sp->gfn, |
1778 |
++ kvm_mmu_page_get_gfn(sp, index), gfn); |
1779 |
+ } |
1780 |
+ |
1781 |
+ /* |
1782 |
+@@ -829,6 +885,17 @@ static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) |
1783 |
+ kvm_mmu_gfn_disallow_lpage(slot, gfn); |
1784 |
+ } |
1785 |
+ |
1786 |
++static void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp) |
1787 |
++{ |
1788 |
++ if (sp->lpage_disallowed) |
1789 |
++ return; |
1790 |
++ |
1791 |
++ ++kvm->stat.nx_lpage_splits; |
1792 |
++ list_add_tail(&sp->lpage_disallowed_link, |
1793 |
++ &kvm->arch.lpage_disallowed_mmu_pages); |
1794 |
++ sp->lpage_disallowed = true; |
1795 |
++} |
1796 |
++ |
1797 |
+ static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) |
1798 |
+ { |
1799 |
+ struct kvm_memslots *slots; |
1800 |
+@@ -846,6 +913,13 @@ static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) |
1801 |
+ kvm_mmu_gfn_allow_lpage(slot, gfn); |
1802 |
+ } |
1803 |
+ |
1804 |
++static void unaccount_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp) |
1805 |
++{ |
1806 |
++ --kvm->stat.nx_lpage_splits; |
1807 |
++ sp->lpage_disallowed = false; |
1808 |
++ list_del(&sp->lpage_disallowed_link); |
1809 |
++} |
1810 |
++ |
1811 |
+ static bool __mmu_gfn_lpage_is_disallowed(gfn_t gfn, int level, |
1812 |
+ struct kvm_memory_slot *slot) |
1813 |
+ { |
1814 |
+@@ -2382,6 +2456,9 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, |
1815 |
+ kvm_reload_remote_mmus(kvm); |
1816 |
+ } |
1817 |
+ |
1818 |
++ if (sp->lpage_disallowed) |
1819 |
++ unaccount_huge_nx_page(kvm, sp); |
1820 |
++ |
1821 |
+ sp->role.invalid = 1; |
1822 |
+ return ret; |
1823 |
+ } |
1824 |
+@@ -2533,6 +2610,11 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, |
1825 |
+ if (!speculative) |
1826 |
+ spte |= shadow_accessed_mask; |
1827 |
+ |
1828 |
++ if (level > PT_PAGE_TABLE_LEVEL && (pte_access & ACC_EXEC_MASK) && |
1829 |
++ is_nx_huge_page_enabled()) { |
1830 |
++ pte_access &= ~ACC_EXEC_MASK; |
1831 |
++ } |
1832 |
++ |
1833 |
+ if (pte_access & ACC_EXEC_MASK) |
1834 |
+ spte |= shadow_x_mask; |
1835 |
+ else |
1836 |
+@@ -2598,13 +2680,13 @@ done: |
1837 |
+ return ret; |
1838 |
+ } |
1839 |
+ |
1840 |
+-static bool mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access, |
1841 |
+- int write_fault, int level, gfn_t gfn, kvm_pfn_t pfn, |
1842 |
+- bool speculative, bool host_writable) |
1843 |
++static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access, |
1844 |
++ int write_fault, int level, gfn_t gfn, kvm_pfn_t pfn, |
1845 |
++ bool speculative, bool host_writable) |
1846 |
+ { |
1847 |
+ int was_rmapped = 0; |
1848 |
+ int rmap_count; |
1849 |
+- bool emulate = false; |
1850 |
++ int ret = RET_PF_RETRY; |
1851 |
+ |
1852 |
+ pgprintk("%s: spte %llx write_fault %d gfn %llx\n", __func__, |
1853 |
+ *sptep, write_fault, gfn); |
1854 |
+@@ -2634,18 +2716,15 @@ static bool mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access, |
1855 |
+ if (set_spte(vcpu, sptep, pte_access, level, gfn, pfn, speculative, |
1856 |
+ true, host_writable)) { |
1857 |
+ if (write_fault) |
1858 |
+- emulate = true; |
1859 |
++ ret = RET_PF_EMULATE; |
1860 |
+ kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); |
1861 |
+ } |
1862 |
+ |
1863 |
+ if (unlikely(is_mmio_spte(*sptep))) |
1864 |
+- emulate = true; |
1865 |
++ ret = RET_PF_EMULATE; |
1866 |
+ |
1867 |
+ pgprintk("%s: setting spte %llx\n", __func__, *sptep); |
1868 |
+- pgprintk("instantiating %s PTE (%s) at %llx (%llx) addr %p\n", |
1869 |
+- is_large_pte(*sptep)? "2MB" : "4kB", |
1870 |
+- *sptep & PT_PRESENT_MASK ?"RW":"R", gfn, |
1871 |
+- *sptep, sptep); |
1872 |
++ trace_kvm_mmu_set_spte(level, gfn, sptep); |
1873 |
+ if (!was_rmapped && is_large_pte(*sptep)) |
1874 |
+ ++vcpu->kvm->stat.lpages; |
1875 |
+ |
1876 |
+@@ -2657,9 +2736,7 @@ static bool mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access, |
1877 |
+ } |
1878 |
+ } |
1879 |
+ |
1880 |
+- kvm_release_pfn_clean(pfn); |
1881 |
+- |
1882 |
+- return emulate; |
1883 |
++ return ret; |
1884 |
+ } |
1885 |
+ |
1886 |
+ static kvm_pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, |
1887 |
+@@ -2693,9 +2770,11 @@ static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu, |
1888 |
+ if (ret <= 0) |
1889 |
+ return -1; |
1890 |
+ |
1891 |
+- for (i = 0; i < ret; i++, gfn++, start++) |
1892 |
++ for (i = 0; i < ret; i++, gfn++, start++) { |
1893 |
+ mmu_set_spte(vcpu, start, access, 0, sp->role.level, gfn, |
1894 |
+ page_to_pfn(pages[i]), true, true); |
1895 |
++ put_page(pages[i]); |
1896 |
++ } |
1897 |
+ |
1898 |
+ return 0; |
1899 |
+ } |
1900 |
+@@ -2743,40 +2822,71 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep) |
1901 |
+ __direct_pte_prefetch(vcpu, sp, sptep); |
1902 |
+ } |
1903 |
+ |
1904 |
+-static int __direct_map(struct kvm_vcpu *vcpu, int write, int map_writable, |
1905 |
+- int level, gfn_t gfn, kvm_pfn_t pfn, bool prefault) |
1906 |
++static void disallowed_hugepage_adjust(struct kvm_shadow_walk_iterator it, |
1907 |
++ gfn_t gfn, kvm_pfn_t *pfnp, int *levelp) |
1908 |
+ { |
1909 |
+- struct kvm_shadow_walk_iterator iterator; |
1910 |
++ int level = *levelp; |
1911 |
++ u64 spte = *it.sptep; |
1912 |
++ |
1913 |
++ if (it.level == level && level > PT_PAGE_TABLE_LEVEL && |
1914 |
++ is_nx_huge_page_enabled() && |
1915 |
++ is_shadow_present_pte(spte) && |
1916 |
++ !is_large_pte(spte)) { |
1917 |
++ /* |
1918 |
++ * A small SPTE exists for this pfn, but FNAME(fetch) |
1919 |
++ * and __direct_map would like to create a large PTE |
1920 |
++ * instead: just force them to go down another level, |
1921 |
++ * patching back for them into pfn the next 9 bits of |
1922 |
++ * the address. |
1923 |
++ */ |
1924 |
++ u64 page_mask = KVM_PAGES_PER_HPAGE(level) - KVM_PAGES_PER_HPAGE(level - 1); |
1925 |
++ *pfnp |= gfn & page_mask; |
1926 |
++ (*levelp)--; |
1927 |
++ } |
1928 |
++} |
1929 |
++ |
1930 |
++static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write, |
1931 |
++ int map_writable, int level, kvm_pfn_t pfn, |
1932 |
++ bool prefault, bool lpage_disallowed) |
1933 |
++{ |
1934 |
++ struct kvm_shadow_walk_iterator it; |
1935 |
+ struct kvm_mmu_page *sp; |
1936 |
+- int emulate = 0; |
1937 |
+- gfn_t pseudo_gfn; |
1938 |
++ int ret; |
1939 |
++ gfn_t gfn = gpa >> PAGE_SHIFT; |
1940 |
++ gfn_t base_gfn = gfn; |
1941 |
+ |
1942 |
+ if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) |
1943 |
+- return 0; |
1944 |
++ return RET_PF_RETRY; |
1945 |
+ |
1946 |
+- for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) { |
1947 |
+- if (iterator.level == level) { |
1948 |
+- emulate = mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, |
1949 |
+- write, level, gfn, pfn, prefault, |
1950 |
+- map_writable); |
1951 |
+- direct_pte_prefetch(vcpu, iterator.sptep); |
1952 |
+- ++vcpu->stat.pf_fixed; |
1953 |
+- break; |
1954 |
+- } |
1955 |
++ trace_kvm_mmu_spte_requested(gpa, level, pfn); |
1956 |
++ for_each_shadow_entry(vcpu, gpa, it) { |
1957 |
++ /* |
1958 |
++ * We cannot overwrite existing page tables with an NX |
1959 |
++ * large page, as the leaf could be executable. |
1960 |
++ */ |
1961 |
++ disallowed_hugepage_adjust(it, gfn, &pfn, &level); |
1962 |
+ |
1963 |
+- drop_large_spte(vcpu, iterator.sptep); |
1964 |
+- if (!is_shadow_present_pte(*iterator.sptep)) { |
1965 |
+- u64 base_addr = iterator.addr; |
1966 |
++ base_gfn = gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1); |
1967 |
++ if (it.level == level) |
1968 |
++ break; |
1969 |
+ |
1970 |
+- base_addr &= PT64_LVL_ADDR_MASK(iterator.level); |
1971 |
+- pseudo_gfn = base_addr >> PAGE_SHIFT; |
1972 |
+- sp = kvm_mmu_get_page(vcpu, pseudo_gfn, iterator.addr, |
1973 |
+- iterator.level - 1, 1, ACC_ALL); |
1974 |
++ drop_large_spte(vcpu, it.sptep); |
1975 |
++ if (!is_shadow_present_pte(*it.sptep)) { |
1976 |
++ sp = kvm_mmu_get_page(vcpu, base_gfn, it.addr, |
1977 |
++ it.level - 1, true, ACC_ALL); |
1978 |
+ |
1979 |
+- link_shadow_page(vcpu, iterator.sptep, sp); |
1980 |
++ link_shadow_page(vcpu, it.sptep, sp); |
1981 |
++ if (lpage_disallowed) |
1982 |
++ account_huge_nx_page(vcpu->kvm, sp); |
1983 |
+ } |
1984 |
+ } |
1985 |
+- return emulate; |
1986 |
++ |
1987 |
++ ret = mmu_set_spte(vcpu, it.sptep, ACC_ALL, |
1988 |
++ write, level, base_gfn, pfn, prefault, |
1989 |
++ map_writable); |
1990 |
++ direct_pte_prefetch(vcpu, it.sptep); |
1991 |
++ ++vcpu->stat.pf_fixed; |
1992 |
++ return ret; |
1993 |
+ } |
1994 |
+ |
1995 |
+ static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *tsk) |
1996 |
+@@ -2798,25 +2908,23 @@ static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn) |
1997 |
+ * Do not cache the mmio info caused by writing the readonly gfn |
1998 |
+ * into the spte otherwise read access on readonly gfn also can |
1999 |
+ * caused mmio page fault and treat it as mmio access. |
2000 |
+- * Return 1 to tell kvm to emulate it. |
2001 |
+ */ |
2002 |
+ if (pfn == KVM_PFN_ERR_RO_FAULT) |
2003 |
+- return 1; |
2004 |
++ return RET_PF_EMULATE; |
2005 |
+ |
2006 |
+ if (pfn == KVM_PFN_ERR_HWPOISON) { |
2007 |
+ kvm_send_hwpoison_signal(kvm_vcpu_gfn_to_hva(vcpu, gfn), current); |
2008 |
+- return 0; |
2009 |
++ return RET_PF_RETRY; |
2010 |
+ } |
2011 |
+ |
2012 |
+ return -EFAULT; |
2013 |
+ } |
2014 |
+ |
2015 |
+ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, |
2016 |
+- gfn_t *gfnp, kvm_pfn_t *pfnp, |
2017 |
++ gfn_t gfn, kvm_pfn_t *pfnp, |
2018 |
+ int *levelp) |
2019 |
+ { |
2020 |
+ kvm_pfn_t pfn = *pfnp; |
2021 |
+- gfn_t gfn = *gfnp; |
2022 |
+ int level = *levelp; |
2023 |
+ |
2024 |
+ /* |
2025 |
+@@ -2843,8 +2951,6 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, |
2026 |
+ mask = KVM_PAGES_PER_HPAGE(level) - 1; |
2027 |
+ VM_BUG_ON((gfn & mask) != (pfn & mask)); |
2028 |
+ if (pfn & mask) { |
2029 |
+- gfn &= ~mask; |
2030 |
+- *gfnp = gfn; |
2031 |
+ kvm_release_pfn_clean(pfn); |
2032 |
+ pfn &= ~mask; |
2033 |
+ kvm_get_pfn(pfn); |
2034 |
+@@ -3012,11 +3118,14 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code, |
2035 |
+ { |
2036 |
+ int r; |
2037 |
+ int level; |
2038 |
+- bool force_pt_level = false; |
2039 |
++ bool force_pt_level; |
2040 |
+ kvm_pfn_t pfn; |
2041 |
+ unsigned long mmu_seq; |
2042 |
+ bool map_writable, write = error_code & PFERR_WRITE_MASK; |
2043 |
++ bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) && |
2044 |
++ is_nx_huge_page_enabled(); |
2045 |
+ |
2046 |
++ force_pt_level = lpage_disallowed; |
2047 |
+ level = mapping_level(vcpu, gfn, &force_pt_level); |
2048 |
+ if (likely(!force_pt_level)) { |
2049 |
+ /* |
2050 |
+@@ -3031,32 +3140,30 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code, |
2051 |
+ } |
2052 |
+ |
2053 |
+ if (fast_page_fault(vcpu, v, level, error_code)) |
2054 |
+- return 0; |
2055 |
++ return RET_PF_RETRY; |
2056 |
+ |
2057 |
+ mmu_seq = vcpu->kvm->mmu_notifier_seq; |
2058 |
+ smp_rmb(); |
2059 |
+ |
2060 |
+ if (try_async_pf(vcpu, prefault, gfn, v, &pfn, write, &map_writable)) |
2061 |
+- return 0; |
2062 |
++ return RET_PF_RETRY; |
2063 |
+ |
2064 |
+ if (handle_abnormal_pfn(vcpu, v, gfn, pfn, ACC_ALL, &r)) |
2065 |
+ return r; |
2066 |
+ |
2067 |
++ r = RET_PF_RETRY; |
2068 |
+ spin_lock(&vcpu->kvm->mmu_lock); |
2069 |
+ if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) |
2070 |
+ goto out_unlock; |
2071 |
+ make_mmu_pages_available(vcpu); |
2072 |
+ if (likely(!force_pt_level)) |
2073 |
+- transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level); |
2074 |
+- r = __direct_map(vcpu, write, map_writable, level, gfn, pfn, prefault); |
2075 |
+- spin_unlock(&vcpu->kvm->mmu_lock); |
2076 |
+- |
2077 |
+- return r; |
2078 |
+- |
2079 |
++ transparent_hugepage_adjust(vcpu, gfn, &pfn, &level); |
2080 |
++ r = __direct_map(vcpu, v, write, map_writable, level, pfn, |
2081 |
++ prefault, false); |
2082 |
+ out_unlock: |
2083 |
+ spin_unlock(&vcpu->kvm->mmu_lock); |
2084 |
+ kvm_release_pfn_clean(pfn); |
2085 |
+- return 0; |
2086 |
++ return r; |
2087 |
+ } |
2088 |
+ |
2089 |
+ |
2090 |
+@@ -3383,38 +3490,38 @@ exit: |
2091 |
+ return reserved; |
2092 |
+ } |
2093 |
+ |
2094 |
+-int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct) |
2095 |
++static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct) |
2096 |
+ { |
2097 |
+ u64 spte; |
2098 |
+ bool reserved; |
2099 |
+ |
2100 |
+ if (mmio_info_in_cache(vcpu, addr, direct)) |
2101 |
+- return RET_MMIO_PF_EMULATE; |
2102 |
++ return RET_PF_EMULATE; |
2103 |
+ |
2104 |
+ reserved = walk_shadow_page_get_mmio_spte(vcpu, addr, &spte); |
2105 |
+ if (WARN_ON(reserved)) |
2106 |
+- return RET_MMIO_PF_BUG; |
2107 |
++ return -EINVAL; |
2108 |
+ |
2109 |
+ if (is_mmio_spte(spte)) { |
2110 |
+ gfn_t gfn = get_mmio_spte_gfn(spte); |
2111 |
+ unsigned access = get_mmio_spte_access(spte); |
2112 |
+ |
2113 |
+ if (!check_mmio_spte(vcpu, spte)) |
2114 |
+- return RET_MMIO_PF_INVALID; |
2115 |
++ return RET_PF_INVALID; |
2116 |
+ |
2117 |
+ if (direct) |
2118 |
+ addr = 0; |
2119 |
+ |
2120 |
+ trace_handle_mmio_page_fault(addr, gfn, access); |
2121 |
+ vcpu_cache_mmio_info(vcpu, addr, gfn, access); |
2122 |
+- return RET_MMIO_PF_EMULATE; |
2123 |
++ return RET_PF_EMULATE; |
2124 |
+ } |
2125 |
+ |
2126 |
+ /* |
2127 |
+ * If the page table is zapped by other cpus, let CPU fault again on |
2128 |
+ * the address. |
2129 |
+ */ |
2130 |
+- return RET_MMIO_PF_RETRY; |
2131 |
++ return RET_PF_RETRY; |
2132 |
+ } |
2133 |
+ EXPORT_SYMBOL_GPL(handle_mmio_page_fault); |
2134 |
+ |
2135 |
+@@ -3464,7 +3571,7 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, |
2136 |
+ pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code); |
2137 |
+ |
2138 |
+ if (page_fault_handle_page_track(vcpu, error_code, gfn)) |
2139 |
+- return 1; |
2140 |
++ return RET_PF_EMULATE; |
2141 |
+ |
2142 |
+ r = mmu_topup_memory_caches(vcpu); |
2143 |
+ if (r) |
2144 |
+@@ -3548,18 +3655,21 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, |
2145 |
+ unsigned long mmu_seq; |
2146 |
+ int write = error_code & PFERR_WRITE_MASK; |
2147 |
+ bool map_writable; |
2148 |
++ bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) && |
2149 |
++ is_nx_huge_page_enabled(); |
2150 |
+ |
2151 |
+ MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); |
2152 |
+ |
2153 |
+ if (page_fault_handle_page_track(vcpu, error_code, gfn)) |
2154 |
+- return 1; |
2155 |
++ return RET_PF_EMULATE; |
2156 |
+ |
2157 |
+ r = mmu_topup_memory_caches(vcpu); |
2158 |
+ if (r) |
2159 |
+ return r; |
2160 |
+ |
2161 |
+- force_pt_level = !check_hugepage_cache_consistency(vcpu, gfn, |
2162 |
+- PT_DIRECTORY_LEVEL); |
2163 |
++ force_pt_level = |
2164 |
++ lpage_disallowed || |
2165 |
++ !check_hugepage_cache_consistency(vcpu, gfn, PT_DIRECTORY_LEVEL); |
2166 |
+ level = mapping_level(vcpu, gfn, &force_pt_level); |
2167 |
+ if (likely(!force_pt_level)) { |
2168 |
+ if (level > PT_DIRECTORY_LEVEL && |
2169 |
+@@ -3569,32 +3679,30 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, |
2170 |
+ } |
2171 |
+ |
2172 |
+ if (fast_page_fault(vcpu, gpa, level, error_code)) |
2173 |
+- return 0; |
2174 |
++ return RET_PF_RETRY; |
2175 |
+ |
2176 |
+ mmu_seq = vcpu->kvm->mmu_notifier_seq; |
2177 |
+ smp_rmb(); |
2178 |
+ |
2179 |
+ if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable)) |
2180 |
+- return 0; |
2181 |
++ return RET_PF_RETRY; |
2182 |
+ |
2183 |
+ if (handle_abnormal_pfn(vcpu, 0, gfn, pfn, ACC_ALL, &r)) |
2184 |
+ return r; |
2185 |
+ |
2186 |
++ r = RET_PF_RETRY; |
2187 |
+ spin_lock(&vcpu->kvm->mmu_lock); |
2188 |
+ if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) |
2189 |
+ goto out_unlock; |
2190 |
+ make_mmu_pages_available(vcpu); |
2191 |
+ if (likely(!force_pt_level)) |
2192 |
+- transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level); |
2193 |
+- r = __direct_map(vcpu, write, map_writable, level, gfn, pfn, prefault); |
2194 |
+- spin_unlock(&vcpu->kvm->mmu_lock); |
2195 |
+- |
2196 |
+- return r; |
2197 |
+- |
2198 |
++ transparent_hugepage_adjust(vcpu, gfn, &pfn, &level); |
2199 |
++ r = __direct_map(vcpu, gpa, write, map_writable, level, pfn, |
2200 |
++ prefault, lpage_disallowed); |
2201 |
+ out_unlock: |
2202 |
+ spin_unlock(&vcpu->kvm->mmu_lock); |
2203 |
+ kvm_release_pfn_clean(pfn); |
2204 |
+- return 0; |
2205 |
++ return r; |
2206 |
+ } |
2207 |
+ |
2208 |
+ static void nonpaging_init_context(struct kvm_vcpu *vcpu, |
2209 |
+@@ -4510,23 +4618,24 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code, |
2210 |
+ enum emulation_result er; |
2211 |
+ bool direct = vcpu->arch.mmu.direct_map || mmu_is_nested(vcpu); |
2212 |
+ |
2213 |
++ r = RET_PF_INVALID; |
2214 |
+ if (unlikely(error_code & PFERR_RSVD_MASK)) { |
2215 |
+ r = handle_mmio_page_fault(vcpu, cr2, direct); |
2216 |
+- if (r == RET_MMIO_PF_EMULATE) { |
2217 |
++ if (r == RET_PF_EMULATE) { |
2218 |
+ emulation_type = 0; |
2219 |
+ goto emulate; |
2220 |
+ } |
2221 |
+- if (r == RET_MMIO_PF_RETRY) |
2222 |
+- return 1; |
2223 |
+- if (r < 0) |
2224 |
+- return r; |
2225 |
+ } |
2226 |
+ |
2227 |
+- r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code, false); |
2228 |
++ if (r == RET_PF_INVALID) { |
2229 |
++ r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code, false); |
2230 |
++ WARN_ON(r == RET_PF_INVALID); |
2231 |
++ } |
2232 |
++ |
2233 |
++ if (r == RET_PF_RETRY) |
2234 |
++ return 1; |
2235 |
+ if (r < 0) |
2236 |
+ return r; |
2237 |
+- if (!r) |
2238 |
+- return 1; |
2239 |
+ |
2240 |
+ if (mmio_info_in_cache(vcpu, cr2, direct)) |
2241 |
+ emulation_type = 0; |
2242 |
+@@ -4965,7 +5074,7 @@ mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) |
2243 |
+ int nr_to_scan = sc->nr_to_scan; |
2244 |
+ unsigned long freed = 0; |
2245 |
+ |
2246 |
+- spin_lock(&kvm_lock); |
2247 |
++ mutex_lock(&kvm_lock); |
2248 |
+ |
2249 |
+ list_for_each_entry(kvm, &vm_list, vm_list) { |
2250 |
+ int idx; |
2251 |
+@@ -5015,7 +5124,7 @@ unlock: |
2252 |
+ break; |
2253 |
+ } |
2254 |
+ |
2255 |
+- spin_unlock(&kvm_lock); |
2256 |
++ mutex_unlock(&kvm_lock); |
2257 |
+ return freed; |
2258 |
+ } |
2259 |
+ |
2260 |
+@@ -5039,8 +5148,58 @@ static void mmu_destroy_caches(void) |
2261 |
+ kmem_cache_destroy(mmu_page_header_cache); |
2262 |
+ } |
2263 |
+ |
2264 |
++static bool get_nx_auto_mode(void) |
2265 |
++{ |
2266 |
++ /* Return true when CPU has the bug, and mitigations are ON */ |
2267 |
++ return boot_cpu_has_bug(X86_BUG_ITLB_MULTIHIT) && !cpu_mitigations_off(); |
2268 |
++} |
2269 |
++ |
2270 |
++static void __set_nx_huge_pages(bool val) |
2271 |
++{ |
2272 |
++ nx_huge_pages = itlb_multihit_kvm_mitigation = val; |
2273 |
++} |
2274 |
++ |
2275 |
++static int set_nx_huge_pages(const char *val, const struct kernel_param *kp) |
2276 |
++{ |
2277 |
++ bool old_val = nx_huge_pages; |
2278 |
++ bool new_val; |
2279 |
++ |
2280 |
++ /* In "auto" mode deploy workaround only if CPU has the bug. */ |
2281 |
++ if (sysfs_streq(val, "off")) |
2282 |
++ new_val = 0; |
2283 |
++ else if (sysfs_streq(val, "force")) |
2284 |
++ new_val = 1; |
2285 |
++ else if (sysfs_streq(val, "auto")) |
2286 |
++ new_val = get_nx_auto_mode(); |
2287 |
++ else if (strtobool(val, &new_val) < 0) |
2288 |
++ return -EINVAL; |
2289 |
++ |
2290 |
++ __set_nx_huge_pages(new_val); |
2291 |
++ |
2292 |
++ if (new_val != old_val) { |
2293 |
++ struct kvm *kvm; |
2294 |
++ int idx; |
2295 |
++ |
2296 |
++ mutex_lock(&kvm_lock); |
2297 |
++ |
2298 |
++ list_for_each_entry(kvm, &vm_list, vm_list) { |
2299 |
++ idx = srcu_read_lock(&kvm->srcu); |
2300 |
++ kvm_mmu_invalidate_zap_all_pages(kvm); |
2301 |
++ srcu_read_unlock(&kvm->srcu, idx); |
2302 |
++ |
2303 |
++ wake_up_process(kvm->arch.nx_lpage_recovery_thread); |
2304 |
++ } |
2305 |
++ mutex_unlock(&kvm_lock); |
2306 |
++ } |
2307 |
++ |
2308 |
++ return 0; |
2309 |
++} |
2310 |
++ |
2311 |
+ int kvm_mmu_module_init(void) |
2312 |
+ { |
2313 |
++ if (nx_huge_pages == -1) |
2314 |
++ __set_nx_huge_pages(get_nx_auto_mode()); |
2315 |
++ |
2316 |
+ pte_list_desc_cache = kmem_cache_create("pte_list_desc", |
2317 |
+ sizeof(struct pte_list_desc), |
2318 |
+ 0, SLAB_ACCOUNT, NULL); |
2319 |
+@@ -5104,3 +5263,116 @@ void kvm_mmu_module_exit(void) |
2320 |
+ unregister_shrinker(&mmu_shrinker); |
2321 |
+ mmu_audit_disable(); |
2322 |
+ } |
2323 |
++ |
2324 |
++static int set_nx_huge_pages_recovery_ratio(const char *val, const struct kernel_param *kp) |
2325 |
++{ |
2326 |
++ unsigned int old_val; |
2327 |
++ int err; |
2328 |
++ |
2329 |
++ old_val = nx_huge_pages_recovery_ratio; |
2330 |
++ err = param_set_uint(val, kp); |
2331 |
++ if (err) |
2332 |
++ return err; |
2333 |
++ |
2334 |
++ if (READ_ONCE(nx_huge_pages) && |
2335 |
++ !old_val && nx_huge_pages_recovery_ratio) { |
2336 |
++ struct kvm *kvm; |
2337 |
++ |
2338 |
++ mutex_lock(&kvm_lock); |
2339 |
++ |
2340 |
++ list_for_each_entry(kvm, &vm_list, vm_list) |
2341 |
++ wake_up_process(kvm->arch.nx_lpage_recovery_thread); |
2342 |
++ |
2343 |
++ mutex_unlock(&kvm_lock); |
2344 |
++ } |
2345 |
++ |
2346 |
++ return err; |
2347 |
++} |
2348 |
++ |
2349 |
++static void kvm_recover_nx_lpages(struct kvm *kvm) |
2350 |
++{ |
2351 |
++ int rcu_idx; |
2352 |
++ struct kvm_mmu_page *sp; |
2353 |
++ unsigned int ratio; |
2354 |
++ LIST_HEAD(invalid_list); |
2355 |
++ ulong to_zap; |
2356 |
++ |
2357 |
++ rcu_idx = srcu_read_lock(&kvm->srcu); |
2358 |
++ spin_lock(&kvm->mmu_lock); |
2359 |
++ |
2360 |
++ ratio = READ_ONCE(nx_huge_pages_recovery_ratio); |
2361 |
++ to_zap = ratio ? DIV_ROUND_UP(kvm->stat.nx_lpage_splits, ratio) : 0; |
2362 |
++ while (to_zap && !list_empty(&kvm->arch.lpage_disallowed_mmu_pages)) { |
2363 |
++ /* |
2364 |
++ * We use a separate list instead of just using active_mmu_pages |
2365 |
++ * because the number of lpage_disallowed pages is expected to |
2366 |
++ * be relatively small compared to the total. |
2367 |
++ */ |
2368 |
++ sp = list_first_entry(&kvm->arch.lpage_disallowed_mmu_pages, |
2369 |
++ struct kvm_mmu_page, |
2370 |
++ lpage_disallowed_link); |
2371 |
++ WARN_ON_ONCE(!sp->lpage_disallowed); |
2372 |
++ kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); |
2373 |
++ WARN_ON_ONCE(sp->lpage_disallowed); |
2374 |
++ |
2375 |
++ if (!--to_zap || need_resched() || spin_needbreak(&kvm->mmu_lock)) { |
2376 |
++ kvm_mmu_commit_zap_page(kvm, &invalid_list); |
2377 |
++ if (to_zap) |
2378 |
++ cond_resched_lock(&kvm->mmu_lock); |
2379 |
++ } |
2380 |
++ } |
2381 |
++ |
2382 |
++ spin_unlock(&kvm->mmu_lock); |
2383 |
++ srcu_read_unlock(&kvm->srcu, rcu_idx); |
2384 |
++} |
2385 |
++ |
2386 |
++static long get_nx_lpage_recovery_timeout(u64 start_time) |
2387 |
++{ |
2388 |
++ return READ_ONCE(nx_huge_pages) && READ_ONCE(nx_huge_pages_recovery_ratio) |
2389 |
++ ? start_time + 60 * HZ - get_jiffies_64() |
2390 |
++ : MAX_SCHEDULE_TIMEOUT; |
2391 |
++} |
2392 |
++ |
2393 |
++static int kvm_nx_lpage_recovery_worker(struct kvm *kvm, uintptr_t data) |
2394 |
++{ |
2395 |
++ u64 start_time; |
2396 |
++ long remaining_time; |
2397 |
++ |
2398 |
++ while (true) { |
2399 |
++ start_time = get_jiffies_64(); |
2400 |
++ remaining_time = get_nx_lpage_recovery_timeout(start_time); |
2401 |
++ |
2402 |
++ set_current_state(TASK_INTERRUPTIBLE); |
2403 |
++ while (!kthread_should_stop() && remaining_time > 0) { |
2404 |
++ schedule_timeout(remaining_time); |
2405 |
++ remaining_time = get_nx_lpage_recovery_timeout(start_time); |
2406 |
++ set_current_state(TASK_INTERRUPTIBLE); |
2407 |
++ } |
2408 |
++ |
2409 |
++ set_current_state(TASK_RUNNING); |
2410 |
++ |
2411 |
++ if (kthread_should_stop()) |
2412 |
++ return 0; |
2413 |
++ |
2414 |
++ kvm_recover_nx_lpages(kvm); |
2415 |
++ } |
2416 |
++} |
2417 |
++ |
2418 |
++int kvm_mmu_post_init_vm(struct kvm *kvm) |
2419 |
++{ |
2420 |
++ int err; |
2421 |
++ |
2422 |
++ err = kvm_vm_create_worker_thread(kvm, kvm_nx_lpage_recovery_worker, 0, |
2423 |
++ "kvm-nx-lpage-recovery", |
2424 |
++ &kvm->arch.nx_lpage_recovery_thread); |
2425 |
++ if (!err) |
2426 |
++ kthread_unpark(kvm->arch.nx_lpage_recovery_thread); |
2427 |
++ |
2428 |
++ return err; |
2429 |
++} |
2430 |
++ |
2431 |
++void kvm_mmu_pre_destroy_vm(struct kvm *kvm) |
2432 |
++{ |
2433 |
++ if (kvm->arch.nx_lpage_recovery_thread) |
2434 |
++ kthread_stop(kvm->arch.nx_lpage_recovery_thread); |
2435 |
++} |
2436 |
+diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h |
2437 |
+index c92834c55c59..e584689e7d46 100644 |
2438 |
+--- a/arch/x86/kvm/mmu.h |
2439 |
++++ b/arch/x86/kvm/mmu.h |
2440 |
+@@ -56,23 +56,6 @@ void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask); |
2441 |
+ void |
2442 |
+ reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context); |
2443 |
+ |
2444 |
+-/* |
2445 |
+- * Return values of handle_mmio_page_fault: |
2446 |
+- * RET_MMIO_PF_EMULATE: it is a real mmio page fault, emulate the instruction |
2447 |
+- * directly. |
2448 |
+- * RET_MMIO_PF_INVALID: invalid spte is detected then let the real page |
2449 |
+- * fault path update the mmio spte. |
2450 |
+- * RET_MMIO_PF_RETRY: let CPU fault again on the address. |
2451 |
+- * RET_MMIO_PF_BUG: a bug was detected (and a WARN was printed). |
2452 |
+- */ |
2453 |
+-enum { |
2454 |
+- RET_MMIO_PF_EMULATE = 1, |
2455 |
+- RET_MMIO_PF_INVALID = 2, |
2456 |
+- RET_MMIO_PF_RETRY = 0, |
2457 |
+- RET_MMIO_PF_BUG = -1 |
2458 |
+-}; |
2459 |
+- |
2460 |
+-int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct); |
2461 |
+ void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu); |
2462 |
+ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly); |
2463 |
+ bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu); |
2464 |
+@@ -202,4 +185,8 @@ void kvm_mmu_gfn_disallow_lpage(struct kvm_memory_slot *slot, gfn_t gfn); |
2465 |
+ void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn); |
2466 |
+ bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, |
2467 |
+ struct kvm_memory_slot *slot, u64 gfn); |
2468 |
++ |
2469 |
++int kvm_mmu_post_init_vm(struct kvm *kvm); |
2470 |
++void kvm_mmu_pre_destroy_vm(struct kvm *kvm); |
2471 |
++ |
2472 |
+ #endif |
2473 |
+diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h |
2474 |
+index 5a24b846a1cb..756b14ecc957 100644 |
2475 |
+--- a/arch/x86/kvm/mmutrace.h |
2476 |
++++ b/arch/x86/kvm/mmutrace.h |
2477 |
+@@ -322,6 +322,65 @@ TRACE_EVENT( |
2478 |
+ __entry->kvm_gen == __entry->spte_gen |
2479 |
+ ) |
2480 |
+ ); |
2481 |
++ |
2482 |
++TRACE_EVENT( |
2483 |
++ kvm_mmu_set_spte, |
2484 |
++ TP_PROTO(int level, gfn_t gfn, u64 *sptep), |
2485 |
++ TP_ARGS(level, gfn, sptep), |
2486 |
++ |
2487 |
++ TP_STRUCT__entry( |
2488 |
++ __field(u64, gfn) |
2489 |
++ __field(u64, spte) |
2490 |
++ __field(u64, sptep) |
2491 |
++ __field(u8, level) |
2492 |
++ /* These depend on page entry type, so compute them now. */ |
2493 |
++ __field(bool, r) |
2494 |
++ __field(bool, x) |
2495 |
++ __field(u8, u) |
2496 |
++ ), |
2497 |
++ |
2498 |
++ TP_fast_assign( |
2499 |
++ __entry->gfn = gfn; |
2500 |
++ __entry->spte = *sptep; |
2501 |
++ __entry->sptep = virt_to_phys(sptep); |
2502 |
++ __entry->level = level; |
2503 |
++ __entry->r = shadow_present_mask || (__entry->spte & PT_PRESENT_MASK); |
2504 |
++ __entry->x = is_executable_pte(__entry->spte); |
2505 |
++ __entry->u = shadow_user_mask ? !!(__entry->spte & shadow_user_mask) : -1; |
2506 |
++ ), |
2507 |
++ |
2508 |
++ TP_printk("gfn %llx spte %llx (%s%s%s%s) level %d at %llx", |
2509 |
++ __entry->gfn, __entry->spte, |
2510 |
++ __entry->r ? "r" : "-", |
2511 |
++ __entry->spte & PT_PRESENT_MASK ? "w" : "-", |
2512 |
++ __entry->x ? "x" : "-", |
2513 |
++ __entry->u == -1 ? "" : (__entry->u ? "u" : "-"), |
2514 |
++ __entry->level, __entry->sptep |
2515 |
++ ) |
2516 |
++); |
2517 |
++ |
2518 |
++TRACE_EVENT( |
2519 |
++ kvm_mmu_spte_requested, |
2520 |
++ TP_PROTO(gpa_t addr, int level, kvm_pfn_t pfn), |
2521 |
++ TP_ARGS(addr, level, pfn), |
2522 |
++ |
2523 |
++ TP_STRUCT__entry( |
2524 |
++ __field(u64, gfn) |
2525 |
++ __field(u64, pfn) |
2526 |
++ __field(u8, level) |
2527 |
++ ), |
2528 |
++ |
2529 |
++ TP_fast_assign( |
2530 |
++ __entry->gfn = addr >> PAGE_SHIFT; |
2531 |
++ __entry->pfn = pfn | (__entry->gfn & (KVM_PAGES_PER_HPAGE(level) - 1)); |
2532 |
++ __entry->level = level; |
2533 |
++ ), |
2534 |
++ |
2535 |
++ TP_printk("gfn %llx pfn %llx level %d", |
2536 |
++ __entry->gfn, __entry->pfn, __entry->level |
2537 |
++ ) |
2538 |
++); |
2539 |
++ |
2540 |
+ #endif /* _TRACE_KVMMMU_H */ |
2541 |
+ |
2542 |
+ #undef TRACE_INCLUDE_PATH |
2543 |
+diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h |
2544 |
+index 37363900297d..e03225e707b2 100644 |
2545 |
+--- a/arch/x86/kvm/paging_tmpl.h |
2546 |
++++ b/arch/x86/kvm/paging_tmpl.h |
2547 |
+@@ -499,6 +499,7 @@ FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, |
2548 |
+ mmu_set_spte(vcpu, spte, pte_access, 0, PT_PAGE_TABLE_LEVEL, gfn, pfn, |
2549 |
+ true, true); |
2550 |
+ |
2551 |
++ kvm_release_pfn_clean(pfn); |
2552 |
+ return true; |
2553 |
+ } |
2554 |
+ |
2555 |
+@@ -572,12 +573,14 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw, |
2556 |
+ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, |
2557 |
+ struct guest_walker *gw, |
2558 |
+ int write_fault, int hlevel, |
2559 |
+- kvm_pfn_t pfn, bool map_writable, bool prefault) |
2560 |
++ kvm_pfn_t pfn, bool map_writable, bool prefault, |
2561 |
++ bool lpage_disallowed) |
2562 |
+ { |
2563 |
+ struct kvm_mmu_page *sp = NULL; |
2564 |
+ struct kvm_shadow_walk_iterator it; |
2565 |
+ unsigned direct_access, access = gw->pt_access; |
2566 |
+- int top_level, emulate; |
2567 |
++ int top_level, ret; |
2568 |
++ gfn_t gfn, base_gfn; |
2569 |
+ |
2570 |
+ direct_access = gw->pte_access; |
2571 |
+ |
2572 |
+@@ -622,36 +625,49 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, |
2573 |
+ link_shadow_page(vcpu, it.sptep, sp); |
2574 |
+ } |
2575 |
+ |
2576 |
+- for (; |
2577 |
+- shadow_walk_okay(&it) && it.level > hlevel; |
2578 |
+- shadow_walk_next(&it)) { |
2579 |
+- gfn_t direct_gfn; |
2580 |
++ /* |
2581 |
++ * FNAME(page_fault) might have clobbered the bottom bits of |
2582 |
++ * gw->gfn, restore them from the virtual address. |
2583 |
++ */ |
2584 |
++ gfn = gw->gfn | ((addr & PT_LVL_OFFSET_MASK(gw->level)) >> PAGE_SHIFT); |
2585 |
++ base_gfn = gfn; |
2586 |
+ |
2587 |
++ trace_kvm_mmu_spte_requested(addr, gw->level, pfn); |
2588 |
++ |
2589 |
++ for (; shadow_walk_okay(&it); shadow_walk_next(&it)) { |
2590 |
+ clear_sp_write_flooding_count(it.sptep); |
2591 |
+- validate_direct_spte(vcpu, it.sptep, direct_access); |
2592 |
+ |
2593 |
+- drop_large_spte(vcpu, it.sptep); |
2594 |
++ /* |
2595 |
++ * We cannot overwrite existing page tables with an NX |
2596 |
++ * large page, as the leaf could be executable. |
2597 |
++ */ |
2598 |
++ disallowed_hugepage_adjust(it, gfn, &pfn, &hlevel); |
2599 |
+ |
2600 |
+- if (is_shadow_present_pte(*it.sptep)) |
2601 |
+- continue; |
2602 |
++ base_gfn = gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1); |
2603 |
++ if (it.level == hlevel) |
2604 |
++ break; |
2605 |
+ |
2606 |
+- direct_gfn = gw->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1); |
2607 |
++ validate_direct_spte(vcpu, it.sptep, direct_access); |
2608 |
+ |
2609 |
+- sp = kvm_mmu_get_page(vcpu, direct_gfn, addr, it.level-1, |
2610 |
+- true, direct_access); |
2611 |
+- link_shadow_page(vcpu, it.sptep, sp); |
2612 |
++ drop_large_spte(vcpu, it.sptep); |
2613 |
++ |
2614 |
++ if (!is_shadow_present_pte(*it.sptep)) { |
2615 |
++ sp = kvm_mmu_get_page(vcpu, base_gfn, addr, |
2616 |
++ it.level - 1, true, direct_access); |
2617 |
++ link_shadow_page(vcpu, it.sptep, sp); |
2618 |
++ if (lpage_disallowed) |
2619 |
++ account_huge_nx_page(vcpu->kvm, sp); |
2620 |
++ } |
2621 |
+ } |
2622 |
+ |
2623 |
+- clear_sp_write_flooding_count(it.sptep); |
2624 |
+- emulate = mmu_set_spte(vcpu, it.sptep, gw->pte_access, write_fault, |
2625 |
+- it.level, gw->gfn, pfn, prefault, map_writable); |
2626 |
++ ret = mmu_set_spte(vcpu, it.sptep, gw->pte_access, write_fault, |
2627 |
++ it.level, base_gfn, pfn, prefault, map_writable); |
2628 |
+ FNAME(pte_prefetch)(vcpu, gw, it.sptep); |
2629 |
+- |
2630 |
+- return emulate; |
2631 |
++ ++vcpu->stat.pf_fixed; |
2632 |
++ return ret; |
2633 |
+ |
2634 |
+ out_gpte_changed: |
2635 |
+- kvm_release_pfn_clean(pfn); |
2636 |
+- return 0; |
2637 |
++ return RET_PF_RETRY; |
2638 |
+ } |
2639 |
+ |
2640 |
+ /* |
2641 |
+@@ -717,9 +733,11 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, |
2642 |
+ int r; |
2643 |
+ kvm_pfn_t pfn; |
2644 |
+ int level = PT_PAGE_TABLE_LEVEL; |
2645 |
+- bool force_pt_level = false; |
2646 |
+ unsigned long mmu_seq; |
2647 |
+ bool map_writable, is_self_change_mapping; |
2648 |
++ bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) && |
2649 |
++ is_nx_huge_page_enabled(); |
2650 |
++ bool force_pt_level = lpage_disallowed; |
2651 |
+ |
2652 |
+ pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code); |
2653 |
+ |
2654 |
+@@ -746,12 +764,12 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, |
2655 |
+ if (!prefault) |
2656 |
+ inject_page_fault(vcpu, &walker.fault); |
2657 |
+ |
2658 |
+- return 0; |
2659 |
++ return RET_PF_RETRY; |
2660 |
+ } |
2661 |
+ |
2662 |
+ if (page_fault_handle_page_track(vcpu, error_code, walker.gfn)) { |
2663 |
+ shadow_page_table_clear_flood(vcpu, addr); |
2664 |
+- return 1; |
2665 |
++ return RET_PF_EMULATE; |
2666 |
+ } |
2667 |
+ |
2668 |
+ vcpu->arch.write_fault_to_shadow_pgtable = false; |
2669 |
+@@ -773,7 +791,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, |
2670 |
+ |
2671 |
+ if (try_async_pf(vcpu, prefault, walker.gfn, addr, &pfn, write_fault, |
2672 |
+ &map_writable)) |
2673 |
+- return 0; |
2674 |
++ return RET_PF_RETRY; |
2675 |
+ |
2676 |
+ if (handle_abnormal_pfn(vcpu, mmu_is_nested(vcpu) ? 0 : addr, |
2677 |
+ walker.gfn, pfn, walker.pte_access, &r)) |
2678 |
+@@ -799,6 +817,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, |
2679 |
+ walker.pte_access &= ~ACC_EXEC_MASK; |
2680 |
+ } |
2681 |
+ |
2682 |
++ r = RET_PF_RETRY; |
2683 |
+ spin_lock(&vcpu->kvm->mmu_lock); |
2684 |
+ if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) |
2685 |
+ goto out_unlock; |
2686 |
+@@ -806,19 +825,15 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, |
2687 |
+ kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT); |
2688 |
+ make_mmu_pages_available(vcpu); |
2689 |
+ if (!force_pt_level) |
2690 |
+- transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level); |
2691 |
++ transparent_hugepage_adjust(vcpu, walker.gfn, &pfn, &level); |
2692 |
+ r = FNAME(fetch)(vcpu, addr, &walker, write_fault, |
2693 |
+- level, pfn, map_writable, prefault); |
2694 |
+- ++vcpu->stat.pf_fixed; |
2695 |
++ level, pfn, map_writable, prefault, lpage_disallowed); |
2696 |
+ kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT); |
2697 |
+- spin_unlock(&vcpu->kvm->mmu_lock); |
2698 |
+- |
2699 |
+- return r; |
2700 |
+ |
2701 |
+ out_unlock: |
2702 |
+ spin_unlock(&vcpu->kvm->mmu_lock); |
2703 |
+ kvm_release_pfn_clean(pfn); |
2704 |
+- return 0; |
2705 |
++ return r; |
2706 |
+ } |
2707 |
+ |
2708 |
+ static gpa_t FNAME(get_level1_sp_gpa)(struct kvm_mmu_page *sp) |
2709 |
+diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c |
2710 |
+index f7a7b98b3271..1079228e4fef 100644 |
2711 |
+--- a/arch/x86/kvm/svm.c |
2712 |
++++ b/arch/x86/kvm/svm.c |
2713 |
+@@ -590,8 +590,14 @@ static int get_npt_level(void) |
2714 |
+ static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) |
2715 |
+ { |
2716 |
+ vcpu->arch.efer = efer; |
2717 |
+- if (!npt_enabled && !(efer & EFER_LMA)) |
2718 |
+- efer &= ~EFER_LME; |
2719 |
++ |
2720 |
++ if (!npt_enabled) { |
2721 |
++ /* Shadow paging assumes NX to be available. */ |
2722 |
++ efer |= EFER_NX; |
2723 |
++ |
2724 |
++ if (!(efer & EFER_LMA)) |
2725 |
++ efer &= ~EFER_LME; |
2726 |
++ } |
2727 |
+ |
2728 |
+ to_svm(vcpu)->vmcb->save.efer = efer | EFER_SVME; |
2729 |
+ mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR); |
2730 |
+diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c |
2731 |
+index 6b66d1f0d185..4c0d6d0d6337 100644 |
2732 |
+--- a/arch/x86/kvm/vmx.c |
2733 |
++++ b/arch/x86/kvm/vmx.c |
2734 |
+@@ -2219,17 +2219,9 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset) |
2735 |
+ u64 guest_efer = vmx->vcpu.arch.efer; |
2736 |
+ u64 ignore_bits = 0; |
2737 |
+ |
2738 |
+- if (!enable_ept) { |
2739 |
+- /* |
2740 |
+- * NX is needed to handle CR0.WP=1, CR4.SMEP=1. Testing |
2741 |
+- * host CPUID is more efficient than testing guest CPUID |
2742 |
+- * or CR4. Host SMEP is anyway a requirement for guest SMEP. |
2743 |
+- */ |
2744 |
+- if (boot_cpu_has(X86_FEATURE_SMEP)) |
2745 |
+- guest_efer |= EFER_NX; |
2746 |
+- else if (!(guest_efer & EFER_NX)) |
2747 |
+- ignore_bits |= EFER_NX; |
2748 |
+- } |
2749 |
++ /* Shadow paging assumes NX to be available. */ |
2750 |
++ if (!enable_ept) |
2751 |
++ guest_efer |= EFER_NX; |
2752 |
+ |
2753 |
+ /* |
2754 |
+ * LMA and LME handled by hardware; SCE meaningless outside long mode. |
2755 |
+@@ -6556,16 +6548,9 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu) |
2756 |
+ NULL, 0) == EMULATE_DONE; |
2757 |
+ } |
2758 |
+ |
2759 |
+- ret = handle_mmio_page_fault(vcpu, gpa, true); |
2760 |
+- if (likely(ret == RET_MMIO_PF_EMULATE)) |
2761 |
+- return x86_emulate_instruction(vcpu, gpa, 0, NULL, 0) == |
2762 |
+- EMULATE_DONE; |
2763 |
+- |
2764 |
+- if (unlikely(ret == RET_MMIO_PF_INVALID)) |
2765 |
+- return kvm_mmu_page_fault(vcpu, gpa, 0, NULL, 0); |
2766 |
+- |
2767 |
+- if (unlikely(ret == RET_MMIO_PF_RETRY)) |
2768 |
+- return 1; |
2769 |
++ ret = kvm_mmu_page_fault(vcpu, gpa, PFERR_RSVD_MASK, NULL, 0); |
2770 |
++ if (ret >= 0) |
2771 |
++ return ret; |
2772 |
+ |
2773 |
+ /* It is the real ept misconfig */ |
2774 |
+ WARN_ON(1); |
2775 |
+diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c |
2776 |
+index 0b6517f5821b..06cd710e1d45 100644 |
2777 |
+--- a/arch/x86/kvm/x86.c |
2778 |
++++ b/arch/x86/kvm/x86.c |
2779 |
+@@ -191,6 +191,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { |
2780 |
+ { "mmu_unsync", VM_STAT(mmu_unsync) }, |
2781 |
+ { "remote_tlb_flush", VM_STAT(remote_tlb_flush) }, |
2782 |
+ { "largepages", VM_STAT(lpages) }, |
2783 |
++ { "nx_largepages_splitted", VM_STAT(nx_lpage_splits) }, |
2784 |
+ { NULL } |
2785 |
+ }; |
2786 |
+ |
2787 |
+@@ -587,7 +588,7 @@ static bool pdptrs_changed(struct kvm_vcpu *vcpu) |
2788 |
+ gfn_t gfn; |
2789 |
+ int r; |
2790 |
+ |
2791 |
+- if (is_long_mode(vcpu) || !is_pae(vcpu)) |
2792 |
++ if (is_long_mode(vcpu) || !is_pae(vcpu) || !is_paging(vcpu)) |
2793 |
+ return false; |
2794 |
+ |
2795 |
+ if (!test_bit(VCPU_EXREG_PDPTR, |
2796 |
+@@ -1031,6 +1032,14 @@ u64 kvm_get_arch_capabilities(void) |
2797 |
+ |
2798 |
+ rdmsrl_safe(MSR_IA32_ARCH_CAPABILITIES, &data); |
2799 |
+ |
2800 |
++ /* |
2801 |
++ * If nx_huge_pages is enabled, KVM's shadow paging will ensure that |
2802 |
++ * the nested hypervisor runs with NX huge pages. If it is not, |
2803 |
++ * L1 is anyway vulnerable to ITLB_MULTIHIT explots from other |
2804 |
++ * L1 guests, so it need not worry about its own (L2) guests. |
2805 |
++ */ |
2806 |
++ data |= ARCH_CAP_PSCHANGE_MC_NO; |
2807 |
++ |
2808 |
+ /* |
2809 |
+ * If we're doing cache flushes (either "always" or "cond") |
2810 |
+ * we will do one whenever the guest does a vmlaunch/vmresume. |
2811 |
+@@ -1043,8 +1052,35 @@ u64 kvm_get_arch_capabilities(void) |
2812 |
+ if (l1tf_vmx_mitigation != VMENTER_L1D_FLUSH_NEVER) |
2813 |
+ data |= ARCH_CAP_SKIP_VMENTRY_L1DFLUSH; |
2814 |
+ |
2815 |
++ if (!boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN)) |
2816 |
++ data |= ARCH_CAP_RDCL_NO; |
2817 |
++ if (!boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS)) |
2818 |
++ data |= ARCH_CAP_SSB_NO; |
2819 |
++ if (!boot_cpu_has_bug(X86_BUG_MDS)) |
2820 |
++ data |= ARCH_CAP_MDS_NO; |
2821 |
++ |
2822 |
++ /* |
2823 |
++ * On TAA affected systems, export MDS_NO=0 when: |
2824 |
++ * - TSX is enabled on the host, i.e. X86_FEATURE_RTM=1. |
2825 |
++ * - Updated microcode is present. This is detected by |
2826 |
++ * the presence of ARCH_CAP_TSX_CTRL_MSR and ensures |
2827 |
++ * that VERW clears CPU buffers. |
2828 |
++ * |
2829 |
++ * When MDS_NO=0 is exported, guests deploy clear CPU buffer |
2830 |
++ * mitigation and don't complain: |
2831 |
++ * |
2832 |
++ * "Vulnerable: Clear CPU buffers attempted, no microcode" |
2833 |
++ * |
2834 |
++ * If TSX is disabled on the system, guests are also mitigated against |
2835 |
++ * TAA and clear CPU buffer mitigation is not required for guests. |
2836 |
++ */ |
2837 |
++ if (boot_cpu_has_bug(X86_BUG_TAA) && boot_cpu_has(X86_FEATURE_RTM) && |
2838 |
++ (data & ARCH_CAP_TSX_CTRL_MSR)) |
2839 |
++ data &= ~ARCH_CAP_MDS_NO; |
2840 |
++ |
2841 |
+ return data; |
2842 |
+ } |
2843 |
++ |
2844 |
+ EXPORT_SYMBOL_GPL(kvm_get_arch_capabilities); |
2845 |
+ |
2846 |
+ static int kvm_get_msr_feature(struct kvm_msr_entry *msr) |
2847 |
+@@ -5951,17 +5987,17 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va |
2848 |
+ |
2849 |
+ smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1); |
2850 |
+ |
2851 |
+- spin_lock(&kvm_lock); |
2852 |
++ mutex_lock(&kvm_lock); |
2853 |
+ list_for_each_entry(kvm, &vm_list, vm_list) { |
2854 |
+ kvm_for_each_vcpu(i, vcpu, kvm) { |
2855 |
+ if (vcpu->cpu != freq->cpu) |
2856 |
+ continue; |
2857 |
+ kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); |
2858 |
+- if (vcpu->cpu != smp_processor_id()) |
2859 |
++ if (vcpu->cpu != raw_smp_processor_id()) |
2860 |
+ send_ipi = 1; |
2861 |
+ } |
2862 |
+ } |
2863 |
+- spin_unlock(&kvm_lock); |
2864 |
++ mutex_unlock(&kvm_lock); |
2865 |
+ |
2866 |
+ if (freq->old < freq->new && send_ipi) { |
2867 |
+ /* |
2868 |
+@@ -6099,12 +6135,12 @@ static void pvclock_gtod_update_fn(struct work_struct *work) |
2869 |
+ struct kvm_vcpu *vcpu; |
2870 |
+ int i; |
2871 |
+ |
2872 |
+- spin_lock(&kvm_lock); |
2873 |
++ mutex_lock(&kvm_lock); |
2874 |
+ list_for_each_entry(kvm, &vm_list, vm_list) |
2875 |
+ kvm_for_each_vcpu(i, vcpu, kvm) |
2876 |
+ kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu); |
2877 |
+ atomic_set(&kvm_guest_has_master_clock, 0); |
2878 |
+- spin_unlock(&kvm_lock); |
2879 |
++ mutex_unlock(&kvm_lock); |
2880 |
+ } |
2881 |
+ |
2882 |
+ static DECLARE_WORK(pvclock_gtod_work, pvclock_gtod_update_fn); |
2883 |
+@@ -7491,7 +7527,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, |
2884 |
+ kvm_update_cpuid(vcpu); |
2885 |
+ |
2886 |
+ idx = srcu_read_lock(&vcpu->kvm->srcu); |
2887 |
+- if (!is_long_mode(vcpu) && is_pae(vcpu)) { |
2888 |
++ if (!is_long_mode(vcpu) && is_pae(vcpu) && is_paging(vcpu)) { |
2889 |
+ load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu)); |
2890 |
+ mmu_reset_needed = 1; |
2891 |
+ } |
2892 |
+@@ -8072,6 +8108,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) |
2893 |
+ INIT_HLIST_HEAD(&kvm->arch.mask_notifier_list); |
2894 |
+ INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); |
2895 |
+ INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages); |
2896 |
++ INIT_LIST_HEAD(&kvm->arch.lpage_disallowed_mmu_pages); |
2897 |
+ INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); |
2898 |
+ atomic_set(&kvm->arch.noncoherent_dma_count, 0); |
2899 |
+ |
2900 |
+@@ -8100,6 +8137,11 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) |
2901 |
+ return 0; |
2902 |
+ } |
2903 |
+ |
2904 |
++int kvm_arch_post_init_vm(struct kvm *kvm) |
2905 |
++{ |
2906 |
++ return kvm_mmu_post_init_vm(kvm); |
2907 |
++} |
2908 |
++ |
2909 |
+ static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu) |
2910 |
+ { |
2911 |
+ int r; |
2912 |
+@@ -8206,6 +8248,11 @@ int x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size) |
2913 |
+ } |
2914 |
+ EXPORT_SYMBOL_GPL(x86_set_memory_region); |
2915 |
+ |
2916 |
++void kvm_arch_pre_destroy_vm(struct kvm *kvm) |
2917 |
++{ |
2918 |
++ kvm_mmu_pre_destroy_vm(kvm); |
2919 |
++} |
2920 |
++ |
2921 |
+ void kvm_arch_destroy_vm(struct kvm *kvm) |
2922 |
+ { |
2923 |
+ if (current->mm == kvm->mm) { |
2924 |
+diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c |
2925 |
+index 3b123735a1c4..677c5f36674b 100644 |
2926 |
+--- a/drivers/base/cpu.c |
2927 |
++++ b/drivers/base/cpu.c |
2928 |
+@@ -537,12 +537,27 @@ ssize_t __weak cpu_show_mds(struct device *dev, |
2929 |
+ return sprintf(buf, "Not affected\n"); |
2930 |
+ } |
2931 |
+ |
2932 |
++ssize_t __weak cpu_show_tsx_async_abort(struct device *dev, |
2933 |
++ struct device_attribute *attr, |
2934 |
++ char *buf) |
2935 |
++{ |
2936 |
++ return sprintf(buf, "Not affected\n"); |
2937 |
++} |
2938 |
++ |
2939 |
++ssize_t __weak cpu_show_itlb_multihit(struct device *dev, |
2940 |
++ struct device_attribute *attr, char *buf) |
2941 |
++{ |
2942 |
++ return sprintf(buf, "Not affected\n"); |
2943 |
++} |
2944 |
++ |
2945 |
+ static DEVICE_ATTR(meltdown, 0444, cpu_show_meltdown, NULL); |
2946 |
+ static DEVICE_ATTR(spectre_v1, 0444, cpu_show_spectre_v1, NULL); |
2947 |
+ static DEVICE_ATTR(spectre_v2, 0444, cpu_show_spectre_v2, NULL); |
2948 |
+ static DEVICE_ATTR(spec_store_bypass, 0444, cpu_show_spec_store_bypass, NULL); |
2949 |
+ static DEVICE_ATTR(l1tf, 0444, cpu_show_l1tf, NULL); |
2950 |
+ static DEVICE_ATTR(mds, 0444, cpu_show_mds, NULL); |
2951 |
++static DEVICE_ATTR(tsx_async_abort, 0444, cpu_show_tsx_async_abort, NULL); |
2952 |
++static DEVICE_ATTR(itlb_multihit, 0444, cpu_show_itlb_multihit, NULL); |
2953 |
+ |
2954 |
+ static struct attribute *cpu_root_vulnerabilities_attrs[] = { |
2955 |
+ &dev_attr_meltdown.attr, |
2956 |
+@@ -551,6 +566,8 @@ static struct attribute *cpu_root_vulnerabilities_attrs[] = { |
2957 |
+ &dev_attr_spec_store_bypass.attr, |
2958 |
+ &dev_attr_l1tf.attr, |
2959 |
+ &dev_attr_mds.attr, |
2960 |
++ &dev_attr_tsx_async_abort.attr, |
2961 |
++ &dev_attr_itlb_multihit.attr, |
2962 |
+ NULL |
2963 |
+ }; |
2964 |
+ |
2965 |
+diff --git a/drivers/bluetooth/hci_ldisc.c b/drivers/bluetooth/hci_ldisc.c |
2966 |
+index a2f6953a86f5..0a21fb86fd67 100644 |
2967 |
+--- a/drivers/bluetooth/hci_ldisc.c |
2968 |
++++ b/drivers/bluetooth/hci_ldisc.c |
2969 |
+@@ -653,15 +653,14 @@ static int hci_uart_set_proto(struct hci_uart *hu, int id) |
2970 |
+ return err; |
2971 |
+ |
2972 |
+ hu->proto = p; |
2973 |
+- set_bit(HCI_UART_PROTO_READY, &hu->flags); |
2974 |
+ |
2975 |
+ err = hci_uart_register_dev(hu); |
2976 |
+ if (err) { |
2977 |
+- clear_bit(HCI_UART_PROTO_READY, &hu->flags); |
2978 |
+ p->close(hu); |
2979 |
+ return err; |
2980 |
+ } |
2981 |
+ |
2982 |
++ set_bit(HCI_UART_PROTO_READY, &hu->flags); |
2983 |
+ return 0; |
2984 |
+ } |
2985 |
+ |
2986 |
+diff --git a/drivers/usb/gadget/udc/core.c b/drivers/usb/gadget/udc/core.c |
2987 |
+index 95e28ecfde0a..99c7cf4822c3 100644 |
2988 |
+--- a/drivers/usb/gadget/udc/core.c |
2989 |
++++ b/drivers/usb/gadget/udc/core.c |
2990 |
+@@ -817,6 +817,8 @@ int usb_gadget_map_request_by_dev(struct device *dev, |
2991 |
+ dev_err(dev, "failed to map buffer\n"); |
2992 |
+ return -EFAULT; |
2993 |
+ } |
2994 |
++ |
2995 |
++ req->dma_mapped = 1; |
2996 |
+ } |
2997 |
+ |
2998 |
+ return 0; |
2999 |
+@@ -841,9 +843,10 @@ void usb_gadget_unmap_request_by_dev(struct device *dev, |
3000 |
+ is_in ? DMA_TO_DEVICE : DMA_FROM_DEVICE); |
3001 |
+ |
3002 |
+ req->num_mapped_sgs = 0; |
3003 |
+- } else { |
3004 |
++ } else if (req->dma_mapped) { |
3005 |
+ dma_unmap_single(dev, req->dma, req->length, |
3006 |
+ is_in ? DMA_TO_DEVICE : DMA_FROM_DEVICE); |
3007 |
++ req->dma_mapped = 0; |
3008 |
+ } |
3009 |
+ } |
3010 |
+ EXPORT_SYMBOL_GPL(usb_gadget_unmap_request_by_dev); |
3011 |
+diff --git a/include/linux/cpu.h b/include/linux/cpu.h |
3012 |
+index b27c9b2e683f..e19bbc38a722 100644 |
3013 |
+--- a/include/linux/cpu.h |
3014 |
++++ b/include/linux/cpu.h |
3015 |
+@@ -56,6 +56,11 @@ extern ssize_t cpu_show_l1tf(struct device *dev, |
3016 |
+ struct device_attribute *attr, char *buf); |
3017 |
+ extern ssize_t cpu_show_mds(struct device *dev, |
3018 |
+ struct device_attribute *attr, char *buf); |
3019 |
++extern ssize_t cpu_show_tsx_async_abort(struct device *dev, |
3020 |
++ struct device_attribute *attr, |
3021 |
++ char *buf); |
3022 |
++extern ssize_t cpu_show_itlb_multihit(struct device *dev, |
3023 |
++ struct device_attribute *attr, char *buf); |
3024 |
+ |
3025 |
+ extern __printf(4, 5) |
3026 |
+ struct device *cpu_device_create(struct device *parent, void *drvdata, |
3027 |
+@@ -282,28 +287,7 @@ static inline int cpuhp_smt_enable(void) { return 0; } |
3028 |
+ static inline int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval) { return 0; } |
3029 |
+ #endif |
3030 |
+ |
3031 |
+-/* |
3032 |
+- * These are used for a global "mitigations=" cmdline option for toggling |
3033 |
+- * optional CPU mitigations. |
3034 |
+- */ |
3035 |
+-enum cpu_mitigations { |
3036 |
+- CPU_MITIGATIONS_OFF, |
3037 |
+- CPU_MITIGATIONS_AUTO, |
3038 |
+- CPU_MITIGATIONS_AUTO_NOSMT, |
3039 |
+-}; |
3040 |
+- |
3041 |
+-extern enum cpu_mitigations cpu_mitigations; |
3042 |
+- |
3043 |
+-/* mitigations=off */ |
3044 |
+-static inline bool cpu_mitigations_off(void) |
3045 |
+-{ |
3046 |
+- return cpu_mitigations == CPU_MITIGATIONS_OFF; |
3047 |
+-} |
3048 |
+- |
3049 |
+-/* mitigations=auto,nosmt */ |
3050 |
+-static inline bool cpu_mitigations_auto_nosmt(void) |
3051 |
+-{ |
3052 |
+- return cpu_mitigations == CPU_MITIGATIONS_AUTO_NOSMT; |
3053 |
+-} |
3054 |
++extern bool cpu_mitigations_off(void); |
3055 |
++extern bool cpu_mitigations_auto_nosmt(void); |
3056 |
+ |
3057 |
+ #endif /* _LINUX_CPU_H_ */ |
3058 |
+diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h |
3059 |
+index eb55374b73f3..0590e7d47b02 100644 |
3060 |
+--- a/include/linux/kvm_host.h |
3061 |
++++ b/include/linux/kvm_host.h |
3062 |
+@@ -129,7 +129,7 @@ static inline bool is_error_page(struct page *page) |
3063 |
+ |
3064 |
+ extern struct kmem_cache *kvm_vcpu_cache; |
3065 |
+ |
3066 |
+-extern spinlock_t kvm_lock; |
3067 |
++extern struct mutex kvm_lock; |
3068 |
+ extern struct list_head vm_list; |
3069 |
+ |
3070 |
+ struct kvm_io_range { |
3071 |
+@@ -1208,4 +1208,10 @@ static inline bool vcpu_valid_wakeup(struct kvm_vcpu *vcpu) |
3072 |
+ } |
3073 |
+ #endif /* CONFIG_HAVE_KVM_INVALID_WAKEUPS */ |
3074 |
+ |
3075 |
++typedef int (*kvm_vm_thread_fn_t)(struct kvm *kvm, uintptr_t data); |
3076 |
++ |
3077 |
++int kvm_vm_create_worker_thread(struct kvm *kvm, kvm_vm_thread_fn_t thread_fn, |
3078 |
++ uintptr_t data, const char *name, |
3079 |
++ struct task_struct **thread_ptr); |
3080 |
++ |
3081 |
+ #endif |
3082 |
+diff --git a/include/linux/usb/gadget.h b/include/linux/usb/gadget.h |
3083 |
+index e4516e9ded0f..4b810bc7ae63 100644 |
3084 |
+--- a/include/linux/usb/gadget.h |
3085 |
++++ b/include/linux/usb/gadget.h |
3086 |
+@@ -48,6 +48,7 @@ struct usb_ep; |
3087 |
+ * by adding a zero length packet as needed; |
3088 |
+ * @short_not_ok: When reading data, makes short packets be |
3089 |
+ * treated as errors (queue stops advancing till cleanup). |
3090 |
++ * @dma_mapped: Indicates if request has been mapped to DMA (internal) |
3091 |
+ * @complete: Function called when request completes, so this request and |
3092 |
+ * its buffer may be re-used. The function will always be called with |
3093 |
+ * interrupts disabled, and it must not sleep. |
3094 |
+@@ -103,6 +104,7 @@ struct usb_request { |
3095 |
+ unsigned no_interrupt:1; |
3096 |
+ unsigned zero:1; |
3097 |
+ unsigned short_not_ok:1; |
3098 |
++ unsigned dma_mapped:1; |
3099 |
+ |
3100 |
+ void (*complete)(struct usb_ep *ep, |
3101 |
+ struct usb_request *req); |
3102 |
+diff --git a/kernel/cpu.c b/kernel/cpu.c |
3103 |
+index c947bb35b89f..0ed3e9deda30 100644 |
3104 |
+--- a/kernel/cpu.c |
3105 |
++++ b/kernel/cpu.c |
3106 |
+@@ -2235,7 +2235,18 @@ void __init boot_cpu_hotplug_init(void) |
3107 |
+ this_cpu_write(cpuhp_state.state, CPUHP_ONLINE); |
3108 |
+ } |
3109 |
+ |
3110 |
+-enum cpu_mitigations cpu_mitigations __ro_after_init = CPU_MITIGATIONS_AUTO; |
3111 |
++/* |
3112 |
++ * These are used for a global "mitigations=" cmdline option for toggling |
3113 |
++ * optional CPU mitigations. |
3114 |
++ */ |
3115 |
++enum cpu_mitigations { |
3116 |
++ CPU_MITIGATIONS_OFF, |
3117 |
++ CPU_MITIGATIONS_AUTO, |
3118 |
++ CPU_MITIGATIONS_AUTO_NOSMT, |
3119 |
++}; |
3120 |
++ |
3121 |
++static enum cpu_mitigations cpu_mitigations __ro_after_init = |
3122 |
++ CPU_MITIGATIONS_AUTO; |
3123 |
+ |
3124 |
+ static int __init mitigations_parse_cmdline(char *arg) |
3125 |
+ { |
3126 |
+@@ -2252,3 +2263,17 @@ static int __init mitigations_parse_cmdline(char *arg) |
3127 |
+ return 0; |
3128 |
+ } |
3129 |
+ early_param("mitigations", mitigations_parse_cmdline); |
3130 |
++ |
3131 |
++/* mitigations=off */ |
3132 |
++bool cpu_mitigations_off(void) |
3133 |
++{ |
3134 |
++ return cpu_mitigations == CPU_MITIGATIONS_OFF; |
3135 |
++} |
3136 |
++EXPORT_SYMBOL_GPL(cpu_mitigations_off); |
3137 |
++ |
3138 |
++/* mitigations=auto,nosmt */ |
3139 |
++bool cpu_mitigations_auto_nosmt(void) |
3140 |
++{ |
3141 |
++ return cpu_mitigations == CPU_MITIGATIONS_AUTO_NOSMT; |
3142 |
++} |
3143 |
++EXPORT_SYMBOL_GPL(cpu_mitigations_auto_nosmt); |
3144 |
+diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c |
3145 |
+index c72586a094ed..0fc93519e63e 100644 |
3146 |
+--- a/virt/kvm/kvm_main.c |
3147 |
++++ b/virt/kvm/kvm_main.c |
3148 |
+@@ -49,6 +49,7 @@ |
3149 |
+ #include <linux/slab.h> |
3150 |
+ #include <linux/sort.h> |
3151 |
+ #include <linux/bsearch.h> |
3152 |
++#include <linux/kthread.h> |
3153 |
+ |
3154 |
+ #include <asm/processor.h> |
3155 |
+ #include <asm/io.h> |
3156 |
+@@ -87,7 +88,7 @@ module_param(halt_poll_ns_shrink, uint, S_IRUGO | S_IWUSR); |
3157 |
+ * kvm->lock --> kvm->slots_lock --> kvm->irq_lock |
3158 |
+ */ |
3159 |
+ |
3160 |
+-DEFINE_SPINLOCK(kvm_lock); |
3161 |
++DEFINE_MUTEX(kvm_lock); |
3162 |
+ static DEFINE_RAW_SPINLOCK(kvm_count_lock); |
3163 |
+ LIST_HEAD(vm_list); |
3164 |
+ |
3165 |
+@@ -612,6 +613,23 @@ static int kvm_create_vm_debugfs(struct kvm *kvm, int fd) |
3166 |
+ return 0; |
3167 |
+ } |
3168 |
+ |
3169 |
++/* |
3170 |
++ * Called after the VM is otherwise initialized, but just before adding it to |
3171 |
++ * the vm_list. |
3172 |
++ */ |
3173 |
++int __weak kvm_arch_post_init_vm(struct kvm *kvm) |
3174 |
++{ |
3175 |
++ return 0; |
3176 |
++} |
3177 |
++ |
3178 |
++/* |
3179 |
++ * Called just after removing the VM from the vm_list, but before doing any |
3180 |
++ * other destruction. |
3181 |
++ */ |
3182 |
++void __weak kvm_arch_pre_destroy_vm(struct kvm *kvm) |
3183 |
++{ |
3184 |
++} |
3185 |
++ |
3186 |
+ static struct kvm *kvm_create_vm(unsigned long type) |
3187 |
+ { |
3188 |
+ int r, i; |
3189 |
+@@ -659,22 +677,31 @@ static struct kvm *kvm_create_vm(unsigned long type) |
3190 |
+ kvm->buses[i] = kzalloc(sizeof(struct kvm_io_bus), |
3191 |
+ GFP_KERNEL); |
3192 |
+ if (!kvm->buses[i]) |
3193 |
+- goto out_err; |
3194 |
++ goto out_err_no_mmu_notifier; |
3195 |
+ } |
3196 |
+ |
3197 |
+ r = kvm_init_mmu_notifier(kvm); |
3198 |
++ if (r) |
3199 |
++ goto out_err_no_mmu_notifier; |
3200 |
++ |
3201 |
++ r = kvm_arch_post_init_vm(kvm); |
3202 |
+ if (r) |
3203 |
+ goto out_err; |
3204 |
+ |
3205 |
+- spin_lock(&kvm_lock); |
3206 |
++ mutex_lock(&kvm_lock); |
3207 |
+ list_add(&kvm->vm_list, &vm_list); |
3208 |
+- spin_unlock(&kvm_lock); |
3209 |
++ mutex_unlock(&kvm_lock); |
3210 |
+ |
3211 |
+ preempt_notifier_inc(); |
3212 |
+ |
3213 |
+ return kvm; |
3214 |
+ |
3215 |
+ out_err: |
3216 |
++#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) |
3217 |
++ if (kvm->mmu_notifier.ops) |
3218 |
++ mmu_notifier_unregister(&kvm->mmu_notifier, current->mm); |
3219 |
++#endif |
3220 |
++out_err_no_mmu_notifier: |
3221 |
+ cleanup_srcu_struct(&kvm->irq_srcu); |
3222 |
+ out_err_no_irq_srcu: |
3223 |
+ cleanup_srcu_struct(&kvm->srcu); |
3224 |
+@@ -724,9 +751,11 @@ static void kvm_destroy_vm(struct kvm *kvm) |
3225 |
+ |
3226 |
+ kvm_destroy_vm_debugfs(kvm); |
3227 |
+ kvm_arch_sync_events(kvm); |
3228 |
+- spin_lock(&kvm_lock); |
3229 |
++ mutex_lock(&kvm_lock); |
3230 |
+ list_del(&kvm->vm_list); |
3231 |
+- spin_unlock(&kvm_lock); |
3232 |
++ mutex_unlock(&kvm_lock); |
3233 |
++ kvm_arch_pre_destroy_vm(kvm); |
3234 |
++ |
3235 |
+ kvm_free_irq_routing(kvm); |
3236 |
+ for (i = 0; i < KVM_NR_BUSES; i++) { |
3237 |
+ if (kvm->buses[i]) |
3238 |
+@@ -3752,13 +3781,13 @@ static int vm_stat_get(void *_offset, u64 *val) |
3239 |
+ u64 tmp_val; |
3240 |
+ |
3241 |
+ *val = 0; |
3242 |
+- spin_lock(&kvm_lock); |
3243 |
++ mutex_lock(&kvm_lock); |
3244 |
+ list_for_each_entry(kvm, &vm_list, vm_list) { |
3245 |
+ stat_tmp.kvm = kvm; |
3246 |
+ vm_stat_get_per_vm((void *)&stat_tmp, &tmp_val); |
3247 |
+ *val += tmp_val; |
3248 |
+ } |
3249 |
+- spin_unlock(&kvm_lock); |
3250 |
++ mutex_unlock(&kvm_lock); |
3251 |
+ return 0; |
3252 |
+ } |
3253 |
+ |
3254 |
+@@ -3772,13 +3801,13 @@ static int vcpu_stat_get(void *_offset, u64 *val) |
3255 |
+ u64 tmp_val; |
3256 |
+ |
3257 |
+ *val = 0; |
3258 |
+- spin_lock(&kvm_lock); |
3259 |
++ mutex_lock(&kvm_lock); |
3260 |
+ list_for_each_entry(kvm, &vm_list, vm_list) { |
3261 |
+ stat_tmp.kvm = kvm; |
3262 |
+ vcpu_stat_get_per_vm((void *)&stat_tmp, &tmp_val); |
3263 |
+ *val += tmp_val; |
3264 |
+ } |
3265 |
+- spin_unlock(&kvm_lock); |
3266 |
++ mutex_unlock(&kvm_lock); |
3267 |
+ return 0; |
3268 |
+ } |
3269 |
+ |
3270 |
+@@ -3987,3 +4016,86 @@ void kvm_exit(void) |
3271 |
+ kvm_vfio_ops_exit(); |
3272 |
+ } |
3273 |
+ EXPORT_SYMBOL_GPL(kvm_exit); |
3274 |
++ |
3275 |
++struct kvm_vm_worker_thread_context { |
3276 |
++ struct kvm *kvm; |
3277 |
++ struct task_struct *parent; |
3278 |
++ struct completion init_done; |
3279 |
++ kvm_vm_thread_fn_t thread_fn; |
3280 |
++ uintptr_t data; |
3281 |
++ int err; |
3282 |
++}; |
3283 |
++ |
3284 |
++static int kvm_vm_worker_thread(void *context) |
3285 |
++{ |
3286 |
++ /* |
3287 |
++ * The init_context is allocated on the stack of the parent thread, so |
3288 |
++ * we have to locally copy anything that is needed beyond initialization |
3289 |
++ */ |
3290 |
++ struct kvm_vm_worker_thread_context *init_context = context; |
3291 |
++ struct kvm *kvm = init_context->kvm; |
3292 |
++ kvm_vm_thread_fn_t thread_fn = init_context->thread_fn; |
3293 |
++ uintptr_t data = init_context->data; |
3294 |
++ int err; |
3295 |
++ |
3296 |
++ err = kthread_park(current); |
3297 |
++ /* kthread_park(current) is never supposed to return an error */ |
3298 |
++ WARN_ON(err != 0); |
3299 |
++ if (err) |
3300 |
++ goto init_complete; |
3301 |
++ |
3302 |
++ err = cgroup_attach_task_all(init_context->parent, current); |
3303 |
++ if (err) { |
3304 |
++ kvm_err("%s: cgroup_attach_task_all failed with err %d\n", |
3305 |
++ __func__, err); |
3306 |
++ goto init_complete; |
3307 |
++ } |
3308 |
++ |
3309 |
++ set_user_nice(current, task_nice(init_context->parent)); |
3310 |
++ |
3311 |
++init_complete: |
3312 |
++ init_context->err = err; |
3313 |
++ complete(&init_context->init_done); |
3314 |
++ init_context = NULL; |
3315 |
++ |
3316 |
++ if (err) |
3317 |
++ return err; |
3318 |
++ |
3319 |
++ /* Wait to be woken up by the spawner before proceeding. */ |
3320 |
++ kthread_parkme(); |
3321 |
++ |
3322 |
++ if (!kthread_should_stop()) |
3323 |
++ err = thread_fn(kvm, data); |
3324 |
++ |
3325 |
++ return err; |
3326 |
++} |
3327 |
++ |
3328 |
++int kvm_vm_create_worker_thread(struct kvm *kvm, kvm_vm_thread_fn_t thread_fn, |
3329 |
++ uintptr_t data, const char *name, |
3330 |
++ struct task_struct **thread_ptr) |
3331 |
++{ |
3332 |
++ struct kvm_vm_worker_thread_context init_context = {}; |
3333 |
++ struct task_struct *thread; |
3334 |
++ |
3335 |
++ *thread_ptr = NULL; |
3336 |
++ init_context.kvm = kvm; |
3337 |
++ init_context.parent = current; |
3338 |
++ init_context.thread_fn = thread_fn; |
3339 |
++ init_context.data = data; |
3340 |
++ init_completion(&init_context.init_done); |
3341 |
++ |
3342 |
++ thread = kthread_run(kvm_vm_worker_thread, &init_context, |
3343 |
++ "%s-%d", name, task_pid_nr(current)); |
3344 |
++ if (IS_ERR(thread)) |
3345 |
++ return PTR_ERR(thread); |
3346 |
++ |
3347 |
++ /* kthread_run is never supposed to return NULL */ |
3348 |
++ WARN_ON(thread == NULL); |
3349 |
++ |
3350 |
++ wait_for_completion(&init_context.init_done); |
3351 |
++ |
3352 |
++ if (!init_context.err) |
3353 |
++ *thread_ptr = thread; |
3354 |
++ |
3355 |
++ return init_context.err; |
3356 |
++} |