Gentoo Archives: gentoo-commits

From: Mike Pagano <mpagano@g.o>
To: gentoo-commits@l.g.o
Subject: [gentoo-commits] proj/linux-patches:4.9 commit in: /
Date: Fri, 28 Feb 2020 15:29:34
Message-Id: 1582903748.017c60e562fe805f91e2f4ab9bb47f59d61bbbb1.mpagano@gentoo
1 commit: 017c60e562fe805f91e2f4ab9bb47f59d61bbbb1
2 Author: Mike Pagano <mpagano <AT> gentoo <DOT> org>
3 AuthorDate: Fri Feb 28 15:29:08 2020 +0000
4 Commit: Mike Pagano <mpagano <AT> gentoo <DOT> org>
5 CommitDate: Fri Feb 28 15:29:08 2020 +0000
6 URL: https://gitweb.gentoo.org/proj/linux-patches.git/commit/?id=017c60e5
7
8 Linux patch 4.9.215
9
10 Signed-off-by: Mike Pagano <mpagano <AT> gentoo.org>
11
12 0000_README | 4 +
13 1214_linux-4.9.215.patch | 12974 +++++++++++++++++++++++++++++++++++++++++++++
14 2 files changed, 12978 insertions(+)
15
16 diff --git a/0000_README b/0000_README
17 index bc32b07..76947fa 100644
18 --- a/0000_README
19 +++ b/0000_README
20 @@ -899,6 +899,10 @@ Patch: 1213_linux-4.9.214.patch
21 From: http://www.kernel.org
22 Desc: Linux 4.9.214
23
24 +Patch: 1214_linux-4.9.215.patch
25 +From: http://www.kernel.org
26 +Desc: Linux 4.9.215
27 +
28 Patch: 1500_XATTR_USER_PREFIX.patch
29 From: https://bugs.gentoo.org/show_bug.cgi?id=470644
30 Desc: Support for namespace user.pax.* on tmpfs.
31
32 diff --git a/1214_linux-4.9.215.patch b/1214_linux-4.9.215.patch
33 new file mode 100644
34 index 0000000..4f2ddb5
35 --- /dev/null
36 +++ b/1214_linux-4.9.215.patch
37 @@ -0,0 +1,12974 @@
38 +diff --git a/Makefile b/Makefile
39 +index 9a6aa41a9ec1..b594484788a8 100644
40 +--- a/Makefile
41 ++++ b/Makefile
42 +@@ -1,6 +1,6 @@
43 + VERSION = 4
44 + PATCHLEVEL = 9
45 +-SUBLEVEL = 214
46 ++SUBLEVEL = 215
47 + EXTRAVERSION =
48 + NAME = Roaring Lionus
49 +
50 +diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
51 +index 74a70f91b01a..56bd9beb6a35 100644
52 +--- a/arch/arm/Kconfig
53 ++++ b/arch/arm/Kconfig
54 +@@ -2020,7 +2020,7 @@ config XIP_PHYS_ADDR
55 + config KEXEC
56 + bool "Kexec system call (EXPERIMENTAL)"
57 + depends on (!SMP || PM_SLEEP_SMP)
58 +- depends on !CPU_V7M
59 ++ depends on MMU
60 + select KEXEC_CORE
61 + help
62 + kexec is a system call that implements the ability to shutdown your
63 +diff --git a/arch/arm/boot/dts/r8a7779.dtsi b/arch/arm/boot/dts/r8a7779.dtsi
64 +index b9bbcce69dfb..6c6d4893e92d 100644
65 +--- a/arch/arm/boot/dts/r8a7779.dtsi
66 ++++ b/arch/arm/boot/dts/r8a7779.dtsi
67 +@@ -67,6 +67,14 @@
68 + <0xf0000100 0x100>;
69 + };
70 +
71 ++ timer@f0000200 {
72 ++ compatible = "arm,cortex-a9-global-timer";
73 ++ reg = <0xf0000200 0x100>;
74 ++ interrupts = <GIC_PPI 11
75 ++ (GIC_CPU_MASK_SIMPLE(4) | IRQ_TYPE_EDGE_RISING)>;
76 ++ clocks = <&cpg_clocks R8A7779_CLK_ZS>;
77 ++ };
78 ++
79 + timer@f0000600 {
80 + compatible = "arm,cortex-a9-twd-timer";
81 + reg = <0xf0000600 0x20>;
82 +diff --git a/arch/arm64/include/asm/alternative.h b/arch/arm64/include/asm/alternative.h
83 +index 7e842dcae450..3626655175a2 100644
84 +--- a/arch/arm64/include/asm/alternative.h
85 ++++ b/arch/arm64/include/asm/alternative.h
86 +@@ -29,13 +29,16 @@ typedef void (*alternative_cb_t)(struct alt_instr *alt,
87 + void __init apply_alternatives_all(void);
88 + void apply_alternatives(void *start, size_t length);
89 +
90 +-#define ALTINSTR_ENTRY(feature,cb) \
91 ++#define ALTINSTR_ENTRY(feature) \
92 + " .word 661b - .\n" /* label */ \
93 +- " .if " __stringify(cb) " == 0\n" \
94 + " .word 663f - .\n" /* new instruction */ \
95 +- " .else\n" \
96 ++ " .hword " __stringify(feature) "\n" /* feature bit */ \
97 ++ " .byte 662b-661b\n" /* source len */ \
98 ++ " .byte 664f-663f\n" /* replacement len */
99 ++
100 ++#define ALTINSTR_ENTRY_CB(feature, cb) \
101 ++ " .word 661b - .\n" /* label */ \
102 + " .word " __stringify(cb) "- .\n" /* callback */ \
103 +- " .endif\n" \
104 + " .hword " __stringify(feature) "\n" /* feature bit */ \
105 + " .byte 662b-661b\n" /* source len */ \
106 + " .byte 664f-663f\n" /* replacement len */
107 +@@ -56,15 +59,14 @@ void apply_alternatives(void *start, size_t length);
108 + *
109 + * Alternatives with callbacks do not generate replacement instructions.
110 + */
111 +-#define __ALTERNATIVE_CFG(oldinstr, newinstr, feature, cfg_enabled, cb) \
112 ++#define __ALTERNATIVE_CFG(oldinstr, newinstr, feature, cfg_enabled) \
113 + ".if "__stringify(cfg_enabled)" == 1\n" \
114 + "661:\n\t" \
115 + oldinstr "\n" \
116 + "662:\n" \
117 + ".pushsection .altinstructions,\"a\"\n" \
118 +- ALTINSTR_ENTRY(feature,cb) \
119 ++ ALTINSTR_ENTRY(feature) \
120 + ".popsection\n" \
121 +- " .if " __stringify(cb) " == 0\n" \
122 + ".pushsection .altinstr_replacement, \"a\"\n" \
123 + "663:\n\t" \
124 + newinstr "\n" \
125 +@@ -72,17 +74,25 @@ void apply_alternatives(void *start, size_t length);
126 + ".popsection\n\t" \
127 + ".org . - (664b-663b) + (662b-661b)\n\t" \
128 + ".org . - (662b-661b) + (664b-663b)\n" \
129 +- ".else\n\t" \
130 ++ ".endif\n"
131 ++
132 ++#define __ALTERNATIVE_CFG_CB(oldinstr, feature, cfg_enabled, cb) \
133 ++ ".if "__stringify(cfg_enabled)" == 1\n" \
134 ++ "661:\n\t" \
135 ++ oldinstr "\n" \
136 ++ "662:\n" \
137 ++ ".pushsection .altinstructions,\"a\"\n" \
138 ++ ALTINSTR_ENTRY_CB(feature, cb) \
139 ++ ".popsection\n" \
140 + "663:\n\t" \
141 + "664:\n\t" \
142 +- ".endif\n" \
143 + ".endif\n"
144 +
145 + #define _ALTERNATIVE_CFG(oldinstr, newinstr, feature, cfg, ...) \
146 +- __ALTERNATIVE_CFG(oldinstr, newinstr, feature, IS_ENABLED(cfg), 0)
147 ++ __ALTERNATIVE_CFG(oldinstr, newinstr, feature, IS_ENABLED(cfg))
148 +
149 + #define ALTERNATIVE_CB(oldinstr, cb) \
150 +- __ALTERNATIVE_CFG(oldinstr, "NOT_AN_INSTRUCTION", ARM64_CB_PATCH, 1, cb)
151 ++ __ALTERNATIVE_CFG_CB(oldinstr, ARM64_CB_PATCH, 1, cb)
152 + #else
153 +
154 + #include <asm/assembler.h>
155 +diff --git a/arch/microblaze/kernel/cpu/cache.c b/arch/microblaze/kernel/cpu/cache.c
156 +index 0bde47e4fa69..dcba53803fa5 100644
157 +--- a/arch/microblaze/kernel/cpu/cache.c
158 ++++ b/arch/microblaze/kernel/cpu/cache.c
159 +@@ -92,7 +92,8 @@ static inline void __disable_dcache_nomsr(void)
160 + #define CACHE_LOOP_LIMITS(start, end, cache_line_length, cache_size) \
161 + do { \
162 + int align = ~(cache_line_length - 1); \
163 +- end = min(start + cache_size, end); \
164 ++ if (start < UINT_MAX - cache_size) \
165 ++ end = min(start + cache_size, end); \
166 + start &= align; \
167 + } while (0)
168 +
169 +diff --git a/arch/mips/loongson64/loongson-3/platform.c b/arch/mips/loongson64/loongson-3/platform.c
170 +index 25a97cc0ee33..0db4cc3196eb 100644
171 +--- a/arch/mips/loongson64/loongson-3/platform.c
172 ++++ b/arch/mips/loongson64/loongson-3/platform.c
173 +@@ -31,6 +31,9 @@ static int __init loongson3_platform_init(void)
174 + continue;
175 +
176 + pdev = kzalloc(sizeof(struct platform_device), GFP_KERNEL);
177 ++ if (!pdev)
178 ++ return -ENOMEM;
179 ++
180 + pdev->name = loongson_sysconf.sensors[i].name;
181 + pdev->id = loongson_sysconf.sensors[i].id;
182 + pdev->dev.platform_data = &loongson_sysconf.sensors[i];
183 +diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c
184 +index 620e08d4eb6e..adac3dee4c57 100644
185 +--- a/arch/powerpc/kernel/eeh_driver.c
186 ++++ b/arch/powerpc/kernel/eeh_driver.c
187 +@@ -520,12 +520,6 @@ static void *eeh_rmv_device(void *data, void *userdata)
188 +
189 + pci_iov_remove_virtfn(edev->physfn, pdn->vf_index, 0);
190 + edev->pdev = NULL;
191 +-
192 +- /*
193 +- * We have to set the VF PE number to invalid one, which is
194 +- * required to plug the VF successfully.
195 +- */
196 +- pdn->pe_number = IODA_INVALID_PE;
197 + #endif
198 + if (rmv_data)
199 + list_add(&edev->rmv_list, &rmv_data->edev_list);
200 +diff --git a/arch/powerpc/kernel/pci_dn.c b/arch/powerpc/kernel/pci_dn.c
201 +index 592693437070..c8f1b78fbd0e 100644
202 +--- a/arch/powerpc/kernel/pci_dn.c
203 ++++ b/arch/powerpc/kernel/pci_dn.c
204 +@@ -271,9 +271,22 @@ void remove_dev_pci_data(struct pci_dev *pdev)
205 + continue;
206 +
207 + #ifdef CONFIG_EEH
208 +- /* Release EEH device for the VF */
209 ++ /*
210 ++ * Release EEH state for this VF. The PCI core
211 ++ * has already torn down the pci_dev for this VF, but
212 ++ * we're responsible to removing the eeh_dev since it
213 ++ * has the same lifetime as the pci_dn that spawned it.
214 ++ */
215 + edev = pdn_to_eeh_dev(pdn);
216 + if (edev) {
217 ++ /*
218 ++ * We allocate pci_dn's for the totalvfs count,
219 ++ * but only only the vfs that were activated
220 ++ * have a configured PE.
221 ++ */
222 ++ if (edev->pe)
223 ++ eeh_rmv_from_parent_pe(edev);
224 ++
225 + pdn->edev = NULL;
226 + kfree(edev);
227 + }
228 +diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
229 +index 3ec673b4ca6c..b787a669a1e2 100644
230 +--- a/arch/powerpc/platforms/powernv/pci-ioda.c
231 ++++ b/arch/powerpc/platforms/powernv/pci-ioda.c
232 +@@ -1524,6 +1524,10 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs)
233 +
234 + /* Reserve PE for each VF */
235 + for (vf_index = 0; vf_index < num_vfs; vf_index++) {
236 ++ int vf_devfn = pci_iov_virtfn_devfn(pdev, vf_index);
237 ++ int vf_bus = pci_iov_virtfn_bus(pdev, vf_index);
238 ++ struct pci_dn *vf_pdn;
239 ++
240 + if (pdn->m64_single_mode)
241 + pe_num = pdn->pe_num_map[vf_index];
242 + else
243 +@@ -1536,13 +1540,11 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs)
244 + pe->pbus = NULL;
245 + pe->parent_dev = pdev;
246 + pe->mve_number = -1;
247 +- pe->rid = (pci_iov_virtfn_bus(pdev, vf_index) << 8) |
248 +- pci_iov_virtfn_devfn(pdev, vf_index);
249 ++ pe->rid = (vf_bus << 8) | vf_devfn;
250 +
251 + pe_info(pe, "VF %04d:%02d:%02d.%d associated with PE#%d\n",
252 + hose->global_number, pdev->bus->number,
253 +- PCI_SLOT(pci_iov_virtfn_devfn(pdev, vf_index)),
254 +- PCI_FUNC(pci_iov_virtfn_devfn(pdev, vf_index)), pe_num);
255 ++ PCI_SLOT(vf_devfn), PCI_FUNC(vf_devfn), pe_num);
256 +
257 + if (pnv_ioda_configure_pe(phb, pe)) {
258 + /* XXX What do we do here ? */
259 +@@ -1556,6 +1558,15 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs)
260 + list_add_tail(&pe->list, &phb->ioda.pe_list);
261 + mutex_unlock(&phb->ioda.pe_list_mutex);
262 +
263 ++ /* associate this pe to it's pdn */
264 ++ list_for_each_entry(vf_pdn, &pdn->parent->child_list, list) {
265 ++ if (vf_pdn->busno == vf_bus &&
266 ++ vf_pdn->devfn == vf_devfn) {
267 ++ vf_pdn->pe_number = pe_num;
268 ++ break;
269 ++ }
270 ++ }
271 ++
272 + pnv_pci_ioda2_setup_dma_pe(phb, pe);
273 + }
274 + }
275 +diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
276 +index 00dbf1e895a9..2ed7627e991e 100644
277 +--- a/arch/powerpc/platforms/powernv/pci.c
278 ++++ b/arch/powerpc/platforms/powernv/pci.c
279 +@@ -856,16 +856,12 @@ void pnv_pci_dma_dev_setup(struct pci_dev *pdev)
280 + struct pnv_phb *phb = hose->private_data;
281 + #ifdef CONFIG_PCI_IOV
282 + struct pnv_ioda_pe *pe;
283 +- struct pci_dn *pdn;
284 +
285 + /* Fix the VF pdn PE number */
286 + if (pdev->is_virtfn) {
287 +- pdn = pci_get_pdn(pdev);
288 +- WARN_ON(pdn->pe_number != IODA_INVALID_PE);
289 + list_for_each_entry(pe, &phb->ioda.pe_list, list) {
290 + if (pe->rid == ((pdev->bus->number << 8) |
291 + (pdev->devfn & 0xff))) {
292 +- pdn->pe_number = pe->pe_number;
293 + pe->pdev = pdev;
294 + break;
295 + }
296 +diff --git a/arch/s390/include/asm/page.h b/arch/s390/include/asm/page.h
297 +index 69b8a41fca84..e094c0cf6936 100644
298 +--- a/arch/s390/include/asm/page.h
299 ++++ b/arch/s390/include/asm/page.h
300 +@@ -35,7 +35,7 @@ void __storage_key_init_range(unsigned long start, unsigned long end);
301 +
302 + static inline void storage_key_init_range(unsigned long start, unsigned long end)
303 + {
304 +- if (PAGE_DEFAULT_KEY)
305 ++ if (PAGE_DEFAULT_KEY != 0)
306 + __storage_key_init_range(start, end);
307 + }
308 +
309 +diff --git a/arch/s390/include/asm/timex.h b/arch/s390/include/asm/timex.h
310 +index 0bb08f341c09..f1330245b584 100644
311 +--- a/arch/s390/include/asm/timex.h
312 ++++ b/arch/s390/include/asm/timex.h
313 +@@ -146,7 +146,7 @@ static inline void get_tod_clock_ext(char *clk)
314 +
315 + static inline unsigned long long get_tod_clock(void)
316 + {
317 +- unsigned char clk[STORE_CLOCK_EXT_SIZE];
318 ++ char clk[STORE_CLOCK_EXT_SIZE];
319 +
320 + get_tod_clock_ext(clk);
321 + return *((unsigned long long *)&clk[1]);
322 +diff --git a/arch/s390/kernel/mcount.S b/arch/s390/kernel/mcount.S
323 +index be75e8e49e43..802a4ded9a62 100644
324 +--- a/arch/s390/kernel/mcount.S
325 ++++ b/arch/s390/kernel/mcount.S
326 +@@ -24,6 +24,12 @@ ENTRY(ftrace_stub)
327 + #define STACK_PTREGS (STACK_FRAME_OVERHEAD)
328 + #define STACK_PTREGS_GPRS (STACK_PTREGS + __PT_GPRS)
329 + #define STACK_PTREGS_PSW (STACK_PTREGS + __PT_PSW)
330 ++#ifdef __PACK_STACK
331 ++/* allocate just enough for r14, r15 and backchain */
332 ++#define TRACED_FUNC_FRAME_SIZE 24
333 ++#else
334 ++#define TRACED_FUNC_FRAME_SIZE STACK_FRAME_OVERHEAD
335 ++#endif
336 +
337 + ENTRY(_mcount)
338 + BR_EX %r14
339 +@@ -37,9 +43,16 @@ ENTRY(ftrace_caller)
340 + #ifndef CC_USING_HOTPATCH
341 + aghi %r0,MCOUNT_RETURN_FIXUP
342 + #endif
343 +- aghi %r15,-STACK_FRAME_SIZE
344 ++ # allocate stack frame for ftrace_caller to contain traced function
345 ++ aghi %r15,-TRACED_FUNC_FRAME_SIZE
346 + stg %r1,__SF_BACKCHAIN(%r15)
347 ++ stg %r0,(__SF_GPRS+8*8)(%r15)
348 ++ stg %r15,(__SF_GPRS+9*8)(%r15)
349 ++ # allocate pt_regs and stack frame for ftrace_trace_function
350 ++ aghi %r15,-STACK_FRAME_SIZE
351 + stg %r1,(STACK_PTREGS_GPRS+15*8)(%r15)
352 ++ aghi %r1,-TRACED_FUNC_FRAME_SIZE
353 ++ stg %r1,__SF_BACKCHAIN(%r15)
354 + stg %r0,(STACK_PTREGS_PSW+8)(%r15)
355 + stmg %r2,%r14,(STACK_PTREGS_GPRS+2*8)(%r15)
356 + #ifdef CONFIG_HAVE_MARCH_Z196_FEATURES
357 +diff --git a/arch/sh/include/cpu-sh2a/cpu/sh7269.h b/arch/sh/include/cpu-sh2a/cpu/sh7269.h
358 +index 2a0ca8780f0d..e4caddd443da 100644
359 +--- a/arch/sh/include/cpu-sh2a/cpu/sh7269.h
360 ++++ b/arch/sh/include/cpu-sh2a/cpu/sh7269.h
361 +@@ -79,8 +79,15 @@ enum {
362 + GPIO_FN_WDTOVF,
363 +
364 + /* CAN */
365 +- GPIO_FN_CTX1, GPIO_FN_CRX1, GPIO_FN_CTX0, GPIO_FN_CTX0_CTX1,
366 +- GPIO_FN_CRX0, GPIO_FN_CRX0_CRX1, GPIO_FN_CRX0_CRX1_CRX2,
367 ++ GPIO_FN_CTX2, GPIO_FN_CRX2,
368 ++ GPIO_FN_CTX1, GPIO_FN_CRX1,
369 ++ GPIO_FN_CTX0, GPIO_FN_CRX0,
370 ++ GPIO_FN_CTX0_CTX1, GPIO_FN_CRX0_CRX1,
371 ++ GPIO_FN_CTX0_CTX1_CTX2, GPIO_FN_CRX0_CRX1_CRX2,
372 ++ GPIO_FN_CTX2_PJ21, GPIO_FN_CRX2_PJ20,
373 ++ GPIO_FN_CTX1_PJ23, GPIO_FN_CRX1_PJ22,
374 ++ GPIO_FN_CTX0_CTX1_PJ23, GPIO_FN_CRX0_CRX1_PJ22,
375 ++ GPIO_FN_CTX0_CTX1_CTX2_PJ21, GPIO_FN_CRX0_CRX1_CRX2_PJ20,
376 +
377 + /* DMAC */
378 + GPIO_FN_TEND0, GPIO_FN_DACK0, GPIO_FN_DREQ0,
379 +diff --git a/arch/sparc/kernel/vmlinux.lds.S b/arch/sparc/kernel/vmlinux.lds.S
380 +index 572db686f845..385d6d04564d 100644
381 +--- a/arch/sparc/kernel/vmlinux.lds.S
382 ++++ b/arch/sparc/kernel/vmlinux.lds.S
383 +@@ -151,12 +151,14 @@ SECTIONS
384 + }
385 + PERCPU_SECTION(SMP_CACHE_BYTES)
386 +
387 +-#ifdef CONFIG_JUMP_LABEL
388 + . = ALIGN(PAGE_SIZE);
389 + .exit.text : {
390 + EXIT_TEXT
391 + }
392 +-#endif
393 ++
394 ++ .exit.data : {
395 ++ EXIT_DATA
396 ++ }
397 +
398 + . = ALIGN(PAGE_SIZE);
399 + __init_end = .;
400 +diff --git a/arch/x86/entry/vdso/vdso32-setup.c b/arch/x86/entry/vdso/vdso32-setup.c
401 +index 3f9d1a83891a..50c1f77cab15 100644
402 +--- a/arch/x86/entry/vdso/vdso32-setup.c
403 ++++ b/arch/x86/entry/vdso/vdso32-setup.c
404 +@@ -10,6 +10,7 @@
405 + #include <linux/smp.h>
406 + #include <linux/kernel.h>
407 + #include <linux/mm_types.h>
408 ++#include <linux/elf.h>
409 +
410 + #include <asm/processor.h>
411 + #include <asm/vdso.h>
412 +diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c
413 +index 00b56cc69d37..836b7e4a2005 100644
414 +--- a/arch/x86/events/amd/core.c
415 ++++ b/arch/x86/events/amd/core.c
416 +@@ -239,6 +239,7 @@ static const u64 amd_f17h_perfmon_event_map[PERF_COUNT_HW_MAX] =
417 + [PERF_COUNT_HW_CPU_CYCLES] = 0x0076,
418 + [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
419 + [PERF_COUNT_HW_CACHE_REFERENCES] = 0xff60,
420 ++ [PERF_COUNT_HW_CACHE_MISSES] = 0x0964,
421 + [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c2,
422 + [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c3,
423 + [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x0287,
424 +diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
425 +index ad31c01f810f..f562ddbeb20c 100644
426 +--- a/arch/x86/events/intel/ds.c
427 ++++ b/arch/x86/events/intel/ds.c
428 +@@ -1326,6 +1326,8 @@ intel_pmu_save_and_restart_reload(struct perf_event *event, int count)
429 + old = ((s64)(prev_raw_count << shift) >> shift);
430 + local64_add(new - old + count * period, &event->count);
431 +
432 ++ local64_set(&hwc->period_left, -new);
433 ++
434 + perf_event_update_userpage(event);
435 +
436 + return 0;
437 +diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
438 +index ccc4420f051b..fb457ba8ccc6 100644
439 +--- a/arch/x86/include/asm/cpufeatures.h
440 ++++ b/arch/x86/include/asm/cpufeatures.h
441 +@@ -305,6 +305,7 @@
442 + /* Intel-defined CPU features, CPUID level 0x00000007:0 (ecx), word 16 */
443 + #define X86_FEATURE_PKU (16*32+ 3) /* Protection Keys for Userspace */
444 + #define X86_FEATURE_OSPKE (16*32+ 4) /* OS Protection Keys Enable */
445 ++#define X86_FEATURE_RDPID (16*32+ 22) /* RDPID instruction */
446 +
447 + /* AMD-defined CPU features, CPUID level 0x80000007 (ebx), word 17 */
448 + #define X86_FEATURE_OVERFLOW_RECOV (17*32+0) /* MCA overflow recovery support */
449 +diff --git a/arch/x86/include/asm/vgtod.h b/arch/x86/include/asm/vgtod.h
450 +index e728699db774..3a01996db58f 100644
451 +--- a/arch/x86/include/asm/vgtod.h
452 ++++ b/arch/x86/include/asm/vgtod.h
453 +@@ -89,8 +89,13 @@ static inline unsigned int __getcpu(void)
454 + * works on all CPUs. This is volatile so that it orders
455 + * correctly wrt barrier() and to keep gcc from cleverly
456 + * hoisting it out of the calling function.
457 ++ *
458 ++ * If RDPID is available, use it.
459 + */
460 +- asm volatile ("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG));
461 ++ alternative_io ("lsl %[p],%[seg]",
462 ++ ".byte 0xf3,0x0f,0xc7,0xf8", /* RDPID %eax/rax */
463 ++ X86_FEATURE_RDPID,
464 ++ [p] "=a" (p), [seg] "r" (__PER_CPU_SEG));
465 +
466 + return p;
467 + }
468 +diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
469 +index 2a473cda3977..775d5f028fe8 100644
470 +--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
471 ++++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
472 +@@ -846,9 +846,12 @@ static const struct sysfs_ops threshold_ops = {
473 + .store = store,
474 + };
475 +
476 ++static void threshold_block_release(struct kobject *kobj);
477 ++
478 + static struct kobj_type threshold_ktype = {
479 + .sysfs_ops = &threshold_ops,
480 + .default_attrs = default_attrs,
481 ++ .release = threshold_block_release,
482 + };
483 +
484 + static const char *get_name(unsigned int bank, struct threshold_block *b)
485 +@@ -879,8 +882,9 @@ static const char *get_name(unsigned int bank, struct threshold_block *b)
486 + return buf_mcatype;
487 + }
488 +
489 +-static int allocate_threshold_blocks(unsigned int cpu, unsigned int bank,
490 +- unsigned int block, u32 address)
491 ++static int allocate_threshold_blocks(unsigned int cpu, struct threshold_bank *tb,
492 ++ unsigned int bank, unsigned int block,
493 ++ u32 address)
494 + {
495 + struct threshold_block *b = NULL;
496 + u32 low, high;
497 +@@ -924,16 +928,12 @@ static int allocate_threshold_blocks(unsigned int cpu, unsigned int bank,
498 +
499 + INIT_LIST_HEAD(&b->miscj);
500 +
501 +- if (per_cpu(threshold_banks, cpu)[bank]->blocks) {
502 +- list_add(&b->miscj,
503 +- &per_cpu(threshold_banks, cpu)[bank]->blocks->miscj);
504 +- } else {
505 +- per_cpu(threshold_banks, cpu)[bank]->blocks = b;
506 +- }
507 ++ if (tb->blocks)
508 ++ list_add(&b->miscj, &tb->blocks->miscj);
509 ++ else
510 ++ tb->blocks = b;
511 +
512 +- err = kobject_init_and_add(&b->kobj, &threshold_ktype,
513 +- per_cpu(threshold_banks, cpu)[bank]->kobj,
514 +- get_name(bank, b));
515 ++ err = kobject_init_and_add(&b->kobj, &threshold_ktype, tb->kobj, get_name(bank, b));
516 + if (err)
517 + goto out_free;
518 + recurse:
519 +@@ -941,7 +941,7 @@ recurse:
520 + if (!address)
521 + return 0;
522 +
523 +- err = allocate_threshold_blocks(cpu, bank, block, address);
524 ++ err = allocate_threshold_blocks(cpu, tb, bank, block, address);
525 + if (err)
526 + goto out_free;
527 +
528 +@@ -1026,8 +1026,6 @@ static int threshold_create_bank(unsigned int cpu, unsigned int bank)
529 + goto out_free;
530 + }
531 +
532 +- per_cpu(threshold_banks, cpu)[bank] = b;
533 +-
534 + if (is_shared_bank(bank)) {
535 + atomic_set(&b->cpus, 1);
536 +
537 +@@ -1038,9 +1036,13 @@ static int threshold_create_bank(unsigned int cpu, unsigned int bank)
538 + }
539 + }
540 +
541 +- err = allocate_threshold_blocks(cpu, bank, 0, msr_ops.misc(bank));
542 +- if (!err)
543 +- goto out;
544 ++ err = allocate_threshold_blocks(cpu, b, bank, 0, msr_ops.misc(bank));
545 ++ if (err)
546 ++ goto out_free;
547 ++
548 ++ per_cpu(threshold_banks, cpu)[bank] = b;
549 ++
550 ++ return 0;
551 +
552 + out_free:
553 + kfree(b);
554 +@@ -1074,8 +1076,12 @@ static int threshold_create_device(unsigned int cpu)
555 + return err;
556 + }
557 +
558 +-static void deallocate_threshold_block(unsigned int cpu,
559 +- unsigned int bank)
560 ++static void threshold_block_release(struct kobject *kobj)
561 ++{
562 ++ kfree(to_block(kobj));
563 ++}
564 ++
565 ++static void deallocate_threshold_block(unsigned int cpu, unsigned int bank)
566 + {
567 + struct threshold_block *pos = NULL;
568 + struct threshold_block *tmp = NULL;
569 +@@ -1085,13 +1091,11 @@ static void deallocate_threshold_block(unsigned int cpu,
570 + return;
571 +
572 + list_for_each_entry_safe(pos, tmp, &head->blocks->miscj, miscj) {
573 +- kobject_put(&pos->kobj);
574 + list_del(&pos->miscj);
575 +- kfree(pos);
576 ++ kobject_put(&pos->kobj);
577 + }
578 +
579 +- kfree(per_cpu(threshold_banks, cpu)[bank]->blocks);
580 +- per_cpu(threshold_banks, cpu)[bank]->blocks = NULL;
581 ++ kobject_put(&head->blocks->kobj);
582 + }
583 +
584 + static void __threshold_remove_blocks(struct threshold_bank *b)
585 +diff --git a/arch/x86/kernel/sysfb_simplefb.c b/arch/x86/kernel/sysfb_simplefb.c
586 +index 85195d447a92..f3215346e47f 100644
587 +--- a/arch/x86/kernel/sysfb_simplefb.c
588 ++++ b/arch/x86/kernel/sysfb_simplefb.c
589 +@@ -94,11 +94,11 @@ __init int create_simplefb(const struct screen_info *si,
590 + if (si->orig_video_isVGA == VIDEO_TYPE_VLFB)
591 + size <<= 16;
592 + length = mode->height * mode->stride;
593 +- length = PAGE_ALIGN(length);
594 + if (length > size) {
595 + printk(KERN_WARNING "sysfb: VRAM smaller than advertised\n");
596 + return -EINVAL;
597 + }
598 ++ length = PAGE_ALIGN(length);
599 +
600 + /* setup IORESOURCE_MEM as framebuffer memory */
601 + memset(&res, 0, sizeof(res));
602 +diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
603 +index 242ad06fbe1a..c57dab0884fe 100644
604 +--- a/arch/x86/kvm/cpuid.c
605 ++++ b/arch/x86/kvm/cpuid.c
606 +@@ -279,13 +279,18 @@ static int __do_cpuid_ent_emulated(struct kvm_cpuid_entry2 *entry,
607 + {
608 + switch (func) {
609 + case 0:
610 +- entry->eax = 1; /* only one leaf currently */
611 ++ entry->eax = 7;
612 + ++*nent;
613 + break;
614 + case 1:
615 + entry->ecx = F(MOVBE);
616 + ++*nent;
617 + break;
618 ++ case 7:
619 ++ entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
620 ++ if (index == 0)
621 ++ entry->ecx = F(RDPID);
622 ++ ++*nent;
623 + default:
624 + break;
625 + }
626 +diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
627 +index c456a9dbade8..e9c7090858d6 100644
628 +--- a/arch/x86/kvm/emulate.c
629 ++++ b/arch/x86/kvm/emulate.c
630 +@@ -3531,6 +3531,16 @@ static int em_cwd(struct x86_emulate_ctxt *ctxt)
631 + return X86EMUL_CONTINUE;
632 + }
633 +
634 ++static int em_rdpid(struct x86_emulate_ctxt *ctxt)
635 ++{
636 ++ u64 tsc_aux = 0;
637 ++
638 ++ if (ctxt->ops->get_msr(ctxt, MSR_TSC_AUX, &tsc_aux))
639 ++ return emulate_gp(ctxt, 0);
640 ++ ctxt->dst.val = tsc_aux;
641 ++ return X86EMUL_CONTINUE;
642 ++}
643 ++
644 + static int em_rdtsc(struct x86_emulate_ctxt *ctxt)
645 + {
646 + u64 tsc = 0;
647 +@@ -4391,10 +4401,20 @@ static const struct opcode group8[] = {
648 + F(DstMem | SrcImmByte | Lock | PageTable, em_btc),
649 + };
650 +
651 ++/*
652 ++ * The "memory" destination is actually always a register, since we come
653 ++ * from the register case of group9.
654 ++ */
655 ++static const struct gprefix pfx_0f_c7_7 = {
656 ++ N, N, N, II(DstMem | ModRM | Op3264 | EmulateOnUD, em_rdpid, rdtscp),
657 ++};
658 ++
659 ++
660 + static const struct group_dual group9 = { {
661 + N, I(DstMem64 | Lock | PageTable, em_cmpxchg8b), N, N, N, N, N, N,
662 + }, {
663 +- N, N, N, N, N, N, N, N,
664 ++ N, N, N, N, N, N, N,
665 ++ GP(0, &pfx_0f_c7_7),
666 + } };
667 +
668 + static const struct opcode group11[] = {
669 +diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c
670 +index 6c0191615f23..cf8b3c17657a 100644
671 +--- a/arch/x86/kvm/irq_comm.c
672 ++++ b/arch/x86/kvm/irq_comm.c
673 +@@ -436,7 +436,7 @@ void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu,
674 +
675 + kvm_set_msi_irq(vcpu->kvm, entry, &irq);
676 +
677 +- if (irq.level && kvm_apic_match_dest(vcpu, NULL, 0,
678 ++ if (irq.trig_mode && kvm_apic_match_dest(vcpu, NULL, 0,
679 + irq.dest_id, irq.dest_mode))
680 + __set_bit(irq.vector, ioapic_handled_vectors);
681 + }
682 +diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
683 +index caa17f8d4221..3988e26af3b5 100644
684 +--- a/arch/x86/kvm/lapic.c
685 ++++ b/arch/x86/kvm/lapic.c
686 +@@ -532,9 +532,11 @@ static inline bool pv_eoi_enabled(struct kvm_vcpu *vcpu)
687 + static bool pv_eoi_get_pending(struct kvm_vcpu *vcpu)
688 + {
689 + u8 val;
690 +- if (pv_eoi_get_user(vcpu, &val) < 0)
691 ++ if (pv_eoi_get_user(vcpu, &val) < 0) {
692 + apic_debug("Can't read EOI MSR value: 0x%llx\n",
693 + (unsigned long long)vcpu->arch.pv_eoi.msr_val);
694 ++ return false;
695 ++ }
696 + return val & 0x1;
697 + }
698 +
699 +diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
700 +index 67cdb08a736f..8bd336651de5 100644
701 +--- a/arch/x86/kvm/vmx.c
702 ++++ b/arch/x86/kvm/vmx.c
703 +@@ -4641,6 +4641,26 @@ static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu)
704 + (ss.selector & SEGMENT_RPL_MASK));
705 + }
706 +
707 ++static bool nested_vmx_check_io_bitmaps(struct kvm_vcpu *vcpu,
708 ++ unsigned int port, int size);
709 ++static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu,
710 ++ struct vmcs12 *vmcs12)
711 ++{
712 ++ unsigned long exit_qualification;
713 ++ unsigned short port;
714 ++ int size;
715 ++
716 ++ if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
717 ++ return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING);
718 ++
719 ++ exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
720 ++
721 ++ port = exit_qualification >> 16;
722 ++ size = (exit_qualification & 7) + 1;
723 ++
724 ++ return nested_vmx_check_io_bitmaps(vcpu, port, size);
725 ++}
726 ++
727 + /*
728 + * Check if guest state is valid. Returns true if valid, false if
729 + * not.
730 +@@ -8026,23 +8046,17 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
731 + static const int kvm_vmx_max_exit_handlers =
732 + ARRAY_SIZE(kvm_vmx_exit_handlers);
733 +
734 +-static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu,
735 +- struct vmcs12 *vmcs12)
736 ++/*
737 ++ * Return true if an IO instruction with the specified port and size should cause
738 ++ * a VM-exit into L1.
739 ++ */
740 ++bool nested_vmx_check_io_bitmaps(struct kvm_vcpu *vcpu, unsigned int port,
741 ++ int size)
742 + {
743 +- unsigned long exit_qualification;
744 ++ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
745 + gpa_t bitmap, last_bitmap;
746 +- unsigned int port;
747 +- int size;
748 + u8 b;
749 +
750 +- if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
751 +- return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING);
752 +-
753 +- exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
754 +-
755 +- port = exit_qualification >> 16;
756 +- size = (exit_qualification & 7) + 1;
757 +-
758 + last_bitmap = (gpa_t)-1;
759 + b = -1;
760 +
761 +@@ -11335,11 +11349,71 @@ static void nested_vmx_entry_failure(struct kvm_vcpu *vcpu,
762 + to_vmx(vcpu)->nested.sync_shadow_vmcs = true;
763 + }
764 +
765 ++static int vmx_check_intercept_io(struct kvm_vcpu *vcpu,
766 ++ struct x86_instruction_info *info)
767 ++{
768 ++ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
769 ++ unsigned short port;
770 ++ bool intercept;
771 ++ int size;
772 ++
773 ++ if (info->intercept == x86_intercept_in ||
774 ++ info->intercept == x86_intercept_ins) {
775 ++ port = info->src_val;
776 ++ size = info->dst_bytes;
777 ++ } else {
778 ++ port = info->dst_val;
779 ++ size = info->src_bytes;
780 ++ }
781 ++
782 ++ /*
783 ++ * If the 'use IO bitmaps' VM-execution control is 0, IO instruction
784 ++ * VM-exits depend on the 'unconditional IO exiting' VM-execution
785 ++ * control.
786 ++ *
787 ++ * Otherwise, IO instruction VM-exits are controlled by the IO bitmaps.
788 ++ */
789 ++ if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
790 ++ intercept = nested_cpu_has(vmcs12,
791 ++ CPU_BASED_UNCOND_IO_EXITING);
792 ++ else
793 ++ intercept = nested_vmx_check_io_bitmaps(vcpu, port, size);
794 ++
795 ++ return intercept ? X86EMUL_UNHANDLEABLE : X86EMUL_CONTINUE;
796 ++}
797 ++
798 + static int vmx_check_intercept(struct kvm_vcpu *vcpu,
799 + struct x86_instruction_info *info,
800 + enum x86_intercept_stage stage)
801 + {
802 +- return X86EMUL_CONTINUE;
803 ++ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
804 ++ struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
805 ++
806 ++ switch (info->intercept) {
807 ++ /*
808 ++ * RDPID causes #UD if disabled through secondary execution controls.
809 ++ * Because it is marked as EmulateOnUD, we need to intercept it here.
810 ++ */
811 ++ case x86_intercept_rdtscp:
812 ++ if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDTSCP)) {
813 ++ ctxt->exception.vector = UD_VECTOR;
814 ++ ctxt->exception.error_code_valid = false;
815 ++ return X86EMUL_PROPAGATE_FAULT;
816 ++ }
817 ++ break;
818 ++
819 ++ case x86_intercept_in:
820 ++ case x86_intercept_ins:
821 ++ case x86_intercept_out:
822 ++ case x86_intercept_outs:
823 ++ return vmx_check_intercept_io(vcpu, info);
824 ++
825 ++ /* TODO: check more intercepts... */
826 ++ default:
827 ++ break;
828 ++ }
829 ++
830 ++ return X86EMUL_UNHANDLEABLE;
831 + }
832 +
833 + #ifdef CONFIG_X86_64
834 +diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
835 +deleted file mode 100644
836 +index 3791ce8d269e..000000000000
837 +--- a/arch/x86/kvm/vmx/vmx.c
838 ++++ /dev/null
839 +@@ -1,8033 +0,0 @@
840 +-// SPDX-License-Identifier: GPL-2.0-only
841 +-/*
842 +- * Kernel-based Virtual Machine driver for Linux
843 +- *
844 +- * This module enables machines with Intel VT-x extensions to run virtual
845 +- * machines without emulation or binary translation.
846 +- *
847 +- * Copyright (C) 2006 Qumranet, Inc.
848 +- * Copyright 2010 Red Hat, Inc. and/or its affiliates.
849 +- *
850 +- * Authors:
851 +- * Avi Kivity <avi@××××××××.com>
852 +- * Yaniv Kamay <yaniv@××××××××.com>
853 +- */
854 +-
855 +-#include <linux/frame.h>
856 +-#include <linux/highmem.h>
857 +-#include <linux/hrtimer.h>
858 +-#include <linux/kernel.h>
859 +-#include <linux/kvm_host.h>
860 +-#include <linux/module.h>
861 +-#include <linux/moduleparam.h>
862 +-#include <linux/mod_devicetable.h>
863 +-#include <linux/mm.h>
864 +-#include <linux/sched.h>
865 +-#include <linux/sched/smt.h>
866 +-#include <linux/slab.h>
867 +-#include <linux/tboot.h>
868 +-#include <linux/trace_events.h>
869 +-
870 +-#include <asm/apic.h>
871 +-#include <asm/asm.h>
872 +-#include <asm/cpu.h>
873 +-#include <asm/debugreg.h>
874 +-#include <asm/desc.h>
875 +-#include <asm/fpu/internal.h>
876 +-#include <asm/io.h>
877 +-#include <asm/irq_remapping.h>
878 +-#include <asm/kexec.h>
879 +-#include <asm/perf_event.h>
880 +-#include <asm/mce.h>
881 +-#include <asm/mmu_context.h>
882 +-#include <asm/mshyperv.h>
883 +-#include <asm/spec-ctrl.h>
884 +-#include <asm/virtext.h>
885 +-#include <asm/vmx.h>
886 +-
887 +-#include "capabilities.h"
888 +-#include "cpuid.h"
889 +-#include "evmcs.h"
890 +-#include "irq.h"
891 +-#include "kvm_cache_regs.h"
892 +-#include "lapic.h"
893 +-#include "mmu.h"
894 +-#include "nested.h"
895 +-#include "ops.h"
896 +-#include "pmu.h"
897 +-#include "trace.h"
898 +-#include "vmcs.h"
899 +-#include "vmcs12.h"
900 +-#include "vmx.h"
901 +-#include "x86.h"
902 +-
903 +-MODULE_AUTHOR("Qumranet");
904 +-MODULE_LICENSE("GPL");
905 +-
906 +-static const struct x86_cpu_id vmx_cpu_id[] = {
907 +- X86_FEATURE_MATCH(X86_FEATURE_VMX),
908 +- {}
909 +-};
910 +-MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id);
911 +-
912 +-bool __read_mostly enable_vpid = 1;
913 +-module_param_named(vpid, enable_vpid, bool, 0444);
914 +-
915 +-static bool __read_mostly enable_vnmi = 1;
916 +-module_param_named(vnmi, enable_vnmi, bool, S_IRUGO);
917 +-
918 +-bool __read_mostly flexpriority_enabled = 1;
919 +-module_param_named(flexpriority, flexpriority_enabled, bool, S_IRUGO);
920 +-
921 +-bool __read_mostly enable_ept = 1;
922 +-module_param_named(ept, enable_ept, bool, S_IRUGO);
923 +-
924 +-bool __read_mostly enable_unrestricted_guest = 1;
925 +-module_param_named(unrestricted_guest,
926 +- enable_unrestricted_guest, bool, S_IRUGO);
927 +-
928 +-bool __read_mostly enable_ept_ad_bits = 1;
929 +-module_param_named(eptad, enable_ept_ad_bits, bool, S_IRUGO);
930 +-
931 +-static bool __read_mostly emulate_invalid_guest_state = true;
932 +-module_param(emulate_invalid_guest_state, bool, S_IRUGO);
933 +-
934 +-static bool __read_mostly fasteoi = 1;
935 +-module_param(fasteoi, bool, S_IRUGO);
936 +-
937 +-static bool __read_mostly enable_apicv = 1;
938 +-module_param(enable_apicv, bool, S_IRUGO);
939 +-
940 +-/*
941 +- * If nested=1, nested virtualization is supported, i.e., guests may use
942 +- * VMX and be a hypervisor for its own guests. If nested=0, guests may not
943 +- * use VMX instructions.
944 +- */
945 +-static bool __read_mostly nested = 1;
946 +-module_param(nested, bool, S_IRUGO);
947 +-
948 +-bool __read_mostly enable_pml = 1;
949 +-module_param_named(pml, enable_pml, bool, S_IRUGO);
950 +-
951 +-static bool __read_mostly dump_invalid_vmcs = 0;
952 +-module_param(dump_invalid_vmcs, bool, 0644);
953 +-
954 +-#define MSR_BITMAP_MODE_X2APIC 1
955 +-#define MSR_BITMAP_MODE_X2APIC_APICV 2
956 +-
957 +-#define KVM_VMX_TSC_MULTIPLIER_MAX 0xffffffffffffffffULL
958 +-
959 +-/* Guest_tsc -> host_tsc conversion requires 64-bit division. */
960 +-static int __read_mostly cpu_preemption_timer_multi;
961 +-static bool __read_mostly enable_preemption_timer = 1;
962 +-#ifdef CONFIG_X86_64
963 +-module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO);
964 +-#endif
965 +-
966 +-#define KVM_VM_CR0_ALWAYS_OFF (X86_CR0_NW | X86_CR0_CD)
967 +-#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR0_NE
968 +-#define KVM_VM_CR0_ALWAYS_ON \
969 +- (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | \
970 +- X86_CR0_WP | X86_CR0_PG | X86_CR0_PE)
971 +-#define KVM_CR4_GUEST_OWNED_BITS \
972 +- (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \
973 +- | X86_CR4_OSXMMEXCPT | X86_CR4_LA57 | X86_CR4_TSD)
974 +-
975 +-#define KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR4_VMXE
976 +-#define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE)
977 +-#define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE)
978 +-
979 +-#define RMODE_GUEST_OWNED_EFLAGS_BITS (~(X86_EFLAGS_IOPL | X86_EFLAGS_VM))
980 +-
981 +-#define MSR_IA32_RTIT_STATUS_MASK (~(RTIT_STATUS_FILTEREN | \
982 +- RTIT_STATUS_CONTEXTEN | RTIT_STATUS_TRIGGEREN | \
983 +- RTIT_STATUS_ERROR | RTIT_STATUS_STOPPED | \
984 +- RTIT_STATUS_BYTECNT))
985 +-
986 +-#define MSR_IA32_RTIT_OUTPUT_BASE_MASK \
987 +- (~((1UL << cpuid_query_maxphyaddr(vcpu)) - 1) | 0x7f)
988 +-
989 +-/*
990 +- * These 2 parameters are used to config the controls for Pause-Loop Exiting:
991 +- * ple_gap: upper bound on the amount of time between two successive
992 +- * executions of PAUSE in a loop. Also indicate if ple enabled.
993 +- * According to test, this time is usually smaller than 128 cycles.
994 +- * ple_window: upper bound on the amount of time a guest is allowed to execute
995 +- * in a PAUSE loop. Tests indicate that most spinlocks are held for
996 +- * less than 2^12 cycles
997 +- * Time is measured based on a counter that runs at the same rate as the TSC,
998 +- * refer SDM volume 3b section 21.6.13 & 22.1.3.
999 +- */
1000 +-static unsigned int ple_gap = KVM_DEFAULT_PLE_GAP;
1001 +-module_param(ple_gap, uint, 0444);
1002 +-
1003 +-static unsigned int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW;
1004 +-module_param(ple_window, uint, 0444);
1005 +-
1006 +-/* Default doubles per-vcpu window every exit. */
1007 +-static unsigned int ple_window_grow = KVM_DEFAULT_PLE_WINDOW_GROW;
1008 +-module_param(ple_window_grow, uint, 0444);
1009 +-
1010 +-/* Default resets per-vcpu window every exit to ple_window. */
1011 +-static unsigned int ple_window_shrink = KVM_DEFAULT_PLE_WINDOW_SHRINK;
1012 +-module_param(ple_window_shrink, uint, 0444);
1013 +-
1014 +-/* Default is to compute the maximum so we can never overflow. */
1015 +-static unsigned int ple_window_max = KVM_VMX_DEFAULT_PLE_WINDOW_MAX;
1016 +-module_param(ple_window_max, uint, 0444);
1017 +-
1018 +-/* Default is SYSTEM mode, 1 for host-guest mode */
1019 +-int __read_mostly pt_mode = PT_MODE_SYSTEM;
1020 +-module_param(pt_mode, int, S_IRUGO);
1021 +-
1022 +-static DEFINE_STATIC_KEY_FALSE(vmx_l1d_should_flush);
1023 +-static DEFINE_STATIC_KEY_FALSE(vmx_l1d_flush_cond);
1024 +-static DEFINE_MUTEX(vmx_l1d_flush_mutex);
1025 +-
1026 +-/* Storage for pre module init parameter parsing */
1027 +-static enum vmx_l1d_flush_state __read_mostly vmentry_l1d_flush_param = VMENTER_L1D_FLUSH_AUTO;
1028 +-
1029 +-static const struct {
1030 +- const char *option;
1031 +- bool for_parse;
1032 +-} vmentry_l1d_param[] = {
1033 +- [VMENTER_L1D_FLUSH_AUTO] = {"auto", true},
1034 +- [VMENTER_L1D_FLUSH_NEVER] = {"never", true},
1035 +- [VMENTER_L1D_FLUSH_COND] = {"cond", true},
1036 +- [VMENTER_L1D_FLUSH_ALWAYS] = {"always", true},
1037 +- [VMENTER_L1D_FLUSH_EPT_DISABLED] = {"EPT disabled", false},
1038 +- [VMENTER_L1D_FLUSH_NOT_REQUIRED] = {"not required", false},
1039 +-};
1040 +-
1041 +-#define L1D_CACHE_ORDER 4
1042 +-static void *vmx_l1d_flush_pages;
1043 +-
1044 +-static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf)
1045 +-{
1046 +- struct page *page;
1047 +- unsigned int i;
1048 +-
1049 +- if (!boot_cpu_has_bug(X86_BUG_L1TF)) {
1050 +- l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED;
1051 +- return 0;
1052 +- }
1053 +-
1054 +- if (!enable_ept) {
1055 +- l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_EPT_DISABLED;
1056 +- return 0;
1057 +- }
1058 +-
1059 +- if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) {
1060 +- u64 msr;
1061 +-
1062 +- rdmsrl(MSR_IA32_ARCH_CAPABILITIES, msr);
1063 +- if (msr & ARCH_CAP_SKIP_VMENTRY_L1DFLUSH) {
1064 +- l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED;
1065 +- return 0;
1066 +- }
1067 +- }
1068 +-
1069 +- /* If set to auto use the default l1tf mitigation method */
1070 +- if (l1tf == VMENTER_L1D_FLUSH_AUTO) {
1071 +- switch (l1tf_mitigation) {
1072 +- case L1TF_MITIGATION_OFF:
1073 +- l1tf = VMENTER_L1D_FLUSH_NEVER;
1074 +- break;
1075 +- case L1TF_MITIGATION_FLUSH_NOWARN:
1076 +- case L1TF_MITIGATION_FLUSH:
1077 +- case L1TF_MITIGATION_FLUSH_NOSMT:
1078 +- l1tf = VMENTER_L1D_FLUSH_COND;
1079 +- break;
1080 +- case L1TF_MITIGATION_FULL:
1081 +- case L1TF_MITIGATION_FULL_FORCE:
1082 +- l1tf = VMENTER_L1D_FLUSH_ALWAYS;
1083 +- break;
1084 +- }
1085 +- } else if (l1tf_mitigation == L1TF_MITIGATION_FULL_FORCE) {
1086 +- l1tf = VMENTER_L1D_FLUSH_ALWAYS;
1087 +- }
1088 +-
1089 +- if (l1tf != VMENTER_L1D_FLUSH_NEVER && !vmx_l1d_flush_pages &&
1090 +- !boot_cpu_has(X86_FEATURE_FLUSH_L1D)) {
1091 +- /*
1092 +- * This allocation for vmx_l1d_flush_pages is not tied to a VM
1093 +- * lifetime and so should not be charged to a memcg.
1094 +- */
1095 +- page = alloc_pages(GFP_KERNEL, L1D_CACHE_ORDER);
1096 +- if (!page)
1097 +- return -ENOMEM;
1098 +- vmx_l1d_flush_pages = page_address(page);
1099 +-
1100 +- /*
1101 +- * Initialize each page with a different pattern in
1102 +- * order to protect against KSM in the nested
1103 +- * virtualization case.
1104 +- */
1105 +- for (i = 0; i < 1u << L1D_CACHE_ORDER; ++i) {
1106 +- memset(vmx_l1d_flush_pages + i * PAGE_SIZE, i + 1,
1107 +- PAGE_SIZE);
1108 +- }
1109 +- }
1110 +-
1111 +- l1tf_vmx_mitigation = l1tf;
1112 +-
1113 +- if (l1tf != VMENTER_L1D_FLUSH_NEVER)
1114 +- static_branch_enable(&vmx_l1d_should_flush);
1115 +- else
1116 +- static_branch_disable(&vmx_l1d_should_flush);
1117 +-
1118 +- if (l1tf == VMENTER_L1D_FLUSH_COND)
1119 +- static_branch_enable(&vmx_l1d_flush_cond);
1120 +- else
1121 +- static_branch_disable(&vmx_l1d_flush_cond);
1122 +- return 0;
1123 +-}
1124 +-
1125 +-static int vmentry_l1d_flush_parse(const char *s)
1126 +-{
1127 +- unsigned int i;
1128 +-
1129 +- if (s) {
1130 +- for (i = 0; i < ARRAY_SIZE(vmentry_l1d_param); i++) {
1131 +- if (vmentry_l1d_param[i].for_parse &&
1132 +- sysfs_streq(s, vmentry_l1d_param[i].option))
1133 +- return i;
1134 +- }
1135 +- }
1136 +- return -EINVAL;
1137 +-}
1138 +-
1139 +-static int vmentry_l1d_flush_set(const char *s, const struct kernel_param *kp)
1140 +-{
1141 +- int l1tf, ret;
1142 +-
1143 +- l1tf = vmentry_l1d_flush_parse(s);
1144 +- if (l1tf < 0)
1145 +- return l1tf;
1146 +-
1147 +- if (!boot_cpu_has(X86_BUG_L1TF))
1148 +- return 0;
1149 +-
1150 +- /*
1151 +- * Has vmx_init() run already? If not then this is the pre init
1152 +- * parameter parsing. In that case just store the value and let
1153 +- * vmx_init() do the proper setup after enable_ept has been
1154 +- * established.
1155 +- */
1156 +- if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_AUTO) {
1157 +- vmentry_l1d_flush_param = l1tf;
1158 +- return 0;
1159 +- }
1160 +-
1161 +- mutex_lock(&vmx_l1d_flush_mutex);
1162 +- ret = vmx_setup_l1d_flush(l1tf);
1163 +- mutex_unlock(&vmx_l1d_flush_mutex);
1164 +- return ret;
1165 +-}
1166 +-
1167 +-static int vmentry_l1d_flush_get(char *s, const struct kernel_param *kp)
1168 +-{
1169 +- if (WARN_ON_ONCE(l1tf_vmx_mitigation >= ARRAY_SIZE(vmentry_l1d_param)))
1170 +- return sprintf(s, "???\n");
1171 +-
1172 +- return sprintf(s, "%s\n", vmentry_l1d_param[l1tf_vmx_mitigation].option);
1173 +-}
1174 +-
1175 +-static const struct kernel_param_ops vmentry_l1d_flush_ops = {
1176 +- .set = vmentry_l1d_flush_set,
1177 +- .get = vmentry_l1d_flush_get,
1178 +-};
1179 +-module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, NULL, 0644);
1180 +-
1181 +-static bool guest_state_valid(struct kvm_vcpu *vcpu);
1182 +-static u32 vmx_segment_access_rights(struct kvm_segment *var);
1183 +-static __always_inline void vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
1184 +- u32 msr, int type);
1185 +-
1186 +-void vmx_vmexit(void);
1187 +-
1188 +-#define vmx_insn_failed(fmt...) \
1189 +-do { \
1190 +- WARN_ONCE(1, fmt); \
1191 +- pr_warn_ratelimited(fmt); \
1192 +-} while (0)
1193 +-
1194 +-asmlinkage void vmread_error(unsigned long field, bool fault)
1195 +-{
1196 +- if (fault)
1197 +- kvm_spurious_fault();
1198 +- else
1199 +- vmx_insn_failed("kvm: vmread failed: field=%lx\n", field);
1200 +-}
1201 +-
1202 +-noinline void vmwrite_error(unsigned long field, unsigned long value)
1203 +-{
1204 +- vmx_insn_failed("kvm: vmwrite failed: field=%lx val=%lx err=%d\n",
1205 +- field, value, vmcs_read32(VM_INSTRUCTION_ERROR));
1206 +-}
1207 +-
1208 +-noinline void vmclear_error(struct vmcs *vmcs, u64 phys_addr)
1209 +-{
1210 +- vmx_insn_failed("kvm: vmclear failed: %p/%llx\n", vmcs, phys_addr);
1211 +-}
1212 +-
1213 +-noinline void vmptrld_error(struct vmcs *vmcs, u64 phys_addr)
1214 +-{
1215 +- vmx_insn_failed("kvm: vmptrld failed: %p/%llx\n", vmcs, phys_addr);
1216 +-}
1217 +-
1218 +-noinline void invvpid_error(unsigned long ext, u16 vpid, gva_t gva)
1219 +-{
1220 +- vmx_insn_failed("kvm: invvpid failed: ext=0x%lx vpid=%u gva=0x%lx\n",
1221 +- ext, vpid, gva);
1222 +-}
1223 +-
1224 +-noinline void invept_error(unsigned long ext, u64 eptp, gpa_t gpa)
1225 +-{
1226 +- vmx_insn_failed("kvm: invept failed: ext=0x%lx eptp=%llx gpa=0x%llx\n",
1227 +- ext, eptp, gpa);
1228 +-}
1229 +-
1230 +-static DEFINE_PER_CPU(struct vmcs *, vmxarea);
1231 +-DEFINE_PER_CPU(struct vmcs *, current_vmcs);
1232 +-/*
1233 +- * We maintain a per-CPU linked-list of VMCS loaded on that CPU. This is needed
1234 +- * when a CPU is brought down, and we need to VMCLEAR all VMCSs loaded on it.
1235 +- */
1236 +-static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu);
1237 +-
1238 +-/*
1239 +- * We maintian a per-CPU linked-list of vCPU, so in wakeup_handler() we
1240 +- * can find which vCPU should be waken up.
1241 +- */
1242 +-static DEFINE_PER_CPU(struct list_head, blocked_vcpu_on_cpu);
1243 +-static DEFINE_PER_CPU(spinlock_t, blocked_vcpu_on_cpu_lock);
1244 +-
1245 +-static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS);
1246 +-static DEFINE_SPINLOCK(vmx_vpid_lock);
1247 +-
1248 +-struct vmcs_config vmcs_config;
1249 +-struct vmx_capability vmx_capability;
1250 +-
1251 +-#define VMX_SEGMENT_FIELD(seg) \
1252 +- [VCPU_SREG_##seg] = { \
1253 +- .selector = GUEST_##seg##_SELECTOR, \
1254 +- .base = GUEST_##seg##_BASE, \
1255 +- .limit = GUEST_##seg##_LIMIT, \
1256 +- .ar_bytes = GUEST_##seg##_AR_BYTES, \
1257 +- }
1258 +-
1259 +-static const struct kvm_vmx_segment_field {
1260 +- unsigned selector;
1261 +- unsigned base;
1262 +- unsigned limit;
1263 +- unsigned ar_bytes;
1264 +-} kvm_vmx_segment_fields[] = {
1265 +- VMX_SEGMENT_FIELD(CS),
1266 +- VMX_SEGMENT_FIELD(DS),
1267 +- VMX_SEGMENT_FIELD(ES),
1268 +- VMX_SEGMENT_FIELD(FS),
1269 +- VMX_SEGMENT_FIELD(GS),
1270 +- VMX_SEGMENT_FIELD(SS),
1271 +- VMX_SEGMENT_FIELD(TR),
1272 +- VMX_SEGMENT_FIELD(LDTR),
1273 +-};
1274 +-
1275 +-u64 host_efer;
1276 +-static unsigned long host_idt_base;
1277 +-
1278 +-/*
1279 +- * Though SYSCALL is only supported in 64-bit mode on Intel CPUs, kvm
1280 +- * will emulate SYSCALL in legacy mode if the vendor string in guest
1281 +- * CPUID.0:{EBX,ECX,EDX} is "AuthenticAMD" or "AMDisbetter!" To
1282 +- * support this emulation, IA32_STAR must always be included in
1283 +- * vmx_msr_index[], even in i386 builds.
1284 +- */
1285 +-const u32 vmx_msr_index[] = {
1286 +-#ifdef CONFIG_X86_64
1287 +- MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR,
1288 +-#endif
1289 +- MSR_EFER, MSR_TSC_AUX, MSR_STAR,
1290 +- MSR_IA32_TSX_CTRL,
1291 +-};
1292 +-
1293 +-#if IS_ENABLED(CONFIG_HYPERV)
1294 +-static bool __read_mostly enlightened_vmcs = true;
1295 +-module_param(enlightened_vmcs, bool, 0444);
1296 +-
1297 +-/* check_ept_pointer() should be under protection of ept_pointer_lock. */
1298 +-static void check_ept_pointer_match(struct kvm *kvm)
1299 +-{
1300 +- struct kvm_vcpu *vcpu;
1301 +- u64 tmp_eptp = INVALID_PAGE;
1302 +- int i;
1303 +-
1304 +- kvm_for_each_vcpu(i, vcpu, kvm) {
1305 +- if (!VALID_PAGE(tmp_eptp)) {
1306 +- tmp_eptp = to_vmx(vcpu)->ept_pointer;
1307 +- } else if (tmp_eptp != to_vmx(vcpu)->ept_pointer) {
1308 +- to_kvm_vmx(kvm)->ept_pointers_match
1309 +- = EPT_POINTERS_MISMATCH;
1310 +- return;
1311 +- }
1312 +- }
1313 +-
1314 +- to_kvm_vmx(kvm)->ept_pointers_match = EPT_POINTERS_MATCH;
1315 +-}
1316 +-
1317 +-static int kvm_fill_hv_flush_list_func(struct hv_guest_mapping_flush_list *flush,
1318 +- void *data)
1319 +-{
1320 +- struct kvm_tlb_range *range = data;
1321 +-
1322 +- return hyperv_fill_flush_guest_mapping_list(flush, range->start_gfn,
1323 +- range->pages);
1324 +-}
1325 +-
1326 +-static inline int __hv_remote_flush_tlb_with_range(struct kvm *kvm,
1327 +- struct kvm_vcpu *vcpu, struct kvm_tlb_range *range)
1328 +-{
1329 +- u64 ept_pointer = to_vmx(vcpu)->ept_pointer;
1330 +-
1331 +- /*
1332 +- * FLUSH_GUEST_PHYSICAL_ADDRESS_SPACE hypercall needs address
1333 +- * of the base of EPT PML4 table, strip off EPT configuration
1334 +- * information.
1335 +- */
1336 +- if (range)
1337 +- return hyperv_flush_guest_mapping_range(ept_pointer & PAGE_MASK,
1338 +- kvm_fill_hv_flush_list_func, (void *)range);
1339 +- else
1340 +- return hyperv_flush_guest_mapping(ept_pointer & PAGE_MASK);
1341 +-}
1342 +-
1343 +-static int hv_remote_flush_tlb_with_range(struct kvm *kvm,
1344 +- struct kvm_tlb_range *range)
1345 +-{
1346 +- struct kvm_vcpu *vcpu;
1347 +- int ret = 0, i;
1348 +-
1349 +- spin_lock(&to_kvm_vmx(kvm)->ept_pointer_lock);
1350 +-
1351 +- if (to_kvm_vmx(kvm)->ept_pointers_match == EPT_POINTERS_CHECK)
1352 +- check_ept_pointer_match(kvm);
1353 +-
1354 +- if (to_kvm_vmx(kvm)->ept_pointers_match != EPT_POINTERS_MATCH) {
1355 +- kvm_for_each_vcpu(i, vcpu, kvm) {
1356 +- /* If ept_pointer is invalid pointer, bypass flush request. */
1357 +- if (VALID_PAGE(to_vmx(vcpu)->ept_pointer))
1358 +- ret |= __hv_remote_flush_tlb_with_range(
1359 +- kvm, vcpu, range);
1360 +- }
1361 +- } else {
1362 +- ret = __hv_remote_flush_tlb_with_range(kvm,
1363 +- kvm_get_vcpu(kvm, 0), range);
1364 +- }
1365 +-
1366 +- spin_unlock(&to_kvm_vmx(kvm)->ept_pointer_lock);
1367 +- return ret;
1368 +-}
1369 +-static int hv_remote_flush_tlb(struct kvm *kvm)
1370 +-{
1371 +- return hv_remote_flush_tlb_with_range(kvm, NULL);
1372 +-}
1373 +-
1374 +-static int hv_enable_direct_tlbflush(struct kvm_vcpu *vcpu)
1375 +-{
1376 +- struct hv_enlightened_vmcs *evmcs;
1377 +- struct hv_partition_assist_pg **p_hv_pa_pg =
1378 +- &vcpu->kvm->arch.hyperv.hv_pa_pg;
1379 +- /*
1380 +- * Synthetic VM-Exit is not enabled in current code and so All
1381 +- * evmcs in singe VM shares same assist page.
1382 +- */
1383 +- if (!*p_hv_pa_pg)
1384 +- *p_hv_pa_pg = kzalloc(PAGE_SIZE, GFP_KERNEL);
1385 +-
1386 +- if (!*p_hv_pa_pg)
1387 +- return -ENOMEM;
1388 +-
1389 +- evmcs = (struct hv_enlightened_vmcs *)to_vmx(vcpu)->loaded_vmcs->vmcs;
1390 +-
1391 +- evmcs->partition_assist_page =
1392 +- __pa(*p_hv_pa_pg);
1393 +- evmcs->hv_vm_id = (unsigned long)vcpu->kvm;
1394 +- evmcs->hv_enlightenments_control.nested_flush_hypercall = 1;
1395 +-
1396 +- return 0;
1397 +-}
1398 +-
1399 +-#endif /* IS_ENABLED(CONFIG_HYPERV) */
1400 +-
1401 +-/*
1402 +- * Comment's format: document - errata name - stepping - processor name.
1403 +- * Refer from
1404 +- * https://www.virtualbox.org/svn/vbox/trunk/src/VBox/VMM/VMMR0/HMR0.cpp
1405 +- */
1406 +-static u32 vmx_preemption_cpu_tfms[] = {
1407 +-/* 323344.pdf - BA86 - D0 - Xeon 7500 Series */
1408 +-0x000206E6,
1409 +-/* 323056.pdf - AAX65 - C2 - Xeon L3406 */
1410 +-/* 322814.pdf - AAT59 - C2 - i7-600, i5-500, i5-400 and i3-300 Mobile */
1411 +-/* 322911.pdf - AAU65 - C2 - i5-600, i3-500 Desktop and Pentium G6950 */
1412 +-0x00020652,
1413 +-/* 322911.pdf - AAU65 - K0 - i5-600, i3-500 Desktop and Pentium G6950 */
1414 +-0x00020655,
1415 +-/* 322373.pdf - AAO95 - B1 - Xeon 3400 Series */
1416 +-/* 322166.pdf - AAN92 - B1 - i7-800 and i5-700 Desktop */
1417 +-/*
1418 +- * 320767.pdf - AAP86 - B1 -
1419 +- * i7-900 Mobile Extreme, i7-800 and i7-700 Mobile
1420 +- */
1421 +-0x000106E5,
1422 +-/* 321333.pdf - AAM126 - C0 - Xeon 3500 */
1423 +-0x000106A0,
1424 +-/* 321333.pdf - AAM126 - C1 - Xeon 3500 */
1425 +-0x000106A1,
1426 +-/* 320836.pdf - AAJ124 - C0 - i7-900 Desktop Extreme and i7-900 Desktop */
1427 +-0x000106A4,
1428 +- /* 321333.pdf - AAM126 - D0 - Xeon 3500 */
1429 +- /* 321324.pdf - AAK139 - D0 - Xeon 5500 */
1430 +- /* 320836.pdf - AAJ124 - D0 - i7-900 Extreme and i7-900 Desktop */
1431 +-0x000106A5,
1432 +- /* Xeon E3-1220 V2 */
1433 +-0x000306A8,
1434 +-};
1435 +-
1436 +-static inline bool cpu_has_broken_vmx_preemption_timer(void)
1437 +-{
1438 +- u32 eax = cpuid_eax(0x00000001), i;
1439 +-
1440 +- /* Clear the reserved bits */
1441 +- eax &= ~(0x3U << 14 | 0xfU << 28);
1442 +- for (i = 0; i < ARRAY_SIZE(vmx_preemption_cpu_tfms); i++)
1443 +- if (eax == vmx_preemption_cpu_tfms[i])
1444 +- return true;
1445 +-
1446 +- return false;
1447 +-}
1448 +-
1449 +-static inline bool cpu_need_virtualize_apic_accesses(struct kvm_vcpu *vcpu)
1450 +-{
1451 +- return flexpriority_enabled && lapic_in_kernel(vcpu);
1452 +-}
1453 +-
1454 +-static inline bool report_flexpriority(void)
1455 +-{
1456 +- return flexpriority_enabled;
1457 +-}
1458 +-
1459 +-static inline int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
1460 +-{
1461 +- int i;
1462 +-
1463 +- for (i = 0; i < vmx->nmsrs; ++i)
1464 +- if (vmx_msr_index[vmx->guest_msrs[i].index] == msr)
1465 +- return i;
1466 +- return -1;
1467 +-}
1468 +-
1469 +-struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr)
1470 +-{
1471 +- int i;
1472 +-
1473 +- i = __find_msr_index(vmx, msr);
1474 +- if (i >= 0)
1475 +- return &vmx->guest_msrs[i];
1476 +- return NULL;
1477 +-}
1478 +-
1479 +-static int vmx_set_guest_msr(struct vcpu_vmx *vmx, struct shared_msr_entry *msr, u64 data)
1480 +-{
1481 +- int ret = 0;
1482 +-
1483 +- u64 old_msr_data = msr->data;
1484 +- msr->data = data;
1485 +- if (msr - vmx->guest_msrs < vmx->save_nmsrs) {
1486 +- preempt_disable();
1487 +- ret = kvm_set_shared_msr(msr->index, msr->data,
1488 +- msr->mask);
1489 +- preempt_enable();
1490 +- if (ret)
1491 +- msr->data = old_msr_data;
1492 +- }
1493 +- return ret;
1494 +-}
1495 +-
1496 +-void loaded_vmcs_init(struct loaded_vmcs *loaded_vmcs)
1497 +-{
1498 +- vmcs_clear(loaded_vmcs->vmcs);
1499 +- if (loaded_vmcs->shadow_vmcs && loaded_vmcs->launched)
1500 +- vmcs_clear(loaded_vmcs->shadow_vmcs);
1501 +- loaded_vmcs->cpu = -1;
1502 +- loaded_vmcs->launched = 0;
1503 +-}
1504 +-
1505 +-#ifdef CONFIG_KEXEC_CORE
1506 +-/*
1507 +- * This bitmap is used to indicate whether the vmclear
1508 +- * operation is enabled on all cpus. All disabled by
1509 +- * default.
1510 +- */
1511 +-static cpumask_t crash_vmclear_enabled_bitmap = CPU_MASK_NONE;
1512 +-
1513 +-static inline void crash_enable_local_vmclear(int cpu)
1514 +-{
1515 +- cpumask_set_cpu(cpu, &crash_vmclear_enabled_bitmap);
1516 +-}
1517 +-
1518 +-static inline void crash_disable_local_vmclear(int cpu)
1519 +-{
1520 +- cpumask_clear_cpu(cpu, &crash_vmclear_enabled_bitmap);
1521 +-}
1522 +-
1523 +-static inline int crash_local_vmclear_enabled(int cpu)
1524 +-{
1525 +- return cpumask_test_cpu(cpu, &crash_vmclear_enabled_bitmap);
1526 +-}
1527 +-
1528 +-static void crash_vmclear_local_loaded_vmcss(void)
1529 +-{
1530 +- int cpu = raw_smp_processor_id();
1531 +- struct loaded_vmcs *v;
1532 +-
1533 +- if (!crash_local_vmclear_enabled(cpu))
1534 +- return;
1535 +-
1536 +- list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu),
1537 +- loaded_vmcss_on_cpu_link)
1538 +- vmcs_clear(v->vmcs);
1539 +-}
1540 +-#else
1541 +-static inline void crash_enable_local_vmclear(int cpu) { }
1542 +-static inline void crash_disable_local_vmclear(int cpu) { }
1543 +-#endif /* CONFIG_KEXEC_CORE */
1544 +-
1545 +-static void __loaded_vmcs_clear(void *arg)
1546 +-{
1547 +- struct loaded_vmcs *loaded_vmcs = arg;
1548 +- int cpu = raw_smp_processor_id();
1549 +-
1550 +- if (loaded_vmcs->cpu != cpu)
1551 +- return; /* vcpu migration can race with cpu offline */
1552 +- if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs)
1553 +- per_cpu(current_vmcs, cpu) = NULL;
1554 +- crash_disable_local_vmclear(cpu);
1555 +- list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link);
1556 +-
1557 +- /*
1558 +- * we should ensure updating loaded_vmcs->loaded_vmcss_on_cpu_link
1559 +- * is before setting loaded_vmcs->vcpu to -1 which is done in
1560 +- * loaded_vmcs_init. Otherwise, other cpu can see vcpu = -1 fist
1561 +- * then adds the vmcs into percpu list before it is deleted.
1562 +- */
1563 +- smp_wmb();
1564 +-
1565 +- loaded_vmcs_init(loaded_vmcs);
1566 +- crash_enable_local_vmclear(cpu);
1567 +-}
1568 +-
1569 +-void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs)
1570 +-{
1571 +- int cpu = loaded_vmcs->cpu;
1572 +-
1573 +- if (cpu != -1)
1574 +- smp_call_function_single(cpu,
1575 +- __loaded_vmcs_clear, loaded_vmcs, 1);
1576 +-}
1577 +-
1578 +-static bool vmx_segment_cache_test_set(struct vcpu_vmx *vmx, unsigned seg,
1579 +- unsigned field)
1580 +-{
1581 +- bool ret;
1582 +- u32 mask = 1 << (seg * SEG_FIELD_NR + field);
1583 +-
1584 +- if (!kvm_register_is_available(&vmx->vcpu, VCPU_EXREG_SEGMENTS)) {
1585 +- kvm_register_mark_available(&vmx->vcpu, VCPU_EXREG_SEGMENTS);
1586 +- vmx->segment_cache.bitmask = 0;
1587 +- }
1588 +- ret = vmx->segment_cache.bitmask & mask;
1589 +- vmx->segment_cache.bitmask |= mask;
1590 +- return ret;
1591 +-}
1592 +-
1593 +-static u16 vmx_read_guest_seg_selector(struct vcpu_vmx *vmx, unsigned seg)
1594 +-{
1595 +- u16 *p = &vmx->segment_cache.seg[seg].selector;
1596 +-
1597 +- if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_SEL))
1598 +- *p = vmcs_read16(kvm_vmx_segment_fields[seg].selector);
1599 +- return *p;
1600 +-}
1601 +-
1602 +-static ulong vmx_read_guest_seg_base(struct vcpu_vmx *vmx, unsigned seg)
1603 +-{
1604 +- ulong *p = &vmx->segment_cache.seg[seg].base;
1605 +-
1606 +- if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_BASE))
1607 +- *p = vmcs_readl(kvm_vmx_segment_fields[seg].base);
1608 +- return *p;
1609 +-}
1610 +-
1611 +-static u32 vmx_read_guest_seg_limit(struct vcpu_vmx *vmx, unsigned seg)
1612 +-{
1613 +- u32 *p = &vmx->segment_cache.seg[seg].limit;
1614 +-
1615 +- if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_LIMIT))
1616 +- *p = vmcs_read32(kvm_vmx_segment_fields[seg].limit);
1617 +- return *p;
1618 +-}
1619 +-
1620 +-static u32 vmx_read_guest_seg_ar(struct vcpu_vmx *vmx, unsigned seg)
1621 +-{
1622 +- u32 *p = &vmx->segment_cache.seg[seg].ar;
1623 +-
1624 +- if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_AR))
1625 +- *p = vmcs_read32(kvm_vmx_segment_fields[seg].ar_bytes);
1626 +- return *p;
1627 +-}
1628 +-
1629 +-void update_exception_bitmap(struct kvm_vcpu *vcpu)
1630 +-{
1631 +- u32 eb;
1632 +-
1633 +- eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) |
1634 +- (1u << DB_VECTOR) | (1u << AC_VECTOR);
1635 +- /*
1636 +- * Guest access to VMware backdoor ports could legitimately
1637 +- * trigger #GP because of TSS I/O permission bitmap.
1638 +- * We intercept those #GP and allow access to them anyway
1639 +- * as VMware does.
1640 +- */
1641 +- if (enable_vmware_backdoor)
1642 +- eb |= (1u << GP_VECTOR);
1643 +- if ((vcpu->guest_debug &
1644 +- (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) ==
1645 +- (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP))
1646 +- eb |= 1u << BP_VECTOR;
1647 +- if (to_vmx(vcpu)->rmode.vm86_active)
1648 +- eb = ~0;
1649 +- if (enable_ept)
1650 +- eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */
1651 +-
1652 +- /* When we are running a nested L2 guest and L1 specified for it a
1653 +- * certain exception bitmap, we must trap the same exceptions and pass
1654 +- * them to L1. When running L2, we will only handle the exceptions
1655 +- * specified above if L1 did not want them.
1656 +- */
1657 +- if (is_guest_mode(vcpu))
1658 +- eb |= get_vmcs12(vcpu)->exception_bitmap;
1659 +-
1660 +- vmcs_write32(EXCEPTION_BITMAP, eb);
1661 +-}
1662 +-
1663 +-/*
1664 +- * Check if MSR is intercepted for currently loaded MSR bitmap.
1665 +- */
1666 +-static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr)
1667 +-{
1668 +- unsigned long *msr_bitmap;
1669 +- int f = sizeof(unsigned long);
1670 +-
1671 +- if (!cpu_has_vmx_msr_bitmap())
1672 +- return true;
1673 +-
1674 +- msr_bitmap = to_vmx(vcpu)->loaded_vmcs->msr_bitmap;
1675 +-
1676 +- if (msr <= 0x1fff) {
1677 +- return !!test_bit(msr, msr_bitmap + 0x800 / f);
1678 +- } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
1679 +- msr &= 0x1fff;
1680 +- return !!test_bit(msr, msr_bitmap + 0xc00 / f);
1681 +- }
1682 +-
1683 +- return true;
1684 +-}
1685 +-
1686 +-static void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx,
1687 +- unsigned long entry, unsigned long exit)
1688 +-{
1689 +- vm_entry_controls_clearbit(vmx, entry);
1690 +- vm_exit_controls_clearbit(vmx, exit);
1691 +-}
1692 +-
1693 +-int vmx_find_msr_index(struct vmx_msrs *m, u32 msr)
1694 +-{
1695 +- unsigned int i;
1696 +-
1697 +- for (i = 0; i < m->nr; ++i) {
1698 +- if (m->val[i].index == msr)
1699 +- return i;
1700 +- }
1701 +- return -ENOENT;
1702 +-}
1703 +-
1704 +-static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr)
1705 +-{
1706 +- int i;
1707 +- struct msr_autoload *m = &vmx->msr_autoload;
1708 +-
1709 +- switch (msr) {
1710 +- case MSR_EFER:
1711 +- if (cpu_has_load_ia32_efer()) {
1712 +- clear_atomic_switch_msr_special(vmx,
1713 +- VM_ENTRY_LOAD_IA32_EFER,
1714 +- VM_EXIT_LOAD_IA32_EFER);
1715 +- return;
1716 +- }
1717 +- break;
1718 +- case MSR_CORE_PERF_GLOBAL_CTRL:
1719 +- if (cpu_has_load_perf_global_ctrl()) {
1720 +- clear_atomic_switch_msr_special(vmx,
1721 +- VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL,
1722 +- VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL);
1723 +- return;
1724 +- }
1725 +- break;
1726 +- }
1727 +- i = vmx_find_msr_index(&m->guest, msr);
1728 +- if (i < 0)
1729 +- goto skip_guest;
1730 +- --m->guest.nr;
1731 +- m->guest.val[i] = m->guest.val[m->guest.nr];
1732 +- vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr);
1733 +-
1734 +-skip_guest:
1735 +- i = vmx_find_msr_index(&m->host, msr);
1736 +- if (i < 0)
1737 +- return;
1738 +-
1739 +- --m->host.nr;
1740 +- m->host.val[i] = m->host.val[m->host.nr];
1741 +- vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr);
1742 +-}
1743 +-
1744 +-static void add_atomic_switch_msr_special(struct vcpu_vmx *vmx,
1745 +- unsigned long entry, unsigned long exit,
1746 +- unsigned long guest_val_vmcs, unsigned long host_val_vmcs,
1747 +- u64 guest_val, u64 host_val)
1748 +-{
1749 +- vmcs_write64(guest_val_vmcs, guest_val);
1750 +- if (host_val_vmcs != HOST_IA32_EFER)
1751 +- vmcs_write64(host_val_vmcs, host_val);
1752 +- vm_entry_controls_setbit(vmx, entry);
1753 +- vm_exit_controls_setbit(vmx, exit);
1754 +-}
1755 +-
1756 +-static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
1757 +- u64 guest_val, u64 host_val, bool entry_only)
1758 +-{
1759 +- int i, j = 0;
1760 +- struct msr_autoload *m = &vmx->msr_autoload;
1761 +-
1762 +- switch (msr) {
1763 +- case MSR_EFER:
1764 +- if (cpu_has_load_ia32_efer()) {
1765 +- add_atomic_switch_msr_special(vmx,
1766 +- VM_ENTRY_LOAD_IA32_EFER,
1767 +- VM_EXIT_LOAD_IA32_EFER,
1768 +- GUEST_IA32_EFER,
1769 +- HOST_IA32_EFER,
1770 +- guest_val, host_val);
1771 +- return;
1772 +- }
1773 +- break;
1774 +- case MSR_CORE_PERF_GLOBAL_CTRL:
1775 +- if (cpu_has_load_perf_global_ctrl()) {
1776 +- add_atomic_switch_msr_special(vmx,
1777 +- VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL,
1778 +- VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL,
1779 +- GUEST_IA32_PERF_GLOBAL_CTRL,
1780 +- HOST_IA32_PERF_GLOBAL_CTRL,
1781 +- guest_val, host_val);
1782 +- return;
1783 +- }
1784 +- break;
1785 +- case MSR_IA32_PEBS_ENABLE:
1786 +- /* PEBS needs a quiescent period after being disabled (to write
1787 +- * a record). Disabling PEBS through VMX MSR swapping doesn't
1788 +- * provide that period, so a CPU could write host's record into
1789 +- * guest's memory.
1790 +- */
1791 +- wrmsrl(MSR_IA32_PEBS_ENABLE, 0);
1792 +- }
1793 +-
1794 +- i = vmx_find_msr_index(&m->guest, msr);
1795 +- if (!entry_only)
1796 +- j = vmx_find_msr_index(&m->host, msr);
1797 +-
1798 +- if ((i < 0 && m->guest.nr == NR_LOADSTORE_MSRS) ||
1799 +- (j < 0 && m->host.nr == NR_LOADSTORE_MSRS)) {
1800 +- printk_once(KERN_WARNING "Not enough msr switch entries. "
1801 +- "Can't add msr %x\n", msr);
1802 +- return;
1803 +- }
1804 +- if (i < 0) {
1805 +- i = m->guest.nr++;
1806 +- vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr);
1807 +- }
1808 +- m->guest.val[i].index = msr;
1809 +- m->guest.val[i].value = guest_val;
1810 +-
1811 +- if (entry_only)
1812 +- return;
1813 +-
1814 +- if (j < 0) {
1815 +- j = m->host.nr++;
1816 +- vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr);
1817 +- }
1818 +- m->host.val[j].index = msr;
1819 +- m->host.val[j].value = host_val;
1820 +-}
1821 +-
1822 +-static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
1823 +-{
1824 +- u64 guest_efer = vmx->vcpu.arch.efer;
1825 +- u64 ignore_bits = 0;
1826 +-
1827 +- /* Shadow paging assumes NX to be available. */
1828 +- if (!enable_ept)
1829 +- guest_efer |= EFER_NX;
1830 +-
1831 +- /*
1832 +- * LMA and LME handled by hardware; SCE meaningless outside long mode.
1833 +- */
1834 +- ignore_bits |= EFER_SCE;
1835 +-#ifdef CONFIG_X86_64
1836 +- ignore_bits |= EFER_LMA | EFER_LME;
1837 +- /* SCE is meaningful only in long mode on Intel */
1838 +- if (guest_efer & EFER_LMA)
1839 +- ignore_bits &= ~(u64)EFER_SCE;
1840 +-#endif
1841 +-
1842 +- /*
1843 +- * On EPT, we can't emulate NX, so we must switch EFER atomically.
1844 +- * On CPUs that support "load IA32_EFER", always switch EFER
1845 +- * atomically, since it's faster than switching it manually.
1846 +- */
1847 +- if (cpu_has_load_ia32_efer() ||
1848 +- (enable_ept && ((vmx->vcpu.arch.efer ^ host_efer) & EFER_NX))) {
1849 +- if (!(guest_efer & EFER_LMA))
1850 +- guest_efer &= ~EFER_LME;
1851 +- if (guest_efer != host_efer)
1852 +- add_atomic_switch_msr(vmx, MSR_EFER,
1853 +- guest_efer, host_efer, false);
1854 +- else
1855 +- clear_atomic_switch_msr(vmx, MSR_EFER);
1856 +- return false;
1857 +- } else {
1858 +- clear_atomic_switch_msr(vmx, MSR_EFER);
1859 +-
1860 +- guest_efer &= ~ignore_bits;
1861 +- guest_efer |= host_efer & ignore_bits;
1862 +-
1863 +- vmx->guest_msrs[efer_offset].data = guest_efer;
1864 +- vmx->guest_msrs[efer_offset].mask = ~ignore_bits;
1865 +-
1866 +- return true;
1867 +- }
1868 +-}
1869 +-
1870 +-#ifdef CONFIG_X86_32
1871 +-/*
1872 +- * On 32-bit kernels, VM exits still load the FS and GS bases from the
1873 +- * VMCS rather than the segment table. KVM uses this helper to figure
1874 +- * out the current bases to poke them into the VMCS before entry.
1875 +- */
1876 +-static unsigned long segment_base(u16 selector)
1877 +-{
1878 +- struct desc_struct *table;
1879 +- unsigned long v;
1880 +-
1881 +- if (!(selector & ~SEGMENT_RPL_MASK))
1882 +- return 0;
1883 +-
1884 +- table = get_current_gdt_ro();
1885 +-
1886 +- if ((selector & SEGMENT_TI_MASK) == SEGMENT_LDT) {
1887 +- u16 ldt_selector = kvm_read_ldt();
1888 +-
1889 +- if (!(ldt_selector & ~SEGMENT_RPL_MASK))
1890 +- return 0;
1891 +-
1892 +- table = (struct desc_struct *)segment_base(ldt_selector);
1893 +- }
1894 +- v = get_desc_base(&table[selector >> 3]);
1895 +- return v;
1896 +-}
1897 +-#endif
1898 +-
1899 +-static inline void pt_load_msr(struct pt_ctx *ctx, u32 addr_range)
1900 +-{
1901 +- u32 i;
1902 +-
1903 +- wrmsrl(MSR_IA32_RTIT_STATUS, ctx->status);
1904 +- wrmsrl(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base);
1905 +- wrmsrl(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask);
1906 +- wrmsrl(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match);
1907 +- for (i = 0; i < addr_range; i++) {
1908 +- wrmsrl(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]);
1909 +- wrmsrl(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]);
1910 +- }
1911 +-}
1912 +-
1913 +-static inline void pt_save_msr(struct pt_ctx *ctx, u32 addr_range)
1914 +-{
1915 +- u32 i;
1916 +-
1917 +- rdmsrl(MSR_IA32_RTIT_STATUS, ctx->status);
1918 +- rdmsrl(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base);
1919 +- rdmsrl(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask);
1920 +- rdmsrl(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match);
1921 +- for (i = 0; i < addr_range; i++) {
1922 +- rdmsrl(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]);
1923 +- rdmsrl(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]);
1924 +- }
1925 +-}
1926 +-
1927 +-static void pt_guest_enter(struct vcpu_vmx *vmx)
1928 +-{
1929 +- if (pt_mode == PT_MODE_SYSTEM)
1930 +- return;
1931 +-
1932 +- /*
1933 +- * GUEST_IA32_RTIT_CTL is already set in the VMCS.
1934 +- * Save host state before VM entry.
1935 +- */
1936 +- rdmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl);
1937 +- if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) {
1938 +- wrmsrl(MSR_IA32_RTIT_CTL, 0);
1939 +- pt_save_msr(&vmx->pt_desc.host, vmx->pt_desc.addr_range);
1940 +- pt_load_msr(&vmx->pt_desc.guest, vmx->pt_desc.addr_range);
1941 +- }
1942 +-}
1943 +-
1944 +-static void pt_guest_exit(struct vcpu_vmx *vmx)
1945 +-{
1946 +- if (pt_mode == PT_MODE_SYSTEM)
1947 +- return;
1948 +-
1949 +- if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) {
1950 +- pt_save_msr(&vmx->pt_desc.guest, vmx->pt_desc.addr_range);
1951 +- pt_load_msr(&vmx->pt_desc.host, vmx->pt_desc.addr_range);
1952 +- }
1953 +-
1954 +- /* Reload host state (IA32_RTIT_CTL will be cleared on VM exit). */
1955 +- wrmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl);
1956 +-}
1957 +-
1958 +-void vmx_set_host_fs_gs(struct vmcs_host_state *host, u16 fs_sel, u16 gs_sel,
1959 +- unsigned long fs_base, unsigned long gs_base)
1960 +-{
1961 +- if (unlikely(fs_sel != host->fs_sel)) {
1962 +- if (!(fs_sel & 7))
1963 +- vmcs_write16(HOST_FS_SELECTOR, fs_sel);
1964 +- else
1965 +- vmcs_write16(HOST_FS_SELECTOR, 0);
1966 +- host->fs_sel = fs_sel;
1967 +- }
1968 +- if (unlikely(gs_sel != host->gs_sel)) {
1969 +- if (!(gs_sel & 7))
1970 +- vmcs_write16(HOST_GS_SELECTOR, gs_sel);
1971 +- else
1972 +- vmcs_write16(HOST_GS_SELECTOR, 0);
1973 +- host->gs_sel = gs_sel;
1974 +- }
1975 +- if (unlikely(fs_base != host->fs_base)) {
1976 +- vmcs_writel(HOST_FS_BASE, fs_base);
1977 +- host->fs_base = fs_base;
1978 +- }
1979 +- if (unlikely(gs_base != host->gs_base)) {
1980 +- vmcs_writel(HOST_GS_BASE, gs_base);
1981 +- host->gs_base = gs_base;
1982 +- }
1983 +-}
1984 +-
1985 +-void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
1986 +-{
1987 +- struct vcpu_vmx *vmx = to_vmx(vcpu);
1988 +- struct vmcs_host_state *host_state;
1989 +-#ifdef CONFIG_X86_64
1990 +- int cpu = raw_smp_processor_id();
1991 +-#endif
1992 +- unsigned long fs_base, gs_base;
1993 +- u16 fs_sel, gs_sel;
1994 +- int i;
1995 +-
1996 +- vmx->req_immediate_exit = false;
1997 +-
1998 +- /*
1999 +- * Note that guest MSRs to be saved/restored can also be changed
2000 +- * when guest state is loaded. This happens when guest transitions
2001 +- * to/from long-mode by setting MSR_EFER.LMA.
2002 +- */
2003 +- if (!vmx->guest_msrs_ready) {
2004 +- vmx->guest_msrs_ready = true;
2005 +- for (i = 0; i < vmx->save_nmsrs; ++i)
2006 +- kvm_set_shared_msr(vmx->guest_msrs[i].index,
2007 +- vmx->guest_msrs[i].data,
2008 +- vmx->guest_msrs[i].mask);
2009 +-
2010 +- }
2011 +- if (vmx->guest_state_loaded)
2012 +- return;
2013 +-
2014 +- host_state = &vmx->loaded_vmcs->host_state;
2015 +-
2016 +- /*
2017 +- * Set host fs and gs selectors. Unfortunately, 22.2.3 does not
2018 +- * allow segment selectors with cpl > 0 or ti == 1.
2019 +- */
2020 +- host_state->ldt_sel = kvm_read_ldt();
2021 +-
2022 +-#ifdef CONFIG_X86_64
2023 +- savesegment(ds, host_state->ds_sel);
2024 +- savesegment(es, host_state->es_sel);
2025 +-
2026 +- gs_base = cpu_kernelmode_gs_base(cpu);
2027 +- if (likely(is_64bit_mm(current->mm))) {
2028 +- save_fsgs_for_kvm();
2029 +- fs_sel = current->thread.fsindex;
2030 +- gs_sel = current->thread.gsindex;
2031 +- fs_base = current->thread.fsbase;
2032 +- vmx->msr_host_kernel_gs_base = current->thread.gsbase;
2033 +- } else {
2034 +- savesegment(fs, fs_sel);
2035 +- savesegment(gs, gs_sel);
2036 +- fs_base = read_msr(MSR_FS_BASE);
2037 +- vmx->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE);
2038 +- }
2039 +-
2040 +- wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
2041 +-#else
2042 +- savesegment(fs, fs_sel);
2043 +- savesegment(gs, gs_sel);
2044 +- fs_base = segment_base(fs_sel);
2045 +- gs_base = segment_base(gs_sel);
2046 +-#endif
2047 +-
2048 +- vmx_set_host_fs_gs(host_state, fs_sel, gs_sel, fs_base, gs_base);
2049 +- vmx->guest_state_loaded = true;
2050 +-}
2051 +-
2052 +-static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx)
2053 +-{
2054 +- struct vmcs_host_state *host_state;
2055 +-
2056 +- if (!vmx->guest_state_loaded)
2057 +- return;
2058 +-
2059 +- host_state = &vmx->loaded_vmcs->host_state;
2060 +-
2061 +- ++vmx->vcpu.stat.host_state_reload;
2062 +-
2063 +-#ifdef CONFIG_X86_64
2064 +- rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
2065 +-#endif
2066 +- if (host_state->ldt_sel || (host_state->gs_sel & 7)) {
2067 +- kvm_load_ldt(host_state->ldt_sel);
2068 +-#ifdef CONFIG_X86_64
2069 +- load_gs_index(host_state->gs_sel);
2070 +-#else
2071 +- loadsegment(gs, host_state->gs_sel);
2072 +-#endif
2073 +- }
2074 +- if (host_state->fs_sel & 7)
2075 +- loadsegment(fs, host_state->fs_sel);
2076 +-#ifdef CONFIG_X86_64
2077 +- if (unlikely(host_state->ds_sel | host_state->es_sel)) {
2078 +- loadsegment(ds, host_state->ds_sel);
2079 +- loadsegment(es, host_state->es_sel);
2080 +- }
2081 +-#endif
2082 +- invalidate_tss_limit();
2083 +-#ifdef CONFIG_X86_64
2084 +- wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
2085 +-#endif
2086 +- load_fixmap_gdt(raw_smp_processor_id());
2087 +- vmx->guest_state_loaded = false;
2088 +- vmx->guest_msrs_ready = false;
2089 +-}
2090 +-
2091 +-#ifdef CONFIG_X86_64
2092 +-static u64 vmx_read_guest_kernel_gs_base(struct vcpu_vmx *vmx)
2093 +-{
2094 +- preempt_disable();
2095 +- if (vmx->guest_state_loaded)
2096 +- rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
2097 +- preempt_enable();
2098 +- return vmx->msr_guest_kernel_gs_base;
2099 +-}
2100 +-
2101 +-static void vmx_write_guest_kernel_gs_base(struct vcpu_vmx *vmx, u64 data)
2102 +-{
2103 +- preempt_disable();
2104 +- if (vmx->guest_state_loaded)
2105 +- wrmsrl(MSR_KERNEL_GS_BASE, data);
2106 +- preempt_enable();
2107 +- vmx->msr_guest_kernel_gs_base = data;
2108 +-}
2109 +-#endif
2110 +-
2111 +-static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
2112 +-{
2113 +- struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
2114 +- struct pi_desc old, new;
2115 +- unsigned int dest;
2116 +-
2117 +- /*
2118 +- * In case of hot-plug or hot-unplug, we may have to undo
2119 +- * vmx_vcpu_pi_put even if there is no assigned device. And we
2120 +- * always keep PI.NDST up to date for simplicity: it makes the
2121 +- * code easier, and CPU migration is not a fast path.
2122 +- */
2123 +- if (!pi_test_sn(pi_desc) && vcpu->cpu == cpu)
2124 +- return;
2125 +-
2126 +- /*
2127 +- * If the 'nv' field is POSTED_INTR_WAKEUP_VECTOR, do not change
2128 +- * PI.NDST: pi_post_block is the one expected to change PID.NDST and the
2129 +- * wakeup handler expects the vCPU to be on the blocked_vcpu_list that
2130 +- * matches PI.NDST. Otherwise, a vcpu may not be able to be woken up
2131 +- * correctly.
2132 +- */
2133 +- if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR || vcpu->cpu == cpu) {
2134 +- pi_clear_sn(pi_desc);
2135 +- goto after_clear_sn;
2136 +- }
2137 +-
2138 +- /* The full case. */
2139 +- do {
2140 +- old.control = new.control = pi_desc->control;
2141 +-
2142 +- dest = cpu_physical_id(cpu);
2143 +-
2144 +- if (x2apic_enabled())
2145 +- new.ndst = dest;
2146 +- else
2147 +- new.ndst = (dest << 8) & 0xFF00;
2148 +-
2149 +- new.sn = 0;
2150 +- } while (cmpxchg64(&pi_desc->control, old.control,
2151 +- new.control) != old.control);
2152 +-
2153 +-after_clear_sn:
2154 +-
2155 +- /*
2156 +- * Clear SN before reading the bitmap. The VT-d firmware
2157 +- * writes the bitmap and reads SN atomically (5.2.3 in the
2158 +- * spec), so it doesn't really have a memory barrier that
2159 +- * pairs with this, but we cannot do that and we need one.
2160 +- */
2161 +- smp_mb__after_atomic();
2162 +-
2163 +- if (!pi_is_pir_empty(pi_desc))
2164 +- pi_set_on(pi_desc);
2165 +-}
2166 +-
2167 +-void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu)
2168 +-{
2169 +- struct vcpu_vmx *vmx = to_vmx(vcpu);
2170 +- bool already_loaded = vmx->loaded_vmcs->cpu == cpu;
2171 +-
2172 +- if (!already_loaded) {
2173 +- loaded_vmcs_clear(vmx->loaded_vmcs);
2174 +- local_irq_disable();
2175 +- crash_disable_local_vmclear(cpu);
2176 +-
2177 +- /*
2178 +- * Read loaded_vmcs->cpu should be before fetching
2179 +- * loaded_vmcs->loaded_vmcss_on_cpu_link.
2180 +- * See the comments in __loaded_vmcs_clear().
2181 +- */
2182 +- smp_rmb();
2183 +-
2184 +- list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link,
2185 +- &per_cpu(loaded_vmcss_on_cpu, cpu));
2186 +- crash_enable_local_vmclear(cpu);
2187 +- local_irq_enable();
2188 +- }
2189 +-
2190 +- if (per_cpu(current_vmcs, cpu) != vmx->loaded_vmcs->vmcs) {
2191 +- per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs;
2192 +- vmcs_load(vmx->loaded_vmcs->vmcs);
2193 +- indirect_branch_prediction_barrier();
2194 +- }
2195 +-
2196 +- if (!already_loaded) {
2197 +- void *gdt = get_current_gdt_ro();
2198 +- unsigned long sysenter_esp;
2199 +-
2200 +- kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
2201 +-
2202 +- /*
2203 +- * Linux uses per-cpu TSS and GDT, so set these when switching
2204 +- * processors. See 22.2.4.
2205 +- */
2206 +- vmcs_writel(HOST_TR_BASE,
2207 +- (unsigned long)&get_cpu_entry_area(cpu)->tss.x86_tss);
2208 +- vmcs_writel(HOST_GDTR_BASE, (unsigned long)gdt); /* 22.2.4 */
2209 +-
2210 +- rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
2211 +- vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
2212 +-
2213 +- vmx->loaded_vmcs->cpu = cpu;
2214 +- }
2215 +-
2216 +- /* Setup TSC multiplier */
2217 +- if (kvm_has_tsc_control &&
2218 +- vmx->current_tsc_ratio != vcpu->arch.tsc_scaling_ratio)
2219 +- decache_tsc_multiplier(vmx);
2220 +-}
2221 +-
2222 +-/*
2223 +- * Switches to specified vcpu, until a matching vcpu_put(), but assumes
2224 +- * vcpu mutex is already taken.
2225 +- */
2226 +-void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
2227 +-{
2228 +- struct vcpu_vmx *vmx = to_vmx(vcpu);
2229 +-
2230 +- vmx_vcpu_load_vmcs(vcpu, cpu);
2231 +-
2232 +- vmx_vcpu_pi_load(vcpu, cpu);
2233 +-
2234 +- vmx->host_pkru = read_pkru();
2235 +- vmx->host_debugctlmsr = get_debugctlmsr();
2236 +-}
2237 +-
2238 +-static void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu)
2239 +-{
2240 +- struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
2241 +-
2242 +- if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
2243 +- !irq_remapping_cap(IRQ_POSTING_CAP) ||
2244 +- !kvm_vcpu_apicv_active(vcpu))
2245 +- return;
2246 +-
2247 +- /* Set SN when the vCPU is preempted */
2248 +- if (vcpu->preempted)
2249 +- pi_set_sn(pi_desc);
2250 +-}
2251 +-
2252 +-static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
2253 +-{
2254 +- vmx_vcpu_pi_put(vcpu);
2255 +-
2256 +- vmx_prepare_switch_to_host(to_vmx(vcpu));
2257 +-}
2258 +-
2259 +-static bool emulation_required(struct kvm_vcpu *vcpu)
2260 +-{
2261 +- return emulate_invalid_guest_state && !guest_state_valid(vcpu);
2262 +-}
2263 +-
2264 +-static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu);
2265 +-
2266 +-unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
2267 +-{
2268 +- struct vcpu_vmx *vmx = to_vmx(vcpu);
2269 +- unsigned long rflags, save_rflags;
2270 +-
2271 +- if (!kvm_register_is_available(vcpu, VCPU_EXREG_RFLAGS)) {
2272 +- kvm_register_mark_available(vcpu, VCPU_EXREG_RFLAGS);
2273 +- rflags = vmcs_readl(GUEST_RFLAGS);
2274 +- if (vmx->rmode.vm86_active) {
2275 +- rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
2276 +- save_rflags = vmx->rmode.save_rflags;
2277 +- rflags |= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
2278 +- }
2279 +- vmx->rflags = rflags;
2280 +- }
2281 +- return vmx->rflags;
2282 +-}
2283 +-
2284 +-void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
2285 +-{
2286 +- struct vcpu_vmx *vmx = to_vmx(vcpu);
2287 +- unsigned long old_rflags;
2288 +-
2289 +- if (enable_unrestricted_guest) {
2290 +- kvm_register_mark_available(vcpu, VCPU_EXREG_RFLAGS);
2291 +- vmx->rflags = rflags;
2292 +- vmcs_writel(GUEST_RFLAGS, rflags);
2293 +- return;
2294 +- }
2295 +-
2296 +- old_rflags = vmx_get_rflags(vcpu);
2297 +- vmx->rflags = rflags;
2298 +- if (vmx->rmode.vm86_active) {
2299 +- vmx->rmode.save_rflags = rflags;
2300 +- rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
2301 +- }
2302 +- vmcs_writel(GUEST_RFLAGS, rflags);
2303 +-
2304 +- if ((old_rflags ^ vmx->rflags) & X86_EFLAGS_VM)
2305 +- vmx->emulation_required = emulation_required(vcpu);
2306 +-}
2307 +-
2308 +-u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu)
2309 +-{
2310 +- u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
2311 +- int ret = 0;
2312 +-
2313 +- if (interruptibility & GUEST_INTR_STATE_STI)
2314 +- ret |= KVM_X86_SHADOW_INT_STI;
2315 +- if (interruptibility & GUEST_INTR_STATE_MOV_SS)
2316 +- ret |= KVM_X86_SHADOW_INT_MOV_SS;
2317 +-
2318 +- return ret;
2319 +-}
2320 +-
2321 +-void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
2322 +-{
2323 +- u32 interruptibility_old = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
2324 +- u32 interruptibility = interruptibility_old;
2325 +-
2326 +- interruptibility &= ~(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS);
2327 +-
2328 +- if (mask & KVM_X86_SHADOW_INT_MOV_SS)
2329 +- interruptibility |= GUEST_INTR_STATE_MOV_SS;
2330 +- else if (mask & KVM_X86_SHADOW_INT_STI)
2331 +- interruptibility |= GUEST_INTR_STATE_STI;
2332 +-
2333 +- if ((interruptibility != interruptibility_old))
2334 +- vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, interruptibility);
2335 +-}
2336 +-
2337 +-static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data)
2338 +-{
2339 +- struct vcpu_vmx *vmx = to_vmx(vcpu);
2340 +- unsigned long value;
2341 +-
2342 +- /*
2343 +- * Any MSR write that attempts to change bits marked reserved will
2344 +- * case a #GP fault.
2345 +- */
2346 +- if (data & vmx->pt_desc.ctl_bitmask)
2347 +- return 1;
2348 +-
2349 +- /*
2350 +- * Any attempt to modify IA32_RTIT_CTL while TraceEn is set will
2351 +- * result in a #GP unless the same write also clears TraceEn.
2352 +- */
2353 +- if ((vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) &&
2354 +- ((vmx->pt_desc.guest.ctl ^ data) & ~RTIT_CTL_TRACEEN))
2355 +- return 1;
2356 +-
2357 +- /*
2358 +- * WRMSR to IA32_RTIT_CTL that sets TraceEn but clears this bit
2359 +- * and FabricEn would cause #GP, if
2360 +- * CPUID.(EAX=14H, ECX=0):ECX.SNGLRGNOUT[bit 2] = 0
2361 +- */
2362 +- if ((data & RTIT_CTL_TRACEEN) && !(data & RTIT_CTL_TOPA) &&
2363 +- !(data & RTIT_CTL_FABRIC_EN) &&
2364 +- !intel_pt_validate_cap(vmx->pt_desc.caps,
2365 +- PT_CAP_single_range_output))
2366 +- return 1;
2367 +-
2368 +- /*
2369 +- * MTCFreq, CycThresh and PSBFreq encodings check, any MSR write that
2370 +- * utilize encodings marked reserved will casue a #GP fault.
2371 +- */
2372 +- value = intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc_periods);
2373 +- if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc) &&
2374 +- !test_bit((data & RTIT_CTL_MTC_RANGE) >>
2375 +- RTIT_CTL_MTC_RANGE_OFFSET, &value))
2376 +- return 1;
2377 +- value = intel_pt_validate_cap(vmx->pt_desc.caps,
2378 +- PT_CAP_cycle_thresholds);
2379 +- if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc) &&
2380 +- !test_bit((data & RTIT_CTL_CYC_THRESH) >>
2381 +- RTIT_CTL_CYC_THRESH_OFFSET, &value))
2382 +- return 1;
2383 +- value = intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_periods);
2384 +- if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc) &&
2385 +- !test_bit((data & RTIT_CTL_PSB_FREQ) >>
2386 +- RTIT_CTL_PSB_FREQ_OFFSET, &value))
2387 +- return 1;
2388 +-
2389 +- /*
2390 +- * If ADDRx_CFG is reserved or the encodings is >2 will
2391 +- * cause a #GP fault.
2392 +- */
2393 +- value = (data & RTIT_CTL_ADDR0) >> RTIT_CTL_ADDR0_OFFSET;
2394 +- if ((value && (vmx->pt_desc.addr_range < 1)) || (value > 2))
2395 +- return 1;
2396 +- value = (data & RTIT_CTL_ADDR1) >> RTIT_CTL_ADDR1_OFFSET;
2397 +- if ((value && (vmx->pt_desc.addr_range < 2)) || (value > 2))
2398 +- return 1;
2399 +- value = (data & RTIT_CTL_ADDR2) >> RTIT_CTL_ADDR2_OFFSET;
2400 +- if ((value && (vmx->pt_desc.addr_range < 3)) || (value > 2))
2401 +- return 1;
2402 +- value = (data & RTIT_CTL_ADDR3) >> RTIT_CTL_ADDR3_OFFSET;
2403 +- if ((value && (vmx->pt_desc.addr_range < 4)) || (value > 2))
2404 +- return 1;
2405 +-
2406 +- return 0;
2407 +-}
2408 +-
2409 +-static int skip_emulated_instruction(struct kvm_vcpu *vcpu)
2410 +-{
2411 +- unsigned long rip;
2412 +-
2413 +- /*
2414 +- * Using VMCS.VM_EXIT_INSTRUCTION_LEN on EPT misconfig depends on
2415 +- * undefined behavior: Intel's SDM doesn't mandate the VMCS field be
2416 +- * set when EPT misconfig occurs. In practice, real hardware updates
2417 +- * VM_EXIT_INSTRUCTION_LEN on EPT misconfig, but other hypervisors
2418 +- * (namely Hyper-V) don't set it due to it being undefined behavior,
2419 +- * i.e. we end up advancing IP with some random value.
2420 +- */
2421 +- if (!static_cpu_has(X86_FEATURE_HYPERVISOR) ||
2422 +- to_vmx(vcpu)->exit_reason != EXIT_REASON_EPT_MISCONFIG) {
2423 +- rip = kvm_rip_read(vcpu);
2424 +- rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
2425 +- kvm_rip_write(vcpu, rip);
2426 +- } else {
2427 +- if (!kvm_emulate_instruction(vcpu, EMULTYPE_SKIP))
2428 +- return 0;
2429 +- }
2430 +-
2431 +- /* skipping an emulated instruction also counts */
2432 +- vmx_set_interrupt_shadow(vcpu, 0);
2433 +-
2434 +- return 1;
2435 +-}
2436 +-
2437 +-static void vmx_clear_hlt(struct kvm_vcpu *vcpu)
2438 +-{
2439 +- /*
2440 +- * Ensure that we clear the HLT state in the VMCS. We don't need to
2441 +- * explicitly skip the instruction because if the HLT state is set,
2442 +- * then the instruction is already executing and RIP has already been
2443 +- * advanced.
2444 +- */
2445 +- if (kvm_hlt_in_guest(vcpu->kvm) &&
2446 +- vmcs_read32(GUEST_ACTIVITY_STATE) == GUEST_ACTIVITY_HLT)
2447 +- vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
2448 +-}
2449 +-
2450 +-static void vmx_queue_exception(struct kvm_vcpu *vcpu)
2451 +-{
2452 +- struct vcpu_vmx *vmx = to_vmx(vcpu);
2453 +- unsigned nr = vcpu->arch.exception.nr;
2454 +- bool has_error_code = vcpu->arch.exception.has_error_code;
2455 +- u32 error_code = vcpu->arch.exception.error_code;
2456 +- u32 intr_info = nr | INTR_INFO_VALID_MASK;
2457 +-
2458 +- kvm_deliver_exception_payload(vcpu);
2459 +-
2460 +- if (has_error_code) {
2461 +- vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
2462 +- intr_info |= INTR_INFO_DELIVER_CODE_MASK;
2463 +- }
2464 +-
2465 +- if (vmx->rmode.vm86_active) {
2466 +- int inc_eip = 0;
2467 +- if (kvm_exception_is_soft(nr))
2468 +- inc_eip = vcpu->arch.event_exit_inst_len;
2469 +- kvm_inject_realmode_interrupt(vcpu, nr, inc_eip);
2470 +- return;
2471 +- }
2472 +-
2473 +- WARN_ON_ONCE(vmx->emulation_required);
2474 +-
2475 +- if (kvm_exception_is_soft(nr)) {
2476 +- vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
2477 +- vmx->vcpu.arch.event_exit_inst_len);
2478 +- intr_info |= INTR_TYPE_SOFT_EXCEPTION;
2479 +- } else
2480 +- intr_info |= INTR_TYPE_HARD_EXCEPTION;
2481 +-
2482 +- vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
2483 +-
2484 +- vmx_clear_hlt(vcpu);
2485 +-}
2486 +-
2487 +-static bool vmx_rdtscp_supported(void)
2488 +-{
2489 +- return cpu_has_vmx_rdtscp();
2490 +-}
2491 +-
2492 +-static bool vmx_invpcid_supported(void)
2493 +-{
2494 +- return cpu_has_vmx_invpcid();
2495 +-}
2496 +-
2497 +-/*
2498 +- * Swap MSR entry in host/guest MSR entry array.
2499 +- */
2500 +-static void move_msr_up(struct vcpu_vmx *vmx, int from, int to)
2501 +-{
2502 +- struct shared_msr_entry tmp;
2503 +-
2504 +- tmp = vmx->guest_msrs[to];
2505 +- vmx->guest_msrs[to] = vmx->guest_msrs[from];
2506 +- vmx->guest_msrs[from] = tmp;
2507 +-}
2508 +-
2509 +-/*
2510 +- * Set up the vmcs to automatically save and restore system
2511 +- * msrs. Don't touch the 64-bit msrs if the guest is in legacy
2512 +- * mode, as fiddling with msrs is very expensive.
2513 +- */
2514 +-static void setup_msrs(struct vcpu_vmx *vmx)
2515 +-{
2516 +- int save_nmsrs, index;
2517 +-
2518 +- save_nmsrs = 0;
2519 +-#ifdef CONFIG_X86_64
2520 +- /*
2521 +- * The SYSCALL MSRs are only needed on long mode guests, and only
2522 +- * when EFER.SCE is set.
2523 +- */
2524 +- if (is_long_mode(&vmx->vcpu) && (vmx->vcpu.arch.efer & EFER_SCE)) {
2525 +- index = __find_msr_index(vmx, MSR_STAR);
2526 +- if (index >= 0)
2527 +- move_msr_up(vmx, index, save_nmsrs++);
2528 +- index = __find_msr_index(vmx, MSR_LSTAR);
2529 +- if (index >= 0)
2530 +- move_msr_up(vmx, index, save_nmsrs++);
2531 +- index = __find_msr_index(vmx, MSR_SYSCALL_MASK);
2532 +- if (index >= 0)
2533 +- move_msr_up(vmx, index, save_nmsrs++);
2534 +- }
2535 +-#endif
2536 +- index = __find_msr_index(vmx, MSR_EFER);
2537 +- if (index >= 0 && update_transition_efer(vmx, index))
2538 +- move_msr_up(vmx, index, save_nmsrs++);
2539 +- index = __find_msr_index(vmx, MSR_TSC_AUX);
2540 +- if (index >= 0 && guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDTSCP))
2541 +- move_msr_up(vmx, index, save_nmsrs++);
2542 +- index = __find_msr_index(vmx, MSR_IA32_TSX_CTRL);
2543 +- if (index >= 0)
2544 +- move_msr_up(vmx, index, save_nmsrs++);
2545 +-
2546 +- vmx->save_nmsrs = save_nmsrs;
2547 +- vmx->guest_msrs_ready = false;
2548 +-
2549 +- if (cpu_has_vmx_msr_bitmap())
2550 +- vmx_update_msr_bitmap(&vmx->vcpu);
2551 +-}
2552 +-
2553 +-static u64 vmx_read_l1_tsc_offset(struct kvm_vcpu *vcpu)
2554 +-{
2555 +- struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
2556 +-
2557 +- if (is_guest_mode(vcpu) &&
2558 +- (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING))
2559 +- return vcpu->arch.tsc_offset - vmcs12->tsc_offset;
2560 +-
2561 +- return vcpu->arch.tsc_offset;
2562 +-}
2563 +-
2564 +-static u64 vmx_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
2565 +-{
2566 +- struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
2567 +- u64 g_tsc_offset = 0;
2568 +-
2569 +- /*
2570 +- * We're here if L1 chose not to trap WRMSR to TSC. According
2571 +- * to the spec, this should set L1's TSC; The offset that L1
2572 +- * set for L2 remains unchanged, and still needs to be added
2573 +- * to the newly set TSC to get L2's TSC.
2574 +- */
2575 +- if (is_guest_mode(vcpu) &&
2576 +- (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING))
2577 +- g_tsc_offset = vmcs12->tsc_offset;
2578 +-
2579 +- trace_kvm_write_tsc_offset(vcpu->vcpu_id,
2580 +- vcpu->arch.tsc_offset - g_tsc_offset,
2581 +- offset);
2582 +- vmcs_write64(TSC_OFFSET, offset + g_tsc_offset);
2583 +- return offset + g_tsc_offset;
2584 +-}
2585 +-
2586 +-/*
2587 +- * nested_vmx_allowed() checks whether a guest should be allowed to use VMX
2588 +- * instructions and MSRs (i.e., nested VMX). Nested VMX is disabled for
2589 +- * all guests if the "nested" module option is off, and can also be disabled
2590 +- * for a single guest by disabling its VMX cpuid bit.
2591 +- */
2592 +-bool nested_vmx_allowed(struct kvm_vcpu *vcpu)
2593 +-{
2594 +- return nested && guest_cpuid_has(vcpu, X86_FEATURE_VMX);
2595 +-}
2596 +-
2597 +-static inline bool vmx_feature_control_msr_valid(struct kvm_vcpu *vcpu,
2598 +- uint64_t val)
2599 +-{
2600 +- uint64_t valid_bits = to_vmx(vcpu)->msr_ia32_feature_control_valid_bits;
2601 +-
2602 +- return !(val & ~valid_bits);
2603 +-}
2604 +-
2605 +-static int vmx_get_msr_feature(struct kvm_msr_entry *msr)
2606 +-{
2607 +- switch (msr->index) {
2608 +- case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
2609 +- if (!nested)
2610 +- return 1;
2611 +- return vmx_get_vmx_msr(&vmcs_config.nested, msr->index, &msr->data);
2612 +- default:
2613 +- return 1;
2614 +- }
2615 +-}
2616 +-
2617 +-/*
2618 +- * Reads an msr value (of 'msr_index') into 'pdata'.
2619 +- * Returns 0 on success, non-0 otherwise.
2620 +- * Assumes vcpu_load() was already called.
2621 +- */
2622 +-static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2623 +-{
2624 +- struct vcpu_vmx *vmx = to_vmx(vcpu);
2625 +- struct shared_msr_entry *msr;
2626 +- u32 index;
2627 +-
2628 +- switch (msr_info->index) {
2629 +-#ifdef CONFIG_X86_64
2630 +- case MSR_FS_BASE:
2631 +- msr_info->data = vmcs_readl(GUEST_FS_BASE);
2632 +- break;
2633 +- case MSR_GS_BASE:
2634 +- msr_info->data = vmcs_readl(GUEST_GS_BASE);
2635 +- break;
2636 +- case MSR_KERNEL_GS_BASE:
2637 +- msr_info->data = vmx_read_guest_kernel_gs_base(vmx);
2638 +- break;
2639 +-#endif
2640 +- case MSR_EFER:
2641 +- return kvm_get_msr_common(vcpu, msr_info);
2642 +- case MSR_IA32_TSX_CTRL:
2643 +- if (!msr_info->host_initiated &&
2644 +- !(vcpu->arch.arch_capabilities & ARCH_CAP_TSX_CTRL_MSR))
2645 +- return 1;
2646 +- goto find_shared_msr;
2647 +- case MSR_IA32_UMWAIT_CONTROL:
2648 +- if (!msr_info->host_initiated && !vmx_has_waitpkg(vmx))
2649 +- return 1;
2650 +-
2651 +- msr_info->data = vmx->msr_ia32_umwait_control;
2652 +- break;
2653 +- case MSR_IA32_SPEC_CTRL:
2654 +- if (!msr_info->host_initiated &&
2655 +- !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
2656 +- return 1;
2657 +-
2658 +- msr_info->data = to_vmx(vcpu)->spec_ctrl;
2659 +- break;
2660 +- case MSR_IA32_SYSENTER_CS:
2661 +- msr_info->data = vmcs_read32(GUEST_SYSENTER_CS);
2662 +- break;
2663 +- case MSR_IA32_SYSENTER_EIP:
2664 +- msr_info->data = vmcs_readl(GUEST_SYSENTER_EIP);
2665 +- break;
2666 +- case MSR_IA32_SYSENTER_ESP:
2667 +- msr_info->data = vmcs_readl(GUEST_SYSENTER_ESP);
2668 +- break;
2669 +- case MSR_IA32_BNDCFGS:
2670 +- if (!kvm_mpx_supported() ||
2671 +- (!msr_info->host_initiated &&
2672 +- !guest_cpuid_has(vcpu, X86_FEATURE_MPX)))
2673 +- return 1;
2674 +- msr_info->data = vmcs_read64(GUEST_BNDCFGS);
2675 +- break;
2676 +- case MSR_IA32_MCG_EXT_CTL:
2677 +- if (!msr_info->host_initiated &&
2678 +- !(vmx->msr_ia32_feature_control &
2679 +- FEATURE_CONTROL_LMCE))
2680 +- return 1;
2681 +- msr_info->data = vcpu->arch.mcg_ext_ctl;
2682 +- break;
2683 +- case MSR_IA32_FEATURE_CONTROL:
2684 +- msr_info->data = vmx->msr_ia32_feature_control;
2685 +- break;
2686 +- case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
2687 +- if (!nested_vmx_allowed(vcpu))
2688 +- return 1;
2689 +- return vmx_get_vmx_msr(&vmx->nested.msrs, msr_info->index,
2690 +- &msr_info->data);
2691 +- case MSR_IA32_RTIT_CTL:
2692 +- if (pt_mode != PT_MODE_HOST_GUEST)
2693 +- return 1;
2694 +- msr_info->data = vmx->pt_desc.guest.ctl;
2695 +- break;
2696 +- case MSR_IA32_RTIT_STATUS:
2697 +- if (pt_mode != PT_MODE_HOST_GUEST)
2698 +- return 1;
2699 +- msr_info->data = vmx->pt_desc.guest.status;
2700 +- break;
2701 +- case MSR_IA32_RTIT_CR3_MATCH:
2702 +- if ((pt_mode != PT_MODE_HOST_GUEST) ||
2703 +- !intel_pt_validate_cap(vmx->pt_desc.caps,
2704 +- PT_CAP_cr3_filtering))
2705 +- return 1;
2706 +- msr_info->data = vmx->pt_desc.guest.cr3_match;
2707 +- break;
2708 +- case MSR_IA32_RTIT_OUTPUT_BASE:
2709 +- if ((pt_mode != PT_MODE_HOST_GUEST) ||
2710 +- (!intel_pt_validate_cap(vmx->pt_desc.caps,
2711 +- PT_CAP_topa_output) &&
2712 +- !intel_pt_validate_cap(vmx->pt_desc.caps,
2713 +- PT_CAP_single_range_output)))
2714 +- return 1;
2715 +- msr_info->data = vmx->pt_desc.guest.output_base;
2716 +- break;
2717 +- case MSR_IA32_RTIT_OUTPUT_MASK:
2718 +- if ((pt_mode != PT_MODE_HOST_GUEST) ||
2719 +- (!intel_pt_validate_cap(vmx->pt_desc.caps,
2720 +- PT_CAP_topa_output) &&
2721 +- !intel_pt_validate_cap(vmx->pt_desc.caps,
2722 +- PT_CAP_single_range_output)))
2723 +- return 1;
2724 +- msr_info->data = vmx->pt_desc.guest.output_mask;
2725 +- break;
2726 +- case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
2727 +- index = msr_info->index - MSR_IA32_RTIT_ADDR0_A;
2728 +- if ((pt_mode != PT_MODE_HOST_GUEST) ||
2729 +- (index >= 2 * intel_pt_validate_cap(vmx->pt_desc.caps,
2730 +- PT_CAP_num_address_ranges)))
2731 +- return 1;
2732 +- if (is_noncanonical_address(data, vcpu))
2733 +- return 1;
2734 +- if (index % 2)
2735 +- msr_info->data = vmx->pt_desc.guest.addr_b[index / 2];
2736 +- else
2737 +- msr_info->data = vmx->pt_desc.guest.addr_a[index / 2];
2738 +- break;
2739 +- case MSR_TSC_AUX:
2740 +- if (!msr_info->host_initiated &&
2741 +- !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
2742 +- return 1;
2743 +- goto find_shared_msr;
2744 +- default:
2745 +- find_shared_msr:
2746 +- msr = find_msr_entry(vmx, msr_info->index);
2747 +- if (msr) {
2748 +- msr_info->data = msr->data;
2749 +- break;
2750 +- }
2751 +- return kvm_get_msr_common(vcpu, msr_info);
2752 +- }
2753 +-
2754 +- return 0;
2755 +-}
2756 +-
2757 +-/*
2758 +- * Writes msr value into the appropriate "register".
2759 +- * Returns 0 on success, non-0 otherwise.
2760 +- * Assumes vcpu_load() was already called.
2761 +- */
2762 +-static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2763 +-{
2764 +- struct vcpu_vmx *vmx = to_vmx(vcpu);
2765 +- struct shared_msr_entry *msr;
2766 +- int ret = 0;
2767 +- u32 msr_index = msr_info->index;
2768 +- u64 data = msr_info->data;
2769 +- u32 index;
2770 +-
2771 +- switch (msr_index) {
2772 +- case MSR_EFER:
2773 +- ret = kvm_set_msr_common(vcpu, msr_info);
2774 +- break;
2775 +-#ifdef CONFIG_X86_64
2776 +- case MSR_FS_BASE:
2777 +- vmx_segment_cache_clear(vmx);
2778 +- vmcs_writel(GUEST_FS_BASE, data);
2779 +- break;
2780 +- case MSR_GS_BASE:
2781 +- vmx_segment_cache_clear(vmx);
2782 +- vmcs_writel(GUEST_GS_BASE, data);
2783 +- break;
2784 +- case MSR_KERNEL_GS_BASE:
2785 +- vmx_write_guest_kernel_gs_base(vmx, data);
2786 +- break;
2787 +-#endif
2788 +- case MSR_IA32_SYSENTER_CS:
2789 +- if (is_guest_mode(vcpu))
2790 +- get_vmcs12(vcpu)->guest_sysenter_cs = data;
2791 +- vmcs_write32(GUEST_SYSENTER_CS, data);
2792 +- break;
2793 +- case MSR_IA32_SYSENTER_EIP:
2794 +- if (is_guest_mode(vcpu))
2795 +- get_vmcs12(vcpu)->guest_sysenter_eip = data;
2796 +- vmcs_writel(GUEST_SYSENTER_EIP, data);
2797 +- break;
2798 +- case MSR_IA32_SYSENTER_ESP:
2799 +- if (is_guest_mode(vcpu))
2800 +- get_vmcs12(vcpu)->guest_sysenter_esp = data;
2801 +- vmcs_writel(GUEST_SYSENTER_ESP, data);
2802 +- break;
2803 +- case MSR_IA32_DEBUGCTLMSR:
2804 +- if (is_guest_mode(vcpu) && get_vmcs12(vcpu)->vm_exit_controls &
2805 +- VM_EXIT_SAVE_DEBUG_CONTROLS)
2806 +- get_vmcs12(vcpu)->guest_ia32_debugctl = data;
2807 +-
2808 +- ret = kvm_set_msr_common(vcpu, msr_info);
2809 +- break;
2810 +-
2811 +- case MSR_IA32_BNDCFGS:
2812 +- if (!kvm_mpx_supported() ||
2813 +- (!msr_info->host_initiated &&
2814 +- !guest_cpuid_has(vcpu, X86_FEATURE_MPX)))
2815 +- return 1;
2816 +- if (is_noncanonical_address(data & PAGE_MASK, vcpu) ||
2817 +- (data & MSR_IA32_BNDCFGS_RSVD))
2818 +- return 1;
2819 +- vmcs_write64(GUEST_BNDCFGS, data);
2820 +- break;
2821 +- case MSR_IA32_UMWAIT_CONTROL:
2822 +- if (!msr_info->host_initiated && !vmx_has_waitpkg(vmx))
2823 +- return 1;
2824 +-
2825 +- /* The reserved bit 1 and non-32 bit [63:32] should be zero */
2826 +- if (data & (BIT_ULL(1) | GENMASK_ULL(63, 32)))
2827 +- return 1;
2828 +-
2829 +- vmx->msr_ia32_umwait_control = data;
2830 +- break;
2831 +- case MSR_IA32_SPEC_CTRL:
2832 +- if (!msr_info->host_initiated &&
2833 +- !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
2834 +- return 1;
2835 +-
2836 +- /* The STIBP bit doesn't fault even if it's not advertised */
2837 +- if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP | SPEC_CTRL_SSBD))
2838 +- return 1;
2839 +-
2840 +- vmx->spec_ctrl = data;
2841 +-
2842 +- if (!data)
2843 +- break;
2844 +-
2845 +- /*
2846 +- * For non-nested:
2847 +- * When it's written (to non-zero) for the first time, pass
2848 +- * it through.
2849 +- *
2850 +- * For nested:
2851 +- * The handling of the MSR bitmap for L2 guests is done in
2852 +- * nested_vmx_prepare_msr_bitmap. We should not touch the
2853 +- * vmcs02.msr_bitmap here since it gets completely overwritten
2854 +- * in the merging. We update the vmcs01 here for L1 as well
2855 +- * since it will end up touching the MSR anyway now.
2856 +- */
2857 +- vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap,
2858 +- MSR_IA32_SPEC_CTRL,
2859 +- MSR_TYPE_RW);
2860 +- break;
2861 +- case MSR_IA32_TSX_CTRL:
2862 +- if (!msr_info->host_initiated &&
2863 +- !(vcpu->arch.arch_capabilities & ARCH_CAP_TSX_CTRL_MSR))
2864 +- return 1;
2865 +- if (data & ~(TSX_CTRL_RTM_DISABLE | TSX_CTRL_CPUID_CLEAR))
2866 +- return 1;
2867 +- goto find_shared_msr;
2868 +- case MSR_IA32_PRED_CMD:
2869 +- if (!msr_info->host_initiated &&
2870 +- !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
2871 +- return 1;
2872 +-
2873 +- if (data & ~PRED_CMD_IBPB)
2874 +- return 1;
2875 +-
2876 +- if (!data)
2877 +- break;
2878 +-
2879 +- wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB);
2880 +-
2881 +- /*
2882 +- * For non-nested:
2883 +- * When it's written (to non-zero) for the first time, pass
2884 +- * it through.
2885 +- *
2886 +- * For nested:
2887 +- * The handling of the MSR bitmap for L2 guests is done in
2888 +- * nested_vmx_prepare_msr_bitmap. We should not touch the
2889 +- * vmcs02.msr_bitmap here since it gets completely overwritten
2890 +- * in the merging.
2891 +- */
2892 +- vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap, MSR_IA32_PRED_CMD,
2893 +- MSR_TYPE_W);
2894 +- break;
2895 +- case MSR_IA32_CR_PAT:
2896 +- if (!kvm_pat_valid(data))
2897 +- return 1;
2898 +-
2899 +- if (is_guest_mode(vcpu) &&
2900 +- get_vmcs12(vcpu)->vm_exit_controls & VM_EXIT_SAVE_IA32_PAT)
2901 +- get_vmcs12(vcpu)->guest_ia32_pat = data;
2902 +-
2903 +- if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
2904 +- vmcs_write64(GUEST_IA32_PAT, data);
2905 +- vcpu->arch.pat = data;
2906 +- break;
2907 +- }
2908 +- ret = kvm_set_msr_common(vcpu, msr_info);
2909 +- break;
2910 +- case MSR_IA32_TSC_ADJUST:
2911 +- ret = kvm_set_msr_common(vcpu, msr_info);
2912 +- break;
2913 +- case MSR_IA32_MCG_EXT_CTL:
2914 +- if ((!msr_info->host_initiated &&
2915 +- !(to_vmx(vcpu)->msr_ia32_feature_control &
2916 +- FEATURE_CONTROL_LMCE)) ||
2917 +- (data & ~MCG_EXT_CTL_LMCE_EN))
2918 +- return 1;
2919 +- vcpu->arch.mcg_ext_ctl = data;
2920 +- break;
2921 +- case MSR_IA32_FEATURE_CONTROL:
2922 +- if (!vmx_feature_control_msr_valid(vcpu, data) ||
2923 +- (to_vmx(vcpu)->msr_ia32_feature_control &
2924 +- FEATURE_CONTROL_LOCKED && !msr_info->host_initiated))
2925 +- return 1;
2926 +- vmx->msr_ia32_feature_control = data;
2927 +- if (msr_info->host_initiated && data == 0)
2928 +- vmx_leave_nested(vcpu);
2929 +- break;
2930 +- case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
2931 +- if (!msr_info->host_initiated)
2932 +- return 1; /* they are read-only */
2933 +- if (!nested_vmx_allowed(vcpu))
2934 +- return 1;
2935 +- return vmx_set_vmx_msr(vcpu, msr_index, data);
2936 +- case MSR_IA32_RTIT_CTL:
2937 +- if ((pt_mode != PT_MODE_HOST_GUEST) ||
2938 +- vmx_rtit_ctl_check(vcpu, data) ||
2939 +- vmx->nested.vmxon)
2940 +- return 1;
2941 +- vmcs_write64(GUEST_IA32_RTIT_CTL, data);
2942 +- vmx->pt_desc.guest.ctl = data;
2943 +- pt_update_intercept_for_msr(vmx);
2944 +- break;
2945 +- case MSR_IA32_RTIT_STATUS:
2946 +- if ((pt_mode != PT_MODE_HOST_GUEST) ||
2947 +- (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) ||
2948 +- (data & MSR_IA32_RTIT_STATUS_MASK))
2949 +- return 1;
2950 +- vmx->pt_desc.guest.status = data;
2951 +- break;
2952 +- case MSR_IA32_RTIT_CR3_MATCH:
2953 +- if ((pt_mode != PT_MODE_HOST_GUEST) ||
2954 +- (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) ||
2955 +- !intel_pt_validate_cap(vmx->pt_desc.caps,
2956 +- PT_CAP_cr3_filtering))
2957 +- return 1;
2958 +- vmx->pt_desc.guest.cr3_match = data;
2959 +- break;
2960 +- case MSR_IA32_RTIT_OUTPUT_BASE:
2961 +- if ((pt_mode != PT_MODE_HOST_GUEST) ||
2962 +- (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) ||
2963 +- (!intel_pt_validate_cap(vmx->pt_desc.caps,
2964 +- PT_CAP_topa_output) &&
2965 +- !intel_pt_validate_cap(vmx->pt_desc.caps,
2966 +- PT_CAP_single_range_output)) ||
2967 +- (data & MSR_IA32_RTIT_OUTPUT_BASE_MASK))
2968 +- return 1;
2969 +- vmx->pt_desc.guest.output_base = data;
2970 +- break;
2971 +- case MSR_IA32_RTIT_OUTPUT_MASK:
2972 +- if ((pt_mode != PT_MODE_HOST_GUEST) ||
2973 +- (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) ||
2974 +- (!intel_pt_validate_cap(vmx->pt_desc.caps,
2975 +- PT_CAP_topa_output) &&
2976 +- !intel_pt_validate_cap(vmx->pt_desc.caps,
2977 +- PT_CAP_single_range_output)))
2978 +- return 1;
2979 +- vmx->pt_desc.guest.output_mask = data;
2980 +- break;
2981 +- case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
2982 +- index = msr_info->index - MSR_IA32_RTIT_ADDR0_A;
2983 +- if ((pt_mode != PT_MODE_HOST_GUEST) ||
2984 +- (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) ||
2985 +- (index >= 2 * intel_pt_validate_cap(vmx->pt_desc.caps,
2986 +- PT_CAP_num_address_ranges)))
2987 +- return 1;
2988 +- if (is_noncanonical_address(data, vcpu))
2989 +- return 1;
2990 +- if (index % 2)
2991 +- vmx->pt_desc.guest.addr_b[index / 2] = data;
2992 +- else
2993 +- vmx->pt_desc.guest.addr_a[index / 2] = data;
2994 +- break;
2995 +- case MSR_TSC_AUX:
2996 +- if (!msr_info->host_initiated &&
2997 +- !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
2998 +- return 1;
2999 +- /* Check reserved bit, higher 32 bits should be zero */
3000 +- if ((data >> 32) != 0)
3001 +- return 1;
3002 +- goto find_shared_msr;
3003 +-
3004 +- default:
3005 +- find_shared_msr:
3006 +- msr = find_msr_entry(vmx, msr_index);
3007 +- if (msr)
3008 +- ret = vmx_set_guest_msr(vmx, msr, data);
3009 +- else
3010 +- ret = kvm_set_msr_common(vcpu, msr_info);
3011 +- }
3012 +-
3013 +- return ret;
3014 +-}
3015 +-
3016 +-static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
3017 +-{
3018 +- kvm_register_mark_available(vcpu, reg);
3019 +-
3020 +- switch (reg) {
3021 +- case VCPU_REGS_RSP:
3022 +- vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP);
3023 +- break;
3024 +- case VCPU_REGS_RIP:
3025 +- vcpu->arch.regs[VCPU_REGS_RIP] = vmcs_readl(GUEST_RIP);
3026 +- break;
3027 +- case VCPU_EXREG_PDPTR:
3028 +- if (enable_ept)
3029 +- ept_save_pdptrs(vcpu);
3030 +- break;
3031 +- case VCPU_EXREG_CR3:
3032 +- if (enable_unrestricted_guest || (enable_ept && is_paging(vcpu)))
3033 +- vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
3034 +- break;
3035 +- default:
3036 +- WARN_ON_ONCE(1);
3037 +- break;
3038 +- }
3039 +-}
3040 +-
3041 +-static __init int cpu_has_kvm_support(void)
3042 +-{
3043 +- return cpu_has_vmx();
3044 +-}
3045 +-
3046 +-static __init int vmx_disabled_by_bios(void)
3047 +-{
3048 +- u64 msr;
3049 +-
3050 +- rdmsrl(MSR_IA32_FEATURE_CONTROL, msr);
3051 +- if (msr & FEATURE_CONTROL_LOCKED) {
3052 +- /* launched w/ TXT and VMX disabled */
3053 +- if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX)
3054 +- && tboot_enabled())
3055 +- return 1;
3056 +- /* launched w/o TXT and VMX only enabled w/ TXT */
3057 +- if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX)
3058 +- && (msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX)
3059 +- && !tboot_enabled()) {
3060 +- printk(KERN_WARNING "kvm: disable TXT in the BIOS or "
3061 +- "activate TXT before enabling KVM\n");
3062 +- return 1;
3063 +- }
3064 +- /* launched w/o TXT and VMX disabled */
3065 +- if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX)
3066 +- && !tboot_enabled())
3067 +- return 1;
3068 +- }
3069 +-
3070 +- return 0;
3071 +-}
3072 +-
3073 +-static void kvm_cpu_vmxon(u64 addr)
3074 +-{
3075 +- cr4_set_bits(X86_CR4_VMXE);
3076 +- intel_pt_handle_vmx(1);
3077 +-
3078 +- asm volatile ("vmxon %0" : : "m"(addr));
3079 +-}
3080 +-
3081 +-static int hardware_enable(void)
3082 +-{
3083 +- int cpu = raw_smp_processor_id();
3084 +- u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
3085 +- u64 old, test_bits;
3086 +-
3087 +- if (cr4_read_shadow() & X86_CR4_VMXE)
3088 +- return -EBUSY;
3089 +-
3090 +- /*
3091 +- * This can happen if we hot-added a CPU but failed to allocate
3092 +- * VP assist page for it.
3093 +- */
3094 +- if (static_branch_unlikely(&enable_evmcs) &&
3095 +- !hv_get_vp_assist_page(cpu))
3096 +- return -EFAULT;
3097 +-
3098 +- INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
3099 +- INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu));
3100 +- spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
3101 +-
3102 +- /*
3103 +- * Now we can enable the vmclear operation in kdump
3104 +- * since the loaded_vmcss_on_cpu list on this cpu
3105 +- * has been initialized.
3106 +- *
3107 +- * Though the cpu is not in VMX operation now, there
3108 +- * is no problem to enable the vmclear operation
3109 +- * for the loaded_vmcss_on_cpu list is empty!
3110 +- */
3111 +- crash_enable_local_vmclear(cpu);
3112 +-
3113 +- rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
3114 +-
3115 +- test_bits = FEATURE_CONTROL_LOCKED;
3116 +- test_bits |= FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
3117 +- if (tboot_enabled())
3118 +- test_bits |= FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX;
3119 +-
3120 +- if ((old & test_bits) != test_bits) {
3121 +- /* enable and lock */
3122 +- wrmsrl(MSR_IA32_FEATURE_CONTROL, old | test_bits);
3123 +- }
3124 +- kvm_cpu_vmxon(phys_addr);
3125 +- if (enable_ept)
3126 +- ept_sync_global();
3127 +-
3128 +- return 0;
3129 +-}
3130 +-
3131 +-static void vmclear_local_loaded_vmcss(void)
3132 +-{
3133 +- int cpu = raw_smp_processor_id();
3134 +- struct loaded_vmcs *v, *n;
3135 +-
3136 +- list_for_each_entry_safe(v, n, &per_cpu(loaded_vmcss_on_cpu, cpu),
3137 +- loaded_vmcss_on_cpu_link)
3138 +- __loaded_vmcs_clear(v);
3139 +-}
3140 +-
3141 +-
3142 +-/* Just like cpu_vmxoff(), but with the __kvm_handle_fault_on_reboot()
3143 +- * tricks.
3144 +- */
3145 +-static void kvm_cpu_vmxoff(void)
3146 +-{
3147 +- asm volatile (__ex("vmxoff"));
3148 +-
3149 +- intel_pt_handle_vmx(0);
3150 +- cr4_clear_bits(X86_CR4_VMXE);
3151 +-}
3152 +-
3153 +-static void hardware_disable(void)
3154 +-{
3155 +- vmclear_local_loaded_vmcss();
3156 +- kvm_cpu_vmxoff();
3157 +-}
3158 +-
3159 +-static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
3160 +- u32 msr, u32 *result)
3161 +-{
3162 +- u32 vmx_msr_low, vmx_msr_high;
3163 +- u32 ctl = ctl_min | ctl_opt;
3164 +-
3165 +- rdmsr(msr, vmx_msr_low, vmx_msr_high);
3166 +-
3167 +- ctl &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */
3168 +- ctl |= vmx_msr_low; /* bit == 1 in low word ==> must be one */
3169 +-
3170 +- /* Ensure minimum (required) set of control bits are supported. */
3171 +- if (ctl_min & ~ctl)
3172 +- return -EIO;
3173 +-
3174 +- *result = ctl;
3175 +- return 0;
3176 +-}
3177 +-
3178 +-static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
3179 +- struct vmx_capability *vmx_cap)
3180 +-{
3181 +- u32 vmx_msr_low, vmx_msr_high;
3182 +- u32 min, opt, min2, opt2;
3183 +- u32 _pin_based_exec_control = 0;
3184 +- u32 _cpu_based_exec_control = 0;
3185 +- u32 _cpu_based_2nd_exec_control = 0;
3186 +- u32 _vmexit_control = 0;
3187 +- u32 _vmentry_control = 0;
3188 +-
3189 +- memset(vmcs_conf, 0, sizeof(*vmcs_conf));
3190 +- min = CPU_BASED_HLT_EXITING |
3191 +-#ifdef CONFIG_X86_64
3192 +- CPU_BASED_CR8_LOAD_EXITING |
3193 +- CPU_BASED_CR8_STORE_EXITING |
3194 +-#endif
3195 +- CPU_BASED_CR3_LOAD_EXITING |
3196 +- CPU_BASED_CR3_STORE_EXITING |
3197 +- CPU_BASED_UNCOND_IO_EXITING |
3198 +- CPU_BASED_MOV_DR_EXITING |
3199 +- CPU_BASED_USE_TSC_OFFSETTING |
3200 +- CPU_BASED_MWAIT_EXITING |
3201 +- CPU_BASED_MONITOR_EXITING |
3202 +- CPU_BASED_INVLPG_EXITING |
3203 +- CPU_BASED_RDPMC_EXITING;
3204 +-
3205 +- opt = CPU_BASED_TPR_SHADOW |
3206 +- CPU_BASED_USE_MSR_BITMAPS |
3207 +- CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
3208 +- if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS,
3209 +- &_cpu_based_exec_control) < 0)
3210 +- return -EIO;
3211 +-#ifdef CONFIG_X86_64
3212 +- if ((_cpu_based_exec_control & CPU_BASED_TPR_SHADOW))
3213 +- _cpu_based_exec_control &= ~CPU_BASED_CR8_LOAD_EXITING &
3214 +- ~CPU_BASED_CR8_STORE_EXITING;
3215 +-#endif
3216 +- if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) {
3217 +- min2 = 0;
3218 +- opt2 = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
3219 +- SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
3220 +- SECONDARY_EXEC_WBINVD_EXITING |
3221 +- SECONDARY_EXEC_ENABLE_VPID |
3222 +- SECONDARY_EXEC_ENABLE_EPT |
3223 +- SECONDARY_EXEC_UNRESTRICTED_GUEST |
3224 +- SECONDARY_EXEC_PAUSE_LOOP_EXITING |
3225 +- SECONDARY_EXEC_DESC |
3226 +- SECONDARY_EXEC_RDTSCP |
3227 +- SECONDARY_EXEC_ENABLE_INVPCID |
3228 +- SECONDARY_EXEC_APIC_REGISTER_VIRT |
3229 +- SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
3230 +- SECONDARY_EXEC_SHADOW_VMCS |
3231 +- SECONDARY_EXEC_XSAVES |
3232 +- SECONDARY_EXEC_RDSEED_EXITING |
3233 +- SECONDARY_EXEC_RDRAND_EXITING |
3234 +- SECONDARY_EXEC_ENABLE_PML |
3235 +- SECONDARY_EXEC_TSC_SCALING |
3236 +- SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE |
3237 +- SECONDARY_EXEC_PT_USE_GPA |
3238 +- SECONDARY_EXEC_PT_CONCEAL_VMX |
3239 +- SECONDARY_EXEC_ENABLE_VMFUNC |
3240 +- SECONDARY_EXEC_ENCLS_EXITING;
3241 +- if (adjust_vmx_controls(min2, opt2,
3242 +- MSR_IA32_VMX_PROCBASED_CTLS2,
3243 +- &_cpu_based_2nd_exec_control) < 0)
3244 +- return -EIO;
3245 +- }
3246 +-#ifndef CONFIG_X86_64
3247 +- if (!(_cpu_based_2nd_exec_control &
3248 +- SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
3249 +- _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW;
3250 +-#endif
3251 +-
3252 +- if (!(_cpu_based_exec_control & CPU_BASED_TPR_SHADOW))
3253 +- _cpu_based_2nd_exec_control &= ~(
3254 +- SECONDARY_EXEC_APIC_REGISTER_VIRT |
3255 +- SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
3256 +- SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
3257 +-
3258 +- rdmsr_safe(MSR_IA32_VMX_EPT_VPID_CAP,
3259 +- &vmx_cap->ept, &vmx_cap->vpid);
3260 +-
3261 +- if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) {
3262 +- /* CR3 accesses and invlpg don't need to cause VM Exits when EPT
3263 +- enabled */
3264 +- _cpu_based_exec_control &= ~(CPU_BASED_CR3_LOAD_EXITING |
3265 +- CPU_BASED_CR3_STORE_EXITING |
3266 +- CPU_BASED_INVLPG_EXITING);
3267 +- } else if (vmx_cap->ept) {
3268 +- vmx_cap->ept = 0;
3269 +- pr_warn_once("EPT CAP should not exist if not support "
3270 +- "1-setting enable EPT VM-execution control\n");
3271 +- }
3272 +- if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_VPID) &&
3273 +- vmx_cap->vpid) {
3274 +- vmx_cap->vpid = 0;
3275 +- pr_warn_once("VPID CAP should not exist if not support "
3276 +- "1-setting enable VPID VM-execution control\n");
3277 +- }
3278 +-
3279 +- min = VM_EXIT_SAVE_DEBUG_CONTROLS | VM_EXIT_ACK_INTR_ON_EXIT;
3280 +-#ifdef CONFIG_X86_64
3281 +- min |= VM_EXIT_HOST_ADDR_SPACE_SIZE;
3282 +-#endif
3283 +- opt = VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL |
3284 +- VM_EXIT_LOAD_IA32_PAT |
3285 +- VM_EXIT_LOAD_IA32_EFER |
3286 +- VM_EXIT_CLEAR_BNDCFGS |
3287 +- VM_EXIT_PT_CONCEAL_PIP |
3288 +- VM_EXIT_CLEAR_IA32_RTIT_CTL;
3289 +- if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS,
3290 +- &_vmexit_control) < 0)
3291 +- return -EIO;
3292 +-
3293 +- min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING;
3294 +- opt = PIN_BASED_VIRTUAL_NMIS | PIN_BASED_POSTED_INTR |
3295 +- PIN_BASED_VMX_PREEMPTION_TIMER;
3296 +- if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS,
3297 +- &_pin_based_exec_control) < 0)
3298 +- return -EIO;
3299 +-
3300 +- if (cpu_has_broken_vmx_preemption_timer())
3301 +- _pin_based_exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
3302 +- if (!(_cpu_based_2nd_exec_control &
3303 +- SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY))
3304 +- _pin_based_exec_control &= ~PIN_BASED_POSTED_INTR;
3305 +-
3306 +- min = VM_ENTRY_LOAD_DEBUG_CONTROLS;
3307 +- opt = VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL |
3308 +- VM_ENTRY_LOAD_IA32_PAT |
3309 +- VM_ENTRY_LOAD_IA32_EFER |
3310 +- VM_ENTRY_LOAD_BNDCFGS |
3311 +- VM_ENTRY_PT_CONCEAL_PIP |
3312 +- VM_ENTRY_LOAD_IA32_RTIT_CTL;
3313 +- if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS,
3314 +- &_vmentry_control) < 0)
3315 +- return -EIO;
3316 +-
3317 +- /*
3318 +- * Some cpus support VM_{ENTRY,EXIT}_IA32_PERF_GLOBAL_CTRL but they
3319 +- * can't be used due to an errata where VM Exit may incorrectly clear
3320 +- * IA32_PERF_GLOBAL_CTRL[34:32]. Workaround the errata by using the
3321 +- * MSR load mechanism to switch IA32_PERF_GLOBAL_CTRL.
3322 +- */
3323 +- if (boot_cpu_data.x86 == 0x6) {
3324 +- switch (boot_cpu_data.x86_model) {
3325 +- case 26: /* AAK155 */
3326 +- case 30: /* AAP115 */
3327 +- case 37: /* AAT100 */
3328 +- case 44: /* BC86,AAY89,BD102 */
3329 +- case 46: /* BA97 */
3330 +- _vmentry_control &= ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
3331 +- _vmexit_control &= ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL;
3332 +- pr_warn_once("kvm: VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL "
3333 +- "does not work properly. Using workaround\n");
3334 +- break;
3335 +- default:
3336 +- break;
3337 +- }
3338 +- }
3339 +-
3340 +-
3341 +- rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high);
3342 +-
3343 +- /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */
3344 +- if ((vmx_msr_high & 0x1fff) > PAGE_SIZE)
3345 +- return -EIO;
3346 +-
3347 +-#ifdef CONFIG_X86_64
3348 +- /* IA-32 SDM Vol 3B: 64-bit CPUs always have VMX_BASIC_MSR[48]==0. */
3349 +- if (vmx_msr_high & (1u<<16))
3350 +- return -EIO;
3351 +-#endif
3352 +-
3353 +- /* Require Write-Back (WB) memory type for VMCS accesses. */
3354 +- if (((vmx_msr_high >> 18) & 15) != 6)
3355 +- return -EIO;
3356 +-
3357 +- vmcs_conf->size = vmx_msr_high & 0x1fff;
3358 +- vmcs_conf->order = get_order(vmcs_conf->size);
3359 +- vmcs_conf->basic_cap = vmx_msr_high & ~0x1fff;
3360 +-
3361 +- vmcs_conf->revision_id = vmx_msr_low;
3362 +-
3363 +- vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control;
3364 +- vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control;
3365 +- vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control;
3366 +- vmcs_conf->vmexit_ctrl = _vmexit_control;
3367 +- vmcs_conf->vmentry_ctrl = _vmentry_control;
3368 +-
3369 +- if (static_branch_unlikely(&enable_evmcs))
3370 +- evmcs_sanitize_exec_ctrls(vmcs_conf);
3371 +-
3372 +- return 0;
3373 +-}
3374 +-
3375 +-struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags)
3376 +-{
3377 +- int node = cpu_to_node(cpu);
3378 +- struct page *pages;
3379 +- struct vmcs *vmcs;
3380 +-
3381 +- pages = __alloc_pages_node(node, flags, vmcs_config.order);
3382 +- if (!pages)
3383 +- return NULL;
3384 +- vmcs = page_address(pages);
3385 +- memset(vmcs, 0, vmcs_config.size);
3386 +-
3387 +- /* KVM supports Enlightened VMCS v1 only */
3388 +- if (static_branch_unlikely(&enable_evmcs))
3389 +- vmcs->hdr.revision_id = KVM_EVMCS_VERSION;
3390 +- else
3391 +- vmcs->hdr.revision_id = vmcs_config.revision_id;
3392 +-
3393 +- if (shadow)
3394 +- vmcs->hdr.shadow_vmcs = 1;
3395 +- return vmcs;
3396 +-}
3397 +-
3398 +-void free_vmcs(struct vmcs *vmcs)
3399 +-{
3400 +- free_pages((unsigned long)vmcs, vmcs_config.order);
3401 +-}
3402 +-
3403 +-/*
3404 +- * Free a VMCS, but before that VMCLEAR it on the CPU where it was last loaded
3405 +- */
3406 +-void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
3407 +-{
3408 +- if (!loaded_vmcs->vmcs)
3409 +- return;
3410 +- loaded_vmcs_clear(loaded_vmcs);
3411 +- free_vmcs(loaded_vmcs->vmcs);
3412 +- loaded_vmcs->vmcs = NULL;
3413 +- if (loaded_vmcs->msr_bitmap)
3414 +- free_page((unsigned long)loaded_vmcs->msr_bitmap);
3415 +- WARN_ON(loaded_vmcs->shadow_vmcs != NULL);
3416 +-}
3417 +-
3418 +-int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
3419 +-{
3420 +- loaded_vmcs->vmcs = alloc_vmcs(false);
3421 +- if (!loaded_vmcs->vmcs)
3422 +- return -ENOMEM;
3423 +-
3424 +- loaded_vmcs->shadow_vmcs = NULL;
3425 +- loaded_vmcs->hv_timer_soft_disabled = false;
3426 +- loaded_vmcs_init(loaded_vmcs);
3427 +-
3428 +- if (cpu_has_vmx_msr_bitmap()) {
3429 +- loaded_vmcs->msr_bitmap = (unsigned long *)
3430 +- __get_free_page(GFP_KERNEL_ACCOUNT);
3431 +- if (!loaded_vmcs->msr_bitmap)
3432 +- goto out_vmcs;
3433 +- memset(loaded_vmcs->msr_bitmap, 0xff, PAGE_SIZE);
3434 +-
3435 +- if (IS_ENABLED(CONFIG_HYPERV) &&
3436 +- static_branch_unlikely(&enable_evmcs) &&
3437 +- (ms_hyperv.nested_features & HV_X64_NESTED_MSR_BITMAP)) {
3438 +- struct hv_enlightened_vmcs *evmcs =
3439 +- (struct hv_enlightened_vmcs *)loaded_vmcs->vmcs;
3440 +-
3441 +- evmcs->hv_enlightenments_control.msr_bitmap = 1;
3442 +- }
3443 +- }
3444 +-
3445 +- memset(&loaded_vmcs->host_state, 0, sizeof(struct vmcs_host_state));
3446 +- memset(&loaded_vmcs->controls_shadow, 0,
3447 +- sizeof(struct vmcs_controls_shadow));
3448 +-
3449 +- return 0;
3450 +-
3451 +-out_vmcs:
3452 +- free_loaded_vmcs(loaded_vmcs);
3453 +- return -ENOMEM;
3454 +-}
3455 +-
3456 +-static void free_kvm_area(void)
3457 +-{
3458 +- int cpu;
3459 +-
3460 +- for_each_possible_cpu(cpu) {
3461 +- free_vmcs(per_cpu(vmxarea, cpu));
3462 +- per_cpu(vmxarea, cpu) = NULL;
3463 +- }
3464 +-}
3465 +-
3466 +-static __init int alloc_kvm_area(void)
3467 +-{
3468 +- int cpu;
3469 +-
3470 +- for_each_possible_cpu(cpu) {
3471 +- struct vmcs *vmcs;
3472 +-
3473 +- vmcs = alloc_vmcs_cpu(false, cpu, GFP_KERNEL);
3474 +- if (!vmcs) {
3475 +- free_kvm_area();
3476 +- return -ENOMEM;
3477 +- }
3478 +-
3479 +- /*
3480 +- * When eVMCS is enabled, alloc_vmcs_cpu() sets
3481 +- * vmcs->revision_id to KVM_EVMCS_VERSION instead of
3482 +- * revision_id reported by MSR_IA32_VMX_BASIC.
3483 +- *
3484 +- * However, even though not explicitly documented by
3485 +- * TLFS, VMXArea passed as VMXON argument should
3486 +- * still be marked with revision_id reported by
3487 +- * physical CPU.
3488 +- */
3489 +- if (static_branch_unlikely(&enable_evmcs))
3490 +- vmcs->hdr.revision_id = vmcs_config.revision_id;
3491 +-
3492 +- per_cpu(vmxarea, cpu) = vmcs;
3493 +- }
3494 +- return 0;
3495 +-}
3496 +-
3497 +-static void fix_pmode_seg(struct kvm_vcpu *vcpu, int seg,
3498 +- struct kvm_segment *save)
3499 +-{
3500 +- if (!emulate_invalid_guest_state) {
3501 +- /*
3502 +- * CS and SS RPL should be equal during guest entry according
3503 +- * to VMX spec, but in reality it is not always so. Since vcpu
3504 +- * is in the middle of the transition from real mode to
3505 +- * protected mode it is safe to assume that RPL 0 is a good
3506 +- * default value.
3507 +- */
3508 +- if (seg == VCPU_SREG_CS || seg == VCPU_SREG_SS)
3509 +- save->selector &= ~SEGMENT_RPL_MASK;
3510 +- save->dpl = save->selector & SEGMENT_RPL_MASK;
3511 +- save->s = 1;
3512 +- }
3513 +- vmx_set_segment(vcpu, save, seg);
3514 +-}
3515 +-
3516 +-static void enter_pmode(struct kvm_vcpu *vcpu)
3517 +-{
3518 +- unsigned long flags;
3519 +- struct vcpu_vmx *vmx = to_vmx(vcpu);
3520 +-
3521 +- /*
3522 +- * Update real mode segment cache. It may be not up-to-date if sement
3523 +- * register was written while vcpu was in a guest mode.
3524 +- */
3525 +- vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES);
3526 +- vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS);
3527 +- vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS);
3528 +- vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS);
3529 +- vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS);
3530 +- vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS);
3531 +-
3532 +- vmx->rmode.vm86_active = 0;
3533 +-
3534 +- vmx_segment_cache_clear(vmx);
3535 +-
3536 +- vmx_set_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
3537 +-
3538 +- flags = vmcs_readl(GUEST_RFLAGS);
3539 +- flags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
3540 +- flags |= vmx->rmode.save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
3541 +- vmcs_writel(GUEST_RFLAGS, flags);
3542 +-
3543 +- vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) |
3544 +- (vmcs_readl(CR4_READ_SHADOW) & X86_CR4_VME));
3545 +-
3546 +- update_exception_bitmap(vcpu);
3547 +-
3548 +- fix_pmode_seg(vcpu, VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]);
3549 +- fix_pmode_seg(vcpu, VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]);
3550 +- fix_pmode_seg(vcpu, VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]);
3551 +- fix_pmode_seg(vcpu, VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]);
3552 +- fix_pmode_seg(vcpu, VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]);
3553 +- fix_pmode_seg(vcpu, VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]);
3554 +-}
3555 +-
3556 +-static void fix_rmode_seg(int seg, struct kvm_segment *save)
3557 +-{
3558 +- const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
3559 +- struct kvm_segment var = *save;
3560 +-
3561 +- var.dpl = 0x3;
3562 +- if (seg == VCPU_SREG_CS)
3563 +- var.type = 0x3;
3564 +-
3565 +- if (!emulate_invalid_guest_state) {
3566 +- var.selector = var.base >> 4;
3567 +- var.base = var.base & 0xffff0;
3568 +- var.limit = 0xffff;
3569 +- var.g = 0;
3570 +- var.db = 0;
3571 +- var.present = 1;
3572 +- var.s = 1;
3573 +- var.l = 0;
3574 +- var.unusable = 0;
3575 +- var.type = 0x3;
3576 +- var.avl = 0;
3577 +- if (save->base & 0xf)
3578 +- printk_once(KERN_WARNING "kvm: segment base is not "
3579 +- "paragraph aligned when entering "
3580 +- "protected mode (seg=%d)", seg);
3581 +- }
3582 +-
3583 +- vmcs_write16(sf->selector, var.selector);
3584 +- vmcs_writel(sf->base, var.base);
3585 +- vmcs_write32(sf->limit, var.limit);
3586 +- vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(&var));
3587 +-}
3588 +-
3589 +-static void enter_rmode(struct kvm_vcpu *vcpu)
3590 +-{
3591 +- unsigned long flags;
3592 +- struct vcpu_vmx *vmx = to_vmx(vcpu);
3593 +- struct kvm_vmx *kvm_vmx = to_kvm_vmx(vcpu->kvm);
3594 +-
3595 +- vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
3596 +- vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES);
3597 +- vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS);
3598 +- vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS);
3599 +- vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS);
3600 +- vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS);
3601 +- vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS);
3602 +-
3603 +- vmx->rmode.vm86_active = 1;
3604 +-
3605 +- /*
3606 +- * Very old userspace does not call KVM_SET_TSS_ADDR before entering
3607 +- * vcpu. Warn the user that an update is overdue.
3608 +- */
3609 +- if (!kvm_vmx->tss_addr)
3610 +- printk_once(KERN_WARNING "kvm: KVM_SET_TSS_ADDR need to be "
3611 +- "called before entering vcpu\n");
3612 +-
3613 +- vmx_segment_cache_clear(vmx);
3614 +-
3615 +- vmcs_writel(GUEST_TR_BASE, kvm_vmx->tss_addr);
3616 +- vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1);
3617 +- vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
3618 +-
3619 +- flags = vmcs_readl(GUEST_RFLAGS);
3620 +- vmx->rmode.save_rflags = flags;
3621 +-
3622 +- flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
3623 +-
3624 +- vmcs_writel(GUEST_RFLAGS, flags);
3625 +- vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME);
3626 +- update_exception_bitmap(vcpu);
3627 +-
3628 +- fix_rmode_seg(VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]);
3629 +- fix_rmode_seg(VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]);
3630 +- fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]);
3631 +- fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]);
3632 +- fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]);
3633 +- fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]);
3634 +-
3635 +- kvm_mmu_reset_context(vcpu);
3636 +-}
3637 +-
3638 +-void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
3639 +-{
3640 +- struct vcpu_vmx *vmx = to_vmx(vcpu);
3641 +- struct shared_msr_entry *msr = find_msr_entry(vmx, MSR_EFER);
3642 +-
3643 +- if (!msr)
3644 +- return;
3645 +-
3646 +- vcpu->arch.efer = efer;
3647 +- if (efer & EFER_LMA) {
3648 +- vm_entry_controls_setbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE);
3649 +- msr->data = efer;
3650 +- } else {
3651 +- vm_entry_controls_clearbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE);
3652 +-
3653 +- msr->data = efer & ~EFER_LME;
3654 +- }
3655 +- setup_msrs(vmx);
3656 +-}
3657 +-
3658 +-#ifdef CONFIG_X86_64
3659 +-
3660 +-static void enter_lmode(struct kvm_vcpu *vcpu)
3661 +-{
3662 +- u32 guest_tr_ar;
3663 +-
3664 +- vmx_segment_cache_clear(to_vmx(vcpu));
3665 +-
3666 +- guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES);
3667 +- if ((guest_tr_ar & VMX_AR_TYPE_MASK) != VMX_AR_TYPE_BUSY_64_TSS) {
3668 +- pr_debug_ratelimited("%s: tss fixup for long mode. \n",
3669 +- __func__);
3670 +- vmcs_write32(GUEST_TR_AR_BYTES,
3671 +- (guest_tr_ar & ~VMX_AR_TYPE_MASK)
3672 +- | VMX_AR_TYPE_BUSY_64_TSS);
3673 +- }
3674 +- vmx_set_efer(vcpu, vcpu->arch.efer | EFER_LMA);
3675 +-}
3676 +-
3677 +-static void exit_lmode(struct kvm_vcpu *vcpu)
3678 +-{
3679 +- vm_entry_controls_clearbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE);
3680 +- vmx_set_efer(vcpu, vcpu->arch.efer & ~EFER_LMA);
3681 +-}
3682 +-
3683 +-#endif
3684 +-
3685 +-static void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr)
3686 +-{
3687 +- int vpid = to_vmx(vcpu)->vpid;
3688 +-
3689 +- if (!vpid_sync_vcpu_addr(vpid, addr))
3690 +- vpid_sync_context(vpid);
3691 +-
3692 +- /*
3693 +- * If VPIDs are not supported or enabled, then the above is a no-op.
3694 +- * But we don't really need a TLB flush in that case anyway, because
3695 +- * each VM entry/exit includes an implicit flush when VPID is 0.
3696 +- */
3697 +-}
3698 +-
3699 +-static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
3700 +-{
3701 +- ulong cr0_guest_owned_bits = vcpu->arch.cr0_guest_owned_bits;
3702 +-
3703 +- vcpu->arch.cr0 &= ~cr0_guest_owned_bits;
3704 +- vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & cr0_guest_owned_bits;
3705 +-}
3706 +-
3707 +-static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
3708 +-{
3709 +- ulong cr4_guest_owned_bits = vcpu->arch.cr4_guest_owned_bits;
3710 +-
3711 +- vcpu->arch.cr4 &= ~cr4_guest_owned_bits;
3712 +- vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & cr4_guest_owned_bits;
3713 +-}
3714 +-
3715 +-static void ept_load_pdptrs(struct kvm_vcpu *vcpu)
3716 +-{
3717 +- struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
3718 +-
3719 +- if (!kvm_register_is_dirty(vcpu, VCPU_EXREG_PDPTR))
3720 +- return;
3721 +-
3722 +- if (is_pae_paging(vcpu)) {
3723 +- vmcs_write64(GUEST_PDPTR0, mmu->pdptrs[0]);
3724 +- vmcs_write64(GUEST_PDPTR1, mmu->pdptrs[1]);
3725 +- vmcs_write64(GUEST_PDPTR2, mmu->pdptrs[2]);
3726 +- vmcs_write64(GUEST_PDPTR3, mmu->pdptrs[3]);
3727 +- }
3728 +-}
3729 +-
3730 +-void ept_save_pdptrs(struct kvm_vcpu *vcpu)
3731 +-{
3732 +- struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
3733 +-
3734 +- if (is_pae_paging(vcpu)) {
3735 +- mmu->pdptrs[0] = vmcs_read64(GUEST_PDPTR0);
3736 +- mmu->pdptrs[1] = vmcs_read64(GUEST_PDPTR1);
3737 +- mmu->pdptrs[2] = vmcs_read64(GUEST_PDPTR2);
3738 +- mmu->pdptrs[3] = vmcs_read64(GUEST_PDPTR3);
3739 +- }
3740 +-
3741 +- kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR);
3742 +-}
3743 +-
3744 +-static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
3745 +- unsigned long cr0,
3746 +- struct kvm_vcpu *vcpu)
3747 +-{
3748 +- struct vcpu_vmx *vmx = to_vmx(vcpu);
3749 +-
3750 +- if (!kvm_register_is_available(vcpu, VCPU_EXREG_CR3))
3751 +- vmx_cache_reg(vcpu, VCPU_EXREG_CR3);
3752 +- if (!(cr0 & X86_CR0_PG)) {
3753 +- /* From paging/starting to nonpaging */
3754 +- exec_controls_setbit(vmx, CPU_BASED_CR3_LOAD_EXITING |
3755 +- CPU_BASED_CR3_STORE_EXITING);
3756 +- vcpu->arch.cr0 = cr0;
3757 +- vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));
3758 +- } else if (!is_paging(vcpu)) {
3759 +- /* From nonpaging to paging */
3760 +- exec_controls_clearbit(vmx, CPU_BASED_CR3_LOAD_EXITING |
3761 +- CPU_BASED_CR3_STORE_EXITING);
3762 +- vcpu->arch.cr0 = cr0;
3763 +- vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));
3764 +- }
3765 +-
3766 +- if (!(cr0 & X86_CR0_WP))
3767 +- *hw_cr0 &= ~X86_CR0_WP;
3768 +-}
3769 +-
3770 +-void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
3771 +-{
3772 +- struct vcpu_vmx *vmx = to_vmx(vcpu);
3773 +- unsigned long hw_cr0;
3774 +-
3775 +- hw_cr0 = (cr0 & ~KVM_VM_CR0_ALWAYS_OFF);
3776 +- if (enable_unrestricted_guest)
3777 +- hw_cr0 |= KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST;
3778 +- else {
3779 +- hw_cr0 |= KVM_VM_CR0_ALWAYS_ON;
3780 +-
3781 +- if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE))
3782 +- enter_pmode(vcpu);
3783 +-
3784 +- if (!vmx->rmode.vm86_active && !(cr0 & X86_CR0_PE))
3785 +- enter_rmode(vcpu);
3786 +- }
3787 +-
3788 +-#ifdef CONFIG_X86_64
3789 +- if (vcpu->arch.efer & EFER_LME) {
3790 +- if (!is_paging(vcpu) && (cr0 & X86_CR0_PG))
3791 +- enter_lmode(vcpu);
3792 +- if (is_paging(vcpu) && !(cr0 & X86_CR0_PG))
3793 +- exit_lmode(vcpu);
3794 +- }
3795 +-#endif
3796 +-
3797 +- if (enable_ept && !enable_unrestricted_guest)
3798 +- ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu);
3799 +-
3800 +- vmcs_writel(CR0_READ_SHADOW, cr0);
3801 +- vmcs_writel(GUEST_CR0, hw_cr0);
3802 +- vcpu->arch.cr0 = cr0;
3803 +-
3804 +- /* depends on vcpu->arch.cr0 to be set to a new value */
3805 +- vmx->emulation_required = emulation_required(vcpu);
3806 +-}
3807 +-
3808 +-static int get_ept_level(struct kvm_vcpu *vcpu)
3809 +-{
3810 +- if (cpu_has_vmx_ept_5levels() && (cpuid_maxphyaddr(vcpu) > 48))
3811 +- return 5;
3812 +- return 4;
3813 +-}
3814 +-
3815 +-u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa)
3816 +-{
3817 +- u64 eptp = VMX_EPTP_MT_WB;
3818 +-
3819 +- eptp |= (get_ept_level(vcpu) == 5) ? VMX_EPTP_PWL_5 : VMX_EPTP_PWL_4;
3820 +-
3821 +- if (enable_ept_ad_bits &&
3822 +- (!is_guest_mode(vcpu) || nested_ept_ad_enabled(vcpu)))
3823 +- eptp |= VMX_EPTP_AD_ENABLE_BIT;
3824 +- eptp |= (root_hpa & PAGE_MASK);
3825 +-
3826 +- return eptp;
3827 +-}
3828 +-
3829 +-void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
3830 +-{
3831 +- struct kvm *kvm = vcpu->kvm;
3832 +- bool update_guest_cr3 = true;
3833 +- unsigned long guest_cr3;
3834 +- u64 eptp;
3835 +-
3836 +- guest_cr3 = cr3;
3837 +- if (enable_ept) {
3838 +- eptp = construct_eptp(vcpu, cr3);
3839 +- vmcs_write64(EPT_POINTER, eptp);
3840 +-
3841 +- if (kvm_x86_ops->tlb_remote_flush) {
3842 +- spin_lock(&to_kvm_vmx(kvm)->ept_pointer_lock);
3843 +- to_vmx(vcpu)->ept_pointer = eptp;
3844 +- to_kvm_vmx(kvm)->ept_pointers_match
3845 +- = EPT_POINTERS_CHECK;
3846 +- spin_unlock(&to_kvm_vmx(kvm)->ept_pointer_lock);
3847 +- }
3848 +-
3849 +- /* Loading vmcs02.GUEST_CR3 is handled by nested VM-Enter. */
3850 +- if (is_guest_mode(vcpu))
3851 +- update_guest_cr3 = false;
3852 +- else if (!enable_unrestricted_guest && !is_paging(vcpu))
3853 +- guest_cr3 = to_kvm_vmx(kvm)->ept_identity_map_addr;
3854 +- else if (test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail))
3855 +- guest_cr3 = vcpu->arch.cr3;
3856 +- else /* vmcs01.GUEST_CR3 is already up-to-date. */
3857 +- update_guest_cr3 = false;
3858 +- ept_load_pdptrs(vcpu);
3859 +- }
3860 +-
3861 +- if (update_guest_cr3)
3862 +- vmcs_writel(GUEST_CR3, guest_cr3);
3863 +-}
3864 +-
3865 +-int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
3866 +-{
3867 +- struct vcpu_vmx *vmx = to_vmx(vcpu);
3868 +- /*
3869 +- * Pass through host's Machine Check Enable value to hw_cr4, which
3870 +- * is in force while we are in guest mode. Do not let guests control
3871 +- * this bit, even if host CR4.MCE == 0.
3872 +- */
3873 +- unsigned long hw_cr4;
3874 +-
3875 +- hw_cr4 = (cr4_read_shadow() & X86_CR4_MCE) | (cr4 & ~X86_CR4_MCE);
3876 +- if (enable_unrestricted_guest)
3877 +- hw_cr4 |= KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST;
3878 +- else if (vmx->rmode.vm86_active)
3879 +- hw_cr4 |= KVM_RMODE_VM_CR4_ALWAYS_ON;
3880 +- else
3881 +- hw_cr4 |= KVM_PMODE_VM_CR4_ALWAYS_ON;
3882 +-
3883 +- if (!boot_cpu_has(X86_FEATURE_UMIP) && vmx_umip_emulated()) {
3884 +- if (cr4 & X86_CR4_UMIP) {
3885 +- secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_DESC);
3886 +- hw_cr4 &= ~X86_CR4_UMIP;
3887 +- } else if (!is_guest_mode(vcpu) ||
3888 +- !nested_cpu_has2(get_vmcs12(vcpu), SECONDARY_EXEC_DESC)) {
3889 +- secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_DESC);
3890 +- }
3891 +- }
3892 +-
3893 +- if (cr4 & X86_CR4_VMXE) {
3894 +- /*
3895 +- * To use VMXON (and later other VMX instructions), a guest
3896 +- * must first be able to turn on cr4.VMXE (see handle_vmon()).
3897 +- * So basically the check on whether to allow nested VMX
3898 +- * is here. We operate under the default treatment of SMM,
3899 +- * so VMX cannot be enabled under SMM.
3900 +- */
3901 +- if (!nested_vmx_allowed(vcpu) || is_smm(vcpu))
3902 +- return 1;
3903 +- }
3904 +-
3905 +- if (vmx->nested.vmxon && !nested_cr4_valid(vcpu, cr4))
3906 +- return 1;
3907 +-
3908 +- vcpu->arch.cr4 = cr4;
3909 +-
3910 +- if (!enable_unrestricted_guest) {
3911 +- if (enable_ept) {
3912 +- if (!is_paging(vcpu)) {
3913 +- hw_cr4 &= ~X86_CR4_PAE;
3914 +- hw_cr4 |= X86_CR4_PSE;
3915 +- } else if (!(cr4 & X86_CR4_PAE)) {
3916 +- hw_cr4 &= ~X86_CR4_PAE;
3917 +- }
3918 +- }
3919 +-
3920 +- /*
3921 +- * SMEP/SMAP/PKU is disabled if CPU is in non-paging mode in
3922 +- * hardware. To emulate this behavior, SMEP/SMAP/PKU needs
3923 +- * to be manually disabled when guest switches to non-paging
3924 +- * mode.
3925 +- *
3926 +- * If !enable_unrestricted_guest, the CPU is always running
3927 +- * with CR0.PG=1 and CR4 needs to be modified.
3928 +- * If enable_unrestricted_guest, the CPU automatically
3929 +- * disables SMEP/SMAP/PKU when the guest sets CR0.PG=0.
3930 +- */
3931 +- if (!is_paging(vcpu))
3932 +- hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE);
3933 +- }
3934 +-
3935 +- vmcs_writel(CR4_READ_SHADOW, cr4);
3936 +- vmcs_writel(GUEST_CR4, hw_cr4);
3937 +- return 0;
3938 +-}
3939 +-
3940 +-void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
3941 +-{
3942 +- struct vcpu_vmx *vmx = to_vmx(vcpu);
3943 +- u32 ar;
3944 +-
3945 +- if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) {
3946 +- *var = vmx->rmode.segs[seg];
3947 +- if (seg == VCPU_SREG_TR
3948 +- || var->selector == vmx_read_guest_seg_selector(vmx, seg))
3949 +- return;
3950 +- var->base = vmx_read_guest_seg_base(vmx, seg);
3951 +- var->selector = vmx_read_guest_seg_selector(vmx, seg);
3952 +- return;
3953 +- }
3954 +- var->base = vmx_read_guest_seg_base(vmx, seg);
3955 +- var->limit = vmx_read_guest_seg_limit(vmx, seg);
3956 +- var->selector = vmx_read_guest_seg_selector(vmx, seg);
3957 +- ar = vmx_read_guest_seg_ar(vmx, seg);
3958 +- var->unusable = (ar >> 16) & 1;
3959 +- var->type = ar & 15;
3960 +- var->s = (ar >> 4) & 1;
3961 +- var->dpl = (ar >> 5) & 3;
3962 +- /*
3963 +- * Some userspaces do not preserve unusable property. Since usable
3964 +- * segment has to be present according to VMX spec we can use present
3965 +- * property to amend userspace bug by making unusable segment always
3966 +- * nonpresent. vmx_segment_access_rights() already marks nonpresent
3967 +- * segment as unusable.
3968 +- */
3969 +- var->present = !var->unusable;
3970 +- var->avl = (ar >> 12) & 1;
3971 +- var->l = (ar >> 13) & 1;
3972 +- var->db = (ar >> 14) & 1;
3973 +- var->g = (ar >> 15) & 1;
3974 +-}
3975 +-
3976 +-static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg)
3977 +-{
3978 +- struct kvm_segment s;
3979 +-
3980 +- if (to_vmx(vcpu)->rmode.vm86_active) {
3981 +- vmx_get_segment(vcpu, &s, seg);
3982 +- return s.base;
3983 +- }
3984 +- return vmx_read_guest_seg_base(to_vmx(vcpu), seg);
3985 +-}
3986 +-
3987 +-int vmx_get_cpl(struct kvm_vcpu *vcpu)
3988 +-{
3989 +- struct vcpu_vmx *vmx = to_vmx(vcpu);
3990 +-
3991 +- if (unlikely(vmx->rmode.vm86_active))
3992 +- return 0;
3993 +- else {
3994 +- int ar = vmx_read_guest_seg_ar(vmx, VCPU_SREG_SS);
3995 +- return VMX_AR_DPL(ar);
3996 +- }
3997 +-}
3998 +-
3999 +-static u32 vmx_segment_access_rights(struct kvm_segment *var)
4000 +-{
4001 +- u32 ar;
4002 +-
4003 +- if (var->unusable || !var->present)
4004 +- ar = 1 << 16;
4005 +- else {
4006 +- ar = var->type & 15;
4007 +- ar |= (var->s & 1) << 4;
4008 +- ar |= (var->dpl & 3) << 5;
4009 +- ar |= (var->present & 1) << 7;
4010 +- ar |= (var->avl & 1) << 12;
4011 +- ar |= (var->l & 1) << 13;
4012 +- ar |= (var->db & 1) << 14;
4013 +- ar |= (var->g & 1) << 15;
4014 +- }
4015 +-
4016 +- return ar;
4017 +-}
4018 +-
4019 +-void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
4020 +-{
4021 +- struct vcpu_vmx *vmx = to_vmx(vcpu);
4022 +- const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
4023 +-
4024 +- vmx_segment_cache_clear(vmx);
4025 +-
4026 +- if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) {
4027 +- vmx->rmode.segs[seg] = *var;
4028 +- if (seg == VCPU_SREG_TR)
4029 +- vmcs_write16(sf->selector, var->selector);
4030 +- else if (var->s)
4031 +- fix_rmode_seg(seg, &vmx->rmode.segs[seg]);
4032 +- goto out;
4033 +- }
4034 +-
4035 +- vmcs_writel(sf->base, var->base);
4036 +- vmcs_write32(sf->limit, var->limit);
4037 +- vmcs_write16(sf->selector, var->selector);
4038 +-
4039 +- /*
4040 +- * Fix the "Accessed" bit in AR field of segment registers for older
4041 +- * qemu binaries.
4042 +- * IA32 arch specifies that at the time of processor reset the
4043 +- * "Accessed" bit in the AR field of segment registers is 1. And qemu
4044 +- * is setting it to 0 in the userland code. This causes invalid guest
4045 +- * state vmexit when "unrestricted guest" mode is turned on.
4046 +- * Fix for this setup issue in cpu_reset is being pushed in the qemu
4047 +- * tree. Newer qemu binaries with that qemu fix would not need this
4048 +- * kvm hack.
4049 +- */
4050 +- if (enable_unrestricted_guest && (seg != VCPU_SREG_LDTR))
4051 +- var->type |= 0x1; /* Accessed */
4052 +-
4053 +- vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(var));
4054 +-
4055 +-out:
4056 +- vmx->emulation_required = emulation_required(vcpu);
4057 +-}
4058 +-
4059 +-static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
4060 +-{
4061 +- u32 ar = vmx_read_guest_seg_ar(to_vmx(vcpu), VCPU_SREG_CS);
4062 +-
4063 +- *db = (ar >> 14) & 1;
4064 +- *l = (ar >> 13) & 1;
4065 +-}
4066 +-
4067 +-static void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
4068 +-{
4069 +- dt->size = vmcs_read32(GUEST_IDTR_LIMIT);
4070 +- dt->address = vmcs_readl(GUEST_IDTR_BASE);
4071 +-}
4072 +-
4073 +-static void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
4074 +-{
4075 +- vmcs_write32(GUEST_IDTR_LIMIT, dt->size);
4076 +- vmcs_writel(GUEST_IDTR_BASE, dt->address);
4077 +-}
4078 +-
4079 +-static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
4080 +-{
4081 +- dt->size = vmcs_read32(GUEST_GDTR_LIMIT);
4082 +- dt->address = vmcs_readl(GUEST_GDTR_BASE);
4083 +-}
4084 +-
4085 +-static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
4086 +-{
4087 +- vmcs_write32(GUEST_GDTR_LIMIT, dt->size);
4088 +- vmcs_writel(GUEST_GDTR_BASE, dt->address);
4089 +-}
4090 +-
4091 +-static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg)
4092 +-{
4093 +- struct kvm_segment var;
4094 +- u32 ar;
4095 +-
4096 +- vmx_get_segment(vcpu, &var, seg);
4097 +- var.dpl = 0x3;
4098 +- if (seg == VCPU_SREG_CS)
4099 +- var.type = 0x3;
4100 +- ar = vmx_segment_access_rights(&var);
4101 +-
4102 +- if (var.base != (var.selector << 4))
4103 +- return false;
4104 +- if (var.limit != 0xffff)
4105 +- return false;
4106 +- if (ar != 0xf3)
4107 +- return false;
4108 +-
4109 +- return true;
4110 +-}
4111 +-
4112 +-static bool code_segment_valid(struct kvm_vcpu *vcpu)
4113 +-{
4114 +- struct kvm_segment cs;
4115 +- unsigned int cs_rpl;
4116 +-
4117 +- vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
4118 +- cs_rpl = cs.selector & SEGMENT_RPL_MASK;
4119 +-
4120 +- if (cs.unusable)
4121 +- return false;
4122 +- if (~cs.type & (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_ACCESSES_MASK))
4123 +- return false;
4124 +- if (!cs.s)
4125 +- return false;
4126 +- if (cs.type & VMX_AR_TYPE_WRITEABLE_MASK) {
4127 +- if (cs.dpl > cs_rpl)
4128 +- return false;
4129 +- } else {
4130 +- if (cs.dpl != cs_rpl)
4131 +- return false;
4132 +- }
4133 +- if (!cs.present)
4134 +- return false;
4135 +-
4136 +- /* TODO: Add Reserved field check, this'll require a new member in the kvm_segment_field structure */
4137 +- return true;
4138 +-}
4139 +-
4140 +-static bool stack_segment_valid(struct kvm_vcpu *vcpu)
4141 +-{
4142 +- struct kvm_segment ss;
4143 +- unsigned int ss_rpl;
4144 +-
4145 +- vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
4146 +- ss_rpl = ss.selector & SEGMENT_RPL_MASK;
4147 +-
4148 +- if (ss.unusable)
4149 +- return true;
4150 +- if (ss.type != 3 && ss.type != 7)
4151 +- return false;
4152 +- if (!ss.s)
4153 +- return false;
4154 +- if (ss.dpl != ss_rpl) /* DPL != RPL */
4155 +- return false;
4156 +- if (!ss.present)
4157 +- return false;
4158 +-
4159 +- return true;
4160 +-}
4161 +-
4162 +-static bool data_segment_valid(struct kvm_vcpu *vcpu, int seg)
4163 +-{
4164 +- struct kvm_segment var;
4165 +- unsigned int rpl;
4166 +-
4167 +- vmx_get_segment(vcpu, &var, seg);
4168 +- rpl = var.selector & SEGMENT_RPL_MASK;
4169 +-
4170 +- if (var.unusable)
4171 +- return true;
4172 +- if (!var.s)
4173 +- return false;
4174 +- if (!var.present)
4175 +- return false;
4176 +- if (~var.type & (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_WRITEABLE_MASK)) {
4177 +- if (var.dpl < rpl) /* DPL < RPL */
4178 +- return false;
4179 +- }
4180 +-
4181 +- /* TODO: Add other members to kvm_segment_field to allow checking for other access
4182 +- * rights flags
4183 +- */
4184 +- return true;
4185 +-}
4186 +-
4187 +-static bool tr_valid(struct kvm_vcpu *vcpu)
4188 +-{
4189 +- struct kvm_segment tr;
4190 +-
4191 +- vmx_get_segment(vcpu, &tr, VCPU_SREG_TR);
4192 +-
4193 +- if (tr.unusable)
4194 +- return false;
4195 +- if (tr.selector & SEGMENT_TI_MASK) /* TI = 1 */
4196 +- return false;
4197 +- if (tr.type != 3 && tr.type != 11) /* TODO: Check if guest is in IA32e mode */
4198 +- return false;
4199 +- if (!tr.present)
4200 +- return false;
4201 +-
4202 +- return true;
4203 +-}
4204 +-
4205 +-static bool ldtr_valid(struct kvm_vcpu *vcpu)
4206 +-{
4207 +- struct kvm_segment ldtr;
4208 +-
4209 +- vmx_get_segment(vcpu, &ldtr, VCPU_SREG_LDTR);
4210 +-
4211 +- if (ldtr.unusable)
4212 +- return true;
4213 +- if (ldtr.selector & SEGMENT_TI_MASK) /* TI = 1 */
4214 +- return false;
4215 +- if (ldtr.type != 2)
4216 +- return false;
4217 +- if (!ldtr.present)
4218 +- return false;
4219 +-
4220 +- return true;
4221 +-}
4222 +-
4223 +-static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu)
4224 +-{
4225 +- struct kvm_segment cs, ss;
4226 +-
4227 +- vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
4228 +- vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
4229 +-
4230 +- return ((cs.selector & SEGMENT_RPL_MASK) ==
4231 +- (ss.selector & SEGMENT_RPL_MASK));
4232 +-}
4233 +-
4234 +-/*
4235 +- * Check if guest state is valid. Returns true if valid, false if
4236 +- * not.
4237 +- * We assume that registers are always usable
4238 +- */
4239 +-static bool guest_state_valid(struct kvm_vcpu *vcpu)
4240 +-{
4241 +- if (enable_unrestricted_guest)
4242 +- return true;
4243 +-
4244 +- /* real mode guest state checks */
4245 +- if (!is_protmode(vcpu) || (vmx_get_rflags(vcpu) & X86_EFLAGS_VM)) {
4246 +- if (!rmode_segment_valid(vcpu, VCPU_SREG_CS))
4247 +- return false;
4248 +- if (!rmode_segment_valid(vcpu, VCPU_SREG_SS))
4249 +- return false;
4250 +- if (!rmode_segment_valid(vcpu, VCPU_SREG_DS))
4251 +- return false;
4252 +- if (!rmode_segment_valid(vcpu, VCPU_SREG_ES))
4253 +- return false;
4254 +- if (!rmode_segment_valid(vcpu, VCPU_SREG_FS))
4255 +- return false;
4256 +- if (!rmode_segment_valid(vcpu, VCPU_SREG_GS))
4257 +- return false;
4258 +- } else {
4259 +- /* protected mode guest state checks */
4260 +- if (!cs_ss_rpl_check(vcpu))
4261 +- return false;
4262 +- if (!code_segment_valid(vcpu))
4263 +- return false;
4264 +- if (!stack_segment_valid(vcpu))
4265 +- return false;
4266 +- if (!data_segment_valid(vcpu, VCPU_SREG_DS))
4267 +- return false;
4268 +- if (!data_segment_valid(vcpu, VCPU_SREG_ES))
4269 +- return false;
4270 +- if (!data_segment_valid(vcpu, VCPU_SREG_FS))
4271 +- return false;
4272 +- if (!data_segment_valid(vcpu, VCPU_SREG_GS))
4273 +- return false;
4274 +- if (!tr_valid(vcpu))
4275 +- return false;
4276 +- if (!ldtr_valid(vcpu))
4277 +- return false;
4278 +- }
4279 +- /* TODO:
4280 +- * - Add checks on RIP
4281 +- * - Add checks on RFLAGS
4282 +- */
4283 +-
4284 +- return true;
4285 +-}
4286 +-
4287 +-static int init_rmode_tss(struct kvm *kvm)
4288 +-{
4289 +- gfn_t fn;
4290 +- u16 data = 0;
4291 +- int idx, r;
4292 +-
4293 +- idx = srcu_read_lock(&kvm->srcu);
4294 +- fn = to_kvm_vmx(kvm)->tss_addr >> PAGE_SHIFT;
4295 +- r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
4296 +- if (r < 0)
4297 +- goto out;
4298 +- data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE;
4299 +- r = kvm_write_guest_page(kvm, fn++, &data,
4300 +- TSS_IOPB_BASE_OFFSET, sizeof(u16));
4301 +- if (r < 0)
4302 +- goto out;
4303 +- r = kvm_clear_guest_page(kvm, fn++, 0, PAGE_SIZE);
4304 +- if (r < 0)
4305 +- goto out;
4306 +- r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
4307 +- if (r < 0)
4308 +- goto out;
4309 +- data = ~0;
4310 +- r = kvm_write_guest_page(kvm, fn, &data,
4311 +- RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1,
4312 +- sizeof(u8));
4313 +-out:
4314 +- srcu_read_unlock(&kvm->srcu, idx);
4315 +- return r;
4316 +-}
4317 +-
4318 +-static int init_rmode_identity_map(struct kvm *kvm)
4319 +-{
4320 +- struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm);
4321 +- int i, idx, r = 0;
4322 +- kvm_pfn_t identity_map_pfn;
4323 +- u32 tmp;
4324 +-
4325 +- /* Protect kvm_vmx->ept_identity_pagetable_done. */
4326 +- mutex_lock(&kvm->slots_lock);
4327 +-
4328 +- if (likely(kvm_vmx->ept_identity_pagetable_done))
4329 +- goto out2;
4330 +-
4331 +- if (!kvm_vmx->ept_identity_map_addr)
4332 +- kvm_vmx->ept_identity_map_addr = VMX_EPT_IDENTITY_PAGETABLE_ADDR;
4333 +- identity_map_pfn = kvm_vmx->ept_identity_map_addr >> PAGE_SHIFT;
4334 +-
4335 +- r = __x86_set_memory_region(kvm, IDENTITY_PAGETABLE_PRIVATE_MEMSLOT,
4336 +- kvm_vmx->ept_identity_map_addr, PAGE_SIZE);
4337 +- if (r < 0)
4338 +- goto out2;
4339 +-
4340 +- idx = srcu_read_lock(&kvm->srcu);
4341 +- r = kvm_clear_guest_page(kvm, identity_map_pfn, 0, PAGE_SIZE);
4342 +- if (r < 0)
4343 +- goto out;
4344 +- /* Set up identity-mapping pagetable for EPT in real mode */
4345 +- for (i = 0; i < PT32_ENT_PER_PAGE; i++) {
4346 +- tmp = (i << 22) + (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |
4347 +- _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE);
4348 +- r = kvm_write_guest_page(kvm, identity_map_pfn,
4349 +- &tmp, i * sizeof(tmp), sizeof(tmp));
4350 +- if (r < 0)
4351 +- goto out;
4352 +- }
4353 +- kvm_vmx->ept_identity_pagetable_done = true;
4354 +-
4355 +-out:
4356 +- srcu_read_unlock(&kvm->srcu, idx);
4357 +-
4358 +-out2:
4359 +- mutex_unlock(&kvm->slots_lock);
4360 +- return r;
4361 +-}
4362 +-
4363 +-static void seg_setup(int seg)
4364 +-{
4365 +- const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
4366 +- unsigned int ar;
4367 +-
4368 +- vmcs_write16(sf->selector, 0);
4369 +- vmcs_writel(sf->base, 0);
4370 +- vmcs_write32(sf->limit, 0xffff);
4371 +- ar = 0x93;
4372 +- if (seg == VCPU_SREG_CS)
4373 +- ar |= 0x08; /* code segment */
4374 +-
4375 +- vmcs_write32(sf->ar_bytes, ar);
4376 +-}
4377 +-
4378 +-static int alloc_apic_access_page(struct kvm *kvm)
4379 +-{
4380 +- struct page *page;
4381 +- int r = 0;
4382 +-
4383 +- mutex_lock(&kvm->slots_lock);
4384 +- if (kvm->arch.apic_access_page_done)
4385 +- goto out;
4386 +- r = __x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT,
4387 +- APIC_DEFAULT_PHYS_BASE, PAGE_SIZE);
4388 +- if (r)
4389 +- goto out;
4390 +-
4391 +- page = gfn_to_page(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT);
4392 +- if (is_error_page(page)) {
4393 +- r = -EFAULT;
4394 +- goto out;
4395 +- }
4396 +-
4397 +- /*
4398 +- * Do not pin the page in memory, so that memory hot-unplug
4399 +- * is able to migrate it.
4400 +- */
4401 +- put_page(page);
4402 +- kvm->arch.apic_access_page_done = true;
4403 +-out:
4404 +- mutex_unlock(&kvm->slots_lock);
4405 +- return r;
4406 +-}
4407 +-
4408 +-int allocate_vpid(void)
4409 +-{
4410 +- int vpid;
4411 +-
4412 +- if (!enable_vpid)
4413 +- return 0;
4414 +- spin_lock(&vmx_vpid_lock);
4415 +- vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS);
4416 +- if (vpid < VMX_NR_VPIDS)
4417 +- __set_bit(vpid, vmx_vpid_bitmap);
4418 +- else
4419 +- vpid = 0;
4420 +- spin_unlock(&vmx_vpid_lock);
4421 +- return vpid;
4422 +-}
4423 +-
4424 +-void free_vpid(int vpid)
4425 +-{
4426 +- if (!enable_vpid || vpid == 0)
4427 +- return;
4428 +- spin_lock(&vmx_vpid_lock);
4429 +- __clear_bit(vpid, vmx_vpid_bitmap);
4430 +- spin_unlock(&vmx_vpid_lock);
4431 +-}
4432 +-
4433 +-static __always_inline void vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
4434 +- u32 msr, int type)
4435 +-{
4436 +- int f = sizeof(unsigned long);
4437 +-
4438 +- if (!cpu_has_vmx_msr_bitmap())
4439 +- return;
4440 +-
4441 +- if (static_branch_unlikely(&enable_evmcs))
4442 +- evmcs_touch_msr_bitmap();
4443 +-
4444 +- /*
4445 +- * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
4446 +- * have the write-low and read-high bitmap offsets the wrong way round.
4447 +- * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
4448 +- */
4449 +- if (msr <= 0x1fff) {
4450 +- if (type & MSR_TYPE_R)
4451 +- /* read-low */
4452 +- __clear_bit(msr, msr_bitmap + 0x000 / f);
4453 +-
4454 +- if (type & MSR_TYPE_W)
4455 +- /* write-low */
4456 +- __clear_bit(msr, msr_bitmap + 0x800 / f);
4457 +-
4458 +- } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
4459 +- msr &= 0x1fff;
4460 +- if (type & MSR_TYPE_R)
4461 +- /* read-high */
4462 +- __clear_bit(msr, msr_bitmap + 0x400 / f);
4463 +-
4464 +- if (type & MSR_TYPE_W)
4465 +- /* write-high */
4466 +- __clear_bit(msr, msr_bitmap + 0xc00 / f);
4467 +-
4468 +- }
4469 +-}
4470 +-
4471 +-static __always_inline void vmx_enable_intercept_for_msr(unsigned long *msr_bitmap,
4472 +- u32 msr, int type)
4473 +-{
4474 +- int f = sizeof(unsigned long);
4475 +-
4476 +- if (!cpu_has_vmx_msr_bitmap())
4477 +- return;
4478 +-
4479 +- if (static_branch_unlikely(&enable_evmcs))
4480 +- evmcs_touch_msr_bitmap();
4481 +-
4482 +- /*
4483 +- * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
4484 +- * have the write-low and read-high bitmap offsets the wrong way round.
4485 +- * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
4486 +- */
4487 +- if (msr <= 0x1fff) {
4488 +- if (type & MSR_TYPE_R)
4489 +- /* read-low */
4490 +- __set_bit(msr, msr_bitmap + 0x000 / f);
4491 +-
4492 +- if (type & MSR_TYPE_W)
4493 +- /* write-low */
4494 +- __set_bit(msr, msr_bitmap + 0x800 / f);
4495 +-
4496 +- } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
4497 +- msr &= 0x1fff;
4498 +- if (type & MSR_TYPE_R)
4499 +- /* read-high */
4500 +- __set_bit(msr, msr_bitmap + 0x400 / f);
4501 +-
4502 +- if (type & MSR_TYPE_W)
4503 +- /* write-high */
4504 +- __set_bit(msr, msr_bitmap + 0xc00 / f);
4505 +-
4506 +- }
4507 +-}
4508 +-
4509 +-static __always_inline void vmx_set_intercept_for_msr(unsigned long *msr_bitmap,
4510 +- u32 msr, int type, bool value)
4511 +-{
4512 +- if (value)
4513 +- vmx_enable_intercept_for_msr(msr_bitmap, msr, type);
4514 +- else
4515 +- vmx_disable_intercept_for_msr(msr_bitmap, msr, type);
4516 +-}
4517 +-
4518 +-static u8 vmx_msr_bitmap_mode(struct kvm_vcpu *vcpu)
4519 +-{
4520 +- u8 mode = 0;
4521 +-
4522 +- if (cpu_has_secondary_exec_ctrls() &&
4523 +- (secondary_exec_controls_get(to_vmx(vcpu)) &
4524 +- SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) {
4525 +- mode |= MSR_BITMAP_MODE_X2APIC;
4526 +- if (enable_apicv && kvm_vcpu_apicv_active(vcpu))
4527 +- mode |= MSR_BITMAP_MODE_X2APIC_APICV;
4528 +- }
4529 +-
4530 +- return mode;
4531 +-}
4532 +-
4533 +-static void vmx_update_msr_bitmap_x2apic(unsigned long *msr_bitmap,
4534 +- u8 mode)
4535 +-{
4536 +- int msr;
4537 +-
4538 +- for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
4539 +- unsigned word = msr / BITS_PER_LONG;
4540 +- msr_bitmap[word] = (mode & MSR_BITMAP_MODE_X2APIC_APICV) ? 0 : ~0;
4541 +- msr_bitmap[word + (0x800 / sizeof(long))] = ~0;
4542 +- }
4543 +-
4544 +- if (mode & MSR_BITMAP_MODE_X2APIC) {
4545 +- /*
4546 +- * TPR reads and writes can be virtualized even if virtual interrupt
4547 +- * delivery is not in use.
4548 +- */
4549 +- vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_TASKPRI), MSR_TYPE_RW);
4550 +- if (mode & MSR_BITMAP_MODE_X2APIC_APICV) {
4551 +- vmx_enable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_TMCCT), MSR_TYPE_R);
4552 +- vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_EOI), MSR_TYPE_W);
4553 +- vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W);
4554 +- }
4555 +- }
4556 +-}
4557 +-
4558 +-void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu)
4559 +-{
4560 +- struct vcpu_vmx *vmx = to_vmx(vcpu);
4561 +- unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
4562 +- u8 mode = vmx_msr_bitmap_mode(vcpu);
4563 +- u8 changed = mode ^ vmx->msr_bitmap_mode;
4564 +-
4565 +- if (!changed)
4566 +- return;
4567 +-
4568 +- if (changed & (MSR_BITMAP_MODE_X2APIC | MSR_BITMAP_MODE_X2APIC_APICV))
4569 +- vmx_update_msr_bitmap_x2apic(msr_bitmap, mode);
4570 +-
4571 +- vmx->msr_bitmap_mode = mode;
4572 +-}
4573 +-
4574 +-void pt_update_intercept_for_msr(struct vcpu_vmx *vmx)
4575 +-{
4576 +- unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
4577 +- bool flag = !(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN);
4578 +- u32 i;
4579 +-
4580 +- vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_STATUS,
4581 +- MSR_TYPE_RW, flag);
4582 +- vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_OUTPUT_BASE,
4583 +- MSR_TYPE_RW, flag);
4584 +- vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_OUTPUT_MASK,
4585 +- MSR_TYPE_RW, flag);
4586 +- vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_CR3_MATCH,
4587 +- MSR_TYPE_RW, flag);
4588 +- for (i = 0; i < vmx->pt_desc.addr_range; i++) {
4589 +- vmx_set_intercept_for_msr(msr_bitmap,
4590 +- MSR_IA32_RTIT_ADDR0_A + i * 2, MSR_TYPE_RW, flag);
4591 +- vmx_set_intercept_for_msr(msr_bitmap,
4592 +- MSR_IA32_RTIT_ADDR0_B + i * 2, MSR_TYPE_RW, flag);
4593 +- }
4594 +-}
4595 +-
4596 +-static bool vmx_get_enable_apicv(struct kvm *kvm)
4597 +-{
4598 +- return enable_apicv;
4599 +-}
4600 +-
4601 +-static bool vmx_guest_apic_has_interrupt(struct kvm_vcpu *vcpu)
4602 +-{
4603 +- struct vcpu_vmx *vmx = to_vmx(vcpu);
4604 +- void *vapic_page;
4605 +- u32 vppr;
4606 +- int rvi;
4607 +-
4608 +- if (WARN_ON_ONCE(!is_guest_mode(vcpu)) ||
4609 +- !nested_cpu_has_vid(get_vmcs12(vcpu)) ||
4610 +- WARN_ON_ONCE(!vmx->nested.virtual_apic_map.gfn))
4611 +- return false;
4612 +-
4613 +- rvi = vmx_get_rvi();
4614 +-
4615 +- vapic_page = vmx->nested.virtual_apic_map.hva;
4616 +- vppr = *((u32 *)(vapic_page + APIC_PROCPRI));
4617 +-
4618 +- return ((rvi & 0xf0) > (vppr & 0xf0));
4619 +-}
4620 +-
4621 +-static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu,
4622 +- bool nested)
4623 +-{
4624 +-#ifdef CONFIG_SMP
4625 +- int pi_vec = nested ? POSTED_INTR_NESTED_VECTOR : POSTED_INTR_VECTOR;
4626 +-
4627 +- if (vcpu->mode == IN_GUEST_MODE) {
4628 +- /*
4629 +- * The vector of interrupt to be delivered to vcpu had
4630 +- * been set in PIR before this function.
4631 +- *
4632 +- * Following cases will be reached in this block, and
4633 +- * we always send a notification event in all cases as
4634 +- * explained below.
4635 +- *
4636 +- * Case 1: vcpu keeps in non-root mode. Sending a
4637 +- * notification event posts the interrupt to vcpu.
4638 +- *
4639 +- * Case 2: vcpu exits to root mode and is still
4640 +- * runnable. PIR will be synced to vIRR before the
4641 +- * next vcpu entry. Sending a notification event in
4642 +- * this case has no effect, as vcpu is not in root
4643 +- * mode.
4644 +- *
4645 +- * Case 3: vcpu exits to root mode and is blocked.
4646 +- * vcpu_block() has already synced PIR to vIRR and
4647 +- * never blocks vcpu if vIRR is not cleared. Therefore,
4648 +- * a blocked vcpu here does not wait for any requested
4649 +- * interrupts in PIR, and sending a notification event
4650 +- * which has no effect is safe here.
4651 +- */
4652 +-
4653 +- apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), pi_vec);
4654 +- return true;
4655 +- }
4656 +-#endif
4657 +- return false;
4658 +-}
4659 +-
4660 +-static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu,
4661 +- int vector)
4662 +-{
4663 +- struct vcpu_vmx *vmx = to_vmx(vcpu);
4664 +-
4665 +- if (is_guest_mode(vcpu) &&
4666 +- vector == vmx->nested.posted_intr_nv) {
4667 +- /*
4668 +- * If a posted intr is not recognized by hardware,
4669 +- * we will accomplish it in the next vmentry.
4670 +- */
4671 +- vmx->nested.pi_pending = true;
4672 +- kvm_make_request(KVM_REQ_EVENT, vcpu);
4673 +- /* the PIR and ON have been set by L1. */
4674 +- if (!kvm_vcpu_trigger_posted_interrupt(vcpu, true))
4675 +- kvm_vcpu_kick(vcpu);
4676 +- return 0;
4677 +- }
4678 +- return -1;
4679 +-}
4680 +-/*
4681 +- * Send interrupt to vcpu via posted interrupt way.
4682 +- * 1. If target vcpu is running(non-root mode), send posted interrupt
4683 +- * notification to vcpu and hardware will sync PIR to vIRR atomically.
4684 +- * 2. If target vcpu isn't running(root mode), kick it to pick up the
4685 +- * interrupt from PIR in next vmentry.
4686 +- */
4687 +-static void vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector)
4688 +-{
4689 +- struct vcpu_vmx *vmx = to_vmx(vcpu);
4690 +- int r;
4691 +-
4692 +- r = vmx_deliver_nested_posted_interrupt(vcpu, vector);
4693 +- if (!r)
4694 +- return;
4695 +-
4696 +- if (pi_test_and_set_pir(vector, &vmx->pi_desc))
4697 +- return;
4698 +-
4699 +- /* If a previous notification has sent the IPI, nothing to do. */
4700 +- if (pi_test_and_set_on(&vmx->pi_desc))
4701 +- return;
4702 +-
4703 +- if (!kvm_vcpu_trigger_posted_interrupt(vcpu, false))
4704 +- kvm_vcpu_kick(vcpu);
4705 +-}
4706 +-
4707 +-/*
4708 +- * Set up the vmcs's constant host-state fields, i.e., host-state fields that
4709 +- * will not change in the lifetime of the guest.
4710 +- * Note that host-state that does change is set elsewhere. E.g., host-state
4711 +- * that is set differently for each CPU is set in vmx_vcpu_load(), not here.
4712 +- */
4713 +-void vmx_set_constant_host_state(struct vcpu_vmx *vmx)
4714 +-{
4715 +- u32 low32, high32;
4716 +- unsigned long tmpl;
4717 +- unsigned long cr0, cr3, cr4;
4718 +-
4719 +- cr0 = read_cr0();
4720 +- WARN_ON(cr0 & X86_CR0_TS);
4721 +- vmcs_writel(HOST_CR0, cr0); /* 22.2.3 */
4722 +-
4723 +- /*
4724 +- * Save the most likely value for this task's CR3 in the VMCS.
4725 +- * We can't use __get_current_cr3_fast() because we're not atomic.
4726 +- */
4727 +- cr3 = __read_cr3();
4728 +- vmcs_writel(HOST_CR3, cr3); /* 22.2.3 FIXME: shadow tables */
4729 +- vmx->loaded_vmcs->host_state.cr3 = cr3;
4730 +-
4731 +- /* Save the most likely value for this task's CR4 in the VMCS. */
4732 +- cr4 = cr4_read_shadow();
4733 +- vmcs_writel(HOST_CR4, cr4); /* 22.2.3, 22.2.5 */
4734 +- vmx->loaded_vmcs->host_state.cr4 = cr4;
4735 +-
4736 +- vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */
4737 +-#ifdef CONFIG_X86_64
4738 +- /*
4739 +- * Load null selectors, so we can avoid reloading them in
4740 +- * vmx_prepare_switch_to_host(), in case userspace uses
4741 +- * the null selectors too (the expected case).
4742 +- */
4743 +- vmcs_write16(HOST_DS_SELECTOR, 0);
4744 +- vmcs_write16(HOST_ES_SELECTOR, 0);
4745 +-#else
4746 +- vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */
4747 +- vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */
4748 +-#endif
4749 +- vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */
4750 +- vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */
4751 +-
4752 +- vmcs_writel(HOST_IDTR_BASE, host_idt_base); /* 22.2.4 */
4753 +-
4754 +- vmcs_writel(HOST_RIP, (unsigned long)vmx_vmexit); /* 22.2.5 */
4755 +-
4756 +- rdmsr(MSR_IA32_SYSENTER_CS, low32, high32);
4757 +- vmcs_write32(HOST_IA32_SYSENTER_CS, low32);
4758 +- rdmsrl(MSR_IA32_SYSENTER_EIP, tmpl);
4759 +- vmcs_writel(HOST_IA32_SYSENTER_EIP, tmpl); /* 22.2.3 */
4760 +-
4761 +- if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) {
4762 +- rdmsr(MSR_IA32_CR_PAT, low32, high32);
4763 +- vmcs_write64(HOST_IA32_PAT, low32 | ((u64) high32 << 32));
4764 +- }
4765 +-
4766 +- if (cpu_has_load_ia32_efer())
4767 +- vmcs_write64(HOST_IA32_EFER, host_efer);
4768 +-}
4769 +-
4770 +-void set_cr4_guest_host_mask(struct vcpu_vmx *vmx)
4771 +-{
4772 +- vmx->vcpu.arch.cr4_guest_owned_bits = KVM_CR4_GUEST_OWNED_BITS;
4773 +- if (enable_ept)
4774 +- vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE;
4775 +- if (is_guest_mode(&vmx->vcpu))
4776 +- vmx->vcpu.arch.cr4_guest_owned_bits &=
4777 +- ~get_vmcs12(&vmx->vcpu)->cr4_guest_host_mask;
4778 +- vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits);
4779 +-}
4780 +-
4781 +-u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx)
4782 +-{
4783 +- u32 pin_based_exec_ctrl = vmcs_config.pin_based_exec_ctrl;
4784 +-
4785 +- if (!kvm_vcpu_apicv_active(&vmx->vcpu))
4786 +- pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR;
4787 +-
4788 +- if (!enable_vnmi)
4789 +- pin_based_exec_ctrl &= ~PIN_BASED_VIRTUAL_NMIS;
4790 +-
4791 +- if (!enable_preemption_timer)
4792 +- pin_based_exec_ctrl &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
4793 +-
4794 +- return pin_based_exec_ctrl;
4795 +-}
4796 +-
4797 +-static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
4798 +-{
4799 +- struct vcpu_vmx *vmx = to_vmx(vcpu);
4800 +-
4801 +- pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx));
4802 +- if (cpu_has_secondary_exec_ctrls()) {
4803 +- if (kvm_vcpu_apicv_active(vcpu))
4804 +- secondary_exec_controls_setbit(vmx,
4805 +- SECONDARY_EXEC_APIC_REGISTER_VIRT |
4806 +- SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
4807 +- else
4808 +- secondary_exec_controls_clearbit(vmx,
4809 +- SECONDARY_EXEC_APIC_REGISTER_VIRT |
4810 +- SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
4811 +- }
4812 +-
4813 +- if (cpu_has_vmx_msr_bitmap())
4814 +- vmx_update_msr_bitmap(vcpu);
4815 +-}
4816 +-
4817 +-u32 vmx_exec_control(struct vcpu_vmx *vmx)
4818 +-{
4819 +- u32 exec_control = vmcs_config.cpu_based_exec_ctrl;
4820 +-
4821 +- if (vmx->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)
4822 +- exec_control &= ~CPU_BASED_MOV_DR_EXITING;
4823 +-
4824 +- if (!cpu_need_tpr_shadow(&vmx->vcpu)) {
4825 +- exec_control &= ~CPU_BASED_TPR_SHADOW;
4826 +-#ifdef CONFIG_X86_64
4827 +- exec_control |= CPU_BASED_CR8_STORE_EXITING |
4828 +- CPU_BASED_CR8_LOAD_EXITING;
4829 +-#endif
4830 +- }
4831 +- if (!enable_ept)
4832 +- exec_control |= CPU_BASED_CR3_STORE_EXITING |
4833 +- CPU_BASED_CR3_LOAD_EXITING |
4834 +- CPU_BASED_INVLPG_EXITING;
4835 +- if (kvm_mwait_in_guest(vmx->vcpu.kvm))
4836 +- exec_control &= ~(CPU_BASED_MWAIT_EXITING |
4837 +- CPU_BASED_MONITOR_EXITING);
4838 +- if (kvm_hlt_in_guest(vmx->vcpu.kvm))
4839 +- exec_control &= ~CPU_BASED_HLT_EXITING;
4840 +- return exec_control;
4841 +-}
4842 +-
4843 +-
4844 +-static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
4845 +-{
4846 +- struct kvm_vcpu *vcpu = &vmx->vcpu;
4847 +-
4848 +- u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl;
4849 +-
4850 +- if (pt_mode == PT_MODE_SYSTEM)
4851 +- exec_control &= ~(SECONDARY_EXEC_PT_USE_GPA | SECONDARY_EXEC_PT_CONCEAL_VMX);
4852 +- if (!cpu_need_virtualize_apic_accesses(vcpu))
4853 +- exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
4854 +- if (vmx->vpid == 0)
4855 +- exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
4856 +- if (!enable_ept) {
4857 +- exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
4858 +- enable_unrestricted_guest = 0;
4859 +- }
4860 +- if (!enable_unrestricted_guest)
4861 +- exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
4862 +- if (kvm_pause_in_guest(vmx->vcpu.kvm))
4863 +- exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
4864 +- if (!kvm_vcpu_apicv_active(vcpu))
4865 +- exec_control &= ~(SECONDARY_EXEC_APIC_REGISTER_VIRT |
4866 +- SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
4867 +- exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
4868 +-
4869 +- /* SECONDARY_EXEC_DESC is enabled/disabled on writes to CR4.UMIP,
4870 +- * in vmx_set_cr4. */
4871 +- exec_control &= ~SECONDARY_EXEC_DESC;
4872 +-
4873 +- /* SECONDARY_EXEC_SHADOW_VMCS is enabled when L1 executes VMPTRLD
4874 +- (handle_vmptrld).
4875 +- We can NOT enable shadow_vmcs here because we don't have yet
4876 +- a current VMCS12
4877 +- */
4878 +- exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
4879 +-
4880 +- if (!enable_pml)
4881 +- exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
4882 +-
4883 +- if (vmx_xsaves_supported()) {
4884 +- /* Exposing XSAVES only when XSAVE is exposed */
4885 +- bool xsaves_enabled =
4886 +- guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) &&
4887 +- guest_cpuid_has(vcpu, X86_FEATURE_XSAVES);
4888 +-
4889 +- vcpu->arch.xsaves_enabled = xsaves_enabled;
4890 +-
4891 +- if (!xsaves_enabled)
4892 +- exec_control &= ~SECONDARY_EXEC_XSAVES;
4893 +-
4894 +- if (nested) {
4895 +- if (xsaves_enabled)
4896 +- vmx->nested.msrs.secondary_ctls_high |=
4897 +- SECONDARY_EXEC_XSAVES;
4898 +- else
4899 +- vmx->nested.msrs.secondary_ctls_high &=
4900 +- ~SECONDARY_EXEC_XSAVES;
4901 +- }
4902 +- }
4903 +-
4904 +- if (vmx_rdtscp_supported()) {
4905 +- bool rdtscp_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP);
4906 +- if (!rdtscp_enabled)
4907 +- exec_control &= ~SECONDARY_EXEC_RDTSCP;
4908 +-
4909 +- if (nested) {
4910 +- if (rdtscp_enabled)
4911 +- vmx->nested.msrs.secondary_ctls_high |=
4912 +- SECONDARY_EXEC_RDTSCP;
4913 +- else
4914 +- vmx->nested.msrs.secondary_ctls_high &=
4915 +- ~SECONDARY_EXEC_RDTSCP;
4916 +- }
4917 +- }
4918 +-
4919 +- if (vmx_invpcid_supported()) {
4920 +- /* Exposing INVPCID only when PCID is exposed */
4921 +- bool invpcid_enabled =
4922 +- guest_cpuid_has(vcpu, X86_FEATURE_INVPCID) &&
4923 +- guest_cpuid_has(vcpu, X86_FEATURE_PCID);
4924 +-
4925 +- if (!invpcid_enabled) {
4926 +- exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID;
4927 +- guest_cpuid_clear(vcpu, X86_FEATURE_INVPCID);
4928 +- }
4929 +-
4930 +- if (nested) {
4931 +- if (invpcid_enabled)
4932 +- vmx->nested.msrs.secondary_ctls_high |=
4933 +- SECONDARY_EXEC_ENABLE_INVPCID;
4934 +- else
4935 +- vmx->nested.msrs.secondary_ctls_high &=
4936 +- ~SECONDARY_EXEC_ENABLE_INVPCID;
4937 +- }
4938 +- }
4939 +-
4940 +- if (vmx_rdrand_supported()) {
4941 +- bool rdrand_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDRAND);
4942 +- if (rdrand_enabled)
4943 +- exec_control &= ~SECONDARY_EXEC_RDRAND_EXITING;
4944 +-
4945 +- if (nested) {
4946 +- if (rdrand_enabled)
4947 +- vmx->nested.msrs.secondary_ctls_high |=
4948 +- SECONDARY_EXEC_RDRAND_EXITING;
4949 +- else
4950 +- vmx->nested.msrs.secondary_ctls_high &=
4951 +- ~SECONDARY_EXEC_RDRAND_EXITING;
4952 +- }
4953 +- }
4954 +-
4955 +- if (vmx_rdseed_supported()) {
4956 +- bool rdseed_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDSEED);
4957 +- if (rdseed_enabled)
4958 +- exec_control &= ~SECONDARY_EXEC_RDSEED_EXITING;
4959 +-
4960 +- if (nested) {
4961 +- if (rdseed_enabled)
4962 +- vmx->nested.msrs.secondary_ctls_high |=
4963 +- SECONDARY_EXEC_RDSEED_EXITING;
4964 +- else
4965 +- vmx->nested.msrs.secondary_ctls_high &=
4966 +- ~SECONDARY_EXEC_RDSEED_EXITING;
4967 +- }
4968 +- }
4969 +-
4970 +- if (vmx_waitpkg_supported()) {
4971 +- bool waitpkg_enabled =
4972 +- guest_cpuid_has(vcpu, X86_FEATURE_WAITPKG);
4973 +-
4974 +- if (!waitpkg_enabled)
4975 +- exec_control &= ~SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE;
4976 +-
4977 +- if (nested) {
4978 +- if (waitpkg_enabled)
4979 +- vmx->nested.msrs.secondary_ctls_high |=
4980 +- SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE;
4981 +- else
4982 +- vmx->nested.msrs.secondary_ctls_high &=
4983 +- ~SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE;
4984 +- }
4985 +- }
4986 +-
4987 +- vmx->secondary_exec_control = exec_control;
4988 +-}
4989 +-
4990 +-static void ept_set_mmio_spte_mask(void)
4991 +-{
4992 +- /*
4993 +- * EPT Misconfigurations can be generated if the value of bits 2:0
4994 +- * of an EPT paging-structure entry is 110b (write/execute).
4995 +- */
4996 +- kvm_mmu_set_mmio_spte_mask(VMX_EPT_RWX_MASK,
4997 +- VMX_EPT_MISCONFIG_WX_VALUE, 0);
4998 +-}
4999 +-
5000 +-#define VMX_XSS_EXIT_BITMAP 0
5001 +-
5002 +-/*
5003 +- * Noting that the initialization of Guest-state Area of VMCS is in
5004 +- * vmx_vcpu_reset().
5005 +- */
5006 +-static void init_vmcs(struct vcpu_vmx *vmx)
5007 +-{
5008 +- if (nested)
5009 +- nested_vmx_set_vmcs_shadowing_bitmap();
5010 +-
5011 +- if (cpu_has_vmx_msr_bitmap())
5012 +- vmcs_write64(MSR_BITMAP, __pa(vmx->vmcs01.msr_bitmap));
5013 +-
5014 +- vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
5015 +-
5016 +- /* Control */
5017 +- pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx));
5018 +-
5019 +- exec_controls_set(vmx, vmx_exec_control(vmx));
5020 +-
5021 +- if (cpu_has_secondary_exec_ctrls()) {
5022 +- vmx_compute_secondary_exec_control(vmx);
5023 +- secondary_exec_controls_set(vmx, vmx->secondary_exec_control);
5024 +- }
5025 +-
5026 +- if (kvm_vcpu_apicv_active(&vmx->vcpu)) {
5027 +- vmcs_write64(EOI_EXIT_BITMAP0, 0);
5028 +- vmcs_write64(EOI_EXIT_BITMAP1, 0);
5029 +- vmcs_write64(EOI_EXIT_BITMAP2, 0);
5030 +- vmcs_write64(EOI_EXIT_BITMAP3, 0);
5031 +-
5032 +- vmcs_write16(GUEST_INTR_STATUS, 0);
5033 +-
5034 +- vmcs_write16(POSTED_INTR_NV, POSTED_INTR_VECTOR);
5035 +- vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((&vmx->pi_desc)));
5036 +- }
5037 +-
5038 +- if (!kvm_pause_in_guest(vmx->vcpu.kvm)) {
5039 +- vmcs_write32(PLE_GAP, ple_gap);
5040 +- vmx->ple_window = ple_window;
5041 +- vmx->ple_window_dirty = true;
5042 +- }
5043 +-
5044 +- vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
5045 +- vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
5046 +- vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */
5047 +-
5048 +- vmcs_write16(HOST_FS_SELECTOR, 0); /* 22.2.4 */
5049 +- vmcs_write16(HOST_GS_SELECTOR, 0); /* 22.2.4 */
5050 +- vmx_set_constant_host_state(vmx);
5051 +- vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */
5052 +- vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */
5053 +-
5054 +- if (cpu_has_vmx_vmfunc())
5055 +- vmcs_write64(VM_FUNCTION_CONTROL, 0);
5056 +-
5057 +- vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
5058 +- vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
5059 +- vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val));
5060 +- vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
5061 +- vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val));
5062 +-
5063 +- if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
5064 +- vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
5065 +-
5066 +- vm_exit_controls_set(vmx, vmx_vmexit_ctrl());
5067 +-
5068 +- /* 22.2.1, 20.8.1 */
5069 +- vm_entry_controls_set(vmx, vmx_vmentry_ctrl());
5070 +-
5071 +- vmx->vcpu.arch.cr0_guest_owned_bits = X86_CR0_TS;
5072 +- vmcs_writel(CR0_GUEST_HOST_MASK, ~X86_CR0_TS);
5073 +-
5074 +- set_cr4_guest_host_mask(vmx);
5075 +-
5076 +- if (vmx->vpid != 0)
5077 +- vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
5078 +-
5079 +- if (vmx_xsaves_supported())
5080 +- vmcs_write64(XSS_EXIT_BITMAP, VMX_XSS_EXIT_BITMAP);
5081 +-
5082 +- if (enable_pml) {
5083 +- vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
5084 +- vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
5085 +- }
5086 +-
5087 +- if (cpu_has_vmx_encls_vmexit())
5088 +- vmcs_write64(ENCLS_EXITING_BITMAP, -1ull);
5089 +-
5090 +- if (pt_mode == PT_MODE_HOST_GUEST) {
5091 +- memset(&vmx->pt_desc, 0, sizeof(vmx->pt_desc));
5092 +- /* Bit[6~0] are forced to 1, writes are ignored. */
5093 +- vmx->pt_desc.guest.output_mask = 0x7F;
5094 +- vmcs_write64(GUEST_IA32_RTIT_CTL, 0);
5095 +- }
5096 +-}
5097 +-
5098 +-static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
5099 +-{
5100 +- struct vcpu_vmx *vmx = to_vmx(vcpu);
5101 +- struct msr_data apic_base_msr;
5102 +- u64 cr0;
5103 +-
5104 +- vmx->rmode.vm86_active = 0;
5105 +- vmx->spec_ctrl = 0;
5106 +-
5107 +- vmx->msr_ia32_umwait_control = 0;
5108 +-
5109 +- vcpu->arch.microcode_version = 0x100000000ULL;
5110 +- vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val();
5111 +- vmx->hv_deadline_tsc = -1;
5112 +- kvm_set_cr8(vcpu, 0);
5113 +-
5114 +- if (!init_event) {
5115 +- apic_base_msr.data = APIC_DEFAULT_PHYS_BASE |
5116 +- MSR_IA32_APICBASE_ENABLE;
5117 +- if (kvm_vcpu_is_reset_bsp(vcpu))
5118 +- apic_base_msr.data |= MSR_IA32_APICBASE_BSP;
5119 +- apic_base_msr.host_initiated = true;
5120 +- kvm_set_apic_base(vcpu, &apic_base_msr);
5121 +- }
5122 +-
5123 +- vmx_segment_cache_clear(vmx);
5124 +-
5125 +- seg_setup(VCPU_SREG_CS);
5126 +- vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
5127 +- vmcs_writel(GUEST_CS_BASE, 0xffff0000ul);
5128 +-
5129 +- seg_setup(VCPU_SREG_DS);
5130 +- seg_setup(VCPU_SREG_ES);
5131 +- seg_setup(VCPU_SREG_FS);
5132 +- seg_setup(VCPU_SREG_GS);
5133 +- seg_setup(VCPU_SREG_SS);
5134 +-
5135 +- vmcs_write16(GUEST_TR_SELECTOR, 0);
5136 +- vmcs_writel(GUEST_TR_BASE, 0);
5137 +- vmcs_write32(GUEST_TR_LIMIT, 0xffff);
5138 +- vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
5139 +-
5140 +- vmcs_write16(GUEST_LDTR_SELECTOR, 0);
5141 +- vmcs_writel(GUEST_LDTR_BASE, 0);
5142 +- vmcs_write32(GUEST_LDTR_LIMIT, 0xffff);
5143 +- vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082);
5144 +-
5145 +- if (!init_event) {
5146 +- vmcs_write32(GUEST_SYSENTER_CS, 0);
5147 +- vmcs_writel(GUEST_SYSENTER_ESP, 0);
5148 +- vmcs_writel(GUEST_SYSENTER_EIP, 0);
5149 +- vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
5150 +- }
5151 +-
5152 +- kvm_set_rflags(vcpu, X86_EFLAGS_FIXED);
5153 +- kvm_rip_write(vcpu, 0xfff0);
5154 +-
5155 +- vmcs_writel(GUEST_GDTR_BASE, 0);
5156 +- vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);
5157 +-
5158 +- vmcs_writel(GUEST_IDTR_BASE, 0);
5159 +- vmcs_write32(GUEST_IDTR_LIMIT, 0xffff);
5160 +-
5161 +- vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
5162 +- vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
5163 +- vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 0);
5164 +- if (kvm_mpx_supported())
5165 +- vmcs_write64(GUEST_BNDCFGS, 0);
5166 +-
5167 +- setup_msrs(vmx);
5168 +-
5169 +- vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */
5170 +-
5171 +- if (cpu_has_vmx_tpr_shadow() && !init_event) {
5172 +- vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
5173 +- if (cpu_need_tpr_shadow(vcpu))
5174 +- vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
5175 +- __pa(vcpu->arch.apic->regs));
5176 +- vmcs_write32(TPR_THRESHOLD, 0);
5177 +- }
5178 +-
5179 +- kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
5180 +-
5181 +- cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET;
5182 +- vmx->vcpu.arch.cr0 = cr0;
5183 +- vmx_set_cr0(vcpu, cr0); /* enter rmode */
5184 +- vmx_set_cr4(vcpu, 0);
5185 +- vmx_set_efer(vcpu, 0);
5186 +-
5187 +- update_exception_bitmap(vcpu);
5188 +-
5189 +- vpid_sync_context(vmx->vpid);
5190 +- if (init_event)
5191 +- vmx_clear_hlt(vcpu);
5192 +-}
5193 +-
5194 +-static void enable_irq_window(struct kvm_vcpu *vcpu)
5195 +-{
5196 +- exec_controls_setbit(to_vmx(vcpu), CPU_BASED_INTR_WINDOW_EXITING);
5197 +-}
5198 +-
5199 +-static void enable_nmi_window(struct kvm_vcpu *vcpu)
5200 +-{
5201 +- if (!enable_vnmi ||
5202 +- vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) {
5203 +- enable_irq_window(vcpu);
5204 +- return;
5205 +- }
5206 +-
5207 +- exec_controls_setbit(to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING);
5208 +-}
5209 +-
5210 +-static void vmx_inject_irq(struct kvm_vcpu *vcpu)
5211 +-{
5212 +- struct vcpu_vmx *vmx = to_vmx(vcpu);
5213 +- uint32_t intr;
5214 +- int irq = vcpu->arch.interrupt.nr;
5215 +-
5216 +- trace_kvm_inj_virq(irq);
5217 +-
5218 +- ++vcpu->stat.irq_injections;
5219 +- if (vmx->rmode.vm86_active) {
5220 +- int inc_eip = 0;
5221 +- if (vcpu->arch.interrupt.soft)
5222 +- inc_eip = vcpu->arch.event_exit_inst_len;
5223 +- kvm_inject_realmode_interrupt(vcpu, irq, inc_eip);
5224 +- return;
5225 +- }
5226 +- intr = irq | INTR_INFO_VALID_MASK;
5227 +- if (vcpu->arch.interrupt.soft) {
5228 +- intr |= INTR_TYPE_SOFT_INTR;
5229 +- vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
5230 +- vmx->vcpu.arch.event_exit_inst_len);
5231 +- } else
5232 +- intr |= INTR_TYPE_EXT_INTR;
5233 +- vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr);
5234 +-
5235 +- vmx_clear_hlt(vcpu);
5236 +-}
5237 +-
5238 +-static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
5239 +-{
5240 +- struct vcpu_vmx *vmx = to_vmx(vcpu);
5241 +-
5242 +- if (!enable_vnmi) {
5243 +- /*
5244 +- * Tracking the NMI-blocked state in software is built upon
5245 +- * finding the next open IRQ window. This, in turn, depends on
5246 +- * well-behaving guests: They have to keep IRQs disabled at
5247 +- * least as long as the NMI handler runs. Otherwise we may
5248 +- * cause NMI nesting, maybe breaking the guest. But as this is
5249 +- * highly unlikely, we can live with the residual risk.
5250 +- */
5251 +- vmx->loaded_vmcs->soft_vnmi_blocked = 1;
5252 +- vmx->loaded_vmcs->vnmi_blocked_time = 0;
5253 +- }
5254 +-
5255 +- ++vcpu->stat.nmi_injections;
5256 +- vmx->loaded_vmcs->nmi_known_unmasked = false;
5257 +-
5258 +- if (vmx->rmode.vm86_active) {
5259 +- kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR, 0);
5260 +- return;
5261 +- }
5262 +-
5263 +- vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
5264 +- INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);
5265 +-
5266 +- vmx_clear_hlt(vcpu);
5267 +-}
5268 +-
5269 +-bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu)
5270 +-{
5271 +- struct vcpu_vmx *vmx = to_vmx(vcpu);
5272 +- bool masked;
5273 +-
5274 +- if (!enable_vnmi)
5275 +- return vmx->loaded_vmcs->soft_vnmi_blocked;
5276 +- if (vmx->loaded_vmcs->nmi_known_unmasked)
5277 +- return false;
5278 +- masked = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI;
5279 +- vmx->loaded_vmcs->nmi_known_unmasked = !masked;
5280 +- return masked;
5281 +-}
5282 +-
5283 +-void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
5284 +-{
5285 +- struct vcpu_vmx *vmx = to_vmx(vcpu);
5286 +-
5287 +- if (!enable_vnmi) {
5288 +- if (vmx->loaded_vmcs->soft_vnmi_blocked != masked) {
5289 +- vmx->loaded_vmcs->soft_vnmi_blocked = masked;
5290 +- vmx->loaded_vmcs->vnmi_blocked_time = 0;
5291 +- }
5292 +- } else {
5293 +- vmx->loaded_vmcs->nmi_known_unmasked = !masked;
5294 +- if (masked)
5295 +- vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
5296 +- GUEST_INTR_STATE_NMI);
5297 +- else
5298 +- vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
5299 +- GUEST_INTR_STATE_NMI);
5300 +- }
5301 +-}
5302 +-
5303 +-static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
5304 +-{
5305 +- if (to_vmx(vcpu)->nested.nested_run_pending)
5306 +- return 0;
5307 +-
5308 +- if (!enable_vnmi &&
5309 +- to_vmx(vcpu)->loaded_vmcs->soft_vnmi_blocked)
5310 +- return 0;
5311 +-
5312 +- return !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
5313 +- (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI
5314 +- | GUEST_INTR_STATE_NMI));
5315 +-}
5316 +-
5317 +-static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu)
5318 +-{
5319 +- return (!to_vmx(vcpu)->nested.nested_run_pending &&
5320 +- vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
5321 +- !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
5322 +- (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS));
5323 +-}
5324 +-
5325 +-static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
5326 +-{
5327 +- int ret;
5328 +-
5329 +- if (enable_unrestricted_guest)
5330 +- return 0;
5331 +-
5332 +- ret = x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, addr,
5333 +- PAGE_SIZE * 3);
5334 +- if (ret)
5335 +- return ret;
5336 +- to_kvm_vmx(kvm)->tss_addr = addr;
5337 +- return init_rmode_tss(kvm);
5338 +-}
5339 +-
5340 +-static int vmx_set_identity_map_addr(struct kvm *kvm, u64 ident_addr)
5341 +-{
5342 +- to_kvm_vmx(kvm)->ept_identity_map_addr = ident_addr;
5343 +- return 0;
5344 +-}
5345 +-
5346 +-static bool rmode_exception(struct kvm_vcpu *vcpu, int vec)
5347 +-{
5348 +- switch (vec) {
5349 +- case BP_VECTOR:
5350 +- /*
5351 +- * Update instruction length as we may reinject the exception
5352 +- * from user space while in guest debugging mode.
5353 +- */
5354 +- to_vmx(vcpu)->vcpu.arch.event_exit_inst_len =
5355 +- vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
5356 +- if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
5357 +- return false;
5358 +- /* fall through */
5359 +- case DB_VECTOR:
5360 +- if (vcpu->guest_debug &
5361 +- (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
5362 +- return false;
5363 +- /* fall through */
5364 +- case DE_VECTOR:
5365 +- case OF_VECTOR:
5366 +- case BR_VECTOR:
5367 +- case UD_VECTOR:
5368 +- case DF_VECTOR:
5369 +- case SS_VECTOR:
5370 +- case GP_VECTOR:
5371 +- case MF_VECTOR:
5372 +- return true;
5373 +- break;
5374 +- }
5375 +- return false;
5376 +-}
5377 +-
5378 +-static int handle_rmode_exception(struct kvm_vcpu *vcpu,
5379 +- int vec, u32 err_code)
5380 +-{
5381 +- /*
5382 +- * Instruction with address size override prefix opcode 0x67
5383 +- * Cause the #SS fault with 0 error code in VM86 mode.
5384 +- */
5385 +- if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) {
5386 +- if (kvm_emulate_instruction(vcpu, 0)) {
5387 +- if (vcpu->arch.halt_request) {
5388 +- vcpu->arch.halt_request = 0;
5389 +- return kvm_vcpu_halt(vcpu);
5390 +- }
5391 +- return 1;
5392 +- }
5393 +- return 0;
5394 +- }
5395 +-
5396 +- /*
5397 +- * Forward all other exceptions that are valid in real mode.
5398 +- * FIXME: Breaks guest debugging in real mode, needs to be fixed with
5399 +- * the required debugging infrastructure rework.
5400 +- */
5401 +- kvm_queue_exception(vcpu, vec);
5402 +- return 1;
5403 +-}
5404 +-
5405 +-/*
5406 +- * Trigger machine check on the host. We assume all the MSRs are already set up
5407 +- * by the CPU and that we still run on the same CPU as the MCE occurred on.
5408 +- * We pass a fake environment to the machine check handler because we want
5409 +- * the guest to be always treated like user space, no matter what context
5410 +- * it used internally.
5411 +- */
5412 +-static void kvm_machine_check(void)
5413 +-{
5414 +-#if defined(CONFIG_X86_MCE) && defined(CONFIG_X86_64)
5415 +- struct pt_regs regs = {
5416 +- .cs = 3, /* Fake ring 3 no matter what the guest ran on */
5417 +- .flags = X86_EFLAGS_IF,
5418 +- };
5419 +-
5420 +- do_machine_check(&regs, 0);
5421 +-#endif
5422 +-}
5423 +-
5424 +-static int handle_machine_check(struct kvm_vcpu *vcpu)
5425 +-{
5426 +- /* handled by vmx_vcpu_run() */
5427 +- return 1;
5428 +-}
5429 +-
5430 +-static int handle_exception_nmi(struct kvm_vcpu *vcpu)
5431 +-{
5432 +- struct vcpu_vmx *vmx = to_vmx(vcpu);
5433 +- struct kvm_run *kvm_run = vcpu->run;
5434 +- u32 intr_info, ex_no, error_code;
5435 +- unsigned long cr2, rip, dr6;
5436 +- u32 vect_info;
5437 +-
5438 +- vect_info = vmx->idt_vectoring_info;
5439 +- intr_info = vmx->exit_intr_info;
5440 +-
5441 +- if (is_machine_check(intr_info) || is_nmi(intr_info))
5442 +- return 1; /* handled by handle_exception_nmi_irqoff() */
5443 +-
5444 +- if (is_invalid_opcode(intr_info))
5445 +- return handle_ud(vcpu);
5446 +-
5447 +- error_code = 0;
5448 +- if (intr_info & INTR_INFO_DELIVER_CODE_MASK)
5449 +- error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
5450 +-
5451 +- if (!vmx->rmode.vm86_active && is_gp_fault(intr_info)) {
5452 +- WARN_ON_ONCE(!enable_vmware_backdoor);
5453 +-
5454 +- /*
5455 +- * VMware backdoor emulation on #GP interception only handles
5456 +- * IN{S}, OUT{S}, and RDPMC, none of which generate a non-zero
5457 +- * error code on #GP.
5458 +- */
5459 +- if (error_code) {
5460 +- kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
5461 +- return 1;
5462 +- }
5463 +- return kvm_emulate_instruction(vcpu, EMULTYPE_VMWARE_GP);
5464 +- }
5465 +-
5466 +- /*
5467 +- * The #PF with PFEC.RSVD = 1 indicates the guest is accessing
5468 +- * MMIO, it is better to report an internal error.
5469 +- * See the comments in vmx_handle_exit.
5470 +- */
5471 +- if ((vect_info & VECTORING_INFO_VALID_MASK) &&
5472 +- !(is_page_fault(intr_info) && !(error_code & PFERR_RSVD_MASK))) {
5473 +- vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
5474 +- vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX;
5475 +- vcpu->run->internal.ndata = 3;
5476 +- vcpu->run->internal.data[0] = vect_info;
5477 +- vcpu->run->internal.data[1] = intr_info;
5478 +- vcpu->run->internal.data[2] = error_code;
5479 +- return 0;
5480 +- }
5481 +-
5482 +- if (is_page_fault(intr_info)) {
5483 +- cr2 = vmcs_readl(EXIT_QUALIFICATION);
5484 +- /* EPT won't cause page fault directly */
5485 +- WARN_ON_ONCE(!vcpu->arch.apf.host_apf_reason && enable_ept);
5486 +- return kvm_handle_page_fault(vcpu, error_code, cr2, NULL, 0);
5487 +- }
5488 +-
5489 +- ex_no = intr_info & INTR_INFO_VECTOR_MASK;
5490 +-
5491 +- if (vmx->rmode.vm86_active && rmode_exception(vcpu, ex_no))
5492 +- return handle_rmode_exception(vcpu, ex_no, error_code);
5493 +-
5494 +- switch (ex_no) {
5495 +- case AC_VECTOR:
5496 +- kvm_queue_exception_e(vcpu, AC_VECTOR, error_code);
5497 +- return 1;
5498 +- case DB_VECTOR:
5499 +- dr6 = vmcs_readl(EXIT_QUALIFICATION);
5500 +- if (!(vcpu->guest_debug &
5501 +- (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) {
5502 +- vcpu->arch.dr6 &= ~DR_TRAP_BITS;
5503 +- vcpu->arch.dr6 |= dr6 | DR6_RTM;
5504 +- if (is_icebp(intr_info))
5505 +- WARN_ON(!skip_emulated_instruction(vcpu));
5506 +-
5507 +- kvm_queue_exception(vcpu, DB_VECTOR);
5508 +- return 1;
5509 +- }
5510 +- kvm_run->debug.arch.dr6 = dr6 | DR6_FIXED_1;
5511 +- kvm_run->debug.arch.dr7 = vmcs_readl(GUEST_DR7);
5512 +- /* fall through */
5513 +- case BP_VECTOR:
5514 +- /*
5515 +- * Update instruction length as we may reinject #BP from
5516 +- * user space while in guest debugging mode. Reading it for
5517 +- * #DB as well causes no harm, it is not used in that case.
5518 +- */
5519 +- vmx->vcpu.arch.event_exit_inst_len =
5520 +- vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
5521 +- kvm_run->exit_reason = KVM_EXIT_DEBUG;
5522 +- rip = kvm_rip_read(vcpu);
5523 +- kvm_run->debug.arch.pc = vmcs_readl(GUEST_CS_BASE) + rip;
5524 +- kvm_run->debug.arch.exception = ex_no;
5525 +- break;
5526 +- default:
5527 +- kvm_run->exit_reason = KVM_EXIT_EXCEPTION;
5528 +- kvm_run->ex.exception = ex_no;
5529 +- kvm_run->ex.error_code = error_code;
5530 +- break;
5531 +- }
5532 +- return 0;
5533 +-}
5534 +-
5535 +-static __always_inline int handle_external_interrupt(struct kvm_vcpu *vcpu)
5536 +-{
5537 +- ++vcpu->stat.irq_exits;
5538 +- return 1;
5539 +-}
5540 +-
5541 +-static int handle_triple_fault(struct kvm_vcpu *vcpu)
5542 +-{
5543 +- vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
5544 +- vcpu->mmio_needed = 0;
5545 +- return 0;
5546 +-}
5547 +-
5548 +-static int handle_io(struct kvm_vcpu *vcpu)
5549 +-{
5550 +- unsigned long exit_qualification;
5551 +- int size, in, string;
5552 +- unsigned port;
5553 +-
5554 +- exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
5555 +- string = (exit_qualification & 16) != 0;
5556 +-
5557 +- ++vcpu->stat.io_exits;
5558 +-
5559 +- if (string)
5560 +- return kvm_emulate_instruction(vcpu, 0);
5561 +-
5562 +- port = exit_qualification >> 16;
5563 +- size = (exit_qualification & 7) + 1;
5564 +- in = (exit_qualification & 8) != 0;
5565 +-
5566 +- return kvm_fast_pio(vcpu, size, port, in);
5567 +-}
5568 +-
5569 +-static void
5570 +-vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
5571 +-{
5572 +- /*
5573 +- * Patch in the VMCALL instruction:
5574 +- */
5575 +- hypercall[0] = 0x0f;
5576 +- hypercall[1] = 0x01;
5577 +- hypercall[2] = 0xc1;
5578 +-}
5579 +-
5580 +-/* called to set cr0 as appropriate for a mov-to-cr0 exit. */
5581 +-static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val)
5582 +-{
5583 +- if (is_guest_mode(vcpu)) {
5584 +- struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
5585 +- unsigned long orig_val = val;
5586 +-
5587 +- /*
5588 +- * We get here when L2 changed cr0 in a way that did not change
5589 +- * any of L1's shadowed bits (see nested_vmx_exit_handled_cr),
5590 +- * but did change L0 shadowed bits. So we first calculate the
5591 +- * effective cr0 value that L1 would like to write into the
5592 +- * hardware. It consists of the L2-owned bits from the new
5593 +- * value combined with the L1-owned bits from L1's guest_cr0.
5594 +- */
5595 +- val = (val & ~vmcs12->cr0_guest_host_mask) |
5596 +- (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask);
5597 +-
5598 +- if (!nested_guest_cr0_valid(vcpu, val))
5599 +- return 1;
5600 +-
5601 +- if (kvm_set_cr0(vcpu, val))
5602 +- return 1;
5603 +- vmcs_writel(CR0_READ_SHADOW, orig_val);
5604 +- return 0;
5605 +- } else {
5606 +- if (to_vmx(vcpu)->nested.vmxon &&
5607 +- !nested_host_cr0_valid(vcpu, val))
5608 +- return 1;
5609 +-
5610 +- return kvm_set_cr0(vcpu, val);
5611 +- }
5612 +-}
5613 +-
5614 +-static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val)
5615 +-{
5616 +- if (is_guest_mode(vcpu)) {
5617 +- struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
5618 +- unsigned long orig_val = val;
5619 +-
5620 +- /* analogously to handle_set_cr0 */
5621 +- val = (val & ~vmcs12->cr4_guest_host_mask) |
5622 +- (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask);
5623 +- if (kvm_set_cr4(vcpu, val))
5624 +- return 1;
5625 +- vmcs_writel(CR4_READ_SHADOW, orig_val);
5626 +- return 0;
5627 +- } else
5628 +- return kvm_set_cr4(vcpu, val);
5629 +-}
5630 +-
5631 +-static int handle_desc(struct kvm_vcpu *vcpu)
5632 +-{
5633 +- WARN_ON(!(vcpu->arch.cr4 & X86_CR4_UMIP));
5634 +- return kvm_emulate_instruction(vcpu, 0);
5635 +-}
5636 +-
5637 +-static int handle_cr(struct kvm_vcpu *vcpu)
5638 +-{
5639 +- unsigned long exit_qualification, val;
5640 +- int cr;
5641 +- int reg;
5642 +- int err;
5643 +- int ret;
5644 +-
5645 +- exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
5646 +- cr = exit_qualification & 15;
5647 +- reg = (exit_qualification >> 8) & 15;
5648 +- switch ((exit_qualification >> 4) & 3) {
5649 +- case 0: /* mov to cr */
5650 +- val = kvm_register_readl(vcpu, reg);
5651 +- trace_kvm_cr_write(cr, val);
5652 +- switch (cr) {
5653 +- case 0:
5654 +- err = handle_set_cr0(vcpu, val);
5655 +- return kvm_complete_insn_gp(vcpu, err);
5656 +- case 3:
5657 +- WARN_ON_ONCE(enable_unrestricted_guest);
5658 +- err = kvm_set_cr3(vcpu, val);
5659 +- return kvm_complete_insn_gp(vcpu, err);
5660 +- case 4:
5661 +- err = handle_set_cr4(vcpu, val);
5662 +- return kvm_complete_insn_gp(vcpu, err);
5663 +- case 8: {
5664 +- u8 cr8_prev = kvm_get_cr8(vcpu);
5665 +- u8 cr8 = (u8)val;
5666 +- err = kvm_set_cr8(vcpu, cr8);
5667 +- ret = kvm_complete_insn_gp(vcpu, err);
5668 +- if (lapic_in_kernel(vcpu))
5669 +- return ret;
5670 +- if (cr8_prev <= cr8)
5671 +- return ret;
5672 +- /*
5673 +- * TODO: we might be squashing a
5674 +- * KVM_GUESTDBG_SINGLESTEP-triggered
5675 +- * KVM_EXIT_DEBUG here.
5676 +- */
5677 +- vcpu->run->exit_reason = KVM_EXIT_SET_TPR;
5678 +- return 0;
5679 +- }
5680 +- }
5681 +- break;
5682 +- case 2: /* clts */
5683 +- WARN_ONCE(1, "Guest should always own CR0.TS");
5684 +- vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS));
5685 +- trace_kvm_cr_write(0, kvm_read_cr0(vcpu));
5686 +- return kvm_skip_emulated_instruction(vcpu);
5687 +- case 1: /*mov from cr*/
5688 +- switch (cr) {
5689 +- case 3:
5690 +- WARN_ON_ONCE(enable_unrestricted_guest);
5691 +- val = kvm_read_cr3(vcpu);
5692 +- kvm_register_write(vcpu, reg, val);
5693 +- trace_kvm_cr_read(cr, val);
5694 +- return kvm_skip_emulated_instruction(vcpu);
5695 +- case 8:
5696 +- val = kvm_get_cr8(vcpu);
5697 +- kvm_register_write(vcpu, reg, val);
5698 +- trace_kvm_cr_read(cr, val);
5699 +- return kvm_skip_emulated_instruction(vcpu);
5700 +- }
5701 +- break;
5702 +- case 3: /* lmsw */
5703 +- val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f;
5704 +- trace_kvm_cr_write(0, (kvm_read_cr0(vcpu) & ~0xful) | val);
5705 +- kvm_lmsw(vcpu, val);
5706 +-
5707 +- return kvm_skip_emulated_instruction(vcpu);
5708 +- default:
5709 +- break;
5710 +- }
5711 +- vcpu->run->exit_reason = 0;
5712 +- vcpu_unimpl(vcpu, "unhandled control register: op %d cr %d\n",
5713 +- (int)(exit_qualification >> 4) & 3, cr);
5714 +- return 0;
5715 +-}
5716 +-
5717 +-static int handle_dr(struct kvm_vcpu *vcpu)
5718 +-{
5719 +- unsigned long exit_qualification;
5720 +- int dr, dr7, reg;
5721 +-
5722 +- exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
5723 +- dr = exit_qualification & DEBUG_REG_ACCESS_NUM;
5724 +-
5725 +- /* First, if DR does not exist, trigger UD */
5726 +- if (!kvm_require_dr(vcpu, dr))
5727 +- return 1;
5728 +-
5729 +- /* Do not handle if the CPL > 0, will trigger GP on re-entry */
5730 +- if (!kvm_require_cpl(vcpu, 0))
5731 +- return 1;
5732 +- dr7 = vmcs_readl(GUEST_DR7);
5733 +- if (dr7 & DR7_GD) {
5734 +- /*
5735 +- * As the vm-exit takes precedence over the debug trap, we
5736 +- * need to emulate the latter, either for the host or the
5737 +- * guest debugging itself.
5738 +- */
5739 +- if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) {
5740 +- vcpu->run->debug.arch.dr6 = vcpu->arch.dr6;
5741 +- vcpu->run->debug.arch.dr7 = dr7;
5742 +- vcpu->run->debug.arch.pc = kvm_get_linear_rip(vcpu);
5743 +- vcpu->run->debug.arch.exception = DB_VECTOR;
5744 +- vcpu->run->exit_reason = KVM_EXIT_DEBUG;
5745 +- return 0;
5746 +- } else {
5747 +- vcpu->arch.dr6 &= ~DR_TRAP_BITS;
5748 +- vcpu->arch.dr6 |= DR6_BD | DR6_RTM;
5749 +- kvm_queue_exception(vcpu, DB_VECTOR);
5750 +- return 1;
5751 +- }
5752 +- }
5753 +-
5754 +- if (vcpu->guest_debug == 0) {
5755 +- exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_MOV_DR_EXITING);
5756 +-
5757 +- /*
5758 +- * No more DR vmexits; force a reload of the debug registers
5759 +- * and reenter on this instruction. The next vmexit will
5760 +- * retrieve the full state of the debug registers.
5761 +- */
5762 +- vcpu->arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT;
5763 +- return 1;
5764 +- }
5765 +-
5766 +- reg = DEBUG_REG_ACCESS_REG(exit_qualification);
5767 +- if (exit_qualification & TYPE_MOV_FROM_DR) {
5768 +- unsigned long val;
5769 +-
5770 +- if (kvm_get_dr(vcpu, dr, &val))
5771 +- return 1;
5772 +- kvm_register_write(vcpu, reg, val);
5773 +- } else
5774 +- if (kvm_set_dr(vcpu, dr, kvm_register_readl(vcpu, reg)))
5775 +- return 1;
5776 +-
5777 +- return kvm_skip_emulated_instruction(vcpu);
5778 +-}
5779 +-
5780 +-static u64 vmx_get_dr6(struct kvm_vcpu *vcpu)
5781 +-{
5782 +- return vcpu->arch.dr6;
5783 +-}
5784 +-
5785 +-static void vmx_set_dr6(struct kvm_vcpu *vcpu, unsigned long val)
5786 +-{
5787 +-}
5788 +-
5789 +-static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
5790 +-{
5791 +- get_debugreg(vcpu->arch.db[0], 0);
5792 +- get_debugreg(vcpu->arch.db[1], 1);
5793 +- get_debugreg(vcpu->arch.db[2], 2);
5794 +- get_debugreg(vcpu->arch.db[3], 3);
5795 +- get_debugreg(vcpu->arch.dr6, 6);
5796 +- vcpu->arch.dr7 = vmcs_readl(GUEST_DR7);
5797 +-
5798 +- vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT;
5799 +- exec_controls_setbit(to_vmx(vcpu), CPU_BASED_MOV_DR_EXITING);
5800 +-}
5801 +-
5802 +-static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val)
5803 +-{
5804 +- vmcs_writel(GUEST_DR7, val);
5805 +-}
5806 +-
5807 +-static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu)
5808 +-{
5809 +- kvm_apic_update_ppr(vcpu);
5810 +- return 1;
5811 +-}
5812 +-
5813 +-static int handle_interrupt_window(struct kvm_vcpu *vcpu)
5814 +-{
5815 +- exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_INTR_WINDOW_EXITING);
5816 +-
5817 +- kvm_make_request(KVM_REQ_EVENT, vcpu);
5818 +-
5819 +- ++vcpu->stat.irq_window_exits;
5820 +- return 1;
5821 +-}
5822 +-
5823 +-static int handle_vmcall(struct kvm_vcpu *vcpu)
5824 +-{
5825 +- return kvm_emulate_hypercall(vcpu);
5826 +-}
5827 +-
5828 +-static int handle_invd(struct kvm_vcpu *vcpu)
5829 +-{
5830 +- return kvm_emulate_instruction(vcpu, 0);
5831 +-}
5832 +-
5833 +-static int handle_invlpg(struct kvm_vcpu *vcpu)
5834 +-{
5835 +- unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
5836 +-
5837 +- kvm_mmu_invlpg(vcpu, exit_qualification);
5838 +- return kvm_skip_emulated_instruction(vcpu);
5839 +-}
5840 +-
5841 +-static int handle_rdpmc(struct kvm_vcpu *vcpu)
5842 +-{
5843 +- int err;
5844 +-
5845 +- err = kvm_rdpmc(vcpu);
5846 +- return kvm_complete_insn_gp(vcpu, err);
5847 +-}
5848 +-
5849 +-static int handle_wbinvd(struct kvm_vcpu *vcpu)
5850 +-{
5851 +- return kvm_emulate_wbinvd(vcpu);
5852 +-}
5853 +-
5854 +-static int handle_xsetbv(struct kvm_vcpu *vcpu)
5855 +-{
5856 +- u64 new_bv = kvm_read_edx_eax(vcpu);
5857 +- u32 index = kvm_rcx_read(vcpu);
5858 +-
5859 +- if (kvm_set_xcr(vcpu, index, new_bv) == 0)
5860 +- return kvm_skip_emulated_instruction(vcpu);
5861 +- return 1;
5862 +-}
5863 +-
5864 +-static int handle_apic_access(struct kvm_vcpu *vcpu)
5865 +-{
5866 +- if (likely(fasteoi)) {
5867 +- unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
5868 +- int access_type, offset;
5869 +-
5870 +- access_type = exit_qualification & APIC_ACCESS_TYPE;
5871 +- offset = exit_qualification & APIC_ACCESS_OFFSET;
5872 +- /*
5873 +- * Sane guest uses MOV to write EOI, with written value
5874 +- * not cared. So make a short-circuit here by avoiding
5875 +- * heavy instruction emulation.
5876 +- */
5877 +- if ((access_type == TYPE_LINEAR_APIC_INST_WRITE) &&
5878 +- (offset == APIC_EOI)) {
5879 +- kvm_lapic_set_eoi(vcpu);
5880 +- return kvm_skip_emulated_instruction(vcpu);
5881 +- }
5882 +- }
5883 +- return kvm_emulate_instruction(vcpu, 0);
5884 +-}
5885 +-
5886 +-static int handle_apic_eoi_induced(struct kvm_vcpu *vcpu)
5887 +-{
5888 +- unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
5889 +- int vector = exit_qualification & 0xff;
5890 +-
5891 +- /* EOI-induced VM exit is trap-like and thus no need to adjust IP */
5892 +- kvm_apic_set_eoi_accelerated(vcpu, vector);
5893 +- return 1;
5894 +-}
5895 +-
5896 +-static int handle_apic_write(struct kvm_vcpu *vcpu)
5897 +-{
5898 +- unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
5899 +- u32 offset = exit_qualification & 0xfff;
5900 +-
5901 +- /* APIC-write VM exit is trap-like and thus no need to adjust IP */
5902 +- kvm_apic_write_nodecode(vcpu, offset);
5903 +- return 1;
5904 +-}
5905 +-
5906 +-static int handle_task_switch(struct kvm_vcpu *vcpu)
5907 +-{
5908 +- struct vcpu_vmx *vmx = to_vmx(vcpu);
5909 +- unsigned long exit_qualification;
5910 +- bool has_error_code = false;
5911 +- u32 error_code = 0;
5912 +- u16 tss_selector;
5913 +- int reason, type, idt_v, idt_index;
5914 +-
5915 +- idt_v = (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK);
5916 +- idt_index = (vmx->idt_vectoring_info & VECTORING_INFO_VECTOR_MASK);
5917 +- type = (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK);
5918 +-
5919 +- exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
5920 +-
5921 +- reason = (u32)exit_qualification >> 30;
5922 +- if (reason == TASK_SWITCH_GATE && idt_v) {
5923 +- switch (type) {
5924 +- case INTR_TYPE_NMI_INTR:
5925 +- vcpu->arch.nmi_injected = false;
5926 +- vmx_set_nmi_mask(vcpu, true);
5927 +- break;
5928 +- case INTR_TYPE_EXT_INTR:
5929 +- case INTR_TYPE_SOFT_INTR:
5930 +- kvm_clear_interrupt_queue(vcpu);
5931 +- break;
5932 +- case INTR_TYPE_HARD_EXCEPTION:
5933 +- if (vmx->idt_vectoring_info &
5934 +- VECTORING_INFO_DELIVER_CODE_MASK) {
5935 +- has_error_code = true;
5936 +- error_code =
5937 +- vmcs_read32(IDT_VECTORING_ERROR_CODE);
5938 +- }
5939 +- /* fall through */
5940 +- case INTR_TYPE_SOFT_EXCEPTION:
5941 +- kvm_clear_exception_queue(vcpu);
5942 +- break;
5943 +- default:
5944 +- break;
5945 +- }
5946 +- }
5947 +- tss_selector = exit_qualification;
5948 +-
5949 +- if (!idt_v || (type != INTR_TYPE_HARD_EXCEPTION &&
5950 +- type != INTR_TYPE_EXT_INTR &&
5951 +- type != INTR_TYPE_NMI_INTR))
5952 +- WARN_ON(!skip_emulated_instruction(vcpu));
5953 +-
5954 +- /*
5955 +- * TODO: What about debug traps on tss switch?
5956 +- * Are we supposed to inject them and update dr6?
5957 +- */
5958 +- return kvm_task_switch(vcpu, tss_selector,
5959 +- type == INTR_TYPE_SOFT_INTR ? idt_index : -1,
5960 +- reason, has_error_code, error_code);
5961 +-}
5962 +-
5963 +-static int handle_ept_violation(struct kvm_vcpu *vcpu)
5964 +-{
5965 +- unsigned long exit_qualification;
5966 +- gpa_t gpa;
5967 +- u64 error_code;
5968 +-
5969 +- exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
5970 +-
5971 +- /*
5972 +- * EPT violation happened while executing iret from NMI,
5973 +- * "blocked by NMI" bit has to be set before next VM entry.
5974 +- * There are errata that may cause this bit to not be set:
5975 +- * AAK134, BY25.
5976 +- */
5977 +- if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
5978 +- enable_vnmi &&
5979 +- (exit_qualification & INTR_INFO_UNBLOCK_NMI))
5980 +- vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI);
5981 +-
5982 +- gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
5983 +- trace_kvm_page_fault(gpa, exit_qualification);
5984 +-
5985 +- /* Is it a read fault? */
5986 +- error_code = (exit_qualification & EPT_VIOLATION_ACC_READ)
5987 +- ? PFERR_USER_MASK : 0;
5988 +- /* Is it a write fault? */
5989 +- error_code |= (exit_qualification & EPT_VIOLATION_ACC_WRITE)
5990 +- ? PFERR_WRITE_MASK : 0;
5991 +- /* Is it a fetch fault? */
5992 +- error_code |= (exit_qualification & EPT_VIOLATION_ACC_INSTR)
5993 +- ? PFERR_FETCH_MASK : 0;
5994 +- /* ept page table entry is present? */
5995 +- error_code |= (exit_qualification &
5996 +- (EPT_VIOLATION_READABLE | EPT_VIOLATION_WRITABLE |
5997 +- EPT_VIOLATION_EXECUTABLE))
5998 +- ? PFERR_PRESENT_MASK : 0;
5999 +-
6000 +- error_code |= (exit_qualification & 0x100) != 0 ?
6001 +- PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK;
6002 +-
6003 +- vcpu->arch.exit_qualification = exit_qualification;
6004 +- return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0);
6005 +-}
6006 +-
6007 +-static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
6008 +-{
6009 +- gpa_t gpa;
6010 +-
6011 +- /*
6012 +- * A nested guest cannot optimize MMIO vmexits, because we have an
6013 +- * nGPA here instead of the required GPA.
6014 +- */
6015 +- gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
6016 +- if (!is_guest_mode(vcpu) &&
6017 +- !kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) {
6018 +- trace_kvm_fast_mmio(gpa);
6019 +- return kvm_skip_emulated_instruction(vcpu);
6020 +- }
6021 +-
6022 +- return kvm_mmu_page_fault(vcpu, gpa, PFERR_RSVD_MASK, NULL, 0);
6023 +-}
6024 +-
6025 +-static int handle_nmi_window(struct kvm_vcpu *vcpu)
6026 +-{
6027 +- WARN_ON_ONCE(!enable_vnmi);
6028 +- exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING);
6029 +- ++vcpu->stat.nmi_window_exits;
6030 +- kvm_make_request(KVM_REQ_EVENT, vcpu);
6031 +-
6032 +- return 1;
6033 +-}
6034 +-
6035 +-static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
6036 +-{
6037 +- struct vcpu_vmx *vmx = to_vmx(vcpu);
6038 +- bool intr_window_requested;
6039 +- unsigned count = 130;
6040 +-
6041 +- /*
6042 +- * We should never reach the point where we are emulating L2
6043 +- * due to invalid guest state as that means we incorrectly
6044 +- * allowed a nested VMEntry with an invalid vmcs12.
6045 +- */
6046 +- WARN_ON_ONCE(vmx->emulation_required && vmx->nested.nested_run_pending);
6047 +-
6048 +- intr_window_requested = exec_controls_get(vmx) &
6049 +- CPU_BASED_INTR_WINDOW_EXITING;
6050 +-
6051 +- while (vmx->emulation_required && count-- != 0) {
6052 +- if (intr_window_requested && vmx_interrupt_allowed(vcpu))
6053 +- return handle_interrupt_window(&vmx->vcpu);
6054 +-
6055 +- if (kvm_test_request(KVM_REQ_EVENT, vcpu))
6056 +- return 1;
6057 +-
6058 +- if (!kvm_emulate_instruction(vcpu, 0))
6059 +- return 0;
6060 +-
6061 +- if (vmx->emulation_required && !vmx->rmode.vm86_active &&
6062 +- vcpu->arch.exception.pending) {
6063 +- vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
6064 +- vcpu->run->internal.suberror =
6065 +- KVM_INTERNAL_ERROR_EMULATION;
6066 +- vcpu->run->internal.ndata = 0;
6067 +- return 0;
6068 +- }
6069 +-
6070 +- if (vcpu->arch.halt_request) {
6071 +- vcpu->arch.halt_request = 0;
6072 +- return kvm_vcpu_halt(vcpu);
6073 +- }
6074 +-
6075 +- /*
6076 +- * Note, return 1 and not 0, vcpu_run() is responsible for
6077 +- * morphing the pending signal into the proper return code.
6078 +- */
6079 +- if (signal_pending(current))
6080 +- return 1;
6081 +-
6082 +- if (need_resched())
6083 +- schedule();
6084 +- }
6085 +-
6086 +- return 1;
6087 +-}
6088 +-
6089 +-static void grow_ple_window(struct kvm_vcpu *vcpu)
6090 +-{
6091 +- struct vcpu_vmx *vmx = to_vmx(vcpu);
6092 +- unsigned int old = vmx->ple_window;
6093 +-
6094 +- vmx->ple_window = __grow_ple_window(old, ple_window,
6095 +- ple_window_grow,
6096 +- ple_window_max);
6097 +-
6098 +- if (vmx->ple_window != old) {
6099 +- vmx->ple_window_dirty = true;
6100 +- trace_kvm_ple_window_update(vcpu->vcpu_id,
6101 +- vmx->ple_window, old);
6102 +- }
6103 +-}
6104 +-
6105 +-static void shrink_ple_window(struct kvm_vcpu *vcpu)
6106 +-{
6107 +- struct vcpu_vmx *vmx = to_vmx(vcpu);
6108 +- unsigned int old = vmx->ple_window;
6109 +-
6110 +- vmx->ple_window = __shrink_ple_window(old, ple_window,
6111 +- ple_window_shrink,
6112 +- ple_window);
6113 +-
6114 +- if (vmx->ple_window != old) {
6115 +- vmx->ple_window_dirty = true;
6116 +- trace_kvm_ple_window_update(vcpu->vcpu_id,
6117 +- vmx->ple_window, old);
6118 +- }
6119 +-}
6120 +-
6121 +-/*
6122 +- * Handler for POSTED_INTERRUPT_WAKEUP_VECTOR.
6123 +- */
6124 +-static void wakeup_handler(void)
6125 +-{
6126 +- struct kvm_vcpu *vcpu;
6127 +- int cpu = smp_processor_id();
6128 +-
6129 +- spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
6130 +- list_for_each_entry(vcpu, &per_cpu(blocked_vcpu_on_cpu, cpu),
6131 +- blocked_vcpu_list) {
6132 +- struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
6133 +-
6134 +- if (pi_test_on(pi_desc) == 1)
6135 +- kvm_vcpu_kick(vcpu);
6136 +- }
6137 +- spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
6138 +-}
6139 +-
6140 +-static void vmx_enable_tdp(void)
6141 +-{
6142 +- kvm_mmu_set_mask_ptes(VMX_EPT_READABLE_MASK,
6143 +- enable_ept_ad_bits ? VMX_EPT_ACCESS_BIT : 0ull,
6144 +- enable_ept_ad_bits ? VMX_EPT_DIRTY_BIT : 0ull,
6145 +- 0ull, VMX_EPT_EXECUTABLE_MASK,
6146 +- cpu_has_vmx_ept_execute_only() ? 0ull : VMX_EPT_READABLE_MASK,
6147 +- VMX_EPT_RWX_MASK, 0ull);
6148 +-
6149 +- ept_set_mmio_spte_mask();
6150 +- kvm_enable_tdp();
6151 +-}
6152 +-
6153 +-/*
6154 +- * Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE
6155 +- * exiting, so only get here on cpu with PAUSE-Loop-Exiting.
6156 +- */
6157 +-static int handle_pause(struct kvm_vcpu *vcpu)
6158 +-{
6159 +- if (!kvm_pause_in_guest(vcpu->kvm))
6160 +- grow_ple_window(vcpu);
6161 +-
6162 +- /*
6163 +- * Intel sdm vol3 ch-25.1.3 says: The "PAUSE-loop exiting"
6164 +- * VM-execution control is ignored if CPL > 0. OTOH, KVM
6165 +- * never set PAUSE_EXITING and just set PLE if supported,
6166 +- * so the vcpu must be CPL=0 if it gets a PAUSE exit.
6167 +- */
6168 +- kvm_vcpu_on_spin(vcpu, true);
6169 +- return kvm_skip_emulated_instruction(vcpu);
6170 +-}
6171 +-
6172 +-static int handle_nop(struct kvm_vcpu *vcpu)
6173 +-{
6174 +- return kvm_skip_emulated_instruction(vcpu);
6175 +-}
6176 +-
6177 +-static int handle_mwait(struct kvm_vcpu *vcpu)
6178 +-{
6179 +- printk_once(KERN_WARNING "kvm: MWAIT instruction emulated as NOP!\n");
6180 +- return handle_nop(vcpu);
6181 +-}
6182 +-
6183 +-static int handle_invalid_op(struct kvm_vcpu *vcpu)
6184 +-{
6185 +- kvm_queue_exception(vcpu, UD_VECTOR);
6186 +- return 1;
6187 +-}
6188 +-
6189 +-static int handle_monitor_trap(struct kvm_vcpu *vcpu)
6190 +-{
6191 +- return 1;
6192 +-}
6193 +-
6194 +-static int handle_monitor(struct kvm_vcpu *vcpu)
6195 +-{
6196 +- printk_once(KERN_WARNING "kvm: MONITOR instruction emulated as NOP!\n");
6197 +- return handle_nop(vcpu);
6198 +-}
6199 +-
6200 +-static int handle_invpcid(struct kvm_vcpu *vcpu)
6201 +-{
6202 +- u32 vmx_instruction_info;
6203 +- unsigned long type;
6204 +- bool pcid_enabled;
6205 +- gva_t gva;
6206 +- struct x86_exception e;
6207 +- unsigned i;
6208 +- unsigned long roots_to_free = 0;
6209 +- struct {
6210 +- u64 pcid;
6211 +- u64 gla;
6212 +- } operand;
6213 +-
6214 +- if (!guest_cpuid_has(vcpu, X86_FEATURE_INVPCID)) {
6215 +- kvm_queue_exception(vcpu, UD_VECTOR);
6216 +- return 1;
6217 +- }
6218 +-
6219 +- vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
6220 +- type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf);
6221 +-
6222 +- if (type > 3) {
6223 +- kvm_inject_gp(vcpu, 0);
6224 +- return 1;
6225 +- }
6226 +-
6227 +- /* According to the Intel instruction reference, the memory operand
6228 +- * is read even if it isn't needed (e.g., for type==all)
6229 +- */
6230 +- if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
6231 +- vmx_instruction_info, false,
6232 +- sizeof(operand), &gva))
6233 +- return 1;
6234 +-
6235 +- if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) {
6236 +- kvm_inject_page_fault(vcpu, &e);
6237 +- return 1;
6238 +- }
6239 +-
6240 +- if (operand.pcid >> 12 != 0) {
6241 +- kvm_inject_gp(vcpu, 0);
6242 +- return 1;
6243 +- }
6244 +-
6245 +- pcid_enabled = kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE);
6246 +-
6247 +- switch (type) {
6248 +- case INVPCID_TYPE_INDIV_ADDR:
6249 +- if ((!pcid_enabled && (operand.pcid != 0)) ||
6250 +- is_noncanonical_address(operand.gla, vcpu)) {
6251 +- kvm_inject_gp(vcpu, 0);
6252 +- return 1;
6253 +- }
6254 +- kvm_mmu_invpcid_gva(vcpu, operand.gla, operand.pcid);
6255 +- return kvm_skip_emulated_instruction(vcpu);
6256 +-
6257 +- case INVPCID_TYPE_SINGLE_CTXT:
6258 +- if (!pcid_enabled && (operand.pcid != 0)) {
6259 +- kvm_inject_gp(vcpu, 0);
6260 +- return 1;
6261 +- }
6262 +-
6263 +- if (kvm_get_active_pcid(vcpu) == operand.pcid) {
6264 +- kvm_mmu_sync_roots(vcpu);
6265 +- kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
6266 +- }
6267 +-
6268 +- for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
6269 +- if (kvm_get_pcid(vcpu, vcpu->arch.mmu->prev_roots[i].cr3)
6270 +- == operand.pcid)
6271 +- roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
6272 +-
6273 +- kvm_mmu_free_roots(vcpu, vcpu->arch.mmu, roots_to_free);
6274 +- /*
6275 +- * If neither the current cr3 nor any of the prev_roots use the
6276 +- * given PCID, then nothing needs to be done here because a
6277 +- * resync will happen anyway before switching to any other CR3.
6278 +- */
6279 +-
6280 +- return kvm_skip_emulated_instruction(vcpu);
6281 +-
6282 +- case INVPCID_TYPE_ALL_NON_GLOBAL:
6283 +- /*
6284 +- * Currently, KVM doesn't mark global entries in the shadow
6285 +- * page tables, so a non-global flush just degenerates to a
6286 +- * global flush. If needed, we could optimize this later by
6287 +- * keeping track of global entries in shadow page tables.
6288 +- */
6289 +-
6290 +- /* fall-through */
6291 +- case INVPCID_TYPE_ALL_INCL_GLOBAL:
6292 +- kvm_mmu_unload(vcpu);
6293 +- return kvm_skip_emulated_instruction(vcpu);
6294 +-
6295 +- default:
6296 +- BUG(); /* We have already checked above that type <= 3 */
6297 +- }
6298 +-}
6299 +-
6300 +-static int handle_pml_full(struct kvm_vcpu *vcpu)
6301 +-{
6302 +- unsigned long exit_qualification;
6303 +-
6304 +- trace_kvm_pml_full(vcpu->vcpu_id);
6305 +-
6306 +- exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
6307 +-
6308 +- /*
6309 +- * PML buffer FULL happened while executing iret from NMI,
6310 +- * "blocked by NMI" bit has to be set before next VM entry.
6311 +- */
6312 +- if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
6313 +- enable_vnmi &&
6314 +- (exit_qualification & INTR_INFO_UNBLOCK_NMI))
6315 +- vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
6316 +- GUEST_INTR_STATE_NMI);
6317 +-
6318 +- /*
6319 +- * PML buffer already flushed at beginning of VMEXIT. Nothing to do
6320 +- * here.., and there's no userspace involvement needed for PML.
6321 +- */
6322 +- return 1;
6323 +-}
6324 +-
6325 +-static int handle_preemption_timer(struct kvm_vcpu *vcpu)
6326 +-{
6327 +- struct vcpu_vmx *vmx = to_vmx(vcpu);
6328 +-
6329 +- if (!vmx->req_immediate_exit &&
6330 +- !unlikely(vmx->loaded_vmcs->hv_timer_soft_disabled))
6331 +- kvm_lapic_expired_hv_timer(vcpu);
6332 +-
6333 +- return 1;
6334 +-}
6335 +-
6336 +-/*
6337 +- * When nested=0, all VMX instruction VM Exits filter here. The handlers
6338 +- * are overwritten by nested_vmx_setup() when nested=1.
6339 +- */
6340 +-static int handle_vmx_instruction(struct kvm_vcpu *vcpu)
6341 +-{
6342 +- kvm_queue_exception(vcpu, UD_VECTOR);
6343 +- return 1;
6344 +-}
6345 +-
6346 +-static int handle_encls(struct kvm_vcpu *vcpu)
6347 +-{
6348 +- /*
6349 +- * SGX virtualization is not yet supported. There is no software
6350 +- * enable bit for SGX, so we have to trap ENCLS and inject a #UD
6351 +- * to prevent the guest from executing ENCLS.
6352 +- */
6353 +- kvm_queue_exception(vcpu, UD_VECTOR);
6354 +- return 1;
6355 +-}
6356 +-
6357 +-/*
6358 +- * The exit handlers return 1 if the exit was handled fully and guest execution
6359 +- * may resume. Otherwise they set the kvm_run parameter to indicate what needs
6360 +- * to be done to userspace and return 0.
6361 +- */
6362 +-static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
6363 +- [EXIT_REASON_EXCEPTION_NMI] = handle_exception_nmi,
6364 +- [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt,
6365 +- [EXIT_REASON_TRIPLE_FAULT] = handle_triple_fault,
6366 +- [EXIT_REASON_NMI_WINDOW] = handle_nmi_window,
6367 +- [EXIT_REASON_IO_INSTRUCTION] = handle_io,
6368 +- [EXIT_REASON_CR_ACCESS] = handle_cr,
6369 +- [EXIT_REASON_DR_ACCESS] = handle_dr,
6370 +- [EXIT_REASON_CPUID] = kvm_emulate_cpuid,
6371 +- [EXIT_REASON_MSR_READ] = kvm_emulate_rdmsr,
6372 +- [EXIT_REASON_MSR_WRITE] = kvm_emulate_wrmsr,
6373 +- [EXIT_REASON_INTERRUPT_WINDOW] = handle_interrupt_window,
6374 +- [EXIT_REASON_HLT] = kvm_emulate_halt,
6375 +- [EXIT_REASON_INVD] = handle_invd,
6376 +- [EXIT_REASON_INVLPG] = handle_invlpg,
6377 +- [EXIT_REASON_RDPMC] = handle_rdpmc,
6378 +- [EXIT_REASON_VMCALL] = handle_vmcall,
6379 +- [EXIT_REASON_VMCLEAR] = handle_vmx_instruction,
6380 +- [EXIT_REASON_VMLAUNCH] = handle_vmx_instruction,
6381 +- [EXIT_REASON_VMPTRLD] = handle_vmx_instruction,
6382 +- [EXIT_REASON_VMPTRST] = handle_vmx_instruction,
6383 +- [EXIT_REASON_VMREAD] = handle_vmx_instruction,
6384 +- [EXIT_REASON_VMRESUME] = handle_vmx_instruction,
6385 +- [EXIT_REASON_VMWRITE] = handle_vmx_instruction,
6386 +- [EXIT_REASON_VMOFF] = handle_vmx_instruction,
6387 +- [EXIT_REASON_VMON] = handle_vmx_instruction,
6388 +- [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold,
6389 +- [EXIT_REASON_APIC_ACCESS] = handle_apic_access,
6390 +- [EXIT_REASON_APIC_WRITE] = handle_apic_write,
6391 +- [EXIT_REASON_EOI_INDUCED] = handle_apic_eoi_induced,
6392 +- [EXIT_REASON_WBINVD] = handle_wbinvd,
6393 +- [EXIT_REASON_XSETBV] = handle_xsetbv,
6394 +- [EXIT_REASON_TASK_SWITCH] = handle_task_switch,
6395 +- [EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check,
6396 +- [EXIT_REASON_GDTR_IDTR] = handle_desc,
6397 +- [EXIT_REASON_LDTR_TR] = handle_desc,
6398 +- [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation,
6399 +- [EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig,
6400 +- [EXIT_REASON_PAUSE_INSTRUCTION] = handle_pause,
6401 +- [EXIT_REASON_MWAIT_INSTRUCTION] = handle_mwait,
6402 +- [EXIT_REASON_MONITOR_TRAP_FLAG] = handle_monitor_trap,
6403 +- [EXIT_REASON_MONITOR_INSTRUCTION] = handle_monitor,
6404 +- [EXIT_REASON_INVEPT] = handle_vmx_instruction,
6405 +- [EXIT_REASON_INVVPID] = handle_vmx_instruction,
6406 +- [EXIT_REASON_RDRAND] = handle_invalid_op,
6407 +- [EXIT_REASON_RDSEED] = handle_invalid_op,
6408 +- [EXIT_REASON_PML_FULL] = handle_pml_full,
6409 +- [EXIT_REASON_INVPCID] = handle_invpcid,
6410 +- [EXIT_REASON_VMFUNC] = handle_vmx_instruction,
6411 +- [EXIT_REASON_PREEMPTION_TIMER] = handle_preemption_timer,
6412 +- [EXIT_REASON_ENCLS] = handle_encls,
6413 +-};
6414 +-
6415 +-static const int kvm_vmx_max_exit_handlers =
6416 +- ARRAY_SIZE(kvm_vmx_exit_handlers);
6417 +-
6418 +-static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
6419 +-{
6420 +- *info1 = vmcs_readl(EXIT_QUALIFICATION);
6421 +- *info2 = vmcs_read32(VM_EXIT_INTR_INFO);
6422 +-}
6423 +-
6424 +-static void vmx_destroy_pml_buffer(struct vcpu_vmx *vmx)
6425 +-{
6426 +- if (vmx->pml_pg) {
6427 +- __free_page(vmx->pml_pg);
6428 +- vmx->pml_pg = NULL;
6429 +- }
6430 +-}
6431 +-
6432 +-static void vmx_flush_pml_buffer(struct kvm_vcpu *vcpu)
6433 +-{
6434 +- struct vcpu_vmx *vmx = to_vmx(vcpu);
6435 +- u64 *pml_buf;
6436 +- u16 pml_idx;
6437 +-
6438 +- pml_idx = vmcs_read16(GUEST_PML_INDEX);
6439 +-
6440 +- /* Do nothing if PML buffer is empty */
6441 +- if (pml_idx == (PML_ENTITY_NUM - 1))
6442 +- return;
6443 +-
6444 +- /* PML index always points to next available PML buffer entity */
6445 +- if (pml_idx >= PML_ENTITY_NUM)
6446 +- pml_idx = 0;
6447 +- else
6448 +- pml_idx++;
6449 +-
6450 +- pml_buf = page_address(vmx->pml_pg);
6451 +- for (; pml_idx < PML_ENTITY_NUM; pml_idx++) {
6452 +- u64 gpa;
6453 +-
6454 +- gpa = pml_buf[pml_idx];
6455 +- WARN_ON(gpa & (PAGE_SIZE - 1));
6456 +- kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT);
6457 +- }
6458 +-
6459 +- /* reset PML index */
6460 +- vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
6461 +-}
6462 +-
6463 +-/*
6464 +- * Flush all vcpus' PML buffer and update logged GPAs to dirty_bitmap.
6465 +- * Called before reporting dirty_bitmap to userspace.
6466 +- */
6467 +-static void kvm_flush_pml_buffers(struct kvm *kvm)
6468 +-{
6469 +- int i;
6470 +- struct kvm_vcpu *vcpu;
6471 +- /*
6472 +- * We only need to kick vcpu out of guest mode here, as PML buffer
6473 +- * is flushed at beginning of all VMEXITs, and it's obvious that only
6474 +- * vcpus running in guest are possible to have unflushed GPAs in PML
6475 +- * buffer.
6476 +- */
6477 +- kvm_for_each_vcpu(i, vcpu, kvm)
6478 +- kvm_vcpu_kick(vcpu);
6479 +-}
6480 +-
6481 +-static void vmx_dump_sel(char *name, uint32_t sel)
6482 +-{
6483 +- pr_err("%s sel=0x%04x, attr=0x%05x, limit=0x%08x, base=0x%016lx\n",
6484 +- name, vmcs_read16(sel),
6485 +- vmcs_read32(sel + GUEST_ES_AR_BYTES - GUEST_ES_SELECTOR),
6486 +- vmcs_read32(sel + GUEST_ES_LIMIT - GUEST_ES_SELECTOR),
6487 +- vmcs_readl(sel + GUEST_ES_BASE - GUEST_ES_SELECTOR));
6488 +-}
6489 +-
6490 +-static void vmx_dump_dtsel(char *name, uint32_t limit)
6491 +-{
6492 +- pr_err("%s limit=0x%08x, base=0x%016lx\n",
6493 +- name, vmcs_read32(limit),
6494 +- vmcs_readl(limit + GUEST_GDTR_BASE - GUEST_GDTR_LIMIT));
6495 +-}
6496 +-
6497 +-void dump_vmcs(void)
6498 +-{
6499 +- u32 vmentry_ctl, vmexit_ctl;
6500 +- u32 cpu_based_exec_ctrl, pin_based_exec_ctrl, secondary_exec_control;
6501 +- unsigned long cr4;
6502 +- u64 efer;
6503 +- int i, n;
6504 +-
6505 +- if (!dump_invalid_vmcs) {
6506 +- pr_warn_ratelimited("set kvm_intel.dump_invalid_vmcs=1 to dump internal KVM state.\n");
6507 +- return;
6508 +- }
6509 +-
6510 +- vmentry_ctl = vmcs_read32(VM_ENTRY_CONTROLS);
6511 +- vmexit_ctl = vmcs_read32(VM_EXIT_CONTROLS);
6512 +- cpu_based_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
6513 +- pin_based_exec_ctrl = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL);
6514 +- cr4 = vmcs_readl(GUEST_CR4);
6515 +- efer = vmcs_read64(GUEST_IA32_EFER);
6516 +- secondary_exec_control = 0;
6517 +- if (cpu_has_secondary_exec_ctrls())
6518 +- secondary_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
6519 +-
6520 +- pr_err("*** Guest State ***\n");
6521 +- pr_err("CR0: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n",
6522 +- vmcs_readl(GUEST_CR0), vmcs_readl(CR0_READ_SHADOW),
6523 +- vmcs_readl(CR0_GUEST_HOST_MASK));
6524 +- pr_err("CR4: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n",
6525 +- cr4, vmcs_readl(CR4_READ_SHADOW), vmcs_readl(CR4_GUEST_HOST_MASK));
6526 +- pr_err("CR3 = 0x%016lx\n", vmcs_readl(GUEST_CR3));
6527 +- if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT) &&
6528 +- (cr4 & X86_CR4_PAE) && !(efer & EFER_LMA))
6529 +- {
6530 +- pr_err("PDPTR0 = 0x%016llx PDPTR1 = 0x%016llx\n",
6531 +- vmcs_read64(GUEST_PDPTR0), vmcs_read64(GUEST_PDPTR1));
6532 +- pr_err("PDPTR2 = 0x%016llx PDPTR3 = 0x%016llx\n",
6533 +- vmcs_read64(GUEST_PDPTR2), vmcs_read64(GUEST_PDPTR3));
6534 +- }
6535 +- pr_err("RSP = 0x%016lx RIP = 0x%016lx\n",
6536 +- vmcs_readl(GUEST_RSP), vmcs_readl(GUEST_RIP));
6537 +- pr_err("RFLAGS=0x%08lx DR7 = 0x%016lx\n",
6538 +- vmcs_readl(GUEST_RFLAGS), vmcs_readl(GUEST_DR7));
6539 +- pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n",
6540 +- vmcs_readl(GUEST_SYSENTER_ESP),
6541 +- vmcs_read32(GUEST_SYSENTER_CS), vmcs_readl(GUEST_SYSENTER_EIP));
6542 +- vmx_dump_sel("CS: ", GUEST_CS_SELECTOR);
6543 +- vmx_dump_sel("DS: ", GUEST_DS_SELECTOR);
6544 +- vmx_dump_sel("SS: ", GUEST_SS_SELECTOR);
6545 +- vmx_dump_sel("ES: ", GUEST_ES_SELECTOR);
6546 +- vmx_dump_sel("FS: ", GUEST_FS_SELECTOR);
6547 +- vmx_dump_sel("GS: ", GUEST_GS_SELECTOR);
6548 +- vmx_dump_dtsel("GDTR:", GUEST_GDTR_LIMIT);
6549 +- vmx_dump_sel("LDTR:", GUEST_LDTR_SELECTOR);
6550 +- vmx_dump_dtsel("IDTR:", GUEST_IDTR_LIMIT);
6551 +- vmx_dump_sel("TR: ", GUEST_TR_SELECTOR);
6552 +- if ((vmexit_ctl & (VM_EXIT_SAVE_IA32_PAT | VM_EXIT_SAVE_IA32_EFER)) ||
6553 +- (vmentry_ctl & (VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_IA32_EFER)))
6554 +- pr_err("EFER = 0x%016llx PAT = 0x%016llx\n",
6555 +- efer, vmcs_read64(GUEST_IA32_PAT));
6556 +- pr_err("DebugCtl = 0x%016llx DebugExceptions = 0x%016lx\n",
6557 +- vmcs_read64(GUEST_IA32_DEBUGCTL),
6558 +- vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS));
6559 +- if (cpu_has_load_perf_global_ctrl() &&
6560 +- vmentry_ctl & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL)
6561 +- pr_err("PerfGlobCtl = 0x%016llx\n",
6562 +- vmcs_read64(GUEST_IA32_PERF_GLOBAL_CTRL));
6563 +- if (vmentry_ctl & VM_ENTRY_LOAD_BNDCFGS)
6564 +- pr_err("BndCfgS = 0x%016llx\n", vmcs_read64(GUEST_BNDCFGS));
6565 +- pr_err("Interruptibility = %08x ActivityState = %08x\n",
6566 +- vmcs_read32(GUEST_INTERRUPTIBILITY_INFO),
6567 +- vmcs_read32(GUEST_ACTIVITY_STATE));
6568 +- if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)
6569 +- pr_err("InterruptStatus = %04x\n",
6570 +- vmcs_read16(GUEST_INTR_STATUS));
6571 +-
6572 +- pr_err("*** Host State ***\n");
6573 +- pr_err("RIP = 0x%016lx RSP = 0x%016lx\n",
6574 +- vmcs_readl(HOST_RIP), vmcs_readl(HOST_RSP));
6575 +- pr_err("CS=%04x SS=%04x DS=%04x ES=%04x FS=%04x GS=%04x TR=%04x\n",
6576 +- vmcs_read16(HOST_CS_SELECTOR), vmcs_read16(HOST_SS_SELECTOR),
6577 +- vmcs_read16(HOST_DS_SELECTOR), vmcs_read16(HOST_ES_SELECTOR),
6578 +- vmcs_read16(HOST_FS_SELECTOR), vmcs_read16(HOST_GS_SELECTOR),
6579 +- vmcs_read16(HOST_TR_SELECTOR));
6580 +- pr_err("FSBase=%016lx GSBase=%016lx TRBase=%016lx\n",
6581 +- vmcs_readl(HOST_FS_BASE), vmcs_readl(HOST_GS_BASE),
6582 +- vmcs_readl(HOST_TR_BASE));
6583 +- pr_err("GDTBase=%016lx IDTBase=%016lx\n",
6584 +- vmcs_readl(HOST_GDTR_BASE), vmcs_readl(HOST_IDTR_BASE));
6585 +- pr_err("CR0=%016lx CR3=%016lx CR4=%016lx\n",
6586 +- vmcs_readl(HOST_CR0), vmcs_readl(HOST_CR3),
6587 +- vmcs_readl(HOST_CR4));
6588 +- pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n",
6589 +- vmcs_readl(HOST_IA32_SYSENTER_ESP),
6590 +- vmcs_read32(HOST_IA32_SYSENTER_CS),
6591 +- vmcs_readl(HOST_IA32_SYSENTER_EIP));
6592 +- if (vmexit_ctl & (VM_EXIT_LOAD_IA32_PAT | VM_EXIT_LOAD_IA32_EFER))
6593 +- pr_err("EFER = 0x%016llx PAT = 0x%016llx\n",
6594 +- vmcs_read64(HOST_IA32_EFER),
6595 +- vmcs_read64(HOST_IA32_PAT));
6596 +- if (cpu_has_load_perf_global_ctrl() &&
6597 +- vmexit_ctl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
6598 +- pr_err("PerfGlobCtl = 0x%016llx\n",
6599 +- vmcs_read64(HOST_IA32_PERF_GLOBAL_CTRL));
6600 +-
6601 +- pr_err("*** Control State ***\n");
6602 +- pr_err("PinBased=%08x CPUBased=%08x SecondaryExec=%08x\n",
6603 +- pin_based_exec_ctrl, cpu_based_exec_ctrl, secondary_exec_control);
6604 +- pr_err("EntryControls=%08x ExitControls=%08x\n", vmentry_ctl, vmexit_ctl);
6605 +- pr_err("ExceptionBitmap=%08x PFECmask=%08x PFECmatch=%08x\n",
6606 +- vmcs_read32(EXCEPTION_BITMAP),
6607 +- vmcs_read32(PAGE_FAULT_ERROR_CODE_MASK),
6608 +- vmcs_read32(PAGE_FAULT_ERROR_CODE_MATCH));
6609 +- pr_err("VMEntry: intr_info=%08x errcode=%08x ilen=%08x\n",
6610 +- vmcs_read32(VM_ENTRY_INTR_INFO_FIELD),
6611 +- vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE),
6612 +- vmcs_read32(VM_ENTRY_INSTRUCTION_LEN));
6613 +- pr_err("VMExit: intr_info=%08x errcode=%08x ilen=%08x\n",
6614 +- vmcs_read32(VM_EXIT_INTR_INFO),
6615 +- vmcs_read32(VM_EXIT_INTR_ERROR_CODE),
6616 +- vmcs_read32(VM_EXIT_INSTRUCTION_LEN));
6617 +- pr_err(" reason=%08x qualification=%016lx\n",
6618 +- vmcs_read32(VM_EXIT_REASON), vmcs_readl(EXIT_QUALIFICATION));
6619 +- pr_err("IDTVectoring: info=%08x errcode=%08x\n",
6620 +- vmcs_read32(IDT_VECTORING_INFO_FIELD),
6621 +- vmcs_read32(IDT_VECTORING_ERROR_CODE));
6622 +- pr_err("TSC Offset = 0x%016llx\n", vmcs_read64(TSC_OFFSET));
6623 +- if (secondary_exec_control & SECONDARY_EXEC_TSC_SCALING)
6624 +- pr_err("TSC Multiplier = 0x%016llx\n",
6625 +- vmcs_read64(TSC_MULTIPLIER));
6626 +- if (cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW) {
6627 +- if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) {
6628 +- u16 status = vmcs_read16(GUEST_INTR_STATUS);
6629 +- pr_err("SVI|RVI = %02x|%02x ", status >> 8, status & 0xff);
6630 +- }
6631 +- pr_cont("TPR Threshold = 0x%02x\n", vmcs_read32(TPR_THRESHOLD));
6632 +- if (secondary_exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)
6633 +- pr_err("APIC-access addr = 0x%016llx ", vmcs_read64(APIC_ACCESS_ADDR));
6634 +- pr_cont("virt-APIC addr = 0x%016llx\n", vmcs_read64(VIRTUAL_APIC_PAGE_ADDR));
6635 +- }
6636 +- if (pin_based_exec_ctrl & PIN_BASED_POSTED_INTR)
6637 +- pr_err("PostedIntrVec = 0x%02x\n", vmcs_read16(POSTED_INTR_NV));
6638 +- if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT))
6639 +- pr_err("EPT pointer = 0x%016llx\n", vmcs_read64(EPT_POINTER));
6640 +- n = vmcs_read32(CR3_TARGET_COUNT);
6641 +- for (i = 0; i + 1 < n; i += 4)
6642 +- pr_err("CR3 target%u=%016lx target%u=%016lx\n",
6643 +- i, vmcs_readl(CR3_TARGET_VALUE0 + i * 2),
6644 +- i + 1, vmcs_readl(CR3_TARGET_VALUE0 + i * 2 + 2));
6645 +- if (i < n)
6646 +- pr_err("CR3 target%u=%016lx\n",
6647 +- i, vmcs_readl(CR3_TARGET_VALUE0 + i * 2));
6648 +- if (secondary_exec_control & SECONDARY_EXEC_PAUSE_LOOP_EXITING)
6649 +- pr_err("PLE Gap=%08x Window=%08x\n",
6650 +- vmcs_read32(PLE_GAP), vmcs_read32(PLE_WINDOW));
6651 +- if (secondary_exec_control & SECONDARY_EXEC_ENABLE_VPID)
6652 +- pr_err("Virtual processor ID = 0x%04x\n",
6653 +- vmcs_read16(VIRTUAL_PROCESSOR_ID));
6654 +-}
6655 +-
6656 +-/*
6657 +- * The guest has exited. See if we can fix it or if we need userspace
6658 +- * assistance.
6659 +- */
6660 +-static int vmx_handle_exit(struct kvm_vcpu *vcpu,
6661 +- enum exit_fastpath_completion exit_fastpath)
6662 +-{
6663 +- struct vcpu_vmx *vmx = to_vmx(vcpu);
6664 +- u32 exit_reason = vmx->exit_reason;
6665 +- u32 vectoring_info = vmx->idt_vectoring_info;
6666 +-
6667 +- trace_kvm_exit(exit_reason, vcpu, KVM_ISA_VMX);
6668 +-
6669 +- /*
6670 +- * Flush logged GPAs PML buffer, this will make dirty_bitmap more
6671 +- * updated. Another good is, in kvm_vm_ioctl_get_dirty_log, before
6672 +- * querying dirty_bitmap, we only need to kick all vcpus out of guest
6673 +- * mode as if vcpus is in root mode, the PML buffer must has been
6674 +- * flushed already.
6675 +- */
6676 +- if (enable_pml)
6677 +- vmx_flush_pml_buffer(vcpu);
6678 +-
6679 +- /* If guest state is invalid, start emulating */
6680 +- if (vmx->emulation_required)
6681 +- return handle_invalid_guest_state(vcpu);
6682 +-
6683 +- if (is_guest_mode(vcpu) && nested_vmx_exit_reflected(vcpu, exit_reason))
6684 +- return nested_vmx_reflect_vmexit(vcpu, exit_reason);
6685 +-
6686 +- if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) {
6687 +- dump_vmcs();
6688 +- vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
6689 +- vcpu->run->fail_entry.hardware_entry_failure_reason
6690 +- = exit_reason;
6691 +- return 0;
6692 +- }
6693 +-
6694 +- if (unlikely(vmx->fail)) {
6695 +- dump_vmcs();
6696 +- vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
6697 +- vcpu->run->fail_entry.hardware_entry_failure_reason
6698 +- = vmcs_read32(VM_INSTRUCTION_ERROR);
6699 +- return 0;
6700 +- }
6701 +-
6702 +- /*
6703 +- * Note:
6704 +- * Do not try to fix EXIT_REASON_EPT_MISCONFIG if it caused by
6705 +- * delivery event since it indicates guest is accessing MMIO.
6706 +- * The vm-exit can be triggered again after return to guest that
6707 +- * will cause infinite loop.
6708 +- */
6709 +- if ((vectoring_info & VECTORING_INFO_VALID_MASK) &&
6710 +- (exit_reason != EXIT_REASON_EXCEPTION_NMI &&
6711 +- exit_reason != EXIT_REASON_EPT_VIOLATION &&
6712 +- exit_reason != EXIT_REASON_PML_FULL &&
6713 +- exit_reason != EXIT_REASON_TASK_SWITCH)) {
6714 +- vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
6715 +- vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV;
6716 +- vcpu->run->internal.ndata = 3;
6717 +- vcpu->run->internal.data[0] = vectoring_info;
6718 +- vcpu->run->internal.data[1] = exit_reason;
6719 +- vcpu->run->internal.data[2] = vcpu->arch.exit_qualification;
6720 +- if (exit_reason == EXIT_REASON_EPT_MISCONFIG) {
6721 +- vcpu->run->internal.ndata++;
6722 +- vcpu->run->internal.data[3] =
6723 +- vmcs_read64(GUEST_PHYSICAL_ADDRESS);
6724 +- }
6725 +- return 0;
6726 +- }
6727 +-
6728 +- if (unlikely(!enable_vnmi &&
6729 +- vmx->loaded_vmcs->soft_vnmi_blocked)) {
6730 +- if (vmx_interrupt_allowed(vcpu)) {
6731 +- vmx->loaded_vmcs->soft_vnmi_blocked = 0;
6732 +- } else if (vmx->loaded_vmcs->vnmi_blocked_time > 1000000000LL &&
6733 +- vcpu->arch.nmi_pending) {
6734 +- /*
6735 +- * This CPU don't support us in finding the end of an
6736 +- * NMI-blocked window if the guest runs with IRQs
6737 +- * disabled. So we pull the trigger after 1 s of
6738 +- * futile waiting, but inform the user about this.
6739 +- */
6740 +- printk(KERN_WARNING "%s: Breaking out of NMI-blocked "
6741 +- "state on VCPU %d after 1 s timeout\n",
6742 +- __func__, vcpu->vcpu_id);
6743 +- vmx->loaded_vmcs->soft_vnmi_blocked = 0;
6744 +- }
6745 +- }
6746 +-
6747 +- if (exit_fastpath == EXIT_FASTPATH_SKIP_EMUL_INS) {
6748 +- kvm_skip_emulated_instruction(vcpu);
6749 +- return 1;
6750 +- } else if (exit_reason < kvm_vmx_max_exit_handlers
6751 +- && kvm_vmx_exit_handlers[exit_reason]) {
6752 +-#ifdef CONFIG_RETPOLINE
6753 +- if (exit_reason == EXIT_REASON_MSR_WRITE)
6754 +- return kvm_emulate_wrmsr(vcpu);
6755 +- else if (exit_reason == EXIT_REASON_PREEMPTION_TIMER)
6756 +- return handle_preemption_timer(vcpu);
6757 +- else if (exit_reason == EXIT_REASON_INTERRUPT_WINDOW)
6758 +- return handle_interrupt_window(vcpu);
6759 +- else if (exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT)
6760 +- return handle_external_interrupt(vcpu);
6761 +- else if (exit_reason == EXIT_REASON_HLT)
6762 +- return kvm_emulate_halt(vcpu);
6763 +- else if (exit_reason == EXIT_REASON_EPT_MISCONFIG)
6764 +- return handle_ept_misconfig(vcpu);
6765 +-#endif
6766 +- return kvm_vmx_exit_handlers[exit_reason](vcpu);
6767 +- } else {
6768 +- vcpu_unimpl(vcpu, "vmx: unexpected exit reason 0x%x\n",
6769 +- exit_reason);
6770 +- dump_vmcs();
6771 +- vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
6772 +- vcpu->run->internal.suberror =
6773 +- KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON;
6774 +- vcpu->run->internal.ndata = 1;
6775 +- vcpu->run->internal.data[0] = exit_reason;
6776 +- return 0;
6777 +- }
6778 +-}
6779 +-
6780 +-/*
6781 +- * Software based L1D cache flush which is used when microcode providing
6782 +- * the cache control MSR is not loaded.
6783 +- *
6784 +- * The L1D cache is 32 KiB on Nehalem and later microarchitectures, but to
6785 +- * flush it is required to read in 64 KiB because the replacement algorithm
6786 +- * is not exactly LRU. This could be sized at runtime via topology
6787 +- * information but as all relevant affected CPUs have 32KiB L1D cache size
6788 +- * there is no point in doing so.
6789 +- */
6790 +-static void vmx_l1d_flush(struct kvm_vcpu *vcpu)
6791 +-{
6792 +- int size = PAGE_SIZE << L1D_CACHE_ORDER;
6793 +-
6794 +- /*
6795 +- * This code is only executed when the the flush mode is 'cond' or
6796 +- * 'always'
6797 +- */
6798 +- if (static_branch_likely(&vmx_l1d_flush_cond)) {
6799 +- bool flush_l1d;
6800 +-
6801 +- /*
6802 +- * Clear the per-vcpu flush bit, it gets set again
6803 +- * either from vcpu_run() or from one of the unsafe
6804 +- * VMEXIT handlers.
6805 +- */
6806 +- flush_l1d = vcpu->arch.l1tf_flush_l1d;
6807 +- vcpu->arch.l1tf_flush_l1d = false;
6808 +-
6809 +- /*
6810 +- * Clear the per-cpu flush bit, it gets set again from
6811 +- * the interrupt handlers.
6812 +- */
6813 +- flush_l1d |= kvm_get_cpu_l1tf_flush_l1d();
6814 +- kvm_clear_cpu_l1tf_flush_l1d();
6815 +-
6816 +- if (!flush_l1d)
6817 +- return;
6818 +- }
6819 +-
6820 +- vcpu->stat.l1d_flush++;
6821 +-
6822 +- if (static_cpu_has(X86_FEATURE_FLUSH_L1D)) {
6823 +- wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH);
6824 +- return;
6825 +- }
6826 +-
6827 +- asm volatile(
6828 +- /* First ensure the pages are in the TLB */
6829 +- "xorl %%eax, %%eax\n"
6830 +- ".Lpopulate_tlb:\n\t"
6831 +- "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t"
6832 +- "addl $4096, %%eax\n\t"
6833 +- "cmpl %%eax, %[size]\n\t"
6834 +- "jne .Lpopulate_tlb\n\t"
6835 +- "xorl %%eax, %%eax\n\t"
6836 +- "cpuid\n\t"
6837 +- /* Now fill the cache */
6838 +- "xorl %%eax, %%eax\n"
6839 +- ".Lfill_cache:\n"
6840 +- "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t"
6841 +- "addl $64, %%eax\n\t"
6842 +- "cmpl %%eax, %[size]\n\t"
6843 +- "jne .Lfill_cache\n\t"
6844 +- "lfence\n"
6845 +- :: [flush_pages] "r" (vmx_l1d_flush_pages),
6846 +- [size] "r" (size)
6847 +- : "eax", "ebx", "ecx", "edx");
6848 +-}
6849 +-
6850 +-static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
6851 +-{
6852 +- struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
6853 +- int tpr_threshold;
6854 +-
6855 +- if (is_guest_mode(vcpu) &&
6856 +- nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))
6857 +- return;
6858 +-
6859 +- tpr_threshold = (irr == -1 || tpr < irr) ? 0 : irr;
6860 +- if (is_guest_mode(vcpu))
6861 +- to_vmx(vcpu)->nested.l1_tpr_threshold = tpr_threshold;
6862 +- else
6863 +- vmcs_write32(TPR_THRESHOLD, tpr_threshold);
6864 +-}
6865 +-
6866 +-void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu)
6867 +-{
6868 +- struct vcpu_vmx *vmx = to_vmx(vcpu);
6869 +- u32 sec_exec_control;
6870 +-
6871 +- if (!lapic_in_kernel(vcpu))
6872 +- return;
6873 +-
6874 +- if (!flexpriority_enabled &&
6875 +- !cpu_has_vmx_virtualize_x2apic_mode())
6876 +- return;
6877 +-
6878 +- /* Postpone execution until vmcs01 is the current VMCS. */
6879 +- if (is_guest_mode(vcpu)) {
6880 +- vmx->nested.change_vmcs01_virtual_apic_mode = true;
6881 +- return;
6882 +- }
6883 +-
6884 +- sec_exec_control = secondary_exec_controls_get(vmx);
6885 +- sec_exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
6886 +- SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE);
6887 +-
6888 +- switch (kvm_get_apic_mode(vcpu)) {
6889 +- case LAPIC_MODE_INVALID:
6890 +- WARN_ONCE(true, "Invalid local APIC state");
6891 +- case LAPIC_MODE_DISABLED:
6892 +- break;
6893 +- case LAPIC_MODE_XAPIC:
6894 +- if (flexpriority_enabled) {
6895 +- sec_exec_control |=
6896 +- SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
6897 +- vmx_flush_tlb(vcpu, true);
6898 +- }
6899 +- break;
6900 +- case LAPIC_MODE_X2APIC:
6901 +- if (cpu_has_vmx_virtualize_x2apic_mode())
6902 +- sec_exec_control |=
6903 +- SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
6904 +- break;
6905 +- }
6906 +- secondary_exec_controls_set(vmx, sec_exec_control);
6907 +-
6908 +- vmx_update_msr_bitmap(vcpu);
6909 +-}
6910 +-
6911 +-static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu, hpa_t hpa)
6912 +-{
6913 +- if (!is_guest_mode(vcpu)) {
6914 +- vmcs_write64(APIC_ACCESS_ADDR, hpa);
6915 +- vmx_flush_tlb(vcpu, true);
6916 +- }
6917 +-}
6918 +-
6919 +-static void vmx_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr)
6920 +-{
6921 +- u16 status;
6922 +- u8 old;
6923 +-
6924 +- if (max_isr == -1)
6925 +- max_isr = 0;
6926 +-
6927 +- status = vmcs_read16(GUEST_INTR_STATUS);
6928 +- old = status >> 8;
6929 +- if (max_isr != old) {
6930 +- status &= 0xff;
6931 +- status |= max_isr << 8;
6932 +- vmcs_write16(GUEST_INTR_STATUS, status);
6933 +- }
6934 +-}
6935 +-
6936 +-static void vmx_set_rvi(int vector)
6937 +-{
6938 +- u16 status;
6939 +- u8 old;
6940 +-
6941 +- if (vector == -1)
6942 +- vector = 0;
6943 +-
6944 +- status = vmcs_read16(GUEST_INTR_STATUS);
6945 +- old = (u8)status & 0xff;
6946 +- if ((u8)vector != old) {
6947 +- status &= ~0xff;
6948 +- status |= (u8)vector;
6949 +- vmcs_write16(GUEST_INTR_STATUS, status);
6950 +- }
6951 +-}
6952 +-
6953 +-static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr)
6954 +-{
6955 +- /*
6956 +- * When running L2, updating RVI is only relevant when
6957 +- * vmcs12 virtual-interrupt-delivery enabled.
6958 +- * However, it can be enabled only when L1 also
6959 +- * intercepts external-interrupts and in that case
6960 +- * we should not update vmcs02 RVI but instead intercept
6961 +- * interrupt. Therefore, do nothing when running L2.
6962 +- */
6963 +- if (!is_guest_mode(vcpu))
6964 +- vmx_set_rvi(max_irr);
6965 +-}
6966 +-
6967 +-static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
6968 +-{
6969 +- struct vcpu_vmx *vmx = to_vmx(vcpu);
6970 +- int max_irr;
6971 +- bool max_irr_updated;
6972 +-
6973 +- WARN_ON(!vcpu->arch.apicv_active);
6974 +- if (pi_test_on(&vmx->pi_desc)) {
6975 +- pi_clear_on(&vmx->pi_desc);
6976 +- /*
6977 +- * IOMMU can write to PID.ON, so the barrier matters even on UP.
6978 +- * But on x86 this is just a compiler barrier anyway.
6979 +- */
6980 +- smp_mb__after_atomic();
6981 +- max_irr_updated =
6982 +- kvm_apic_update_irr(vcpu, vmx->pi_desc.pir, &max_irr);
6983 +-
6984 +- /*
6985 +- * If we are running L2 and L1 has a new pending interrupt
6986 +- * which can be injected, we should re-evaluate
6987 +- * what should be done with this new L1 interrupt.
6988 +- * If L1 intercepts external-interrupts, we should
6989 +- * exit from L2 to L1. Otherwise, interrupt should be
6990 +- * delivered directly to L2.
6991 +- */
6992 +- if (is_guest_mode(vcpu) && max_irr_updated) {
6993 +- if (nested_exit_on_intr(vcpu))
6994 +- kvm_vcpu_exiting_guest_mode(vcpu);
6995 +- else
6996 +- kvm_make_request(KVM_REQ_EVENT, vcpu);
6997 +- }
6998 +- } else {
6999 +- max_irr = kvm_lapic_find_highest_irr(vcpu);
7000 +- }
7001 +- vmx_hwapic_irr_update(vcpu, max_irr);
7002 +- return max_irr;
7003 +-}
7004 +-
7005 +-static bool vmx_dy_apicv_has_pending_interrupt(struct kvm_vcpu *vcpu)
7006 +-{
7007 +- struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
7008 +-
7009 +- return pi_test_on(pi_desc) ||
7010 +- (pi_test_sn(pi_desc) && !pi_is_pir_empty(pi_desc));
7011 +-}
7012 +-
7013 +-static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
7014 +-{
7015 +- if (!kvm_vcpu_apicv_active(vcpu))
7016 +- return;
7017 +-
7018 +- vmcs_write64(EOI_EXIT_BITMAP0, eoi_exit_bitmap[0]);
7019 +- vmcs_write64(EOI_EXIT_BITMAP1, eoi_exit_bitmap[1]);
7020 +- vmcs_write64(EOI_EXIT_BITMAP2, eoi_exit_bitmap[2]);
7021 +- vmcs_write64(EOI_EXIT_BITMAP3, eoi_exit_bitmap[3]);
7022 +-}
7023 +-
7024 +-static void vmx_apicv_post_state_restore(struct kvm_vcpu *vcpu)
7025 +-{
7026 +- struct vcpu_vmx *vmx = to_vmx(vcpu);
7027 +-
7028 +- pi_clear_on(&vmx->pi_desc);
7029 +- memset(vmx->pi_desc.pir, 0, sizeof(vmx->pi_desc.pir));
7030 +-}
7031 +-
7032 +-static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx)
7033 +-{
7034 +- vmx->exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
7035 +-
7036 +- /* if exit due to PF check for async PF */
7037 +- if (is_page_fault(vmx->exit_intr_info))
7038 +- vmx->vcpu.arch.apf.host_apf_reason = kvm_read_and_reset_pf_reason();
7039 +-
7040 +- /* Handle machine checks before interrupts are enabled */
7041 +- if (is_machine_check(vmx->exit_intr_info))
7042 +- kvm_machine_check();
7043 +-
7044 +- /* We need to handle NMIs before interrupts are enabled */
7045 +- if (is_nmi(vmx->exit_intr_info)) {
7046 +- kvm_before_interrupt(&vmx->vcpu);
7047 +- asm("int $2");
7048 +- kvm_after_interrupt(&vmx->vcpu);
7049 +- }
7050 +-}
7051 +-
7052 +-static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu)
7053 +-{
7054 +- unsigned int vector;
7055 +- unsigned long entry;
7056 +-#ifdef CONFIG_X86_64
7057 +- unsigned long tmp;
7058 +-#endif
7059 +- gate_desc *desc;
7060 +- u32 intr_info;
7061 +-
7062 +- intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
7063 +- if (WARN_ONCE(!is_external_intr(intr_info),
7064 +- "KVM: unexpected VM-Exit interrupt info: 0x%x", intr_info))
7065 +- return;
7066 +-
7067 +- vector = intr_info & INTR_INFO_VECTOR_MASK;
7068 +- desc = (gate_desc *)host_idt_base + vector;
7069 +- entry = gate_offset(desc);
7070 +-
7071 +- kvm_before_interrupt(vcpu);
7072 +-
7073 +- asm volatile(
7074 +-#ifdef CONFIG_X86_64
7075 +- "mov %%" _ASM_SP ", %[sp]\n\t"
7076 +- "and $0xfffffffffffffff0, %%" _ASM_SP "\n\t"
7077 +- "push $%c[ss]\n\t"
7078 +- "push %[sp]\n\t"
7079 +-#endif
7080 +- "pushf\n\t"
7081 +- __ASM_SIZE(push) " $%c[cs]\n\t"
7082 +- CALL_NOSPEC
7083 +- :
7084 +-#ifdef CONFIG_X86_64
7085 +- [sp]"=&r"(tmp),
7086 +-#endif
7087 +- ASM_CALL_CONSTRAINT
7088 +- :
7089 +- THUNK_TARGET(entry),
7090 +- [ss]"i"(__KERNEL_DS),
7091 +- [cs]"i"(__KERNEL_CS)
7092 +- );
7093 +-
7094 +- kvm_after_interrupt(vcpu);
7095 +-}
7096 +-STACK_FRAME_NON_STANDARD(handle_external_interrupt_irqoff);
7097 +-
7098 +-static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu,
7099 +- enum exit_fastpath_completion *exit_fastpath)
7100 +-{
7101 +- struct vcpu_vmx *vmx = to_vmx(vcpu);
7102 +-
7103 +- if (vmx->exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT)
7104 +- handle_external_interrupt_irqoff(vcpu);
7105 +- else if (vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI)
7106 +- handle_exception_nmi_irqoff(vmx);
7107 +- else if (!is_guest_mode(vcpu) &&
7108 +- vmx->exit_reason == EXIT_REASON_MSR_WRITE)
7109 +- *exit_fastpath = handle_fastpath_set_msr_irqoff(vcpu);
7110 +-}
7111 +-
7112 +-static bool vmx_has_emulated_msr(int index)
7113 +-{
7114 +- switch (index) {
7115 +- case MSR_IA32_SMBASE:
7116 +- /*
7117 +- * We cannot do SMM unless we can run the guest in big
7118 +- * real mode.
7119 +- */
7120 +- return enable_unrestricted_guest || emulate_invalid_guest_state;
7121 +- case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
7122 +- return nested;
7123 +- case MSR_AMD64_VIRT_SPEC_CTRL:
7124 +- /* This is AMD only. */
7125 +- return false;
7126 +- default:
7127 +- return true;
7128 +- }
7129 +-}
7130 +-
7131 +-static bool vmx_pt_supported(void)
7132 +-{
7133 +- return pt_mode == PT_MODE_HOST_GUEST;
7134 +-}
7135 +-
7136 +-static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
7137 +-{
7138 +- u32 exit_intr_info;
7139 +- bool unblock_nmi;
7140 +- u8 vector;
7141 +- bool idtv_info_valid;
7142 +-
7143 +- idtv_info_valid = vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK;
7144 +-
7145 +- if (enable_vnmi) {
7146 +- if (vmx->loaded_vmcs->nmi_known_unmasked)
7147 +- return;
7148 +- /*
7149 +- * Can't use vmx->exit_intr_info since we're not sure what
7150 +- * the exit reason is.
7151 +- */
7152 +- exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
7153 +- unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0;
7154 +- vector = exit_intr_info & INTR_INFO_VECTOR_MASK;
7155 +- /*
7156 +- * SDM 3: 27.7.1.2 (September 2008)
7157 +- * Re-set bit "block by NMI" before VM entry if vmexit caused by
7158 +- * a guest IRET fault.
7159 +- * SDM 3: 23.2.2 (September 2008)
7160 +- * Bit 12 is undefined in any of the following cases:
7161 +- * If the VM exit sets the valid bit in the IDT-vectoring
7162 +- * information field.
7163 +- * If the VM exit is due to a double fault.
7164 +- */
7165 +- if ((exit_intr_info & INTR_INFO_VALID_MASK) && unblock_nmi &&
7166 +- vector != DF_VECTOR && !idtv_info_valid)
7167 +- vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
7168 +- GUEST_INTR_STATE_NMI);
7169 +- else
7170 +- vmx->loaded_vmcs->nmi_known_unmasked =
7171 +- !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO)
7172 +- & GUEST_INTR_STATE_NMI);
7173 +- } else if (unlikely(vmx->loaded_vmcs->soft_vnmi_blocked))
7174 +- vmx->loaded_vmcs->vnmi_blocked_time +=
7175 +- ktime_to_ns(ktime_sub(ktime_get(),
7176 +- vmx->loaded_vmcs->entry_time));
7177 +-}
7178 +-
7179 +-static void __vmx_complete_interrupts(struct kvm_vcpu *vcpu,
7180 +- u32 idt_vectoring_info,
7181 +- int instr_len_field,
7182 +- int error_code_field)
7183 +-{
7184 +- u8 vector;
7185 +- int type;
7186 +- bool idtv_info_valid;
7187 +-
7188 +- idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK;
7189 +-
7190 +- vcpu->arch.nmi_injected = false;
7191 +- kvm_clear_exception_queue(vcpu);
7192 +- kvm_clear_interrupt_queue(vcpu);
7193 +-
7194 +- if (!idtv_info_valid)
7195 +- return;
7196 +-
7197 +- kvm_make_request(KVM_REQ_EVENT, vcpu);
7198 +-
7199 +- vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK;
7200 +- type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK;
7201 +-
7202 +- switch (type) {
7203 +- case INTR_TYPE_NMI_INTR:
7204 +- vcpu->arch.nmi_injected = true;
7205 +- /*
7206 +- * SDM 3: 27.7.1.2 (September 2008)
7207 +- * Clear bit "block by NMI" before VM entry if a NMI
7208 +- * delivery faulted.
7209 +- */
7210 +- vmx_set_nmi_mask(vcpu, false);
7211 +- break;
7212 +- case INTR_TYPE_SOFT_EXCEPTION:
7213 +- vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field);
7214 +- /* fall through */
7215 +- case INTR_TYPE_HARD_EXCEPTION:
7216 +- if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) {
7217 +- u32 err = vmcs_read32(error_code_field);
7218 +- kvm_requeue_exception_e(vcpu, vector, err);
7219 +- } else
7220 +- kvm_requeue_exception(vcpu, vector);
7221 +- break;
7222 +- case INTR_TYPE_SOFT_INTR:
7223 +- vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field);
7224 +- /* fall through */
7225 +- case INTR_TYPE_EXT_INTR:
7226 +- kvm_queue_interrupt(vcpu, vector, type == INTR_TYPE_SOFT_INTR);
7227 +- break;
7228 +- default:
7229 +- break;
7230 +- }
7231 +-}
7232 +-
7233 +-static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
7234 +-{
7235 +- __vmx_complete_interrupts(&vmx->vcpu, vmx->idt_vectoring_info,
7236 +- VM_EXIT_INSTRUCTION_LEN,
7237 +- IDT_VECTORING_ERROR_CODE);
7238 +-}
7239 +-
7240 +-static void vmx_cancel_injection(struct kvm_vcpu *vcpu)
7241 +-{
7242 +- __vmx_complete_interrupts(vcpu,
7243 +- vmcs_read32(VM_ENTRY_INTR_INFO_FIELD),
7244 +- VM_ENTRY_INSTRUCTION_LEN,
7245 +- VM_ENTRY_EXCEPTION_ERROR_CODE);
7246 +-
7247 +- vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);
7248 +-}
7249 +-
7250 +-static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx)
7251 +-{
7252 +- int i, nr_msrs;
7253 +- struct perf_guest_switch_msr *msrs;
7254 +-
7255 +- msrs = perf_guest_get_msrs(&nr_msrs);
7256 +-
7257 +- if (!msrs)
7258 +- return;
7259 +-
7260 +- for (i = 0; i < nr_msrs; i++)
7261 +- if (msrs[i].host == msrs[i].guest)
7262 +- clear_atomic_switch_msr(vmx, msrs[i].msr);
7263 +- else
7264 +- add_atomic_switch_msr(vmx, msrs[i].msr, msrs[i].guest,
7265 +- msrs[i].host, false);
7266 +-}
7267 +-
7268 +-static void atomic_switch_umwait_control_msr(struct vcpu_vmx *vmx)
7269 +-{
7270 +- u32 host_umwait_control;
7271 +-
7272 +- if (!vmx_has_waitpkg(vmx))
7273 +- return;
7274 +-
7275 +- host_umwait_control = get_umwait_control_msr();
7276 +-
7277 +- if (vmx->msr_ia32_umwait_control != host_umwait_control)
7278 +- add_atomic_switch_msr(vmx, MSR_IA32_UMWAIT_CONTROL,
7279 +- vmx->msr_ia32_umwait_control,
7280 +- host_umwait_control, false);
7281 +- else
7282 +- clear_atomic_switch_msr(vmx, MSR_IA32_UMWAIT_CONTROL);
7283 +-}
7284 +-
7285 +-static void vmx_update_hv_timer(struct kvm_vcpu *vcpu)
7286 +-{
7287 +- struct vcpu_vmx *vmx = to_vmx(vcpu);
7288 +- u64 tscl;
7289 +- u32 delta_tsc;
7290 +-
7291 +- if (vmx->req_immediate_exit) {
7292 +- vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, 0);
7293 +- vmx->loaded_vmcs->hv_timer_soft_disabled = false;
7294 +- } else if (vmx->hv_deadline_tsc != -1) {
7295 +- tscl = rdtsc();
7296 +- if (vmx->hv_deadline_tsc > tscl)
7297 +- /* set_hv_timer ensures the delta fits in 32-bits */
7298 +- delta_tsc = (u32)((vmx->hv_deadline_tsc - tscl) >>
7299 +- cpu_preemption_timer_multi);
7300 +- else
7301 +- delta_tsc = 0;
7302 +-
7303 +- vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, delta_tsc);
7304 +- vmx->loaded_vmcs->hv_timer_soft_disabled = false;
7305 +- } else if (!vmx->loaded_vmcs->hv_timer_soft_disabled) {
7306 +- vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, -1);
7307 +- vmx->loaded_vmcs->hv_timer_soft_disabled = true;
7308 +- }
7309 +-}
7310 +-
7311 +-void vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp)
7312 +-{
7313 +- if (unlikely(host_rsp != vmx->loaded_vmcs->host_state.rsp)) {
7314 +- vmx->loaded_vmcs->host_state.rsp = host_rsp;
7315 +- vmcs_writel(HOST_RSP, host_rsp);
7316 +- }
7317 +-}
7318 +-
7319 +-bool __vmx_vcpu_run(struct vcpu_vmx *vmx, unsigned long *regs, bool launched);
7320 +-
7321 +-static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
7322 +-{
7323 +- struct vcpu_vmx *vmx = to_vmx(vcpu);
7324 +- unsigned long cr3, cr4;
7325 +-
7326 +- /* Record the guest's net vcpu time for enforced NMI injections. */
7327 +- if (unlikely(!enable_vnmi &&
7328 +- vmx->loaded_vmcs->soft_vnmi_blocked))
7329 +- vmx->loaded_vmcs->entry_time = ktime_get();
7330 +-
7331 +- /* Don't enter VMX if guest state is invalid, let the exit handler
7332 +- start emulation until we arrive back to a valid state */
7333 +- if (vmx->emulation_required)
7334 +- return;
7335 +-
7336 +- if (vmx->ple_window_dirty) {
7337 +- vmx->ple_window_dirty = false;
7338 +- vmcs_write32(PLE_WINDOW, vmx->ple_window);
7339 +- }
7340 +-
7341 +- if (vmx->nested.need_vmcs12_to_shadow_sync)
7342 +- nested_sync_vmcs12_to_shadow(vcpu);
7343 +-
7344 +- if (kvm_register_is_dirty(vcpu, VCPU_REGS_RSP))
7345 +- vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
7346 +- if (kvm_register_is_dirty(vcpu, VCPU_REGS_RIP))
7347 +- vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
7348 +-
7349 +- cr3 = __get_current_cr3_fast();
7350 +- if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) {
7351 +- vmcs_writel(HOST_CR3, cr3);
7352 +- vmx->loaded_vmcs->host_state.cr3 = cr3;
7353 +- }
7354 +-
7355 +- cr4 = cr4_read_shadow();
7356 +- if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) {
7357 +- vmcs_writel(HOST_CR4, cr4);
7358 +- vmx->loaded_vmcs->host_state.cr4 = cr4;
7359 +- }
7360 +-
7361 +- /* When single-stepping over STI and MOV SS, we must clear the
7362 +- * corresponding interruptibility bits in the guest state. Otherwise
7363 +- * vmentry fails as it then expects bit 14 (BS) in pending debug
7364 +- * exceptions being set, but that's not correct for the guest debugging
7365 +- * case. */
7366 +- if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
7367 +- vmx_set_interrupt_shadow(vcpu, 0);
7368 +-
7369 +- kvm_load_guest_xsave_state(vcpu);
7370 +-
7371 +- if (static_cpu_has(X86_FEATURE_PKU) &&
7372 +- kvm_read_cr4_bits(vcpu, X86_CR4_PKE) &&
7373 +- vcpu->arch.pkru != vmx->host_pkru)
7374 +- __write_pkru(vcpu->arch.pkru);
7375 +-
7376 +- pt_guest_enter(vmx);
7377 +-
7378 +- atomic_switch_perf_msrs(vmx);
7379 +- atomic_switch_umwait_control_msr(vmx);
7380 +-
7381 +- if (enable_preemption_timer)
7382 +- vmx_update_hv_timer(vcpu);
7383 +-
7384 +- if (lapic_in_kernel(vcpu) &&
7385 +- vcpu->arch.apic->lapic_timer.timer_advance_ns)
7386 +- kvm_wait_lapic_expire(vcpu);
7387 +-
7388 +- /*
7389 +- * If this vCPU has touched SPEC_CTRL, restore the guest's value if
7390 +- * it's non-zero. Since vmentry is serialising on affected CPUs, there
7391 +- * is no need to worry about the conditional branch over the wrmsr
7392 +- * being speculatively taken.
7393 +- */
7394 +- x86_spec_ctrl_set_guest(vmx->spec_ctrl, 0);
7395 +-
7396 +- /* L1D Flush includes CPU buffer clear to mitigate MDS */
7397 +- if (static_branch_unlikely(&vmx_l1d_should_flush))
7398 +- vmx_l1d_flush(vcpu);
7399 +- else if (static_branch_unlikely(&mds_user_clear))
7400 +- mds_clear_cpu_buffers();
7401 +-
7402 +- if (vcpu->arch.cr2 != read_cr2())
7403 +- write_cr2(vcpu->arch.cr2);
7404 +-
7405 +- vmx->fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs,
7406 +- vmx->loaded_vmcs->launched);
7407 +-
7408 +- vcpu->arch.cr2 = read_cr2();
7409 +-
7410 +- /*
7411 +- * We do not use IBRS in the kernel. If this vCPU has used the
7412 +- * SPEC_CTRL MSR it may have left it on; save the value and
7413 +- * turn it off. This is much more efficient than blindly adding
7414 +- * it to the atomic save/restore list. Especially as the former
7415 +- * (Saving guest MSRs on vmexit) doesn't even exist in KVM.
7416 +- *
7417 +- * For non-nested case:
7418 +- * If the L01 MSR bitmap does not intercept the MSR, then we need to
7419 +- * save it.
7420 +- *
7421 +- * For nested case:
7422 +- * If the L02 MSR bitmap does not intercept the MSR, then we need to
7423 +- * save it.
7424 +- */
7425 +- if (unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL)))
7426 +- vmx->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL);
7427 +-
7428 +- x86_spec_ctrl_restore_host(vmx->spec_ctrl, 0);
7429 +-
7430 +- /* All fields are clean at this point */
7431 +- if (static_branch_unlikely(&enable_evmcs))
7432 +- current_evmcs->hv_clean_fields |=
7433 +- HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
7434 +-
7435 +- if (static_branch_unlikely(&enable_evmcs))
7436 +- current_evmcs->hv_vp_id = vcpu->arch.hyperv.vp_index;
7437 +-
7438 +- /* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */
7439 +- if (vmx->host_debugctlmsr)
7440 +- update_debugctlmsr(vmx->host_debugctlmsr);
7441 +-
7442 +-#ifndef CONFIG_X86_64
7443 +- /*
7444 +- * The sysexit path does not restore ds/es, so we must set them to
7445 +- * a reasonable value ourselves.
7446 +- *
7447 +- * We can't defer this to vmx_prepare_switch_to_host() since that
7448 +- * function may be executed in interrupt context, which saves and
7449 +- * restore segments around it, nullifying its effect.
7450 +- */
7451 +- loadsegment(ds, __USER_DS);
7452 +- loadsegment(es, __USER_DS);
7453 +-#endif
7454 +-
7455 +- vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)
7456 +- | (1 << VCPU_EXREG_RFLAGS)
7457 +- | (1 << VCPU_EXREG_PDPTR)
7458 +- | (1 << VCPU_EXREG_SEGMENTS)
7459 +- | (1 << VCPU_EXREG_CR3));
7460 +- vcpu->arch.regs_dirty = 0;
7461 +-
7462 +- pt_guest_exit(vmx);
7463 +-
7464 +- /*
7465 +- * eager fpu is enabled if PKEY is supported and CR4 is switched
7466 +- * back on host, so it is safe to read guest PKRU from current
7467 +- * XSAVE.
7468 +- */
7469 +- if (static_cpu_has(X86_FEATURE_PKU) &&
7470 +- kvm_read_cr4_bits(vcpu, X86_CR4_PKE)) {
7471 +- vcpu->arch.pkru = rdpkru();
7472 +- if (vcpu->arch.pkru != vmx->host_pkru)
7473 +- __write_pkru(vmx->host_pkru);
7474 +- }
7475 +-
7476 +- kvm_load_host_xsave_state(vcpu);
7477 +-
7478 +- vmx->nested.nested_run_pending = 0;
7479 +- vmx->idt_vectoring_info = 0;
7480 +-
7481 +- vmx->exit_reason = vmx->fail ? 0xdead : vmcs_read32(VM_EXIT_REASON);
7482 +- if ((u16)vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY)
7483 +- kvm_machine_check();
7484 +-
7485 +- if (vmx->fail || (vmx->exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY))
7486 +- return;
7487 +-
7488 +- vmx->loaded_vmcs->launched = 1;
7489 +- vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
7490 +-
7491 +- vmx_recover_nmi_blocking(vmx);
7492 +- vmx_complete_interrupts(vmx);
7493 +-}
7494 +-
7495 +-static struct kvm *vmx_vm_alloc(void)
7496 +-{
7497 +- struct kvm_vmx *kvm_vmx = __vmalloc(sizeof(struct kvm_vmx),
7498 +- GFP_KERNEL_ACCOUNT | __GFP_ZERO,
7499 +- PAGE_KERNEL);
7500 +- return &kvm_vmx->kvm;
7501 +-}
7502 +-
7503 +-static void vmx_vm_free(struct kvm *kvm)
7504 +-{
7505 +- kfree(kvm->arch.hyperv.hv_pa_pg);
7506 +- vfree(to_kvm_vmx(kvm));
7507 +-}
7508 +-
7509 +-static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
7510 +-{
7511 +- struct vcpu_vmx *vmx = to_vmx(vcpu);
7512 +-
7513 +- if (enable_pml)
7514 +- vmx_destroy_pml_buffer(vmx);
7515 +- free_vpid(vmx->vpid);
7516 +- nested_vmx_free_vcpu(vcpu);
7517 +- free_loaded_vmcs(vmx->loaded_vmcs);
7518 +- kvm_vcpu_uninit(vcpu);
7519 +- kmem_cache_free(x86_fpu_cache, vmx->vcpu.arch.user_fpu);
7520 +- kmem_cache_free(x86_fpu_cache, vmx->vcpu.arch.guest_fpu);
7521 +- kmem_cache_free(kvm_vcpu_cache, vmx);
7522 +-}
7523 +-
7524 +-static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
7525 +-{
7526 +- int err;
7527 +- struct vcpu_vmx *vmx;
7528 +- unsigned long *msr_bitmap;
7529 +- int i, cpu;
7530 +-
7531 +- BUILD_BUG_ON_MSG(offsetof(struct vcpu_vmx, vcpu) != 0,
7532 +- "struct kvm_vcpu must be at offset 0 for arch usercopy region");
7533 +-
7534 +- vmx = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL_ACCOUNT);
7535 +- if (!vmx)
7536 +- return ERR_PTR(-ENOMEM);
7537 +-
7538 +- vmx->vcpu.arch.user_fpu = kmem_cache_zalloc(x86_fpu_cache,
7539 +- GFP_KERNEL_ACCOUNT);
7540 +- if (!vmx->vcpu.arch.user_fpu) {
7541 +- printk(KERN_ERR "kvm: failed to allocate kvm userspace's fpu\n");
7542 +- err = -ENOMEM;
7543 +- goto free_partial_vcpu;
7544 +- }
7545 +-
7546 +- vmx->vcpu.arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache,
7547 +- GFP_KERNEL_ACCOUNT);
7548 +- if (!vmx->vcpu.arch.guest_fpu) {
7549 +- printk(KERN_ERR "kvm: failed to allocate vcpu's fpu\n");
7550 +- err = -ENOMEM;
7551 +- goto free_user_fpu;
7552 +- }
7553 +-
7554 +- vmx->vpid = allocate_vpid();
7555 +-
7556 +- err = kvm_vcpu_init(&vmx->vcpu, kvm, id);
7557 +- if (err)
7558 +- goto free_vcpu;
7559 +-
7560 +- err = -ENOMEM;
7561 +-
7562 +- /*
7563 +- * If PML is turned on, failure on enabling PML just results in failure
7564 +- * of creating the vcpu, therefore we can simplify PML logic (by
7565 +- * avoiding dealing with cases, such as enabling PML partially on vcpus
7566 +- * for the guest), etc.
7567 +- */
7568 +- if (enable_pml) {
7569 +- vmx->pml_pg = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
7570 +- if (!vmx->pml_pg)
7571 +- goto uninit_vcpu;
7572 +- }
7573 +-
7574 +- BUILD_BUG_ON(ARRAY_SIZE(vmx_msr_index) != NR_SHARED_MSRS);
7575 +-
7576 +- for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i) {
7577 +- u32 index = vmx_msr_index[i];
7578 +- u32 data_low, data_high;
7579 +- int j = vmx->nmsrs;
7580 +-
7581 +- if (rdmsr_safe(index, &data_low, &data_high) < 0)
7582 +- continue;
7583 +- if (wrmsr_safe(index, data_low, data_high) < 0)
7584 +- continue;
7585 +-
7586 +- vmx->guest_msrs[j].index = i;
7587 +- vmx->guest_msrs[j].data = 0;
7588 +- switch (index) {
7589 +- case MSR_IA32_TSX_CTRL:
7590 +- /*
7591 +- * No need to pass TSX_CTRL_CPUID_CLEAR through, so
7592 +- * let's avoid changing CPUID bits under the host
7593 +- * kernel's feet.
7594 +- */
7595 +- vmx->guest_msrs[j].mask = ~(u64)TSX_CTRL_CPUID_CLEAR;
7596 +- break;
7597 +- default:
7598 +- vmx->guest_msrs[j].mask = -1ull;
7599 +- break;
7600 +- }
7601 +- ++vmx->nmsrs;
7602 +- }
7603 +-
7604 +- err = alloc_loaded_vmcs(&vmx->vmcs01);
7605 +- if (err < 0)
7606 +- goto free_pml;
7607 +-
7608 +- msr_bitmap = vmx->vmcs01.msr_bitmap;
7609 +- vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_TSC, MSR_TYPE_R);
7610 +- vmx_disable_intercept_for_msr(msr_bitmap, MSR_FS_BASE, MSR_TYPE_RW);
7611 +- vmx_disable_intercept_for_msr(msr_bitmap, MSR_GS_BASE, MSR_TYPE_RW);
7612 +- vmx_disable_intercept_for_msr(msr_bitmap, MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
7613 +- vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW);
7614 +- vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW);
7615 +- vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW);
7616 +- if (kvm_cstate_in_guest(kvm)) {
7617 +- vmx_disable_intercept_for_msr(msr_bitmap, MSR_CORE_C1_RES, MSR_TYPE_R);
7618 +- vmx_disable_intercept_for_msr(msr_bitmap, MSR_CORE_C3_RESIDENCY, MSR_TYPE_R);
7619 +- vmx_disable_intercept_for_msr(msr_bitmap, MSR_CORE_C6_RESIDENCY, MSR_TYPE_R);
7620 +- vmx_disable_intercept_for_msr(msr_bitmap, MSR_CORE_C7_RESIDENCY, MSR_TYPE_R);
7621 +- }
7622 +- vmx->msr_bitmap_mode = 0;
7623 +-
7624 +- vmx->loaded_vmcs = &vmx->vmcs01;
7625 +- cpu = get_cpu();
7626 +- vmx_vcpu_load(&vmx->vcpu, cpu);
7627 +- vmx->vcpu.cpu = cpu;
7628 +- init_vmcs(vmx);
7629 +- vmx_vcpu_put(&vmx->vcpu);
7630 +- put_cpu();
7631 +- if (cpu_need_virtualize_apic_accesses(&vmx->vcpu)) {
7632 +- err = alloc_apic_access_page(kvm);
7633 +- if (err)
7634 +- goto free_vmcs;
7635 +- }
7636 +-
7637 +- if (enable_ept && !enable_unrestricted_guest) {
7638 +- err = init_rmode_identity_map(kvm);
7639 +- if (err)
7640 +- goto free_vmcs;
7641 +- }
7642 +-
7643 +- if (nested)
7644 +- nested_vmx_setup_ctls_msrs(&vmx->nested.msrs,
7645 +- vmx_capability.ept,
7646 +- kvm_vcpu_apicv_active(&vmx->vcpu));
7647 +- else
7648 +- memset(&vmx->nested.msrs, 0, sizeof(vmx->nested.msrs));
7649 +-
7650 +- vmx->nested.posted_intr_nv = -1;
7651 +- vmx->nested.current_vmptr = -1ull;
7652 +-
7653 +- vmx->msr_ia32_feature_control_valid_bits = FEATURE_CONTROL_LOCKED;
7654 +-
7655 +- /*
7656 +- * Enforce invariant: pi_desc.nv is always either POSTED_INTR_VECTOR
7657 +- * or POSTED_INTR_WAKEUP_VECTOR.
7658 +- */
7659 +- vmx->pi_desc.nv = POSTED_INTR_VECTOR;
7660 +- vmx->pi_desc.sn = 1;
7661 +-
7662 +- vmx->ept_pointer = INVALID_PAGE;
7663 +-
7664 +- return &vmx->vcpu;
7665 +-
7666 +-free_vmcs:
7667 +- free_loaded_vmcs(vmx->loaded_vmcs);
7668 +-free_pml:
7669 +- vmx_destroy_pml_buffer(vmx);
7670 +-uninit_vcpu:
7671 +- kvm_vcpu_uninit(&vmx->vcpu);
7672 +-free_vcpu:
7673 +- free_vpid(vmx->vpid);
7674 +- kmem_cache_free(x86_fpu_cache, vmx->vcpu.arch.guest_fpu);
7675 +-free_user_fpu:
7676 +- kmem_cache_free(x86_fpu_cache, vmx->vcpu.arch.user_fpu);
7677 +-free_partial_vcpu:
7678 +- kmem_cache_free(kvm_vcpu_cache, vmx);
7679 +- return ERR_PTR(err);
7680 +-}
7681 +-
7682 +-#define L1TF_MSG_SMT "L1TF CPU bug present and SMT on, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n"
7683 +-#define L1TF_MSG_L1D "L1TF CPU bug present and virtualization mitigation disabled, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n"
7684 +-
7685 +-static int vmx_vm_init(struct kvm *kvm)
7686 +-{
7687 +- spin_lock_init(&to_kvm_vmx(kvm)->ept_pointer_lock);
7688 +-
7689 +- if (!ple_gap)
7690 +- kvm->arch.pause_in_guest = true;
7691 +-
7692 +- if (boot_cpu_has(X86_BUG_L1TF) && enable_ept) {
7693 +- switch (l1tf_mitigation) {
7694 +- case L1TF_MITIGATION_OFF:
7695 +- case L1TF_MITIGATION_FLUSH_NOWARN:
7696 +- /* 'I explicitly don't care' is set */
7697 +- break;
7698 +- case L1TF_MITIGATION_FLUSH:
7699 +- case L1TF_MITIGATION_FLUSH_NOSMT:
7700 +- case L1TF_MITIGATION_FULL:
7701 +- /*
7702 +- * Warn upon starting the first VM in a potentially
7703 +- * insecure environment.
7704 +- */
7705 +- if (sched_smt_active())
7706 +- pr_warn_once(L1TF_MSG_SMT);
7707 +- if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_NEVER)
7708 +- pr_warn_once(L1TF_MSG_L1D);
7709 +- break;
7710 +- case L1TF_MITIGATION_FULL_FORCE:
7711 +- /* Flush is enforced */
7712 +- break;
7713 +- }
7714 +- }
7715 +- return 0;
7716 +-}
7717 +-
7718 +-static int __init vmx_check_processor_compat(void)
7719 +-{
7720 +- struct vmcs_config vmcs_conf;
7721 +- struct vmx_capability vmx_cap;
7722 +-
7723 +- if (setup_vmcs_config(&vmcs_conf, &vmx_cap) < 0)
7724 +- return -EIO;
7725 +- if (nested)
7726 +- nested_vmx_setup_ctls_msrs(&vmcs_conf.nested, vmx_cap.ept,
7727 +- enable_apicv);
7728 +- if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config)) != 0) {
7729 +- printk(KERN_ERR "kvm: CPU %d feature inconsistency!\n",
7730 +- smp_processor_id());
7731 +- return -EIO;
7732 +- }
7733 +- return 0;
7734 +-}
7735 +-
7736 +-static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
7737 +-{
7738 +- u8 cache;
7739 +- u64 ipat = 0;
7740 +-
7741 +- /* For VT-d and EPT combination
7742 +- * 1. MMIO: always map as UC
7743 +- * 2. EPT with VT-d:
7744 +- * a. VT-d without snooping control feature: can't guarantee the
7745 +- * result, try to trust guest.
7746 +- * b. VT-d with snooping control feature: snooping control feature of
7747 +- * VT-d engine can guarantee the cache correctness. Just set it
7748 +- * to WB to keep consistent with host. So the same as item 3.
7749 +- * 3. EPT without VT-d: always map as WB and set IPAT=1 to keep
7750 +- * consistent with host MTRR
7751 +- */
7752 +- if (is_mmio) {
7753 +- cache = MTRR_TYPE_UNCACHABLE;
7754 +- goto exit;
7755 +- }
7756 +-
7757 +- if (!kvm_arch_has_noncoherent_dma(vcpu->kvm)) {
7758 +- ipat = VMX_EPT_IPAT_BIT;
7759 +- cache = MTRR_TYPE_WRBACK;
7760 +- goto exit;
7761 +- }
7762 +-
7763 +- if (kvm_read_cr0(vcpu) & X86_CR0_CD) {
7764 +- ipat = VMX_EPT_IPAT_BIT;
7765 +- if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
7766 +- cache = MTRR_TYPE_WRBACK;
7767 +- else
7768 +- cache = MTRR_TYPE_UNCACHABLE;
7769 +- goto exit;
7770 +- }
7771 +-
7772 +- cache = kvm_mtrr_get_guest_memory_type(vcpu, gfn);
7773 +-
7774 +-exit:
7775 +- return (cache << VMX_EPT_MT_EPTE_SHIFT) | ipat;
7776 +-}
7777 +-
7778 +-static int vmx_get_lpage_level(void)
7779 +-{
7780 +- if (enable_ept && !cpu_has_vmx_ept_1g_page())
7781 +- return PT_DIRECTORY_LEVEL;
7782 +- else
7783 +- /* For shadow and EPT supported 1GB page */
7784 +- return PT_PDPE_LEVEL;
7785 +-}
7786 +-
7787 +-static void vmcs_set_secondary_exec_control(struct vcpu_vmx *vmx)
7788 +-{
7789 +- /*
7790 +- * These bits in the secondary execution controls field
7791 +- * are dynamic, the others are mostly based on the hypervisor
7792 +- * architecture and the guest's CPUID. Do not touch the
7793 +- * dynamic bits.
7794 +- */
7795 +- u32 mask =
7796 +- SECONDARY_EXEC_SHADOW_VMCS |
7797 +- SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
7798 +- SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
7799 +- SECONDARY_EXEC_DESC;
7800 +-
7801 +- u32 new_ctl = vmx->secondary_exec_control;
7802 +- u32 cur_ctl = secondary_exec_controls_get(vmx);
7803 +-
7804 +- secondary_exec_controls_set(vmx, (new_ctl & ~mask) | (cur_ctl & mask));
7805 +-}
7806 +-
7807 +-/*
7808 +- * Generate MSR_IA32_VMX_CR{0,4}_FIXED1 according to CPUID. Only set bits
7809 +- * (indicating "allowed-1") if they are supported in the guest's CPUID.
7810 +- */
7811 +-static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu)
7812 +-{
7813 +- struct vcpu_vmx *vmx = to_vmx(vcpu);
7814 +- struct kvm_cpuid_entry2 *entry;
7815 +-
7816 +- vmx->nested.msrs.cr0_fixed1 = 0xffffffff;
7817 +- vmx->nested.msrs.cr4_fixed1 = X86_CR4_PCE;
7818 +-
7819 +-#define cr4_fixed1_update(_cr4_mask, _reg, _cpuid_mask) do { \
7820 +- if (entry && (entry->_reg & (_cpuid_mask))) \
7821 +- vmx->nested.msrs.cr4_fixed1 |= (_cr4_mask); \
7822 +-} while (0)
7823 +-
7824 +- entry = kvm_find_cpuid_entry(vcpu, 0x1, 0);
7825 +- cr4_fixed1_update(X86_CR4_VME, edx, bit(X86_FEATURE_VME));
7826 +- cr4_fixed1_update(X86_CR4_PVI, edx, bit(X86_FEATURE_VME));
7827 +- cr4_fixed1_update(X86_CR4_TSD, edx, bit(X86_FEATURE_TSC));
7828 +- cr4_fixed1_update(X86_CR4_DE, edx, bit(X86_FEATURE_DE));
7829 +- cr4_fixed1_update(X86_CR4_PSE, edx, bit(X86_FEATURE_PSE));
7830 +- cr4_fixed1_update(X86_CR4_PAE, edx, bit(X86_FEATURE_PAE));
7831 +- cr4_fixed1_update(X86_CR4_MCE, edx, bit(X86_FEATURE_MCE));
7832 +- cr4_fixed1_update(X86_CR4_PGE, edx, bit(X86_FEATURE_PGE));
7833 +- cr4_fixed1_update(X86_CR4_OSFXSR, edx, bit(X86_FEATURE_FXSR));
7834 +- cr4_fixed1_update(X86_CR4_OSXMMEXCPT, edx, bit(X86_FEATURE_XMM));
7835 +- cr4_fixed1_update(X86_CR4_VMXE, ecx, bit(X86_FEATURE_VMX));
7836 +- cr4_fixed1_update(X86_CR4_SMXE, ecx, bit(X86_FEATURE_SMX));
7837 +- cr4_fixed1_update(X86_CR4_PCIDE, ecx, bit(X86_FEATURE_PCID));
7838 +- cr4_fixed1_update(X86_CR4_OSXSAVE, ecx, bit(X86_FEATURE_XSAVE));
7839 +-
7840 +- entry = kvm_find_cpuid_entry(vcpu, 0x7, 0);
7841 +- cr4_fixed1_update(X86_CR4_FSGSBASE, ebx, bit(X86_FEATURE_FSGSBASE));
7842 +- cr4_fixed1_update(X86_CR4_SMEP, ebx, bit(X86_FEATURE_SMEP));
7843 +- cr4_fixed1_update(X86_CR4_SMAP, ebx, bit(X86_FEATURE_SMAP));
7844 +- cr4_fixed1_update(X86_CR4_PKE, ecx, bit(X86_FEATURE_PKU));
7845 +- cr4_fixed1_update(X86_CR4_UMIP, ecx, bit(X86_FEATURE_UMIP));
7846 +- cr4_fixed1_update(X86_CR4_LA57, ecx, bit(X86_FEATURE_LA57));
7847 +-
7848 +-#undef cr4_fixed1_update
7849 +-}
7850 +-
7851 +-static void nested_vmx_entry_exit_ctls_update(struct kvm_vcpu *vcpu)
7852 +-{
7853 +- struct vcpu_vmx *vmx = to_vmx(vcpu);
7854 +-
7855 +- if (kvm_mpx_supported()) {
7856 +- bool mpx_enabled = guest_cpuid_has(vcpu, X86_FEATURE_MPX);
7857 +-
7858 +- if (mpx_enabled) {
7859 +- vmx->nested.msrs.entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS;
7860 +- vmx->nested.msrs.exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS;
7861 +- } else {
7862 +- vmx->nested.msrs.entry_ctls_high &= ~VM_ENTRY_LOAD_BNDCFGS;
7863 +- vmx->nested.msrs.exit_ctls_high &= ~VM_EXIT_CLEAR_BNDCFGS;
7864 +- }
7865 +- }
7866 +-}
7867 +-
7868 +-static void update_intel_pt_cfg(struct kvm_vcpu *vcpu)
7869 +-{
7870 +- struct vcpu_vmx *vmx = to_vmx(vcpu);
7871 +- struct kvm_cpuid_entry2 *best = NULL;
7872 +- int i;
7873 +-
7874 +- for (i = 0; i < PT_CPUID_LEAVES; i++) {
7875 +- best = kvm_find_cpuid_entry(vcpu, 0x14, i);
7876 +- if (!best)
7877 +- return;
7878 +- vmx->pt_desc.caps[CPUID_EAX + i*PT_CPUID_REGS_NUM] = best->eax;
7879 +- vmx->pt_desc.caps[CPUID_EBX + i*PT_CPUID_REGS_NUM] = best->ebx;
7880 +- vmx->pt_desc.caps[CPUID_ECX + i*PT_CPUID_REGS_NUM] = best->ecx;
7881 +- vmx->pt_desc.caps[CPUID_EDX + i*PT_CPUID_REGS_NUM] = best->edx;
7882 +- }
7883 +-
7884 +- /* Get the number of configurable Address Ranges for filtering */
7885 +- vmx->pt_desc.addr_range = intel_pt_validate_cap(vmx->pt_desc.caps,
7886 +- PT_CAP_num_address_ranges);
7887 +-
7888 +- /* Initialize and clear the no dependency bits */
7889 +- vmx->pt_desc.ctl_bitmask = ~(RTIT_CTL_TRACEEN | RTIT_CTL_OS |
7890 +- RTIT_CTL_USR | RTIT_CTL_TSC_EN | RTIT_CTL_DISRETC);
7891 +-
7892 +- /*
7893 +- * If CPUID.(EAX=14H,ECX=0):EBX[0]=1 CR3Filter can be set otherwise
7894 +- * will inject an #GP
7895 +- */
7896 +- if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_cr3_filtering))
7897 +- vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_CR3EN;
7898 +-
7899 +- /*
7900 +- * If CPUID.(EAX=14H,ECX=0):EBX[1]=1 CYCEn, CycThresh and
7901 +- * PSBFreq can be set
7902 +- */
7903 +- if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc))
7904 +- vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_CYCLEACC |
7905 +- RTIT_CTL_CYC_THRESH | RTIT_CTL_PSB_FREQ);
7906 +-
7907 +- /*
7908 +- * If CPUID.(EAX=14H,ECX=0):EBX[3]=1 MTCEn BranchEn and
7909 +- * MTCFreq can be set
7910 +- */
7911 +- if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc))
7912 +- vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_MTC_EN |
7913 +- RTIT_CTL_BRANCH_EN | RTIT_CTL_MTC_RANGE);
7914 +-
7915 +- /* If CPUID.(EAX=14H,ECX=0):EBX[4]=1 FUPonPTW and PTWEn can be set */
7916 +- if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_ptwrite))
7917 +- vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_FUP_ON_PTW |
7918 +- RTIT_CTL_PTW_EN);
7919 +-
7920 +- /* If CPUID.(EAX=14H,ECX=0):EBX[5]=1 PwrEvEn can be set */
7921 +- if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_power_event_trace))
7922 +- vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_PWR_EVT_EN;
7923 +-
7924 +- /* If CPUID.(EAX=14H,ECX=0):ECX[0]=1 ToPA can be set */
7925 +- if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_topa_output))
7926 +- vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_TOPA;
7927 +-
7928 +- /* If CPUID.(EAX=14H,ECX=0):ECX[3]=1 FabircEn can be set */
7929 +- if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_output_subsys))
7930 +- vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_FABRIC_EN;
7931 +-
7932 +- /* unmask address range configure area */
7933 +- for (i = 0; i < vmx->pt_desc.addr_range; i++)
7934 +- vmx->pt_desc.ctl_bitmask &= ~(0xfULL << (32 + i * 4));
7935 +-}
7936 +-
7937 +-static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
7938 +-{
7939 +- struct vcpu_vmx *vmx = to_vmx(vcpu);
7940 +-
7941 +- /* xsaves_enabled is recomputed in vmx_compute_secondary_exec_control(). */
7942 +- vcpu->arch.xsaves_enabled = false;
7943 +-
7944 +- if (cpu_has_secondary_exec_ctrls()) {
7945 +- vmx_compute_secondary_exec_control(vmx);
7946 +- vmcs_set_secondary_exec_control(vmx);
7947 +- }
7948 +-
7949 +- if (nested_vmx_allowed(vcpu))
7950 +- to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |=
7951 +- FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX |
7952 +- FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
7953 +- else
7954 +- to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &=
7955 +- ~(FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX |
7956 +- FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX);
7957 +-
7958 +- if (nested_vmx_allowed(vcpu)) {
7959 +- nested_vmx_cr_fixed1_bits_update(vcpu);
7960 +- nested_vmx_entry_exit_ctls_update(vcpu);
7961 +- }
7962 +-
7963 +- if (boot_cpu_has(X86_FEATURE_INTEL_PT) &&
7964 +- guest_cpuid_has(vcpu, X86_FEATURE_INTEL_PT))
7965 +- update_intel_pt_cfg(vcpu);
7966 +-
7967 +- if (boot_cpu_has(X86_FEATURE_RTM)) {
7968 +- struct shared_msr_entry *msr;
7969 +- msr = find_msr_entry(vmx, MSR_IA32_TSX_CTRL);
7970 +- if (msr) {
7971 +- bool enabled = guest_cpuid_has(vcpu, X86_FEATURE_RTM);
7972 +- vmx_set_guest_msr(vmx, msr, enabled ? 0 : TSX_CTRL_RTM_DISABLE);
7973 +- }
7974 +- }
7975 +-}
7976 +-
7977 +-static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
7978 +-{
7979 +- if (func == 1 && nested)
7980 +- entry->ecx |= bit(X86_FEATURE_VMX);
7981 +-}
7982 +-
7983 +-static void vmx_request_immediate_exit(struct kvm_vcpu *vcpu)
7984 +-{
7985 +- to_vmx(vcpu)->req_immediate_exit = true;
7986 +-}
7987 +-
7988 +-static int vmx_check_intercept(struct kvm_vcpu *vcpu,
7989 +- struct x86_instruction_info *info,
7990 +- enum x86_intercept_stage stage)
7991 +-{
7992 +- struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
7993 +- struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
7994 +-
7995 +- /*
7996 +- * RDPID causes #UD if disabled through secondary execution controls.
7997 +- * Because it is marked as EmulateOnUD, we need to intercept it here.
7998 +- */
7999 +- if (info->intercept == x86_intercept_rdtscp &&
8000 +- !nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDTSCP)) {
8001 +- ctxt->exception.vector = UD_VECTOR;
8002 +- ctxt->exception.error_code_valid = false;
8003 +- return X86EMUL_PROPAGATE_FAULT;
8004 +- }
8005 +-
8006 +- /* TODO: check more intercepts... */
8007 +- return X86EMUL_CONTINUE;
8008 +-}
8009 +-
8010 +-#ifdef CONFIG_X86_64
8011 +-/* (a << shift) / divisor, return 1 if overflow otherwise 0 */
8012 +-static inline int u64_shl_div_u64(u64 a, unsigned int shift,
8013 +- u64 divisor, u64 *result)
8014 +-{
8015 +- u64 low = a << shift, high = a >> (64 - shift);
8016 +-
8017 +- /* To avoid the overflow on divq */
8018 +- if (high >= divisor)
8019 +- return 1;
8020 +-
8021 +- /* Low hold the result, high hold rem which is discarded */
8022 +- asm("divq %2\n\t" : "=a" (low), "=d" (high) :
8023 +- "rm" (divisor), "0" (low), "1" (high));
8024 +- *result = low;
8025 +-
8026 +- return 0;
8027 +-}
8028 +-
8029 +-static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc,
8030 +- bool *expired)
8031 +-{
8032 +- struct vcpu_vmx *vmx;
8033 +- u64 tscl, guest_tscl, delta_tsc, lapic_timer_advance_cycles;
8034 +- struct kvm_timer *ktimer = &vcpu->arch.apic->lapic_timer;
8035 +-
8036 +- if (kvm_mwait_in_guest(vcpu->kvm) ||
8037 +- kvm_can_post_timer_interrupt(vcpu))
8038 +- return -EOPNOTSUPP;
8039 +-
8040 +- vmx = to_vmx(vcpu);
8041 +- tscl = rdtsc();
8042 +- guest_tscl = kvm_read_l1_tsc(vcpu, tscl);
8043 +- delta_tsc = max(guest_deadline_tsc, guest_tscl) - guest_tscl;
8044 +- lapic_timer_advance_cycles = nsec_to_cycles(vcpu,
8045 +- ktimer->timer_advance_ns);
8046 +-
8047 +- if (delta_tsc > lapic_timer_advance_cycles)
8048 +- delta_tsc -= lapic_timer_advance_cycles;
8049 +- else
8050 +- delta_tsc = 0;
8051 +-
8052 +- /* Convert to host delta tsc if tsc scaling is enabled */
8053 +- if (vcpu->arch.tsc_scaling_ratio != kvm_default_tsc_scaling_ratio &&
8054 +- delta_tsc && u64_shl_div_u64(delta_tsc,
8055 +- kvm_tsc_scaling_ratio_frac_bits,
8056 +- vcpu->arch.tsc_scaling_ratio, &delta_tsc))
8057 +- return -ERANGE;
8058 +-
8059 +- /*
8060 +- * If the delta tsc can't fit in the 32 bit after the multi shift,
8061 +- * we can't use the preemption timer.
8062 +- * It's possible that it fits on later vmentries, but checking
8063 +- * on every vmentry is costly so we just use an hrtimer.
8064 +- */
8065 +- if (delta_tsc >> (cpu_preemption_timer_multi + 32))
8066 +- return -ERANGE;
8067 +-
8068 +- vmx->hv_deadline_tsc = tscl + delta_tsc;
8069 +- *expired = !delta_tsc;
8070 +- return 0;
8071 +-}
8072 +-
8073 +-static void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu)
8074 +-{
8075 +- to_vmx(vcpu)->hv_deadline_tsc = -1;
8076 +-}
8077 +-#endif
8078 +-
8079 +-static void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu)
8080 +-{
8081 +- if (!kvm_pause_in_guest(vcpu->kvm))
8082 +- shrink_ple_window(vcpu);
8083 +-}
8084 +-
8085 +-static void vmx_slot_enable_log_dirty(struct kvm *kvm,
8086 +- struct kvm_memory_slot *slot)
8087 +-{
8088 +- kvm_mmu_slot_leaf_clear_dirty(kvm, slot);
8089 +- kvm_mmu_slot_largepage_remove_write_access(kvm, slot);
8090 +-}
8091 +-
8092 +-static void vmx_slot_disable_log_dirty(struct kvm *kvm,
8093 +- struct kvm_memory_slot *slot)
8094 +-{
8095 +- kvm_mmu_slot_set_dirty(kvm, slot);
8096 +-}
8097 +-
8098 +-static void vmx_flush_log_dirty(struct kvm *kvm)
8099 +-{
8100 +- kvm_flush_pml_buffers(kvm);
8101 +-}
8102 +-
8103 +-static int vmx_write_pml_buffer(struct kvm_vcpu *vcpu)
8104 +-{
8105 +- struct vmcs12 *vmcs12;
8106 +- struct vcpu_vmx *vmx = to_vmx(vcpu);
8107 +- gpa_t gpa, dst;
8108 +-
8109 +- if (is_guest_mode(vcpu)) {
8110 +- WARN_ON_ONCE(vmx->nested.pml_full);
8111 +-
8112 +- /*
8113 +- * Check if PML is enabled for the nested guest.
8114 +- * Whether eptp bit 6 is set is already checked
8115 +- * as part of A/D emulation.
8116 +- */
8117 +- vmcs12 = get_vmcs12(vcpu);
8118 +- if (!nested_cpu_has_pml(vmcs12))
8119 +- return 0;
8120 +-
8121 +- if (vmcs12->guest_pml_index >= PML_ENTITY_NUM) {
8122 +- vmx->nested.pml_full = true;
8123 +- return 1;
8124 +- }
8125 +-
8126 +- gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS) & ~0xFFFull;
8127 +- dst = vmcs12->pml_address + sizeof(u64) * vmcs12->guest_pml_index;
8128 +-
8129 +- if (kvm_write_guest_page(vcpu->kvm, gpa_to_gfn(dst), &gpa,
8130 +- offset_in_page(dst), sizeof(gpa)))
8131 +- return 0;
8132 +-
8133 +- vmcs12->guest_pml_index--;
8134 +- }
8135 +-
8136 +- return 0;
8137 +-}
8138 +-
8139 +-static void vmx_enable_log_dirty_pt_masked(struct kvm *kvm,
8140 +- struct kvm_memory_slot *memslot,
8141 +- gfn_t offset, unsigned long mask)
8142 +-{
8143 +- kvm_mmu_clear_dirty_pt_masked(kvm, memslot, offset, mask);
8144 +-}
8145 +-
8146 +-static void __pi_post_block(struct kvm_vcpu *vcpu)
8147 +-{
8148 +- struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
8149 +- struct pi_desc old, new;
8150 +- unsigned int dest;
8151 +-
8152 +- do {
8153 +- old.control = new.control = pi_desc->control;
8154 +- WARN(old.nv != POSTED_INTR_WAKEUP_VECTOR,
8155 +- "Wakeup handler not enabled while the VCPU is blocked\n");
8156 +-
8157 +- dest = cpu_physical_id(vcpu->cpu);
8158 +-
8159 +- if (x2apic_enabled())
8160 +- new.ndst = dest;
8161 +- else
8162 +- new.ndst = (dest << 8) & 0xFF00;
8163 +-
8164 +- /* set 'NV' to 'notification vector' */
8165 +- new.nv = POSTED_INTR_VECTOR;
8166 +- } while (cmpxchg64(&pi_desc->control, old.control,
8167 +- new.control) != old.control);
8168 +-
8169 +- if (!WARN_ON_ONCE(vcpu->pre_pcpu == -1)) {
8170 +- spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
8171 +- list_del(&vcpu->blocked_vcpu_list);
8172 +- spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
8173 +- vcpu->pre_pcpu = -1;
8174 +- }
8175 +-}
8176 +-
8177 +-/*
8178 +- * This routine does the following things for vCPU which is going
8179 +- * to be blocked if VT-d PI is enabled.
8180 +- * - Store the vCPU to the wakeup list, so when interrupts happen
8181 +- * we can find the right vCPU to wake up.
8182 +- * - Change the Posted-interrupt descriptor as below:
8183 +- * 'NDST' <-- vcpu->pre_pcpu
8184 +- * 'NV' <-- POSTED_INTR_WAKEUP_VECTOR
8185 +- * - If 'ON' is set during this process, which means at least one
8186 +- * interrupt is posted for this vCPU, we cannot block it, in
8187 +- * this case, return 1, otherwise, return 0.
8188 +- *
8189 +- */
8190 +-static int pi_pre_block(struct kvm_vcpu *vcpu)
8191 +-{
8192 +- unsigned int dest;
8193 +- struct pi_desc old, new;
8194 +- struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
8195 +-
8196 +- if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
8197 +- !irq_remapping_cap(IRQ_POSTING_CAP) ||
8198 +- !kvm_vcpu_apicv_active(vcpu))
8199 +- return 0;
8200 +-
8201 +- WARN_ON(irqs_disabled());
8202 +- local_irq_disable();
8203 +- if (!WARN_ON_ONCE(vcpu->pre_pcpu != -1)) {
8204 +- vcpu->pre_pcpu = vcpu->cpu;
8205 +- spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
8206 +- list_add_tail(&vcpu->blocked_vcpu_list,
8207 +- &per_cpu(blocked_vcpu_on_cpu,
8208 +- vcpu->pre_pcpu));
8209 +- spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
8210 +- }
8211 +-
8212 +- do {
8213 +- old.control = new.control = pi_desc->control;
8214 +-
8215 +- WARN((pi_desc->sn == 1),
8216 +- "Warning: SN field of posted-interrupts "
8217 +- "is set before blocking\n");
8218 +-
8219 +- /*
8220 +- * Since vCPU can be preempted during this process,
8221 +- * vcpu->cpu could be different with pre_pcpu, we
8222 +- * need to set pre_pcpu as the destination of wakeup
8223 +- * notification event, then we can find the right vCPU
8224 +- * to wakeup in wakeup handler if interrupts happen
8225 +- * when the vCPU is in blocked state.
8226 +- */
8227 +- dest = cpu_physical_id(vcpu->pre_pcpu);
8228 +-
8229 +- if (x2apic_enabled())
8230 +- new.ndst = dest;
8231 +- else
8232 +- new.ndst = (dest << 8) & 0xFF00;
8233 +-
8234 +- /* set 'NV' to 'wakeup vector' */
8235 +- new.nv = POSTED_INTR_WAKEUP_VECTOR;
8236 +- } while (cmpxchg64(&pi_desc->control, old.control,
8237 +- new.control) != old.control);
8238 +-
8239 +- /* We should not block the vCPU if an interrupt is posted for it. */
8240 +- if (pi_test_on(pi_desc) == 1)
8241 +- __pi_post_block(vcpu);
8242 +-
8243 +- local_irq_enable();
8244 +- return (vcpu->pre_pcpu == -1);
8245 +-}
8246 +-
8247 +-static int vmx_pre_block(struct kvm_vcpu *vcpu)
8248 +-{
8249 +- if (pi_pre_block(vcpu))
8250 +- return 1;
8251 +-
8252 +- if (kvm_lapic_hv_timer_in_use(vcpu))
8253 +- kvm_lapic_switch_to_sw_timer(vcpu);
8254 +-
8255 +- return 0;
8256 +-}
8257 +-
8258 +-static void pi_post_block(struct kvm_vcpu *vcpu)
8259 +-{
8260 +- if (vcpu->pre_pcpu == -1)
8261 +- return;
8262 +-
8263 +- WARN_ON(irqs_disabled());
8264 +- local_irq_disable();
8265 +- __pi_post_block(vcpu);
8266 +- local_irq_enable();
8267 +-}
8268 +-
8269 +-static void vmx_post_block(struct kvm_vcpu *vcpu)
8270 +-{
8271 +- if (kvm_x86_ops->set_hv_timer)
8272 +- kvm_lapic_switch_to_hv_timer(vcpu);
8273 +-
8274 +- pi_post_block(vcpu);
8275 +-}
8276 +-
8277 +-/*
8278 +- * vmx_update_pi_irte - set IRTE for Posted-Interrupts
8279 +- *
8280 +- * @kvm: kvm
8281 +- * @host_irq: host irq of the interrupt
8282 +- * @guest_irq: gsi of the interrupt
8283 +- * @set: set or unset PI
8284 +- * returns 0 on success, < 0 on failure
8285 +- */
8286 +-static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
8287 +- uint32_t guest_irq, bool set)
8288 +-{
8289 +- struct kvm_kernel_irq_routing_entry *e;
8290 +- struct kvm_irq_routing_table *irq_rt;
8291 +- struct kvm_lapic_irq irq;
8292 +- struct kvm_vcpu *vcpu;
8293 +- struct vcpu_data vcpu_info;
8294 +- int idx, ret = 0;
8295 +-
8296 +- if (!kvm_arch_has_assigned_device(kvm) ||
8297 +- !irq_remapping_cap(IRQ_POSTING_CAP) ||
8298 +- !kvm_vcpu_apicv_active(kvm->vcpus[0]))
8299 +- return 0;
8300 +-
8301 +- idx = srcu_read_lock(&kvm->irq_srcu);
8302 +- irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
8303 +- if (guest_irq >= irq_rt->nr_rt_entries ||
8304 +- hlist_empty(&irq_rt->map[guest_irq])) {
8305 +- pr_warn_once("no route for guest_irq %u/%u (broken user space?)\n",
8306 +- guest_irq, irq_rt->nr_rt_entries);
8307 +- goto out;
8308 +- }
8309 +-
8310 +- hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) {
8311 +- if (e->type != KVM_IRQ_ROUTING_MSI)
8312 +- continue;
8313 +- /*
8314 +- * VT-d PI cannot support posting multicast/broadcast
8315 +- * interrupts to a vCPU, we still use interrupt remapping
8316 +- * for these kind of interrupts.
8317 +- *
8318 +- * For lowest-priority interrupts, we only support
8319 +- * those with single CPU as the destination, e.g. user
8320 +- * configures the interrupts via /proc/irq or uses
8321 +- * irqbalance to make the interrupts single-CPU.
8322 +- *
8323 +- * We will support full lowest-priority interrupt later.
8324 +- *
8325 +- * In addition, we can only inject generic interrupts using
8326 +- * the PI mechanism, refuse to route others through it.
8327 +- */
8328 +-
8329 +- kvm_set_msi_irq(kvm, e, &irq);
8330 +- if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu) ||
8331 +- !kvm_irq_is_postable(&irq)) {
8332 +- /*
8333 +- * Make sure the IRTE is in remapped mode if
8334 +- * we don't handle it in posted mode.
8335 +- */
8336 +- ret = irq_set_vcpu_affinity(host_irq, NULL);
8337 +- if (ret < 0) {
8338 +- printk(KERN_INFO
8339 +- "failed to back to remapped mode, irq: %u\n",
8340 +- host_irq);
8341 +- goto out;
8342 +- }
8343 +-
8344 +- continue;
8345 +- }
8346 +-
8347 +- vcpu_info.pi_desc_addr = __pa(vcpu_to_pi_desc(vcpu));
8348 +- vcpu_info.vector = irq.vector;
8349 +-
8350 +- trace_kvm_pi_irte_update(host_irq, vcpu->vcpu_id, e->gsi,
8351 +- vcpu_info.vector, vcpu_info.pi_desc_addr, set);
8352 +-
8353 +- if (set)
8354 +- ret = irq_set_vcpu_affinity(host_irq, &vcpu_info);
8355 +- else
8356 +- ret = irq_set_vcpu_affinity(host_irq, NULL);
8357 +-
8358 +- if (ret < 0) {
8359 +- printk(KERN_INFO "%s: failed to update PI IRTE\n",
8360 +- __func__);
8361 +- goto out;
8362 +- }
8363 +- }
8364 +-
8365 +- ret = 0;
8366 +-out:
8367 +- srcu_read_unlock(&kvm->irq_srcu, idx);
8368 +- return ret;
8369 +-}
8370 +-
8371 +-static void vmx_setup_mce(struct kvm_vcpu *vcpu)
8372 +-{
8373 +- if (vcpu->arch.mcg_cap & MCG_LMCE_P)
8374 +- to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |=
8375 +- FEATURE_CONTROL_LMCE;
8376 +- else
8377 +- to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &=
8378 +- ~FEATURE_CONTROL_LMCE;
8379 +-}
8380 +-
8381 +-static int vmx_smi_allowed(struct kvm_vcpu *vcpu)
8382 +-{
8383 +- /* we need a nested vmexit to enter SMM, postpone if run is pending */
8384 +- if (to_vmx(vcpu)->nested.nested_run_pending)
8385 +- return 0;
8386 +- return 1;
8387 +-}
8388 +-
8389 +-static int vmx_pre_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
8390 +-{
8391 +- struct vcpu_vmx *vmx = to_vmx(vcpu);
8392 +-
8393 +- vmx->nested.smm.guest_mode = is_guest_mode(vcpu);
8394 +- if (vmx->nested.smm.guest_mode)
8395 +- nested_vmx_vmexit(vcpu, -1, 0, 0);
8396 +-
8397 +- vmx->nested.smm.vmxon = vmx->nested.vmxon;
8398 +- vmx->nested.vmxon = false;
8399 +- vmx_clear_hlt(vcpu);
8400 +- return 0;
8401 +-}
8402 +-
8403 +-static int vmx_pre_leave_smm(struct kvm_vcpu *vcpu, const char *smstate)
8404 +-{
8405 +- struct vcpu_vmx *vmx = to_vmx(vcpu);
8406 +- int ret;
8407 +-
8408 +- if (vmx->nested.smm.vmxon) {
8409 +- vmx->nested.vmxon = true;
8410 +- vmx->nested.smm.vmxon = false;
8411 +- }
8412 +-
8413 +- if (vmx->nested.smm.guest_mode) {
8414 +- ret = nested_vmx_enter_non_root_mode(vcpu, false);
8415 +- if (ret)
8416 +- return ret;
8417 +-
8418 +- vmx->nested.smm.guest_mode = false;
8419 +- }
8420 +- return 0;
8421 +-}
8422 +-
8423 +-static int enable_smi_window(struct kvm_vcpu *vcpu)
8424 +-{
8425 +- return 0;
8426 +-}
8427 +-
8428 +-static bool vmx_need_emulation_on_page_fault(struct kvm_vcpu *vcpu)
8429 +-{
8430 +- return false;
8431 +-}
8432 +-
8433 +-static bool vmx_apic_init_signal_blocked(struct kvm_vcpu *vcpu)
8434 +-{
8435 +- return to_vmx(vcpu)->nested.vmxon;
8436 +-}
8437 +-
8438 +-static __init int hardware_setup(void)
8439 +-{
8440 +- unsigned long host_bndcfgs;
8441 +- struct desc_ptr dt;
8442 +- int r, i;
8443 +-
8444 +- rdmsrl_safe(MSR_EFER, &host_efer);
8445 +-
8446 +- store_idt(&dt);
8447 +- host_idt_base = dt.address;
8448 +-
8449 +- for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i)
8450 +- kvm_define_shared_msr(i, vmx_msr_index[i]);
8451 +-
8452 +- if (setup_vmcs_config(&vmcs_config, &vmx_capability) < 0)
8453 +- return -EIO;
8454 +-
8455 +- if (boot_cpu_has(X86_FEATURE_NX))
8456 +- kvm_enable_efer_bits(EFER_NX);
8457 +-
8458 +- if (boot_cpu_has(X86_FEATURE_MPX)) {
8459 +- rdmsrl(MSR_IA32_BNDCFGS, host_bndcfgs);
8460 +- WARN_ONCE(host_bndcfgs, "KVM: BNDCFGS in host will be lost");
8461 +- }
8462 +-
8463 +- if (!cpu_has_vmx_vpid() || !cpu_has_vmx_invvpid() ||
8464 +- !(cpu_has_vmx_invvpid_single() || cpu_has_vmx_invvpid_global()))
8465 +- enable_vpid = 0;
8466 +-
8467 +- if (!cpu_has_vmx_ept() ||
8468 +- !cpu_has_vmx_ept_4levels() ||
8469 +- !cpu_has_vmx_ept_mt_wb() ||
8470 +- !cpu_has_vmx_invept_global())
8471 +- enable_ept = 0;
8472 +-
8473 +- if (!cpu_has_vmx_ept_ad_bits() || !enable_ept)
8474 +- enable_ept_ad_bits = 0;
8475 +-
8476 +- if (!cpu_has_vmx_unrestricted_guest() || !enable_ept)
8477 +- enable_unrestricted_guest = 0;
8478 +-
8479 +- if (!cpu_has_vmx_flexpriority())
8480 +- flexpriority_enabled = 0;
8481 +-
8482 +- if (!cpu_has_virtual_nmis())
8483 +- enable_vnmi = 0;
8484 +-
8485 +- /*
8486 +- * set_apic_access_page_addr() is used to reload apic access
8487 +- * page upon invalidation. No need to do anything if not
8488 +- * using the APIC_ACCESS_ADDR VMCS field.
8489 +- */
8490 +- if (!flexpriority_enabled)
8491 +- kvm_x86_ops->set_apic_access_page_addr = NULL;
8492 +-
8493 +- if (!cpu_has_vmx_tpr_shadow())
8494 +- kvm_x86_ops->update_cr8_intercept = NULL;
8495 +-
8496 +- if (enable_ept && !cpu_has_vmx_ept_2m_page())
8497 +- kvm_disable_largepages();
8498 +-
8499 +-#if IS_ENABLED(CONFIG_HYPERV)
8500 +- if (ms_hyperv.nested_features & HV_X64_NESTED_GUEST_MAPPING_FLUSH
8501 +- && enable_ept) {
8502 +- kvm_x86_ops->tlb_remote_flush = hv_remote_flush_tlb;
8503 +- kvm_x86_ops->tlb_remote_flush_with_range =
8504 +- hv_remote_flush_tlb_with_range;
8505 +- }
8506 +-#endif
8507 +-
8508 +- if (!cpu_has_vmx_ple()) {
8509 +- ple_gap = 0;
8510 +- ple_window = 0;
8511 +- ple_window_grow = 0;
8512 +- ple_window_max = 0;
8513 +- ple_window_shrink = 0;
8514 +- }
8515 +-
8516 +- if (!cpu_has_vmx_apicv()) {
8517 +- enable_apicv = 0;
8518 +- kvm_x86_ops->sync_pir_to_irr = NULL;
8519 +- }
8520 +-
8521 +- if (cpu_has_vmx_tsc_scaling()) {
8522 +- kvm_has_tsc_control = true;
8523 +- kvm_max_tsc_scaling_ratio = KVM_VMX_TSC_MULTIPLIER_MAX;
8524 +- kvm_tsc_scaling_ratio_frac_bits = 48;
8525 +- }
8526 +-
8527 +- set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
8528 +-
8529 +- if (enable_ept)
8530 +- vmx_enable_tdp();
8531 +- else
8532 +- kvm_disable_tdp();
8533 +-
8534 +- /*
8535 +- * Only enable PML when hardware supports PML feature, and both EPT
8536 +- * and EPT A/D bit features are enabled -- PML depends on them to work.
8537 +- */
8538 +- if (!enable_ept || !enable_ept_ad_bits || !cpu_has_vmx_pml())
8539 +- enable_pml = 0;
8540 +-
8541 +- if (!enable_pml) {
8542 +- kvm_x86_ops->slot_enable_log_dirty = NULL;
8543 +- kvm_x86_ops->slot_disable_log_dirty = NULL;
8544 +- kvm_x86_ops->flush_log_dirty = NULL;
8545 +- kvm_x86_ops->enable_log_dirty_pt_masked = NULL;
8546 +- }
8547 +-
8548 +- if (!cpu_has_vmx_preemption_timer())
8549 +- enable_preemption_timer = false;
8550 +-
8551 +- if (enable_preemption_timer) {
8552 +- u64 use_timer_freq = 5000ULL * 1000 * 1000;
8553 +- u64 vmx_msr;
8554 +-
8555 +- rdmsrl(MSR_IA32_VMX_MISC, vmx_msr);
8556 +- cpu_preemption_timer_multi =
8557 +- vmx_msr & VMX_MISC_PREEMPTION_TIMER_RATE_MASK;
8558 +-
8559 +- if (tsc_khz)
8560 +- use_timer_freq = (u64)tsc_khz * 1000;
8561 +- use_timer_freq >>= cpu_preemption_timer_multi;
8562 +-
8563 +- /*
8564 +- * KVM "disables" the preemption timer by setting it to its max
8565 +- * value. Don't use the timer if it might cause spurious exits
8566 +- * at a rate faster than 0.1 Hz (of uninterrupted guest time).
8567 +- */
8568 +- if (use_timer_freq > 0xffffffffu / 10)
8569 +- enable_preemption_timer = false;
8570 +- }
8571 +-
8572 +- if (!enable_preemption_timer) {
8573 +- kvm_x86_ops->set_hv_timer = NULL;
8574 +- kvm_x86_ops->cancel_hv_timer = NULL;
8575 +- kvm_x86_ops->request_immediate_exit = __kvm_request_immediate_exit;
8576 +- }
8577 +-
8578 +- kvm_set_posted_intr_wakeup_handler(wakeup_handler);
8579 +-
8580 +- kvm_mce_cap_supported |= MCG_LMCE_P;
8581 +-
8582 +- if (pt_mode != PT_MODE_SYSTEM && pt_mode != PT_MODE_HOST_GUEST)
8583 +- return -EINVAL;
8584 +- if (!enable_ept || !cpu_has_vmx_intel_pt())
8585 +- pt_mode = PT_MODE_SYSTEM;
8586 +-
8587 +- if (nested) {
8588 +- nested_vmx_setup_ctls_msrs(&vmcs_config.nested,
8589 +- vmx_capability.ept, enable_apicv);
8590 +-
8591 +- r = nested_vmx_hardware_setup(kvm_vmx_exit_handlers);
8592 +- if (r)
8593 +- return r;
8594 +- }
8595 +-
8596 +- r = alloc_kvm_area();
8597 +- if (r)
8598 +- nested_vmx_hardware_unsetup();
8599 +- return r;
8600 +-}
8601 +-
8602 +-static __exit void hardware_unsetup(void)
8603 +-{
8604 +- if (nested)
8605 +- nested_vmx_hardware_unsetup();
8606 +-
8607 +- free_kvm_area();
8608 +-}
8609 +-
8610 +-static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
8611 +- .cpu_has_kvm_support = cpu_has_kvm_support,
8612 +- .disabled_by_bios = vmx_disabled_by_bios,
8613 +- .hardware_setup = hardware_setup,
8614 +- .hardware_unsetup = hardware_unsetup,
8615 +- .check_processor_compatibility = vmx_check_processor_compat,
8616 +- .hardware_enable = hardware_enable,
8617 +- .hardware_disable = hardware_disable,
8618 +- .cpu_has_accelerated_tpr = report_flexpriority,
8619 +- .has_emulated_msr = vmx_has_emulated_msr,
8620 +-
8621 +- .vm_init = vmx_vm_init,
8622 +- .vm_alloc = vmx_vm_alloc,
8623 +- .vm_free = vmx_vm_free,
8624 +-
8625 +- .vcpu_create = vmx_create_vcpu,
8626 +- .vcpu_free = vmx_free_vcpu,
8627 +- .vcpu_reset = vmx_vcpu_reset,
8628 +-
8629 +- .prepare_guest_switch = vmx_prepare_switch_to_guest,
8630 +- .vcpu_load = vmx_vcpu_load,
8631 +- .vcpu_put = vmx_vcpu_put,
8632 +-
8633 +- .update_bp_intercept = update_exception_bitmap,
8634 +- .get_msr_feature = vmx_get_msr_feature,
8635 +- .get_msr = vmx_get_msr,
8636 +- .set_msr = vmx_set_msr,
8637 +- .get_segment_base = vmx_get_segment_base,
8638 +- .get_segment = vmx_get_segment,
8639 +- .set_segment = vmx_set_segment,
8640 +- .get_cpl = vmx_get_cpl,
8641 +- .get_cs_db_l_bits = vmx_get_cs_db_l_bits,
8642 +- .decache_cr0_guest_bits = vmx_decache_cr0_guest_bits,
8643 +- .decache_cr4_guest_bits = vmx_decache_cr4_guest_bits,
8644 +- .set_cr0 = vmx_set_cr0,
8645 +- .set_cr3 = vmx_set_cr3,
8646 +- .set_cr4 = vmx_set_cr4,
8647 +- .set_efer = vmx_set_efer,
8648 +- .get_idt = vmx_get_idt,
8649 +- .set_idt = vmx_set_idt,
8650 +- .get_gdt = vmx_get_gdt,
8651 +- .set_gdt = vmx_set_gdt,
8652 +- .get_dr6 = vmx_get_dr6,
8653 +- .set_dr6 = vmx_set_dr6,
8654 +- .set_dr7 = vmx_set_dr7,
8655 +- .sync_dirty_debug_regs = vmx_sync_dirty_debug_regs,
8656 +- .cache_reg = vmx_cache_reg,
8657 +- .get_rflags = vmx_get_rflags,
8658 +- .set_rflags = vmx_set_rflags,
8659 +-
8660 +- .tlb_flush = vmx_flush_tlb,
8661 +- .tlb_flush_gva = vmx_flush_tlb_gva,
8662 +-
8663 +- .run = vmx_vcpu_run,
8664 +- .handle_exit = vmx_handle_exit,
8665 +- .skip_emulated_instruction = skip_emulated_instruction,
8666 +- .set_interrupt_shadow = vmx_set_interrupt_shadow,
8667 +- .get_interrupt_shadow = vmx_get_interrupt_shadow,
8668 +- .patch_hypercall = vmx_patch_hypercall,
8669 +- .set_irq = vmx_inject_irq,
8670 +- .set_nmi = vmx_inject_nmi,
8671 +- .queue_exception = vmx_queue_exception,
8672 +- .cancel_injection = vmx_cancel_injection,
8673 +- .interrupt_allowed = vmx_interrupt_allowed,
8674 +- .nmi_allowed = vmx_nmi_allowed,
8675 +- .get_nmi_mask = vmx_get_nmi_mask,
8676 +- .set_nmi_mask = vmx_set_nmi_mask,
8677 +- .enable_nmi_window = enable_nmi_window,
8678 +- .enable_irq_window = enable_irq_window,
8679 +- .update_cr8_intercept = update_cr8_intercept,
8680 +- .set_virtual_apic_mode = vmx_set_virtual_apic_mode,
8681 +- .set_apic_access_page_addr = vmx_set_apic_access_page_addr,
8682 +- .get_enable_apicv = vmx_get_enable_apicv,
8683 +- .refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl,
8684 +- .load_eoi_exitmap = vmx_load_eoi_exitmap,
8685 +- .apicv_post_state_restore = vmx_apicv_post_state_restore,
8686 +- .hwapic_irr_update = vmx_hwapic_irr_update,
8687 +- .hwapic_isr_update = vmx_hwapic_isr_update,
8688 +- .guest_apic_has_interrupt = vmx_guest_apic_has_interrupt,
8689 +- .sync_pir_to_irr = vmx_sync_pir_to_irr,
8690 +- .deliver_posted_interrupt = vmx_deliver_posted_interrupt,
8691 +- .dy_apicv_has_pending_interrupt = vmx_dy_apicv_has_pending_interrupt,
8692 +-
8693 +- .set_tss_addr = vmx_set_tss_addr,
8694 +- .set_identity_map_addr = vmx_set_identity_map_addr,
8695 +- .get_tdp_level = get_ept_level,
8696 +- .get_mt_mask = vmx_get_mt_mask,
8697 +-
8698 +- .get_exit_info = vmx_get_exit_info,
8699 +-
8700 +- .get_lpage_level = vmx_get_lpage_level,
8701 +-
8702 +- .cpuid_update = vmx_cpuid_update,
8703 +-
8704 +- .rdtscp_supported = vmx_rdtscp_supported,
8705 +- .invpcid_supported = vmx_invpcid_supported,
8706 +-
8707 +- .set_supported_cpuid = vmx_set_supported_cpuid,
8708 +-
8709 +- .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
8710 +-
8711 +- .read_l1_tsc_offset = vmx_read_l1_tsc_offset,
8712 +- .write_l1_tsc_offset = vmx_write_l1_tsc_offset,
8713 +-
8714 +- .set_tdp_cr3 = vmx_set_cr3,
8715 +-
8716 +- .check_intercept = vmx_check_intercept,
8717 +- .handle_exit_irqoff = vmx_handle_exit_irqoff,
8718 +- .mpx_supported = vmx_mpx_supported,
8719 +- .xsaves_supported = vmx_xsaves_supported,
8720 +- .umip_emulated = vmx_umip_emulated,
8721 +- .pt_supported = vmx_pt_supported,
8722 +-
8723 +- .request_immediate_exit = vmx_request_immediate_exit,
8724 +-
8725 +- .sched_in = vmx_sched_in,
8726 +-
8727 +- .slot_enable_log_dirty = vmx_slot_enable_log_dirty,
8728 +- .slot_disable_log_dirty = vmx_slot_disable_log_dirty,
8729 +- .flush_log_dirty = vmx_flush_log_dirty,
8730 +- .enable_log_dirty_pt_masked = vmx_enable_log_dirty_pt_masked,
8731 +- .write_log_dirty = vmx_write_pml_buffer,
8732 +-
8733 +- .pre_block = vmx_pre_block,
8734 +- .post_block = vmx_post_block,
8735 +-
8736 +- .pmu_ops = &intel_pmu_ops,
8737 +-
8738 +- .update_pi_irte = vmx_update_pi_irte,
8739 +-
8740 +-#ifdef CONFIG_X86_64
8741 +- .set_hv_timer = vmx_set_hv_timer,
8742 +- .cancel_hv_timer = vmx_cancel_hv_timer,
8743 +-#endif
8744 +-
8745 +- .setup_mce = vmx_setup_mce,
8746 +-
8747 +- .smi_allowed = vmx_smi_allowed,
8748 +- .pre_enter_smm = vmx_pre_enter_smm,
8749 +- .pre_leave_smm = vmx_pre_leave_smm,
8750 +- .enable_smi_window = enable_smi_window,
8751 +-
8752 +- .check_nested_events = NULL,
8753 +- .get_nested_state = NULL,
8754 +- .set_nested_state = NULL,
8755 +- .get_vmcs12_pages = NULL,
8756 +- .nested_enable_evmcs = NULL,
8757 +- .nested_get_evmcs_version = NULL,
8758 +- .need_emulation_on_page_fault = vmx_need_emulation_on_page_fault,
8759 +- .apic_init_signal_blocked = vmx_apic_init_signal_blocked,
8760 +-};
8761 +-
8762 +-static void vmx_cleanup_l1d_flush(void)
8763 +-{
8764 +- if (vmx_l1d_flush_pages) {
8765 +- free_pages((unsigned long)vmx_l1d_flush_pages, L1D_CACHE_ORDER);
8766 +- vmx_l1d_flush_pages = NULL;
8767 +- }
8768 +- /* Restore state so sysfs ignores VMX */
8769 +- l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO;
8770 +-}
8771 +-
8772 +-static void vmx_exit(void)
8773 +-{
8774 +-#ifdef CONFIG_KEXEC_CORE
8775 +- RCU_INIT_POINTER(crash_vmclear_loaded_vmcss, NULL);
8776 +- synchronize_rcu();
8777 +-#endif
8778 +-
8779 +- kvm_exit();
8780 +-
8781 +-#if IS_ENABLED(CONFIG_HYPERV)
8782 +- if (static_branch_unlikely(&enable_evmcs)) {
8783 +- int cpu;
8784 +- struct hv_vp_assist_page *vp_ap;
8785 +- /*
8786 +- * Reset everything to support using non-enlightened VMCS
8787 +- * access later (e.g. when we reload the module with
8788 +- * enlightened_vmcs=0)
8789 +- */
8790 +- for_each_online_cpu(cpu) {
8791 +- vp_ap = hv_get_vp_assist_page(cpu);
8792 +-
8793 +- if (!vp_ap)
8794 +- continue;
8795 +-
8796 +- vp_ap->nested_control.features.directhypercall = 0;
8797 +- vp_ap->current_nested_vmcs = 0;
8798 +- vp_ap->enlighten_vmentry = 0;
8799 +- }
8800 +-
8801 +- static_branch_disable(&enable_evmcs);
8802 +- }
8803 +-#endif
8804 +- vmx_cleanup_l1d_flush();
8805 +-}
8806 +-module_exit(vmx_exit);
8807 +-
8808 +-static int __init vmx_init(void)
8809 +-{
8810 +- int r;
8811 +-
8812 +-#if IS_ENABLED(CONFIG_HYPERV)
8813 +- /*
8814 +- * Enlightened VMCS usage should be recommended and the host needs
8815 +- * to support eVMCS v1 or above. We can also disable eVMCS support
8816 +- * with module parameter.
8817 +- */
8818 +- if (enlightened_vmcs &&
8819 +- ms_hyperv.hints & HV_X64_ENLIGHTENED_VMCS_RECOMMENDED &&
8820 +- (ms_hyperv.nested_features & HV_X64_ENLIGHTENED_VMCS_VERSION) >=
8821 +- KVM_EVMCS_VERSION) {
8822 +- int cpu;
8823 +-
8824 +- /* Check that we have assist pages on all online CPUs */
8825 +- for_each_online_cpu(cpu) {
8826 +- if (!hv_get_vp_assist_page(cpu)) {
8827 +- enlightened_vmcs = false;
8828 +- break;
8829 +- }
8830 +- }
8831 +-
8832 +- if (enlightened_vmcs) {
8833 +- pr_info("KVM: vmx: using Hyper-V Enlightened VMCS\n");
8834 +- static_branch_enable(&enable_evmcs);
8835 +- }
8836 +-
8837 +- if (ms_hyperv.nested_features & HV_X64_NESTED_DIRECT_FLUSH)
8838 +- vmx_x86_ops.enable_direct_tlbflush
8839 +- = hv_enable_direct_tlbflush;
8840 +-
8841 +- } else {
8842 +- enlightened_vmcs = false;
8843 +- }
8844 +-#endif
8845 +-
8846 +- r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx),
8847 +- __alignof__(struct vcpu_vmx), THIS_MODULE);
8848 +- if (r)
8849 +- return r;
8850 +-
8851 +- /*
8852 +- * Must be called after kvm_init() so enable_ept is properly set
8853 +- * up. Hand the parameter mitigation value in which was stored in
8854 +- * the pre module init parser. If no parameter was given, it will
8855 +- * contain 'auto' which will be turned into the default 'cond'
8856 +- * mitigation mode.
8857 +- */
8858 +- r = vmx_setup_l1d_flush(vmentry_l1d_flush_param);
8859 +- if (r) {
8860 +- vmx_exit();
8861 +- return r;
8862 +- }
8863 +-
8864 +-#ifdef CONFIG_KEXEC_CORE
8865 +- rcu_assign_pointer(crash_vmclear_loaded_vmcss,
8866 +- crash_vmclear_local_loaded_vmcss);
8867 +-#endif
8868 +- vmx_check_vmcs12_offsets();
8869 +-
8870 +- return 0;
8871 +-}
8872 +-module_init(vmx_init);
8873 +diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt
8874 +index 0f7eb4f5bdb7..82e105b284e0 100644
8875 +--- a/arch/x86/lib/x86-opcode-map.txt
8876 ++++ b/arch/x86/lib/x86-opcode-map.txt
8877 +@@ -909,7 +909,7 @@ EndTable
8878 +
8879 + GrpTable: Grp3_2
8880 + 0: TEST Ev,Iz
8881 +-1:
8882 ++1: TEST Ev,Iz
8883 + 2: NOT Ev
8884 + 3: NEG Ev
8885 + 4: MUL rAX,Ev
8886 +diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
8887 +index b6669d326545..f08abdf8bb67 100644
8888 +--- a/arch/x86/platform/efi/efi.c
8889 ++++ b/arch/x86/platform/efi/efi.c
8890 +@@ -478,7 +478,6 @@ void __init efi_init(void)
8891 + efi_char16_t *c16;
8892 + char vendor[100] = "unknown";
8893 + int i = 0;
8894 +- void *tmp;
8895 +
8896 + #ifdef CONFIG_X86_32
8897 + if (boot_params.efi_info.efi_systab_hi ||
8898 +@@ -503,14 +502,16 @@ void __init efi_init(void)
8899 + /*
8900 + * Show what we know for posterity
8901 + */
8902 +- c16 = tmp = early_memremap(efi.systab->fw_vendor, 2);
8903 ++ c16 = early_memremap_ro(efi.systab->fw_vendor,
8904 ++ sizeof(vendor) * sizeof(efi_char16_t));
8905 + if (c16) {
8906 +- for (i = 0; i < sizeof(vendor) - 1 && *c16; ++i)
8907 +- vendor[i] = *c16++;
8908 ++ for (i = 0; i < sizeof(vendor) - 1 && c16[i]; ++i)
8909 ++ vendor[i] = c16[i];
8910 + vendor[i] = '\0';
8911 +- } else
8912 ++ early_memunmap(c16, sizeof(vendor) * sizeof(efi_char16_t));
8913 ++ } else {
8914 + pr_err("Could not map the firmware vendor!\n");
8915 +- early_memunmap(tmp, 2);
8916 ++ }
8917 +
8918 + pr_info("EFI v%u.%.02u by %s\n",
8919 + efi.systab->hdr.revision >> 16,
8920 +diff --git a/drivers/acpi/acpica/dsfield.c b/drivers/acpi/acpica/dsfield.c
8921 +index 6a4b603d0e83..10bbf6ca082a 100644
8922 +--- a/drivers/acpi/acpica/dsfield.c
8923 ++++ b/drivers/acpi/acpica/dsfield.c
8924 +@@ -272,7 +272,7 @@ cleanup:
8925 + * FUNCTION: acpi_ds_get_field_names
8926 + *
8927 + * PARAMETERS: info - create_field info structure
8928 +- * ` walk_state - Current method state
8929 ++ * walk_state - Current method state
8930 + * arg - First parser arg for the field name list
8931 + *
8932 + * RETURN: Status
8933 +diff --git a/drivers/acpi/acpica/dswload.c b/drivers/acpi/acpica/dswload.c
8934 +index fd34040d4f44..9c41d2153d0f 100644
8935 +--- a/drivers/acpi/acpica/dswload.c
8936 ++++ b/drivers/acpi/acpica/dswload.c
8937 +@@ -440,6 +440,27 @@ acpi_status acpi_ds_load1_end_op(struct acpi_walk_state *walk_state)
8938 + ACPI_DEBUG_PRINT((ACPI_DB_DISPATCH, "Op=%p State=%p\n", op,
8939 + walk_state));
8940 +
8941 ++ /*
8942 ++ * Disassembler: handle create field operators here.
8943 ++ *
8944 ++ * create_buffer_field is a deferred op that is typically processed in load
8945 ++ * pass 2. However, disassembly of control method contents walk the parse
8946 ++ * tree with ACPI_PARSE_LOAD_PASS1 and AML_CREATE operators are processed
8947 ++ * in a later walk. This is a problem when there is a control method that
8948 ++ * has the same name as the AML_CREATE object. In this case, any use of the
8949 ++ * name segment will be detected as a method call rather than a reference
8950 ++ * to a buffer field.
8951 ++ *
8952 ++ * This earlier creation during disassembly solves this issue by inserting
8953 ++ * the named object in the ACPI namespace so that references to this name
8954 ++ * would be a name string rather than a method call.
8955 ++ */
8956 ++ if ((walk_state->parse_flags & ACPI_PARSE_DISASSEMBLE) &&
8957 ++ (walk_state->op_info->flags & AML_CREATE)) {
8958 ++ status = acpi_ds_create_buffer_field(op, walk_state);
8959 ++ return_ACPI_STATUS(status);
8960 ++ }
8961 ++
8962 + /* We are only interested in opcodes that have an associated name */
8963 +
8964 + if (!(walk_state->op_info->flags & (AML_NAMED | AML_FIELD))) {
8965 +diff --git a/drivers/ata/ahci.c b/drivers/ata/ahci.c
8966 +index 5408a292078b..89e62043d02e 100644
8967 +--- a/drivers/ata/ahci.c
8968 ++++ b/drivers/ata/ahci.c
8969 +@@ -86,6 +86,7 @@ enum board_ids {
8970 +
8971 + static int ahci_init_one(struct pci_dev *pdev, const struct pci_device_id *ent);
8972 + static void ahci_remove_one(struct pci_dev *dev);
8973 ++static void ahci_shutdown_one(struct pci_dev *dev);
8974 + static int ahci_vt8251_hardreset(struct ata_link *link, unsigned int *class,
8975 + unsigned long deadline);
8976 + static int ahci_avn_hardreset(struct ata_link *link, unsigned int *class,
8977 +@@ -582,6 +583,7 @@ static struct pci_driver ahci_pci_driver = {
8978 + .id_table = ahci_pci_tbl,
8979 + .probe = ahci_init_one,
8980 + .remove = ahci_remove_one,
8981 ++ .shutdown = ahci_shutdown_one,
8982 + .driver = {
8983 + .pm = &ahci_pci_pm_ops,
8984 + },
8985 +@@ -1775,6 +1777,11 @@ static int ahci_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
8986 + return 0;
8987 + }
8988 +
8989 ++static void ahci_shutdown_one(struct pci_dev *pdev)
8990 ++{
8991 ++ ata_pci_shutdown_one(pdev);
8992 ++}
8993 ++
8994 + static void ahci_remove_one(struct pci_dev *pdev)
8995 + {
8996 + pm_runtime_get_noresume(&pdev->dev);
8997 +diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
8998 +index b1582f161171..ba0cffbd0bb6 100644
8999 +--- a/drivers/ata/libata-core.c
9000 ++++ b/drivers/ata/libata-core.c
9001 +@@ -6580,6 +6580,26 @@ void ata_pci_remove_one(struct pci_dev *pdev)
9002 + ata_host_detach(host);
9003 + }
9004 +
9005 ++void ata_pci_shutdown_one(struct pci_dev *pdev)
9006 ++{
9007 ++ struct ata_host *host = pci_get_drvdata(pdev);
9008 ++ int i;
9009 ++
9010 ++ for (i = 0; i < host->n_ports; i++) {
9011 ++ struct ata_port *ap = host->ports[i];
9012 ++
9013 ++ ap->pflags |= ATA_PFLAG_FROZEN;
9014 ++
9015 ++ /* Disable port interrupts */
9016 ++ if (ap->ops->freeze)
9017 ++ ap->ops->freeze(ap);
9018 ++
9019 ++ /* Stop the port DMA engines */
9020 ++ if (ap->ops->port_stop)
9021 ++ ap->ops->port_stop(ap);
9022 ++ }
9023 ++}
9024 ++
9025 + /* move to PCI subsystem */
9026 + int pci_test_config_bits(struct pci_dev *pdev, const struct pci_bits *bits)
9027 + {
9028 +@@ -7200,6 +7220,7 @@ EXPORT_SYMBOL_GPL(ata_timing_cycle2mode);
9029 +
9030 + #ifdef CONFIG_PCI
9031 + EXPORT_SYMBOL_GPL(pci_test_config_bits);
9032 ++EXPORT_SYMBOL_GPL(ata_pci_shutdown_one);
9033 + EXPORT_SYMBOL_GPL(ata_pci_remove_one);
9034 + #ifdef CONFIG_PM
9035 + EXPORT_SYMBOL_GPL(ata_pci_device_do_suspend);
9036 +diff --git a/drivers/base/dd.c b/drivers/base/dd.c
9037 +index ee25a69630c3..854d218ea76a 100644
9038 +--- a/drivers/base/dd.c
9039 ++++ b/drivers/base/dd.c
9040 +@@ -341,7 +341,10 @@ static int really_probe(struct device *dev, struct device_driver *drv)
9041 + atomic_inc(&probe_count);
9042 + pr_debug("bus: '%s': %s: probing driver %s with device %s\n",
9043 + drv->bus->name, __func__, drv->name, dev_name(dev));
9044 +- WARN_ON(!list_empty(&dev->devres_head));
9045 ++ if (!list_empty(&dev->devres_head)) {
9046 ++ dev_crit(dev, "Resources present before probing\n");
9047 ++ return -EBUSY;
9048 ++ }
9049 +
9050 + re_probe:
9051 + dev->driver = drv;
9052 +diff --git a/drivers/base/platform.c b/drivers/base/platform.c
9053 +index f90b1b9bbad0..bef299ef6227 100644
9054 +--- a/drivers/base/platform.c
9055 ++++ b/drivers/base/platform.c
9056 +@@ -28,6 +28,7 @@
9057 + #include <linux/limits.h>
9058 + #include <linux/property.h>
9059 + #include <linux/kmemleak.h>
9060 ++#include <linux/types.h>
9061 +
9062 + #include "base.h"
9063 + #include "power/power.h"
9064 +@@ -68,7 +69,7 @@ void __weak arch_setup_pdev_archdata(struct platform_device *pdev)
9065 + struct resource *platform_get_resource(struct platform_device *dev,
9066 + unsigned int type, unsigned int num)
9067 + {
9068 +- int i;
9069 ++ u32 i;
9070 +
9071 + for (i = 0; i < dev->num_resources; i++) {
9072 + struct resource *r = &dev->resource[i];
9073 +@@ -153,7 +154,7 @@ struct resource *platform_get_resource_byname(struct platform_device *dev,
9074 + unsigned int type,
9075 + const char *name)
9076 + {
9077 +- int i;
9078 ++ u32 i;
9079 +
9080 + for (i = 0; i < dev->num_resources; i++) {
9081 + struct resource *r = &dev->resource[i];
9082 +@@ -350,7 +351,8 @@ EXPORT_SYMBOL_GPL(platform_device_add_properties);
9083 + */
9084 + int platform_device_add(struct platform_device *pdev)
9085 + {
9086 +- int i, ret;
9087 ++ u32 i;
9088 ++ int ret;
9089 +
9090 + if (!pdev)
9091 + return -EINVAL;
9092 +@@ -416,7 +418,7 @@ int platform_device_add(struct platform_device *pdev)
9093 + pdev->id = PLATFORM_DEVID_AUTO;
9094 + }
9095 +
9096 +- while (--i >= 0) {
9097 ++ while (i--) {
9098 + struct resource *r = &pdev->resource[i];
9099 + if (r->parent)
9100 + release_resource(r);
9101 +@@ -437,7 +439,7 @@ EXPORT_SYMBOL_GPL(platform_device_add);
9102 + */
9103 + void platform_device_del(struct platform_device *pdev)
9104 + {
9105 +- int i;
9106 ++ u32 i;
9107 +
9108 + if (pdev) {
9109 + device_remove_properties(&pdev->dev);
9110 +diff --git a/drivers/block/brd.c b/drivers/block/brd.c
9111 +index 0c76d4016eeb..7e35574a17df 100644
9112 +--- a/drivers/block/brd.c
9113 ++++ b/drivers/block/brd.c
9114 +@@ -581,6 +581,25 @@ static struct kobject *brd_probe(dev_t dev, int *part, void *data)
9115 + return kobj;
9116 + }
9117 +
9118 ++static inline void brd_check_and_reset_par(void)
9119 ++{
9120 ++ if (unlikely(!max_part))
9121 ++ max_part = 1;
9122 ++
9123 ++ /*
9124 ++ * make sure 'max_part' can be divided exactly by (1U << MINORBITS),
9125 ++ * otherwise, it is possiable to get same dev_t when adding partitions.
9126 ++ */
9127 ++ if ((1U << MINORBITS) % max_part != 0)
9128 ++ max_part = 1UL << fls(max_part);
9129 ++
9130 ++ if (max_part > DISK_MAX_PARTS) {
9131 ++ pr_info("brd: max_part can't be larger than %d, reset max_part = %d.\n",
9132 ++ DISK_MAX_PARTS, DISK_MAX_PARTS);
9133 ++ max_part = DISK_MAX_PARTS;
9134 ++ }
9135 ++}
9136 ++
9137 + static int __init brd_init(void)
9138 + {
9139 + struct brd_device *brd, *next;
9140 +@@ -604,8 +623,7 @@ static int __init brd_init(void)
9141 + if (register_blkdev(RAMDISK_MAJOR, "ramdisk"))
9142 + return -EIO;
9143 +
9144 +- if (unlikely(!max_part))
9145 +- max_part = 1;
9146 ++ brd_check_and_reset_par();
9147 +
9148 + for (i = 0; i < rd_nr; i++) {
9149 + brd = brd_alloc(i);
9150 +diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
9151 +index ece4f706b38f..4496e7a49235 100644
9152 +--- a/drivers/block/floppy.c
9153 ++++ b/drivers/block/floppy.c
9154 +@@ -848,14 +848,17 @@ static void reset_fdc_info(int mode)
9155 + /* selects the fdc and drive, and enables the fdc's input/dma. */
9156 + static void set_fdc(int drive)
9157 + {
9158 ++ unsigned int new_fdc = fdc;
9159 ++
9160 + if (drive >= 0 && drive < N_DRIVE) {
9161 +- fdc = FDC(drive);
9162 ++ new_fdc = FDC(drive);
9163 + current_drive = drive;
9164 + }
9165 +- if (fdc != 1 && fdc != 0) {
9166 ++ if (new_fdc >= N_FDC) {
9167 + pr_info("bad fdc value\n");
9168 + return;
9169 + }
9170 ++ fdc = new_fdc;
9171 + set_dor(fdc, ~0, 8);
9172 + #if N_FDC > 1
9173 + set_dor(1 - fdc, ~8, 0);
9174 +diff --git a/drivers/clk/qcom/clk-rcg2.c b/drivers/clk/qcom/clk-rcg2.c
9175 +index 0ae1b0a66eb5..d8601b138dc1 100644
9176 +--- a/drivers/clk/qcom/clk-rcg2.c
9177 ++++ b/drivers/clk/qcom/clk-rcg2.c
9178 +@@ -194,6 +194,9 @@ static int _freq_tbl_determine_rate(struct clk_hw *hw,
9179 +
9180 + clk_flags = clk_hw_get_flags(hw);
9181 + p = clk_hw_get_parent_by_index(hw, index);
9182 ++ if (!p)
9183 ++ return -EINVAL;
9184 ++
9185 + if (clk_flags & CLK_SET_RATE_PARENT) {
9186 + if (f->pre_div) {
9187 + if (!rate)
9188 +diff --git a/drivers/devfreq/Kconfig b/drivers/devfreq/Kconfig
9189 +index 41254e702f1e..2ce7cc94d78b 100644
9190 +--- a/drivers/devfreq/Kconfig
9191 ++++ b/drivers/devfreq/Kconfig
9192 +@@ -102,7 +102,8 @@ config ARM_TEGRA_DEVFREQ
9193 +
9194 + config ARM_RK3399_DMC_DEVFREQ
9195 + tristate "ARM RK3399 DMC DEVFREQ Driver"
9196 +- depends on ARCH_ROCKCHIP
9197 ++ depends on (ARCH_ROCKCHIP && HAVE_ARM_SMCCC) || \
9198 ++ (COMPILE_TEST && HAVE_ARM_SMCCC)
9199 + select DEVFREQ_EVENT_ROCKCHIP_DFI
9200 + select DEVFREQ_GOV_SIMPLE_ONDEMAND
9201 + select PM_DEVFREQ_EVENT
9202 +diff --git a/drivers/devfreq/event/Kconfig b/drivers/devfreq/event/Kconfig
9203 +index cd949800eed9..8851bc4e8e3e 100644
9204 +--- a/drivers/devfreq/event/Kconfig
9205 ++++ b/drivers/devfreq/event/Kconfig
9206 +@@ -33,7 +33,7 @@ config DEVFREQ_EVENT_EXYNOS_PPMU
9207 +
9208 + config DEVFREQ_EVENT_ROCKCHIP_DFI
9209 + tristate "ROCKCHIP DFI DEVFREQ event Driver"
9210 +- depends on ARCH_ROCKCHIP
9211 ++ depends on ARCH_ROCKCHIP || COMPILE_TEST
9212 + help
9213 + This add the devfreq-event driver for Rockchip SoC. It provides DFI
9214 + (DDR Monitor Module) driver to count ddr load.
9215 +diff --git a/drivers/gpio/gpio-grgpio.c b/drivers/gpio/gpio-grgpio.c
9216 +index 7847dd34f86f..036a78b70427 100644
9217 +--- a/drivers/gpio/gpio-grgpio.c
9218 ++++ b/drivers/gpio/gpio-grgpio.c
9219 +@@ -259,17 +259,16 @@ static int grgpio_irq_map(struct irq_domain *d, unsigned int irq,
9220 + lirq->irq = irq;
9221 + uirq = &priv->uirqs[lirq->index];
9222 + if (uirq->refcnt == 0) {
9223 ++ spin_unlock_irqrestore(&priv->gc.bgpio_lock, flags);
9224 + ret = request_irq(uirq->uirq, grgpio_irq_handler, 0,
9225 + dev_name(priv->dev), priv);
9226 + if (ret) {
9227 + dev_err(priv->dev,
9228 + "Could not request underlying irq %d\n",
9229 + uirq->uirq);
9230 +-
9231 +- spin_unlock_irqrestore(&priv->gc.bgpio_lock, flags);
9232 +-
9233 + return ret;
9234 + }
9235 ++ spin_lock_irqsave(&priv->gc.bgpio_lock, flags);
9236 + }
9237 + uirq->refcnt++;
9238 +
9239 +@@ -315,8 +314,11 @@ static void grgpio_irq_unmap(struct irq_domain *d, unsigned int irq)
9240 + if (index >= 0) {
9241 + uirq = &priv->uirqs[lirq->index];
9242 + uirq->refcnt--;
9243 +- if (uirq->refcnt == 0)
9244 ++ if (uirq->refcnt == 0) {
9245 ++ spin_unlock_irqrestore(&priv->gc.bgpio_lock, flags);
9246 + free_irq(uirq->uirq, priv);
9247 ++ return;
9248 ++ }
9249 + }
9250 +
9251 + spin_unlock_irqrestore(&priv->gc.bgpio_lock, flags);
9252 +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_atombios.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_atombios.c
9253 +index 26afdffab5a0..ac8885562919 100644
9254 +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_atombios.c
9255 ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_atombios.c
9256 +@@ -336,17 +336,9 @@ bool amdgpu_atombios_get_connector_info_from_object_table(struct amdgpu_device *
9257 + path_size += le16_to_cpu(path->usSize);
9258 +
9259 + if (device_support & le16_to_cpu(path->usDeviceTag)) {
9260 +- uint8_t con_obj_id, con_obj_num, con_obj_type;
9261 +-
9262 +- con_obj_id =
9263 ++ uint8_t con_obj_id =
9264 + (le16_to_cpu(path->usConnObjectId) & OBJECT_ID_MASK)
9265 + >> OBJECT_ID_SHIFT;
9266 +- con_obj_num =
9267 +- (le16_to_cpu(path->usConnObjectId) & ENUM_ID_MASK)
9268 +- >> ENUM_ID_SHIFT;
9269 +- con_obj_type =
9270 +- (le16_to_cpu(path->usConnObjectId) &
9271 +- OBJECT_TYPE_MASK) >> OBJECT_TYPE_SHIFT;
9272 +
9273 + /* Skip TV/CV support */
9274 + if ((le16_to_cpu(path->usDeviceTag) ==
9275 +@@ -371,14 +363,7 @@ bool amdgpu_atombios_get_connector_info_from_object_table(struct amdgpu_device *
9276 + router.ddc_valid = false;
9277 + router.cd_valid = false;
9278 + for (j = 0; j < ((le16_to_cpu(path->usSize) - 8) / 2); j++) {
9279 +- uint8_t grph_obj_id, grph_obj_num, grph_obj_type;
9280 +-
9281 +- grph_obj_id =
9282 +- (le16_to_cpu(path->usGraphicObjIds[j]) &
9283 +- OBJECT_ID_MASK) >> OBJECT_ID_SHIFT;
9284 +- grph_obj_num =
9285 +- (le16_to_cpu(path->usGraphicObjIds[j]) &
9286 +- ENUM_ID_MASK) >> ENUM_ID_SHIFT;
9287 ++ uint8_t grph_obj_type=
9288 + grph_obj_type =
9289 + (le16_to_cpu(path->usGraphicObjIds[j]) &
9290 + OBJECT_TYPE_MASK) >> OBJECT_TYPE_SHIFT;
9291 +diff --git a/drivers/gpu/drm/gma500/framebuffer.c b/drivers/gpu/drm/gma500/framebuffer.c
9292 +index 3a44e705db53..d224fc12b757 100644
9293 +--- a/drivers/gpu/drm/gma500/framebuffer.c
9294 ++++ b/drivers/gpu/drm/gma500/framebuffer.c
9295 +@@ -516,6 +516,7 @@ static int psbfb_probe(struct drm_fb_helper *helper,
9296 + container_of(helper, struct psb_fbdev, psb_fb_helper);
9297 + struct drm_device *dev = psb_fbdev->psb_fb_helper.dev;
9298 + struct drm_psb_private *dev_priv = dev->dev_private;
9299 ++ unsigned int fb_size;
9300 + int bytespp;
9301 +
9302 + bytespp = sizes->surface_bpp / 8;
9303 +@@ -525,8 +526,11 @@ static int psbfb_probe(struct drm_fb_helper *helper,
9304 + /* If the mode will not fit in 32bit then switch to 16bit to get
9305 + a console on full resolution. The X mode setting server will
9306 + allocate its own 32bit GEM framebuffer */
9307 +- if (ALIGN(sizes->fb_width * bytespp, 64) * sizes->fb_height >
9308 +- dev_priv->vram_stolen_size) {
9309 ++ fb_size = ALIGN(sizes->surface_width * bytespp, 64) *
9310 ++ sizes->surface_height;
9311 ++ fb_size = ALIGN(fb_size, PAGE_SIZE);
9312 ++
9313 ++ if (fb_size > dev_priv->vram_stolen_size) {
9314 + sizes->surface_bpp = 16;
9315 + sizes->surface_depth = 16;
9316 + }
9317 +diff --git a/drivers/gpu/drm/mediatek/mtk_drm_crtc.c b/drivers/gpu/drm/mediatek/mtk_drm_crtc.c
9318 +index 01a21dd835b5..1ed60da76a0c 100644
9319 +--- a/drivers/gpu/drm/mediatek/mtk_drm_crtc.c
9320 ++++ b/drivers/gpu/drm/mediatek/mtk_drm_crtc.c
9321 +@@ -306,6 +306,7 @@ err_pm_runtime_put:
9322 + static void mtk_crtc_ddp_hw_fini(struct mtk_drm_crtc *mtk_crtc)
9323 + {
9324 + struct drm_device *drm = mtk_crtc->base.dev;
9325 ++ struct drm_crtc *crtc = &mtk_crtc->base;
9326 + int i;
9327 +
9328 + DRM_DEBUG_DRIVER("%s\n", __func__);
9329 +@@ -327,6 +328,13 @@ static void mtk_crtc_ddp_hw_fini(struct mtk_drm_crtc *mtk_crtc)
9330 + mtk_disp_mutex_unprepare(mtk_crtc->mutex);
9331 +
9332 + pm_runtime_put(drm->dev);
9333 ++
9334 ++ if (crtc->state->event && !crtc->state->active) {
9335 ++ spin_lock_irq(&crtc->dev->event_lock);
9336 ++ drm_crtc_send_vblank_event(crtc, crtc->state->event);
9337 ++ crtc->state->event = NULL;
9338 ++ spin_unlock_irq(&crtc->dev->event_lock);
9339 ++ }
9340 + }
9341 +
9342 + static void mtk_drm_crtc_enable(struct drm_crtc *crtc)
9343 +diff --git a/drivers/gpu/drm/nouveau/nouveau_fence.c b/drivers/gpu/drm/nouveau/nouveau_fence.c
9344 +index 4bb9ab892ae1..78e521d00251 100644
9345 +--- a/drivers/gpu/drm/nouveau/nouveau_fence.c
9346 ++++ b/drivers/gpu/drm/nouveau/nouveau_fence.c
9347 +@@ -158,7 +158,7 @@ nouveau_fence_wait_uevent_handler(struct nvif_notify *notify)
9348 +
9349 + fence = list_entry(fctx->pending.next, typeof(*fence), head);
9350 + chan = rcu_dereference_protected(fence->channel, lockdep_is_held(&fctx->lock));
9351 +- if (nouveau_fence_update(fence->channel, fctx))
9352 ++ if (nouveau_fence_update(chan, fctx))
9353 + ret = NVIF_NOTIFY_DROP;
9354 + }
9355 + spin_unlock_irqrestore(&fctx->lock, flags);
9356 +diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/disp/channv50.c b/drivers/gpu/drm/nouveau/nvkm/engine/disp/channv50.c
9357 +index 9d90d8b4b7e6..f5a8db1bb8b7 100644
9358 +--- a/drivers/gpu/drm/nouveau/nvkm/engine/disp/channv50.c
9359 ++++ b/drivers/gpu/drm/nouveau/nvkm/engine/disp/channv50.c
9360 +@@ -72,6 +72,8 @@ nv50_disp_chan_mthd(struct nv50_disp_chan *chan, int debug)
9361 +
9362 + if (debug > subdev->debug)
9363 + return;
9364 ++ if (!mthd)
9365 ++ return;
9366 +
9367 + for (i = 0; (list = mthd->data[i].mthd) != NULL; i++) {
9368 + u32 base = chan->head * mthd->addr;
9369 +diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/gr/gk20a.c b/drivers/gpu/drm/nouveau/nvkm/engine/gr/gk20a.c
9370 +index de8b806b88fd..7618b2eb4fdf 100644
9371 +--- a/drivers/gpu/drm/nouveau/nvkm/engine/gr/gk20a.c
9372 ++++ b/drivers/gpu/drm/nouveau/nvkm/engine/gr/gk20a.c
9373 +@@ -143,23 +143,24 @@ gk20a_gr_av_to_method(struct gf100_gr *gr, const char *fw_name,
9374 +
9375 + nent = (fuc.size / sizeof(struct gk20a_fw_av));
9376 +
9377 +- pack = vzalloc((sizeof(*pack) * max_classes) +
9378 +- (sizeof(*init) * (nent + 1)));
9379 ++ pack = vzalloc((sizeof(*pack) * (max_classes + 1)) +
9380 ++ (sizeof(*init) * (nent + max_classes + 1)));
9381 + if (!pack) {
9382 + ret = -ENOMEM;
9383 + goto end;
9384 + }
9385 +
9386 +- init = (void *)(pack + max_classes);
9387 ++ init = (void *)(pack + max_classes + 1);
9388 +
9389 +- for (i = 0; i < nent; i++) {
9390 +- struct gf100_gr_init *ent = &init[i];
9391 ++ for (i = 0; i < nent; i++, init++) {
9392 + struct gk20a_fw_av *av = &((struct gk20a_fw_av *)fuc.data)[i];
9393 + u32 class = av->addr & 0xffff;
9394 + u32 addr = (av->addr & 0xffff0000) >> 14;
9395 +
9396 + if (prevclass != class) {
9397 +- pack[classidx].init = ent;
9398 ++ if (prevclass) /* Add terminator to the method list. */
9399 ++ init++;
9400 ++ pack[classidx].init = init;
9401 + pack[classidx].type = class;
9402 + prevclass = class;
9403 + if (++classidx >= max_classes) {
9404 +@@ -169,10 +170,10 @@ gk20a_gr_av_to_method(struct gf100_gr *gr, const char *fw_name,
9405 + }
9406 + }
9407 +
9408 +- ent->addr = addr;
9409 +- ent->data = av->data;
9410 +- ent->count = 1;
9411 +- ent->pitch = 1;
9412 ++ init->addr = addr;
9413 ++ init->data = av->data;
9414 ++ init->count = 1;
9415 ++ init->pitch = 1;
9416 + }
9417 +
9418 + *ppack = pack;
9419 +diff --git a/drivers/gpu/drm/radeon/radeon_display.c b/drivers/gpu/drm/radeon/radeon_display.c
9420 +index 8b6f8aa23806..432ad7d73cb9 100644
9421 +--- a/drivers/gpu/drm/radeon/radeon_display.c
9422 ++++ b/drivers/gpu/drm/radeon/radeon_display.c
9423 +@@ -110,6 +110,8 @@ static void dce5_crtc_load_lut(struct drm_crtc *crtc)
9424 +
9425 + DRM_DEBUG_KMS("%d\n", radeon_crtc->crtc_id);
9426 +
9427 ++ msleep(10);
9428 ++
9429 + WREG32(NI_INPUT_CSC_CONTROL + radeon_crtc->crtc_offset,
9430 + (NI_INPUT_CSC_GRPH_MODE(NI_INPUT_CSC_BYPASS) |
9431 + NI_INPUT_CSC_OVL_MODE(NI_INPUT_CSC_BYPASS)));
9432 +diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_cmdbuf_res.c b/drivers/gpu/drm/vmwgfx/vmwgfx_cmdbuf_res.c
9433 +index 1f013d45c9e9..0c7c3005594c 100644
9434 +--- a/drivers/gpu/drm/vmwgfx/vmwgfx_cmdbuf_res.c
9435 ++++ b/drivers/gpu/drm/vmwgfx/vmwgfx_cmdbuf_res.c
9436 +@@ -210,8 +210,10 @@ int vmw_cmdbuf_res_add(struct vmw_cmdbuf_res_manager *man,
9437 +
9438 + cres->hash.key = user_key | (res_type << 24);
9439 + ret = drm_ht_insert_item(&man->resources, &cres->hash);
9440 +- if (unlikely(ret != 0))
9441 ++ if (unlikely(ret != 0)) {
9442 ++ kfree(cres);
9443 + goto out_invalid_key;
9444 ++ }
9445 +
9446 + cres->state = VMW_CMDBUF_RES_ADD;
9447 + cres->res = vmw_resource_reference(res);
9448 +diff --git a/drivers/hwmon/pmbus/ltc2978.c b/drivers/hwmon/pmbus/ltc2978.c
9449 +index 58b789c28b48..94eea2ac6251 100644
9450 +--- a/drivers/hwmon/pmbus/ltc2978.c
9451 ++++ b/drivers/hwmon/pmbus/ltc2978.c
9452 +@@ -89,8 +89,8 @@ enum chips { ltc2974, ltc2975, ltc2977, ltc2978, ltc2980, ltc3880, ltc3882,
9453 +
9454 + #define LTC_POLL_TIMEOUT 100 /* in milli-seconds */
9455 +
9456 +-#define LTC_NOT_BUSY BIT(5)
9457 +-#define LTC_NOT_PENDING BIT(4)
9458 ++#define LTC_NOT_BUSY BIT(6)
9459 ++#define LTC_NOT_PENDING BIT(5)
9460 +
9461 + /*
9462 + * LTC2978 clears peak data whenever the CLEAR_FAULTS command is executed, which
9463 +diff --git a/drivers/ide/cmd64x.c b/drivers/ide/cmd64x.c
9464 +index b127ed60c733..9dde8390da09 100644
9465 +--- a/drivers/ide/cmd64x.c
9466 ++++ b/drivers/ide/cmd64x.c
9467 +@@ -65,6 +65,9 @@ static void cmd64x_program_timings(ide_drive_t *drive, u8 mode)
9468 + struct ide_timing t;
9469 + u8 arttim = 0;
9470 +
9471 ++ if (drive->dn >= ARRAY_SIZE(drwtim_regs))
9472 ++ return;
9473 ++
9474 + ide_timing_compute(drive, mode, &t, T, 0);
9475 +
9476 + /*
9477 +diff --git a/drivers/ide/serverworks.c b/drivers/ide/serverworks.c
9478 +index a97affca18ab..0f57d45484d1 100644
9479 +--- a/drivers/ide/serverworks.c
9480 ++++ b/drivers/ide/serverworks.c
9481 +@@ -114,6 +114,9 @@ static void svwks_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive)
9482 + struct pci_dev *dev = to_pci_dev(hwif->dev);
9483 + const u8 pio = drive->pio_mode - XFER_PIO_0;
9484 +
9485 ++ if (drive->dn >= ARRAY_SIZE(drive_pci))
9486 ++ return;
9487 ++
9488 + pci_write_config_byte(dev, drive_pci[drive->dn], pio_modes[pio]);
9489 +
9490 + if (svwks_csb_check(dev)) {
9491 +@@ -140,6 +143,9 @@ static void svwks_set_dma_mode(ide_hwif_t *hwif, ide_drive_t *drive)
9492 +
9493 + u8 ultra_enable = 0, ultra_timing = 0, dma_timing = 0;
9494 +
9495 ++ if (drive->dn >= ARRAY_SIZE(drive_pci2))
9496 ++ return;
9497 ++
9498 + pci_read_config_byte(dev, (0x56|hwif->channel), &ultra_timing);
9499 + pci_read_config_byte(dev, 0x54, &ultra_enable);
9500 +
9501 +diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h b/drivers/infiniband/sw/rxe/rxe_verbs.h
9502 +index 47003d2a4a46..dee3853163b6 100644
9503 +--- a/drivers/infiniband/sw/rxe/rxe_verbs.h
9504 ++++ b/drivers/infiniband/sw/rxe/rxe_verbs.h
9505 +@@ -422,7 +422,7 @@ struct rxe_dev {
9506 + struct list_head pending_mmaps;
9507 +
9508 + spinlock_t mmap_offset_lock; /* guard mmap_offset */
9509 +- int mmap_offset;
9510 ++ u64 mmap_offset;
9511 +
9512 + struct rxe_port port;
9513 + struct list_head list;
9514 +diff --git a/drivers/infiniband/ulp/isert/ib_isert.c b/drivers/infiniband/ulp/isert/ib_isert.c
9515 +index 0d2ab9a2cf44..02a5e2d7e574 100644
9516 +--- a/drivers/infiniband/ulp/isert/ib_isert.c
9517 ++++ b/drivers/infiniband/ulp/isert/ib_isert.c
9518 +@@ -2555,6 +2555,17 @@ isert_wait4logout(struct isert_conn *isert_conn)
9519 + }
9520 + }
9521 +
9522 ++static void
9523 ++isert_wait4cmds(struct iscsi_conn *conn)
9524 ++{
9525 ++ isert_info("iscsi_conn %p\n", conn);
9526 ++
9527 ++ if (conn->sess) {
9528 ++ target_sess_cmd_list_set_waiting(conn->sess->se_sess);
9529 ++ target_wait_for_sess_cmds(conn->sess->se_sess);
9530 ++ }
9531 ++}
9532 ++
9533 + /**
9534 + * isert_put_unsol_pending_cmds() - Drop commands waiting for
9535 + * unsolicitate dataout
9536 +@@ -2602,6 +2613,7 @@ static void isert_wait_conn(struct iscsi_conn *conn)
9537 +
9538 + ib_drain_qp(isert_conn->qp);
9539 + isert_put_unsol_pending_cmds(conn);
9540 ++ isert_wait4cmds(conn);
9541 + isert_wait4logout(isert_conn);
9542 +
9543 + queue_work(isert_release_wq, &isert_conn->release_work);
9544 +diff --git a/drivers/input/touchscreen/edt-ft5x06.c b/drivers/input/touchscreen/edt-ft5x06.c
9545 +index 28466e358fee..22c8d2070faa 100644
9546 +--- a/drivers/input/touchscreen/edt-ft5x06.c
9547 ++++ b/drivers/input/touchscreen/edt-ft5x06.c
9548 +@@ -887,6 +887,7 @@ static int edt_ft5x06_ts_probe(struct i2c_client *client,
9549 + {
9550 + const struct edt_i2c_chip_data *chip_data;
9551 + struct edt_ft5x06_ts_data *tsdata;
9552 ++ u8 buf[2] = { 0xfc, 0x00 };
9553 + struct input_dev *input;
9554 + unsigned long irq_flags;
9555 + int error;
9556 +@@ -956,6 +957,12 @@ static int edt_ft5x06_ts_probe(struct i2c_client *client,
9557 + return error;
9558 + }
9559 +
9560 ++ /*
9561 ++ * Dummy read access. EP0700MLP1 returns bogus data on the first
9562 ++ * register read access and ignores writes.
9563 ++ */
9564 ++ edt_ft5x06_ts_readwrite(tsdata->client, 2, buf, 2, buf);
9565 ++
9566 + edt_ft5x06_ts_set_regs(tsdata);
9567 + edt_ft5x06_ts_get_defaults(&client->dev, tsdata);
9568 + edt_ft5x06_ts_get_parameters(tsdata);
9569 +diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
9570 +index 7bd98585d78d..48d382008788 100644
9571 +--- a/drivers/iommu/arm-smmu-v3.c
9572 ++++ b/drivers/iommu/arm-smmu-v3.c
9573 +@@ -1103,7 +1103,8 @@ static void arm_smmu_write_strtab_ent(struct arm_smmu_device *smmu, u32 sid,
9574 + }
9575 +
9576 + arm_smmu_sync_ste_for_sid(smmu, sid);
9577 +- dst[0] = cpu_to_le64(val);
9578 ++ /* See comment in arm_smmu_write_ctx_desc() */
9579 ++ WRITE_ONCE(dst[0], cpu_to_le64(val));
9580 + arm_smmu_sync_ste_for_sid(smmu, sid);
9581 +
9582 + /* It's likely that we'll want to use the new STE soon */
9583 +diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
9584 +index 0c0cd2768d6e..d1efbb8dadc5 100644
9585 +--- a/drivers/irqchip/irq-gic-v3-its.c
9586 ++++ b/drivers/irqchip/irq-gic-v3-its.c
9587 +@@ -365,7 +365,7 @@ static struct its_collection *its_build_invall_cmd(struct its_cmd_block *cmd,
9588 + struct its_cmd_desc *desc)
9589 + {
9590 + its_encode_cmd(cmd, GITS_CMD_INVALL);
9591 +- its_encode_collection(cmd, desc->its_mapc_cmd.col->col_id);
9592 ++ its_encode_collection(cmd, desc->its_invall_cmd.col->col_id);
9593 +
9594 + its_fixup_cmd(cmd);
9595 +
9596 +diff --git a/drivers/irqchip/irq-gic-v3.c b/drivers/irqchip/irq-gic-v3.c
9597 +index f7b8681aed3f..2ab6060031a4 100644
9598 +--- a/drivers/irqchip/irq-gic-v3.c
9599 ++++ b/drivers/irqchip/irq-gic-v3.c
9600 +@@ -1195,6 +1195,7 @@ static struct
9601 + struct redist_region *redist_regs;
9602 + u32 nr_redist_regions;
9603 + bool single_redist;
9604 ++ int enabled_rdists;
9605 + u32 maint_irq;
9606 + int maint_irq_mode;
9607 + phys_addr_t vcpu_base;
9608 +@@ -1289,8 +1290,10 @@ static int __init gic_acpi_match_gicc(struct acpi_subtable_header *header,
9609 + * If GICC is enabled and has valid gicr base address, then it means
9610 + * GICR base is presented via GICC
9611 + */
9612 +- if ((gicc->flags & ACPI_MADT_ENABLED) && gicc->gicr_base_address)
9613 ++ if ((gicc->flags & ACPI_MADT_ENABLED) && gicc->gicr_base_address) {
9614 ++ acpi_data.enabled_rdists++;
9615 + return 0;
9616 ++ }
9617 +
9618 + /*
9619 + * It's perfectly valid firmware can pass disabled GICC entry, driver
9620 +@@ -1320,8 +1323,10 @@ static int __init gic_acpi_count_gicr_regions(void)
9621 +
9622 + count = acpi_table_parse_madt(ACPI_MADT_TYPE_GENERIC_INTERRUPT,
9623 + gic_acpi_match_gicc, 0);
9624 +- if (count > 0)
9625 ++ if (count > 0) {
9626 + acpi_data.single_redist = true;
9627 ++ count = acpi_data.enabled_rdists;
9628 ++ }
9629 +
9630 + return count;
9631 + }
9632 +diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h
9633 +index b935839ab79c..f483041eed98 100644
9634 +--- a/drivers/md/bcache/bset.h
9635 ++++ b/drivers/md/bcache/bset.h
9636 +@@ -380,7 +380,8 @@ void bch_btree_keys_stats(struct btree_keys *, struct bset_stats *);
9637 +
9638 + /* Bkey utility code */
9639 +
9640 +-#define bset_bkey_last(i) bkey_idx((struct bkey *) (i)->d, (i)->keys)
9641 ++#define bset_bkey_last(i) bkey_idx((struct bkey *) (i)->d, \
9642 ++ (unsigned int)(i)->keys)
9643 +
9644 + static inline struct bkey *bset_bkey_idx(struct bset *i, unsigned idx)
9645 + {
9646 +diff --git a/drivers/media/i2c/mt9v032.c b/drivers/media/i2c/mt9v032.c
9647 +index 58eb62f1ba21..a018a76662df 100644
9648 +--- a/drivers/media/i2c/mt9v032.c
9649 ++++ b/drivers/media/i2c/mt9v032.c
9650 +@@ -423,10 +423,12 @@ static int mt9v032_enum_mbus_code(struct v4l2_subdev *subdev,
9651 + struct v4l2_subdev_pad_config *cfg,
9652 + struct v4l2_subdev_mbus_code_enum *code)
9653 + {
9654 ++ struct mt9v032 *mt9v032 = to_mt9v032(subdev);
9655 ++
9656 + if (code->index > 0)
9657 + return -EINVAL;
9658 +
9659 +- code->code = MEDIA_BUS_FMT_SGRBG10_1X10;
9660 ++ code->code = mt9v032->format.code;
9661 + return 0;
9662 + }
9663 +
9664 +@@ -434,7 +436,11 @@ static int mt9v032_enum_frame_size(struct v4l2_subdev *subdev,
9665 + struct v4l2_subdev_pad_config *cfg,
9666 + struct v4l2_subdev_frame_size_enum *fse)
9667 + {
9668 +- if (fse->index >= 3 || fse->code != MEDIA_BUS_FMT_SGRBG10_1X10)
9669 ++ struct mt9v032 *mt9v032 = to_mt9v032(subdev);
9670 ++
9671 ++ if (fse->index >= 3)
9672 ++ return -EINVAL;
9673 ++ if (mt9v032->format.code != fse->code)
9674 + return -EINVAL;
9675 +
9676 + fse->min_width = MT9V032_WINDOW_WIDTH_DEF / (1 << fse->index);
9677 +diff --git a/drivers/media/platform/sti/bdisp/bdisp-hw.c b/drivers/media/platform/sti/bdisp/bdisp-hw.c
9678 +index b7892f3efd98..5c4c3f0c57be 100644
9679 +--- a/drivers/media/platform/sti/bdisp/bdisp-hw.c
9680 ++++ b/drivers/media/platform/sti/bdisp/bdisp-hw.c
9681 +@@ -14,8 +14,8 @@
9682 + #define MAX_SRC_WIDTH 2048
9683 +
9684 + /* Reset & boot poll config */
9685 +-#define POLL_RST_MAX 50
9686 +-#define POLL_RST_DELAY_MS 20
9687 ++#define POLL_RST_MAX 500
9688 ++#define POLL_RST_DELAY_MS 2
9689 +
9690 + enum bdisp_target_plan {
9691 + BDISP_RGB,
9692 +@@ -382,7 +382,7 @@ int bdisp_hw_reset(struct bdisp_dev *bdisp)
9693 + for (i = 0; i < POLL_RST_MAX; i++) {
9694 + if (readl(bdisp->regs + BLT_STA1) & BLT_STA1_IDLE)
9695 + break;
9696 +- msleep(POLL_RST_DELAY_MS);
9697 ++ udelay(POLL_RST_DELAY_MS * 1000);
9698 + }
9699 + if (i == POLL_RST_MAX)
9700 + dev_err(bdisp->dev, "Reset timeout\n");
9701 +diff --git a/drivers/net/ethernet/cisco/enic/enic_main.c b/drivers/net/ethernet/cisco/enic/enic_main.c
9702 +index b73d9ba9496c..96290b83dfde 100644
9703 +--- a/drivers/net/ethernet/cisco/enic/enic_main.c
9704 ++++ b/drivers/net/ethernet/cisco/enic/enic_main.c
9705 +@@ -1806,10 +1806,10 @@ static int enic_stop(struct net_device *netdev)
9706 + }
9707 +
9708 + netif_carrier_off(netdev);
9709 +- netif_tx_disable(netdev);
9710 + if (vnic_dev_get_intr_mode(enic->vdev) == VNIC_DEV_INTR_MODE_MSIX)
9711 + for (i = 0; i < enic->wq_count; i++)
9712 + napi_disable(&enic->napi[enic_cq_wq(enic, i)]);
9713 ++ netif_tx_disable(netdev);
9714 +
9715 + if (!enic_is_dynamic(enic) && !enic_is_sriov_vf(enic))
9716 + enic_dev_del_station_addr(enic);
9717 +diff --git a/drivers/net/ethernet/freescale/gianfar.c b/drivers/net/ethernet/freescale/gianfar.c
9718 +index 60bd1b36df60..b665d27f8e29 100644
9719 +--- a/drivers/net/ethernet/freescale/gianfar.c
9720 ++++ b/drivers/net/ethernet/freescale/gianfar.c
9721 +@@ -2688,13 +2688,17 @@ static void gfar_clean_tx_ring(struct gfar_priv_tx_q *tx_queue)
9722 + skb_dirtytx = tx_queue->skb_dirtytx;
9723 +
9724 + while ((skb = tx_queue->tx_skbuff[skb_dirtytx])) {
9725 ++ bool do_tstamp;
9726 ++
9727 ++ do_tstamp = (skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP) &&
9728 ++ priv->hwts_tx_en;
9729 +
9730 + frags = skb_shinfo(skb)->nr_frags;
9731 +
9732 + /* When time stamping, one additional TxBD must be freed.
9733 + * Also, we need to dma_unmap_single() the TxPAL.
9734 + */
9735 +- if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_IN_PROGRESS))
9736 ++ if (unlikely(do_tstamp))
9737 + nr_txbds = frags + 2;
9738 + else
9739 + nr_txbds = frags + 1;
9740 +@@ -2708,7 +2712,7 @@ static void gfar_clean_tx_ring(struct gfar_priv_tx_q *tx_queue)
9741 + (lstatus & BD_LENGTH_MASK))
9742 + break;
9743 +
9744 +- if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_IN_PROGRESS)) {
9745 ++ if (unlikely(do_tstamp)) {
9746 + next = next_txbd(bdp, base, tx_ring_size);
9747 + buflen = be16_to_cpu(next->length) +
9748 + GMAC_FCB_LEN + GMAC_TXPAL_LEN;
9749 +@@ -2718,7 +2722,7 @@ static void gfar_clean_tx_ring(struct gfar_priv_tx_q *tx_queue)
9750 + dma_unmap_single(priv->dev, be32_to_cpu(bdp->bufPtr),
9751 + buflen, DMA_TO_DEVICE);
9752 +
9753 +- if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_IN_PROGRESS)) {
9754 ++ if (unlikely(do_tstamp)) {
9755 + struct skb_shared_hwtstamps shhwtstamps;
9756 + u64 *ns = (u64 *)(((uintptr_t)skb->data + 0x10) &
9757 + ~0x7UL);
9758 +diff --git a/drivers/net/wan/fsl_ucc_hdlc.c b/drivers/net/wan/fsl_ucc_hdlc.c
9759 +index af85a1b3135e..87bf05a81db5 100644
9760 +--- a/drivers/net/wan/fsl_ucc_hdlc.c
9761 ++++ b/drivers/net/wan/fsl_ucc_hdlc.c
9762 +@@ -209,6 +209,11 @@ static int uhdlc_init(struct ucc_hdlc_private *priv)
9763 + ret = -ENOMEM;
9764 + goto free_riptr;
9765 + }
9766 ++ if (riptr != (u16)riptr || tiptr != (u16)tiptr) {
9767 ++ dev_err(priv->dev, "MURAM allocation out of addressable range\n");
9768 ++ ret = -ENOMEM;
9769 ++ goto free_tiptr;
9770 ++ }
9771 +
9772 + /* Set RIPTR, TIPTR */
9773 + iowrite16be(riptr, &priv->ucc_pram->riptr);
9774 +diff --git a/drivers/net/wan/ixp4xx_hss.c b/drivers/net/wan/ixp4xx_hss.c
9775 +index e7bbdb7af53a..97968e6a6a4e 100644
9776 +--- a/drivers/net/wan/ixp4xx_hss.c
9777 ++++ b/drivers/net/wan/ixp4xx_hss.c
9778 +@@ -261,7 +261,7 @@ struct port {
9779 + struct hss_plat_info *plat;
9780 + buffer_t *rx_buff_tab[RX_DESCS], *tx_buff_tab[TX_DESCS];
9781 + struct desc *desc_tab; /* coherent */
9782 +- u32 desc_tab_phys;
9783 ++ dma_addr_t desc_tab_phys;
9784 + unsigned int id;
9785 + unsigned int clock_type, clock_rate, loopback;
9786 + unsigned int initialized, carrier;
9787 +@@ -861,7 +861,7 @@ static int hss_hdlc_xmit(struct sk_buff *skb, struct net_device *dev)
9788 + dev->stats.tx_dropped++;
9789 + return NETDEV_TX_OK;
9790 + }
9791 +- memcpy_swab32(mem, (u32 *)((int)skb->data & ~3), bytes / 4);
9792 ++ memcpy_swab32(mem, (u32 *)((uintptr_t)skb->data & ~3), bytes / 4);
9793 + dev_kfree_skb(skb);
9794 + #endif
9795 +
9796 +diff --git a/drivers/net/wireless/broadcom/b43legacy/main.c b/drivers/net/wireless/broadcom/b43legacy/main.c
9797 +index 83770d2ea057..9da8bd792702 100644
9798 +--- a/drivers/net/wireless/broadcom/b43legacy/main.c
9799 ++++ b/drivers/net/wireless/broadcom/b43legacy/main.c
9800 +@@ -1304,8 +1304,9 @@ static void handle_irq_ucode_debug(struct b43legacy_wldev *dev)
9801 + }
9802 +
9803 + /* Interrupt handler bottom-half */
9804 +-static void b43legacy_interrupt_tasklet(struct b43legacy_wldev *dev)
9805 ++static void b43legacy_interrupt_tasklet(unsigned long data)
9806 + {
9807 ++ struct b43legacy_wldev *dev = (struct b43legacy_wldev *)data;
9808 + u32 reason;
9809 + u32 dma_reason[ARRAY_SIZE(dev->dma_reason)];
9810 + u32 merged_dma_reason = 0;
9811 +@@ -3775,7 +3776,7 @@ static int b43legacy_one_core_attach(struct ssb_device *dev,
9812 + b43legacy_set_status(wldev, B43legacy_STAT_UNINIT);
9813 + wldev->bad_frames_preempt = modparam_bad_frames_preempt;
9814 + tasklet_init(&wldev->isr_tasklet,
9815 +- (void (*)(unsigned long))b43legacy_interrupt_tasklet,
9816 ++ b43legacy_interrupt_tasklet,
9817 + (unsigned long)wldev);
9818 + if (modparam_pio)
9819 + wldev->__using_pio = true;
9820 +diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c
9821 +index de52d826eb24..998a4bd6db78 100644
9822 +--- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c
9823 ++++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c
9824 +@@ -1921,6 +1921,7 @@ static uint brcmf_sdio_readframes(struct brcmf_sdio *bus, uint maxframes)
9825 + BRCMF_SDIO_FT_NORMAL)) {
9826 + rd->len = 0;
9827 + brcmu_pkt_buf_free_skb(pkt);
9828 ++ continue;
9829 + }
9830 + bus->sdcnt.rx_readahead_cnt++;
9831 + if (rd->len != roundup(rd_new.len, 16)) {
9832 +diff --git a/drivers/net/wireless/intel/ipw2x00/ipw2100.c b/drivers/net/wireless/intel/ipw2x00/ipw2100.c
9833 +index bfa542c8d6f1..86c84b11218d 100644
9834 +--- a/drivers/net/wireless/intel/ipw2x00/ipw2100.c
9835 ++++ b/drivers/net/wireless/intel/ipw2x00/ipw2100.c
9836 +@@ -3220,8 +3220,9 @@ static void ipw2100_tx_send_data(struct ipw2100_priv *priv)
9837 + }
9838 + }
9839 +
9840 +-static void ipw2100_irq_tasklet(struct ipw2100_priv *priv)
9841 ++static void ipw2100_irq_tasklet(unsigned long data)
9842 + {
9843 ++ struct ipw2100_priv *priv = (struct ipw2100_priv *)data;
9844 + struct net_device *dev = priv->net_dev;
9845 + unsigned long flags;
9846 + u32 inta, tmp;
9847 +@@ -6029,7 +6030,7 @@ static void ipw2100_rf_kill(struct work_struct *work)
9848 + spin_unlock_irqrestore(&priv->low_lock, flags);
9849 + }
9850 +
9851 +-static void ipw2100_irq_tasklet(struct ipw2100_priv *priv);
9852 ++static void ipw2100_irq_tasklet(unsigned long data);
9853 +
9854 + static const struct net_device_ops ipw2100_netdev_ops = {
9855 + .ndo_open = ipw2100_open,
9856 +@@ -6158,7 +6159,7 @@ static struct net_device *ipw2100_alloc_device(struct pci_dev *pci_dev,
9857 + INIT_DELAYED_WORK(&priv->rf_kill, ipw2100_rf_kill);
9858 + INIT_DELAYED_WORK(&priv->scan_event, ipw2100_scan_event);
9859 +
9860 +- tasklet_init(&priv->irq_tasklet, (void (*)(unsigned long))
9861 ++ tasklet_init(&priv->irq_tasklet,
9862 + ipw2100_irq_tasklet, (unsigned long)priv);
9863 +
9864 + /* NOTE: We do not start the deferred work for status checks yet */
9865 +diff --git a/drivers/net/wireless/intel/ipw2x00/ipw2200.c b/drivers/net/wireless/intel/ipw2x00/ipw2200.c
9866 +index bfd68612a535..48edb2b6eb7d 100644
9867 +--- a/drivers/net/wireless/intel/ipw2x00/ipw2200.c
9868 ++++ b/drivers/net/wireless/intel/ipw2x00/ipw2200.c
9869 +@@ -1968,8 +1968,9 @@ static void notify_wx_assoc_event(struct ipw_priv *priv)
9870 + wireless_send_event(priv->net_dev, SIOCGIWAP, &wrqu, NULL);
9871 + }
9872 +
9873 +-static void ipw_irq_tasklet(struct ipw_priv *priv)
9874 ++static void ipw_irq_tasklet(unsigned long data)
9875 + {
9876 ++ struct ipw_priv *priv = (struct ipw_priv *)data;
9877 + u32 inta, inta_mask, handled = 0;
9878 + unsigned long flags;
9879 + int rc = 0;
9880 +@@ -10705,7 +10706,7 @@ static int ipw_setup_deferred_work(struct ipw_priv *priv)
9881 + INIT_WORK(&priv->qos_activate, ipw_bg_qos_activate);
9882 + #endif /* CONFIG_IPW2200_QOS */
9883 +
9884 +- tasklet_init(&priv->irq_tasklet, (void (*)(unsigned long))
9885 ++ tasklet_init(&priv->irq_tasklet,
9886 + ipw_irq_tasklet, (unsigned long)priv);
9887 +
9888 + return ret;
9889 +diff --git a/drivers/net/wireless/intel/iwlegacy/3945-mac.c b/drivers/net/wireless/intel/iwlegacy/3945-mac.c
9890 +index 466912eb2d87..d853ccbf74cb 100644
9891 +--- a/drivers/net/wireless/intel/iwlegacy/3945-mac.c
9892 ++++ b/drivers/net/wireless/intel/iwlegacy/3945-mac.c
9893 +@@ -1399,8 +1399,9 @@ il3945_dump_nic_error_log(struct il_priv *il)
9894 + }
9895 +
9896 + static void
9897 +-il3945_irq_tasklet(struct il_priv *il)
9898 ++il3945_irq_tasklet(unsigned long data)
9899 + {
9900 ++ struct il_priv *il = (struct il_priv *)data;
9901 + u32 inta, handled = 0;
9902 + u32 inta_fh;
9903 + unsigned long flags;
9904 +@@ -3432,7 +3433,7 @@ il3945_setup_deferred_work(struct il_priv *il)
9905 + setup_timer(&il->watchdog, il_bg_watchdog, (unsigned long)il);
9906 +
9907 + tasklet_init(&il->irq_tasklet,
9908 +- (void (*)(unsigned long))il3945_irq_tasklet,
9909 ++ il3945_irq_tasklet,
9910 + (unsigned long)il);
9911 + }
9912 +
9913 +diff --git a/drivers/net/wireless/intel/iwlegacy/4965-mac.c b/drivers/net/wireless/intel/iwlegacy/4965-mac.c
9914 +index a91d170a614b..6c2dcd236713 100644
9915 +--- a/drivers/net/wireless/intel/iwlegacy/4965-mac.c
9916 ++++ b/drivers/net/wireless/intel/iwlegacy/4965-mac.c
9917 +@@ -4361,8 +4361,9 @@ il4965_synchronize_irq(struct il_priv *il)
9918 + }
9919 +
9920 + static void
9921 +-il4965_irq_tasklet(struct il_priv *il)
9922 ++il4965_irq_tasklet(unsigned long data)
9923 + {
9924 ++ struct il_priv *il = (struct il_priv *)data;
9925 + u32 inta, handled = 0;
9926 + u32 inta_fh;
9927 + unsigned long flags;
9928 +@@ -6260,7 +6261,7 @@ il4965_setup_deferred_work(struct il_priv *il)
9929 + setup_timer(&il->watchdog, il_bg_watchdog, (unsigned long)il);
9930 +
9931 + tasklet_init(&il->irq_tasklet,
9932 +- (void (*)(unsigned long))il4965_irq_tasklet,
9933 ++ il4965_irq_tasklet,
9934 + (unsigned long)il);
9935 + }
9936 +
9937 +diff --git a/drivers/net/wireless/intel/iwlegacy/common.c b/drivers/net/wireless/intel/iwlegacy/common.c
9938 +index 140b6ea8f7cc..db2373fe8ac3 100644
9939 +--- a/drivers/net/wireless/intel/iwlegacy/common.c
9940 ++++ b/drivers/net/wireless/intel/iwlegacy/common.c
9941 +@@ -717,7 +717,7 @@ il_eeprom_init(struct il_priv *il)
9942 + u32 gp = _il_rd(il, CSR_EEPROM_GP);
9943 + int sz;
9944 + int ret;
9945 +- u16 addr;
9946 ++ int addr;
9947 +
9948 + /* allocate eeprom */
9949 + sz = il->cfg->eeprom_size;
9950 +diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/tt.c b/drivers/net/wireless/intel/iwlwifi/mvm/tt.c
9951 +index c5203568a47a..f0f205c3aadb 100644
9952 +--- a/drivers/net/wireless/intel/iwlwifi/mvm/tt.c
9953 ++++ b/drivers/net/wireless/intel/iwlwifi/mvm/tt.c
9954 +@@ -736,7 +736,8 @@ static struct thermal_zone_device_ops tzone_ops = {
9955 + static void iwl_mvm_thermal_zone_register(struct iwl_mvm *mvm)
9956 + {
9957 + int i;
9958 +- char name[] = "iwlwifi";
9959 ++ char name[16];
9960 ++ static atomic_t counter = ATOMIC_INIT(0);
9961 +
9962 + if (!iwl_mvm_is_tt_in_fw(mvm)) {
9963 + mvm->tz_device.tzone = NULL;
9964 +@@ -746,6 +747,7 @@ static void iwl_mvm_thermal_zone_register(struct iwl_mvm *mvm)
9965 +
9966 + BUILD_BUG_ON(ARRAY_SIZE(name) >= THERMAL_NAME_LENGTH);
9967 +
9968 ++ sprintf(name, "iwlwifi_%u", atomic_inc_return(&counter) & 0xFF);
9969 + mvm->tz_device.tzone = thermal_zone_device_register(name,
9970 + IWL_MAX_DTS_TRIPS,
9971 + IWL_WRITABLE_TRIPS_MSK,
9972 +diff --git a/drivers/net/wireless/intersil/hostap/hostap_ap.c b/drivers/net/wireless/intersil/hostap/hostap_ap.c
9973 +index c995ace153ee..30171d4c4718 100644
9974 +--- a/drivers/net/wireless/intersil/hostap/hostap_ap.c
9975 ++++ b/drivers/net/wireless/intersil/hostap/hostap_ap.c
9976 +@@ -2570,7 +2570,7 @@ static int prism2_hostapd_add_sta(struct ap_data *ap,
9977 + sta->supported_rates[0] = 2;
9978 + if (sta->tx_supp_rates & WLAN_RATE_2M)
9979 + sta->supported_rates[1] = 4;
9980 +- if (sta->tx_supp_rates & WLAN_RATE_5M5)
9981 ++ if (sta->tx_supp_rates & WLAN_RATE_5M5)
9982 + sta->supported_rates[2] = 11;
9983 + if (sta->tx_supp_rates & WLAN_RATE_11M)
9984 + sta->supported_rates[3] = 22;
9985 +diff --git a/drivers/net/wireless/intersil/orinoco/orinoco_usb.c b/drivers/net/wireless/intersil/orinoco/orinoco_usb.c
9986 +index 8244d8262951..4e91c74fcfad 100644
9987 +--- a/drivers/net/wireless/intersil/orinoco/orinoco_usb.c
9988 ++++ b/drivers/net/wireless/intersil/orinoco/orinoco_usb.c
9989 +@@ -1351,7 +1351,8 @@ static int ezusb_init(struct hermes *hw)
9990 + int retval;
9991 +
9992 + BUG_ON(in_interrupt());
9993 +- BUG_ON(!upriv);
9994 ++ if (!upriv)
9995 ++ return -EINVAL;
9996 +
9997 + upriv->reply_count = 0;
9998 + /* Write the MAGIC number on the simulated registers to keep
9999 +diff --git a/drivers/net/wireless/realtek/rtlwifi/pci.c b/drivers/net/wireless/realtek/rtlwifi/pci.c
10000 +index e15b462d096b..21b7cb845bf4 100644
10001 +--- a/drivers/net/wireless/realtek/rtlwifi/pci.c
10002 ++++ b/drivers/net/wireless/realtek/rtlwifi/pci.c
10003 +@@ -1095,13 +1095,15 @@ done:
10004 + return ret;
10005 + }
10006 +
10007 +-static void _rtl_pci_irq_tasklet(struct ieee80211_hw *hw)
10008 ++static void _rtl_pci_irq_tasklet(unsigned long data)
10009 + {
10010 ++ struct ieee80211_hw *hw = (struct ieee80211_hw *)data;
10011 + _rtl_pci_tx_chk_waitq(hw);
10012 + }
10013 +
10014 +-static void _rtl_pci_prepare_bcn_tasklet(struct ieee80211_hw *hw)
10015 ++static void _rtl_pci_prepare_bcn_tasklet(unsigned long data)
10016 + {
10017 ++ struct ieee80211_hw *hw = (struct ieee80211_hw *)data;
10018 + struct rtl_priv *rtlpriv = rtl_priv(hw);
10019 + struct rtl_pci *rtlpci = rtl_pcidev(rtl_pcipriv(hw));
10020 + struct rtl_mac *mac = rtl_mac(rtl_priv(hw));
10021 +@@ -1223,10 +1225,10 @@ static void _rtl_pci_init_struct(struct ieee80211_hw *hw,
10022 +
10023 + /*task */
10024 + tasklet_init(&rtlpriv->works.irq_tasklet,
10025 +- (void (*)(unsigned long))_rtl_pci_irq_tasklet,
10026 ++ _rtl_pci_irq_tasklet,
10027 + (unsigned long)hw);
10028 + tasklet_init(&rtlpriv->works.irq_prepare_bcn_tasklet,
10029 +- (void (*)(unsigned long))_rtl_pci_prepare_bcn_tasklet,
10030 ++ _rtl_pci_prepare_bcn_tasklet,
10031 + (unsigned long)hw);
10032 + INIT_WORK(&rtlpriv->works.lps_change_work,
10033 + rtl_lps_change_work_callback);
10034 +diff --git a/drivers/nfc/port100.c b/drivers/nfc/port100.c
10035 +index 3cd995de1bbb..151b220381f9 100644
10036 +--- a/drivers/nfc/port100.c
10037 ++++ b/drivers/nfc/port100.c
10038 +@@ -573,7 +573,7 @@ static void port100_tx_update_payload_len(void *_frame, int len)
10039 + {
10040 + struct port100_frame *frame = _frame;
10041 +
10042 +- frame->datalen = cpu_to_le16(le16_to_cpu(frame->datalen) + len);
10043 ++ le16_add_cpu(&frame->datalen, len);
10044 + }
10045 +
10046 + static bool port100_rx_frame_is_valid(void *_frame)
10047 +diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c
10048 +index 1d32fe2d97aa..9ec3cb628b0b 100644
10049 +--- a/drivers/pci/iov.c
10050 ++++ b/drivers/pci/iov.c
10051 +@@ -181,6 +181,7 @@ int pci_iov_add_virtfn(struct pci_dev *dev, int id, int reset)
10052 + failed2:
10053 + sysfs_remove_link(&dev->dev.kobj, buf);
10054 + failed1:
10055 ++ pci_stop_and_remove_bus_device(virtfn);
10056 + pci_dev_put(dev);
10057 + mutex_lock(&iov->dev->sriov->lock);
10058 + pci_stop_and_remove_bus_device(virtfn);
10059 +diff --git a/drivers/pinctrl/intel/pinctrl-baytrail.c b/drivers/pinctrl/intel/pinctrl-baytrail.c
10060 +index f83a2a60d9c9..1e945aa77734 100644
10061 +--- a/drivers/pinctrl/intel/pinctrl-baytrail.c
10062 ++++ b/drivers/pinctrl/intel/pinctrl-baytrail.c
10063 +@@ -958,7 +958,13 @@ static void byt_gpio_clear_triggering(struct byt_gpio *vg, unsigned int offset)
10064 +
10065 + raw_spin_lock_irqsave(&byt_lock, flags);
10066 + value = readl(reg);
10067 +- value &= ~(BYT_TRIG_POS | BYT_TRIG_NEG | BYT_TRIG_LVL);
10068 ++
10069 ++ /* Do not clear direct-irq enabled IRQs (from gpio_disable_free) */
10070 ++ if (value & BYT_DIRECT_IRQ_EN)
10071 ++ /* nothing to do */ ;
10072 ++ else
10073 ++ value &= ~(BYT_TRIG_POS | BYT_TRIG_NEG | BYT_TRIG_LVL);
10074 ++
10075 + writel(value, reg);
10076 + raw_spin_unlock_irqrestore(&byt_lock, flags);
10077 + }
10078 +diff --git a/drivers/pinctrl/sh-pfc/pfc-sh7264.c b/drivers/pinctrl/sh-pfc/pfc-sh7264.c
10079 +index e1c34e19222e..3ddb9565ed80 100644
10080 +--- a/drivers/pinctrl/sh-pfc/pfc-sh7264.c
10081 ++++ b/drivers/pinctrl/sh-pfc/pfc-sh7264.c
10082 +@@ -500,17 +500,15 @@ enum {
10083 + SD_WP_MARK, SD_CLK_MARK, SD_CMD_MARK,
10084 + CRX0_MARK, CRX1_MARK,
10085 + CTX0_MARK, CTX1_MARK,
10086 ++ CRX0_CRX1_MARK, CTX0_CTX1_MARK,
10087 +
10088 + PWM1A_MARK, PWM1B_MARK, PWM1C_MARK, PWM1D_MARK,
10089 + PWM1E_MARK, PWM1F_MARK, PWM1G_MARK, PWM1H_MARK,
10090 + PWM2A_MARK, PWM2B_MARK, PWM2C_MARK, PWM2D_MARK,
10091 + PWM2E_MARK, PWM2F_MARK, PWM2G_MARK, PWM2H_MARK,
10092 + IERXD_MARK, IETXD_MARK,
10093 +- CRX0_CRX1_MARK,
10094 + WDTOVF_MARK,
10095 +
10096 +- CRX0X1_MARK,
10097 +-
10098 + /* DMAC */
10099 + TEND0_MARK, DACK0_MARK, DREQ0_MARK,
10100 + TEND1_MARK, DACK1_MARK, DREQ1_MARK,
10101 +@@ -998,12 +996,12 @@ static const u16 pinmux_data[] = {
10102 +
10103 + PINMUX_DATA(PJ3_DATA, PJ3MD_00),
10104 + PINMUX_DATA(CRX1_MARK, PJ3MD_01),
10105 +- PINMUX_DATA(CRX0X1_MARK, PJ3MD_10),
10106 ++ PINMUX_DATA(CRX0_CRX1_MARK, PJ3MD_10),
10107 + PINMUX_DATA(IRQ1_PJ_MARK, PJ3MD_11),
10108 +
10109 + PINMUX_DATA(PJ2_DATA, PJ2MD_000),
10110 + PINMUX_DATA(CTX1_MARK, PJ2MD_001),
10111 +- PINMUX_DATA(CRX0_CRX1_MARK, PJ2MD_010),
10112 ++ PINMUX_DATA(CTX0_CTX1_MARK, PJ2MD_010),
10113 + PINMUX_DATA(CS2_MARK, PJ2MD_011),
10114 + PINMUX_DATA(SCK0_MARK, PJ2MD_100),
10115 + PINMUX_DATA(LCD_M_DISP_MARK, PJ2MD_101),
10116 +@@ -1248,6 +1246,7 @@ static const struct pinmux_func pinmux_func_gpios[] = {
10117 + GPIO_FN(CTX1),
10118 + GPIO_FN(CRX1),
10119 + GPIO_FN(CTX0),
10120 ++ GPIO_FN(CTX0_CTX1),
10121 + GPIO_FN(CRX0),
10122 + GPIO_FN(CRX0_CRX1),
10123 +
10124 +diff --git a/drivers/pinctrl/sh-pfc/pfc-sh7269.c b/drivers/pinctrl/sh-pfc/pfc-sh7269.c
10125 +index cfdb4fc177c3..3df0c0d139d0 100644
10126 +--- a/drivers/pinctrl/sh-pfc/pfc-sh7269.c
10127 ++++ b/drivers/pinctrl/sh-pfc/pfc-sh7269.c
10128 +@@ -740,13 +740,12 @@ enum {
10129 + CRX0_MARK, CTX0_MARK,
10130 + CRX1_MARK, CTX1_MARK,
10131 + CRX2_MARK, CTX2_MARK,
10132 +- CRX0_CRX1_MARK,
10133 +- CRX0_CRX1_CRX2_MARK,
10134 +- CTX0CTX1CTX2_MARK,
10135 ++ CRX0_CRX1_MARK, CTX0_CTX1_MARK,
10136 ++ CRX0_CRX1_CRX2_MARK, CTX0_CTX1_CTX2_MARK,
10137 + CRX1_PJ22_MARK, CTX1_PJ23_MARK,
10138 + CRX2_PJ20_MARK, CTX2_PJ21_MARK,
10139 +- CRX0CRX1_PJ22_MARK,
10140 +- CRX0CRX1CRX2_PJ20_MARK,
10141 ++ CRX0_CRX1_PJ22_MARK, CTX0_CTX1_PJ23_MARK,
10142 ++ CRX0_CRX1_CRX2_PJ20_MARK, CTX0_CTX1_CTX2_PJ21_MARK,
10143 +
10144 + /* VDC */
10145 + DV_CLK_MARK,
10146 +@@ -824,6 +823,7 @@ static const u16 pinmux_data[] = {
10147 + PINMUX_DATA(CS3_MARK, PC8MD_001),
10148 + PINMUX_DATA(TXD7_MARK, PC8MD_010),
10149 + PINMUX_DATA(CTX1_MARK, PC8MD_011),
10150 ++ PINMUX_DATA(CTX0_CTX1_MARK, PC8MD_100),
10151 +
10152 + PINMUX_DATA(PC7_DATA, PC7MD_000),
10153 + PINMUX_DATA(CKE_MARK, PC7MD_001),
10154 +@@ -836,11 +836,12 @@ static const u16 pinmux_data[] = {
10155 + PINMUX_DATA(CAS_MARK, PC6MD_001),
10156 + PINMUX_DATA(SCK7_MARK, PC6MD_010),
10157 + PINMUX_DATA(CTX0_MARK, PC6MD_011),
10158 ++ PINMUX_DATA(CTX0_CTX1_CTX2_MARK, PC6MD_100),
10159 +
10160 + PINMUX_DATA(PC5_DATA, PC5MD_000),
10161 + PINMUX_DATA(RAS_MARK, PC5MD_001),
10162 + PINMUX_DATA(CRX0_MARK, PC5MD_011),
10163 +- PINMUX_DATA(CTX0CTX1CTX2_MARK, PC5MD_100),
10164 ++ PINMUX_DATA(CTX0_CTX1_CTX2_MARK, PC5MD_100),
10165 + PINMUX_DATA(IRQ0_PC_MARK, PC5MD_101),
10166 +
10167 + PINMUX_DATA(PC4_DATA, PC4MD_00),
10168 +@@ -1292,30 +1293,32 @@ static const u16 pinmux_data[] = {
10169 + PINMUX_DATA(LCD_DATA23_PJ23_MARK, PJ23MD_010),
10170 + PINMUX_DATA(LCD_TCON6_MARK, PJ23MD_011),
10171 + PINMUX_DATA(IRQ3_PJ_MARK, PJ23MD_100),
10172 +- PINMUX_DATA(CTX1_MARK, PJ23MD_101),
10173 ++ PINMUX_DATA(CTX1_PJ23_MARK, PJ23MD_101),
10174 ++ PINMUX_DATA(CTX0_CTX1_PJ23_MARK, PJ23MD_110),
10175 +
10176 + PINMUX_DATA(PJ22_DATA, PJ22MD_000),
10177 + PINMUX_DATA(DV_DATA22_MARK, PJ22MD_001),
10178 + PINMUX_DATA(LCD_DATA22_PJ22_MARK, PJ22MD_010),
10179 + PINMUX_DATA(LCD_TCON5_MARK, PJ22MD_011),
10180 + PINMUX_DATA(IRQ2_PJ_MARK, PJ22MD_100),
10181 +- PINMUX_DATA(CRX1_MARK, PJ22MD_101),
10182 +- PINMUX_DATA(CRX0_CRX1_MARK, PJ22MD_110),
10183 ++ PINMUX_DATA(CRX1_PJ22_MARK, PJ22MD_101),
10184 ++ PINMUX_DATA(CRX0_CRX1_PJ22_MARK, PJ22MD_110),
10185 +
10186 + PINMUX_DATA(PJ21_DATA, PJ21MD_000),
10187 + PINMUX_DATA(DV_DATA21_MARK, PJ21MD_001),
10188 + PINMUX_DATA(LCD_DATA21_PJ21_MARK, PJ21MD_010),
10189 + PINMUX_DATA(LCD_TCON4_MARK, PJ21MD_011),
10190 + PINMUX_DATA(IRQ1_PJ_MARK, PJ21MD_100),
10191 +- PINMUX_DATA(CTX2_MARK, PJ21MD_101),
10192 ++ PINMUX_DATA(CTX2_PJ21_MARK, PJ21MD_101),
10193 ++ PINMUX_DATA(CTX0_CTX1_CTX2_PJ21_MARK, PJ21MD_110),
10194 +
10195 + PINMUX_DATA(PJ20_DATA, PJ20MD_000),
10196 + PINMUX_DATA(DV_DATA20_MARK, PJ20MD_001),
10197 + PINMUX_DATA(LCD_DATA20_PJ20_MARK, PJ20MD_010),
10198 + PINMUX_DATA(LCD_TCON3_MARK, PJ20MD_011),
10199 + PINMUX_DATA(IRQ0_PJ_MARK, PJ20MD_100),
10200 +- PINMUX_DATA(CRX2_MARK, PJ20MD_101),
10201 +- PINMUX_DATA(CRX0CRX1CRX2_PJ20_MARK, PJ20MD_110),
10202 ++ PINMUX_DATA(CRX2_PJ20_MARK, PJ20MD_101),
10203 ++ PINMUX_DATA(CRX0_CRX1_CRX2_PJ20_MARK, PJ20MD_110),
10204 +
10205 + PINMUX_DATA(PJ19_DATA, PJ19MD_000),
10206 + PINMUX_DATA(DV_DATA19_MARK, PJ19MD_001),
10207 +@@ -1666,12 +1669,24 @@ static const struct pinmux_func pinmux_func_gpios[] = {
10208 + GPIO_FN(WDTOVF),
10209 +
10210 + /* CAN */
10211 ++ GPIO_FN(CTX2),
10212 ++ GPIO_FN(CRX2),
10213 + GPIO_FN(CTX1),
10214 + GPIO_FN(CRX1),
10215 + GPIO_FN(CTX0),
10216 + GPIO_FN(CRX0),
10217 ++ GPIO_FN(CTX0_CTX1),
10218 + GPIO_FN(CRX0_CRX1),
10219 ++ GPIO_FN(CTX0_CTX1_CTX2),
10220 + GPIO_FN(CRX0_CRX1_CRX2),
10221 ++ GPIO_FN(CTX2_PJ21),
10222 ++ GPIO_FN(CRX2_PJ20),
10223 ++ GPIO_FN(CTX1_PJ23),
10224 ++ GPIO_FN(CRX1_PJ22),
10225 ++ GPIO_FN(CTX0_CTX1_PJ23),
10226 ++ GPIO_FN(CRX0_CRX1_PJ22),
10227 ++ GPIO_FN(CTX0_CTX1_CTX2_PJ21),
10228 ++ GPIO_FN(CRX0_CRX1_CRX2_PJ20),
10229 +
10230 + /* DMAC */
10231 + GPIO_FN(TEND0),
10232 +diff --git a/drivers/pwm/pwm-omap-dmtimer.c b/drivers/pwm/pwm-omap-dmtimer.c
10233 +index 5ad42f33e70c..2e15acf13893 100644
10234 +--- a/drivers/pwm/pwm-omap-dmtimer.c
10235 ++++ b/drivers/pwm/pwm-omap-dmtimer.c
10236 +@@ -337,6 +337,11 @@ static int pwm_omap_dmtimer_probe(struct platform_device *pdev)
10237 + static int pwm_omap_dmtimer_remove(struct platform_device *pdev)
10238 + {
10239 + struct pwm_omap_dmtimer_chip *omap = platform_get_drvdata(pdev);
10240 ++ int ret;
10241 ++
10242 ++ ret = pwmchip_remove(&omap->chip);
10243 ++ if (ret)
10244 ++ return ret;
10245 +
10246 + if (pm_runtime_active(&omap->dm_timer_pdev->dev))
10247 + omap->pdata->stop(omap->dm_timer);
10248 +@@ -345,7 +350,7 @@ static int pwm_omap_dmtimer_remove(struct platform_device *pdev)
10249 +
10250 + mutex_destroy(&omap->mutex);
10251 +
10252 +- return pwmchip_remove(&omap->chip);
10253 ++ return 0;
10254 + }
10255 +
10256 + static const struct of_device_id pwm_omap_dmtimer_of_match[] = {
10257 +diff --git a/drivers/regulator/rk808-regulator.c b/drivers/regulator/rk808-regulator.c
10258 +index dfa8d50a5d74..28646e4cf3ba 100644
10259 +--- a/drivers/regulator/rk808-regulator.c
10260 ++++ b/drivers/regulator/rk808-regulator.c
10261 +@@ -589,7 +589,7 @@ static int rk808_regulator_dt_parse_pdata(struct device *dev,
10262 + }
10263 +
10264 + if (!pdata->dvs_gpio[i]) {
10265 +- dev_warn(dev, "there is no dvs%d gpio\n", i);
10266 ++ dev_info(dev, "there is no dvs%d gpio\n", i);
10267 + continue;
10268 + }
10269 +
10270 +diff --git a/drivers/remoteproc/remoteproc_core.c b/drivers/remoteproc/remoteproc_core.c
10271 +index c6bfb3496684..b99780574044 100644
10272 +--- a/drivers/remoteproc/remoteproc_core.c
10273 ++++ b/drivers/remoteproc/remoteproc_core.c
10274 +@@ -1488,7 +1488,7 @@ static int __init remoteproc_init(void)
10275 +
10276 + return 0;
10277 + }
10278 +-module_init(remoteproc_init);
10279 ++subsys_initcall(remoteproc_init);
10280 +
10281 + static void __exit remoteproc_exit(void)
10282 + {
10283 +diff --git a/drivers/scsi/aic7xxx/aic7xxx_core.c b/drivers/scsi/aic7xxx/aic7xxx_core.c
10284 +index 64ab9eaec428..def3208dd290 100644
10285 +--- a/drivers/scsi/aic7xxx/aic7xxx_core.c
10286 ++++ b/drivers/scsi/aic7xxx/aic7xxx_core.c
10287 +@@ -2321,7 +2321,7 @@ ahc_find_syncrate(struct ahc_softc *ahc, u_int *period,
10288 + * At some speeds, we only support
10289 + * ST transfers.
10290 + */
10291 +- if ((syncrate->sxfr_u2 & ST_SXFR) != 0)
10292 ++ if ((syncrate->sxfr_u2 & ST_SXFR) != 0)
10293 + *ppr_options &= ~MSG_EXT_PPR_DT_REQ;
10294 + break;
10295 + }
10296 +diff --git a/drivers/scsi/iscsi_tcp.c b/drivers/scsi/iscsi_tcp.c
10297 +index d60564397be5..60c3e2bf8761 100644
10298 +--- a/drivers/scsi/iscsi_tcp.c
10299 ++++ b/drivers/scsi/iscsi_tcp.c
10300 +@@ -882,6 +882,10 @@ free_host:
10301 + static void iscsi_sw_tcp_session_destroy(struct iscsi_cls_session *cls_session)
10302 + {
10303 + struct Scsi_Host *shost = iscsi_session_to_shost(cls_session);
10304 ++ struct iscsi_session *session = cls_session->dd_data;
10305 ++
10306 ++ if (WARN_ON_ONCE(session->leadconn))
10307 ++ return;
10308 +
10309 + iscsi_tcp_r2tpool_free(cls_session->dd_data);
10310 + iscsi_session_teardown(cls_session);
10311 +diff --git a/drivers/scsi/qla2xxx/qla_os.c b/drivers/scsi/qla2xxx/qla_os.c
10312 +index e730aabc26d0..65bbca715f57 100644
10313 +--- a/drivers/scsi/qla2xxx/qla_os.c
10314 ++++ b/drivers/scsi/qla2xxx/qla_os.c
10315 +@@ -451,6 +451,12 @@ static int qla25xx_setup_mode(struct scsi_qla_host *vha)
10316 + goto fail;
10317 + }
10318 + if (ql2xmultique_tag) {
10319 ++ ha->wq = alloc_workqueue("qla2xxx_wq", WQ_MEM_RECLAIM, 1);
10320 ++ if (unlikely(!ha->wq)) {
10321 ++ ql_log(ql_log_warn, vha, 0x01e0,
10322 ++ "Failed to alloc workqueue.\n");
10323 ++ goto fail;
10324 ++ }
10325 + /* create a request queue for IO */
10326 + options |= BIT_7;
10327 + req = qla25xx_create_req_que(ha, options, 0, 0, -1,
10328 +@@ -458,9 +464,8 @@ static int qla25xx_setup_mode(struct scsi_qla_host *vha)
10329 + if (!req) {
10330 + ql_log(ql_log_warn, vha, 0x00e0,
10331 + "Failed to create request queue.\n");
10332 +- goto fail;
10333 ++ goto fail2;
10334 + }
10335 +- ha->wq = alloc_workqueue("qla2xxx_wq", WQ_MEM_RECLAIM, 1);
10336 + vha->req = ha->req_q_map[req];
10337 + options |= BIT_1;
10338 + for (ques = 1; ques < ha->max_rsp_queues; ques++) {
10339 +@@ -468,7 +473,7 @@ static int qla25xx_setup_mode(struct scsi_qla_host *vha)
10340 + if (!ret) {
10341 + ql_log(ql_log_warn, vha, 0x00e8,
10342 + "Failed to create response queue.\n");
10343 +- goto fail2;
10344 ++ goto fail3;
10345 + }
10346 + }
10347 + ha->flags.cpu_affinity_enabled = 1;
10348 +@@ -482,11 +487,13 @@ static int qla25xx_setup_mode(struct scsi_qla_host *vha)
10349 + ha->max_rsp_queues, ha->max_req_queues);
10350 + }
10351 + return 0;
10352 +-fail2:
10353 ++
10354 ++fail3:
10355 + qla25xx_delete_queues(vha);
10356 +- destroy_workqueue(ha->wq);
10357 +- ha->wq = NULL;
10358 + vha->req = ha->req_q_map[0];
10359 ++fail2:
10360 ++ destroy_workqueue(ha->wq);
10361 ++ ha->wq = NULL;
10362 + fail:
10363 + ha->mqenable = 0;
10364 + kfree(ha->req_q_map);
10365 +diff --git a/drivers/scsi/scsi_transport_iscsi.c b/drivers/scsi/scsi_transport_iscsi.c
10366 +index ab7bc4e63425..fff9c4d0f7c8 100644
10367 +--- a/drivers/scsi/scsi_transport_iscsi.c
10368 ++++ b/drivers/scsi/scsi_transport_iscsi.c
10369 +@@ -2964,6 +2964,24 @@ iscsi_set_path(struct iscsi_transport *transport, struct iscsi_uevent *ev)
10370 + return err;
10371 + }
10372 +
10373 ++static int iscsi_session_has_conns(int sid)
10374 ++{
10375 ++ struct iscsi_cls_conn *conn;
10376 ++ unsigned long flags;
10377 ++ int found = 0;
10378 ++
10379 ++ spin_lock_irqsave(&connlock, flags);
10380 ++ list_for_each_entry(conn, &connlist, conn_list) {
10381 ++ if (iscsi_conn_get_sid(conn) == sid) {
10382 ++ found = 1;
10383 ++ break;
10384 ++ }
10385 ++ }
10386 ++ spin_unlock_irqrestore(&connlock, flags);
10387 ++
10388 ++ return found;
10389 ++}
10390 ++
10391 + static int
10392 + iscsi_set_iface_params(struct iscsi_transport *transport,
10393 + struct iscsi_uevent *ev, uint32_t len)
10394 +@@ -3538,10 +3556,12 @@ iscsi_if_recv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, uint32_t *group)
10395 + break;
10396 + case ISCSI_UEVENT_DESTROY_SESSION:
10397 + session = iscsi_session_lookup(ev->u.d_session.sid);
10398 +- if (session)
10399 +- transport->destroy_session(session);
10400 +- else
10401 ++ if (!session)
10402 + err = -EINVAL;
10403 ++ else if (iscsi_session_has_conns(ev->u.d_session.sid))
10404 ++ err = -EBUSY;
10405 ++ else
10406 ++ transport->destroy_session(session);
10407 + break;
10408 + case ISCSI_UEVENT_UNBIND_SESSION:
10409 + session = iscsi_session_lookup(ev->u.d_session.sid);
10410 +diff --git a/drivers/soc/tegra/fuse/tegra-apbmisc.c b/drivers/soc/tegra/fuse/tegra-apbmisc.c
10411 +index 5b18f6ffa45c..cd61c883c19f 100644
10412 +--- a/drivers/soc/tegra/fuse/tegra-apbmisc.c
10413 ++++ b/drivers/soc/tegra/fuse/tegra-apbmisc.c
10414 +@@ -134,7 +134,7 @@ void __init tegra_init_apbmisc(void)
10415 + apbmisc.flags = IORESOURCE_MEM;
10416 +
10417 + /* strapping options */
10418 +- if (tegra_get_chip_id() == TEGRA124) {
10419 ++ if (of_machine_is_compatible("nvidia,tegra124")) {
10420 + straps.start = 0x7000e864;
10421 + straps.end = 0x7000e867;
10422 + } else {
10423 +diff --git a/drivers/staging/android/ashmem.c b/drivers/staging/android/ashmem.c
10424 +index c6314d1552ea..99fd4f53c856 100644
10425 +--- a/drivers/staging/android/ashmem.c
10426 ++++ b/drivers/staging/android/ashmem.c
10427 +@@ -370,8 +370,23 @@ static inline vm_flags_t calc_vm_may_flags(unsigned long prot)
10428 + _calc_vm_trans(prot, PROT_EXEC, VM_MAYEXEC);
10429 + }
10430 +
10431 ++static int ashmem_vmfile_mmap(struct file *file, struct vm_area_struct *vma)
10432 ++{
10433 ++ /* do not allow to mmap ashmem backing shmem file directly */
10434 ++ return -EPERM;
10435 ++}
10436 ++
10437 ++static unsigned long
10438 ++ashmem_vmfile_get_unmapped_area(struct file *file, unsigned long addr,
10439 ++ unsigned long len, unsigned long pgoff,
10440 ++ unsigned long flags)
10441 ++{
10442 ++ return current->mm->get_unmapped_area(file, addr, len, pgoff, flags);
10443 ++}
10444 ++
10445 + static int ashmem_mmap(struct file *file, struct vm_area_struct *vma)
10446 + {
10447 ++ static struct file_operations vmfile_fops;
10448 + struct ashmem_area *asma = file->private_data;
10449 + int ret = 0;
10450 +
10451 +@@ -412,6 +427,19 @@ static int ashmem_mmap(struct file *file, struct vm_area_struct *vma)
10452 + }
10453 + vmfile->f_mode |= FMODE_LSEEK;
10454 + asma->file = vmfile;
10455 ++ /*
10456 ++ * override mmap operation of the vmfile so that it can't be
10457 ++ * remapped which would lead to creation of a new vma with no
10458 ++ * asma permission checks. Have to override get_unmapped_area
10459 ++ * as well to prevent VM_BUG_ON check for f_ops modification.
10460 ++ */
10461 ++ if (!vmfile_fops.mmap) {
10462 ++ vmfile_fops = *vmfile->f_op;
10463 ++ vmfile_fops.mmap = ashmem_vmfile_mmap;
10464 ++ vmfile_fops.get_unmapped_area =
10465 ++ ashmem_vmfile_get_unmapped_area;
10466 ++ }
10467 ++ vmfile->f_op = &vmfile_fops;
10468 + }
10469 + get_file(asma->file);
10470 +
10471 +diff --git a/drivers/staging/greybus/audio_manager.c b/drivers/staging/greybus/audio_manager.c
10472 +index aa6508b44fab..ed7c32542cb3 100644
10473 +--- a/drivers/staging/greybus/audio_manager.c
10474 ++++ b/drivers/staging/greybus/audio_manager.c
10475 +@@ -90,8 +90,8 @@ void gb_audio_manager_remove_all(void)
10476 +
10477 + list_for_each_entry_safe(module, next, &modules_list, list) {
10478 + list_del(&module->list);
10479 +- kobject_put(&module->kobj);
10480 + ida_simple_remove(&module_id, module->id);
10481 ++ kobject_put(&module->kobj);
10482 + }
10483 +
10484 + is_empty = list_empty(&modules_list);
10485 +diff --git a/drivers/staging/rtl8188eu/os_dep/ioctl_linux.c b/drivers/staging/rtl8188eu/os_dep/ioctl_linux.c
10486 +index c7bf8ab26192..50793c9df1b3 100644
10487 +--- a/drivers/staging/rtl8188eu/os_dep/ioctl_linux.c
10488 ++++ b/drivers/staging/rtl8188eu/os_dep/ioctl_linux.c
10489 +@@ -2052,7 +2052,7 @@ static int wpa_supplicant_ioctl(struct net_device *dev, struct iw_point *p)
10490 + struct ieee_param *param;
10491 + uint ret = 0;
10492 +
10493 +- if (p->length < sizeof(struct ieee_param) || !p->pointer) {
10494 ++ if (!p->pointer || p->length != sizeof(struct ieee_param)) {
10495 + ret = -EINVAL;
10496 + goto out;
10497 + }
10498 +@@ -2859,7 +2859,7 @@ static int rtw_hostapd_ioctl(struct net_device *dev, struct iw_point *p)
10499 + goto out;
10500 + }
10501 +
10502 +- if (!p->pointer) {
10503 ++ if (!p->pointer || p->length != sizeof(struct ieee_param)) {
10504 + ret = -EINVAL;
10505 + goto out;
10506 + }
10507 +diff --git a/drivers/staging/vt6656/dpc.c b/drivers/staging/vt6656/dpc.c
10508 +index 655f0002f880..7b73fa2f8834 100644
10509 +--- a/drivers/staging/vt6656/dpc.c
10510 ++++ b/drivers/staging/vt6656/dpc.c
10511 +@@ -140,7 +140,7 @@ int vnt_rx_data(struct vnt_private *priv, struct vnt_rcb *ptr_rcb,
10512 +
10513 + vnt_rf_rssi_to_dbm(priv, *rssi, &rx_dbm);
10514 +
10515 +- priv->bb_pre_ed_rssi = (u8)rx_dbm + 1;
10516 ++ priv->bb_pre_ed_rssi = (u8)-rx_dbm + 1;
10517 + priv->current_rssi = priv->bb_pre_ed_rssi;
10518 +
10519 + frame = skb_data + 8;
10520 +diff --git a/drivers/target/iscsi/iscsi_target.c b/drivers/target/iscsi/iscsi_target.c
10521 +index 9636d8744347..b6c4f55f79e7 100644
10522 +--- a/drivers/target/iscsi/iscsi_target.c
10523 ++++ b/drivers/target/iscsi/iscsi_target.c
10524 +@@ -1168,9 +1168,7 @@ int iscsit_setup_scsi_cmd(struct iscsi_conn *conn, struct iscsi_cmd *cmd,
10525 + hdr->cmdsn, be32_to_cpu(hdr->data_length), payload_length,
10526 + conn->cid);
10527 +
10528 +- if (target_get_sess_cmd(&cmd->se_cmd, true) < 0)
10529 +- return iscsit_add_reject_cmd(cmd,
10530 +- ISCSI_REASON_WAITING_FOR_LOGOUT, buf);
10531 ++ target_get_sess_cmd(&cmd->se_cmd, true);
10532 +
10533 + cmd->sense_reason = transport_lookup_cmd_lun(&cmd->se_cmd,
10534 + scsilun_to_int(&hdr->lun));
10535 +@@ -1988,9 +1986,7 @@ iscsit_handle_task_mgt_cmd(struct iscsi_conn *conn, struct iscsi_cmd *cmd,
10536 + conn->sess->se_sess, 0, DMA_NONE,
10537 + TCM_SIMPLE_TAG, cmd->sense_buffer + 2);
10538 +
10539 +- if (target_get_sess_cmd(&cmd->se_cmd, true) < 0)
10540 +- return iscsit_add_reject_cmd(cmd,
10541 +- ISCSI_REASON_WAITING_FOR_LOGOUT, buf);
10542 ++ target_get_sess_cmd(&cmd->se_cmd, true);
10543 +
10544 + /*
10545 + * TASK_REASSIGN for ERL=2 / connection stays inside of
10546 +@@ -4162,6 +4158,9 @@ int iscsit_close_connection(
10547 + iscsit_stop_nopin_response_timer(conn);
10548 + iscsit_stop_nopin_timer(conn);
10549 +
10550 ++ if (conn->conn_transport->iscsit_wait_conn)
10551 ++ conn->conn_transport->iscsit_wait_conn(conn);
10552 ++
10553 + /*
10554 + * During Connection recovery drop unacknowledged out of order
10555 + * commands for this connection, and prepare the other commands
10556 +@@ -4244,11 +4243,6 @@ int iscsit_close_connection(
10557 + * must wait until they have completed.
10558 + */
10559 + iscsit_check_conn_usage_count(conn);
10560 +- target_sess_cmd_list_set_waiting(sess->se_sess);
10561 +- target_wait_for_sess_cmds(sess->se_sess);
10562 +-
10563 +- if (conn->conn_transport->iscsit_wait_conn)
10564 +- conn->conn_transport->iscsit_wait_conn(conn);
10565 +
10566 + ahash_request_free(conn->conn_tx_hash);
10567 + if (conn->conn_rx_hash) {
10568 +diff --git a/drivers/tty/serial/atmel_serial.c b/drivers/tty/serial/atmel_serial.c
10569 +index 325f9db2da86..4a7eb85f7c85 100644
10570 +--- a/drivers/tty/serial/atmel_serial.c
10571 ++++ b/drivers/tty/serial/atmel_serial.c
10572 +@@ -501,7 +501,8 @@ static void atmel_stop_tx(struct uart_port *port)
10573 + atmel_uart_writel(port, ATMEL_US_IDR, atmel_port->tx_done_mask);
10574 +
10575 + if (atmel_uart_is_half_duplex(port))
10576 +- atmel_start_rx(port);
10577 ++ if (!atomic_read(&atmel_port->tasklet_shutdown))
10578 ++ atmel_start_rx(port);
10579 +
10580 + }
10581 +
10582 +diff --git a/drivers/tty/serial/imx.c b/drivers/tty/serial/imx.c
10583 +index e75bd8d7e6f6..325c38c9b451 100644
10584 +--- a/drivers/tty/serial/imx.c
10585 ++++ b/drivers/tty/serial/imx.c
10586 +@@ -532,7 +532,7 @@ static void imx_dma_tx(struct imx_port *sport)
10587 +
10588 + sport->tx_bytes = uart_circ_chars_pending(xmit);
10589 +
10590 +- if (xmit->tail < xmit->head) {
10591 ++ if (xmit->tail < xmit->head || xmit->head == 0) {
10592 + sport->dma_tx_nents = 1;
10593 + sg_init_one(sgl, xmit->buf + xmit->tail, sport->tx_bytes);
10594 + } else {
10595 +diff --git a/drivers/tty/synclink_gt.c b/drivers/tty/synclink_gt.c
10596 +index e645ee1cfd98..7446ce29f677 100644
10597 +--- a/drivers/tty/synclink_gt.c
10598 ++++ b/drivers/tty/synclink_gt.c
10599 +@@ -1349,10 +1349,10 @@ static void throttle(struct tty_struct * tty)
10600 + DBGINFO(("%s throttle\n", info->device_name));
10601 + if (I_IXOFF(tty))
10602 + send_xchar(tty, STOP_CHAR(tty));
10603 +- if (C_CRTSCTS(tty)) {
10604 ++ if (C_CRTSCTS(tty)) {
10605 + spin_lock_irqsave(&info->lock,flags);
10606 + info->signals &= ~SerialSignal_RTS;
10607 +- set_signals(info);
10608 ++ set_signals(info);
10609 + spin_unlock_irqrestore(&info->lock,flags);
10610 + }
10611 + }
10612 +@@ -1374,10 +1374,10 @@ static void unthrottle(struct tty_struct * tty)
10613 + else
10614 + send_xchar(tty, START_CHAR(tty));
10615 + }
10616 +- if (C_CRTSCTS(tty)) {
10617 ++ if (C_CRTSCTS(tty)) {
10618 + spin_lock_irqsave(&info->lock,flags);
10619 + info->signals |= SerialSignal_RTS;
10620 +- set_signals(info);
10621 ++ set_signals(info);
10622 + spin_unlock_irqrestore(&info->lock,flags);
10623 + }
10624 + }
10625 +@@ -2576,8 +2576,8 @@ static void change_params(struct slgt_info *info)
10626 + info->read_status_mask = IRQ_RXOVER;
10627 + if (I_INPCK(info->port.tty))
10628 + info->read_status_mask |= MASK_PARITY | MASK_FRAMING;
10629 +- if (I_BRKINT(info->port.tty) || I_PARMRK(info->port.tty))
10630 +- info->read_status_mask |= MASK_BREAK;
10631 ++ if (I_BRKINT(info->port.tty) || I_PARMRK(info->port.tty))
10632 ++ info->read_status_mask |= MASK_BREAK;
10633 + if (I_IGNPAR(info->port.tty))
10634 + info->ignore_status_mask |= MASK_PARITY | MASK_FRAMING;
10635 + if (I_IGNBRK(info->port.tty)) {
10636 +@@ -3208,7 +3208,7 @@ static int tiocmset(struct tty_struct *tty,
10637 + info->signals &= ~SerialSignal_DTR;
10638 +
10639 + spin_lock_irqsave(&info->lock,flags);
10640 +- set_signals(info);
10641 ++ set_signals(info);
10642 + spin_unlock_irqrestore(&info->lock,flags);
10643 + return 0;
10644 + }
10645 +@@ -3219,7 +3219,7 @@ static int carrier_raised(struct tty_port *port)
10646 + struct slgt_info *info = container_of(port, struct slgt_info, port);
10647 +
10648 + spin_lock_irqsave(&info->lock,flags);
10649 +- get_signals(info);
10650 ++ get_signals(info);
10651 + spin_unlock_irqrestore(&info->lock,flags);
10652 + return (info->signals & SerialSignal_DCD) ? 1 : 0;
10653 + }
10654 +@@ -3234,7 +3234,7 @@ static void dtr_rts(struct tty_port *port, int on)
10655 + info->signals |= SerialSignal_RTS | SerialSignal_DTR;
10656 + else
10657 + info->signals &= ~(SerialSignal_RTS | SerialSignal_DTR);
10658 +- set_signals(info);
10659 ++ set_signals(info);
10660 + spin_unlock_irqrestore(&info->lock,flags);
10661 + }
10662 +
10663 +diff --git a/drivers/tty/synclinkmp.c b/drivers/tty/synclinkmp.c
10664 +index dec156586de1..2f6df8d74b4a 100644
10665 +--- a/drivers/tty/synclinkmp.c
10666 ++++ b/drivers/tty/synclinkmp.c
10667 +@@ -1467,10 +1467,10 @@ static void throttle(struct tty_struct * tty)
10668 + if (I_IXOFF(tty))
10669 + send_xchar(tty, STOP_CHAR(tty));
10670 +
10671 +- if (C_CRTSCTS(tty)) {
10672 ++ if (C_CRTSCTS(tty)) {
10673 + spin_lock_irqsave(&info->lock,flags);
10674 + info->serial_signals &= ~SerialSignal_RTS;
10675 +- set_signals(info);
10676 ++ set_signals(info);
10677 + spin_unlock_irqrestore(&info->lock,flags);
10678 + }
10679 + }
10680 +@@ -1496,10 +1496,10 @@ static void unthrottle(struct tty_struct * tty)
10681 + send_xchar(tty, START_CHAR(tty));
10682 + }
10683 +
10684 +- if (C_CRTSCTS(tty)) {
10685 ++ if (C_CRTSCTS(tty)) {
10686 + spin_lock_irqsave(&info->lock,flags);
10687 + info->serial_signals |= SerialSignal_RTS;
10688 +- set_signals(info);
10689 ++ set_signals(info);
10690 + spin_unlock_irqrestore(&info->lock,flags);
10691 + }
10692 + }
10693 +@@ -2485,7 +2485,7 @@ static void isr_io_pin( SLMP_INFO *info, u16 status )
10694 + if (status & SerialSignal_CTS) {
10695 + if ( debug_level >= DEBUG_LEVEL_ISR )
10696 + printk("CTS tx start...");
10697 +- info->port.tty->hw_stopped = 0;
10698 ++ info->port.tty->hw_stopped = 0;
10699 + tx_start(info);
10700 + info->pending_bh |= BH_TRANSMIT;
10701 + return;
10702 +@@ -2494,7 +2494,7 @@ static void isr_io_pin( SLMP_INFO *info, u16 status )
10703 + if (!(status & SerialSignal_CTS)) {
10704 + if ( debug_level >= DEBUG_LEVEL_ISR )
10705 + printk("CTS tx stop...");
10706 +- info->port.tty->hw_stopped = 1;
10707 ++ info->port.tty->hw_stopped = 1;
10708 + tx_stop(info);
10709 + }
10710 + }
10711 +@@ -2821,8 +2821,8 @@ static void change_params(SLMP_INFO *info)
10712 + info->read_status_mask2 = OVRN;
10713 + if (I_INPCK(info->port.tty))
10714 + info->read_status_mask2 |= PE | FRME;
10715 +- if (I_BRKINT(info->port.tty) || I_PARMRK(info->port.tty))
10716 +- info->read_status_mask1 |= BRKD;
10717 ++ if (I_BRKINT(info->port.tty) || I_PARMRK(info->port.tty))
10718 ++ info->read_status_mask1 |= BRKD;
10719 + if (I_IGNPAR(info->port.tty))
10720 + info->ignore_status_mask2 |= PE | FRME;
10721 + if (I_IGNBRK(info->port.tty)) {
10722 +@@ -3192,7 +3192,7 @@ static int tiocmget(struct tty_struct *tty)
10723 + unsigned long flags;
10724 +
10725 + spin_lock_irqsave(&info->lock,flags);
10726 +- get_signals(info);
10727 ++ get_signals(info);
10728 + spin_unlock_irqrestore(&info->lock,flags);
10729 +
10730 + result = ((info->serial_signals & SerialSignal_RTS) ? TIOCM_RTS : 0) |
10731 +@@ -3230,7 +3230,7 @@ static int tiocmset(struct tty_struct *tty,
10732 + info->serial_signals &= ~SerialSignal_DTR;
10733 +
10734 + spin_lock_irqsave(&info->lock,flags);
10735 +- set_signals(info);
10736 ++ set_signals(info);
10737 + spin_unlock_irqrestore(&info->lock,flags);
10738 +
10739 + return 0;
10740 +@@ -3242,7 +3242,7 @@ static int carrier_raised(struct tty_port *port)
10741 + unsigned long flags;
10742 +
10743 + spin_lock_irqsave(&info->lock,flags);
10744 +- get_signals(info);
10745 ++ get_signals(info);
10746 + spin_unlock_irqrestore(&info->lock,flags);
10747 +
10748 + return (info->serial_signals & SerialSignal_DCD) ? 1 : 0;
10749 +@@ -3258,7 +3258,7 @@ static void dtr_rts(struct tty_port *port, int on)
10750 + info->serial_signals |= SerialSignal_RTS | SerialSignal_DTR;
10751 + else
10752 + info->serial_signals &= ~(SerialSignal_RTS | SerialSignal_DTR);
10753 +- set_signals(info);
10754 ++ set_signals(info);
10755 + spin_unlock_irqrestore(&info->lock,flags);
10756 + }
10757 +
10758 +diff --git a/drivers/tty/vt/selection.c b/drivers/tty/vt/selection.c
10759 +index 368ce1803e8f..6ac05021c4a7 100644
10760 +--- a/drivers/tty/vt/selection.c
10761 ++++ b/drivers/tty/vt/selection.c
10762 +@@ -341,6 +341,7 @@ int paste_selection(struct tty_struct *tty)
10763 + unsigned int count;
10764 + struct tty_ldisc *ld;
10765 + DECLARE_WAITQUEUE(wait, current);
10766 ++ int ret = 0;
10767 +
10768 + console_lock();
10769 + poke_blanked_console();
10770 +@@ -354,6 +355,10 @@ int paste_selection(struct tty_struct *tty)
10771 + add_wait_queue(&vc->paste_wait, &wait);
10772 + while (sel_buffer && sel_buffer_lth > pasted) {
10773 + set_current_state(TASK_INTERRUPTIBLE);
10774 ++ if (signal_pending(current)) {
10775 ++ ret = -EINTR;
10776 ++ break;
10777 ++ }
10778 + if (tty_throttled(tty)) {
10779 + schedule();
10780 + continue;
10781 +@@ -369,5 +374,5 @@ int paste_selection(struct tty_struct *tty)
10782 +
10783 + tty_buffer_unlock_exclusive(&vc->port);
10784 + tty_ldisc_deref(ld);
10785 +- return 0;
10786 ++ return ret;
10787 + }
10788 +diff --git a/drivers/tty/vt/vt_ioctl.c b/drivers/tty/vt/vt_ioctl.c
10789 +index 638eb9bbd59f..e8efb270dc8f 100644
10790 +--- a/drivers/tty/vt/vt_ioctl.c
10791 ++++ b/drivers/tty/vt/vt_ioctl.c
10792 +@@ -850,58 +850,49 @@ int vt_ioctl(struct tty_struct *tty,
10793 +
10794 + case VT_RESIZEX:
10795 + {
10796 +- struct vt_consize __user *vtconsize = up;
10797 +- ushort ll,cc,vlin,clin,vcol,ccol;
10798 ++ struct vt_consize v;
10799 + if (!perm)
10800 + return -EPERM;
10801 +- if (!access_ok(VERIFY_READ, vtconsize,
10802 +- sizeof(struct vt_consize))) {
10803 +- ret = -EFAULT;
10804 +- break;
10805 +- }
10806 ++ if (copy_from_user(&v, up, sizeof(struct vt_consize)))
10807 ++ return -EFAULT;
10808 + /* FIXME: Should check the copies properly */
10809 +- __get_user(ll, &vtconsize->v_rows);
10810 +- __get_user(cc, &vtconsize->v_cols);
10811 +- __get_user(vlin, &vtconsize->v_vlin);
10812 +- __get_user(clin, &vtconsize->v_clin);
10813 +- __get_user(vcol, &vtconsize->v_vcol);
10814 +- __get_user(ccol, &vtconsize->v_ccol);
10815 +- vlin = vlin ? vlin : vc->vc_scan_lines;
10816 +- if (clin) {
10817 +- if (ll) {
10818 +- if (ll != vlin/clin) {
10819 +- /* Parameters don't add up */
10820 +- ret = -EINVAL;
10821 +- break;
10822 +- }
10823 +- } else
10824 +- ll = vlin/clin;
10825 ++ if (!v.v_vlin)
10826 ++ v.v_vlin = vc->vc_scan_lines;
10827 ++ if (v.v_clin) {
10828 ++ int rows = v.v_vlin/v.v_clin;
10829 ++ if (v.v_rows != rows) {
10830 ++ if (v.v_rows) /* Parameters don't add up */
10831 ++ return -EINVAL;
10832 ++ v.v_rows = rows;
10833 ++ }
10834 + }
10835 +- if (vcol && ccol) {
10836 +- if (cc) {
10837 +- if (cc != vcol/ccol) {
10838 +- ret = -EINVAL;
10839 +- break;
10840 +- }
10841 +- } else
10842 +- cc = vcol/ccol;
10843 ++ if (v.v_vcol && v.v_ccol) {
10844 ++ int cols = v.v_vcol/v.v_ccol;
10845 ++ if (v.v_cols != cols) {
10846 ++ if (v.v_cols)
10847 ++ return -EINVAL;
10848 ++ v.v_cols = cols;
10849 ++ }
10850 + }
10851 +
10852 +- if (clin > 32) {
10853 +- ret = -EINVAL;
10854 +- break;
10855 +- }
10856 +-
10857 ++ if (v.v_clin > 32)
10858 ++ return -EINVAL;
10859 ++
10860 + for (i = 0; i < MAX_NR_CONSOLES; i++) {
10861 ++ struct vc_data *vcp;
10862 ++
10863 + if (!vc_cons[i].d)
10864 + continue;
10865 + console_lock();
10866 +- if (vlin)
10867 +- vc_cons[i].d->vc_scan_lines = vlin;
10868 +- if (clin)
10869 +- vc_cons[i].d->vc_font.height = clin;
10870 +- vc_cons[i].d->vc_resize_user = 1;
10871 +- vc_resize(vc_cons[i].d, cc, ll);
10872 ++ vcp = vc_cons[i].d;
10873 ++ if (vcp) {
10874 ++ if (v.v_vlin)
10875 ++ vcp->vc_scan_lines = v.v_vlin;
10876 ++ if (v.v_clin)
10877 ++ vcp->vc_font.height = v.v_clin;
10878 ++ vcp->vc_resize_user = 1;
10879 ++ vc_resize(vcp, v.v_cols, v.v_rows);
10880 ++ }
10881 + console_unlock();
10882 + }
10883 + break;
10884 +diff --git a/drivers/uio/uio_dmem_genirq.c b/drivers/uio/uio_dmem_genirq.c
10885 +index e1134a4d97f3..a00b4aee6c79 100644
10886 +--- a/drivers/uio/uio_dmem_genirq.c
10887 ++++ b/drivers/uio/uio_dmem_genirq.c
10888 +@@ -135,11 +135,13 @@ static int uio_dmem_genirq_irqcontrol(struct uio_info *dev_info, s32 irq_on)
10889 + if (irq_on) {
10890 + if (test_and_clear_bit(0, &priv->flags))
10891 + enable_irq(dev_info->irq);
10892 ++ spin_unlock_irqrestore(&priv->lock, flags);
10893 + } else {
10894 +- if (!test_and_set_bit(0, &priv->flags))
10895 ++ if (!test_and_set_bit(0, &priv->flags)) {
10896 ++ spin_unlock_irqrestore(&priv->lock, flags);
10897 + disable_irq(dev_info->irq);
10898 ++ }
10899 + }
10900 +- spin_unlock_irqrestore(&priv->lock, flags);
10901 +
10902 + return 0;
10903 + }
10904 +diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c
10905 +index 9f05f9a81f69..3fcc3e74ae2e 100644
10906 +--- a/drivers/usb/core/hub.c
10907 ++++ b/drivers/usb/core/hub.c
10908 +@@ -1187,11 +1187,6 @@ static void hub_activate(struct usb_hub *hub, enum hub_activation_type type)
10909 + #ifdef CONFIG_PM
10910 + udev->reset_resume = 1;
10911 + #endif
10912 +- /* Don't set the change_bits when the device
10913 +- * was powered off.
10914 +- */
10915 +- if (test_bit(port1, hub->power_bits))
10916 +- set_bit(port1, hub->change_bits);
10917 +
10918 + } else {
10919 + /* The power session is gone; tell hub_wq */
10920 +diff --git a/drivers/usb/core/quirks.c b/drivers/usb/core/quirks.c
10921 +index 19e819aa2419..ad8307140df8 100644
10922 +--- a/drivers/usb/core/quirks.c
10923 ++++ b/drivers/usb/core/quirks.c
10924 +@@ -291,6 +291,9 @@ static const struct usb_device_id usb_quirk_list[] = {
10925 + /* INTEL VALUE SSD */
10926 + { USB_DEVICE(0x8086, 0xf1a5), .driver_info = USB_QUIRK_RESET_RESUME },
10927 +
10928 ++ /* novation SoundControl XL */
10929 ++ { USB_DEVICE(0x1235, 0x0061), .driver_info = USB_QUIRK_RESET_RESUME },
10930 ++
10931 + { } /* terminating entry must be last */
10932 + };
10933 +
10934 +diff --git a/drivers/usb/gadget/composite.c b/drivers/usb/gadget/composite.c
10935 +index 854c4ec0af2c..4d7df2f6caf5 100644
10936 +--- a/drivers/usb/gadget/composite.c
10937 ++++ b/drivers/usb/gadget/composite.c
10938 +@@ -437,12 +437,10 @@ static u8 encode_bMaxPower(enum usb_device_speed speed,
10939 + val = CONFIG_USB_GADGET_VBUS_DRAW;
10940 + if (!val)
10941 + return 0;
10942 +- switch (speed) {
10943 +- case USB_SPEED_SUPER:
10944 +- return DIV_ROUND_UP(val, 8);
10945 +- default:
10946 ++ if (speed < USB_SPEED_SUPER)
10947 + return DIV_ROUND_UP(val, 2);
10948 +- }
10949 ++ else
10950 ++ return DIV_ROUND_UP(val, 8);
10951 + }
10952 +
10953 + static int config_buf(struct usb_configuration *config,
10954 +diff --git a/drivers/usb/gadget/udc/gr_udc.c b/drivers/usb/gadget/udc/gr_udc.c
10955 +index 39b7136d31d9..9e246d2e55ca 100644
10956 +--- a/drivers/usb/gadget/udc/gr_udc.c
10957 ++++ b/drivers/usb/gadget/udc/gr_udc.c
10958 +@@ -2200,8 +2200,6 @@ static int gr_probe(struct platform_device *pdev)
10959 + return -ENOMEM;
10960 + }
10961 +
10962 +- spin_lock(&dev->lock);
10963 +-
10964 + /* Inside lock so that no gadget can use this udc until probe is done */
10965 + retval = usb_add_gadget_udc(dev->dev, &dev->gadget);
10966 + if (retval) {
10967 +@@ -2210,15 +2208,21 @@ static int gr_probe(struct platform_device *pdev)
10968 + }
10969 + dev->added = 1;
10970 +
10971 ++ spin_lock(&dev->lock);
10972 ++
10973 + retval = gr_udc_init(dev);
10974 +- if (retval)
10975 ++ if (retval) {
10976 ++ spin_unlock(&dev->lock);
10977 + goto out;
10978 +-
10979 +- gr_dfs_create(dev);
10980 ++ }
10981 +
10982 + /* Clear all interrupt enables that might be left on since last boot */
10983 + gr_disable_interrupts_and_pullup(dev);
10984 +
10985 ++ spin_unlock(&dev->lock);
10986 ++
10987 ++ gr_dfs_create(dev);
10988 ++
10989 + retval = gr_request_irq(dev, dev->irq);
10990 + if (retval) {
10991 + dev_err(dev->dev, "Failed to request irq %d\n", dev->irq);
10992 +@@ -2247,8 +2251,6 @@ static int gr_probe(struct platform_device *pdev)
10993 + dev_info(dev->dev, "regs: %p, irq %d\n", dev->regs, dev->irq);
10994 +
10995 + out:
10996 +- spin_unlock(&dev->lock);
10997 +-
10998 + if (retval)
10999 + gr_remove(pdev);
11000 +
11001 +diff --git a/drivers/usb/host/xhci-mem.c b/drivers/usb/host/xhci-mem.c
11002 +index aad64a26a767..3cca60b845a8 100644
11003 +--- a/drivers/usb/host/xhci-mem.c
11004 ++++ b/drivers/usb/host/xhci-mem.c
11005 +@@ -1532,9 +1532,15 @@ int xhci_endpoint_init(struct xhci_hcd *xhci,
11006 + /* Allow 3 retries for everything but isoc, set CErr = 3 */
11007 + if (!usb_endpoint_xfer_isoc(&ep->desc))
11008 + err_count = 3;
11009 +- /* Some devices get this wrong */
11010 +- if (usb_endpoint_xfer_bulk(&ep->desc) && udev->speed == USB_SPEED_HIGH)
11011 +- max_packet = 512;
11012 ++ /* HS bulk max packet should be 512, FS bulk supports 8, 16, 32 or 64 */
11013 ++ if (usb_endpoint_xfer_bulk(&ep->desc)) {
11014 ++ if (udev->speed == USB_SPEED_HIGH)
11015 ++ max_packet = 512;
11016 ++ if (udev->speed == USB_SPEED_FULL) {
11017 ++ max_packet = rounddown_pow_of_two(max_packet);
11018 ++ max_packet = clamp_val(max_packet, 8, 64);
11019 ++ }
11020 ++ }
11021 + /* xHCI 1.0 and 1.1 indicates that ctrl ep avg TRB Length should be 8 */
11022 + if (usb_endpoint_xfer_control(&ep->desc) && xhci->hci_version >= 0x100)
11023 + avg_trb_len = 8;
11024 +diff --git a/drivers/usb/host/xhci-pci.c b/drivers/usb/host/xhci-pci.c
11025 +index aec6b20262e9..4355fbc36fce 100644
11026 +--- a/drivers/usb/host/xhci-pci.c
11027 ++++ b/drivers/usb/host/xhci-pci.c
11028 +@@ -53,6 +53,7 @@
11029 + #define PCI_DEVICE_ID_INTEL_BROXTON_B_XHCI 0x1aa8
11030 + #define PCI_DEVICE_ID_INTEL_APL_XHCI 0x5aa8
11031 + #define PCI_DEVICE_ID_INTEL_DNV_XHCI 0x19d0
11032 ++#define PCI_DEVICE_ID_INTEL_CML_XHCI 0xa3af
11033 +
11034 + #define PCI_DEVICE_ID_ASMEDIA_1042A_XHCI 0x1142
11035 +
11036 +@@ -170,7 +171,8 @@ static void xhci_pci_quirks(struct device *dev, struct xhci_hcd *xhci)
11037 + pdev->device == PCI_DEVICE_ID_INTEL_BROXTON_M_XHCI ||
11038 + pdev->device == PCI_DEVICE_ID_INTEL_BROXTON_B_XHCI ||
11039 + pdev->device == PCI_DEVICE_ID_INTEL_APL_XHCI ||
11040 +- pdev->device == PCI_DEVICE_ID_INTEL_DNV_XHCI)) {
11041 ++ pdev->device == PCI_DEVICE_ID_INTEL_DNV_XHCI ||
11042 ++ pdev->device == PCI_DEVICE_ID_INTEL_CML_XHCI)) {
11043 + xhci->quirks |= XHCI_PME_STUCK_QUIRK;
11044 + }
11045 + if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
11046 +diff --git a/drivers/usb/musb/omap2430.c b/drivers/usb/musb/omap2430.c
11047 +index e8be8e39ab8f..457ad33f4caa 100644
11048 +--- a/drivers/usb/musb/omap2430.c
11049 ++++ b/drivers/usb/musb/omap2430.c
11050 +@@ -388,8 +388,6 @@ static const struct musb_platform_ops omap2430_ops = {
11051 + .init = omap2430_musb_init,
11052 + .exit = omap2430_musb_exit,
11053 +
11054 +- .set_vbus = omap2430_musb_set_vbus,
11055 +-
11056 + .enable = omap2430_musb_enable,
11057 + .disable = omap2430_musb_disable,
11058 +
11059 +diff --git a/drivers/usb/storage/uas.c b/drivers/usb/storage/uas.c
11060 +index a6999042e7ad..d022b5ff4cd0 100644
11061 +--- a/drivers/usb/storage/uas.c
11062 ++++ b/drivers/usb/storage/uas.c
11063 +@@ -46,6 +46,7 @@ struct uas_dev_info {
11064 + struct scsi_cmnd *cmnd[MAX_CMNDS];
11065 + spinlock_t lock;
11066 + struct work_struct work;
11067 ++ struct work_struct scan_work; /* for async scanning */
11068 + };
11069 +
11070 + enum {
11071 +@@ -115,6 +116,17 @@ out:
11072 + spin_unlock_irqrestore(&devinfo->lock, flags);
11073 + }
11074 +
11075 ++static void uas_scan_work(struct work_struct *work)
11076 ++{
11077 ++ struct uas_dev_info *devinfo =
11078 ++ container_of(work, struct uas_dev_info, scan_work);
11079 ++ struct Scsi_Host *shost = usb_get_intfdata(devinfo->intf);
11080 ++
11081 ++ dev_dbg(&devinfo->intf->dev, "starting scan\n");
11082 ++ scsi_scan_host(shost);
11083 ++ dev_dbg(&devinfo->intf->dev, "scan complete\n");
11084 ++}
11085 ++
11086 + static void uas_add_work(struct uas_cmd_info *cmdinfo)
11087 + {
11088 + struct scsi_pointer *scp = (void *)cmdinfo;
11089 +@@ -989,6 +1001,7 @@ static int uas_probe(struct usb_interface *intf, const struct usb_device_id *id)
11090 + init_usb_anchor(&devinfo->data_urbs);
11091 + spin_lock_init(&devinfo->lock);
11092 + INIT_WORK(&devinfo->work, uas_do_work);
11093 ++ INIT_WORK(&devinfo->scan_work, uas_scan_work);
11094 +
11095 + result = uas_configure_endpoints(devinfo);
11096 + if (result)
11097 +@@ -1005,7 +1018,9 @@ static int uas_probe(struct usb_interface *intf, const struct usb_device_id *id)
11098 + if (result)
11099 + goto free_streams;
11100 +
11101 +- scsi_scan_host(shost);
11102 ++ /* Submit the delayed_work for SCSI-device scanning */
11103 ++ schedule_work(&devinfo->scan_work);
11104 ++
11105 + return result;
11106 +
11107 + free_streams:
11108 +@@ -1173,6 +1188,12 @@ static void uas_disconnect(struct usb_interface *intf)
11109 + usb_kill_anchored_urbs(&devinfo->data_urbs);
11110 + uas_zap_pending(devinfo, DID_NO_CONNECT);
11111 +
11112 ++ /*
11113 ++ * Prevent SCSI scanning (if it hasn't started yet)
11114 ++ * or wait for the SCSI-scanning routine to stop.
11115 ++ */
11116 ++ cancel_work_sync(&devinfo->scan_work);
11117 ++
11118 + scsi_remove_host(shost);
11119 + uas_free_streams(devinfo);
11120 + scsi_host_put(shost);
11121 +diff --git a/drivers/video/fbdev/pxa168fb.c b/drivers/video/fbdev/pxa168fb.c
11122 +index d059d04c63ac..20195d3dbf08 100644
11123 +--- a/drivers/video/fbdev/pxa168fb.c
11124 ++++ b/drivers/video/fbdev/pxa168fb.c
11125 +@@ -769,8 +769,8 @@ failed_free_cmap:
11126 + failed_free_clk:
11127 + clk_disable_unprepare(fbi->clk);
11128 + failed_free_fbmem:
11129 +- dma_free_coherent(fbi->dev, info->fix.smem_len,
11130 +- info->screen_base, fbi->fb_start_dma);
11131 ++ dma_free_wc(fbi->dev, info->fix.smem_len,
11132 ++ info->screen_base, fbi->fb_start_dma);
11133 + failed_free_info:
11134 + kfree(info);
11135 +
11136 +@@ -804,7 +804,7 @@ static int pxa168fb_remove(struct platform_device *pdev)
11137 +
11138 + irq = platform_get_irq(pdev, 0);
11139 +
11140 +- dma_free_wc(fbi->dev, PAGE_ALIGN(info->fix.smem_len),
11141 ++ dma_free_wc(fbi->dev, info->fix.smem_len,
11142 + info->screen_base, info->fix.smem_start);
11143 +
11144 + clk_disable_unprepare(fbi->clk);
11145 +diff --git a/drivers/vme/bridges/vme_fake.c b/drivers/vme/bridges/vme_fake.c
11146 +index 30b3acc93833..e81ec763b555 100644
11147 +--- a/drivers/vme/bridges/vme_fake.c
11148 ++++ b/drivers/vme/bridges/vme_fake.c
11149 +@@ -418,8 +418,9 @@ static void fake_lm_check(struct fake_driver *bridge, unsigned long long addr,
11150 + }
11151 + }
11152 +
11153 +-static u8 fake_vmeread8(struct fake_driver *bridge, unsigned long long addr,
11154 +- u32 aspace, u32 cycle)
11155 ++static noinline_for_stack u8 fake_vmeread8(struct fake_driver *bridge,
11156 ++ unsigned long long addr,
11157 ++ u32 aspace, u32 cycle)
11158 + {
11159 + u8 retval = 0xff;
11160 + int i;
11161 +@@ -450,8 +451,9 @@ static u8 fake_vmeread8(struct fake_driver *bridge, unsigned long long addr,
11162 + return retval;
11163 + }
11164 +
11165 +-static u16 fake_vmeread16(struct fake_driver *bridge, unsigned long long addr,
11166 +- u32 aspace, u32 cycle)
11167 ++static noinline_for_stack u16 fake_vmeread16(struct fake_driver *bridge,
11168 ++ unsigned long long addr,
11169 ++ u32 aspace, u32 cycle)
11170 + {
11171 + u16 retval = 0xffff;
11172 + int i;
11173 +@@ -482,8 +484,9 @@ static u16 fake_vmeread16(struct fake_driver *bridge, unsigned long long addr,
11174 + return retval;
11175 + }
11176 +
11177 +-static u32 fake_vmeread32(struct fake_driver *bridge, unsigned long long addr,
11178 +- u32 aspace, u32 cycle)
11179 ++static noinline_for_stack u32 fake_vmeread32(struct fake_driver *bridge,
11180 ++ unsigned long long addr,
11181 ++ u32 aspace, u32 cycle)
11182 + {
11183 + u32 retval = 0xffffffff;
11184 + int i;
11185 +@@ -613,8 +616,9 @@ out:
11186 + return retval;
11187 + }
11188 +
11189 +-static void fake_vmewrite8(struct fake_driver *bridge, u8 *buf,
11190 +- unsigned long long addr, u32 aspace, u32 cycle)
11191 ++static noinline_for_stack void fake_vmewrite8(struct fake_driver *bridge,
11192 ++ u8 *buf, unsigned long long addr,
11193 ++ u32 aspace, u32 cycle)
11194 + {
11195 + int i;
11196 + unsigned long long start, end, offset;
11197 +@@ -643,8 +647,9 @@ static void fake_vmewrite8(struct fake_driver *bridge, u8 *buf,
11198 +
11199 + }
11200 +
11201 +-static void fake_vmewrite16(struct fake_driver *bridge, u16 *buf,
11202 +- unsigned long long addr, u32 aspace, u32 cycle)
11203 ++static noinline_for_stack void fake_vmewrite16(struct fake_driver *bridge,
11204 ++ u16 *buf, unsigned long long addr,
11205 ++ u32 aspace, u32 cycle)
11206 + {
11207 + int i;
11208 + unsigned long long start, end, offset;
11209 +@@ -673,8 +678,9 @@ static void fake_vmewrite16(struct fake_driver *bridge, u16 *buf,
11210 +
11211 + }
11212 +
11213 +-static void fake_vmewrite32(struct fake_driver *bridge, u32 *buf,
11214 +- unsigned long long addr, u32 aspace, u32 cycle)
11215 ++static noinline_for_stack void fake_vmewrite32(struct fake_driver *bridge,
11216 ++ u32 *buf, unsigned long long addr,
11217 ++ u32 aspace, u32 cycle)
11218 + {
11219 + int i;
11220 + unsigned long long start, end, offset;
11221 +diff --git a/drivers/xen/preempt.c b/drivers/xen/preempt.c
11222 +index 08cb419eb4e6..5f6b77ea34fb 100644
11223 +--- a/drivers/xen/preempt.c
11224 ++++ b/drivers/xen/preempt.c
11225 +@@ -37,7 +37,9 @@ asmlinkage __visible void xen_maybe_preempt_hcall(void)
11226 + * cpu.
11227 + */
11228 + __this_cpu_write(xen_in_preemptible_hcall, false);
11229 +- _cond_resched();
11230 ++ local_irq_enable();
11231 ++ cond_resched();
11232 ++ local_irq_disable();
11233 + __this_cpu_write(xen_in_preemptible_hcall, true);
11234 + }
11235 + }
11236 +diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
11237 +index e3524ecce3d7..390053557d4d 100644
11238 +--- a/fs/btrfs/disk-io.c
11239 ++++ b/fs/btrfs/disk-io.c
11240 +@@ -2979,6 +2979,7 @@ retry_root_backup:
11241 + /* do not make disk changes in broken FS or nologreplay is given */
11242 + if (btrfs_super_log_root(disk_super) != 0 &&
11243 + !btrfs_test_opt(tree_root->fs_info, NOLOGREPLAY)) {
11244 ++ btrfs_info(fs_info, "start tree-log replay");
11245 + ret = btrfs_replay_log(fs_info, fs_devices);
11246 + if (ret) {
11247 + err = ret;
11248 +diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
11249 +index 26f9ac719d20..4f59b4089eb0 100644
11250 +--- a/fs/btrfs/extent_map.c
11251 ++++ b/fs/btrfs/extent_map.c
11252 +@@ -227,6 +227,17 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
11253 + struct extent_map *merge = NULL;
11254 + struct rb_node *rb;
11255 +
11256 ++ /*
11257 ++ * We can't modify an extent map that is in the tree and that is being
11258 ++ * used by another task, as it can cause that other task to see it in
11259 ++ * inconsistent state during the merging. We always have 1 reference for
11260 ++ * the tree and 1 for this task (which is unpinning the extent map or
11261 ++ * clearing the logging flag), so anything > 2 means it's being used by
11262 ++ * other tasks too.
11263 ++ */
11264 ++ if (atomic_read(&em->refs) > 2)
11265 ++ return;
11266 ++
11267 + if (em->start != 0) {
11268 + rb = rb_prev(&em->rb_node);
11269 + if (rb)
11270 +diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
11271 +index b2d1e95de7be..7dc2284017fa 100644
11272 +--- a/fs/btrfs/ordered-data.c
11273 ++++ b/fs/btrfs/ordered-data.c
11274 +@@ -837,10 +837,15 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
11275 + }
11276 + btrfs_start_ordered_extent(inode, ordered, 1);
11277 + end = ordered->file_offset;
11278 ++ /*
11279 ++ * If the ordered extent had an error save the error but don't
11280 ++ * exit without waiting first for all other ordered extents in
11281 ++ * the range to complete.
11282 ++ */
11283 + if (test_bit(BTRFS_ORDERED_IOERR, &ordered->flags))
11284 + ret = -EIO;
11285 + btrfs_put_ordered_extent(ordered);
11286 +- if (ret || end == 0 || end == start)
11287 ++ if (end == 0 || end == start)
11288 + break;
11289 + end--;
11290 + }
11291 +diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
11292 +index 0c71cdd3f98b..9286603a6a98 100644
11293 +--- a/fs/btrfs/super.c
11294 ++++ b/fs/btrfs/super.c
11295 +@@ -1809,6 +1809,8 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
11296 + }
11297 +
11298 + if (btrfs_super_log_root(fs_info->super_copy) != 0) {
11299 ++ btrfs_warn(fs_info,
11300 ++ "mount required to replay tree-log, cannot remount read-write");
11301 + ret = -EINVAL;
11302 + goto restore;
11303 + }
11304 +diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
11305 +index 751bdde6515d..961fcb40183a 100644
11306 +--- a/fs/cifs/connect.c
11307 ++++ b/fs/cifs/connect.c
11308 +@@ -2927,8 +2927,10 @@ match_prepath(struct super_block *sb, struct cifs_mnt_data *mnt_data)
11309 + {
11310 + struct cifs_sb_info *old = CIFS_SB(sb);
11311 + struct cifs_sb_info *new = mnt_data->cifs_sb;
11312 +- bool old_set = old->mnt_cifs_flags & CIFS_MOUNT_USE_PREFIX_PATH;
11313 +- bool new_set = new->mnt_cifs_flags & CIFS_MOUNT_USE_PREFIX_PATH;
11314 ++ bool old_set = (old->mnt_cifs_flags & CIFS_MOUNT_USE_PREFIX_PATH) &&
11315 ++ old->prepath;
11316 ++ bool new_set = (new->mnt_cifs_flags & CIFS_MOUNT_USE_PREFIX_PATH) &&
11317 ++ new->prepath;
11318 +
11319 + if (old_set && new_set && !strcmp(new->prepath, old->prepath))
11320 + return 1;
11321 +diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
11322 +index cb77e7ee2c9f..ff6cf23be8a2 100644
11323 +--- a/fs/ecryptfs/crypto.c
11324 ++++ b/fs/ecryptfs/crypto.c
11325 +@@ -339,8 +339,10 @@ static int crypt_scatterlist(struct ecryptfs_crypt_stat *crypt_stat,
11326 + struct extent_crypt_result ecr;
11327 + int rc = 0;
11328 +
11329 +- BUG_ON(!crypt_stat || !crypt_stat->tfm
11330 +- || !(crypt_stat->flags & ECRYPTFS_STRUCT_INITIALIZED));
11331 ++ if (!crypt_stat || !crypt_stat->tfm
11332 ++ || !(crypt_stat->flags & ECRYPTFS_STRUCT_INITIALIZED))
11333 ++ return -EINVAL;
11334 ++
11335 + if (unlikely(ecryptfs_verbosity > 0)) {
11336 + ecryptfs_printk(KERN_DEBUG, "Key size [%zd]; key:\n",
11337 + crypt_stat->key_size);
11338 +diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
11339 +index fa218cd64f74..3f3ec50bf773 100644
11340 +--- a/fs/ecryptfs/keystore.c
11341 ++++ b/fs/ecryptfs/keystore.c
11342 +@@ -1285,7 +1285,7 @@ parse_tag_1_packet(struct ecryptfs_crypt_stat *crypt_stat,
11343 + printk(KERN_ERR "Enter w/ first byte != 0x%.2x\n",
11344 + ECRYPTFS_TAG_1_PACKET_TYPE);
11345 + rc = -EINVAL;
11346 +- goto out;
11347 ++ goto out_free;
11348 + }
11349 + /* Released: wipe_auth_tok_list called in ecryptfs_parse_packet_set or
11350 + * at end of function upon failure */
11351 +diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c
11352 +index 4f457d5c4933..26464f9d9b76 100644
11353 +--- a/fs/ecryptfs/messaging.c
11354 ++++ b/fs/ecryptfs/messaging.c
11355 +@@ -397,6 +397,7 @@ int __init ecryptfs_init_messaging(void)
11356 + * ecryptfs_message_buf_len),
11357 + GFP_KERNEL);
11358 + if (!ecryptfs_msg_ctx_arr) {
11359 ++ kfree(ecryptfs_daemon_hash);
11360 + rc = -ENOMEM;
11361 + printk(KERN_ERR "%s: Failed to allocate memory\n", __func__);
11362 + goto out;
11363 +diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
11364 +index 7fb8df7b6a43..6b3a32f75dad 100644
11365 +--- a/fs/ext4/dir.c
11366 ++++ b/fs/ext4/dir.c
11367 +@@ -124,12 +124,14 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx)
11368 + if (err != ERR_BAD_DX_DIR) {
11369 + return err;
11370 + }
11371 +- /*
11372 +- * We don't set the inode dirty flag since it's not
11373 +- * critical that it get flushed back to the disk.
11374 +- */
11375 +- ext4_clear_inode_flag(file_inode(file),
11376 +- EXT4_INODE_INDEX);
11377 ++ /* Can we just clear INDEX flag to ignore htree information? */
11378 ++ if (!ext4_has_metadata_csum(sb)) {
11379 ++ /*
11380 ++ * We don't set the inode dirty flag since it's not
11381 ++ * critical that it gets flushed back to the disk.
11382 ++ */
11383 ++ ext4_clear_inode_flag(inode, EXT4_INODE_INDEX);
11384 ++ }
11385 + }
11386 +
11387 + if (ext4_has_inline_data(inode)) {
11388 +diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
11389 +index 567a6c7af677..9713d3d41412 100644
11390 +--- a/fs/ext4/ext4.h
11391 ++++ b/fs/ext4/ext4.h
11392 +@@ -1514,8 +1514,11 @@ struct ext4_sb_info {
11393 + struct ratelimit_state s_warning_ratelimit_state;
11394 + struct ratelimit_state s_msg_ratelimit_state;
11395 +
11396 +- /* Barrier between changing inodes' journal flags and writepages ops. */
11397 +- struct percpu_rw_semaphore s_journal_flag_rwsem;
11398 ++ /*
11399 ++ * Barrier between writepages ops and changing any inode's JOURNAL_DATA
11400 ++ * or EXTENTS flag.
11401 ++ */
11402 ++ struct percpu_rw_semaphore s_writepages_rwsem;
11403 +
11404 + /* Encryption support */
11405 + #ifdef CONFIG_EXT4_FS_ENCRYPTION
11406 +@@ -2375,8 +2378,11 @@ int ext4_insert_dentry(struct inode *dir,
11407 + struct ext4_filename *fname);
11408 + static inline void ext4_update_dx_flag(struct inode *inode)
11409 + {
11410 +- if (!ext4_has_feature_dir_index(inode->i_sb))
11411 ++ if (!ext4_has_feature_dir_index(inode->i_sb)) {
11412 ++ /* ext4_iget() should have caught this... */
11413 ++ WARN_ON_ONCE(ext4_has_feature_metadata_csum(inode->i_sb));
11414 + ext4_clear_inode_flag(inode, EXT4_INODE_INDEX);
11415 ++ }
11416 + }
11417 + static unsigned char ext4_filetype_table[] = {
11418 + DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
11419 +@@ -2848,7 +2854,7 @@ static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize)
11420 + !inode_is_locked(inode));
11421 + down_write(&EXT4_I(inode)->i_data_sem);
11422 + if (newsize > EXT4_I(inode)->i_disksize)
11423 +- EXT4_I(inode)->i_disksize = newsize;
11424 ++ WRITE_ONCE(EXT4_I(inode)->i_disksize, newsize);
11425 + up_write(&EXT4_I(inode)->i_data_sem);
11426 + }
11427 +
11428 +diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
11429 +index 8133e6529994..911a49e861d2 100644
11430 +--- a/fs/ext4/inode.c
11431 ++++ b/fs/ext4/inode.c
11432 +@@ -2475,7 +2475,7 @@ update_disksize:
11433 + * truncate are avoided by checking i_size under i_data_sem.
11434 + */
11435 + disksize = ((loff_t)mpd->first_page) << PAGE_SHIFT;
11436 +- if (disksize > EXT4_I(inode)->i_disksize) {
11437 ++ if (disksize > READ_ONCE(EXT4_I(inode)->i_disksize)) {
11438 + int err2;
11439 + loff_t i_size;
11440 +
11441 +@@ -2652,7 +2652,7 @@ static int ext4_writepages(struct address_space *mapping,
11442 + struct blk_plug plug;
11443 + bool give_up_on_write = false;
11444 +
11445 +- percpu_down_read(&sbi->s_journal_flag_rwsem);
11446 ++ percpu_down_read(&sbi->s_writepages_rwsem);
11447 + trace_ext4_writepages(inode, wbc);
11448 +
11449 + if (dax_mapping(mapping)) {
11450 +@@ -2853,7 +2853,7 @@ retry:
11451 + out_writepages:
11452 + trace_ext4_writepages_result(inode, wbc, ret,
11453 + nr_to_write - wbc->nr_to_write);
11454 +- percpu_up_read(&sbi->s_journal_flag_rwsem);
11455 ++ percpu_up_read(&sbi->s_writepages_rwsem);
11456 + return ret;
11457 + }
11458 +
11459 +@@ -4594,6 +4594,18 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
11460 + ret = -EFSCORRUPTED;
11461 + goto bad_inode;
11462 + }
11463 ++ /*
11464 ++ * If dir_index is not enabled but there's dir with INDEX flag set,
11465 ++ * we'd normally treat htree data as empty space. But with metadata
11466 ++ * checksumming that corrupts checksums so forbid that.
11467 ++ */
11468 ++ if (!ext4_has_feature_dir_index(sb) && ext4_has_metadata_csum(sb) &&
11469 ++ ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) {
11470 ++ EXT4_ERROR_INODE(inode,
11471 ++ "iget: Dir with htree data on filesystem without dir_index feature.");
11472 ++ ret = -EFSCORRUPTED;
11473 ++ goto bad_inode;
11474 ++ }
11475 + ei->i_disksize = inode->i_size;
11476 + #ifdef CONFIG_QUOTA
11477 + ei->i_reserved_quota = 0;
11478 +@@ -5676,7 +5688,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
11479 + }
11480 + }
11481 +
11482 +- percpu_down_write(&sbi->s_journal_flag_rwsem);
11483 ++ percpu_down_write(&sbi->s_writepages_rwsem);
11484 + jbd2_journal_lock_updates(journal);
11485 +
11486 + /*
11487 +@@ -5693,7 +5705,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
11488 + err = jbd2_journal_flush(journal);
11489 + if (err < 0) {
11490 + jbd2_journal_unlock_updates(journal);
11491 +- percpu_up_write(&sbi->s_journal_flag_rwsem);
11492 ++ percpu_up_write(&sbi->s_writepages_rwsem);
11493 + ext4_inode_resume_unlocked_dio(inode);
11494 + return err;
11495 + }
11496 +@@ -5702,7 +5714,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
11497 + ext4_set_aops(inode);
11498 +
11499 + jbd2_journal_unlock_updates(journal);
11500 +- percpu_up_write(&sbi->s_journal_flag_rwsem);
11501 ++ percpu_up_write(&sbi->s_writepages_rwsem);
11502 +
11503 + if (val)
11504 + up_write(&EXT4_I(inode)->i_mmap_sem);
11505 +diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
11506 +index 364ea4d4a943..bce2d696d6b9 100644
11507 +--- a/fs/ext4/migrate.c
11508 ++++ b/fs/ext4/migrate.c
11509 +@@ -434,6 +434,7 @@ static int free_ext_block(handle_t *handle, struct inode *inode)
11510 +
11511 + int ext4_ext_migrate(struct inode *inode)
11512 + {
11513 ++ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
11514 + handle_t *handle;
11515 + int retval = 0, i;
11516 + __le32 *i_data;
11517 +@@ -458,6 +459,8 @@ int ext4_ext_migrate(struct inode *inode)
11518 + */
11519 + return retval;
11520 +
11521 ++ percpu_down_write(&sbi->s_writepages_rwsem);
11522 ++
11523 + /*
11524 + * Worst case we can touch the allocation bitmaps, a bgd
11525 + * block, and a block to link in the orphan list. We do need
11526 +@@ -468,7 +471,7 @@ int ext4_ext_migrate(struct inode *inode)
11527 +
11528 + if (IS_ERR(handle)) {
11529 + retval = PTR_ERR(handle);
11530 +- return retval;
11531 ++ goto out_unlock;
11532 + }
11533 + goal = (((inode->i_ino - 1) / EXT4_INODES_PER_GROUP(inode->i_sb)) *
11534 + EXT4_INODES_PER_GROUP(inode->i_sb)) + 1;
11535 +@@ -479,7 +482,7 @@ int ext4_ext_migrate(struct inode *inode)
11536 + if (IS_ERR(tmp_inode)) {
11537 + retval = PTR_ERR(tmp_inode);
11538 + ext4_journal_stop(handle);
11539 +- return retval;
11540 ++ goto out_unlock;
11541 + }
11542 + i_size_write(tmp_inode, i_size_read(inode));
11543 + /*
11544 +@@ -521,7 +524,7 @@ int ext4_ext_migrate(struct inode *inode)
11545 + */
11546 + ext4_orphan_del(NULL, tmp_inode);
11547 + retval = PTR_ERR(handle);
11548 +- goto out;
11549 ++ goto out_tmp_inode;
11550 + }
11551 +
11552 + ei = EXT4_I(inode);
11553 +@@ -602,10 +605,11 @@ err_out:
11554 + /* Reset the extent details */
11555 + ext4_ext_tree_init(handle, tmp_inode);
11556 + ext4_journal_stop(handle);
11557 +-out:
11558 ++out_tmp_inode:
11559 + unlock_new_inode(tmp_inode);
11560 + iput(tmp_inode);
11561 +-
11562 ++out_unlock:
11563 ++ percpu_up_write(&sbi->s_writepages_rwsem);
11564 + return retval;
11565 + }
11566 +
11567 +@@ -615,7 +619,8 @@ out:
11568 + int ext4_ind_migrate(struct inode *inode)
11569 + {
11570 + struct ext4_extent_header *eh;
11571 +- struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
11572 ++ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
11573 ++ struct ext4_super_block *es = sbi->s_es;
11574 + struct ext4_inode_info *ei = EXT4_I(inode);
11575 + struct ext4_extent *ex;
11576 + unsigned int i, len;
11577 +@@ -639,9 +644,13 @@ int ext4_ind_migrate(struct inode *inode)
11578 + if (test_opt(inode->i_sb, DELALLOC))
11579 + ext4_alloc_da_blocks(inode);
11580 +
11581 ++ percpu_down_write(&sbi->s_writepages_rwsem);
11582 ++
11583 + handle = ext4_journal_start(inode, EXT4_HT_MIGRATE, 1);
11584 +- if (IS_ERR(handle))
11585 +- return PTR_ERR(handle);
11586 ++ if (IS_ERR(handle)) {
11587 ++ ret = PTR_ERR(handle);
11588 ++ goto out_unlock;
11589 ++ }
11590 +
11591 + down_write(&EXT4_I(inode)->i_data_sem);
11592 + ret = ext4_ext_check_inode(inode);
11593 +@@ -676,5 +685,7 @@ int ext4_ind_migrate(struct inode *inode)
11594 + errout:
11595 + ext4_journal_stop(handle);
11596 + up_write(&EXT4_I(inode)->i_data_sem);
11597 ++out_unlock:
11598 ++ percpu_up_write(&sbi->s_writepages_rwsem);
11599 + return ret;
11600 + }
11601 +diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c
11602 +index c2e830a6206d..fb1ad9510c5f 100644
11603 +--- a/fs/ext4/mmp.c
11604 ++++ b/fs/ext4/mmp.c
11605 +@@ -119,10 +119,10 @@ void __dump_mmp_msg(struct super_block *sb, struct mmp_struct *mmp,
11606 + {
11607 + __ext4_warning(sb, function, line, "%s", msg);
11608 + __ext4_warning(sb, function, line,
11609 +- "MMP failure info: last update time: %llu, last update "
11610 +- "node: %s, last update device: %s",
11611 +- (long long unsigned int) le64_to_cpu(mmp->mmp_time),
11612 +- mmp->mmp_nodename, mmp->mmp_bdevname);
11613 ++ "MMP failure info: last update time: %llu, last update node: %.*s, last update device: %.*s",
11614 ++ (unsigned long long)le64_to_cpu(mmp->mmp_time),
11615 ++ (int)sizeof(mmp->mmp_nodename), mmp->mmp_nodename,
11616 ++ (int)sizeof(mmp->mmp_bdevname), mmp->mmp_bdevname);
11617 + }
11618 +
11619 + /*
11620 +@@ -153,6 +153,7 @@ static int kmmpd(void *data)
11621 + mmp_check_interval = max(EXT4_MMP_CHECK_MULT * mmp_update_interval,
11622 + EXT4_MMP_MIN_CHECK_INTERVAL);
11623 + mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval);
11624 ++ BUILD_BUG_ON(sizeof(mmp->mmp_bdevname) < BDEVNAME_SIZE);
11625 + bdevname(bh->b_bdev, mmp->mmp_bdevname);
11626 +
11627 + memcpy(mmp->mmp_nodename, init_utsname()->nodename,
11628 +@@ -377,7 +378,8 @@ skip:
11629 + /*
11630 + * Start a kernel thread to update the MMP block periodically.
11631 + */
11632 +- EXT4_SB(sb)->s_mmp_tsk = kthread_run(kmmpd, mmpd_data, "kmmpd-%s",
11633 ++ EXT4_SB(sb)->s_mmp_tsk = kthread_run(kmmpd, mmpd_data, "kmmpd-%.*s",
11634 ++ (int)sizeof(mmp->mmp_bdevname),
11635 + bdevname(bh->b_bdev,
11636 + mmp->mmp_bdevname));
11637 + if (IS_ERR(EXT4_SB(sb)->s_mmp_tsk)) {
11638 +diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
11639 +index f0ce535d514c..339ede11896a 100644
11640 +--- a/fs/ext4/namei.c
11641 ++++ b/fs/ext4/namei.c
11642 +@@ -1445,6 +1445,7 @@ restart:
11643 + /*
11644 + * We deal with the read-ahead logic here.
11645 + */
11646 ++ cond_resched();
11647 + if (ra_ptr >= ra_max) {
11648 + /* Refill the readahead buffer */
11649 + ra_ptr = 0;
11650 +@@ -2148,6 +2149,13 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
11651 + retval = ext4_dx_add_entry(handle, &fname, dir, inode);
11652 + if (!retval || (retval != ERR_BAD_DX_DIR))
11653 + goto out;
11654 ++ /* Can we just ignore htree data? */
11655 ++ if (ext4_has_metadata_csum(sb)) {
11656 ++ EXT4_ERROR_INODE(dir,
11657 ++ "Directory has corrupted htree index.");
11658 ++ retval = -EFSCORRUPTED;
11659 ++ goto out;
11660 ++ }
11661 + ext4_clear_inode_flag(dir, EXT4_INODE_INDEX);
11662 + dx_fallback++;
11663 + ext4_mark_inode_dirty(handle, dir);
11664 +diff --git a/fs/ext4/super.c b/fs/ext4/super.c
11665 +index 391ab55808c9..b69a78c061cb 100644
11666 +--- a/fs/ext4/super.c
11667 ++++ b/fs/ext4/super.c
11668 +@@ -865,7 +865,7 @@ static void ext4_put_super(struct super_block *sb)
11669 + percpu_counter_destroy(&sbi->s_freeinodes_counter);
11670 + percpu_counter_destroy(&sbi->s_dirs_counter);
11671 + percpu_counter_destroy(&sbi->s_dirtyclusters_counter);
11672 +- percpu_free_rwsem(&sbi->s_journal_flag_rwsem);
11673 ++ percpu_free_rwsem(&sbi->s_writepages_rwsem);
11674 + brelse(sbi->s_sbh);
11675 + #ifdef CONFIG_QUOTA
11676 + for (i = 0; i < EXT4_MAXQUOTAS; i++)
11677 +@@ -2743,17 +2743,11 @@ static int ext4_feature_set_ok(struct super_block *sb, int readonly)
11678 + return 0;
11679 + }
11680 +
11681 +-#ifndef CONFIG_QUOTA
11682 +- if (ext4_has_feature_quota(sb) && !readonly) {
11683 ++#if !IS_ENABLED(CONFIG_QUOTA) || !IS_ENABLED(CONFIG_QFMT_V2)
11684 ++ if (!readonly && (ext4_has_feature_quota(sb) ||
11685 ++ ext4_has_feature_project(sb))) {
11686 + ext4_msg(sb, KERN_ERR,
11687 +- "Filesystem with quota feature cannot be mounted RDWR "
11688 +- "without CONFIG_QUOTA");
11689 +- return 0;
11690 +- }
11691 +- if (ext4_has_feature_project(sb) && !readonly) {
11692 +- ext4_msg(sb, KERN_ERR,
11693 +- "Filesystem with project quota feature cannot be mounted RDWR "
11694 +- "without CONFIG_QUOTA");
11695 ++ "The kernel was not built with CONFIG_QUOTA and CONFIG_QFMT_V2");
11696 + return 0;
11697 + }
11698 + #endif /* CONFIG_QUOTA */
11699 +@@ -4229,7 +4223,7 @@ no_journal:
11700 + err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0,
11701 + GFP_KERNEL);
11702 + if (!err)
11703 +- err = percpu_init_rwsem(&sbi->s_journal_flag_rwsem);
11704 ++ err = percpu_init_rwsem(&sbi->s_writepages_rwsem);
11705 +
11706 + if (err) {
11707 + ext4_msg(sb, KERN_ERR, "insufficient memory");
11708 +@@ -4328,7 +4322,7 @@ failed_mount6:
11709 + percpu_counter_destroy(&sbi->s_freeinodes_counter);
11710 + percpu_counter_destroy(&sbi->s_dirs_counter);
11711 + percpu_counter_destroy(&sbi->s_dirtyclusters_counter);
11712 +- percpu_free_rwsem(&sbi->s_journal_flag_rwsem);
11713 ++ percpu_free_rwsem(&sbi->s_writepages_rwsem);
11714 + failed_mount5:
11715 + ext4_ext_release(sb);
11716 + ext4_release_system_zone(sb);
11717 +diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
11718 +index 4d5a5a4cc017..addb0784dd1c 100644
11719 +--- a/fs/jbd2/checkpoint.c
11720 ++++ b/fs/jbd2/checkpoint.c
11721 +@@ -168,7 +168,7 @@ void __jbd2_log_wait_for_space(journal_t *journal)
11722 + "journal space in %s\n", __func__,
11723 + journal->j_devname);
11724 + WARN_ON(1);
11725 +- jbd2_journal_abort(journal, 0);
11726 ++ jbd2_journal_abort(journal, -EIO);
11727 + }
11728 + write_lock(&journal->j_state_lock);
11729 + } else {
11730 +diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
11731 +index d002b2b6895f..1d06f81ee8b4 100644
11732 +--- a/fs/jbd2/commit.c
11733 ++++ b/fs/jbd2/commit.c
11734 +@@ -779,7 +779,7 @@ start_journal_io:
11735 + err = journal_submit_commit_record(journal, commit_transaction,
11736 + &cbh, crc32_sum);
11737 + if (err)
11738 +- __jbd2_journal_abort_hard(journal);
11739 ++ jbd2_journal_abort(journal, err);
11740 + }
11741 +
11742 + blk_finish_plug(&plug);
11743 +@@ -872,7 +872,7 @@ start_journal_io:
11744 + err = journal_submit_commit_record(journal, commit_transaction,
11745 + &cbh, crc32_sum);
11746 + if (err)
11747 +- __jbd2_journal_abort_hard(journal);
11748 ++ jbd2_journal_abort(journal, err);
11749 + }
11750 + if (cbh)
11751 + err = journal_wait_on_commit_record(journal, cbh);
11752 +@@ -969,29 +969,33 @@ restart_loop:
11753 + * it. */
11754 +
11755 + /*
11756 +- * A buffer which has been freed while still being journaled by
11757 +- * a previous transaction.
11758 +- */
11759 +- if (buffer_freed(bh)) {
11760 ++ * A buffer which has been freed while still being journaled
11761 ++ * by a previous transaction, refile the buffer to BJ_Forget of
11762 ++ * the running transaction. If the just committed transaction
11763 ++ * contains "add to orphan" operation, we can completely
11764 ++ * invalidate the buffer now. We are rather through in that
11765 ++ * since the buffer may be still accessible when blocksize <
11766 ++ * pagesize and it is attached to the last partial page.
11767 ++ */
11768 ++ if (buffer_freed(bh) && !jh->b_next_transaction) {
11769 ++ struct address_space *mapping;
11770 ++
11771 ++ clear_buffer_freed(bh);
11772 ++ clear_buffer_jbddirty(bh);
11773 ++
11774 + /*
11775 +- * If the running transaction is the one containing
11776 +- * "add to orphan" operation (b_next_transaction !=
11777 +- * NULL), we have to wait for that transaction to
11778 +- * commit before we can really get rid of the buffer.
11779 +- * So just clear b_modified to not confuse transaction
11780 +- * credit accounting and refile the buffer to
11781 +- * BJ_Forget of the running transaction. If the just
11782 +- * committed transaction contains "add to orphan"
11783 +- * operation, we can completely invalidate the buffer
11784 +- * now. We are rather through in that since the
11785 +- * buffer may be still accessible when blocksize <
11786 +- * pagesize and it is attached to the last partial
11787 +- * page.
11788 ++ * Block device buffers need to stay mapped all the
11789 ++ * time, so it is enough to clear buffer_jbddirty and
11790 ++ * buffer_freed bits. For the file mapping buffers (i.e.
11791 ++ * journalled data) we need to unmap buffer and clear
11792 ++ * more bits. We also need to be careful about the check
11793 ++ * because the data page mapping can get cleared under
11794 ++ * out hands, which alse need not to clear more bits
11795 ++ * because the page and buffers will be freed and can
11796 ++ * never be reused once we are done with them.
11797 + */
11798 +- jh->b_modified = 0;
11799 +- if (!jh->b_next_transaction) {
11800 +- clear_buffer_freed(bh);
11801 +- clear_buffer_jbddirty(bh);
11802 ++ mapping = READ_ONCE(bh->b_page->mapping);
11803 ++ if (mapping && !sb_is_blkdev_sb(mapping->host->i_sb)) {
11804 + clear_buffer_mapped(bh);
11805 + clear_buffer_new(bh);
11806 + clear_buffer_req(bh);
11807 +diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
11808 +index 3cbcf649ac66..efc8cfd06073 100644
11809 +--- a/fs/jbd2/journal.c
11810 ++++ b/fs/jbd2/journal.c
11811 +@@ -1670,6 +1670,11 @@ int jbd2_journal_load(journal_t *journal)
11812 + journal->j_devname);
11813 + return -EFSCORRUPTED;
11814 + }
11815 ++ /*
11816 ++ * clear JBD2_ABORT flag initialized in journal_init_common
11817 ++ * here to update log tail information with the newest seq.
11818 ++ */
11819 ++ journal->j_flags &= ~JBD2_ABORT;
11820 +
11821 + /* OK, we've finished with the dynamic journal bits:
11822 + * reinitialise the dynamic contents of the superblock in memory
11823 +@@ -1677,7 +1682,6 @@ int jbd2_journal_load(journal_t *journal)
11824 + if (journal_reset(journal))
11825 + goto recovery_error;
11826 +
11827 +- journal->j_flags &= ~JBD2_ABORT;
11828 + journal->j_flags |= JBD2_LOADED;
11829 + return 0;
11830 +
11831 +@@ -2096,12 +2100,10 @@ static void __journal_abort_soft (journal_t *journal, int errno)
11832 +
11833 + __jbd2_journal_abort_hard(journal);
11834 +
11835 +- if (errno) {
11836 +- jbd2_journal_update_sb_errno(journal);
11837 +- write_lock(&journal->j_state_lock);
11838 +- journal->j_flags |= JBD2_REC_ERR;
11839 +- write_unlock(&journal->j_state_lock);
11840 +- }
11841 ++ jbd2_journal_update_sb_errno(journal);
11842 ++ write_lock(&journal->j_state_lock);
11843 ++ journal->j_flags |= JBD2_REC_ERR;
11844 ++ write_unlock(&journal->j_state_lock);
11845 + }
11846 +
11847 + /**
11848 +@@ -2143,11 +2145,6 @@ static void __journal_abort_soft (journal_t *journal, int errno)
11849 + * failure to disk. ext3_error, for example, now uses this
11850 + * functionality.
11851 + *
11852 +- * Errors which originate from within the journaling layer will NOT
11853 +- * supply an errno; a null errno implies that absolutely no further
11854 +- * writes are done to the journal (unless there are any already in
11855 +- * progress).
11856 +- *
11857 + */
11858 +
11859 + void jbd2_journal_abort(journal_t *journal, int errno)
11860 +diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
11861 +index 799f96c67211..04dd0652bb5c 100644
11862 +--- a/fs/jbd2/transaction.c
11863 ++++ b/fs/jbd2/transaction.c
11864 +@@ -2213,14 +2213,16 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh,
11865 + return -EBUSY;
11866 + }
11867 + /*
11868 +- * OK, buffer won't be reachable after truncate. We just set
11869 +- * j_next_transaction to the running transaction (if there is
11870 +- * one) and mark buffer as freed so that commit code knows it
11871 +- * should clear dirty bits when it is done with the buffer.
11872 ++ * OK, buffer won't be reachable after truncate. We just clear
11873 ++ * b_modified to not confuse transaction credit accounting, and
11874 ++ * set j_next_transaction to the running transaction (if there
11875 ++ * is one) and mark buffer as freed so that commit code knows
11876 ++ * it should clear dirty bits when it is done with the buffer.
11877 + */
11878 + set_buffer_freed(bh);
11879 + if (journal->j_running_transaction && buffer_jbddirty(bh))
11880 + jh->b_next_transaction = journal->j_running_transaction;
11881 ++ jh->b_modified = 0;
11882 + jbd2_journal_put_journal_head(jh);
11883 + spin_unlock(&journal->j_list_lock);
11884 + jbd_unlock_bh_state(bh);
11885 +diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
11886 +index 497a4171ef61..bfb50fc51528 100644
11887 +--- a/fs/ocfs2/journal.h
11888 ++++ b/fs/ocfs2/journal.h
11889 +@@ -637,9 +637,11 @@ static inline void ocfs2_update_inode_fsync_trans(handle_t *handle,
11890 + {
11891 + struct ocfs2_inode_info *oi = OCFS2_I(inode);
11892 +
11893 +- oi->i_sync_tid = handle->h_transaction->t_tid;
11894 +- if (datasync)
11895 +- oi->i_datasync_tid = handle->h_transaction->t_tid;
11896 ++ if (!is_handle_aborted(handle)) {
11897 ++ oi->i_sync_tid = handle->h_transaction->t_tid;
11898 ++ if (datasync)
11899 ++ oi->i_datasync_tid = handle->h_transaction->t_tid;
11900 ++ }
11901 + }
11902 +
11903 + #endif /* OCFS2_JOURNAL_H */
11904 +diff --git a/fs/orangefs/orangefs-debugfs.c b/fs/orangefs/orangefs-debugfs.c
11905 +index 0748a26598fc..7d7df003f9d8 100644
11906 +--- a/fs/orangefs/orangefs-debugfs.c
11907 ++++ b/fs/orangefs/orangefs-debugfs.c
11908 +@@ -304,6 +304,7 @@ static void *help_start(struct seq_file *m, loff_t *pos)
11909 +
11910 + static void *help_next(struct seq_file *m, void *v, loff_t *pos)
11911 + {
11912 ++ (*pos)++;
11913 + gossip_debug(GOSSIP_DEBUGFS_DEBUG, "help_next: start\n");
11914 +
11915 + return NULL;
11916 +diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c
11917 +index a97e352d05d3..5f5fff068877 100644
11918 +--- a/fs/reiserfs/stree.c
11919 ++++ b/fs/reiserfs/stree.c
11920 +@@ -2249,7 +2249,8 @@ error_out:
11921 + /* also releases the path */
11922 + unfix_nodes(&s_ins_balance);
11923 + #ifdef REISERQUOTA_DEBUG
11924 +- reiserfs_debug(th->t_super, REISERFS_DEBUG_CODE,
11925 ++ if (inode)
11926 ++ reiserfs_debug(th->t_super, REISERFS_DEBUG_CODE,
11927 + "reiserquota insert_item(): freeing %u id=%u type=%c",
11928 + quota_bytes, inode->i_uid, head2type(ih));
11929 + #endif
11930 +diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
11931 +index bfed2a700015..677608a89b08 100644
11932 +--- a/fs/reiserfs/super.c
11933 ++++ b/fs/reiserfs/super.c
11934 +@@ -1928,7 +1928,7 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
11935 + if (!sbi->s_jdev) {
11936 + SWARN(silent, s, "", "Cannot allocate memory for "
11937 + "journal device name");
11938 +- goto error;
11939 ++ goto error_unlocked;
11940 + }
11941 + }
11942 + #ifdef CONFIG_QUOTA
11943 +diff --git a/fs/udf/super.c b/fs/udf/super.c
11944 +index 03369a89600e..4abdba453885 100644
11945 +--- a/fs/udf/super.c
11946 ++++ b/fs/udf/super.c
11947 +@@ -2460,17 +2460,29 @@ static unsigned int udf_count_free_table(struct super_block *sb,
11948 + static unsigned int udf_count_free(struct super_block *sb)
11949 + {
11950 + unsigned int accum = 0;
11951 +- struct udf_sb_info *sbi;
11952 ++ struct udf_sb_info *sbi = UDF_SB(sb);
11953 + struct udf_part_map *map;
11954 ++ unsigned int part = sbi->s_partition;
11955 ++ int ptype = sbi->s_partmaps[part].s_partition_type;
11956 ++
11957 ++ if (ptype == UDF_METADATA_MAP25) {
11958 ++ part = sbi->s_partmaps[part].s_type_specific.s_metadata.
11959 ++ s_phys_partition_ref;
11960 ++ } else if (ptype == UDF_VIRTUAL_MAP15 || ptype == UDF_VIRTUAL_MAP20) {
11961 ++ /*
11962 ++ * Filesystems with VAT are append-only and we cannot write to
11963 ++ * them. Let's just report 0 here.
11964 ++ */
11965 ++ return 0;
11966 ++ }
11967 +
11968 +- sbi = UDF_SB(sb);
11969 + if (sbi->s_lvid_bh) {
11970 + struct logicalVolIntegrityDesc *lvid =
11971 + (struct logicalVolIntegrityDesc *)
11972 + sbi->s_lvid_bh->b_data;
11973 +- if (le32_to_cpu(lvid->numOfPartitions) > sbi->s_partition) {
11974 ++ if (le32_to_cpu(lvid->numOfPartitions) > part) {
11975 + accum = le32_to_cpu(
11976 +- lvid->freeSpaceTable[sbi->s_partition]);
11977 ++ lvid->freeSpaceTable[part]);
11978 + if (accum == 0xFFFFFFFF)
11979 + accum = 0;
11980 + }
11981 +@@ -2479,7 +2491,7 @@ static unsigned int udf_count_free(struct super_block *sb)
11982 + if (accum)
11983 + return accum;
11984 +
11985 +- map = &sbi->s_partmaps[sbi->s_partition];
11986 ++ map = &sbi->s_partmaps[part];
11987 + if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_BITMAP) {
11988 + accum += udf_count_free_bitmap(sb,
11989 + map->s_uspace.s_bitmap);
11990 +diff --git a/include/linux/libata.h b/include/linux/libata.h
11991 +index df58b01e6962..cdfb67b22317 100644
11992 +--- a/include/linux/libata.h
11993 ++++ b/include/linux/libata.h
11994 +@@ -1222,6 +1222,7 @@ struct pci_bits {
11995 + };
11996 +
11997 + extern int pci_test_config_bits(struct pci_dev *pdev, const struct pci_bits *bits);
11998 ++extern void ata_pci_shutdown_one(struct pci_dev *pdev);
11999 + extern void ata_pci_remove_one(struct pci_dev *pdev);
12000 +
12001 + #ifdef CONFIG_PM
12002 +diff --git a/include/linux/list_nulls.h b/include/linux/list_nulls.h
12003 +index 87ff4f58a2f0..9e20bf7f46a2 100644
12004 +--- a/include/linux/list_nulls.h
12005 ++++ b/include/linux/list_nulls.h
12006 +@@ -71,10 +71,10 @@ static inline void hlist_nulls_add_head(struct hlist_nulls_node *n,
12007 + struct hlist_nulls_node *first = h->first;
12008 +
12009 + n->next = first;
12010 +- n->pprev = &h->first;
12011 ++ WRITE_ONCE(n->pprev, &h->first);
12012 + h->first = n;
12013 + if (!is_a_nulls(first))
12014 +- first->pprev = &n->next;
12015 ++ WRITE_ONCE(first->pprev, &n->next);
12016 + }
12017 +
12018 + static inline void __hlist_nulls_del(struct hlist_nulls_node *n)
12019 +@@ -84,13 +84,13 @@ static inline void __hlist_nulls_del(struct hlist_nulls_node *n)
12020 +
12021 + WRITE_ONCE(*pprev, next);
12022 + if (!is_a_nulls(next))
12023 +- next->pprev = pprev;
12024 ++ WRITE_ONCE(next->pprev, pprev);
12025 + }
12026 +
12027 + static inline void hlist_nulls_del(struct hlist_nulls_node *n)
12028 + {
12029 + __hlist_nulls_del(n);
12030 +- n->pprev = LIST_POISON2;
12031 ++ WRITE_ONCE(n->pprev, LIST_POISON2);
12032 + }
12033 +
12034 + /**
12035 +diff --git a/include/linux/rculist_nulls.h b/include/linux/rculist_nulls.h
12036 +index 106f4e0d7bd3..4d71e3687d1e 100644
12037 +--- a/include/linux/rculist_nulls.h
12038 ++++ b/include/linux/rculist_nulls.h
12039 +@@ -33,7 +33,7 @@ static inline void hlist_nulls_del_init_rcu(struct hlist_nulls_node *n)
12040 + {
12041 + if (!hlist_nulls_unhashed(n)) {
12042 + __hlist_nulls_del(n);
12043 +- n->pprev = NULL;
12044 ++ WRITE_ONCE(n->pprev, NULL);
12045 + }
12046 + }
12047 +
12048 +@@ -65,7 +65,7 @@ static inline void hlist_nulls_del_init_rcu(struct hlist_nulls_node *n)
12049 + static inline void hlist_nulls_del_rcu(struct hlist_nulls_node *n)
12050 + {
12051 + __hlist_nulls_del(n);
12052 +- n->pprev = LIST_POISON2;
12053 ++ WRITE_ONCE(n->pprev, LIST_POISON2);
12054 + }
12055 +
12056 + /**
12057 +@@ -93,10 +93,10 @@ static inline void hlist_nulls_add_head_rcu(struct hlist_nulls_node *n,
12058 + struct hlist_nulls_node *first = h->first;
12059 +
12060 + n->next = first;
12061 +- n->pprev = &h->first;
12062 ++ WRITE_ONCE(n->pprev, &h->first);
12063 + rcu_assign_pointer(hlist_nulls_first_rcu(h), n);
12064 + if (!is_a_nulls(first))
12065 +- first->pprev = &n->next;
12066 ++ WRITE_ONCE(first->pprev, &n->next);
12067 + }
12068 +
12069 + /**
12070 +diff --git a/include/media/v4l2-device.h b/include/media/v4l2-device.h
12071 +index 8ffa94009d1a..76002416cead 100644
12072 +--- a/include/media/v4l2-device.h
12073 ++++ b/include/media/v4l2-device.h
12074 +@@ -268,7 +268,7 @@ static inline void v4l2_subdev_notify(struct v4l2_subdev *sd,
12075 + struct v4l2_subdev *__sd; \
12076 + \
12077 + __v4l2_device_call_subdevs_p(v4l2_dev, __sd, \
12078 +- !(grpid) || __sd->grp_id == (grpid), o, f , \
12079 ++ (grpid) == 0 || __sd->grp_id == (grpid), o, f , \
12080 + ##args); \
12081 + } while (0)
12082 +
12083 +@@ -280,7 +280,7 @@ static inline void v4l2_subdev_notify(struct v4l2_subdev *sd,
12084 + ({ \
12085 + struct v4l2_subdev *__sd; \
12086 + __v4l2_device_call_subdevs_until_err_p(v4l2_dev, __sd, \
12087 +- !(grpid) || __sd->grp_id == (grpid), o, f , \
12088 ++ (grpid) == 0 || __sd->grp_id == (grpid), o, f , \
12089 + ##args); \
12090 + })
12091 +
12092 +@@ -294,8 +294,8 @@ static inline void v4l2_subdev_notify(struct v4l2_subdev *sd,
12093 + struct v4l2_subdev *__sd; \
12094 + \
12095 + __v4l2_device_call_subdevs_p(v4l2_dev, __sd, \
12096 +- !(grpmsk) || (__sd->grp_id & (grpmsk)), o, f , \
12097 +- ##args); \
12098 ++ (grpmsk) == 0 || (__sd->grp_id & (grpmsk)), o, \
12099 ++ f , ##args); \
12100 + } while (0)
12101 +
12102 + /*
12103 +@@ -308,8 +308,8 @@ static inline void v4l2_subdev_notify(struct v4l2_subdev *sd,
12104 + ({ \
12105 + struct v4l2_subdev *__sd; \
12106 + __v4l2_device_call_subdevs_until_err_p(v4l2_dev, __sd, \
12107 +- !(grpmsk) || (__sd->grp_id & (grpmsk)), o, f , \
12108 +- ##args); \
12109 ++ (grpmsk) == 0 || (__sd->grp_id & (grpmsk)), o, \
12110 ++ f , ##args); \
12111 + })
12112 +
12113 + /*
12114 +diff --git a/include/scsi/iscsi_proto.h b/include/scsi/iscsi_proto.h
12115 +index 1a2ae0862e23..c1260d80ef30 100644
12116 +--- a/include/scsi/iscsi_proto.h
12117 ++++ b/include/scsi/iscsi_proto.h
12118 +@@ -638,7 +638,6 @@ struct iscsi_reject {
12119 + #define ISCSI_REASON_BOOKMARK_INVALID 9
12120 + #define ISCSI_REASON_BOOKMARK_NO_RESOURCES 10
12121 + #define ISCSI_REASON_NEGOTIATION_RESET 11
12122 +-#define ISCSI_REASON_WAITING_FOR_LOGOUT 12
12123 +
12124 + /* Max. number of Key=Value pairs in a text message */
12125 + #define MAX_KEY_VALUE_PAIRS 8192
12126 +diff --git a/include/sound/rawmidi.h b/include/sound/rawmidi.h
12127 +index f730b91e472f..5432111c8761 100644
12128 +--- a/include/sound/rawmidi.h
12129 ++++ b/include/sound/rawmidi.h
12130 +@@ -92,9 +92,9 @@ struct snd_rawmidi_substream {
12131 + struct list_head list; /* list of all substream for given stream */
12132 + int stream; /* direction */
12133 + int number; /* substream number */
12134 +- unsigned int opened: 1, /* open flag */
12135 +- append: 1, /* append flag (merge more streams) */
12136 +- active_sensing: 1; /* send active sensing when close */
12137 ++ bool opened; /* open flag */
12138 ++ bool append; /* append flag (merge more streams) */
12139 ++ bool active_sensing; /* send active sensing when close */
12140 + int use_count; /* use counter (for output) */
12141 + size_t bytes;
12142 + struct snd_rawmidi *rmidi;
12143 +diff --git a/ipc/sem.c b/ipc/sem.c
12144 +index 10b94bc59d4a..5cd9d802592f 100644
12145 +--- a/ipc/sem.c
12146 ++++ b/ipc/sem.c
12147 +@@ -2159,11 +2159,9 @@ void exit_sem(struct task_struct *tsk)
12148 + ipc_assert_locked_object(&sma->sem_perm);
12149 + list_del(&un->list_id);
12150 +
12151 +- /* we are the last process using this ulp, acquiring ulp->lock
12152 +- * isn't required. Besides that, we are also protected against
12153 +- * IPC_RMID as we hold sma->sem_perm lock now
12154 +- */
12155 ++ spin_lock(&ulp->lock);
12156 + list_del_rcu(&un->list_proc);
12157 ++ spin_unlock(&ulp->lock);
12158 +
12159 + /* perform adjustments registered in un */
12160 + for (i = 0; i < sma->sem_nsems; i++) {
12161 +diff --git a/kernel/cpu.c b/kernel/cpu.c
12162 +index c2573e858009..1fbe93fefc1f 100644
12163 +--- a/kernel/cpu.c
12164 ++++ b/kernel/cpu.c
12165 +@@ -515,8 +515,7 @@ static int bringup_wait_for_ap(unsigned int cpu)
12166 + if (WARN_ON_ONCE((!cpu_online(cpu))))
12167 + return -ECANCELED;
12168 +
12169 +- /* Unpark the stopper thread and the hotplug thread of the target cpu */
12170 +- stop_machine_unpark(cpu);
12171 ++ /* Unpark the hotplug thread of the target cpu */
12172 + kthread_unpark(st->thread);
12173 +
12174 + /*
12175 +@@ -1115,8 +1114,8 @@ void notify_cpu_starting(unsigned int cpu)
12176 +
12177 + /*
12178 + * Called from the idle task. Wake up the controlling task which brings the
12179 +- * stopper and the hotplug thread of the upcoming CPU up and then delegates
12180 +- * the rest of the online bringup to the hotplug thread.
12181 ++ * hotplug thread of the upcoming CPU up and then delegates the rest of the
12182 ++ * online bringup to the hotplug thread.
12183 + */
12184 + void cpuhp_online_idle(enum cpuhp_state state)
12185 + {
12186 +@@ -1126,6 +1125,12 @@ void cpuhp_online_idle(enum cpuhp_state state)
12187 + if (state != CPUHP_AP_ONLINE_IDLE)
12188 + return;
12189 +
12190 ++ /*
12191 ++ * Unpart the stopper thread before we start the idle loop (and start
12192 ++ * scheduling); this ensures the stopper task is always available.
12193 ++ */
12194 ++ stop_machine_unpark(smp_processor_id());
12195 ++
12196 + st->state = CPUHP_AP_ONLINE_IDLE;
12197 + complete(&st->done);
12198 + }
12199 +diff --git a/kernel/padata.c b/kernel/padata.c
12200 +index 63449fc584da..286c5142a0f7 100644
12201 +--- a/kernel/padata.c
12202 ++++ b/kernel/padata.c
12203 +@@ -34,6 +34,8 @@
12204 +
12205 + #define MAX_OBJ_NUM 1000
12206 +
12207 ++static void padata_free_pd(struct parallel_data *pd);
12208 ++
12209 + static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index)
12210 + {
12211 + int cpu, target_cpu;
12212 +@@ -301,6 +303,7 @@ static void padata_serial_worker(struct work_struct *serial_work)
12213 + struct padata_serial_queue *squeue;
12214 + struct parallel_data *pd;
12215 + LIST_HEAD(local_list);
12216 ++ int cnt;
12217 +
12218 + local_bh_disable();
12219 + squeue = container_of(serial_work, struct padata_serial_queue, work);
12220 +@@ -310,6 +313,8 @@ static void padata_serial_worker(struct work_struct *serial_work)
12221 + list_replace_init(&squeue->serial.list, &local_list);
12222 + spin_unlock(&squeue->serial.lock);
12223 +
12224 ++ cnt = 0;
12225 ++
12226 + while (!list_empty(&local_list)) {
12227 + struct padata_priv *padata;
12228 +
12229 +@@ -319,9 +324,12 @@ static void padata_serial_worker(struct work_struct *serial_work)
12230 + list_del_init(&padata->list);
12231 +
12232 + padata->serial(padata);
12233 +- atomic_dec(&pd->refcnt);
12234 ++ cnt++;
12235 + }
12236 + local_bh_enable();
12237 ++
12238 ++ if (atomic_sub_and_test(cnt, &pd->refcnt))
12239 ++ padata_free_pd(pd);
12240 + }
12241 +
12242 + /**
12243 +@@ -444,7 +452,7 @@ static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst,
12244 + setup_timer(&pd->timer, padata_reorder_timer, (unsigned long)pd);
12245 + atomic_set(&pd->seq_nr, -1);
12246 + atomic_set(&pd->reorder_objects, 0);
12247 +- atomic_set(&pd->refcnt, 0);
12248 ++ atomic_set(&pd->refcnt, 1);
12249 + pd->pinst = pinst;
12250 + spin_lock_init(&pd->lock);
12251 +
12252 +@@ -469,31 +477,6 @@ static void padata_free_pd(struct parallel_data *pd)
12253 + kfree(pd);
12254 + }
12255 +
12256 +-/* Flush all objects out of the padata queues. */
12257 +-static void padata_flush_queues(struct parallel_data *pd)
12258 +-{
12259 +- int cpu;
12260 +- struct padata_parallel_queue *pqueue;
12261 +- struct padata_serial_queue *squeue;
12262 +-
12263 +- for_each_cpu(cpu, pd->cpumask.pcpu) {
12264 +- pqueue = per_cpu_ptr(pd->pqueue, cpu);
12265 +- flush_work(&pqueue->work);
12266 +- }
12267 +-
12268 +- del_timer_sync(&pd->timer);
12269 +-
12270 +- if (atomic_read(&pd->reorder_objects))
12271 +- padata_reorder(pd);
12272 +-
12273 +- for_each_cpu(cpu, pd->cpumask.cbcpu) {
12274 +- squeue = per_cpu_ptr(pd->squeue, cpu);
12275 +- flush_work(&squeue->work);
12276 +- }
12277 +-
12278 +- BUG_ON(atomic_read(&pd->refcnt) != 0);
12279 +-}
12280 +-
12281 + static void __padata_start(struct padata_instance *pinst)
12282 + {
12283 + pinst->flags |= PADATA_INIT;
12284 +@@ -507,10 +490,6 @@ static void __padata_stop(struct padata_instance *pinst)
12285 + pinst->flags &= ~PADATA_INIT;
12286 +
12287 + synchronize_rcu();
12288 +-
12289 +- get_online_cpus();
12290 +- padata_flush_queues(pinst->pd);
12291 +- put_online_cpus();
12292 + }
12293 +
12294 + /* Replace the internal control structure with a new one. */
12295 +@@ -531,8 +510,8 @@ static void padata_replace(struct padata_instance *pinst,
12296 + if (!cpumask_equal(pd_old->cpumask.cbcpu, pd_new->cpumask.cbcpu))
12297 + notification_mask |= PADATA_CPU_SERIAL;
12298 +
12299 +- padata_flush_queues(pd_old);
12300 +- padata_free_pd(pd_old);
12301 ++ if (atomic_dec_and_test(&pd_old->refcnt))
12302 ++ padata_free_pd(pd_old);
12303 +
12304 + if (notification_mask)
12305 + blocking_notifier_call_chain(&pinst->cpumask_change_notifier,
12306 +diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
12307 +index 71a40e5c3a9f..2ae98f8bce81 100644
12308 +--- a/kernel/trace/ftrace.c
12309 ++++ b/kernel/trace/ftrace.c
12310 +@@ -5455,9 +5455,10 @@ static void *fpid_next(struct seq_file *m, void *v, loff_t *pos)
12311 + struct trace_array *tr = m->private;
12312 + struct trace_pid_list *pid_list = rcu_dereference_sched(tr->function_pids);
12313 +
12314 +- if (v == FTRACE_NO_PIDS)
12315 ++ if (v == FTRACE_NO_PIDS) {
12316 ++ (*pos)++;
12317 + return NULL;
12318 +-
12319 ++ }
12320 + return trace_pid_next(pid_list, v, pos);
12321 + }
12322 +
12323 +diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
12324 +index 7e6971ba9541..8a88e85c8c61 100644
12325 +--- a/kernel/trace/trace_events_trigger.c
12326 ++++ b/kernel/trace/trace_events_trigger.c
12327 +@@ -126,9 +126,10 @@ static void *trigger_next(struct seq_file *m, void *t, loff_t *pos)
12328 + {
12329 + struct trace_event_file *event_file = event_file_data(m->private);
12330 +
12331 +- if (t == SHOW_AVAILABLE_TRIGGERS)
12332 ++ if (t == SHOW_AVAILABLE_TRIGGERS) {
12333 ++ (*pos)++;
12334 + return NULL;
12335 +-
12336 ++ }
12337 + return seq_list_next(t, &event_file->triggers, pos);
12338 + }
12339 +
12340 +diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
12341 +index 413ff108fbd0..d19f2191960e 100644
12342 +--- a/kernel/trace/trace_stat.c
12343 ++++ b/kernel/trace/trace_stat.c
12344 +@@ -277,18 +277,22 @@ static int tracing_stat_init(void)
12345 +
12346 + d_tracing = tracing_init_dentry();
12347 + if (IS_ERR(d_tracing))
12348 +- return 0;
12349 ++ return -ENODEV;
12350 +
12351 + stat_dir = tracefs_create_dir("trace_stat", d_tracing);
12352 +- if (!stat_dir)
12353 ++ if (!stat_dir) {
12354 + pr_warn("Could not create tracefs 'trace_stat' entry\n");
12355 ++ return -ENOMEM;
12356 ++ }
12357 + return 0;
12358 + }
12359 +
12360 + static int init_stat_file(struct stat_session *session)
12361 + {
12362 +- if (!stat_dir && tracing_stat_init())
12363 +- return -ENODEV;
12364 ++ int ret;
12365 ++
12366 ++ if (!stat_dir && (ret = tracing_stat_init()))
12367 ++ return ret;
12368 +
12369 + session->file = tracefs_create_file(session->ts->name, 0644,
12370 + stat_dir,
12371 +@@ -301,7 +305,7 @@ static int init_stat_file(struct stat_session *session)
12372 + int register_stat_tracer(struct tracer_stat *trace)
12373 + {
12374 + struct stat_session *session, *node;
12375 +- int ret;
12376 ++ int ret = -EINVAL;
12377 +
12378 + if (!trace)
12379 + return -EINVAL;
12380 +@@ -312,17 +316,15 @@ int register_stat_tracer(struct tracer_stat *trace)
12381 + /* Already registered? */
12382 + mutex_lock(&all_stat_sessions_mutex);
12383 + list_for_each_entry(node, &all_stat_sessions, session_list) {
12384 +- if (node->ts == trace) {
12385 +- mutex_unlock(&all_stat_sessions_mutex);
12386 +- return -EINVAL;
12387 +- }
12388 ++ if (node->ts == trace)
12389 ++ goto out;
12390 + }
12391 +- mutex_unlock(&all_stat_sessions_mutex);
12392 +
12393 ++ ret = -ENOMEM;
12394 + /* Init the session */
12395 + session = kzalloc(sizeof(*session), GFP_KERNEL);
12396 + if (!session)
12397 +- return -ENOMEM;
12398 ++ goto out;
12399 +
12400 + session->ts = trace;
12401 + INIT_LIST_HEAD(&session->session_list);
12402 +@@ -331,15 +333,16 @@ int register_stat_tracer(struct tracer_stat *trace)
12403 + ret = init_stat_file(session);
12404 + if (ret) {
12405 + destroy_session(session);
12406 +- return ret;
12407 ++ goto out;
12408 + }
12409 +
12410 ++ ret = 0;
12411 + /* Register */
12412 +- mutex_lock(&all_stat_sessions_mutex);
12413 + list_add_tail(&session->session_list, &all_stat_sessions);
12414 ++ out:
12415 + mutex_unlock(&all_stat_sessions_mutex);
12416 +
12417 +- return 0;
12418 ++ return ret;
12419 + }
12420 +
12421 + void unregister_stat_tracer(struct tracer_stat *trace)
12422 +diff --git a/lib/scatterlist.c b/lib/scatterlist.c
12423 +index a854cc39f084..ef8c14a56d0a 100644
12424 +--- a/lib/scatterlist.c
12425 ++++ b/lib/scatterlist.c
12426 +@@ -317,7 +317,7 @@ int __sg_alloc_table(struct sg_table *table, unsigned int nents,
12427 + if (prv)
12428 + table->nents = ++table->orig_nents;
12429 +
12430 +- return -ENOMEM;
12431 ++ return -ENOMEM;
12432 + }
12433 +
12434 + sg_init_table(sg, alloc_size);
12435 +diff --git a/lib/stackdepot.c b/lib/stackdepot.c
12436 +index f87d138e9672..759ff419fe61 100644
12437 +--- a/lib/stackdepot.c
12438 ++++ b/lib/stackdepot.c
12439 +@@ -92,15 +92,19 @@ static bool init_stack_slab(void **prealloc)
12440 + return true;
12441 + if (stack_slabs[depot_index] == NULL) {
12442 + stack_slabs[depot_index] = *prealloc;
12443 ++ *prealloc = NULL;
12444 + } else {
12445 +- stack_slabs[depot_index + 1] = *prealloc;
12446 ++ /* If this is the last depot slab, do not touch the next one. */
12447 ++ if (depot_index + 1 < STACK_ALLOC_MAX_SLABS) {
12448 ++ stack_slabs[depot_index + 1] = *prealloc;
12449 ++ *prealloc = NULL;
12450 ++ }
12451 + /*
12452 + * This smp_store_release pairs with smp_load_acquire() from
12453 + * |next_slab_inited| above and in depot_save_stack().
12454 + */
12455 + smp_store_release(&next_slab_inited, 1);
12456 + }
12457 +- *prealloc = NULL;
12458 + return true;
12459 + }
12460 +
12461 +diff --git a/net/netfilter/xt_bpf.c b/net/netfilter/xt_bpf.c
12462 +index dffee9d47ec4..7b993f25aab9 100644
12463 +--- a/net/netfilter/xt_bpf.c
12464 ++++ b/net/netfilter/xt_bpf.c
12465 +@@ -25,6 +25,9 @@ static int bpf_mt_check(const struct xt_mtchk_param *par)
12466 + struct xt_bpf_info *info = par->matchinfo;
12467 + struct sock_fprog_kern program;
12468 +
12469 ++ if (info->bpf_program_num_elem > XT_BPF_MAX_NUM_INSTR)
12470 ++ return -EINVAL;
12471 ++
12472 + program.len = info->bpf_program_num_elem;
12473 + program.filter = info->bpf_program;
12474 +
12475 +diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c
12476 +index a1a29cdc58fc..140a9ae262ef 100644
12477 +--- a/net/netfilter/xt_hashlimit.c
12478 ++++ b/net/netfilter/xt_hashlimit.c
12479 +@@ -735,6 +735,8 @@ hashlimit_mt(const struct sk_buff *skb, struct xt_action_param *par)
12480 + return hashlimit_mt_common(skb, par, hinfo, &info->cfg, 2);
12481 + }
12482 +
12483 ++#define HASHLIMIT_MAX_SIZE 1048576
12484 ++
12485 + static int hashlimit_mt_check_common(const struct xt_mtchk_param *par,
12486 + struct xt_hashlimit_htable **hinfo,
12487 + struct hashlimit_cfg2 *cfg,
12488 +@@ -745,6 +747,14 @@ static int hashlimit_mt_check_common(const struct xt_mtchk_param *par,
12489 +
12490 + if (cfg->gc_interval == 0 || cfg->expire == 0)
12491 + return -EINVAL;
12492 ++ if (cfg->size > HASHLIMIT_MAX_SIZE) {
12493 ++ cfg->size = HASHLIMIT_MAX_SIZE;
12494 ++ pr_info_ratelimited("size too large, truncated to %u\n", cfg->size);
12495 ++ }
12496 ++ if (cfg->max > HASHLIMIT_MAX_SIZE) {
12497 ++ cfg->max = HASHLIMIT_MAX_SIZE;
12498 ++ pr_info_ratelimited("max too large, truncated to %u\n", cfg->max);
12499 ++ }
12500 + if (par->family == NFPROTO_IPV4) {
12501 + if (cfg->srcmask > 32 || cfg->dstmask > 32)
12502 + return -EINVAL;
12503 +diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
12504 +index eee299bb6bcf..de03b7b49e05 100644
12505 +--- a/net/sched/cls_flower.c
12506 ++++ b/net/sched/cls_flower.c
12507 +@@ -364,6 +364,7 @@ static const struct nla_policy fl_policy[TCA_FLOWER_MAX + 1] = {
12508 + [TCA_FLOWER_KEY_TCP_DST_MASK] = { .type = NLA_U16 },
12509 + [TCA_FLOWER_KEY_UDP_SRC_MASK] = { .type = NLA_U16 },
12510 + [TCA_FLOWER_KEY_UDP_DST_MASK] = { .type = NLA_U16 },
12511 ++ [TCA_FLOWER_FLAGS] = { .type = NLA_U32 },
12512 + };
12513 +
12514 + static void fl_set_key_val(struct nlattr **tb,
12515 +diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c
12516 +index 61ddfbad2aae..fe29c576e494 100644
12517 +--- a/net/sched/cls_matchall.c
12518 ++++ b/net/sched/cls_matchall.c
12519 +@@ -111,6 +111,7 @@ static unsigned long mall_get(struct tcf_proto *tp, u32 handle)
12520 + static const struct nla_policy mall_policy[TCA_MATCHALL_MAX + 1] = {
12521 + [TCA_MATCHALL_UNSPEC] = { .type = NLA_UNSPEC },
12522 + [TCA_MATCHALL_CLASSID] = { .type = NLA_U32 },
12523 ++ [TCA_MATCHALL_FLAGS] = { .type = NLA_U32 },
12524 + };
12525 +
12526 + static int mall_set_parms(struct net *net, struct tcf_proto *tp,
12527 +diff --git a/scripts/kconfig/confdata.c b/scripts/kconfig/confdata.c
12528 +index 27aac273205b..fa423fcd1a92 100644
12529 +--- a/scripts/kconfig/confdata.c
12530 ++++ b/scripts/kconfig/confdata.c
12531 +@@ -1238,7 +1238,7 @@ bool conf_set_all_new_symbols(enum conf_def_mode mode)
12532 +
12533 + sym_calc_value(csym);
12534 + if (mode == def_random)
12535 +- has_changed = randomize_choice_values(csym);
12536 ++ has_changed |= randomize_choice_values(csym);
12537 + else {
12538 + set_all_choice_values(csym);
12539 + has_changed = true;
12540 +diff --git a/security/selinux/avc.c b/security/selinux/avc.c
12541 +index 52f3c550abcc..f3c473791b69 100644
12542 +--- a/security/selinux/avc.c
12543 ++++ b/security/selinux/avc.c
12544 +@@ -865,7 +865,7 @@ static int avc_update_node(u32 event, u32 perms, u8 driver, u8 xperm, u32 ssid,
12545 + if (orig->ae.xp_node) {
12546 + rc = avc_xperms_populate(node, orig->ae.xp_node);
12547 + if (rc) {
12548 +- kmem_cache_free(avc_node_cachep, node);
12549 ++ avc_node_kill(node);
12550 + goto out_unlock;
12551 + }
12552 + }
12553 +diff --git a/sound/core/seq/seq_clientmgr.c b/sound/core/seq/seq_clientmgr.c
12554 +index eee4ea17a8f5..198eea5c8c2f 100644
12555 +--- a/sound/core/seq/seq_clientmgr.c
12556 ++++ b/sound/core/seq/seq_clientmgr.c
12557 +@@ -564,7 +564,7 @@ static int update_timestamp_of_queue(struct snd_seq_event *event,
12558 + event->queue = queue;
12559 + event->flags &= ~SNDRV_SEQ_TIME_STAMP_MASK;
12560 + if (real_time) {
12561 +- event->time.time = snd_seq_timer_get_cur_time(q->timer);
12562 ++ event->time.time = snd_seq_timer_get_cur_time(q->timer, true);
12563 + event->flags |= SNDRV_SEQ_TIME_STAMP_REAL;
12564 + } else {
12565 + event->time.tick = snd_seq_timer_get_cur_tick(q->timer);
12566 +@@ -1639,7 +1639,7 @@ static int snd_seq_ioctl_get_queue_status(struct snd_seq_client *client,
12567 + tmr = queue->timer;
12568 + status->events = queue->tickq->cells + queue->timeq->cells;
12569 +
12570 +- status->time = snd_seq_timer_get_cur_time(tmr);
12571 ++ status->time = snd_seq_timer_get_cur_time(tmr, true);
12572 + status->tick = snd_seq_timer_get_cur_tick(tmr);
12573 +
12574 + status->running = tmr->running;
12575 +diff --git a/sound/core/seq/seq_queue.c b/sound/core/seq/seq_queue.c
12576 +index 1a6dc4ff44a6..ea1aa0796276 100644
12577 +--- a/sound/core/seq/seq_queue.c
12578 ++++ b/sound/core/seq/seq_queue.c
12579 +@@ -261,6 +261,8 @@ void snd_seq_check_queue(struct snd_seq_queue *q, int atomic, int hop)
12580 + {
12581 + unsigned long flags;
12582 + struct snd_seq_event_cell *cell;
12583 ++ snd_seq_tick_time_t cur_tick;
12584 ++ snd_seq_real_time_t cur_time;
12585 +
12586 + if (q == NULL)
12587 + return;
12588 +@@ -277,17 +279,18 @@ void snd_seq_check_queue(struct snd_seq_queue *q, int atomic, int hop)
12589 +
12590 + __again:
12591 + /* Process tick queue... */
12592 ++ cur_tick = snd_seq_timer_get_cur_tick(q->timer);
12593 + for (;;) {
12594 +- cell = snd_seq_prioq_cell_out(q->tickq,
12595 +- &q->timer->tick.cur_tick);
12596 ++ cell = snd_seq_prioq_cell_out(q->tickq, &cur_tick);
12597 + if (!cell)
12598 + break;
12599 + snd_seq_dispatch_event(cell, atomic, hop);
12600 + }
12601 +
12602 + /* Process time queue... */
12603 ++ cur_time = snd_seq_timer_get_cur_time(q->timer, false);
12604 + for (;;) {
12605 +- cell = snd_seq_prioq_cell_out(q->timeq, &q->timer->cur_time);
12606 ++ cell = snd_seq_prioq_cell_out(q->timeq, &cur_time);
12607 + if (!cell)
12608 + break;
12609 + snd_seq_dispatch_event(cell, atomic, hop);
12610 +@@ -415,6 +418,7 @@ int snd_seq_queue_check_access(int queueid, int client)
12611 + int snd_seq_queue_set_owner(int queueid, int client, int locked)
12612 + {
12613 + struct snd_seq_queue *q = queueptr(queueid);
12614 ++ unsigned long flags;
12615 +
12616 + if (q == NULL)
12617 + return -EINVAL;
12618 +@@ -424,8 +428,10 @@ int snd_seq_queue_set_owner(int queueid, int client, int locked)
12619 + return -EPERM;
12620 + }
12621 +
12622 ++ spin_lock_irqsave(&q->owner_lock, flags);
12623 + q->locked = locked ? 1 : 0;
12624 + q->owner = client;
12625 ++ spin_unlock_irqrestore(&q->owner_lock, flags);
12626 + queue_access_unlock(q);
12627 + queuefree(q);
12628 +
12629 +@@ -564,15 +570,17 @@ void snd_seq_queue_client_termination(int client)
12630 + unsigned long flags;
12631 + int i;
12632 + struct snd_seq_queue *q;
12633 ++ bool matched;
12634 +
12635 + for (i = 0; i < SNDRV_SEQ_MAX_QUEUES; i++) {
12636 + if ((q = queueptr(i)) == NULL)
12637 + continue;
12638 + spin_lock_irqsave(&q->owner_lock, flags);
12639 +- if (q->owner == client)
12640 ++ matched = (q->owner == client);
12641 ++ if (matched)
12642 + q->klocked = 1;
12643 + spin_unlock_irqrestore(&q->owner_lock, flags);
12644 +- if (q->owner == client) {
12645 ++ if (matched) {
12646 + if (q->timer->running)
12647 + snd_seq_timer_stop(q->timer);
12648 + snd_seq_timer_reset(q->timer);
12649 +@@ -764,6 +772,8 @@ void snd_seq_info_queues_read(struct snd_info_entry *entry,
12650 + int i, bpm;
12651 + struct snd_seq_queue *q;
12652 + struct snd_seq_timer *tmr;
12653 ++ bool locked;
12654 ++ int owner;
12655 +
12656 + for (i = 0; i < SNDRV_SEQ_MAX_QUEUES; i++) {
12657 + if ((q = queueptr(i)) == NULL)
12658 +@@ -775,9 +785,14 @@ void snd_seq_info_queues_read(struct snd_info_entry *entry,
12659 + else
12660 + bpm = 0;
12661 +
12662 ++ spin_lock_irq(&q->owner_lock);
12663 ++ locked = q->locked;
12664 ++ owner = q->owner;
12665 ++ spin_unlock_irq(&q->owner_lock);
12666 ++
12667 + snd_iprintf(buffer, "queue %d: [%s]\n", q->queue, q->name);
12668 +- snd_iprintf(buffer, "owned by client : %d\n", q->owner);
12669 +- snd_iprintf(buffer, "lock status : %s\n", q->locked ? "Locked" : "Free");
12670 ++ snd_iprintf(buffer, "owned by client : %d\n", owner);
12671 ++ snd_iprintf(buffer, "lock status : %s\n", locked ? "Locked" : "Free");
12672 + snd_iprintf(buffer, "queued time events : %d\n", snd_seq_prioq_avail(q->timeq));
12673 + snd_iprintf(buffer, "queued tick events : %d\n", snd_seq_prioq_avail(q->tickq));
12674 + snd_iprintf(buffer, "timer state : %s\n", tmr->running ? "Running" : "Stopped");
12675 +diff --git a/sound/core/seq/seq_timer.c b/sound/core/seq/seq_timer.c
12676 +index 0e1feb597586..bd5e5a5d52a8 100644
12677 +--- a/sound/core/seq/seq_timer.c
12678 ++++ b/sound/core/seq/seq_timer.c
12679 +@@ -436,14 +436,15 @@ int snd_seq_timer_continue(struct snd_seq_timer *tmr)
12680 + }
12681 +
12682 + /* return current 'real' time. use timeofday() to get better granularity. */
12683 +-snd_seq_real_time_t snd_seq_timer_get_cur_time(struct snd_seq_timer *tmr)
12684 ++snd_seq_real_time_t snd_seq_timer_get_cur_time(struct snd_seq_timer *tmr,
12685 ++ bool adjust_ktime)
12686 + {
12687 + snd_seq_real_time_t cur_time;
12688 + unsigned long flags;
12689 +
12690 + spin_lock_irqsave(&tmr->lock, flags);
12691 + cur_time = tmr->cur_time;
12692 +- if (tmr->running) {
12693 ++ if (adjust_ktime && tmr->running) {
12694 + struct timespec64 tm;
12695 +
12696 + ktime_get_ts64(&tm);
12697 +@@ -460,7 +461,13 @@ snd_seq_real_time_t snd_seq_timer_get_cur_time(struct snd_seq_timer *tmr)
12698 + high PPQ values) */
12699 + snd_seq_tick_time_t snd_seq_timer_get_cur_tick(struct snd_seq_timer *tmr)
12700 + {
12701 +- return tmr->tick.cur_tick;
12702 ++ snd_seq_tick_time_t cur_tick;
12703 ++ unsigned long flags;
12704 ++
12705 ++ spin_lock_irqsave(&tmr->lock, flags);
12706 ++ cur_tick = tmr->tick.cur_tick;
12707 ++ spin_unlock_irqrestore(&tmr->lock, flags);
12708 ++ return cur_tick;
12709 + }
12710 +
12711 +
12712 +diff --git a/sound/core/seq/seq_timer.h b/sound/core/seq/seq_timer.h
12713 +index 9506b661fe5b..5d47d559465e 100644
12714 +--- a/sound/core/seq/seq_timer.h
12715 ++++ b/sound/core/seq/seq_timer.h
12716 +@@ -135,7 +135,8 @@ int snd_seq_timer_set_ppq(struct snd_seq_timer *tmr, int ppq);
12717 + int snd_seq_timer_set_position_tick(struct snd_seq_timer *tmr, snd_seq_tick_time_t position);
12718 + int snd_seq_timer_set_position_time(struct snd_seq_timer *tmr, snd_seq_real_time_t position);
12719 + int snd_seq_timer_set_skew(struct snd_seq_timer *tmr, unsigned int skew, unsigned int base);
12720 +-snd_seq_real_time_t snd_seq_timer_get_cur_time(struct snd_seq_timer *tmr);
12721 ++snd_seq_real_time_t snd_seq_timer_get_cur_time(struct snd_seq_timer *tmr,
12722 ++ bool adjust_ktime);
12723 + snd_seq_tick_time_t snd_seq_timer_get_cur_tick(struct snd_seq_timer *tmr);
12724 +
12725 + extern int seq_default_timer_class;
12726 +diff --git a/sound/hda/hdmi_chmap.c b/sound/hda/hdmi_chmap.c
12727 +index f21633cd9b38..acbe61b8db7b 100644
12728 +--- a/sound/hda/hdmi_chmap.c
12729 ++++ b/sound/hda/hdmi_chmap.c
12730 +@@ -249,7 +249,7 @@ void snd_hdac_print_channel_allocation(int spk_alloc, char *buf, int buflen)
12731 +
12732 + for (i = 0, j = 0; i < ARRAY_SIZE(cea_speaker_allocation_names); i++) {
12733 + if (spk_alloc & (1 << i))
12734 +- j += snprintf(buf + j, buflen - j, " %s",
12735 ++ j += scnprintf(buf + j, buflen - j, " %s",
12736 + cea_speaker_allocation_names[i]);
12737 + }
12738 + buf[j] = '\0'; /* necessary when j == 0 */
12739 +diff --git a/sound/pci/hda/hda_codec.c b/sound/pci/hda/hda_codec.c
12740 +index 1b5e217d1bb2..2ad28ce7ff49 100644
12741 +--- a/sound/pci/hda/hda_codec.c
12742 ++++ b/sound/pci/hda/hda_codec.c
12743 +@@ -4104,7 +4104,7 @@ void snd_print_pcm_bits(int pcm, char *buf, int buflen)
12744 +
12745 + for (i = 0, j = 0; i < ARRAY_SIZE(bits); i++)
12746 + if (pcm & (AC_SUPPCM_BITS_8 << i))
12747 +- j += snprintf(buf + j, buflen - j, " %d", bits[i]);
12748 ++ j += scnprintf(buf + j, buflen - j, " %d", bits[i]);
12749 +
12750 + buf[j] = '\0'; /* necessary when j == 0 */
12751 + }
12752 +diff --git a/sound/pci/hda/hda_eld.c b/sound/pci/hda/hda_eld.c
12753 +index ba7fe9b6655c..864cc8c9ada0 100644
12754 +--- a/sound/pci/hda/hda_eld.c
12755 ++++ b/sound/pci/hda/hda_eld.c
12756 +@@ -373,7 +373,7 @@ static void hdmi_print_pcm_rates(int pcm, char *buf, int buflen)
12757 +
12758 + for (i = 0, j = 0; i < ARRAY_SIZE(alsa_rates); i++)
12759 + if (pcm & (1 << i))
12760 +- j += snprintf(buf + j, buflen - j, " %d",
12761 ++ j += scnprintf(buf + j, buflen - j, " %d",
12762 + alsa_rates[i]);
12763 +
12764 + buf[j] = '\0'; /* necessary when j == 0 */
12765 +diff --git a/sound/pci/hda/hda_sysfs.c b/sound/pci/hda/hda_sysfs.c
12766 +index 9739fce9e032..f3ac19d33bd4 100644
12767 +--- a/sound/pci/hda/hda_sysfs.c
12768 ++++ b/sound/pci/hda/hda_sysfs.c
12769 +@@ -221,7 +221,7 @@ static ssize_t init_verbs_show(struct device *dev,
12770 + mutex_lock(&codec->user_mutex);
12771 + for (i = 0; i < codec->init_verbs.used; i++) {
12772 + struct hda_verb *v = snd_array_elem(&codec->init_verbs, i);
12773 +- len += snprintf(buf + len, PAGE_SIZE - len,
12774 ++ len += scnprintf(buf + len, PAGE_SIZE - len,
12775 + "0x%02x 0x%03x 0x%04x\n",
12776 + v->nid, v->verb, v->param);
12777 + }
12778 +@@ -271,7 +271,7 @@ static ssize_t hints_show(struct device *dev,
12779 + mutex_lock(&codec->user_mutex);
12780 + for (i = 0; i < codec->hints.used; i++) {
12781 + struct hda_hint *hint = snd_array_elem(&codec->hints, i);
12782 +- len += snprintf(buf + len, PAGE_SIZE - len,
12783 ++ len += scnprintf(buf + len, PAGE_SIZE - len,
12784 + "%s = %s\n", hint->key, hint->val);
12785 + }
12786 + mutex_unlock(&codec->user_mutex);
12787 +diff --git a/sound/pci/hda/patch_conexant.c b/sound/pci/hda/patch_conexant.c
12788 +index 8557b94e462c..1e99500dbb6c 100644
12789 +--- a/sound/pci/hda/patch_conexant.c
12790 ++++ b/sound/pci/hda/patch_conexant.c
12791 +@@ -853,6 +853,7 @@ static const struct snd_pci_quirk cxt5066_fixups[] = {
12792 + SND_PCI_QUIRK(0x17aa, 0x215f, "Lenovo T510", CXT_PINCFG_LENOVO_TP410),
12793 + SND_PCI_QUIRK(0x17aa, 0x21ce, "Lenovo T420", CXT_PINCFG_LENOVO_TP410),
12794 + SND_PCI_QUIRK(0x17aa, 0x21cf, "Lenovo T520", CXT_PINCFG_LENOVO_TP410),
12795 ++ SND_PCI_QUIRK(0x17aa, 0x21d2, "Lenovo T420s", CXT_PINCFG_LENOVO_TP410),
12796 + SND_PCI_QUIRK(0x17aa, 0x21da, "Lenovo X220", CXT_PINCFG_LENOVO_TP410),
12797 + SND_PCI_QUIRK(0x17aa, 0x21db, "Lenovo X220-tablet", CXT_PINCFG_LENOVO_TP410),
12798 + SND_PCI_QUIRK(0x17aa, 0x38af, "Lenovo IdeaPad Z560", CXT_FIXUP_MUTE_LED_EAPD),
12799 +diff --git a/sound/sh/aica.c b/sound/sh/aica.c
12800 +index fbbc25279559..2a127feb8e29 100644
12801 +--- a/sound/sh/aica.c
12802 ++++ b/sound/sh/aica.c
12803 +@@ -117,10 +117,10 @@ static void spu_memset(u32 toi, u32 what, int length)
12804 + }
12805 +
12806 + /* spu_memload - write to SPU address space */
12807 +-static void spu_memload(u32 toi, void *from, int length)
12808 ++static void spu_memload(u32 toi, const void *from, int length)
12809 + {
12810 + unsigned long flags;
12811 +- u32 *froml = from;
12812 ++ const u32 *froml = from;
12813 + u32 __iomem *to = (u32 __iomem *) (SPU_MEMORY_BASE + toi);
12814 + int i;
12815 + u32 val;
12816 +diff --git a/sound/soc/atmel/Kconfig b/sound/soc/atmel/Kconfig
12817 +index 22aec9a1e9a4..838d03a138ca 100644
12818 +--- a/sound/soc/atmel/Kconfig
12819 ++++ b/sound/soc/atmel/Kconfig
12820 +@@ -25,6 +25,8 @@ config SND_ATMEL_SOC_DMA
12821 +
12822 + config SND_ATMEL_SOC_SSC_DMA
12823 + tristate
12824 ++ select SND_ATMEL_SOC_DMA
12825 ++ select SND_ATMEL_SOC_PDC
12826 +
12827 + config SND_ATMEL_SOC_SSC
12828 + tristate
12829 +diff --git a/sound/usb/quirks.c b/sound/usb/quirks.c
12830 +index a5299cbb09ba..064f3485a977 100644
12831 +--- a/sound/usb/quirks.c
12832 ++++ b/sound/usb/quirks.c
12833 +@@ -1149,6 +1149,7 @@ bool snd_usb_get_sample_rate_quirk(struct snd_usb_audio *chip)
12834 + case USB_ID(0x1de7, 0x0014): /* Phoenix Audio TMX320 */
12835 + case USB_ID(0x1de7, 0x0114): /* Phoenix Audio MT202pcs */
12836 + case USB_ID(0x21B4, 0x0081): /* AudioQuest DragonFly */
12837 ++ case USB_ID(0x2912, 0x30c8): /* Audioengine D1 */
12838 + return true;
12839 + }
12840 + return false;
12841 +diff --git a/sound/usb/usx2y/usX2Yhwdep.c b/sound/usb/usx2y/usX2Yhwdep.c
12842 +index 0b34dbc8f302..7dcb33d3886b 100644
12843 +--- a/sound/usb/usx2y/usX2Yhwdep.c
12844 ++++ b/sound/usb/usx2y/usX2Yhwdep.c
12845 +@@ -132,7 +132,7 @@ static int snd_usX2Y_hwdep_dsp_status(struct snd_hwdep *hw,
12846 + info->num_dsps = 2; // 0: Prepad Data, 1: FPGA Code
12847 + if (us428->chip_status & USX2Y_STAT_CHIP_INIT)
12848 + info->chip_ready = 1;
12849 +- info->version = USX2Y_DRIVER_VERSION;
12850 ++ info->version = USX2Y_DRIVER_VERSION;
12851 + return 0;
12852 + }
12853 +
12854 +diff --git a/tools/lib/api/fs/fs.c b/tools/lib/api/fs/fs.c
12855 +index f99f49e4a31e..21e714cf0126 100644
12856 +--- a/tools/lib/api/fs/fs.c
12857 ++++ b/tools/lib/api/fs/fs.c
12858 +@@ -194,6 +194,7 @@ static bool fs__env_override(struct fs *fs)
12859 + size_t name_len = strlen(fs->name);
12860 + /* name + "_PATH" + '\0' */
12861 + char upper_name[name_len + 5 + 1];
12862 ++
12863 + memcpy(upper_name, fs->name, name_len);
12864 + mem_toupper(upper_name, name_len);
12865 + strcpy(&upper_name[name_len], "_PATH");
12866 +@@ -203,7 +204,8 @@ static bool fs__env_override(struct fs *fs)
12867 + return false;
12868 +
12869 + fs->found = true;
12870 +- strncpy(fs->path, override_path, sizeof(fs->path));
12871 ++ strncpy(fs->path, override_path, sizeof(fs->path) - 1);
12872 ++ fs->path[sizeof(fs->path) - 1] = '\0';
12873 + return true;
12874 + }
12875 +
12876 +diff --git a/tools/objtool/arch/x86/lib/x86-opcode-map.txt b/tools/objtool/arch/x86/lib/x86-opcode-map.txt
12877 +index 0f7eb4f5bdb7..82e105b284e0 100644
12878 +--- a/tools/objtool/arch/x86/lib/x86-opcode-map.txt
12879 ++++ b/tools/objtool/arch/x86/lib/x86-opcode-map.txt
12880 +@@ -909,7 +909,7 @@ EndTable
12881 +
12882 + GrpTable: Grp3_2
12883 + 0: TEST Ev,Iz
12884 +-1:
12885 ++1: TEST Ev,Iz
12886 + 2: NOT Ev
12887 + 3: NEG Ev
12888 + 4: MUL rAX,Ev
12889 +diff --git a/tools/usb/usbip/src/usbip_network.c b/tools/usb/usbip/src/usbip_network.c
12890 +index b4c37e76a6e0..187dfaa67d0a 100644
12891 +--- a/tools/usb/usbip/src/usbip_network.c
12892 ++++ b/tools/usb/usbip/src/usbip_network.c
12893 +@@ -62,39 +62,39 @@ void usbip_setup_port_number(char *arg)
12894 + info("using port %d (\"%s\")", usbip_port, usbip_port_string);
12895 + }
12896 +
12897 +-void usbip_net_pack_uint32_t(int pack, uint32_t *num)
12898 ++uint32_t usbip_net_pack_uint32_t(int pack, uint32_t num)
12899 + {
12900 + uint32_t i;
12901 +
12902 + if (pack)
12903 +- i = htonl(*num);
12904 ++ i = htonl(num);
12905 + else
12906 +- i = ntohl(*num);
12907 ++ i = ntohl(num);
12908 +
12909 +- *num = i;
12910 ++ return i;
12911 + }
12912 +
12913 +-void usbip_net_pack_uint16_t(int pack, uint16_t *num)
12914 ++uint16_t usbip_net_pack_uint16_t(int pack, uint16_t num)
12915 + {
12916 + uint16_t i;
12917 +
12918 + if (pack)
12919 +- i = htons(*num);
12920 ++ i = htons(num);
12921 + else
12922 +- i = ntohs(*num);
12923 ++ i = ntohs(num);
12924 +
12925 +- *num = i;
12926 ++ return i;
12927 + }
12928 +
12929 + void usbip_net_pack_usb_device(int pack, struct usbip_usb_device *udev)
12930 + {
12931 +- usbip_net_pack_uint32_t(pack, &udev->busnum);
12932 +- usbip_net_pack_uint32_t(pack, &udev->devnum);
12933 +- usbip_net_pack_uint32_t(pack, &udev->speed);
12934 ++ udev->busnum = usbip_net_pack_uint32_t(pack, udev->busnum);
12935 ++ udev->devnum = usbip_net_pack_uint32_t(pack, udev->devnum);
12936 ++ udev->speed = usbip_net_pack_uint32_t(pack, udev->speed);
12937 +
12938 +- usbip_net_pack_uint16_t(pack, &udev->idVendor);
12939 +- usbip_net_pack_uint16_t(pack, &udev->idProduct);
12940 +- usbip_net_pack_uint16_t(pack, &udev->bcdDevice);
12941 ++ udev->idVendor = usbip_net_pack_uint16_t(pack, udev->idVendor);
12942 ++ udev->idProduct = usbip_net_pack_uint16_t(pack, udev->idProduct);
12943 ++ udev->bcdDevice = usbip_net_pack_uint16_t(pack, udev->bcdDevice);
12944 + }
12945 +
12946 + void usbip_net_pack_usb_interface(int pack __attribute__((unused)),
12947 +@@ -141,6 +141,14 @@ ssize_t usbip_net_send(int sockfd, void *buff, size_t bufflen)
12948 + return usbip_net_xmit(sockfd, buff, bufflen, 1);
12949 + }
12950 +
12951 ++static inline void usbip_net_pack_op_common(int pack,
12952 ++ struct op_common *op_common)
12953 ++{
12954 ++ op_common->version = usbip_net_pack_uint16_t(pack, op_common->version);
12955 ++ op_common->code = usbip_net_pack_uint16_t(pack, op_common->code);
12956 ++ op_common->status = usbip_net_pack_uint32_t(pack, op_common->status);
12957 ++}
12958 ++
12959 + int usbip_net_send_op_common(int sockfd, uint32_t code, uint32_t status)
12960 + {
12961 + struct op_common op_common;
12962 +@@ -152,7 +160,7 @@ int usbip_net_send_op_common(int sockfd, uint32_t code, uint32_t status)
12963 + op_common.code = code;
12964 + op_common.status = status;
12965 +
12966 +- PACK_OP_COMMON(1, &op_common);
12967 ++ usbip_net_pack_op_common(1, &op_common);
12968 +
12969 + rc = usbip_net_send(sockfd, &op_common, sizeof(op_common));
12970 + if (rc < 0) {
12971 +@@ -176,7 +184,7 @@ int usbip_net_recv_op_common(int sockfd, uint16_t *code)
12972 + goto err;
12973 + }
12974 +
12975 +- PACK_OP_COMMON(0, &op_common);
12976 ++ usbip_net_pack_op_common(0, &op_common);
12977 +
12978 + if (op_common.version != USBIP_VERSION) {
12979 + dbg("version mismatch: %d %d", op_common.version,
12980 +diff --git a/tools/usb/usbip/src/usbip_network.h b/tools/usb/usbip/src/usbip_network.h
12981 +index c1e875cf1078..573fa839b66b 100644
12982 +--- a/tools/usb/usbip/src/usbip_network.h
12983 ++++ b/tools/usb/usbip/src/usbip_network.h
12984 +@@ -33,12 +33,6 @@ struct op_common {
12985 +
12986 + } __attribute__((packed));
12987 +
12988 +-#define PACK_OP_COMMON(pack, op_common) do {\
12989 +- usbip_net_pack_uint16_t(pack, &(op_common)->version);\
12990 +- usbip_net_pack_uint16_t(pack, &(op_common)->code);\
12991 +- usbip_net_pack_uint32_t(pack, &(op_common)->status);\
12992 +-} while (0)
12993 +-
12994 + /* ---------------------------------------------------------------------- */
12995 + /* Dummy Code */
12996 + #define OP_UNSPEC 0x00
12997 +@@ -164,11 +158,11 @@ struct op_devlist_reply_extra {
12998 + } while (0)
12999 +
13000 + #define PACK_OP_DEVLIST_REPLY(pack, reply) do {\
13001 +- usbip_net_pack_uint32_t(pack, &(reply)->ndev);\
13002 ++ (reply)->ndev = usbip_net_pack_uint32_t(pack, (reply)->ndev);\
13003 + } while (0)
13004 +
13005 +-void usbip_net_pack_uint32_t(int pack, uint32_t *num);
13006 +-void usbip_net_pack_uint16_t(int pack, uint16_t *num);
13007 ++uint32_t usbip_net_pack_uint32_t(int pack, uint32_t num);
13008 ++uint16_t usbip_net_pack_uint16_t(int pack, uint16_t num);
13009 + void usbip_net_pack_usb_device(int pack, struct usbip_usb_device *udev);
13010 + void usbip_net_pack_usb_interface(int pack, struct usbip_usb_interface *uinf);
13011 +