Gentoo Archives: gentoo-commits

From: Mike Pagano <mpagano@g.o>
To: gentoo-commits@l.g.o
Subject: [gentoo-commits] proj/linux-patches:4.19 commit in: /
Date: Tue, 11 Feb 2020 16:20:58
Message-Id: 1581438035.7f8cc789b776e0cf768d27cff4118be5960094c0.mpagano@gentoo
1 commit: 7f8cc789b776e0cf768d27cff4118be5960094c0
2 Author: Mike Pagano <mpagano <AT> gentoo <DOT> org>
3 AuthorDate: Tue Feb 11 16:20:35 2020 +0000
4 Commit: Mike Pagano <mpagano <AT> gentoo <DOT> org>
5 CommitDate: Tue Feb 11 16:20:35 2020 +0000
6 URL: https://gitweb.gentoo.org/proj/linux-patches.git/commit/?id=7f8cc789
7
8 Linux patch 4.19.103
9
10 Signed-off-by: Mike Pagano <mpagano <AT> gentoo.org>
11
12 0000_README | 4 +
13 1102_linux-4.19.103.patch | 17138 ++++++++++++++++++++++++++++++++++++++++++++
14 2 files changed, 17142 insertions(+)
15
16 diff --git a/0000_README b/0000_README
17 index 0d9cae4..8ac0cbe 100644
18 --- a/0000_README
19 +++ b/0000_README
20 @@ -447,6 +447,10 @@ Patch: 1101_linux-4.19.102.patch
21 From: https://www.kernel.org
22 Desc: Linux 4.19.102
23
24 +Patch: 1102_linux-4.19.103.patch
25 +From: https://www.kernel.org
26 +Desc: Linux 4.19.103
27 +
28 Patch: 1500_XATTR_USER_PREFIX.patch
29 From: https://bugs.gentoo.org/show_bug.cgi?id=470644
30 Desc: Support for namespace user.pax.* on tmpfs.
31
32 diff --git a/1102_linux-4.19.103.patch b/1102_linux-4.19.103.patch
33 new file mode 100644
34 index 0000000..7b0adc1
35 --- /dev/null
36 +++ b/1102_linux-4.19.103.patch
37 @@ -0,0 +1,17138 @@
38 +diff --git a/Makefile b/Makefile
39 +index 597a14e2127b..37f58becf5c2 100644
40 +--- a/Makefile
41 ++++ b/Makefile
42 +@@ -1,7 +1,7 @@
43 + # SPDX-License-Identifier: GPL-2.0
44 + VERSION = 4
45 + PATCHLEVEL = 19
46 +-SUBLEVEL = 102
47 ++SUBLEVEL = 103
48 + EXTRAVERSION =
49 + NAME = "People's Front"
50 +
51 +diff --git a/arch/arm/include/asm/kvm_emulate.h b/arch/arm/include/asm/kvm_emulate.h
52 +index 77121b713bef..7d2ca035d6c8 100644
53 +--- a/arch/arm/include/asm/kvm_emulate.h
54 ++++ b/arch/arm/include/asm/kvm_emulate.h
55 +@@ -26,13 +26,25 @@
56 + #include <asm/cputype.h>
57 +
58 + /* arm64 compatibility macros */
59 ++#define PSR_AA32_MODE_FIQ FIQ_MODE
60 ++#define PSR_AA32_MODE_SVC SVC_MODE
61 + #define PSR_AA32_MODE_ABT ABT_MODE
62 + #define PSR_AA32_MODE_UND UND_MODE
63 + #define PSR_AA32_T_BIT PSR_T_BIT
64 ++#define PSR_AA32_F_BIT PSR_F_BIT
65 + #define PSR_AA32_I_BIT PSR_I_BIT
66 + #define PSR_AA32_A_BIT PSR_A_BIT
67 + #define PSR_AA32_E_BIT PSR_E_BIT
68 + #define PSR_AA32_IT_MASK PSR_IT_MASK
69 ++#define PSR_AA32_GE_MASK 0x000f0000
70 ++#define PSR_AA32_DIT_BIT 0x00200000
71 ++#define PSR_AA32_PAN_BIT 0x00400000
72 ++#define PSR_AA32_SSBS_BIT 0x00800000
73 ++#define PSR_AA32_Q_BIT PSR_Q_BIT
74 ++#define PSR_AA32_V_BIT PSR_V_BIT
75 ++#define PSR_AA32_C_BIT PSR_C_BIT
76 ++#define PSR_AA32_Z_BIT PSR_Z_BIT
77 ++#define PSR_AA32_N_BIT PSR_N_BIT
78 +
79 + unsigned long *vcpu_reg(struct kvm_vcpu *vcpu, u8 reg_num);
80 +
81 +@@ -53,6 +65,11 @@ static inline void vcpu_write_spsr(struct kvm_vcpu *vcpu, unsigned long v)
82 + *__vcpu_spsr(vcpu) = v;
83 + }
84 +
85 ++static inline unsigned long host_spsr_to_spsr32(unsigned long spsr)
86 ++{
87 ++ return spsr;
88 ++}
89 ++
90 + static inline unsigned long vcpu_get_reg(struct kvm_vcpu *vcpu,
91 + u8 reg_num)
92 + {
93 +@@ -189,6 +206,11 @@ static inline bool kvm_vcpu_dabt_issext(struct kvm_vcpu *vcpu)
94 + return kvm_vcpu_get_hsr(vcpu) & HSR_SSE;
95 + }
96 +
97 ++static inline bool kvm_vcpu_dabt_issf(const struct kvm_vcpu *vcpu)
98 ++{
99 ++ return false;
100 ++}
101 ++
102 + static inline int kvm_vcpu_dabt_get_rd(struct kvm_vcpu *vcpu)
103 + {
104 + return (kvm_vcpu_get_hsr(vcpu) & HSR_SRT_MASK) >> HSR_SRT_SHIFT;
105 +diff --git a/arch/arm/include/asm/kvm_mmio.h b/arch/arm/include/asm/kvm_mmio.h
106 +index f3a7de71f515..848339d76f9a 100644
107 +--- a/arch/arm/include/asm/kvm_mmio.h
108 ++++ b/arch/arm/include/asm/kvm_mmio.h
109 +@@ -26,6 +26,8 @@
110 + struct kvm_decode {
111 + unsigned long rt;
112 + bool sign_extend;
113 ++ /* Not used on 32-bit arm */
114 ++ bool sixty_four;
115 + };
116 +
117 + void kvm_mmio_write_buf(void *buf, unsigned int len, unsigned long data);
118 +diff --git a/arch/arm/mach-tegra/sleep-tegra30.S b/arch/arm/mach-tegra/sleep-tegra30.S
119 +index dd4a67dabd91..b7cd41461e7d 100644
120 +--- a/arch/arm/mach-tegra/sleep-tegra30.S
121 ++++ b/arch/arm/mach-tegra/sleep-tegra30.S
122 +@@ -382,6 +382,14 @@ _pll_m_c_x_done:
123 + pll_locked r1, r0, CLK_RESET_PLLC_BASE
124 + pll_locked r1, r0, CLK_RESET_PLLX_BASE
125 +
126 ++ tegra_get_soc_id TEGRA_APB_MISC_BASE, r1
127 ++ cmp r1, #TEGRA30
128 ++ beq 1f
129 ++ ldr r1, [r0, #CLK_RESET_PLLP_BASE]
130 ++ bic r1, r1, #(1<<31) @ disable PllP bypass
131 ++ str r1, [r0, #CLK_RESET_PLLP_BASE]
132 ++1:
133 ++
134 + mov32 r7, TEGRA_TMRUS_BASE
135 + ldr r1, [r7]
136 + add r1, r1, #LOCK_DELAY
137 +@@ -641,7 +649,10 @@ tegra30_switch_cpu_to_clk32k:
138 + str r0, [r4, #PMC_PLLP_WB0_OVERRIDE]
139 +
140 + /* disable PLLP, PLLA, PLLC and PLLX */
141 ++ tegra_get_soc_id TEGRA_APB_MISC_BASE, r1
142 ++ cmp r1, #TEGRA30
143 + ldr r0, [r5, #CLK_RESET_PLLP_BASE]
144 ++ orrne r0, r0, #(1 << 31) @ enable PllP bypass on fast cluster
145 + bic r0, r0, #(1 << 30)
146 + str r0, [r5, #CLK_RESET_PLLP_BASE]
147 + ldr r0, [r5, #CLK_RESET_PLLA_BASE]
148 +diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h
149 +index 6106a85ae0be..778cb4f868d9 100644
150 +--- a/arch/arm64/include/asm/kvm_emulate.h
151 ++++ b/arch/arm64/include/asm/kvm_emulate.h
152 +@@ -202,6 +202,38 @@ static inline void vcpu_write_spsr(struct kvm_vcpu *vcpu, unsigned long v)
153 + vcpu_gp_regs(vcpu)->spsr[KVM_SPSR_EL1] = v;
154 + }
155 +
156 ++/*
157 ++ * The layout of SPSR for an AArch32 state is different when observed from an
158 ++ * AArch64 SPSR_ELx or an AArch32 SPSR_*. This function generates the AArch32
159 ++ * view given an AArch64 view.
160 ++ *
161 ++ * In ARM DDI 0487E.a see:
162 ++ *
163 ++ * - The AArch64 view (SPSR_EL2) in section C5.2.18, page C5-426
164 ++ * - The AArch32 view (SPSR_abt) in section G8.2.126, page G8-6256
165 ++ * - The AArch32 view (SPSR_und) in section G8.2.132, page G8-6280
166 ++ *
167 ++ * Which show the following differences:
168 ++ *
169 ++ * | Bit | AA64 | AA32 | Notes |
170 ++ * +-----+------+------+-----------------------------|
171 ++ * | 24 | DIT | J | J is RES0 in ARMv8 |
172 ++ * | 21 | SS | DIT | SS doesn't exist in AArch32 |
173 ++ *
174 ++ * ... and all other bits are (currently) common.
175 ++ */
176 ++static inline unsigned long host_spsr_to_spsr32(unsigned long spsr)
177 ++{
178 ++ const unsigned long overlap = BIT(24) | BIT(21);
179 ++ unsigned long dit = !!(spsr & PSR_AA32_DIT_BIT);
180 ++
181 ++ spsr &= ~overlap;
182 ++
183 ++ spsr |= dit << 21;
184 ++
185 ++ return spsr;
186 ++}
187 ++
188 + static inline bool vcpu_mode_priv(const struct kvm_vcpu *vcpu)
189 + {
190 + u32 mode;
191 +@@ -261,6 +293,11 @@ static inline bool kvm_vcpu_dabt_issext(const struct kvm_vcpu *vcpu)
192 + return !!(kvm_vcpu_get_hsr(vcpu) & ESR_ELx_SSE);
193 + }
194 +
195 ++static inline bool kvm_vcpu_dabt_issf(const struct kvm_vcpu *vcpu)
196 ++{
197 ++ return !!(kvm_vcpu_get_hsr(vcpu) & ESR_ELx_SF);
198 ++}
199 ++
200 + static inline int kvm_vcpu_dabt_get_rd(const struct kvm_vcpu *vcpu)
201 + {
202 + return (kvm_vcpu_get_hsr(vcpu) & ESR_ELx_SRT_MASK) >> ESR_ELx_SRT_SHIFT;
203 +diff --git a/arch/arm64/include/asm/kvm_mmio.h b/arch/arm64/include/asm/kvm_mmio.h
204 +index 75ea42079757..0240290cf764 100644
205 +--- a/arch/arm64/include/asm/kvm_mmio.h
206 ++++ b/arch/arm64/include/asm/kvm_mmio.h
207 +@@ -21,13 +21,11 @@
208 + #include <linux/kvm_host.h>
209 + #include <asm/kvm_arm.h>
210 +
211 +-/*
212 +- * This is annoying. The mmio code requires this, even if we don't
213 +- * need any decoding. To be fixed.
214 +- */
215 + struct kvm_decode {
216 + unsigned long rt;
217 + bool sign_extend;
218 ++ /* Witdth of the register accessed by the faulting instruction is 64-bits */
219 ++ bool sixty_four;
220 + };
221 +
222 + void kvm_mmio_write_buf(void *buf, unsigned int len, unsigned long data);
223 +diff --git a/arch/arm64/include/asm/ptrace.h b/arch/arm64/include/asm/ptrace.h
224 +index 6bc43889d11e..163970d5c4b1 100644
225 +--- a/arch/arm64/include/asm/ptrace.h
226 ++++ b/arch/arm64/include/asm/ptrace.h
227 +@@ -50,6 +50,7 @@
228 + #define PSR_AA32_I_BIT 0x00000080
229 + #define PSR_AA32_A_BIT 0x00000100
230 + #define PSR_AA32_E_BIT 0x00000200
231 ++#define PSR_AA32_PAN_BIT 0x00400000
232 + #define PSR_AA32_SSBS_BIT 0x00800000
233 + #define PSR_AA32_DIT_BIT 0x01000000
234 + #define PSR_AA32_Q_BIT 0x08000000
235 +diff --git a/arch/arm64/include/uapi/asm/ptrace.h b/arch/arm64/include/uapi/asm/ptrace.h
236 +index b0fd1d300154..978ff79fba2b 100644
237 +--- a/arch/arm64/include/uapi/asm/ptrace.h
238 ++++ b/arch/arm64/include/uapi/asm/ptrace.h
239 +@@ -49,6 +49,7 @@
240 + #define PSR_SSBS_BIT 0x00001000
241 + #define PSR_PAN_BIT 0x00400000
242 + #define PSR_UAO_BIT 0x00800000
243 ++#define PSR_DIT_BIT 0x01000000
244 + #define PSR_V_BIT 0x10000000
245 + #define PSR_C_BIT 0x20000000
246 + #define PSR_Z_BIT 0x40000000
247 +diff --git a/arch/arm64/kvm/inject_fault.c b/arch/arm64/kvm/inject_fault.c
248 +index a55e91dfcf8f..41c80c311367 100644
249 +--- a/arch/arm64/kvm/inject_fault.c
250 ++++ b/arch/arm64/kvm/inject_fault.c
251 +@@ -25,9 +25,6 @@
252 + #include <asm/kvm_emulate.h>
253 + #include <asm/esr.h>
254 +
255 +-#define PSTATE_FAULT_BITS_64 (PSR_MODE_EL1h | PSR_A_BIT | PSR_F_BIT | \
256 +- PSR_I_BIT | PSR_D_BIT)
257 +-
258 + #define CURRENT_EL_SP_EL0_VECTOR 0x0
259 + #define CURRENT_EL_SP_ELx_VECTOR 0x200
260 + #define LOWER_EL_AArch64_VECTOR 0x400
261 +@@ -61,6 +58,69 @@ static u64 get_except_vector(struct kvm_vcpu *vcpu, enum exception_type type)
262 + return vcpu_read_sys_reg(vcpu, VBAR_EL1) + exc_offset + type;
263 + }
264 +
265 ++/*
266 ++ * When an exception is taken, most PSTATE fields are left unchanged in the
267 ++ * handler. However, some are explicitly overridden (e.g. M[4:0]). Luckily all
268 ++ * of the inherited bits have the same position in the AArch64/AArch32 SPSR_ELx
269 ++ * layouts, so we don't need to shuffle these for exceptions from AArch32 EL0.
270 ++ *
271 ++ * For the SPSR_ELx layout for AArch64, see ARM DDI 0487E.a page C5-429.
272 ++ * For the SPSR_ELx layout for AArch32, see ARM DDI 0487E.a page C5-426.
273 ++ *
274 ++ * Here we manipulate the fields in order of the AArch64 SPSR_ELx layout, from
275 ++ * MSB to LSB.
276 ++ */
277 ++static unsigned long get_except64_pstate(struct kvm_vcpu *vcpu)
278 ++{
279 ++ unsigned long sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL1);
280 ++ unsigned long old, new;
281 ++
282 ++ old = *vcpu_cpsr(vcpu);
283 ++ new = 0;
284 ++
285 ++ new |= (old & PSR_N_BIT);
286 ++ new |= (old & PSR_Z_BIT);
287 ++ new |= (old & PSR_C_BIT);
288 ++ new |= (old & PSR_V_BIT);
289 ++
290 ++ // TODO: TCO (if/when ARMv8.5-MemTag is exposed to guests)
291 ++
292 ++ new |= (old & PSR_DIT_BIT);
293 ++
294 ++ // PSTATE.UAO is set to zero upon any exception to AArch64
295 ++ // See ARM DDI 0487E.a, page D5-2579.
296 ++
297 ++ // PSTATE.PAN is unchanged unless SCTLR_ELx.SPAN == 0b0
298 ++ // SCTLR_ELx.SPAN is RES1 when ARMv8.1-PAN is not implemented
299 ++ // See ARM DDI 0487E.a, page D5-2578.
300 ++ new |= (old & PSR_PAN_BIT);
301 ++ if (!(sctlr & SCTLR_EL1_SPAN))
302 ++ new |= PSR_PAN_BIT;
303 ++
304 ++ // PSTATE.SS is set to zero upon any exception to AArch64
305 ++ // See ARM DDI 0487E.a, page D2-2452.
306 ++
307 ++ // PSTATE.IL is set to zero upon any exception to AArch64
308 ++ // See ARM DDI 0487E.a, page D1-2306.
309 ++
310 ++ // PSTATE.SSBS is set to SCTLR_ELx.DSSBS upon any exception to AArch64
311 ++ // See ARM DDI 0487E.a, page D13-3258
312 ++ if (sctlr & SCTLR_ELx_DSSBS)
313 ++ new |= PSR_SSBS_BIT;
314 ++
315 ++ // PSTATE.BTYPE is set to zero upon any exception to AArch64
316 ++ // See ARM DDI 0487E.a, pages D1-2293 to D1-2294.
317 ++
318 ++ new |= PSR_D_BIT;
319 ++ new |= PSR_A_BIT;
320 ++ new |= PSR_I_BIT;
321 ++ new |= PSR_F_BIT;
322 ++
323 ++ new |= PSR_MODE_EL1h;
324 ++
325 ++ return new;
326 ++}
327 ++
328 + static void inject_abt64(struct kvm_vcpu *vcpu, bool is_iabt, unsigned long addr)
329 + {
330 + unsigned long cpsr = *vcpu_cpsr(vcpu);
331 +@@ -70,7 +130,7 @@ static void inject_abt64(struct kvm_vcpu *vcpu, bool is_iabt, unsigned long addr
332 + vcpu_write_elr_el1(vcpu, *vcpu_pc(vcpu));
333 + *vcpu_pc(vcpu) = get_except_vector(vcpu, except_type_sync);
334 +
335 +- *vcpu_cpsr(vcpu) = PSTATE_FAULT_BITS_64;
336 ++ *vcpu_cpsr(vcpu) = get_except64_pstate(vcpu);
337 + vcpu_write_spsr(vcpu, cpsr);
338 +
339 + vcpu_write_sys_reg(vcpu, addr, FAR_EL1);
340 +@@ -105,7 +165,7 @@ static void inject_undef64(struct kvm_vcpu *vcpu)
341 + vcpu_write_elr_el1(vcpu, *vcpu_pc(vcpu));
342 + *vcpu_pc(vcpu) = get_except_vector(vcpu, except_type_sync);
343 +
344 +- *vcpu_cpsr(vcpu) = PSTATE_FAULT_BITS_64;
345 ++ *vcpu_cpsr(vcpu) = get_except64_pstate(vcpu);
346 + vcpu_write_spsr(vcpu, cpsr);
347 +
348 + /*
349 +diff --git a/arch/mips/Makefile.postlink b/arch/mips/Makefile.postlink
350 +index 4eea4188cb20..13e0beb9eee3 100644
351 +--- a/arch/mips/Makefile.postlink
352 ++++ b/arch/mips/Makefile.postlink
353 +@@ -12,7 +12,7 @@ __archpost:
354 + include scripts/Kbuild.include
355 +
356 + CMD_RELOCS = arch/mips/boot/tools/relocs
357 +-quiet_cmd_relocs = RELOCS $@
358 ++quiet_cmd_relocs = RELOCS $@
359 + cmd_relocs = $(CMD_RELOCS) $@
360 +
361 + # `@true` prevents complaint when there is nothing to be done
362 +diff --git a/arch/mips/boot/Makefile b/arch/mips/boot/Makefile
363 +index 35704c28a28b..0ccc20320099 100644
364 +--- a/arch/mips/boot/Makefile
365 ++++ b/arch/mips/boot/Makefile
366 +@@ -123,7 +123,7 @@ $(obj)/vmlinux.its.S: $(addprefix $(srctree)/arch/mips/$(PLATFORM)/,$(ITS_INPUTS
367 + targets += vmlinux.its
368 + targets += vmlinux.gz.its
369 + targets += vmlinux.bz2.its
370 +-targets += vmlinux.lzmo.its
371 ++targets += vmlinux.lzma.its
372 + targets += vmlinux.lzo.its
373 +
374 + quiet_cmd_cpp_its_S = ITS $@
375 +diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
376 +index a80669209155..6f475dc5829b 100644
377 +--- a/arch/powerpc/Kconfig
378 ++++ b/arch/powerpc/Kconfig
379 +@@ -230,6 +230,7 @@ config PPC
380 + select NEED_SG_DMA_LENGTH
381 + select NO_BOOTMEM
382 + select OF
383 ++ select OF_DMA_DEFAULT_COHERENT if !NOT_COHERENT_CACHE
384 + select OF_EARLY_FLATTREE
385 + select OF_RESERVED_MEM
386 + select OLD_SIGACTION if PPC32
387 +diff --git a/arch/powerpc/boot/4xx.c b/arch/powerpc/boot/4xx.c
388 +index f7da65169124..3c8774163c7e 100644
389 +--- a/arch/powerpc/boot/4xx.c
390 ++++ b/arch/powerpc/boot/4xx.c
391 +@@ -232,7 +232,7 @@ void ibm4xx_denali_fixup_memsize(void)
392 + dpath = 8; /* 64 bits */
393 +
394 + /* get address pins (rows) */
395 +- val = SDRAM0_READ(DDR0_42);
396 ++ val = SDRAM0_READ(DDR0_42);
397 +
398 + row = DDR_GET_VAL(val, DDR_APIN, DDR_APIN_SHIFT);
399 + if (row > max_row)
400 +diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
401 +index dbfe32327212..5dc592fb4f5f 100644
402 +--- a/arch/powerpc/kvm/book3s_hv.c
403 ++++ b/arch/powerpc/kvm/book3s_hv.c
404 +@@ -2065,7 +2065,7 @@ static struct kvm_vcpu *kvmppc_core_vcpu_create_hv(struct kvm *kvm,
405 + mutex_unlock(&kvm->lock);
406 +
407 + if (!vcore)
408 +- goto free_vcpu;
409 ++ goto uninit_vcpu;
410 +
411 + spin_lock(&vcore->lock);
412 + ++vcore->num_threads;
413 +@@ -2082,6 +2082,8 @@ static struct kvm_vcpu *kvmppc_core_vcpu_create_hv(struct kvm *kvm,
414 +
415 + return vcpu;
416 +
417 ++uninit_vcpu:
418 ++ kvm_vcpu_uninit(vcpu);
419 + free_vcpu:
420 + kmem_cache_free(kvm_vcpu_cache, vcpu);
421 + out:
422 +diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
423 +index de9702219dee..7869112a8f3c 100644
424 +--- a/arch/powerpc/kvm/book3s_pr.c
425 ++++ b/arch/powerpc/kvm/book3s_pr.c
426 +@@ -1772,10 +1772,12 @@ static struct kvm_vcpu *kvmppc_core_vcpu_create_pr(struct kvm *kvm,
427 +
428 + err = kvmppc_mmu_init(vcpu);
429 + if (err < 0)
430 +- goto uninit_vcpu;
431 ++ goto free_shared_page;
432 +
433 + return vcpu;
434 +
435 ++free_shared_page:
436 ++ free_page((unsigned long)vcpu->arch.shared);
437 + uninit_vcpu:
438 + kvm_vcpu_uninit(vcpu);
439 + free_shadow_vcpu:
440 +diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c b/arch/powerpc/platforms/pseries/hotplug-memory.c
441 +index c2c6f32848e1..fc01a2c0f8ed 100644
442 +--- a/arch/powerpc/platforms/pseries/hotplug-memory.c
443 ++++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
444 +@@ -366,8 +366,10 @@ static bool lmb_is_removable(struct drmem_lmb *lmb)
445 +
446 + for (i = 0; i < scns_per_block; i++) {
447 + pfn = PFN_DOWN(phys_addr);
448 +- if (!pfn_present(pfn))
449 ++ if (!pfn_present(pfn)) {
450 ++ phys_addr += MIN_MEMORY_BLOCK_SIZE;
451 + continue;
452 ++ }
453 +
454 + rc &= is_mem_section_removable(pfn, PAGES_PER_SECTION);
455 + phys_addr += MIN_MEMORY_BLOCK_SIZE;
456 +diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
457 +index f0fa22e7d36c..3291e5fb94bc 100644
458 +--- a/arch/powerpc/xmon/xmon.c
459 ++++ b/arch/powerpc/xmon/xmon.c
460 +@@ -1878,15 +1878,14 @@ static void dump_300_sprs(void)
461 +
462 + printf("pidr = %.16lx tidr = %.16lx\n",
463 + mfspr(SPRN_PID), mfspr(SPRN_TIDR));
464 +- printf("asdr = %.16lx psscr = %.16lx\n",
465 +- mfspr(SPRN_ASDR), hv ? mfspr(SPRN_PSSCR)
466 +- : mfspr(SPRN_PSSCR_PR));
467 ++ printf("psscr = %.16lx\n",
468 ++ hv ? mfspr(SPRN_PSSCR) : mfspr(SPRN_PSSCR_PR));
469 +
470 + if (!hv)
471 + return;
472 +
473 +- printf("ptcr = %.16lx\n",
474 +- mfspr(SPRN_PTCR));
475 ++ printf("ptcr = %.16lx asdr = %.16lx\n",
476 ++ mfspr(SPRN_PTCR), mfspr(SPRN_ASDR));
477 + #endif
478 + }
479 +
480 +diff --git a/arch/s390/include/asm/page.h b/arch/s390/include/asm/page.h
481 +index 0d753291c43c..ac3c86b21d79 100644
482 +--- a/arch/s390/include/asm/page.h
483 ++++ b/arch/s390/include/asm/page.h
484 +@@ -33,6 +33,8 @@
485 + #define ARCH_HAS_PREPARE_HUGEPAGE
486 + #define ARCH_HAS_HUGEPAGE_CLEAR_FLUSH
487 +
488 ++#define HAVE_ARCH_HUGETLB_UNMAPPED_AREA
489 ++
490 + #include <asm/setup.h>
491 + #ifndef __ASSEMBLY__
492 +
493 +diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
494 +index db3196aebaa1..11c3cd906ab4 100644
495 +--- a/arch/s390/kvm/kvm-s390.c
496 ++++ b/arch/s390/kvm/kvm-s390.c
497 +@@ -2564,9 +2564,7 @@ static void kvm_s390_vcpu_initial_reset(struct kvm_vcpu *vcpu)
498 + vcpu->arch.sie_block->gcr[14] = CR14_UNUSED_32 |
499 + CR14_UNUSED_33 |
500 + CR14_EXTERNAL_DAMAGE_SUBMASK;
501 +- /* make sure the new fpc will be lazily loaded */
502 +- save_fpu_regs();
503 +- current->thread.fpu.fpc = 0;
504 ++ vcpu->run->s.regs.fpc = 0;
505 + vcpu->arch.sie_block->gbea = 1;
506 + vcpu->arch.sie_block->pp = 0;
507 + vcpu->arch.sie_block->fpf &= ~FPF_BPBC;
508 +@@ -3994,7 +3992,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
509 + switch (ioctl) {
510 + case KVM_S390_STORE_STATUS:
511 + idx = srcu_read_lock(&vcpu->kvm->srcu);
512 +- r = kvm_s390_vcpu_store_status(vcpu, arg);
513 ++ r = kvm_s390_store_status_unloaded(vcpu, arg);
514 + srcu_read_unlock(&vcpu->kvm->srcu, idx);
515 + break;
516 + case KVM_S390_SET_INITIAL_PSW: {
517 +diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c
518 +index b0246c705a19..5674710a4841 100644
519 +--- a/arch/s390/mm/hugetlbpage.c
520 ++++ b/arch/s390/mm/hugetlbpage.c
521 +@@ -2,7 +2,7 @@
522 + /*
523 + * IBM System z Huge TLB Page Support for Kernel.
524 + *
525 +- * Copyright IBM Corp. 2007,2016
526 ++ * Copyright IBM Corp. 2007,2020
527 + * Author(s): Gerald Schaefer <gerald.schaefer@××××××.com>
528 + */
529 +
530 +@@ -11,6 +11,9 @@
531 +
532 + #include <linux/mm.h>
533 + #include <linux/hugetlb.h>
534 ++#include <linux/mman.h>
535 ++#include <linux/sched/mm.h>
536 ++#include <linux/security.h>
537 +
538 + /*
539 + * If the bit selected by single-bit bitmask "a" is set within "x", move
540 +@@ -267,3 +270,98 @@ static __init int setup_hugepagesz(char *opt)
541 + return 1;
542 + }
543 + __setup("hugepagesz=", setup_hugepagesz);
544 ++
545 ++static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file,
546 ++ unsigned long addr, unsigned long len,
547 ++ unsigned long pgoff, unsigned long flags)
548 ++{
549 ++ struct hstate *h = hstate_file(file);
550 ++ struct vm_unmapped_area_info info;
551 ++
552 ++ info.flags = 0;
553 ++ info.length = len;
554 ++ info.low_limit = current->mm->mmap_base;
555 ++ info.high_limit = TASK_SIZE;
556 ++ info.align_mask = PAGE_MASK & ~huge_page_mask(h);
557 ++ info.align_offset = 0;
558 ++ return vm_unmapped_area(&info);
559 ++}
560 ++
561 ++static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file,
562 ++ unsigned long addr0, unsigned long len,
563 ++ unsigned long pgoff, unsigned long flags)
564 ++{
565 ++ struct hstate *h = hstate_file(file);
566 ++ struct vm_unmapped_area_info info;
567 ++ unsigned long addr;
568 ++
569 ++ info.flags = VM_UNMAPPED_AREA_TOPDOWN;
570 ++ info.length = len;
571 ++ info.low_limit = max(PAGE_SIZE, mmap_min_addr);
572 ++ info.high_limit = current->mm->mmap_base;
573 ++ info.align_mask = PAGE_MASK & ~huge_page_mask(h);
574 ++ info.align_offset = 0;
575 ++ addr = vm_unmapped_area(&info);
576 ++
577 ++ /*
578 ++ * A failed mmap() very likely causes application failure,
579 ++ * so fall back to the bottom-up function here. This scenario
580 ++ * can happen with large stack limits and large mmap()
581 ++ * allocations.
582 ++ */
583 ++ if (addr & ~PAGE_MASK) {
584 ++ VM_BUG_ON(addr != -ENOMEM);
585 ++ info.flags = 0;
586 ++ info.low_limit = TASK_UNMAPPED_BASE;
587 ++ info.high_limit = TASK_SIZE;
588 ++ addr = vm_unmapped_area(&info);
589 ++ }
590 ++
591 ++ return addr;
592 ++}
593 ++
594 ++unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
595 ++ unsigned long len, unsigned long pgoff, unsigned long flags)
596 ++{
597 ++ struct hstate *h = hstate_file(file);
598 ++ struct mm_struct *mm = current->mm;
599 ++ struct vm_area_struct *vma;
600 ++ int rc;
601 ++
602 ++ if (len & ~huge_page_mask(h))
603 ++ return -EINVAL;
604 ++ if (len > TASK_SIZE - mmap_min_addr)
605 ++ return -ENOMEM;
606 ++
607 ++ if (flags & MAP_FIXED) {
608 ++ if (prepare_hugepage_range(file, addr, len))
609 ++ return -EINVAL;
610 ++ goto check_asce_limit;
611 ++ }
612 ++
613 ++ if (addr) {
614 ++ addr = ALIGN(addr, huge_page_size(h));
615 ++ vma = find_vma(mm, addr);
616 ++ if (TASK_SIZE - len >= addr && addr >= mmap_min_addr &&
617 ++ (!vma || addr + len <= vm_start_gap(vma)))
618 ++ goto check_asce_limit;
619 ++ }
620 ++
621 ++ if (mm->get_unmapped_area == arch_get_unmapped_area)
622 ++ addr = hugetlb_get_unmapped_area_bottomup(file, addr, len,
623 ++ pgoff, flags);
624 ++ else
625 ++ addr = hugetlb_get_unmapped_area_topdown(file, addr, len,
626 ++ pgoff, flags);
627 ++ if (addr & ~PAGE_MASK)
628 ++ return addr;
629 ++
630 ++check_asce_limit:
631 ++ if (addr + len > current->mm->context.asce_limit &&
632 ++ addr + len <= TASK_SIZE) {
633 ++ rc = crst_table_upgrade(mm, addr + len);
634 ++ if (rc)
635 ++ return (unsigned long) rc;
636 ++ }
637 ++ return addr;
638 ++}
639 +diff --git a/arch/sparc/include/uapi/asm/ipcbuf.h b/arch/sparc/include/uapi/asm/ipcbuf.h
640 +index 9d0d125500e2..084b8949ddff 100644
641 +--- a/arch/sparc/include/uapi/asm/ipcbuf.h
642 ++++ b/arch/sparc/include/uapi/asm/ipcbuf.h
643 +@@ -15,19 +15,19 @@
644 +
645 + struct ipc64_perm
646 + {
647 +- __kernel_key_t key;
648 +- __kernel_uid_t uid;
649 +- __kernel_gid_t gid;
650 +- __kernel_uid_t cuid;
651 +- __kernel_gid_t cgid;
652 ++ __kernel_key_t key;
653 ++ __kernel_uid32_t uid;
654 ++ __kernel_gid32_t gid;
655 ++ __kernel_uid32_t cuid;
656 ++ __kernel_gid32_t cgid;
657 + #ifndef __arch64__
658 +- unsigned short __pad0;
659 ++ unsigned short __pad0;
660 + #endif
661 +- __kernel_mode_t mode;
662 +- unsigned short __pad1;
663 +- unsigned short seq;
664 +- unsigned long long __unused1;
665 +- unsigned long long __unused2;
666 ++ __kernel_mode_t mode;
667 ++ unsigned short __pad1;
668 ++ unsigned short seq;
669 ++ unsigned long long __unused1;
670 ++ unsigned long long __unused2;
671 + };
672 +
673 + #endif /* __SPARC_IPCBUF_H */
674 +diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
675 +index 050368db9d35..3c1e51ead072 100644
676 +--- a/arch/x86/include/asm/apic.h
677 ++++ b/arch/x86/include/asm/apic.h
678 +@@ -448,6 +448,14 @@ static inline void ack_APIC_irq(void)
679 + apic_eoi();
680 + }
681 +
682 ++
683 ++static inline bool lapic_vector_set_in_irr(unsigned int vector)
684 ++{
685 ++ u32 irr = apic_read(APIC_IRR + (vector / 32 * 0x10));
686 ++
687 ++ return !!(irr & (1U << (vector % 32)));
688 ++}
689 ++
690 + static inline unsigned default_get_apic_id(unsigned long x)
691 + {
692 + unsigned int ver = GET_APIC_VERSION(apic_read(APIC_LVR));
693 +diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
694 +index 155be8adb934..21a58fcc3dd4 100644
695 +--- a/arch/x86/include/asm/kvm_host.h
696 ++++ b/arch/x86/include/asm/kvm_host.h
697 +@@ -350,12 +350,12 @@ struct kvm_mmu {
698 + void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long root);
699 + unsigned long (*get_cr3)(struct kvm_vcpu *vcpu);
700 + u64 (*get_pdptr)(struct kvm_vcpu *vcpu, int index);
701 +- int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err,
702 ++ int (*page_fault)(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u32 err,
703 + bool prefault);
704 + void (*inject_page_fault)(struct kvm_vcpu *vcpu,
705 + struct x86_exception *fault);
706 +- gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva, u32 access,
707 +- struct x86_exception *exception);
708 ++ gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gpa_t gva_or_gpa,
709 ++ u32 access, struct x86_exception *exception);
710 + gpa_t (*translate_gpa)(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access,
711 + struct x86_exception *exception);
712 + int (*sync_page)(struct kvm_vcpu *vcpu,
713 +@@ -1354,7 +1354,7 @@ void kvm_vcpu_deactivate_apicv(struct kvm_vcpu *vcpu);
714 +
715 + int kvm_emulate_hypercall(struct kvm_vcpu *vcpu);
716 +
717 +-int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u64 error_code,
718 ++int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code,
719 + void *insn, int insn_len);
720 + void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva);
721 + void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid);
722 +diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c
723 +index 72a94401f9e0..1f5df339e48f 100644
724 +--- a/arch/x86/kernel/apic/msi.c
725 ++++ b/arch/x86/kernel/apic/msi.c
726 +@@ -26,10 +26,8 @@
727 +
728 + static struct irq_domain *msi_default_domain;
729 +
730 +-static void irq_msi_compose_msg(struct irq_data *data, struct msi_msg *msg)
731 ++static void __irq_msi_compose_msg(struct irq_cfg *cfg, struct msi_msg *msg)
732 + {
733 +- struct irq_cfg *cfg = irqd_cfg(data);
734 +-
735 + msg->address_hi = MSI_ADDR_BASE_HI;
736 +
737 + if (x2apic_enabled())
738 +@@ -50,6 +48,127 @@ static void irq_msi_compose_msg(struct irq_data *data, struct msi_msg *msg)
739 + MSI_DATA_VECTOR(cfg->vector);
740 + }
741 +
742 ++static void irq_msi_compose_msg(struct irq_data *data, struct msi_msg *msg)
743 ++{
744 ++ __irq_msi_compose_msg(irqd_cfg(data), msg);
745 ++}
746 ++
747 ++static void irq_msi_update_msg(struct irq_data *irqd, struct irq_cfg *cfg)
748 ++{
749 ++ struct msi_msg msg[2] = { [1] = { }, };
750 ++
751 ++ __irq_msi_compose_msg(cfg, msg);
752 ++ irq_data_get_irq_chip(irqd)->irq_write_msi_msg(irqd, msg);
753 ++}
754 ++
755 ++static int
756 ++msi_set_affinity(struct irq_data *irqd, const struct cpumask *mask, bool force)
757 ++{
758 ++ struct irq_cfg old_cfg, *cfg = irqd_cfg(irqd);
759 ++ struct irq_data *parent = irqd->parent_data;
760 ++ unsigned int cpu;
761 ++ int ret;
762 ++
763 ++ /* Save the current configuration */
764 ++ cpu = cpumask_first(irq_data_get_effective_affinity_mask(irqd));
765 ++ old_cfg = *cfg;
766 ++
767 ++ /* Allocate a new target vector */
768 ++ ret = parent->chip->irq_set_affinity(parent, mask, force);
769 ++ if (ret < 0 || ret == IRQ_SET_MASK_OK_DONE)
770 ++ return ret;
771 ++
772 ++ /*
773 ++ * For non-maskable and non-remapped MSI interrupts the migration
774 ++ * to a different destination CPU and a different vector has to be
775 ++ * done careful to handle the possible stray interrupt which can be
776 ++ * caused by the non-atomic update of the address/data pair.
777 ++ *
778 ++ * Direct update is possible when:
779 ++ * - The MSI is maskable (remapped MSI does not use this code path)).
780 ++ * The quirk bit is not set in this case.
781 ++ * - The new vector is the same as the old vector
782 ++ * - The old vector is MANAGED_IRQ_SHUTDOWN_VECTOR (interrupt starts up)
783 ++ * - The new destination CPU is the same as the old destination CPU
784 ++ */
785 ++ if (!irqd_msi_nomask_quirk(irqd) ||
786 ++ cfg->vector == old_cfg.vector ||
787 ++ old_cfg.vector == MANAGED_IRQ_SHUTDOWN_VECTOR ||
788 ++ cfg->dest_apicid == old_cfg.dest_apicid) {
789 ++ irq_msi_update_msg(irqd, cfg);
790 ++ return ret;
791 ++ }
792 ++
793 ++ /*
794 ++ * Paranoia: Validate that the interrupt target is the local
795 ++ * CPU.
796 ++ */
797 ++ if (WARN_ON_ONCE(cpu != smp_processor_id())) {
798 ++ irq_msi_update_msg(irqd, cfg);
799 ++ return ret;
800 ++ }
801 ++
802 ++ /*
803 ++ * Redirect the interrupt to the new vector on the current CPU
804 ++ * first. This might cause a spurious interrupt on this vector if
805 ++ * the device raises an interrupt right between this update and the
806 ++ * update to the final destination CPU.
807 ++ *
808 ++ * If the vector is in use then the installed device handler will
809 ++ * denote it as spurious which is no harm as this is a rare event
810 ++ * and interrupt handlers have to cope with spurious interrupts
811 ++ * anyway. If the vector is unused, then it is marked so it won't
812 ++ * trigger the 'No irq handler for vector' warning in do_IRQ().
813 ++ *
814 ++ * This requires to hold vector lock to prevent concurrent updates to
815 ++ * the affected vector.
816 ++ */
817 ++ lock_vector_lock();
818 ++
819 ++ /*
820 ++ * Mark the new target vector on the local CPU if it is currently
821 ++ * unused. Reuse the VECTOR_RETRIGGERED state which is also used in
822 ++ * the CPU hotplug path for a similar purpose. This cannot be
823 ++ * undone here as the current CPU has interrupts disabled and
824 ++ * cannot handle the interrupt before the whole set_affinity()
825 ++ * section is done. In the CPU unplug case, the current CPU is
826 ++ * about to vanish and will not handle any interrupts anymore. The
827 ++ * vector is cleaned up when the CPU comes online again.
828 ++ */
829 ++ if (IS_ERR_OR_NULL(this_cpu_read(vector_irq[cfg->vector])))
830 ++ this_cpu_write(vector_irq[cfg->vector], VECTOR_RETRIGGERED);
831 ++
832 ++ /* Redirect it to the new vector on the local CPU temporarily */
833 ++ old_cfg.vector = cfg->vector;
834 ++ irq_msi_update_msg(irqd, &old_cfg);
835 ++
836 ++ /* Now transition it to the target CPU */
837 ++ irq_msi_update_msg(irqd, cfg);
838 ++
839 ++ /*
840 ++ * All interrupts after this point are now targeted at the new
841 ++ * vector/CPU.
842 ++ *
843 ++ * Drop vector lock before testing whether the temporary assignment
844 ++ * to the local CPU was hit by an interrupt raised in the device,
845 ++ * because the retrigger function acquires vector lock again.
846 ++ */
847 ++ unlock_vector_lock();
848 ++
849 ++ /*
850 ++ * Check whether the transition raced with a device interrupt and
851 ++ * is pending in the local APICs IRR. It is safe to do this outside
852 ++ * of vector lock as the irq_desc::lock of this interrupt is still
853 ++ * held and interrupts are disabled: The check is not accessing the
854 ++ * underlying vector store. It's just checking the local APIC's
855 ++ * IRR.
856 ++ */
857 ++ if (lapic_vector_set_in_irr(cfg->vector))
858 ++ irq_data_get_irq_chip(irqd)->irq_retrigger(irqd);
859 ++
860 ++ return ret;
861 ++}
862 ++
863 + /*
864 + * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices,
865 + * which implement the MSI or MSI-X Capability Structure.
866 +@@ -61,6 +180,7 @@ static struct irq_chip pci_msi_controller = {
867 + .irq_ack = irq_chip_ack_parent,
868 + .irq_retrigger = irq_chip_retrigger_hierarchy,
869 + .irq_compose_msi_msg = irq_msi_compose_msg,
870 ++ .irq_set_affinity = msi_set_affinity,
871 + .flags = IRQCHIP_SKIP_SET_WAKE,
872 + };
873 +
874 +@@ -149,6 +269,8 @@ void __init arch_init_msi_domain(struct irq_domain *parent)
875 + }
876 + if (!msi_default_domain)
877 + pr_warn("failed to initialize irqdomain for MSI/MSI-x.\n");
878 ++ else
879 ++ msi_default_domain->flags |= IRQ_DOMAIN_MSI_NOMASK_QUIRK;
880 + }
881 +
882 + #ifdef CONFIG_IRQ_REMAP
883 +diff --git a/arch/x86/kernel/cpu/tsx.c b/arch/x86/kernel/cpu/tsx.c
884 +index 3e20d322bc98..032509adf9de 100644
885 +--- a/arch/x86/kernel/cpu/tsx.c
886 ++++ b/arch/x86/kernel/cpu/tsx.c
887 +@@ -115,11 +115,12 @@ void __init tsx_init(void)
888 + tsx_disable();
889 +
890 + /*
891 +- * tsx_disable() will change the state of the
892 +- * RTM CPUID bit. Clear it here since it is now
893 +- * expected to be not set.
894 ++ * tsx_disable() will change the state of the RTM and HLE CPUID
895 ++ * bits. Clear them here since they are now expected to be not
896 ++ * set.
897 + */
898 + setup_clear_cpu_cap(X86_FEATURE_RTM);
899 ++ setup_clear_cpu_cap(X86_FEATURE_HLE);
900 + } else if (tsx_ctrl_state == TSX_CTRL_ENABLE) {
901 +
902 + /*
903 +@@ -131,10 +132,10 @@ void __init tsx_init(void)
904 + tsx_enable();
905 +
906 + /*
907 +- * tsx_enable() will change the state of the
908 +- * RTM CPUID bit. Force it here since it is now
909 +- * expected to be set.
910 ++ * tsx_enable() will change the state of the RTM and HLE CPUID
911 ++ * bits. Force them here since they are now expected to be set.
912 + */
913 + setup_force_cpu_cap(X86_FEATURE_RTM);
914 ++ setup_force_cpu_cap(X86_FEATURE_HLE);
915 + }
916 + }
917 +diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
918 +index e699f4d2a450..c91431bc476e 100644
919 +--- a/arch/x86/kvm/emulate.c
920 ++++ b/arch/x86/kvm/emulate.c
921 +@@ -5164,16 +5164,28 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len)
922 + ctxt->ad_bytes = def_ad_bytes ^ 6;
923 + break;
924 + case 0x26: /* ES override */
925 ++ has_seg_override = true;
926 ++ ctxt->seg_override = VCPU_SREG_ES;
927 ++ break;
928 + case 0x2e: /* CS override */
929 ++ has_seg_override = true;
930 ++ ctxt->seg_override = VCPU_SREG_CS;
931 ++ break;
932 + case 0x36: /* SS override */
933 ++ has_seg_override = true;
934 ++ ctxt->seg_override = VCPU_SREG_SS;
935 ++ break;
936 + case 0x3e: /* DS override */
937 + has_seg_override = true;
938 +- ctxt->seg_override = (ctxt->b >> 3) & 3;
939 ++ ctxt->seg_override = VCPU_SREG_DS;
940 + break;
941 + case 0x64: /* FS override */
942 ++ has_seg_override = true;
943 ++ ctxt->seg_override = VCPU_SREG_FS;
944 ++ break;
945 + case 0x65: /* GS override */
946 + has_seg_override = true;
947 +- ctxt->seg_override = ctxt->b & 7;
948 ++ ctxt->seg_override = VCPU_SREG_GS;
949 + break;
950 + case 0x40 ... 0x4f: /* REX */
951 + if (mode != X86EMUL_MODE_PROT64)
952 +@@ -5257,10 +5269,15 @@ done_prefixes:
953 + }
954 + break;
955 + case Escape:
956 +- if (ctxt->modrm > 0xbf)
957 +- opcode = opcode.u.esc->high[ctxt->modrm - 0xc0];
958 +- else
959 ++ if (ctxt->modrm > 0xbf) {
960 ++ size_t size = ARRAY_SIZE(opcode.u.esc->high);
961 ++ u32 index = array_index_nospec(
962 ++ ctxt->modrm - 0xc0, size);
963 ++
964 ++ opcode = opcode.u.esc->high[index];
965 ++ } else {
966 + opcode = opcode.u.esc->op[(ctxt->modrm >> 3) & 7];
967 ++ }
968 + break;
969 + case InstrDual:
970 + if ((ctxt->modrm >> 6) == 3)
971 +diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
972 +index 5842c5f587fe..3fd6c4b2c2b7 100644
973 +--- a/arch/x86/kvm/hyperv.c
974 ++++ b/arch/x86/kvm/hyperv.c
975 +@@ -792,11 +792,12 @@ static int kvm_hv_msr_get_crash_data(struct kvm_vcpu *vcpu,
976 + u32 index, u64 *pdata)
977 + {
978 + struct kvm_hv *hv = &vcpu->kvm->arch.hyperv;
979 ++ size_t size = ARRAY_SIZE(hv->hv_crash_param);
980 +
981 +- if (WARN_ON_ONCE(index >= ARRAY_SIZE(hv->hv_crash_param)))
982 ++ if (WARN_ON_ONCE(index >= size))
983 + return -EINVAL;
984 +
985 +- *pdata = hv->hv_crash_param[index];
986 ++ *pdata = hv->hv_crash_param[array_index_nospec(index, size)];
987 + return 0;
988 + }
989 +
990 +@@ -835,11 +836,12 @@ static int kvm_hv_msr_set_crash_data(struct kvm_vcpu *vcpu,
991 + u32 index, u64 data)
992 + {
993 + struct kvm_hv *hv = &vcpu->kvm->arch.hyperv;
994 ++ size_t size = ARRAY_SIZE(hv->hv_crash_param);
995 +
996 +- if (WARN_ON_ONCE(index >= ARRAY_SIZE(hv->hv_crash_param)))
997 ++ if (WARN_ON_ONCE(index >= size))
998 + return -EINVAL;
999 +
1000 +- hv->hv_crash_param[index] = data;
1001 ++ hv->hv_crash_param[array_index_nospec(index, size)] = data;
1002 + return 0;
1003 + }
1004 +
1005 +diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
1006 +index bdcd4139eca9..38a36a1cc87f 100644
1007 +--- a/arch/x86/kvm/i8259.c
1008 ++++ b/arch/x86/kvm/i8259.c
1009 +@@ -460,10 +460,14 @@ static int picdev_write(struct kvm_pic *s,
1010 + switch (addr) {
1011 + case 0x20:
1012 + case 0x21:
1013 ++ pic_lock(s);
1014 ++ pic_ioport_write(&s->pics[0], addr, data);
1015 ++ pic_unlock(s);
1016 ++ break;
1017 + case 0xa0:
1018 + case 0xa1:
1019 + pic_lock(s);
1020 +- pic_ioport_write(&s->pics[addr >> 7], addr, data);
1021 ++ pic_ioport_write(&s->pics[1], addr, data);
1022 + pic_unlock(s);
1023 + break;
1024 + case 0x4d0:
1025 +diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c
1026 +index 4e822ad363f3..bac2ec9b4443 100644
1027 +--- a/arch/x86/kvm/ioapic.c
1028 ++++ b/arch/x86/kvm/ioapic.c
1029 +@@ -36,6 +36,7 @@
1030 + #include <linux/io.h>
1031 + #include <linux/slab.h>
1032 + #include <linux/export.h>
1033 ++#include <linux/nospec.h>
1034 + #include <asm/processor.h>
1035 + #include <asm/page.h>
1036 + #include <asm/current.h>
1037 +@@ -73,13 +74,14 @@ static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic,
1038 + default:
1039 + {
1040 + u32 redir_index = (ioapic->ioregsel - 0x10) >> 1;
1041 +- u64 redir_content;
1042 ++ u64 redir_content = ~0ULL;
1043 +
1044 +- if (redir_index < IOAPIC_NUM_PINS)
1045 +- redir_content =
1046 +- ioapic->redirtbl[redir_index].bits;
1047 +- else
1048 +- redir_content = ~0ULL;
1049 ++ if (redir_index < IOAPIC_NUM_PINS) {
1050 ++ u32 index = array_index_nospec(
1051 ++ redir_index, IOAPIC_NUM_PINS);
1052 ++
1053 ++ redir_content = ioapic->redirtbl[index].bits;
1054 ++ }
1055 +
1056 + result = (ioapic->ioregsel & 0x1) ?
1057 + (redir_content >> 32) & 0xffffffff :
1058 +@@ -297,6 +299,7 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
1059 + ioapic_debug("change redir index %x val %x\n", index, val);
1060 + if (index >= IOAPIC_NUM_PINS)
1061 + return;
1062 ++ index = array_index_nospec(index, IOAPIC_NUM_PINS);
1063 + e = &ioapic->redirtbl[index];
1064 + mask_before = e->fields.mask;
1065 + /* Preserve read-only fields */
1066 +diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
1067 +index 262e49301cae..05905961ecca 100644
1068 +--- a/arch/x86/kvm/lapic.c
1069 ++++ b/arch/x86/kvm/lapic.c
1070 +@@ -1862,15 +1862,20 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
1071 + case APIC_LVTTHMR:
1072 + case APIC_LVTPC:
1073 + case APIC_LVT1:
1074 +- case APIC_LVTERR:
1075 ++ case APIC_LVTERR: {
1076 + /* TODO: Check vector */
1077 ++ size_t size;
1078 ++ u32 index;
1079 ++
1080 + if (!kvm_apic_sw_enabled(apic))
1081 + val |= APIC_LVT_MASKED;
1082 +-
1083 +- val &= apic_lvt_mask[(reg - APIC_LVTT) >> 4];
1084 ++ size = ARRAY_SIZE(apic_lvt_mask);
1085 ++ index = array_index_nospec(
1086 ++ (reg - APIC_LVTT) >> 4, size);
1087 ++ val &= apic_lvt_mask[index];
1088 + kvm_lapic_set_reg(apic, reg, val);
1089 +-
1090 + break;
1091 ++ }
1092 +
1093 + case APIC_LVTT:
1094 + if (!kvm_apic_sw_enabled(apic))
1095 +diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
1096 +index eddf91a0e363..62f1e4663bc3 100644
1097 +--- a/arch/x86/kvm/mmu.c
1098 ++++ b/arch/x86/kvm/mmu.c
1099 +@@ -1184,12 +1184,12 @@ static bool mmu_gfn_lpage_is_disallowed(struct kvm_vcpu *vcpu, gfn_t gfn,
1100 + return __mmu_gfn_lpage_is_disallowed(gfn, level, slot);
1101 + }
1102 +
1103 +-static int host_mapping_level(struct kvm *kvm, gfn_t gfn)
1104 ++static int host_mapping_level(struct kvm_vcpu *vcpu, gfn_t gfn)
1105 + {
1106 + unsigned long page_size;
1107 + int i, ret = 0;
1108 +
1109 +- page_size = kvm_host_page_size(kvm, gfn);
1110 ++ page_size = kvm_host_page_size(vcpu, gfn);
1111 +
1112 + for (i = PT_PAGE_TABLE_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
1113 + if (page_size >= KVM_HPAGE_SIZE(i))
1114 +@@ -1239,7 +1239,7 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn,
1115 + if (unlikely(*force_pt_level))
1116 + return PT_PAGE_TABLE_LEVEL;
1117 +
1118 +- host_level = host_mapping_level(vcpu->kvm, large_gfn);
1119 ++ host_level = host_mapping_level(vcpu, large_gfn);
1120 +
1121 + if (host_level == PT_PAGE_TABLE_LEVEL)
1122 + return host_level;
1123 +@@ -3390,7 +3390,7 @@ static bool is_access_allowed(u32 fault_err_code, u64 spte)
1124 + * - true: let the vcpu to access on the same address again.
1125 + * - false: let the real page fault path to fix it.
1126 + */
1127 +-static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
1128 ++static bool fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, int level,
1129 + u32 error_code)
1130 + {
1131 + struct kvm_shadow_walk_iterator iterator;
1132 +@@ -3410,7 +3410,7 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
1133 + do {
1134 + u64 new_spte;
1135 +
1136 +- for_each_shadow_entry_lockless(vcpu, gva, iterator, spte)
1137 ++ for_each_shadow_entry_lockless(vcpu, cr2_or_gpa, iterator, spte)
1138 + if (!is_shadow_present_pte(spte) ||
1139 + iterator.level < level)
1140 + break;
1141 +@@ -3488,7 +3488,7 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
1142 +
1143 + } while (true);
1144 +
1145 +- trace_fast_page_fault(vcpu, gva, error_code, iterator.sptep,
1146 ++ trace_fast_page_fault(vcpu, cr2_or_gpa, error_code, iterator.sptep,
1147 + spte, fault_handled);
1148 + walk_shadow_page_lockless_end(vcpu);
1149 +
1150 +@@ -3496,10 +3496,11 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
1151 + }
1152 +
1153 + static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
1154 +- gva_t gva, kvm_pfn_t *pfn, bool write, bool *writable);
1155 ++ gpa_t cr2_or_gpa, kvm_pfn_t *pfn, bool write,
1156 ++ bool *writable);
1157 + static int make_mmu_pages_available(struct kvm_vcpu *vcpu);
1158 +
1159 +-static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
1160 ++static int nonpaging_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
1161 + gfn_t gfn, bool prefault)
1162 + {
1163 + int r;
1164 +@@ -3525,16 +3526,16 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
1165 + gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
1166 + }
1167 +
1168 +- if (fast_page_fault(vcpu, v, level, error_code))
1169 ++ if (fast_page_fault(vcpu, gpa, level, error_code))
1170 + return RET_PF_RETRY;
1171 +
1172 + mmu_seq = vcpu->kvm->mmu_notifier_seq;
1173 + smp_rmb();
1174 +
1175 +- if (try_async_pf(vcpu, prefault, gfn, v, &pfn, write, &map_writable))
1176 ++ if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable))
1177 + return RET_PF_RETRY;
1178 +
1179 +- if (handle_abnormal_pfn(vcpu, v, gfn, pfn, ACC_ALL, &r))
1180 ++ if (handle_abnormal_pfn(vcpu, gpa, gfn, pfn, ACC_ALL, &r))
1181 + return r;
1182 +
1183 + r = RET_PF_RETRY;
1184 +@@ -3545,7 +3546,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
1185 + goto out_unlock;
1186 + if (likely(!force_pt_level))
1187 + transparent_hugepage_adjust(vcpu, gfn, &pfn, &level);
1188 +- r = __direct_map(vcpu, v, write, map_writable, level, pfn,
1189 ++ r = __direct_map(vcpu, gpa, write, map_writable, level, pfn,
1190 + prefault, false);
1191 + out_unlock:
1192 + spin_unlock(&vcpu->kvm->mmu_lock);
1193 +@@ -3838,7 +3839,7 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
1194 + }
1195 + EXPORT_SYMBOL_GPL(kvm_mmu_sync_roots);
1196 +
1197 +-static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr,
1198 ++static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gpa_t vaddr,
1199 + u32 access, struct x86_exception *exception)
1200 + {
1201 + if (exception)
1202 +@@ -3846,7 +3847,7 @@ static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr,
1203 + return vaddr;
1204 + }
1205 +
1206 +-static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gva_t vaddr,
1207 ++static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gpa_t vaddr,
1208 + u32 access,
1209 + struct x86_exception *exception)
1210 + {
1211 +@@ -4006,13 +4007,14 @@ static void shadow_page_table_clear_flood(struct kvm_vcpu *vcpu, gva_t addr)
1212 + walk_shadow_page_lockless_end(vcpu);
1213 + }
1214 +
1215 +-static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
1216 ++static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa,
1217 + u32 error_code, bool prefault)
1218 + {
1219 +- gfn_t gfn = gva >> PAGE_SHIFT;
1220 ++ gfn_t gfn = gpa >> PAGE_SHIFT;
1221 + int r;
1222 +
1223 +- pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code);
1224 ++ /* Note, paging is disabled, ergo gva == gpa. */
1225 ++ pgprintk("%s: gva %lx error %x\n", __func__, gpa, error_code);
1226 +
1227 + if (page_fault_handle_page_track(vcpu, error_code, gfn))
1228 + return RET_PF_EMULATE;
1229 +@@ -4024,11 +4026,12 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
1230 + MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
1231 +
1232 +
1233 +- return nonpaging_map(vcpu, gva & PAGE_MASK,
1234 ++ return nonpaging_map(vcpu, gpa & PAGE_MASK,
1235 + error_code, gfn, prefault);
1236 + }
1237 +
1238 +-static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
1239 ++static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
1240 ++ gfn_t gfn)
1241 + {
1242 + struct kvm_arch_async_pf arch;
1243 +
1244 +@@ -4037,7 +4040,8 @@ static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
1245 + arch.direct_map = vcpu->arch.mmu.direct_map;
1246 + arch.cr3 = vcpu->arch.mmu.get_cr3(vcpu);
1247 +
1248 +- return kvm_setup_async_pf(vcpu, gva, kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch);
1249 ++ return kvm_setup_async_pf(vcpu, cr2_or_gpa,
1250 ++ kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch);
1251 + }
1252 +
1253 + bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu)
1254 +@@ -4054,7 +4058,8 @@ bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu)
1255 + }
1256 +
1257 + static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
1258 +- gva_t gva, kvm_pfn_t *pfn, bool write, bool *writable)
1259 ++ gpa_t cr2_or_gpa, kvm_pfn_t *pfn, bool write,
1260 ++ bool *writable)
1261 + {
1262 + struct kvm_memory_slot *slot;
1263 + bool async;
1264 +@@ -4074,12 +4079,12 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
1265 + return false; /* *pfn has correct page already */
1266 +
1267 + if (!prefault && kvm_can_do_async_pf(vcpu)) {
1268 +- trace_kvm_try_async_get_page(gva, gfn);
1269 ++ trace_kvm_try_async_get_page(cr2_or_gpa, gfn);
1270 + if (kvm_find_async_pf_gfn(vcpu, gfn)) {
1271 +- trace_kvm_async_pf_doublefault(gva, gfn);
1272 ++ trace_kvm_async_pf_doublefault(cr2_or_gpa, gfn);
1273 + kvm_make_request(KVM_REQ_APF_HALT, vcpu);
1274 + return true;
1275 +- } else if (kvm_arch_setup_async_pf(vcpu, gva, gfn))
1276 ++ } else if (kvm_arch_setup_async_pf(vcpu, cr2_or_gpa, gfn))
1277 + return true;
1278 + }
1279 +
1280 +@@ -4092,6 +4097,12 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
1281 + {
1282 + int r = 1;
1283 +
1284 ++#ifndef CONFIG_X86_64
1285 ++ /* A 64-bit CR2 should be impossible on 32-bit KVM. */
1286 ++ if (WARN_ON_ONCE(fault_address >> 32))
1287 ++ return -EFAULT;
1288 ++#endif
1289 ++
1290 + vcpu->arch.l1tf_flush_l1d = true;
1291 + switch (vcpu->arch.apf.host_apf_reason) {
1292 + default:
1293 +@@ -4129,7 +4140,7 @@ check_hugepage_cache_consistency(struct kvm_vcpu *vcpu, gfn_t gfn, int level)
1294 + return kvm_mtrr_check_gfn_range_consistency(vcpu, gfn, page_num);
1295 + }
1296 +
1297 +-static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
1298 ++static int tdp_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
1299 + bool prefault)
1300 + {
1301 + kvm_pfn_t pfn;
1302 +@@ -5307,7 +5318,7 @@ static int make_mmu_pages_available(struct kvm_vcpu *vcpu)
1303 + return 0;
1304 + }
1305 +
1306 +-int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code,
1307 ++int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code,
1308 + void *insn, int insn_len)
1309 + {
1310 + int r, emulation_type = 0;
1311 +@@ -5317,19 +5328,20 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code,
1312 + /* With shadow page tables, fault_address contains a GVA or nGPA. */
1313 + if (vcpu->arch.mmu.direct_map) {
1314 + vcpu->arch.gpa_available = true;
1315 +- vcpu->arch.gpa_val = cr2;
1316 ++ vcpu->arch.gpa_val = cr2_or_gpa;
1317 + }
1318 +
1319 + r = RET_PF_INVALID;
1320 + if (unlikely(error_code & PFERR_RSVD_MASK)) {
1321 +- r = handle_mmio_page_fault(vcpu, cr2, direct);
1322 ++ r = handle_mmio_page_fault(vcpu, cr2_or_gpa, direct);
1323 + if (r == RET_PF_EMULATE)
1324 + goto emulate;
1325 + }
1326 +
1327 + if (r == RET_PF_INVALID) {
1328 +- r = vcpu->arch.mmu.page_fault(vcpu, cr2, lower_32_bits(error_code),
1329 +- false);
1330 ++ r = vcpu->arch.mmu.page_fault(vcpu, cr2_or_gpa,
1331 ++ lower_32_bits(error_code),
1332 ++ false);
1333 + WARN_ON(r == RET_PF_INVALID);
1334 + }
1335 +
1336 +@@ -5347,7 +5359,7 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code,
1337 + */
1338 + if (vcpu->arch.mmu.direct_map &&
1339 + (error_code & PFERR_NESTED_GUEST_PAGE) == PFERR_NESTED_GUEST_PAGE) {
1340 +- kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2));
1341 ++ kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2_or_gpa));
1342 + return 1;
1343 + }
1344 +
1345 +@@ -5362,7 +5374,7 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code,
1346 + * explicitly shadowing L1's page tables, i.e. unprotecting something
1347 + * for L1 isn't going to magically fix whatever issue cause L2 to fail.
1348 + */
1349 +- if (!mmio_info_in_cache(vcpu, cr2, direct) && !is_guest_mode(vcpu))
1350 ++ if (!mmio_info_in_cache(vcpu, cr2_or_gpa, direct) && !is_guest_mode(vcpu))
1351 + emulation_type = EMULTYPE_ALLOW_RETRY;
1352 + emulate:
1353 + /*
1354 +@@ -5375,7 +5387,7 @@ emulate:
1355 + if (unlikely(insn && !insn_len))
1356 + return 1;
1357 +
1358 +- er = x86_emulate_instruction(vcpu, cr2, emulation_type, insn, insn_len);
1359 ++ er = x86_emulate_instruction(vcpu, cr2_or_gpa, emulation_type, insn, insn_len);
1360 +
1361 + switch (er) {
1362 + case EMULATE_DONE:
1363 +diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
1364 +index 918b0d5bf272..cb41b036eb26 100644
1365 +--- a/arch/x86/kvm/mmutrace.h
1366 ++++ b/arch/x86/kvm/mmutrace.h
1367 +@@ -249,13 +249,13 @@ TRACE_EVENT(
1368 +
1369 + TRACE_EVENT(
1370 + fast_page_fault,
1371 +- TP_PROTO(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code,
1372 ++ TP_PROTO(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u32 error_code,
1373 + u64 *sptep, u64 old_spte, bool retry),
1374 +- TP_ARGS(vcpu, gva, error_code, sptep, old_spte, retry),
1375 ++ TP_ARGS(vcpu, cr2_or_gpa, error_code, sptep, old_spte, retry),
1376 +
1377 + TP_STRUCT__entry(
1378 + __field(int, vcpu_id)
1379 +- __field(gva_t, gva)
1380 ++ __field(gpa_t, cr2_or_gpa)
1381 + __field(u32, error_code)
1382 + __field(u64 *, sptep)
1383 + __field(u64, old_spte)
1384 +@@ -265,7 +265,7 @@ TRACE_EVENT(
1385 +
1386 + TP_fast_assign(
1387 + __entry->vcpu_id = vcpu->vcpu_id;
1388 +- __entry->gva = gva;
1389 ++ __entry->cr2_or_gpa = cr2_or_gpa;
1390 + __entry->error_code = error_code;
1391 + __entry->sptep = sptep;
1392 + __entry->old_spte = old_spte;
1393 +@@ -273,9 +273,9 @@ TRACE_EVENT(
1394 + __entry->retry = retry;
1395 + ),
1396 +
1397 +- TP_printk("vcpu %d gva %lx error_code %s sptep %p old %#llx"
1398 ++ TP_printk("vcpu %d gva %llx error_code %s sptep %p old %#llx"
1399 + " new %llx spurious %d fixed %d", __entry->vcpu_id,
1400 +- __entry->gva, __print_flags(__entry->error_code, "|",
1401 ++ __entry->cr2_or_gpa, __print_flags(__entry->error_code, "|",
1402 + kvm_mmu_trace_pferr_flags), __entry->sptep,
1403 + __entry->old_spte, __entry->new_spte,
1404 + __spte_satisfied(old_spte), __spte_satisfied(new_spte)
1405 +diff --git a/arch/x86/kvm/mtrr.c b/arch/x86/kvm/mtrr.c
1406 +index 9f72cc427158..fabce87697e5 100644
1407 +--- a/arch/x86/kvm/mtrr.c
1408 ++++ b/arch/x86/kvm/mtrr.c
1409 +@@ -194,11 +194,15 @@ static bool fixed_msr_to_seg_unit(u32 msr, int *seg, int *unit)
1410 + break;
1411 + case MSR_MTRRfix16K_80000 ... MSR_MTRRfix16K_A0000:
1412 + *seg = 1;
1413 +- *unit = msr - MSR_MTRRfix16K_80000;
1414 ++ *unit = array_index_nospec(
1415 ++ msr - MSR_MTRRfix16K_80000,
1416 ++ MSR_MTRRfix16K_A0000 - MSR_MTRRfix16K_80000 + 1);
1417 + break;
1418 + case MSR_MTRRfix4K_C0000 ... MSR_MTRRfix4K_F8000:
1419 + *seg = 2;
1420 +- *unit = msr - MSR_MTRRfix4K_C0000;
1421 ++ *unit = array_index_nospec(
1422 ++ msr - MSR_MTRRfix4K_C0000,
1423 ++ MSR_MTRRfix4K_F8000 - MSR_MTRRfix4K_C0000 + 1);
1424 + break;
1425 + default:
1426 + return false;
1427 +diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
1428 +index adf42dc8d38b..100ae4fabf17 100644
1429 +--- a/arch/x86/kvm/paging_tmpl.h
1430 ++++ b/arch/x86/kvm/paging_tmpl.h
1431 +@@ -273,11 +273,11 @@ static inline unsigned FNAME(gpte_pkeys)(struct kvm_vcpu *vcpu, u64 gpte)
1432 + }
1433 +
1434 + /*
1435 +- * Fetch a guest pte for a guest virtual address
1436 ++ * Fetch a guest pte for a guest virtual address, or for an L2's GPA.
1437 + */
1438 + static int FNAME(walk_addr_generic)(struct guest_walker *walker,
1439 + struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
1440 +- gva_t addr, u32 access)
1441 ++ gpa_t addr, u32 access)
1442 + {
1443 + int ret;
1444 + pt_element_t pte;
1445 +@@ -478,7 +478,7 @@ error:
1446 + }
1447 +
1448 + static int FNAME(walk_addr)(struct guest_walker *walker,
1449 +- struct kvm_vcpu *vcpu, gva_t addr, u32 access)
1450 ++ struct kvm_vcpu *vcpu, gpa_t addr, u32 access)
1451 + {
1452 + return FNAME(walk_addr_generic)(walker, vcpu, &vcpu->arch.mmu, addr,
1453 + access);
1454 +@@ -593,7 +593,7 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
1455 + * If the guest tries to write a write-protected page, we need to
1456 + * emulate this operation, return 1 to indicate this case.
1457 + */
1458 +-static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
1459 ++static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr,
1460 + struct guest_walker *gw,
1461 + int write_fault, int hlevel,
1462 + kvm_pfn_t pfn, bool map_writable, bool prefault,
1463 +@@ -747,7 +747,7 @@ FNAME(is_self_change_mapping)(struct kvm_vcpu *vcpu,
1464 + * Returns: 1 if we need to emulate the instruction, 0 otherwise, or
1465 + * a negative value on error.
1466 + */
1467 +-static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
1468 ++static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code,
1469 + bool prefault)
1470 + {
1471 + int write_fault = error_code & PFERR_WRITE_MASK;
1472 +@@ -926,18 +926,19 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa)
1473 + spin_unlock(&vcpu->kvm->mmu_lock);
1474 + }
1475 +
1476 +-static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access,
1477 ++/* Note, @addr is a GPA when gva_to_gpa() translates an L2 GPA to an L1 GPA. */
1478 ++static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gpa_t addr, u32 access,
1479 + struct x86_exception *exception)
1480 + {
1481 + struct guest_walker walker;
1482 + gpa_t gpa = UNMAPPED_GVA;
1483 + int r;
1484 +
1485 +- r = FNAME(walk_addr)(&walker, vcpu, vaddr, access);
1486 ++ r = FNAME(walk_addr)(&walker, vcpu, addr, access);
1487 +
1488 + if (r) {
1489 + gpa = gfn_to_gpa(walker.gfn);
1490 +- gpa |= vaddr & ~PAGE_MASK;
1491 ++ gpa |= addr & ~PAGE_MASK;
1492 + } else if (exception)
1493 + *exception = walker.fault;
1494 +
1495 +@@ -945,7 +946,8 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access,
1496 + }
1497 +
1498 + #if PTTYPE != PTTYPE_EPT
1499 +-static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr,
1500 ++/* Note, gva_to_gpa_nested() is only used to translate L2 GVAs. */
1501 ++static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gpa_t vaddr,
1502 + u32 access,
1503 + struct x86_exception *exception)
1504 + {
1505 +@@ -953,6 +955,11 @@ static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr,
1506 + gpa_t gpa = UNMAPPED_GVA;
1507 + int r;
1508 +
1509 ++#ifndef CONFIG_X86_64
1510 ++ /* A 64-bit GVA should be impossible on 32-bit KVM. */
1511 ++ WARN_ON_ONCE(vaddr >> 32);
1512 ++#endif
1513 ++
1514 + r = FNAME(walk_addr_nested)(&walker, vcpu, vaddr, access);
1515 +
1516 + if (r) {
1517 +diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h
1518 +index 22dff661145a..7b4828e50ab3 100644
1519 +--- a/arch/x86/kvm/pmu.h
1520 ++++ b/arch/x86/kvm/pmu.h
1521 +@@ -2,6 +2,8 @@
1522 + #ifndef __KVM_X86_PMU_H
1523 + #define __KVM_X86_PMU_H
1524 +
1525 ++#include <linux/nospec.h>
1526 ++
1527 + #define vcpu_to_pmu(vcpu) (&(vcpu)->arch.pmu)
1528 + #define pmu_to_vcpu(pmu) (container_of((pmu), struct kvm_vcpu, arch.pmu))
1529 + #define pmc_to_pmu(pmc) (&(pmc)->vcpu->arch.pmu)
1530 +@@ -86,8 +88,12 @@ static inline bool pmc_is_enabled(struct kvm_pmc *pmc)
1531 + static inline struct kvm_pmc *get_gp_pmc(struct kvm_pmu *pmu, u32 msr,
1532 + u32 base)
1533 + {
1534 +- if (msr >= base && msr < base + pmu->nr_arch_gp_counters)
1535 +- return &pmu->gp_counters[msr - base];
1536 ++ if (msr >= base && msr < base + pmu->nr_arch_gp_counters) {
1537 ++ u32 index = array_index_nospec(msr - base,
1538 ++ pmu->nr_arch_gp_counters);
1539 ++
1540 ++ return &pmu->gp_counters[index];
1541 ++ }
1542 +
1543 + return NULL;
1544 + }
1545 +@@ -97,8 +103,12 @@ static inline struct kvm_pmc *get_fixed_pmc(struct kvm_pmu *pmu, u32 msr)
1546 + {
1547 + int base = MSR_CORE_PERF_FIXED_CTR0;
1548 +
1549 +- if (msr >= base && msr < base + pmu->nr_arch_fixed_counters)
1550 +- return &pmu->fixed_counters[msr - base];
1551 ++ if (msr >= base && msr < base + pmu->nr_arch_fixed_counters) {
1552 ++ u32 index = array_index_nospec(msr - base,
1553 ++ pmu->nr_arch_fixed_counters);
1554 ++
1555 ++ return &pmu->fixed_counters[index];
1556 ++ }
1557 +
1558 + return NULL;
1559 + }
1560 +diff --git a/arch/x86/kvm/pmu_intel.c b/arch/x86/kvm/pmu_intel.c
1561 +index c3f103e2b08e..2ab8c20c8bfa 100644
1562 +--- a/arch/x86/kvm/pmu_intel.c
1563 ++++ b/arch/x86/kvm/pmu_intel.c
1564 +@@ -87,10 +87,14 @@ static unsigned intel_find_arch_event(struct kvm_pmu *pmu,
1565 +
1566 + static unsigned intel_find_fixed_event(int idx)
1567 + {
1568 +- if (idx >= ARRAY_SIZE(fixed_pmc_events))
1569 ++ u32 event;
1570 ++ size_t size = ARRAY_SIZE(fixed_pmc_events);
1571 ++
1572 ++ if (idx >= size)
1573 + return PERF_COUNT_HW_MAX;
1574 +
1575 +- return intel_arch_events[fixed_pmc_events[idx]].event_type;
1576 ++ event = fixed_pmc_events[array_index_nospec(idx, size)];
1577 ++ return intel_arch_events[event].event_type;
1578 + }
1579 +
1580 + /* check if a PMC is enabled by comparing it with globl_ctrl bits. */
1581 +@@ -131,16 +135,20 @@ static struct kvm_pmc *intel_msr_idx_to_pmc(struct kvm_vcpu *vcpu,
1582 + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
1583 + bool fixed = idx & (1u << 30);
1584 + struct kvm_pmc *counters;
1585 ++ unsigned int num_counters;
1586 +
1587 + idx &= ~(3u << 30);
1588 +- if (!fixed && idx >= pmu->nr_arch_gp_counters)
1589 +- return NULL;
1590 +- if (fixed && idx >= pmu->nr_arch_fixed_counters)
1591 ++ if (fixed) {
1592 ++ counters = pmu->fixed_counters;
1593 ++ num_counters = pmu->nr_arch_fixed_counters;
1594 ++ } else {
1595 ++ counters = pmu->gp_counters;
1596 ++ num_counters = pmu->nr_arch_gp_counters;
1597 ++ }
1598 ++ if (idx >= num_counters)
1599 + return NULL;
1600 +- counters = fixed ? pmu->fixed_counters : pmu->gp_counters;
1601 + *mask &= pmu->counter_bitmask[fixed ? KVM_PMC_FIXED : KVM_PMC_GP];
1602 +-
1603 +- return &counters[idx];
1604 ++ return &counters[array_index_nospec(idx, num_counters)];
1605 + }
1606 +
1607 + static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr)
1608 +diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
1609 +index fa2abed1a14d..2660c01eadae 100644
1610 +--- a/arch/x86/kvm/vmx.c
1611 ++++ b/arch/x86/kvm/vmx.c
1612 +@@ -8793,8 +8793,10 @@ static int handle_vmread(struct kvm_vcpu *vcpu)
1613 + /* _system ok, nested_vmx_check_permission has verified cpl=0 */
1614 + if (kvm_write_guest_virt_system(vcpu, gva, &field_value,
1615 + (is_long_mode(vcpu) ? 8 : 4),
1616 +- &e))
1617 ++ &e)) {
1618 + kvm_inject_page_fault(vcpu, &e);
1619 ++ return 1;
1620 ++ }
1621 + }
1622 +
1623 + nested_vmx_succeed(vcpu);
1624 +diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
1625 +new file mode 100644
1626 +index 000000000000..3791ce8d269e
1627 +--- /dev/null
1628 ++++ b/arch/x86/kvm/vmx/vmx.c
1629 +@@ -0,0 +1,8033 @@
1630 ++// SPDX-License-Identifier: GPL-2.0-only
1631 ++/*
1632 ++ * Kernel-based Virtual Machine driver for Linux
1633 ++ *
1634 ++ * This module enables machines with Intel VT-x extensions to run virtual
1635 ++ * machines without emulation or binary translation.
1636 ++ *
1637 ++ * Copyright (C) 2006 Qumranet, Inc.
1638 ++ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
1639 ++ *
1640 ++ * Authors:
1641 ++ * Avi Kivity <avi@××××××××.com>
1642 ++ * Yaniv Kamay <yaniv@××××××××.com>
1643 ++ */
1644 ++
1645 ++#include <linux/frame.h>
1646 ++#include <linux/highmem.h>
1647 ++#include <linux/hrtimer.h>
1648 ++#include <linux/kernel.h>
1649 ++#include <linux/kvm_host.h>
1650 ++#include <linux/module.h>
1651 ++#include <linux/moduleparam.h>
1652 ++#include <linux/mod_devicetable.h>
1653 ++#include <linux/mm.h>
1654 ++#include <linux/sched.h>
1655 ++#include <linux/sched/smt.h>
1656 ++#include <linux/slab.h>
1657 ++#include <linux/tboot.h>
1658 ++#include <linux/trace_events.h>
1659 ++
1660 ++#include <asm/apic.h>
1661 ++#include <asm/asm.h>
1662 ++#include <asm/cpu.h>
1663 ++#include <asm/debugreg.h>
1664 ++#include <asm/desc.h>
1665 ++#include <asm/fpu/internal.h>
1666 ++#include <asm/io.h>
1667 ++#include <asm/irq_remapping.h>
1668 ++#include <asm/kexec.h>
1669 ++#include <asm/perf_event.h>
1670 ++#include <asm/mce.h>
1671 ++#include <asm/mmu_context.h>
1672 ++#include <asm/mshyperv.h>
1673 ++#include <asm/spec-ctrl.h>
1674 ++#include <asm/virtext.h>
1675 ++#include <asm/vmx.h>
1676 ++
1677 ++#include "capabilities.h"
1678 ++#include "cpuid.h"
1679 ++#include "evmcs.h"
1680 ++#include "irq.h"
1681 ++#include "kvm_cache_regs.h"
1682 ++#include "lapic.h"
1683 ++#include "mmu.h"
1684 ++#include "nested.h"
1685 ++#include "ops.h"
1686 ++#include "pmu.h"
1687 ++#include "trace.h"
1688 ++#include "vmcs.h"
1689 ++#include "vmcs12.h"
1690 ++#include "vmx.h"
1691 ++#include "x86.h"
1692 ++
1693 ++MODULE_AUTHOR("Qumranet");
1694 ++MODULE_LICENSE("GPL");
1695 ++
1696 ++static const struct x86_cpu_id vmx_cpu_id[] = {
1697 ++ X86_FEATURE_MATCH(X86_FEATURE_VMX),
1698 ++ {}
1699 ++};
1700 ++MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id);
1701 ++
1702 ++bool __read_mostly enable_vpid = 1;
1703 ++module_param_named(vpid, enable_vpid, bool, 0444);
1704 ++
1705 ++static bool __read_mostly enable_vnmi = 1;
1706 ++module_param_named(vnmi, enable_vnmi, bool, S_IRUGO);
1707 ++
1708 ++bool __read_mostly flexpriority_enabled = 1;
1709 ++module_param_named(flexpriority, flexpriority_enabled, bool, S_IRUGO);
1710 ++
1711 ++bool __read_mostly enable_ept = 1;
1712 ++module_param_named(ept, enable_ept, bool, S_IRUGO);
1713 ++
1714 ++bool __read_mostly enable_unrestricted_guest = 1;
1715 ++module_param_named(unrestricted_guest,
1716 ++ enable_unrestricted_guest, bool, S_IRUGO);
1717 ++
1718 ++bool __read_mostly enable_ept_ad_bits = 1;
1719 ++module_param_named(eptad, enable_ept_ad_bits, bool, S_IRUGO);
1720 ++
1721 ++static bool __read_mostly emulate_invalid_guest_state = true;
1722 ++module_param(emulate_invalid_guest_state, bool, S_IRUGO);
1723 ++
1724 ++static bool __read_mostly fasteoi = 1;
1725 ++module_param(fasteoi, bool, S_IRUGO);
1726 ++
1727 ++static bool __read_mostly enable_apicv = 1;
1728 ++module_param(enable_apicv, bool, S_IRUGO);
1729 ++
1730 ++/*
1731 ++ * If nested=1, nested virtualization is supported, i.e., guests may use
1732 ++ * VMX and be a hypervisor for its own guests. If nested=0, guests may not
1733 ++ * use VMX instructions.
1734 ++ */
1735 ++static bool __read_mostly nested = 1;
1736 ++module_param(nested, bool, S_IRUGO);
1737 ++
1738 ++bool __read_mostly enable_pml = 1;
1739 ++module_param_named(pml, enable_pml, bool, S_IRUGO);
1740 ++
1741 ++static bool __read_mostly dump_invalid_vmcs = 0;
1742 ++module_param(dump_invalid_vmcs, bool, 0644);
1743 ++
1744 ++#define MSR_BITMAP_MODE_X2APIC 1
1745 ++#define MSR_BITMAP_MODE_X2APIC_APICV 2
1746 ++
1747 ++#define KVM_VMX_TSC_MULTIPLIER_MAX 0xffffffffffffffffULL
1748 ++
1749 ++/* Guest_tsc -> host_tsc conversion requires 64-bit division. */
1750 ++static int __read_mostly cpu_preemption_timer_multi;
1751 ++static bool __read_mostly enable_preemption_timer = 1;
1752 ++#ifdef CONFIG_X86_64
1753 ++module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO);
1754 ++#endif
1755 ++
1756 ++#define KVM_VM_CR0_ALWAYS_OFF (X86_CR0_NW | X86_CR0_CD)
1757 ++#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR0_NE
1758 ++#define KVM_VM_CR0_ALWAYS_ON \
1759 ++ (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | \
1760 ++ X86_CR0_WP | X86_CR0_PG | X86_CR0_PE)
1761 ++#define KVM_CR4_GUEST_OWNED_BITS \
1762 ++ (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \
1763 ++ | X86_CR4_OSXMMEXCPT | X86_CR4_LA57 | X86_CR4_TSD)
1764 ++
1765 ++#define KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR4_VMXE
1766 ++#define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE)
1767 ++#define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE)
1768 ++
1769 ++#define RMODE_GUEST_OWNED_EFLAGS_BITS (~(X86_EFLAGS_IOPL | X86_EFLAGS_VM))
1770 ++
1771 ++#define MSR_IA32_RTIT_STATUS_MASK (~(RTIT_STATUS_FILTEREN | \
1772 ++ RTIT_STATUS_CONTEXTEN | RTIT_STATUS_TRIGGEREN | \
1773 ++ RTIT_STATUS_ERROR | RTIT_STATUS_STOPPED | \
1774 ++ RTIT_STATUS_BYTECNT))
1775 ++
1776 ++#define MSR_IA32_RTIT_OUTPUT_BASE_MASK \
1777 ++ (~((1UL << cpuid_query_maxphyaddr(vcpu)) - 1) | 0x7f)
1778 ++
1779 ++/*
1780 ++ * These 2 parameters are used to config the controls for Pause-Loop Exiting:
1781 ++ * ple_gap: upper bound on the amount of time between two successive
1782 ++ * executions of PAUSE in a loop. Also indicate if ple enabled.
1783 ++ * According to test, this time is usually smaller than 128 cycles.
1784 ++ * ple_window: upper bound on the amount of time a guest is allowed to execute
1785 ++ * in a PAUSE loop. Tests indicate that most spinlocks are held for
1786 ++ * less than 2^12 cycles
1787 ++ * Time is measured based on a counter that runs at the same rate as the TSC,
1788 ++ * refer SDM volume 3b section 21.6.13 & 22.1.3.
1789 ++ */
1790 ++static unsigned int ple_gap = KVM_DEFAULT_PLE_GAP;
1791 ++module_param(ple_gap, uint, 0444);
1792 ++
1793 ++static unsigned int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW;
1794 ++module_param(ple_window, uint, 0444);
1795 ++
1796 ++/* Default doubles per-vcpu window every exit. */
1797 ++static unsigned int ple_window_grow = KVM_DEFAULT_PLE_WINDOW_GROW;
1798 ++module_param(ple_window_grow, uint, 0444);
1799 ++
1800 ++/* Default resets per-vcpu window every exit to ple_window. */
1801 ++static unsigned int ple_window_shrink = KVM_DEFAULT_PLE_WINDOW_SHRINK;
1802 ++module_param(ple_window_shrink, uint, 0444);
1803 ++
1804 ++/* Default is to compute the maximum so we can never overflow. */
1805 ++static unsigned int ple_window_max = KVM_VMX_DEFAULT_PLE_WINDOW_MAX;
1806 ++module_param(ple_window_max, uint, 0444);
1807 ++
1808 ++/* Default is SYSTEM mode, 1 for host-guest mode */
1809 ++int __read_mostly pt_mode = PT_MODE_SYSTEM;
1810 ++module_param(pt_mode, int, S_IRUGO);
1811 ++
1812 ++static DEFINE_STATIC_KEY_FALSE(vmx_l1d_should_flush);
1813 ++static DEFINE_STATIC_KEY_FALSE(vmx_l1d_flush_cond);
1814 ++static DEFINE_MUTEX(vmx_l1d_flush_mutex);
1815 ++
1816 ++/* Storage for pre module init parameter parsing */
1817 ++static enum vmx_l1d_flush_state __read_mostly vmentry_l1d_flush_param = VMENTER_L1D_FLUSH_AUTO;
1818 ++
1819 ++static const struct {
1820 ++ const char *option;
1821 ++ bool for_parse;
1822 ++} vmentry_l1d_param[] = {
1823 ++ [VMENTER_L1D_FLUSH_AUTO] = {"auto", true},
1824 ++ [VMENTER_L1D_FLUSH_NEVER] = {"never", true},
1825 ++ [VMENTER_L1D_FLUSH_COND] = {"cond", true},
1826 ++ [VMENTER_L1D_FLUSH_ALWAYS] = {"always", true},
1827 ++ [VMENTER_L1D_FLUSH_EPT_DISABLED] = {"EPT disabled", false},
1828 ++ [VMENTER_L1D_FLUSH_NOT_REQUIRED] = {"not required", false},
1829 ++};
1830 ++
1831 ++#define L1D_CACHE_ORDER 4
1832 ++static void *vmx_l1d_flush_pages;
1833 ++
1834 ++static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf)
1835 ++{
1836 ++ struct page *page;
1837 ++ unsigned int i;
1838 ++
1839 ++ if (!boot_cpu_has_bug(X86_BUG_L1TF)) {
1840 ++ l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED;
1841 ++ return 0;
1842 ++ }
1843 ++
1844 ++ if (!enable_ept) {
1845 ++ l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_EPT_DISABLED;
1846 ++ return 0;
1847 ++ }
1848 ++
1849 ++ if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) {
1850 ++ u64 msr;
1851 ++
1852 ++ rdmsrl(MSR_IA32_ARCH_CAPABILITIES, msr);
1853 ++ if (msr & ARCH_CAP_SKIP_VMENTRY_L1DFLUSH) {
1854 ++ l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED;
1855 ++ return 0;
1856 ++ }
1857 ++ }
1858 ++
1859 ++ /* If set to auto use the default l1tf mitigation method */
1860 ++ if (l1tf == VMENTER_L1D_FLUSH_AUTO) {
1861 ++ switch (l1tf_mitigation) {
1862 ++ case L1TF_MITIGATION_OFF:
1863 ++ l1tf = VMENTER_L1D_FLUSH_NEVER;
1864 ++ break;
1865 ++ case L1TF_MITIGATION_FLUSH_NOWARN:
1866 ++ case L1TF_MITIGATION_FLUSH:
1867 ++ case L1TF_MITIGATION_FLUSH_NOSMT:
1868 ++ l1tf = VMENTER_L1D_FLUSH_COND;
1869 ++ break;
1870 ++ case L1TF_MITIGATION_FULL:
1871 ++ case L1TF_MITIGATION_FULL_FORCE:
1872 ++ l1tf = VMENTER_L1D_FLUSH_ALWAYS;
1873 ++ break;
1874 ++ }
1875 ++ } else if (l1tf_mitigation == L1TF_MITIGATION_FULL_FORCE) {
1876 ++ l1tf = VMENTER_L1D_FLUSH_ALWAYS;
1877 ++ }
1878 ++
1879 ++ if (l1tf != VMENTER_L1D_FLUSH_NEVER && !vmx_l1d_flush_pages &&
1880 ++ !boot_cpu_has(X86_FEATURE_FLUSH_L1D)) {
1881 ++ /*
1882 ++ * This allocation for vmx_l1d_flush_pages is not tied to a VM
1883 ++ * lifetime and so should not be charged to a memcg.
1884 ++ */
1885 ++ page = alloc_pages(GFP_KERNEL, L1D_CACHE_ORDER);
1886 ++ if (!page)
1887 ++ return -ENOMEM;
1888 ++ vmx_l1d_flush_pages = page_address(page);
1889 ++
1890 ++ /*
1891 ++ * Initialize each page with a different pattern in
1892 ++ * order to protect against KSM in the nested
1893 ++ * virtualization case.
1894 ++ */
1895 ++ for (i = 0; i < 1u << L1D_CACHE_ORDER; ++i) {
1896 ++ memset(vmx_l1d_flush_pages + i * PAGE_SIZE, i + 1,
1897 ++ PAGE_SIZE);
1898 ++ }
1899 ++ }
1900 ++
1901 ++ l1tf_vmx_mitigation = l1tf;
1902 ++
1903 ++ if (l1tf != VMENTER_L1D_FLUSH_NEVER)
1904 ++ static_branch_enable(&vmx_l1d_should_flush);
1905 ++ else
1906 ++ static_branch_disable(&vmx_l1d_should_flush);
1907 ++
1908 ++ if (l1tf == VMENTER_L1D_FLUSH_COND)
1909 ++ static_branch_enable(&vmx_l1d_flush_cond);
1910 ++ else
1911 ++ static_branch_disable(&vmx_l1d_flush_cond);
1912 ++ return 0;
1913 ++}
1914 ++
1915 ++static int vmentry_l1d_flush_parse(const char *s)
1916 ++{
1917 ++ unsigned int i;
1918 ++
1919 ++ if (s) {
1920 ++ for (i = 0; i < ARRAY_SIZE(vmentry_l1d_param); i++) {
1921 ++ if (vmentry_l1d_param[i].for_parse &&
1922 ++ sysfs_streq(s, vmentry_l1d_param[i].option))
1923 ++ return i;
1924 ++ }
1925 ++ }
1926 ++ return -EINVAL;
1927 ++}
1928 ++
1929 ++static int vmentry_l1d_flush_set(const char *s, const struct kernel_param *kp)
1930 ++{
1931 ++ int l1tf, ret;
1932 ++
1933 ++ l1tf = vmentry_l1d_flush_parse(s);
1934 ++ if (l1tf < 0)
1935 ++ return l1tf;
1936 ++
1937 ++ if (!boot_cpu_has(X86_BUG_L1TF))
1938 ++ return 0;
1939 ++
1940 ++ /*
1941 ++ * Has vmx_init() run already? If not then this is the pre init
1942 ++ * parameter parsing. In that case just store the value and let
1943 ++ * vmx_init() do the proper setup after enable_ept has been
1944 ++ * established.
1945 ++ */
1946 ++ if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_AUTO) {
1947 ++ vmentry_l1d_flush_param = l1tf;
1948 ++ return 0;
1949 ++ }
1950 ++
1951 ++ mutex_lock(&vmx_l1d_flush_mutex);
1952 ++ ret = vmx_setup_l1d_flush(l1tf);
1953 ++ mutex_unlock(&vmx_l1d_flush_mutex);
1954 ++ return ret;
1955 ++}
1956 ++
1957 ++static int vmentry_l1d_flush_get(char *s, const struct kernel_param *kp)
1958 ++{
1959 ++ if (WARN_ON_ONCE(l1tf_vmx_mitigation >= ARRAY_SIZE(vmentry_l1d_param)))
1960 ++ return sprintf(s, "???\n");
1961 ++
1962 ++ return sprintf(s, "%s\n", vmentry_l1d_param[l1tf_vmx_mitigation].option);
1963 ++}
1964 ++
1965 ++static const struct kernel_param_ops vmentry_l1d_flush_ops = {
1966 ++ .set = vmentry_l1d_flush_set,
1967 ++ .get = vmentry_l1d_flush_get,
1968 ++};
1969 ++module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, NULL, 0644);
1970 ++
1971 ++static bool guest_state_valid(struct kvm_vcpu *vcpu);
1972 ++static u32 vmx_segment_access_rights(struct kvm_segment *var);
1973 ++static __always_inline void vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
1974 ++ u32 msr, int type);
1975 ++
1976 ++void vmx_vmexit(void);
1977 ++
1978 ++#define vmx_insn_failed(fmt...) \
1979 ++do { \
1980 ++ WARN_ONCE(1, fmt); \
1981 ++ pr_warn_ratelimited(fmt); \
1982 ++} while (0)
1983 ++
1984 ++asmlinkage void vmread_error(unsigned long field, bool fault)
1985 ++{
1986 ++ if (fault)
1987 ++ kvm_spurious_fault();
1988 ++ else
1989 ++ vmx_insn_failed("kvm: vmread failed: field=%lx\n", field);
1990 ++}
1991 ++
1992 ++noinline void vmwrite_error(unsigned long field, unsigned long value)
1993 ++{
1994 ++ vmx_insn_failed("kvm: vmwrite failed: field=%lx val=%lx err=%d\n",
1995 ++ field, value, vmcs_read32(VM_INSTRUCTION_ERROR));
1996 ++}
1997 ++
1998 ++noinline void vmclear_error(struct vmcs *vmcs, u64 phys_addr)
1999 ++{
2000 ++ vmx_insn_failed("kvm: vmclear failed: %p/%llx\n", vmcs, phys_addr);
2001 ++}
2002 ++
2003 ++noinline void vmptrld_error(struct vmcs *vmcs, u64 phys_addr)
2004 ++{
2005 ++ vmx_insn_failed("kvm: vmptrld failed: %p/%llx\n", vmcs, phys_addr);
2006 ++}
2007 ++
2008 ++noinline void invvpid_error(unsigned long ext, u16 vpid, gva_t gva)
2009 ++{
2010 ++ vmx_insn_failed("kvm: invvpid failed: ext=0x%lx vpid=%u gva=0x%lx\n",
2011 ++ ext, vpid, gva);
2012 ++}
2013 ++
2014 ++noinline void invept_error(unsigned long ext, u64 eptp, gpa_t gpa)
2015 ++{
2016 ++ vmx_insn_failed("kvm: invept failed: ext=0x%lx eptp=%llx gpa=0x%llx\n",
2017 ++ ext, eptp, gpa);
2018 ++}
2019 ++
2020 ++static DEFINE_PER_CPU(struct vmcs *, vmxarea);
2021 ++DEFINE_PER_CPU(struct vmcs *, current_vmcs);
2022 ++/*
2023 ++ * We maintain a per-CPU linked-list of VMCS loaded on that CPU. This is needed
2024 ++ * when a CPU is brought down, and we need to VMCLEAR all VMCSs loaded on it.
2025 ++ */
2026 ++static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu);
2027 ++
2028 ++/*
2029 ++ * We maintian a per-CPU linked-list of vCPU, so in wakeup_handler() we
2030 ++ * can find which vCPU should be waken up.
2031 ++ */
2032 ++static DEFINE_PER_CPU(struct list_head, blocked_vcpu_on_cpu);
2033 ++static DEFINE_PER_CPU(spinlock_t, blocked_vcpu_on_cpu_lock);
2034 ++
2035 ++static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS);
2036 ++static DEFINE_SPINLOCK(vmx_vpid_lock);
2037 ++
2038 ++struct vmcs_config vmcs_config;
2039 ++struct vmx_capability vmx_capability;
2040 ++
2041 ++#define VMX_SEGMENT_FIELD(seg) \
2042 ++ [VCPU_SREG_##seg] = { \
2043 ++ .selector = GUEST_##seg##_SELECTOR, \
2044 ++ .base = GUEST_##seg##_BASE, \
2045 ++ .limit = GUEST_##seg##_LIMIT, \
2046 ++ .ar_bytes = GUEST_##seg##_AR_BYTES, \
2047 ++ }
2048 ++
2049 ++static const struct kvm_vmx_segment_field {
2050 ++ unsigned selector;
2051 ++ unsigned base;
2052 ++ unsigned limit;
2053 ++ unsigned ar_bytes;
2054 ++} kvm_vmx_segment_fields[] = {
2055 ++ VMX_SEGMENT_FIELD(CS),
2056 ++ VMX_SEGMENT_FIELD(DS),
2057 ++ VMX_SEGMENT_FIELD(ES),
2058 ++ VMX_SEGMENT_FIELD(FS),
2059 ++ VMX_SEGMENT_FIELD(GS),
2060 ++ VMX_SEGMENT_FIELD(SS),
2061 ++ VMX_SEGMENT_FIELD(TR),
2062 ++ VMX_SEGMENT_FIELD(LDTR),
2063 ++};
2064 ++
2065 ++u64 host_efer;
2066 ++static unsigned long host_idt_base;
2067 ++
2068 ++/*
2069 ++ * Though SYSCALL is only supported in 64-bit mode on Intel CPUs, kvm
2070 ++ * will emulate SYSCALL in legacy mode if the vendor string in guest
2071 ++ * CPUID.0:{EBX,ECX,EDX} is "AuthenticAMD" or "AMDisbetter!" To
2072 ++ * support this emulation, IA32_STAR must always be included in
2073 ++ * vmx_msr_index[], even in i386 builds.
2074 ++ */
2075 ++const u32 vmx_msr_index[] = {
2076 ++#ifdef CONFIG_X86_64
2077 ++ MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR,
2078 ++#endif
2079 ++ MSR_EFER, MSR_TSC_AUX, MSR_STAR,
2080 ++ MSR_IA32_TSX_CTRL,
2081 ++};
2082 ++
2083 ++#if IS_ENABLED(CONFIG_HYPERV)
2084 ++static bool __read_mostly enlightened_vmcs = true;
2085 ++module_param(enlightened_vmcs, bool, 0444);
2086 ++
2087 ++/* check_ept_pointer() should be under protection of ept_pointer_lock. */
2088 ++static void check_ept_pointer_match(struct kvm *kvm)
2089 ++{
2090 ++ struct kvm_vcpu *vcpu;
2091 ++ u64 tmp_eptp = INVALID_PAGE;
2092 ++ int i;
2093 ++
2094 ++ kvm_for_each_vcpu(i, vcpu, kvm) {
2095 ++ if (!VALID_PAGE(tmp_eptp)) {
2096 ++ tmp_eptp = to_vmx(vcpu)->ept_pointer;
2097 ++ } else if (tmp_eptp != to_vmx(vcpu)->ept_pointer) {
2098 ++ to_kvm_vmx(kvm)->ept_pointers_match
2099 ++ = EPT_POINTERS_MISMATCH;
2100 ++ return;
2101 ++ }
2102 ++ }
2103 ++
2104 ++ to_kvm_vmx(kvm)->ept_pointers_match = EPT_POINTERS_MATCH;
2105 ++}
2106 ++
2107 ++static int kvm_fill_hv_flush_list_func(struct hv_guest_mapping_flush_list *flush,
2108 ++ void *data)
2109 ++{
2110 ++ struct kvm_tlb_range *range = data;
2111 ++
2112 ++ return hyperv_fill_flush_guest_mapping_list(flush, range->start_gfn,
2113 ++ range->pages);
2114 ++}
2115 ++
2116 ++static inline int __hv_remote_flush_tlb_with_range(struct kvm *kvm,
2117 ++ struct kvm_vcpu *vcpu, struct kvm_tlb_range *range)
2118 ++{
2119 ++ u64 ept_pointer = to_vmx(vcpu)->ept_pointer;
2120 ++
2121 ++ /*
2122 ++ * FLUSH_GUEST_PHYSICAL_ADDRESS_SPACE hypercall needs address
2123 ++ * of the base of EPT PML4 table, strip off EPT configuration
2124 ++ * information.
2125 ++ */
2126 ++ if (range)
2127 ++ return hyperv_flush_guest_mapping_range(ept_pointer & PAGE_MASK,
2128 ++ kvm_fill_hv_flush_list_func, (void *)range);
2129 ++ else
2130 ++ return hyperv_flush_guest_mapping(ept_pointer & PAGE_MASK);
2131 ++}
2132 ++
2133 ++static int hv_remote_flush_tlb_with_range(struct kvm *kvm,
2134 ++ struct kvm_tlb_range *range)
2135 ++{
2136 ++ struct kvm_vcpu *vcpu;
2137 ++ int ret = 0, i;
2138 ++
2139 ++ spin_lock(&to_kvm_vmx(kvm)->ept_pointer_lock);
2140 ++
2141 ++ if (to_kvm_vmx(kvm)->ept_pointers_match == EPT_POINTERS_CHECK)
2142 ++ check_ept_pointer_match(kvm);
2143 ++
2144 ++ if (to_kvm_vmx(kvm)->ept_pointers_match != EPT_POINTERS_MATCH) {
2145 ++ kvm_for_each_vcpu(i, vcpu, kvm) {
2146 ++ /* If ept_pointer is invalid pointer, bypass flush request. */
2147 ++ if (VALID_PAGE(to_vmx(vcpu)->ept_pointer))
2148 ++ ret |= __hv_remote_flush_tlb_with_range(
2149 ++ kvm, vcpu, range);
2150 ++ }
2151 ++ } else {
2152 ++ ret = __hv_remote_flush_tlb_with_range(kvm,
2153 ++ kvm_get_vcpu(kvm, 0), range);
2154 ++ }
2155 ++
2156 ++ spin_unlock(&to_kvm_vmx(kvm)->ept_pointer_lock);
2157 ++ return ret;
2158 ++}
2159 ++static int hv_remote_flush_tlb(struct kvm *kvm)
2160 ++{
2161 ++ return hv_remote_flush_tlb_with_range(kvm, NULL);
2162 ++}
2163 ++
2164 ++static int hv_enable_direct_tlbflush(struct kvm_vcpu *vcpu)
2165 ++{
2166 ++ struct hv_enlightened_vmcs *evmcs;
2167 ++ struct hv_partition_assist_pg **p_hv_pa_pg =
2168 ++ &vcpu->kvm->arch.hyperv.hv_pa_pg;
2169 ++ /*
2170 ++ * Synthetic VM-Exit is not enabled in current code and so All
2171 ++ * evmcs in singe VM shares same assist page.
2172 ++ */
2173 ++ if (!*p_hv_pa_pg)
2174 ++ *p_hv_pa_pg = kzalloc(PAGE_SIZE, GFP_KERNEL);
2175 ++
2176 ++ if (!*p_hv_pa_pg)
2177 ++ return -ENOMEM;
2178 ++
2179 ++ evmcs = (struct hv_enlightened_vmcs *)to_vmx(vcpu)->loaded_vmcs->vmcs;
2180 ++
2181 ++ evmcs->partition_assist_page =
2182 ++ __pa(*p_hv_pa_pg);
2183 ++ evmcs->hv_vm_id = (unsigned long)vcpu->kvm;
2184 ++ evmcs->hv_enlightenments_control.nested_flush_hypercall = 1;
2185 ++
2186 ++ return 0;
2187 ++}
2188 ++
2189 ++#endif /* IS_ENABLED(CONFIG_HYPERV) */
2190 ++
2191 ++/*
2192 ++ * Comment's format: document - errata name - stepping - processor name.
2193 ++ * Refer from
2194 ++ * https://www.virtualbox.org/svn/vbox/trunk/src/VBox/VMM/VMMR0/HMR0.cpp
2195 ++ */
2196 ++static u32 vmx_preemption_cpu_tfms[] = {
2197 ++/* 323344.pdf - BA86 - D0 - Xeon 7500 Series */
2198 ++0x000206E6,
2199 ++/* 323056.pdf - AAX65 - C2 - Xeon L3406 */
2200 ++/* 322814.pdf - AAT59 - C2 - i7-600, i5-500, i5-400 and i3-300 Mobile */
2201 ++/* 322911.pdf - AAU65 - C2 - i5-600, i3-500 Desktop and Pentium G6950 */
2202 ++0x00020652,
2203 ++/* 322911.pdf - AAU65 - K0 - i5-600, i3-500 Desktop and Pentium G6950 */
2204 ++0x00020655,
2205 ++/* 322373.pdf - AAO95 - B1 - Xeon 3400 Series */
2206 ++/* 322166.pdf - AAN92 - B1 - i7-800 and i5-700 Desktop */
2207 ++/*
2208 ++ * 320767.pdf - AAP86 - B1 -
2209 ++ * i7-900 Mobile Extreme, i7-800 and i7-700 Mobile
2210 ++ */
2211 ++0x000106E5,
2212 ++/* 321333.pdf - AAM126 - C0 - Xeon 3500 */
2213 ++0x000106A0,
2214 ++/* 321333.pdf - AAM126 - C1 - Xeon 3500 */
2215 ++0x000106A1,
2216 ++/* 320836.pdf - AAJ124 - C0 - i7-900 Desktop Extreme and i7-900 Desktop */
2217 ++0x000106A4,
2218 ++ /* 321333.pdf - AAM126 - D0 - Xeon 3500 */
2219 ++ /* 321324.pdf - AAK139 - D0 - Xeon 5500 */
2220 ++ /* 320836.pdf - AAJ124 - D0 - i7-900 Extreme and i7-900 Desktop */
2221 ++0x000106A5,
2222 ++ /* Xeon E3-1220 V2 */
2223 ++0x000306A8,
2224 ++};
2225 ++
2226 ++static inline bool cpu_has_broken_vmx_preemption_timer(void)
2227 ++{
2228 ++ u32 eax = cpuid_eax(0x00000001), i;
2229 ++
2230 ++ /* Clear the reserved bits */
2231 ++ eax &= ~(0x3U << 14 | 0xfU << 28);
2232 ++ for (i = 0; i < ARRAY_SIZE(vmx_preemption_cpu_tfms); i++)
2233 ++ if (eax == vmx_preemption_cpu_tfms[i])
2234 ++ return true;
2235 ++
2236 ++ return false;
2237 ++}
2238 ++
2239 ++static inline bool cpu_need_virtualize_apic_accesses(struct kvm_vcpu *vcpu)
2240 ++{
2241 ++ return flexpriority_enabled && lapic_in_kernel(vcpu);
2242 ++}
2243 ++
2244 ++static inline bool report_flexpriority(void)
2245 ++{
2246 ++ return flexpriority_enabled;
2247 ++}
2248 ++
2249 ++static inline int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
2250 ++{
2251 ++ int i;
2252 ++
2253 ++ for (i = 0; i < vmx->nmsrs; ++i)
2254 ++ if (vmx_msr_index[vmx->guest_msrs[i].index] == msr)
2255 ++ return i;
2256 ++ return -1;
2257 ++}
2258 ++
2259 ++struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr)
2260 ++{
2261 ++ int i;
2262 ++
2263 ++ i = __find_msr_index(vmx, msr);
2264 ++ if (i >= 0)
2265 ++ return &vmx->guest_msrs[i];
2266 ++ return NULL;
2267 ++}
2268 ++
2269 ++static int vmx_set_guest_msr(struct vcpu_vmx *vmx, struct shared_msr_entry *msr, u64 data)
2270 ++{
2271 ++ int ret = 0;
2272 ++
2273 ++ u64 old_msr_data = msr->data;
2274 ++ msr->data = data;
2275 ++ if (msr - vmx->guest_msrs < vmx->save_nmsrs) {
2276 ++ preempt_disable();
2277 ++ ret = kvm_set_shared_msr(msr->index, msr->data,
2278 ++ msr->mask);
2279 ++ preempt_enable();
2280 ++ if (ret)
2281 ++ msr->data = old_msr_data;
2282 ++ }
2283 ++ return ret;
2284 ++}
2285 ++
2286 ++void loaded_vmcs_init(struct loaded_vmcs *loaded_vmcs)
2287 ++{
2288 ++ vmcs_clear(loaded_vmcs->vmcs);
2289 ++ if (loaded_vmcs->shadow_vmcs && loaded_vmcs->launched)
2290 ++ vmcs_clear(loaded_vmcs->shadow_vmcs);
2291 ++ loaded_vmcs->cpu = -1;
2292 ++ loaded_vmcs->launched = 0;
2293 ++}
2294 ++
2295 ++#ifdef CONFIG_KEXEC_CORE
2296 ++/*
2297 ++ * This bitmap is used to indicate whether the vmclear
2298 ++ * operation is enabled on all cpus. All disabled by
2299 ++ * default.
2300 ++ */
2301 ++static cpumask_t crash_vmclear_enabled_bitmap = CPU_MASK_NONE;
2302 ++
2303 ++static inline void crash_enable_local_vmclear(int cpu)
2304 ++{
2305 ++ cpumask_set_cpu(cpu, &crash_vmclear_enabled_bitmap);
2306 ++}
2307 ++
2308 ++static inline void crash_disable_local_vmclear(int cpu)
2309 ++{
2310 ++ cpumask_clear_cpu(cpu, &crash_vmclear_enabled_bitmap);
2311 ++}
2312 ++
2313 ++static inline int crash_local_vmclear_enabled(int cpu)
2314 ++{
2315 ++ return cpumask_test_cpu(cpu, &crash_vmclear_enabled_bitmap);
2316 ++}
2317 ++
2318 ++static void crash_vmclear_local_loaded_vmcss(void)
2319 ++{
2320 ++ int cpu = raw_smp_processor_id();
2321 ++ struct loaded_vmcs *v;
2322 ++
2323 ++ if (!crash_local_vmclear_enabled(cpu))
2324 ++ return;
2325 ++
2326 ++ list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu),
2327 ++ loaded_vmcss_on_cpu_link)
2328 ++ vmcs_clear(v->vmcs);
2329 ++}
2330 ++#else
2331 ++static inline void crash_enable_local_vmclear(int cpu) { }
2332 ++static inline void crash_disable_local_vmclear(int cpu) { }
2333 ++#endif /* CONFIG_KEXEC_CORE */
2334 ++
2335 ++static void __loaded_vmcs_clear(void *arg)
2336 ++{
2337 ++ struct loaded_vmcs *loaded_vmcs = arg;
2338 ++ int cpu = raw_smp_processor_id();
2339 ++
2340 ++ if (loaded_vmcs->cpu != cpu)
2341 ++ return; /* vcpu migration can race with cpu offline */
2342 ++ if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs)
2343 ++ per_cpu(current_vmcs, cpu) = NULL;
2344 ++ crash_disable_local_vmclear(cpu);
2345 ++ list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link);
2346 ++
2347 ++ /*
2348 ++ * we should ensure updating loaded_vmcs->loaded_vmcss_on_cpu_link
2349 ++ * is before setting loaded_vmcs->vcpu to -1 which is done in
2350 ++ * loaded_vmcs_init. Otherwise, other cpu can see vcpu = -1 fist
2351 ++ * then adds the vmcs into percpu list before it is deleted.
2352 ++ */
2353 ++ smp_wmb();
2354 ++
2355 ++ loaded_vmcs_init(loaded_vmcs);
2356 ++ crash_enable_local_vmclear(cpu);
2357 ++}
2358 ++
2359 ++void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs)
2360 ++{
2361 ++ int cpu = loaded_vmcs->cpu;
2362 ++
2363 ++ if (cpu != -1)
2364 ++ smp_call_function_single(cpu,
2365 ++ __loaded_vmcs_clear, loaded_vmcs, 1);
2366 ++}
2367 ++
2368 ++static bool vmx_segment_cache_test_set(struct vcpu_vmx *vmx, unsigned seg,
2369 ++ unsigned field)
2370 ++{
2371 ++ bool ret;
2372 ++ u32 mask = 1 << (seg * SEG_FIELD_NR + field);
2373 ++
2374 ++ if (!kvm_register_is_available(&vmx->vcpu, VCPU_EXREG_SEGMENTS)) {
2375 ++ kvm_register_mark_available(&vmx->vcpu, VCPU_EXREG_SEGMENTS);
2376 ++ vmx->segment_cache.bitmask = 0;
2377 ++ }
2378 ++ ret = vmx->segment_cache.bitmask & mask;
2379 ++ vmx->segment_cache.bitmask |= mask;
2380 ++ return ret;
2381 ++}
2382 ++
2383 ++static u16 vmx_read_guest_seg_selector(struct vcpu_vmx *vmx, unsigned seg)
2384 ++{
2385 ++ u16 *p = &vmx->segment_cache.seg[seg].selector;
2386 ++
2387 ++ if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_SEL))
2388 ++ *p = vmcs_read16(kvm_vmx_segment_fields[seg].selector);
2389 ++ return *p;
2390 ++}
2391 ++
2392 ++static ulong vmx_read_guest_seg_base(struct vcpu_vmx *vmx, unsigned seg)
2393 ++{
2394 ++ ulong *p = &vmx->segment_cache.seg[seg].base;
2395 ++
2396 ++ if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_BASE))
2397 ++ *p = vmcs_readl(kvm_vmx_segment_fields[seg].base);
2398 ++ return *p;
2399 ++}
2400 ++
2401 ++static u32 vmx_read_guest_seg_limit(struct vcpu_vmx *vmx, unsigned seg)
2402 ++{
2403 ++ u32 *p = &vmx->segment_cache.seg[seg].limit;
2404 ++
2405 ++ if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_LIMIT))
2406 ++ *p = vmcs_read32(kvm_vmx_segment_fields[seg].limit);
2407 ++ return *p;
2408 ++}
2409 ++
2410 ++static u32 vmx_read_guest_seg_ar(struct vcpu_vmx *vmx, unsigned seg)
2411 ++{
2412 ++ u32 *p = &vmx->segment_cache.seg[seg].ar;
2413 ++
2414 ++ if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_AR))
2415 ++ *p = vmcs_read32(kvm_vmx_segment_fields[seg].ar_bytes);
2416 ++ return *p;
2417 ++}
2418 ++
2419 ++void update_exception_bitmap(struct kvm_vcpu *vcpu)
2420 ++{
2421 ++ u32 eb;
2422 ++
2423 ++ eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) |
2424 ++ (1u << DB_VECTOR) | (1u << AC_VECTOR);
2425 ++ /*
2426 ++ * Guest access to VMware backdoor ports could legitimately
2427 ++ * trigger #GP because of TSS I/O permission bitmap.
2428 ++ * We intercept those #GP and allow access to them anyway
2429 ++ * as VMware does.
2430 ++ */
2431 ++ if (enable_vmware_backdoor)
2432 ++ eb |= (1u << GP_VECTOR);
2433 ++ if ((vcpu->guest_debug &
2434 ++ (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) ==
2435 ++ (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP))
2436 ++ eb |= 1u << BP_VECTOR;
2437 ++ if (to_vmx(vcpu)->rmode.vm86_active)
2438 ++ eb = ~0;
2439 ++ if (enable_ept)
2440 ++ eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */
2441 ++
2442 ++ /* When we are running a nested L2 guest and L1 specified for it a
2443 ++ * certain exception bitmap, we must trap the same exceptions and pass
2444 ++ * them to L1. When running L2, we will only handle the exceptions
2445 ++ * specified above if L1 did not want them.
2446 ++ */
2447 ++ if (is_guest_mode(vcpu))
2448 ++ eb |= get_vmcs12(vcpu)->exception_bitmap;
2449 ++
2450 ++ vmcs_write32(EXCEPTION_BITMAP, eb);
2451 ++}
2452 ++
2453 ++/*
2454 ++ * Check if MSR is intercepted for currently loaded MSR bitmap.
2455 ++ */
2456 ++static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr)
2457 ++{
2458 ++ unsigned long *msr_bitmap;
2459 ++ int f = sizeof(unsigned long);
2460 ++
2461 ++ if (!cpu_has_vmx_msr_bitmap())
2462 ++ return true;
2463 ++
2464 ++ msr_bitmap = to_vmx(vcpu)->loaded_vmcs->msr_bitmap;
2465 ++
2466 ++ if (msr <= 0x1fff) {
2467 ++ return !!test_bit(msr, msr_bitmap + 0x800 / f);
2468 ++ } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
2469 ++ msr &= 0x1fff;
2470 ++ return !!test_bit(msr, msr_bitmap + 0xc00 / f);
2471 ++ }
2472 ++
2473 ++ return true;
2474 ++}
2475 ++
2476 ++static void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx,
2477 ++ unsigned long entry, unsigned long exit)
2478 ++{
2479 ++ vm_entry_controls_clearbit(vmx, entry);
2480 ++ vm_exit_controls_clearbit(vmx, exit);
2481 ++}
2482 ++
2483 ++int vmx_find_msr_index(struct vmx_msrs *m, u32 msr)
2484 ++{
2485 ++ unsigned int i;
2486 ++
2487 ++ for (i = 0; i < m->nr; ++i) {
2488 ++ if (m->val[i].index == msr)
2489 ++ return i;
2490 ++ }
2491 ++ return -ENOENT;
2492 ++}
2493 ++
2494 ++static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr)
2495 ++{
2496 ++ int i;
2497 ++ struct msr_autoload *m = &vmx->msr_autoload;
2498 ++
2499 ++ switch (msr) {
2500 ++ case MSR_EFER:
2501 ++ if (cpu_has_load_ia32_efer()) {
2502 ++ clear_atomic_switch_msr_special(vmx,
2503 ++ VM_ENTRY_LOAD_IA32_EFER,
2504 ++ VM_EXIT_LOAD_IA32_EFER);
2505 ++ return;
2506 ++ }
2507 ++ break;
2508 ++ case MSR_CORE_PERF_GLOBAL_CTRL:
2509 ++ if (cpu_has_load_perf_global_ctrl()) {
2510 ++ clear_atomic_switch_msr_special(vmx,
2511 ++ VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL,
2512 ++ VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL);
2513 ++ return;
2514 ++ }
2515 ++ break;
2516 ++ }
2517 ++ i = vmx_find_msr_index(&m->guest, msr);
2518 ++ if (i < 0)
2519 ++ goto skip_guest;
2520 ++ --m->guest.nr;
2521 ++ m->guest.val[i] = m->guest.val[m->guest.nr];
2522 ++ vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr);
2523 ++
2524 ++skip_guest:
2525 ++ i = vmx_find_msr_index(&m->host, msr);
2526 ++ if (i < 0)
2527 ++ return;
2528 ++
2529 ++ --m->host.nr;
2530 ++ m->host.val[i] = m->host.val[m->host.nr];
2531 ++ vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr);
2532 ++}
2533 ++
2534 ++static void add_atomic_switch_msr_special(struct vcpu_vmx *vmx,
2535 ++ unsigned long entry, unsigned long exit,
2536 ++ unsigned long guest_val_vmcs, unsigned long host_val_vmcs,
2537 ++ u64 guest_val, u64 host_val)
2538 ++{
2539 ++ vmcs_write64(guest_val_vmcs, guest_val);
2540 ++ if (host_val_vmcs != HOST_IA32_EFER)
2541 ++ vmcs_write64(host_val_vmcs, host_val);
2542 ++ vm_entry_controls_setbit(vmx, entry);
2543 ++ vm_exit_controls_setbit(vmx, exit);
2544 ++}
2545 ++
2546 ++static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
2547 ++ u64 guest_val, u64 host_val, bool entry_only)
2548 ++{
2549 ++ int i, j = 0;
2550 ++ struct msr_autoload *m = &vmx->msr_autoload;
2551 ++
2552 ++ switch (msr) {
2553 ++ case MSR_EFER:
2554 ++ if (cpu_has_load_ia32_efer()) {
2555 ++ add_atomic_switch_msr_special(vmx,
2556 ++ VM_ENTRY_LOAD_IA32_EFER,
2557 ++ VM_EXIT_LOAD_IA32_EFER,
2558 ++ GUEST_IA32_EFER,
2559 ++ HOST_IA32_EFER,
2560 ++ guest_val, host_val);
2561 ++ return;
2562 ++ }
2563 ++ break;
2564 ++ case MSR_CORE_PERF_GLOBAL_CTRL:
2565 ++ if (cpu_has_load_perf_global_ctrl()) {
2566 ++ add_atomic_switch_msr_special(vmx,
2567 ++ VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL,
2568 ++ VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL,
2569 ++ GUEST_IA32_PERF_GLOBAL_CTRL,
2570 ++ HOST_IA32_PERF_GLOBAL_CTRL,
2571 ++ guest_val, host_val);
2572 ++ return;
2573 ++ }
2574 ++ break;
2575 ++ case MSR_IA32_PEBS_ENABLE:
2576 ++ /* PEBS needs a quiescent period after being disabled (to write
2577 ++ * a record). Disabling PEBS through VMX MSR swapping doesn't
2578 ++ * provide that period, so a CPU could write host's record into
2579 ++ * guest's memory.
2580 ++ */
2581 ++ wrmsrl(MSR_IA32_PEBS_ENABLE, 0);
2582 ++ }
2583 ++
2584 ++ i = vmx_find_msr_index(&m->guest, msr);
2585 ++ if (!entry_only)
2586 ++ j = vmx_find_msr_index(&m->host, msr);
2587 ++
2588 ++ if ((i < 0 && m->guest.nr == NR_LOADSTORE_MSRS) ||
2589 ++ (j < 0 && m->host.nr == NR_LOADSTORE_MSRS)) {
2590 ++ printk_once(KERN_WARNING "Not enough msr switch entries. "
2591 ++ "Can't add msr %x\n", msr);
2592 ++ return;
2593 ++ }
2594 ++ if (i < 0) {
2595 ++ i = m->guest.nr++;
2596 ++ vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr);
2597 ++ }
2598 ++ m->guest.val[i].index = msr;
2599 ++ m->guest.val[i].value = guest_val;
2600 ++
2601 ++ if (entry_only)
2602 ++ return;
2603 ++
2604 ++ if (j < 0) {
2605 ++ j = m->host.nr++;
2606 ++ vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr);
2607 ++ }
2608 ++ m->host.val[j].index = msr;
2609 ++ m->host.val[j].value = host_val;
2610 ++}
2611 ++
2612 ++static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
2613 ++{
2614 ++ u64 guest_efer = vmx->vcpu.arch.efer;
2615 ++ u64 ignore_bits = 0;
2616 ++
2617 ++ /* Shadow paging assumes NX to be available. */
2618 ++ if (!enable_ept)
2619 ++ guest_efer |= EFER_NX;
2620 ++
2621 ++ /*
2622 ++ * LMA and LME handled by hardware; SCE meaningless outside long mode.
2623 ++ */
2624 ++ ignore_bits |= EFER_SCE;
2625 ++#ifdef CONFIG_X86_64
2626 ++ ignore_bits |= EFER_LMA | EFER_LME;
2627 ++ /* SCE is meaningful only in long mode on Intel */
2628 ++ if (guest_efer & EFER_LMA)
2629 ++ ignore_bits &= ~(u64)EFER_SCE;
2630 ++#endif
2631 ++
2632 ++ /*
2633 ++ * On EPT, we can't emulate NX, so we must switch EFER atomically.
2634 ++ * On CPUs that support "load IA32_EFER", always switch EFER
2635 ++ * atomically, since it's faster than switching it manually.
2636 ++ */
2637 ++ if (cpu_has_load_ia32_efer() ||
2638 ++ (enable_ept && ((vmx->vcpu.arch.efer ^ host_efer) & EFER_NX))) {
2639 ++ if (!(guest_efer & EFER_LMA))
2640 ++ guest_efer &= ~EFER_LME;
2641 ++ if (guest_efer != host_efer)
2642 ++ add_atomic_switch_msr(vmx, MSR_EFER,
2643 ++ guest_efer, host_efer, false);
2644 ++ else
2645 ++ clear_atomic_switch_msr(vmx, MSR_EFER);
2646 ++ return false;
2647 ++ } else {
2648 ++ clear_atomic_switch_msr(vmx, MSR_EFER);
2649 ++
2650 ++ guest_efer &= ~ignore_bits;
2651 ++ guest_efer |= host_efer & ignore_bits;
2652 ++
2653 ++ vmx->guest_msrs[efer_offset].data = guest_efer;
2654 ++ vmx->guest_msrs[efer_offset].mask = ~ignore_bits;
2655 ++
2656 ++ return true;
2657 ++ }
2658 ++}
2659 ++
2660 ++#ifdef CONFIG_X86_32
2661 ++/*
2662 ++ * On 32-bit kernels, VM exits still load the FS and GS bases from the
2663 ++ * VMCS rather than the segment table. KVM uses this helper to figure
2664 ++ * out the current bases to poke them into the VMCS before entry.
2665 ++ */
2666 ++static unsigned long segment_base(u16 selector)
2667 ++{
2668 ++ struct desc_struct *table;
2669 ++ unsigned long v;
2670 ++
2671 ++ if (!(selector & ~SEGMENT_RPL_MASK))
2672 ++ return 0;
2673 ++
2674 ++ table = get_current_gdt_ro();
2675 ++
2676 ++ if ((selector & SEGMENT_TI_MASK) == SEGMENT_LDT) {
2677 ++ u16 ldt_selector = kvm_read_ldt();
2678 ++
2679 ++ if (!(ldt_selector & ~SEGMENT_RPL_MASK))
2680 ++ return 0;
2681 ++
2682 ++ table = (struct desc_struct *)segment_base(ldt_selector);
2683 ++ }
2684 ++ v = get_desc_base(&table[selector >> 3]);
2685 ++ return v;
2686 ++}
2687 ++#endif
2688 ++
2689 ++static inline void pt_load_msr(struct pt_ctx *ctx, u32 addr_range)
2690 ++{
2691 ++ u32 i;
2692 ++
2693 ++ wrmsrl(MSR_IA32_RTIT_STATUS, ctx->status);
2694 ++ wrmsrl(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base);
2695 ++ wrmsrl(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask);
2696 ++ wrmsrl(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match);
2697 ++ for (i = 0; i < addr_range; i++) {
2698 ++ wrmsrl(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]);
2699 ++ wrmsrl(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]);
2700 ++ }
2701 ++}
2702 ++
2703 ++static inline void pt_save_msr(struct pt_ctx *ctx, u32 addr_range)
2704 ++{
2705 ++ u32 i;
2706 ++
2707 ++ rdmsrl(MSR_IA32_RTIT_STATUS, ctx->status);
2708 ++ rdmsrl(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base);
2709 ++ rdmsrl(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask);
2710 ++ rdmsrl(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match);
2711 ++ for (i = 0; i < addr_range; i++) {
2712 ++ rdmsrl(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]);
2713 ++ rdmsrl(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]);
2714 ++ }
2715 ++}
2716 ++
2717 ++static void pt_guest_enter(struct vcpu_vmx *vmx)
2718 ++{
2719 ++ if (pt_mode == PT_MODE_SYSTEM)
2720 ++ return;
2721 ++
2722 ++ /*
2723 ++ * GUEST_IA32_RTIT_CTL is already set in the VMCS.
2724 ++ * Save host state before VM entry.
2725 ++ */
2726 ++ rdmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl);
2727 ++ if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) {
2728 ++ wrmsrl(MSR_IA32_RTIT_CTL, 0);
2729 ++ pt_save_msr(&vmx->pt_desc.host, vmx->pt_desc.addr_range);
2730 ++ pt_load_msr(&vmx->pt_desc.guest, vmx->pt_desc.addr_range);
2731 ++ }
2732 ++}
2733 ++
2734 ++static void pt_guest_exit(struct vcpu_vmx *vmx)
2735 ++{
2736 ++ if (pt_mode == PT_MODE_SYSTEM)
2737 ++ return;
2738 ++
2739 ++ if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) {
2740 ++ pt_save_msr(&vmx->pt_desc.guest, vmx->pt_desc.addr_range);
2741 ++ pt_load_msr(&vmx->pt_desc.host, vmx->pt_desc.addr_range);
2742 ++ }
2743 ++
2744 ++ /* Reload host state (IA32_RTIT_CTL will be cleared on VM exit). */
2745 ++ wrmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl);
2746 ++}
2747 ++
2748 ++void vmx_set_host_fs_gs(struct vmcs_host_state *host, u16 fs_sel, u16 gs_sel,
2749 ++ unsigned long fs_base, unsigned long gs_base)
2750 ++{
2751 ++ if (unlikely(fs_sel != host->fs_sel)) {
2752 ++ if (!(fs_sel & 7))
2753 ++ vmcs_write16(HOST_FS_SELECTOR, fs_sel);
2754 ++ else
2755 ++ vmcs_write16(HOST_FS_SELECTOR, 0);
2756 ++ host->fs_sel = fs_sel;
2757 ++ }
2758 ++ if (unlikely(gs_sel != host->gs_sel)) {
2759 ++ if (!(gs_sel & 7))
2760 ++ vmcs_write16(HOST_GS_SELECTOR, gs_sel);
2761 ++ else
2762 ++ vmcs_write16(HOST_GS_SELECTOR, 0);
2763 ++ host->gs_sel = gs_sel;
2764 ++ }
2765 ++ if (unlikely(fs_base != host->fs_base)) {
2766 ++ vmcs_writel(HOST_FS_BASE, fs_base);
2767 ++ host->fs_base = fs_base;
2768 ++ }
2769 ++ if (unlikely(gs_base != host->gs_base)) {
2770 ++ vmcs_writel(HOST_GS_BASE, gs_base);
2771 ++ host->gs_base = gs_base;
2772 ++ }
2773 ++}
2774 ++
2775 ++void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
2776 ++{
2777 ++ struct vcpu_vmx *vmx = to_vmx(vcpu);
2778 ++ struct vmcs_host_state *host_state;
2779 ++#ifdef CONFIG_X86_64
2780 ++ int cpu = raw_smp_processor_id();
2781 ++#endif
2782 ++ unsigned long fs_base, gs_base;
2783 ++ u16 fs_sel, gs_sel;
2784 ++ int i;
2785 ++
2786 ++ vmx->req_immediate_exit = false;
2787 ++
2788 ++ /*
2789 ++ * Note that guest MSRs to be saved/restored can also be changed
2790 ++ * when guest state is loaded. This happens when guest transitions
2791 ++ * to/from long-mode by setting MSR_EFER.LMA.
2792 ++ */
2793 ++ if (!vmx->guest_msrs_ready) {
2794 ++ vmx->guest_msrs_ready = true;
2795 ++ for (i = 0; i < vmx->save_nmsrs; ++i)
2796 ++ kvm_set_shared_msr(vmx->guest_msrs[i].index,
2797 ++ vmx->guest_msrs[i].data,
2798 ++ vmx->guest_msrs[i].mask);
2799 ++
2800 ++ }
2801 ++ if (vmx->guest_state_loaded)
2802 ++ return;
2803 ++
2804 ++ host_state = &vmx->loaded_vmcs->host_state;
2805 ++
2806 ++ /*
2807 ++ * Set host fs and gs selectors. Unfortunately, 22.2.3 does not
2808 ++ * allow segment selectors with cpl > 0 or ti == 1.
2809 ++ */
2810 ++ host_state->ldt_sel = kvm_read_ldt();
2811 ++
2812 ++#ifdef CONFIG_X86_64
2813 ++ savesegment(ds, host_state->ds_sel);
2814 ++ savesegment(es, host_state->es_sel);
2815 ++
2816 ++ gs_base = cpu_kernelmode_gs_base(cpu);
2817 ++ if (likely(is_64bit_mm(current->mm))) {
2818 ++ save_fsgs_for_kvm();
2819 ++ fs_sel = current->thread.fsindex;
2820 ++ gs_sel = current->thread.gsindex;
2821 ++ fs_base = current->thread.fsbase;
2822 ++ vmx->msr_host_kernel_gs_base = current->thread.gsbase;
2823 ++ } else {
2824 ++ savesegment(fs, fs_sel);
2825 ++ savesegment(gs, gs_sel);
2826 ++ fs_base = read_msr(MSR_FS_BASE);
2827 ++ vmx->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE);
2828 ++ }
2829 ++
2830 ++ wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
2831 ++#else
2832 ++ savesegment(fs, fs_sel);
2833 ++ savesegment(gs, gs_sel);
2834 ++ fs_base = segment_base(fs_sel);
2835 ++ gs_base = segment_base(gs_sel);
2836 ++#endif
2837 ++
2838 ++ vmx_set_host_fs_gs(host_state, fs_sel, gs_sel, fs_base, gs_base);
2839 ++ vmx->guest_state_loaded = true;
2840 ++}
2841 ++
2842 ++static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx)
2843 ++{
2844 ++ struct vmcs_host_state *host_state;
2845 ++
2846 ++ if (!vmx->guest_state_loaded)
2847 ++ return;
2848 ++
2849 ++ host_state = &vmx->loaded_vmcs->host_state;
2850 ++
2851 ++ ++vmx->vcpu.stat.host_state_reload;
2852 ++
2853 ++#ifdef CONFIG_X86_64
2854 ++ rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
2855 ++#endif
2856 ++ if (host_state->ldt_sel || (host_state->gs_sel & 7)) {
2857 ++ kvm_load_ldt(host_state->ldt_sel);
2858 ++#ifdef CONFIG_X86_64
2859 ++ load_gs_index(host_state->gs_sel);
2860 ++#else
2861 ++ loadsegment(gs, host_state->gs_sel);
2862 ++#endif
2863 ++ }
2864 ++ if (host_state->fs_sel & 7)
2865 ++ loadsegment(fs, host_state->fs_sel);
2866 ++#ifdef CONFIG_X86_64
2867 ++ if (unlikely(host_state->ds_sel | host_state->es_sel)) {
2868 ++ loadsegment(ds, host_state->ds_sel);
2869 ++ loadsegment(es, host_state->es_sel);
2870 ++ }
2871 ++#endif
2872 ++ invalidate_tss_limit();
2873 ++#ifdef CONFIG_X86_64
2874 ++ wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
2875 ++#endif
2876 ++ load_fixmap_gdt(raw_smp_processor_id());
2877 ++ vmx->guest_state_loaded = false;
2878 ++ vmx->guest_msrs_ready = false;
2879 ++}
2880 ++
2881 ++#ifdef CONFIG_X86_64
2882 ++static u64 vmx_read_guest_kernel_gs_base(struct vcpu_vmx *vmx)
2883 ++{
2884 ++ preempt_disable();
2885 ++ if (vmx->guest_state_loaded)
2886 ++ rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
2887 ++ preempt_enable();
2888 ++ return vmx->msr_guest_kernel_gs_base;
2889 ++}
2890 ++
2891 ++static void vmx_write_guest_kernel_gs_base(struct vcpu_vmx *vmx, u64 data)
2892 ++{
2893 ++ preempt_disable();
2894 ++ if (vmx->guest_state_loaded)
2895 ++ wrmsrl(MSR_KERNEL_GS_BASE, data);
2896 ++ preempt_enable();
2897 ++ vmx->msr_guest_kernel_gs_base = data;
2898 ++}
2899 ++#endif
2900 ++
2901 ++static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
2902 ++{
2903 ++ struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
2904 ++ struct pi_desc old, new;
2905 ++ unsigned int dest;
2906 ++
2907 ++ /*
2908 ++ * In case of hot-plug or hot-unplug, we may have to undo
2909 ++ * vmx_vcpu_pi_put even if there is no assigned device. And we
2910 ++ * always keep PI.NDST up to date for simplicity: it makes the
2911 ++ * code easier, and CPU migration is not a fast path.
2912 ++ */
2913 ++ if (!pi_test_sn(pi_desc) && vcpu->cpu == cpu)
2914 ++ return;
2915 ++
2916 ++ /*
2917 ++ * If the 'nv' field is POSTED_INTR_WAKEUP_VECTOR, do not change
2918 ++ * PI.NDST: pi_post_block is the one expected to change PID.NDST and the
2919 ++ * wakeup handler expects the vCPU to be on the blocked_vcpu_list that
2920 ++ * matches PI.NDST. Otherwise, a vcpu may not be able to be woken up
2921 ++ * correctly.
2922 ++ */
2923 ++ if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR || vcpu->cpu == cpu) {
2924 ++ pi_clear_sn(pi_desc);
2925 ++ goto after_clear_sn;
2926 ++ }
2927 ++
2928 ++ /* The full case. */
2929 ++ do {
2930 ++ old.control = new.control = pi_desc->control;
2931 ++
2932 ++ dest = cpu_physical_id(cpu);
2933 ++
2934 ++ if (x2apic_enabled())
2935 ++ new.ndst = dest;
2936 ++ else
2937 ++ new.ndst = (dest << 8) & 0xFF00;
2938 ++
2939 ++ new.sn = 0;
2940 ++ } while (cmpxchg64(&pi_desc->control, old.control,
2941 ++ new.control) != old.control);
2942 ++
2943 ++after_clear_sn:
2944 ++
2945 ++ /*
2946 ++ * Clear SN before reading the bitmap. The VT-d firmware
2947 ++ * writes the bitmap and reads SN atomically (5.2.3 in the
2948 ++ * spec), so it doesn't really have a memory barrier that
2949 ++ * pairs with this, but we cannot do that and we need one.
2950 ++ */
2951 ++ smp_mb__after_atomic();
2952 ++
2953 ++ if (!pi_is_pir_empty(pi_desc))
2954 ++ pi_set_on(pi_desc);
2955 ++}
2956 ++
2957 ++void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu)
2958 ++{
2959 ++ struct vcpu_vmx *vmx = to_vmx(vcpu);
2960 ++ bool already_loaded = vmx->loaded_vmcs->cpu == cpu;
2961 ++
2962 ++ if (!already_loaded) {
2963 ++ loaded_vmcs_clear(vmx->loaded_vmcs);
2964 ++ local_irq_disable();
2965 ++ crash_disable_local_vmclear(cpu);
2966 ++
2967 ++ /*
2968 ++ * Read loaded_vmcs->cpu should be before fetching
2969 ++ * loaded_vmcs->loaded_vmcss_on_cpu_link.
2970 ++ * See the comments in __loaded_vmcs_clear().
2971 ++ */
2972 ++ smp_rmb();
2973 ++
2974 ++ list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link,
2975 ++ &per_cpu(loaded_vmcss_on_cpu, cpu));
2976 ++ crash_enable_local_vmclear(cpu);
2977 ++ local_irq_enable();
2978 ++ }
2979 ++
2980 ++ if (per_cpu(current_vmcs, cpu) != vmx->loaded_vmcs->vmcs) {
2981 ++ per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs;
2982 ++ vmcs_load(vmx->loaded_vmcs->vmcs);
2983 ++ indirect_branch_prediction_barrier();
2984 ++ }
2985 ++
2986 ++ if (!already_loaded) {
2987 ++ void *gdt = get_current_gdt_ro();
2988 ++ unsigned long sysenter_esp;
2989 ++
2990 ++ kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
2991 ++
2992 ++ /*
2993 ++ * Linux uses per-cpu TSS and GDT, so set these when switching
2994 ++ * processors. See 22.2.4.
2995 ++ */
2996 ++ vmcs_writel(HOST_TR_BASE,
2997 ++ (unsigned long)&get_cpu_entry_area(cpu)->tss.x86_tss);
2998 ++ vmcs_writel(HOST_GDTR_BASE, (unsigned long)gdt); /* 22.2.4 */
2999 ++
3000 ++ rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
3001 ++ vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
3002 ++
3003 ++ vmx->loaded_vmcs->cpu = cpu;
3004 ++ }
3005 ++
3006 ++ /* Setup TSC multiplier */
3007 ++ if (kvm_has_tsc_control &&
3008 ++ vmx->current_tsc_ratio != vcpu->arch.tsc_scaling_ratio)
3009 ++ decache_tsc_multiplier(vmx);
3010 ++}
3011 ++
3012 ++/*
3013 ++ * Switches to specified vcpu, until a matching vcpu_put(), but assumes
3014 ++ * vcpu mutex is already taken.
3015 ++ */
3016 ++void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
3017 ++{
3018 ++ struct vcpu_vmx *vmx = to_vmx(vcpu);
3019 ++
3020 ++ vmx_vcpu_load_vmcs(vcpu, cpu);
3021 ++
3022 ++ vmx_vcpu_pi_load(vcpu, cpu);
3023 ++
3024 ++ vmx->host_pkru = read_pkru();
3025 ++ vmx->host_debugctlmsr = get_debugctlmsr();
3026 ++}
3027 ++
3028 ++static void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu)
3029 ++{
3030 ++ struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
3031 ++
3032 ++ if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
3033 ++ !irq_remapping_cap(IRQ_POSTING_CAP) ||
3034 ++ !kvm_vcpu_apicv_active(vcpu))
3035 ++ return;
3036 ++
3037 ++ /* Set SN when the vCPU is preempted */
3038 ++ if (vcpu->preempted)
3039 ++ pi_set_sn(pi_desc);
3040 ++}
3041 ++
3042 ++static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
3043 ++{
3044 ++ vmx_vcpu_pi_put(vcpu);
3045 ++
3046 ++ vmx_prepare_switch_to_host(to_vmx(vcpu));
3047 ++}
3048 ++
3049 ++static bool emulation_required(struct kvm_vcpu *vcpu)
3050 ++{
3051 ++ return emulate_invalid_guest_state && !guest_state_valid(vcpu);
3052 ++}
3053 ++
3054 ++static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu);
3055 ++
3056 ++unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
3057 ++{
3058 ++ struct vcpu_vmx *vmx = to_vmx(vcpu);
3059 ++ unsigned long rflags, save_rflags;
3060 ++
3061 ++ if (!kvm_register_is_available(vcpu, VCPU_EXREG_RFLAGS)) {
3062 ++ kvm_register_mark_available(vcpu, VCPU_EXREG_RFLAGS);
3063 ++ rflags = vmcs_readl(GUEST_RFLAGS);
3064 ++ if (vmx->rmode.vm86_active) {
3065 ++ rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
3066 ++ save_rflags = vmx->rmode.save_rflags;
3067 ++ rflags |= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
3068 ++ }
3069 ++ vmx->rflags = rflags;
3070 ++ }
3071 ++ return vmx->rflags;
3072 ++}
3073 ++
3074 ++void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
3075 ++{
3076 ++ struct vcpu_vmx *vmx = to_vmx(vcpu);
3077 ++ unsigned long old_rflags;
3078 ++
3079 ++ if (enable_unrestricted_guest) {
3080 ++ kvm_register_mark_available(vcpu, VCPU_EXREG_RFLAGS);
3081 ++ vmx->rflags = rflags;
3082 ++ vmcs_writel(GUEST_RFLAGS, rflags);
3083 ++ return;
3084 ++ }
3085 ++
3086 ++ old_rflags = vmx_get_rflags(vcpu);
3087 ++ vmx->rflags = rflags;
3088 ++ if (vmx->rmode.vm86_active) {
3089 ++ vmx->rmode.save_rflags = rflags;
3090 ++ rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
3091 ++ }
3092 ++ vmcs_writel(GUEST_RFLAGS, rflags);
3093 ++
3094 ++ if ((old_rflags ^ vmx->rflags) & X86_EFLAGS_VM)
3095 ++ vmx->emulation_required = emulation_required(vcpu);
3096 ++}
3097 ++
3098 ++u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu)
3099 ++{
3100 ++ u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
3101 ++ int ret = 0;
3102 ++
3103 ++ if (interruptibility & GUEST_INTR_STATE_STI)
3104 ++ ret |= KVM_X86_SHADOW_INT_STI;
3105 ++ if (interruptibility & GUEST_INTR_STATE_MOV_SS)
3106 ++ ret |= KVM_X86_SHADOW_INT_MOV_SS;
3107 ++
3108 ++ return ret;
3109 ++}
3110 ++
3111 ++void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
3112 ++{
3113 ++ u32 interruptibility_old = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
3114 ++ u32 interruptibility = interruptibility_old;
3115 ++
3116 ++ interruptibility &= ~(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS);
3117 ++
3118 ++ if (mask & KVM_X86_SHADOW_INT_MOV_SS)
3119 ++ interruptibility |= GUEST_INTR_STATE_MOV_SS;
3120 ++ else if (mask & KVM_X86_SHADOW_INT_STI)
3121 ++ interruptibility |= GUEST_INTR_STATE_STI;
3122 ++
3123 ++ if ((interruptibility != interruptibility_old))
3124 ++ vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, interruptibility);
3125 ++}
3126 ++
3127 ++static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data)
3128 ++{
3129 ++ struct vcpu_vmx *vmx = to_vmx(vcpu);
3130 ++ unsigned long value;
3131 ++
3132 ++ /*
3133 ++ * Any MSR write that attempts to change bits marked reserved will
3134 ++ * case a #GP fault.
3135 ++ */
3136 ++ if (data & vmx->pt_desc.ctl_bitmask)
3137 ++ return 1;
3138 ++
3139 ++ /*
3140 ++ * Any attempt to modify IA32_RTIT_CTL while TraceEn is set will
3141 ++ * result in a #GP unless the same write also clears TraceEn.
3142 ++ */
3143 ++ if ((vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) &&
3144 ++ ((vmx->pt_desc.guest.ctl ^ data) & ~RTIT_CTL_TRACEEN))
3145 ++ return 1;
3146 ++
3147 ++ /*
3148 ++ * WRMSR to IA32_RTIT_CTL that sets TraceEn but clears this bit
3149 ++ * and FabricEn would cause #GP, if
3150 ++ * CPUID.(EAX=14H, ECX=0):ECX.SNGLRGNOUT[bit 2] = 0
3151 ++ */
3152 ++ if ((data & RTIT_CTL_TRACEEN) && !(data & RTIT_CTL_TOPA) &&
3153 ++ !(data & RTIT_CTL_FABRIC_EN) &&
3154 ++ !intel_pt_validate_cap(vmx->pt_desc.caps,
3155 ++ PT_CAP_single_range_output))
3156 ++ return 1;
3157 ++
3158 ++ /*
3159 ++ * MTCFreq, CycThresh and PSBFreq encodings check, any MSR write that
3160 ++ * utilize encodings marked reserved will casue a #GP fault.
3161 ++ */
3162 ++ value = intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc_periods);
3163 ++ if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc) &&
3164 ++ !test_bit((data & RTIT_CTL_MTC_RANGE) >>
3165 ++ RTIT_CTL_MTC_RANGE_OFFSET, &value))
3166 ++ return 1;
3167 ++ value = intel_pt_validate_cap(vmx->pt_desc.caps,
3168 ++ PT_CAP_cycle_thresholds);
3169 ++ if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc) &&
3170 ++ !test_bit((data & RTIT_CTL_CYC_THRESH) >>
3171 ++ RTIT_CTL_CYC_THRESH_OFFSET, &value))
3172 ++ return 1;
3173 ++ value = intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_periods);
3174 ++ if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc) &&
3175 ++ !test_bit((data & RTIT_CTL_PSB_FREQ) >>
3176 ++ RTIT_CTL_PSB_FREQ_OFFSET, &value))
3177 ++ return 1;
3178 ++
3179 ++ /*
3180 ++ * If ADDRx_CFG is reserved or the encodings is >2 will
3181 ++ * cause a #GP fault.
3182 ++ */
3183 ++ value = (data & RTIT_CTL_ADDR0) >> RTIT_CTL_ADDR0_OFFSET;
3184 ++ if ((value && (vmx->pt_desc.addr_range < 1)) || (value > 2))
3185 ++ return 1;
3186 ++ value = (data & RTIT_CTL_ADDR1) >> RTIT_CTL_ADDR1_OFFSET;
3187 ++ if ((value && (vmx->pt_desc.addr_range < 2)) || (value > 2))
3188 ++ return 1;
3189 ++ value = (data & RTIT_CTL_ADDR2) >> RTIT_CTL_ADDR2_OFFSET;
3190 ++ if ((value && (vmx->pt_desc.addr_range < 3)) || (value > 2))
3191 ++ return 1;
3192 ++ value = (data & RTIT_CTL_ADDR3) >> RTIT_CTL_ADDR3_OFFSET;
3193 ++ if ((value && (vmx->pt_desc.addr_range < 4)) || (value > 2))
3194 ++ return 1;
3195 ++
3196 ++ return 0;
3197 ++}
3198 ++
3199 ++static int skip_emulated_instruction(struct kvm_vcpu *vcpu)
3200 ++{
3201 ++ unsigned long rip;
3202 ++
3203 ++ /*
3204 ++ * Using VMCS.VM_EXIT_INSTRUCTION_LEN on EPT misconfig depends on
3205 ++ * undefined behavior: Intel's SDM doesn't mandate the VMCS field be
3206 ++ * set when EPT misconfig occurs. In practice, real hardware updates
3207 ++ * VM_EXIT_INSTRUCTION_LEN on EPT misconfig, but other hypervisors
3208 ++ * (namely Hyper-V) don't set it due to it being undefined behavior,
3209 ++ * i.e. we end up advancing IP with some random value.
3210 ++ */
3211 ++ if (!static_cpu_has(X86_FEATURE_HYPERVISOR) ||
3212 ++ to_vmx(vcpu)->exit_reason != EXIT_REASON_EPT_MISCONFIG) {
3213 ++ rip = kvm_rip_read(vcpu);
3214 ++ rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
3215 ++ kvm_rip_write(vcpu, rip);
3216 ++ } else {
3217 ++ if (!kvm_emulate_instruction(vcpu, EMULTYPE_SKIP))
3218 ++ return 0;
3219 ++ }
3220 ++
3221 ++ /* skipping an emulated instruction also counts */
3222 ++ vmx_set_interrupt_shadow(vcpu, 0);
3223 ++
3224 ++ return 1;
3225 ++}
3226 ++
3227 ++static void vmx_clear_hlt(struct kvm_vcpu *vcpu)
3228 ++{
3229 ++ /*
3230 ++ * Ensure that we clear the HLT state in the VMCS. We don't need to
3231 ++ * explicitly skip the instruction because if the HLT state is set,
3232 ++ * then the instruction is already executing and RIP has already been
3233 ++ * advanced.
3234 ++ */
3235 ++ if (kvm_hlt_in_guest(vcpu->kvm) &&
3236 ++ vmcs_read32(GUEST_ACTIVITY_STATE) == GUEST_ACTIVITY_HLT)
3237 ++ vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
3238 ++}
3239 ++
3240 ++static void vmx_queue_exception(struct kvm_vcpu *vcpu)
3241 ++{
3242 ++ struct vcpu_vmx *vmx = to_vmx(vcpu);
3243 ++ unsigned nr = vcpu->arch.exception.nr;
3244 ++ bool has_error_code = vcpu->arch.exception.has_error_code;
3245 ++ u32 error_code = vcpu->arch.exception.error_code;
3246 ++ u32 intr_info = nr | INTR_INFO_VALID_MASK;
3247 ++
3248 ++ kvm_deliver_exception_payload(vcpu);
3249 ++
3250 ++ if (has_error_code) {
3251 ++ vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
3252 ++ intr_info |= INTR_INFO_DELIVER_CODE_MASK;
3253 ++ }
3254 ++
3255 ++ if (vmx->rmode.vm86_active) {
3256 ++ int inc_eip = 0;
3257 ++ if (kvm_exception_is_soft(nr))
3258 ++ inc_eip = vcpu->arch.event_exit_inst_len;
3259 ++ kvm_inject_realmode_interrupt(vcpu, nr, inc_eip);
3260 ++ return;
3261 ++ }
3262 ++
3263 ++ WARN_ON_ONCE(vmx->emulation_required);
3264 ++
3265 ++ if (kvm_exception_is_soft(nr)) {
3266 ++ vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
3267 ++ vmx->vcpu.arch.event_exit_inst_len);
3268 ++ intr_info |= INTR_TYPE_SOFT_EXCEPTION;
3269 ++ } else
3270 ++ intr_info |= INTR_TYPE_HARD_EXCEPTION;
3271 ++
3272 ++ vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
3273 ++
3274 ++ vmx_clear_hlt(vcpu);
3275 ++}
3276 ++
3277 ++static bool vmx_rdtscp_supported(void)
3278 ++{
3279 ++ return cpu_has_vmx_rdtscp();
3280 ++}
3281 ++
3282 ++static bool vmx_invpcid_supported(void)
3283 ++{
3284 ++ return cpu_has_vmx_invpcid();
3285 ++}
3286 ++
3287 ++/*
3288 ++ * Swap MSR entry in host/guest MSR entry array.
3289 ++ */
3290 ++static void move_msr_up(struct vcpu_vmx *vmx, int from, int to)
3291 ++{
3292 ++ struct shared_msr_entry tmp;
3293 ++
3294 ++ tmp = vmx->guest_msrs[to];
3295 ++ vmx->guest_msrs[to] = vmx->guest_msrs[from];
3296 ++ vmx->guest_msrs[from] = tmp;
3297 ++}
3298 ++
3299 ++/*
3300 ++ * Set up the vmcs to automatically save and restore system
3301 ++ * msrs. Don't touch the 64-bit msrs if the guest is in legacy
3302 ++ * mode, as fiddling with msrs is very expensive.
3303 ++ */
3304 ++static void setup_msrs(struct vcpu_vmx *vmx)
3305 ++{
3306 ++ int save_nmsrs, index;
3307 ++
3308 ++ save_nmsrs = 0;
3309 ++#ifdef CONFIG_X86_64
3310 ++ /*
3311 ++ * The SYSCALL MSRs are only needed on long mode guests, and only
3312 ++ * when EFER.SCE is set.
3313 ++ */
3314 ++ if (is_long_mode(&vmx->vcpu) && (vmx->vcpu.arch.efer & EFER_SCE)) {
3315 ++ index = __find_msr_index(vmx, MSR_STAR);
3316 ++ if (index >= 0)
3317 ++ move_msr_up(vmx, index, save_nmsrs++);
3318 ++ index = __find_msr_index(vmx, MSR_LSTAR);
3319 ++ if (index >= 0)
3320 ++ move_msr_up(vmx, index, save_nmsrs++);
3321 ++ index = __find_msr_index(vmx, MSR_SYSCALL_MASK);
3322 ++ if (index >= 0)
3323 ++ move_msr_up(vmx, index, save_nmsrs++);
3324 ++ }
3325 ++#endif
3326 ++ index = __find_msr_index(vmx, MSR_EFER);
3327 ++ if (index >= 0 && update_transition_efer(vmx, index))
3328 ++ move_msr_up(vmx, index, save_nmsrs++);
3329 ++ index = __find_msr_index(vmx, MSR_TSC_AUX);
3330 ++ if (index >= 0 && guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDTSCP))
3331 ++ move_msr_up(vmx, index, save_nmsrs++);
3332 ++ index = __find_msr_index(vmx, MSR_IA32_TSX_CTRL);
3333 ++ if (index >= 0)
3334 ++ move_msr_up(vmx, index, save_nmsrs++);
3335 ++
3336 ++ vmx->save_nmsrs = save_nmsrs;
3337 ++ vmx->guest_msrs_ready = false;
3338 ++
3339 ++ if (cpu_has_vmx_msr_bitmap())
3340 ++ vmx_update_msr_bitmap(&vmx->vcpu);
3341 ++}
3342 ++
3343 ++static u64 vmx_read_l1_tsc_offset(struct kvm_vcpu *vcpu)
3344 ++{
3345 ++ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
3346 ++
3347 ++ if (is_guest_mode(vcpu) &&
3348 ++ (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING))
3349 ++ return vcpu->arch.tsc_offset - vmcs12->tsc_offset;
3350 ++
3351 ++ return vcpu->arch.tsc_offset;
3352 ++}
3353 ++
3354 ++static u64 vmx_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
3355 ++{
3356 ++ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
3357 ++ u64 g_tsc_offset = 0;
3358 ++
3359 ++ /*
3360 ++ * We're here if L1 chose not to trap WRMSR to TSC. According
3361 ++ * to the spec, this should set L1's TSC; The offset that L1
3362 ++ * set for L2 remains unchanged, and still needs to be added
3363 ++ * to the newly set TSC to get L2's TSC.
3364 ++ */
3365 ++ if (is_guest_mode(vcpu) &&
3366 ++ (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING))
3367 ++ g_tsc_offset = vmcs12->tsc_offset;
3368 ++
3369 ++ trace_kvm_write_tsc_offset(vcpu->vcpu_id,
3370 ++ vcpu->arch.tsc_offset - g_tsc_offset,
3371 ++ offset);
3372 ++ vmcs_write64(TSC_OFFSET, offset + g_tsc_offset);
3373 ++ return offset + g_tsc_offset;
3374 ++}
3375 ++
3376 ++/*
3377 ++ * nested_vmx_allowed() checks whether a guest should be allowed to use VMX
3378 ++ * instructions and MSRs (i.e., nested VMX). Nested VMX is disabled for
3379 ++ * all guests if the "nested" module option is off, and can also be disabled
3380 ++ * for a single guest by disabling its VMX cpuid bit.
3381 ++ */
3382 ++bool nested_vmx_allowed(struct kvm_vcpu *vcpu)
3383 ++{
3384 ++ return nested && guest_cpuid_has(vcpu, X86_FEATURE_VMX);
3385 ++}
3386 ++
3387 ++static inline bool vmx_feature_control_msr_valid(struct kvm_vcpu *vcpu,
3388 ++ uint64_t val)
3389 ++{
3390 ++ uint64_t valid_bits = to_vmx(vcpu)->msr_ia32_feature_control_valid_bits;
3391 ++
3392 ++ return !(val & ~valid_bits);
3393 ++}
3394 ++
3395 ++static int vmx_get_msr_feature(struct kvm_msr_entry *msr)
3396 ++{
3397 ++ switch (msr->index) {
3398 ++ case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
3399 ++ if (!nested)
3400 ++ return 1;
3401 ++ return vmx_get_vmx_msr(&vmcs_config.nested, msr->index, &msr->data);
3402 ++ default:
3403 ++ return 1;
3404 ++ }
3405 ++}
3406 ++
3407 ++/*
3408 ++ * Reads an msr value (of 'msr_index') into 'pdata'.
3409 ++ * Returns 0 on success, non-0 otherwise.
3410 ++ * Assumes vcpu_load() was already called.
3411 ++ */
3412 ++static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
3413 ++{
3414 ++ struct vcpu_vmx *vmx = to_vmx(vcpu);
3415 ++ struct shared_msr_entry *msr;
3416 ++ u32 index;
3417 ++
3418 ++ switch (msr_info->index) {
3419 ++#ifdef CONFIG_X86_64
3420 ++ case MSR_FS_BASE:
3421 ++ msr_info->data = vmcs_readl(GUEST_FS_BASE);
3422 ++ break;
3423 ++ case MSR_GS_BASE:
3424 ++ msr_info->data = vmcs_readl(GUEST_GS_BASE);
3425 ++ break;
3426 ++ case MSR_KERNEL_GS_BASE:
3427 ++ msr_info->data = vmx_read_guest_kernel_gs_base(vmx);
3428 ++ break;
3429 ++#endif
3430 ++ case MSR_EFER:
3431 ++ return kvm_get_msr_common(vcpu, msr_info);
3432 ++ case MSR_IA32_TSX_CTRL:
3433 ++ if (!msr_info->host_initiated &&
3434 ++ !(vcpu->arch.arch_capabilities & ARCH_CAP_TSX_CTRL_MSR))
3435 ++ return 1;
3436 ++ goto find_shared_msr;
3437 ++ case MSR_IA32_UMWAIT_CONTROL:
3438 ++ if (!msr_info->host_initiated && !vmx_has_waitpkg(vmx))
3439 ++ return 1;
3440 ++
3441 ++ msr_info->data = vmx->msr_ia32_umwait_control;
3442 ++ break;
3443 ++ case MSR_IA32_SPEC_CTRL:
3444 ++ if (!msr_info->host_initiated &&
3445 ++ !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
3446 ++ return 1;
3447 ++
3448 ++ msr_info->data = to_vmx(vcpu)->spec_ctrl;
3449 ++ break;
3450 ++ case MSR_IA32_SYSENTER_CS:
3451 ++ msr_info->data = vmcs_read32(GUEST_SYSENTER_CS);
3452 ++ break;
3453 ++ case MSR_IA32_SYSENTER_EIP:
3454 ++ msr_info->data = vmcs_readl(GUEST_SYSENTER_EIP);
3455 ++ break;
3456 ++ case MSR_IA32_SYSENTER_ESP:
3457 ++ msr_info->data = vmcs_readl(GUEST_SYSENTER_ESP);
3458 ++ break;
3459 ++ case MSR_IA32_BNDCFGS:
3460 ++ if (!kvm_mpx_supported() ||
3461 ++ (!msr_info->host_initiated &&
3462 ++ !guest_cpuid_has(vcpu, X86_FEATURE_MPX)))
3463 ++ return 1;
3464 ++ msr_info->data = vmcs_read64(GUEST_BNDCFGS);
3465 ++ break;
3466 ++ case MSR_IA32_MCG_EXT_CTL:
3467 ++ if (!msr_info->host_initiated &&
3468 ++ !(vmx->msr_ia32_feature_control &
3469 ++ FEATURE_CONTROL_LMCE))
3470 ++ return 1;
3471 ++ msr_info->data = vcpu->arch.mcg_ext_ctl;
3472 ++ break;
3473 ++ case MSR_IA32_FEATURE_CONTROL:
3474 ++ msr_info->data = vmx->msr_ia32_feature_control;
3475 ++ break;
3476 ++ case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
3477 ++ if (!nested_vmx_allowed(vcpu))
3478 ++ return 1;
3479 ++ return vmx_get_vmx_msr(&vmx->nested.msrs, msr_info->index,
3480 ++ &msr_info->data);
3481 ++ case MSR_IA32_RTIT_CTL:
3482 ++ if (pt_mode != PT_MODE_HOST_GUEST)
3483 ++ return 1;
3484 ++ msr_info->data = vmx->pt_desc.guest.ctl;
3485 ++ break;
3486 ++ case MSR_IA32_RTIT_STATUS:
3487 ++ if (pt_mode != PT_MODE_HOST_GUEST)
3488 ++ return 1;
3489 ++ msr_info->data = vmx->pt_desc.guest.status;
3490 ++ break;
3491 ++ case MSR_IA32_RTIT_CR3_MATCH:
3492 ++ if ((pt_mode != PT_MODE_HOST_GUEST) ||
3493 ++ !intel_pt_validate_cap(vmx->pt_desc.caps,
3494 ++ PT_CAP_cr3_filtering))
3495 ++ return 1;
3496 ++ msr_info->data = vmx->pt_desc.guest.cr3_match;
3497 ++ break;
3498 ++ case MSR_IA32_RTIT_OUTPUT_BASE:
3499 ++ if ((pt_mode != PT_MODE_HOST_GUEST) ||
3500 ++ (!intel_pt_validate_cap(vmx->pt_desc.caps,
3501 ++ PT_CAP_topa_output) &&
3502 ++ !intel_pt_validate_cap(vmx->pt_desc.caps,
3503 ++ PT_CAP_single_range_output)))
3504 ++ return 1;
3505 ++ msr_info->data = vmx->pt_desc.guest.output_base;
3506 ++ break;
3507 ++ case MSR_IA32_RTIT_OUTPUT_MASK:
3508 ++ if ((pt_mode != PT_MODE_HOST_GUEST) ||
3509 ++ (!intel_pt_validate_cap(vmx->pt_desc.caps,
3510 ++ PT_CAP_topa_output) &&
3511 ++ !intel_pt_validate_cap(vmx->pt_desc.caps,
3512 ++ PT_CAP_single_range_output)))
3513 ++ return 1;
3514 ++ msr_info->data = vmx->pt_desc.guest.output_mask;
3515 ++ break;
3516 ++ case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
3517 ++ index = msr_info->index - MSR_IA32_RTIT_ADDR0_A;
3518 ++ if ((pt_mode != PT_MODE_HOST_GUEST) ||
3519 ++ (index >= 2 * intel_pt_validate_cap(vmx->pt_desc.caps,
3520 ++ PT_CAP_num_address_ranges)))
3521 ++ return 1;
3522 ++ if (is_noncanonical_address(data, vcpu))
3523 ++ return 1;
3524 ++ if (index % 2)
3525 ++ msr_info->data = vmx->pt_desc.guest.addr_b[index / 2];
3526 ++ else
3527 ++ msr_info->data = vmx->pt_desc.guest.addr_a[index / 2];
3528 ++ break;
3529 ++ case MSR_TSC_AUX:
3530 ++ if (!msr_info->host_initiated &&
3531 ++ !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
3532 ++ return 1;
3533 ++ goto find_shared_msr;
3534 ++ default:
3535 ++ find_shared_msr:
3536 ++ msr = find_msr_entry(vmx, msr_info->index);
3537 ++ if (msr) {
3538 ++ msr_info->data = msr->data;
3539 ++ break;
3540 ++ }
3541 ++ return kvm_get_msr_common(vcpu, msr_info);
3542 ++ }
3543 ++
3544 ++ return 0;
3545 ++}
3546 ++
3547 ++/*
3548 ++ * Writes msr value into the appropriate "register".
3549 ++ * Returns 0 on success, non-0 otherwise.
3550 ++ * Assumes vcpu_load() was already called.
3551 ++ */
3552 ++static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
3553 ++{
3554 ++ struct vcpu_vmx *vmx = to_vmx(vcpu);
3555 ++ struct shared_msr_entry *msr;
3556 ++ int ret = 0;
3557 ++ u32 msr_index = msr_info->index;
3558 ++ u64 data = msr_info->data;
3559 ++ u32 index;
3560 ++
3561 ++ switch (msr_index) {
3562 ++ case MSR_EFER:
3563 ++ ret = kvm_set_msr_common(vcpu, msr_info);
3564 ++ break;
3565 ++#ifdef CONFIG_X86_64
3566 ++ case MSR_FS_BASE:
3567 ++ vmx_segment_cache_clear(vmx);
3568 ++ vmcs_writel(GUEST_FS_BASE, data);
3569 ++ break;
3570 ++ case MSR_GS_BASE:
3571 ++ vmx_segment_cache_clear(vmx);
3572 ++ vmcs_writel(GUEST_GS_BASE, data);
3573 ++ break;
3574 ++ case MSR_KERNEL_GS_BASE:
3575 ++ vmx_write_guest_kernel_gs_base(vmx, data);
3576 ++ break;
3577 ++#endif
3578 ++ case MSR_IA32_SYSENTER_CS:
3579 ++ if (is_guest_mode(vcpu))
3580 ++ get_vmcs12(vcpu)->guest_sysenter_cs = data;
3581 ++ vmcs_write32(GUEST_SYSENTER_CS, data);
3582 ++ break;
3583 ++ case MSR_IA32_SYSENTER_EIP:
3584 ++ if (is_guest_mode(vcpu))
3585 ++ get_vmcs12(vcpu)->guest_sysenter_eip = data;
3586 ++ vmcs_writel(GUEST_SYSENTER_EIP, data);
3587 ++ break;
3588 ++ case MSR_IA32_SYSENTER_ESP:
3589 ++ if (is_guest_mode(vcpu))
3590 ++ get_vmcs12(vcpu)->guest_sysenter_esp = data;
3591 ++ vmcs_writel(GUEST_SYSENTER_ESP, data);
3592 ++ break;
3593 ++ case MSR_IA32_DEBUGCTLMSR:
3594 ++ if (is_guest_mode(vcpu) && get_vmcs12(vcpu)->vm_exit_controls &
3595 ++ VM_EXIT_SAVE_DEBUG_CONTROLS)
3596 ++ get_vmcs12(vcpu)->guest_ia32_debugctl = data;
3597 ++
3598 ++ ret = kvm_set_msr_common(vcpu, msr_info);
3599 ++ break;
3600 ++
3601 ++ case MSR_IA32_BNDCFGS:
3602 ++ if (!kvm_mpx_supported() ||
3603 ++ (!msr_info->host_initiated &&
3604 ++ !guest_cpuid_has(vcpu, X86_FEATURE_MPX)))
3605 ++ return 1;
3606 ++ if (is_noncanonical_address(data & PAGE_MASK, vcpu) ||
3607 ++ (data & MSR_IA32_BNDCFGS_RSVD))
3608 ++ return 1;
3609 ++ vmcs_write64(GUEST_BNDCFGS, data);
3610 ++ break;
3611 ++ case MSR_IA32_UMWAIT_CONTROL:
3612 ++ if (!msr_info->host_initiated && !vmx_has_waitpkg(vmx))
3613 ++ return 1;
3614 ++
3615 ++ /* The reserved bit 1 and non-32 bit [63:32] should be zero */
3616 ++ if (data & (BIT_ULL(1) | GENMASK_ULL(63, 32)))
3617 ++ return 1;
3618 ++
3619 ++ vmx->msr_ia32_umwait_control = data;
3620 ++ break;
3621 ++ case MSR_IA32_SPEC_CTRL:
3622 ++ if (!msr_info->host_initiated &&
3623 ++ !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
3624 ++ return 1;
3625 ++
3626 ++ /* The STIBP bit doesn't fault even if it's not advertised */
3627 ++ if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP | SPEC_CTRL_SSBD))
3628 ++ return 1;
3629 ++
3630 ++ vmx->spec_ctrl = data;
3631 ++
3632 ++ if (!data)
3633 ++ break;
3634 ++
3635 ++ /*
3636 ++ * For non-nested:
3637 ++ * When it's written (to non-zero) for the first time, pass
3638 ++ * it through.
3639 ++ *
3640 ++ * For nested:
3641 ++ * The handling of the MSR bitmap for L2 guests is done in
3642 ++ * nested_vmx_prepare_msr_bitmap. We should not touch the
3643 ++ * vmcs02.msr_bitmap here since it gets completely overwritten
3644 ++ * in the merging. We update the vmcs01 here for L1 as well
3645 ++ * since it will end up touching the MSR anyway now.
3646 ++ */
3647 ++ vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap,
3648 ++ MSR_IA32_SPEC_CTRL,
3649 ++ MSR_TYPE_RW);
3650 ++ break;
3651 ++ case MSR_IA32_TSX_CTRL:
3652 ++ if (!msr_info->host_initiated &&
3653 ++ !(vcpu->arch.arch_capabilities & ARCH_CAP_TSX_CTRL_MSR))
3654 ++ return 1;
3655 ++ if (data & ~(TSX_CTRL_RTM_DISABLE | TSX_CTRL_CPUID_CLEAR))
3656 ++ return 1;
3657 ++ goto find_shared_msr;
3658 ++ case MSR_IA32_PRED_CMD:
3659 ++ if (!msr_info->host_initiated &&
3660 ++ !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
3661 ++ return 1;
3662 ++
3663 ++ if (data & ~PRED_CMD_IBPB)
3664 ++ return 1;
3665 ++
3666 ++ if (!data)
3667 ++ break;
3668 ++
3669 ++ wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB);
3670 ++
3671 ++ /*
3672 ++ * For non-nested:
3673 ++ * When it's written (to non-zero) for the first time, pass
3674 ++ * it through.
3675 ++ *
3676 ++ * For nested:
3677 ++ * The handling of the MSR bitmap for L2 guests is done in
3678 ++ * nested_vmx_prepare_msr_bitmap. We should not touch the
3679 ++ * vmcs02.msr_bitmap here since it gets completely overwritten
3680 ++ * in the merging.
3681 ++ */
3682 ++ vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap, MSR_IA32_PRED_CMD,
3683 ++ MSR_TYPE_W);
3684 ++ break;
3685 ++ case MSR_IA32_CR_PAT:
3686 ++ if (!kvm_pat_valid(data))
3687 ++ return 1;
3688 ++
3689 ++ if (is_guest_mode(vcpu) &&
3690 ++ get_vmcs12(vcpu)->vm_exit_controls & VM_EXIT_SAVE_IA32_PAT)
3691 ++ get_vmcs12(vcpu)->guest_ia32_pat = data;
3692 ++
3693 ++ if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
3694 ++ vmcs_write64(GUEST_IA32_PAT, data);
3695 ++ vcpu->arch.pat = data;
3696 ++ break;
3697 ++ }
3698 ++ ret = kvm_set_msr_common(vcpu, msr_info);
3699 ++ break;
3700 ++ case MSR_IA32_TSC_ADJUST:
3701 ++ ret = kvm_set_msr_common(vcpu, msr_info);
3702 ++ break;
3703 ++ case MSR_IA32_MCG_EXT_CTL:
3704 ++ if ((!msr_info->host_initiated &&
3705 ++ !(to_vmx(vcpu)->msr_ia32_feature_control &
3706 ++ FEATURE_CONTROL_LMCE)) ||
3707 ++ (data & ~MCG_EXT_CTL_LMCE_EN))
3708 ++ return 1;
3709 ++ vcpu->arch.mcg_ext_ctl = data;
3710 ++ break;
3711 ++ case MSR_IA32_FEATURE_CONTROL:
3712 ++ if (!vmx_feature_control_msr_valid(vcpu, data) ||
3713 ++ (to_vmx(vcpu)->msr_ia32_feature_control &
3714 ++ FEATURE_CONTROL_LOCKED && !msr_info->host_initiated))
3715 ++ return 1;
3716 ++ vmx->msr_ia32_feature_control = data;
3717 ++ if (msr_info->host_initiated && data == 0)
3718 ++ vmx_leave_nested(vcpu);
3719 ++ break;
3720 ++ case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
3721 ++ if (!msr_info->host_initiated)
3722 ++ return 1; /* they are read-only */
3723 ++ if (!nested_vmx_allowed(vcpu))
3724 ++ return 1;
3725 ++ return vmx_set_vmx_msr(vcpu, msr_index, data);
3726 ++ case MSR_IA32_RTIT_CTL:
3727 ++ if ((pt_mode != PT_MODE_HOST_GUEST) ||
3728 ++ vmx_rtit_ctl_check(vcpu, data) ||
3729 ++ vmx->nested.vmxon)
3730 ++ return 1;
3731 ++ vmcs_write64(GUEST_IA32_RTIT_CTL, data);
3732 ++ vmx->pt_desc.guest.ctl = data;
3733 ++ pt_update_intercept_for_msr(vmx);
3734 ++ break;
3735 ++ case MSR_IA32_RTIT_STATUS:
3736 ++ if ((pt_mode != PT_MODE_HOST_GUEST) ||
3737 ++ (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) ||
3738 ++ (data & MSR_IA32_RTIT_STATUS_MASK))
3739 ++ return 1;
3740 ++ vmx->pt_desc.guest.status = data;
3741 ++ break;
3742 ++ case MSR_IA32_RTIT_CR3_MATCH:
3743 ++ if ((pt_mode != PT_MODE_HOST_GUEST) ||
3744 ++ (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) ||
3745 ++ !intel_pt_validate_cap(vmx->pt_desc.caps,
3746 ++ PT_CAP_cr3_filtering))
3747 ++ return 1;
3748 ++ vmx->pt_desc.guest.cr3_match = data;
3749 ++ break;
3750 ++ case MSR_IA32_RTIT_OUTPUT_BASE:
3751 ++ if ((pt_mode != PT_MODE_HOST_GUEST) ||
3752 ++ (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) ||
3753 ++ (!intel_pt_validate_cap(vmx->pt_desc.caps,
3754 ++ PT_CAP_topa_output) &&
3755 ++ !intel_pt_validate_cap(vmx->pt_desc.caps,
3756 ++ PT_CAP_single_range_output)) ||
3757 ++ (data & MSR_IA32_RTIT_OUTPUT_BASE_MASK))
3758 ++ return 1;
3759 ++ vmx->pt_desc.guest.output_base = data;
3760 ++ break;
3761 ++ case MSR_IA32_RTIT_OUTPUT_MASK:
3762 ++ if ((pt_mode != PT_MODE_HOST_GUEST) ||
3763 ++ (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) ||
3764 ++ (!intel_pt_validate_cap(vmx->pt_desc.caps,
3765 ++ PT_CAP_topa_output) &&
3766 ++ !intel_pt_validate_cap(vmx->pt_desc.caps,
3767 ++ PT_CAP_single_range_output)))
3768 ++ return 1;
3769 ++ vmx->pt_desc.guest.output_mask = data;
3770 ++ break;
3771 ++ case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
3772 ++ index = msr_info->index - MSR_IA32_RTIT_ADDR0_A;
3773 ++ if ((pt_mode != PT_MODE_HOST_GUEST) ||
3774 ++ (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) ||
3775 ++ (index >= 2 * intel_pt_validate_cap(vmx->pt_desc.caps,
3776 ++ PT_CAP_num_address_ranges)))
3777 ++ return 1;
3778 ++ if (is_noncanonical_address(data, vcpu))
3779 ++ return 1;
3780 ++ if (index % 2)
3781 ++ vmx->pt_desc.guest.addr_b[index / 2] = data;
3782 ++ else
3783 ++ vmx->pt_desc.guest.addr_a[index / 2] = data;
3784 ++ break;
3785 ++ case MSR_TSC_AUX:
3786 ++ if (!msr_info->host_initiated &&
3787 ++ !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
3788 ++ return 1;
3789 ++ /* Check reserved bit, higher 32 bits should be zero */
3790 ++ if ((data >> 32) != 0)
3791 ++ return 1;
3792 ++ goto find_shared_msr;
3793 ++
3794 ++ default:
3795 ++ find_shared_msr:
3796 ++ msr = find_msr_entry(vmx, msr_index);
3797 ++ if (msr)
3798 ++ ret = vmx_set_guest_msr(vmx, msr, data);
3799 ++ else
3800 ++ ret = kvm_set_msr_common(vcpu, msr_info);
3801 ++ }
3802 ++
3803 ++ return ret;
3804 ++}
3805 ++
3806 ++static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
3807 ++{
3808 ++ kvm_register_mark_available(vcpu, reg);
3809 ++
3810 ++ switch (reg) {
3811 ++ case VCPU_REGS_RSP:
3812 ++ vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP);
3813 ++ break;
3814 ++ case VCPU_REGS_RIP:
3815 ++ vcpu->arch.regs[VCPU_REGS_RIP] = vmcs_readl(GUEST_RIP);
3816 ++ break;
3817 ++ case VCPU_EXREG_PDPTR:
3818 ++ if (enable_ept)
3819 ++ ept_save_pdptrs(vcpu);
3820 ++ break;
3821 ++ case VCPU_EXREG_CR3:
3822 ++ if (enable_unrestricted_guest || (enable_ept && is_paging(vcpu)))
3823 ++ vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
3824 ++ break;
3825 ++ default:
3826 ++ WARN_ON_ONCE(1);
3827 ++ break;
3828 ++ }
3829 ++}
3830 ++
3831 ++static __init int cpu_has_kvm_support(void)
3832 ++{
3833 ++ return cpu_has_vmx();
3834 ++}
3835 ++
3836 ++static __init int vmx_disabled_by_bios(void)
3837 ++{
3838 ++ u64 msr;
3839 ++
3840 ++ rdmsrl(MSR_IA32_FEATURE_CONTROL, msr);
3841 ++ if (msr & FEATURE_CONTROL_LOCKED) {
3842 ++ /* launched w/ TXT and VMX disabled */
3843 ++ if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX)
3844 ++ && tboot_enabled())
3845 ++ return 1;
3846 ++ /* launched w/o TXT and VMX only enabled w/ TXT */
3847 ++ if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX)
3848 ++ && (msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX)
3849 ++ && !tboot_enabled()) {
3850 ++ printk(KERN_WARNING "kvm: disable TXT in the BIOS or "
3851 ++ "activate TXT before enabling KVM\n");
3852 ++ return 1;
3853 ++ }
3854 ++ /* launched w/o TXT and VMX disabled */
3855 ++ if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX)
3856 ++ && !tboot_enabled())
3857 ++ return 1;
3858 ++ }
3859 ++
3860 ++ return 0;
3861 ++}
3862 ++
3863 ++static void kvm_cpu_vmxon(u64 addr)
3864 ++{
3865 ++ cr4_set_bits(X86_CR4_VMXE);
3866 ++ intel_pt_handle_vmx(1);
3867 ++
3868 ++ asm volatile ("vmxon %0" : : "m"(addr));
3869 ++}
3870 ++
3871 ++static int hardware_enable(void)
3872 ++{
3873 ++ int cpu = raw_smp_processor_id();
3874 ++ u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
3875 ++ u64 old, test_bits;
3876 ++
3877 ++ if (cr4_read_shadow() & X86_CR4_VMXE)
3878 ++ return -EBUSY;
3879 ++
3880 ++ /*
3881 ++ * This can happen if we hot-added a CPU but failed to allocate
3882 ++ * VP assist page for it.
3883 ++ */
3884 ++ if (static_branch_unlikely(&enable_evmcs) &&
3885 ++ !hv_get_vp_assist_page(cpu))
3886 ++ return -EFAULT;
3887 ++
3888 ++ INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
3889 ++ INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu));
3890 ++ spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
3891 ++
3892 ++ /*
3893 ++ * Now we can enable the vmclear operation in kdump
3894 ++ * since the loaded_vmcss_on_cpu list on this cpu
3895 ++ * has been initialized.
3896 ++ *
3897 ++ * Though the cpu is not in VMX operation now, there
3898 ++ * is no problem to enable the vmclear operation
3899 ++ * for the loaded_vmcss_on_cpu list is empty!
3900 ++ */
3901 ++ crash_enable_local_vmclear(cpu);
3902 ++
3903 ++ rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
3904 ++
3905 ++ test_bits = FEATURE_CONTROL_LOCKED;
3906 ++ test_bits |= FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
3907 ++ if (tboot_enabled())
3908 ++ test_bits |= FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX;
3909 ++
3910 ++ if ((old & test_bits) != test_bits) {
3911 ++ /* enable and lock */
3912 ++ wrmsrl(MSR_IA32_FEATURE_CONTROL, old | test_bits);
3913 ++ }
3914 ++ kvm_cpu_vmxon(phys_addr);
3915 ++ if (enable_ept)
3916 ++ ept_sync_global();
3917 ++
3918 ++ return 0;
3919 ++}
3920 ++
3921 ++static void vmclear_local_loaded_vmcss(void)
3922 ++{
3923 ++ int cpu = raw_smp_processor_id();
3924 ++ struct loaded_vmcs *v, *n;
3925 ++
3926 ++ list_for_each_entry_safe(v, n, &per_cpu(loaded_vmcss_on_cpu, cpu),
3927 ++ loaded_vmcss_on_cpu_link)
3928 ++ __loaded_vmcs_clear(v);
3929 ++}
3930 ++
3931 ++
3932 ++/* Just like cpu_vmxoff(), but with the __kvm_handle_fault_on_reboot()
3933 ++ * tricks.
3934 ++ */
3935 ++static void kvm_cpu_vmxoff(void)
3936 ++{
3937 ++ asm volatile (__ex("vmxoff"));
3938 ++
3939 ++ intel_pt_handle_vmx(0);
3940 ++ cr4_clear_bits(X86_CR4_VMXE);
3941 ++}
3942 ++
3943 ++static void hardware_disable(void)
3944 ++{
3945 ++ vmclear_local_loaded_vmcss();
3946 ++ kvm_cpu_vmxoff();
3947 ++}
3948 ++
3949 ++static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
3950 ++ u32 msr, u32 *result)
3951 ++{
3952 ++ u32 vmx_msr_low, vmx_msr_high;
3953 ++ u32 ctl = ctl_min | ctl_opt;
3954 ++
3955 ++ rdmsr(msr, vmx_msr_low, vmx_msr_high);
3956 ++
3957 ++ ctl &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */
3958 ++ ctl |= vmx_msr_low; /* bit == 1 in low word ==> must be one */
3959 ++
3960 ++ /* Ensure minimum (required) set of control bits are supported. */
3961 ++ if (ctl_min & ~ctl)
3962 ++ return -EIO;
3963 ++
3964 ++ *result = ctl;
3965 ++ return 0;
3966 ++}
3967 ++
3968 ++static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
3969 ++ struct vmx_capability *vmx_cap)
3970 ++{
3971 ++ u32 vmx_msr_low, vmx_msr_high;
3972 ++ u32 min, opt, min2, opt2;
3973 ++ u32 _pin_based_exec_control = 0;
3974 ++ u32 _cpu_based_exec_control = 0;
3975 ++ u32 _cpu_based_2nd_exec_control = 0;
3976 ++ u32 _vmexit_control = 0;
3977 ++ u32 _vmentry_control = 0;
3978 ++
3979 ++ memset(vmcs_conf, 0, sizeof(*vmcs_conf));
3980 ++ min = CPU_BASED_HLT_EXITING |
3981 ++#ifdef CONFIG_X86_64
3982 ++ CPU_BASED_CR8_LOAD_EXITING |
3983 ++ CPU_BASED_CR8_STORE_EXITING |
3984 ++#endif
3985 ++ CPU_BASED_CR3_LOAD_EXITING |
3986 ++ CPU_BASED_CR3_STORE_EXITING |
3987 ++ CPU_BASED_UNCOND_IO_EXITING |
3988 ++ CPU_BASED_MOV_DR_EXITING |
3989 ++ CPU_BASED_USE_TSC_OFFSETTING |
3990 ++ CPU_BASED_MWAIT_EXITING |
3991 ++ CPU_BASED_MONITOR_EXITING |
3992 ++ CPU_BASED_INVLPG_EXITING |
3993 ++ CPU_BASED_RDPMC_EXITING;
3994 ++
3995 ++ opt = CPU_BASED_TPR_SHADOW |
3996 ++ CPU_BASED_USE_MSR_BITMAPS |
3997 ++ CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
3998 ++ if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS,
3999 ++ &_cpu_based_exec_control) < 0)
4000 ++ return -EIO;
4001 ++#ifdef CONFIG_X86_64
4002 ++ if ((_cpu_based_exec_control & CPU_BASED_TPR_SHADOW))
4003 ++ _cpu_based_exec_control &= ~CPU_BASED_CR8_LOAD_EXITING &
4004 ++ ~CPU_BASED_CR8_STORE_EXITING;
4005 ++#endif
4006 ++ if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) {
4007 ++ min2 = 0;
4008 ++ opt2 = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
4009 ++ SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
4010 ++ SECONDARY_EXEC_WBINVD_EXITING |
4011 ++ SECONDARY_EXEC_ENABLE_VPID |
4012 ++ SECONDARY_EXEC_ENABLE_EPT |
4013 ++ SECONDARY_EXEC_UNRESTRICTED_GUEST |
4014 ++ SECONDARY_EXEC_PAUSE_LOOP_EXITING |
4015 ++ SECONDARY_EXEC_DESC |
4016 ++ SECONDARY_EXEC_RDTSCP |
4017 ++ SECONDARY_EXEC_ENABLE_INVPCID |
4018 ++ SECONDARY_EXEC_APIC_REGISTER_VIRT |
4019 ++ SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
4020 ++ SECONDARY_EXEC_SHADOW_VMCS |
4021 ++ SECONDARY_EXEC_XSAVES |
4022 ++ SECONDARY_EXEC_RDSEED_EXITING |
4023 ++ SECONDARY_EXEC_RDRAND_EXITING |
4024 ++ SECONDARY_EXEC_ENABLE_PML |
4025 ++ SECONDARY_EXEC_TSC_SCALING |
4026 ++ SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE |
4027 ++ SECONDARY_EXEC_PT_USE_GPA |
4028 ++ SECONDARY_EXEC_PT_CONCEAL_VMX |
4029 ++ SECONDARY_EXEC_ENABLE_VMFUNC |
4030 ++ SECONDARY_EXEC_ENCLS_EXITING;
4031 ++ if (adjust_vmx_controls(min2, opt2,
4032 ++ MSR_IA32_VMX_PROCBASED_CTLS2,
4033 ++ &_cpu_based_2nd_exec_control) < 0)
4034 ++ return -EIO;
4035 ++ }
4036 ++#ifndef CONFIG_X86_64
4037 ++ if (!(_cpu_based_2nd_exec_control &
4038 ++ SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
4039 ++ _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW;
4040 ++#endif
4041 ++
4042 ++ if (!(_cpu_based_exec_control & CPU_BASED_TPR_SHADOW))
4043 ++ _cpu_based_2nd_exec_control &= ~(
4044 ++ SECONDARY_EXEC_APIC_REGISTER_VIRT |
4045 ++ SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
4046 ++ SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
4047 ++
4048 ++ rdmsr_safe(MSR_IA32_VMX_EPT_VPID_CAP,
4049 ++ &vmx_cap->ept, &vmx_cap->vpid);
4050 ++
4051 ++ if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) {
4052 ++ /* CR3 accesses and invlpg don't need to cause VM Exits when EPT
4053 ++ enabled */
4054 ++ _cpu_based_exec_control &= ~(CPU_BASED_CR3_LOAD_EXITING |
4055 ++ CPU_BASED_CR3_STORE_EXITING |
4056 ++ CPU_BASED_INVLPG_EXITING);
4057 ++ } else if (vmx_cap->ept) {
4058 ++ vmx_cap->ept = 0;
4059 ++ pr_warn_once("EPT CAP should not exist if not support "
4060 ++ "1-setting enable EPT VM-execution control\n");
4061 ++ }
4062 ++ if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_VPID) &&
4063 ++ vmx_cap->vpid) {
4064 ++ vmx_cap->vpid = 0;
4065 ++ pr_warn_once("VPID CAP should not exist if not support "
4066 ++ "1-setting enable VPID VM-execution control\n");
4067 ++ }
4068 ++
4069 ++ min = VM_EXIT_SAVE_DEBUG_CONTROLS | VM_EXIT_ACK_INTR_ON_EXIT;
4070 ++#ifdef CONFIG_X86_64
4071 ++ min |= VM_EXIT_HOST_ADDR_SPACE_SIZE;
4072 ++#endif
4073 ++ opt = VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL |
4074 ++ VM_EXIT_LOAD_IA32_PAT |
4075 ++ VM_EXIT_LOAD_IA32_EFER |
4076 ++ VM_EXIT_CLEAR_BNDCFGS |
4077 ++ VM_EXIT_PT_CONCEAL_PIP |
4078 ++ VM_EXIT_CLEAR_IA32_RTIT_CTL;
4079 ++ if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS,
4080 ++ &_vmexit_control) < 0)
4081 ++ return -EIO;
4082 ++
4083 ++ min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING;
4084 ++ opt = PIN_BASED_VIRTUAL_NMIS | PIN_BASED_POSTED_INTR |
4085 ++ PIN_BASED_VMX_PREEMPTION_TIMER;
4086 ++ if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS,
4087 ++ &_pin_based_exec_control) < 0)
4088 ++ return -EIO;
4089 ++
4090 ++ if (cpu_has_broken_vmx_preemption_timer())
4091 ++ _pin_based_exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
4092 ++ if (!(_cpu_based_2nd_exec_control &
4093 ++ SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY))
4094 ++ _pin_based_exec_control &= ~PIN_BASED_POSTED_INTR;
4095 ++
4096 ++ min = VM_ENTRY_LOAD_DEBUG_CONTROLS;
4097 ++ opt = VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL |
4098 ++ VM_ENTRY_LOAD_IA32_PAT |
4099 ++ VM_ENTRY_LOAD_IA32_EFER |
4100 ++ VM_ENTRY_LOAD_BNDCFGS |
4101 ++ VM_ENTRY_PT_CONCEAL_PIP |
4102 ++ VM_ENTRY_LOAD_IA32_RTIT_CTL;
4103 ++ if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS,
4104 ++ &_vmentry_control) < 0)
4105 ++ return -EIO;
4106 ++
4107 ++ /*
4108 ++ * Some cpus support VM_{ENTRY,EXIT}_IA32_PERF_GLOBAL_CTRL but they
4109 ++ * can't be used due to an errata where VM Exit may incorrectly clear
4110 ++ * IA32_PERF_GLOBAL_CTRL[34:32]. Workaround the errata by using the
4111 ++ * MSR load mechanism to switch IA32_PERF_GLOBAL_CTRL.
4112 ++ */
4113 ++ if (boot_cpu_data.x86 == 0x6) {
4114 ++ switch (boot_cpu_data.x86_model) {
4115 ++ case 26: /* AAK155 */
4116 ++ case 30: /* AAP115 */
4117 ++ case 37: /* AAT100 */
4118 ++ case 44: /* BC86,AAY89,BD102 */
4119 ++ case 46: /* BA97 */
4120 ++ _vmentry_control &= ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
4121 ++ _vmexit_control &= ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL;
4122 ++ pr_warn_once("kvm: VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL "
4123 ++ "does not work properly. Using workaround\n");
4124 ++ break;
4125 ++ default:
4126 ++ break;
4127 ++ }
4128 ++ }
4129 ++
4130 ++
4131 ++ rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high);
4132 ++
4133 ++ /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */
4134 ++ if ((vmx_msr_high & 0x1fff) > PAGE_SIZE)
4135 ++ return -EIO;
4136 ++
4137 ++#ifdef CONFIG_X86_64
4138 ++ /* IA-32 SDM Vol 3B: 64-bit CPUs always have VMX_BASIC_MSR[48]==0. */
4139 ++ if (vmx_msr_high & (1u<<16))
4140 ++ return -EIO;
4141 ++#endif
4142 ++
4143 ++ /* Require Write-Back (WB) memory type for VMCS accesses. */
4144 ++ if (((vmx_msr_high >> 18) & 15) != 6)
4145 ++ return -EIO;
4146 ++
4147 ++ vmcs_conf->size = vmx_msr_high & 0x1fff;
4148 ++ vmcs_conf->order = get_order(vmcs_conf->size);
4149 ++ vmcs_conf->basic_cap = vmx_msr_high & ~0x1fff;
4150 ++
4151 ++ vmcs_conf->revision_id = vmx_msr_low;
4152 ++
4153 ++ vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control;
4154 ++ vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control;
4155 ++ vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control;
4156 ++ vmcs_conf->vmexit_ctrl = _vmexit_control;
4157 ++ vmcs_conf->vmentry_ctrl = _vmentry_control;
4158 ++
4159 ++ if (static_branch_unlikely(&enable_evmcs))
4160 ++ evmcs_sanitize_exec_ctrls(vmcs_conf);
4161 ++
4162 ++ return 0;
4163 ++}
4164 ++
4165 ++struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags)
4166 ++{
4167 ++ int node = cpu_to_node(cpu);
4168 ++ struct page *pages;
4169 ++ struct vmcs *vmcs;
4170 ++
4171 ++ pages = __alloc_pages_node(node, flags, vmcs_config.order);
4172 ++ if (!pages)
4173 ++ return NULL;
4174 ++ vmcs = page_address(pages);
4175 ++ memset(vmcs, 0, vmcs_config.size);
4176 ++
4177 ++ /* KVM supports Enlightened VMCS v1 only */
4178 ++ if (static_branch_unlikely(&enable_evmcs))
4179 ++ vmcs->hdr.revision_id = KVM_EVMCS_VERSION;
4180 ++ else
4181 ++ vmcs->hdr.revision_id = vmcs_config.revision_id;
4182 ++
4183 ++ if (shadow)
4184 ++ vmcs->hdr.shadow_vmcs = 1;
4185 ++ return vmcs;
4186 ++}
4187 ++
4188 ++void free_vmcs(struct vmcs *vmcs)
4189 ++{
4190 ++ free_pages((unsigned long)vmcs, vmcs_config.order);
4191 ++}
4192 ++
4193 ++/*
4194 ++ * Free a VMCS, but before that VMCLEAR it on the CPU where it was last loaded
4195 ++ */
4196 ++void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
4197 ++{
4198 ++ if (!loaded_vmcs->vmcs)
4199 ++ return;
4200 ++ loaded_vmcs_clear(loaded_vmcs);
4201 ++ free_vmcs(loaded_vmcs->vmcs);
4202 ++ loaded_vmcs->vmcs = NULL;
4203 ++ if (loaded_vmcs->msr_bitmap)
4204 ++ free_page((unsigned long)loaded_vmcs->msr_bitmap);
4205 ++ WARN_ON(loaded_vmcs->shadow_vmcs != NULL);
4206 ++}
4207 ++
4208 ++int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
4209 ++{
4210 ++ loaded_vmcs->vmcs = alloc_vmcs(false);
4211 ++ if (!loaded_vmcs->vmcs)
4212 ++ return -ENOMEM;
4213 ++
4214 ++ loaded_vmcs->shadow_vmcs = NULL;
4215 ++ loaded_vmcs->hv_timer_soft_disabled = false;
4216 ++ loaded_vmcs_init(loaded_vmcs);
4217 ++
4218 ++ if (cpu_has_vmx_msr_bitmap()) {
4219 ++ loaded_vmcs->msr_bitmap = (unsigned long *)
4220 ++ __get_free_page(GFP_KERNEL_ACCOUNT);
4221 ++ if (!loaded_vmcs->msr_bitmap)
4222 ++ goto out_vmcs;
4223 ++ memset(loaded_vmcs->msr_bitmap, 0xff, PAGE_SIZE);
4224 ++
4225 ++ if (IS_ENABLED(CONFIG_HYPERV) &&
4226 ++ static_branch_unlikely(&enable_evmcs) &&
4227 ++ (ms_hyperv.nested_features & HV_X64_NESTED_MSR_BITMAP)) {
4228 ++ struct hv_enlightened_vmcs *evmcs =
4229 ++ (struct hv_enlightened_vmcs *)loaded_vmcs->vmcs;
4230 ++
4231 ++ evmcs->hv_enlightenments_control.msr_bitmap = 1;
4232 ++ }
4233 ++ }
4234 ++
4235 ++ memset(&loaded_vmcs->host_state, 0, sizeof(struct vmcs_host_state));
4236 ++ memset(&loaded_vmcs->controls_shadow, 0,
4237 ++ sizeof(struct vmcs_controls_shadow));
4238 ++
4239 ++ return 0;
4240 ++
4241 ++out_vmcs:
4242 ++ free_loaded_vmcs(loaded_vmcs);
4243 ++ return -ENOMEM;
4244 ++}
4245 ++
4246 ++static void free_kvm_area(void)
4247 ++{
4248 ++ int cpu;
4249 ++
4250 ++ for_each_possible_cpu(cpu) {
4251 ++ free_vmcs(per_cpu(vmxarea, cpu));
4252 ++ per_cpu(vmxarea, cpu) = NULL;
4253 ++ }
4254 ++}
4255 ++
4256 ++static __init int alloc_kvm_area(void)
4257 ++{
4258 ++ int cpu;
4259 ++
4260 ++ for_each_possible_cpu(cpu) {
4261 ++ struct vmcs *vmcs;
4262 ++
4263 ++ vmcs = alloc_vmcs_cpu(false, cpu, GFP_KERNEL);
4264 ++ if (!vmcs) {
4265 ++ free_kvm_area();
4266 ++ return -ENOMEM;
4267 ++ }
4268 ++
4269 ++ /*
4270 ++ * When eVMCS is enabled, alloc_vmcs_cpu() sets
4271 ++ * vmcs->revision_id to KVM_EVMCS_VERSION instead of
4272 ++ * revision_id reported by MSR_IA32_VMX_BASIC.
4273 ++ *
4274 ++ * However, even though not explicitly documented by
4275 ++ * TLFS, VMXArea passed as VMXON argument should
4276 ++ * still be marked with revision_id reported by
4277 ++ * physical CPU.
4278 ++ */
4279 ++ if (static_branch_unlikely(&enable_evmcs))
4280 ++ vmcs->hdr.revision_id = vmcs_config.revision_id;
4281 ++
4282 ++ per_cpu(vmxarea, cpu) = vmcs;
4283 ++ }
4284 ++ return 0;
4285 ++}
4286 ++
4287 ++static void fix_pmode_seg(struct kvm_vcpu *vcpu, int seg,
4288 ++ struct kvm_segment *save)
4289 ++{
4290 ++ if (!emulate_invalid_guest_state) {
4291 ++ /*
4292 ++ * CS and SS RPL should be equal during guest entry according
4293 ++ * to VMX spec, but in reality it is not always so. Since vcpu
4294 ++ * is in the middle of the transition from real mode to
4295 ++ * protected mode it is safe to assume that RPL 0 is a good
4296 ++ * default value.
4297 ++ */
4298 ++ if (seg == VCPU_SREG_CS || seg == VCPU_SREG_SS)
4299 ++ save->selector &= ~SEGMENT_RPL_MASK;
4300 ++ save->dpl = save->selector & SEGMENT_RPL_MASK;
4301 ++ save->s = 1;
4302 ++ }
4303 ++ vmx_set_segment(vcpu, save, seg);
4304 ++}
4305 ++
4306 ++static void enter_pmode(struct kvm_vcpu *vcpu)
4307 ++{
4308 ++ unsigned long flags;
4309 ++ struct vcpu_vmx *vmx = to_vmx(vcpu);
4310 ++
4311 ++ /*
4312 ++ * Update real mode segment cache. It may be not up-to-date if sement
4313 ++ * register was written while vcpu was in a guest mode.
4314 ++ */
4315 ++ vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES);
4316 ++ vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS);
4317 ++ vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS);
4318 ++ vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS);
4319 ++ vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS);
4320 ++ vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS);
4321 ++
4322 ++ vmx->rmode.vm86_active = 0;
4323 ++
4324 ++ vmx_segment_cache_clear(vmx);
4325 ++
4326 ++ vmx_set_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
4327 ++
4328 ++ flags = vmcs_readl(GUEST_RFLAGS);
4329 ++ flags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
4330 ++ flags |= vmx->rmode.save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
4331 ++ vmcs_writel(GUEST_RFLAGS, flags);
4332 ++
4333 ++ vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) |
4334 ++ (vmcs_readl(CR4_READ_SHADOW) & X86_CR4_VME));
4335 ++
4336 ++ update_exception_bitmap(vcpu);
4337 ++
4338 ++ fix_pmode_seg(vcpu, VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]);
4339 ++ fix_pmode_seg(vcpu, VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]);
4340 ++ fix_pmode_seg(vcpu, VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]);
4341 ++ fix_pmode_seg(vcpu, VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]);
4342 ++ fix_pmode_seg(vcpu, VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]);
4343 ++ fix_pmode_seg(vcpu, VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]);
4344 ++}
4345 ++
4346 ++static void fix_rmode_seg(int seg, struct kvm_segment *save)
4347 ++{
4348 ++ const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
4349 ++ struct kvm_segment var = *save;
4350 ++
4351 ++ var.dpl = 0x3;
4352 ++ if (seg == VCPU_SREG_CS)
4353 ++ var.type = 0x3;
4354 ++
4355 ++ if (!emulate_invalid_guest_state) {
4356 ++ var.selector = var.base >> 4;
4357 ++ var.base = var.base & 0xffff0;
4358 ++ var.limit = 0xffff;
4359 ++ var.g = 0;
4360 ++ var.db = 0;
4361 ++ var.present = 1;
4362 ++ var.s = 1;
4363 ++ var.l = 0;
4364 ++ var.unusable = 0;
4365 ++ var.type = 0x3;
4366 ++ var.avl = 0;
4367 ++ if (save->base & 0xf)
4368 ++ printk_once(KERN_WARNING "kvm: segment base is not "
4369 ++ "paragraph aligned when entering "
4370 ++ "protected mode (seg=%d)", seg);
4371 ++ }
4372 ++
4373 ++ vmcs_write16(sf->selector, var.selector);
4374 ++ vmcs_writel(sf->base, var.base);
4375 ++ vmcs_write32(sf->limit, var.limit);
4376 ++ vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(&var));
4377 ++}
4378 ++
4379 ++static void enter_rmode(struct kvm_vcpu *vcpu)
4380 ++{
4381 ++ unsigned long flags;
4382 ++ struct vcpu_vmx *vmx = to_vmx(vcpu);
4383 ++ struct kvm_vmx *kvm_vmx = to_kvm_vmx(vcpu->kvm);
4384 ++
4385 ++ vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
4386 ++ vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES);
4387 ++ vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS);
4388 ++ vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS);
4389 ++ vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS);
4390 ++ vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS);
4391 ++ vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS);
4392 ++
4393 ++ vmx->rmode.vm86_active = 1;
4394 ++
4395 ++ /*
4396 ++ * Very old userspace does not call KVM_SET_TSS_ADDR before entering
4397 ++ * vcpu. Warn the user that an update is overdue.
4398 ++ */
4399 ++ if (!kvm_vmx->tss_addr)
4400 ++ printk_once(KERN_WARNING "kvm: KVM_SET_TSS_ADDR need to be "
4401 ++ "called before entering vcpu\n");
4402 ++
4403 ++ vmx_segment_cache_clear(vmx);
4404 ++
4405 ++ vmcs_writel(GUEST_TR_BASE, kvm_vmx->tss_addr);
4406 ++ vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1);
4407 ++ vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
4408 ++
4409 ++ flags = vmcs_readl(GUEST_RFLAGS);
4410 ++ vmx->rmode.save_rflags = flags;
4411 ++
4412 ++ flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
4413 ++
4414 ++ vmcs_writel(GUEST_RFLAGS, flags);
4415 ++ vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME);
4416 ++ update_exception_bitmap(vcpu);
4417 ++
4418 ++ fix_rmode_seg(VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]);
4419 ++ fix_rmode_seg(VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]);
4420 ++ fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]);
4421 ++ fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]);
4422 ++ fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]);
4423 ++ fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]);
4424 ++
4425 ++ kvm_mmu_reset_context(vcpu);
4426 ++}
4427 ++
4428 ++void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
4429 ++{
4430 ++ struct vcpu_vmx *vmx = to_vmx(vcpu);
4431 ++ struct shared_msr_entry *msr = find_msr_entry(vmx, MSR_EFER);
4432 ++
4433 ++ if (!msr)
4434 ++ return;
4435 ++
4436 ++ vcpu->arch.efer = efer;
4437 ++ if (efer & EFER_LMA) {
4438 ++ vm_entry_controls_setbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE);
4439 ++ msr->data = efer;
4440 ++ } else {
4441 ++ vm_entry_controls_clearbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE);
4442 ++
4443 ++ msr->data = efer & ~EFER_LME;
4444 ++ }
4445 ++ setup_msrs(vmx);
4446 ++}
4447 ++
4448 ++#ifdef CONFIG_X86_64
4449 ++
4450 ++static void enter_lmode(struct kvm_vcpu *vcpu)
4451 ++{
4452 ++ u32 guest_tr_ar;
4453 ++
4454 ++ vmx_segment_cache_clear(to_vmx(vcpu));
4455 ++
4456 ++ guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES);
4457 ++ if ((guest_tr_ar & VMX_AR_TYPE_MASK) != VMX_AR_TYPE_BUSY_64_TSS) {
4458 ++ pr_debug_ratelimited("%s: tss fixup for long mode. \n",
4459 ++ __func__);
4460 ++ vmcs_write32(GUEST_TR_AR_BYTES,
4461 ++ (guest_tr_ar & ~VMX_AR_TYPE_MASK)
4462 ++ | VMX_AR_TYPE_BUSY_64_TSS);
4463 ++ }
4464 ++ vmx_set_efer(vcpu, vcpu->arch.efer | EFER_LMA);
4465 ++}
4466 ++
4467 ++static void exit_lmode(struct kvm_vcpu *vcpu)
4468 ++{
4469 ++ vm_entry_controls_clearbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE);
4470 ++ vmx_set_efer(vcpu, vcpu->arch.efer & ~EFER_LMA);
4471 ++}
4472 ++
4473 ++#endif
4474 ++
4475 ++static void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr)
4476 ++{
4477 ++ int vpid = to_vmx(vcpu)->vpid;
4478 ++
4479 ++ if (!vpid_sync_vcpu_addr(vpid, addr))
4480 ++ vpid_sync_context(vpid);
4481 ++
4482 ++ /*
4483 ++ * If VPIDs are not supported or enabled, then the above is a no-op.
4484 ++ * But we don't really need a TLB flush in that case anyway, because
4485 ++ * each VM entry/exit includes an implicit flush when VPID is 0.
4486 ++ */
4487 ++}
4488 ++
4489 ++static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
4490 ++{
4491 ++ ulong cr0_guest_owned_bits = vcpu->arch.cr0_guest_owned_bits;
4492 ++
4493 ++ vcpu->arch.cr0 &= ~cr0_guest_owned_bits;
4494 ++ vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & cr0_guest_owned_bits;
4495 ++}
4496 ++
4497 ++static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
4498 ++{
4499 ++ ulong cr4_guest_owned_bits = vcpu->arch.cr4_guest_owned_bits;
4500 ++
4501 ++ vcpu->arch.cr4 &= ~cr4_guest_owned_bits;
4502 ++ vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & cr4_guest_owned_bits;
4503 ++}
4504 ++
4505 ++static void ept_load_pdptrs(struct kvm_vcpu *vcpu)
4506 ++{
4507 ++ struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
4508 ++
4509 ++ if (!kvm_register_is_dirty(vcpu, VCPU_EXREG_PDPTR))
4510 ++ return;
4511 ++
4512 ++ if (is_pae_paging(vcpu)) {
4513 ++ vmcs_write64(GUEST_PDPTR0, mmu->pdptrs[0]);
4514 ++ vmcs_write64(GUEST_PDPTR1, mmu->pdptrs[1]);
4515 ++ vmcs_write64(GUEST_PDPTR2, mmu->pdptrs[2]);
4516 ++ vmcs_write64(GUEST_PDPTR3, mmu->pdptrs[3]);
4517 ++ }
4518 ++}
4519 ++
4520 ++void ept_save_pdptrs(struct kvm_vcpu *vcpu)
4521 ++{
4522 ++ struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
4523 ++
4524 ++ if (is_pae_paging(vcpu)) {
4525 ++ mmu->pdptrs[0] = vmcs_read64(GUEST_PDPTR0);
4526 ++ mmu->pdptrs[1] = vmcs_read64(GUEST_PDPTR1);
4527 ++ mmu->pdptrs[2] = vmcs_read64(GUEST_PDPTR2);
4528 ++ mmu->pdptrs[3] = vmcs_read64(GUEST_PDPTR3);
4529 ++ }
4530 ++
4531 ++ kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR);
4532 ++}
4533 ++
4534 ++static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
4535 ++ unsigned long cr0,
4536 ++ struct kvm_vcpu *vcpu)
4537 ++{
4538 ++ struct vcpu_vmx *vmx = to_vmx(vcpu);
4539 ++
4540 ++ if (!kvm_register_is_available(vcpu, VCPU_EXREG_CR3))
4541 ++ vmx_cache_reg(vcpu, VCPU_EXREG_CR3);
4542 ++ if (!(cr0 & X86_CR0_PG)) {
4543 ++ /* From paging/starting to nonpaging */
4544 ++ exec_controls_setbit(vmx, CPU_BASED_CR3_LOAD_EXITING |
4545 ++ CPU_BASED_CR3_STORE_EXITING);
4546 ++ vcpu->arch.cr0 = cr0;
4547 ++ vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));
4548 ++ } else if (!is_paging(vcpu)) {
4549 ++ /* From nonpaging to paging */
4550 ++ exec_controls_clearbit(vmx, CPU_BASED_CR3_LOAD_EXITING |
4551 ++ CPU_BASED_CR3_STORE_EXITING);
4552 ++ vcpu->arch.cr0 = cr0;
4553 ++ vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));
4554 ++ }
4555 ++
4556 ++ if (!(cr0 & X86_CR0_WP))
4557 ++ *hw_cr0 &= ~X86_CR0_WP;
4558 ++}
4559 ++
4560 ++void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
4561 ++{
4562 ++ struct vcpu_vmx *vmx = to_vmx(vcpu);
4563 ++ unsigned long hw_cr0;
4564 ++
4565 ++ hw_cr0 = (cr0 & ~KVM_VM_CR0_ALWAYS_OFF);
4566 ++ if (enable_unrestricted_guest)
4567 ++ hw_cr0 |= KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST;
4568 ++ else {
4569 ++ hw_cr0 |= KVM_VM_CR0_ALWAYS_ON;
4570 ++
4571 ++ if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE))
4572 ++ enter_pmode(vcpu);
4573 ++
4574 ++ if (!vmx->rmode.vm86_active && !(cr0 & X86_CR0_PE))
4575 ++ enter_rmode(vcpu);
4576 ++ }
4577 ++
4578 ++#ifdef CONFIG_X86_64
4579 ++ if (vcpu->arch.efer & EFER_LME) {
4580 ++ if (!is_paging(vcpu) && (cr0 & X86_CR0_PG))
4581 ++ enter_lmode(vcpu);
4582 ++ if (is_paging(vcpu) && !(cr0 & X86_CR0_PG))
4583 ++ exit_lmode(vcpu);
4584 ++ }
4585 ++#endif
4586 ++
4587 ++ if (enable_ept && !enable_unrestricted_guest)
4588 ++ ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu);
4589 ++
4590 ++ vmcs_writel(CR0_READ_SHADOW, cr0);
4591 ++ vmcs_writel(GUEST_CR0, hw_cr0);
4592 ++ vcpu->arch.cr0 = cr0;
4593 ++
4594 ++ /* depends on vcpu->arch.cr0 to be set to a new value */
4595 ++ vmx->emulation_required = emulation_required(vcpu);
4596 ++}
4597 ++
4598 ++static int get_ept_level(struct kvm_vcpu *vcpu)
4599 ++{
4600 ++ if (cpu_has_vmx_ept_5levels() && (cpuid_maxphyaddr(vcpu) > 48))
4601 ++ return 5;
4602 ++ return 4;
4603 ++}
4604 ++
4605 ++u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa)
4606 ++{
4607 ++ u64 eptp = VMX_EPTP_MT_WB;
4608 ++
4609 ++ eptp |= (get_ept_level(vcpu) == 5) ? VMX_EPTP_PWL_5 : VMX_EPTP_PWL_4;
4610 ++
4611 ++ if (enable_ept_ad_bits &&
4612 ++ (!is_guest_mode(vcpu) || nested_ept_ad_enabled(vcpu)))
4613 ++ eptp |= VMX_EPTP_AD_ENABLE_BIT;
4614 ++ eptp |= (root_hpa & PAGE_MASK);
4615 ++
4616 ++ return eptp;
4617 ++}
4618 ++
4619 ++void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
4620 ++{
4621 ++ struct kvm *kvm = vcpu->kvm;
4622 ++ bool update_guest_cr3 = true;
4623 ++ unsigned long guest_cr3;
4624 ++ u64 eptp;
4625 ++
4626 ++ guest_cr3 = cr3;
4627 ++ if (enable_ept) {
4628 ++ eptp = construct_eptp(vcpu, cr3);
4629 ++ vmcs_write64(EPT_POINTER, eptp);
4630 ++
4631 ++ if (kvm_x86_ops->tlb_remote_flush) {
4632 ++ spin_lock(&to_kvm_vmx(kvm)->ept_pointer_lock);
4633 ++ to_vmx(vcpu)->ept_pointer = eptp;
4634 ++ to_kvm_vmx(kvm)->ept_pointers_match
4635 ++ = EPT_POINTERS_CHECK;
4636 ++ spin_unlock(&to_kvm_vmx(kvm)->ept_pointer_lock);
4637 ++ }
4638 ++
4639 ++ /* Loading vmcs02.GUEST_CR3 is handled by nested VM-Enter. */
4640 ++ if (is_guest_mode(vcpu))
4641 ++ update_guest_cr3 = false;
4642 ++ else if (!enable_unrestricted_guest && !is_paging(vcpu))
4643 ++ guest_cr3 = to_kvm_vmx(kvm)->ept_identity_map_addr;
4644 ++ else if (test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail))
4645 ++ guest_cr3 = vcpu->arch.cr3;
4646 ++ else /* vmcs01.GUEST_CR3 is already up-to-date. */
4647 ++ update_guest_cr3 = false;
4648 ++ ept_load_pdptrs(vcpu);
4649 ++ }
4650 ++
4651 ++ if (update_guest_cr3)
4652 ++ vmcs_writel(GUEST_CR3, guest_cr3);
4653 ++}
4654 ++
4655 ++int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
4656 ++{
4657 ++ struct vcpu_vmx *vmx = to_vmx(vcpu);
4658 ++ /*
4659 ++ * Pass through host's Machine Check Enable value to hw_cr4, which
4660 ++ * is in force while we are in guest mode. Do not let guests control
4661 ++ * this bit, even if host CR4.MCE == 0.
4662 ++ */
4663 ++ unsigned long hw_cr4;
4664 ++
4665 ++ hw_cr4 = (cr4_read_shadow() & X86_CR4_MCE) | (cr4 & ~X86_CR4_MCE);
4666 ++ if (enable_unrestricted_guest)
4667 ++ hw_cr4 |= KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST;
4668 ++ else if (vmx->rmode.vm86_active)
4669 ++ hw_cr4 |= KVM_RMODE_VM_CR4_ALWAYS_ON;
4670 ++ else
4671 ++ hw_cr4 |= KVM_PMODE_VM_CR4_ALWAYS_ON;
4672 ++
4673 ++ if (!boot_cpu_has(X86_FEATURE_UMIP) && vmx_umip_emulated()) {
4674 ++ if (cr4 & X86_CR4_UMIP) {
4675 ++ secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_DESC);
4676 ++ hw_cr4 &= ~X86_CR4_UMIP;
4677 ++ } else if (!is_guest_mode(vcpu) ||
4678 ++ !nested_cpu_has2(get_vmcs12(vcpu), SECONDARY_EXEC_DESC)) {
4679 ++ secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_DESC);
4680 ++ }
4681 ++ }
4682 ++
4683 ++ if (cr4 & X86_CR4_VMXE) {
4684 ++ /*
4685 ++ * To use VMXON (and later other VMX instructions), a guest
4686 ++ * must first be able to turn on cr4.VMXE (see handle_vmon()).
4687 ++ * So basically the check on whether to allow nested VMX
4688 ++ * is here. We operate under the default treatment of SMM,
4689 ++ * so VMX cannot be enabled under SMM.
4690 ++ */
4691 ++ if (!nested_vmx_allowed(vcpu) || is_smm(vcpu))
4692 ++ return 1;
4693 ++ }
4694 ++
4695 ++ if (vmx->nested.vmxon && !nested_cr4_valid(vcpu, cr4))
4696 ++ return 1;
4697 ++
4698 ++ vcpu->arch.cr4 = cr4;
4699 ++
4700 ++ if (!enable_unrestricted_guest) {
4701 ++ if (enable_ept) {
4702 ++ if (!is_paging(vcpu)) {
4703 ++ hw_cr4 &= ~X86_CR4_PAE;
4704 ++ hw_cr4 |= X86_CR4_PSE;
4705 ++ } else if (!(cr4 & X86_CR4_PAE)) {
4706 ++ hw_cr4 &= ~X86_CR4_PAE;
4707 ++ }
4708 ++ }
4709 ++
4710 ++ /*
4711 ++ * SMEP/SMAP/PKU is disabled if CPU is in non-paging mode in
4712 ++ * hardware. To emulate this behavior, SMEP/SMAP/PKU needs
4713 ++ * to be manually disabled when guest switches to non-paging
4714 ++ * mode.
4715 ++ *
4716 ++ * If !enable_unrestricted_guest, the CPU is always running
4717 ++ * with CR0.PG=1 and CR4 needs to be modified.
4718 ++ * If enable_unrestricted_guest, the CPU automatically
4719 ++ * disables SMEP/SMAP/PKU when the guest sets CR0.PG=0.
4720 ++ */
4721 ++ if (!is_paging(vcpu))
4722 ++ hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE);
4723 ++ }
4724 ++
4725 ++ vmcs_writel(CR4_READ_SHADOW, cr4);
4726 ++ vmcs_writel(GUEST_CR4, hw_cr4);
4727 ++ return 0;
4728 ++}
4729 ++
4730 ++void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
4731 ++{
4732 ++ struct vcpu_vmx *vmx = to_vmx(vcpu);
4733 ++ u32 ar;
4734 ++
4735 ++ if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) {
4736 ++ *var = vmx->rmode.segs[seg];
4737 ++ if (seg == VCPU_SREG_TR
4738 ++ || var->selector == vmx_read_guest_seg_selector(vmx, seg))
4739 ++ return;
4740 ++ var->base = vmx_read_guest_seg_base(vmx, seg);
4741 ++ var->selector = vmx_read_guest_seg_selector(vmx, seg);
4742 ++ return;
4743 ++ }
4744 ++ var->base = vmx_read_guest_seg_base(vmx, seg);
4745 ++ var->limit = vmx_read_guest_seg_limit(vmx, seg);
4746 ++ var->selector = vmx_read_guest_seg_selector(vmx, seg);
4747 ++ ar = vmx_read_guest_seg_ar(vmx, seg);
4748 ++ var->unusable = (ar >> 16) & 1;
4749 ++ var->type = ar & 15;
4750 ++ var->s = (ar >> 4) & 1;
4751 ++ var->dpl = (ar >> 5) & 3;
4752 ++ /*
4753 ++ * Some userspaces do not preserve unusable property. Since usable
4754 ++ * segment has to be present according to VMX spec we can use present
4755 ++ * property to amend userspace bug by making unusable segment always
4756 ++ * nonpresent. vmx_segment_access_rights() already marks nonpresent
4757 ++ * segment as unusable.
4758 ++ */
4759 ++ var->present = !var->unusable;
4760 ++ var->avl = (ar >> 12) & 1;
4761 ++ var->l = (ar >> 13) & 1;
4762 ++ var->db = (ar >> 14) & 1;
4763 ++ var->g = (ar >> 15) & 1;
4764 ++}
4765 ++
4766 ++static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg)
4767 ++{
4768 ++ struct kvm_segment s;
4769 ++
4770 ++ if (to_vmx(vcpu)->rmode.vm86_active) {
4771 ++ vmx_get_segment(vcpu, &s, seg);
4772 ++ return s.base;
4773 ++ }
4774 ++ return vmx_read_guest_seg_base(to_vmx(vcpu), seg);
4775 ++}
4776 ++
4777 ++int vmx_get_cpl(struct kvm_vcpu *vcpu)
4778 ++{
4779 ++ struct vcpu_vmx *vmx = to_vmx(vcpu);
4780 ++
4781 ++ if (unlikely(vmx->rmode.vm86_active))
4782 ++ return 0;
4783 ++ else {
4784 ++ int ar = vmx_read_guest_seg_ar(vmx, VCPU_SREG_SS);
4785 ++ return VMX_AR_DPL(ar);
4786 ++ }
4787 ++}
4788 ++
4789 ++static u32 vmx_segment_access_rights(struct kvm_segment *var)
4790 ++{
4791 ++ u32 ar;
4792 ++
4793 ++ if (var->unusable || !var->present)
4794 ++ ar = 1 << 16;
4795 ++ else {
4796 ++ ar = var->type & 15;
4797 ++ ar |= (var->s & 1) << 4;
4798 ++ ar |= (var->dpl & 3) << 5;
4799 ++ ar |= (var->present & 1) << 7;
4800 ++ ar |= (var->avl & 1) << 12;
4801 ++ ar |= (var->l & 1) << 13;
4802 ++ ar |= (var->db & 1) << 14;
4803 ++ ar |= (var->g & 1) << 15;
4804 ++ }
4805 ++
4806 ++ return ar;
4807 ++}
4808 ++
4809 ++void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
4810 ++{
4811 ++ struct vcpu_vmx *vmx = to_vmx(vcpu);
4812 ++ const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
4813 ++
4814 ++ vmx_segment_cache_clear(vmx);
4815 ++
4816 ++ if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) {
4817 ++ vmx->rmode.segs[seg] = *var;
4818 ++ if (seg == VCPU_SREG_TR)
4819 ++ vmcs_write16(sf->selector, var->selector);
4820 ++ else if (var->s)
4821 ++ fix_rmode_seg(seg, &vmx->rmode.segs[seg]);
4822 ++ goto out;
4823 ++ }
4824 ++
4825 ++ vmcs_writel(sf->base, var->base);
4826 ++ vmcs_write32(sf->limit, var->limit);
4827 ++ vmcs_write16(sf->selector, var->selector);
4828 ++
4829 ++ /*
4830 ++ * Fix the "Accessed" bit in AR field of segment registers for older
4831 ++ * qemu binaries.
4832 ++ * IA32 arch specifies that at the time of processor reset the
4833 ++ * "Accessed" bit in the AR field of segment registers is 1. And qemu
4834 ++ * is setting it to 0 in the userland code. This causes invalid guest
4835 ++ * state vmexit when "unrestricted guest" mode is turned on.
4836 ++ * Fix for this setup issue in cpu_reset is being pushed in the qemu
4837 ++ * tree. Newer qemu binaries with that qemu fix would not need this
4838 ++ * kvm hack.
4839 ++ */
4840 ++ if (enable_unrestricted_guest && (seg != VCPU_SREG_LDTR))
4841 ++ var->type |= 0x1; /* Accessed */
4842 ++
4843 ++ vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(var));
4844 ++
4845 ++out:
4846 ++ vmx->emulation_required = emulation_required(vcpu);
4847 ++}
4848 ++
4849 ++static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
4850 ++{
4851 ++ u32 ar = vmx_read_guest_seg_ar(to_vmx(vcpu), VCPU_SREG_CS);
4852 ++
4853 ++ *db = (ar >> 14) & 1;
4854 ++ *l = (ar >> 13) & 1;
4855 ++}
4856 ++
4857 ++static void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
4858 ++{
4859 ++ dt->size = vmcs_read32(GUEST_IDTR_LIMIT);
4860 ++ dt->address = vmcs_readl(GUEST_IDTR_BASE);
4861 ++}
4862 ++
4863 ++static void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
4864 ++{
4865 ++ vmcs_write32(GUEST_IDTR_LIMIT, dt->size);
4866 ++ vmcs_writel(GUEST_IDTR_BASE, dt->address);
4867 ++}
4868 ++
4869 ++static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
4870 ++{
4871 ++ dt->size = vmcs_read32(GUEST_GDTR_LIMIT);
4872 ++ dt->address = vmcs_readl(GUEST_GDTR_BASE);
4873 ++}
4874 ++
4875 ++static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
4876 ++{
4877 ++ vmcs_write32(GUEST_GDTR_LIMIT, dt->size);
4878 ++ vmcs_writel(GUEST_GDTR_BASE, dt->address);
4879 ++}
4880 ++
4881 ++static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg)
4882 ++{
4883 ++ struct kvm_segment var;
4884 ++ u32 ar;
4885 ++
4886 ++ vmx_get_segment(vcpu, &var, seg);
4887 ++ var.dpl = 0x3;
4888 ++ if (seg == VCPU_SREG_CS)
4889 ++ var.type = 0x3;
4890 ++ ar = vmx_segment_access_rights(&var);
4891 ++
4892 ++ if (var.base != (var.selector << 4))
4893 ++ return false;
4894 ++ if (var.limit != 0xffff)
4895 ++ return false;
4896 ++ if (ar != 0xf3)
4897 ++ return false;
4898 ++
4899 ++ return true;
4900 ++}
4901 ++
4902 ++static bool code_segment_valid(struct kvm_vcpu *vcpu)
4903 ++{
4904 ++ struct kvm_segment cs;
4905 ++ unsigned int cs_rpl;
4906 ++
4907 ++ vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
4908 ++ cs_rpl = cs.selector & SEGMENT_RPL_MASK;
4909 ++
4910 ++ if (cs.unusable)
4911 ++ return false;
4912 ++ if (~cs.type & (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_ACCESSES_MASK))
4913 ++ return false;
4914 ++ if (!cs.s)
4915 ++ return false;
4916 ++ if (cs.type & VMX_AR_TYPE_WRITEABLE_MASK) {
4917 ++ if (cs.dpl > cs_rpl)
4918 ++ return false;
4919 ++ } else {
4920 ++ if (cs.dpl != cs_rpl)
4921 ++ return false;
4922 ++ }
4923 ++ if (!cs.present)
4924 ++ return false;
4925 ++
4926 ++ /* TODO: Add Reserved field check, this'll require a new member in the kvm_segment_field structure */
4927 ++ return true;
4928 ++}
4929 ++
4930 ++static bool stack_segment_valid(struct kvm_vcpu *vcpu)
4931 ++{
4932 ++ struct kvm_segment ss;
4933 ++ unsigned int ss_rpl;
4934 ++
4935 ++ vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
4936 ++ ss_rpl = ss.selector & SEGMENT_RPL_MASK;
4937 ++
4938 ++ if (ss.unusable)
4939 ++ return true;
4940 ++ if (ss.type != 3 && ss.type != 7)
4941 ++ return false;
4942 ++ if (!ss.s)
4943 ++ return false;
4944 ++ if (ss.dpl != ss_rpl) /* DPL != RPL */
4945 ++ return false;
4946 ++ if (!ss.present)
4947 ++ return false;
4948 ++
4949 ++ return true;
4950 ++}
4951 ++
4952 ++static bool data_segment_valid(struct kvm_vcpu *vcpu, int seg)
4953 ++{
4954 ++ struct kvm_segment var;
4955 ++ unsigned int rpl;
4956 ++
4957 ++ vmx_get_segment(vcpu, &var, seg);
4958 ++ rpl = var.selector & SEGMENT_RPL_MASK;
4959 ++
4960 ++ if (var.unusable)
4961 ++ return true;
4962 ++ if (!var.s)
4963 ++ return false;
4964 ++ if (!var.present)
4965 ++ return false;
4966 ++ if (~var.type & (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_WRITEABLE_MASK)) {
4967 ++ if (var.dpl < rpl) /* DPL < RPL */
4968 ++ return false;
4969 ++ }
4970 ++
4971 ++ /* TODO: Add other members to kvm_segment_field to allow checking for other access
4972 ++ * rights flags
4973 ++ */
4974 ++ return true;
4975 ++}
4976 ++
4977 ++static bool tr_valid(struct kvm_vcpu *vcpu)
4978 ++{
4979 ++ struct kvm_segment tr;
4980 ++
4981 ++ vmx_get_segment(vcpu, &tr, VCPU_SREG_TR);
4982 ++
4983 ++ if (tr.unusable)
4984 ++ return false;
4985 ++ if (tr.selector & SEGMENT_TI_MASK) /* TI = 1 */
4986 ++ return false;
4987 ++ if (tr.type != 3 && tr.type != 11) /* TODO: Check if guest is in IA32e mode */
4988 ++ return false;
4989 ++ if (!tr.present)
4990 ++ return false;
4991 ++
4992 ++ return true;
4993 ++}
4994 ++
4995 ++static bool ldtr_valid(struct kvm_vcpu *vcpu)
4996 ++{
4997 ++ struct kvm_segment ldtr;
4998 ++
4999 ++ vmx_get_segment(vcpu, &ldtr, VCPU_SREG_LDTR);
5000 ++
5001 ++ if (ldtr.unusable)
5002 ++ return true;
5003 ++ if (ldtr.selector & SEGMENT_TI_MASK) /* TI = 1 */
5004 ++ return false;
5005 ++ if (ldtr.type != 2)
5006 ++ return false;
5007 ++ if (!ldtr.present)
5008 ++ return false;
5009 ++
5010 ++ return true;
5011 ++}
5012 ++
5013 ++static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu)
5014 ++{
5015 ++ struct kvm_segment cs, ss;
5016 ++
5017 ++ vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
5018 ++ vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
5019 ++
5020 ++ return ((cs.selector & SEGMENT_RPL_MASK) ==
5021 ++ (ss.selector & SEGMENT_RPL_MASK));
5022 ++}
5023 ++
5024 ++/*
5025 ++ * Check if guest state is valid. Returns true if valid, false if
5026 ++ * not.
5027 ++ * We assume that registers are always usable
5028 ++ */
5029 ++static bool guest_state_valid(struct kvm_vcpu *vcpu)
5030 ++{
5031 ++ if (enable_unrestricted_guest)
5032 ++ return true;
5033 ++
5034 ++ /* real mode guest state checks */
5035 ++ if (!is_protmode(vcpu) || (vmx_get_rflags(vcpu) & X86_EFLAGS_VM)) {
5036 ++ if (!rmode_segment_valid(vcpu, VCPU_SREG_CS))
5037 ++ return false;
5038 ++ if (!rmode_segment_valid(vcpu, VCPU_SREG_SS))
5039 ++ return false;
5040 ++ if (!rmode_segment_valid(vcpu, VCPU_SREG_DS))
5041 ++ return false;
5042 ++ if (!rmode_segment_valid(vcpu, VCPU_SREG_ES))
5043 ++ return false;
5044 ++ if (!rmode_segment_valid(vcpu, VCPU_SREG_FS))
5045 ++ return false;
5046 ++ if (!rmode_segment_valid(vcpu, VCPU_SREG_GS))
5047 ++ return false;
5048 ++ } else {
5049 ++ /* protected mode guest state checks */
5050 ++ if (!cs_ss_rpl_check(vcpu))
5051 ++ return false;
5052 ++ if (!code_segment_valid(vcpu))
5053 ++ return false;
5054 ++ if (!stack_segment_valid(vcpu))
5055 ++ return false;
5056 ++ if (!data_segment_valid(vcpu, VCPU_SREG_DS))
5057 ++ return false;
5058 ++ if (!data_segment_valid(vcpu, VCPU_SREG_ES))
5059 ++ return false;
5060 ++ if (!data_segment_valid(vcpu, VCPU_SREG_FS))
5061 ++ return false;
5062 ++ if (!data_segment_valid(vcpu, VCPU_SREG_GS))
5063 ++ return false;
5064 ++ if (!tr_valid(vcpu))
5065 ++ return false;
5066 ++ if (!ldtr_valid(vcpu))
5067 ++ return false;
5068 ++ }
5069 ++ /* TODO:
5070 ++ * - Add checks on RIP
5071 ++ * - Add checks on RFLAGS
5072 ++ */
5073 ++
5074 ++ return true;
5075 ++}
5076 ++
5077 ++static int init_rmode_tss(struct kvm *kvm)
5078 ++{
5079 ++ gfn_t fn;
5080 ++ u16 data = 0;
5081 ++ int idx, r;
5082 ++
5083 ++ idx = srcu_read_lock(&kvm->srcu);
5084 ++ fn = to_kvm_vmx(kvm)->tss_addr >> PAGE_SHIFT;
5085 ++ r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
5086 ++ if (r < 0)
5087 ++ goto out;
5088 ++ data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE;
5089 ++ r = kvm_write_guest_page(kvm, fn++, &data,
5090 ++ TSS_IOPB_BASE_OFFSET, sizeof(u16));
5091 ++ if (r < 0)
5092 ++ goto out;
5093 ++ r = kvm_clear_guest_page(kvm, fn++, 0, PAGE_SIZE);
5094 ++ if (r < 0)
5095 ++ goto out;
5096 ++ r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
5097 ++ if (r < 0)
5098 ++ goto out;
5099 ++ data = ~0;
5100 ++ r = kvm_write_guest_page(kvm, fn, &data,
5101 ++ RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1,
5102 ++ sizeof(u8));
5103 ++out:
5104 ++ srcu_read_unlock(&kvm->srcu, idx);
5105 ++ return r;
5106 ++}
5107 ++
5108 ++static int init_rmode_identity_map(struct kvm *kvm)
5109 ++{
5110 ++ struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm);
5111 ++ int i, idx, r = 0;
5112 ++ kvm_pfn_t identity_map_pfn;
5113 ++ u32 tmp;
5114 ++
5115 ++ /* Protect kvm_vmx->ept_identity_pagetable_done. */
5116 ++ mutex_lock(&kvm->slots_lock);
5117 ++
5118 ++ if (likely(kvm_vmx->ept_identity_pagetable_done))
5119 ++ goto out2;
5120 ++
5121 ++ if (!kvm_vmx->ept_identity_map_addr)
5122 ++ kvm_vmx->ept_identity_map_addr = VMX_EPT_IDENTITY_PAGETABLE_ADDR;
5123 ++ identity_map_pfn = kvm_vmx->ept_identity_map_addr >> PAGE_SHIFT;
5124 ++
5125 ++ r = __x86_set_memory_region(kvm, IDENTITY_PAGETABLE_PRIVATE_MEMSLOT,
5126 ++ kvm_vmx->ept_identity_map_addr, PAGE_SIZE);
5127 ++ if (r < 0)
5128 ++ goto out2;
5129 ++
5130 ++ idx = srcu_read_lock(&kvm->srcu);
5131 ++ r = kvm_clear_guest_page(kvm, identity_map_pfn, 0, PAGE_SIZE);
5132 ++ if (r < 0)
5133 ++ goto out;
5134 ++ /* Set up identity-mapping pagetable for EPT in real mode */
5135 ++ for (i = 0; i < PT32_ENT_PER_PAGE; i++) {
5136 ++ tmp = (i << 22) + (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |
5137 ++ _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE);
5138 ++ r = kvm_write_guest_page(kvm, identity_map_pfn,
5139 ++ &tmp, i * sizeof(tmp), sizeof(tmp));
5140 ++ if (r < 0)
5141 ++ goto out;
5142 ++ }
5143 ++ kvm_vmx->ept_identity_pagetable_done = true;
5144 ++
5145 ++out:
5146 ++ srcu_read_unlock(&kvm->srcu, idx);
5147 ++
5148 ++out2:
5149 ++ mutex_unlock(&kvm->slots_lock);
5150 ++ return r;
5151 ++}
5152 ++
5153 ++static void seg_setup(int seg)
5154 ++{
5155 ++ const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
5156 ++ unsigned int ar;
5157 ++
5158 ++ vmcs_write16(sf->selector, 0);
5159 ++ vmcs_writel(sf->base, 0);
5160 ++ vmcs_write32(sf->limit, 0xffff);
5161 ++ ar = 0x93;
5162 ++ if (seg == VCPU_SREG_CS)
5163 ++ ar |= 0x08; /* code segment */
5164 ++
5165 ++ vmcs_write32(sf->ar_bytes, ar);
5166 ++}
5167 ++
5168 ++static int alloc_apic_access_page(struct kvm *kvm)
5169 ++{
5170 ++ struct page *page;
5171 ++ int r = 0;
5172 ++
5173 ++ mutex_lock(&kvm->slots_lock);
5174 ++ if (kvm->arch.apic_access_page_done)
5175 ++ goto out;
5176 ++ r = __x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT,
5177 ++ APIC_DEFAULT_PHYS_BASE, PAGE_SIZE);
5178 ++ if (r)
5179 ++ goto out;
5180 ++
5181 ++ page = gfn_to_page(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT);
5182 ++ if (is_error_page(page)) {
5183 ++ r = -EFAULT;
5184 ++ goto out;
5185 ++ }
5186 ++
5187 ++ /*
5188 ++ * Do not pin the page in memory, so that memory hot-unplug
5189 ++ * is able to migrate it.
5190 ++ */
5191 ++ put_page(page);
5192 ++ kvm->arch.apic_access_page_done = true;
5193 ++out:
5194 ++ mutex_unlock(&kvm->slots_lock);
5195 ++ return r;
5196 ++}
5197 ++
5198 ++int allocate_vpid(void)
5199 ++{
5200 ++ int vpid;
5201 ++
5202 ++ if (!enable_vpid)
5203 ++ return 0;
5204 ++ spin_lock(&vmx_vpid_lock);
5205 ++ vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS);
5206 ++ if (vpid < VMX_NR_VPIDS)
5207 ++ __set_bit(vpid, vmx_vpid_bitmap);
5208 ++ else
5209 ++ vpid = 0;
5210 ++ spin_unlock(&vmx_vpid_lock);
5211 ++ return vpid;
5212 ++}
5213 ++
5214 ++void free_vpid(int vpid)
5215 ++{
5216 ++ if (!enable_vpid || vpid == 0)
5217 ++ return;
5218 ++ spin_lock(&vmx_vpid_lock);
5219 ++ __clear_bit(vpid, vmx_vpid_bitmap);
5220 ++ spin_unlock(&vmx_vpid_lock);
5221 ++}
5222 ++
5223 ++static __always_inline void vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
5224 ++ u32 msr, int type)
5225 ++{
5226 ++ int f = sizeof(unsigned long);
5227 ++
5228 ++ if (!cpu_has_vmx_msr_bitmap())
5229 ++ return;
5230 ++
5231 ++ if (static_branch_unlikely(&enable_evmcs))
5232 ++ evmcs_touch_msr_bitmap();
5233 ++
5234 ++ /*
5235 ++ * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
5236 ++ * have the write-low and read-high bitmap offsets the wrong way round.
5237 ++ * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
5238 ++ */
5239 ++ if (msr <= 0x1fff) {
5240 ++ if (type & MSR_TYPE_R)
5241 ++ /* read-low */
5242 ++ __clear_bit(msr, msr_bitmap + 0x000 / f);
5243 ++
5244 ++ if (type & MSR_TYPE_W)
5245 ++ /* write-low */
5246 ++ __clear_bit(msr, msr_bitmap + 0x800 / f);
5247 ++
5248 ++ } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
5249 ++ msr &= 0x1fff;
5250 ++ if (type & MSR_TYPE_R)
5251 ++ /* read-high */
5252 ++ __clear_bit(msr, msr_bitmap + 0x400 / f);
5253 ++
5254 ++ if (type & MSR_TYPE_W)
5255 ++ /* write-high */
5256 ++ __clear_bit(msr, msr_bitmap + 0xc00 / f);
5257 ++
5258 ++ }
5259 ++}
5260 ++
5261 ++static __always_inline void vmx_enable_intercept_for_msr(unsigned long *msr_bitmap,
5262 ++ u32 msr, int type)
5263 ++{
5264 ++ int f = sizeof(unsigned long);
5265 ++
5266 ++ if (!cpu_has_vmx_msr_bitmap())
5267 ++ return;
5268 ++
5269 ++ if (static_branch_unlikely(&enable_evmcs))
5270 ++ evmcs_touch_msr_bitmap();
5271 ++
5272 ++ /*
5273 ++ * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
5274 ++ * have the write-low and read-high bitmap offsets the wrong way round.
5275 ++ * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
5276 ++ */
5277 ++ if (msr <= 0x1fff) {
5278 ++ if (type & MSR_TYPE_R)
5279 ++ /* read-low */
5280 ++ __set_bit(msr, msr_bitmap + 0x000 / f);
5281 ++
5282 ++ if (type & MSR_TYPE_W)
5283 ++ /* write-low */
5284 ++ __set_bit(msr, msr_bitmap + 0x800 / f);
5285 ++
5286 ++ } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
5287 ++ msr &= 0x1fff;
5288 ++ if (type & MSR_TYPE_R)
5289 ++ /* read-high */
5290 ++ __set_bit(msr, msr_bitmap + 0x400 / f);
5291 ++
5292 ++ if (type & MSR_TYPE_W)
5293 ++ /* write-high */
5294 ++ __set_bit(msr, msr_bitmap + 0xc00 / f);
5295 ++
5296 ++ }
5297 ++}
5298 ++
5299 ++static __always_inline void vmx_set_intercept_for_msr(unsigned long *msr_bitmap,
5300 ++ u32 msr, int type, bool value)
5301 ++{
5302 ++ if (value)
5303 ++ vmx_enable_intercept_for_msr(msr_bitmap, msr, type);
5304 ++ else
5305 ++ vmx_disable_intercept_for_msr(msr_bitmap, msr, type);
5306 ++}
5307 ++
5308 ++static u8 vmx_msr_bitmap_mode(struct kvm_vcpu *vcpu)
5309 ++{
5310 ++ u8 mode = 0;
5311 ++
5312 ++ if (cpu_has_secondary_exec_ctrls() &&
5313 ++ (secondary_exec_controls_get(to_vmx(vcpu)) &
5314 ++ SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) {
5315 ++ mode |= MSR_BITMAP_MODE_X2APIC;
5316 ++ if (enable_apicv && kvm_vcpu_apicv_active(vcpu))
5317 ++ mode |= MSR_BITMAP_MODE_X2APIC_APICV;
5318 ++ }
5319 ++
5320 ++ return mode;
5321 ++}
5322 ++
5323 ++static void vmx_update_msr_bitmap_x2apic(unsigned long *msr_bitmap,
5324 ++ u8 mode)
5325 ++{
5326 ++ int msr;
5327 ++
5328 ++ for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
5329 ++ unsigned word = msr / BITS_PER_LONG;
5330 ++ msr_bitmap[word] = (mode & MSR_BITMAP_MODE_X2APIC_APICV) ? 0 : ~0;
5331 ++ msr_bitmap[word + (0x800 / sizeof(long))] = ~0;
5332 ++ }
5333 ++
5334 ++ if (mode & MSR_BITMAP_MODE_X2APIC) {
5335 ++ /*
5336 ++ * TPR reads and writes can be virtualized even if virtual interrupt
5337 ++ * delivery is not in use.
5338 ++ */
5339 ++ vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_TASKPRI), MSR_TYPE_RW);
5340 ++ if (mode & MSR_BITMAP_MODE_X2APIC_APICV) {
5341 ++ vmx_enable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_TMCCT), MSR_TYPE_R);
5342 ++ vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_EOI), MSR_TYPE_W);
5343 ++ vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W);
5344 ++ }
5345 ++ }
5346 ++}
5347 ++
5348 ++void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu)
5349 ++{
5350 ++ struct vcpu_vmx *vmx = to_vmx(vcpu);
5351 ++ unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
5352 ++ u8 mode = vmx_msr_bitmap_mode(vcpu);
5353 ++ u8 changed = mode ^ vmx->msr_bitmap_mode;
5354 ++
5355 ++ if (!changed)
5356 ++ return;
5357 ++
5358 ++ if (changed & (MSR_BITMAP_MODE_X2APIC | MSR_BITMAP_MODE_X2APIC_APICV))
5359 ++ vmx_update_msr_bitmap_x2apic(msr_bitmap, mode);
5360 ++
5361 ++ vmx->msr_bitmap_mode = mode;
5362 ++}
5363 ++
5364 ++void pt_update_intercept_for_msr(struct vcpu_vmx *vmx)
5365 ++{
5366 ++ unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
5367 ++ bool flag = !(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN);
5368 ++ u32 i;
5369 ++
5370 ++ vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_STATUS,
5371 ++ MSR_TYPE_RW, flag);
5372 ++ vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_OUTPUT_BASE,
5373 ++ MSR_TYPE_RW, flag);
5374 ++ vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_OUTPUT_MASK,
5375 ++ MSR_TYPE_RW, flag);
5376 ++ vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_CR3_MATCH,
5377 ++ MSR_TYPE_RW, flag);
5378 ++ for (i = 0; i < vmx->pt_desc.addr_range; i++) {
5379 ++ vmx_set_intercept_for_msr(msr_bitmap,
5380 ++ MSR_IA32_RTIT_ADDR0_A + i * 2, MSR_TYPE_RW, flag);
5381 ++ vmx_set_intercept_for_msr(msr_bitmap,
5382 ++ MSR_IA32_RTIT_ADDR0_B + i * 2, MSR_TYPE_RW, flag);
5383 ++ }
5384 ++}
5385 ++
5386 ++static bool vmx_get_enable_apicv(struct kvm *kvm)
5387 ++{
5388 ++ return enable_apicv;
5389 ++}
5390 ++
5391 ++static bool vmx_guest_apic_has_interrupt(struct kvm_vcpu *vcpu)
5392 ++{
5393 ++ struct vcpu_vmx *vmx = to_vmx(vcpu);
5394 ++ void *vapic_page;
5395 ++ u32 vppr;
5396 ++ int rvi;
5397 ++
5398 ++ if (WARN_ON_ONCE(!is_guest_mode(vcpu)) ||
5399 ++ !nested_cpu_has_vid(get_vmcs12(vcpu)) ||
5400 ++ WARN_ON_ONCE(!vmx->nested.virtual_apic_map.gfn))
5401 ++ return false;
5402 ++
5403 ++ rvi = vmx_get_rvi();
5404 ++
5405 ++ vapic_page = vmx->nested.virtual_apic_map.hva;
5406 ++ vppr = *((u32 *)(vapic_page + APIC_PROCPRI));
5407 ++
5408 ++ return ((rvi & 0xf0) > (vppr & 0xf0));
5409 ++}
5410 ++
5411 ++static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu,
5412 ++ bool nested)
5413 ++{
5414 ++#ifdef CONFIG_SMP
5415 ++ int pi_vec = nested ? POSTED_INTR_NESTED_VECTOR : POSTED_INTR_VECTOR;
5416 ++
5417 ++ if (vcpu->mode == IN_GUEST_MODE) {
5418 ++ /*
5419 ++ * The vector of interrupt to be delivered to vcpu had
5420 ++ * been set in PIR before this function.
5421 ++ *
5422 ++ * Following cases will be reached in this block, and
5423 ++ * we always send a notification event in all cases as
5424 ++ * explained below.
5425 ++ *
5426 ++ * Case 1: vcpu keeps in non-root mode. Sending a
5427 ++ * notification event posts the interrupt to vcpu.
5428 ++ *
5429 ++ * Case 2: vcpu exits to root mode and is still
5430 ++ * runnable. PIR will be synced to vIRR before the
5431 ++ * next vcpu entry. Sending a notification event in
5432 ++ * this case has no effect, as vcpu is not in root
5433 ++ * mode.
5434 ++ *
5435 ++ * Case 3: vcpu exits to root mode and is blocked.
5436 ++ * vcpu_block() has already synced PIR to vIRR and
5437 ++ * never blocks vcpu if vIRR is not cleared. Therefore,
5438 ++ * a blocked vcpu here does not wait for any requested
5439 ++ * interrupts in PIR, and sending a notification event
5440 ++ * which has no effect is safe here.
5441 ++ */
5442 ++
5443 ++ apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), pi_vec);
5444 ++ return true;
5445 ++ }
5446 ++#endif
5447 ++ return false;
5448 ++}
5449 ++
5450 ++static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu,
5451 ++ int vector)
5452 ++{
5453 ++ struct vcpu_vmx *vmx = to_vmx(vcpu);
5454 ++
5455 ++ if (is_guest_mode(vcpu) &&
5456 ++ vector == vmx->nested.posted_intr_nv) {
5457 ++ /*
5458 ++ * If a posted intr is not recognized by hardware,
5459 ++ * we will accomplish it in the next vmentry.
5460 ++ */
5461 ++ vmx->nested.pi_pending = true;
5462 ++ kvm_make_request(KVM_REQ_EVENT, vcpu);
5463 ++ /* the PIR and ON have been set by L1. */
5464 ++ if (!kvm_vcpu_trigger_posted_interrupt(vcpu, true))
5465 ++ kvm_vcpu_kick(vcpu);
5466 ++ return 0;
5467 ++ }
5468 ++ return -1;
5469 ++}
5470 ++/*
5471 ++ * Send interrupt to vcpu via posted interrupt way.
5472 ++ * 1. If target vcpu is running(non-root mode), send posted interrupt
5473 ++ * notification to vcpu and hardware will sync PIR to vIRR atomically.
5474 ++ * 2. If target vcpu isn't running(root mode), kick it to pick up the
5475 ++ * interrupt from PIR in next vmentry.
5476 ++ */
5477 ++static void vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector)
5478 ++{
5479 ++ struct vcpu_vmx *vmx = to_vmx(vcpu);
5480 ++ int r;
5481 ++
5482 ++ r = vmx_deliver_nested_posted_interrupt(vcpu, vector);
5483 ++ if (!r)
5484 ++ return;
5485 ++
5486 ++ if (pi_test_and_set_pir(vector, &vmx->pi_desc))
5487 ++ return;
5488 ++
5489 ++ /* If a previous notification has sent the IPI, nothing to do. */
5490 ++ if (pi_test_and_set_on(&vmx->pi_desc))
5491 ++ return;
5492 ++
5493 ++ if (!kvm_vcpu_trigger_posted_interrupt(vcpu, false))
5494 ++ kvm_vcpu_kick(vcpu);
5495 ++}
5496 ++
5497 ++/*
5498 ++ * Set up the vmcs's constant host-state fields, i.e., host-state fields that
5499 ++ * will not change in the lifetime of the guest.
5500 ++ * Note that host-state that does change is set elsewhere. E.g., host-state
5501 ++ * that is set differently for each CPU is set in vmx_vcpu_load(), not here.
5502 ++ */
5503 ++void vmx_set_constant_host_state(struct vcpu_vmx *vmx)
5504 ++{
5505 ++ u32 low32, high32;
5506 ++ unsigned long tmpl;
5507 ++ unsigned long cr0, cr3, cr4;
5508 ++
5509 ++ cr0 = read_cr0();
5510 ++ WARN_ON(cr0 & X86_CR0_TS);
5511 ++ vmcs_writel(HOST_CR0, cr0); /* 22.2.3 */
5512 ++
5513 ++ /*
5514 ++ * Save the most likely value for this task's CR3 in the VMCS.
5515 ++ * We can't use __get_current_cr3_fast() because we're not atomic.
5516 ++ */
5517 ++ cr3 = __read_cr3();
5518 ++ vmcs_writel(HOST_CR3, cr3); /* 22.2.3 FIXME: shadow tables */
5519 ++ vmx->loaded_vmcs->host_state.cr3 = cr3;
5520 ++
5521 ++ /* Save the most likely value for this task's CR4 in the VMCS. */
5522 ++ cr4 = cr4_read_shadow();
5523 ++ vmcs_writel(HOST_CR4, cr4); /* 22.2.3, 22.2.5 */
5524 ++ vmx->loaded_vmcs->host_state.cr4 = cr4;
5525 ++
5526 ++ vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */
5527 ++#ifdef CONFIG_X86_64
5528 ++ /*
5529 ++ * Load null selectors, so we can avoid reloading them in
5530 ++ * vmx_prepare_switch_to_host(), in case userspace uses
5531 ++ * the null selectors too (the expected case).
5532 ++ */
5533 ++ vmcs_write16(HOST_DS_SELECTOR, 0);
5534 ++ vmcs_write16(HOST_ES_SELECTOR, 0);
5535 ++#else
5536 ++ vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */
5537 ++ vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */
5538 ++#endif
5539 ++ vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */
5540 ++ vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */
5541 ++
5542 ++ vmcs_writel(HOST_IDTR_BASE, host_idt_base); /* 22.2.4 */
5543 ++
5544 ++ vmcs_writel(HOST_RIP, (unsigned long)vmx_vmexit); /* 22.2.5 */
5545 ++
5546 ++ rdmsr(MSR_IA32_SYSENTER_CS, low32, high32);
5547 ++ vmcs_write32(HOST_IA32_SYSENTER_CS, low32);
5548 ++ rdmsrl(MSR_IA32_SYSENTER_EIP, tmpl);
5549 ++ vmcs_writel(HOST_IA32_SYSENTER_EIP, tmpl); /* 22.2.3 */
5550 ++
5551 ++ if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) {
5552 ++ rdmsr(MSR_IA32_CR_PAT, low32, high32);
5553 ++ vmcs_write64(HOST_IA32_PAT, low32 | ((u64) high32 << 32));
5554 ++ }
5555 ++
5556 ++ if (cpu_has_load_ia32_efer())
5557 ++ vmcs_write64(HOST_IA32_EFER, host_efer);
5558 ++}
5559 ++
5560 ++void set_cr4_guest_host_mask(struct vcpu_vmx *vmx)
5561 ++{
5562 ++ vmx->vcpu.arch.cr4_guest_owned_bits = KVM_CR4_GUEST_OWNED_BITS;
5563 ++ if (enable_ept)
5564 ++ vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE;
5565 ++ if (is_guest_mode(&vmx->vcpu))
5566 ++ vmx->vcpu.arch.cr4_guest_owned_bits &=
5567 ++ ~get_vmcs12(&vmx->vcpu)->cr4_guest_host_mask;
5568 ++ vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits);
5569 ++}
5570 ++
5571 ++u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx)
5572 ++{
5573 ++ u32 pin_based_exec_ctrl = vmcs_config.pin_based_exec_ctrl;
5574 ++
5575 ++ if (!kvm_vcpu_apicv_active(&vmx->vcpu))
5576 ++ pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR;
5577 ++
5578 ++ if (!enable_vnmi)
5579 ++ pin_based_exec_ctrl &= ~PIN_BASED_VIRTUAL_NMIS;
5580 ++
5581 ++ if (!enable_preemption_timer)
5582 ++ pin_based_exec_ctrl &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
5583 ++
5584 ++ return pin_based_exec_ctrl;
5585 ++}
5586 ++
5587 ++static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
5588 ++{
5589 ++ struct vcpu_vmx *vmx = to_vmx(vcpu);
5590 ++
5591 ++ pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx));
5592 ++ if (cpu_has_secondary_exec_ctrls()) {
5593 ++ if (kvm_vcpu_apicv_active(vcpu))
5594 ++ secondary_exec_controls_setbit(vmx,
5595 ++ SECONDARY_EXEC_APIC_REGISTER_VIRT |
5596 ++ SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
5597 ++ else
5598 ++ secondary_exec_controls_clearbit(vmx,
5599 ++ SECONDARY_EXEC_APIC_REGISTER_VIRT |
5600 ++ SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
5601 ++ }
5602 ++
5603 ++ if (cpu_has_vmx_msr_bitmap())
5604 ++ vmx_update_msr_bitmap(vcpu);
5605 ++}
5606 ++
5607 ++u32 vmx_exec_control(struct vcpu_vmx *vmx)
5608 ++{
5609 ++ u32 exec_control = vmcs_config.cpu_based_exec_ctrl;
5610 ++
5611 ++ if (vmx->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)
5612 ++ exec_control &= ~CPU_BASED_MOV_DR_EXITING;
5613 ++
5614 ++ if (!cpu_need_tpr_shadow(&vmx->vcpu)) {
5615 ++ exec_control &= ~CPU_BASED_TPR_SHADOW;
5616 ++#ifdef CONFIG_X86_64
5617 ++ exec_control |= CPU_BASED_CR8_STORE_EXITING |
5618 ++ CPU_BASED_CR8_LOAD_EXITING;
5619 ++#endif
5620 ++ }
5621 ++ if (!enable_ept)
5622 ++ exec_control |= CPU_BASED_CR3_STORE_EXITING |
5623 ++ CPU_BASED_CR3_LOAD_EXITING |
5624 ++ CPU_BASED_INVLPG_EXITING;
5625 ++ if (kvm_mwait_in_guest(vmx->vcpu.kvm))
5626 ++ exec_control &= ~(CPU_BASED_MWAIT_EXITING |
5627 ++ CPU_BASED_MONITOR_EXITING);
5628 ++ if (kvm_hlt_in_guest(vmx->vcpu.kvm))
5629 ++ exec_control &= ~CPU_BASED_HLT_EXITING;
5630 ++ return exec_control;
5631 ++}
5632 ++
5633 ++
5634 ++static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
5635 ++{
5636 ++ struct kvm_vcpu *vcpu = &vmx->vcpu;
5637 ++
5638 ++ u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl;
5639 ++
5640 ++ if (pt_mode == PT_MODE_SYSTEM)
5641 ++ exec_control &= ~(SECONDARY_EXEC_PT_USE_GPA | SECONDARY_EXEC_PT_CONCEAL_VMX);
5642 ++ if (!cpu_need_virtualize_apic_accesses(vcpu))
5643 ++ exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
5644 ++ if (vmx->vpid == 0)
5645 ++ exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
5646 ++ if (!enable_ept) {
5647 ++ exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
5648 ++ enable_unrestricted_guest = 0;
5649 ++ }
5650 ++ if (!enable_unrestricted_guest)
5651 ++ exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
5652 ++ if (kvm_pause_in_guest(vmx->vcpu.kvm))
5653 ++ exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
5654 ++ if (!kvm_vcpu_apicv_active(vcpu))
5655 ++ exec_control &= ~(SECONDARY_EXEC_APIC_REGISTER_VIRT |
5656 ++ SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
5657 ++ exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
5658 ++
5659 ++ /* SECONDARY_EXEC_DESC is enabled/disabled on writes to CR4.UMIP,
5660 ++ * in vmx_set_cr4. */
5661 ++ exec_control &= ~SECONDARY_EXEC_DESC;
5662 ++
5663 ++ /* SECONDARY_EXEC_SHADOW_VMCS is enabled when L1 executes VMPTRLD
5664 ++ (handle_vmptrld).
5665 ++ We can NOT enable shadow_vmcs here because we don't have yet
5666 ++ a current VMCS12
5667 ++ */
5668 ++ exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
5669 ++
5670 ++ if (!enable_pml)
5671 ++ exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
5672 ++
5673 ++ if (vmx_xsaves_supported()) {
5674 ++ /* Exposing XSAVES only when XSAVE is exposed */
5675 ++ bool xsaves_enabled =
5676 ++ guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) &&
5677 ++ guest_cpuid_has(vcpu, X86_FEATURE_XSAVES);
5678 ++
5679 ++ vcpu->arch.xsaves_enabled = xsaves_enabled;
5680 ++
5681 ++ if (!xsaves_enabled)
5682 ++ exec_control &= ~SECONDARY_EXEC_XSAVES;
5683 ++
5684 ++ if (nested) {
5685 ++ if (xsaves_enabled)
5686 ++ vmx->nested.msrs.secondary_ctls_high |=
5687 ++ SECONDARY_EXEC_XSAVES;
5688 ++ else
5689 ++ vmx->nested.msrs.secondary_ctls_high &=
5690 ++ ~SECONDARY_EXEC_XSAVES;
5691 ++ }
5692 ++ }
5693 ++
5694 ++ if (vmx_rdtscp_supported()) {
5695 ++ bool rdtscp_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP);
5696 ++ if (!rdtscp_enabled)
5697 ++ exec_control &= ~SECONDARY_EXEC_RDTSCP;
5698 ++
5699 ++ if (nested) {
5700 ++ if (rdtscp_enabled)
5701 ++ vmx->nested.msrs.secondary_ctls_high |=
5702 ++ SECONDARY_EXEC_RDTSCP;
5703 ++ else
5704 ++ vmx->nested.msrs.secondary_ctls_high &=
5705 ++ ~SECONDARY_EXEC_RDTSCP;
5706 ++ }
5707 ++ }
5708 ++
5709 ++ if (vmx_invpcid_supported()) {
5710 ++ /* Exposing INVPCID only when PCID is exposed */
5711 ++ bool invpcid_enabled =
5712 ++ guest_cpuid_has(vcpu, X86_FEATURE_INVPCID) &&
5713 ++ guest_cpuid_has(vcpu, X86_FEATURE_PCID);
5714 ++
5715 ++ if (!invpcid_enabled) {
5716 ++ exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID;
5717 ++ guest_cpuid_clear(vcpu, X86_FEATURE_INVPCID);
5718 ++ }
5719 ++
5720 ++ if (nested) {
5721 ++ if (invpcid_enabled)
5722 ++ vmx->nested.msrs.secondary_ctls_high |=
5723 ++ SECONDARY_EXEC_ENABLE_INVPCID;
5724 ++ else
5725 ++ vmx->nested.msrs.secondary_ctls_high &=
5726 ++ ~SECONDARY_EXEC_ENABLE_INVPCID;
5727 ++ }
5728 ++ }
5729 ++
5730 ++ if (vmx_rdrand_supported()) {
5731 ++ bool rdrand_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDRAND);
5732 ++ if (rdrand_enabled)
5733 ++ exec_control &= ~SECONDARY_EXEC_RDRAND_EXITING;
5734 ++
5735 ++ if (nested) {
5736 ++ if (rdrand_enabled)
5737 ++ vmx->nested.msrs.secondary_ctls_high |=
5738 ++ SECONDARY_EXEC_RDRAND_EXITING;
5739 ++ else
5740 ++ vmx->nested.msrs.secondary_ctls_high &=
5741 ++ ~SECONDARY_EXEC_RDRAND_EXITING;
5742 ++ }
5743 ++ }
5744 ++
5745 ++ if (vmx_rdseed_supported()) {
5746 ++ bool rdseed_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDSEED);
5747 ++ if (rdseed_enabled)
5748 ++ exec_control &= ~SECONDARY_EXEC_RDSEED_EXITING;
5749 ++
5750 ++ if (nested) {
5751 ++ if (rdseed_enabled)
5752 ++ vmx->nested.msrs.secondary_ctls_high |=
5753 ++ SECONDARY_EXEC_RDSEED_EXITING;
5754 ++ else
5755 ++ vmx->nested.msrs.secondary_ctls_high &=
5756 ++ ~SECONDARY_EXEC_RDSEED_EXITING;
5757 ++ }
5758 ++ }
5759 ++
5760 ++ if (vmx_waitpkg_supported()) {
5761 ++ bool waitpkg_enabled =
5762 ++ guest_cpuid_has(vcpu, X86_FEATURE_WAITPKG);
5763 ++
5764 ++ if (!waitpkg_enabled)
5765 ++ exec_control &= ~SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE;
5766 ++
5767 ++ if (nested) {
5768 ++ if (waitpkg_enabled)
5769 ++ vmx->nested.msrs.secondary_ctls_high |=
5770 ++ SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE;
5771 ++ else
5772 ++ vmx->nested.msrs.secondary_ctls_high &=
5773 ++ ~SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE;
5774 ++ }
5775 ++ }
5776 ++
5777 ++ vmx->secondary_exec_control = exec_control;
5778 ++}
5779 ++
5780 ++static void ept_set_mmio_spte_mask(void)
5781 ++{
5782 ++ /*
5783 ++ * EPT Misconfigurations can be generated if the value of bits 2:0
5784 ++ * of an EPT paging-structure entry is 110b (write/execute).
5785 ++ */
5786 ++ kvm_mmu_set_mmio_spte_mask(VMX_EPT_RWX_MASK,
5787 ++ VMX_EPT_MISCONFIG_WX_VALUE, 0);
5788 ++}
5789 ++
5790 ++#define VMX_XSS_EXIT_BITMAP 0
5791 ++
5792 ++/*
5793 ++ * Noting that the initialization of Guest-state Area of VMCS is in
5794 ++ * vmx_vcpu_reset().
5795 ++ */
5796 ++static void init_vmcs(struct vcpu_vmx *vmx)
5797 ++{
5798 ++ if (nested)
5799 ++ nested_vmx_set_vmcs_shadowing_bitmap();
5800 ++
5801 ++ if (cpu_has_vmx_msr_bitmap())
5802 ++ vmcs_write64(MSR_BITMAP, __pa(vmx->vmcs01.msr_bitmap));
5803 ++
5804 ++ vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
5805 ++
5806 ++ /* Control */
5807 ++ pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx));
5808 ++
5809 ++ exec_controls_set(vmx, vmx_exec_control(vmx));
5810 ++
5811 ++ if (cpu_has_secondary_exec_ctrls()) {
5812 ++ vmx_compute_secondary_exec_control(vmx);
5813 ++ secondary_exec_controls_set(vmx, vmx->secondary_exec_control);
5814 ++ }
5815 ++
5816 ++ if (kvm_vcpu_apicv_active(&vmx->vcpu)) {
5817 ++ vmcs_write64(EOI_EXIT_BITMAP0, 0);
5818 ++ vmcs_write64(EOI_EXIT_BITMAP1, 0);
5819 ++ vmcs_write64(EOI_EXIT_BITMAP2, 0);
5820 ++ vmcs_write64(EOI_EXIT_BITMAP3, 0);
5821 ++
5822 ++ vmcs_write16(GUEST_INTR_STATUS, 0);
5823 ++
5824 ++ vmcs_write16(POSTED_INTR_NV, POSTED_INTR_VECTOR);
5825 ++ vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((&vmx->pi_desc)));
5826 ++ }
5827 ++
5828 ++ if (!kvm_pause_in_guest(vmx->vcpu.kvm)) {
5829 ++ vmcs_write32(PLE_GAP, ple_gap);
5830 ++ vmx->ple_window = ple_window;
5831 ++ vmx->ple_window_dirty = true;
5832 ++ }
5833 ++
5834 ++ vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
5835 ++ vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
5836 ++ vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */
5837 ++
5838 ++ vmcs_write16(HOST_FS_SELECTOR, 0); /* 22.2.4 */
5839 ++ vmcs_write16(HOST_GS_SELECTOR, 0); /* 22.2.4 */
5840 ++ vmx_set_constant_host_state(vmx);
5841 ++ vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */
5842 ++ vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */
5843 ++
5844 ++ if (cpu_has_vmx_vmfunc())
5845 ++ vmcs_write64(VM_FUNCTION_CONTROL, 0);
5846 ++
5847 ++ vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
5848 ++ vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
5849 ++ vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val));
5850 ++ vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
5851 ++ vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val));
5852 ++
5853 ++ if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
5854 ++ vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
5855 ++
5856 ++ vm_exit_controls_set(vmx, vmx_vmexit_ctrl());
5857 ++
5858 ++ /* 22.2.1, 20.8.1 */
5859 ++ vm_entry_controls_set(vmx, vmx_vmentry_ctrl());
5860 ++
5861 ++ vmx->vcpu.arch.cr0_guest_owned_bits = X86_CR0_TS;
5862 ++ vmcs_writel(CR0_GUEST_HOST_MASK, ~X86_CR0_TS);
5863 ++
5864 ++ set_cr4_guest_host_mask(vmx);
5865 ++
5866 ++ if (vmx->vpid != 0)
5867 ++ vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
5868 ++
5869 ++ if (vmx_xsaves_supported())
5870 ++ vmcs_write64(XSS_EXIT_BITMAP, VMX_XSS_EXIT_BITMAP);
5871 ++
5872 ++ if (enable_pml) {
5873 ++ vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
5874 ++ vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
5875 ++ }
5876 ++
5877 ++ if (cpu_has_vmx_encls_vmexit())
5878 ++ vmcs_write64(ENCLS_EXITING_BITMAP, -1ull);
5879 ++
5880 ++ if (pt_mode == PT_MODE_HOST_GUEST) {
5881 ++ memset(&vmx->pt_desc, 0, sizeof(vmx->pt_desc));
5882 ++ /* Bit[6~0] are forced to 1, writes are ignored. */
5883 ++ vmx->pt_desc.guest.output_mask = 0x7F;
5884 ++ vmcs_write64(GUEST_IA32_RTIT_CTL, 0);
5885 ++ }
5886 ++}
5887 ++
5888 ++static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
5889 ++{
5890 ++ struct vcpu_vmx *vmx = to_vmx(vcpu);
5891 ++ struct msr_data apic_base_msr;
5892 ++ u64 cr0;
5893 ++
5894 ++ vmx->rmode.vm86_active = 0;
5895 ++ vmx->spec_ctrl = 0;
5896 ++
5897 ++ vmx->msr_ia32_umwait_control = 0;
5898 ++
5899 ++ vcpu->arch.microcode_version = 0x100000000ULL;
5900 ++ vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val();
5901 ++ vmx->hv_deadline_tsc = -1;
5902 ++ kvm_set_cr8(vcpu, 0);
5903 ++
5904 ++ if (!init_event) {
5905 ++ apic_base_msr.data = APIC_DEFAULT_PHYS_BASE |
5906 ++ MSR_IA32_APICBASE_ENABLE;
5907 ++ if (kvm_vcpu_is_reset_bsp(vcpu))
5908 ++ apic_base_msr.data |= MSR_IA32_APICBASE_BSP;
5909 ++ apic_base_msr.host_initiated = true;
5910 ++ kvm_set_apic_base(vcpu, &apic_base_msr);
5911 ++ }
5912 ++
5913 ++ vmx_segment_cache_clear(vmx);
5914 ++
5915 ++ seg_setup(VCPU_SREG_CS);
5916 ++ vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
5917 ++ vmcs_writel(GUEST_CS_BASE, 0xffff0000ul);
5918 ++
5919 ++ seg_setup(VCPU_SREG_DS);
5920 ++ seg_setup(VCPU_SREG_ES);
5921 ++ seg_setup(VCPU_SREG_FS);
5922 ++ seg_setup(VCPU_SREG_GS);
5923 ++ seg_setup(VCPU_SREG_SS);
5924 ++
5925 ++ vmcs_write16(GUEST_TR_SELECTOR, 0);
5926 ++ vmcs_writel(GUEST_TR_BASE, 0);
5927 ++ vmcs_write32(GUEST_TR_LIMIT, 0xffff);
5928 ++ vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
5929 ++
5930 ++ vmcs_write16(GUEST_LDTR_SELECTOR, 0);
5931 ++ vmcs_writel(GUEST_LDTR_BASE, 0);
5932 ++ vmcs_write32(GUEST_LDTR_LIMIT, 0xffff);
5933 ++ vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082);
5934 ++
5935 ++ if (!init_event) {
5936 ++ vmcs_write32(GUEST_SYSENTER_CS, 0);
5937 ++ vmcs_writel(GUEST_SYSENTER_ESP, 0);
5938 ++ vmcs_writel(GUEST_SYSENTER_EIP, 0);
5939 ++ vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
5940 ++ }
5941 ++
5942 ++ kvm_set_rflags(vcpu, X86_EFLAGS_FIXED);
5943 ++ kvm_rip_write(vcpu, 0xfff0);
5944 ++
5945 ++ vmcs_writel(GUEST_GDTR_BASE, 0);
5946 ++ vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);
5947 ++
5948 ++ vmcs_writel(GUEST_IDTR_BASE, 0);
5949 ++ vmcs_write32(GUEST_IDTR_LIMIT, 0xffff);
5950 ++
5951 ++ vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
5952 ++ vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
5953 ++ vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 0);
5954 ++ if (kvm_mpx_supported())
5955 ++ vmcs_write64(GUEST_BNDCFGS, 0);
5956 ++
5957 ++ setup_msrs(vmx);
5958 ++
5959 ++ vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */
5960 ++
5961 ++ if (cpu_has_vmx_tpr_shadow() && !init_event) {
5962 ++ vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
5963 ++ if (cpu_need_tpr_shadow(vcpu))
5964 ++ vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
5965 ++ __pa(vcpu->arch.apic->regs));
5966 ++ vmcs_write32(TPR_THRESHOLD, 0);
5967 ++ }
5968 ++
5969 ++ kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
5970 ++
5971 ++ cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET;
5972 ++ vmx->vcpu.arch.cr0 = cr0;
5973 ++ vmx_set_cr0(vcpu, cr0); /* enter rmode */
5974 ++ vmx_set_cr4(vcpu, 0);
5975 ++ vmx_set_efer(vcpu, 0);
5976 ++
5977 ++ update_exception_bitmap(vcpu);
5978 ++
5979 ++ vpid_sync_context(vmx->vpid);
5980 ++ if (init_event)
5981 ++ vmx_clear_hlt(vcpu);
5982 ++}
5983 ++
5984 ++static void enable_irq_window(struct kvm_vcpu *vcpu)
5985 ++{
5986 ++ exec_controls_setbit(to_vmx(vcpu), CPU_BASED_INTR_WINDOW_EXITING);
5987 ++}
5988 ++
5989 ++static void enable_nmi_window(struct kvm_vcpu *vcpu)
5990 ++{
5991 ++ if (!enable_vnmi ||
5992 ++ vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) {
5993 ++ enable_irq_window(vcpu);
5994 ++ return;
5995 ++ }
5996 ++
5997 ++ exec_controls_setbit(to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING);
5998 ++}
5999 ++
6000 ++static void vmx_inject_irq(struct kvm_vcpu *vcpu)
6001 ++{
6002 ++ struct vcpu_vmx *vmx = to_vmx(vcpu);
6003 ++ uint32_t intr;
6004 ++ int irq = vcpu->arch.interrupt.nr;
6005 ++
6006 ++ trace_kvm_inj_virq(irq);
6007 ++
6008 ++ ++vcpu->stat.irq_injections;
6009 ++ if (vmx->rmode.vm86_active) {
6010 ++ int inc_eip = 0;
6011 ++ if (vcpu->arch.interrupt.soft)
6012 ++ inc_eip = vcpu->arch.event_exit_inst_len;
6013 ++ kvm_inject_realmode_interrupt(vcpu, irq, inc_eip);
6014 ++ return;
6015 ++ }
6016 ++ intr = irq | INTR_INFO_VALID_MASK;
6017 ++ if (vcpu->arch.interrupt.soft) {
6018 ++ intr |= INTR_TYPE_SOFT_INTR;
6019 ++ vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
6020 ++ vmx->vcpu.arch.event_exit_inst_len);
6021 ++ } else
6022 ++ intr |= INTR_TYPE_EXT_INTR;
6023 ++ vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr);
6024 ++
6025 ++ vmx_clear_hlt(vcpu);
6026 ++}
6027 ++
6028 ++static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
6029 ++{
6030 ++ struct vcpu_vmx *vmx = to_vmx(vcpu);
6031 ++
6032 ++ if (!enable_vnmi) {
6033 ++ /*
6034 ++ * Tracking the NMI-blocked state in software is built upon
6035 ++ * finding the next open IRQ window. This, in turn, depends on
6036 ++ * well-behaving guests: They have to keep IRQs disabled at
6037 ++ * least as long as the NMI handler runs. Otherwise we may
6038 ++ * cause NMI nesting, maybe breaking the guest. But as this is
6039 ++ * highly unlikely, we can live with the residual risk.
6040 ++ */
6041 ++ vmx->loaded_vmcs->soft_vnmi_blocked = 1;
6042 ++ vmx->loaded_vmcs->vnmi_blocked_time = 0;
6043 ++ }
6044 ++
6045 ++ ++vcpu->stat.nmi_injections;
6046 ++ vmx->loaded_vmcs->nmi_known_unmasked = false;
6047 ++
6048 ++ if (vmx->rmode.vm86_active) {
6049 ++ kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR, 0);
6050 ++ return;
6051 ++ }
6052 ++
6053 ++ vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
6054 ++ INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);
6055 ++
6056 ++ vmx_clear_hlt(vcpu);
6057 ++}
6058 ++
6059 ++bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu)
6060 ++{
6061 ++ struct vcpu_vmx *vmx = to_vmx(vcpu);
6062 ++ bool masked;
6063 ++
6064 ++ if (!enable_vnmi)
6065 ++ return vmx->loaded_vmcs->soft_vnmi_blocked;
6066 ++ if (vmx->loaded_vmcs->nmi_known_unmasked)
6067 ++ return false;
6068 ++ masked = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI;
6069 ++ vmx->loaded_vmcs->nmi_known_unmasked = !masked;
6070 ++ return masked;
6071 ++}
6072 ++
6073 ++void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
6074 ++{
6075 ++ struct vcpu_vmx *vmx = to_vmx(vcpu);
6076 ++
6077 ++ if (!enable_vnmi) {
6078 ++ if (vmx->loaded_vmcs->soft_vnmi_blocked != masked) {
6079 ++ vmx->loaded_vmcs->soft_vnmi_blocked = masked;
6080 ++ vmx->loaded_vmcs->vnmi_blocked_time = 0;
6081 ++ }
6082 ++ } else {
6083 ++ vmx->loaded_vmcs->nmi_known_unmasked = !masked;
6084 ++ if (masked)
6085 ++ vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
6086 ++ GUEST_INTR_STATE_NMI);
6087 ++ else
6088 ++ vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
6089 ++ GUEST_INTR_STATE_NMI);
6090 ++ }
6091 ++}
6092 ++
6093 ++static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
6094 ++{
6095 ++ if (to_vmx(vcpu)->nested.nested_run_pending)
6096 ++ return 0;
6097 ++
6098 ++ if (!enable_vnmi &&
6099 ++ to_vmx(vcpu)->loaded_vmcs->soft_vnmi_blocked)
6100 ++ return 0;
6101 ++
6102 ++ return !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
6103 ++ (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI
6104 ++ | GUEST_INTR_STATE_NMI));
6105 ++}
6106 ++
6107 ++static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu)
6108 ++{
6109 ++ return (!to_vmx(vcpu)->nested.nested_run_pending &&
6110 ++ vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
6111 ++ !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
6112 ++ (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS));
6113 ++}
6114 ++
6115 ++static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
6116 ++{
6117 ++ int ret;
6118 ++
6119 ++ if (enable_unrestricted_guest)
6120 ++ return 0;
6121 ++
6122 ++ ret = x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, addr,
6123 ++ PAGE_SIZE * 3);
6124 ++ if (ret)
6125 ++ return ret;
6126 ++ to_kvm_vmx(kvm)->tss_addr = addr;
6127 ++ return init_rmode_tss(kvm);
6128 ++}
6129 ++
6130 ++static int vmx_set_identity_map_addr(struct kvm *kvm, u64 ident_addr)
6131 ++{
6132 ++ to_kvm_vmx(kvm)->ept_identity_map_addr = ident_addr;
6133 ++ return 0;
6134 ++}
6135 ++
6136 ++static bool rmode_exception(struct kvm_vcpu *vcpu, int vec)
6137 ++{
6138 ++ switch (vec) {
6139 ++ case BP_VECTOR:
6140 ++ /*
6141 ++ * Update instruction length as we may reinject the exception
6142 ++ * from user space while in guest debugging mode.
6143 ++ */
6144 ++ to_vmx(vcpu)->vcpu.arch.event_exit_inst_len =
6145 ++ vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
6146 ++ if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
6147 ++ return false;
6148 ++ /* fall through */
6149 ++ case DB_VECTOR:
6150 ++ if (vcpu->guest_debug &
6151 ++ (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
6152 ++ return false;
6153 ++ /* fall through */
6154 ++ case DE_VECTOR:
6155 ++ case OF_VECTOR:
6156 ++ case BR_VECTOR:
6157 ++ case UD_VECTOR:
6158 ++ case DF_VECTOR:
6159 ++ case SS_VECTOR:
6160 ++ case GP_VECTOR:
6161 ++ case MF_VECTOR:
6162 ++ return true;
6163 ++ break;
6164 ++ }
6165 ++ return false;
6166 ++}
6167 ++
6168 ++static int handle_rmode_exception(struct kvm_vcpu *vcpu,
6169 ++ int vec, u32 err_code)
6170 ++{
6171 ++ /*
6172 ++ * Instruction with address size override prefix opcode 0x67
6173 ++ * Cause the #SS fault with 0 error code in VM86 mode.
6174 ++ */
6175 ++ if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) {
6176 ++ if (kvm_emulate_instruction(vcpu, 0)) {
6177 ++ if (vcpu->arch.halt_request) {
6178 ++ vcpu->arch.halt_request = 0;
6179 ++ return kvm_vcpu_halt(vcpu);
6180 ++ }
6181 ++ return 1;
6182 ++ }
6183 ++ return 0;
6184 ++ }
6185 ++
6186 ++ /*
6187 ++ * Forward all other exceptions that are valid in real mode.
6188 ++ * FIXME: Breaks guest debugging in real mode, needs to be fixed with
6189 ++ * the required debugging infrastructure rework.
6190 ++ */
6191 ++ kvm_queue_exception(vcpu, vec);
6192 ++ return 1;
6193 ++}
6194 ++
6195 ++/*
6196 ++ * Trigger machine check on the host. We assume all the MSRs are already set up
6197 ++ * by the CPU and that we still run on the same CPU as the MCE occurred on.
6198 ++ * We pass a fake environment to the machine check handler because we want
6199 ++ * the guest to be always treated like user space, no matter what context
6200 ++ * it used internally.
6201 ++ */
6202 ++static void kvm_machine_check(void)
6203 ++{
6204 ++#if defined(CONFIG_X86_MCE) && defined(CONFIG_X86_64)
6205 ++ struct pt_regs regs = {
6206 ++ .cs = 3, /* Fake ring 3 no matter what the guest ran on */
6207 ++ .flags = X86_EFLAGS_IF,
6208 ++ };
6209 ++
6210 ++ do_machine_check(&regs, 0);
6211 ++#endif
6212 ++}
6213 ++
6214 ++static int handle_machine_check(struct kvm_vcpu *vcpu)
6215 ++{
6216 ++ /* handled by vmx_vcpu_run() */
6217 ++ return 1;
6218 ++}
6219 ++
6220 ++static int handle_exception_nmi(struct kvm_vcpu *vcpu)
6221 ++{
6222 ++ struct vcpu_vmx *vmx = to_vmx(vcpu);
6223 ++ struct kvm_run *kvm_run = vcpu->run;
6224 ++ u32 intr_info, ex_no, error_code;
6225 ++ unsigned long cr2, rip, dr6;
6226 ++ u32 vect_info;
6227 ++
6228 ++ vect_info = vmx->idt_vectoring_info;
6229 ++ intr_info = vmx->exit_intr_info;
6230 ++
6231 ++ if (is_machine_check(intr_info) || is_nmi(intr_info))
6232 ++ return 1; /* handled by handle_exception_nmi_irqoff() */
6233 ++
6234 ++ if (is_invalid_opcode(intr_info))
6235 ++ return handle_ud(vcpu);
6236 ++
6237 ++ error_code = 0;
6238 ++ if (intr_info & INTR_INFO_DELIVER_CODE_MASK)
6239 ++ error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
6240 ++
6241 ++ if (!vmx->rmode.vm86_active && is_gp_fault(intr_info)) {
6242 ++ WARN_ON_ONCE(!enable_vmware_backdoor);
6243 ++
6244 ++ /*
6245 ++ * VMware backdoor emulation on #GP interception only handles
6246 ++ * IN{S}, OUT{S}, and RDPMC, none of which generate a non-zero
6247 ++ * error code on #GP.
6248 ++ */
6249 ++ if (error_code) {
6250 ++ kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
6251 ++ return 1;
6252 ++ }
6253 ++ return kvm_emulate_instruction(vcpu, EMULTYPE_VMWARE_GP);
6254 ++ }
6255 ++
6256 ++ /*
6257 ++ * The #PF with PFEC.RSVD = 1 indicates the guest is accessing
6258 ++ * MMIO, it is better to report an internal error.
6259 ++ * See the comments in vmx_handle_exit.
6260 ++ */
6261 ++ if ((vect_info & VECTORING_INFO_VALID_MASK) &&
6262 ++ !(is_page_fault(intr_info) && !(error_code & PFERR_RSVD_MASK))) {
6263 ++ vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
6264 ++ vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX;
6265 ++ vcpu->run->internal.ndata = 3;
6266 ++ vcpu->run->internal.data[0] = vect_info;
6267 ++ vcpu->run->internal.data[1] = intr_info;
6268 ++ vcpu->run->internal.data[2] = error_code;
6269 ++ return 0;
6270 ++ }
6271 ++
6272 ++ if (is_page_fault(intr_info)) {
6273 ++ cr2 = vmcs_readl(EXIT_QUALIFICATION);
6274 ++ /* EPT won't cause page fault directly */
6275 ++ WARN_ON_ONCE(!vcpu->arch.apf.host_apf_reason && enable_ept);
6276 ++ return kvm_handle_page_fault(vcpu, error_code, cr2, NULL, 0);
6277 ++ }
6278 ++
6279 ++ ex_no = intr_info & INTR_INFO_VECTOR_MASK;
6280 ++
6281 ++ if (vmx->rmode.vm86_active && rmode_exception(vcpu, ex_no))
6282 ++ return handle_rmode_exception(vcpu, ex_no, error_code);
6283 ++
6284 ++ switch (ex_no) {
6285 ++ case AC_VECTOR:
6286 ++ kvm_queue_exception_e(vcpu, AC_VECTOR, error_code);
6287 ++ return 1;
6288 ++ case DB_VECTOR:
6289 ++ dr6 = vmcs_readl(EXIT_QUALIFICATION);
6290 ++ if (!(vcpu->guest_debug &
6291 ++ (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) {
6292 ++ vcpu->arch.dr6 &= ~DR_TRAP_BITS;
6293 ++ vcpu->arch.dr6 |= dr6 | DR6_RTM;
6294 ++ if (is_icebp(intr_info))
6295 ++ WARN_ON(!skip_emulated_instruction(vcpu));
6296 ++
6297 ++ kvm_queue_exception(vcpu, DB_VECTOR);
6298 ++ return 1;
6299 ++ }
6300 ++ kvm_run->debug.arch.dr6 = dr6 | DR6_FIXED_1;
6301 ++ kvm_run->debug.arch.dr7 = vmcs_readl(GUEST_DR7);
6302 ++ /* fall through */
6303 ++ case BP_VECTOR:
6304 ++ /*
6305 ++ * Update instruction length as we may reinject #BP from
6306 ++ * user space while in guest debugging mode. Reading it for
6307 ++ * #DB as well causes no harm, it is not used in that case.
6308 ++ */
6309 ++ vmx->vcpu.arch.event_exit_inst_len =
6310 ++ vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
6311 ++ kvm_run->exit_reason = KVM_EXIT_DEBUG;
6312 ++ rip = kvm_rip_read(vcpu);
6313 ++ kvm_run->debug.arch.pc = vmcs_readl(GUEST_CS_BASE) + rip;
6314 ++ kvm_run->debug.arch.exception = ex_no;
6315 ++ break;
6316 ++ default:
6317 ++ kvm_run->exit_reason = KVM_EXIT_EXCEPTION;
6318 ++ kvm_run->ex.exception = ex_no;
6319 ++ kvm_run->ex.error_code = error_code;
6320 ++ break;
6321 ++ }
6322 ++ return 0;
6323 ++}
6324 ++
6325 ++static __always_inline int handle_external_interrupt(struct kvm_vcpu *vcpu)
6326 ++{
6327 ++ ++vcpu->stat.irq_exits;
6328 ++ return 1;
6329 ++}
6330 ++
6331 ++static int handle_triple_fault(struct kvm_vcpu *vcpu)
6332 ++{
6333 ++ vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
6334 ++ vcpu->mmio_needed = 0;
6335 ++ return 0;
6336 ++}
6337 ++
6338 ++static int handle_io(struct kvm_vcpu *vcpu)
6339 ++{
6340 ++ unsigned long exit_qualification;
6341 ++ int size, in, string;
6342 ++ unsigned port;
6343 ++
6344 ++ exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
6345 ++ string = (exit_qualification & 16) != 0;
6346 ++
6347 ++ ++vcpu->stat.io_exits;
6348 ++
6349 ++ if (string)
6350 ++ return kvm_emulate_instruction(vcpu, 0);
6351 ++
6352 ++ port = exit_qualification >> 16;
6353 ++ size = (exit_qualification & 7) + 1;
6354 ++ in = (exit_qualification & 8) != 0;
6355 ++
6356 ++ return kvm_fast_pio(vcpu, size, port, in);
6357 ++}
6358 ++
6359 ++static void
6360 ++vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
6361 ++{
6362 ++ /*
6363 ++ * Patch in the VMCALL instruction:
6364 ++ */
6365 ++ hypercall[0] = 0x0f;
6366 ++ hypercall[1] = 0x01;
6367 ++ hypercall[2] = 0xc1;
6368 ++}
6369 ++
6370 ++/* called to set cr0 as appropriate for a mov-to-cr0 exit. */
6371 ++static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val)
6372 ++{
6373 ++ if (is_guest_mode(vcpu)) {
6374 ++ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
6375 ++ unsigned long orig_val = val;
6376 ++
6377 ++ /*
6378 ++ * We get here when L2 changed cr0 in a way that did not change
6379 ++ * any of L1's shadowed bits (see nested_vmx_exit_handled_cr),
6380 ++ * but did change L0 shadowed bits. So we first calculate the
6381 ++ * effective cr0 value that L1 would like to write into the
6382 ++ * hardware. It consists of the L2-owned bits from the new
6383 ++ * value combined with the L1-owned bits from L1's guest_cr0.
6384 ++ */
6385 ++ val = (val & ~vmcs12->cr0_guest_host_mask) |
6386 ++ (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask);
6387 ++
6388 ++ if (!nested_guest_cr0_valid(vcpu, val))
6389 ++ return 1;
6390 ++
6391 ++ if (kvm_set_cr0(vcpu, val))
6392 ++ return 1;
6393 ++ vmcs_writel(CR0_READ_SHADOW, orig_val);
6394 ++ return 0;
6395 ++ } else {
6396 ++ if (to_vmx(vcpu)->nested.vmxon &&
6397 ++ !nested_host_cr0_valid(vcpu, val))
6398 ++ return 1;
6399 ++
6400 ++ return kvm_set_cr0(vcpu, val);
6401 ++ }
6402 ++}
6403 ++
6404 ++static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val)
6405 ++{
6406 ++ if (is_guest_mode(vcpu)) {
6407 ++ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
6408 ++ unsigned long orig_val = val;
6409 ++
6410 ++ /* analogously to handle_set_cr0 */
6411 ++ val = (val & ~vmcs12->cr4_guest_host_mask) |
6412 ++ (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask);
6413 ++ if (kvm_set_cr4(vcpu, val))
6414 ++ return 1;
6415 ++ vmcs_writel(CR4_READ_SHADOW, orig_val);
6416 ++ return 0;
6417 ++ } else
6418 ++ return kvm_set_cr4(vcpu, val);
6419 ++}
6420 ++
6421 ++static int handle_desc(struct kvm_vcpu *vcpu)
6422 ++{
6423 ++ WARN_ON(!(vcpu->arch.cr4 & X86_CR4_UMIP));
6424 ++ return kvm_emulate_instruction(vcpu, 0);
6425 ++}
6426 ++
6427 ++static int handle_cr(struct kvm_vcpu *vcpu)
6428 ++{
6429 ++ unsigned long exit_qualification, val;
6430 ++ int cr;
6431 ++ int reg;
6432 ++ int err;
6433 ++ int ret;
6434 ++
6435 ++ exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
6436 ++ cr = exit_qualification & 15;
6437 ++ reg = (exit_qualification >> 8) & 15;
6438 ++ switch ((exit_qualification >> 4) & 3) {
6439 ++ case 0: /* mov to cr */
6440 ++ val = kvm_register_readl(vcpu, reg);
6441 ++ trace_kvm_cr_write(cr, val);
6442 ++ switch (cr) {
6443 ++ case 0:
6444 ++ err = handle_set_cr0(vcpu, val);
6445 ++ return kvm_complete_insn_gp(vcpu, err);
6446 ++ case 3:
6447 ++ WARN_ON_ONCE(enable_unrestricted_guest);
6448 ++ err = kvm_set_cr3(vcpu, val);
6449 ++ return kvm_complete_insn_gp(vcpu, err);
6450 ++ case 4:
6451 ++ err = handle_set_cr4(vcpu, val);
6452 ++ return kvm_complete_insn_gp(vcpu, err);
6453 ++ case 8: {
6454 ++ u8 cr8_prev = kvm_get_cr8(vcpu);
6455 ++ u8 cr8 = (u8)val;
6456 ++ err = kvm_set_cr8(vcpu, cr8);
6457 ++ ret = kvm_complete_insn_gp(vcpu, err);
6458 ++ if (lapic_in_kernel(vcpu))
6459 ++ return ret;
6460 ++ if (cr8_prev <= cr8)
6461 ++ return ret;
6462 ++ /*
6463 ++ * TODO: we might be squashing a
6464 ++ * KVM_GUESTDBG_SINGLESTEP-triggered
6465 ++ * KVM_EXIT_DEBUG here.
6466 ++ */
6467 ++ vcpu->run->exit_reason = KVM_EXIT_SET_TPR;
6468 ++ return 0;
6469 ++ }
6470 ++ }
6471 ++ break;
6472 ++ case 2: /* clts */
6473 ++ WARN_ONCE(1, "Guest should always own CR0.TS");
6474 ++ vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS));
6475 ++ trace_kvm_cr_write(0, kvm_read_cr0(vcpu));
6476 ++ return kvm_skip_emulated_instruction(vcpu);
6477 ++ case 1: /*mov from cr*/
6478 ++ switch (cr) {
6479 ++ case 3:
6480 ++ WARN_ON_ONCE(enable_unrestricted_guest);
6481 ++ val = kvm_read_cr3(vcpu);
6482 ++ kvm_register_write(vcpu, reg, val);
6483 ++ trace_kvm_cr_read(cr, val);
6484 ++ return kvm_skip_emulated_instruction(vcpu);
6485 ++ case 8:
6486 ++ val = kvm_get_cr8(vcpu);
6487 ++ kvm_register_write(vcpu, reg, val);
6488 ++ trace_kvm_cr_read(cr, val);
6489 ++ return kvm_skip_emulated_instruction(vcpu);
6490 ++ }
6491 ++ break;
6492 ++ case 3: /* lmsw */
6493 ++ val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f;
6494 ++ trace_kvm_cr_write(0, (kvm_read_cr0(vcpu) & ~0xful) | val);
6495 ++ kvm_lmsw(vcpu, val);
6496 ++
6497 ++ return kvm_skip_emulated_instruction(vcpu);
6498 ++ default:
6499 ++ break;
6500 ++ }
6501 ++ vcpu->run->exit_reason = 0;
6502 ++ vcpu_unimpl(vcpu, "unhandled control register: op %d cr %d\n",
6503 ++ (int)(exit_qualification >> 4) & 3, cr);
6504 ++ return 0;
6505 ++}
6506 ++
6507 ++static int handle_dr(struct kvm_vcpu *vcpu)
6508 ++{
6509 ++ unsigned long exit_qualification;
6510 ++ int dr, dr7, reg;
6511 ++
6512 ++ exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
6513 ++ dr = exit_qualification & DEBUG_REG_ACCESS_NUM;
6514 ++
6515 ++ /* First, if DR does not exist, trigger UD */
6516 ++ if (!kvm_require_dr(vcpu, dr))
6517 ++ return 1;
6518 ++
6519 ++ /* Do not handle if the CPL > 0, will trigger GP on re-entry */
6520 ++ if (!kvm_require_cpl(vcpu, 0))
6521 ++ return 1;
6522 ++ dr7 = vmcs_readl(GUEST_DR7);
6523 ++ if (dr7 & DR7_GD) {
6524 ++ /*
6525 ++ * As the vm-exit takes precedence over the debug trap, we
6526 ++ * need to emulate the latter, either for the host or the
6527 ++ * guest debugging itself.
6528 ++ */
6529 ++ if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) {
6530 ++ vcpu->run->debug.arch.dr6 = vcpu->arch.dr6;
6531 ++ vcpu->run->debug.arch.dr7 = dr7;
6532 ++ vcpu->run->debug.arch.pc = kvm_get_linear_rip(vcpu);
6533 ++ vcpu->run->debug.arch.exception = DB_VECTOR;
6534 ++ vcpu->run->exit_reason = KVM_EXIT_DEBUG;
6535 ++ return 0;
6536 ++ } else {
6537 ++ vcpu->arch.dr6 &= ~DR_TRAP_BITS;
6538 ++ vcpu->arch.dr6 |= DR6_BD | DR6_RTM;
6539 ++ kvm_queue_exception(vcpu, DB_VECTOR);
6540 ++ return 1;
6541 ++ }
6542 ++ }
6543 ++
6544 ++ if (vcpu->guest_debug == 0) {
6545 ++ exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_MOV_DR_EXITING);
6546 ++
6547 ++ /*
6548 ++ * No more DR vmexits; force a reload of the debug registers
6549 ++ * and reenter on this instruction. The next vmexit will
6550 ++ * retrieve the full state of the debug registers.
6551 ++ */
6552 ++ vcpu->arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT;
6553 ++ return 1;
6554 ++ }
6555 ++
6556 ++ reg = DEBUG_REG_ACCESS_REG(exit_qualification);
6557 ++ if (exit_qualification & TYPE_MOV_FROM_DR) {
6558 ++ unsigned long val;
6559 ++
6560 ++ if (kvm_get_dr(vcpu, dr, &val))
6561 ++ return 1;
6562 ++ kvm_register_write(vcpu, reg, val);
6563 ++ } else
6564 ++ if (kvm_set_dr(vcpu, dr, kvm_register_readl(vcpu, reg)))
6565 ++ return 1;
6566 ++
6567 ++ return kvm_skip_emulated_instruction(vcpu);
6568 ++}
6569 ++
6570 ++static u64 vmx_get_dr6(struct kvm_vcpu *vcpu)
6571 ++{
6572 ++ return vcpu->arch.dr6;
6573 ++}
6574 ++
6575 ++static void vmx_set_dr6(struct kvm_vcpu *vcpu, unsigned long val)
6576 ++{
6577 ++}
6578 ++
6579 ++static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
6580 ++{
6581 ++ get_debugreg(vcpu->arch.db[0], 0);
6582 ++ get_debugreg(vcpu->arch.db[1], 1);
6583 ++ get_debugreg(vcpu->arch.db[2], 2);
6584 ++ get_debugreg(vcpu->arch.db[3], 3);
6585 ++ get_debugreg(vcpu->arch.dr6, 6);
6586 ++ vcpu->arch.dr7 = vmcs_readl(GUEST_DR7);
6587 ++
6588 ++ vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT;
6589 ++ exec_controls_setbit(to_vmx(vcpu), CPU_BASED_MOV_DR_EXITING);
6590 ++}
6591 ++
6592 ++static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val)
6593 ++{
6594 ++ vmcs_writel(GUEST_DR7, val);
6595 ++}
6596 ++
6597 ++static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu)
6598 ++{
6599 ++ kvm_apic_update_ppr(vcpu);
6600 ++ return 1;
6601 ++}
6602 ++
6603 ++static int handle_interrupt_window(struct kvm_vcpu *vcpu)
6604 ++{
6605 ++ exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_INTR_WINDOW_EXITING);
6606 ++
6607 ++ kvm_make_request(KVM_REQ_EVENT, vcpu);
6608 ++
6609 ++ ++vcpu->stat.irq_window_exits;
6610 ++ return 1;
6611 ++}
6612 ++
6613 ++static int handle_vmcall(struct kvm_vcpu *vcpu)
6614 ++{
6615 ++ return kvm_emulate_hypercall(vcpu);
6616 ++}
6617 ++
6618 ++static int handle_invd(struct kvm_vcpu *vcpu)
6619 ++{
6620 ++ return kvm_emulate_instruction(vcpu, 0);
6621 ++}
6622 ++
6623 ++static int handle_invlpg(struct kvm_vcpu *vcpu)
6624 ++{
6625 ++ unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
6626 ++
6627 ++ kvm_mmu_invlpg(vcpu, exit_qualification);
6628 ++ return kvm_skip_emulated_instruction(vcpu);
6629 ++}
6630 ++
6631 ++static int handle_rdpmc(struct kvm_vcpu *vcpu)
6632 ++{
6633 ++ int err;
6634 ++
6635 ++ err = kvm_rdpmc(vcpu);
6636 ++ return kvm_complete_insn_gp(vcpu, err);
6637 ++}
6638 ++
6639 ++static int handle_wbinvd(struct kvm_vcpu *vcpu)
6640 ++{
6641 ++ return kvm_emulate_wbinvd(vcpu);
6642 ++}
6643 ++
6644 ++static int handle_xsetbv(struct kvm_vcpu *vcpu)
6645 ++{
6646 ++ u64 new_bv = kvm_read_edx_eax(vcpu);
6647 ++ u32 index = kvm_rcx_read(vcpu);
6648 ++
6649 ++ if (kvm_set_xcr(vcpu, index, new_bv) == 0)
6650 ++ return kvm_skip_emulated_instruction(vcpu);
6651 ++ return 1;
6652 ++}
6653 ++
6654 ++static int handle_apic_access(struct kvm_vcpu *vcpu)
6655 ++{
6656 ++ if (likely(fasteoi)) {
6657 ++ unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
6658 ++ int access_type, offset;
6659 ++
6660 ++ access_type = exit_qualification & APIC_ACCESS_TYPE;
6661 ++ offset = exit_qualification & APIC_ACCESS_OFFSET;
6662 ++ /*
6663 ++ * Sane guest uses MOV to write EOI, with written value
6664 ++ * not cared. So make a short-circuit here by avoiding
6665 ++ * heavy instruction emulation.
6666 ++ */
6667 ++ if ((access_type == TYPE_LINEAR_APIC_INST_WRITE) &&
6668 ++ (offset == APIC_EOI)) {
6669 ++ kvm_lapic_set_eoi(vcpu);
6670 ++ return kvm_skip_emulated_instruction(vcpu);
6671 ++ }
6672 ++ }
6673 ++ return kvm_emulate_instruction(vcpu, 0);
6674 ++}
6675 ++
6676 ++static int handle_apic_eoi_induced(struct kvm_vcpu *vcpu)
6677 ++{
6678 ++ unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
6679 ++ int vector = exit_qualification & 0xff;
6680 ++
6681 ++ /* EOI-induced VM exit is trap-like and thus no need to adjust IP */
6682 ++ kvm_apic_set_eoi_accelerated(vcpu, vector);
6683 ++ return 1;
6684 ++}
6685 ++
6686 ++static int handle_apic_write(struct kvm_vcpu *vcpu)
6687 ++{
6688 ++ unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
6689 ++ u32 offset = exit_qualification & 0xfff;
6690 ++
6691 ++ /* APIC-write VM exit is trap-like and thus no need to adjust IP */
6692 ++ kvm_apic_write_nodecode(vcpu, offset);
6693 ++ return 1;
6694 ++}
6695 ++
6696 ++static int handle_task_switch(struct kvm_vcpu *vcpu)
6697 ++{
6698 ++ struct vcpu_vmx *vmx = to_vmx(vcpu);
6699 ++ unsigned long exit_qualification;
6700 ++ bool has_error_code = false;
6701 ++ u32 error_code = 0;
6702 ++ u16 tss_selector;
6703 ++ int reason, type, idt_v, idt_index;
6704 ++
6705 ++ idt_v = (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK);
6706 ++ idt_index = (vmx->idt_vectoring_info & VECTORING_INFO_VECTOR_MASK);
6707 ++ type = (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK);
6708 ++
6709 ++ exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
6710 ++
6711 ++ reason = (u32)exit_qualification >> 30;
6712 ++ if (reason == TASK_SWITCH_GATE && idt_v) {
6713 ++ switch (type) {
6714 ++ case INTR_TYPE_NMI_INTR:
6715 ++ vcpu->arch.nmi_injected = false;
6716 ++ vmx_set_nmi_mask(vcpu, true);
6717 ++ break;
6718 ++ case INTR_TYPE_EXT_INTR:
6719 ++ case INTR_TYPE_SOFT_INTR:
6720 ++ kvm_clear_interrupt_queue(vcpu);
6721 ++ break;
6722 ++ case INTR_TYPE_HARD_EXCEPTION:
6723 ++ if (vmx->idt_vectoring_info &
6724 ++ VECTORING_INFO_DELIVER_CODE_MASK) {
6725 ++ has_error_code = true;
6726 ++ error_code =
6727 ++ vmcs_read32(IDT_VECTORING_ERROR_CODE);
6728 ++ }
6729 ++ /* fall through */
6730 ++ case INTR_TYPE_SOFT_EXCEPTION:
6731 ++ kvm_clear_exception_queue(vcpu);
6732 ++ break;
6733 ++ default:
6734 ++ break;
6735 ++ }
6736 ++ }
6737 ++ tss_selector = exit_qualification;
6738 ++
6739 ++ if (!idt_v || (type != INTR_TYPE_HARD_EXCEPTION &&
6740 ++ type != INTR_TYPE_EXT_INTR &&
6741 ++ type != INTR_TYPE_NMI_INTR))
6742 ++ WARN_ON(!skip_emulated_instruction(vcpu));
6743 ++
6744 ++ /*
6745 ++ * TODO: What about debug traps on tss switch?
6746 ++ * Are we supposed to inject them and update dr6?
6747 ++ */
6748 ++ return kvm_task_switch(vcpu, tss_selector,
6749 ++ type == INTR_TYPE_SOFT_INTR ? idt_index : -1,
6750 ++ reason, has_error_code, error_code);
6751 ++}
6752 ++
6753 ++static int handle_ept_violation(struct kvm_vcpu *vcpu)
6754 ++{
6755 ++ unsigned long exit_qualification;
6756 ++ gpa_t gpa;
6757 ++ u64 error_code;
6758 ++
6759 ++ exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
6760 ++
6761 ++ /*
6762 ++ * EPT violation happened while executing iret from NMI,
6763 ++ * "blocked by NMI" bit has to be set before next VM entry.
6764 ++ * There are errata that may cause this bit to not be set:
6765 ++ * AAK134, BY25.
6766 ++ */
6767 ++ if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
6768 ++ enable_vnmi &&
6769 ++ (exit_qualification & INTR_INFO_UNBLOCK_NMI))
6770 ++ vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI);
6771 ++
6772 ++ gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
6773 ++ trace_kvm_page_fault(gpa, exit_qualification);
6774 ++
6775 ++ /* Is it a read fault? */
6776 ++ error_code = (exit_qualification & EPT_VIOLATION_ACC_READ)
6777 ++ ? PFERR_USER_MASK : 0;
6778 ++ /* Is it a write fault? */
6779 ++ error_code |= (exit_qualification & EPT_VIOLATION_ACC_WRITE)
6780 ++ ? PFERR_WRITE_MASK : 0;
6781 ++ /* Is it a fetch fault? */
6782 ++ error_code |= (exit_qualification & EPT_VIOLATION_ACC_INSTR)
6783 ++ ? PFERR_FETCH_MASK : 0;
6784 ++ /* ept page table entry is present? */
6785 ++ error_code |= (exit_qualification &
6786 ++ (EPT_VIOLATION_READABLE | EPT_VIOLATION_WRITABLE |
6787 ++ EPT_VIOLATION_EXECUTABLE))
6788 ++ ? PFERR_PRESENT_MASK : 0;
6789 ++
6790 ++ error_code |= (exit_qualification & 0x100) != 0 ?
6791 ++ PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK;
6792 ++
6793 ++ vcpu->arch.exit_qualification = exit_qualification;
6794 ++ return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0);
6795 ++}
6796 ++
6797 ++static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
6798 ++{
6799 ++ gpa_t gpa;
6800 ++
6801 ++ /*
6802 ++ * A nested guest cannot optimize MMIO vmexits, because we have an
6803 ++ * nGPA here instead of the required GPA.
6804 ++ */
6805 ++ gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
6806 ++ if (!is_guest_mode(vcpu) &&
6807 ++ !kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) {
6808 ++ trace_kvm_fast_mmio(gpa);
6809 ++ return kvm_skip_emulated_instruction(vcpu);
6810 ++ }
6811 ++
6812 ++ return kvm_mmu_page_fault(vcpu, gpa, PFERR_RSVD_MASK, NULL, 0);
6813 ++}
6814 ++
6815 ++static int handle_nmi_window(struct kvm_vcpu *vcpu)
6816 ++{
6817 ++ WARN_ON_ONCE(!enable_vnmi);
6818 ++ exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING);
6819 ++ ++vcpu->stat.nmi_window_exits;
6820 ++ kvm_make_request(KVM_REQ_EVENT, vcpu);
6821 ++
6822 ++ return 1;
6823 ++}
6824 ++
6825 ++static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
6826 ++{
6827 ++ struct vcpu_vmx *vmx = to_vmx(vcpu);
6828 ++ bool intr_window_requested;
6829 ++ unsigned count = 130;
6830 ++
6831 ++ /*
6832 ++ * We should never reach the point where we are emulating L2
6833 ++ * due to invalid guest state as that means we incorrectly
6834 ++ * allowed a nested VMEntry with an invalid vmcs12.
6835 ++ */
6836 ++ WARN_ON_ONCE(vmx->emulation_required && vmx->nested.nested_run_pending);
6837 ++
6838 ++ intr_window_requested = exec_controls_get(vmx) &
6839 ++ CPU_BASED_INTR_WINDOW_EXITING;
6840 ++
6841 ++ while (vmx->emulation_required && count-- != 0) {
6842 ++ if (intr_window_requested && vmx_interrupt_allowed(vcpu))
6843 ++ return handle_interrupt_window(&vmx->vcpu);
6844 ++
6845 ++ if (kvm_test_request(KVM_REQ_EVENT, vcpu))
6846 ++ return 1;
6847 ++
6848 ++ if (!kvm_emulate_instruction(vcpu, 0))
6849 ++ return 0;
6850 ++
6851 ++ if (vmx->emulation_required && !vmx->rmode.vm86_active &&
6852 ++ vcpu->arch.exception.pending) {
6853 ++ vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
6854 ++ vcpu->run->internal.suberror =
6855 ++ KVM_INTERNAL_ERROR_EMULATION;
6856 ++ vcpu->run->internal.ndata = 0;
6857 ++ return 0;
6858 ++ }
6859 ++
6860 ++ if (vcpu->arch.halt_request) {
6861 ++ vcpu->arch.halt_request = 0;
6862 ++ return kvm_vcpu_halt(vcpu);
6863 ++ }
6864 ++
6865 ++ /*
6866 ++ * Note, return 1 and not 0, vcpu_run() is responsible for
6867 ++ * morphing the pending signal into the proper return code.
6868 ++ */
6869 ++ if (signal_pending(current))
6870 ++ return 1;
6871 ++
6872 ++ if (need_resched())
6873 ++ schedule();
6874 ++ }
6875 ++
6876 ++ return 1;
6877 ++}
6878 ++
6879 ++static void grow_ple_window(struct kvm_vcpu *vcpu)
6880 ++{
6881 ++ struct vcpu_vmx *vmx = to_vmx(vcpu);
6882 ++ unsigned int old = vmx->ple_window;
6883 ++
6884 ++ vmx->ple_window = __grow_ple_window(old, ple_window,
6885 ++ ple_window_grow,
6886 ++ ple_window_max);
6887 ++
6888 ++ if (vmx->ple_window != old) {
6889 ++ vmx->ple_window_dirty = true;
6890 ++ trace_kvm_ple_window_update(vcpu->vcpu_id,
6891 ++ vmx->ple_window, old);
6892 ++ }
6893 ++}
6894 ++
6895 ++static void shrink_ple_window(struct kvm_vcpu *vcpu)
6896 ++{
6897 ++ struct vcpu_vmx *vmx = to_vmx(vcpu);
6898 ++ unsigned int old = vmx->ple_window;
6899 ++
6900 ++ vmx->ple_window = __shrink_ple_window(old, ple_window,
6901 ++ ple_window_shrink,
6902 ++ ple_window);
6903 ++
6904 ++ if (vmx->ple_window != old) {
6905 ++ vmx->ple_window_dirty = true;
6906 ++ trace_kvm_ple_window_update(vcpu->vcpu_id,
6907 ++ vmx->ple_window, old);
6908 ++ }
6909 ++}
6910 ++
6911 ++/*
6912 ++ * Handler for POSTED_INTERRUPT_WAKEUP_VECTOR.
6913 ++ */
6914 ++static void wakeup_handler(void)
6915 ++{
6916 ++ struct kvm_vcpu *vcpu;
6917 ++ int cpu = smp_processor_id();
6918 ++
6919 ++ spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
6920 ++ list_for_each_entry(vcpu, &per_cpu(blocked_vcpu_on_cpu, cpu),
6921 ++ blocked_vcpu_list) {
6922 ++ struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
6923 ++
6924 ++ if (pi_test_on(pi_desc) == 1)
6925 ++ kvm_vcpu_kick(vcpu);
6926 ++ }
6927 ++ spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
6928 ++}
6929 ++
6930 ++static void vmx_enable_tdp(void)
6931 ++{
6932 ++ kvm_mmu_set_mask_ptes(VMX_EPT_READABLE_MASK,
6933 ++ enable_ept_ad_bits ? VMX_EPT_ACCESS_BIT : 0ull,
6934 ++ enable_ept_ad_bits ? VMX_EPT_DIRTY_BIT : 0ull,
6935 ++ 0ull, VMX_EPT_EXECUTABLE_MASK,
6936 ++ cpu_has_vmx_ept_execute_only() ? 0ull : VMX_EPT_READABLE_MASK,
6937 ++ VMX_EPT_RWX_MASK, 0ull);
6938 ++
6939 ++ ept_set_mmio_spte_mask();
6940 ++ kvm_enable_tdp();
6941 ++}
6942 ++
6943 ++/*
6944 ++ * Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE
6945 ++ * exiting, so only get here on cpu with PAUSE-Loop-Exiting.
6946 ++ */
6947 ++static int handle_pause(struct kvm_vcpu *vcpu)
6948 ++{
6949 ++ if (!kvm_pause_in_guest(vcpu->kvm))
6950 ++ grow_ple_window(vcpu);
6951 ++
6952 ++ /*
6953 ++ * Intel sdm vol3 ch-25.1.3 says: The "PAUSE-loop exiting"
6954 ++ * VM-execution control is ignored if CPL > 0. OTOH, KVM
6955 ++ * never set PAUSE_EXITING and just set PLE if supported,
6956 ++ * so the vcpu must be CPL=0 if it gets a PAUSE exit.
6957 ++ */
6958 ++ kvm_vcpu_on_spin(vcpu, true);
6959 ++ return kvm_skip_emulated_instruction(vcpu);
6960 ++}
6961 ++
6962 ++static int handle_nop(struct kvm_vcpu *vcpu)
6963 ++{
6964 ++ return kvm_skip_emulated_instruction(vcpu);
6965 ++}
6966 ++
6967 ++static int handle_mwait(struct kvm_vcpu *vcpu)
6968 ++{
6969 ++ printk_once(KERN_WARNING "kvm: MWAIT instruction emulated as NOP!\n");
6970 ++ return handle_nop(vcpu);
6971 ++}
6972 ++
6973 ++static int handle_invalid_op(struct kvm_vcpu *vcpu)
6974 ++{
6975 ++ kvm_queue_exception(vcpu, UD_VECTOR);
6976 ++ return 1;
6977 ++}
6978 ++
6979 ++static int handle_monitor_trap(struct kvm_vcpu *vcpu)
6980 ++{
6981 ++ return 1;
6982 ++}
6983 ++
6984 ++static int handle_monitor(struct kvm_vcpu *vcpu)
6985 ++{
6986 ++ printk_once(KERN_WARNING "kvm: MONITOR instruction emulated as NOP!\n");
6987 ++ return handle_nop(vcpu);
6988 ++}
6989 ++
6990 ++static int handle_invpcid(struct kvm_vcpu *vcpu)
6991 ++{
6992 ++ u32 vmx_instruction_info;
6993 ++ unsigned long type;
6994 ++ bool pcid_enabled;
6995 ++ gva_t gva;
6996 ++ struct x86_exception e;
6997 ++ unsigned i;
6998 ++ unsigned long roots_to_free = 0;
6999 ++ struct {
7000 ++ u64 pcid;
7001 ++ u64 gla;
7002 ++ } operand;
7003 ++
7004 ++ if (!guest_cpuid_has(vcpu, X86_FEATURE_INVPCID)) {
7005 ++ kvm_queue_exception(vcpu, UD_VECTOR);
7006 ++ return 1;
7007 ++ }
7008 ++
7009 ++ vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
7010 ++ type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf);
7011 ++
7012 ++ if (type > 3) {
7013 ++ kvm_inject_gp(vcpu, 0);
7014 ++ return 1;
7015 ++ }
7016 ++
7017 ++ /* According to the Intel instruction reference, the memory operand
7018 ++ * is read even if it isn't needed (e.g., for type==all)
7019 ++ */
7020 ++ if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
7021 ++ vmx_instruction_info, false,
7022 ++ sizeof(operand), &gva))
7023 ++ return 1;
7024 ++
7025 ++ if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) {
7026 ++ kvm_inject_page_fault(vcpu, &e);
7027 ++ return 1;
7028 ++ }
7029 ++
7030 ++ if (operand.pcid >> 12 != 0) {
7031 ++ kvm_inject_gp(vcpu, 0);
7032 ++ return 1;
7033 ++ }
7034 ++
7035 ++ pcid_enabled = kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE);
7036 ++
7037 ++ switch (type) {
7038 ++ case INVPCID_TYPE_INDIV_ADDR:
7039 ++ if ((!pcid_enabled && (operand.pcid != 0)) ||
7040 ++ is_noncanonical_address(operand.gla, vcpu)) {
7041 ++ kvm_inject_gp(vcpu, 0);
7042 ++ return 1;
7043 ++ }
7044 ++ kvm_mmu_invpcid_gva(vcpu, operand.gla, operand.pcid);
7045 ++ return kvm_skip_emulated_instruction(vcpu);
7046 ++
7047 ++ case INVPCID_TYPE_SINGLE_CTXT:
7048 ++ if (!pcid_enabled && (operand.pcid != 0)) {
7049 ++ kvm_inject_gp(vcpu, 0);
7050 ++ return 1;
7051 ++ }
7052 ++
7053 ++ if (kvm_get_active_pcid(vcpu) == operand.pcid) {
7054 ++ kvm_mmu_sync_roots(vcpu);
7055 ++ kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
7056 ++ }
7057 ++
7058 ++ for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
7059 ++ if (kvm_get_pcid(vcpu, vcpu->arch.mmu->prev_roots[i].cr3)
7060 ++ == operand.pcid)
7061 ++ roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
7062 ++
7063 ++ kvm_mmu_free_roots(vcpu, vcpu->arch.mmu, roots_to_free);
7064 ++ /*
7065 ++ * If neither the current cr3 nor any of the prev_roots use the
7066 ++ * given PCID, then nothing needs to be done here because a
7067 ++ * resync will happen anyway before switching to any other CR3.
7068 ++ */
7069 ++
7070 ++ return kvm_skip_emulated_instruction(vcpu);
7071 ++
7072 ++ case INVPCID_TYPE_ALL_NON_GLOBAL:
7073 ++ /*
7074 ++ * Currently, KVM doesn't mark global entries in the shadow
7075 ++ * page tables, so a non-global flush just degenerates to a
7076 ++ * global flush. If needed, we could optimize this later by
7077 ++ * keeping track of global entries in shadow page tables.
7078 ++ */
7079 ++
7080 ++ /* fall-through */
7081 ++ case INVPCID_TYPE_ALL_INCL_GLOBAL:
7082 ++ kvm_mmu_unload(vcpu);
7083 ++ return kvm_skip_emulated_instruction(vcpu);
7084 ++
7085 ++ default:
7086 ++ BUG(); /* We have already checked above that type <= 3 */
7087 ++ }
7088 ++}
7089 ++
7090 ++static int handle_pml_full(struct kvm_vcpu *vcpu)
7091 ++{
7092 ++ unsigned long exit_qualification;
7093 ++
7094 ++ trace_kvm_pml_full(vcpu->vcpu_id);
7095 ++
7096 ++ exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
7097 ++
7098 ++ /*
7099 ++ * PML buffer FULL happened while executing iret from NMI,
7100 ++ * "blocked by NMI" bit has to be set before next VM entry.
7101 ++ */
7102 ++ if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
7103 ++ enable_vnmi &&
7104 ++ (exit_qualification & INTR_INFO_UNBLOCK_NMI))
7105 ++ vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
7106 ++ GUEST_INTR_STATE_NMI);
7107 ++
7108 ++ /*
7109 ++ * PML buffer already flushed at beginning of VMEXIT. Nothing to do
7110 ++ * here.., and there's no userspace involvement needed for PML.
7111 ++ */
7112 ++ return 1;
7113 ++}
7114 ++
7115 ++static int handle_preemption_timer(struct kvm_vcpu *vcpu)
7116 ++{
7117 ++ struct vcpu_vmx *vmx = to_vmx(vcpu);
7118 ++
7119 ++ if (!vmx->req_immediate_exit &&
7120 ++ !unlikely(vmx->loaded_vmcs->hv_timer_soft_disabled))
7121 ++ kvm_lapic_expired_hv_timer(vcpu);
7122 ++
7123 ++ return 1;
7124 ++}
7125 ++
7126 ++/*
7127 ++ * When nested=0, all VMX instruction VM Exits filter here. The handlers
7128 ++ * are overwritten by nested_vmx_setup() when nested=1.
7129 ++ */
7130 ++static int handle_vmx_instruction(struct kvm_vcpu *vcpu)
7131 ++{
7132 ++ kvm_queue_exception(vcpu, UD_VECTOR);
7133 ++ return 1;
7134 ++}
7135 ++
7136 ++static int handle_encls(struct kvm_vcpu *vcpu)
7137 ++{
7138 ++ /*
7139 ++ * SGX virtualization is not yet supported. There is no software
7140 ++ * enable bit for SGX, so we have to trap ENCLS and inject a #UD
7141 ++ * to prevent the guest from executing ENCLS.
7142 ++ */
7143 ++ kvm_queue_exception(vcpu, UD_VECTOR);
7144 ++ return 1;
7145 ++}
7146 ++
7147 ++/*
7148 ++ * The exit handlers return 1 if the exit was handled fully and guest execution
7149 ++ * may resume. Otherwise they set the kvm_run parameter to indicate what needs
7150 ++ * to be done to userspace and return 0.
7151 ++ */
7152 ++static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
7153 ++ [EXIT_REASON_EXCEPTION_NMI] = handle_exception_nmi,
7154 ++ [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt,
7155 ++ [EXIT_REASON_TRIPLE_FAULT] = handle_triple_fault,
7156 ++ [EXIT_REASON_NMI_WINDOW] = handle_nmi_window,
7157 ++ [EXIT_REASON_IO_INSTRUCTION] = handle_io,
7158 ++ [EXIT_REASON_CR_ACCESS] = handle_cr,
7159 ++ [EXIT_REASON_DR_ACCESS] = handle_dr,
7160 ++ [EXIT_REASON_CPUID] = kvm_emulate_cpuid,
7161 ++ [EXIT_REASON_MSR_READ] = kvm_emulate_rdmsr,
7162 ++ [EXIT_REASON_MSR_WRITE] = kvm_emulate_wrmsr,
7163 ++ [EXIT_REASON_INTERRUPT_WINDOW] = handle_interrupt_window,
7164 ++ [EXIT_REASON_HLT] = kvm_emulate_halt,
7165 ++ [EXIT_REASON_INVD] = handle_invd,
7166 ++ [EXIT_REASON_INVLPG] = handle_invlpg,
7167 ++ [EXIT_REASON_RDPMC] = handle_rdpmc,
7168 ++ [EXIT_REASON_VMCALL] = handle_vmcall,
7169 ++ [EXIT_REASON_VMCLEAR] = handle_vmx_instruction,
7170 ++ [EXIT_REASON_VMLAUNCH] = handle_vmx_instruction,
7171 ++ [EXIT_REASON_VMPTRLD] = handle_vmx_instruction,
7172 ++ [EXIT_REASON_VMPTRST] = handle_vmx_instruction,
7173 ++ [EXIT_REASON_VMREAD] = handle_vmx_instruction,
7174 ++ [EXIT_REASON_VMRESUME] = handle_vmx_instruction,
7175 ++ [EXIT_REASON_VMWRITE] = handle_vmx_instruction,
7176 ++ [EXIT_REASON_VMOFF] = handle_vmx_instruction,
7177 ++ [EXIT_REASON_VMON] = handle_vmx_instruction,
7178 ++ [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold,
7179 ++ [EXIT_REASON_APIC_ACCESS] = handle_apic_access,
7180 ++ [EXIT_REASON_APIC_WRITE] = handle_apic_write,
7181 ++ [EXIT_REASON_EOI_INDUCED] = handle_apic_eoi_induced,
7182 ++ [EXIT_REASON_WBINVD] = handle_wbinvd,
7183 ++ [EXIT_REASON_XSETBV] = handle_xsetbv,
7184 ++ [EXIT_REASON_TASK_SWITCH] = handle_task_switch,
7185 ++ [EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check,
7186 ++ [EXIT_REASON_GDTR_IDTR] = handle_desc,
7187 ++ [EXIT_REASON_LDTR_TR] = handle_desc,
7188 ++ [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation,
7189 ++ [EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig,
7190 ++ [EXIT_REASON_PAUSE_INSTRUCTION] = handle_pause,
7191 ++ [EXIT_REASON_MWAIT_INSTRUCTION] = handle_mwait,
7192 ++ [EXIT_REASON_MONITOR_TRAP_FLAG] = handle_monitor_trap,
7193 ++ [EXIT_REASON_MONITOR_INSTRUCTION] = handle_monitor,
7194 ++ [EXIT_REASON_INVEPT] = handle_vmx_instruction,
7195 ++ [EXIT_REASON_INVVPID] = handle_vmx_instruction,
7196 ++ [EXIT_REASON_RDRAND] = handle_invalid_op,
7197 ++ [EXIT_REASON_RDSEED] = handle_invalid_op,
7198 ++ [EXIT_REASON_PML_FULL] = handle_pml_full,
7199 ++ [EXIT_REASON_INVPCID] = handle_invpcid,
7200 ++ [EXIT_REASON_VMFUNC] = handle_vmx_instruction,
7201 ++ [EXIT_REASON_PREEMPTION_TIMER] = handle_preemption_timer,
7202 ++ [EXIT_REASON_ENCLS] = handle_encls,
7203 ++};
7204 ++
7205 ++static const int kvm_vmx_max_exit_handlers =
7206 ++ ARRAY_SIZE(kvm_vmx_exit_handlers);
7207 ++
7208 ++static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
7209 ++{
7210 ++ *info1 = vmcs_readl(EXIT_QUALIFICATION);
7211 ++ *info2 = vmcs_read32(VM_EXIT_INTR_INFO);
7212 ++}
7213 ++
7214 ++static void vmx_destroy_pml_buffer(struct vcpu_vmx *vmx)
7215 ++{
7216 ++ if (vmx->pml_pg) {
7217 ++ __free_page(vmx->pml_pg);
7218 ++ vmx->pml_pg = NULL;
7219 ++ }
7220 ++}
7221 ++
7222 ++static void vmx_flush_pml_buffer(struct kvm_vcpu *vcpu)
7223 ++{
7224 ++ struct vcpu_vmx *vmx = to_vmx(vcpu);
7225 ++ u64 *pml_buf;
7226 ++ u16 pml_idx;
7227 ++
7228 ++ pml_idx = vmcs_read16(GUEST_PML_INDEX);
7229 ++
7230 ++ /* Do nothing if PML buffer is empty */
7231 ++ if (pml_idx == (PML_ENTITY_NUM - 1))
7232 ++ return;
7233 ++
7234 ++ /* PML index always points to next available PML buffer entity */
7235 ++ if (pml_idx >= PML_ENTITY_NUM)
7236 ++ pml_idx = 0;
7237 ++ else
7238 ++ pml_idx++;
7239 ++
7240 ++ pml_buf = page_address(vmx->pml_pg);
7241 ++ for (; pml_idx < PML_ENTITY_NUM; pml_idx++) {
7242 ++ u64 gpa;
7243 ++
7244 ++ gpa = pml_buf[pml_idx];
7245 ++ WARN_ON(gpa & (PAGE_SIZE - 1));
7246 ++ kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT);
7247 ++ }
7248 ++
7249 ++ /* reset PML index */
7250 ++ vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
7251 ++}
7252 ++
7253 ++/*
7254 ++ * Flush all vcpus' PML buffer and update logged GPAs to dirty_bitmap.
7255 ++ * Called before reporting dirty_bitmap to userspace.
7256 ++ */
7257 ++static void kvm_flush_pml_buffers(struct kvm *kvm)
7258 ++{
7259 ++ int i;
7260 ++ struct kvm_vcpu *vcpu;
7261 ++ /*
7262 ++ * We only need to kick vcpu out of guest mode here, as PML buffer
7263 ++ * is flushed at beginning of all VMEXITs, and it's obvious that only
7264 ++ * vcpus running in guest are possible to have unflushed GPAs in PML
7265 ++ * buffer.
7266 ++ */
7267 ++ kvm_for_each_vcpu(i, vcpu, kvm)
7268 ++ kvm_vcpu_kick(vcpu);
7269 ++}
7270 ++
7271 ++static void vmx_dump_sel(char *name, uint32_t sel)
7272 ++{
7273 ++ pr_err("%s sel=0x%04x, attr=0x%05x, limit=0x%08x, base=0x%016lx\n",
7274 ++ name, vmcs_read16(sel),
7275 ++ vmcs_read32(sel + GUEST_ES_AR_BYTES - GUEST_ES_SELECTOR),
7276 ++ vmcs_read32(sel + GUEST_ES_LIMIT - GUEST_ES_SELECTOR),
7277 ++ vmcs_readl(sel + GUEST_ES_BASE - GUEST_ES_SELECTOR));
7278 ++}
7279 ++
7280 ++static void vmx_dump_dtsel(char *name, uint32_t limit)
7281 ++{
7282 ++ pr_err("%s limit=0x%08x, base=0x%016lx\n",
7283 ++ name, vmcs_read32(limit),
7284 ++ vmcs_readl(limit + GUEST_GDTR_BASE - GUEST_GDTR_LIMIT));
7285 ++}
7286 ++
7287 ++void dump_vmcs(void)
7288 ++{
7289 ++ u32 vmentry_ctl, vmexit_ctl;
7290 ++ u32 cpu_based_exec_ctrl, pin_based_exec_ctrl, secondary_exec_control;
7291 ++ unsigned long cr4;
7292 ++ u64 efer;
7293 ++ int i, n;
7294 ++
7295 ++ if (!dump_invalid_vmcs) {
7296 ++ pr_warn_ratelimited("set kvm_intel.dump_invalid_vmcs=1 to dump internal KVM state.\n");
7297 ++ return;
7298 ++ }
7299 ++
7300 ++ vmentry_ctl = vmcs_read32(VM_ENTRY_CONTROLS);
7301 ++ vmexit_ctl = vmcs_read32(VM_EXIT_CONTROLS);
7302 ++ cpu_based_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
7303 ++ pin_based_exec_ctrl = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL);
7304 ++ cr4 = vmcs_readl(GUEST_CR4);
7305 ++ efer = vmcs_read64(GUEST_IA32_EFER);
7306 ++ secondary_exec_control = 0;
7307 ++ if (cpu_has_secondary_exec_ctrls())
7308 ++ secondary_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
7309 ++
7310 ++ pr_err("*** Guest State ***\n");
7311 ++ pr_err("CR0: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n",
7312 ++ vmcs_readl(GUEST_CR0), vmcs_readl(CR0_READ_SHADOW),
7313 ++ vmcs_readl(CR0_GUEST_HOST_MASK));
7314 ++ pr_err("CR4: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n",
7315 ++ cr4, vmcs_readl(CR4_READ_SHADOW), vmcs_readl(CR4_GUEST_HOST_MASK));
7316 ++ pr_err("CR3 = 0x%016lx\n", vmcs_readl(GUEST_CR3));
7317 ++ if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT) &&
7318 ++ (cr4 & X86_CR4_PAE) && !(efer & EFER_LMA))
7319 ++ {
7320 ++ pr_err("PDPTR0 = 0x%016llx PDPTR1 = 0x%016llx\n",
7321 ++ vmcs_read64(GUEST_PDPTR0), vmcs_read64(GUEST_PDPTR1));
7322 ++ pr_err("PDPTR2 = 0x%016llx PDPTR3 = 0x%016llx\n",
7323 ++ vmcs_read64(GUEST_PDPTR2), vmcs_read64(GUEST_PDPTR3));
7324 ++ }
7325 ++ pr_err("RSP = 0x%016lx RIP = 0x%016lx\n",
7326 ++ vmcs_readl(GUEST_RSP), vmcs_readl(GUEST_RIP));
7327 ++ pr_err("RFLAGS=0x%08lx DR7 = 0x%016lx\n",
7328 ++ vmcs_readl(GUEST_RFLAGS), vmcs_readl(GUEST_DR7));
7329 ++ pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n",
7330 ++ vmcs_readl(GUEST_SYSENTER_ESP),
7331 ++ vmcs_read32(GUEST_SYSENTER_CS), vmcs_readl(GUEST_SYSENTER_EIP));
7332 ++ vmx_dump_sel("CS: ", GUEST_CS_SELECTOR);
7333 ++ vmx_dump_sel("DS: ", GUEST_DS_SELECTOR);
7334 ++ vmx_dump_sel("SS: ", GUEST_SS_SELECTOR);
7335 ++ vmx_dump_sel("ES: ", GUEST_ES_SELECTOR);
7336 ++ vmx_dump_sel("FS: ", GUEST_FS_SELECTOR);
7337 ++ vmx_dump_sel("GS: ", GUEST_GS_SELECTOR);
7338 ++ vmx_dump_dtsel("GDTR:", GUEST_GDTR_LIMIT);
7339 ++ vmx_dump_sel("LDTR:", GUEST_LDTR_SELECTOR);
7340 ++ vmx_dump_dtsel("IDTR:", GUEST_IDTR_LIMIT);
7341 ++ vmx_dump_sel("TR: ", GUEST_TR_SELECTOR);
7342 ++ if ((vmexit_ctl & (VM_EXIT_SAVE_IA32_PAT | VM_EXIT_SAVE_IA32_EFER)) ||
7343 ++ (vmentry_ctl & (VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_IA32_EFER)))
7344 ++ pr_err("EFER = 0x%016llx PAT = 0x%016llx\n",
7345 ++ efer, vmcs_read64(GUEST_IA32_PAT));
7346 ++ pr_err("DebugCtl = 0x%016llx DebugExceptions = 0x%016lx\n",
7347 ++ vmcs_read64(GUEST_IA32_DEBUGCTL),
7348 ++ vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS));
7349 ++ if (cpu_has_load_perf_global_ctrl() &&
7350 ++ vmentry_ctl & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL)
7351 ++ pr_err("PerfGlobCtl = 0x%016llx\n",
7352 ++ vmcs_read64(GUEST_IA32_PERF_GLOBAL_CTRL));
7353 ++ if (vmentry_ctl & VM_ENTRY_LOAD_BNDCFGS)
7354 ++ pr_err("BndCfgS = 0x%016llx\n", vmcs_read64(GUEST_BNDCFGS));
7355 ++ pr_err("Interruptibility = %08x ActivityState = %08x\n",
7356 ++ vmcs_read32(GUEST_INTERRUPTIBILITY_INFO),
7357 ++ vmcs_read32(GUEST_ACTIVITY_STATE));
7358 ++ if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)
7359 ++ pr_err("InterruptStatus = %04x\n",
7360 ++ vmcs_read16(GUEST_INTR_STATUS));
7361 ++
7362 ++ pr_err("*** Host State ***\n");
7363 ++ pr_err("RIP = 0x%016lx RSP = 0x%016lx\n",
7364 ++ vmcs_readl(HOST_RIP), vmcs_readl(HOST_RSP));
7365 ++ pr_err("CS=%04x SS=%04x DS=%04x ES=%04x FS=%04x GS=%04x TR=%04x\n",
7366 ++ vmcs_read16(HOST_CS_SELECTOR), vmcs_read16(HOST_SS_SELECTOR),
7367 ++ vmcs_read16(HOST_DS_SELECTOR), vmcs_read16(HOST_ES_SELECTOR),
7368 ++ vmcs_read16(HOST_FS_SELECTOR), vmcs_read16(HOST_GS_SELECTOR),
7369 ++ vmcs_read16(HOST_TR_SELECTOR));
7370 ++ pr_err("FSBase=%016lx GSBase=%016lx TRBase=%016lx\n",
7371 ++ vmcs_readl(HOST_FS_BASE), vmcs_readl(HOST_GS_BASE),
7372 ++ vmcs_readl(HOST_TR_BASE));
7373 ++ pr_err("GDTBase=%016lx IDTBase=%016lx\n",
7374 ++ vmcs_readl(HOST_GDTR_BASE), vmcs_readl(HOST_IDTR_BASE));
7375 ++ pr_err("CR0=%016lx CR3=%016lx CR4=%016lx\n",
7376 ++ vmcs_readl(HOST_CR0), vmcs_readl(HOST_CR3),
7377 ++ vmcs_readl(HOST_CR4));
7378 ++ pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n",
7379 ++ vmcs_readl(HOST_IA32_SYSENTER_ESP),
7380 ++ vmcs_read32(HOST_IA32_SYSENTER_CS),
7381 ++ vmcs_readl(HOST_IA32_SYSENTER_EIP));
7382 ++ if (vmexit_ctl & (VM_EXIT_LOAD_IA32_PAT | VM_EXIT_LOAD_IA32_EFER))
7383 ++ pr_err("EFER = 0x%016llx PAT = 0x%016llx\n",
7384 ++ vmcs_read64(HOST_IA32_EFER),
7385 ++ vmcs_read64(HOST_IA32_PAT));
7386 ++ if (cpu_has_load_perf_global_ctrl() &&
7387 ++ vmexit_ctl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
7388 ++ pr_err("PerfGlobCtl = 0x%016llx\n",
7389 ++ vmcs_read64(HOST_IA32_PERF_GLOBAL_CTRL));
7390 ++
7391 ++ pr_err("*** Control State ***\n");
7392 ++ pr_err("PinBased=%08x CPUBased=%08x SecondaryExec=%08x\n",
7393 ++ pin_based_exec_ctrl, cpu_based_exec_ctrl, secondary_exec_control);
7394 ++ pr_err("EntryControls=%08x ExitControls=%08x\n", vmentry_ctl, vmexit_ctl);
7395 ++ pr_err("ExceptionBitmap=%08x PFECmask=%08x PFECmatch=%08x\n",
7396 ++ vmcs_read32(EXCEPTION_BITMAP),
7397 ++ vmcs_read32(PAGE_FAULT_ERROR_CODE_MASK),
7398 ++ vmcs_read32(PAGE_FAULT_ERROR_CODE_MATCH));
7399 ++ pr_err("VMEntry: intr_info=%08x errcode=%08x ilen=%08x\n",
7400 ++ vmcs_read32(VM_ENTRY_INTR_INFO_FIELD),
7401 ++ vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE),
7402 ++ vmcs_read32(VM_ENTRY_INSTRUCTION_LEN));
7403 ++ pr_err("VMExit: intr_info=%08x errcode=%08x ilen=%08x\n",
7404 ++ vmcs_read32(VM_EXIT_INTR_INFO),
7405 ++ vmcs_read32(VM_EXIT_INTR_ERROR_CODE),
7406 ++ vmcs_read32(VM_EXIT_INSTRUCTION_LEN));
7407 ++ pr_err(" reason=%08x qualification=%016lx\n",
7408 ++ vmcs_read32(VM_EXIT_REASON), vmcs_readl(EXIT_QUALIFICATION));
7409 ++ pr_err("IDTVectoring: info=%08x errcode=%08x\n",
7410 ++ vmcs_read32(IDT_VECTORING_INFO_FIELD),
7411 ++ vmcs_read32(IDT_VECTORING_ERROR_CODE));
7412 ++ pr_err("TSC Offset = 0x%016llx\n", vmcs_read64(TSC_OFFSET));
7413 ++ if (secondary_exec_control & SECONDARY_EXEC_TSC_SCALING)
7414 ++ pr_err("TSC Multiplier = 0x%016llx\n",
7415 ++ vmcs_read64(TSC_MULTIPLIER));
7416 ++ if (cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW) {
7417 ++ if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) {
7418 ++ u16 status = vmcs_read16(GUEST_INTR_STATUS);
7419 ++ pr_err("SVI|RVI = %02x|%02x ", status >> 8, status & 0xff);
7420 ++ }
7421 ++ pr_cont("TPR Threshold = 0x%02x\n", vmcs_read32(TPR_THRESHOLD));
7422 ++ if (secondary_exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)
7423 ++ pr_err("APIC-access addr = 0x%016llx ", vmcs_read64(APIC_ACCESS_ADDR));
7424 ++ pr_cont("virt-APIC addr = 0x%016llx\n", vmcs_read64(VIRTUAL_APIC_PAGE_ADDR));
7425 ++ }
7426 ++ if (pin_based_exec_ctrl & PIN_BASED_POSTED_INTR)
7427 ++ pr_err("PostedIntrVec = 0x%02x\n", vmcs_read16(POSTED_INTR_NV));
7428 ++ if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT))
7429 ++ pr_err("EPT pointer = 0x%016llx\n", vmcs_read64(EPT_POINTER));
7430 ++ n = vmcs_read32(CR3_TARGET_COUNT);
7431 ++ for (i = 0; i + 1 < n; i += 4)
7432 ++ pr_err("CR3 target%u=%016lx target%u=%016lx\n",
7433 ++ i, vmcs_readl(CR3_TARGET_VALUE0 + i * 2),
7434 ++ i + 1, vmcs_readl(CR3_TARGET_VALUE0 + i * 2 + 2));
7435 ++ if (i < n)
7436 ++ pr_err("CR3 target%u=%016lx\n",
7437 ++ i, vmcs_readl(CR3_TARGET_VALUE0 + i * 2));
7438 ++ if (secondary_exec_control & SECONDARY_EXEC_PAUSE_LOOP_EXITING)
7439 ++ pr_err("PLE Gap=%08x Window=%08x\n",
7440 ++ vmcs_read32(PLE_GAP), vmcs_read32(PLE_WINDOW));
7441 ++ if (secondary_exec_control & SECONDARY_EXEC_ENABLE_VPID)
7442 ++ pr_err("Virtual processor ID = 0x%04x\n",
7443 ++ vmcs_read16(VIRTUAL_PROCESSOR_ID));
7444 ++}
7445 ++
7446 ++/*
7447 ++ * The guest has exited. See if we can fix it or if we need userspace
7448 ++ * assistance.
7449 ++ */
7450 ++static int vmx_handle_exit(struct kvm_vcpu *vcpu,
7451 ++ enum exit_fastpath_completion exit_fastpath)
7452 ++{
7453 ++ struct vcpu_vmx *vmx = to_vmx(vcpu);
7454 ++ u32 exit_reason = vmx->exit_reason;
7455 ++ u32 vectoring_info = vmx->idt_vectoring_info;
7456 ++
7457 ++ trace_kvm_exit(exit_reason, vcpu, KVM_ISA_VMX);
7458 ++
7459 ++ /*
7460 ++ * Flush logged GPAs PML buffer, this will make dirty_bitmap more
7461 ++ * updated. Another good is, in kvm_vm_ioctl_get_dirty_log, before
7462 ++ * querying dirty_bitmap, we only need to kick all vcpus out of guest
7463 ++ * mode as if vcpus is in root mode, the PML buffer must has been
7464 ++ * flushed already.
7465 ++ */
7466 ++ if (enable_pml)
7467 ++ vmx_flush_pml_buffer(vcpu);
7468 ++
7469 ++ /* If guest state is invalid, start emulating */
7470 ++ if (vmx->emulation_required)
7471 ++ return handle_invalid_guest_state(vcpu);
7472 ++
7473 ++ if (is_guest_mode(vcpu) && nested_vmx_exit_reflected(vcpu, exit_reason))
7474 ++ return nested_vmx_reflect_vmexit(vcpu, exit_reason);
7475 ++
7476 ++ if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) {
7477 ++ dump_vmcs();
7478 ++ vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
7479 ++ vcpu->run->fail_entry.hardware_entry_failure_reason
7480 ++ = exit_reason;
7481 ++ return 0;
7482 ++ }
7483 ++
7484 ++ if (unlikely(vmx->fail)) {
7485 ++ dump_vmcs();
7486 ++ vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
7487 ++ vcpu->run->fail_entry.hardware_entry_failure_reason
7488 ++ = vmcs_read32(VM_INSTRUCTION_ERROR);
7489 ++ return 0;
7490 ++ }
7491 ++
7492 ++ /*
7493 ++ * Note:
7494 ++ * Do not try to fix EXIT_REASON_EPT_MISCONFIG if it caused by
7495 ++ * delivery event since it indicates guest is accessing MMIO.
7496 ++ * The vm-exit can be triggered again after return to guest that
7497 ++ * will cause infinite loop.
7498 ++ */
7499 ++ if ((vectoring_info & VECTORING_INFO_VALID_MASK) &&
7500 ++ (exit_reason != EXIT_REASON_EXCEPTION_NMI &&
7501 ++ exit_reason != EXIT_REASON_EPT_VIOLATION &&
7502 ++ exit_reason != EXIT_REASON_PML_FULL &&
7503 ++ exit_reason != EXIT_REASON_TASK_SWITCH)) {
7504 ++ vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
7505 ++ vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV;
7506 ++ vcpu->run->internal.ndata = 3;
7507 ++ vcpu->run->internal.data[0] = vectoring_info;
7508 ++ vcpu->run->internal.data[1] = exit_reason;
7509 ++ vcpu->run->internal.data[2] = vcpu->arch.exit_qualification;
7510 ++ if (exit_reason == EXIT_REASON_EPT_MISCONFIG) {
7511 ++ vcpu->run->internal.ndata++;
7512 ++ vcpu->run->internal.data[3] =
7513 ++ vmcs_read64(GUEST_PHYSICAL_ADDRESS);
7514 ++ }
7515 ++ return 0;
7516 ++ }
7517 ++
7518 ++ if (unlikely(!enable_vnmi &&
7519 ++ vmx->loaded_vmcs->soft_vnmi_blocked)) {
7520 ++ if (vmx_interrupt_allowed(vcpu)) {
7521 ++ vmx->loaded_vmcs->soft_vnmi_blocked = 0;
7522 ++ } else if (vmx->loaded_vmcs->vnmi_blocked_time > 1000000000LL &&
7523 ++ vcpu->arch.nmi_pending) {
7524 ++ /*
7525 ++ * This CPU don't support us in finding the end of an
7526 ++ * NMI-blocked window if the guest runs with IRQs
7527 ++ * disabled. So we pull the trigger after 1 s of
7528 ++ * futile waiting, but inform the user about this.
7529 ++ */
7530 ++ printk(KERN_WARNING "%s: Breaking out of NMI-blocked "
7531 ++ "state on VCPU %d after 1 s timeout\n",
7532 ++ __func__, vcpu->vcpu_id);
7533 ++ vmx->loaded_vmcs->soft_vnmi_blocked = 0;
7534 ++ }
7535 ++ }
7536 ++
7537 ++ if (exit_fastpath == EXIT_FASTPATH_SKIP_EMUL_INS) {
7538 ++ kvm_skip_emulated_instruction(vcpu);
7539 ++ return 1;
7540 ++ } else if (exit_reason < kvm_vmx_max_exit_handlers
7541 ++ && kvm_vmx_exit_handlers[exit_reason]) {
7542 ++#ifdef CONFIG_RETPOLINE
7543 ++ if (exit_reason == EXIT_REASON_MSR_WRITE)
7544 ++ return kvm_emulate_wrmsr(vcpu);
7545 ++ else if (exit_reason == EXIT_REASON_PREEMPTION_TIMER)
7546 ++ return handle_preemption_timer(vcpu);
7547 ++ else if (exit_reason == EXIT_REASON_INTERRUPT_WINDOW)
7548 ++ return handle_interrupt_window(vcpu);
7549 ++ else if (exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT)
7550 ++ return handle_external_interrupt(vcpu);
7551 ++ else if (exit_reason == EXIT_REASON_HLT)
7552 ++ return kvm_emulate_halt(vcpu);
7553 ++ else if (exit_reason == EXIT_REASON_EPT_MISCONFIG)
7554 ++ return handle_ept_misconfig(vcpu);
7555 ++#endif
7556 ++ return kvm_vmx_exit_handlers[exit_reason](vcpu);
7557 ++ } else {
7558 ++ vcpu_unimpl(vcpu, "vmx: unexpected exit reason 0x%x\n",
7559 ++ exit_reason);
7560 ++ dump_vmcs();
7561 ++ vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
7562 ++ vcpu->run->internal.suberror =
7563 ++ KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON;
7564 ++ vcpu->run->internal.ndata = 1;
7565 ++ vcpu->run->internal.data[0] = exit_reason;
7566 ++ return 0;
7567 ++ }
7568 ++}
7569 ++
7570 ++/*
7571 ++ * Software based L1D cache flush which is used when microcode providing
7572 ++ * the cache control MSR is not loaded.
7573 ++ *
7574 ++ * The L1D cache is 32 KiB on Nehalem and later microarchitectures, but to
7575 ++ * flush it is required to read in 64 KiB because the replacement algorithm
7576 ++ * is not exactly LRU. This could be sized at runtime via topology
7577 ++ * information but as all relevant affected CPUs have 32KiB L1D cache size
7578 ++ * there is no point in doing so.
7579 ++ */
7580 ++static void vmx_l1d_flush(struct kvm_vcpu *vcpu)
7581 ++{
7582 ++ int size = PAGE_SIZE << L1D_CACHE_ORDER;
7583 ++
7584 ++ /*
7585 ++ * This code is only executed when the the flush mode is 'cond' or
7586 ++ * 'always'
7587 ++ */
7588 ++ if (static_branch_likely(&vmx_l1d_flush_cond)) {
7589 ++ bool flush_l1d;
7590 ++
7591 ++ /*
7592 ++ * Clear the per-vcpu flush bit, it gets set again
7593 ++ * either from vcpu_run() or from one of the unsafe
7594 ++ * VMEXIT handlers.
7595 ++ */
7596 ++ flush_l1d = vcpu->arch.l1tf_flush_l1d;
7597 ++ vcpu->arch.l1tf_flush_l1d = false;
7598 ++
7599 ++ /*
7600 ++ * Clear the per-cpu flush bit, it gets set again from
7601 ++ * the interrupt handlers.
7602 ++ */
7603 ++ flush_l1d |= kvm_get_cpu_l1tf_flush_l1d();
7604 ++ kvm_clear_cpu_l1tf_flush_l1d();
7605 ++
7606 ++ if (!flush_l1d)
7607 ++ return;
7608 ++ }
7609 ++
7610 ++ vcpu->stat.l1d_flush++;
7611 ++
7612 ++ if (static_cpu_has(X86_FEATURE_FLUSH_L1D)) {
7613 ++ wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH);
7614 ++ return;
7615 ++ }
7616 ++
7617 ++ asm volatile(
7618 ++ /* First ensure the pages are in the TLB */
7619 ++ "xorl %%eax, %%eax\n"
7620 ++ ".Lpopulate_tlb:\n\t"
7621 ++ "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t"
7622 ++ "addl $4096, %%eax\n\t"
7623 ++ "cmpl %%eax, %[size]\n\t"
7624 ++ "jne .Lpopulate_tlb\n\t"
7625 ++ "xorl %%eax, %%eax\n\t"
7626 ++ "cpuid\n\t"
7627 ++ /* Now fill the cache */
7628 ++ "xorl %%eax, %%eax\n"
7629 ++ ".Lfill_cache:\n"
7630 ++ "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t"
7631 ++ "addl $64, %%eax\n\t"
7632 ++ "cmpl %%eax, %[size]\n\t"
7633 ++ "jne .Lfill_cache\n\t"
7634 ++ "lfence\n"
7635 ++ :: [flush_pages] "r" (vmx_l1d_flush_pages),
7636 ++ [size] "r" (size)
7637 ++ : "eax", "ebx", "ecx", "edx");
7638 ++}
7639 ++
7640 ++static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
7641 ++{
7642 ++ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
7643 ++ int tpr_threshold;
7644 ++
7645 ++ if (is_guest_mode(vcpu) &&
7646 ++ nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))
7647 ++ return;
7648 ++
7649 ++ tpr_threshold = (irr == -1 || tpr < irr) ? 0 : irr;
7650 ++ if (is_guest_mode(vcpu))
7651 ++ to_vmx(vcpu)->nested.l1_tpr_threshold = tpr_threshold;
7652 ++ else
7653 ++ vmcs_write32(TPR_THRESHOLD, tpr_threshold);
7654 ++}
7655 ++
7656 ++void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu)
7657 ++{
7658 ++ struct vcpu_vmx *vmx = to_vmx(vcpu);
7659 ++ u32 sec_exec_control;
7660 ++
7661 ++ if (!lapic_in_kernel(vcpu))
7662 ++ return;
7663 ++
7664 ++ if (!flexpriority_enabled &&
7665 ++ !cpu_has_vmx_virtualize_x2apic_mode())
7666 ++ return;
7667 ++
7668 ++ /* Postpone execution until vmcs01 is the current VMCS. */
7669 ++ if (is_guest_mode(vcpu)) {
7670 ++ vmx->nested.change_vmcs01_virtual_apic_mode = true;
7671 ++ return;
7672 ++ }
7673 ++
7674 ++ sec_exec_control = secondary_exec_controls_get(vmx);
7675 ++ sec_exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
7676 ++ SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE);
7677 ++
7678 ++ switch (kvm_get_apic_mode(vcpu)) {
7679 ++ case LAPIC_MODE_INVALID:
7680 ++ WARN_ONCE(true, "Invalid local APIC state");
7681 ++ case LAPIC_MODE_DISABLED:
7682 ++ break;
7683 ++ case LAPIC_MODE_XAPIC:
7684 ++ if (flexpriority_enabled) {
7685 ++ sec_exec_control |=
7686 ++ SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
7687 ++ vmx_flush_tlb(vcpu, true);
7688 ++ }
7689 ++ break;
7690 ++ case LAPIC_MODE_X2APIC:
7691 ++ if (cpu_has_vmx_virtualize_x2apic_mode())
7692 ++ sec_exec_control |=
7693 ++ SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
7694 ++ break;
7695 ++ }
7696 ++ secondary_exec_controls_set(vmx, sec_exec_control);
7697 ++
7698 ++ vmx_update_msr_bitmap(vcpu);
7699 ++}
7700 ++
7701 ++static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu, hpa_t hpa)
7702 ++{
7703 ++ if (!is_guest_mode(vcpu)) {
7704 ++ vmcs_write64(APIC_ACCESS_ADDR, hpa);
7705 ++ vmx_flush_tlb(vcpu, true);
7706 ++ }
7707 ++}
7708 ++
7709 ++static void vmx_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr)
7710 ++{
7711 ++ u16 status;
7712 ++ u8 old;
7713 ++
7714 ++ if (max_isr == -1)
7715 ++ max_isr = 0;
7716 ++
7717 ++ status = vmcs_read16(GUEST_INTR_STATUS);
7718 ++ old = status >> 8;
7719 ++ if (max_isr != old) {
7720 ++ status &= 0xff;
7721 ++ status |= max_isr << 8;
7722 ++ vmcs_write16(GUEST_INTR_STATUS, status);
7723 ++ }
7724 ++}
7725 ++
7726 ++static void vmx_set_rvi(int vector)
7727 ++{
7728 ++ u16 status;
7729 ++ u8 old;
7730 ++
7731 ++ if (vector == -1)
7732 ++ vector = 0;
7733 ++
7734 ++ status = vmcs_read16(GUEST_INTR_STATUS);
7735 ++ old = (u8)status & 0xff;
7736 ++ if ((u8)vector != old) {
7737 ++ status &= ~0xff;
7738 ++ status |= (u8)vector;
7739 ++ vmcs_write16(GUEST_INTR_STATUS, status);
7740 ++ }
7741 ++}
7742 ++
7743 ++static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr)
7744 ++{
7745 ++ /*
7746 ++ * When running L2, updating RVI is only relevant when
7747 ++ * vmcs12 virtual-interrupt-delivery enabled.
7748 ++ * However, it can be enabled only when L1 also
7749 ++ * intercepts external-interrupts and in that case
7750 ++ * we should not update vmcs02 RVI but instead intercept
7751 ++ * interrupt. Therefore, do nothing when running L2.
7752 ++ */
7753 ++ if (!is_guest_mode(vcpu))
7754 ++ vmx_set_rvi(max_irr);
7755 ++}
7756 ++
7757 ++static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
7758 ++{
7759 ++ struct vcpu_vmx *vmx = to_vmx(vcpu);
7760 ++ int max_irr;
7761 ++ bool max_irr_updated;
7762 ++
7763 ++ WARN_ON(!vcpu->arch.apicv_active);
7764 ++ if (pi_test_on(&vmx->pi_desc)) {
7765 ++ pi_clear_on(&vmx->pi_desc);
7766 ++ /*
7767 ++ * IOMMU can write to PID.ON, so the barrier matters even on UP.
7768 ++ * But on x86 this is just a compiler barrier anyway.
7769 ++ */
7770 ++ smp_mb__after_atomic();
7771 ++ max_irr_updated =
7772 ++ kvm_apic_update_irr(vcpu, vmx->pi_desc.pir, &max_irr);
7773 ++
7774 ++ /*
7775 ++ * If we are running L2 and L1 has a new pending interrupt
7776 ++ * which can be injected, we should re-evaluate
7777 ++ * what should be done with this new L1 interrupt.
7778 ++ * If L1 intercepts external-interrupts, we should
7779 ++ * exit from L2 to L1. Otherwise, interrupt should be
7780 ++ * delivered directly to L2.
7781 ++ */
7782 ++ if (is_guest_mode(vcpu) && max_irr_updated) {
7783 ++ if (nested_exit_on_intr(vcpu))
7784 ++ kvm_vcpu_exiting_guest_mode(vcpu);
7785 ++ else
7786 ++ kvm_make_request(KVM_REQ_EVENT, vcpu);
7787 ++ }
7788 ++ } else {
7789 ++ max_irr = kvm_lapic_find_highest_irr(vcpu);
7790 ++ }
7791 ++ vmx_hwapic_irr_update(vcpu, max_irr);
7792 ++ return max_irr;
7793 ++}
7794 ++
7795 ++static bool vmx_dy_apicv_has_pending_interrupt(struct kvm_vcpu *vcpu)
7796 ++{
7797 ++ struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
7798 ++
7799 ++ return pi_test_on(pi_desc) ||
7800 ++ (pi_test_sn(pi_desc) && !pi_is_pir_empty(pi_desc));
7801 ++}
7802 ++
7803 ++static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
7804 ++{
7805 ++ if (!kvm_vcpu_apicv_active(vcpu))
7806 ++ return;
7807 ++
7808 ++ vmcs_write64(EOI_EXIT_BITMAP0, eoi_exit_bitmap[0]);
7809 ++ vmcs_write64(EOI_EXIT_BITMAP1, eoi_exit_bitmap[1]);
7810 ++ vmcs_write64(EOI_EXIT_BITMAP2, eoi_exit_bitmap[2]);
7811 ++ vmcs_write64(EOI_EXIT_BITMAP3, eoi_exit_bitmap[3]);
7812 ++}
7813 ++
7814 ++static void vmx_apicv_post_state_restore(struct kvm_vcpu *vcpu)
7815 ++{
7816 ++ struct vcpu_vmx *vmx = to_vmx(vcpu);
7817 ++
7818 ++ pi_clear_on(&vmx->pi_desc);
7819 ++ memset(vmx->pi_desc.pir, 0, sizeof(vmx->pi_desc.pir));
7820 ++}
7821 ++
7822 ++static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx)
7823 ++{
7824 ++ vmx->exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
7825 ++
7826 ++ /* if exit due to PF check for async PF */
7827 ++ if (is_page_fault(vmx->exit_intr_info))
7828 ++ vmx->vcpu.arch.apf.host_apf_reason = kvm_read_and_reset_pf_reason();
7829 ++
7830 ++ /* Handle machine checks before interrupts are enabled */
7831 ++ if (is_machine_check(vmx->exit_intr_info))
7832 ++ kvm_machine_check();
7833 ++
7834 ++ /* We need to handle NMIs before interrupts are enabled */
7835 ++ if (is_nmi(vmx->exit_intr_info)) {
7836 ++ kvm_before_interrupt(&vmx->vcpu);
7837 ++ asm("int $2");
7838 ++ kvm_after_interrupt(&vmx->vcpu);
7839 ++ }
7840 ++}
7841 ++
7842 ++static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu)
7843 ++{
7844 ++ unsigned int vector;
7845 ++ unsigned long entry;
7846 ++#ifdef CONFIG_X86_64
7847 ++ unsigned long tmp;
7848 ++#endif
7849 ++ gate_desc *desc;
7850 ++ u32 intr_info;
7851 ++
7852 ++ intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
7853 ++ if (WARN_ONCE(!is_external_intr(intr_info),
7854 ++ "KVM: unexpected VM-Exit interrupt info: 0x%x", intr_info))
7855 ++ return;
7856 ++
7857 ++ vector = intr_info & INTR_INFO_VECTOR_MASK;
7858 ++ desc = (gate_desc *)host_idt_base + vector;
7859 ++ entry = gate_offset(desc);
7860 ++
7861 ++ kvm_before_interrupt(vcpu);
7862 ++
7863 ++ asm volatile(
7864 ++#ifdef CONFIG_X86_64
7865 ++ "mov %%" _ASM_SP ", %[sp]\n\t"
7866 ++ "and $0xfffffffffffffff0, %%" _ASM_SP "\n\t"
7867 ++ "push $%c[ss]\n\t"
7868 ++ "push %[sp]\n\t"
7869 ++#endif
7870 ++ "pushf\n\t"
7871 ++ __ASM_SIZE(push) " $%c[cs]\n\t"
7872 ++ CALL_NOSPEC
7873 ++ :
7874 ++#ifdef CONFIG_X86_64
7875 ++ [sp]"=&r"(tmp),
7876 ++#endif
7877 ++ ASM_CALL_CONSTRAINT
7878 ++ :
7879 ++ THUNK_TARGET(entry),
7880 ++ [ss]"i"(__KERNEL_DS),
7881 ++ [cs]"i"(__KERNEL_CS)
7882 ++ );
7883 ++
7884 ++ kvm_after_interrupt(vcpu);
7885 ++}
7886 ++STACK_FRAME_NON_STANDARD(handle_external_interrupt_irqoff);
7887 ++
7888 ++static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu,
7889 ++ enum exit_fastpath_completion *exit_fastpath)
7890 ++{
7891 ++ struct vcpu_vmx *vmx = to_vmx(vcpu);
7892 ++
7893 ++ if (vmx->exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT)
7894 ++ handle_external_interrupt_irqoff(vcpu);
7895 ++ else if (vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI)
7896 ++ handle_exception_nmi_irqoff(vmx);
7897 ++ else if (!is_guest_mode(vcpu) &&
7898 ++ vmx->exit_reason == EXIT_REASON_MSR_WRITE)
7899 ++ *exit_fastpath = handle_fastpath_set_msr_irqoff(vcpu);
7900 ++}
7901 ++
7902 ++static bool vmx_has_emulated_msr(int index)
7903 ++{
7904 ++ switch (index) {
7905 ++ case MSR_IA32_SMBASE:
7906 ++ /*
7907 ++ * We cannot do SMM unless we can run the guest in big
7908 ++ * real mode.
7909 ++ */
7910 ++ return enable_unrestricted_guest || emulate_invalid_guest_state;
7911 ++ case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
7912 ++ return nested;
7913 ++ case MSR_AMD64_VIRT_SPEC_CTRL:
7914 ++ /* This is AMD only. */
7915 ++ return false;
7916 ++ default:
7917 ++ return true;
7918 ++ }
7919 ++}
7920 ++
7921 ++static bool vmx_pt_supported(void)
7922 ++{
7923 ++ return pt_mode == PT_MODE_HOST_GUEST;
7924 ++}
7925 ++
7926 ++static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
7927 ++{
7928 ++ u32 exit_intr_info;
7929 ++ bool unblock_nmi;
7930 ++ u8 vector;
7931 ++ bool idtv_info_valid;
7932 ++
7933 ++ idtv_info_valid = vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK;
7934 ++
7935 ++ if (enable_vnmi) {
7936 ++ if (vmx->loaded_vmcs->nmi_known_unmasked)
7937 ++ return;
7938 ++ /*
7939 ++ * Can't use vmx->exit_intr_info since we're not sure what
7940 ++ * the exit reason is.
7941 ++ */
7942 ++ exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
7943 ++ unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0;
7944 ++ vector = exit_intr_info & INTR_INFO_VECTOR_MASK;
7945 ++ /*
7946 ++ * SDM 3: 27.7.1.2 (September 2008)
7947 ++ * Re-set bit "block by NMI" before VM entry if vmexit caused by
7948 ++ * a guest IRET fault.
7949 ++ * SDM 3: 23.2.2 (September 2008)
7950 ++ * Bit 12 is undefined in any of the following cases:
7951 ++ * If the VM exit sets the valid bit in the IDT-vectoring
7952 ++ * information field.
7953 ++ * If the VM exit is due to a double fault.
7954 ++ */
7955 ++ if ((exit_intr_info & INTR_INFO_VALID_MASK) && unblock_nmi &&
7956 ++ vector != DF_VECTOR && !idtv_info_valid)
7957 ++ vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
7958 ++ GUEST_INTR_STATE_NMI);
7959 ++ else
7960 ++ vmx->loaded_vmcs->nmi_known_unmasked =
7961 ++ !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO)
7962 ++ & GUEST_INTR_STATE_NMI);
7963 ++ } else if (unlikely(vmx->loaded_vmcs->soft_vnmi_blocked))
7964 ++ vmx->loaded_vmcs->vnmi_blocked_time +=
7965 ++ ktime_to_ns(ktime_sub(ktime_get(),
7966 ++ vmx->loaded_vmcs->entry_time));
7967 ++}
7968 ++
7969 ++static void __vmx_complete_interrupts(struct kvm_vcpu *vcpu,
7970 ++ u32 idt_vectoring_info,
7971 ++ int instr_len_field,
7972 ++ int error_code_field)
7973 ++{
7974 ++ u8 vector;
7975 ++ int type;
7976 ++ bool idtv_info_valid;
7977 ++
7978 ++ idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK;
7979 ++
7980 ++ vcpu->arch.nmi_injected = false;
7981 ++ kvm_clear_exception_queue(vcpu);
7982 ++ kvm_clear_interrupt_queue(vcpu);
7983 ++
7984 ++ if (!idtv_info_valid)
7985 ++ return;
7986 ++
7987 ++ kvm_make_request(KVM_REQ_EVENT, vcpu);
7988 ++
7989 ++ vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK;
7990 ++ type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK;
7991 ++
7992 ++ switch (type) {
7993 ++ case INTR_TYPE_NMI_INTR:
7994 ++ vcpu->arch.nmi_injected = true;
7995 ++ /*
7996 ++ * SDM 3: 27.7.1.2 (September 2008)
7997 ++ * Clear bit "block by NMI" before VM entry if a NMI
7998 ++ * delivery faulted.
7999 ++ */
8000 ++ vmx_set_nmi_mask(vcpu, false);
8001 ++ break;
8002 ++ case INTR_TYPE_SOFT_EXCEPTION:
8003 ++ vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field);
8004 ++ /* fall through */
8005 ++ case INTR_TYPE_HARD_EXCEPTION:
8006 ++ if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) {
8007 ++ u32 err = vmcs_read32(error_code_field);
8008 ++ kvm_requeue_exception_e(vcpu, vector, err);
8009 ++ } else
8010 ++ kvm_requeue_exception(vcpu, vector);
8011 ++ break;
8012 ++ case INTR_TYPE_SOFT_INTR:
8013 ++ vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field);
8014 ++ /* fall through */
8015 ++ case INTR_TYPE_EXT_INTR:
8016 ++ kvm_queue_interrupt(vcpu, vector, type == INTR_TYPE_SOFT_INTR);
8017 ++ break;
8018 ++ default:
8019 ++ break;
8020 ++ }
8021 ++}
8022 ++
8023 ++static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
8024 ++{
8025 ++ __vmx_complete_interrupts(&vmx->vcpu, vmx->idt_vectoring_info,
8026 ++ VM_EXIT_INSTRUCTION_LEN,
8027 ++ IDT_VECTORING_ERROR_CODE);
8028 ++}
8029 ++
8030 ++static void vmx_cancel_injection(struct kvm_vcpu *vcpu)
8031 ++{
8032 ++ __vmx_complete_interrupts(vcpu,
8033 ++ vmcs_read32(VM_ENTRY_INTR_INFO_FIELD),
8034 ++ VM_ENTRY_INSTRUCTION_LEN,
8035 ++ VM_ENTRY_EXCEPTION_ERROR_CODE);
8036 ++
8037 ++ vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);
8038 ++}
8039 ++
8040 ++static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx)
8041 ++{
8042 ++ int i, nr_msrs;
8043 ++ struct perf_guest_switch_msr *msrs;
8044 ++
8045 ++ msrs = perf_guest_get_msrs(&nr_msrs);
8046 ++
8047 ++ if (!msrs)
8048 ++ return;
8049 ++
8050 ++ for (i = 0; i < nr_msrs; i++)
8051 ++ if (msrs[i].host == msrs[i].guest)
8052 ++ clear_atomic_switch_msr(vmx, msrs[i].msr);
8053 ++ else
8054 ++ add_atomic_switch_msr(vmx, msrs[i].msr, msrs[i].guest,
8055 ++ msrs[i].host, false);
8056 ++}
8057 ++
8058 ++static void atomic_switch_umwait_control_msr(struct vcpu_vmx *vmx)
8059 ++{
8060 ++ u32 host_umwait_control;
8061 ++
8062 ++ if (!vmx_has_waitpkg(vmx))
8063 ++ return;
8064 ++
8065 ++ host_umwait_control = get_umwait_control_msr();
8066 ++
8067 ++ if (vmx->msr_ia32_umwait_control != host_umwait_control)
8068 ++ add_atomic_switch_msr(vmx, MSR_IA32_UMWAIT_CONTROL,
8069 ++ vmx->msr_ia32_umwait_control,
8070 ++ host_umwait_control, false);
8071 ++ else
8072 ++ clear_atomic_switch_msr(vmx, MSR_IA32_UMWAIT_CONTROL);
8073 ++}
8074 ++
8075 ++static void vmx_update_hv_timer(struct kvm_vcpu *vcpu)
8076 ++{
8077 ++ struct vcpu_vmx *vmx = to_vmx(vcpu);
8078 ++ u64 tscl;
8079 ++ u32 delta_tsc;
8080 ++
8081 ++ if (vmx->req_immediate_exit) {
8082 ++ vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, 0);
8083 ++ vmx->loaded_vmcs->hv_timer_soft_disabled = false;
8084 ++ } else if (vmx->hv_deadline_tsc != -1) {
8085 ++ tscl = rdtsc();
8086 ++ if (vmx->hv_deadline_tsc > tscl)
8087 ++ /* set_hv_timer ensures the delta fits in 32-bits */
8088 ++ delta_tsc = (u32)((vmx->hv_deadline_tsc - tscl) >>
8089 ++ cpu_preemption_timer_multi);
8090 ++ else
8091 ++ delta_tsc = 0;
8092 ++
8093 ++ vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, delta_tsc);
8094 ++ vmx->loaded_vmcs->hv_timer_soft_disabled = false;
8095 ++ } else if (!vmx->loaded_vmcs->hv_timer_soft_disabled) {
8096 ++ vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, -1);
8097 ++ vmx->loaded_vmcs->hv_timer_soft_disabled = true;
8098 ++ }
8099 ++}
8100 ++
8101 ++void vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp)
8102 ++{
8103 ++ if (unlikely(host_rsp != vmx->loaded_vmcs->host_state.rsp)) {
8104 ++ vmx->loaded_vmcs->host_state.rsp = host_rsp;
8105 ++ vmcs_writel(HOST_RSP, host_rsp);
8106 ++ }
8107 ++}
8108 ++
8109 ++bool __vmx_vcpu_run(struct vcpu_vmx *vmx, unsigned long *regs, bool launched);
8110 ++
8111 ++static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
8112 ++{
8113 ++ struct vcpu_vmx *vmx = to_vmx(vcpu);
8114 ++ unsigned long cr3, cr4;
8115 ++
8116 ++ /* Record the guest's net vcpu time for enforced NMI injections. */
8117 ++ if (unlikely(!enable_vnmi &&
8118 ++ vmx->loaded_vmcs->soft_vnmi_blocked))
8119 ++ vmx->loaded_vmcs->entry_time = ktime_get();
8120 ++
8121 ++ /* Don't enter VMX if guest state is invalid, let the exit handler
8122 ++ start emulation until we arrive back to a valid state */
8123 ++ if (vmx->emulation_required)
8124 ++ return;
8125 ++
8126 ++ if (vmx->ple_window_dirty) {
8127 ++ vmx->ple_window_dirty = false;
8128 ++ vmcs_write32(PLE_WINDOW, vmx->ple_window);
8129 ++ }
8130 ++
8131 ++ if (vmx->nested.need_vmcs12_to_shadow_sync)
8132 ++ nested_sync_vmcs12_to_shadow(vcpu);
8133 ++
8134 ++ if (kvm_register_is_dirty(vcpu, VCPU_REGS_RSP))
8135 ++ vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
8136 ++ if (kvm_register_is_dirty(vcpu, VCPU_REGS_RIP))
8137 ++ vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
8138 ++
8139 ++ cr3 = __get_current_cr3_fast();
8140 ++ if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) {
8141 ++ vmcs_writel(HOST_CR3, cr3);
8142 ++ vmx->loaded_vmcs->host_state.cr3 = cr3;
8143 ++ }
8144 ++
8145 ++ cr4 = cr4_read_shadow();
8146 ++ if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) {
8147 ++ vmcs_writel(HOST_CR4, cr4);
8148 ++ vmx->loaded_vmcs->host_state.cr4 = cr4;
8149 ++ }
8150 ++
8151 ++ /* When single-stepping over STI and MOV SS, we must clear the
8152 ++ * corresponding interruptibility bits in the guest state. Otherwise
8153 ++ * vmentry fails as it then expects bit 14 (BS) in pending debug
8154 ++ * exceptions being set, but that's not correct for the guest debugging
8155 ++ * case. */
8156 ++ if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
8157 ++ vmx_set_interrupt_shadow(vcpu, 0);
8158 ++
8159 ++ kvm_load_guest_xsave_state(vcpu);
8160 ++
8161 ++ if (static_cpu_has(X86_FEATURE_PKU) &&
8162 ++ kvm_read_cr4_bits(vcpu, X86_CR4_PKE) &&
8163 ++ vcpu->arch.pkru != vmx->host_pkru)
8164 ++ __write_pkru(vcpu->arch.pkru);
8165 ++
8166 ++ pt_guest_enter(vmx);
8167 ++
8168 ++ atomic_switch_perf_msrs(vmx);
8169 ++ atomic_switch_umwait_control_msr(vmx);
8170 ++
8171 ++ if (enable_preemption_timer)
8172 ++ vmx_update_hv_timer(vcpu);
8173 ++
8174 ++ if (lapic_in_kernel(vcpu) &&
8175 ++ vcpu->arch.apic->lapic_timer.timer_advance_ns)
8176 ++ kvm_wait_lapic_expire(vcpu);
8177 ++
8178 ++ /*
8179 ++ * If this vCPU has touched SPEC_CTRL, restore the guest's value if
8180 ++ * it's non-zero. Since vmentry is serialising on affected CPUs, there
8181 ++ * is no need to worry about the conditional branch over the wrmsr
8182 ++ * being speculatively taken.
8183 ++ */
8184 ++ x86_spec_ctrl_set_guest(vmx->spec_ctrl, 0);
8185 ++
8186 ++ /* L1D Flush includes CPU buffer clear to mitigate MDS */
8187 ++ if (static_branch_unlikely(&vmx_l1d_should_flush))
8188 ++ vmx_l1d_flush(vcpu);
8189 ++ else if (static_branch_unlikely(&mds_user_clear))
8190 ++ mds_clear_cpu_buffers();
8191 ++
8192 ++ if (vcpu->arch.cr2 != read_cr2())
8193 ++ write_cr2(vcpu->arch.cr2);
8194 ++
8195 ++ vmx->fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs,
8196 ++ vmx->loaded_vmcs->launched);
8197 ++
8198 ++ vcpu->arch.cr2 = read_cr2();
8199 ++
8200 ++ /*
8201 ++ * We do not use IBRS in the kernel. If this vCPU has used the
8202 ++ * SPEC_CTRL MSR it may have left it on; save the value and
8203 ++ * turn it off. This is much more efficient than blindly adding
8204 ++ * it to the atomic save/restore list. Especially as the former
8205 ++ * (Saving guest MSRs on vmexit) doesn't even exist in KVM.
8206 ++ *
8207 ++ * For non-nested case:
8208 ++ * If the L01 MSR bitmap does not intercept the MSR, then we need to
8209 ++ * save it.
8210 ++ *
8211 ++ * For nested case:
8212 ++ * If the L02 MSR bitmap does not intercept the MSR, then we need to
8213 ++ * save it.
8214 ++ */
8215 ++ if (unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL)))
8216 ++ vmx->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL);
8217 ++
8218 ++ x86_spec_ctrl_restore_host(vmx->spec_ctrl, 0);
8219 ++
8220 ++ /* All fields are clean at this point */
8221 ++ if (static_branch_unlikely(&enable_evmcs))
8222 ++ current_evmcs->hv_clean_fields |=
8223 ++ HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
8224 ++
8225 ++ if (static_branch_unlikely(&enable_evmcs))
8226 ++ current_evmcs->hv_vp_id = vcpu->arch.hyperv.vp_index;
8227 ++
8228 ++ /* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */
8229 ++ if (vmx->host_debugctlmsr)
8230 ++ update_debugctlmsr(vmx->host_debugctlmsr);
8231 ++
8232 ++#ifndef CONFIG_X86_64
8233 ++ /*
8234 ++ * The sysexit path does not restore ds/es, so we must set them to
8235 ++ * a reasonable value ourselves.
8236 ++ *
8237 ++ * We can't defer this to vmx_prepare_switch_to_host() since that
8238 ++ * function may be executed in interrupt context, which saves and
8239 ++ * restore segments around it, nullifying its effect.
8240 ++ */
8241 ++ loadsegment(ds, __USER_DS);
8242 ++ loadsegment(es, __USER_DS);
8243 ++#endif
8244 ++
8245 ++ vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)
8246 ++ | (1 << VCPU_EXREG_RFLAGS)
8247 ++ | (1 << VCPU_EXREG_PDPTR)
8248 ++ | (1 << VCPU_EXREG_SEGMENTS)
8249 ++ | (1 << VCPU_EXREG_CR3));
8250 ++ vcpu->arch.regs_dirty = 0;
8251 ++
8252 ++ pt_guest_exit(vmx);
8253 ++
8254 ++ /*
8255 ++ * eager fpu is enabled if PKEY is supported and CR4 is switched
8256 ++ * back on host, so it is safe to read guest PKRU from current
8257 ++ * XSAVE.
8258 ++ */
8259 ++ if (static_cpu_has(X86_FEATURE_PKU) &&
8260 ++ kvm_read_cr4_bits(vcpu, X86_CR4_PKE)) {
8261 ++ vcpu->arch.pkru = rdpkru();
8262 ++ if (vcpu->arch.pkru != vmx->host_pkru)
8263 ++ __write_pkru(vmx->host_pkru);
8264 ++ }
8265 ++
8266 ++ kvm_load_host_xsave_state(vcpu);
8267 ++
8268 ++ vmx->nested.nested_run_pending = 0;
8269 ++ vmx->idt_vectoring_info = 0;
8270 ++
8271 ++ vmx->exit_reason = vmx->fail ? 0xdead : vmcs_read32(VM_EXIT_REASON);
8272 ++ if ((u16)vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY)
8273 ++ kvm_machine_check();
8274 ++
8275 ++ if (vmx->fail || (vmx->exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY))
8276 ++ return;
8277 ++
8278 ++ vmx->loaded_vmcs->launched = 1;
8279 ++ vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
8280 ++
8281 ++ vmx_recover_nmi_blocking(vmx);
8282 ++ vmx_complete_interrupts(vmx);
8283 ++}
8284 ++
8285 ++static struct kvm *vmx_vm_alloc(void)
8286 ++{
8287 ++ struct kvm_vmx *kvm_vmx = __vmalloc(sizeof(struct kvm_vmx),
8288 ++ GFP_KERNEL_ACCOUNT | __GFP_ZERO,
8289 ++ PAGE_KERNEL);
8290 ++ return &kvm_vmx->kvm;
8291 ++}
8292 ++
8293 ++static void vmx_vm_free(struct kvm *kvm)
8294 ++{
8295 ++ kfree(kvm->arch.hyperv.hv_pa_pg);
8296 ++ vfree(to_kvm_vmx(kvm));
8297 ++}
8298 ++
8299 ++static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
8300 ++{
8301 ++ struct vcpu_vmx *vmx = to_vmx(vcpu);
8302 ++
8303 ++ if (enable_pml)
8304 ++ vmx_destroy_pml_buffer(vmx);
8305 ++ free_vpid(vmx->vpid);
8306 ++ nested_vmx_free_vcpu(vcpu);
8307 ++ free_loaded_vmcs(vmx->loaded_vmcs);
8308 ++ kvm_vcpu_uninit(vcpu);
8309 ++ kmem_cache_free(x86_fpu_cache, vmx->vcpu.arch.user_fpu);
8310 ++ kmem_cache_free(x86_fpu_cache, vmx->vcpu.arch.guest_fpu);
8311 ++ kmem_cache_free(kvm_vcpu_cache, vmx);
8312 ++}
8313 ++
8314 ++static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
8315 ++{
8316 ++ int err;
8317 ++ struct vcpu_vmx *vmx;
8318 ++ unsigned long *msr_bitmap;
8319 ++ int i, cpu;
8320 ++
8321 ++ BUILD_BUG_ON_MSG(offsetof(struct vcpu_vmx, vcpu) != 0,
8322 ++ "struct kvm_vcpu must be at offset 0 for arch usercopy region");
8323 ++
8324 ++ vmx = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL_ACCOUNT);
8325 ++ if (!vmx)
8326 ++ return ERR_PTR(-ENOMEM);
8327 ++
8328 ++ vmx->vcpu.arch.user_fpu = kmem_cache_zalloc(x86_fpu_cache,
8329 ++ GFP_KERNEL_ACCOUNT);
8330 ++ if (!vmx->vcpu.arch.user_fpu) {
8331 ++ printk(KERN_ERR "kvm: failed to allocate kvm userspace's fpu\n");
8332 ++ err = -ENOMEM;
8333 ++ goto free_partial_vcpu;
8334 ++ }
8335 ++
8336 ++ vmx->vcpu.arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache,
8337 ++ GFP_KERNEL_ACCOUNT);
8338 ++ if (!vmx->vcpu.arch.guest_fpu) {
8339 ++ printk(KERN_ERR "kvm: failed to allocate vcpu's fpu\n");
8340 ++ err = -ENOMEM;
8341 ++ goto free_user_fpu;
8342 ++ }
8343 ++
8344 ++ vmx->vpid = allocate_vpid();
8345 ++
8346 ++ err = kvm_vcpu_init(&vmx->vcpu, kvm, id);
8347 ++ if (err)
8348 ++ goto free_vcpu;
8349 ++
8350 ++ err = -ENOMEM;
8351 ++
8352 ++ /*
8353 ++ * If PML is turned on, failure on enabling PML just results in failure
8354 ++ * of creating the vcpu, therefore we can simplify PML logic (by
8355 ++ * avoiding dealing with cases, such as enabling PML partially on vcpus
8356 ++ * for the guest), etc.
8357 ++ */
8358 ++ if (enable_pml) {
8359 ++ vmx->pml_pg = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
8360 ++ if (!vmx->pml_pg)
8361 ++ goto uninit_vcpu;
8362 ++ }
8363 ++
8364 ++ BUILD_BUG_ON(ARRAY_SIZE(vmx_msr_index) != NR_SHARED_MSRS);
8365 ++
8366 ++ for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i) {
8367 ++ u32 index = vmx_msr_index[i];
8368 ++ u32 data_low, data_high;
8369 ++ int j = vmx->nmsrs;
8370 ++
8371 ++ if (rdmsr_safe(index, &data_low, &data_high) < 0)
8372 ++ continue;
8373 ++ if (wrmsr_safe(index, data_low, data_high) < 0)
8374 ++ continue;
8375 ++
8376 ++ vmx->guest_msrs[j].index = i;
8377 ++ vmx->guest_msrs[j].data = 0;
8378 ++ switch (index) {
8379 ++ case MSR_IA32_TSX_CTRL:
8380 ++ /*
8381 ++ * No need to pass TSX_CTRL_CPUID_CLEAR through, so
8382 ++ * let's avoid changing CPUID bits under the host
8383 ++ * kernel's feet.
8384 ++ */
8385 ++ vmx->guest_msrs[j].mask = ~(u64)TSX_CTRL_CPUID_CLEAR;
8386 ++ break;
8387 ++ default:
8388 ++ vmx->guest_msrs[j].mask = -1ull;
8389 ++ break;
8390 ++ }
8391 ++ ++vmx->nmsrs;
8392 ++ }
8393 ++
8394 ++ err = alloc_loaded_vmcs(&vmx->vmcs01);
8395 ++ if (err < 0)
8396 ++ goto free_pml;
8397 ++
8398 ++ msr_bitmap = vmx->vmcs01.msr_bitmap;
8399 ++ vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_TSC, MSR_TYPE_R);
8400 ++ vmx_disable_intercept_for_msr(msr_bitmap, MSR_FS_BASE, MSR_TYPE_RW);
8401 ++ vmx_disable_intercept_for_msr(msr_bitmap, MSR_GS_BASE, MSR_TYPE_RW);
8402 ++ vmx_disable_intercept_for_msr(msr_bitmap, MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
8403 ++ vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW);
8404 ++ vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW);
8405 ++ vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW);
8406 ++ if (kvm_cstate_in_guest(kvm)) {
8407 ++ vmx_disable_intercept_for_msr(msr_bitmap, MSR_CORE_C1_RES, MSR_TYPE_R);
8408 ++ vmx_disable_intercept_for_msr(msr_bitmap, MSR_CORE_C3_RESIDENCY, MSR_TYPE_R);
8409 ++ vmx_disable_intercept_for_msr(msr_bitmap, MSR_CORE_C6_RESIDENCY, MSR_TYPE_R);
8410 ++ vmx_disable_intercept_for_msr(msr_bitmap, MSR_CORE_C7_RESIDENCY, MSR_TYPE_R);
8411 ++ }
8412 ++ vmx->msr_bitmap_mode = 0;
8413 ++
8414 ++ vmx->loaded_vmcs = &vmx->vmcs01;
8415 ++ cpu = get_cpu();
8416 ++ vmx_vcpu_load(&vmx->vcpu, cpu);
8417 ++ vmx->vcpu.cpu = cpu;
8418 ++ init_vmcs(vmx);
8419 ++ vmx_vcpu_put(&vmx->vcpu);
8420 ++ put_cpu();
8421 ++ if (cpu_need_virtualize_apic_accesses(&vmx->vcpu)) {
8422 ++ err = alloc_apic_access_page(kvm);
8423 ++ if (err)
8424 ++ goto free_vmcs;
8425 ++ }
8426 ++
8427 ++ if (enable_ept && !enable_unrestricted_guest) {
8428 ++ err = init_rmode_identity_map(kvm);
8429 ++ if (err)
8430 ++ goto free_vmcs;
8431 ++ }
8432 ++
8433 ++ if (nested)
8434 ++ nested_vmx_setup_ctls_msrs(&vmx->nested.msrs,
8435 ++ vmx_capability.ept,
8436 ++ kvm_vcpu_apicv_active(&vmx->vcpu));
8437 ++ else
8438 ++ memset(&vmx->nested.msrs, 0, sizeof(vmx->nested.msrs));
8439 ++
8440 ++ vmx->nested.posted_intr_nv = -1;
8441 ++ vmx->nested.current_vmptr = -1ull;
8442 ++
8443 ++ vmx->msr_ia32_feature_control_valid_bits = FEATURE_CONTROL_LOCKED;
8444 ++
8445 ++ /*
8446 ++ * Enforce invariant: pi_desc.nv is always either POSTED_INTR_VECTOR
8447 ++ * or POSTED_INTR_WAKEUP_VECTOR.
8448 ++ */
8449 ++ vmx->pi_desc.nv = POSTED_INTR_VECTOR;
8450 ++ vmx->pi_desc.sn = 1;
8451 ++
8452 ++ vmx->ept_pointer = INVALID_PAGE;
8453 ++
8454 ++ return &vmx->vcpu;
8455 ++
8456 ++free_vmcs:
8457 ++ free_loaded_vmcs(vmx->loaded_vmcs);
8458 ++free_pml:
8459 ++ vmx_destroy_pml_buffer(vmx);
8460 ++uninit_vcpu:
8461 ++ kvm_vcpu_uninit(&vmx->vcpu);
8462 ++free_vcpu:
8463 ++ free_vpid(vmx->vpid);
8464 ++ kmem_cache_free(x86_fpu_cache, vmx->vcpu.arch.guest_fpu);
8465 ++free_user_fpu:
8466 ++ kmem_cache_free(x86_fpu_cache, vmx->vcpu.arch.user_fpu);
8467 ++free_partial_vcpu:
8468 ++ kmem_cache_free(kvm_vcpu_cache, vmx);
8469 ++ return ERR_PTR(err);
8470 ++}
8471 ++
8472 ++#define L1TF_MSG_SMT "L1TF CPU bug present and SMT on, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n"
8473 ++#define L1TF_MSG_L1D "L1TF CPU bug present and virtualization mitigation disabled, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n"
8474 ++
8475 ++static int vmx_vm_init(struct kvm *kvm)
8476 ++{
8477 ++ spin_lock_init(&to_kvm_vmx(kvm)->ept_pointer_lock);
8478 ++
8479 ++ if (!ple_gap)
8480 ++ kvm->arch.pause_in_guest = true;
8481 ++
8482 ++ if (boot_cpu_has(X86_BUG_L1TF) && enable_ept) {
8483 ++ switch (l1tf_mitigation) {
8484 ++ case L1TF_MITIGATION_OFF:
8485 ++ case L1TF_MITIGATION_FLUSH_NOWARN:
8486 ++ /* 'I explicitly don't care' is set */
8487 ++ break;
8488 ++ case L1TF_MITIGATION_FLUSH:
8489 ++ case L1TF_MITIGATION_FLUSH_NOSMT:
8490 ++ case L1TF_MITIGATION_FULL:
8491 ++ /*
8492 ++ * Warn upon starting the first VM in a potentially
8493 ++ * insecure environment.
8494 ++ */
8495 ++ if (sched_smt_active())
8496 ++ pr_warn_once(L1TF_MSG_SMT);
8497 ++ if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_NEVER)
8498 ++ pr_warn_once(L1TF_MSG_L1D);
8499 ++ break;
8500 ++ case L1TF_MITIGATION_FULL_FORCE:
8501 ++ /* Flush is enforced */
8502 ++ break;
8503 ++ }
8504 ++ }
8505 ++ return 0;
8506 ++}
8507 ++
8508 ++static int __init vmx_check_processor_compat(void)
8509 ++{
8510 ++ struct vmcs_config vmcs_conf;
8511 ++ struct vmx_capability vmx_cap;
8512 ++
8513 ++ if (setup_vmcs_config(&vmcs_conf, &vmx_cap) < 0)
8514 ++ return -EIO;
8515 ++ if (nested)
8516 ++ nested_vmx_setup_ctls_msrs(&vmcs_conf.nested, vmx_cap.ept,
8517 ++ enable_apicv);
8518 ++ if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config)) != 0) {
8519 ++ printk(KERN_ERR "kvm: CPU %d feature inconsistency!\n",
8520 ++ smp_processor_id());
8521 ++ return -EIO;
8522 ++ }
8523 ++ return 0;
8524 ++}
8525 ++
8526 ++static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
8527 ++{
8528 ++ u8 cache;
8529 ++ u64 ipat = 0;
8530 ++
8531 ++ /* For VT-d and EPT combination
8532 ++ * 1. MMIO: always map as UC
8533 ++ * 2. EPT with VT-d:
8534 ++ * a. VT-d without snooping control feature: can't guarantee the
8535 ++ * result, try to trust guest.
8536 ++ * b. VT-d with snooping control feature: snooping control feature of
8537 ++ * VT-d engine can guarantee the cache correctness. Just set it
8538 ++ * to WB to keep consistent with host. So the same as item 3.
8539 ++ * 3. EPT without VT-d: always map as WB and set IPAT=1 to keep
8540 ++ * consistent with host MTRR
8541 ++ */
8542 ++ if (is_mmio) {
8543 ++ cache = MTRR_TYPE_UNCACHABLE;
8544 ++ goto exit;
8545 ++ }
8546 ++
8547 ++ if (!kvm_arch_has_noncoherent_dma(vcpu->kvm)) {
8548 ++ ipat = VMX_EPT_IPAT_BIT;
8549 ++ cache = MTRR_TYPE_WRBACK;
8550 ++ goto exit;
8551 ++ }
8552 ++
8553 ++ if (kvm_read_cr0(vcpu) & X86_CR0_CD) {
8554 ++ ipat = VMX_EPT_IPAT_BIT;
8555 ++ if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
8556 ++ cache = MTRR_TYPE_WRBACK;
8557 ++ else
8558 ++ cache = MTRR_TYPE_UNCACHABLE;
8559 ++ goto exit;
8560 ++ }
8561 ++
8562 ++ cache = kvm_mtrr_get_guest_memory_type(vcpu, gfn);
8563 ++
8564 ++exit:
8565 ++ return (cache << VMX_EPT_MT_EPTE_SHIFT) | ipat;
8566 ++}
8567 ++
8568 ++static int vmx_get_lpage_level(void)
8569 ++{
8570 ++ if (enable_ept && !cpu_has_vmx_ept_1g_page())
8571 ++ return PT_DIRECTORY_LEVEL;
8572 ++ else
8573 ++ /* For shadow and EPT supported 1GB page */
8574 ++ return PT_PDPE_LEVEL;
8575 ++}
8576 ++
8577 ++static void vmcs_set_secondary_exec_control(struct vcpu_vmx *vmx)
8578 ++{
8579 ++ /*
8580 ++ * These bits in the secondary execution controls field
8581 ++ * are dynamic, the others are mostly based on the hypervisor
8582 ++ * architecture and the guest's CPUID. Do not touch the
8583 ++ * dynamic bits.
8584 ++ */
8585 ++ u32 mask =
8586 ++ SECONDARY_EXEC_SHADOW_VMCS |
8587 ++ SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
8588 ++ SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
8589 ++ SECONDARY_EXEC_DESC;
8590 ++
8591 ++ u32 new_ctl = vmx->secondary_exec_control;
8592 ++ u32 cur_ctl = secondary_exec_controls_get(vmx);
8593 ++
8594 ++ secondary_exec_controls_set(vmx, (new_ctl & ~mask) | (cur_ctl & mask));
8595 ++}
8596 ++
8597 ++/*
8598 ++ * Generate MSR_IA32_VMX_CR{0,4}_FIXED1 according to CPUID. Only set bits
8599 ++ * (indicating "allowed-1") if they are supported in the guest's CPUID.
8600 ++ */
8601 ++static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu)
8602 ++{
8603 ++ struct vcpu_vmx *vmx = to_vmx(vcpu);
8604 ++ struct kvm_cpuid_entry2 *entry;
8605 ++
8606 ++ vmx->nested.msrs.cr0_fixed1 = 0xffffffff;
8607 ++ vmx->nested.msrs.cr4_fixed1 = X86_CR4_PCE;
8608 ++
8609 ++#define cr4_fixed1_update(_cr4_mask, _reg, _cpuid_mask) do { \
8610 ++ if (entry && (entry->_reg & (_cpuid_mask))) \
8611 ++ vmx->nested.msrs.cr4_fixed1 |= (_cr4_mask); \
8612 ++} while (0)
8613 ++
8614 ++ entry = kvm_find_cpuid_entry(vcpu, 0x1, 0);
8615 ++ cr4_fixed1_update(X86_CR4_VME, edx, bit(X86_FEATURE_VME));
8616 ++ cr4_fixed1_update(X86_CR4_PVI, edx, bit(X86_FEATURE_VME));
8617 ++ cr4_fixed1_update(X86_CR4_TSD, edx, bit(X86_FEATURE_TSC));
8618 ++ cr4_fixed1_update(X86_CR4_DE, edx, bit(X86_FEATURE_DE));
8619 ++ cr4_fixed1_update(X86_CR4_PSE, edx, bit(X86_FEATURE_PSE));
8620 ++ cr4_fixed1_update(X86_CR4_PAE, edx, bit(X86_FEATURE_PAE));
8621 ++ cr4_fixed1_update(X86_CR4_MCE, edx, bit(X86_FEATURE_MCE));
8622 ++ cr4_fixed1_update(X86_CR4_PGE, edx, bit(X86_FEATURE_PGE));
8623 ++ cr4_fixed1_update(X86_CR4_OSFXSR, edx, bit(X86_FEATURE_FXSR));
8624 ++ cr4_fixed1_update(X86_CR4_OSXMMEXCPT, edx, bit(X86_FEATURE_XMM));
8625 ++ cr4_fixed1_update(X86_CR4_VMXE, ecx, bit(X86_FEATURE_VMX));
8626 ++ cr4_fixed1_update(X86_CR4_SMXE, ecx, bit(X86_FEATURE_SMX));
8627 ++ cr4_fixed1_update(X86_CR4_PCIDE, ecx, bit(X86_FEATURE_PCID));
8628 ++ cr4_fixed1_update(X86_CR4_OSXSAVE, ecx, bit(X86_FEATURE_XSAVE));
8629 ++
8630 ++ entry = kvm_find_cpuid_entry(vcpu, 0x7, 0);
8631 ++ cr4_fixed1_update(X86_CR4_FSGSBASE, ebx, bit(X86_FEATURE_FSGSBASE));
8632 ++ cr4_fixed1_update(X86_CR4_SMEP, ebx, bit(X86_FEATURE_SMEP));
8633 ++ cr4_fixed1_update(X86_CR4_SMAP, ebx, bit(X86_FEATURE_SMAP));
8634 ++ cr4_fixed1_update(X86_CR4_PKE, ecx, bit(X86_FEATURE_PKU));
8635 ++ cr4_fixed1_update(X86_CR4_UMIP, ecx, bit(X86_FEATURE_UMIP));
8636 ++ cr4_fixed1_update(X86_CR4_LA57, ecx, bit(X86_FEATURE_LA57));
8637 ++
8638 ++#undef cr4_fixed1_update
8639 ++}
8640 ++
8641 ++static void nested_vmx_entry_exit_ctls_update(struct kvm_vcpu *vcpu)
8642 ++{
8643 ++ struct vcpu_vmx *vmx = to_vmx(vcpu);
8644 ++
8645 ++ if (kvm_mpx_supported()) {
8646 ++ bool mpx_enabled = guest_cpuid_has(vcpu, X86_FEATURE_MPX);
8647 ++
8648 ++ if (mpx_enabled) {
8649 ++ vmx->nested.msrs.entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS;
8650 ++ vmx->nested.msrs.exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS;
8651 ++ } else {
8652 ++ vmx->nested.msrs.entry_ctls_high &= ~VM_ENTRY_LOAD_BNDCFGS;
8653 ++ vmx->nested.msrs.exit_ctls_high &= ~VM_EXIT_CLEAR_BNDCFGS;
8654 ++ }
8655 ++ }
8656 ++}
8657 ++
8658 ++static void update_intel_pt_cfg(struct kvm_vcpu *vcpu)
8659 ++{
8660 ++ struct vcpu_vmx *vmx = to_vmx(vcpu);
8661 ++ struct kvm_cpuid_entry2 *best = NULL;
8662 ++ int i;
8663 ++
8664 ++ for (i = 0; i < PT_CPUID_LEAVES; i++) {
8665 ++ best = kvm_find_cpuid_entry(vcpu, 0x14, i);
8666 ++ if (!best)
8667 ++ return;
8668 ++ vmx->pt_desc.caps[CPUID_EAX + i*PT_CPUID_REGS_NUM] = best->eax;
8669 ++ vmx->pt_desc.caps[CPUID_EBX + i*PT_CPUID_REGS_NUM] = best->ebx;
8670 ++ vmx->pt_desc.caps[CPUID_ECX + i*PT_CPUID_REGS_NUM] = best->ecx;
8671 ++ vmx->pt_desc.caps[CPUID_EDX + i*PT_CPUID_REGS_NUM] = best->edx;
8672 ++ }
8673 ++
8674 ++ /* Get the number of configurable Address Ranges for filtering */
8675 ++ vmx->pt_desc.addr_range = intel_pt_validate_cap(vmx->pt_desc.caps,
8676 ++ PT_CAP_num_address_ranges);
8677 ++
8678 ++ /* Initialize and clear the no dependency bits */
8679 ++ vmx->pt_desc.ctl_bitmask = ~(RTIT_CTL_TRACEEN | RTIT_CTL_OS |
8680 ++ RTIT_CTL_USR | RTIT_CTL_TSC_EN | RTIT_CTL_DISRETC);
8681 ++
8682 ++ /*
8683 ++ * If CPUID.(EAX=14H,ECX=0):EBX[0]=1 CR3Filter can be set otherwise
8684 ++ * will inject an #GP
8685 ++ */
8686 ++ if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_cr3_filtering))
8687 ++ vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_CR3EN;
8688 ++
8689 ++ /*
8690 ++ * If CPUID.(EAX=14H,ECX=0):EBX[1]=1 CYCEn, CycThresh and
8691 ++ * PSBFreq can be set
8692 ++ */
8693 ++ if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc))
8694 ++ vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_CYCLEACC |
8695 ++ RTIT_CTL_CYC_THRESH | RTIT_CTL_PSB_FREQ);
8696 ++
8697 ++ /*
8698 ++ * If CPUID.(EAX=14H,ECX=0):EBX[3]=1 MTCEn BranchEn and
8699 ++ * MTCFreq can be set
8700 ++ */
8701 ++ if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc))
8702 ++ vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_MTC_EN |
8703 ++ RTIT_CTL_BRANCH_EN | RTIT_CTL_MTC_RANGE);
8704 ++
8705 ++ /* If CPUID.(EAX=14H,ECX=0):EBX[4]=1 FUPonPTW and PTWEn can be set */
8706 ++ if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_ptwrite))
8707 ++ vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_FUP_ON_PTW |
8708 ++ RTIT_CTL_PTW_EN);
8709 ++
8710 ++ /* If CPUID.(EAX=14H,ECX=0):EBX[5]=1 PwrEvEn can be set */
8711 ++ if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_power_event_trace))
8712 ++ vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_PWR_EVT_EN;
8713 ++
8714 ++ /* If CPUID.(EAX=14H,ECX=0):ECX[0]=1 ToPA can be set */
8715 ++ if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_topa_output))
8716 ++ vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_TOPA;
8717 ++
8718 ++ /* If CPUID.(EAX=14H,ECX=0):ECX[3]=1 FabircEn can be set */
8719 ++ if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_output_subsys))
8720 ++ vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_FABRIC_EN;
8721 ++
8722 ++ /* unmask address range configure area */
8723 ++ for (i = 0; i < vmx->pt_desc.addr_range; i++)
8724 ++ vmx->pt_desc.ctl_bitmask &= ~(0xfULL << (32 + i * 4));
8725 ++}
8726 ++
8727 ++static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
8728 ++{
8729 ++ struct vcpu_vmx *vmx = to_vmx(vcpu);
8730 ++
8731 ++ /* xsaves_enabled is recomputed in vmx_compute_secondary_exec_control(). */
8732 ++ vcpu->arch.xsaves_enabled = false;
8733 ++
8734 ++ if (cpu_has_secondary_exec_ctrls()) {
8735 ++ vmx_compute_secondary_exec_control(vmx);
8736 ++ vmcs_set_secondary_exec_control(vmx);
8737 ++ }
8738 ++
8739 ++ if (nested_vmx_allowed(vcpu))
8740 ++ to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |=
8741 ++ FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX |
8742 ++ FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
8743 ++ else
8744 ++ to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &=
8745 ++ ~(FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX |
8746 ++ FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX);
8747 ++
8748 ++ if (nested_vmx_allowed(vcpu)) {
8749 ++ nested_vmx_cr_fixed1_bits_update(vcpu);
8750 ++ nested_vmx_entry_exit_ctls_update(vcpu);
8751 ++ }
8752 ++
8753 ++ if (boot_cpu_has(X86_FEATURE_INTEL_PT) &&
8754 ++ guest_cpuid_has(vcpu, X86_FEATURE_INTEL_PT))
8755 ++ update_intel_pt_cfg(vcpu);
8756 ++
8757 ++ if (boot_cpu_has(X86_FEATURE_RTM)) {
8758 ++ struct shared_msr_entry *msr;
8759 ++ msr = find_msr_entry(vmx, MSR_IA32_TSX_CTRL);
8760 ++ if (msr) {
8761 ++ bool enabled = guest_cpuid_has(vcpu, X86_FEATURE_RTM);
8762 ++ vmx_set_guest_msr(vmx, msr, enabled ? 0 : TSX_CTRL_RTM_DISABLE);
8763 ++ }
8764 ++ }
8765 ++}
8766 ++
8767 ++static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
8768 ++{
8769 ++ if (func == 1 && nested)
8770 ++ entry->ecx |= bit(X86_FEATURE_VMX);
8771 ++}
8772 ++
8773 ++static void vmx_request_immediate_exit(struct kvm_vcpu *vcpu)
8774 ++{
8775 ++ to_vmx(vcpu)->req_immediate_exit = true;
8776 ++}
8777 ++
8778 ++static int vmx_check_intercept(struct kvm_vcpu *vcpu,
8779 ++ struct x86_instruction_info *info,
8780 ++ enum x86_intercept_stage stage)
8781 ++{
8782 ++ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
8783 ++ struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
8784 ++
8785 ++ /*
8786 ++ * RDPID causes #UD if disabled through secondary execution controls.
8787 ++ * Because it is marked as EmulateOnUD, we need to intercept it here.
8788 ++ */
8789 ++ if (info->intercept == x86_intercept_rdtscp &&
8790 ++ !nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDTSCP)) {
8791 ++ ctxt->exception.vector = UD_VECTOR;
8792 ++ ctxt->exception.error_code_valid = false;
8793 ++ return X86EMUL_PROPAGATE_FAULT;
8794 ++ }
8795 ++
8796 ++ /* TODO: check more intercepts... */
8797 ++ return X86EMUL_CONTINUE;
8798 ++}
8799 ++
8800 ++#ifdef CONFIG_X86_64
8801 ++/* (a << shift) / divisor, return 1 if overflow otherwise 0 */
8802 ++static inline int u64_shl_div_u64(u64 a, unsigned int shift,
8803 ++ u64 divisor, u64 *result)
8804 ++{
8805 ++ u64 low = a << shift, high = a >> (64 - shift);
8806 ++
8807 ++ /* To avoid the overflow on divq */
8808 ++ if (high >= divisor)
8809 ++ return 1;
8810 ++
8811 ++ /* Low hold the result, high hold rem which is discarded */
8812 ++ asm("divq %2\n\t" : "=a" (low), "=d" (high) :
8813 ++ "rm" (divisor), "0" (low), "1" (high));
8814 ++ *result = low;
8815 ++
8816 ++ return 0;
8817 ++}
8818 ++
8819 ++static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc,
8820 ++ bool *expired)
8821 ++{
8822 ++ struct vcpu_vmx *vmx;
8823 ++ u64 tscl, guest_tscl, delta_tsc, lapic_timer_advance_cycles;
8824 ++ struct kvm_timer *ktimer = &vcpu->arch.apic->lapic_timer;
8825 ++
8826 ++ if (kvm_mwait_in_guest(vcpu->kvm) ||
8827 ++ kvm_can_post_timer_interrupt(vcpu))
8828 ++ return -EOPNOTSUPP;
8829 ++
8830 ++ vmx = to_vmx(vcpu);
8831 ++ tscl = rdtsc();
8832 ++ guest_tscl = kvm_read_l1_tsc(vcpu, tscl);
8833 ++ delta_tsc = max(guest_deadline_tsc, guest_tscl) - guest_tscl;
8834 ++ lapic_timer_advance_cycles = nsec_to_cycles(vcpu,
8835 ++ ktimer->timer_advance_ns);
8836 ++
8837 ++ if (delta_tsc > lapic_timer_advance_cycles)
8838 ++ delta_tsc -= lapic_timer_advance_cycles;
8839 ++ else
8840 ++ delta_tsc = 0;
8841 ++
8842 ++ /* Convert to host delta tsc if tsc scaling is enabled */
8843 ++ if (vcpu->arch.tsc_scaling_ratio != kvm_default_tsc_scaling_ratio &&
8844 ++ delta_tsc && u64_shl_div_u64(delta_tsc,
8845 ++ kvm_tsc_scaling_ratio_frac_bits,
8846 ++ vcpu->arch.tsc_scaling_ratio, &delta_tsc))
8847 ++ return -ERANGE;
8848 ++
8849 ++ /*
8850 ++ * If the delta tsc can't fit in the 32 bit after the multi shift,
8851 ++ * we can't use the preemption timer.
8852 ++ * It's possible that it fits on later vmentries, but checking
8853 ++ * on every vmentry is costly so we just use an hrtimer.
8854 ++ */
8855 ++ if (delta_tsc >> (cpu_preemption_timer_multi + 32))
8856 ++ return -ERANGE;
8857 ++
8858 ++ vmx->hv_deadline_tsc = tscl + delta_tsc;
8859 ++ *expired = !delta_tsc;
8860 ++ return 0;
8861 ++}
8862 ++
8863 ++static void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu)
8864 ++{
8865 ++ to_vmx(vcpu)->hv_deadline_tsc = -1;
8866 ++}
8867 ++#endif
8868 ++
8869 ++static void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu)
8870 ++{
8871 ++ if (!kvm_pause_in_guest(vcpu->kvm))
8872 ++ shrink_ple_window(vcpu);
8873 ++}
8874 ++
8875 ++static void vmx_slot_enable_log_dirty(struct kvm *kvm,
8876 ++ struct kvm_memory_slot *slot)
8877 ++{
8878 ++ kvm_mmu_slot_leaf_clear_dirty(kvm, slot);
8879 ++ kvm_mmu_slot_largepage_remove_write_access(kvm, slot);
8880 ++}
8881 ++
8882 ++static void vmx_slot_disable_log_dirty(struct kvm *kvm,
8883 ++ struct kvm_memory_slot *slot)
8884 ++{
8885 ++ kvm_mmu_slot_set_dirty(kvm, slot);
8886 ++}
8887 ++
8888 ++static void vmx_flush_log_dirty(struct kvm *kvm)
8889 ++{
8890 ++ kvm_flush_pml_buffers(kvm);
8891 ++}
8892 ++
8893 ++static int vmx_write_pml_buffer(struct kvm_vcpu *vcpu)
8894 ++{
8895 ++ struct vmcs12 *vmcs12;
8896 ++ struct vcpu_vmx *vmx = to_vmx(vcpu);
8897 ++ gpa_t gpa, dst;
8898 ++
8899 ++ if (is_guest_mode(vcpu)) {
8900 ++ WARN_ON_ONCE(vmx->nested.pml_full);
8901 ++
8902 ++ /*
8903 ++ * Check if PML is enabled for the nested guest.
8904 ++ * Whether eptp bit 6 is set is already checked
8905 ++ * as part of A/D emulation.
8906 ++ */
8907 ++ vmcs12 = get_vmcs12(vcpu);
8908 ++ if (!nested_cpu_has_pml(vmcs12))
8909 ++ return 0;
8910 ++
8911 ++ if (vmcs12->guest_pml_index >= PML_ENTITY_NUM) {
8912 ++ vmx->nested.pml_full = true;
8913 ++ return 1;
8914 ++ }
8915 ++
8916 ++ gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS) & ~0xFFFull;
8917 ++ dst = vmcs12->pml_address + sizeof(u64) * vmcs12->guest_pml_index;
8918 ++
8919 ++ if (kvm_write_guest_page(vcpu->kvm, gpa_to_gfn(dst), &gpa,
8920 ++ offset_in_page(dst), sizeof(gpa)))
8921 ++ return 0;
8922 ++
8923 ++ vmcs12->guest_pml_index--;
8924 ++ }
8925 ++
8926 ++ return 0;
8927 ++}
8928 ++
8929 ++static void vmx_enable_log_dirty_pt_masked(struct kvm *kvm,
8930 ++ struct kvm_memory_slot *memslot,
8931 ++ gfn_t offset, unsigned long mask)
8932 ++{
8933 ++ kvm_mmu_clear_dirty_pt_masked(kvm, memslot, offset, mask);
8934 ++}
8935 ++
8936 ++static void __pi_post_block(struct kvm_vcpu *vcpu)
8937 ++{
8938 ++ struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
8939 ++ struct pi_desc old, new;
8940 ++ unsigned int dest;
8941 ++
8942 ++ do {
8943 ++ old.control = new.control = pi_desc->control;
8944 ++ WARN(old.nv != POSTED_INTR_WAKEUP_VECTOR,
8945 ++ "Wakeup handler not enabled while the VCPU is blocked\n");
8946 ++
8947 ++ dest = cpu_physical_id(vcpu->cpu);
8948 ++
8949 ++ if (x2apic_enabled())
8950 ++ new.ndst = dest;
8951 ++ else
8952 ++ new.ndst = (dest << 8) & 0xFF00;
8953 ++
8954 ++ /* set 'NV' to 'notification vector' */
8955 ++ new.nv = POSTED_INTR_VECTOR;
8956 ++ } while (cmpxchg64(&pi_desc->control, old.control,
8957 ++ new.control) != old.control);
8958 ++
8959 ++ if (!WARN_ON_ONCE(vcpu->pre_pcpu == -1)) {
8960 ++ spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
8961 ++ list_del(&vcpu->blocked_vcpu_list);
8962 ++ spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
8963 ++ vcpu->pre_pcpu = -1;
8964 ++ }
8965 ++}
8966 ++
8967 ++/*
8968 ++ * This routine does the following things for vCPU which is going
8969 ++ * to be blocked if VT-d PI is enabled.
8970 ++ * - Store the vCPU to the wakeup list, so when interrupts happen
8971 ++ * we can find the right vCPU to wake up.
8972 ++ * - Change the Posted-interrupt descriptor as below:
8973 ++ * 'NDST' <-- vcpu->pre_pcpu
8974 ++ * 'NV' <-- POSTED_INTR_WAKEUP_VECTOR
8975 ++ * - If 'ON' is set during this process, which means at least one
8976 ++ * interrupt is posted for this vCPU, we cannot block it, in
8977 ++ * this case, return 1, otherwise, return 0.
8978 ++ *
8979 ++ */
8980 ++static int pi_pre_block(struct kvm_vcpu *vcpu)
8981 ++{
8982 ++ unsigned int dest;
8983 ++ struct pi_desc old, new;
8984 ++ struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
8985 ++
8986 ++ if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
8987 ++ !irq_remapping_cap(IRQ_POSTING_CAP) ||
8988 ++ !kvm_vcpu_apicv_active(vcpu))
8989 ++ return 0;
8990 ++
8991 ++ WARN_ON(irqs_disabled());
8992 ++ local_irq_disable();
8993 ++ if (!WARN_ON_ONCE(vcpu->pre_pcpu != -1)) {
8994 ++ vcpu->pre_pcpu = vcpu->cpu;
8995 ++ spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
8996 ++ list_add_tail(&vcpu->blocked_vcpu_list,
8997 ++ &per_cpu(blocked_vcpu_on_cpu,
8998 ++ vcpu->pre_pcpu));
8999 ++ spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
9000 ++ }
9001 ++
9002 ++ do {
9003 ++ old.control = new.control = pi_desc->control;
9004 ++
9005 ++ WARN((pi_desc->sn == 1),
9006 ++ "Warning: SN field of posted-interrupts "
9007 ++ "is set before blocking\n");
9008 ++
9009 ++ /*
9010 ++ * Since vCPU can be preempted during this process,
9011 ++ * vcpu->cpu could be different with pre_pcpu, we
9012 ++ * need to set pre_pcpu as the destination of wakeup
9013 ++ * notification event, then we can find the right vCPU
9014 ++ * to wakeup in wakeup handler if interrupts happen
9015 ++ * when the vCPU is in blocked state.
9016 ++ */
9017 ++ dest = cpu_physical_id(vcpu->pre_pcpu);
9018 ++
9019 ++ if (x2apic_enabled())
9020 ++ new.ndst = dest;
9021 ++ else
9022 ++ new.ndst = (dest << 8) & 0xFF00;
9023 ++
9024 ++ /* set 'NV' to 'wakeup vector' */
9025 ++ new.nv = POSTED_INTR_WAKEUP_VECTOR;
9026 ++ } while (cmpxchg64(&pi_desc->control, old.control,
9027 ++ new.control) != old.control);
9028 ++
9029 ++ /* We should not block the vCPU if an interrupt is posted for it. */
9030 ++ if (pi_test_on(pi_desc) == 1)
9031 ++ __pi_post_block(vcpu);
9032 ++
9033 ++ local_irq_enable();
9034 ++ return (vcpu->pre_pcpu == -1);
9035 ++}
9036 ++
9037 ++static int vmx_pre_block(struct kvm_vcpu *vcpu)
9038 ++{
9039 ++ if (pi_pre_block(vcpu))
9040 ++ return 1;
9041 ++
9042 ++ if (kvm_lapic_hv_timer_in_use(vcpu))
9043 ++ kvm_lapic_switch_to_sw_timer(vcpu);
9044 ++
9045 ++ return 0;
9046 ++}
9047 ++
9048 ++static void pi_post_block(struct kvm_vcpu *vcpu)
9049 ++{
9050 ++ if (vcpu->pre_pcpu == -1)
9051 ++ return;
9052 ++
9053 ++ WARN_ON(irqs_disabled());
9054 ++ local_irq_disable();
9055 ++ __pi_post_block(vcpu);
9056 ++ local_irq_enable();
9057 ++}
9058 ++
9059 ++static void vmx_post_block(struct kvm_vcpu *vcpu)
9060 ++{
9061 ++ if (kvm_x86_ops->set_hv_timer)
9062 ++ kvm_lapic_switch_to_hv_timer(vcpu);
9063 ++
9064 ++ pi_post_block(vcpu);
9065 ++}
9066 ++
9067 ++/*
9068 ++ * vmx_update_pi_irte - set IRTE for Posted-Interrupts
9069 ++ *
9070 ++ * @kvm: kvm
9071 ++ * @host_irq: host irq of the interrupt
9072 ++ * @guest_irq: gsi of the interrupt
9073 ++ * @set: set or unset PI
9074 ++ * returns 0 on success, < 0 on failure
9075 ++ */
9076 ++static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
9077 ++ uint32_t guest_irq, bool set)
9078 ++{
9079 ++ struct kvm_kernel_irq_routing_entry *e;
9080 ++ struct kvm_irq_routing_table *irq_rt;
9081 ++ struct kvm_lapic_irq irq;
9082 ++ struct kvm_vcpu *vcpu;
9083 ++ struct vcpu_data vcpu_info;
9084 ++ int idx, ret = 0;
9085 ++
9086 ++ if (!kvm_arch_has_assigned_device(kvm) ||
9087 ++ !irq_remapping_cap(IRQ_POSTING_CAP) ||
9088 ++ !kvm_vcpu_apicv_active(kvm->vcpus[0]))
9089 ++ return 0;
9090 ++
9091 ++ idx = srcu_read_lock(&kvm->irq_srcu);
9092 ++ irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
9093 ++ if (guest_irq >= irq_rt->nr_rt_entries ||
9094 ++ hlist_empty(&irq_rt->map[guest_irq])) {
9095 ++ pr_warn_once("no route for guest_irq %u/%u (broken user space?)\n",
9096 ++ guest_irq, irq_rt->nr_rt_entries);
9097 ++ goto out;
9098 ++ }
9099 ++
9100 ++ hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) {
9101 ++ if (e->type != KVM_IRQ_ROUTING_MSI)
9102 ++ continue;
9103 ++ /*
9104 ++ * VT-d PI cannot support posting multicast/broadcast
9105 ++ * interrupts to a vCPU, we still use interrupt remapping
9106 ++ * for these kind of interrupts.
9107 ++ *
9108 ++ * For lowest-priority interrupts, we only support
9109 ++ * those with single CPU as the destination, e.g. user
9110 ++ * configures the interrupts via /proc/irq or uses
9111 ++ * irqbalance to make the interrupts single-CPU.
9112 ++ *
9113 ++ * We will support full lowest-priority interrupt later.
9114 ++ *
9115 ++ * In addition, we can only inject generic interrupts using
9116 ++ * the PI mechanism, refuse to route others through it.
9117 ++ */
9118 ++
9119 ++ kvm_set_msi_irq(kvm, e, &irq);
9120 ++ if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu) ||
9121 ++ !kvm_irq_is_postable(&irq)) {
9122 ++ /*
9123 ++ * Make sure the IRTE is in remapped mode if
9124 ++ * we don't handle it in posted mode.
9125 ++ */
9126 ++ ret = irq_set_vcpu_affinity(host_irq, NULL);
9127 ++ if (ret < 0) {
9128 ++ printk(KERN_INFO
9129 ++ "failed to back to remapped mode, irq: %u\n",
9130 ++ host_irq);
9131 ++ goto out;
9132 ++ }
9133 ++
9134 ++ continue;
9135 ++ }
9136 ++
9137 ++ vcpu_info.pi_desc_addr = __pa(vcpu_to_pi_desc(vcpu));
9138 ++ vcpu_info.vector = irq.vector;
9139 ++
9140 ++ trace_kvm_pi_irte_update(host_irq, vcpu->vcpu_id, e->gsi,
9141 ++ vcpu_info.vector, vcpu_info.pi_desc_addr, set);
9142 ++
9143 ++ if (set)
9144 ++ ret = irq_set_vcpu_affinity(host_irq, &vcpu_info);
9145 ++ else
9146 ++ ret = irq_set_vcpu_affinity(host_irq, NULL);
9147 ++
9148 ++ if (ret < 0) {
9149 ++ printk(KERN_INFO "%s: failed to update PI IRTE\n",
9150 ++ __func__);
9151 ++ goto out;
9152 ++ }
9153 ++ }
9154 ++
9155 ++ ret = 0;
9156 ++out:
9157 ++ srcu_read_unlock(&kvm->irq_srcu, idx);
9158 ++ return ret;
9159 ++}
9160 ++
9161 ++static void vmx_setup_mce(struct kvm_vcpu *vcpu)
9162 ++{
9163 ++ if (vcpu->arch.mcg_cap & MCG_LMCE_P)
9164 ++ to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |=
9165 ++ FEATURE_CONTROL_LMCE;
9166 ++ else
9167 ++ to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &=
9168 ++ ~FEATURE_CONTROL_LMCE;
9169 ++}
9170 ++
9171 ++static int vmx_smi_allowed(struct kvm_vcpu *vcpu)
9172 ++{
9173 ++ /* we need a nested vmexit to enter SMM, postpone if run is pending */
9174 ++ if (to_vmx(vcpu)->nested.nested_run_pending)
9175 ++ return 0;
9176 ++ return 1;
9177 ++}
9178 ++
9179 ++static int vmx_pre_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
9180 ++{
9181 ++ struct vcpu_vmx *vmx = to_vmx(vcpu);
9182 ++
9183 ++ vmx->nested.smm.guest_mode = is_guest_mode(vcpu);
9184 ++ if (vmx->nested.smm.guest_mode)
9185 ++ nested_vmx_vmexit(vcpu, -1, 0, 0);
9186 ++
9187 ++ vmx->nested.smm.vmxon = vmx->nested.vmxon;
9188 ++ vmx->nested.vmxon = false;
9189 ++ vmx_clear_hlt(vcpu);
9190 ++ return 0;
9191 ++}
9192 ++
9193 ++static int vmx_pre_leave_smm(struct kvm_vcpu *vcpu, const char *smstate)
9194 ++{
9195 ++ struct vcpu_vmx *vmx = to_vmx(vcpu);
9196 ++ int ret;
9197 ++
9198 ++ if (vmx->nested.smm.vmxon) {
9199 ++ vmx->nested.vmxon = true;
9200 ++ vmx->nested.smm.vmxon = false;
9201 ++ }
9202 ++
9203 ++ if (vmx->nested.smm.guest_mode) {
9204 ++ ret = nested_vmx_enter_non_root_mode(vcpu, false);
9205 ++ if (ret)
9206 ++ return ret;
9207 ++
9208 ++ vmx->nested.smm.guest_mode = false;
9209 ++ }
9210 ++ return 0;
9211 ++}
9212 ++
9213 ++static int enable_smi_window(struct kvm_vcpu *vcpu)
9214 ++{
9215 ++ return 0;
9216 ++}
9217 ++
9218 ++static bool vmx_need_emulation_on_page_fault(struct kvm_vcpu *vcpu)
9219 ++{
9220 ++ return false;
9221 ++}
9222 ++
9223 ++static bool vmx_apic_init_signal_blocked(struct kvm_vcpu *vcpu)
9224 ++{
9225 ++ return to_vmx(vcpu)->nested.vmxon;
9226 ++}
9227 ++
9228 ++static __init int hardware_setup(void)
9229 ++{
9230 ++ unsigned long host_bndcfgs;
9231 ++ struct desc_ptr dt;
9232 ++ int r, i;
9233 ++
9234 ++ rdmsrl_safe(MSR_EFER, &host_efer);
9235 ++
9236 ++ store_idt(&dt);
9237 ++ host_idt_base = dt.address;
9238 ++
9239 ++ for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i)
9240 ++ kvm_define_shared_msr(i, vmx_msr_index[i]);
9241 ++
9242 ++ if (setup_vmcs_config(&vmcs_config, &vmx_capability) < 0)
9243 ++ return -EIO;
9244 ++
9245 ++ if (boot_cpu_has(X86_FEATURE_NX))
9246 ++ kvm_enable_efer_bits(EFER_NX);
9247 ++
9248 ++ if (boot_cpu_has(X86_FEATURE_MPX)) {
9249 ++ rdmsrl(MSR_IA32_BNDCFGS, host_bndcfgs);
9250 ++ WARN_ONCE(host_bndcfgs, "KVM: BNDCFGS in host will be lost");
9251 ++ }
9252 ++
9253 ++ if (!cpu_has_vmx_vpid() || !cpu_has_vmx_invvpid() ||
9254 ++ !(cpu_has_vmx_invvpid_single() || cpu_has_vmx_invvpid_global()))
9255 ++ enable_vpid = 0;
9256 ++
9257 ++ if (!cpu_has_vmx_ept() ||
9258 ++ !cpu_has_vmx_ept_4levels() ||
9259 ++ !cpu_has_vmx_ept_mt_wb() ||
9260 ++ !cpu_has_vmx_invept_global())
9261 ++ enable_ept = 0;
9262 ++
9263 ++ if (!cpu_has_vmx_ept_ad_bits() || !enable_ept)
9264 ++ enable_ept_ad_bits = 0;
9265 ++
9266 ++ if (!cpu_has_vmx_unrestricted_guest() || !enable_ept)
9267 ++ enable_unrestricted_guest = 0;
9268 ++
9269 ++ if (!cpu_has_vmx_flexpriority())
9270 ++ flexpriority_enabled = 0;
9271 ++
9272 ++ if (!cpu_has_virtual_nmis())
9273 ++ enable_vnmi = 0;
9274 ++
9275 ++ /*
9276 ++ * set_apic_access_page_addr() is used to reload apic access
9277 ++ * page upon invalidation. No need to do anything if not
9278 ++ * using the APIC_ACCESS_ADDR VMCS field.
9279 ++ */
9280 ++ if (!flexpriority_enabled)
9281 ++ kvm_x86_ops->set_apic_access_page_addr = NULL;
9282 ++
9283 ++ if (!cpu_has_vmx_tpr_shadow())
9284 ++ kvm_x86_ops->update_cr8_intercept = NULL;
9285 ++
9286 ++ if (enable_ept && !cpu_has_vmx_ept_2m_page())
9287 ++ kvm_disable_largepages();
9288 ++
9289 ++#if IS_ENABLED(CONFIG_HYPERV)
9290 ++ if (ms_hyperv.nested_features & HV_X64_NESTED_GUEST_MAPPING_FLUSH
9291 ++ && enable_ept) {
9292 ++ kvm_x86_ops->tlb_remote_flush = hv_remote_flush_tlb;
9293 ++ kvm_x86_ops->tlb_remote_flush_with_range =
9294 ++ hv_remote_flush_tlb_with_range;
9295 ++ }
9296 ++#endif
9297 ++
9298 ++ if (!cpu_has_vmx_ple()) {
9299 ++ ple_gap = 0;
9300 ++ ple_window = 0;
9301 ++ ple_window_grow = 0;
9302 ++ ple_window_max = 0;
9303 ++ ple_window_shrink = 0;
9304 ++ }
9305 ++
9306 ++ if (!cpu_has_vmx_apicv()) {
9307 ++ enable_apicv = 0;
9308 ++ kvm_x86_ops->sync_pir_to_irr = NULL;
9309 ++ }
9310 ++
9311 ++ if (cpu_has_vmx_tsc_scaling()) {
9312 ++ kvm_has_tsc_control = true;
9313 ++ kvm_max_tsc_scaling_ratio = KVM_VMX_TSC_MULTIPLIER_MAX;
9314 ++ kvm_tsc_scaling_ratio_frac_bits = 48;
9315 ++ }
9316 ++
9317 ++ set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
9318 ++
9319 ++ if (enable_ept)
9320 ++ vmx_enable_tdp();
9321 ++ else
9322 ++ kvm_disable_tdp();
9323 ++
9324 ++ /*
9325 ++ * Only enable PML when hardware supports PML feature, and both EPT
9326 ++ * and EPT A/D bit features are enabled -- PML depends on them to work.
9327 ++ */
9328 ++ if (!enable_ept || !enable_ept_ad_bits || !cpu_has_vmx_pml())
9329 ++ enable_pml = 0;
9330 ++
9331 ++ if (!enable_pml) {
9332 ++ kvm_x86_ops->slot_enable_log_dirty = NULL;
9333 ++ kvm_x86_ops->slot_disable_log_dirty = NULL;
9334 ++ kvm_x86_ops->flush_log_dirty = NULL;
9335 ++ kvm_x86_ops->enable_log_dirty_pt_masked = NULL;
9336 ++ }
9337 ++
9338 ++ if (!cpu_has_vmx_preemption_timer())
9339 ++ enable_preemption_timer = false;
9340 ++
9341 ++ if (enable_preemption_timer) {
9342 ++ u64 use_timer_freq = 5000ULL * 1000 * 1000;
9343 ++ u64 vmx_msr;
9344 ++
9345 ++ rdmsrl(MSR_IA32_VMX_MISC, vmx_msr);
9346 ++ cpu_preemption_timer_multi =
9347 ++ vmx_msr & VMX_MISC_PREEMPTION_TIMER_RATE_MASK;
9348 ++
9349 ++ if (tsc_khz)
9350 ++ use_timer_freq = (u64)tsc_khz * 1000;
9351 ++ use_timer_freq >>= cpu_preemption_timer_multi;
9352 ++
9353 ++ /*
9354 ++ * KVM "disables" the preemption timer by setting it to its max
9355 ++ * value. Don't use the timer if it might cause spurious exits
9356 ++ * at a rate faster than 0.1 Hz (of uninterrupted guest time).
9357 ++ */
9358 ++ if (use_timer_freq > 0xffffffffu / 10)
9359 ++ enable_preemption_timer = false;
9360 ++ }
9361 ++
9362 ++ if (!enable_preemption_timer) {
9363 ++ kvm_x86_ops->set_hv_timer = NULL;
9364 ++ kvm_x86_ops->cancel_hv_timer = NULL;
9365 ++ kvm_x86_ops->request_immediate_exit = __kvm_request_immediate_exit;
9366 ++ }
9367 ++
9368 ++ kvm_set_posted_intr_wakeup_handler(wakeup_handler);
9369 ++
9370 ++ kvm_mce_cap_supported |= MCG_LMCE_P;
9371 ++
9372 ++ if (pt_mode != PT_MODE_SYSTEM && pt_mode != PT_MODE_HOST_GUEST)
9373 ++ return -EINVAL;
9374 ++ if (!enable_ept || !cpu_has_vmx_intel_pt())
9375 ++ pt_mode = PT_MODE_SYSTEM;
9376 ++
9377 ++ if (nested) {
9378 ++ nested_vmx_setup_ctls_msrs(&vmcs_config.nested,
9379 ++ vmx_capability.ept, enable_apicv);
9380 ++
9381 ++ r = nested_vmx_hardware_setup(kvm_vmx_exit_handlers);
9382 ++ if (r)
9383 ++ return r;
9384 ++ }
9385 ++
9386 ++ r = alloc_kvm_area();
9387 ++ if (r)
9388 ++ nested_vmx_hardware_unsetup();
9389 ++ return r;
9390 ++}
9391 ++
9392 ++static __exit void hardware_unsetup(void)
9393 ++{
9394 ++ if (nested)
9395 ++ nested_vmx_hardware_unsetup();
9396 ++
9397 ++ free_kvm_area();
9398 ++}
9399 ++
9400 ++static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
9401 ++ .cpu_has_kvm_support = cpu_has_kvm_support,
9402 ++ .disabled_by_bios = vmx_disabled_by_bios,
9403 ++ .hardware_setup = hardware_setup,
9404 ++ .hardware_unsetup = hardware_unsetup,
9405 ++ .check_processor_compatibility = vmx_check_processor_compat,
9406 ++ .hardware_enable = hardware_enable,
9407 ++ .hardware_disable = hardware_disable,
9408 ++ .cpu_has_accelerated_tpr = report_flexpriority,
9409 ++ .has_emulated_msr = vmx_has_emulated_msr,
9410 ++
9411 ++ .vm_init = vmx_vm_init,
9412 ++ .vm_alloc = vmx_vm_alloc,
9413 ++ .vm_free = vmx_vm_free,
9414 ++
9415 ++ .vcpu_create = vmx_create_vcpu,
9416 ++ .vcpu_free = vmx_free_vcpu,
9417 ++ .vcpu_reset = vmx_vcpu_reset,
9418 ++
9419 ++ .prepare_guest_switch = vmx_prepare_switch_to_guest,
9420 ++ .vcpu_load = vmx_vcpu_load,
9421 ++ .vcpu_put = vmx_vcpu_put,
9422 ++
9423 ++ .update_bp_intercept = update_exception_bitmap,
9424 ++ .get_msr_feature = vmx_get_msr_feature,
9425 ++ .get_msr = vmx_get_msr,
9426 ++ .set_msr = vmx_set_msr,
9427 ++ .get_segment_base = vmx_get_segment_base,
9428 ++ .get_segment = vmx_get_segment,
9429 ++ .set_segment = vmx_set_segment,
9430 ++ .get_cpl = vmx_get_cpl,
9431 ++ .get_cs_db_l_bits = vmx_get_cs_db_l_bits,
9432 ++ .decache_cr0_guest_bits = vmx_decache_cr0_guest_bits,
9433 ++ .decache_cr4_guest_bits = vmx_decache_cr4_guest_bits,
9434 ++ .set_cr0 = vmx_set_cr0,
9435 ++ .set_cr3 = vmx_set_cr3,
9436 ++ .set_cr4 = vmx_set_cr4,
9437 ++ .set_efer = vmx_set_efer,
9438 ++ .get_idt = vmx_get_idt,
9439 ++ .set_idt = vmx_set_idt,
9440 ++ .get_gdt = vmx_get_gdt,
9441 ++ .set_gdt = vmx_set_gdt,
9442 ++ .get_dr6 = vmx_get_dr6,
9443 ++ .set_dr6 = vmx_set_dr6,
9444 ++ .set_dr7 = vmx_set_dr7,
9445 ++ .sync_dirty_debug_regs = vmx_sync_dirty_debug_regs,
9446 ++ .cache_reg = vmx_cache_reg,
9447 ++ .get_rflags = vmx_get_rflags,
9448 ++ .set_rflags = vmx_set_rflags,
9449 ++
9450 ++ .tlb_flush = vmx_flush_tlb,
9451 ++ .tlb_flush_gva = vmx_flush_tlb_gva,
9452 ++
9453 ++ .run = vmx_vcpu_run,
9454 ++ .handle_exit = vmx_handle_exit,
9455 ++ .skip_emulated_instruction = skip_emulated_instruction,
9456 ++ .set_interrupt_shadow = vmx_set_interrupt_shadow,
9457 ++ .get_interrupt_shadow = vmx_get_interrupt_shadow,
9458 ++ .patch_hypercall = vmx_patch_hypercall,
9459 ++ .set_irq = vmx_inject_irq,
9460 ++ .set_nmi = vmx_inject_nmi,
9461 ++ .queue_exception = vmx_queue_exception,
9462 ++ .cancel_injection = vmx_cancel_injection,
9463 ++ .interrupt_allowed = vmx_interrupt_allowed,
9464 ++ .nmi_allowed = vmx_nmi_allowed,
9465 ++ .get_nmi_mask = vmx_get_nmi_mask,
9466 ++ .set_nmi_mask = vmx_set_nmi_mask,
9467 ++ .enable_nmi_window = enable_nmi_window,
9468 ++ .enable_irq_window = enable_irq_window,
9469 ++ .update_cr8_intercept = update_cr8_intercept,
9470 ++ .set_virtual_apic_mode = vmx_set_virtual_apic_mode,
9471 ++ .set_apic_access_page_addr = vmx_set_apic_access_page_addr,
9472 ++ .get_enable_apicv = vmx_get_enable_apicv,
9473 ++ .refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl,
9474 ++ .load_eoi_exitmap = vmx_load_eoi_exitmap,
9475 ++ .apicv_post_state_restore = vmx_apicv_post_state_restore,
9476 ++ .hwapic_irr_update = vmx_hwapic_irr_update,
9477 ++ .hwapic_isr_update = vmx_hwapic_isr_update,
9478 ++ .guest_apic_has_interrupt = vmx_guest_apic_has_interrupt,
9479 ++ .sync_pir_to_irr = vmx_sync_pir_to_irr,
9480 ++ .deliver_posted_interrupt = vmx_deliver_posted_interrupt,
9481 ++ .dy_apicv_has_pending_interrupt = vmx_dy_apicv_has_pending_interrupt,
9482 ++
9483 ++ .set_tss_addr = vmx_set_tss_addr,
9484 ++ .set_identity_map_addr = vmx_set_identity_map_addr,
9485 ++ .get_tdp_level = get_ept_level,
9486 ++ .get_mt_mask = vmx_get_mt_mask,
9487 ++
9488 ++ .get_exit_info = vmx_get_exit_info,
9489 ++
9490 ++ .get_lpage_level = vmx_get_lpage_level,
9491 ++
9492 ++ .cpuid_update = vmx_cpuid_update,
9493 ++
9494 ++ .rdtscp_supported = vmx_rdtscp_supported,
9495 ++ .invpcid_supported = vmx_invpcid_supported,
9496 ++
9497 ++ .set_supported_cpuid = vmx_set_supported_cpuid,
9498 ++
9499 ++ .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
9500 ++
9501 ++ .read_l1_tsc_offset = vmx_read_l1_tsc_offset,
9502 ++ .write_l1_tsc_offset = vmx_write_l1_tsc_offset,
9503 ++
9504 ++ .set_tdp_cr3 = vmx_set_cr3,
9505 ++
9506 ++ .check_intercept = vmx_check_intercept,
9507 ++ .handle_exit_irqoff = vmx_handle_exit_irqoff,
9508 ++ .mpx_supported = vmx_mpx_supported,
9509 ++ .xsaves_supported = vmx_xsaves_supported,
9510 ++ .umip_emulated = vmx_umip_emulated,
9511 ++ .pt_supported = vmx_pt_supported,
9512 ++
9513 ++ .request_immediate_exit = vmx_request_immediate_exit,
9514 ++
9515 ++ .sched_in = vmx_sched_in,
9516 ++
9517 ++ .slot_enable_log_dirty = vmx_slot_enable_log_dirty,
9518 ++ .slot_disable_log_dirty = vmx_slot_disable_log_dirty,
9519 ++ .flush_log_dirty = vmx_flush_log_dirty,
9520 ++ .enable_log_dirty_pt_masked = vmx_enable_log_dirty_pt_masked,
9521 ++ .write_log_dirty = vmx_write_pml_buffer,
9522 ++
9523 ++ .pre_block = vmx_pre_block,
9524 ++ .post_block = vmx_post_block,
9525 ++
9526 ++ .pmu_ops = &intel_pmu_ops,
9527 ++
9528 ++ .update_pi_irte = vmx_update_pi_irte,
9529 ++
9530 ++#ifdef CONFIG_X86_64
9531 ++ .set_hv_timer = vmx_set_hv_timer,
9532 ++ .cancel_hv_timer = vmx_cancel_hv_timer,
9533 ++#endif
9534 ++
9535 ++ .setup_mce = vmx_setup_mce,
9536 ++
9537 ++ .smi_allowed = vmx_smi_allowed,
9538 ++ .pre_enter_smm = vmx_pre_enter_smm,
9539 ++ .pre_leave_smm = vmx_pre_leave_smm,
9540 ++ .enable_smi_window = enable_smi_window,
9541 ++
9542 ++ .check_nested_events = NULL,
9543 ++ .get_nested_state = NULL,
9544 ++ .set_nested_state = NULL,
9545 ++ .get_vmcs12_pages = NULL,
9546 ++ .nested_enable_evmcs = NULL,
9547 ++ .nested_get_evmcs_version = NULL,
9548 ++ .need_emulation_on_page_fault = vmx_need_emulation_on_page_fault,
9549 ++ .apic_init_signal_blocked = vmx_apic_init_signal_blocked,
9550 ++};
9551 ++
9552 ++static void vmx_cleanup_l1d_flush(void)
9553 ++{
9554 ++ if (vmx_l1d_flush_pages) {
9555 ++ free_pages((unsigned long)vmx_l1d_flush_pages, L1D_CACHE_ORDER);
9556 ++ vmx_l1d_flush_pages = NULL;
9557 ++ }
9558 ++ /* Restore state so sysfs ignores VMX */
9559 ++ l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO;
9560 ++}
9561 ++
9562 ++static void vmx_exit(void)
9563 ++{
9564 ++#ifdef CONFIG_KEXEC_CORE
9565 ++ RCU_INIT_POINTER(crash_vmclear_loaded_vmcss, NULL);
9566 ++ synchronize_rcu();
9567 ++#endif
9568 ++
9569 ++ kvm_exit();
9570 ++
9571 ++#if IS_ENABLED(CONFIG_HYPERV)
9572 ++ if (static_branch_unlikely(&enable_evmcs)) {
9573 ++ int cpu;
9574 ++ struct hv_vp_assist_page *vp_ap;
9575 ++ /*
9576 ++ * Reset everything to support using non-enlightened VMCS
9577 ++ * access later (e.g. when we reload the module with
9578 ++ * enlightened_vmcs=0)
9579 ++ */
9580 ++ for_each_online_cpu(cpu) {
9581 ++ vp_ap = hv_get_vp_assist_page(cpu);
9582 ++
9583 ++ if (!vp_ap)
9584 ++ continue;
9585 ++
9586 ++ vp_ap->nested_control.features.directhypercall = 0;
9587 ++ vp_ap->current_nested_vmcs = 0;
9588 ++ vp_ap->enlighten_vmentry = 0;
9589 ++ }
9590 ++
9591 ++ static_branch_disable(&enable_evmcs);
9592 ++ }
9593 ++#endif
9594 ++ vmx_cleanup_l1d_flush();
9595 ++}
9596 ++module_exit(vmx_exit);
9597 ++
9598 ++static int __init vmx_init(void)
9599 ++{
9600 ++ int r;
9601 ++
9602 ++#if IS_ENABLED(CONFIG_HYPERV)
9603 ++ /*
9604 ++ * Enlightened VMCS usage should be recommended and the host needs
9605 ++ * to support eVMCS v1 or above. We can also disable eVMCS support
9606 ++ * with module parameter.
9607 ++ */
9608 ++ if (enlightened_vmcs &&
9609 ++ ms_hyperv.hints & HV_X64_ENLIGHTENED_VMCS_RECOMMENDED &&
9610 ++ (ms_hyperv.nested_features & HV_X64_ENLIGHTENED_VMCS_VERSION) >=
9611 ++ KVM_EVMCS_VERSION) {
9612 ++ int cpu;
9613 ++
9614 ++ /* Check that we have assist pages on all online CPUs */
9615 ++ for_each_online_cpu(cpu) {
9616 ++ if (!hv_get_vp_assist_page(cpu)) {
9617 ++ enlightened_vmcs = false;
9618 ++ break;
9619 ++ }
9620 ++ }
9621 ++
9622 ++ if (enlightened_vmcs) {
9623 ++ pr_info("KVM: vmx: using Hyper-V Enlightened VMCS\n");
9624 ++ static_branch_enable(&enable_evmcs);
9625 ++ }
9626 ++
9627 ++ if (ms_hyperv.nested_features & HV_X64_NESTED_DIRECT_FLUSH)
9628 ++ vmx_x86_ops.enable_direct_tlbflush
9629 ++ = hv_enable_direct_tlbflush;
9630 ++
9631 ++ } else {
9632 ++ enlightened_vmcs = false;
9633 ++ }
9634 ++#endif
9635 ++
9636 ++ r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx),
9637 ++ __alignof__(struct vcpu_vmx), THIS_MODULE);
9638 ++ if (r)
9639 ++ return r;
9640 ++
9641 ++ /*
9642 ++ * Must be called after kvm_init() so enable_ept is properly set
9643 ++ * up. Hand the parameter mitigation value in which was stored in
9644 ++ * the pre module init parser. If no parameter was given, it will
9645 ++ * contain 'auto' which will be turned into the default 'cond'
9646 ++ * mitigation mode.
9647 ++ */
9648 ++ r = vmx_setup_l1d_flush(vmentry_l1d_flush_param);
9649 ++ if (r) {
9650 ++ vmx_exit();
9651 ++ return r;
9652 ++ }
9653 ++
9654 ++#ifdef CONFIG_KEXEC_CORE
9655 ++ rcu_assign_pointer(crash_vmclear_loaded_vmcss,
9656 ++ crash_vmclear_local_loaded_vmcss);
9657 ++#endif
9658 ++ vmx_check_vmcs12_offsets();
9659 ++
9660 ++ return 0;
9661 ++}
9662 ++module_init(vmx_init);
9663 +diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
9664 +index 353f63f3b262..ade694f94a49 100644
9665 +--- a/arch/x86/kvm/x86.c
9666 ++++ b/arch/x86/kvm/x86.c
9667 +@@ -92,6 +92,8 @@ u64 __read_mostly efer_reserved_bits = ~((u64)(EFER_SCE | EFER_LME | EFER_LMA));
9668 + static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE);
9669 + #endif
9670 +
9671 ++static u64 __read_mostly cr4_reserved_bits = CR4_RESERVED_BITS;
9672 ++
9673 + #define VM_STAT(x, ...) offsetof(struct kvm, stat.x), KVM_STAT_VM, ## __VA_ARGS__
9674 + #define VCPU_STAT(x, ...) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU, ## __VA_ARGS__
9675 +
9676 +@@ -793,9 +795,38 @@ int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
9677 + }
9678 + EXPORT_SYMBOL_GPL(kvm_set_xcr);
9679 +
9680 ++static u64 kvm_host_cr4_reserved_bits(struct cpuinfo_x86 *c)
9681 ++{
9682 ++ u64 reserved_bits = CR4_RESERVED_BITS;
9683 ++
9684 ++ if (!cpu_has(c, X86_FEATURE_XSAVE))
9685 ++ reserved_bits |= X86_CR4_OSXSAVE;
9686 ++
9687 ++ if (!cpu_has(c, X86_FEATURE_SMEP))
9688 ++ reserved_bits |= X86_CR4_SMEP;
9689 ++
9690 ++ if (!cpu_has(c, X86_FEATURE_SMAP))
9691 ++ reserved_bits |= X86_CR4_SMAP;
9692 ++
9693 ++ if (!cpu_has(c, X86_FEATURE_FSGSBASE))
9694 ++ reserved_bits |= X86_CR4_FSGSBASE;
9695 ++
9696 ++ if (!cpu_has(c, X86_FEATURE_PKU))
9697 ++ reserved_bits |= X86_CR4_PKE;
9698 ++
9699 ++ if (!cpu_has(c, X86_FEATURE_LA57) &&
9700 ++ !(cpuid_ecx(0x7) & bit(X86_FEATURE_LA57)))
9701 ++ reserved_bits |= X86_CR4_LA57;
9702 ++
9703 ++ if (!cpu_has(c, X86_FEATURE_UMIP) && !kvm_x86_ops->umip_emulated())
9704 ++ reserved_bits |= X86_CR4_UMIP;
9705 ++
9706 ++ return reserved_bits;
9707 ++}
9708 ++
9709 + static int kvm_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
9710 + {
9711 +- if (cr4 & CR4_RESERVED_BITS)
9712 ++ if (cr4 & cr4_reserved_bits)
9713 + return -EINVAL;
9714 +
9715 + if (!guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && (cr4 & X86_CR4_OSXSAVE))
9716 +@@ -961,9 +992,11 @@ static u64 kvm_dr6_fixed(struct kvm_vcpu *vcpu)
9717 +
9718 + static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
9719 + {
9720 ++ size_t size = ARRAY_SIZE(vcpu->arch.db);
9721 ++
9722 + switch (dr) {
9723 + case 0 ... 3:
9724 +- vcpu->arch.db[dr] = val;
9725 ++ vcpu->arch.db[array_index_nospec(dr, size)] = val;
9726 + if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
9727 + vcpu->arch.eff_db[dr] = val;
9728 + break;
9729 +@@ -1000,9 +1033,11 @@ EXPORT_SYMBOL_GPL(kvm_set_dr);
9730 +
9731 + int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
9732 + {
9733 ++ size_t size = ARRAY_SIZE(vcpu->arch.db);
9734 ++
9735 + switch (dr) {
9736 + case 0 ... 3:
9737 +- *val = vcpu->arch.db[dr];
9738 ++ *val = vcpu->arch.db[array_index_nospec(dr, size)];
9739 + break;
9740 + case 4:
9741 + /* fall through */
9742 +@@ -2269,7 +2304,10 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
9743 + default:
9744 + if (msr >= MSR_IA32_MC0_CTL &&
9745 + msr < MSR_IA32_MCx_CTL(bank_num)) {
9746 +- u32 offset = msr - MSR_IA32_MC0_CTL;
9747 ++ u32 offset = array_index_nospec(
9748 ++ msr - MSR_IA32_MC0_CTL,
9749 ++ MSR_IA32_MCx_CTL(bank_num) - MSR_IA32_MC0_CTL);
9750 ++
9751 + /* only 0 or all 1s can be written to IA32_MCi_CTL
9752 + * some Linux kernels though clear bit 10 in bank 4 to
9753 + * workaround a BIOS/GART TBL issue on AMD K8s, ignore
9754 +@@ -2681,7 +2719,10 @@ static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host)
9755 + default:
9756 + if (msr >= MSR_IA32_MC0_CTL &&
9757 + msr < MSR_IA32_MCx_CTL(bank_num)) {
9758 +- u32 offset = msr - MSR_IA32_MC0_CTL;
9759 ++ u32 offset = array_index_nospec(
9760 ++ msr - MSR_IA32_MC0_CTL,
9761 ++ MSR_IA32_MCx_CTL(bank_num) - MSR_IA32_MC0_CTL);
9762 ++
9763 + data = vcpu->arch.mce_banks[offset];
9764 + break;
9765 + }
9766 +@@ -3234,6 +3275,9 @@ static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu)
9767 + if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
9768 + return;
9769 +
9770 ++ if (vcpu->arch.st.steal.preempted)
9771 ++ return;
9772 ++
9773 + vcpu->arch.st.steal.preempted = KVM_VCPU_PREEMPTED;
9774 +
9775 + kvm_write_guest_offset_cached(vcpu->kvm, &vcpu->arch.st.stime,
9776 +@@ -5977,11 +6021,11 @@ static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type)
9777 + return r;
9778 + }
9779 +
9780 +-static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t cr2,
9781 ++static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
9782 + bool write_fault_to_shadow_pgtable,
9783 + int emulation_type)
9784 + {
9785 +- gpa_t gpa = cr2;
9786 ++ gpa_t gpa = cr2_or_gpa;
9787 + kvm_pfn_t pfn;
9788 +
9789 + if (!(emulation_type & EMULTYPE_ALLOW_RETRY))
9790 +@@ -5995,7 +6039,7 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t cr2,
9791 + * Write permission should be allowed since only
9792 + * write access need to be emulated.
9793 + */
9794 +- gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2, NULL);
9795 ++ gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2_or_gpa, NULL);
9796 +
9797 + /*
9798 + * If the mapping is invalid in guest, let cpu retry
9799 +@@ -6052,10 +6096,10 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t cr2,
9800 + }
9801 +
9802 + static bool retry_instruction(struct x86_emulate_ctxt *ctxt,
9803 +- unsigned long cr2, int emulation_type)
9804 ++ gpa_t cr2_or_gpa, int emulation_type)
9805 + {
9806 + struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
9807 +- unsigned long last_retry_eip, last_retry_addr, gpa = cr2;
9808 ++ unsigned long last_retry_eip, last_retry_addr, gpa = cr2_or_gpa;
9809 +
9810 + last_retry_eip = vcpu->arch.last_retry_eip;
9811 + last_retry_addr = vcpu->arch.last_retry_addr;
9812 +@@ -6084,14 +6128,14 @@ static bool retry_instruction(struct x86_emulate_ctxt *ctxt,
9813 + if (x86_page_table_writing_insn(ctxt))
9814 + return false;
9815 +
9816 +- if (ctxt->eip == last_retry_eip && last_retry_addr == cr2)
9817 ++ if (ctxt->eip == last_retry_eip && last_retry_addr == cr2_or_gpa)
9818 + return false;
9819 +
9820 + vcpu->arch.last_retry_eip = ctxt->eip;
9821 +- vcpu->arch.last_retry_addr = cr2;
9822 ++ vcpu->arch.last_retry_addr = cr2_or_gpa;
9823 +
9824 + if (!vcpu->arch.mmu.direct_map)
9825 +- gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2, NULL);
9826 ++ gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2_or_gpa, NULL);
9827 +
9828 + kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
9829 +
9830 +@@ -6252,11 +6296,8 @@ static bool is_vmware_backdoor_opcode(struct x86_emulate_ctxt *ctxt)
9831 + return false;
9832 + }
9833 +
9834 +-int x86_emulate_instruction(struct kvm_vcpu *vcpu,
9835 +- unsigned long cr2,
9836 +- int emulation_type,
9837 +- void *insn,
9838 +- int insn_len)
9839 ++int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
9840 ++ int emulation_type, void *insn, int insn_len)
9841 + {
9842 + int r;
9843 + struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
9844 +@@ -6299,7 +6340,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
9845 + if (r != EMULATION_OK) {
9846 + if (emulation_type & EMULTYPE_TRAP_UD)
9847 + return EMULATE_FAIL;
9848 +- if (reexecute_instruction(vcpu, cr2, write_fault_to_spt,
9849 ++ if (reexecute_instruction(vcpu, cr2_or_gpa, write_fault_to_spt,
9850 + emulation_type))
9851 + return EMULATE_DONE;
9852 + if (ctxt->have_exception) {
9853 +@@ -6329,7 +6370,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
9854 + return EMULATE_DONE;
9855 + }
9856 +
9857 +- if (retry_instruction(ctxt, cr2, emulation_type))
9858 ++ if (retry_instruction(ctxt, cr2_or_gpa, emulation_type))
9859 + return EMULATE_DONE;
9860 +
9861 + /* this is needed for vmware backdoor interface to work since it
9862 +@@ -6341,7 +6382,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
9863 +
9864 + restart:
9865 + /* Save the faulting GPA (cr2) in the address field */
9866 +- ctxt->exception.address = cr2;
9867 ++ ctxt->exception.address = cr2_or_gpa;
9868 +
9869 + r = x86_emulate_insn(ctxt);
9870 +
9871 +@@ -6349,7 +6390,7 @@ restart:
9872 + return EMULATE_DONE;
9873 +
9874 + if (r == EMULATION_FAILED) {
9875 +- if (reexecute_instruction(vcpu, cr2, write_fault_to_spt,
9876 ++ if (reexecute_instruction(vcpu, cr2_or_gpa, write_fault_to_spt,
9877 + emulation_type))
9878 + return EMULATE_DONE;
9879 +
9880 +@@ -6753,7 +6794,7 @@ static void kvm_set_mmio_spte_mask(void)
9881 + * If reserved bit is not supported, clear the present bit to disable
9882 + * mmio page fault.
9883 + */
9884 +- if (IS_ENABLED(CONFIG_X86_64) && maxphyaddr == 52)
9885 ++ if (maxphyaddr == 52)
9886 + mask &= ~1ull;
9887 +
9888 + kvm_mmu_set_mmio_spte_mask(mask, mask);
9889 +@@ -8225,6 +8266,8 @@ int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
9890 + struct kvm_mp_state *mp_state)
9891 + {
9892 + vcpu_load(vcpu);
9893 ++ if (kvm_mpx_supported())
9894 ++ kvm_load_guest_fpu(vcpu);
9895 +
9896 + kvm_apic_accept_events(vcpu);
9897 + if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED &&
9898 +@@ -8233,6 +8276,8 @@ int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
9899 + else
9900 + mp_state->mp_state = vcpu->arch.mp_state;
9901 +
9902 ++ if (kvm_mpx_supported())
9903 ++ kvm_put_guest_fpu(vcpu);
9904 + vcpu_put(vcpu);
9905 + return 0;
9906 + }
9907 +@@ -8654,7 +8699,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
9908 + kvm_mmu_unload(vcpu);
9909 + vcpu_put(vcpu);
9910 +
9911 +- kvm_x86_ops->vcpu_free(vcpu);
9912 ++ kvm_arch_vcpu_free(vcpu);
9913 + }
9914 +
9915 + void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
9916 +@@ -8847,6 +8892,8 @@ int kvm_arch_hardware_setup(void)
9917 + if (r != 0)
9918 + return r;
9919 +
9920 ++ cr4_reserved_bits = kvm_host_cr4_reserved_bits(&boot_cpu_data);
9921 ++
9922 + if (kvm_has_tsc_control) {
9923 + /*
9924 + * Make sure the user can only configure tsc_khz values that
9925 +@@ -9505,7 +9552,7 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
9926 + work->arch.cr3 != vcpu->arch.mmu.get_cr3(vcpu))
9927 + return;
9928 +
9929 +- vcpu->arch.mmu.page_fault(vcpu, work->gva, 0, true);
9930 ++ vcpu->arch.mmu.page_fault(vcpu, work->cr2_or_gpa, 0, true);
9931 + }
9932 +
9933 + static inline u32 kvm_async_pf_hash_fn(gfn_t gfn)
9934 +@@ -9588,7 +9635,7 @@ void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
9935 + {
9936 + struct x86_exception fault;
9937 +
9938 +- trace_kvm_async_pf_not_present(work->arch.token, work->gva);
9939 ++ trace_kvm_async_pf_not_present(work->arch.token, work->cr2_or_gpa);
9940 + kvm_add_async_pf_gfn(vcpu, work->arch.gfn);
9941 +
9942 + if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) ||
9943 +@@ -9616,7 +9663,7 @@ void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
9944 + work->arch.token = ~0; /* broadcast wakeup */
9945 + else
9946 + kvm_del_async_pf_gfn(vcpu, work->arch.gfn);
9947 +- trace_kvm_async_pf_ready(work->arch.token, work->gva);
9948 ++ trace_kvm_async_pf_ready(work->arch.token, work->cr2_or_gpa);
9949 +
9950 + if (vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED &&
9951 + !apf_get_user(vcpu, &val)) {
9952 +diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
9953 +index 608e5f8c5d0a..422331b257d3 100644
9954 +--- a/arch/x86/kvm/x86.h
9955 ++++ b/arch/x86/kvm/x86.h
9956 +@@ -284,7 +284,7 @@ int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata);
9957 + bool kvm_mtrr_check_gfn_range_consistency(struct kvm_vcpu *vcpu, gfn_t gfn,
9958 + int page_num);
9959 + bool kvm_vector_hashing_enabled(void);
9960 +-int x86_emulate_instruction(struct kvm_vcpu *vcpu, unsigned long cr2,
9961 ++int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
9962 + int emulation_type, void *insn, int insn_len);
9963 +
9964 + #define KVM_SUPPORTED_XCR0 (XFEATURE_MASK_FP | XFEATURE_MASK_SSE \
9965 +diff --git a/crypto/algapi.c b/crypto/algapi.c
9966 +index c0755cf4f53f..346557a3fc0b 100644
9967 +--- a/crypto/algapi.c
9968 ++++ b/crypto/algapi.c
9969 +@@ -649,11 +649,9 @@ EXPORT_SYMBOL_GPL(crypto_grab_spawn);
9970 +
9971 + void crypto_drop_spawn(struct crypto_spawn *spawn)
9972 + {
9973 +- if (!spawn->alg)
9974 +- return;
9975 +-
9976 + down_write(&crypto_alg_sem);
9977 +- list_del(&spawn->list);
9978 ++ if (spawn->alg)
9979 ++ list_del(&spawn->list);
9980 + up_write(&crypto_alg_sem);
9981 + }
9982 + EXPORT_SYMBOL_GPL(crypto_drop_spawn);
9983 +@@ -661,22 +659,16 @@ EXPORT_SYMBOL_GPL(crypto_drop_spawn);
9984 + static struct crypto_alg *crypto_spawn_alg(struct crypto_spawn *spawn)
9985 + {
9986 + struct crypto_alg *alg;
9987 +- struct crypto_alg *alg2;
9988 +
9989 + down_read(&crypto_alg_sem);
9990 + alg = spawn->alg;
9991 +- alg2 = alg;
9992 +- if (alg2)
9993 +- alg2 = crypto_mod_get(alg2);
9994 +- up_read(&crypto_alg_sem);
9995 +-
9996 +- if (!alg2) {
9997 +- if (alg)
9998 +- crypto_shoot_alg(alg);
9999 +- return ERR_PTR(-EAGAIN);
10000 ++ if (alg && !crypto_mod_get(alg)) {
10001 ++ alg->cra_flags |= CRYPTO_ALG_DYING;
10002 ++ alg = NULL;
10003 + }
10004 ++ up_read(&crypto_alg_sem);
10005 +
10006 +- return alg;
10007 ++ return alg ?: ERR_PTR(-EAGAIN);
10008 + }
10009 +
10010 + struct crypto_tfm *crypto_spawn_tfm(struct crypto_spawn *spawn, u32 type,
10011 +diff --git a/crypto/api.c b/crypto/api.c
10012 +index 7aca9f86c5f3..1909195b2c70 100644
10013 +--- a/crypto/api.c
10014 ++++ b/crypto/api.c
10015 +@@ -349,13 +349,12 @@ static unsigned int crypto_ctxsize(struct crypto_alg *alg, u32 type, u32 mask)
10016 + return len;
10017 + }
10018 +
10019 +-void crypto_shoot_alg(struct crypto_alg *alg)
10020 ++static void crypto_shoot_alg(struct crypto_alg *alg)
10021 + {
10022 + down_write(&crypto_alg_sem);
10023 + alg->cra_flags |= CRYPTO_ALG_DYING;
10024 + up_write(&crypto_alg_sem);
10025 + }
10026 +-EXPORT_SYMBOL_GPL(crypto_shoot_alg);
10027 +
10028 + struct crypto_tfm *__crypto_alloc_tfm(struct crypto_alg *alg, u32 type,
10029 + u32 mask)
10030 +diff --git a/crypto/internal.h b/crypto/internal.h
10031 +index 9a3f39939fba..f8d6efaffef9 100644
10032 +--- a/crypto/internal.h
10033 ++++ b/crypto/internal.h
10034 +@@ -79,7 +79,6 @@ void crypto_alg_tested(const char *name, int err);
10035 + void crypto_remove_spawns(struct crypto_alg *alg, struct list_head *list,
10036 + struct crypto_alg *nalg);
10037 + void crypto_remove_final(struct list_head *list);
10038 +-void crypto_shoot_alg(struct crypto_alg *alg);
10039 + struct crypto_tfm *__crypto_alloc_tfm(struct crypto_alg *alg, u32 type,
10040 + u32 mask);
10041 + void *crypto_create_tfm(struct crypto_alg *alg,
10042 +diff --git a/crypto/pcrypt.c b/crypto/pcrypt.c
10043 +index 1348541da463..85082574c515 100644
10044 +--- a/crypto/pcrypt.c
10045 ++++ b/crypto/pcrypt.c
10046 +@@ -130,7 +130,6 @@ static void pcrypt_aead_done(struct crypto_async_request *areq, int err)
10047 + struct padata_priv *padata = pcrypt_request_padata(preq);
10048 +
10049 + padata->info = err;
10050 +- req->base.flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
10051 +
10052 + padata_do_serial(padata);
10053 + }
10054 +diff --git a/drivers/acpi/battery.c b/drivers/acpi/battery.c
10055 +index cb97b6105f52..674a0e92b798 100644
10056 +--- a/drivers/acpi/battery.c
10057 ++++ b/drivers/acpi/battery.c
10058 +@@ -51,6 +51,8 @@
10059 + #define PREFIX "ACPI: "
10060 +
10061 + #define ACPI_BATTERY_VALUE_UNKNOWN 0xFFFFFFFF
10062 ++#define ACPI_BATTERY_CAPACITY_VALID(capacity) \
10063 ++ ((capacity) != 0 && (capacity) != ACPI_BATTERY_VALUE_UNKNOWN)
10064 +
10065 + #define ACPI_BATTERY_DEVICE_NAME "Battery"
10066 +
10067 +@@ -205,7 +207,8 @@ static int acpi_battery_is_charged(struct acpi_battery *battery)
10068 +
10069 + static bool acpi_battery_is_degraded(struct acpi_battery *battery)
10070 + {
10071 +- return battery->full_charge_capacity && battery->design_capacity &&
10072 ++ return ACPI_BATTERY_CAPACITY_VALID(battery->full_charge_capacity) &&
10073 ++ ACPI_BATTERY_CAPACITY_VALID(battery->design_capacity) &&
10074 + battery->full_charge_capacity < battery->design_capacity;
10075 + }
10076 +
10077 +@@ -227,7 +230,7 @@ static int acpi_battery_get_property(struct power_supply *psy,
10078 + enum power_supply_property psp,
10079 + union power_supply_propval *val)
10080 + {
10081 +- int ret = 0;
10082 ++ int full_capacity = ACPI_BATTERY_VALUE_UNKNOWN, ret = 0;
10083 + struct acpi_battery *battery = to_acpi_battery(psy);
10084 +
10085 + if (acpi_battery_present(battery)) {
10086 +@@ -276,14 +279,14 @@ static int acpi_battery_get_property(struct power_supply *psy,
10087 + break;
10088 + case POWER_SUPPLY_PROP_CHARGE_FULL_DESIGN:
10089 + case POWER_SUPPLY_PROP_ENERGY_FULL_DESIGN:
10090 +- if (battery->design_capacity == ACPI_BATTERY_VALUE_UNKNOWN)
10091 ++ if (!ACPI_BATTERY_CAPACITY_VALID(battery->design_capacity))
10092 + ret = -ENODEV;
10093 + else
10094 + val->intval = battery->design_capacity * 1000;
10095 + break;
10096 + case POWER_SUPPLY_PROP_CHARGE_FULL:
10097 + case POWER_SUPPLY_PROP_ENERGY_FULL:
10098 +- if (battery->full_charge_capacity == ACPI_BATTERY_VALUE_UNKNOWN)
10099 ++ if (!ACPI_BATTERY_CAPACITY_VALID(battery->full_charge_capacity))
10100 + ret = -ENODEV;
10101 + else
10102 + val->intval = battery->full_charge_capacity * 1000;
10103 +@@ -296,11 +299,17 @@ static int acpi_battery_get_property(struct power_supply *psy,
10104 + val->intval = battery->capacity_now * 1000;
10105 + break;
10106 + case POWER_SUPPLY_PROP_CAPACITY:
10107 +- if (battery->capacity_now && battery->full_charge_capacity)
10108 +- val->intval = battery->capacity_now * 100/
10109 +- battery->full_charge_capacity;
10110 ++ if (ACPI_BATTERY_CAPACITY_VALID(battery->full_charge_capacity))
10111 ++ full_capacity = battery->full_charge_capacity;
10112 ++ else if (ACPI_BATTERY_CAPACITY_VALID(battery->design_capacity))
10113 ++ full_capacity = battery->design_capacity;
10114 ++
10115 ++ if (battery->capacity_now == ACPI_BATTERY_VALUE_UNKNOWN ||
10116 ++ full_capacity == ACPI_BATTERY_VALUE_UNKNOWN)
10117 ++ ret = -ENODEV;
10118 + else
10119 +- val->intval = 0;
10120 ++ val->intval = battery->capacity_now * 100/
10121 ++ full_capacity;
10122 + break;
10123 + case POWER_SUPPLY_PROP_CAPACITY_LEVEL:
10124 + if (battery->state & ACPI_BATTERY_STATE_CRITICAL)
10125 +@@ -346,6 +355,20 @@ static enum power_supply_property charge_battery_props[] = {
10126 + POWER_SUPPLY_PROP_SERIAL_NUMBER,
10127 + };
10128 +
10129 ++static enum power_supply_property charge_battery_full_cap_broken_props[] = {
10130 ++ POWER_SUPPLY_PROP_STATUS,
10131 ++ POWER_SUPPLY_PROP_PRESENT,
10132 ++ POWER_SUPPLY_PROP_TECHNOLOGY,
10133 ++ POWER_SUPPLY_PROP_CYCLE_COUNT,
10134 ++ POWER_SUPPLY_PROP_VOLTAGE_MIN_DESIGN,
10135 ++ POWER_SUPPLY_PROP_VOLTAGE_NOW,
10136 ++ POWER_SUPPLY_PROP_CURRENT_NOW,
10137 ++ POWER_SUPPLY_PROP_CHARGE_NOW,
10138 ++ POWER_SUPPLY_PROP_MODEL_NAME,
10139 ++ POWER_SUPPLY_PROP_MANUFACTURER,
10140 ++ POWER_SUPPLY_PROP_SERIAL_NUMBER,
10141 ++};
10142 ++
10143 + static enum power_supply_property energy_battery_props[] = {
10144 + POWER_SUPPLY_PROP_STATUS,
10145 + POWER_SUPPLY_PROP_PRESENT,
10146 +@@ -807,20 +830,34 @@ static void __exit battery_hook_exit(void)
10147 + static int sysfs_add_battery(struct acpi_battery *battery)
10148 + {
10149 + struct power_supply_config psy_cfg = { .drv_data = battery, };
10150 ++ bool full_cap_broken = false;
10151 ++
10152 ++ if (!ACPI_BATTERY_CAPACITY_VALID(battery->full_charge_capacity) &&
10153 ++ !ACPI_BATTERY_CAPACITY_VALID(battery->design_capacity))
10154 ++ full_cap_broken = true;
10155 +
10156 + if (battery->power_unit == ACPI_BATTERY_POWER_UNIT_MA) {
10157 +- battery->bat_desc.properties = charge_battery_props;
10158 +- battery->bat_desc.num_properties =
10159 +- ARRAY_SIZE(charge_battery_props);
10160 +- } else if (battery->full_charge_capacity == 0) {
10161 +- battery->bat_desc.properties =
10162 +- energy_battery_full_cap_broken_props;
10163 +- battery->bat_desc.num_properties =
10164 +- ARRAY_SIZE(energy_battery_full_cap_broken_props);
10165 ++ if (full_cap_broken) {
10166 ++ battery->bat_desc.properties =
10167 ++ charge_battery_full_cap_broken_props;
10168 ++ battery->bat_desc.num_properties =
10169 ++ ARRAY_SIZE(charge_battery_full_cap_broken_props);
10170 ++ } else {
10171 ++ battery->bat_desc.properties = charge_battery_props;
10172 ++ battery->bat_desc.num_properties =
10173 ++ ARRAY_SIZE(charge_battery_props);
10174 ++ }
10175 + } else {
10176 +- battery->bat_desc.properties = energy_battery_props;
10177 +- battery->bat_desc.num_properties =
10178 +- ARRAY_SIZE(energy_battery_props);
10179 ++ if (full_cap_broken) {
10180 ++ battery->bat_desc.properties =
10181 ++ energy_battery_full_cap_broken_props;
10182 ++ battery->bat_desc.num_properties =
10183 ++ ARRAY_SIZE(energy_battery_full_cap_broken_props);
10184 ++ } else {
10185 ++ battery->bat_desc.properties = energy_battery_props;
10186 ++ battery->bat_desc.num_properties =
10187 ++ ARRAY_SIZE(energy_battery_props);
10188 ++ }
10189 + }
10190 +
10191 + battery->bat_desc.name = acpi_device_bid(battery->device);
10192 +diff --git a/drivers/acpi/video_detect.c b/drivers/acpi/video_detect.c
10193 +index 43587ac680e4..214c4e2e8ade 100644
10194 +--- a/drivers/acpi/video_detect.c
10195 ++++ b/drivers/acpi/video_detect.c
10196 +@@ -328,6 +328,11 @@ static const struct dmi_system_id video_detect_dmi_table[] = {
10197 + DMI_MATCH(DMI_PRODUCT_NAME, "Precision 7510"),
10198 + },
10199 + },
10200 ++
10201 ++ /*
10202 ++ * Desktops which falsely report a backlight and which our heuristics
10203 ++ * for this do not catch.
10204 ++ */
10205 + {
10206 + .callback = video_detect_force_none,
10207 + .ident = "Dell OptiPlex 9020M",
10208 +@@ -336,6 +341,14 @@ static const struct dmi_system_id video_detect_dmi_table[] = {
10209 + DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 9020M"),
10210 + },
10211 + },
10212 ++ {
10213 ++ .callback = video_detect_force_none,
10214 ++ .ident = "MSI MS-7721",
10215 ++ .matches = {
10216 ++ DMI_MATCH(DMI_SYS_VENDOR, "MSI"),
10217 ++ DMI_MATCH(DMI_PRODUCT_NAME, "MS-7721"),
10218 ++ },
10219 ++ },
10220 + { },
10221 + };
10222 +
10223 +diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c
10224 +index 4abd7c6531d9..3b382a7e07b2 100644
10225 +--- a/drivers/base/power/main.c
10226 ++++ b/drivers/base/power/main.c
10227 +@@ -265,10 +265,38 @@ static void dpm_wait_for_suppliers(struct device *dev, bool async)
10228 + device_links_read_unlock(idx);
10229 + }
10230 +
10231 +-static void dpm_wait_for_superior(struct device *dev, bool async)
10232 ++static bool dpm_wait_for_superior(struct device *dev, bool async)
10233 + {
10234 +- dpm_wait(dev->parent, async);
10235 ++ struct device *parent;
10236 ++
10237 ++ /*
10238 ++ * If the device is resumed asynchronously and the parent's callback
10239 ++ * deletes both the device and the parent itself, the parent object may
10240 ++ * be freed while this function is running, so avoid that by reference
10241 ++ * counting the parent once more unless the device has been deleted
10242 ++ * already (in which case return right away).
10243 ++ */
10244 ++ mutex_lock(&dpm_list_mtx);
10245 ++
10246 ++ if (!device_pm_initialized(dev)) {
10247 ++ mutex_unlock(&dpm_list_mtx);
10248 ++ return false;
10249 ++ }
10250 ++
10251 ++ parent = get_device(dev->parent);
10252 ++
10253 ++ mutex_unlock(&dpm_list_mtx);
10254 ++
10255 ++ dpm_wait(parent, async);
10256 ++ put_device(parent);
10257 ++
10258 + dpm_wait_for_suppliers(dev, async);
10259 ++
10260 ++ /*
10261 ++ * If the parent's callback has deleted the device, attempting to resume
10262 ++ * it would be invalid, so avoid doing that then.
10263 ++ */
10264 ++ return device_pm_initialized(dev);
10265 + }
10266 +
10267 + static void dpm_wait_for_consumers(struct device *dev, bool async)
10268 +@@ -628,7 +656,8 @@ static int device_resume_noirq(struct device *dev, pm_message_t state, bool asyn
10269 + if (!dev->power.is_noirq_suspended)
10270 + goto Out;
10271 +
10272 +- dpm_wait_for_superior(dev, async);
10273 ++ if (!dpm_wait_for_superior(dev, async))
10274 ++ goto Out;
10275 +
10276 + skip_resume = dev_pm_may_skip_resume(dev);
10277 +
10278 +@@ -829,7 +858,8 @@ static int device_resume_early(struct device *dev, pm_message_t state, bool asyn
10279 + if (!dev->power.is_late_suspended)
10280 + goto Out;
10281 +
10282 +- dpm_wait_for_superior(dev, async);
10283 ++ if (!dpm_wait_for_superior(dev, async))
10284 ++ goto Out;
10285 +
10286 + callback = dpm_subsys_resume_early_cb(dev, state, &info);
10287 +
10288 +@@ -949,7 +979,9 @@ static int device_resume(struct device *dev, pm_message_t state, bool async)
10289 + goto Complete;
10290 + }
10291 +
10292 +- dpm_wait_for_superior(dev, async);
10293 ++ if (!dpm_wait_for_superior(dev, async))
10294 ++ goto Complete;
10295 ++
10296 + dpm_watchdog_set(&wd, dev);
10297 + device_lock(dev);
10298 +
10299 +diff --git a/drivers/clk/tegra/clk-tegra-periph.c b/drivers/clk/tegra/clk-tegra-periph.c
10300 +index 38c4eb28c8bf..b137c5d34eec 100644
10301 +--- a/drivers/clk/tegra/clk-tegra-periph.c
10302 ++++ b/drivers/clk/tegra/clk-tegra-periph.c
10303 +@@ -799,7 +799,11 @@ static struct tegra_periph_init_data gate_clks[] = {
10304 + GATE("ahbdma", "hclk", 33, 0, tegra_clk_ahbdma, 0),
10305 + GATE("apbdma", "pclk", 34, 0, tegra_clk_apbdma, 0),
10306 + GATE("kbc", "clk_32k", 36, TEGRA_PERIPH_ON_APB | TEGRA_PERIPH_NO_RESET, tegra_clk_kbc, 0),
10307 +- GATE("fuse", "clk_m", 39, TEGRA_PERIPH_ON_APB, tegra_clk_fuse, 0),
10308 ++ /*
10309 ++ * Critical for RAM re-repair operation, which must occur on resume
10310 ++ * from LP1 system suspend and as part of CCPLEX cluster switching.
10311 ++ */
10312 ++ GATE("fuse", "clk_m", 39, TEGRA_PERIPH_ON_APB, tegra_clk_fuse, CLK_IS_CRITICAL),
10313 + GATE("fuse_burn", "clk_m", 39, TEGRA_PERIPH_ON_APB, tegra_clk_fuse_burn, 0),
10314 + GATE("kfuse", "clk_m", 40, TEGRA_PERIPH_ON_APB, tegra_clk_kfuse, 0),
10315 + GATE("apbif", "clk_m", 107, TEGRA_PERIPH_ON_APB, tegra_clk_apbif, 0),
10316 +diff --git a/drivers/crypto/atmel-aes.c b/drivers/crypto/atmel-aes.c
10317 +index 53a78035381d..92060864e356 100644
10318 +--- a/drivers/crypto/atmel-aes.c
10319 ++++ b/drivers/crypto/atmel-aes.c
10320 +@@ -91,7 +91,6 @@
10321 + struct atmel_aes_caps {
10322 + bool has_dualbuff;
10323 + bool has_cfb64;
10324 +- bool has_ctr32;
10325 + bool has_gcm;
10326 + bool has_xts;
10327 + bool has_authenc;
10328 +@@ -1016,8 +1015,9 @@ static int atmel_aes_ctr_transfer(struct atmel_aes_dev *dd)
10329 + struct atmel_aes_ctr_ctx *ctx = atmel_aes_ctr_ctx_cast(dd->ctx);
10330 + struct ablkcipher_request *req = ablkcipher_request_cast(dd->areq);
10331 + struct scatterlist *src, *dst;
10332 +- u32 ctr, blocks;
10333 + size_t datalen;
10334 ++ u32 ctr;
10335 ++ u16 blocks, start, end;
10336 + bool use_dma, fragmented = false;
10337 +
10338 + /* Check for transfer completion. */
10339 +@@ -1029,27 +1029,17 @@ static int atmel_aes_ctr_transfer(struct atmel_aes_dev *dd)
10340 + datalen = req->nbytes - ctx->offset;
10341 + blocks = DIV_ROUND_UP(datalen, AES_BLOCK_SIZE);
10342 + ctr = be32_to_cpu(ctx->iv[3]);
10343 +- if (dd->caps.has_ctr32) {
10344 +- /* Check 32bit counter overflow. */
10345 +- u32 start = ctr;
10346 +- u32 end = start + blocks - 1;
10347 +-
10348 +- if (end < start) {
10349 +- ctr |= 0xffffffff;
10350 +- datalen = AES_BLOCK_SIZE * -start;
10351 +- fragmented = true;
10352 +- }
10353 +- } else {
10354 +- /* Check 16bit counter overflow. */
10355 +- u16 start = ctr & 0xffff;
10356 +- u16 end = start + (u16)blocks - 1;
10357 +-
10358 +- if (blocks >> 16 || end < start) {
10359 +- ctr |= 0xffff;
10360 +- datalen = AES_BLOCK_SIZE * (0x10000-start);
10361 +- fragmented = true;
10362 +- }
10363 ++
10364 ++ /* Check 16bit counter overflow. */
10365 ++ start = ctr & 0xffff;
10366 ++ end = start + blocks - 1;
10367 ++
10368 ++ if (blocks >> 16 || end < start) {
10369 ++ ctr |= 0xffff;
10370 ++ datalen = AES_BLOCK_SIZE * (0x10000 - start);
10371 ++ fragmented = true;
10372 + }
10373 ++
10374 + use_dma = (datalen >= ATMEL_AES_DMA_THRESHOLD);
10375 +
10376 + /* Jump to offset. */
10377 +@@ -2553,7 +2543,6 @@ static void atmel_aes_get_cap(struct atmel_aes_dev *dd)
10378 + {
10379 + dd->caps.has_dualbuff = 0;
10380 + dd->caps.has_cfb64 = 0;
10381 +- dd->caps.has_ctr32 = 0;
10382 + dd->caps.has_gcm = 0;
10383 + dd->caps.has_xts = 0;
10384 + dd->caps.has_authenc = 0;
10385 +@@ -2564,7 +2553,6 @@ static void atmel_aes_get_cap(struct atmel_aes_dev *dd)
10386 + case 0x500:
10387 + dd->caps.has_dualbuff = 1;
10388 + dd->caps.has_cfb64 = 1;
10389 +- dd->caps.has_ctr32 = 1;
10390 + dd->caps.has_gcm = 1;
10391 + dd->caps.has_xts = 1;
10392 + dd->caps.has_authenc = 1;
10393 +@@ -2573,7 +2561,6 @@ static void atmel_aes_get_cap(struct atmel_aes_dev *dd)
10394 + case 0x200:
10395 + dd->caps.has_dualbuff = 1;
10396 + dd->caps.has_cfb64 = 1;
10397 +- dd->caps.has_ctr32 = 1;
10398 + dd->caps.has_gcm = 1;
10399 + dd->caps.max_burst_size = 4;
10400 + break;
10401 +diff --git a/drivers/crypto/ccp/ccp-dev-v3.c b/drivers/crypto/ccp/ccp-dev-v3.c
10402 +index 240bebbcb8ac..ae0cc0a4dc5c 100644
10403 +--- a/drivers/crypto/ccp/ccp-dev-v3.c
10404 ++++ b/drivers/crypto/ccp/ccp-dev-v3.c
10405 +@@ -590,6 +590,7 @@ const struct ccp_vdata ccpv3_platform = {
10406 + .setup = NULL,
10407 + .perform = &ccp3_actions,
10408 + .offset = 0,
10409 ++ .rsamax = CCP_RSA_MAX_WIDTH,
10410 + };
10411 +
10412 + const struct ccp_vdata ccpv3 = {
10413 +diff --git a/drivers/crypto/ccree/cc_driver.h b/drivers/crypto/ccree/cc_driver.h
10414 +index be7f9bd5c559..d41193932207 100644
10415 +--- a/drivers/crypto/ccree/cc_driver.h
10416 ++++ b/drivers/crypto/ccree/cc_driver.h
10417 +@@ -131,6 +131,7 @@ struct cc_drvdata {
10418 + u32 axim_mon_offset;
10419 + u32 sig_offset;
10420 + u32 ver_offset;
10421 ++ bool pm_on;
10422 + };
10423 +
10424 + struct cc_crypto_alg {
10425 +diff --git a/drivers/crypto/ccree/cc_pm.c b/drivers/crypto/ccree/cc_pm.c
10426 +index 638082dff183..2df2c2ca8aae 100644
10427 +--- a/drivers/crypto/ccree/cc_pm.c
10428 ++++ b/drivers/crypto/ccree/cc_pm.c
10429 +@@ -23,14 +23,8 @@ const struct dev_pm_ops ccree_pm = {
10430 + int cc_pm_suspend(struct device *dev)
10431 + {
10432 + struct cc_drvdata *drvdata = dev_get_drvdata(dev);
10433 +- int rc;
10434 +
10435 + dev_dbg(dev, "set HOST_POWER_DOWN_EN\n");
10436 +- rc = cc_suspend_req_queue(drvdata);
10437 +- if (rc) {
10438 +- dev_err(dev, "cc_suspend_req_queue (%x)\n", rc);
10439 +- return rc;
10440 +- }
10441 + fini_cc_regs(drvdata);
10442 + cc_iowrite(drvdata, CC_REG(HOST_POWER_DOWN_EN), POWER_DOWN_ENABLE);
10443 + cc_clk_off(drvdata);
10444 +@@ -59,13 +53,6 @@ int cc_pm_resume(struct device *dev)
10445 + /* check if tee fips error occurred during power down */
10446 + cc_tee_handle_fips_error(drvdata);
10447 +
10448 +- rc = cc_resume_req_queue(drvdata);
10449 +- if (rc) {
10450 +- dev_err(dev, "cc_resume_req_queue (%x)\n", rc);
10451 +- return rc;
10452 +- }
10453 +-
10454 +- /* must be after the queue resuming as it uses the HW queue*/
10455 + cc_init_hash_sram(drvdata);
10456 +
10457 + cc_init_iv_sram(drvdata);
10458 +@@ -77,12 +64,10 @@ int cc_pm_get(struct device *dev)
10459 + int rc = 0;
10460 + struct cc_drvdata *drvdata = dev_get_drvdata(dev);
10461 +
10462 +- if (cc_req_queue_suspended(drvdata))
10463 ++ if (drvdata->pm_on)
10464 + rc = pm_runtime_get_sync(dev);
10465 +- else
10466 +- pm_runtime_get_noresume(dev);
10467 +
10468 +- return rc;
10469 ++ return (rc == 1 ? 0 : rc);
10470 + }
10471 +
10472 + int cc_pm_put_suspend(struct device *dev)
10473 +@@ -90,14 +75,11 @@ int cc_pm_put_suspend(struct device *dev)
10474 + int rc = 0;
10475 + struct cc_drvdata *drvdata = dev_get_drvdata(dev);
10476 +
10477 +- if (!cc_req_queue_suspended(drvdata)) {
10478 ++ if (drvdata->pm_on) {
10479 + pm_runtime_mark_last_busy(dev);
10480 + rc = pm_runtime_put_autosuspend(dev);
10481 +- } else {
10482 +- /* Something wrong happens*/
10483 +- dev_err(dev, "request to suspend already suspended queue");
10484 +- rc = -EBUSY;
10485 + }
10486 ++
10487 + return rc;
10488 + }
10489 +
10490 +@@ -108,7 +90,7 @@ int cc_pm_init(struct cc_drvdata *drvdata)
10491 + /* must be before the enabling to avoid resdundent suspending */
10492 + pm_runtime_set_autosuspend_delay(dev, CC_SUSPEND_TIMEOUT);
10493 + pm_runtime_use_autosuspend(dev);
10494 +- /* activate the PM module */
10495 ++ /* set us as active - note we won't do PM ops until cc_pm_go()! */
10496 + return pm_runtime_set_active(dev);
10497 + }
10498 +
10499 +@@ -116,9 +98,11 @@ int cc_pm_init(struct cc_drvdata *drvdata)
10500 + void cc_pm_go(struct cc_drvdata *drvdata)
10501 + {
10502 + pm_runtime_enable(drvdata_to_dev(drvdata));
10503 ++ drvdata->pm_on = true;
10504 + }
10505 +
10506 + void cc_pm_fini(struct cc_drvdata *drvdata)
10507 + {
10508 + pm_runtime_disable(drvdata_to_dev(drvdata));
10509 ++ drvdata->pm_on = false;
10510 + }
10511 +diff --git a/drivers/crypto/ccree/cc_request_mgr.c b/drivers/crypto/ccree/cc_request_mgr.c
10512 +index 83a8aaae61c7..1d88abc6d230 100644
10513 +--- a/drivers/crypto/ccree/cc_request_mgr.c
10514 ++++ b/drivers/crypto/ccree/cc_request_mgr.c
10515 +@@ -41,7 +41,6 @@ struct cc_req_mgr_handle {
10516 + #else
10517 + struct tasklet_struct comptask;
10518 + #endif
10519 +- bool is_runtime_suspended;
10520 + };
10521 +
10522 + struct cc_bl_item {
10523 +@@ -403,6 +402,7 @@ static void cc_proc_backlog(struct cc_drvdata *drvdata)
10524 + spin_lock(&mgr->bl_lock);
10525 + list_del(&bli->list);
10526 + --mgr->bl_len;
10527 ++ kfree(bli);
10528 + }
10529 +
10530 + spin_unlock(&mgr->bl_lock);
10531 +@@ -660,52 +660,3 @@ static void comp_handler(unsigned long devarg)
10532 +
10533 + cc_proc_backlog(drvdata);
10534 + }
10535 +-
10536 +-/*
10537 +- * resume the queue configuration - no need to take the lock as this happens
10538 +- * inside the spin lock protection
10539 +- */
10540 +-#if defined(CONFIG_PM)
10541 +-int cc_resume_req_queue(struct cc_drvdata *drvdata)
10542 +-{
10543 +- struct cc_req_mgr_handle *request_mgr_handle =
10544 +- drvdata->request_mgr_handle;
10545 +-
10546 +- spin_lock_bh(&request_mgr_handle->hw_lock);
10547 +- request_mgr_handle->is_runtime_suspended = false;
10548 +- spin_unlock_bh(&request_mgr_handle->hw_lock);
10549 +-
10550 +- return 0;
10551 +-}
10552 +-
10553 +-/*
10554 +- * suspend the queue configuration. Since it is used for the runtime suspend
10555 +- * only verify that the queue can be suspended.
10556 +- */
10557 +-int cc_suspend_req_queue(struct cc_drvdata *drvdata)
10558 +-{
10559 +- struct cc_req_mgr_handle *request_mgr_handle =
10560 +- drvdata->request_mgr_handle;
10561 +-
10562 +- /* lock the send_request */
10563 +- spin_lock_bh(&request_mgr_handle->hw_lock);
10564 +- if (request_mgr_handle->req_queue_head !=
10565 +- request_mgr_handle->req_queue_tail) {
10566 +- spin_unlock_bh(&request_mgr_handle->hw_lock);
10567 +- return -EBUSY;
10568 +- }
10569 +- request_mgr_handle->is_runtime_suspended = true;
10570 +- spin_unlock_bh(&request_mgr_handle->hw_lock);
10571 +-
10572 +- return 0;
10573 +-}
10574 +-
10575 +-bool cc_req_queue_suspended(struct cc_drvdata *drvdata)
10576 +-{
10577 +- struct cc_req_mgr_handle *request_mgr_handle =
10578 +- drvdata->request_mgr_handle;
10579 +-
10580 +- return request_mgr_handle->is_runtime_suspended;
10581 +-}
10582 +-
10583 +-#endif
10584 +diff --git a/drivers/crypto/ccree/cc_request_mgr.h b/drivers/crypto/ccree/cc_request_mgr.h
10585 +index 573cb97af085..ae96abce25c9 100644
10586 +--- a/drivers/crypto/ccree/cc_request_mgr.h
10587 ++++ b/drivers/crypto/ccree/cc_request_mgr.h
10588 +@@ -40,12 +40,4 @@ void complete_request(struct cc_drvdata *drvdata);
10589 +
10590 + void cc_req_mgr_fini(struct cc_drvdata *drvdata);
10591 +
10592 +-#if defined(CONFIG_PM)
10593 +-int cc_resume_req_queue(struct cc_drvdata *drvdata);
10594 +-
10595 +-int cc_suspend_req_queue(struct cc_drvdata *drvdata);
10596 +-
10597 +-bool cc_req_queue_suspended(struct cc_drvdata *drvdata);
10598 +-#endif
10599 +-
10600 + #endif /*__REQUEST_MGR_H__*/
10601 +diff --git a/drivers/crypto/geode-aes.c b/drivers/crypto/geode-aes.c
10602 +index d670f7000cbb..0bd99c0decf5 100644
10603 +--- a/drivers/crypto/geode-aes.c
10604 ++++ b/drivers/crypto/geode-aes.c
10605 +@@ -14,7 +14,7 @@
10606 + #include <linux/spinlock.h>
10607 + #include <crypto/algapi.h>
10608 + #include <crypto/aes.h>
10609 +-#include <crypto/skcipher.h>
10610 ++#include <crypto/internal/skcipher.h>
10611 +
10612 + #include <linux/io.h>
10613 + #include <linux/delay.h>
10614 +@@ -28,12 +28,12 @@ static spinlock_t lock;
10615 +
10616 + /* Write a 128 bit field (either a writable key or IV) */
10617 + static inline void
10618 +-_writefield(u32 offset, void *value)
10619 ++_writefield(u32 offset, const void *value)
10620 + {
10621 + int i;
10622 +
10623 + for (i = 0; i < 4; i++)
10624 +- iowrite32(((u32 *) value)[i], _iobase + offset + (i * 4));
10625 ++ iowrite32(((const u32 *) value)[i], _iobase + offset + (i * 4));
10626 + }
10627 +
10628 + /* Read a 128 bit field (either a writable key or IV) */
10629 +@@ -47,12 +47,12 @@ _readfield(u32 offset, void *value)
10630 + }
10631 +
10632 + static int
10633 +-do_crypt(void *src, void *dst, int len, u32 flags)
10634 ++do_crypt(const void *src, void *dst, u32 len, u32 flags)
10635 + {
10636 + u32 status;
10637 + u32 counter = AES_OP_TIMEOUT;
10638 +
10639 +- iowrite32(virt_to_phys(src), _iobase + AES_SOURCEA_REG);
10640 ++ iowrite32(virt_to_phys((void *)src), _iobase + AES_SOURCEA_REG);
10641 + iowrite32(virt_to_phys(dst), _iobase + AES_DSTA_REG);
10642 + iowrite32(len, _iobase + AES_LENA_REG);
10643 +
10644 +@@ -69,16 +69,14 @@ do_crypt(void *src, void *dst, int len, u32 flags)
10645 + return counter ? 0 : 1;
10646 + }
10647 +
10648 +-static unsigned int
10649 +-geode_aes_crypt(struct geode_aes_op *op)
10650 ++static void
10651 ++geode_aes_crypt(const struct geode_aes_tfm_ctx *tctx, const void *src,
10652 ++ void *dst, u32 len, u8 *iv, int mode, int dir)
10653 + {
10654 + u32 flags = 0;
10655 + unsigned long iflags;
10656 + int ret;
10657 +
10658 +- if (op->len == 0)
10659 +- return 0;
10660 +-
10661 + /* If the source and destination is the same, then
10662 + * we need to turn on the coherent flags, otherwise
10663 + * we don't need to worry
10664 +@@ -86,32 +84,28 @@ geode_aes_crypt(struct geode_aes_op *op)
10665 +
10666 + flags |= (AES_CTRL_DCA | AES_CTRL_SCA);
10667 +
10668 +- if (op->dir == AES_DIR_ENCRYPT)
10669 ++ if (dir == AES_DIR_ENCRYPT)
10670 + flags |= AES_CTRL_ENCRYPT;
10671 +
10672 + /* Start the critical section */
10673 +
10674 + spin_lock_irqsave(&lock, iflags);
10675 +
10676 +- if (op->mode == AES_MODE_CBC) {
10677 ++ if (mode == AES_MODE_CBC) {
10678 + flags |= AES_CTRL_CBC;
10679 +- _writefield(AES_WRITEIV0_REG, op->iv);
10680 ++ _writefield(AES_WRITEIV0_REG, iv);
10681 + }
10682 +
10683 +- if (!(op->flags & AES_FLAGS_HIDDENKEY)) {
10684 +- flags |= AES_CTRL_WRKEY;
10685 +- _writefield(AES_WRITEKEY0_REG, op->key);
10686 +- }
10687 ++ flags |= AES_CTRL_WRKEY;
10688 ++ _writefield(AES_WRITEKEY0_REG, tctx->key);
10689 +
10690 +- ret = do_crypt(op->src, op->dst, op->len, flags);
10691 ++ ret = do_crypt(src, dst, len, flags);
10692 + BUG_ON(ret);
10693 +
10694 +- if (op->mode == AES_MODE_CBC)
10695 +- _readfield(AES_WRITEIV0_REG, op->iv);
10696 ++ if (mode == AES_MODE_CBC)
10697 ++ _readfield(AES_WRITEIV0_REG, iv);
10698 +
10699 + spin_unlock_irqrestore(&lock, iflags);
10700 +-
10701 +- return op->len;
10702 + }
10703 +
10704 + /* CRYPTO-API Functions */
10705 +@@ -119,13 +113,13 @@ geode_aes_crypt(struct geode_aes_op *op)
10706 + static int geode_setkey_cip(struct crypto_tfm *tfm, const u8 *key,
10707 + unsigned int len)
10708 + {
10709 +- struct geode_aes_op *op = crypto_tfm_ctx(tfm);
10710 ++ struct geode_aes_tfm_ctx *tctx = crypto_tfm_ctx(tfm);
10711 + unsigned int ret;
10712 +
10713 +- op->keylen = len;
10714 ++ tctx->keylen = len;
10715 +
10716 + if (len == AES_KEYSIZE_128) {
10717 +- memcpy(op->key, key, len);
10718 ++ memcpy(tctx->key, key, len);
10719 + return 0;
10720 + }
10721 +
10722 +@@ -138,132 +132,93 @@ static int geode_setkey_cip(struct crypto_tfm *tfm, const u8 *key,
10723 + /*
10724 + * The requested key size is not supported by HW, do a fallback
10725 + */
10726 +- op->fallback.cip->base.crt_flags &= ~CRYPTO_TFM_REQ_MASK;
10727 +- op->fallback.cip->base.crt_flags |= (tfm->crt_flags & CRYPTO_TFM_REQ_MASK);
10728 ++ tctx->fallback.cip->base.crt_flags &= ~CRYPTO_TFM_REQ_MASK;
10729 ++ tctx->fallback.cip->base.crt_flags |=
10730 ++ (tfm->crt_flags & CRYPTO_TFM_REQ_MASK);
10731 +
10732 +- ret = crypto_cipher_setkey(op->fallback.cip, key, len);
10733 ++ ret = crypto_cipher_setkey(tctx->fallback.cip, key, len);
10734 + if (ret) {
10735 + tfm->crt_flags &= ~CRYPTO_TFM_RES_MASK;
10736 +- tfm->crt_flags |= (op->fallback.cip->base.crt_flags & CRYPTO_TFM_RES_MASK);
10737 ++ tfm->crt_flags |= (tctx->fallback.cip->base.crt_flags &
10738 ++ CRYPTO_TFM_RES_MASK);
10739 + }
10740 + return ret;
10741 + }
10742 +
10743 +-static int geode_setkey_blk(struct crypto_tfm *tfm, const u8 *key,
10744 +- unsigned int len)
10745 ++static int geode_setkey_skcipher(struct crypto_skcipher *tfm, const u8 *key,
10746 ++ unsigned int len)
10747 + {
10748 +- struct geode_aes_op *op = crypto_tfm_ctx(tfm);
10749 ++ struct geode_aes_tfm_ctx *tctx = crypto_skcipher_ctx(tfm);
10750 + unsigned int ret;
10751 +
10752 +- op->keylen = len;
10753 ++ tctx->keylen = len;
10754 +
10755 + if (len == AES_KEYSIZE_128) {
10756 +- memcpy(op->key, key, len);
10757 ++ memcpy(tctx->key, key, len);
10758 + return 0;
10759 + }
10760 +
10761 + if (len != AES_KEYSIZE_192 && len != AES_KEYSIZE_256) {
10762 + /* not supported at all */
10763 +- tfm->crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
10764 ++ crypto_skcipher_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
10765 + return -EINVAL;
10766 + }
10767 +
10768 + /*
10769 + * The requested key size is not supported by HW, do a fallback
10770 + */
10771 +- crypto_skcipher_clear_flags(op->fallback.blk, CRYPTO_TFM_REQ_MASK);
10772 +- crypto_skcipher_set_flags(op->fallback.blk,
10773 +- tfm->crt_flags & CRYPTO_TFM_REQ_MASK);
10774 +-
10775 +- ret = crypto_skcipher_setkey(op->fallback.blk, key, len);
10776 +- if (ret) {
10777 +- tfm->crt_flags &= ~CRYPTO_TFM_RES_MASK;
10778 +- tfm->crt_flags |= crypto_skcipher_get_flags(op->fallback.blk) &
10779 +- CRYPTO_TFM_RES_MASK;
10780 +- }
10781 ++ crypto_skcipher_clear_flags(tctx->fallback.skcipher,
10782 ++ CRYPTO_TFM_REQ_MASK);
10783 ++ crypto_skcipher_set_flags(tctx->fallback.skcipher,
10784 ++ crypto_skcipher_get_flags(tfm) &
10785 ++ CRYPTO_TFM_REQ_MASK);
10786 ++ ret = crypto_skcipher_setkey(tctx->fallback.skcipher, key, len);
10787 ++ crypto_skcipher_set_flags(tfm,
10788 ++ crypto_skcipher_get_flags(tctx->fallback.skcipher) &
10789 ++ CRYPTO_TFM_RES_MASK);
10790 + return ret;
10791 + }
10792 +
10793 +-static int fallback_blk_dec(struct blkcipher_desc *desc,
10794 +- struct scatterlist *dst, struct scatterlist *src,
10795 +- unsigned int nbytes)
10796 +-{
10797 +- struct geode_aes_op *op = crypto_blkcipher_ctx(desc->tfm);
10798 +- SKCIPHER_REQUEST_ON_STACK(req, op->fallback.blk);
10799 +-
10800 +- skcipher_request_set_tfm(req, op->fallback.blk);
10801 +- skcipher_request_set_callback(req, 0, NULL, NULL);
10802 +- skcipher_request_set_crypt(req, src, dst, nbytes, desc->info);
10803 +-
10804 +- return crypto_skcipher_decrypt(req);
10805 +-}
10806 +-
10807 +-static int fallback_blk_enc(struct blkcipher_desc *desc,
10808 +- struct scatterlist *dst, struct scatterlist *src,
10809 +- unsigned int nbytes)
10810 +-{
10811 +- struct geode_aes_op *op = crypto_blkcipher_ctx(desc->tfm);
10812 +- SKCIPHER_REQUEST_ON_STACK(req, op->fallback.blk);
10813 +-
10814 +- skcipher_request_set_tfm(req, op->fallback.blk);
10815 +- skcipher_request_set_callback(req, 0, NULL, NULL);
10816 +- skcipher_request_set_crypt(req, src, dst, nbytes, desc->info);
10817 +-
10818 +- return crypto_skcipher_encrypt(req);
10819 +-}
10820 +-
10821 + static void
10822 + geode_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
10823 + {
10824 +- struct geode_aes_op *op = crypto_tfm_ctx(tfm);
10825 ++ const struct geode_aes_tfm_ctx *tctx = crypto_tfm_ctx(tfm);
10826 +
10827 +- if (unlikely(op->keylen != AES_KEYSIZE_128)) {
10828 +- crypto_cipher_encrypt_one(op->fallback.cip, out, in);
10829 ++ if (unlikely(tctx->keylen != AES_KEYSIZE_128)) {
10830 ++ crypto_cipher_encrypt_one(tctx->fallback.cip, out, in);
10831 + return;
10832 + }
10833 +
10834 +- op->src = (void *) in;
10835 +- op->dst = (void *) out;
10836 +- op->mode = AES_MODE_ECB;
10837 +- op->flags = 0;
10838 +- op->len = AES_BLOCK_SIZE;
10839 +- op->dir = AES_DIR_ENCRYPT;
10840 +-
10841 +- geode_aes_crypt(op);
10842 ++ geode_aes_crypt(tctx, in, out, AES_BLOCK_SIZE, NULL,
10843 ++ AES_MODE_ECB, AES_DIR_ENCRYPT);
10844 + }
10845 +
10846 +
10847 + static void
10848 + geode_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
10849 + {
10850 +- struct geode_aes_op *op = crypto_tfm_ctx(tfm);
10851 ++ const struct geode_aes_tfm_ctx *tctx = crypto_tfm_ctx(tfm);
10852 +
10853 +- if (unlikely(op->keylen != AES_KEYSIZE_128)) {
10854 +- crypto_cipher_decrypt_one(op->fallback.cip, out, in);
10855 ++ if (unlikely(tctx->keylen != AES_KEYSIZE_128)) {
10856 ++ crypto_cipher_decrypt_one(tctx->fallback.cip, out, in);
10857 + return;
10858 + }
10859 +
10860 +- op->src = (void *) in;
10861 +- op->dst = (void *) out;
10862 +- op->mode = AES_MODE_ECB;
10863 +- op->flags = 0;
10864 +- op->len = AES_BLOCK_SIZE;
10865 +- op->dir = AES_DIR_DECRYPT;
10866 +-
10867 +- geode_aes_crypt(op);
10868 ++ geode_aes_crypt(tctx, in, out, AES_BLOCK_SIZE, NULL,
10869 ++ AES_MODE_ECB, AES_DIR_DECRYPT);
10870 + }
10871 +
10872 + static int fallback_init_cip(struct crypto_tfm *tfm)
10873 + {
10874 + const char *name = crypto_tfm_alg_name(tfm);
10875 +- struct geode_aes_op *op = crypto_tfm_ctx(tfm);
10876 ++ struct geode_aes_tfm_ctx *tctx = crypto_tfm_ctx(tfm);
10877 +
10878 +- op->fallback.cip = crypto_alloc_cipher(name, 0,
10879 +- CRYPTO_ALG_ASYNC | CRYPTO_ALG_NEED_FALLBACK);
10880 ++ tctx->fallback.cip = crypto_alloc_cipher(name, 0,
10881 ++ CRYPTO_ALG_NEED_FALLBACK);
10882 +
10883 +- if (IS_ERR(op->fallback.cip)) {
10884 ++ if (IS_ERR(tctx->fallback.cip)) {
10885 + printk(KERN_ERR "Error allocating fallback algo %s\n", name);
10886 +- return PTR_ERR(op->fallback.cip);
10887 ++ return PTR_ERR(tctx->fallback.cip);
10888 + }
10889 +
10890 + return 0;
10891 +@@ -271,10 +226,9 @@ static int fallback_init_cip(struct crypto_tfm *tfm)
10892 +
10893 + static void fallback_exit_cip(struct crypto_tfm *tfm)
10894 + {
10895 +- struct geode_aes_op *op = crypto_tfm_ctx(tfm);
10896 ++ struct geode_aes_tfm_ctx *tctx = crypto_tfm_ctx(tfm);
10897 +
10898 +- crypto_free_cipher(op->fallback.cip);
10899 +- op->fallback.cip = NULL;
10900 ++ crypto_free_cipher(tctx->fallback.cip);
10901 + }
10902 +
10903 + static struct crypto_alg geode_alg = {
10904 +@@ -287,7 +241,7 @@ static struct crypto_alg geode_alg = {
10905 + .cra_init = fallback_init_cip,
10906 + .cra_exit = fallback_exit_cip,
10907 + .cra_blocksize = AES_BLOCK_SIZE,
10908 +- .cra_ctxsize = sizeof(struct geode_aes_op),
10909 ++ .cra_ctxsize = sizeof(struct geode_aes_tfm_ctx),
10910 + .cra_module = THIS_MODULE,
10911 + .cra_u = {
10912 + .cipher = {
10913 +@@ -300,222 +254,126 @@ static struct crypto_alg geode_alg = {
10914 + }
10915 + };
10916 +
10917 +-static int
10918 +-geode_cbc_decrypt(struct blkcipher_desc *desc,
10919 +- struct scatterlist *dst, struct scatterlist *src,
10920 +- unsigned int nbytes)
10921 ++static int geode_init_skcipher(struct crypto_skcipher *tfm)
10922 + {
10923 +- struct geode_aes_op *op = crypto_blkcipher_ctx(desc->tfm);
10924 +- struct blkcipher_walk walk;
10925 +- int err, ret;
10926 +-
10927 +- if (nbytes % AES_BLOCK_SIZE)
10928 +- return -EINVAL;
10929 +-
10930 +- if (unlikely(op->keylen != AES_KEYSIZE_128))
10931 +- return fallback_blk_dec(desc, dst, src, nbytes);
10932 ++ const char *name = crypto_tfm_alg_name(&tfm->base);
10933 ++ struct geode_aes_tfm_ctx *tctx = crypto_skcipher_ctx(tfm);
10934 +
10935 +- blkcipher_walk_init(&walk, dst, src, nbytes);
10936 +- err = blkcipher_walk_virt(desc, &walk);
10937 +- op->iv = walk.iv;
10938 +-
10939 +- while ((nbytes = walk.nbytes)) {
10940 +- op->src = walk.src.virt.addr,
10941 +- op->dst = walk.dst.virt.addr;
10942 +- op->mode = AES_MODE_CBC;
10943 +- op->len = nbytes - (nbytes % AES_BLOCK_SIZE);
10944 +- op->dir = AES_DIR_DECRYPT;
10945 +-
10946 +- ret = geode_aes_crypt(op);
10947 +-
10948 +- nbytes -= ret;
10949 +- err = blkcipher_walk_done(desc, &walk, nbytes);
10950 ++ tctx->fallback.skcipher =
10951 ++ crypto_alloc_skcipher(name, 0, CRYPTO_ALG_NEED_FALLBACK |
10952 ++ CRYPTO_ALG_ASYNC);
10953 ++ if (IS_ERR(tctx->fallback.skcipher)) {
10954 ++ printk(KERN_ERR "Error allocating fallback algo %s\n", name);
10955 ++ return PTR_ERR(tctx->fallback.skcipher);
10956 + }
10957 +
10958 +- return err;
10959 ++ crypto_skcipher_set_reqsize(tfm, sizeof(struct skcipher_request) +
10960 ++ crypto_skcipher_reqsize(tctx->fallback.skcipher));
10961 ++ return 0;
10962 + }
10963 +
10964 +-static int
10965 +-geode_cbc_encrypt(struct blkcipher_desc *desc,
10966 +- struct scatterlist *dst, struct scatterlist *src,
10967 +- unsigned int nbytes)
10968 ++static void geode_exit_skcipher(struct crypto_skcipher *tfm)
10969 + {
10970 +- struct geode_aes_op *op = crypto_blkcipher_ctx(desc->tfm);
10971 +- struct blkcipher_walk walk;
10972 +- int err, ret;
10973 ++ struct geode_aes_tfm_ctx *tctx = crypto_skcipher_ctx(tfm);
10974 +
10975 +- if (nbytes % AES_BLOCK_SIZE)
10976 +- return -EINVAL;
10977 +-
10978 +- if (unlikely(op->keylen != AES_KEYSIZE_128))
10979 +- return fallback_blk_enc(desc, dst, src, nbytes);
10980 ++ crypto_free_skcipher(tctx->fallback.skcipher);
10981 ++}
10982 +
10983 +- blkcipher_walk_init(&walk, dst, src, nbytes);
10984 +- err = blkcipher_walk_virt(desc, &walk);
10985 +- op->iv = walk.iv;
10986 ++static int geode_skcipher_crypt(struct skcipher_request *req, int mode, int dir)
10987 ++{
10988 ++ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
10989 ++ const struct geode_aes_tfm_ctx *tctx = crypto_skcipher_ctx(tfm);
10990 ++ struct skcipher_walk walk;
10991 ++ unsigned int nbytes;
10992 ++ int err;
10993 ++
10994 ++ if (unlikely(tctx->keylen != AES_KEYSIZE_128)) {
10995 ++ struct skcipher_request *subreq = skcipher_request_ctx(req);
10996 ++
10997 ++ *subreq = *req;
10998 ++ skcipher_request_set_tfm(subreq, tctx->fallback.skcipher);
10999 ++ if (dir == AES_DIR_DECRYPT)
11000 ++ return crypto_skcipher_decrypt(subreq);
11001 ++ else
11002 ++ return crypto_skcipher_encrypt(subreq);
11003 ++ }
11004 +
11005 +- while ((nbytes = walk.nbytes)) {
11006 +- op->src = walk.src.virt.addr,
11007 +- op->dst = walk.dst.virt.addr;
11008 +- op->mode = AES_MODE_CBC;
11009 +- op->len = nbytes - (nbytes % AES_BLOCK_SIZE);
11010 +- op->dir = AES_DIR_ENCRYPT;
11011 ++ err = skcipher_walk_virt(&walk, req, false);
11012 +
11013 +- ret = geode_aes_crypt(op);
11014 +- nbytes -= ret;
11015 +- err = blkcipher_walk_done(desc, &walk, nbytes);
11016 ++ while ((nbytes = walk.nbytes) != 0) {
11017 ++ geode_aes_crypt(tctx, walk.src.virt.addr, walk.dst.virt.addr,
11018 ++ round_down(nbytes, AES_BLOCK_SIZE),
11019 ++ walk.iv, mode, dir);
11020 ++ err = skcipher_walk_done(&walk, nbytes % AES_BLOCK_SIZE);
11021 + }
11022 +
11023 + return err;
11024 + }
11025 +
11026 +-static int fallback_init_blk(struct crypto_tfm *tfm)
11027 ++static int geode_cbc_encrypt(struct skcipher_request *req)
11028 + {
11029 +- const char *name = crypto_tfm_alg_name(tfm);
11030 +- struct geode_aes_op *op = crypto_tfm_ctx(tfm);
11031 +-
11032 +- op->fallback.blk = crypto_alloc_skcipher(name, 0,
11033 +- CRYPTO_ALG_ASYNC |
11034 +- CRYPTO_ALG_NEED_FALLBACK);
11035 +-
11036 +- if (IS_ERR(op->fallback.blk)) {
11037 +- printk(KERN_ERR "Error allocating fallback algo %s\n", name);
11038 +- return PTR_ERR(op->fallback.blk);
11039 +- }
11040 +-
11041 +- return 0;
11042 ++ return geode_skcipher_crypt(req, AES_MODE_CBC, AES_DIR_ENCRYPT);
11043 + }
11044 +
11045 +-static void fallback_exit_blk(struct crypto_tfm *tfm)
11046 ++static int geode_cbc_decrypt(struct skcipher_request *req)
11047 + {
11048 +- struct geode_aes_op *op = crypto_tfm_ctx(tfm);
11049 +-
11050 +- crypto_free_skcipher(op->fallback.blk);
11051 +- op->fallback.blk = NULL;
11052 ++ return geode_skcipher_crypt(req, AES_MODE_CBC, AES_DIR_DECRYPT);
11053 + }
11054 +
11055 +-static struct crypto_alg geode_cbc_alg = {
11056 +- .cra_name = "cbc(aes)",
11057 +- .cra_driver_name = "cbc-aes-geode",
11058 +- .cra_priority = 400,
11059 +- .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
11060 +- CRYPTO_ALG_KERN_DRIVER_ONLY |
11061 +- CRYPTO_ALG_NEED_FALLBACK,
11062 +- .cra_init = fallback_init_blk,
11063 +- .cra_exit = fallback_exit_blk,
11064 +- .cra_blocksize = AES_BLOCK_SIZE,
11065 +- .cra_ctxsize = sizeof(struct geode_aes_op),
11066 +- .cra_alignmask = 15,
11067 +- .cra_type = &crypto_blkcipher_type,
11068 +- .cra_module = THIS_MODULE,
11069 +- .cra_u = {
11070 +- .blkcipher = {
11071 +- .min_keysize = AES_MIN_KEY_SIZE,
11072 +- .max_keysize = AES_MAX_KEY_SIZE,
11073 +- .setkey = geode_setkey_blk,
11074 +- .encrypt = geode_cbc_encrypt,
11075 +- .decrypt = geode_cbc_decrypt,
11076 +- .ivsize = AES_BLOCK_SIZE,
11077 +- }
11078 +- }
11079 +-};
11080 +-
11081 +-static int
11082 +-geode_ecb_decrypt(struct blkcipher_desc *desc,
11083 +- struct scatterlist *dst, struct scatterlist *src,
11084 +- unsigned int nbytes)
11085 ++static int geode_ecb_encrypt(struct skcipher_request *req)
11086 + {
11087 +- struct geode_aes_op *op = crypto_blkcipher_ctx(desc->tfm);
11088 +- struct blkcipher_walk walk;
11089 +- int err, ret;
11090 +-
11091 +- if (nbytes % AES_BLOCK_SIZE)
11092 +- return -EINVAL;
11093 +-
11094 +- if (unlikely(op->keylen != AES_KEYSIZE_128))
11095 +- return fallback_blk_dec(desc, dst, src, nbytes);
11096 +-
11097 +- blkcipher_walk_init(&walk, dst, src, nbytes);
11098 +- err = blkcipher_walk_virt(desc, &walk);
11099 +-
11100 +- while ((nbytes = walk.nbytes)) {
11101 +- op->src = walk.src.virt.addr,
11102 +- op->dst = walk.dst.virt.addr;
11103 +- op->mode = AES_MODE_ECB;
11104 +- op->len = nbytes - (nbytes % AES_BLOCK_SIZE);
11105 +- op->dir = AES_DIR_DECRYPT;
11106 +-
11107 +- ret = geode_aes_crypt(op);
11108 +- nbytes -= ret;
11109 +- err = blkcipher_walk_done(desc, &walk, nbytes);
11110 +- }
11111 +-
11112 +- return err;
11113 ++ return geode_skcipher_crypt(req, AES_MODE_ECB, AES_DIR_ENCRYPT);
11114 + }
11115 +
11116 +-static int
11117 +-geode_ecb_encrypt(struct blkcipher_desc *desc,
11118 +- struct scatterlist *dst, struct scatterlist *src,
11119 +- unsigned int nbytes)
11120 ++static int geode_ecb_decrypt(struct skcipher_request *req)
11121 + {
11122 +- struct geode_aes_op *op = crypto_blkcipher_ctx(desc->tfm);
11123 +- struct blkcipher_walk walk;
11124 +- int err, ret;
11125 +-
11126 +- if (nbytes % AES_BLOCK_SIZE)
11127 +- return -EINVAL;
11128 +-
11129 +- if (unlikely(op->keylen != AES_KEYSIZE_128))
11130 +- return fallback_blk_enc(desc, dst, src, nbytes);
11131 +-
11132 +- blkcipher_walk_init(&walk, dst, src, nbytes);
11133 +- err = blkcipher_walk_virt(desc, &walk);
11134 +-
11135 +- while ((nbytes = walk.nbytes)) {
11136 +- op->src = walk.src.virt.addr,
11137 +- op->dst = walk.dst.virt.addr;
11138 +- op->mode = AES_MODE_ECB;
11139 +- op->len = nbytes - (nbytes % AES_BLOCK_SIZE);
11140 +- op->dir = AES_DIR_ENCRYPT;
11141 +-
11142 +- ret = geode_aes_crypt(op);
11143 +- nbytes -= ret;
11144 +- ret = blkcipher_walk_done(desc, &walk, nbytes);
11145 +- }
11146 +-
11147 +- return err;
11148 ++ return geode_skcipher_crypt(req, AES_MODE_ECB, AES_DIR_DECRYPT);
11149 + }
11150 +
11151 +-static struct crypto_alg geode_ecb_alg = {
11152 +- .cra_name = "ecb(aes)",
11153 +- .cra_driver_name = "ecb-aes-geode",
11154 +- .cra_priority = 400,
11155 +- .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
11156 +- CRYPTO_ALG_KERN_DRIVER_ONLY |
11157 +- CRYPTO_ALG_NEED_FALLBACK,
11158 +- .cra_init = fallback_init_blk,
11159 +- .cra_exit = fallback_exit_blk,
11160 +- .cra_blocksize = AES_BLOCK_SIZE,
11161 +- .cra_ctxsize = sizeof(struct geode_aes_op),
11162 +- .cra_alignmask = 15,
11163 +- .cra_type = &crypto_blkcipher_type,
11164 +- .cra_module = THIS_MODULE,
11165 +- .cra_u = {
11166 +- .blkcipher = {
11167 +- .min_keysize = AES_MIN_KEY_SIZE,
11168 +- .max_keysize = AES_MAX_KEY_SIZE,
11169 +- .setkey = geode_setkey_blk,
11170 +- .encrypt = geode_ecb_encrypt,
11171 +- .decrypt = geode_ecb_decrypt,
11172 +- }
11173 +- }
11174 ++static struct skcipher_alg geode_skcipher_algs[] = {
11175 ++ {
11176 ++ .base.cra_name = "cbc(aes)",
11177 ++ .base.cra_driver_name = "cbc-aes-geode",
11178 ++ .base.cra_priority = 400,
11179 ++ .base.cra_flags = CRYPTO_ALG_KERN_DRIVER_ONLY |
11180 ++ CRYPTO_ALG_NEED_FALLBACK,
11181 ++ .base.cra_blocksize = AES_BLOCK_SIZE,
11182 ++ .base.cra_ctxsize = sizeof(struct geode_aes_tfm_ctx),
11183 ++ .base.cra_alignmask = 15,
11184 ++ .base.cra_module = THIS_MODULE,
11185 ++ .init = geode_init_skcipher,
11186 ++ .exit = geode_exit_skcipher,
11187 ++ .setkey = geode_setkey_skcipher,
11188 ++ .encrypt = geode_cbc_encrypt,
11189 ++ .decrypt = geode_cbc_decrypt,
11190 ++ .min_keysize = AES_MIN_KEY_SIZE,
11191 ++ .max_keysize = AES_MAX_KEY_SIZE,
11192 ++ .ivsize = AES_BLOCK_SIZE,
11193 ++ }, {
11194 ++ .base.cra_name = "ecb(aes)",
11195 ++ .base.cra_driver_name = "ecb-aes-geode",
11196 ++ .base.cra_priority = 400,
11197 ++ .base.cra_flags = CRYPTO_ALG_KERN_DRIVER_ONLY |
11198 ++ CRYPTO_ALG_NEED_FALLBACK,
11199 ++ .base.cra_blocksize = AES_BLOCK_SIZE,
11200 ++ .base.cra_ctxsize = sizeof(struct geode_aes_tfm_ctx),
11201 ++ .base.cra_alignmask = 15,
11202 ++ .base.cra_module = THIS_MODULE,
11203 ++ .init = geode_init_skcipher,
11204 ++ .exit = geode_exit_skcipher,
11205 ++ .setkey = geode_setkey_skcipher,
11206 ++ .encrypt = geode_ecb_encrypt,
11207 ++ .decrypt = geode_ecb_decrypt,
11208 ++ .min_keysize = AES_MIN_KEY_SIZE,
11209 ++ .max_keysize = AES_MAX_KEY_SIZE,
11210 ++ },
11211 + };
11212 +
11213 + static void geode_aes_remove(struct pci_dev *dev)
11214 + {
11215 + crypto_unregister_alg(&geode_alg);
11216 +- crypto_unregister_alg(&geode_ecb_alg);
11217 +- crypto_unregister_alg(&geode_cbc_alg);
11218 ++ crypto_unregister_skciphers(geode_skcipher_algs,
11219 ++ ARRAY_SIZE(geode_skcipher_algs));
11220 +
11221 + pci_iounmap(dev, _iobase);
11222 + _iobase = NULL;
11223 +@@ -553,20 +411,14 @@ static int geode_aes_probe(struct pci_dev *dev, const struct pci_device_id *id)
11224 + if (ret)
11225 + goto eiomap;
11226 +
11227 +- ret = crypto_register_alg(&geode_ecb_alg);
11228 ++ ret = crypto_register_skciphers(geode_skcipher_algs,
11229 ++ ARRAY_SIZE(geode_skcipher_algs));
11230 + if (ret)
11231 + goto ealg;
11232 +
11233 +- ret = crypto_register_alg(&geode_cbc_alg);
11234 +- if (ret)
11235 +- goto eecb;
11236 +-
11237 + dev_notice(&dev->dev, "GEODE AES engine enabled.\n");
11238 + return 0;
11239 +
11240 +- eecb:
11241 +- crypto_unregister_alg(&geode_ecb_alg);
11242 +-
11243 + ealg:
11244 + crypto_unregister_alg(&geode_alg);
11245 +
11246 +diff --git a/drivers/crypto/geode-aes.h b/drivers/crypto/geode-aes.h
11247 +index c5763a041bb8..157443dc6d8a 100644
11248 +--- a/drivers/crypto/geode-aes.h
11249 ++++ b/drivers/crypto/geode-aes.h
11250 +@@ -50,21 +50,10 @@
11251 +
11252 + #define AES_OP_TIMEOUT 0x50000
11253 +
11254 +-struct geode_aes_op {
11255 +-
11256 +- void *src;
11257 +- void *dst;
11258 +-
11259 +- u32 mode;
11260 +- u32 dir;
11261 +- u32 flags;
11262 +- int len;
11263 +-
11264 ++struct geode_aes_tfm_ctx {
11265 + u8 key[AES_KEYSIZE_128];
11266 +- u8 *iv;
11267 +-
11268 + union {
11269 +- struct crypto_skcipher *blk;
11270 ++ struct crypto_skcipher *skcipher;
11271 + struct crypto_cipher *cip;
11272 + } fallback;
11273 + u32 keylen;
11274 +diff --git a/drivers/crypto/picoxcell_crypto.c b/drivers/crypto/picoxcell_crypto.c
11275 +index 321d5e2ac833..e2491754c468 100644
11276 +--- a/drivers/crypto/picoxcell_crypto.c
11277 ++++ b/drivers/crypto/picoxcell_crypto.c
11278 +@@ -1616,6 +1616,11 @@ static const struct of_device_id spacc_of_id_table[] = {
11279 + MODULE_DEVICE_TABLE(of, spacc_of_id_table);
11280 + #endif /* CONFIG_OF */
11281 +
11282 ++static void spacc_tasklet_kill(void *data)
11283 ++{
11284 ++ tasklet_kill(data);
11285 ++}
11286 ++
11287 + static int spacc_probe(struct platform_device *pdev)
11288 + {
11289 + int i, err, ret;
11290 +@@ -1659,6 +1664,14 @@ static int spacc_probe(struct platform_device *pdev)
11291 + return -ENXIO;
11292 + }
11293 +
11294 ++ tasklet_init(&engine->complete, spacc_spacc_complete,
11295 ++ (unsigned long)engine);
11296 ++
11297 ++ ret = devm_add_action(&pdev->dev, spacc_tasklet_kill,
11298 ++ &engine->complete);
11299 ++ if (ret)
11300 ++ return ret;
11301 ++
11302 + if (devm_request_irq(&pdev->dev, irq->start, spacc_spacc_irq, 0,
11303 + engine->name, engine)) {
11304 + dev_err(engine->dev, "failed to request IRQ\n");
11305 +@@ -1716,8 +1729,6 @@ static int spacc_probe(struct platform_device *pdev)
11306 + INIT_LIST_HEAD(&engine->completed);
11307 + INIT_LIST_HEAD(&engine->in_progress);
11308 + engine->in_flight = 0;
11309 +- tasklet_init(&engine->complete, spacc_spacc_complete,
11310 +- (unsigned long)engine);
11311 +
11312 + platform_set_drvdata(pdev, engine);
11313 +
11314 +diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_helpers.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_helpers.c
11315 +index 8403b6a9a77b..d8c53ddc23b4 100644
11316 +--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_helpers.c
11317 ++++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_helpers.c
11318 +@@ -248,7 +248,8 @@ bool dm_helpers_dp_mst_write_payload_allocation_table(
11319 + drm_dp_mst_reset_vcpi_slots(mst_mgr, mst_port);
11320 + }
11321 +
11322 +- ret = drm_dp_update_payload_part1(mst_mgr);
11323 ++ /* It's OK for this to fail */
11324 ++ drm_dp_update_payload_part1(mst_mgr);
11325 +
11326 + /* mst_mgr->->payloads are VC payload notify MST branch using DPCD or
11327 + * AUX message. The sequence is slot 1-63 allocated sequence for each
11328 +@@ -257,9 +258,6 @@ bool dm_helpers_dp_mst_write_payload_allocation_table(
11329 +
11330 + get_payload_table(aconnector, proposed_table);
11331 +
11332 +- if (ret)
11333 +- return false;
11334 +-
11335 + return true;
11336 + }
11337 +
11338 +@@ -310,7 +308,6 @@ bool dm_helpers_dp_mst_send_payload_allocation(
11339 + struct amdgpu_dm_connector *aconnector;
11340 + struct drm_dp_mst_topology_mgr *mst_mgr;
11341 + struct drm_dp_mst_port *mst_port;
11342 +- int ret;
11343 +
11344 + aconnector = stream->sink->priv;
11345 +
11346 +@@ -324,10 +321,8 @@ bool dm_helpers_dp_mst_send_payload_allocation(
11347 + if (!mst_mgr->mst_state)
11348 + return false;
11349 +
11350 +- ret = drm_dp_update_payload_part2(mst_mgr);
11351 +-
11352 +- if (ret)
11353 +- return false;
11354 ++ /* It's OK for this to fail */
11355 ++ drm_dp_update_payload_part2(mst_mgr);
11356 +
11357 + if (!enable)
11358 + drm_dp_mst_deallocate_vcpi(mst_mgr, mst_port);
11359 +diff --git a/drivers/gpu/drm/atmel-hlcdc/atmel_hlcdc_crtc.c b/drivers/gpu/drm/atmel-hlcdc/atmel_hlcdc_crtc.c
11360 +index d73281095fac..976109c20d49 100644
11361 +--- a/drivers/gpu/drm/atmel-hlcdc/atmel_hlcdc_crtc.c
11362 ++++ b/drivers/gpu/drm/atmel-hlcdc/atmel_hlcdc_crtc.c
11363 +@@ -79,7 +79,11 @@ static void atmel_hlcdc_crtc_mode_set_nofb(struct drm_crtc *c)
11364 + struct videomode vm;
11365 + unsigned long prate;
11366 + unsigned int cfg;
11367 +- int div;
11368 ++ int div, ret;
11369 ++
11370 ++ ret = clk_prepare_enable(crtc->dc->hlcdc->sys_clk);
11371 ++ if (ret)
11372 ++ return;
11373 +
11374 + vm.vfront_porch = adj->crtc_vsync_start - adj->crtc_vdisplay;
11375 + vm.vback_porch = adj->crtc_vtotal - adj->crtc_vsync_end;
11376 +@@ -138,6 +142,8 @@ static void atmel_hlcdc_crtc_mode_set_nofb(struct drm_crtc *c)
11377 + ATMEL_HLCDC_VSPSU | ATMEL_HLCDC_VSPHO |
11378 + ATMEL_HLCDC_GUARDTIME_MASK | ATMEL_HLCDC_MODE_MASK,
11379 + cfg);
11380 ++
11381 ++ clk_disable_unprepare(crtc->dc->hlcdc->sys_clk);
11382 + }
11383 +
11384 + static enum drm_mode_status
11385 +diff --git a/drivers/gpu/drm/drm_dp_mst_topology.c b/drivers/gpu/drm/drm_dp_mst_topology.c
11386 +index 58fe3945494c..bf4eed5f6a7e 100644
11387 +--- a/drivers/gpu/drm/drm_dp_mst_topology.c
11388 ++++ b/drivers/gpu/drm/drm_dp_mst_topology.c
11389 +@@ -2125,6 +2125,7 @@ static bool drm_dp_get_vc_payload_bw(int dp_link_bw,
11390 + int drm_dp_mst_topology_mgr_set_mst(struct drm_dp_mst_topology_mgr *mgr, bool mst_state)
11391 + {
11392 + int ret = 0;
11393 ++ int i = 0;
11394 + struct drm_dp_mst_branch *mstb = NULL;
11395 +
11396 + mutex_lock(&mgr->lock);
11397 +@@ -2185,10 +2186,21 @@ int drm_dp_mst_topology_mgr_set_mst(struct drm_dp_mst_topology_mgr *mgr, bool ms
11398 + /* this can fail if the device is gone */
11399 + drm_dp_dpcd_writeb(mgr->aux, DP_MSTM_CTRL, 0);
11400 + ret = 0;
11401 ++ mutex_lock(&mgr->payload_lock);
11402 + memset(mgr->payloads, 0, mgr->max_payloads * sizeof(struct drm_dp_payload));
11403 + mgr->payload_mask = 0;
11404 + set_bit(0, &mgr->payload_mask);
11405 ++ for (i = 0; i < mgr->max_payloads; i++) {
11406 ++ struct drm_dp_vcpi *vcpi = mgr->proposed_vcpis[i];
11407 ++
11408 ++ if (vcpi) {
11409 ++ vcpi->vcpi = 0;
11410 ++ vcpi->num_slots = 0;
11411 ++ }
11412 ++ mgr->proposed_vcpis[i] = NULL;
11413 ++ }
11414 + mgr->vcpi_mask = 0;
11415 ++ mutex_unlock(&mgr->payload_lock);
11416 + }
11417 +
11418 + out_unlock:
11419 +diff --git a/drivers/gpu/drm/drm_rect.c b/drivers/gpu/drm/drm_rect.c
11420 +index 8c057829b804..0f5a0c64c4c4 100644
11421 +--- a/drivers/gpu/drm/drm_rect.c
11422 ++++ b/drivers/gpu/drm/drm_rect.c
11423 +@@ -52,7 +52,12 @@ EXPORT_SYMBOL(drm_rect_intersect);
11424 +
11425 + static u32 clip_scaled(u32 src, u32 dst, u32 clip)
11426 + {
11427 +- u64 tmp = mul_u32_u32(src, dst - clip);
11428 ++ u64 tmp;
11429 ++
11430 ++ if (dst == 0)
11431 ++ return 0;
11432 ++
11433 ++ tmp = mul_u32_u32(src, dst - clip);
11434 +
11435 + /*
11436 + * Round toward 1.0 when clipping so that we don't accidentally
11437 +diff --git a/drivers/gpu/drm/msm/disp/mdp4/mdp4_dsi_encoder.c b/drivers/gpu/drm/msm/disp/mdp4/mdp4_dsi_encoder.c
11438 +index 6a1ebdace391..6253717d2e0c 100644
11439 +--- a/drivers/gpu/drm/msm/disp/mdp4/mdp4_dsi_encoder.c
11440 ++++ b/drivers/gpu/drm/msm/disp/mdp4/mdp4_dsi_encoder.c
11441 +@@ -139,7 +139,7 @@ static void mdp4_dsi_encoder_enable(struct drm_encoder *encoder)
11442 + if (mdp4_dsi_encoder->enabled)
11443 + return;
11444 +
11445 +- mdp4_crtc_set_config(encoder->crtc,
11446 ++ mdp4_crtc_set_config(encoder->crtc,
11447 + MDP4_DMA_CONFIG_PACK_ALIGN_MSB |
11448 + MDP4_DMA_CONFIG_DEFLKR_EN |
11449 + MDP4_DMA_CONFIG_DITHER_EN |
11450 +diff --git a/drivers/gpu/drm/sun4i/sun6i_mipi_dsi.c b/drivers/gpu/drm/sun4i/sun6i_mipi_dsi.c
11451 +index 97a0573cc514..79eb11cd185d 100644
11452 +--- a/drivers/gpu/drm/sun4i/sun6i_mipi_dsi.c
11453 ++++ b/drivers/gpu/drm/sun4i/sun6i_mipi_dsi.c
11454 +@@ -357,8 +357,7 @@ static void sun6i_dsi_inst_init(struct sun6i_dsi *dsi,
11455 + static u16 sun6i_dsi_get_video_start_delay(struct sun6i_dsi *dsi,
11456 + struct drm_display_mode *mode)
11457 + {
11458 +- u16 start = clamp(mode->vtotal - mode->vdisplay - 10, 8, 100);
11459 +- u16 delay = mode->vtotal - (mode->vsync_end - mode->vdisplay) + start;
11460 ++ u16 delay = mode->vtotal - (mode->vsync_end - mode->vdisplay) + 1;
11461 +
11462 + if (delay > mode->vtotal)
11463 + delay = delay % mode->vtotal;
11464 +diff --git a/drivers/hv/hv_balloon.c b/drivers/hv/hv_balloon.c
11465 +index d2a735ac9ba1..9ca0706a9d40 100644
11466 +--- a/drivers/hv/hv_balloon.c
11467 ++++ b/drivers/hv/hv_balloon.c
11468 +@@ -1215,10 +1215,7 @@ static unsigned int alloc_balloon_pages(struct hv_dynmem_device *dm,
11469 + unsigned int i = 0;
11470 + struct page *pg;
11471 +
11472 +- if (num_pages < alloc_unit)
11473 +- return 0;
11474 +-
11475 +- for (i = 0; (i * alloc_unit) < num_pages; i++) {
11476 ++ for (i = 0; i < num_pages / alloc_unit; i++) {
11477 + if (bl_resp->hdr.size + sizeof(union dm_mem_page_range) >
11478 + PAGE_SIZE)
11479 + return i * alloc_unit;
11480 +@@ -1252,7 +1249,7 @@ static unsigned int alloc_balloon_pages(struct hv_dynmem_device *dm,
11481 +
11482 + }
11483 +
11484 +- return num_pages;
11485 ++ return i * alloc_unit;
11486 + }
11487 +
11488 + static void balloon_up(struct work_struct *dummy)
11489 +@@ -1267,9 +1264,6 @@ static void balloon_up(struct work_struct *dummy)
11490 + long avail_pages;
11491 + unsigned long floor;
11492 +
11493 +- /* The host balloons pages in 2M granularity. */
11494 +- WARN_ON_ONCE(num_pages % PAGES_IN_2M != 0);
11495 +-
11496 + /*
11497 + * We will attempt 2M allocations. However, if we fail to
11498 + * allocate 2M chunks, we will go back to 4k allocations.
11499 +@@ -1279,14 +1273,13 @@ static void balloon_up(struct work_struct *dummy)
11500 + avail_pages = si_mem_available();
11501 + floor = compute_balloon_floor();
11502 +
11503 +- /* Refuse to balloon below the floor, keep the 2M granularity. */
11504 ++ /* Refuse to balloon below the floor. */
11505 + if (avail_pages < num_pages || avail_pages - num_pages < floor) {
11506 + pr_warn("Balloon request will be partially fulfilled. %s\n",
11507 + avail_pages < num_pages ? "Not enough memory." :
11508 + "Balloon floor reached.");
11509 +
11510 + num_pages = avail_pages > floor ? (avail_pages - floor) : 0;
11511 +- num_pages -= num_pages % PAGES_IN_2M;
11512 + }
11513 +
11514 + while (!done) {
11515 +diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c
11516 +index 6ec748eccff7..4bda1242df87 100644
11517 +--- a/drivers/infiniband/core/umem_odp.c
11518 ++++ b/drivers/infiniband/core/umem_odp.c
11519 +@@ -689,7 +689,7 @@ int ib_umem_odp_map_dma_pages(struct ib_umem *umem, u64 user_virt, u64 bcnt,
11520 +
11521 + while (bcnt > 0) {
11522 + const size_t gup_num_pages = min_t(size_t,
11523 +- (bcnt + BIT(page_shift) - 1) >> page_shift,
11524 ++ ALIGN(bcnt, PAGE_SIZE) / PAGE_SIZE,
11525 + PAGE_SIZE / sizeof(struct page *));
11526 +
11527 + down_read(&owning_mm->mmap_sem);
11528 +diff --git a/drivers/infiniband/hw/mlx5/gsi.c b/drivers/infiniband/hw/mlx5/gsi.c
11529 +index 4950df3f71b6..5c73c0a790fa 100644
11530 +--- a/drivers/infiniband/hw/mlx5/gsi.c
11531 ++++ b/drivers/infiniband/hw/mlx5/gsi.c
11532 +@@ -507,8 +507,7 @@ int mlx5_ib_gsi_post_send(struct ib_qp *qp, const struct ib_send_wr *wr,
11533 + ret = ib_post_send(tx_qp, &cur_wr.wr, bad_wr);
11534 + if (ret) {
11535 + /* Undo the effect of adding the outstanding wr */
11536 +- gsi->outstanding_pi = (gsi->outstanding_pi - 1) %
11537 +- gsi->cap.max_send_wr;
11538 ++ gsi->outstanding_pi--;
11539 + goto err;
11540 + }
11541 + spin_unlock_irqrestore(&gsi->lock, flags);
11542 +diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
11543 +index 4677b18ac281..2a2f189dd37c 100644
11544 +--- a/drivers/md/bcache/bcache.h
11545 ++++ b/drivers/md/bcache/bcache.h
11546 +@@ -329,6 +329,9 @@ struct cached_dev {
11547 + */
11548 + atomic_t has_dirty;
11549 +
11550 ++#define BCH_CACHE_READA_ALL 0
11551 ++#define BCH_CACHE_READA_META_ONLY 1
11552 ++ unsigned int cache_readahead_policy;
11553 + struct bch_ratelimit writeback_rate;
11554 + struct delayed_work writeback_rate_update;
11555 +
11556 +diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
11557 +index 4ca3e3d3f9c7..c1e487d1261c 100644
11558 +--- a/drivers/md/bcache/request.c
11559 ++++ b/drivers/md/bcache/request.c
11560 +@@ -391,13 +391,20 @@ static bool check_should_bypass(struct cached_dev *dc, struct bio *bio)
11561 + goto skip;
11562 +
11563 + /*
11564 +- * Flag for bypass if the IO is for read-ahead or background,
11565 +- * unless the read-ahead request is for metadata
11566 ++ * If the bio is for read-ahead or background IO, bypass it or
11567 ++ * not depends on the following situations,
11568 ++ * - If the IO is for meta data, always cache it and no bypass
11569 ++ * - If the IO is not meta data, check dc->cache_reada_policy,
11570 ++ * BCH_CACHE_READA_ALL: cache it and not bypass
11571 ++ * BCH_CACHE_READA_META_ONLY: not cache it and bypass
11572 ++ * That is, read-ahead request for metadata always get cached
11573 + * (eg, for gfs2 or xfs).
11574 + */
11575 +- if (bio->bi_opf & (REQ_RAHEAD|REQ_BACKGROUND) &&
11576 +- !(bio->bi_opf & (REQ_META|REQ_PRIO)))
11577 +- goto skip;
11578 ++ if ((bio->bi_opf & (REQ_RAHEAD|REQ_BACKGROUND))) {
11579 ++ if (!(bio->bi_opf & (REQ_META|REQ_PRIO)) &&
11580 ++ (dc->cache_readahead_policy != BCH_CACHE_READA_ALL))
11581 ++ goto skip;
11582 ++ }
11583 +
11584 + if (bio->bi_iter.bi_sector & (c->sb.block_size - 1) ||
11585 + bio_sectors(bio) & (c->sb.block_size - 1)) {
11586 +diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
11587 +index 3e8d1f1b562f..591d9c8107dd 100644
11588 +--- a/drivers/md/bcache/sysfs.c
11589 ++++ b/drivers/md/bcache/sysfs.c
11590 +@@ -25,6 +25,12 @@ static const char * const bch_cache_modes[] = {
11591 + NULL
11592 + };
11593 +
11594 ++static const char * const bch_reada_cache_policies[] = {
11595 ++ "all",
11596 ++ "meta-only",
11597 ++ NULL
11598 ++};
11599 ++
11600 + /* Default is -1; we skip past it for stop_when_cache_set_failed */
11601 + static const char * const bch_stop_on_failure_modes[] = {
11602 + "auto",
11603 +@@ -94,6 +100,7 @@ rw_attribute(congested_write_threshold_us);
11604 + rw_attribute(sequential_cutoff);
11605 + rw_attribute(data_csum);
11606 + rw_attribute(cache_mode);
11607 ++rw_attribute(readahead_cache_policy);
11608 + rw_attribute(stop_when_cache_set_failed);
11609 + rw_attribute(writeback_metadata);
11610 + rw_attribute(writeback_running);
11611 +@@ -160,6 +167,11 @@ SHOW(__bch_cached_dev)
11612 + bch_cache_modes,
11613 + BDEV_CACHE_MODE(&dc->sb));
11614 +
11615 ++ if (attr == &sysfs_readahead_cache_policy)
11616 ++ return bch_snprint_string_list(buf, PAGE_SIZE,
11617 ++ bch_reada_cache_policies,
11618 ++ dc->cache_readahead_policy);
11619 ++
11620 + if (attr == &sysfs_stop_when_cache_set_failed)
11621 + return bch_snprint_string_list(buf, PAGE_SIZE,
11622 + bch_stop_on_failure_modes,
11623 +@@ -324,6 +336,15 @@ STORE(__cached_dev)
11624 + }
11625 + }
11626 +
11627 ++ if (attr == &sysfs_readahead_cache_policy) {
11628 ++ v = __sysfs_match_string(bch_reada_cache_policies, -1, buf);
11629 ++ if (v < 0)
11630 ++ return v;
11631 ++
11632 ++ if ((unsigned int) v != dc->cache_readahead_policy)
11633 ++ dc->cache_readahead_policy = v;
11634 ++ }
11635 ++
11636 + if (attr == &sysfs_stop_when_cache_set_failed) {
11637 + v = __sysfs_match_string(bch_stop_on_failure_modes, -1, buf);
11638 + if (v < 0)
11639 +@@ -417,6 +438,7 @@ static struct attribute *bch_cached_dev_files[] = {
11640 + &sysfs_data_csum,
11641 + #endif
11642 + &sysfs_cache_mode,
11643 ++ &sysfs_readahead_cache_policy,
11644 + &sysfs_stop_when_cache_set_failed,
11645 + &sysfs_writeback_metadata,
11646 + &sysfs_writeback_running,
11647 +diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
11648 +index 34f5de13a93d..750f8b34e693 100644
11649 +--- a/drivers/md/dm-crypt.c
11650 ++++ b/drivers/md/dm-crypt.c
11651 +@@ -482,8 +482,14 @@ static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv,
11652 + static int crypt_iv_benbi_ctr(struct crypt_config *cc, struct dm_target *ti,
11653 + const char *opts)
11654 + {
11655 +- unsigned bs = crypto_skcipher_blocksize(any_tfm(cc));
11656 +- int log = ilog2(bs);
11657 ++ unsigned bs;
11658 ++ int log;
11659 ++
11660 ++ if (test_bit(CRYPT_MODE_INTEGRITY_AEAD, &cc->cipher_flags))
11661 ++ bs = crypto_aead_blocksize(any_tfm_aead(cc));
11662 ++ else
11663 ++ bs = crypto_skcipher_blocksize(any_tfm(cc));
11664 ++ log = ilog2(bs);
11665 +
11666 + /* we need to calculate how far we must shift the sector count
11667 + * to get the cipher block count, we use this shift in _gen */
11668 +diff --git a/drivers/md/dm-writecache.c b/drivers/md/dm-writecache.c
11669 +index 503c4265ecbe..820c2e07dadf 100644
11670 +--- a/drivers/md/dm-writecache.c
11671 ++++ b/drivers/md/dm-writecache.c
11672 +@@ -447,7 +447,13 @@ static void writecache_notify_io(unsigned long error, void *context)
11673 + complete(&endio->c);
11674 + }
11675 +
11676 +-static void ssd_commit_flushed(struct dm_writecache *wc)
11677 ++static void writecache_wait_for_ios(struct dm_writecache *wc, int direction)
11678 ++{
11679 ++ wait_event(wc->bio_in_progress_wait[direction],
11680 ++ !atomic_read(&wc->bio_in_progress[direction]));
11681 ++}
11682 ++
11683 ++static void ssd_commit_flushed(struct dm_writecache *wc, bool wait_for_ios)
11684 + {
11685 + struct dm_io_region region;
11686 + struct dm_io_request req;
11687 +@@ -493,17 +499,20 @@ static void ssd_commit_flushed(struct dm_writecache *wc)
11688 + writecache_notify_io(0, &endio);
11689 + wait_for_completion_io(&endio.c);
11690 +
11691 ++ if (wait_for_ios)
11692 ++ writecache_wait_for_ios(wc, WRITE);
11693 ++
11694 + writecache_disk_flush(wc, wc->ssd_dev);
11695 +
11696 + memset(wc->dirty_bitmap, 0, wc->dirty_bitmap_size);
11697 + }
11698 +
11699 +-static void writecache_commit_flushed(struct dm_writecache *wc)
11700 ++static void writecache_commit_flushed(struct dm_writecache *wc, bool wait_for_ios)
11701 + {
11702 + if (WC_MODE_PMEM(wc))
11703 + wmb();
11704 + else
11705 +- ssd_commit_flushed(wc);
11706 ++ ssd_commit_flushed(wc, wait_for_ios);
11707 + }
11708 +
11709 + static void writecache_disk_flush(struct dm_writecache *wc, struct dm_dev *dev)
11710 +@@ -527,12 +536,6 @@ static void writecache_disk_flush(struct dm_writecache *wc, struct dm_dev *dev)
11711 + writecache_error(wc, r, "error flushing metadata: %d", r);
11712 + }
11713 +
11714 +-static void writecache_wait_for_ios(struct dm_writecache *wc, int direction)
11715 +-{
11716 +- wait_event(wc->bio_in_progress_wait[direction],
11717 +- !atomic_read(&wc->bio_in_progress[direction]));
11718 +-}
11719 +-
11720 + #define WFE_RETURN_FOLLOWING 1
11721 + #define WFE_LOWEST_SEQ 2
11722 +
11723 +@@ -730,14 +733,12 @@ static void writecache_flush(struct dm_writecache *wc)
11724 + e = e2;
11725 + cond_resched();
11726 + }
11727 +- writecache_commit_flushed(wc);
11728 +-
11729 +- writecache_wait_for_ios(wc, WRITE);
11730 ++ writecache_commit_flushed(wc, true);
11731 +
11732 + wc->seq_count++;
11733 + pmem_assign(sb(wc)->seq_count, cpu_to_le64(wc->seq_count));
11734 + writecache_flush_region(wc, &sb(wc)->seq_count, sizeof sb(wc)->seq_count);
11735 +- writecache_commit_flushed(wc);
11736 ++ writecache_commit_flushed(wc, false);
11737 +
11738 + wc->overwrote_committed = false;
11739 +
11740 +@@ -761,7 +762,7 @@ static void writecache_flush(struct dm_writecache *wc)
11741 + }
11742 +
11743 + if (need_flush_after_free)
11744 +- writecache_commit_flushed(wc);
11745 ++ writecache_commit_flushed(wc, false);
11746 + }
11747 +
11748 + static void writecache_flush_work(struct work_struct *work)
11749 +@@ -814,7 +815,7 @@ static void writecache_discard(struct dm_writecache *wc, sector_t start, sector_
11750 + }
11751 +
11752 + if (discarded_something)
11753 +- writecache_commit_flushed(wc);
11754 ++ writecache_commit_flushed(wc, false);
11755 + }
11756 +
11757 + static bool writecache_wait_for_writeback(struct dm_writecache *wc)
11758 +@@ -963,7 +964,7 @@ erase_this:
11759 +
11760 + if (need_flush) {
11761 + writecache_flush_all_metadata(wc);
11762 +- writecache_commit_flushed(wc);
11763 ++ writecache_commit_flushed(wc, false);
11764 + }
11765 +
11766 + wc_unlock(wc);
11767 +@@ -1347,7 +1348,7 @@ static void __writecache_endio_pmem(struct dm_writecache *wc, struct list_head *
11768 + wc->writeback_size--;
11769 + n_walked++;
11770 + if (unlikely(n_walked >= ENDIO_LATENCY)) {
11771 +- writecache_commit_flushed(wc);
11772 ++ writecache_commit_flushed(wc, false);
11773 + wc_unlock(wc);
11774 + wc_lock(wc);
11775 + n_walked = 0;
11776 +@@ -1428,7 +1429,7 @@ pop_from_list:
11777 + writecache_wait_for_ios(wc, READ);
11778 + }
11779 +
11780 +- writecache_commit_flushed(wc);
11781 ++ writecache_commit_flushed(wc, false);
11782 +
11783 + wc_unlock(wc);
11784 + }
11785 +@@ -1759,10 +1760,10 @@ static int init_memory(struct dm_writecache *wc)
11786 + write_original_sector_seq_count(wc, &wc->entries[b], -1, -1);
11787 +
11788 + writecache_flush_all_metadata(wc);
11789 +- writecache_commit_flushed(wc);
11790 ++ writecache_commit_flushed(wc, false);
11791 + pmem_assign(sb(wc)->magic, cpu_to_le32(MEMORY_SUPERBLOCK_MAGIC));
11792 + writecache_flush_region(wc, &sb(wc)->magic, sizeof sb(wc)->magic);
11793 +- writecache_commit_flushed(wc);
11794 ++ writecache_commit_flushed(wc, false);
11795 +
11796 + return 0;
11797 + }
11798 +diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c
11799 +index c2c17149d968..086a870087cf 100644
11800 +--- a/drivers/md/dm-zoned-metadata.c
11801 ++++ b/drivers/md/dm-zoned-metadata.c
11802 +@@ -132,6 +132,7 @@ struct dmz_metadata {
11803 +
11804 + sector_t zone_bitmap_size;
11805 + unsigned int zone_nr_bitmap_blocks;
11806 ++ unsigned int zone_bits_per_mblk;
11807 +
11808 + unsigned int nr_bitmap_blocks;
11809 + unsigned int nr_map_blocks;
11810 +@@ -1165,7 +1166,10 @@ static int dmz_init_zones(struct dmz_metadata *zmd)
11811 +
11812 + /* Init */
11813 + zmd->zone_bitmap_size = dev->zone_nr_blocks >> 3;
11814 +- zmd->zone_nr_bitmap_blocks = zmd->zone_bitmap_size >> DMZ_BLOCK_SHIFT;
11815 ++ zmd->zone_nr_bitmap_blocks =
11816 ++ max_t(sector_t, 1, zmd->zone_bitmap_size >> DMZ_BLOCK_SHIFT);
11817 ++ zmd->zone_bits_per_mblk = min_t(sector_t, dev->zone_nr_blocks,
11818 ++ DMZ_BLOCK_SIZE_BITS);
11819 +
11820 + /* Allocate zone array */
11821 + zmd->zones = kcalloc(dev->nr_zones, sizeof(struct dm_zone), GFP_KERNEL);
11822 +@@ -1982,7 +1986,7 @@ int dmz_copy_valid_blocks(struct dmz_metadata *zmd, struct dm_zone *from_zone,
11823 + dmz_release_mblock(zmd, to_mblk);
11824 + dmz_release_mblock(zmd, from_mblk);
11825 +
11826 +- chunk_block += DMZ_BLOCK_SIZE_BITS;
11827 ++ chunk_block += zmd->zone_bits_per_mblk;
11828 + }
11829 +
11830 + to_zone->weight = from_zone->weight;
11831 +@@ -2043,7 +2047,7 @@ int dmz_validate_blocks(struct dmz_metadata *zmd, struct dm_zone *zone,
11832 +
11833 + /* Set bits */
11834 + bit = chunk_block & DMZ_BLOCK_MASK_BITS;
11835 +- nr_bits = min(nr_blocks, DMZ_BLOCK_SIZE_BITS - bit);
11836 ++ nr_bits = min(nr_blocks, zmd->zone_bits_per_mblk - bit);
11837 +
11838 + count = dmz_set_bits((unsigned long *)mblk->data, bit, nr_bits);
11839 + if (count) {
11840 +@@ -2122,7 +2126,7 @@ int dmz_invalidate_blocks(struct dmz_metadata *zmd, struct dm_zone *zone,
11841 +
11842 + /* Clear bits */
11843 + bit = chunk_block & DMZ_BLOCK_MASK_BITS;
11844 +- nr_bits = min(nr_blocks, DMZ_BLOCK_SIZE_BITS - bit);
11845 ++ nr_bits = min(nr_blocks, zmd->zone_bits_per_mblk - bit);
11846 +
11847 + count = dmz_clear_bits((unsigned long *)mblk->data,
11848 + bit, nr_bits);
11849 +@@ -2182,6 +2186,7 @@ static int dmz_to_next_set_block(struct dmz_metadata *zmd, struct dm_zone *zone,
11850 + {
11851 + struct dmz_mblock *mblk;
11852 + unsigned int bit, set_bit, nr_bits;
11853 ++ unsigned int zone_bits = zmd->zone_bits_per_mblk;
11854 + unsigned long *bitmap;
11855 + int n = 0;
11856 +
11857 +@@ -2196,15 +2201,15 @@ static int dmz_to_next_set_block(struct dmz_metadata *zmd, struct dm_zone *zone,
11858 + /* Get offset */
11859 + bitmap = (unsigned long *) mblk->data;
11860 + bit = chunk_block & DMZ_BLOCK_MASK_BITS;
11861 +- nr_bits = min(nr_blocks, DMZ_BLOCK_SIZE_BITS - bit);
11862 ++ nr_bits = min(nr_blocks, zone_bits - bit);
11863 + if (set)
11864 +- set_bit = find_next_bit(bitmap, DMZ_BLOCK_SIZE_BITS, bit);
11865 ++ set_bit = find_next_bit(bitmap, zone_bits, bit);
11866 + else
11867 +- set_bit = find_next_zero_bit(bitmap, DMZ_BLOCK_SIZE_BITS, bit);
11868 ++ set_bit = find_next_zero_bit(bitmap, zone_bits, bit);
11869 + dmz_release_mblock(zmd, mblk);
11870 +
11871 + n += set_bit - bit;
11872 +- if (set_bit < DMZ_BLOCK_SIZE_BITS)
11873 ++ if (set_bit < zone_bits)
11874 + break;
11875 +
11876 + nr_blocks -= nr_bits;
11877 +@@ -2307,7 +2312,7 @@ static void dmz_get_zone_weight(struct dmz_metadata *zmd, struct dm_zone *zone)
11878 + /* Count bits in this block */
11879 + bitmap = mblk->data;
11880 + bit = chunk_block & DMZ_BLOCK_MASK_BITS;
11881 +- nr_bits = min(nr_blocks, DMZ_BLOCK_SIZE_BITS - bit);
11882 ++ nr_bits = min(nr_blocks, zmd->zone_bits_per_mblk - bit);
11883 + n += dmz_count_bits(bitmap, bit, nr_bits);
11884 +
11885 + dmz_release_mblock(zmd, mblk);
11886 +diff --git a/drivers/md/dm.c b/drivers/md/dm.c
11887 +index c9860e3b04dd..3965f3cf8ea1 100644
11888 +--- a/drivers/md/dm.c
11889 ++++ b/drivers/md/dm.c
11890 +@@ -1819,6 +1819,7 @@ static void dm_init_normal_md_queue(struct mapped_device *md)
11891 + /*
11892 + * Initialize aspects of queue that aren't relevant for blk-mq
11893 + */
11894 ++ md->queue->backing_dev_info->congested_data = md;
11895 + md->queue->backing_dev_info->congested_fn = dm_any_congested;
11896 + }
11897 +
11898 +@@ -1913,7 +1914,12 @@ static struct mapped_device *alloc_dev(int minor)
11899 + if (!md->queue)
11900 + goto bad;
11901 + md->queue->queuedata = md;
11902 +- md->queue->backing_dev_info->congested_data = md;
11903 ++ /*
11904 ++ * default to bio-based required ->make_request_fn until DM
11905 ++ * table is loaded and md->type established. If request-based
11906 ++ * table is loaded: blk-mq will override accordingly.
11907 ++ */
11908 ++ blk_queue_make_request(md->queue, dm_make_request);
11909 +
11910 + md->disk = alloc_disk_node(1, md->numa_node_id);
11911 + if (!md->disk)
11912 +@@ -2242,7 +2248,6 @@ int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t)
11913 + case DM_TYPE_BIO_BASED:
11914 + case DM_TYPE_DAX_BIO_BASED:
11915 + dm_init_normal_md_queue(md);
11916 +- blk_queue_make_request(md->queue, dm_make_request);
11917 + break;
11918 + case DM_TYPE_NVME_BIO_BASED:
11919 + dm_init_normal_md_queue(md);
11920 +diff --git a/drivers/md/persistent-data/dm-space-map-common.c b/drivers/md/persistent-data/dm-space-map-common.c
11921 +index 0a3b8ae4a29c..17aef55ed708 100644
11922 +--- a/drivers/md/persistent-data/dm-space-map-common.c
11923 ++++ b/drivers/md/persistent-data/dm-space-map-common.c
11924 +@@ -382,6 +382,33 @@ int sm_ll_find_free_block(struct ll_disk *ll, dm_block_t begin,
11925 + return -ENOSPC;
11926 + }
11927 +
11928 ++int sm_ll_find_common_free_block(struct ll_disk *old_ll, struct ll_disk *new_ll,
11929 ++ dm_block_t begin, dm_block_t end, dm_block_t *b)
11930 ++{
11931 ++ int r;
11932 ++ uint32_t count;
11933 ++
11934 ++ do {
11935 ++ r = sm_ll_find_free_block(new_ll, begin, new_ll->nr_blocks, b);
11936 ++ if (r)
11937 ++ break;
11938 ++
11939 ++ /* double check this block wasn't used in the old transaction */
11940 ++ if (*b >= old_ll->nr_blocks)
11941 ++ count = 0;
11942 ++ else {
11943 ++ r = sm_ll_lookup(old_ll, *b, &count);
11944 ++ if (r)
11945 ++ break;
11946 ++
11947 ++ if (count)
11948 ++ begin = *b + 1;
11949 ++ }
11950 ++ } while (count);
11951 ++
11952 ++ return r;
11953 ++}
11954 ++
11955 + static int sm_ll_mutate(struct ll_disk *ll, dm_block_t b,
11956 + int (*mutator)(void *context, uint32_t old, uint32_t *new),
11957 + void *context, enum allocation_event *ev)
11958 +diff --git a/drivers/md/persistent-data/dm-space-map-common.h b/drivers/md/persistent-data/dm-space-map-common.h
11959 +index b3078d5eda0c..8de63ce39bdd 100644
11960 +--- a/drivers/md/persistent-data/dm-space-map-common.h
11961 ++++ b/drivers/md/persistent-data/dm-space-map-common.h
11962 +@@ -109,6 +109,8 @@ int sm_ll_lookup_bitmap(struct ll_disk *ll, dm_block_t b, uint32_t *result);
11963 + int sm_ll_lookup(struct ll_disk *ll, dm_block_t b, uint32_t *result);
11964 + int sm_ll_find_free_block(struct ll_disk *ll, dm_block_t begin,
11965 + dm_block_t end, dm_block_t *result);
11966 ++int sm_ll_find_common_free_block(struct ll_disk *old_ll, struct ll_disk *new_ll,
11967 ++ dm_block_t begin, dm_block_t end, dm_block_t *result);
11968 + int sm_ll_insert(struct ll_disk *ll, dm_block_t b, uint32_t ref_count, enum allocation_event *ev);
11969 + int sm_ll_inc(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev);
11970 + int sm_ll_dec(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev);
11971 +diff --git a/drivers/md/persistent-data/dm-space-map-disk.c b/drivers/md/persistent-data/dm-space-map-disk.c
11972 +index 32adf6b4a9c7..bf4c5e2ccb6f 100644
11973 +--- a/drivers/md/persistent-data/dm-space-map-disk.c
11974 ++++ b/drivers/md/persistent-data/dm-space-map-disk.c
11975 +@@ -167,8 +167,10 @@ static int sm_disk_new_block(struct dm_space_map *sm, dm_block_t *b)
11976 + enum allocation_event ev;
11977 + struct sm_disk *smd = container_of(sm, struct sm_disk, sm);
11978 +
11979 +- /* FIXME: we should loop round a couple of times */
11980 +- r = sm_ll_find_free_block(&smd->old_ll, smd->begin, smd->old_ll.nr_blocks, b);
11981 ++ /*
11982 ++ * Any block we allocate has to be free in both the old and current ll.
11983 ++ */
11984 ++ r = sm_ll_find_common_free_block(&smd->old_ll, &smd->ll, smd->begin, smd->ll.nr_blocks, b);
11985 + if (r)
11986 + return r;
11987 +
11988 +diff --git a/drivers/md/persistent-data/dm-space-map-metadata.c b/drivers/md/persistent-data/dm-space-map-metadata.c
11989 +index 25328582cc48..9e3c64ec2026 100644
11990 +--- a/drivers/md/persistent-data/dm-space-map-metadata.c
11991 ++++ b/drivers/md/persistent-data/dm-space-map-metadata.c
11992 +@@ -448,7 +448,10 @@ static int sm_metadata_new_block_(struct dm_space_map *sm, dm_block_t *b)
11993 + enum allocation_event ev;
11994 + struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
11995 +
11996 +- r = sm_ll_find_free_block(&smm->old_ll, smm->begin, smm->old_ll.nr_blocks, b);
11997 ++ /*
11998 ++ * Any block we allocate has to be free in both the old and current ll.
11999 ++ */
12000 ++ r = sm_ll_find_common_free_block(&smm->old_ll, &smm->ll, smm->begin, smm->ll.nr_blocks, b);
12001 + if (r)
12002 + return r;
12003 +
12004 +diff --git a/drivers/media/rc/iguanair.c b/drivers/media/rc/iguanair.c
12005 +index 6f3030b2054d..1df9522c30fa 100644
12006 +--- a/drivers/media/rc/iguanair.c
12007 ++++ b/drivers/media/rc/iguanair.c
12008 +@@ -424,7 +424,7 @@ static int iguanair_probe(struct usb_interface *intf,
12009 + int ret, pipein, pipeout;
12010 + struct usb_host_interface *idesc;
12011 +
12012 +- idesc = intf->altsetting;
12013 ++ idesc = intf->cur_altsetting;
12014 + if (idesc->desc.bNumEndpoints < 2)
12015 + return -ENODEV;
12016 +
12017 +diff --git a/drivers/media/rc/rc-main.c b/drivers/media/rc/rc-main.c
12018 +index 0f218afdadaa..c30affbd43a9 100644
12019 +--- a/drivers/media/rc/rc-main.c
12020 ++++ b/drivers/media/rc/rc-main.c
12021 +@@ -1874,23 +1874,28 @@ int rc_register_device(struct rc_dev *dev)
12022 +
12023 + dev->registered = true;
12024 +
12025 +- if (dev->driver_type != RC_DRIVER_IR_RAW_TX) {
12026 +- rc = rc_setup_rx_device(dev);
12027 +- if (rc)
12028 +- goto out_dev;
12029 +- }
12030 +-
12031 +- /* Ensure that the lirc kfifo is setup before we start the thread */
12032 ++ /*
12033 ++ * once the the input device is registered in rc_setup_rx_device,
12034 ++ * userspace can open the input device and rc_open() will be called
12035 ++ * as a result. This results in driver code being allowed to submit
12036 ++ * keycodes with rc_keydown, so lirc must be registered first.
12037 ++ */
12038 + if (dev->allowed_protocols != RC_PROTO_BIT_CEC) {
12039 + rc = ir_lirc_register(dev);
12040 + if (rc < 0)
12041 +- goto out_rx;
12042 ++ goto out_dev;
12043 ++ }
12044 ++
12045 ++ if (dev->driver_type != RC_DRIVER_IR_RAW_TX) {
12046 ++ rc = rc_setup_rx_device(dev);
12047 ++ if (rc)
12048 ++ goto out_lirc;
12049 + }
12050 +
12051 + if (dev->driver_type == RC_DRIVER_IR_RAW) {
12052 + rc = ir_raw_event_register(dev);
12053 + if (rc < 0)
12054 +- goto out_lirc;
12055 ++ goto out_rx;
12056 + }
12057 +
12058 + dev_dbg(&dev->dev, "Registered rc%u (driver: %s)\n", dev->minor,
12059 +@@ -1898,11 +1903,11 @@ int rc_register_device(struct rc_dev *dev)
12060 +
12061 + return 0;
12062 +
12063 ++out_rx:
12064 ++ rc_free_rx_device(dev);
12065 + out_lirc:
12066 + if (dev->allowed_protocols != RC_PROTO_BIT_CEC)
12067 + ir_lirc_unregister(dev);
12068 +-out_rx:
12069 +- rc_free_rx_device(dev);
12070 + out_dev:
12071 + device_del(&dev->dev);
12072 + out_rx_free:
12073 +diff --git a/drivers/media/usb/uvc/uvc_driver.c b/drivers/media/usb/uvc/uvc_driver.c
12074 +index 063e229ead5e..38c73cdbef70 100644
12075 +--- a/drivers/media/usb/uvc/uvc_driver.c
12076 ++++ b/drivers/media/usb/uvc/uvc_driver.c
12077 +@@ -1482,6 +1482,11 @@ static int uvc_scan_chain_forward(struct uvc_video_chain *chain,
12078 + break;
12079 + if (forward == prev)
12080 + continue;
12081 ++ if (forward->chain.next || forward->chain.prev) {
12082 ++ uvc_trace(UVC_TRACE_DESCR, "Found reference to "
12083 ++ "entity %d already in chain.\n", forward->id);
12084 ++ return -EINVAL;
12085 ++ }
12086 +
12087 + switch (UVC_ENTITY_TYPE(forward)) {
12088 + case UVC_VC_EXTENSION_UNIT:
12089 +@@ -1563,6 +1568,13 @@ static int uvc_scan_chain_backward(struct uvc_video_chain *chain,
12090 + return -1;
12091 + }
12092 +
12093 ++ if (term->chain.next || term->chain.prev) {
12094 ++ uvc_trace(UVC_TRACE_DESCR, "Found reference to "
12095 ++ "entity %d already in chain.\n",
12096 ++ term->id);
12097 ++ return -EINVAL;
12098 ++ }
12099 ++
12100 + if (uvc_trace_param & UVC_TRACE_PROBE)
12101 + printk(KERN_CONT " %d", term->id);
12102 +
12103 +diff --git a/drivers/media/v4l2-core/v4l2-compat-ioctl32.c b/drivers/media/v4l2-core/v4l2-compat-ioctl32.c
12104 +index 6481212fda77..3efe4e0a80a4 100644
12105 +--- a/drivers/media/v4l2-core/v4l2-compat-ioctl32.c
12106 ++++ b/drivers/media/v4l2-core/v4l2-compat-ioctl32.c
12107 +@@ -1171,36 +1171,38 @@ static long do_video_ioctl(struct file *file, unsigned int cmd, unsigned long ar
12108 + u32 aux_space;
12109 + int compatible_arg = 1;
12110 + long err = 0;
12111 ++ unsigned int ncmd;
12112 +
12113 + /*
12114 + * 1. When struct size is different, converts the command.
12115 + */
12116 + switch (cmd) {
12117 +- case VIDIOC_G_FMT32: cmd = VIDIOC_G_FMT; break;
12118 +- case VIDIOC_S_FMT32: cmd = VIDIOC_S_FMT; break;
12119 +- case VIDIOC_QUERYBUF32: cmd = VIDIOC_QUERYBUF; break;
12120 +- case VIDIOC_G_FBUF32: cmd = VIDIOC_G_FBUF; break;
12121 +- case VIDIOC_S_FBUF32: cmd = VIDIOC_S_FBUF; break;
12122 +- case VIDIOC_QBUF32: cmd = VIDIOC_QBUF; break;
12123 +- case VIDIOC_DQBUF32: cmd = VIDIOC_DQBUF; break;
12124 +- case VIDIOC_ENUMSTD32: cmd = VIDIOC_ENUMSTD; break;
12125 +- case VIDIOC_ENUMINPUT32: cmd = VIDIOC_ENUMINPUT; break;
12126 +- case VIDIOC_TRY_FMT32: cmd = VIDIOC_TRY_FMT; break;
12127 +- case VIDIOC_G_EXT_CTRLS32: cmd = VIDIOC_G_EXT_CTRLS; break;
12128 +- case VIDIOC_S_EXT_CTRLS32: cmd = VIDIOC_S_EXT_CTRLS; break;
12129 +- case VIDIOC_TRY_EXT_CTRLS32: cmd = VIDIOC_TRY_EXT_CTRLS; break;
12130 +- case VIDIOC_DQEVENT32: cmd = VIDIOC_DQEVENT; break;
12131 +- case VIDIOC_OVERLAY32: cmd = VIDIOC_OVERLAY; break;
12132 +- case VIDIOC_STREAMON32: cmd = VIDIOC_STREAMON; break;
12133 +- case VIDIOC_STREAMOFF32: cmd = VIDIOC_STREAMOFF; break;
12134 +- case VIDIOC_G_INPUT32: cmd = VIDIOC_G_INPUT; break;
12135 +- case VIDIOC_S_INPUT32: cmd = VIDIOC_S_INPUT; break;
12136 +- case VIDIOC_G_OUTPUT32: cmd = VIDIOC_G_OUTPUT; break;
12137 +- case VIDIOC_S_OUTPUT32: cmd = VIDIOC_S_OUTPUT; break;
12138 +- case VIDIOC_CREATE_BUFS32: cmd = VIDIOC_CREATE_BUFS; break;
12139 +- case VIDIOC_PREPARE_BUF32: cmd = VIDIOC_PREPARE_BUF; break;
12140 +- case VIDIOC_G_EDID32: cmd = VIDIOC_G_EDID; break;
12141 +- case VIDIOC_S_EDID32: cmd = VIDIOC_S_EDID; break;
12142 ++ case VIDIOC_G_FMT32: ncmd = VIDIOC_G_FMT; break;
12143 ++ case VIDIOC_S_FMT32: ncmd = VIDIOC_S_FMT; break;
12144 ++ case VIDIOC_QUERYBUF32: ncmd = VIDIOC_QUERYBUF; break;
12145 ++ case VIDIOC_G_FBUF32: ncmd = VIDIOC_G_FBUF; break;
12146 ++ case VIDIOC_S_FBUF32: ncmd = VIDIOC_S_FBUF; break;
12147 ++ case VIDIOC_QBUF32: ncmd = VIDIOC_QBUF; break;
12148 ++ case VIDIOC_DQBUF32: ncmd = VIDIOC_DQBUF; break;
12149 ++ case VIDIOC_ENUMSTD32: ncmd = VIDIOC_ENUMSTD; break;
12150 ++ case VIDIOC_ENUMINPUT32: ncmd = VIDIOC_ENUMINPUT; break;
12151 ++ case VIDIOC_TRY_FMT32: ncmd = VIDIOC_TRY_FMT; break;
12152 ++ case VIDIOC_G_EXT_CTRLS32: ncmd = VIDIOC_G_EXT_CTRLS; break;
12153 ++ case VIDIOC_S_EXT_CTRLS32: ncmd = VIDIOC_S_EXT_CTRLS; break;
12154 ++ case VIDIOC_TRY_EXT_CTRLS32: ncmd = VIDIOC_TRY_EXT_CTRLS; break;
12155 ++ case VIDIOC_DQEVENT32: ncmd = VIDIOC_DQEVENT; break;
12156 ++ case VIDIOC_OVERLAY32: ncmd = VIDIOC_OVERLAY; break;
12157 ++ case VIDIOC_STREAMON32: ncmd = VIDIOC_STREAMON; break;
12158 ++ case VIDIOC_STREAMOFF32: ncmd = VIDIOC_STREAMOFF; break;
12159 ++ case VIDIOC_G_INPUT32: ncmd = VIDIOC_G_INPUT; break;
12160 ++ case VIDIOC_S_INPUT32: ncmd = VIDIOC_S_INPUT; break;
12161 ++ case VIDIOC_G_OUTPUT32: ncmd = VIDIOC_G_OUTPUT; break;
12162 ++ case VIDIOC_S_OUTPUT32: ncmd = VIDIOC_S_OUTPUT; break;
12163 ++ case VIDIOC_CREATE_BUFS32: ncmd = VIDIOC_CREATE_BUFS; break;
12164 ++ case VIDIOC_PREPARE_BUF32: ncmd = VIDIOC_PREPARE_BUF; break;
12165 ++ case VIDIOC_G_EDID32: ncmd = VIDIOC_G_EDID; break;
12166 ++ case VIDIOC_S_EDID32: ncmd = VIDIOC_S_EDID; break;
12167 ++ default: ncmd = cmd; break;
12168 + }
12169 +
12170 + /*
12171 +@@ -1209,11 +1211,11 @@ static long do_video_ioctl(struct file *file, unsigned int cmd, unsigned long ar
12172 + * argument into it.
12173 + */
12174 + switch (cmd) {
12175 +- case VIDIOC_OVERLAY:
12176 +- case VIDIOC_STREAMON:
12177 +- case VIDIOC_STREAMOFF:
12178 +- case VIDIOC_S_INPUT:
12179 +- case VIDIOC_S_OUTPUT:
12180 ++ case VIDIOC_OVERLAY32:
12181 ++ case VIDIOC_STREAMON32:
12182 ++ case VIDIOC_STREAMOFF32:
12183 ++ case VIDIOC_S_INPUT32:
12184 ++ case VIDIOC_S_OUTPUT32:
12185 + err = alloc_userspace(sizeof(unsigned int), 0, &new_p64);
12186 + if (!err && assign_in_user((unsigned int __user *)new_p64,
12187 + (compat_uint_t __user *)p32))
12188 +@@ -1221,23 +1223,23 @@ static long do_video_ioctl(struct file *file, unsigned int cmd, unsigned long ar
12189 + compatible_arg = 0;
12190 + break;
12191 +
12192 +- case VIDIOC_G_INPUT:
12193 +- case VIDIOC_G_OUTPUT:
12194 ++ case VIDIOC_G_INPUT32:
12195 ++ case VIDIOC_G_OUTPUT32:
12196 + err = alloc_userspace(sizeof(unsigned int), 0, &new_p64);
12197 + compatible_arg = 0;
12198 + break;
12199 +
12200 +- case VIDIOC_G_EDID:
12201 +- case VIDIOC_S_EDID:
12202 ++ case VIDIOC_G_EDID32:
12203 ++ case VIDIOC_S_EDID32:
12204 + err = alloc_userspace(sizeof(struct v4l2_edid), 0, &new_p64);
12205 + if (!err)
12206 + err = get_v4l2_edid32(new_p64, p32);
12207 + compatible_arg = 0;
12208 + break;
12209 +
12210 +- case VIDIOC_G_FMT:
12211 +- case VIDIOC_S_FMT:
12212 +- case VIDIOC_TRY_FMT:
12213 ++ case VIDIOC_G_FMT32:
12214 ++ case VIDIOC_S_FMT32:
12215 ++ case VIDIOC_TRY_FMT32:
12216 + err = bufsize_v4l2_format(p32, &aux_space);
12217 + if (!err)
12218 + err = alloc_userspace(sizeof(struct v4l2_format),
12219 +@@ -1250,7 +1252,7 @@ static long do_video_ioctl(struct file *file, unsigned int cmd, unsigned long ar
12220 + compatible_arg = 0;
12221 + break;
12222 +
12223 +- case VIDIOC_CREATE_BUFS:
12224 ++ case VIDIOC_CREATE_BUFS32:
12225 + err = bufsize_v4l2_create(p32, &aux_space);
12226 + if (!err)
12227 + err = alloc_userspace(sizeof(struct v4l2_create_buffers),
12228 +@@ -1263,10 +1265,10 @@ static long do_video_ioctl(struct file *file, unsigned int cmd, unsigned long ar
12229 + compatible_arg = 0;
12230 + break;
12231 +
12232 +- case VIDIOC_PREPARE_BUF:
12233 +- case VIDIOC_QUERYBUF:
12234 +- case VIDIOC_QBUF:
12235 +- case VIDIOC_DQBUF:
12236 ++ case VIDIOC_PREPARE_BUF32:
12237 ++ case VIDIOC_QUERYBUF32:
12238 ++ case VIDIOC_QBUF32:
12239 ++ case VIDIOC_DQBUF32:
12240 + err = bufsize_v4l2_buffer(p32, &aux_space);
12241 + if (!err)
12242 + err = alloc_userspace(sizeof(struct v4l2_buffer),
12243 +@@ -1279,7 +1281,7 @@ static long do_video_ioctl(struct file *file, unsigned int cmd, unsigned long ar
12244 + compatible_arg = 0;
12245 + break;
12246 +
12247 +- case VIDIOC_S_FBUF:
12248 ++ case VIDIOC_S_FBUF32:
12249 + err = alloc_userspace(sizeof(struct v4l2_framebuffer), 0,
12250 + &new_p64);
12251 + if (!err)
12252 +@@ -1287,13 +1289,13 @@ static long do_video_ioctl(struct file *file, unsigned int cmd, unsigned long ar
12253 + compatible_arg = 0;
12254 + break;
12255 +
12256 +- case VIDIOC_G_FBUF:
12257 ++ case VIDIOC_G_FBUF32:
12258 + err = alloc_userspace(sizeof(struct v4l2_framebuffer), 0,
12259 + &new_p64);
12260 + compatible_arg = 0;
12261 + break;
12262 +
12263 +- case VIDIOC_ENUMSTD:
12264 ++ case VIDIOC_ENUMSTD32:
12265 + err = alloc_userspace(sizeof(struct v4l2_standard), 0,
12266 + &new_p64);
12267 + if (!err)
12268 +@@ -1301,16 +1303,16 @@ static long do_video_ioctl(struct file *file, unsigned int cmd, unsigned long ar
12269 + compatible_arg = 0;
12270 + break;
12271 +
12272 +- case VIDIOC_ENUMINPUT:
12273 ++ case VIDIOC_ENUMINPUT32:
12274 + err = alloc_userspace(sizeof(struct v4l2_input), 0, &new_p64);
12275 + if (!err)
12276 + err = get_v4l2_input32(new_p64, p32);
12277 + compatible_arg = 0;
12278 + break;
12279 +
12280 +- case VIDIOC_G_EXT_CTRLS:
12281 +- case VIDIOC_S_EXT_CTRLS:
12282 +- case VIDIOC_TRY_EXT_CTRLS:
12283 ++ case VIDIOC_G_EXT_CTRLS32:
12284 ++ case VIDIOC_S_EXT_CTRLS32:
12285 ++ case VIDIOC_TRY_EXT_CTRLS32:
12286 + err = bufsize_v4l2_ext_controls(p32, &aux_space);
12287 + if (!err)
12288 + err = alloc_userspace(sizeof(struct v4l2_ext_controls),
12289 +@@ -1322,7 +1324,7 @@ static long do_video_ioctl(struct file *file, unsigned int cmd, unsigned long ar
12290 + }
12291 + compatible_arg = 0;
12292 + break;
12293 +- case VIDIOC_DQEVENT:
12294 ++ case VIDIOC_DQEVENT32:
12295 + err = alloc_userspace(sizeof(struct v4l2_event), 0, &new_p64);
12296 + compatible_arg = 0;
12297 + break;
12298 +@@ -1340,9 +1342,9 @@ static long do_video_ioctl(struct file *file, unsigned int cmd, unsigned long ar
12299 + * Otherwise, it will pass the newly allocated @new_p64 argument.
12300 + */
12301 + if (compatible_arg)
12302 +- err = native_ioctl(file, cmd, (unsigned long)p32);
12303 ++ err = native_ioctl(file, ncmd, (unsigned long)p32);
12304 + else
12305 +- err = native_ioctl(file, cmd, (unsigned long)new_p64);
12306 ++ err = native_ioctl(file, ncmd, (unsigned long)new_p64);
12307 +
12308 + if (err == -ENOTTY)
12309 + return err;
12310 +@@ -1358,13 +1360,13 @@ static long do_video_ioctl(struct file *file, unsigned int cmd, unsigned long ar
12311 + * the blocks to maximum allowed value.
12312 + */
12313 + switch (cmd) {
12314 +- case VIDIOC_G_EXT_CTRLS:
12315 +- case VIDIOC_S_EXT_CTRLS:
12316 +- case VIDIOC_TRY_EXT_CTRLS:
12317 ++ case VIDIOC_G_EXT_CTRLS32:
12318 ++ case VIDIOC_S_EXT_CTRLS32:
12319 ++ case VIDIOC_TRY_EXT_CTRLS32:
12320 + if (put_v4l2_ext_controls32(file, new_p64, p32))
12321 + err = -EFAULT;
12322 + break;
12323 +- case VIDIOC_S_EDID:
12324 ++ case VIDIOC_S_EDID32:
12325 + if (put_v4l2_edid32(new_p64, p32))
12326 + err = -EFAULT;
12327 + break;
12328 +@@ -1377,49 +1379,49 @@ static long do_video_ioctl(struct file *file, unsigned int cmd, unsigned long ar
12329 + * the original 32 bits structure.
12330 + */
12331 + switch (cmd) {
12332 +- case VIDIOC_S_INPUT:
12333 +- case VIDIOC_S_OUTPUT:
12334 +- case VIDIOC_G_INPUT:
12335 +- case VIDIOC_G_OUTPUT:
12336 ++ case VIDIOC_S_INPUT32:
12337 ++ case VIDIOC_S_OUTPUT32:
12338 ++ case VIDIOC_G_INPUT32:
12339 ++ case VIDIOC_G_OUTPUT32:
12340 + if (assign_in_user((compat_uint_t __user *)p32,
12341 + ((unsigned int __user *)new_p64)))
12342 + err = -EFAULT;
12343 + break;
12344 +
12345 +- case VIDIOC_G_FBUF:
12346 ++ case VIDIOC_G_FBUF32:
12347 + err = put_v4l2_framebuffer32(new_p64, p32);
12348 + break;
12349 +
12350 +- case VIDIOC_DQEVENT:
12351 ++ case VIDIOC_DQEVENT32:
12352 + err = put_v4l2_event32(new_p64, p32);
12353 + break;
12354 +
12355 +- case VIDIOC_G_EDID:
12356 ++ case VIDIOC_G_EDID32:
12357 + err = put_v4l2_edid32(new_p64, p32);
12358 + break;
12359 +
12360 +- case VIDIOC_G_FMT:
12361 +- case VIDIOC_S_FMT:
12362 +- case VIDIOC_TRY_FMT:
12363 ++ case VIDIOC_G_FMT32:
12364 ++ case VIDIOC_S_FMT32:
12365 ++ case VIDIOC_TRY_FMT32:
12366 + err = put_v4l2_format32(new_p64, p32);
12367 + break;
12368 +
12369 +- case VIDIOC_CREATE_BUFS:
12370 ++ case VIDIOC_CREATE_BUFS32:
12371 + err = put_v4l2_create32(new_p64, p32);
12372 + break;
12373 +
12374 +- case VIDIOC_PREPARE_BUF:
12375 +- case VIDIOC_QUERYBUF:
12376 +- case VIDIOC_QBUF:
12377 +- case VIDIOC_DQBUF:
12378 ++ case VIDIOC_PREPARE_BUF32:
12379 ++ case VIDIOC_QUERYBUF32:
12380 ++ case VIDIOC_QBUF32:
12381 ++ case VIDIOC_DQBUF32:
12382 + err = put_v4l2_buffer32(new_p64, p32);
12383 + break;
12384 +
12385 +- case VIDIOC_ENUMSTD:
12386 ++ case VIDIOC_ENUMSTD32:
12387 + err = put_v4l2_standard32(new_p64, p32);
12388 + break;
12389 +
12390 +- case VIDIOC_ENUMINPUT:
12391 ++ case VIDIOC_ENUMINPUT32:
12392 + err = put_v4l2_input32(new_p64, p32);
12393 + break;
12394 + }
12395 +diff --git a/drivers/media/v4l2-core/videobuf-dma-sg.c b/drivers/media/v4l2-core/videobuf-dma-sg.c
12396 +index 08929c087e27..4c396e0defa4 100644
12397 +--- a/drivers/media/v4l2-core/videobuf-dma-sg.c
12398 ++++ b/drivers/media/v4l2-core/videobuf-dma-sg.c
12399 +@@ -352,8 +352,11 @@ int videobuf_dma_free(struct videobuf_dmabuf *dma)
12400 + BUG_ON(dma->sglen);
12401 +
12402 + if (dma->pages) {
12403 +- for (i = 0; i < dma->nr_pages; i++)
12404 ++ for (i = 0; i < dma->nr_pages; i++) {
12405 ++ if (dma->direction == DMA_FROM_DEVICE)
12406 ++ set_page_dirty_lock(dma->pages[i]);
12407 + put_page(dma->pages[i]);
12408 ++ }
12409 + kfree(dma->pages);
12410 + dma->pages = NULL;
12411 + }
12412 +diff --git a/drivers/mfd/axp20x.c b/drivers/mfd/axp20x.c
12413 +index f8e0fa97bb31..aa65931142ba 100644
12414 +--- a/drivers/mfd/axp20x.c
12415 ++++ b/drivers/mfd/axp20x.c
12416 +@@ -128,7 +128,7 @@ static const struct regmap_range axp288_writeable_ranges[] = {
12417 + static const struct regmap_range axp288_volatile_ranges[] = {
12418 + regmap_reg_range(AXP20X_PWR_INPUT_STATUS, AXP288_POWER_REASON),
12419 + regmap_reg_range(AXP288_BC_GLOBAL, AXP288_BC_GLOBAL),
12420 +- regmap_reg_range(AXP288_BC_DET_STAT, AXP288_BC_DET_STAT),
12421 ++ regmap_reg_range(AXP288_BC_DET_STAT, AXP20X_VBUS_IPSOUT_MGMT),
12422 + regmap_reg_range(AXP20X_CHRG_BAK_CTRL, AXP20X_CHRG_BAK_CTRL),
12423 + regmap_reg_range(AXP20X_IRQ1_EN, AXP20X_IPSOUT_V_HIGH_L),
12424 + regmap_reg_range(AXP20X_TIMER_CTRL, AXP20X_TIMER_CTRL),
12425 +diff --git a/drivers/mfd/da9062-core.c b/drivers/mfd/da9062-core.c
12426 +index 9f6105906c09..83c624f6033c 100644
12427 +--- a/drivers/mfd/da9062-core.c
12428 ++++ b/drivers/mfd/da9062-core.c
12429 +@@ -257,7 +257,7 @@ static const struct mfd_cell da9062_devs[] = {
12430 + .name = "da9062-watchdog",
12431 + .num_resources = ARRAY_SIZE(da9062_wdt_resources),
12432 + .resources = da9062_wdt_resources,
12433 +- .of_compatible = "dlg,da9062-wdt",
12434 ++ .of_compatible = "dlg,da9062-watchdog",
12435 + },
12436 + {
12437 + .name = "da9062-thermal",
12438 +diff --git a/drivers/mfd/dln2.c b/drivers/mfd/dln2.c
12439 +index 90e789943466..1476465ce803 100644
12440 +--- a/drivers/mfd/dln2.c
12441 ++++ b/drivers/mfd/dln2.c
12442 +@@ -725,6 +725,8 @@ static int dln2_probe(struct usb_interface *interface,
12443 + const struct usb_device_id *usb_id)
12444 + {
12445 + struct usb_host_interface *hostif = interface->cur_altsetting;
12446 ++ struct usb_endpoint_descriptor *epin;
12447 ++ struct usb_endpoint_descriptor *epout;
12448 + struct device *dev = &interface->dev;
12449 + struct dln2_dev *dln2;
12450 + int ret;
12451 +@@ -734,12 +736,19 @@ static int dln2_probe(struct usb_interface *interface,
12452 + hostif->desc.bNumEndpoints < 2)
12453 + return -ENODEV;
12454 +
12455 ++ epin = &hostif->endpoint[0].desc;
12456 ++ epout = &hostif->endpoint[1].desc;
12457 ++ if (!usb_endpoint_is_bulk_out(epout))
12458 ++ return -ENODEV;
12459 ++ if (!usb_endpoint_is_bulk_in(epin))
12460 ++ return -ENODEV;
12461 ++
12462 + dln2 = kzalloc(sizeof(*dln2), GFP_KERNEL);
12463 + if (!dln2)
12464 + return -ENOMEM;
12465 +
12466 +- dln2->ep_out = hostif->endpoint[0].desc.bEndpointAddress;
12467 +- dln2->ep_in = hostif->endpoint[1].desc.bEndpointAddress;
12468 ++ dln2->ep_out = epout->bEndpointAddress;
12469 ++ dln2->ep_in = epin->bEndpointAddress;
12470 + dln2->usb_dev = usb_get_dev(interface_to_usbdev(interface));
12471 + dln2->interface = interface;
12472 + usb_set_intfdata(interface, dln2);
12473 +diff --git a/drivers/mfd/rn5t618.c b/drivers/mfd/rn5t618.c
12474 +index f4037d42a60f..dd4251f105e0 100644
12475 +--- a/drivers/mfd/rn5t618.c
12476 ++++ b/drivers/mfd/rn5t618.c
12477 +@@ -32,6 +32,7 @@ static bool rn5t618_volatile_reg(struct device *dev, unsigned int reg)
12478 + case RN5T618_WATCHDOGCNT:
12479 + case RN5T618_DCIRQ:
12480 + case RN5T618_ILIMDATAH ... RN5T618_AIN0DATAL:
12481 ++ case RN5T618_ADCCNT3:
12482 + case RN5T618_IR_ADC1 ... RN5T618_IR_ADC3:
12483 + case RN5T618_IR_GPR:
12484 + case RN5T618_IR_GPF:
12485 +diff --git a/drivers/mmc/host/mmc_spi.c b/drivers/mmc/host/mmc_spi.c
12486 +index ea254d00541f..24795454d106 100644
12487 +--- a/drivers/mmc/host/mmc_spi.c
12488 ++++ b/drivers/mmc/host/mmc_spi.c
12489 +@@ -1154,17 +1154,22 @@ static void mmc_spi_initsequence(struct mmc_spi_host *host)
12490 + * SPI protocol. Another is that when chipselect is released while
12491 + * the card returns BUSY status, the clock must issue several cycles
12492 + * with chipselect high before the card will stop driving its output.
12493 ++ *
12494 ++ * SPI_CS_HIGH means "asserted" here. In some cases like when using
12495 ++ * GPIOs for chip select, SPI_CS_HIGH is set but this will be logically
12496 ++ * inverted by gpiolib, so if we want to ascertain to drive it high
12497 ++ * we should toggle the default with an XOR as we do here.
12498 + */
12499 +- host->spi->mode |= SPI_CS_HIGH;
12500 ++ host->spi->mode ^= SPI_CS_HIGH;
12501 + if (spi_setup(host->spi) != 0) {
12502 + /* Just warn; most cards work without it. */
12503 + dev_warn(&host->spi->dev,
12504 + "can't change chip-select polarity\n");
12505 +- host->spi->mode &= ~SPI_CS_HIGH;
12506 ++ host->spi->mode ^= SPI_CS_HIGH;
12507 + } else {
12508 + mmc_spi_readbytes(host, 18);
12509 +
12510 +- host->spi->mode &= ~SPI_CS_HIGH;
12511 ++ host->spi->mode ^= SPI_CS_HIGH;
12512 + if (spi_setup(host->spi) != 0) {
12513 + /* Wot, we can't get the same setup we had before? */
12514 + dev_err(&host->spi->dev,
12515 +diff --git a/drivers/mmc/host/sdhci-of-at91.c b/drivers/mmc/host/sdhci-of-at91.c
12516 +index 1ebcf0eb781e..04e88d4796fa 100644
12517 +--- a/drivers/mmc/host/sdhci-of-at91.c
12518 ++++ b/drivers/mmc/host/sdhci-of-at91.c
12519 +@@ -332,19 +332,22 @@ static int sdhci_at91_probe(struct platform_device *pdev)
12520 + priv->mainck = devm_clk_get(&pdev->dev, "baseclk");
12521 + if (IS_ERR(priv->mainck)) {
12522 + dev_err(&pdev->dev, "failed to get baseclk\n");
12523 +- return PTR_ERR(priv->mainck);
12524 ++ ret = PTR_ERR(priv->mainck);
12525 ++ goto sdhci_pltfm_free;
12526 + }
12527 +
12528 + priv->hclock = devm_clk_get(&pdev->dev, "hclock");
12529 + if (IS_ERR(priv->hclock)) {
12530 + dev_err(&pdev->dev, "failed to get hclock\n");
12531 +- return PTR_ERR(priv->hclock);
12532 ++ ret = PTR_ERR(priv->hclock);
12533 ++ goto sdhci_pltfm_free;
12534 + }
12535 +
12536 + priv->gck = devm_clk_get(&pdev->dev, "multclk");
12537 + if (IS_ERR(priv->gck)) {
12538 + dev_err(&pdev->dev, "failed to get multclk\n");
12539 +- return PTR_ERR(priv->gck);
12540 ++ ret = PTR_ERR(priv->gck);
12541 ++ goto sdhci_pltfm_free;
12542 + }
12543 +
12544 + ret = sdhci_at91_set_clks_presets(&pdev->dev);
12545 +diff --git a/drivers/mtd/ubi/fastmap.c b/drivers/mtd/ubi/fastmap.c
12546 +index 462526a10537..8e292992f84c 100644
12547 +--- a/drivers/mtd/ubi/fastmap.c
12548 ++++ b/drivers/mtd/ubi/fastmap.c
12549 +@@ -73,7 +73,7 @@ static int self_check_seen(struct ubi_device *ubi, unsigned long *seen)
12550 + return 0;
12551 +
12552 + for (pnum = 0; pnum < ubi->peb_count; pnum++) {
12553 +- if (test_bit(pnum, seen) && ubi->lookuptbl[pnum]) {
12554 ++ if (!test_bit(pnum, seen) && ubi->lookuptbl[pnum]) {
12555 + ubi_err(ubi, "self-check failed for PEB %d, fastmap didn't see it", pnum);
12556 + ret = -EINVAL;
12557 + }
12558 +@@ -1146,7 +1146,7 @@ static int ubi_write_fastmap(struct ubi_device *ubi,
12559 + struct rb_node *tmp_rb;
12560 + int ret, i, j, free_peb_count, used_peb_count, vol_count;
12561 + int scrub_peb_count, erase_peb_count;
12562 +- unsigned long *seen_pebs = NULL;
12563 ++ unsigned long *seen_pebs;
12564 +
12565 + fm_raw = ubi->fm_buf;
12566 + memset(ubi->fm_buf, 0, ubi->fm_size);
12567 +@@ -1160,7 +1160,7 @@ static int ubi_write_fastmap(struct ubi_device *ubi,
12568 + dvbuf = new_fm_vbuf(ubi, UBI_FM_DATA_VOLUME_ID);
12569 + if (!dvbuf) {
12570 + ret = -ENOMEM;
12571 +- goto out_kfree;
12572 ++ goto out_free_avbuf;
12573 + }
12574 +
12575 + avhdr = ubi_get_vid_hdr(avbuf);
12576 +@@ -1169,7 +1169,7 @@ static int ubi_write_fastmap(struct ubi_device *ubi,
12577 + seen_pebs = init_seen(ubi);
12578 + if (IS_ERR(seen_pebs)) {
12579 + ret = PTR_ERR(seen_pebs);
12580 +- goto out_kfree;
12581 ++ goto out_free_dvbuf;
12582 + }
12583 +
12584 + spin_lock(&ubi->volumes_lock);
12585 +@@ -1337,7 +1337,7 @@ static int ubi_write_fastmap(struct ubi_device *ubi,
12586 + ret = ubi_io_write_vid_hdr(ubi, new_fm->e[0]->pnum, avbuf);
12587 + if (ret) {
12588 + ubi_err(ubi, "unable to write vid_hdr to fastmap SB!");
12589 +- goto out_kfree;
12590 ++ goto out_free_seen;
12591 + }
12592 +
12593 + for (i = 0; i < new_fm->used_blocks; i++) {
12594 +@@ -1359,7 +1359,7 @@ static int ubi_write_fastmap(struct ubi_device *ubi,
12595 + if (ret) {
12596 + ubi_err(ubi, "unable to write vid_hdr to PEB %i!",
12597 + new_fm->e[i]->pnum);
12598 +- goto out_kfree;
12599 ++ goto out_free_seen;
12600 + }
12601 + }
12602 +
12603 +@@ -1369,7 +1369,7 @@ static int ubi_write_fastmap(struct ubi_device *ubi,
12604 + if (ret) {
12605 + ubi_err(ubi, "unable to write fastmap to PEB %i!",
12606 + new_fm->e[i]->pnum);
12607 +- goto out_kfree;
12608 ++ goto out_free_seen;
12609 + }
12610 + }
12611 +
12612 +@@ -1379,10 +1379,13 @@ static int ubi_write_fastmap(struct ubi_device *ubi,
12613 + ret = self_check_seen(ubi, seen_pebs);
12614 + dbg_bld("fastmap written!");
12615 +
12616 +-out_kfree:
12617 +- ubi_free_vid_buf(avbuf);
12618 +- ubi_free_vid_buf(dvbuf);
12619 ++out_free_seen:
12620 + free_seen(seen_pebs);
12621 ++out_free_dvbuf:
12622 ++ ubi_free_vid_buf(dvbuf);
12623 ++out_free_avbuf:
12624 ++ ubi_free_vid_buf(avbuf);
12625 ++
12626 + out:
12627 + return ret;
12628 + }
12629 +diff --git a/drivers/net/bonding/bond_alb.c b/drivers/net/bonding/bond_alb.c
12630 +index e82108c917a6..334e3f22d4f1 100644
12631 +--- a/drivers/net/bonding/bond_alb.c
12632 ++++ b/drivers/net/bonding/bond_alb.c
12633 +@@ -1399,26 +1399,31 @@ netdev_tx_t bond_alb_xmit(struct sk_buff *skb, struct net_device *bond_dev)
12634 + bool do_tx_balance = true;
12635 + u32 hash_index = 0;
12636 + const u8 *hash_start = NULL;
12637 +- struct ipv6hdr *ip6hdr;
12638 +
12639 + skb_reset_mac_header(skb);
12640 + eth_data = eth_hdr(skb);
12641 +
12642 + switch (ntohs(skb->protocol)) {
12643 + case ETH_P_IP: {
12644 +- const struct iphdr *iph = ip_hdr(skb);
12645 ++ const struct iphdr *iph;
12646 +
12647 + if (is_broadcast_ether_addr(eth_data->h_dest) ||
12648 +- iph->daddr == ip_bcast ||
12649 +- iph->protocol == IPPROTO_IGMP) {
12650 ++ !pskb_network_may_pull(skb, sizeof(*iph))) {
12651 ++ do_tx_balance = false;
12652 ++ break;
12653 ++ }
12654 ++ iph = ip_hdr(skb);
12655 ++ if (iph->daddr == ip_bcast || iph->protocol == IPPROTO_IGMP) {
12656 + do_tx_balance = false;
12657 + break;
12658 + }
12659 + hash_start = (char *)&(iph->daddr);
12660 + hash_size = sizeof(iph->daddr);
12661 +- }
12662 + break;
12663 +- case ETH_P_IPV6:
12664 ++ }
12665 ++ case ETH_P_IPV6: {
12666 ++ const struct ipv6hdr *ip6hdr;
12667 ++
12668 + /* IPv6 doesn't really use broadcast mac address, but leave
12669 + * that here just in case.
12670 + */
12671 +@@ -1435,7 +1440,11 @@ netdev_tx_t bond_alb_xmit(struct sk_buff *skb, struct net_device *bond_dev)
12672 + break;
12673 + }
12674 +
12675 +- /* Additianally, DAD probes should not be tx-balanced as that
12676 ++ if (!pskb_network_may_pull(skb, sizeof(*ip6hdr))) {
12677 ++ do_tx_balance = false;
12678 ++ break;
12679 ++ }
12680 ++ /* Additionally, DAD probes should not be tx-balanced as that
12681 + * will lead to false positives for duplicate addresses and
12682 + * prevent address configuration from working.
12683 + */
12684 +@@ -1445,17 +1454,26 @@ netdev_tx_t bond_alb_xmit(struct sk_buff *skb, struct net_device *bond_dev)
12685 + break;
12686 + }
12687 +
12688 +- hash_start = (char *)&(ipv6_hdr(skb)->daddr);
12689 +- hash_size = sizeof(ipv6_hdr(skb)->daddr);
12690 ++ hash_start = (char *)&ip6hdr->daddr;
12691 ++ hash_size = sizeof(ip6hdr->daddr);
12692 + break;
12693 +- case ETH_P_IPX:
12694 +- if (ipx_hdr(skb)->ipx_checksum != IPX_NO_CHECKSUM) {
12695 ++ }
12696 ++ case ETH_P_IPX: {
12697 ++ const struct ipxhdr *ipxhdr;
12698 ++
12699 ++ if (pskb_network_may_pull(skb, sizeof(*ipxhdr))) {
12700 ++ do_tx_balance = false;
12701 ++ break;
12702 ++ }
12703 ++ ipxhdr = (struct ipxhdr *)skb_network_header(skb);
12704 ++
12705 ++ if (ipxhdr->ipx_checksum != IPX_NO_CHECKSUM) {
12706 + /* something is wrong with this packet */
12707 + do_tx_balance = false;
12708 + break;
12709 + }
12710 +
12711 +- if (ipx_hdr(skb)->ipx_type != IPX_TYPE_NCP) {
12712 ++ if (ipxhdr->ipx_type != IPX_TYPE_NCP) {
12713 + /* The only protocol worth balancing in
12714 + * this family since it has an "ARP" like
12715 + * mechanism
12716 +@@ -1464,9 +1482,11 @@ netdev_tx_t bond_alb_xmit(struct sk_buff *skb, struct net_device *bond_dev)
12717 + break;
12718 + }
12719 +
12720 ++ eth_data = eth_hdr(skb);
12721 + hash_start = (char *)eth_data->h_dest;
12722 + hash_size = ETH_ALEN;
12723 + break;
12724 ++ }
12725 + case ETH_P_ARP:
12726 + do_tx_balance = false;
12727 + if (bond_info->rlb_enabled)
12728 +diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c
12729 +index 9f21e710fc38..51436e7eae10 100644
12730 +--- a/drivers/net/dsa/b53/b53_common.c
12731 ++++ b/drivers/net/dsa/b53/b53_common.c
12732 +@@ -655,7 +655,7 @@ int b53_configure_vlan(struct dsa_switch *ds)
12733 + b53_do_vlan_op(dev, VTA_CMD_CLEAR);
12734 + }
12735 +
12736 +- b53_enable_vlan(dev, false, dev->vlan_filtering_enabled);
12737 ++ b53_enable_vlan(dev, dev->vlan_enabled, dev->vlan_filtering_enabled);
12738 +
12739 + b53_for_each_port(dev, i)
12740 + b53_write16(dev, B53_VLAN_PAGE,
12741 +diff --git a/drivers/net/dsa/bcm_sf2.c b/drivers/net/dsa/bcm_sf2.c
12742 +index c93609007670..f181a28cb452 100644
12743 +--- a/drivers/net/dsa/bcm_sf2.c
12744 ++++ b/drivers/net/dsa/bcm_sf2.c
12745 +@@ -72,7 +72,9 @@ static void bcm_sf2_imp_setup(struct dsa_switch *ds, int port)
12746 +
12747 + /* Force link status for IMP port */
12748 + reg = core_readl(priv, offset);
12749 +- reg |= (MII_SW_OR | LINK_STS | GMII_SPEED_UP_2G);
12750 ++ reg |= (MII_SW_OR | LINK_STS);
12751 ++ if (priv->type == BCM7278_DEVICE_ID)
12752 ++ reg |= GMII_SPEED_UP_2G;
12753 + core_writel(priv, reg, offset);
12754 +
12755 + /* Enable Broadcast, Multicast, Unicast forwarding to IMP port */
12756 +diff --git a/drivers/net/ethernet/broadcom/bcmsysport.c b/drivers/net/ethernet/broadcom/bcmsysport.c
12757 +index 49aa3b5ea57c..6f8649376ff0 100644
12758 +--- a/drivers/net/ethernet/broadcom/bcmsysport.c
12759 ++++ b/drivers/net/ethernet/broadcom/bcmsysport.c
12760 +@@ -2716,6 +2716,9 @@ static int __maybe_unused bcm_sysport_resume(struct device *d)
12761 +
12762 + umac_reset(priv);
12763 +
12764 ++ /* Disable the UniMAC RX/TX */
12765 ++ umac_enable_set(priv, CMD_RX_EN | CMD_TX_EN, 0);
12766 ++
12767 + /* We may have been suspended and never received a WOL event that
12768 + * would turn off MPD detection, take care of that now
12769 + */
12770 +diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
12771 +index 5cf85a89016e..c19d0eabeb52 100644
12772 +--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
12773 ++++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
12774 +@@ -5861,7 +5861,7 @@ static void bnxt_setup_msix(struct bnxt *bp)
12775 + int tcs, i;
12776 +
12777 + tcs = netdev_get_num_tc(dev);
12778 +- if (tcs > 1) {
12779 ++ if (tcs) {
12780 + int i, off, count;
12781 +
12782 + for (i = 0; i < tcs; i++) {
12783 +diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c
12784 +index c2eb18854794..d1ff317f3b18 100644
12785 +--- a/drivers/net/ethernet/cadence/macb_main.c
12786 ++++ b/drivers/net/ethernet/cadence/macb_main.c
12787 +@@ -66,7 +66,11 @@
12788 + /* Max length of transmit frame must be a multiple of 8 bytes */
12789 + #define MACB_TX_LEN_ALIGN 8
12790 + #define MACB_MAX_TX_LEN ((unsigned int)((1 << MACB_TX_FRMLEN_SIZE) - 1) & ~((unsigned int)(MACB_TX_LEN_ALIGN - 1)))
12791 +-#define GEM_MAX_TX_LEN ((unsigned int)((1 << GEM_TX_FRMLEN_SIZE) - 1) & ~((unsigned int)(MACB_TX_LEN_ALIGN - 1)))
12792 ++/* Limit maximum TX length as per Cadence TSO errata. This is to avoid a
12793 ++ * false amba_error in TX path from the DMA assuming there is not enough
12794 ++ * space in the SRAM (16KB) even when there is.
12795 ++ */
12796 ++#define GEM_MAX_TX_LEN (unsigned int)(0x3FC0)
12797 +
12798 + #define GEM_MTU_MIN_SIZE ETH_MIN_MTU
12799 + #define MACB_NETIF_LSO NETIF_F_TSO
12800 +@@ -1654,16 +1658,14 @@ static netdev_features_t macb_features_check(struct sk_buff *skb,
12801 +
12802 + /* Validate LSO compatibility */
12803 +
12804 +- /* there is only one buffer */
12805 +- if (!skb_is_nonlinear(skb))
12806 ++ /* there is only one buffer or protocol is not UDP */
12807 ++ if (!skb_is_nonlinear(skb) || (ip_hdr(skb)->protocol != IPPROTO_UDP))
12808 + return features;
12809 +
12810 + /* length of header */
12811 + hdrlen = skb_transport_offset(skb);
12812 +- if (ip_hdr(skb)->protocol == IPPROTO_TCP)
12813 +- hdrlen += tcp_hdrlen(skb);
12814 +
12815 +- /* For LSO:
12816 ++ /* For UFO only:
12817 + * When software supplies two or more payload buffers all payload buffers
12818 + * apart from the last must be a multiple of 8 bytes in size.
12819 + */
12820 +diff --git a/drivers/net/ethernet/dec/tulip/dmfe.c b/drivers/net/ethernet/dec/tulip/dmfe.c
12821 +index 17ef7a28873d..0defd5b1212a 100644
12822 +--- a/drivers/net/ethernet/dec/tulip/dmfe.c
12823 ++++ b/drivers/net/ethernet/dec/tulip/dmfe.c
12824 +@@ -2222,15 +2222,16 @@ static int __init dmfe_init_module(void)
12825 + if (cr6set)
12826 + dmfe_cr6_user_set = cr6set;
12827 +
12828 +- switch(mode) {
12829 +- case DMFE_10MHF:
12830 ++ switch (mode) {
12831 ++ case DMFE_10MHF:
12832 + case DMFE_100MHF:
12833 + case DMFE_10MFD:
12834 + case DMFE_100MFD:
12835 + case DMFE_1M_HPNA:
12836 + dmfe_media_mode = mode;
12837 + break;
12838 +- default:dmfe_media_mode = DMFE_AUTO;
12839 ++ default:
12840 ++ dmfe_media_mode = DMFE_AUTO;
12841 + break;
12842 + }
12843 +
12844 +diff --git a/drivers/net/ethernet/dec/tulip/uli526x.c b/drivers/net/ethernet/dec/tulip/uli526x.c
12845 +index 488a744084c9..f4751a8de629 100644
12846 +--- a/drivers/net/ethernet/dec/tulip/uli526x.c
12847 ++++ b/drivers/net/ethernet/dec/tulip/uli526x.c
12848 +@@ -1817,8 +1817,8 @@ static int __init uli526x_init_module(void)
12849 + if (cr6set)
12850 + uli526x_cr6_user_set = cr6set;
12851 +
12852 +- switch (mode) {
12853 +- case ULI526X_10MHF:
12854 ++ switch (mode) {
12855 ++ case ULI526X_10MHF:
12856 + case ULI526X_100MHF:
12857 + case ULI526X_10MFD:
12858 + case ULI526X_100MFD:
12859 +diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c
12860 +index 4313bbb2396f..51885e6dec50 100644
12861 +--- a/drivers/net/ethernet/marvell/mvneta.c
12862 ++++ b/drivers/net/ethernet/marvell/mvneta.c
12863 +@@ -385,6 +385,8 @@ struct mvneta_pcpu_stats {
12864 + struct u64_stats_sync syncp;
12865 + u64 rx_packets;
12866 + u64 rx_bytes;
12867 ++ u64 rx_dropped;
12868 ++ u64 rx_errors;
12869 + u64 tx_packets;
12870 + u64 tx_bytes;
12871 + };
12872 +@@ -701,6 +703,8 @@ mvneta_get_stats64(struct net_device *dev,
12873 + struct mvneta_pcpu_stats *cpu_stats;
12874 + u64 rx_packets;
12875 + u64 rx_bytes;
12876 ++ u64 rx_dropped;
12877 ++ u64 rx_errors;
12878 + u64 tx_packets;
12879 + u64 tx_bytes;
12880 +
12881 +@@ -709,19 +713,20 @@ mvneta_get_stats64(struct net_device *dev,
12882 + start = u64_stats_fetch_begin_irq(&cpu_stats->syncp);
12883 + rx_packets = cpu_stats->rx_packets;
12884 + rx_bytes = cpu_stats->rx_bytes;
12885 ++ rx_dropped = cpu_stats->rx_dropped;
12886 ++ rx_errors = cpu_stats->rx_errors;
12887 + tx_packets = cpu_stats->tx_packets;
12888 + tx_bytes = cpu_stats->tx_bytes;
12889 + } while (u64_stats_fetch_retry_irq(&cpu_stats->syncp, start));
12890 +
12891 + stats->rx_packets += rx_packets;
12892 + stats->rx_bytes += rx_bytes;
12893 ++ stats->rx_dropped += rx_dropped;
12894 ++ stats->rx_errors += rx_errors;
12895 + stats->tx_packets += tx_packets;
12896 + stats->tx_bytes += tx_bytes;
12897 + }
12898 +
12899 +- stats->rx_errors = dev->stats.rx_errors;
12900 +- stats->rx_dropped = dev->stats.rx_dropped;
12901 +-
12902 + stats->tx_dropped = dev->stats.tx_dropped;
12903 + }
12904 +
12905 +@@ -1698,8 +1703,14 @@ static u32 mvneta_txq_desc_csum(int l3_offs, int l3_proto,
12906 + static void mvneta_rx_error(struct mvneta_port *pp,
12907 + struct mvneta_rx_desc *rx_desc)
12908 + {
12909 ++ struct mvneta_pcpu_stats *stats = this_cpu_ptr(pp->stats);
12910 + u32 status = rx_desc->status;
12911 +
12912 ++ /* update per-cpu counter */
12913 ++ u64_stats_update_begin(&stats->syncp);
12914 ++ stats->rx_errors++;
12915 ++ u64_stats_update_end(&stats->syncp);
12916 ++
12917 + switch (status & MVNETA_RXD_ERR_CODE_MASK) {
12918 + case MVNETA_RXD_ERR_CRC:
12919 + netdev_err(pp->dev, "bad rx status %08x (crc error), size=%d\n",
12920 +@@ -1960,7 +1971,6 @@ static int mvneta_rx_swbm(struct napi_struct *napi,
12921 + /* Check errors only for FIRST descriptor */
12922 + if (rx_status & MVNETA_RXD_ERR_SUMMARY) {
12923 + mvneta_rx_error(pp, rx_desc);
12924 +- dev->stats.rx_errors++;
12925 + /* leave the descriptor untouched */
12926 + continue;
12927 + }
12928 +@@ -1971,11 +1981,17 @@ static int mvneta_rx_swbm(struct napi_struct *napi,
12929 + skb_size = max(rx_copybreak, rx_header_size);
12930 + rxq->skb = netdev_alloc_skb_ip_align(dev, skb_size);
12931 + if (unlikely(!rxq->skb)) {
12932 ++ struct mvneta_pcpu_stats *stats = this_cpu_ptr(pp->stats);
12933 ++
12934 + netdev_err(dev,
12935 + "Can't allocate skb on queue %d\n",
12936 + rxq->id);
12937 +- dev->stats.rx_dropped++;
12938 ++
12939 + rxq->skb_alloc_err++;
12940 ++
12941 ++ u64_stats_update_begin(&stats->syncp);
12942 ++ stats->rx_dropped++;
12943 ++ u64_stats_update_end(&stats->syncp);
12944 + continue;
12945 + }
12946 + copy_size = min(skb_size, rx_bytes);
12947 +@@ -2135,7 +2151,6 @@ err_drop_frame_ret_pool:
12948 + mvneta_bm_pool_put_bp(pp->bm_priv, bm_pool,
12949 + rx_desc->buf_phys_addr);
12950 + err_drop_frame:
12951 +- dev->stats.rx_errors++;
12952 + mvneta_rx_error(pp, rx_desc);
12953 + /* leave the descriptor untouched */
12954 + continue;
12955 +diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/ipsec.c b/drivers/net/ethernet/mellanox/mlx5/core/fpga/ipsec.c
12956 +index b5a8769a5bfd..715ccafc92cd 100644
12957 +--- a/drivers/net/ethernet/mellanox/mlx5/core/fpga/ipsec.c
12958 ++++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/ipsec.c
12959 +@@ -848,6 +848,7 @@ void mlx5_fpga_ipsec_delete_sa_ctx(void *context)
12960 + mutex_lock(&fpga_xfrm->lock);
12961 + if (!--fpga_xfrm->num_rules) {
12962 + mlx5_fpga_ipsec_release_sa_ctx(fpga_xfrm->sa_ctx);
12963 ++ kfree(fpga_xfrm->sa_ctx);
12964 + fpga_xfrm->sa_ctx = NULL;
12965 + }
12966 + mutex_unlock(&fpga_xfrm->lock);
12967 +@@ -1472,7 +1473,7 @@ int mlx5_fpga_esp_modify_xfrm(struct mlx5_accel_esp_xfrm *xfrm,
12968 + if (!memcmp(&xfrm->attrs, attrs, sizeof(xfrm->attrs)))
12969 + return 0;
12970 +
12971 +- if (!mlx5_fpga_esp_validate_xfrm_attrs(mdev, attrs)) {
12972 ++ if (mlx5_fpga_esp_validate_xfrm_attrs(mdev, attrs)) {
12973 + mlx5_core_warn(mdev, "Tried to create an esp with unsupported attrs\n");
12974 + return -EOPNOTSUPP;
12975 + }
12976 +diff --git a/drivers/net/ethernet/smsc/smc911x.c b/drivers/net/ethernet/smsc/smc911x.c
12977 +index 8355dfbb8ec3..f97b35430c84 100644
12978 +--- a/drivers/net/ethernet/smsc/smc911x.c
12979 ++++ b/drivers/net/ethernet/smsc/smc911x.c
12980 +@@ -947,7 +947,7 @@ static void smc911x_phy_configure(struct work_struct *work)
12981 + if (lp->ctl_rspeed != 100)
12982 + my_ad_caps &= ~(ADVERTISE_100BASE4|ADVERTISE_100FULL|ADVERTISE_100HALF);
12983 +
12984 +- if (!lp->ctl_rfduplx)
12985 ++ if (!lp->ctl_rfduplx)
12986 + my_ad_caps &= ~(ADVERTISE_100FULL|ADVERTISE_10FULL);
12987 +
12988 + /* Update our Auto-Neg Advertisement Register */
12989 +diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
12990 +index f069adfc2b35..9c7b1d8e8220 100644
12991 +--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
12992 ++++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
12993 +@@ -4513,6 +4513,7 @@ int stmmac_suspend(struct device *dev)
12994 + {
12995 + struct net_device *ndev = dev_get_drvdata(dev);
12996 + struct stmmac_priv *priv = netdev_priv(ndev);
12997 ++ u32 chan;
12998 +
12999 + if (!ndev || !netif_running(ndev))
13000 + return 0;
13001 +@@ -4527,6 +4528,9 @@ int stmmac_suspend(struct device *dev)
13002 +
13003 + stmmac_disable_all_queues(priv);
13004 +
13005 ++ for (chan = 0; chan < priv->plat->tx_queues_to_use; chan++)
13006 ++ del_timer_sync(&priv->tx_queue[chan].txtimer);
13007 ++
13008 + /* Stop TX/RX DMA */
13009 + stmmac_stop_all_dma(priv);
13010 +
13011 +diff --git a/drivers/net/gtp.c b/drivers/net/gtp.c
13012 +index ee086441dcbe..eab9984f73a8 100644
13013 +--- a/drivers/net/gtp.c
13014 ++++ b/drivers/net/gtp.c
13015 +@@ -772,12 +772,12 @@ static int gtp_hashtable_new(struct gtp_dev *gtp, int hsize)
13016 + int i;
13017 +
13018 + gtp->addr_hash = kmalloc_array(hsize, sizeof(struct hlist_head),
13019 +- GFP_KERNEL);
13020 ++ GFP_KERNEL | __GFP_NOWARN);
13021 + if (gtp->addr_hash == NULL)
13022 + return -ENOMEM;
13023 +
13024 + gtp->tid_hash = kmalloc_array(hsize, sizeof(struct hlist_head),
13025 +- GFP_KERNEL);
13026 ++ GFP_KERNEL | __GFP_NOWARN);
13027 + if (gtp->tid_hash == NULL)
13028 + goto err1;
13029 +
13030 +diff --git a/drivers/net/ppp/ppp_async.c b/drivers/net/ppp/ppp_async.c
13031 +index bdc4d23627c5..bf03db40d4f0 100644
13032 +--- a/drivers/net/ppp/ppp_async.c
13033 ++++ b/drivers/net/ppp/ppp_async.c
13034 +@@ -878,15 +878,15 @@ ppp_async_input(struct asyncppp *ap, const unsigned char *buf,
13035 + skb = dev_alloc_skb(ap->mru + PPP_HDRLEN + 2);
13036 + if (!skb)
13037 + goto nomem;
13038 +- ap->rpkt = skb;
13039 +- }
13040 +- if (skb->len == 0) {
13041 +- /* Try to get the payload 4-byte aligned.
13042 +- * This should match the
13043 +- * PPP_ALLSTATIONS/PPP_UI/compressed tests in
13044 +- * process_input_packet, but we do not have
13045 +- * enough chars here to test buf[1] and buf[2].
13046 +- */
13047 ++ ap->rpkt = skb;
13048 ++ }
13049 ++ if (skb->len == 0) {
13050 ++ /* Try to get the payload 4-byte aligned.
13051 ++ * This should match the
13052 ++ * PPP_ALLSTATIONS/PPP_UI/compressed tests in
13053 ++ * process_input_packet, but we do not have
13054 ++ * enough chars here to test buf[1] and buf[2].
13055 ++ */
13056 + if (buf[0] != PPP_ALLSTATIONS)
13057 + skb_reserve(skb, 2 + (buf[0] & 1));
13058 + }
13059 +diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/usb.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/usb.c
13060 +index 6a213fe760ff..41254f04ab15 100644
13061 +--- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/usb.c
13062 ++++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/usb.c
13063 +@@ -441,6 +441,7 @@ fail:
13064 + usb_free_urb(req->urb);
13065 + list_del(q->next);
13066 + }
13067 ++ kfree(reqs);
13068 + return NULL;
13069 +
13070 + }
13071 +diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/nvm.c b/drivers/net/wireless/intel/iwlwifi/mvm/nvm.c
13072 +index f2579c94ffdb..3270faafe0bc 100644
13073 +--- a/drivers/net/wireless/intel/iwlwifi/mvm/nvm.c
13074 ++++ b/drivers/net/wireless/intel/iwlwifi/mvm/nvm.c
13075 +@@ -286,7 +286,7 @@ iwl_parse_nvm_sections(struct iwl_mvm *mvm)
13076 + int regulatory_type;
13077 +
13078 + /* Checking for required sections */
13079 +- if (mvm->trans->cfg->nvm_type != IWL_NVM_EXT) {
13080 ++ if (mvm->trans->cfg->nvm_type == IWL_NVM) {
13081 + if (!mvm->nvm_sections[NVM_SECTION_TYPE_SW].data ||
13082 + !mvm->nvm_sections[mvm->cfg->nvm_hw_section_num].data) {
13083 + IWL_ERR(mvm, "Can't parse empty OTP/NVM sections\n");
13084 +diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/sta.c b/drivers/net/wireless/intel/iwlwifi/mvm/sta.c
13085 +index 69057701641e..373ace38edab 100644
13086 +--- a/drivers/net/wireless/intel/iwlwifi/mvm/sta.c
13087 ++++ b/drivers/net/wireless/intel/iwlwifi/mvm/sta.c
13088 +@@ -3045,6 +3045,10 @@ static int iwl_mvm_send_sta_igtk(struct iwl_mvm *mvm,
13089 + igtk_cmd.sta_id = cpu_to_le32(sta_id);
13090 +
13091 + if (remove_key) {
13092 ++ /* This is a valid situation for IGTK */
13093 ++ if (sta_id == IWL_MVM_INVALID_STA)
13094 ++ return 0;
13095 ++
13096 + igtk_cmd.ctrl_flags |= cpu_to_le32(STA_KEY_NOT_VALID);
13097 + } else {
13098 + struct ieee80211_key_seq seq;
13099 +@@ -3352,9 +3356,9 @@ int iwl_mvm_remove_sta_key(struct iwl_mvm *mvm,
13100 + IWL_DEBUG_WEP(mvm, "mvm remove dynamic key: idx=%d sta=%d\n",
13101 + keyconf->keyidx, sta_id);
13102 +
13103 +- if (mvm_sta && (keyconf->cipher == WLAN_CIPHER_SUITE_AES_CMAC ||
13104 +- keyconf->cipher == WLAN_CIPHER_SUITE_BIP_GMAC_128 ||
13105 +- keyconf->cipher == WLAN_CIPHER_SUITE_BIP_GMAC_256))
13106 ++ if (keyconf->cipher == WLAN_CIPHER_SUITE_AES_CMAC ||
13107 ++ keyconf->cipher == WLAN_CIPHER_SUITE_BIP_GMAC_128 ||
13108 ++ keyconf->cipher == WLAN_CIPHER_SUITE_BIP_GMAC_256)
13109 + return iwl_mvm_send_sta_igtk(mvm, keyconf, sta_id, true);
13110 +
13111 + if (!__test_and_clear_bit(keyconf->hw_key_idx, mvm->fw_key_table)) {
13112 +diff --git a/drivers/net/wireless/marvell/mwifiex/sta_ioctl.c b/drivers/net/wireless/marvell/mwifiex/sta_ioctl.c
13113 +index 6dd835f1efc2..fbfa0b15d0c8 100644
13114 +--- a/drivers/net/wireless/marvell/mwifiex/sta_ioctl.c
13115 ++++ b/drivers/net/wireless/marvell/mwifiex/sta_ioctl.c
13116 +@@ -232,6 +232,7 @@ static int mwifiex_process_country_ie(struct mwifiex_private *priv,
13117 +
13118 + if (country_ie_len >
13119 + (IEEE80211_COUNTRY_STRING_LEN + MWIFIEX_MAX_TRIPLET_802_11D)) {
13120 ++ rcu_read_unlock();
13121 + mwifiex_dbg(priv->adapter, ERROR,
13122 + "11D: country_ie_len overflow!, deauth AP\n");
13123 + return -EINVAL;
13124 +diff --git a/drivers/nfc/pn544/pn544.c b/drivers/nfc/pn544/pn544.c
13125 +index 70e898e38b16..f30bdf95610f 100644
13126 +--- a/drivers/nfc/pn544/pn544.c
13127 ++++ b/drivers/nfc/pn544/pn544.c
13128 +@@ -704,7 +704,7 @@ static int pn544_hci_check_presence(struct nfc_hci_dev *hdev,
13129 + target->nfcid1_len != 10)
13130 + return -EOPNOTSUPP;
13131 +
13132 +- return nfc_hci_send_cmd(hdev, NFC_HCI_RF_READER_A_GATE,
13133 ++ return nfc_hci_send_cmd(hdev, NFC_HCI_RF_READER_A_GATE,
13134 + PN544_RF_READER_CMD_ACTIVATE_NEXT,
13135 + target->nfcid1, target->nfcid1_len, NULL);
13136 + } else if (target->supported_protocols & (NFC_PROTO_JEWEL_MASK |
13137 +diff --git a/drivers/of/Kconfig b/drivers/of/Kconfig
13138 +index ad3fcad4d75b..5e1315900562 100644
13139 +--- a/drivers/of/Kconfig
13140 ++++ b/drivers/of/Kconfig
13141 +@@ -103,4 +103,8 @@ config OF_OVERLAY
13142 + config OF_NUMA
13143 + bool
13144 +
13145 ++config OF_DMA_DEFAULT_COHERENT
13146 ++ # arches should select this if DMA is coherent by default for OF devices
13147 ++ bool
13148 ++
13149 + endif # OF
13150 +diff --git a/drivers/of/address.c b/drivers/of/address.c
13151 +index 7ddbf0a1ab86..c42aebba35ab 100644
13152 +--- a/drivers/of/address.c
13153 ++++ b/drivers/of/address.c
13154 +@@ -970,12 +970,16 @@ EXPORT_SYMBOL_GPL(of_dma_get_range);
13155 + * @np: device node
13156 + *
13157 + * It returns true if "dma-coherent" property was found
13158 +- * for this device in DT.
13159 ++ * for this device in the DT, or if DMA is coherent by
13160 ++ * default for OF devices on the current platform.
13161 + */
13162 + bool of_dma_is_coherent(struct device_node *np)
13163 + {
13164 + struct device_node *node = of_node_get(np);
13165 +
13166 ++ if (IS_ENABLED(CONFIG_OF_DMA_DEFAULT_COHERENT))
13167 ++ return true;
13168 ++
13169 + while (node) {
13170 + if (of_property_read_bool(node, "dma-coherent")) {
13171 + of_node_put(node);
13172 +diff --git a/drivers/pci/controller/dwc/pci-keystone-dw.c b/drivers/pci/controller/dwc/pci-keystone-dw.c
13173 +index 0682213328e9..15c612e853af 100644
13174 +--- a/drivers/pci/controller/dwc/pci-keystone-dw.c
13175 ++++ b/drivers/pci/controller/dwc/pci-keystone-dw.c
13176 +@@ -425,7 +425,7 @@ void ks_dw_pcie_initiate_link_train(struct keystone_pcie *ks_pcie)
13177 + /* Disable Link training */
13178 + val = ks_dw_app_readl(ks_pcie, CMD_STATUS);
13179 + val &= ~LTSSM_EN_VAL;
13180 +- ks_dw_app_writel(ks_pcie, CMD_STATUS, LTSSM_EN_VAL | val);
13181 ++ ks_dw_app_writel(ks_pcie, CMD_STATUS, val);
13182 +
13183 + /* Initiate Link Training */
13184 + val = ks_dw_app_readl(ks_pcie, CMD_STATUS);
13185 +diff --git a/drivers/pci/controller/pci-tegra.c b/drivers/pci/controller/pci-tegra.c
13186 +index 58e487352853..6f86583605a4 100644
13187 +--- a/drivers/pci/controller/pci-tegra.c
13188 ++++ b/drivers/pci/controller/pci-tegra.c
13189 +@@ -2398,7 +2398,7 @@ static int tegra_pcie_probe(struct platform_device *pdev)
13190 +
13191 + pm_runtime_enable(pcie->dev);
13192 + err = pm_runtime_get_sync(pcie->dev);
13193 +- if (err) {
13194 ++ if (err < 0) {
13195 + dev_err(dev, "fail to enable pcie controller: %d\n", err);
13196 + goto teardown_msi;
13197 + }
13198 +diff --git a/drivers/phy/qualcomm/phy-qcom-apq8064-sata.c b/drivers/phy/qualcomm/phy-qcom-apq8064-sata.c
13199 +index 69ce2afac015..c6925e3e878b 100644
13200 +--- a/drivers/phy/qualcomm/phy-qcom-apq8064-sata.c
13201 ++++ b/drivers/phy/qualcomm/phy-qcom-apq8064-sata.c
13202 +@@ -88,7 +88,7 @@ static int read_poll_timeout(void __iomem *addr, u32 mask)
13203 + if (readl_relaxed(addr) & mask)
13204 + return 0;
13205 +
13206 +- usleep_range(DELAY_INTERVAL_US, DELAY_INTERVAL_US + 50);
13207 ++ usleep_range(DELAY_INTERVAL_US, DELAY_INTERVAL_US + 50);
13208 + } while (!time_after(jiffies, timeout));
13209 +
13210 + return (readl_relaxed(addr) & mask) ? 0 : -ETIMEDOUT;
13211 +diff --git a/drivers/platform/x86/intel_scu_ipc.c b/drivers/platform/x86/intel_scu_ipc.c
13212 +index 75c8fef7a482..54f131bec192 100644
13213 +--- a/drivers/platform/x86/intel_scu_ipc.c
13214 ++++ b/drivers/platform/x86/intel_scu_ipc.c
13215 +@@ -69,26 +69,22 @@
13216 + struct intel_scu_ipc_pdata_t {
13217 + u32 i2c_base;
13218 + u32 i2c_len;
13219 +- u8 irq_mode;
13220 + };
13221 +
13222 + static const struct intel_scu_ipc_pdata_t intel_scu_ipc_lincroft_pdata = {
13223 + .i2c_base = 0xff12b000,
13224 + .i2c_len = 0x10,
13225 +- .irq_mode = 0,
13226 + };
13227 +
13228 + /* Penwell and Cloverview */
13229 + static const struct intel_scu_ipc_pdata_t intel_scu_ipc_penwell_pdata = {
13230 + .i2c_base = 0xff12b000,
13231 + .i2c_len = 0x10,
13232 +- .irq_mode = 1,
13233 + };
13234 +
13235 + static const struct intel_scu_ipc_pdata_t intel_scu_ipc_tangier_pdata = {
13236 + .i2c_base = 0xff00d000,
13237 + .i2c_len = 0x10,
13238 +- .irq_mode = 0,
13239 + };
13240 +
13241 + struct intel_scu_ipc_dev {
13242 +@@ -101,6 +97,9 @@ struct intel_scu_ipc_dev {
13243 +
13244 + static struct intel_scu_ipc_dev ipcdev; /* Only one for now */
13245 +
13246 ++#define IPC_STATUS 0x04
13247 ++#define IPC_STATUS_IRQ BIT(2)
13248 ++
13249 + /*
13250 + * IPC Read Buffer (Read Only):
13251 + * 16 byte buffer for receiving data from SCU, if IPC command
13252 +@@ -122,11 +121,8 @@ static DEFINE_MUTEX(ipclock); /* lock used to prevent multiple call to SCU */
13253 + */
13254 + static inline void ipc_command(struct intel_scu_ipc_dev *scu, u32 cmd)
13255 + {
13256 +- if (scu->irq_mode) {
13257 +- reinit_completion(&scu->cmd_complete);
13258 +- writel(cmd | IPC_IOC, scu->ipc_base);
13259 +- }
13260 +- writel(cmd, scu->ipc_base);
13261 ++ reinit_completion(&scu->cmd_complete);
13262 ++ writel(cmd | IPC_IOC, scu->ipc_base);
13263 + }
13264 +
13265 + /*
13266 +@@ -612,9 +608,10 @@ EXPORT_SYMBOL(intel_scu_ipc_i2c_cntrl);
13267 + static irqreturn_t ioc(int irq, void *dev_id)
13268 + {
13269 + struct intel_scu_ipc_dev *scu = dev_id;
13270 ++ int status = ipc_read_status(scu);
13271 +
13272 +- if (scu->irq_mode)
13273 +- complete(&scu->cmd_complete);
13274 ++ writel(status | IPC_STATUS_IRQ, scu->ipc_base + IPC_STATUS);
13275 ++ complete(&scu->cmd_complete);
13276 +
13277 + return IRQ_HANDLED;
13278 + }
13279 +@@ -640,8 +637,6 @@ static int ipc_probe(struct pci_dev *pdev, const struct pci_device_id *id)
13280 + if (!pdata)
13281 + return -ENODEV;
13282 +
13283 +- scu->irq_mode = pdata->irq_mode;
13284 +-
13285 + err = pcim_enable_device(pdev);
13286 + if (err)
13287 + return err;
13288 +diff --git a/drivers/power/supply/ltc2941-battery-gauge.c b/drivers/power/supply/ltc2941-battery-gauge.c
13289 +index 4f129bb4c972..ff5febea1a21 100644
13290 +--- a/drivers/power/supply/ltc2941-battery-gauge.c
13291 ++++ b/drivers/power/supply/ltc2941-battery-gauge.c
13292 +@@ -448,7 +448,7 @@ static int ltc294x_i2c_remove(struct i2c_client *client)
13293 + {
13294 + struct ltc294x_info *info = i2c_get_clientdata(client);
13295 +
13296 +- cancel_delayed_work(&info->work);
13297 ++ cancel_delayed_work_sync(&info->work);
13298 + power_supply_unregister(info->supply);
13299 + return 0;
13300 + }
13301 +diff --git a/drivers/scsi/csiostor/csio_scsi.c b/drivers/scsi/csiostor/csio_scsi.c
13302 +index e09c7f360dbd..0cb585759de6 100644
13303 +--- a/drivers/scsi/csiostor/csio_scsi.c
13304 ++++ b/drivers/scsi/csiostor/csio_scsi.c
13305 +@@ -1383,7 +1383,7 @@ csio_device_reset(struct device *dev,
13306 + return -EINVAL;
13307 +
13308 + /* Delete NPIV lnodes */
13309 +- csio_lnodes_exit(hw, 1);
13310 ++ csio_lnodes_exit(hw, 1);
13311 +
13312 + /* Block upper IOs */
13313 + csio_lnodes_block_request(hw);
13314 +diff --git a/drivers/scsi/qla2xxx/qla_dbg.c b/drivers/scsi/qla2xxx/qla_dbg.c
13315 +index c7533fa7f46e..36871760a5d3 100644
13316 +--- a/drivers/scsi/qla2xxx/qla_dbg.c
13317 ++++ b/drivers/scsi/qla2xxx/qla_dbg.c
13318 +@@ -2520,12 +2520,6 @@ qla83xx_fw_dump_failed:
13319 + /* Driver Debug Functions. */
13320 + /****************************************************************************/
13321 +
13322 +-static inline int
13323 +-ql_mask_match(uint32_t level)
13324 +-{
13325 +- return (level & ql2xextended_error_logging) == level;
13326 +-}
13327 +-
13328 + /*
13329 + * This function is for formatting and logging debug information.
13330 + * It is to be used when vha is available. It formats the message
13331 +diff --git a/drivers/scsi/qla2xxx/qla_dbg.h b/drivers/scsi/qla2xxx/qla_dbg.h
13332 +index 8877aa97d829..ceca6dd34db1 100644
13333 +--- a/drivers/scsi/qla2xxx/qla_dbg.h
13334 ++++ b/drivers/scsi/qla2xxx/qla_dbg.h
13335 +@@ -374,3 +374,9 @@ extern int qla24xx_dump_ram(struct qla_hw_data *, uint32_t, uint32_t *,
13336 + extern void qla24xx_pause_risc(struct device_reg_24xx __iomem *,
13337 + struct qla_hw_data *);
13338 + extern int qla24xx_soft_reset(struct qla_hw_data *);
13339 ++
13340 ++static inline int
13341 ++ql_mask_match(uint level)
13342 ++{
13343 ++ return (level & ql2xextended_error_logging) == level;
13344 ++}
13345 +diff --git a/drivers/scsi/qla2xxx/qla_isr.c b/drivers/scsi/qla2xxx/qla_isr.c
13346 +index 01ded6c6ad38..f9b3151f4b10 100644
13347 +--- a/drivers/scsi/qla2xxx/qla_isr.c
13348 ++++ b/drivers/scsi/qla2xxx/qla_isr.c
13349 +@@ -1876,6 +1876,18 @@ static void qla24xx_nvme_iocb_entry(scsi_qla_host_t *vha, struct req_que *req,
13350 + inbuf = (uint32_t *)&sts->nvme_ersp_data;
13351 + outbuf = (uint32_t *)fd->rspaddr;
13352 + iocb->u.nvme.rsp_pyld_len = le16_to_cpu(sts->nvme_rsp_pyld_len);
13353 ++ if (unlikely(iocb->u.nvme.rsp_pyld_len >
13354 ++ sizeof(struct nvme_fc_ersp_iu))) {
13355 ++ if (ql_mask_match(ql_dbg_io)) {
13356 ++ WARN_ONCE(1, "Unexpected response payload length %u.\n",
13357 ++ iocb->u.nvme.rsp_pyld_len);
13358 ++ ql_log(ql_log_warn, fcport->vha, 0x5100,
13359 ++ "Unexpected response payload length %u.\n",
13360 ++ iocb->u.nvme.rsp_pyld_len);
13361 ++ }
13362 ++ iocb->u.nvme.rsp_pyld_len =
13363 ++ sizeof(struct nvme_fc_ersp_iu);
13364 ++ }
13365 + iter = iocb->u.nvme.rsp_pyld_len >> 2;
13366 + for (; iter; iter--)
13367 + *outbuf++ = swab32(*inbuf++);
13368 +diff --git a/drivers/scsi/qla2xxx/qla_mbx.c b/drivers/scsi/qla2xxx/qla_mbx.c
13369 +index abef3b29fa10..bef9faea5eee 100644
13370 +--- a/drivers/scsi/qla2xxx/qla_mbx.c
13371 ++++ b/drivers/scsi/qla2xxx/qla_mbx.c
13372 +@@ -5994,9 +5994,8 @@ qla2x00_dump_mctp_data(scsi_qla_host_t *vha, dma_addr_t req_dma, uint32_t addr,
13373 + mcp->mb[7] = LSW(MSD(req_dma));
13374 + mcp->mb[8] = MSW(addr);
13375 + /* Setting RAM ID to valid */
13376 +- mcp->mb[10] |= BIT_7;
13377 + /* For MCTP RAM ID is 0x40 */
13378 +- mcp->mb[10] |= 0x40;
13379 ++ mcp->mb[10] = BIT_7 | 0x40;
13380 +
13381 + mcp->out_mb |= MBX_10|MBX_8|MBX_7|MBX_6|MBX_5|MBX_4|MBX_3|MBX_2|MBX_1|
13382 + MBX_0;
13383 +diff --git a/drivers/scsi/qla2xxx/qla_nx.c b/drivers/scsi/qla2xxx/qla_nx.c
13384 +index de2bc78449e7..3007eecfa509 100644
13385 +--- a/drivers/scsi/qla2xxx/qla_nx.c
13386 ++++ b/drivers/scsi/qla2xxx/qla_nx.c
13387 +@@ -1605,8 +1605,7 @@ qla82xx_get_bootld_offset(struct qla_hw_data *ha)
13388 + return (u8 *)&ha->hablob->fw->data[offset];
13389 + }
13390 +
13391 +-static __le32
13392 +-qla82xx_get_fw_size(struct qla_hw_data *ha)
13393 ++static u32 qla82xx_get_fw_size(struct qla_hw_data *ha)
13394 + {
13395 + struct qla82xx_uri_data_desc *uri_desc = NULL;
13396 +
13397 +@@ -1617,7 +1616,7 @@ qla82xx_get_fw_size(struct qla_hw_data *ha)
13398 + return cpu_to_le32(uri_desc->size);
13399 + }
13400 +
13401 +- return cpu_to_le32(*(u32 *)&ha->hablob->fw->data[FW_SIZE_OFFSET]);
13402 ++ return get_unaligned_le32(&ha->hablob->fw->data[FW_SIZE_OFFSET]);
13403 + }
13404 +
13405 + static u8 *
13406 +@@ -1808,7 +1807,7 @@ qla82xx_fw_load_from_blob(struct qla_hw_data *ha)
13407 + }
13408 +
13409 + flashaddr = FLASH_ADDR_START;
13410 +- size = (__force u32)qla82xx_get_fw_size(ha) / 8;
13411 ++ size = qla82xx_get_fw_size(ha) / 8;
13412 + ptr64 = (u64 *)qla82xx_get_fw_offs(ha);
13413 +
13414 + for (i = 0; i < size; i++) {
13415 +diff --git a/drivers/scsi/qla4xxx/ql4_os.c b/drivers/scsi/qla4xxx/ql4_os.c
13416 +index f8acf101af3d..f59b8982b288 100644
13417 +--- a/drivers/scsi/qla4xxx/ql4_os.c
13418 ++++ b/drivers/scsi/qla4xxx/ql4_os.c
13419 +@@ -4146,7 +4146,7 @@ static void qla4xxx_mem_free(struct scsi_qla_host *ha)
13420 + dma_free_coherent(&ha->pdev->dev, ha->queues_len, ha->queues,
13421 + ha->queues_dma);
13422 +
13423 +- if (ha->fw_dump)
13424 ++ if (ha->fw_dump)
13425 + vfree(ha->fw_dump);
13426 +
13427 + ha->queues_len = 0;
13428 +diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c
13429 +index 3601e770da16..af01be59a721 100644
13430 +--- a/drivers/scsi/ufs/ufshcd.c
13431 ++++ b/drivers/scsi/ufs/ufshcd.c
13432 +@@ -5044,6 +5044,7 @@ static int ufshcd_disable_auto_bkops(struct ufs_hba *hba)
13433 +
13434 + hba->auto_bkops_enabled = false;
13435 + trace_ufshcd_auto_bkops_state(dev_name(hba->dev), "Disabled");
13436 ++ hba->is_urgent_bkops_lvl_checked = false;
13437 + out:
13438 + return err;
13439 + }
13440 +@@ -5068,6 +5069,7 @@ static void ufshcd_force_reset_auto_bkops(struct ufs_hba *hba)
13441 + hba->ee_ctrl_mask &= ~MASK_EE_URGENT_BKOPS;
13442 + ufshcd_disable_auto_bkops(hba);
13443 + }
13444 ++ hba->is_urgent_bkops_lvl_checked = false;
13445 + }
13446 +
13447 + static inline int ufshcd_get_bkops_status(struct ufs_hba *hba, u32 *status)
13448 +@@ -5114,6 +5116,7 @@ static int ufshcd_bkops_ctrl(struct ufs_hba *hba,
13449 + err = ufshcd_enable_auto_bkops(hba);
13450 + else
13451 + err = ufshcd_disable_auto_bkops(hba);
13452 ++ hba->urgent_bkops_lvl = curr_status;
13453 + out:
13454 + return err;
13455 + }
13456 +diff --git a/drivers/usb/gadget/function/f_ecm.c b/drivers/usb/gadget/function/f_ecm.c
13457 +index 460d5d7c984f..7f5cf488b2b1 100644
13458 +--- a/drivers/usb/gadget/function/f_ecm.c
13459 ++++ b/drivers/usb/gadget/function/f_ecm.c
13460 +@@ -52,6 +52,7 @@ struct f_ecm {
13461 + struct usb_ep *notify;
13462 + struct usb_request *notify_req;
13463 + u8 notify_state;
13464 ++ atomic_t notify_count;
13465 + bool is_open;
13466 +
13467 + /* FIXME is_open needs some irq-ish locking
13468 +@@ -380,7 +381,7 @@ static void ecm_do_notify(struct f_ecm *ecm)
13469 + int status;
13470 +
13471 + /* notification already in flight? */
13472 +- if (!req)
13473 ++ if (atomic_read(&ecm->notify_count))
13474 + return;
13475 +
13476 + event = req->buf;
13477 +@@ -420,10 +421,10 @@ static void ecm_do_notify(struct f_ecm *ecm)
13478 + event->bmRequestType = 0xA1;
13479 + event->wIndex = cpu_to_le16(ecm->ctrl_id);
13480 +
13481 +- ecm->notify_req = NULL;
13482 ++ atomic_inc(&ecm->notify_count);
13483 + status = usb_ep_queue(ecm->notify, req, GFP_ATOMIC);
13484 + if (status < 0) {
13485 +- ecm->notify_req = req;
13486 ++ atomic_dec(&ecm->notify_count);
13487 + DBG(cdev, "notify --> %d\n", status);
13488 + }
13489 + }
13490 +@@ -448,17 +449,19 @@ static void ecm_notify_complete(struct usb_ep *ep, struct usb_request *req)
13491 + switch (req->status) {
13492 + case 0:
13493 + /* no fault */
13494 ++ atomic_dec(&ecm->notify_count);
13495 + break;
13496 + case -ECONNRESET:
13497 + case -ESHUTDOWN:
13498 ++ atomic_set(&ecm->notify_count, 0);
13499 + ecm->notify_state = ECM_NOTIFY_NONE;
13500 + break;
13501 + default:
13502 + DBG(cdev, "event %02x --> %d\n",
13503 + event->bNotificationType, req->status);
13504 ++ atomic_dec(&ecm->notify_count);
13505 + break;
13506 + }
13507 +- ecm->notify_req = req;
13508 + ecm_do_notify(ecm);
13509 + }
13510 +
13511 +@@ -907,6 +910,11 @@ static void ecm_unbind(struct usb_configuration *c, struct usb_function *f)
13512 +
13513 + usb_free_all_descriptors(f);
13514 +
13515 ++ if (atomic_read(&ecm->notify_count)) {
13516 ++ usb_ep_dequeue(ecm->notify, ecm->notify_req);
13517 ++ atomic_set(&ecm->notify_count, 0);
13518 ++ }
13519 ++
13520 + kfree(ecm->notify_req->buf);
13521 + usb_ep_free_request(ecm->notify, ecm->notify_req);
13522 + }
13523 +diff --git a/drivers/usb/gadget/function/f_ncm.c b/drivers/usb/gadget/function/f_ncm.c
13524 +index 5780fba620ab..cfca4584ae13 100644
13525 +--- a/drivers/usb/gadget/function/f_ncm.c
13526 ++++ b/drivers/usb/gadget/function/f_ncm.c
13527 +@@ -54,6 +54,7 @@ struct f_ncm {
13528 + struct usb_ep *notify;
13529 + struct usb_request *notify_req;
13530 + u8 notify_state;
13531 ++ atomic_t notify_count;
13532 + bool is_open;
13533 +
13534 + const struct ndp_parser_opts *parser_opts;
13535 +@@ -547,7 +548,7 @@ static void ncm_do_notify(struct f_ncm *ncm)
13536 + int status;
13537 +
13538 + /* notification already in flight? */
13539 +- if (!req)
13540 ++ if (atomic_read(&ncm->notify_count))
13541 + return;
13542 +
13543 + event = req->buf;
13544 +@@ -587,7 +588,8 @@ static void ncm_do_notify(struct f_ncm *ncm)
13545 + event->bmRequestType = 0xA1;
13546 + event->wIndex = cpu_to_le16(ncm->ctrl_id);
13547 +
13548 +- ncm->notify_req = NULL;
13549 ++ atomic_inc(&ncm->notify_count);
13550 ++
13551 + /*
13552 + * In double buffering if there is a space in FIFO,
13553 + * completion callback can be called right after the call,
13554 +@@ -597,7 +599,7 @@ static void ncm_do_notify(struct f_ncm *ncm)
13555 + status = usb_ep_queue(ncm->notify, req, GFP_ATOMIC);
13556 + spin_lock(&ncm->lock);
13557 + if (status < 0) {
13558 +- ncm->notify_req = req;
13559 ++ atomic_dec(&ncm->notify_count);
13560 + DBG(cdev, "notify --> %d\n", status);
13561 + }
13562 + }
13563 +@@ -632,17 +634,19 @@ static void ncm_notify_complete(struct usb_ep *ep, struct usb_request *req)
13564 + case 0:
13565 + VDBG(cdev, "Notification %02x sent\n",
13566 + event->bNotificationType);
13567 ++ atomic_dec(&ncm->notify_count);
13568 + break;
13569 + case -ECONNRESET:
13570 + case -ESHUTDOWN:
13571 ++ atomic_set(&ncm->notify_count, 0);
13572 + ncm->notify_state = NCM_NOTIFY_NONE;
13573 + break;
13574 + default:
13575 + DBG(cdev, "event %02x --> %d\n",
13576 + event->bNotificationType, req->status);
13577 ++ atomic_dec(&ncm->notify_count);
13578 + break;
13579 + }
13580 +- ncm->notify_req = req;
13581 + ncm_do_notify(ncm);
13582 + spin_unlock(&ncm->lock);
13583 + }
13584 +@@ -1612,6 +1616,11 @@ static void ncm_unbind(struct usb_configuration *c, struct usb_function *f)
13585 + ncm_string_defs[0].id = 0;
13586 + usb_free_all_descriptors(f);
13587 +
13588 ++ if (atomic_read(&ncm->notify_count)) {
13589 ++ usb_ep_dequeue(ncm->notify, ncm->notify_req);
13590 ++ atomic_set(&ncm->notify_count, 0);
13591 ++ }
13592 ++
13593 + kfree(ncm->notify_req->buf);
13594 + usb_ep_free_request(ncm->notify, ncm->notify_req);
13595 + }
13596 +diff --git a/drivers/usb/gadget/legacy/cdc2.c b/drivers/usb/gadget/legacy/cdc2.c
13597 +index da1c37933ca1..8d7a556ece30 100644
13598 +--- a/drivers/usb/gadget/legacy/cdc2.c
13599 ++++ b/drivers/usb/gadget/legacy/cdc2.c
13600 +@@ -225,7 +225,7 @@ static struct usb_composite_driver cdc_driver = {
13601 + .name = "g_cdc",
13602 + .dev = &device_desc,
13603 + .strings = dev_strings,
13604 +- .max_speed = USB_SPEED_HIGH,
13605 ++ .max_speed = USB_SPEED_SUPER,
13606 + .bind = cdc_bind,
13607 + .unbind = cdc_unbind,
13608 + };
13609 +diff --git a/drivers/usb/gadget/legacy/g_ffs.c b/drivers/usb/gadget/legacy/g_ffs.c
13610 +index b640ed3fcf70..ae6d8f7092b8 100644
13611 +--- a/drivers/usb/gadget/legacy/g_ffs.c
13612 ++++ b/drivers/usb/gadget/legacy/g_ffs.c
13613 +@@ -149,7 +149,7 @@ static struct usb_composite_driver gfs_driver = {
13614 + .name = DRIVER_NAME,
13615 + .dev = &gfs_dev_desc,
13616 + .strings = gfs_dev_strings,
13617 +- .max_speed = USB_SPEED_HIGH,
13618 ++ .max_speed = USB_SPEED_SUPER,
13619 + .bind = gfs_bind,
13620 + .unbind = gfs_unbind,
13621 + };
13622 +diff --git a/drivers/usb/gadget/legacy/multi.c b/drivers/usb/gadget/legacy/multi.c
13623 +index 50515f9e1022..ec9749845660 100644
13624 +--- a/drivers/usb/gadget/legacy/multi.c
13625 ++++ b/drivers/usb/gadget/legacy/multi.c
13626 +@@ -482,7 +482,7 @@ static struct usb_composite_driver multi_driver = {
13627 + .name = "g_multi",
13628 + .dev = &device_desc,
13629 + .strings = dev_strings,
13630 +- .max_speed = USB_SPEED_HIGH,
13631 ++ .max_speed = USB_SPEED_SUPER,
13632 + .bind = multi_bind,
13633 + .unbind = multi_unbind,
13634 + .needs_serial = 1,
13635 +diff --git a/drivers/usb/gadget/legacy/ncm.c b/drivers/usb/gadget/legacy/ncm.c
13636 +index 8465f081e921..c61e71ba7045 100644
13637 +--- a/drivers/usb/gadget/legacy/ncm.c
13638 ++++ b/drivers/usb/gadget/legacy/ncm.c
13639 +@@ -197,7 +197,7 @@ static struct usb_composite_driver ncm_driver = {
13640 + .name = "g_ncm",
13641 + .dev = &device_desc,
13642 + .strings = dev_strings,
13643 +- .max_speed = USB_SPEED_HIGH,
13644 ++ .max_speed = USB_SPEED_SUPER,
13645 + .bind = gncm_bind,
13646 + .unbind = gncm_unbind,
13647 + };
13648 +diff --git a/drivers/usb/typec/tcpci.c b/drivers/usb/typec/tcpci.c
13649 +index c1f7073a56de..dfae41fe1331 100644
13650 +--- a/drivers/usb/typec/tcpci.c
13651 ++++ b/drivers/usb/typec/tcpci.c
13652 +@@ -581,6 +581,12 @@ static int tcpci_probe(struct i2c_client *client,
13653 + static int tcpci_remove(struct i2c_client *client)
13654 + {
13655 + struct tcpci_chip *chip = i2c_get_clientdata(client);
13656 ++ int err;
13657 ++
13658 ++ /* Disable chip interrupts before unregistering port */
13659 ++ err = tcpci_write16(chip->tcpci, TCPC_ALERT_MASK, 0);
13660 ++ if (err < 0)
13661 ++ return err;
13662 +
13663 + tcpci_unregister_port(chip->tcpci);
13664 +
13665 +diff --git a/drivers/watchdog/watchdog_core.c b/drivers/watchdog/watchdog_core.c
13666 +index eb8fa25f8eb2..8b1f37ffb65a 100644
13667 +--- a/drivers/watchdog/watchdog_core.c
13668 ++++ b/drivers/watchdog/watchdog_core.c
13669 +@@ -138,6 +138,25 @@ int watchdog_init_timeout(struct watchdog_device *wdd,
13670 + }
13671 + EXPORT_SYMBOL_GPL(watchdog_init_timeout);
13672 +
13673 ++static int watchdog_reboot_notifier(struct notifier_block *nb,
13674 ++ unsigned long code, void *data)
13675 ++{
13676 ++ struct watchdog_device *wdd;
13677 ++
13678 ++ wdd = container_of(nb, struct watchdog_device, reboot_nb);
13679 ++ if (code == SYS_DOWN || code == SYS_HALT) {
13680 ++ if (watchdog_active(wdd)) {
13681 ++ int ret;
13682 ++
13683 ++ ret = wdd->ops->stop(wdd);
13684 ++ if (ret)
13685 ++ return NOTIFY_BAD;
13686 ++ }
13687 ++ }
13688 ++
13689 ++ return NOTIFY_DONE;
13690 ++}
13691 ++
13692 + static int watchdog_restart_notifier(struct notifier_block *nb,
13693 + unsigned long action, void *data)
13694 + {
13695 +@@ -226,6 +245,19 @@ static int __watchdog_register_device(struct watchdog_device *wdd)
13696 + }
13697 + }
13698 +
13699 ++ if (test_bit(WDOG_STOP_ON_REBOOT, &wdd->status)) {
13700 ++ wdd->reboot_nb.notifier_call = watchdog_reboot_notifier;
13701 ++
13702 ++ ret = register_reboot_notifier(&wdd->reboot_nb);
13703 ++ if (ret) {
13704 ++ pr_err("watchdog%d: Cannot register reboot notifier (%d)\n",
13705 ++ wdd->id, ret);
13706 ++ watchdog_dev_unregister(wdd);
13707 ++ ida_simple_remove(&watchdog_ida, id);
13708 ++ return ret;
13709 ++ }
13710 ++ }
13711 ++
13712 + if (wdd->ops->restart) {
13713 + wdd->restart_nb.notifier_call = watchdog_restart_notifier;
13714 +
13715 +@@ -271,6 +303,9 @@ static void __watchdog_unregister_device(struct watchdog_device *wdd)
13716 + if (wdd->ops->restart)
13717 + unregister_restart_handler(&wdd->restart_nb);
13718 +
13719 ++ if (test_bit(WDOG_STOP_ON_REBOOT, &wdd->status))
13720 ++ unregister_reboot_notifier(&wdd->reboot_nb);
13721 ++
13722 + watchdog_dev_unregister(wdd);
13723 + ida_simple_remove(&watchdog_ida, wdd->id);
13724 + }
13725 +diff --git a/drivers/watchdog/watchdog_dev.c b/drivers/watchdog/watchdog_dev.c
13726 +index 4b89333e8eb4..e64aa88e99da 100644
13727 +--- a/drivers/watchdog/watchdog_dev.c
13728 ++++ b/drivers/watchdog/watchdog_dev.c
13729 +@@ -42,7 +42,6 @@
13730 + #include <linux/miscdevice.h> /* For handling misc devices */
13731 + #include <linux/module.h> /* For module stuff/... */
13732 + #include <linux/mutex.h> /* For mutexes */
13733 +-#include <linux/reboot.h> /* For reboot notifier */
13734 + #include <linux/slab.h> /* For memory functions */
13735 + #include <linux/types.h> /* For standard types (like size_t) */
13736 + #include <linux/watchdog.h> /* For watchdog specific items */
13737 +@@ -1048,25 +1047,6 @@ static void watchdog_cdev_unregister(struct watchdog_device *wdd)
13738 + put_device(&wd_data->dev);
13739 + }
13740 +
13741 +-static int watchdog_reboot_notifier(struct notifier_block *nb,
13742 +- unsigned long code, void *data)
13743 +-{
13744 +- struct watchdog_device *wdd;
13745 +-
13746 +- wdd = container_of(nb, struct watchdog_device, reboot_nb);
13747 +- if (code == SYS_DOWN || code == SYS_HALT) {
13748 +- if (watchdog_active(wdd)) {
13749 +- int ret;
13750 +-
13751 +- ret = wdd->ops->stop(wdd);
13752 +- if (ret)
13753 +- return NOTIFY_BAD;
13754 +- }
13755 +- }
13756 +-
13757 +- return NOTIFY_DONE;
13758 +-}
13759 +-
13760 + /*
13761 + * watchdog_dev_register: register a watchdog device
13762 + * @wdd: watchdog device
13763 +@@ -1085,22 +1065,8 @@ int watchdog_dev_register(struct watchdog_device *wdd)
13764 + return ret;
13765 +
13766 + ret = watchdog_register_pretimeout(wdd);
13767 +- if (ret) {
13768 ++ if (ret)
13769 + watchdog_cdev_unregister(wdd);
13770 +- return ret;
13771 +- }
13772 +-
13773 +- if (test_bit(WDOG_STOP_ON_REBOOT, &wdd->status)) {
13774 +- wdd->reboot_nb.notifier_call = watchdog_reboot_notifier;
13775 +-
13776 +- ret = devm_register_reboot_notifier(&wdd->wd_data->dev,
13777 +- &wdd->reboot_nb);
13778 +- if (ret) {
13779 +- pr_err("watchdog%d: Cannot register reboot notifier (%d)\n",
13780 +- wdd->id, ret);
13781 +- watchdog_dev_unregister(wdd);
13782 +- }
13783 +- }
13784 +
13785 + return ret;
13786 + }
13787 +diff --git a/drivers/xen/xen-balloon.c b/drivers/xen/xen-balloon.c
13788 +index 2acbfe104e46..3aab77916915 100644
13789 +--- a/drivers/xen/xen-balloon.c
13790 ++++ b/drivers/xen/xen-balloon.c
13791 +@@ -83,7 +83,7 @@ static void watch_target(struct xenbus_watch *watch,
13792 + "%llu", &static_max) == 1))
13793 + static_max >>= PAGE_SHIFT - 10;
13794 + else
13795 +- static_max = new_target;
13796 ++ static_max = balloon_stats.current_pages;
13797 +
13798 + target_diff = (xen_pv_domain() || xen_initial_domain()) ? 0
13799 + : static_max - balloon_stats.target_pages;
13800 +diff --git a/fs/aio.c b/fs/aio.c
13801 +index 911e23087dfb..b5fbf2061868 100644
13802 +--- a/fs/aio.c
13803 ++++ b/fs/aio.c
13804 +@@ -1600,6 +1600,14 @@ static int aio_fsync(struct fsync_iocb *req, const struct iocb *iocb,
13805 + return 0;
13806 + }
13807 +
13808 ++static void aio_poll_put_work(struct work_struct *work)
13809 ++{
13810 ++ struct poll_iocb *req = container_of(work, struct poll_iocb, work);
13811 ++ struct aio_kiocb *iocb = container_of(req, struct aio_kiocb, poll);
13812 ++
13813 ++ iocb_put(iocb);
13814 ++}
13815 ++
13816 + static void aio_poll_complete_work(struct work_struct *work)
13817 + {
13818 + struct poll_iocb *req = container_of(work, struct poll_iocb, work);
13819 +@@ -1664,6 +1672,8 @@ static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
13820 + list_del_init(&req->wait.entry);
13821 +
13822 + if (mask && spin_trylock_irqsave(&iocb->ki_ctx->ctx_lock, flags)) {
13823 ++ struct kioctx *ctx = iocb->ki_ctx;
13824 ++
13825 + /*
13826 + * Try to complete the iocb inline if we can. Use
13827 + * irqsave/irqrestore because not all filesystems (e.g. fuse)
13828 +@@ -1673,8 +1683,14 @@ static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
13829 + list_del(&iocb->ki_list);
13830 + iocb->ki_res.res = mangle_poll(mask);
13831 + req->done = true;
13832 +- spin_unlock_irqrestore(&iocb->ki_ctx->ctx_lock, flags);
13833 +- iocb_put(iocb);
13834 ++ if (iocb->ki_eventfd && eventfd_signal_count()) {
13835 ++ iocb = NULL;
13836 ++ INIT_WORK(&req->work, aio_poll_put_work);
13837 ++ schedule_work(&req->work);
13838 ++ }
13839 ++ spin_unlock_irqrestore(&ctx->ctx_lock, flags);
13840 ++ if (iocb)
13841 ++ iocb_put(iocb);
13842 + } else {
13843 + schedule_work(&req->work);
13844 + }
13845 +diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
13846 +index 84ff398ae70b..c9943d70e2cb 100644
13847 +--- a/fs/btrfs/ctree.c
13848 ++++ b/fs/btrfs/ctree.c
13849 +@@ -337,12 +337,10 @@ u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
13850 + struct seq_list *elem)
13851 + {
13852 + write_lock(&fs_info->tree_mod_log_lock);
13853 +- spin_lock(&fs_info->tree_mod_seq_lock);
13854 + if (!elem->seq) {
13855 + elem->seq = btrfs_inc_tree_mod_seq(fs_info);
13856 + list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
13857 + }
13858 +- spin_unlock(&fs_info->tree_mod_seq_lock);
13859 + write_unlock(&fs_info->tree_mod_log_lock);
13860 +
13861 + return elem->seq;
13862 +@@ -362,7 +360,7 @@ void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
13863 + if (!seq_putting)
13864 + return;
13865 +
13866 +- spin_lock(&fs_info->tree_mod_seq_lock);
13867 ++ write_lock(&fs_info->tree_mod_log_lock);
13868 + list_del(&elem->list);
13869 + elem->seq = 0;
13870 +
13871 +@@ -373,19 +371,17 @@ void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
13872 + * blocker with lower sequence number exists, we
13873 + * cannot remove anything from the log
13874 + */
13875 +- spin_unlock(&fs_info->tree_mod_seq_lock);
13876 ++ write_unlock(&fs_info->tree_mod_log_lock);
13877 + return;
13878 + }
13879 + min_seq = cur_elem->seq;
13880 + }
13881 + }
13882 +- spin_unlock(&fs_info->tree_mod_seq_lock);
13883 +
13884 + /*
13885 + * anything that's lower than the lowest existing (read: blocked)
13886 + * sequence number can be removed from the tree.
13887 + */
13888 +- write_lock(&fs_info->tree_mod_log_lock);
13889 + tm_root = &fs_info->tree_mod_log;
13890 + for (node = rb_first(tm_root); node; node = next) {
13891 + next = rb_next(node);
13892 +diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
13893 +index d24ecbf938b6..15cb96ad15d8 100644
13894 +--- a/fs/btrfs/ctree.h
13895 ++++ b/fs/btrfs/ctree.h
13896 +@@ -893,14 +893,12 @@ struct btrfs_fs_info {
13897 + struct list_head delayed_iputs;
13898 + struct mutex cleaner_delayed_iput_mutex;
13899 +
13900 +- /* this protects tree_mod_seq_list */
13901 +- spinlock_t tree_mod_seq_lock;
13902 + atomic64_t tree_mod_seq;
13903 +- struct list_head tree_mod_seq_list;
13904 +
13905 +- /* this protects tree_mod_log */
13906 ++ /* this protects tree_mod_log and tree_mod_seq_list */
13907 + rwlock_t tree_mod_log_lock;
13908 + struct rb_root tree_mod_log;
13909 ++ struct list_head tree_mod_seq_list;
13910 +
13911 + atomic_t async_delalloc_pages;
13912 +
13913 +diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
13914 +index 7e5c81e80e15..09a12115b640 100644
13915 +--- a/fs/btrfs/delayed-ref.c
13916 ++++ b/fs/btrfs/delayed-ref.c
13917 +@@ -301,7 +301,7 @@ void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans,
13918 + if (head->is_data)
13919 + return;
13920 +
13921 +- spin_lock(&fs_info->tree_mod_seq_lock);
13922 ++ read_lock(&fs_info->tree_mod_log_lock);
13923 + if (!list_empty(&fs_info->tree_mod_seq_list)) {
13924 + struct seq_list *elem;
13925 +
13926 +@@ -309,7 +309,7 @@ void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans,
13927 + struct seq_list, list);
13928 + seq = elem->seq;
13929 + }
13930 +- spin_unlock(&fs_info->tree_mod_seq_lock);
13931 ++ read_unlock(&fs_info->tree_mod_log_lock);
13932 +
13933 + again:
13934 + for (node = rb_first(&head->ref_tree); node; node = rb_next(node)) {
13935 +@@ -326,7 +326,7 @@ int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq)
13936 + struct seq_list *elem;
13937 + int ret = 0;
13938 +
13939 +- spin_lock(&fs_info->tree_mod_seq_lock);
13940 ++ read_lock(&fs_info->tree_mod_log_lock);
13941 + if (!list_empty(&fs_info->tree_mod_seq_list)) {
13942 + elem = list_first_entry(&fs_info->tree_mod_seq_list,
13943 + struct seq_list, list);
13944 +@@ -339,7 +339,7 @@ int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq)
13945 + }
13946 + }
13947 +
13948 +- spin_unlock(&fs_info->tree_mod_seq_lock);
13949 ++ read_unlock(&fs_info->tree_mod_log_lock);
13950 + return ret;
13951 + }
13952 +
13953 +diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
13954 +index e12c37f457e0..9e467e8a8cb5 100644
13955 +--- a/fs/btrfs/disk-io.c
13956 ++++ b/fs/btrfs/disk-io.c
13957 +@@ -2031,7 +2031,7 @@ static void free_root_extent_buffers(struct btrfs_root *root)
13958 + }
13959 +
13960 + /* helper to cleanup tree roots */
13961 +-static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root)
13962 ++static void free_root_pointers(struct btrfs_fs_info *info, bool free_chunk_root)
13963 + {
13964 + free_root_extent_buffers(info->tree_root);
13965 +
13966 +@@ -2040,7 +2040,7 @@ static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root)
13967 + free_root_extent_buffers(info->csum_root);
13968 + free_root_extent_buffers(info->quota_root);
13969 + free_root_extent_buffers(info->uuid_root);
13970 +- if (chunk_root)
13971 ++ if (free_chunk_root)
13972 + free_root_extent_buffers(info->chunk_root);
13973 + free_root_extent_buffers(info->free_space_root);
13974 + }
13975 +@@ -2645,7 +2645,6 @@ int open_ctree(struct super_block *sb,
13976 + spin_lock_init(&fs_info->fs_roots_radix_lock);
13977 + spin_lock_init(&fs_info->delayed_iput_lock);
13978 + spin_lock_init(&fs_info->defrag_inodes_lock);
13979 +- spin_lock_init(&fs_info->tree_mod_seq_lock);
13980 + spin_lock_init(&fs_info->super_lock);
13981 + spin_lock_init(&fs_info->qgroup_op_lock);
13982 + spin_lock_init(&fs_info->buffer_lock);
13983 +@@ -3274,7 +3273,7 @@ fail_block_groups:
13984 + btrfs_put_block_group_cache(fs_info);
13985 +
13986 + fail_tree_roots:
13987 +- free_root_pointers(fs_info, 1);
13988 ++ free_root_pointers(fs_info, true);
13989 + invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
13990 +
13991 + fail_sb_buffer:
13992 +@@ -3302,7 +3301,7 @@ recovery_tree_root:
13993 + if (!btrfs_test_opt(fs_info, USEBACKUPROOT))
13994 + goto fail_tree_roots;
13995 +
13996 +- free_root_pointers(fs_info, 0);
13997 ++ free_root_pointers(fs_info, false);
13998 +
13999 + /* don't use the log in recovery mode, it won't be valid */
14000 + btrfs_set_super_log_root(disk_super, 0);
14001 +@@ -3984,10 +3983,17 @@ void close_ctree(struct btrfs_fs_info *fs_info)
14002 + invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
14003 + btrfs_stop_all_workers(fs_info);
14004 +
14005 +- btrfs_free_block_groups(fs_info);
14006 +-
14007 + clear_bit(BTRFS_FS_OPEN, &fs_info->flags);
14008 +- free_root_pointers(fs_info, 1);
14009 ++ free_root_pointers(fs_info, true);
14010 ++
14011 ++ /*
14012 ++ * We must free the block groups after dropping the fs_roots as we could
14013 ++ * have had an IO error and have left over tree log blocks that aren't
14014 ++ * cleaned up until the fs roots are freed. This makes the block group
14015 ++ * accounting appear to be wrong because there's pending reserved bytes,
14016 ++ * so make sure we do the block group cleanup afterwards.
14017 ++ */
14018 ++ btrfs_free_block_groups(fs_info);
14019 +
14020 + iput(fs_info->btree_inode);
14021 +
14022 +diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
14023 +index fed44390c049..11efb4f5041c 100644
14024 +--- a/fs/btrfs/extent_io.c
14025 ++++ b/fs/btrfs/extent_io.c
14026 +@@ -4014,6 +4014,14 @@ retry:
14027 + */
14028 + scanned = 1;
14029 + index = 0;
14030 ++
14031 ++ /*
14032 ++ * If we're looping we could run into a page that is locked by a
14033 ++ * writer and that writer could be waiting on writeback for a
14034 ++ * page in our current bio, and thus deadlock, so flush the
14035 ++ * write bio here.
14036 ++ */
14037 ++ flush_write_bio(epd);
14038 + goto retry;
14039 + }
14040 +
14041 +diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c
14042 +index db72b3b6209e..2eec1dd3803a 100644
14043 +--- a/fs/btrfs/tests/btrfs-tests.c
14044 ++++ b/fs/btrfs/tests/btrfs-tests.c
14045 +@@ -102,7 +102,6 @@ struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(u32 nodesize, u32 sectorsize)
14046 + spin_lock_init(&fs_info->qgroup_op_lock);
14047 + spin_lock_init(&fs_info->super_lock);
14048 + spin_lock_init(&fs_info->fs_roots_radix_lock);
14049 +- spin_lock_init(&fs_info->tree_mod_seq_lock);
14050 + mutex_init(&fs_info->qgroup_ioctl_lock);
14051 + mutex_init(&fs_info->qgroup_rescan_lock);
14052 + rwlock_init(&fs_info->tree_mod_log_lock);
14053 +diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
14054 +index 26317bca5649..4b1491e1b803 100644
14055 +--- a/fs/btrfs/transaction.c
14056 ++++ b/fs/btrfs/transaction.c
14057 +@@ -1936,6 +1936,14 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
14058 + struct btrfs_transaction *prev_trans = NULL;
14059 + int ret;
14060 +
14061 ++ /*
14062 ++ * Some places just start a transaction to commit it. We need to make
14063 ++ * sure that if this commit fails that the abort code actually marks the
14064 ++ * transaction as failed, so set trans->dirty to make the abort code do
14065 ++ * the right thing.
14066 ++ */
14067 ++ trans->dirty = true;
14068 ++
14069 + /* Stop the commit early if ->aborted is set */
14070 + if (unlikely(READ_ONCE(cur_trans->aborted))) {
14071 + ret = cur_trans->aborted;
14072 +diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
14073 +index fe7165c9d875..d4c86c6cbe39 100644
14074 +--- a/fs/btrfs/tree-log.c
14075 ++++ b/fs/btrfs/tree-log.c
14076 +@@ -3892,7 +3892,7 @@ static int log_csums(struct btrfs_trans_handle *trans,
14077 + static noinline int copy_items(struct btrfs_trans_handle *trans,
14078 + struct btrfs_inode *inode,
14079 + struct btrfs_path *dst_path,
14080 +- struct btrfs_path *src_path, u64 *last_extent,
14081 ++ struct btrfs_path *src_path,
14082 + int start_slot, int nr, int inode_only,
14083 + u64 logged_isize)
14084 + {
14085 +@@ -3903,7 +3903,6 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
14086 + struct btrfs_file_extent_item *extent;
14087 + struct btrfs_inode_item *inode_item;
14088 + struct extent_buffer *src = src_path->nodes[0];
14089 +- struct btrfs_key first_key, last_key, key;
14090 + int ret;
14091 + struct btrfs_key *ins_keys;
14092 + u32 *ins_sizes;
14093 +@@ -3911,9 +3910,6 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
14094 + int i;
14095 + struct list_head ordered_sums;
14096 + int skip_csum = inode->flags & BTRFS_INODE_NODATASUM;
14097 +- bool has_extents = false;
14098 +- bool need_find_last_extent = true;
14099 +- bool done = false;
14100 +
14101 + INIT_LIST_HEAD(&ordered_sums);
14102 +
14103 +@@ -3922,8 +3918,6 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
14104 + if (!ins_data)
14105 + return -ENOMEM;
14106 +
14107 +- first_key.objectid = (u64)-1;
14108 +-
14109 + ins_sizes = (u32 *)ins_data;
14110 + ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32));
14111 +
14112 +@@ -3944,9 +3938,6 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
14113 +
14114 + src_offset = btrfs_item_ptr_offset(src, start_slot + i);
14115 +
14116 +- if (i == nr - 1)
14117 +- last_key = ins_keys[i];
14118 +-
14119 + if (ins_keys[i].type == BTRFS_INODE_ITEM_KEY) {
14120 + inode_item = btrfs_item_ptr(dst_path->nodes[0],
14121 + dst_path->slots[0],
14122 +@@ -3960,20 +3951,6 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
14123 + src_offset, ins_sizes[i]);
14124 + }
14125 +
14126 +- /*
14127 +- * We set need_find_last_extent here in case we know we were
14128 +- * processing other items and then walk into the first extent in
14129 +- * the inode. If we don't hit an extent then nothing changes,
14130 +- * we'll do the last search the next time around.
14131 +- */
14132 +- if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY) {
14133 +- has_extents = true;
14134 +- if (first_key.objectid == (u64)-1)
14135 +- first_key = ins_keys[i];
14136 +- } else {
14137 +- need_find_last_extent = false;
14138 +- }
14139 +-
14140 + /* take a reference on file data extents so that truncates
14141 + * or deletes of this inode don't have to relog the inode
14142 + * again
14143 +@@ -4039,167 +4016,6 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
14144 + kfree(sums);
14145 + }
14146 +
14147 +- if (!has_extents)
14148 +- return ret;
14149 +-
14150 +- if (need_find_last_extent && *last_extent == first_key.offset) {
14151 +- /*
14152 +- * We don't have any leafs between our current one and the one
14153 +- * we processed before that can have file extent items for our
14154 +- * inode (and have a generation number smaller than our current
14155 +- * transaction id).
14156 +- */
14157 +- need_find_last_extent = false;
14158 +- }
14159 +-
14160 +- /*
14161 +- * Because we use btrfs_search_forward we could skip leaves that were
14162 +- * not modified and then assume *last_extent is valid when it really
14163 +- * isn't. So back up to the previous leaf and read the end of the last
14164 +- * extent before we go and fill in holes.
14165 +- */
14166 +- if (need_find_last_extent) {
14167 +- u64 len;
14168 +-
14169 +- ret = btrfs_prev_leaf(inode->root, src_path);
14170 +- if (ret < 0)
14171 +- return ret;
14172 +- if (ret)
14173 +- goto fill_holes;
14174 +- if (src_path->slots[0])
14175 +- src_path->slots[0]--;
14176 +- src = src_path->nodes[0];
14177 +- btrfs_item_key_to_cpu(src, &key, src_path->slots[0]);
14178 +- if (key.objectid != btrfs_ino(inode) ||
14179 +- key.type != BTRFS_EXTENT_DATA_KEY)
14180 +- goto fill_holes;
14181 +- extent = btrfs_item_ptr(src, src_path->slots[0],
14182 +- struct btrfs_file_extent_item);
14183 +- if (btrfs_file_extent_type(src, extent) ==
14184 +- BTRFS_FILE_EXTENT_INLINE) {
14185 +- len = btrfs_file_extent_ram_bytes(src, extent);
14186 +- *last_extent = ALIGN(key.offset + len,
14187 +- fs_info->sectorsize);
14188 +- } else {
14189 +- len = btrfs_file_extent_num_bytes(src, extent);
14190 +- *last_extent = key.offset + len;
14191 +- }
14192 +- }
14193 +-fill_holes:
14194 +- /* So we did prev_leaf, now we need to move to the next leaf, but a few
14195 +- * things could have happened
14196 +- *
14197 +- * 1) A merge could have happened, so we could currently be on a leaf
14198 +- * that holds what we were copying in the first place.
14199 +- * 2) A split could have happened, and now not all of the items we want
14200 +- * are on the same leaf.
14201 +- *
14202 +- * So we need to adjust how we search for holes, we need to drop the
14203 +- * path and re-search for the first extent key we found, and then walk
14204 +- * forward until we hit the last one we copied.
14205 +- */
14206 +- if (need_find_last_extent) {
14207 +- /* btrfs_prev_leaf could return 1 without releasing the path */
14208 +- btrfs_release_path(src_path);
14209 +- ret = btrfs_search_slot(NULL, inode->root, &first_key,
14210 +- src_path, 0, 0);
14211 +- if (ret < 0)
14212 +- return ret;
14213 +- ASSERT(ret == 0);
14214 +- src = src_path->nodes[0];
14215 +- i = src_path->slots[0];
14216 +- } else {
14217 +- i = start_slot;
14218 +- }
14219 +-
14220 +- /*
14221 +- * Ok so here we need to go through and fill in any holes we may have
14222 +- * to make sure that holes are punched for those areas in case they had
14223 +- * extents previously.
14224 +- */
14225 +- while (!done) {
14226 +- u64 offset, len;
14227 +- u64 extent_end;
14228 +-
14229 +- if (i >= btrfs_header_nritems(src_path->nodes[0])) {
14230 +- ret = btrfs_next_leaf(inode->root, src_path);
14231 +- if (ret < 0)
14232 +- return ret;
14233 +- ASSERT(ret == 0);
14234 +- src = src_path->nodes[0];
14235 +- i = 0;
14236 +- need_find_last_extent = true;
14237 +- }
14238 +-
14239 +- btrfs_item_key_to_cpu(src, &key, i);
14240 +- if (!btrfs_comp_cpu_keys(&key, &last_key))
14241 +- done = true;
14242 +- if (key.objectid != btrfs_ino(inode) ||
14243 +- key.type != BTRFS_EXTENT_DATA_KEY) {
14244 +- i++;
14245 +- continue;
14246 +- }
14247 +- extent = btrfs_item_ptr(src, i, struct btrfs_file_extent_item);
14248 +- if (btrfs_file_extent_type(src, extent) ==
14249 +- BTRFS_FILE_EXTENT_INLINE) {
14250 +- len = btrfs_file_extent_ram_bytes(src, extent);
14251 +- extent_end = ALIGN(key.offset + len,
14252 +- fs_info->sectorsize);
14253 +- } else {
14254 +- len = btrfs_file_extent_num_bytes(src, extent);
14255 +- extent_end = key.offset + len;
14256 +- }
14257 +- i++;
14258 +-
14259 +- if (*last_extent == key.offset) {
14260 +- *last_extent = extent_end;
14261 +- continue;
14262 +- }
14263 +- offset = *last_extent;
14264 +- len = key.offset - *last_extent;
14265 +- ret = btrfs_insert_file_extent(trans, log, btrfs_ino(inode),
14266 +- offset, 0, 0, len, 0, len, 0, 0, 0);
14267 +- if (ret)
14268 +- break;
14269 +- *last_extent = extent_end;
14270 +- }
14271 +-
14272 +- /*
14273 +- * Check if there is a hole between the last extent found in our leaf
14274 +- * and the first extent in the next leaf. If there is one, we need to
14275 +- * log an explicit hole so that at replay time we can punch the hole.
14276 +- */
14277 +- if (ret == 0 &&
14278 +- key.objectid == btrfs_ino(inode) &&
14279 +- key.type == BTRFS_EXTENT_DATA_KEY &&
14280 +- i == btrfs_header_nritems(src_path->nodes[0])) {
14281 +- ret = btrfs_next_leaf(inode->root, src_path);
14282 +- need_find_last_extent = true;
14283 +- if (ret > 0) {
14284 +- ret = 0;
14285 +- } else if (ret == 0) {
14286 +- btrfs_item_key_to_cpu(src_path->nodes[0], &key,
14287 +- src_path->slots[0]);
14288 +- if (key.objectid == btrfs_ino(inode) &&
14289 +- key.type == BTRFS_EXTENT_DATA_KEY &&
14290 +- *last_extent < key.offset) {
14291 +- const u64 len = key.offset - *last_extent;
14292 +-
14293 +- ret = btrfs_insert_file_extent(trans, log,
14294 +- btrfs_ino(inode),
14295 +- *last_extent, 0,
14296 +- 0, len, 0, len,
14297 +- 0, 0, 0);
14298 +- *last_extent += len;
14299 +- }
14300 +- }
14301 +- }
14302 +- /*
14303 +- * Need to let the callers know we dropped the path so they should
14304 +- * re-search.
14305 +- */
14306 +- if (!ret && need_find_last_extent)
14307 +- ret = 1;
14308 + return ret;
14309 + }
14310 +
14311 +@@ -4365,7 +4181,7 @@ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans,
14312 + const u64 i_size = i_size_read(&inode->vfs_inode);
14313 + const u64 ino = btrfs_ino(inode);
14314 + struct btrfs_path *dst_path = NULL;
14315 +- u64 last_extent = (u64)-1;
14316 ++ bool dropped_extents = false;
14317 + int ins_nr = 0;
14318 + int start_slot;
14319 + int ret;
14320 +@@ -4387,8 +4203,7 @@ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans,
14321 + if (slot >= btrfs_header_nritems(leaf)) {
14322 + if (ins_nr > 0) {
14323 + ret = copy_items(trans, inode, dst_path, path,
14324 +- &last_extent, start_slot,
14325 +- ins_nr, 1, 0);
14326 ++ start_slot, ins_nr, 1, 0);
14327 + if (ret < 0)
14328 + goto out;
14329 + ins_nr = 0;
14330 +@@ -4412,8 +4227,7 @@ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans,
14331 + path->slots[0]++;
14332 + continue;
14333 + }
14334 +- if (last_extent == (u64)-1) {
14335 +- last_extent = key.offset;
14336 ++ if (!dropped_extents) {
14337 + /*
14338 + * Avoid logging extent items logged in past fsync calls
14339 + * and leading to duplicate keys in the log tree.
14340 +@@ -4427,6 +4241,7 @@ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans,
14341 + } while (ret == -EAGAIN);
14342 + if (ret)
14343 + goto out;
14344 ++ dropped_extents = true;
14345 + }
14346 + if (ins_nr == 0)
14347 + start_slot = slot;
14348 +@@ -4441,7 +4256,7 @@ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans,
14349 + }
14350 + }
14351 + if (ins_nr > 0) {
14352 +- ret = copy_items(trans, inode, dst_path, path, &last_extent,
14353 ++ ret = copy_items(trans, inode, dst_path, path,
14354 + start_slot, ins_nr, 1, 0);
14355 + if (ret > 0)
14356 + ret = 0;
14357 +@@ -4636,13 +4451,8 @@ static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans,
14358 +
14359 + if (slot >= nritems) {
14360 + if (ins_nr > 0) {
14361 +- u64 last_extent = 0;
14362 +-
14363 + ret = copy_items(trans, inode, dst_path, path,
14364 +- &last_extent, start_slot,
14365 +- ins_nr, 1, 0);
14366 +- /* can't be 1, extent items aren't processed */
14367 +- ASSERT(ret <= 0);
14368 ++ start_slot, ins_nr, 1, 0);
14369 + if (ret < 0)
14370 + return ret;
14371 + ins_nr = 0;
14372 +@@ -4666,13 +4476,8 @@ static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans,
14373 + cond_resched();
14374 + }
14375 + if (ins_nr > 0) {
14376 +- u64 last_extent = 0;
14377 +-
14378 + ret = copy_items(trans, inode, dst_path, path,
14379 +- &last_extent, start_slot,
14380 +- ins_nr, 1, 0);
14381 +- /* can't be 1, extent items aren't processed */
14382 +- ASSERT(ret <= 0);
14383 ++ start_slot, ins_nr, 1, 0);
14384 + if (ret < 0)
14385 + return ret;
14386 + }
14387 +@@ -4681,100 +4486,119 @@ static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans,
14388 + }
14389 +
14390 + /*
14391 +- * If the no holes feature is enabled we need to make sure any hole between the
14392 +- * last extent and the i_size of our inode is explicitly marked in the log. This
14393 +- * is to make sure that doing something like:
14394 +- *
14395 +- * 1) create file with 128Kb of data
14396 +- * 2) truncate file to 64Kb
14397 +- * 3) truncate file to 256Kb
14398 +- * 4) fsync file
14399 +- * 5) <crash/power failure>
14400 +- * 6) mount fs and trigger log replay
14401 +- *
14402 +- * Will give us a file with a size of 256Kb, the first 64Kb of data match what
14403 +- * the file had in its first 64Kb of data at step 1 and the last 192Kb of the
14404 +- * file correspond to a hole. The presence of explicit holes in a log tree is
14405 +- * what guarantees that log replay will remove/adjust file extent items in the
14406 +- * fs/subvol tree.
14407 +- *
14408 +- * Here we do not need to care about holes between extents, that is already done
14409 +- * by copy_items(). We also only need to do this in the full sync path, where we
14410 +- * lookup for extents from the fs/subvol tree only. In the fast path case, we
14411 +- * lookup the list of modified extent maps and if any represents a hole, we
14412 +- * insert a corresponding extent representing a hole in the log tree.
14413 ++ * When using the NO_HOLES feature if we punched a hole that causes the
14414 ++ * deletion of entire leafs or all the extent items of the first leaf (the one
14415 ++ * that contains the inode item and references) we may end up not processing
14416 ++ * any extents, because there are no leafs with a generation matching the
14417 ++ * current transaction that have extent items for our inode. So we need to find
14418 ++ * if any holes exist and then log them. We also need to log holes after any
14419 ++ * truncate operation that changes the inode's size.
14420 + */
14421 +-static int btrfs_log_trailing_hole(struct btrfs_trans_handle *trans,
14422 +- struct btrfs_root *root,
14423 +- struct btrfs_inode *inode,
14424 +- struct btrfs_path *path)
14425 ++static int btrfs_log_holes(struct btrfs_trans_handle *trans,
14426 ++ struct btrfs_root *root,
14427 ++ struct btrfs_inode *inode,
14428 ++ struct btrfs_path *path)
14429 + {
14430 + struct btrfs_fs_info *fs_info = root->fs_info;
14431 +- int ret;
14432 + struct btrfs_key key;
14433 +- u64 hole_start;
14434 +- u64 hole_size;
14435 +- struct extent_buffer *leaf;
14436 +- struct btrfs_root *log = root->log_root;
14437 + const u64 ino = btrfs_ino(inode);
14438 + const u64 i_size = i_size_read(&inode->vfs_inode);
14439 ++ u64 prev_extent_end = 0;
14440 ++ int ret;
14441 +
14442 +- if (!btrfs_fs_incompat(fs_info, NO_HOLES))
14443 ++ if (!btrfs_fs_incompat(fs_info, NO_HOLES) || i_size == 0)
14444 + return 0;
14445 +
14446 + key.objectid = ino;
14447 + key.type = BTRFS_EXTENT_DATA_KEY;
14448 +- key.offset = (u64)-1;
14449 ++ key.offset = 0;
14450 +
14451 + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
14452 +- ASSERT(ret != 0);
14453 + if (ret < 0)
14454 + return ret;
14455 +
14456 +- ASSERT(path->slots[0] > 0);
14457 +- path->slots[0]--;
14458 +- leaf = path->nodes[0];
14459 +- btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
14460 +-
14461 +- if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) {
14462 +- /* inode does not have any extents */
14463 +- hole_start = 0;
14464 +- hole_size = i_size;
14465 +- } else {
14466 ++ while (true) {
14467 + struct btrfs_file_extent_item *extent;
14468 ++ struct extent_buffer *leaf = path->nodes[0];
14469 + u64 len;
14470 +
14471 +- /*
14472 +- * If there's an extent beyond i_size, an explicit hole was
14473 +- * already inserted by copy_items().
14474 +- */
14475 +- if (key.offset >= i_size)
14476 +- return 0;
14477 ++ if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
14478 ++ ret = btrfs_next_leaf(root, path);
14479 ++ if (ret < 0)
14480 ++ return ret;
14481 ++ if (ret > 0) {
14482 ++ ret = 0;
14483 ++ break;
14484 ++ }
14485 ++ leaf = path->nodes[0];
14486 ++ }
14487 ++
14488 ++ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
14489 ++ if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY)
14490 ++ break;
14491 ++
14492 ++ /* We have a hole, log it. */
14493 ++ if (prev_extent_end < key.offset) {
14494 ++ const u64 hole_len = key.offset - prev_extent_end;
14495 ++
14496 ++ /*
14497 ++ * Release the path to avoid deadlocks with other code
14498 ++ * paths that search the root while holding locks on
14499 ++ * leafs from the log root.
14500 ++ */
14501 ++ btrfs_release_path(path);
14502 ++ ret = btrfs_insert_file_extent(trans, root->log_root,
14503 ++ ino, prev_extent_end, 0,
14504 ++ 0, hole_len, 0, hole_len,
14505 ++ 0, 0, 0);
14506 ++ if (ret < 0)
14507 ++ return ret;
14508 ++
14509 ++ /*
14510 ++ * Search for the same key again in the root. Since it's
14511 ++ * an extent item and we are holding the inode lock, the
14512 ++ * key must still exist. If it doesn't just emit warning
14513 ++ * and return an error to fall back to a transaction
14514 ++ * commit.
14515 ++ */
14516 ++ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
14517 ++ if (ret < 0)
14518 ++ return ret;
14519 ++ if (WARN_ON(ret > 0))
14520 ++ return -ENOENT;
14521 ++ leaf = path->nodes[0];
14522 ++ }
14523 +
14524 + extent = btrfs_item_ptr(leaf, path->slots[0],
14525 + struct btrfs_file_extent_item);
14526 +-
14527 + if (btrfs_file_extent_type(leaf, extent) ==
14528 +- BTRFS_FILE_EXTENT_INLINE)
14529 +- return 0;
14530 ++ BTRFS_FILE_EXTENT_INLINE) {
14531 ++ len = btrfs_file_extent_ram_bytes(leaf, extent);
14532 ++ prev_extent_end = ALIGN(key.offset + len,
14533 ++ fs_info->sectorsize);
14534 ++ } else {
14535 ++ len = btrfs_file_extent_num_bytes(leaf, extent);
14536 ++ prev_extent_end = key.offset + len;
14537 ++ }
14538 +
14539 +- len = btrfs_file_extent_num_bytes(leaf, extent);
14540 +- /* Last extent goes beyond i_size, no need to log a hole. */
14541 +- if (key.offset + len > i_size)
14542 +- return 0;
14543 +- hole_start = key.offset + len;
14544 +- hole_size = i_size - hole_start;
14545 ++ path->slots[0]++;
14546 ++ cond_resched();
14547 + }
14548 +- btrfs_release_path(path);
14549 +
14550 +- /* Last extent ends at i_size. */
14551 +- if (hole_size == 0)
14552 +- return 0;
14553 ++ if (prev_extent_end < i_size) {
14554 ++ u64 hole_len;
14555 +
14556 +- hole_size = ALIGN(hole_size, fs_info->sectorsize);
14557 +- ret = btrfs_insert_file_extent(trans, log, ino, hole_start, 0, 0,
14558 +- hole_size, 0, hole_size, 0, 0, 0);
14559 +- return ret;
14560 ++ btrfs_release_path(path);
14561 ++ hole_len = ALIGN(i_size - prev_extent_end, fs_info->sectorsize);
14562 ++ ret = btrfs_insert_file_extent(trans, root->log_root,
14563 ++ ino, prev_extent_end, 0, 0,
14564 ++ hole_len, 0, hole_len,
14565 ++ 0, 0, 0);
14566 ++ if (ret < 0)
14567 ++ return ret;
14568 ++ }
14569 ++
14570 ++ return 0;
14571 + }
14572 +
14573 + /*
14574 +@@ -4934,7 +4758,6 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
14575 + struct btrfs_key min_key;
14576 + struct btrfs_key max_key;
14577 + struct btrfs_root *log = root->log_root;
14578 +- u64 last_extent = 0;
14579 + int err = 0;
14580 + int ret;
14581 + int nritems;
14582 +@@ -5108,7 +4931,7 @@ again:
14583 + ins_start_slot = path->slots[0];
14584 + }
14585 + ret = copy_items(trans, inode, dst_path, path,
14586 +- &last_extent, ins_start_slot,
14587 ++ ins_start_slot,
14588 + ins_nr, inode_only,
14589 + logged_isize);
14590 + if (ret < 0) {
14591 +@@ -5161,17 +4984,13 @@ again:
14592 + if (ins_nr == 0)
14593 + goto next_slot;
14594 + ret = copy_items(trans, inode, dst_path, path,
14595 +- &last_extent, ins_start_slot,
14596 ++ ins_start_slot,
14597 + ins_nr, inode_only, logged_isize);
14598 + if (ret < 0) {
14599 + err = ret;
14600 + goto out_unlock;
14601 + }
14602 + ins_nr = 0;
14603 +- if (ret) {
14604 +- btrfs_release_path(path);
14605 +- continue;
14606 +- }
14607 + goto next_slot;
14608 + }
14609 +
14610 +@@ -5184,18 +5003,13 @@ again:
14611 + goto next_slot;
14612 + }
14613 +
14614 +- ret = copy_items(trans, inode, dst_path, path, &last_extent,
14615 ++ ret = copy_items(trans, inode, dst_path, path,
14616 + ins_start_slot, ins_nr, inode_only,
14617 + logged_isize);
14618 + if (ret < 0) {
14619 + err = ret;
14620 + goto out_unlock;
14621 + }
14622 +- if (ret) {
14623 +- ins_nr = 0;
14624 +- btrfs_release_path(path);
14625 +- continue;
14626 +- }
14627 + ins_nr = 1;
14628 + ins_start_slot = path->slots[0];
14629 + next_slot:
14630 +@@ -5209,13 +5023,12 @@ next_slot:
14631 + }
14632 + if (ins_nr) {
14633 + ret = copy_items(trans, inode, dst_path, path,
14634 +- &last_extent, ins_start_slot,
14635 ++ ins_start_slot,
14636 + ins_nr, inode_only, logged_isize);
14637 + if (ret < 0) {
14638 + err = ret;
14639 + goto out_unlock;
14640 + }
14641 +- ret = 0;
14642 + ins_nr = 0;
14643 + }
14644 + btrfs_release_path(path);
14645 +@@ -5230,14 +5043,13 @@ next_key:
14646 + }
14647 + }
14648 + if (ins_nr) {
14649 +- ret = copy_items(trans, inode, dst_path, path, &last_extent,
14650 ++ ret = copy_items(trans, inode, dst_path, path,
14651 + ins_start_slot, ins_nr, inode_only,
14652 + logged_isize);
14653 + if (ret < 0) {
14654 + err = ret;
14655 + goto out_unlock;
14656 + }
14657 +- ret = 0;
14658 + ins_nr = 0;
14659 + }
14660 +
14661 +@@ -5250,7 +5062,7 @@ next_key:
14662 + if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) {
14663 + btrfs_release_path(path);
14664 + btrfs_release_path(dst_path);
14665 +- err = btrfs_log_trailing_hole(trans, root, inode, path);
14666 ++ err = btrfs_log_holes(trans, root, inode, path);
14667 + if (err)
14668 + goto out_unlock;
14669 + }
14670 +diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
14671 +index 43f29621e51f..0d4e4d97e6cf 100644
14672 +--- a/fs/cifs/smb2pdu.c
14673 ++++ b/fs/cifs/smb2pdu.c
14674 +@@ -259,9 +259,14 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon)
14675 + }
14676 +
14677 + rc = cifs_negotiate_protocol(0, tcon->ses);
14678 +- if (!rc && tcon->ses->need_reconnect)
14679 ++ if (!rc && tcon->ses->need_reconnect) {
14680 + rc = cifs_setup_session(0, tcon->ses, nls_codepage);
14681 +-
14682 ++ if ((rc == -EACCES) && !tcon->retry) {
14683 ++ rc = -EHOSTDOWN;
14684 ++ mutex_unlock(&tcon->ses->session_mutex);
14685 ++ goto failed;
14686 ++ }
14687 ++ }
14688 + if (rc || !tcon->need_reconnect) {
14689 + mutex_unlock(&tcon->ses->session_mutex);
14690 + goto out;
14691 +@@ -306,6 +311,7 @@ out:
14692 + case SMB2_SET_INFO:
14693 + rc = -EAGAIN;
14694 + }
14695 ++failed:
14696 + unload_nls(nls_codepage);
14697 + return rc;
14698 + }
14699 +@@ -3130,8 +3136,8 @@ smb2_readv_callback(struct mid_q_entry *mid)
14700 + struct smb2_sync_hdr *shdr =
14701 + (struct smb2_sync_hdr *)rdata->iov[0].iov_base;
14702 + unsigned int credits_received = 0;
14703 +- struct smb_rqst rqst = { .rq_iov = rdata->iov,
14704 +- .rq_nvec = 2,
14705 ++ struct smb_rqst rqst = { .rq_iov = &rdata->iov[1],
14706 ++ .rq_nvec = 1,
14707 + .rq_pages = rdata->pages,
14708 + .rq_offset = rdata->page_offset,
14709 + .rq_npages = rdata->nr_pages,
14710 +diff --git a/fs/eventfd.c b/fs/eventfd.c
14711 +index 08d3bd602f73..ce1d1711fbba 100644
14712 +--- a/fs/eventfd.c
14713 ++++ b/fs/eventfd.c
14714 +@@ -22,6 +22,8 @@
14715 + #include <linux/proc_fs.h>
14716 + #include <linux/seq_file.h>
14717 +
14718 ++DEFINE_PER_CPU(int, eventfd_wake_count);
14719 ++
14720 + struct eventfd_ctx {
14721 + struct kref kref;
14722 + wait_queue_head_t wqh;
14723 +@@ -55,12 +57,25 @@ __u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n)
14724 + {
14725 + unsigned long flags;
14726 +
14727 ++ /*
14728 ++ * Deadlock or stack overflow issues can happen if we recurse here
14729 ++ * through waitqueue wakeup handlers. If the caller users potentially
14730 ++ * nested waitqueues with custom wakeup handlers, then it should
14731 ++ * check eventfd_signal_count() before calling this function. If
14732 ++ * it returns true, the eventfd_signal() call should be deferred to a
14733 ++ * safe context.
14734 ++ */
14735 ++ if (WARN_ON_ONCE(this_cpu_read(eventfd_wake_count)))
14736 ++ return 0;
14737 ++
14738 + spin_lock_irqsave(&ctx->wqh.lock, flags);
14739 ++ this_cpu_inc(eventfd_wake_count);
14740 + if (ULLONG_MAX - ctx->count < n)
14741 + n = ULLONG_MAX - ctx->count;
14742 + ctx->count += n;
14743 + if (waitqueue_active(&ctx->wqh))
14744 + wake_up_locked_poll(&ctx->wqh, EPOLLIN);
14745 ++ this_cpu_dec(eventfd_wake_count);
14746 + spin_unlock_irqrestore(&ctx->wqh.lock, flags);
14747 +
14748 + return n;
14749 +diff --git a/fs/ext2/super.c b/fs/ext2/super.c
14750 +index 364e647d87c0..80a3038e0e46 100644
14751 +--- a/fs/ext2/super.c
14752 ++++ b/fs/ext2/super.c
14753 +@@ -1093,9 +1093,9 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
14754 +
14755 + if (EXT2_BLOCKS_PER_GROUP(sb) == 0)
14756 + goto cantfind_ext2;
14757 +- sbi->s_groups_count = ((le32_to_cpu(es->s_blocks_count) -
14758 +- le32_to_cpu(es->s_first_data_block) - 1)
14759 +- / EXT2_BLOCKS_PER_GROUP(sb)) + 1;
14760 ++ sbi->s_groups_count = ((le32_to_cpu(es->s_blocks_count) -
14761 ++ le32_to_cpu(es->s_first_data_block) - 1)
14762 ++ / EXT2_BLOCKS_PER_GROUP(sb)) + 1;
14763 + db_count = (sbi->s_groups_count + EXT2_DESC_PER_BLOCK(sb) - 1) /
14764 + EXT2_DESC_PER_BLOCK(sb);
14765 + sbi->s_group_desc = kmalloc_array (db_count,
14766 +diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
14767 +index db7590178dfc..9cc79b7b0df1 100644
14768 +--- a/fs/ext4/page-io.c
14769 ++++ b/fs/ext4/page-io.c
14770 +@@ -481,17 +481,26 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
14771 + nr_to_submit) {
14772 + gfp_t gfp_flags = GFP_NOFS;
14773 +
14774 ++ /*
14775 ++ * Since bounce page allocation uses a mempool, we can only use
14776 ++ * a waiting mask (i.e. request guaranteed allocation) on the
14777 ++ * first page of the bio. Otherwise it can deadlock.
14778 ++ */
14779 ++ if (io->io_bio)
14780 ++ gfp_flags = GFP_NOWAIT | __GFP_NOWARN;
14781 + retry_encrypt:
14782 + data_page = fscrypt_encrypt_page(inode, page, PAGE_SIZE, 0,
14783 + page->index, gfp_flags);
14784 + if (IS_ERR(data_page)) {
14785 + ret = PTR_ERR(data_page);
14786 +- if (ret == -ENOMEM && wbc->sync_mode == WB_SYNC_ALL) {
14787 +- if (io->io_bio) {
14788 ++ if (ret == -ENOMEM &&
14789 ++ (io->io_bio || wbc->sync_mode == WB_SYNC_ALL)) {
14790 ++ gfp_flags = GFP_NOFS;
14791 ++ if (io->io_bio)
14792 + ext4_io_submit(io);
14793 +- congestion_wait(BLK_RW_ASYNC, HZ/50);
14794 +- }
14795 +- gfp_flags |= __GFP_NOFAIL;
14796 ++ else
14797 ++ gfp_flags |= __GFP_NOFAIL;
14798 ++ congestion_wait(BLK_RW_ASYNC, HZ/50);
14799 + goto retry_encrypt;
14800 + }
14801 + data_page = NULL;
14802 +diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
14803 +index 7a9cc64f5ca3..da348cf4ff56 100644
14804 +--- a/fs/f2fs/super.c
14805 ++++ b/fs/f2fs/super.c
14806 +@@ -1148,9 +1148,11 @@ static int f2fs_statfs_project(struct super_block *sb,
14807 + return PTR_ERR(dquot);
14808 + spin_lock(&dquot->dq_dqb_lock);
14809 +
14810 +- limit = (dquot->dq_dqb.dqb_bsoftlimit ?
14811 +- dquot->dq_dqb.dqb_bsoftlimit :
14812 +- dquot->dq_dqb.dqb_bhardlimit) >> sb->s_blocksize_bits;
14813 ++ limit = min_not_zero(dquot->dq_dqb.dqb_bsoftlimit,
14814 ++ dquot->dq_dqb.dqb_bhardlimit);
14815 ++ if (limit)
14816 ++ limit >>= sb->s_blocksize_bits;
14817 ++
14818 + if (limit && buf->f_blocks > limit) {
14819 + curblock = dquot->dq_dqb.dqb_curspace >> sb->s_blocksize_bits;
14820 + buf->f_blocks = limit;
14821 +@@ -1159,9 +1161,9 @@ static int f2fs_statfs_project(struct super_block *sb,
14822 + (buf->f_blocks - curblock) : 0;
14823 + }
14824 +
14825 +- limit = dquot->dq_dqb.dqb_isoftlimit ?
14826 +- dquot->dq_dqb.dqb_isoftlimit :
14827 +- dquot->dq_dqb.dqb_ihardlimit;
14828 ++ limit = min_not_zero(dquot->dq_dqb.dqb_isoftlimit,
14829 ++ dquot->dq_dqb.dqb_ihardlimit);
14830 ++
14831 + if (limit && buf->f_files > limit) {
14832 + buf->f_files = limit;
14833 + buf->f_ffree =
14834 +diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
14835 +index 08369c6cd127..143e7d518c5d 100644
14836 +--- a/fs/gfs2/file.c
14837 ++++ b/fs/gfs2/file.c
14838 +@@ -780,7 +780,7 @@ static ssize_t gfs2_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
14839 + struct file *file = iocb->ki_filp;
14840 + struct inode *inode = file_inode(file);
14841 + struct gfs2_inode *ip = GFS2_I(inode);
14842 +- ssize_t written = 0, ret;
14843 ++ ssize_t ret;
14844 +
14845 + ret = gfs2_rsqa_alloc(ip);
14846 + if (ret)
14847 +@@ -800,68 +800,58 @@ static ssize_t gfs2_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
14848 + inode_lock(inode);
14849 + ret = generic_write_checks(iocb, from);
14850 + if (ret <= 0)
14851 +- goto out;
14852 +-
14853 +- /* We can write back this queue in page reclaim */
14854 +- current->backing_dev_info = inode_to_bdi(inode);
14855 ++ goto out_unlock;
14856 +
14857 + ret = file_remove_privs(file);
14858 + if (ret)
14859 +- goto out2;
14860 ++ goto out_unlock;
14861 +
14862 + ret = file_update_time(file);
14863 + if (ret)
14864 +- goto out2;
14865 ++ goto out_unlock;
14866 +
14867 + if (iocb->ki_flags & IOCB_DIRECT) {
14868 + struct address_space *mapping = file->f_mapping;
14869 +- loff_t pos, endbyte;
14870 +- ssize_t buffered;
14871 ++ ssize_t buffered, ret2;
14872 +
14873 +- written = gfs2_file_direct_write(iocb, from);
14874 +- if (written < 0 || !iov_iter_count(from))
14875 +- goto out2;
14876 ++ ret = gfs2_file_direct_write(iocb, from);
14877 ++ if (ret < 0 || !iov_iter_count(from))
14878 ++ goto out_unlock;
14879 +
14880 +- ret = iomap_file_buffered_write(iocb, from, &gfs2_iomap_ops);
14881 +- if (unlikely(ret < 0))
14882 +- goto out2;
14883 +- buffered = ret;
14884 ++ iocb->ki_flags |= IOCB_DSYNC;
14885 ++ current->backing_dev_info = inode_to_bdi(inode);
14886 ++ buffered = iomap_file_buffered_write(iocb, from, &gfs2_iomap_ops);
14887 ++ current->backing_dev_info = NULL;
14888 ++ if (unlikely(buffered <= 0))
14889 ++ goto out_unlock;
14890 +
14891 + /*
14892 + * We need to ensure that the page cache pages are written to
14893 + * disk and invalidated to preserve the expected O_DIRECT
14894 +- * semantics.
14895 ++ * semantics. If the writeback or invalidate fails, only report
14896 ++ * the direct I/O range as we don't know if the buffered pages
14897 ++ * made it to disk.
14898 + */
14899 +- pos = iocb->ki_pos;
14900 +- endbyte = pos + buffered - 1;
14901 +- ret = filemap_write_and_wait_range(mapping, pos, endbyte);
14902 +- if (!ret) {
14903 +- iocb->ki_pos += buffered;
14904 +- written += buffered;
14905 +- invalidate_mapping_pages(mapping,
14906 +- pos >> PAGE_SHIFT,
14907 +- endbyte >> PAGE_SHIFT);
14908 +- } else {
14909 +- /*
14910 +- * We don't know how much we wrote, so just return
14911 +- * the number of bytes which were direct-written
14912 +- */
14913 +- }
14914 ++ iocb->ki_pos += buffered;
14915 ++ ret2 = generic_write_sync(iocb, buffered);
14916 ++ invalidate_mapping_pages(mapping,
14917 ++ (iocb->ki_pos - buffered) >> PAGE_SHIFT,
14918 ++ (iocb->ki_pos - 1) >> PAGE_SHIFT);
14919 ++ if (!ret || ret2 > 0)
14920 ++ ret += ret2;
14921 + } else {
14922 ++ current->backing_dev_info = inode_to_bdi(inode);
14923 + ret = iomap_file_buffered_write(iocb, from, &gfs2_iomap_ops);
14924 +- if (likely(ret > 0))
14925 ++ current->backing_dev_info = NULL;
14926 ++ if (likely(ret > 0)) {
14927 + iocb->ki_pos += ret;
14928 ++ ret = generic_write_sync(iocb, ret);
14929 ++ }
14930 + }
14931 +
14932 +-out2:
14933 +- current->backing_dev_info = NULL;
14934 +-out:
14935 ++out_unlock:
14936 + inode_unlock(inode);
14937 +- if (likely(ret > 0)) {
14938 +- /* Handle various SYNC-type writes */
14939 +- ret = generic_write_sync(iocb, ret);
14940 +- }
14941 +- return written ? written : ret;
14942 ++ return ret;
14943 + }
14944 +
14945 + static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len,
14946 +diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
14947 +index df390a69c49a..1a2339f2cb49 100644
14948 +--- a/fs/jbd2/journal.c
14949 ++++ b/fs/jbd2/journal.c
14950 +@@ -1002,6 +1002,7 @@ static void *jbd2_seq_info_start(struct seq_file *seq, loff_t *pos)
14951 +
14952 + static void *jbd2_seq_info_next(struct seq_file *seq, void *v, loff_t *pos)
14953 + {
14954 ++ (*pos)++;
14955 + return NULL;
14956 + }
14957 +
14958 +diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
14959 +index b8d686087952..4ae726e70d87 100644
14960 +--- a/fs/nfs/dir.c
14961 ++++ b/fs/nfs/dir.c
14962 +@@ -162,6 +162,17 @@ typedef struct {
14963 + bool eof;
14964 + } nfs_readdir_descriptor_t;
14965 +
14966 ++static
14967 ++void nfs_readdir_init_array(struct page *page)
14968 ++{
14969 ++ struct nfs_cache_array *array;
14970 ++
14971 ++ array = kmap_atomic(page);
14972 ++ memset(array, 0, sizeof(struct nfs_cache_array));
14973 ++ array->eof_index = -1;
14974 ++ kunmap_atomic(array);
14975 ++}
14976 ++
14977 + /*
14978 + * we are freeing strings created by nfs_add_to_readdir_array()
14979 + */
14980 +@@ -174,6 +185,7 @@ void nfs_readdir_clear_array(struct page *page)
14981 + array = kmap_atomic(page);
14982 + for (i = 0; i < array->size; i++)
14983 + kfree(array->array[i].string.name);
14984 ++ array->size = 0;
14985 + kunmap_atomic(array);
14986 + }
14987 +
14988 +@@ -610,6 +622,8 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page,
14989 + int status = -ENOMEM;
14990 + unsigned int array_size = ARRAY_SIZE(pages);
14991 +
14992 ++ nfs_readdir_init_array(page);
14993 ++
14994 + entry.prev_cookie = 0;
14995 + entry.cookie = desc->last_cookie;
14996 + entry.eof = 0;
14997 +@@ -626,8 +640,6 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page,
14998 + }
14999 +
15000 + array = kmap(page);
15001 +- memset(array, 0, sizeof(struct nfs_cache_array));
15002 +- array->eof_index = -1;
15003 +
15004 + status = nfs_readdir_alloc_pages(pages, array_size);
15005 + if (status < 0)
15006 +@@ -681,6 +693,7 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page* page)
15007 + unlock_page(page);
15008 + return 0;
15009 + error:
15010 ++ nfs_readdir_clear_array(page);
15011 + unlock_page(page);
15012 + return ret;
15013 + }
15014 +@@ -688,8 +701,6 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page* page)
15015 + static
15016 + void cache_page_release(nfs_readdir_descriptor_t *desc)
15017 + {
15018 +- if (!desc->page->mapping)
15019 +- nfs_readdir_clear_array(desc->page);
15020 + put_page(desc->page);
15021 + desc->page = NULL;
15022 + }
15023 +@@ -703,19 +714,28 @@ struct page *get_cache_page(nfs_readdir_descriptor_t *desc)
15024 +
15025 + /*
15026 + * Returns 0 if desc->dir_cookie was found on page desc->page_index
15027 ++ * and locks the page to prevent removal from the page cache.
15028 + */
15029 + static
15030 +-int find_cache_page(nfs_readdir_descriptor_t *desc)
15031 ++int find_and_lock_cache_page(nfs_readdir_descriptor_t *desc)
15032 + {
15033 + int res;
15034 +
15035 + desc->page = get_cache_page(desc);
15036 + if (IS_ERR(desc->page))
15037 + return PTR_ERR(desc->page);
15038 +-
15039 +- res = nfs_readdir_search_array(desc);
15040 ++ res = lock_page_killable(desc->page);
15041 + if (res != 0)
15042 +- cache_page_release(desc);
15043 ++ goto error;
15044 ++ res = -EAGAIN;
15045 ++ if (desc->page->mapping != NULL) {
15046 ++ res = nfs_readdir_search_array(desc);
15047 ++ if (res == 0)
15048 ++ return 0;
15049 ++ }
15050 ++ unlock_page(desc->page);
15051 ++error:
15052 ++ cache_page_release(desc);
15053 + return res;
15054 + }
15055 +
15056 +@@ -730,7 +750,7 @@ int readdir_search_pagecache(nfs_readdir_descriptor_t *desc)
15057 + desc->last_cookie = 0;
15058 + }
15059 + do {
15060 +- res = find_cache_page(desc);
15061 ++ res = find_and_lock_cache_page(desc);
15062 + } while (res == -EAGAIN);
15063 + return res;
15064 + }
15065 +@@ -769,7 +789,6 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc)
15066 + desc->eof = true;
15067 +
15068 + kunmap(desc->page);
15069 +- cache_page_release(desc);
15070 + dfprintk(DIRCACHE, "NFS: nfs_do_filldir() filling ended @ cookie %Lu; returning = %d\n",
15071 + (unsigned long long)*desc->dir_cookie, res);
15072 + return res;
15073 +@@ -815,13 +834,13 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc)
15074 +
15075 + status = nfs_do_filldir(desc);
15076 +
15077 ++ out_release:
15078 ++ nfs_readdir_clear_array(desc->page);
15079 ++ cache_page_release(desc);
15080 + out:
15081 + dfprintk(DIRCACHE, "NFS: %s: returns %d\n",
15082 + __func__, status);
15083 + return status;
15084 +- out_release:
15085 +- cache_page_release(desc);
15086 +- goto out;
15087 + }
15088 +
15089 + /* The file offset position represents the dirent entry number. A
15090 +@@ -886,6 +905,8 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
15091 + break;
15092 +
15093 + res = nfs_do_filldir(desc);
15094 ++ unlock_page(desc->page);
15095 ++ cache_page_release(desc);
15096 + if (res < 0)
15097 + break;
15098 + } while (!desc->eof);
15099 +diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c
15100 +index 2b36aa037ce0..f4cf1c0793c6 100644
15101 +--- a/fs/nfsd/nfs4layouts.c
15102 ++++ b/fs/nfsd/nfs4layouts.c
15103 +@@ -676,7 +676,7 @@ nfsd4_cb_layout_done(struct nfsd4_callback *cb, struct rpc_task *task)
15104 +
15105 + /* Client gets 2 lease periods to return it */
15106 + cutoff = ktime_add_ns(task->tk_start,
15107 +- nn->nfsd4_lease * NSEC_PER_SEC * 2);
15108 ++ (u64)nn->nfsd4_lease * NSEC_PER_SEC * 2);
15109 +
15110 + if (ktime_before(now, cutoff)) {
15111 + rpc_delay(task, HZ/100); /* 10 mili-seconds */
15112 +diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
15113 +index c8ce128e0054..ed73e86194fa 100644
15114 +--- a/fs/nfsd/nfs4state.c
15115 ++++ b/fs/nfsd/nfs4state.c
15116 +@@ -6075,7 +6075,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
15117 + }
15118 +
15119 + if (fl_flags & FL_SLEEP) {
15120 +- nbl->nbl_time = jiffies;
15121 ++ nbl->nbl_time = get_seconds();
15122 + spin_lock(&nn->blocked_locks_lock);
15123 + list_add_tail(&nbl->nbl_list, &lock_sop->lo_blocked);
15124 + list_add_tail(&nbl->nbl_lru, &nn->blocked_locks_lru);
15125 +diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
15126 +index 0f07ad6dc1ef..594fc1a8b672 100644
15127 +--- a/fs/nfsd/state.h
15128 ++++ b/fs/nfsd/state.h
15129 +@@ -592,7 +592,7 @@ static inline bool nfsd4_stateid_generation_after(stateid_t *a, stateid_t *b)
15130 + struct nfsd4_blocked_lock {
15131 + struct list_head nbl_list;
15132 + struct list_head nbl_lru;
15133 +- unsigned long nbl_time;
15134 ++ time_t nbl_time;
15135 + struct file_lock nbl_lock;
15136 + struct knfsd_fh nbl_fh;
15137 + struct nfsd4_callback nbl_cb;
15138 +diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
15139 +index 4fe8db314950..80cededcd10d 100644
15140 +--- a/fs/nfsd/vfs.c
15141 ++++ b/fs/nfsd/vfs.c
15142 +@@ -1016,6 +1016,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
15143 + host_err = vfs_iter_write(file, &iter, &pos, flags);
15144 + if (host_err < 0)
15145 + goto out_nfserr;
15146 ++ *cnt = host_err;
15147 + nfsdstats.io_write += *cnt;
15148 + fsnotify_modify(file);
15149 +
15150 +diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
15151 +index a3e077fcfeb9..fbd70111a2f1 100644
15152 +--- a/fs/ocfs2/file.c
15153 ++++ b/fs/ocfs2/file.c
15154 +@@ -2109,17 +2109,15 @@ static int ocfs2_is_io_unaligned(struct inode *inode, size_t count, loff_t pos)
15155 + static int ocfs2_inode_lock_for_extent_tree(struct inode *inode,
15156 + struct buffer_head **di_bh,
15157 + int meta_level,
15158 +- int overwrite_io,
15159 + int write_sem,
15160 + int wait)
15161 + {
15162 + int ret = 0;
15163 +
15164 + if (wait)
15165 +- ret = ocfs2_inode_lock(inode, NULL, meta_level);
15166 ++ ret = ocfs2_inode_lock(inode, di_bh, meta_level);
15167 + else
15168 +- ret = ocfs2_try_inode_lock(inode,
15169 +- overwrite_io ? NULL : di_bh, meta_level);
15170 ++ ret = ocfs2_try_inode_lock(inode, di_bh, meta_level);
15171 + if (ret < 0)
15172 + goto out;
15173 +
15174 +@@ -2144,6 +2142,7 @@ static int ocfs2_inode_lock_for_extent_tree(struct inode *inode,
15175 +
15176 + out_unlock:
15177 + brelse(*di_bh);
15178 ++ *di_bh = NULL;
15179 + ocfs2_inode_unlock(inode, meta_level);
15180 + out:
15181 + return ret;
15182 +@@ -2186,7 +2185,6 @@ static int ocfs2_prepare_inode_for_write(struct file *file,
15183 + ret = ocfs2_inode_lock_for_extent_tree(inode,
15184 + &di_bh,
15185 + meta_level,
15186 +- overwrite_io,
15187 + write_sem,
15188 + wait);
15189 + if (ret < 0) {
15190 +@@ -2244,13 +2242,13 @@ static int ocfs2_prepare_inode_for_write(struct file *file,
15191 + &di_bh,
15192 + meta_level,
15193 + write_sem);
15194 ++ meta_level = 1;
15195 ++ write_sem = 1;
15196 + ret = ocfs2_inode_lock_for_extent_tree(inode,
15197 + &di_bh,
15198 + meta_level,
15199 +- overwrite_io,
15200 +- 1,
15201 ++ write_sem,
15202 + wait);
15203 +- write_sem = 1;
15204 + if (ret < 0) {
15205 + if (ret != -EAGAIN)
15206 + mlog_errno(ret);
15207 +diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
15208 +index 0bd276e4ccbe..fa5ac5de807c 100644
15209 +--- a/fs/overlayfs/file.c
15210 ++++ b/fs/overlayfs/file.c
15211 +@@ -149,7 +149,7 @@ static loff_t ovl_llseek(struct file *file, loff_t offset, int whence)
15212 + struct inode *inode = file_inode(file);
15213 + struct fd real;
15214 + const struct cred *old_cred;
15215 +- ssize_t ret;
15216 ++ loff_t ret;
15217 +
15218 + /*
15219 + * The two special cases below do not need to involve real fs,
15220 +diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c
15221 +index cc8303a806b4..11b7941c5dbc 100644
15222 +--- a/fs/overlayfs/readdir.c
15223 ++++ b/fs/overlayfs/readdir.c
15224 +@@ -507,7 +507,13 @@ get:
15225 + if (err)
15226 + goto fail;
15227 +
15228 +- WARN_ON_ONCE(dir->d_sb->s_dev != stat.dev);
15229 ++ /*
15230 ++ * Directory inode is always on overlay st_dev.
15231 ++ * Non-dir with ovl_same_dev() could be on pseudo st_dev in case
15232 ++ * of xino bits overflow.
15233 ++ */
15234 ++ WARN_ON_ONCE(S_ISDIR(stat.mode) &&
15235 ++ dir->d_sb->s_dev != stat.dev);
15236 + ino = stat.ino;
15237 + } else if (xinobits && !OVL_TYPE_UPPER(type)) {
15238 + ino = ovl_remap_lower_ino(ino, xinobits,
15239 +diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
15240 +index 5767b373a8ff..d7c0aa0626cd 100644
15241 +--- a/fs/ubifs/dir.c
15242 ++++ b/fs/ubifs/dir.c
15243 +@@ -242,6 +242,8 @@ static struct dentry *ubifs_lookup(struct inode *dir, struct dentry *dentry,
15244 + if (nm.hash) {
15245 + ubifs_assert(c, fname_len(&nm) == 0);
15246 + ubifs_assert(c, fname_name(&nm) == NULL);
15247 ++ if (nm.hash & ~UBIFS_S_KEY_HASH_MASK)
15248 ++ goto done; /* ENOENT */
15249 + dent_key_init_hash(c, &key, dir->i_ino, nm.hash);
15250 + err = ubifs_tnc_lookup_dh(c, &key, dent, nm.minor_hash);
15251 + } else {
15252 +diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
15253 +index 1b78f2e09218..65b4f63349c7 100644
15254 +--- a/fs/ubifs/file.c
15255 ++++ b/fs/ubifs/file.c
15256 +@@ -798,7 +798,9 @@ static int ubifs_do_bulk_read(struct ubifs_info *c, struct bu_info *bu,
15257 +
15258 + if (page_offset > end_index)
15259 + break;
15260 +- page = find_or_create_page(mapping, page_offset, ra_gfp_mask);
15261 ++ page = pagecache_get_page(mapping, page_offset,
15262 ++ FGP_LOCK|FGP_ACCESSED|FGP_CREAT|FGP_NOWAIT,
15263 ++ ra_gfp_mask);
15264 + if (!page)
15265 + break;
15266 + if (!PageUptodate(page))
15267 +diff --git a/fs/ubifs/ioctl.c b/fs/ubifs/ioctl.c
15268 +index 0164bcc827f8..e666459f63c7 100644
15269 +--- a/fs/ubifs/ioctl.c
15270 ++++ b/fs/ubifs/ioctl.c
15271 +@@ -28,6 +28,11 @@
15272 + #include <linux/mount.h>
15273 + #include "ubifs.h"
15274 +
15275 ++/* Need to be kept consistent with checked flags in ioctl2ubifs() */
15276 ++#define UBIFS_SUPPORTED_IOCTL_FLAGS \
15277 ++ (FS_COMPR_FL | FS_SYNC_FL | FS_APPEND_FL | \
15278 ++ FS_IMMUTABLE_FL | FS_DIRSYNC_FL)
15279 ++
15280 + /**
15281 + * ubifs_set_inode_flags - set VFS inode flags.
15282 + * @inode: VFS inode to set flags for
15283 +@@ -127,7 +132,8 @@ static int setflags(struct inode *inode, int flags)
15284 + }
15285 + }
15286 +
15287 +- ui->flags = ioctl2ubifs(flags);
15288 ++ ui->flags &= ~ioctl2ubifs(UBIFS_SUPPORTED_IOCTL_FLAGS);
15289 ++ ui->flags |= ioctl2ubifs(flags);
15290 + ubifs_set_inode_flags(inode);
15291 + inode->i_ctime = current_time(inode);
15292 + release = ui->dirty;
15293 +@@ -169,6 +175,9 @@ long ubifs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
15294 + if (get_user(flags, (int __user *) arg))
15295 + return -EFAULT;
15296 +
15297 ++ if (flags & ~UBIFS_SUPPORTED_IOCTL_FLAGS)
15298 ++ return -EOPNOTSUPP;
15299 ++
15300 + if (!S_ISDIR(inode->i_mode))
15301 + flags &= ~FS_DIRSYNC_FL;
15302 +
15303 +diff --git a/fs/udf/super.c b/fs/udf/super.c
15304 +index 7af011dc9ae8..6fd0f14e9dd2 100644
15305 +--- a/fs/udf/super.c
15306 ++++ b/fs/udf/super.c
15307 +@@ -999,7 +999,6 @@ static int check_partition_desc(struct super_block *sb,
15308 + switch (le32_to_cpu(p->accessType)) {
15309 + case PD_ACCESS_TYPE_READ_ONLY:
15310 + case PD_ACCESS_TYPE_WRITE_ONCE:
15311 +- case PD_ACCESS_TYPE_REWRITABLE:
15312 + case PD_ACCESS_TYPE_NONE:
15313 + goto force_ro;
15314 + }
15315 +diff --git a/include/linux/eventfd.h b/include/linux/eventfd.h
15316 +index ffcc7724ca21..dc4fd8a6644d 100644
15317 +--- a/include/linux/eventfd.h
15318 ++++ b/include/linux/eventfd.h
15319 +@@ -12,6 +12,8 @@
15320 + #include <linux/fcntl.h>
15321 + #include <linux/wait.h>
15322 + #include <linux/err.h>
15323 ++#include <linux/percpu-defs.h>
15324 ++#include <linux/percpu.h>
15325 +
15326 + /*
15327 + * CAREFUL: Check include/uapi/asm-generic/fcntl.h when defining
15328 +@@ -40,6 +42,13 @@ __u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n);
15329 + int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_entry_t *wait,
15330 + __u64 *cnt);
15331 +
15332 ++DECLARE_PER_CPU(int, eventfd_wake_count);
15333 ++
15334 ++static inline bool eventfd_signal_count(void)
15335 ++{
15336 ++ return this_cpu_read(eventfd_wake_count);
15337 ++}
15338 ++
15339 + #else /* CONFIG_EVENTFD */
15340 +
15341 + /*
15342 +@@ -68,6 +77,11 @@ static inline int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx,
15343 + return -ENOSYS;
15344 + }
15345 +
15346 ++static inline bool eventfd_signal_count(void)
15347 ++{
15348 ++ return false;
15349 ++}
15350 ++
15351 + #endif
15352 +
15353 + #endif /* _LINUX_EVENTFD_H */
15354 +diff --git a/include/linux/irq.h b/include/linux/irq.h
15355 +index c9bffda04a45..6ecaf056ab63 100644
15356 +--- a/include/linux/irq.h
15357 ++++ b/include/linux/irq.h
15358 +@@ -208,6 +208,8 @@ struct irq_data {
15359 + * IRQD_SINGLE_TARGET - IRQ allows only a single affinity target
15360 + * IRQD_DEFAULT_TRIGGER_SET - Expected trigger already been set
15361 + * IRQD_CAN_RESERVE - Can use reservation mode
15362 ++ * IRQD_MSI_NOMASK_QUIRK - Non-maskable MSI quirk for affinity change
15363 ++ * required
15364 + */
15365 + enum {
15366 + IRQD_TRIGGER_MASK = 0xf,
15367 +@@ -230,6 +232,7 @@ enum {
15368 + IRQD_SINGLE_TARGET = (1 << 24),
15369 + IRQD_DEFAULT_TRIGGER_SET = (1 << 25),
15370 + IRQD_CAN_RESERVE = (1 << 26),
15371 ++ IRQD_MSI_NOMASK_QUIRK = (1 << 27),
15372 + };
15373 +
15374 + #define __irqd_to_state(d) ACCESS_PRIVATE((d)->common, state_use_accessors)
15375 +@@ -389,6 +392,21 @@ static inline bool irqd_can_reserve(struct irq_data *d)
15376 + return __irqd_to_state(d) & IRQD_CAN_RESERVE;
15377 + }
15378 +
15379 ++static inline void irqd_set_msi_nomask_quirk(struct irq_data *d)
15380 ++{
15381 ++ __irqd_to_state(d) |= IRQD_MSI_NOMASK_QUIRK;
15382 ++}
15383 ++
15384 ++static inline void irqd_clr_msi_nomask_quirk(struct irq_data *d)
15385 ++{
15386 ++ __irqd_to_state(d) &= ~IRQD_MSI_NOMASK_QUIRK;
15387 ++}
15388 ++
15389 ++static inline bool irqd_msi_nomask_quirk(struct irq_data *d)
15390 ++{
15391 ++ return __irqd_to_state(d) & IRQD_MSI_NOMASK_QUIRK;
15392 ++}
15393 ++
15394 + #undef __irqd_to_state
15395 +
15396 + static inline irq_hw_number_t irqd_to_hwirq(struct irq_data *d)
15397 +diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h
15398 +index dccfa65aee96..8301f1df0682 100644
15399 +--- a/include/linux/irqdomain.h
15400 ++++ b/include/linux/irqdomain.h
15401 +@@ -202,6 +202,13 @@ enum {
15402 + /* Irq domain implements MSI remapping */
15403 + IRQ_DOMAIN_FLAG_MSI_REMAP = (1 << 5),
15404 +
15405 ++ /*
15406 ++ * Quirk to handle MSI implementations which do not provide
15407 ++ * masking. Currently known to affect x86, but partially
15408 ++ * handled in core code.
15409 ++ */
15410 ++ IRQ_DOMAIN_MSI_NOMASK_QUIRK = (1 << 6),
15411 ++
15412 + /*
15413 + * Flags starting from IRQ_DOMAIN_FLAG_NONCORE are reserved
15414 + * for implementation specific purposes and ignored by the
15415 +diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
15416 +index 748016ae01e3..0f99ecc01bc7 100644
15417 +--- a/include/linux/kvm_host.h
15418 ++++ b/include/linux/kvm_host.h
15419 +@@ -186,7 +186,7 @@ struct kvm_async_pf {
15420 + struct list_head queue;
15421 + struct kvm_vcpu *vcpu;
15422 + struct mm_struct *mm;
15423 +- gva_t gva;
15424 ++ gpa_t cr2_or_gpa;
15425 + unsigned long addr;
15426 + struct kvm_arch_async_pf arch;
15427 + bool wakeup_all;
15428 +@@ -194,8 +194,8 @@ struct kvm_async_pf {
15429 +
15430 + void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu);
15431 + void kvm_check_async_pf_completion(struct kvm_vcpu *vcpu);
15432 +-int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, unsigned long hva,
15433 +- struct kvm_arch_async_pf *arch);
15434 ++int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
15435 ++ unsigned long hva, struct kvm_arch_async_pf *arch);
15436 + int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu);
15437 + #endif
15438 +
15439 +@@ -704,7 +704,7 @@ int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len);
15440 + int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len);
15441 + struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn);
15442 + bool kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn);
15443 +-unsigned long kvm_host_page_size(struct kvm *kvm, gfn_t gfn);
15444 ++unsigned long kvm_host_page_size(struct kvm_vcpu *vcpu, gfn_t gfn);
15445 + void mark_page_dirty(struct kvm *kvm, gfn_t gfn);
15446 +
15447 + struct kvm_memslots *kvm_vcpu_memslots(struct kvm_vcpu *vcpu);
15448 +diff --git a/include/linux/memblock.h b/include/linux/memblock.h
15449 +index 516920549378..2acdd046df2d 100644
15450 +--- a/include/linux/memblock.h
15451 ++++ b/include/linux/memblock.h
15452 +@@ -265,21 +265,6 @@ void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn,
15453 + for_each_mem_range_rev(i, &memblock.memory, &memblock.reserved, \
15454 + nid, flags, p_start, p_end, p_nid)
15455 +
15456 +-/**
15457 +- * for_each_resv_unavail_range - iterate through reserved and unavailable memory
15458 +- * @i: u64 used as loop variable
15459 +- * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL
15460 +- * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL
15461 +- *
15462 +- * Walks over unavailable but reserved (reserved && !memory) areas of memblock.
15463 +- * Available as soon as memblock is initialized.
15464 +- * Note: because this memory does not belong to any physical node, flags and
15465 +- * nid arguments do not make sense and thus not exported as arguments.
15466 +- */
15467 +-#define for_each_resv_unavail_range(i, p_start, p_end) \
15468 +- for_each_mem_range(i, &memblock.reserved, &memblock.memory, \
15469 +- NUMA_NO_NODE, MEMBLOCK_NONE, p_start, p_end, NULL)
15470 +-
15471 + static inline void memblock_set_region_flags(struct memblock_region *r,
15472 + enum memblock_flags flags)
15473 + {
15474 +diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h
15475 +index 2d2096ba1cfe..90b8ce813fa6 100644
15476 +--- a/include/linux/percpu-defs.h
15477 ++++ b/include/linux/percpu-defs.h
15478 +@@ -176,8 +176,7 @@
15479 + * Declaration/definition used for per-CPU variables that should be accessed
15480 + * as decrypted when memory encryption is enabled in the guest.
15481 + */
15482 +-#if defined(CONFIG_VIRTUALIZATION) && defined(CONFIG_AMD_MEM_ENCRYPT)
15483 +-
15484 ++#ifdef CONFIG_AMD_MEM_ENCRYPT
15485 + #define DECLARE_PER_CPU_DECRYPTED(type, name) \
15486 + DECLARE_PER_CPU_SECTION(type, name, "..decrypted")
15487 +
15488 +diff --git a/include/media/v4l2-rect.h b/include/media/v4l2-rect.h
15489 +index 595c3ba05f23..59745e5feb4d 100644
15490 +--- a/include/media/v4l2-rect.h
15491 ++++ b/include/media/v4l2-rect.h
15492 +@@ -63,10 +63,10 @@ static inline void v4l2_rect_map_inside(struct v4l2_rect *r,
15493 + r->left = boundary->left;
15494 + if (r->top < boundary->top)
15495 + r->top = boundary->top;
15496 +- if (r->left + r->width > boundary->width)
15497 +- r->left = boundary->width - r->width;
15498 +- if (r->top + r->height > boundary->height)
15499 +- r->top = boundary->height - r->height;
15500 ++ if (r->left + r->width > boundary->left + boundary->width)
15501 ++ r->left = boundary->left + boundary->width - r->width;
15502 ++ if (r->top + r->height > boundary->top + boundary->height)
15503 ++ r->top = boundary->top + boundary->height - r->height;
15504 + }
15505 +
15506 + /**
15507 +diff --git a/include/net/ipx.h b/include/net/ipx.h
15508 +index baf090390998..9d1342807b59 100644
15509 +--- a/include/net/ipx.h
15510 ++++ b/include/net/ipx.h
15511 +@@ -47,11 +47,6 @@ struct ipxhdr {
15512 + /* From af_ipx.c */
15513 + extern int sysctl_ipx_pprop_broadcasting;
15514 +
15515 +-static __inline__ struct ipxhdr *ipx_hdr(struct sk_buff *skb)
15516 +-{
15517 +- return (struct ipxhdr *)skb_transport_header(skb);
15518 +-}
15519 +-
15520 + struct ipx_interface {
15521 + /* IPX address */
15522 + __be32 if_netnum;
15523 +diff --git a/ipc/msg.c b/ipc/msg.c
15524 +index 883642cf2b27..ac4de3f67261 100644
15525 +--- a/ipc/msg.c
15526 ++++ b/ipc/msg.c
15527 +@@ -377,7 +377,7 @@ copy_msqid_from_user(struct msqid64_ds *out, void __user *buf, int version)
15528 + * NOTE: no locks must be held, the rwsem is taken inside this function.
15529 + */
15530 + static int msgctl_down(struct ipc_namespace *ns, int msqid, int cmd,
15531 +- struct msqid64_ds *msqid64)
15532 ++ struct ipc64_perm *perm, int msg_qbytes)
15533 + {
15534 + struct kern_ipc_perm *ipcp;
15535 + struct msg_queue *msq;
15536 +@@ -387,7 +387,7 @@ static int msgctl_down(struct ipc_namespace *ns, int msqid, int cmd,
15537 + rcu_read_lock();
15538 +
15539 + ipcp = ipcctl_obtain_check(ns, &msg_ids(ns), msqid, cmd,
15540 +- &msqid64->msg_perm, msqid64->msg_qbytes);
15541 ++ perm, msg_qbytes);
15542 + if (IS_ERR(ipcp)) {
15543 + err = PTR_ERR(ipcp);
15544 + goto out_unlock1;
15545 +@@ -409,18 +409,18 @@ static int msgctl_down(struct ipc_namespace *ns, int msqid, int cmd,
15546 + {
15547 + DEFINE_WAKE_Q(wake_q);
15548 +
15549 +- if (msqid64->msg_qbytes > ns->msg_ctlmnb &&
15550 ++ if (msg_qbytes > ns->msg_ctlmnb &&
15551 + !capable(CAP_SYS_RESOURCE)) {
15552 + err = -EPERM;
15553 + goto out_unlock1;
15554 + }
15555 +
15556 + ipc_lock_object(&msq->q_perm);
15557 +- err = ipc_update_perm(&msqid64->msg_perm, ipcp);
15558 ++ err = ipc_update_perm(perm, ipcp);
15559 + if (err)
15560 + goto out_unlock0;
15561 +
15562 +- msq->q_qbytes = msqid64->msg_qbytes;
15563 ++ msq->q_qbytes = msg_qbytes;
15564 +
15565 + msq->q_ctime = ktime_get_real_seconds();
15566 + /*
15567 +@@ -603,9 +603,10 @@ long ksys_msgctl(int msqid, int cmd, struct msqid_ds __user *buf)
15568 + case IPC_SET:
15569 + if (copy_msqid_from_user(&msqid64, buf, version))
15570 + return -EFAULT;
15571 +- /* fallthru */
15572 ++ return msgctl_down(ns, msqid, cmd, &msqid64.msg_perm,
15573 ++ msqid64.msg_qbytes);
15574 + case IPC_RMID:
15575 +- return msgctl_down(ns, msqid, cmd, &msqid64);
15576 ++ return msgctl_down(ns, msqid, cmd, NULL, 0);
15577 + default:
15578 + return -EINVAL;
15579 + }
15580 +@@ -724,9 +725,9 @@ long compat_ksys_msgctl(int msqid, int cmd, void __user *uptr)
15581 + case IPC_SET:
15582 + if (copy_compat_msqid_from_user(&msqid64, uptr, version))
15583 + return -EFAULT;
15584 +- /* fallthru */
15585 ++ return msgctl_down(ns, msqid, cmd, &msqid64.msg_perm, msqid64.msg_qbytes);
15586 + case IPC_RMID:
15587 +- return msgctl_down(ns, msqid, cmd, &msqid64);
15588 ++ return msgctl_down(ns, msqid, cmd, NULL, 0);
15589 + default:
15590 + return -EINVAL;
15591 + }
15592 +diff --git a/kernel/events/core.c b/kernel/events/core.c
15593 +index 16af86ab24c4..8c70ee23fbe9 100644
15594 +--- a/kernel/events/core.c
15595 ++++ b/kernel/events/core.c
15596 +@@ -5709,7 +5709,15 @@ accounting:
15597 + */
15598 + user_lock_limit *= num_online_cpus();
15599 +
15600 +- user_locked = atomic_long_read(&user->locked_vm) + user_extra;
15601 ++ user_locked = atomic_long_read(&user->locked_vm);
15602 ++
15603 ++ /*
15604 ++ * sysctl_perf_event_mlock may have changed, so that
15605 ++ * user->locked_vm > user_lock_limit
15606 ++ */
15607 ++ if (user_locked > user_lock_limit)
15608 ++ user_locked = user_lock_limit;
15609 ++ user_locked += user_extra;
15610 +
15611 + if (user_locked > user_lock_limit)
15612 + extra = user_locked - user_lock_limit;
15613 +diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c
15614 +index 6f636136cccc..b3f55dd581b0 100644
15615 +--- a/kernel/irq/debugfs.c
15616 ++++ b/kernel/irq/debugfs.c
15617 +@@ -113,6 +113,7 @@ static const struct irq_bit_descr irqdata_states[] = {
15618 + BIT_MASK_DESCR(IRQD_AFFINITY_MANAGED),
15619 + BIT_MASK_DESCR(IRQD_MANAGED_SHUTDOWN),
15620 + BIT_MASK_DESCR(IRQD_CAN_RESERVE),
15621 ++ BIT_MASK_DESCR(IRQD_MSI_NOMASK_QUIRK),
15622 +
15623 + BIT_MASK_DESCR(IRQD_FORWARDED_TO_VCPU),
15624 +
15625 +diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
15626 +index 6e8520a81dd8..e0eda2bd3975 100644
15627 +--- a/kernel/irq/irqdomain.c
15628 ++++ b/kernel/irq/irqdomain.c
15629 +@@ -1421,6 +1421,7 @@ int irq_domain_push_irq(struct irq_domain *domain, int virq, void *arg)
15630 + if (rv) {
15631 + /* Restore the original irq_data. */
15632 + *root_irq_data = *child_irq_data;
15633 ++ kfree(child_irq_data);
15634 + goto error;
15635 + }
15636 +
15637 +diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
15638 +index 4ca2fd46645d..dc1186ce3ecd 100644
15639 +--- a/kernel/irq/msi.c
15640 ++++ b/kernel/irq/msi.c
15641 +@@ -453,8 +453,11 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
15642 + continue;
15643 +
15644 + irq_data = irq_domain_get_irq_data(domain, desc->irq);
15645 +- if (!can_reserve)
15646 ++ if (!can_reserve) {
15647 + irqd_clr_can_reserve(irq_data);
15648 ++ if (domain->flags & IRQ_DOMAIN_MSI_NOMASK_QUIRK)
15649 ++ irqd_set_msi_nomask_quirk(irq_data);
15650 ++ }
15651 + ret = irq_domain_activate_irq(irq_data, can_reserve);
15652 + if (ret)
15653 + goto cleanup;
15654 +diff --git a/kernel/module.c b/kernel/module.c
15655 +index d3aaec62c142..70a75a7216ab 100644
15656 +--- a/kernel/module.c
15657 ++++ b/kernel/module.c
15658 +@@ -1729,6 +1729,8 @@ static int module_add_modinfo_attrs(struct module *mod)
15659 + error_out:
15660 + if (i > 0)
15661 + module_remove_modinfo_attrs(mod, --i);
15662 ++ else
15663 ++ kfree(mod->modinfo_attrs);
15664 + return error;
15665 + }
15666 +
15667 +diff --git a/kernel/padata.c b/kernel/padata.c
15668 +index 6c06b3039fae..11c5f9c8779e 100644
15669 +--- a/kernel/padata.c
15670 ++++ b/kernel/padata.c
15671 +@@ -35,6 +35,8 @@
15672 +
15673 + #define MAX_OBJ_NUM 1000
15674 +
15675 ++static void padata_free_pd(struct parallel_data *pd);
15676 ++
15677 + static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index)
15678 + {
15679 + int cpu, target_cpu;
15680 +@@ -334,6 +336,7 @@ static void padata_serial_worker(struct work_struct *serial_work)
15681 + struct padata_serial_queue *squeue;
15682 + struct parallel_data *pd;
15683 + LIST_HEAD(local_list);
15684 ++ int cnt;
15685 +
15686 + local_bh_disable();
15687 + squeue = container_of(serial_work, struct padata_serial_queue, work);
15688 +@@ -343,6 +346,8 @@ static void padata_serial_worker(struct work_struct *serial_work)
15689 + list_replace_init(&squeue->serial.list, &local_list);
15690 + spin_unlock(&squeue->serial.lock);
15691 +
15692 ++ cnt = 0;
15693 ++
15694 + while (!list_empty(&local_list)) {
15695 + struct padata_priv *padata;
15696 +
15697 +@@ -352,9 +357,12 @@ static void padata_serial_worker(struct work_struct *serial_work)
15698 + list_del_init(&padata->list);
15699 +
15700 + padata->serial(padata);
15701 +- atomic_dec(&pd->refcnt);
15702 ++ cnt++;
15703 + }
15704 + local_bh_enable();
15705 ++
15706 ++ if (atomic_sub_and_test(cnt, &pd->refcnt))
15707 ++ padata_free_pd(pd);
15708 + }
15709 +
15710 + /**
15711 +@@ -501,8 +509,7 @@ static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst,
15712 + timer_setup(&pd->timer, padata_reorder_timer, 0);
15713 + atomic_set(&pd->seq_nr, -1);
15714 + atomic_set(&pd->reorder_objects, 0);
15715 +- atomic_set(&pd->refcnt, 0);
15716 +- pd->pinst = pinst;
15717 ++ atomic_set(&pd->refcnt, 1);
15718 + spin_lock_init(&pd->lock);
15719 +
15720 + return pd;
15721 +@@ -526,31 +533,6 @@ static void padata_free_pd(struct parallel_data *pd)
15722 + kfree(pd);
15723 + }
15724 +
15725 +-/* Flush all objects out of the padata queues. */
15726 +-static void padata_flush_queues(struct parallel_data *pd)
15727 +-{
15728 +- int cpu;
15729 +- struct padata_parallel_queue *pqueue;
15730 +- struct padata_serial_queue *squeue;
15731 +-
15732 +- for_each_cpu(cpu, pd->cpumask.pcpu) {
15733 +- pqueue = per_cpu_ptr(pd->pqueue, cpu);
15734 +- flush_work(&pqueue->work);
15735 +- }
15736 +-
15737 +- del_timer_sync(&pd->timer);
15738 +-
15739 +- if (atomic_read(&pd->reorder_objects))
15740 +- padata_reorder(pd);
15741 +-
15742 +- for_each_cpu(cpu, pd->cpumask.cbcpu) {
15743 +- squeue = per_cpu_ptr(pd->squeue, cpu);
15744 +- flush_work(&squeue->work);
15745 +- }
15746 +-
15747 +- BUG_ON(atomic_read(&pd->refcnt) != 0);
15748 +-}
15749 +-
15750 + static void __padata_start(struct padata_instance *pinst)
15751 + {
15752 + pinst->flags |= PADATA_INIT;
15753 +@@ -564,10 +546,6 @@ static void __padata_stop(struct padata_instance *pinst)
15754 + pinst->flags &= ~PADATA_INIT;
15755 +
15756 + synchronize_rcu();
15757 +-
15758 +- get_online_cpus();
15759 +- padata_flush_queues(pinst->pd);
15760 +- put_online_cpus();
15761 + }
15762 +
15763 + /* Replace the internal control structure with a new one. */
15764 +@@ -588,8 +566,8 @@ static void padata_replace(struct padata_instance *pinst,
15765 + if (!cpumask_equal(pd_old->cpumask.cbcpu, pd_new->cpumask.cbcpu))
15766 + notification_mask |= PADATA_CPU_SERIAL;
15767 +
15768 +- padata_flush_queues(pd_old);
15769 +- padata_free_pd(pd_old);
15770 ++ if (atomic_dec_and_test(&pd_old->refcnt))
15771 ++ padata_free_pd(pd_old);
15772 +
15773 + if (notification_mask)
15774 + blocking_notifier_call_chain(&pinst->cpumask_change_notifier,
15775 +diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
15776 +index 845efadaf7ec..7a2fdc097c8c 100644
15777 +--- a/kernel/printk/printk.c
15778 ++++ b/kernel/printk/printk.c
15779 +@@ -2717,8 +2717,6 @@ void register_console(struct console *newcon)
15780 + * for us.
15781 + */
15782 + logbuf_lock_irqsave(flags);
15783 +- console_seq = syslog_seq;
15784 +- console_idx = syslog_idx;
15785 + /*
15786 + * We're about to replay the log buffer. Only do this to the
15787 + * just-registered console to avoid excessive message spam to
15788 +@@ -2730,6 +2728,8 @@ void register_console(struct console *newcon)
15789 + */
15790 + exclusive_console = newcon;
15791 + exclusive_console_stop_seq = console_seq;
15792 ++ console_seq = syslog_seq;
15793 ++ console_idx = syslog_idx;
15794 + logbuf_unlock_irqrestore(flags);
15795 + }
15796 + console_unlock();
15797 +diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
15798 +index a97c20ea9bce..5f6de49dc78e 100644
15799 +--- a/kernel/rcu/tree_plugin.h
15800 ++++ b/kernel/rcu/tree_plugin.h
15801 +@@ -267,7 +267,7 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp)
15802 + * blocked tasks.
15803 + */
15804 + if (!rnp->gp_tasks && (blkd_state & RCU_GP_BLKD)) {
15805 +- rnp->gp_tasks = &t->rcu_node_entry;
15806 ++ WRITE_ONCE(rnp->gp_tasks, &t->rcu_node_entry);
15807 + WARN_ON_ONCE(rnp->completedqs == rnp->gp_seq);
15808 + }
15809 + if (!rnp->exp_tasks && (blkd_state & RCU_EXP_BLKD))
15810 +@@ -392,7 +392,7 @@ static void rcu_preempt_note_context_switch(bool preempt)
15811 + */
15812 + static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp)
15813 + {
15814 +- return rnp->gp_tasks != NULL;
15815 ++ return READ_ONCE(rnp->gp_tasks) != NULL;
15816 + }
15817 +
15818 + /*
15819 +@@ -557,7 +557,7 @@ static void rcu_read_unlock_special(struct task_struct *t)
15820 + trace_rcu_unlock_preempted_task(TPS("rcu_preempt"),
15821 + rnp->gp_seq, t->pid);
15822 + if (&t->rcu_node_entry == rnp->gp_tasks)
15823 +- rnp->gp_tasks = np;
15824 ++ WRITE_ONCE(rnp->gp_tasks, np);
15825 + if (&t->rcu_node_entry == rnp->exp_tasks)
15826 + rnp->exp_tasks = np;
15827 + if (IS_ENABLED(CONFIG_RCU_BOOST)) {
15828 +@@ -716,7 +716,7 @@ rcu_preempt_check_blocked_tasks(struct rcu_state *rsp, struct rcu_node *rnp)
15829 + dump_blkd_tasks(rsp, rnp, 10);
15830 + if (rcu_preempt_has_tasks(rnp) &&
15831 + (rnp->qsmaskinit || rnp->wait_blkd_tasks)) {
15832 +- rnp->gp_tasks = rnp->blkd_tasks.next;
15833 ++ WRITE_ONCE(rnp->gp_tasks, rnp->blkd_tasks.next);
15834 + t = container_of(rnp->gp_tasks, struct task_struct,
15835 + rcu_node_entry);
15836 + trace_rcu_unlock_preempted_task(TPS("rcu_preempt-GPS"),
15837 +@@ -883,7 +883,8 @@ dump_blkd_tasks(struct rcu_state *rsp, struct rcu_node *rnp, int ncheck)
15838 + pr_info("%s: %d:%d ->qsmask %#lx ->qsmaskinit %#lx ->qsmaskinitnext %#lx\n",
15839 + __func__, rnp1->grplo, rnp1->grphi, rnp1->qsmask, rnp1->qsmaskinit, rnp1->qsmaskinitnext);
15840 + pr_info("%s: ->gp_tasks %p ->boost_tasks %p ->exp_tasks %p\n",
15841 +- __func__, rnp->gp_tasks, rnp->boost_tasks, rnp->exp_tasks);
15842 ++ __func__, READ_ONCE(rnp->gp_tasks), rnp->boost_tasks,
15843 ++ rnp->exp_tasks);
15844 + pr_info("%s: ->blkd_tasks", __func__);
15845 + i = 0;
15846 + list_for_each(lhp, &rnp->blkd_tasks) {
15847 +diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
15848 +index f4255a65c44b..9eece67f29f3 100644
15849 +--- a/kernel/time/alarmtimer.c
15850 ++++ b/kernel/time/alarmtimer.c
15851 +@@ -91,6 +91,7 @@ static int alarmtimer_rtc_add_device(struct device *dev,
15852 + unsigned long flags;
15853 + struct rtc_device *rtc = to_rtc_device(dev);
15854 + struct wakeup_source *__ws;
15855 ++ int ret = 0;
15856 +
15857 + if (rtcdev)
15858 + return -EBUSY;
15859 +@@ -105,8 +106,8 @@ static int alarmtimer_rtc_add_device(struct device *dev,
15860 + spin_lock_irqsave(&rtcdev_lock, flags);
15861 + if (!rtcdev) {
15862 + if (!try_module_get(rtc->owner)) {
15863 +- spin_unlock_irqrestore(&rtcdev_lock, flags);
15864 +- return -1;
15865 ++ ret = -1;
15866 ++ goto unlock;
15867 + }
15868 +
15869 + rtcdev = rtc;
15870 +@@ -115,11 +116,12 @@ static int alarmtimer_rtc_add_device(struct device *dev,
15871 + ws = __ws;
15872 + __ws = NULL;
15873 + }
15874 ++unlock:
15875 + spin_unlock_irqrestore(&rtcdev_lock, flags);
15876 +
15877 + wakeup_source_unregister(__ws);
15878 +
15879 +- return 0;
15880 ++ return ret;
15881 + }
15882 +
15883 + static inline void alarmtimer_rtc_timer_init(void)
15884 +diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
15885 +index 0e6e97a01942..f80bb104c41a 100644
15886 +--- a/kernel/time/clocksource.c
15887 ++++ b/kernel/time/clocksource.c
15888 +@@ -311,8 +311,15 @@ static void clocksource_watchdog(struct timer_list *unused)
15889 + next_cpu = cpumask_next(raw_smp_processor_id(), cpu_online_mask);
15890 + if (next_cpu >= nr_cpu_ids)
15891 + next_cpu = cpumask_first(cpu_online_mask);
15892 +- watchdog_timer.expires += WATCHDOG_INTERVAL;
15893 +- add_timer_on(&watchdog_timer, next_cpu);
15894 ++
15895 ++ /*
15896 ++ * Arm timer if not already pending: could race with concurrent
15897 ++ * pair clocksource_stop_watchdog() clocksource_start_watchdog().
15898 ++ */
15899 ++ if (!timer_pending(&watchdog_timer)) {
15900 ++ watchdog_timer.expires += WATCHDOG_INTERVAL;
15901 ++ add_timer_on(&watchdog_timer, next_cpu);
15902 ++ }
15903 + out:
15904 + spin_unlock(&watchdog_lock);
15905 + }
15906 +diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
15907 +index 37a435bac161..53795237e975 100644
15908 +--- a/kernel/trace/ftrace.c
15909 ++++ b/kernel/trace/ftrace.c
15910 +@@ -5072,8 +5072,8 @@ static const struct file_operations ftrace_notrace_fops = {
15911 +
15912 + static DEFINE_MUTEX(graph_lock);
15913 +
15914 +-struct ftrace_hash *ftrace_graph_hash = EMPTY_HASH;
15915 +-struct ftrace_hash *ftrace_graph_notrace_hash = EMPTY_HASH;
15916 ++struct ftrace_hash __rcu *ftrace_graph_hash = EMPTY_HASH;
15917 ++struct ftrace_hash __rcu *ftrace_graph_notrace_hash = EMPTY_HASH;
15918 +
15919 + enum graph_filter_type {
15920 + GRAPH_FILTER_NOTRACE = 0,
15921 +@@ -5344,8 +5344,15 @@ ftrace_graph_release(struct inode *inode, struct file *file)
15922 +
15923 + mutex_unlock(&graph_lock);
15924 +
15925 +- /* Wait till all users are no longer using the old hash */
15926 +- synchronize_sched();
15927 ++ /*
15928 ++ * We need to do a hard force of sched synchronization.
15929 ++ * This is because we use preempt_disable() to do RCU, but
15930 ++ * the function tracers can be called where RCU is not watching
15931 ++ * (like before user_exit()). We can not rely on the RCU
15932 ++ * infrastructure to do the synchronization, thus we must do it
15933 ++ * ourselves.
15934 ++ */
15935 ++ schedule_on_each_cpu(ftrace_sync);
15936 +
15937 + free_ftrace_hash(old_hash);
15938 + }
15939 +diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
15940 +index d11d7bfc3fa5..ee0c6a313ed1 100644
15941 +--- a/kernel/trace/trace.h
15942 ++++ b/kernel/trace/trace.h
15943 +@@ -872,22 +872,31 @@ extern void __trace_graph_return(struct trace_array *tr,
15944 + unsigned long flags, int pc);
15945 +
15946 + #ifdef CONFIG_DYNAMIC_FTRACE
15947 +-extern struct ftrace_hash *ftrace_graph_hash;
15948 +-extern struct ftrace_hash *ftrace_graph_notrace_hash;
15949 ++extern struct ftrace_hash __rcu *ftrace_graph_hash;
15950 ++extern struct ftrace_hash __rcu *ftrace_graph_notrace_hash;
15951 +
15952 + static inline int ftrace_graph_addr(struct ftrace_graph_ent *trace)
15953 + {
15954 + unsigned long addr = trace->func;
15955 + int ret = 0;
15956 ++ struct ftrace_hash *hash;
15957 +
15958 + preempt_disable_notrace();
15959 +
15960 +- if (ftrace_hash_empty(ftrace_graph_hash)) {
15961 ++ /*
15962 ++ * Have to open code "rcu_dereference_sched()" because the
15963 ++ * function graph tracer can be called when RCU is not
15964 ++ * "watching".
15965 ++ * Protected with schedule_on_each_cpu(ftrace_sync)
15966 ++ */
15967 ++ hash = rcu_dereference_protected(ftrace_graph_hash, !preemptible());
15968 ++
15969 ++ if (ftrace_hash_empty(hash)) {
15970 + ret = 1;
15971 + goto out;
15972 + }
15973 +
15974 +- if (ftrace_lookup_ip(ftrace_graph_hash, addr)) {
15975 ++ if (ftrace_lookup_ip(hash, addr)) {
15976 +
15977 + /*
15978 + * This needs to be cleared on the return functions
15979 +@@ -923,10 +932,20 @@ static inline void ftrace_graph_addr_finish(struct ftrace_graph_ret *trace)
15980 + static inline int ftrace_graph_notrace_addr(unsigned long addr)
15981 + {
15982 + int ret = 0;
15983 ++ struct ftrace_hash *notrace_hash;
15984 +
15985 + preempt_disable_notrace();
15986 +
15987 +- if (ftrace_lookup_ip(ftrace_graph_notrace_hash, addr))
15988 ++ /*
15989 ++ * Have to open code "rcu_dereference_sched()" because the
15990 ++ * function graph tracer can be called when RCU is not
15991 ++ * "watching".
15992 ++ * Protected with schedule_on_each_cpu(ftrace_sync)
15993 ++ */
15994 ++ notrace_hash = rcu_dereference_protected(ftrace_graph_notrace_hash,
15995 ++ !preemptible());
15996 ++
15997 ++ if (ftrace_lookup_ip(notrace_hash, addr))
15998 + ret = 1;
15999 +
16000 + preempt_enable_notrace();
16001 +diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
16002 +index e288168661e1..e304196d7c28 100644
16003 +--- a/kernel/trace/trace_sched_switch.c
16004 ++++ b/kernel/trace/trace_sched_switch.c
16005 +@@ -89,8 +89,10 @@ static void tracing_sched_unregister(void)
16006 +
16007 + static void tracing_start_sched_switch(int ops)
16008 + {
16009 +- bool sched_register = (!sched_cmdline_ref && !sched_tgid_ref);
16010 ++ bool sched_register;
16011 ++
16012 + mutex_lock(&sched_register_mutex);
16013 ++ sched_register = (!sched_cmdline_ref && !sched_tgid_ref);
16014 +
16015 + switch (ops) {
16016 + case RECORD_CMDLINE:
16017 +diff --git a/lib/test_kasan.c b/lib/test_kasan.c
16018 +index ec657105edbf..bd0e067c4895 100644
16019 +--- a/lib/test_kasan.c
16020 ++++ b/lib/test_kasan.c
16021 +@@ -157,6 +157,7 @@ static noinline void __init kmalloc_oob_krealloc_more(void)
16022 + if (!ptr1 || !ptr2) {
16023 + pr_err("Allocation failed\n");
16024 + kfree(ptr1);
16025 ++ kfree(ptr2);
16026 + return;
16027 + }
16028 +
16029 +diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
16030 +index abc10dcbc9d5..aae7ff485671 100644
16031 +--- a/mm/memory_hotplug.c
16032 ++++ b/mm/memory_hotplug.c
16033 +@@ -1902,8 +1902,6 @@ void __ref __remove_memory(int nid, u64 start, u64 size)
16034 +
16035 + BUG_ON(check_hotplug_memory_range(start, size));
16036 +
16037 +- mem_hotplug_begin();
16038 +-
16039 + /*
16040 + * All memory blocks must be offlined before removing memory. Check
16041 + * whether all memory blocks in question are offline and trigger a BUG()
16042 +@@ -1919,9 +1917,14 @@ void __ref __remove_memory(int nid, u64 start, u64 size)
16043 + memblock_free(start, size);
16044 + memblock_remove(start, size);
16045 +
16046 +- /* remove memory block devices before removing memory */
16047 ++ /*
16048 ++ * Memory block device removal under the device_hotplug_lock is
16049 ++ * a barrier against racing online attempts.
16050 ++ */
16051 + remove_memory_block_devices(start, size);
16052 +
16053 ++ mem_hotplug_begin();
16054 ++
16055 + arch_remove_memory(nid, start, size, NULL);
16056 + __release_memory_resource(start, size);
16057 +
16058 +diff --git a/mm/migrate.c b/mm/migrate.c
16059 +index 70f8ad4ade3f..a69b842f95da 100644
16060 +--- a/mm/migrate.c
16061 ++++ b/mm/migrate.c
16062 +@@ -1623,8 +1623,19 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes,
16063 + start = i;
16064 + } else if (node != current_node) {
16065 + err = do_move_pages_to_node(mm, &pagelist, current_node);
16066 +- if (err)
16067 ++ if (err) {
16068 ++ /*
16069 ++ * Positive err means the number of failed
16070 ++ * pages to migrate. Since we are going to
16071 ++ * abort and return the number of non-migrated
16072 ++ * pages, so need to incude the rest of the
16073 ++ * nr_pages that have not been attempted as
16074 ++ * well.
16075 ++ */
16076 ++ if (err > 0)
16077 ++ err += nr_pages - i - 1;
16078 + goto out;
16079 ++ }
16080 + err = store_status(status, start, current_node, i - start);
16081 + if (err)
16082 + goto out;
16083 +@@ -1655,8 +1666,11 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes,
16084 + goto out_flush;
16085 +
16086 + err = do_move_pages_to_node(mm, &pagelist, current_node);
16087 +- if (err)
16088 ++ if (err) {
16089 ++ if (err > 0)
16090 ++ err += nr_pages - i - 1;
16091 + goto out;
16092 ++ }
16093 + if (i > start) {
16094 + err = store_status(status, start, current_node, i - start);
16095 + if (err)
16096 +@@ -1670,6 +1684,13 @@ out_flush:
16097 +
16098 + /* Make sure we do not overwrite the existing error */
16099 + err1 = do_move_pages_to_node(mm, &pagelist, current_node);
16100 ++ /*
16101 ++ * Don't have to report non-attempted pages here since:
16102 ++ * - If the above loop is done gracefully all pages have been
16103 ++ * attempted.
16104 ++ * - If the above loop is aborted it means a fatal error
16105 ++ * happened, should return ret.
16106 ++ */
16107 + if (!err1)
16108 + err1 = store_status(status, start, current_node, i - start);
16109 + if (err >= 0)
16110 +diff --git a/mm/page_alloc.c b/mm/page_alloc.c
16111 +index 74fb5c338e8f..e5c610d711f3 100644
16112 +--- a/mm/page_alloc.c
16113 ++++ b/mm/page_alloc.c
16114 +@@ -6456,45 +6456,75 @@ void __init free_area_init_node(int nid, unsigned long *zones_size,
16115 + }
16116 +
16117 + #if defined(CONFIG_HAVE_MEMBLOCK) && !defined(CONFIG_FLAT_NODE_MEM_MAP)
16118 ++
16119 ++/*
16120 ++ * Zero all valid struct pages in range [spfn, epfn), return number of struct
16121 ++ * pages zeroed
16122 ++ */
16123 ++static u64 zero_pfn_range(unsigned long spfn, unsigned long epfn)
16124 ++{
16125 ++ unsigned long pfn;
16126 ++ u64 pgcnt = 0;
16127 ++
16128 ++ for (pfn = spfn; pfn < epfn; pfn++) {
16129 ++ if (!pfn_valid(ALIGN_DOWN(pfn, pageblock_nr_pages))) {
16130 ++ pfn = ALIGN_DOWN(pfn, pageblock_nr_pages)
16131 ++ + pageblock_nr_pages - 1;
16132 ++ continue;
16133 ++ }
16134 ++ mm_zero_struct_page(pfn_to_page(pfn));
16135 ++ pgcnt++;
16136 ++ }
16137 ++
16138 ++ return pgcnt;
16139 ++}
16140 ++
16141 + /*
16142 + * Only struct pages that are backed by physical memory are zeroed and
16143 + * initialized by going through __init_single_page(). But, there are some
16144 + * struct pages which are reserved in memblock allocator and their fields
16145 + * may be accessed (for example page_to_pfn() on some configuration accesses
16146 + * flags). We must explicitly zero those struct pages.
16147 ++ *
16148 ++ * This function also addresses a similar issue where struct pages are left
16149 ++ * uninitialized because the physical address range is not covered by
16150 ++ * memblock.memory or memblock.reserved. That could happen when memblock
16151 ++ * layout is manually configured via memmap=, or when the highest physical
16152 ++ * address (max_pfn) does not end on a section boundary.
16153 + */
16154 + void __init zero_resv_unavail(void)
16155 + {
16156 + phys_addr_t start, end;
16157 +- unsigned long pfn;
16158 + u64 i, pgcnt;
16159 ++ phys_addr_t next = 0;
16160 +
16161 + /*
16162 +- * Loop through ranges that are reserved, but do not have reported
16163 +- * physical memory backing.
16164 ++ * Loop through unavailable ranges not covered by memblock.memory.
16165 + */
16166 + pgcnt = 0;
16167 +- for_each_resv_unavail_range(i, &start, &end) {
16168 +- for (pfn = PFN_DOWN(start); pfn < PFN_UP(end); pfn++) {
16169 +- if (!pfn_valid(ALIGN_DOWN(pfn, pageblock_nr_pages))) {
16170 +- pfn = ALIGN_DOWN(pfn, pageblock_nr_pages)
16171 +- + pageblock_nr_pages - 1;
16172 +- continue;
16173 +- }
16174 +- mm_zero_struct_page(pfn_to_page(pfn));
16175 +- pgcnt++;
16176 +- }
16177 ++ for_each_mem_range(i, &memblock.memory, NULL,
16178 ++ NUMA_NO_NODE, MEMBLOCK_NONE, &start, &end, NULL) {
16179 ++ if (next < start)
16180 ++ pgcnt += zero_pfn_range(PFN_DOWN(next), PFN_UP(start));
16181 ++ next = end;
16182 + }
16183 +
16184 ++ /*
16185 ++ * Early sections always have a fully populated memmap for the whole
16186 ++ * section - see pfn_valid(). If the last section has holes at the
16187 ++ * end and that section is marked "online", the memmap will be
16188 ++ * considered initialized. Make sure that memmap has a well defined
16189 ++ * state.
16190 ++ */
16191 ++ pgcnt += zero_pfn_range(PFN_DOWN(next),
16192 ++ round_up(max_pfn, PAGES_PER_SECTION));
16193 ++
16194 + /*
16195 + * Struct pages that do not have backing memory. This could be because
16196 + * firmware is using some of this memory, or for some other reasons.
16197 +- * Once memblock is changed so such behaviour is not allowed: i.e.
16198 +- * list of "reserved" memory must be a subset of list of "memory", then
16199 +- * this code can be removed.
16200 + */
16201 + if (pgcnt)
16202 +- pr_info("Reserved but unavailable: %lld pages", pgcnt);
16203 ++ pr_info("Zeroed struct page in unavailable ranges: %lld pages", pgcnt);
16204 + }
16205 + #endif /* CONFIG_HAVE_MEMBLOCK && !CONFIG_FLAT_NODE_MEM_MAP */
16206 +
16207 +diff --git a/net/hsr/hsr_slave.c b/net/hsr/hsr_slave.c
16208 +index 56080da4aa77..5fee6ec7c93d 100644
16209 +--- a/net/hsr/hsr_slave.c
16210 ++++ b/net/hsr/hsr_slave.c
16211 +@@ -32,6 +32,8 @@ static rx_handler_result_t hsr_handle_frame(struct sk_buff **pskb)
16212 +
16213 + rcu_read_lock(); /* hsr->node_db, hsr->ports */
16214 + port = hsr_port_get_rcu(skb->dev);
16215 ++ if (!port)
16216 ++ goto finish_pass;
16217 +
16218 + if (hsr_addr_is_self(port->hsr, eth_hdr(skb)->h_source)) {
16219 + /* Directly kill frames sent by ourselves */
16220 +diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
16221 +index e80eb1788f80..34fda81c7db0 100644
16222 +--- a/net/ipv4/tcp.c
16223 ++++ b/net/ipv4/tcp.c
16224 +@@ -2588,10 +2588,12 @@ int tcp_disconnect(struct sock *sk, int flags)
16225 + tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
16226 + tp->snd_cwnd_cnt = 0;
16227 + tp->window_clamp = 0;
16228 ++ tp->delivered = 0;
16229 + tp->delivered_ce = 0;
16230 + tcp_set_ca_state(sk, TCP_CA_Open);
16231 + tp->is_sack_reneg = 0;
16232 + tcp_clear_retrans(tp);
16233 ++ tp->total_retrans = 0;
16234 + inet_csk_delack_init(sk);
16235 + /* Initialize rcv_mss to TCP_MIN_MSS to avoid division by 0
16236 + * issue in __tcp_select_window()
16237 +@@ -2603,10 +2605,14 @@ int tcp_disconnect(struct sock *sk, int flags)
16238 + sk->sk_rx_dst = NULL;
16239 + tcp_saved_syn_free(tp);
16240 + tp->compressed_ack = 0;
16241 ++ tp->segs_in = 0;
16242 ++ tp->segs_out = 0;
16243 + tp->bytes_sent = 0;
16244 + tp->bytes_acked = 0;
16245 + tp->bytes_received = 0;
16246 + tp->bytes_retrans = 0;
16247 ++ tp->data_segs_in = 0;
16248 ++ tp->data_segs_out = 0;
16249 + tp->dsack_dups = 0;
16250 + tp->reord_seen = 0;
16251 +
16252 +diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c
16253 +index e4dec03a19fe..d0a295cd71ef 100644
16254 +--- a/net/l2tp/l2tp_core.c
16255 ++++ b/net/l2tp/l2tp_core.c
16256 +@@ -325,8 +325,13 @@ int l2tp_session_register(struct l2tp_session *session,
16257 +
16258 + spin_lock_bh(&pn->l2tp_session_hlist_lock);
16259 +
16260 ++ /* IP encap expects session IDs to be globally unique, while
16261 ++ * UDP encap doesn't.
16262 ++ */
16263 + hlist_for_each_entry(session_walk, g_head, global_hlist)
16264 +- if (session_walk->session_id == session->session_id) {
16265 ++ if (session_walk->session_id == session->session_id &&
16266 ++ (session_walk->tunnel->encap == L2TP_ENCAPTYPE_IP ||
16267 ++ tunnel->encap == L2TP_ENCAPTYPE_IP)) {
16268 + err = -EEXIST;
16269 + goto err_tlock_pnlock;
16270 + }
16271 +diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c
16272 +index a74edb10cbfc..57f835d2442e 100644
16273 +--- a/net/rxrpc/af_rxrpc.c
16274 ++++ b/net/rxrpc/af_rxrpc.c
16275 +@@ -196,6 +196,7 @@ static int rxrpc_bind(struct socket *sock, struct sockaddr *saddr, int len)
16276 + service_in_use:
16277 + write_unlock(&local->services_lock);
16278 + rxrpc_unuse_local(local);
16279 ++ rxrpc_put_local(local);
16280 + ret = -EADDRINUSE;
16281 + error_unlock:
16282 + release_sock(&rx->sk);
16283 +@@ -906,6 +907,7 @@ static int rxrpc_release_sock(struct sock *sk)
16284 + rxrpc_purge_queue(&sk->sk_receive_queue);
16285 +
16286 + rxrpc_unuse_local(rx->local);
16287 ++ rxrpc_put_local(rx->local);
16288 + rx->local = NULL;
16289 + key_put(rx->key);
16290 + rx->key = NULL;
16291 +diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
16292 +index ccef6e40e002..9c4ee7513214 100644
16293 +--- a/net/rxrpc/ar-internal.h
16294 ++++ b/net/rxrpc/ar-internal.h
16295 +@@ -484,6 +484,7 @@ enum rxrpc_call_flag {
16296 + RXRPC_CALL_BEGAN_RX_TIMER, /* We began the expect_rx_by timer */
16297 + RXRPC_CALL_RX_HEARD, /* The peer responded at least once to this call */
16298 + RXRPC_CALL_RX_UNDERRUN, /* Got data underrun */
16299 ++ RXRPC_CALL_DISCONNECTED, /* The call has been disconnected */
16300 + };
16301 +
16302 + /*
16303 +@@ -1006,6 +1007,16 @@ void rxrpc_unuse_local(struct rxrpc_local *);
16304 + void rxrpc_queue_local(struct rxrpc_local *);
16305 + void rxrpc_destroy_all_locals(struct rxrpc_net *);
16306 +
16307 ++static inline bool __rxrpc_unuse_local(struct rxrpc_local *local)
16308 ++{
16309 ++ return atomic_dec_return(&local->active_users) == 0;
16310 ++}
16311 ++
16312 ++static inline bool __rxrpc_use_local(struct rxrpc_local *local)
16313 ++{
16314 ++ return atomic_fetch_add_unless(&local->active_users, 1, 0) != 0;
16315 ++}
16316 ++
16317 + /*
16318 + * misc.c
16319 + */
16320 +diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c
16321 +index 215f4d98baa0..17fdfce1625f 100644
16322 +--- a/net/rxrpc/call_object.c
16323 ++++ b/net/rxrpc/call_object.c
16324 +@@ -520,7 +520,7 @@ void rxrpc_release_call(struct rxrpc_sock *rx, struct rxrpc_call *call)
16325 +
16326 + _debug("RELEASE CALL %p (%d CONN %p)", call, call->debug_id, conn);
16327 +
16328 +- if (conn)
16329 ++ if (conn && !test_bit(RXRPC_CALL_DISCONNECTED, &call->flags))
16330 + rxrpc_disconnect_call(call);
16331 +
16332 + for (i = 0; i < RXRPC_RXTX_BUFF_SIZE; i++) {
16333 +@@ -654,6 +654,7 @@ static void rxrpc_rcu_destroy_call(struct rcu_head *rcu)
16334 + struct rxrpc_call *call = container_of(rcu, struct rxrpc_call, rcu);
16335 + struct rxrpc_net *rxnet = call->rxnet;
16336 +
16337 ++ rxrpc_put_connection(call->conn);
16338 + rxrpc_put_peer(call->peer);
16339 + kfree(call->rxtx_buffer);
16340 + kfree(call->rxtx_annotations);
16341 +@@ -677,7 +678,6 @@ void rxrpc_cleanup_call(struct rxrpc_call *call)
16342 +
16343 + ASSERTCMP(call->state, ==, RXRPC_CALL_COMPLETE);
16344 + ASSERT(test_bit(RXRPC_CALL_RELEASED, &call->flags));
16345 +- ASSERTCMP(call->conn, ==, NULL);
16346 +
16347 + /* Clean up the Rx/Tx buffer */
16348 + for (i = 0; i < RXRPC_RXTX_BUFF_SIZE; i++)
16349 +diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c
16350 +index 38d548532024..4ffc7b87fec0 100644
16351 +--- a/net/rxrpc/conn_client.c
16352 ++++ b/net/rxrpc/conn_client.c
16353 +@@ -786,6 +786,7 @@ void rxrpc_disconnect_client_call(struct rxrpc_call *call)
16354 + u32 cid;
16355 +
16356 + spin_lock(&conn->channel_lock);
16357 ++ set_bit(RXRPC_CALL_DISCONNECTED, &call->flags);
16358 +
16359 + cid = call->cid;
16360 + if (cid) {
16361 +@@ -793,7 +794,6 @@ void rxrpc_disconnect_client_call(struct rxrpc_call *call)
16362 + chan = &conn->channels[channel];
16363 + }
16364 + trace_rxrpc_client(conn, channel, rxrpc_client_chan_disconnect);
16365 +- call->conn = NULL;
16366 +
16367 + /* Calls that have never actually been assigned a channel can simply be
16368 + * discarded. If the conn didn't get used either, it will follow
16369 +@@ -909,7 +909,6 @@ out:
16370 + spin_unlock(&rxnet->client_conn_cache_lock);
16371 + out_2:
16372 + spin_unlock(&conn->channel_lock);
16373 +- rxrpc_put_connection(conn);
16374 + _leave("");
16375 + return;
16376 +
16377 +diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c
16378 +index b6fca8ebb117..126154a97a59 100644
16379 +--- a/net/rxrpc/conn_event.c
16380 ++++ b/net/rxrpc/conn_event.c
16381 +@@ -453,16 +453,12 @@ again:
16382 + /*
16383 + * connection-level event processor
16384 + */
16385 +-void rxrpc_process_connection(struct work_struct *work)
16386 ++static void rxrpc_do_process_connection(struct rxrpc_connection *conn)
16387 + {
16388 +- struct rxrpc_connection *conn =
16389 +- container_of(work, struct rxrpc_connection, processor);
16390 + struct sk_buff *skb;
16391 + u32 abort_code = RX_PROTOCOL_ERROR;
16392 + int ret;
16393 +
16394 +- rxrpc_see_connection(conn);
16395 +-
16396 + if (test_and_clear_bit(RXRPC_CONN_EV_CHALLENGE, &conn->events))
16397 + rxrpc_secure_connection(conn);
16398 +
16399 +@@ -490,18 +486,33 @@ void rxrpc_process_connection(struct work_struct *work)
16400 + }
16401 + }
16402 +
16403 +-out:
16404 +- rxrpc_put_connection(conn);
16405 +- _leave("");
16406 + return;
16407 +
16408 + requeue_and_leave:
16409 + skb_queue_head(&conn->rx_queue, skb);
16410 +- goto out;
16411 ++ return;
16412 +
16413 + protocol_error:
16414 + if (rxrpc_abort_connection(conn, ret, abort_code) < 0)
16415 + goto requeue_and_leave;
16416 + rxrpc_free_skb(skb, rxrpc_skb_rx_freed);
16417 +- goto out;
16418 ++ return;
16419 ++}
16420 ++
16421 ++void rxrpc_process_connection(struct work_struct *work)
16422 ++{
16423 ++ struct rxrpc_connection *conn =
16424 ++ container_of(work, struct rxrpc_connection, processor);
16425 ++
16426 ++ rxrpc_see_connection(conn);
16427 ++
16428 ++ if (__rxrpc_use_local(conn->params.local)) {
16429 ++ rxrpc_do_process_connection(conn);
16430 ++ rxrpc_unuse_local(conn->params.local);
16431 ++ }
16432 ++
16433 ++ rxrpc_put_connection(conn);
16434 ++ _leave("");
16435 ++ return;
16436 + }
16437 ++
16438 +diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c
16439 +index f338efd2880a..c4c4450891e0 100644
16440 +--- a/net/rxrpc/conn_object.c
16441 ++++ b/net/rxrpc/conn_object.c
16442 +@@ -226,9 +226,8 @@ void rxrpc_disconnect_call(struct rxrpc_call *call)
16443 + __rxrpc_disconnect_call(conn, call);
16444 + spin_unlock(&conn->channel_lock);
16445 +
16446 +- call->conn = NULL;
16447 ++ set_bit(RXRPC_CALL_DISCONNECTED, &call->flags);
16448 + conn->idle_timestamp = jiffies;
16449 +- rxrpc_put_connection(conn);
16450 + }
16451 +
16452 + /*
16453 +diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c
16454 +index 2f91ab909191..d9beb28fc32f 100644
16455 +--- a/net/rxrpc/input.c
16456 ++++ b/net/rxrpc/input.c
16457 +@@ -612,10 +612,8 @@ ack:
16458 + false, true,
16459 + rxrpc_propose_ack_input_data);
16460 +
16461 +- if (sp->hdr.seq == READ_ONCE(call->rx_hard_ack) + 1) {
16462 +- trace_rxrpc_notify_socket(call->debug_id, serial);
16463 +- rxrpc_notify_socket(call);
16464 +- }
16465 ++ trace_rxrpc_notify_socket(call->debug_id, serial);
16466 ++ rxrpc_notify_socket(call);
16467 +
16468 + unlock:
16469 + spin_unlock(&call->input_lock);
16470 +diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c
16471 +index 04f0976841a4..4c0087a48e87 100644
16472 +--- a/net/rxrpc/local_object.c
16473 ++++ b/net/rxrpc/local_object.c
16474 +@@ -368,11 +368,14 @@ void rxrpc_queue_local(struct rxrpc_local *local)
16475 + void rxrpc_put_local(struct rxrpc_local *local)
16476 + {
16477 + const void *here = __builtin_return_address(0);
16478 ++ unsigned int debug_id;
16479 + int n;
16480 +
16481 + if (local) {
16482 ++ debug_id = local->debug_id;
16483 ++
16484 + n = atomic_dec_return(&local->usage);
16485 +- trace_rxrpc_local(local->debug_id, rxrpc_local_put, n, here);
16486 ++ trace_rxrpc_local(debug_id, rxrpc_local_put, n, here);
16487 +
16488 + if (n == 0)
16489 + call_rcu(&local->rcu, rxrpc_local_rcu);
16490 +@@ -384,14 +387,11 @@ void rxrpc_put_local(struct rxrpc_local *local)
16491 + */
16492 + struct rxrpc_local *rxrpc_use_local(struct rxrpc_local *local)
16493 + {
16494 +- unsigned int au;
16495 +-
16496 + local = rxrpc_get_local_maybe(local);
16497 + if (!local)
16498 + return NULL;
16499 +
16500 +- au = atomic_fetch_add_unless(&local->active_users, 1, 0);
16501 +- if (au == 0) {
16502 ++ if (!__rxrpc_use_local(local)) {
16503 + rxrpc_put_local(local);
16504 + return NULL;
16505 + }
16506 +@@ -405,14 +405,11 @@ struct rxrpc_local *rxrpc_use_local(struct rxrpc_local *local)
16507 + */
16508 + void rxrpc_unuse_local(struct rxrpc_local *local)
16509 + {
16510 +- unsigned int au;
16511 +-
16512 + if (local) {
16513 +- au = atomic_dec_return(&local->active_users);
16514 +- if (au == 0)
16515 ++ if (__rxrpc_unuse_local(local)) {
16516 ++ rxrpc_get_local(local);
16517 + rxrpc_queue_local(local);
16518 +- else
16519 +- rxrpc_put_local(local);
16520 ++ }
16521 + }
16522 + }
16523 +
16524 +@@ -469,7 +466,7 @@ static void rxrpc_local_processor(struct work_struct *work)
16525 +
16526 + do {
16527 + again = false;
16528 +- if (atomic_read(&local->active_users) == 0) {
16529 ++ if (!__rxrpc_use_local(local)) {
16530 + rxrpc_local_destroyer(local);
16531 + break;
16532 + }
16533 +@@ -483,6 +480,8 @@ static void rxrpc_local_processor(struct work_struct *work)
16534 + rxrpc_process_local_events(local);
16535 + again = true;
16536 + }
16537 ++
16538 ++ __rxrpc_unuse_local(local);
16539 + } while (again);
16540 +
16541 + rxrpc_put_local(local);
16542 +diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c
16543 +index 31e47cfb3e68..b0aa08e3796d 100644
16544 +--- a/net/rxrpc/output.c
16545 ++++ b/net/rxrpc/output.c
16546 +@@ -133,7 +133,7 @@ static size_t rxrpc_fill_out_ack(struct rxrpc_connection *conn,
16547 + int rxrpc_send_ack_packet(struct rxrpc_call *call, bool ping,
16548 + rxrpc_serial_t *_serial)
16549 + {
16550 +- struct rxrpc_connection *conn = NULL;
16551 ++ struct rxrpc_connection *conn;
16552 + struct rxrpc_ack_buffer *pkt;
16553 + struct msghdr msg;
16554 + struct kvec iov[2];
16555 +@@ -143,18 +143,14 @@ int rxrpc_send_ack_packet(struct rxrpc_call *call, bool ping,
16556 + int ret;
16557 + u8 reason;
16558 +
16559 +- spin_lock_bh(&call->lock);
16560 +- if (call->conn)
16561 +- conn = rxrpc_get_connection_maybe(call->conn);
16562 +- spin_unlock_bh(&call->lock);
16563 +- if (!conn)
16564 ++ if (test_bit(RXRPC_CALL_DISCONNECTED, &call->flags))
16565 + return -ECONNRESET;
16566 +
16567 + pkt = kzalloc(sizeof(*pkt), GFP_KERNEL);
16568 +- if (!pkt) {
16569 +- rxrpc_put_connection(conn);
16570 ++ if (!pkt)
16571 + return -ENOMEM;
16572 +- }
16573 ++
16574 ++ conn = call->conn;
16575 +
16576 + msg.msg_name = &call->peer->srx.transport;
16577 + msg.msg_namelen = call->peer->srx.transport_len;
16578 +@@ -249,7 +245,6 @@ int rxrpc_send_ack_packet(struct rxrpc_call *call, bool ping,
16579 + }
16580 +
16581 + out:
16582 +- rxrpc_put_connection(conn);
16583 + kfree(pkt);
16584 + return ret;
16585 + }
16586 +@@ -259,7 +254,7 @@ out:
16587 + */
16588 + int rxrpc_send_abort_packet(struct rxrpc_call *call)
16589 + {
16590 +- struct rxrpc_connection *conn = NULL;
16591 ++ struct rxrpc_connection *conn;
16592 + struct rxrpc_abort_buffer pkt;
16593 + struct msghdr msg;
16594 + struct kvec iov[1];
16595 +@@ -276,13 +271,11 @@ int rxrpc_send_abort_packet(struct rxrpc_call *call)
16596 + test_bit(RXRPC_CALL_TX_LAST, &call->flags))
16597 + return 0;
16598 +
16599 +- spin_lock_bh(&call->lock);
16600 +- if (call->conn)
16601 +- conn = rxrpc_get_connection_maybe(call->conn);
16602 +- spin_unlock_bh(&call->lock);
16603 +- if (!conn)
16604 ++ if (test_bit(RXRPC_CALL_DISCONNECTED, &call->flags))
16605 + return -ECONNRESET;
16606 +
16607 ++ conn = call->conn;
16608 ++
16609 + msg.msg_name = &call->peer->srx.transport;
16610 + msg.msg_namelen = call->peer->srx.transport_len;
16611 + msg.msg_control = NULL;
16612 +@@ -317,8 +310,6 @@ int rxrpc_send_abort_packet(struct rxrpc_call *call)
16613 + trace_rxrpc_tx_packet(call->debug_id, &pkt.whdr,
16614 + rxrpc_tx_point_call_abort);
16615 + rxrpc_tx_backoff(call, ret);
16616 +-
16617 +- rxrpc_put_connection(conn);
16618 + return ret;
16619 + }
16620 +
16621 +diff --git a/net/rxrpc/peer_event.c b/net/rxrpc/peer_event.c
16622 +index 42582a9ff81d..85bdc31d3dbf 100644
16623 +--- a/net/rxrpc/peer_event.c
16624 ++++ b/net/rxrpc/peer_event.c
16625 +@@ -357,27 +357,31 @@ static void rxrpc_peer_keepalive_dispatch(struct rxrpc_net *rxnet,
16626 + if (!rxrpc_get_peer_maybe(peer))
16627 + continue;
16628 +
16629 +- spin_unlock_bh(&rxnet->peer_hash_lock);
16630 +-
16631 +- keepalive_at = peer->last_tx_at + RXRPC_KEEPALIVE_TIME;
16632 +- slot = keepalive_at - base;
16633 +- _debug("%02x peer %u t=%d {%pISp}",
16634 +- cursor, peer->debug_id, slot, &peer->srx.transport);
16635 ++ if (__rxrpc_use_local(peer->local)) {
16636 ++ spin_unlock_bh(&rxnet->peer_hash_lock);
16637 ++
16638 ++ keepalive_at = peer->last_tx_at + RXRPC_KEEPALIVE_TIME;
16639 ++ slot = keepalive_at - base;
16640 ++ _debug("%02x peer %u t=%d {%pISp}",
16641 ++ cursor, peer->debug_id, slot, &peer->srx.transport);
16642 ++
16643 ++ if (keepalive_at <= base ||
16644 ++ keepalive_at > base + RXRPC_KEEPALIVE_TIME) {
16645 ++ rxrpc_send_keepalive(peer);
16646 ++ slot = RXRPC_KEEPALIVE_TIME;
16647 ++ }
16648 +
16649 +- if (keepalive_at <= base ||
16650 +- keepalive_at > base + RXRPC_KEEPALIVE_TIME) {
16651 +- rxrpc_send_keepalive(peer);
16652 +- slot = RXRPC_KEEPALIVE_TIME;
16653 ++ /* A transmission to this peer occurred since last we
16654 ++ * examined it so put it into the appropriate future
16655 ++ * bucket.
16656 ++ */
16657 ++ slot += cursor;
16658 ++ slot &= mask;
16659 ++ spin_lock_bh(&rxnet->peer_hash_lock);
16660 ++ list_add_tail(&peer->keepalive_link,
16661 ++ &rxnet->peer_keepalive[slot & mask]);
16662 ++ rxrpc_unuse_local(peer->local);
16663 + }
16664 +-
16665 +- /* A transmission to this peer occurred since last we examined
16666 +- * it so put it into the appropriate future bucket.
16667 +- */
16668 +- slot += cursor;
16669 +- slot &= mask;
16670 +- spin_lock_bh(&rxnet->peer_hash_lock);
16671 +- list_add_tail(&peer->keepalive_link,
16672 +- &rxnet->peer_keepalive[slot & mask]);
16673 + rxrpc_put_peer_locked(peer);
16674 + }
16675 +
16676 +diff --git a/net/sched/cls_rsvp.h b/net/sched/cls_rsvp.h
16677 +index 6d30a291bcd2..eb1dd2afc5a1 100644
16678 +--- a/net/sched/cls_rsvp.h
16679 ++++ b/net/sched/cls_rsvp.h
16680 +@@ -466,10 +466,8 @@ static u32 gen_tunnel(struct rsvp_head *data)
16681 +
16682 + static const struct nla_policy rsvp_policy[TCA_RSVP_MAX + 1] = {
16683 + [TCA_RSVP_CLASSID] = { .type = NLA_U32 },
16684 +- [TCA_RSVP_DST] = { .type = NLA_BINARY,
16685 +- .len = RSVP_DST_LEN * sizeof(u32) },
16686 +- [TCA_RSVP_SRC] = { .type = NLA_BINARY,
16687 +- .len = RSVP_DST_LEN * sizeof(u32) },
16688 ++ [TCA_RSVP_DST] = { .len = RSVP_DST_LEN * sizeof(u32) },
16689 ++ [TCA_RSVP_SRC] = { .len = RSVP_DST_LEN * sizeof(u32) },
16690 + [TCA_RSVP_PINFO] = { .len = sizeof(struct tc_rsvp_pinfo) },
16691 + };
16692 +
16693 +diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c
16694 +index edf27365f91c..db4b5d9ffaf7 100644
16695 +--- a/net/sched/cls_tcindex.c
16696 ++++ b/net/sched/cls_tcindex.c
16697 +@@ -333,12 +333,31 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base,
16698 + cp->fall_through = p->fall_through;
16699 + cp->tp = tp;
16700 +
16701 ++ if (tb[TCA_TCINDEX_HASH])
16702 ++ cp->hash = nla_get_u32(tb[TCA_TCINDEX_HASH]);
16703 ++
16704 ++ if (tb[TCA_TCINDEX_MASK])
16705 ++ cp->mask = nla_get_u16(tb[TCA_TCINDEX_MASK]);
16706 ++
16707 ++ if (tb[TCA_TCINDEX_SHIFT])
16708 ++ cp->shift = nla_get_u32(tb[TCA_TCINDEX_SHIFT]);
16709 ++
16710 ++ if (!cp->hash) {
16711 ++ /* Hash not specified, use perfect hash if the upper limit
16712 ++ * of the hashing index is below the threshold.
16713 ++ */
16714 ++ if ((cp->mask >> cp->shift) < PERFECT_HASH_THRESHOLD)
16715 ++ cp->hash = (cp->mask >> cp->shift) + 1;
16716 ++ else
16717 ++ cp->hash = DEFAULT_HASH_SIZE;
16718 ++ }
16719 ++
16720 + if (p->perfect) {
16721 + int i;
16722 +
16723 + if (tcindex_alloc_perfect_hash(net, cp) < 0)
16724 + goto errout;
16725 +- for (i = 0; i < cp->hash; i++)
16726 ++ for (i = 0; i < min(cp->hash, p->hash); i++)
16727 + cp->perfect[i].res = p->perfect[i].res;
16728 + balloc = 1;
16729 + }
16730 +@@ -346,19 +365,10 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base,
16731 +
16732 + err = tcindex_filter_result_init(&new_filter_result);
16733 + if (err < 0)
16734 +- goto errout1;
16735 ++ goto errout_alloc;
16736 + if (old_r)
16737 + cr = r->res;
16738 +
16739 +- if (tb[TCA_TCINDEX_HASH])
16740 +- cp->hash = nla_get_u32(tb[TCA_TCINDEX_HASH]);
16741 +-
16742 +- if (tb[TCA_TCINDEX_MASK])
16743 +- cp->mask = nla_get_u16(tb[TCA_TCINDEX_MASK]);
16744 +-
16745 +- if (tb[TCA_TCINDEX_SHIFT])
16746 +- cp->shift = nla_get_u32(tb[TCA_TCINDEX_SHIFT]);
16747 +-
16748 + err = -EBUSY;
16749 +
16750 + /* Hash already allocated, make sure that we still meet the
16751 +@@ -376,16 +386,6 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base,
16752 + if (tb[TCA_TCINDEX_FALL_THROUGH])
16753 + cp->fall_through = nla_get_u32(tb[TCA_TCINDEX_FALL_THROUGH]);
16754 +
16755 +- if (!cp->hash) {
16756 +- /* Hash not specified, use perfect hash if the upper limit
16757 +- * of the hashing index is below the threshold.
16758 +- */
16759 +- if ((cp->mask >> cp->shift) < PERFECT_HASH_THRESHOLD)
16760 +- cp->hash = (cp->mask >> cp->shift) + 1;
16761 +- else
16762 +- cp->hash = DEFAULT_HASH_SIZE;
16763 +- }
16764 +-
16765 + if (!cp->perfect && !cp->h)
16766 + cp->alloc_hash = cp->hash;
16767 +
16768 +@@ -484,7 +484,6 @@ errout_alloc:
16769 + tcindex_free_perfect_hash(cp);
16770 + else if (balloc == 2)
16771 + kfree(cp->h);
16772 +-errout1:
16773 + tcf_exts_destroy(&new_filter_result.exts);
16774 + errout:
16775 + kfree(cp);
16776 +diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c
16777 +index 24c7a1e2bd34..68830e88b6e9 100644
16778 +--- a/net/sunrpc/auth_gss/svcauth_gss.c
16779 ++++ b/net/sunrpc/auth_gss/svcauth_gss.c
16780 +@@ -1224,6 +1224,7 @@ static int gss_proxy_save_rsc(struct cache_detail *cd,
16781 + dprintk("RPC: No creds found!\n");
16782 + goto out;
16783 + } else {
16784 ++ struct timespec64 boot;
16785 +
16786 + /* steal creds */
16787 + rsci.cred = ud->creds;
16788 +@@ -1244,6 +1245,9 @@ static int gss_proxy_save_rsc(struct cache_detail *cd,
16789 + &expiry, GFP_KERNEL);
16790 + if (status)
16791 + goto out;
16792 ++
16793 ++ getboottime64(&boot);
16794 ++ expiry -= boot.tv_sec;
16795 + }
16796 +
16797 + rsci.h.expiry_time = expiry;
16798 +diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
16799 +index 75d4b48601aa..85a6e8f5a75d 100644
16800 +--- a/samples/bpf/Makefile
16801 ++++ b/samples/bpf/Makefile
16802 +@@ -221,7 +221,7 @@ all:
16803 +
16804 + clean:
16805 + $(MAKE) -C ../../ M=$(CURDIR) clean
16806 +- @rm -f *~
16807 ++ @find $(CURDIR) -type f -name '*~' -delete
16808 +
16809 + $(LIBBPF): FORCE
16810 + # Fix up variables inherited from Kbuild that tools/ build system won't like
16811 +diff --git a/scripts/find-unused-docs.sh b/scripts/find-unused-docs.sh
16812 +index 3f46f8977dc4..ee6a50e33aba 100755
16813 +--- a/scripts/find-unused-docs.sh
16814 ++++ b/scripts/find-unused-docs.sh
16815 +@@ -54,7 +54,7 @@ for file in `find $1 -name '*.c'`; do
16816 + if [[ ${FILES_INCLUDED[$file]+_} ]]; then
16817 + continue;
16818 + fi
16819 +- str=$(scripts/kernel-doc -text -export "$file" 2>/dev/null)
16820 ++ str=$(scripts/kernel-doc -export "$file" 2>/dev/null)
16821 + if [[ -n "$str" ]]; then
16822 + echo "$file"
16823 + fi
16824 +diff --git a/sound/drivers/dummy.c b/sound/drivers/dummy.c
16825 +index 9af154db530a..b78cc8d86a8b 100644
16826 +--- a/sound/drivers/dummy.c
16827 ++++ b/sound/drivers/dummy.c
16828 +@@ -929,7 +929,7 @@ static void print_formats(struct snd_dummy *dummy,
16829 + {
16830 + int i;
16831 +
16832 +- for (i = 0; i < SNDRV_PCM_FORMAT_LAST; i++) {
16833 ++ for (i = 0; i <= SNDRV_PCM_FORMAT_LAST; i++) {
16834 + if (dummy->pcm_hw.formats & (1ULL << i))
16835 + snd_iprintf(buffer, " %s", snd_pcm_format_name(i));
16836 + }
16837 +diff --git a/sound/pci/hda/hda_intel.c b/sound/pci/hda/hda_intel.c
16838 +index d63fea5d1c92..3ee5b7b9b595 100644
16839 +--- a/sound/pci/hda/hda_intel.c
16840 ++++ b/sound/pci/hda/hda_intel.c
16841 +@@ -2324,6 +2324,8 @@ static struct snd_pci_quirk power_save_blacklist[] = {
16842 + /* https://bugzilla.redhat.com/show_bug.cgi?id=1581607 */
16843 + SND_PCI_QUIRK(0x1558, 0x3501, "Clevo W35xSS_370SS", 0),
16844 + /* https://bugzilla.redhat.com/show_bug.cgi?id=1525104 */
16845 ++ SND_PCI_QUIRK(0x1558, 0x6504, "Clevo W65_67SB", 0),
16846 ++ /* https://bugzilla.redhat.com/show_bug.cgi?id=1525104 */
16847 + SND_PCI_QUIRK(0x1028, 0x0497, "Dell Precision T3600", 0),
16848 + /* https://bugzilla.redhat.com/show_bug.cgi?id=1525104 */
16849 + /* Note the P55A-UD3 and Z87-D3HP share the subsys id for the HDA dev */
16850 +diff --git a/sound/usb/validate.c b/sound/usb/validate.c
16851 +index 389e8657434a..5a3c4f7882b0 100644
16852 +--- a/sound/usb/validate.c
16853 ++++ b/sound/usb/validate.c
16854 +@@ -110,7 +110,7 @@ static bool validate_processing_unit(const void *p,
16855 + default:
16856 + if (v->type == UAC1_EXTENSION_UNIT)
16857 + return true; /* OK */
16858 +- switch (d->wProcessType) {
16859 ++ switch (le16_to_cpu(d->wProcessType)) {
16860 + case UAC_PROCESS_UP_DOWNMIX:
16861 + case UAC_PROCESS_DOLBY_PROLOGIC:
16862 + if (d->bLength < len + 1) /* bNrModes */
16863 +@@ -125,7 +125,7 @@ static bool validate_processing_unit(const void *p,
16864 + case UAC_VERSION_2:
16865 + if (v->type == UAC2_EXTENSION_UNIT_V2)
16866 + return true; /* OK */
16867 +- switch (d->wProcessType) {
16868 ++ switch (le16_to_cpu(d->wProcessType)) {
16869 + case UAC2_PROCESS_UP_DOWNMIX:
16870 + case UAC2_PROCESS_DOLBY_PROLOCIC: /* SiC! */
16871 + if (d->bLength < len + 1) /* bNrModes */
16872 +@@ -142,7 +142,7 @@ static bool validate_processing_unit(const void *p,
16873 + len += 2; /* wClusterDescrID */
16874 + break;
16875 + }
16876 +- switch (d->wProcessType) {
16877 ++ switch (le16_to_cpu(d->wProcessType)) {
16878 + case UAC3_PROCESS_UP_DOWNMIX:
16879 + if (d->bLength < len + 1) /* bNrModes */
16880 + return false;
16881 +diff --git a/tools/kvm/kvm_stat/kvm_stat b/tools/kvm/kvm_stat/kvm_stat
16882 +index ba7ee74ee533..f6ca0a216f3c 100755
16883 +--- a/tools/kvm/kvm_stat/kvm_stat
16884 ++++ b/tools/kvm/kvm_stat/kvm_stat
16885 +@@ -271,6 +271,7 @@ class ArchX86(Arch):
16886 + def __init__(self, exit_reasons):
16887 + self.sc_perf_evt_open = 298
16888 + self.ioctl_numbers = IOCTL_NUMBERS
16889 ++ self.exit_reason_field = 'exit_reason'
16890 + self.exit_reasons = exit_reasons
16891 +
16892 + def debugfs_is_child(self, field):
16893 +@@ -290,6 +291,7 @@ class ArchPPC(Arch):
16894 + # numbers depend on the wordsize.
16895 + char_ptr_size = ctypes.sizeof(ctypes.c_char_p)
16896 + self.ioctl_numbers['SET_FILTER'] = 0x80002406 | char_ptr_size << 16
16897 ++ self.exit_reason_field = 'exit_nr'
16898 + self.exit_reasons = {}
16899 +
16900 + def debugfs_is_child(self, field):
16901 +@@ -301,6 +303,7 @@ class ArchA64(Arch):
16902 + def __init__(self):
16903 + self.sc_perf_evt_open = 241
16904 + self.ioctl_numbers = IOCTL_NUMBERS
16905 ++ self.exit_reason_field = 'esr_ec'
16906 + self.exit_reasons = AARCH64_EXIT_REASONS
16907 +
16908 + def debugfs_is_child(self, field):
16909 +@@ -312,6 +315,7 @@ class ArchS390(Arch):
16910 + def __init__(self):
16911 + self.sc_perf_evt_open = 331
16912 + self.ioctl_numbers = IOCTL_NUMBERS
16913 ++ self.exit_reason_field = None
16914 + self.exit_reasons = None
16915 +
16916 + def debugfs_is_child(self, field):
16917 +@@ -542,8 +546,8 @@ class TracepointProvider(Provider):
16918 + """
16919 + filters = {}
16920 + filters['kvm_userspace_exit'] = ('reason', USERSPACE_EXIT_REASONS)
16921 +- if ARCH.exit_reasons:
16922 +- filters['kvm_exit'] = ('exit_reason', ARCH.exit_reasons)
16923 ++ if ARCH.exit_reason_field and ARCH.exit_reasons:
16924 ++ filters['kvm_exit'] = (ARCH.exit_reason_field, ARCH.exit_reasons)
16925 + return filters
16926 +
16927 + def _get_available_fields(self):
16928 +diff --git a/virt/kvm/arm/aarch32.c b/virt/kvm/arm/aarch32.c
16929 +index 6880236974b8..d915548759a4 100644
16930 +--- a/virt/kvm/arm/aarch32.c
16931 ++++ b/virt/kvm/arm/aarch32.c
16932 +@@ -21,6 +21,7 @@
16933 + * along with this program. If not, see <http://www.gnu.org/licenses/>.
16934 + */
16935 +
16936 ++#include <linux/bits.h>
16937 + #include <linux/kvm_host.h>
16938 + #include <asm/kvm_emulate.h>
16939 + #include <asm/kvm_hyp.h>
16940 +@@ -39,25 +40,115 @@ static const u8 return_offsets[8][2] = {
16941 + [7] = { 4, 4 }, /* FIQ, unused */
16942 + };
16943 +
16944 ++/*
16945 ++ * When an exception is taken, most CPSR fields are left unchanged in the
16946 ++ * handler. However, some are explicitly overridden (e.g. M[4:0]).
16947 ++ *
16948 ++ * The SPSR/SPSR_ELx layouts differ, and the below is intended to work with
16949 ++ * either format. Note: SPSR.J bit doesn't exist in SPSR_ELx, but this bit was
16950 ++ * obsoleted by the ARMv7 virtualization extensions and is RES0.
16951 ++ *
16952 ++ * For the SPSR layout seen from AArch32, see:
16953 ++ * - ARM DDI 0406C.d, page B1-1148
16954 ++ * - ARM DDI 0487E.a, page G8-6264
16955 ++ *
16956 ++ * For the SPSR_ELx layout for AArch32 seen from AArch64, see:
16957 ++ * - ARM DDI 0487E.a, page C5-426
16958 ++ *
16959 ++ * Here we manipulate the fields in order of the AArch32 SPSR_ELx layout, from
16960 ++ * MSB to LSB.
16961 ++ */
16962 ++static unsigned long get_except32_cpsr(struct kvm_vcpu *vcpu, u32 mode)
16963 ++{
16964 ++ u32 sctlr = vcpu_cp15(vcpu, c1_SCTLR);
16965 ++ unsigned long old, new;
16966 ++
16967 ++ old = *vcpu_cpsr(vcpu);
16968 ++ new = 0;
16969 ++
16970 ++ new |= (old & PSR_AA32_N_BIT);
16971 ++ new |= (old & PSR_AA32_Z_BIT);
16972 ++ new |= (old & PSR_AA32_C_BIT);
16973 ++ new |= (old & PSR_AA32_V_BIT);
16974 ++ new |= (old & PSR_AA32_Q_BIT);
16975 ++
16976 ++ // CPSR.IT[7:0] are set to zero upon any exception
16977 ++ // See ARM DDI 0487E.a, section G1.12.3
16978 ++ // See ARM DDI 0406C.d, section B1.8.3
16979 ++
16980 ++ new |= (old & PSR_AA32_DIT_BIT);
16981 ++
16982 ++ // CPSR.SSBS is set to SCTLR.DSSBS upon any exception
16983 ++ // See ARM DDI 0487E.a, page G8-6244
16984 ++ if (sctlr & BIT(31))
16985 ++ new |= PSR_AA32_SSBS_BIT;
16986 ++
16987 ++ // CPSR.PAN is unchanged unless SCTLR.SPAN == 0b0
16988 ++ // SCTLR.SPAN is RES1 when ARMv8.1-PAN is not implemented
16989 ++ // See ARM DDI 0487E.a, page G8-6246
16990 ++ new |= (old & PSR_AA32_PAN_BIT);
16991 ++ if (!(sctlr & BIT(23)))
16992 ++ new |= PSR_AA32_PAN_BIT;
16993 ++
16994 ++ // SS does not exist in AArch32, so ignore
16995 ++
16996 ++ // CPSR.IL is set to zero upon any exception
16997 ++ // See ARM DDI 0487E.a, page G1-5527
16998 ++
16999 ++ new |= (old & PSR_AA32_GE_MASK);
17000 ++
17001 ++ // CPSR.IT[7:0] are set to zero upon any exception
17002 ++ // See prior comment above
17003 ++
17004 ++ // CPSR.E is set to SCTLR.EE upon any exception
17005 ++ // See ARM DDI 0487E.a, page G8-6245
17006 ++ // See ARM DDI 0406C.d, page B4-1701
17007 ++ if (sctlr & BIT(25))
17008 ++ new |= PSR_AA32_E_BIT;
17009 ++
17010 ++ // CPSR.A is unchanged upon an exception to Undefined, Supervisor
17011 ++ // CPSR.A is set upon an exception to other modes
17012 ++ // See ARM DDI 0487E.a, pages G1-5515 to G1-5516
17013 ++ // See ARM DDI 0406C.d, page B1-1182
17014 ++ new |= (old & PSR_AA32_A_BIT);
17015 ++ if (mode != PSR_AA32_MODE_UND && mode != PSR_AA32_MODE_SVC)
17016 ++ new |= PSR_AA32_A_BIT;
17017 ++
17018 ++ // CPSR.I is set upon any exception
17019 ++ // See ARM DDI 0487E.a, pages G1-5515 to G1-5516
17020 ++ // See ARM DDI 0406C.d, page B1-1182
17021 ++ new |= PSR_AA32_I_BIT;
17022 ++
17023 ++ // CPSR.F is set upon an exception to FIQ
17024 ++ // CPSR.F is unchanged upon an exception to other modes
17025 ++ // See ARM DDI 0487E.a, pages G1-5515 to G1-5516
17026 ++ // See ARM DDI 0406C.d, page B1-1182
17027 ++ new |= (old & PSR_AA32_F_BIT);
17028 ++ if (mode == PSR_AA32_MODE_FIQ)
17029 ++ new |= PSR_AA32_F_BIT;
17030 ++
17031 ++ // CPSR.T is set to SCTLR.TE upon any exception
17032 ++ // See ARM DDI 0487E.a, page G8-5514
17033 ++ // See ARM DDI 0406C.d, page B1-1181
17034 ++ if (sctlr & BIT(30))
17035 ++ new |= PSR_AA32_T_BIT;
17036 ++
17037 ++ new |= mode;
17038 ++
17039 ++ return new;
17040 ++}
17041 ++
17042 + static void prepare_fault32(struct kvm_vcpu *vcpu, u32 mode, u32 vect_offset)
17043 + {
17044 +- unsigned long cpsr;
17045 +- unsigned long new_spsr_value = *vcpu_cpsr(vcpu);
17046 +- bool is_thumb = (new_spsr_value & PSR_AA32_T_BIT);
17047 ++ unsigned long spsr = *vcpu_cpsr(vcpu);
17048 ++ bool is_thumb = (spsr & PSR_AA32_T_BIT);
17049 + u32 return_offset = return_offsets[vect_offset >> 2][is_thumb];
17050 + u32 sctlr = vcpu_cp15(vcpu, c1_SCTLR);
17051 +
17052 +- cpsr = mode | PSR_AA32_I_BIT;
17053 +-
17054 +- if (sctlr & (1 << 30))
17055 +- cpsr |= PSR_AA32_T_BIT;
17056 +- if (sctlr & (1 << 25))
17057 +- cpsr |= PSR_AA32_E_BIT;
17058 +-
17059 +- *vcpu_cpsr(vcpu) = cpsr;
17060 ++ *vcpu_cpsr(vcpu) = get_except32_cpsr(vcpu, mode);
17061 +
17062 + /* Note: These now point to the banked copies */
17063 +- vcpu_write_spsr(vcpu, new_spsr_value);
17064 ++ vcpu_write_spsr(vcpu, host_spsr_to_spsr32(spsr));
17065 + *vcpu_reg32(vcpu, 14) = *vcpu_pc(vcpu) + return_offset;
17066 +
17067 + /* Branch to exception vector */
17068 +@@ -95,7 +186,7 @@ static void inject_abt32(struct kvm_vcpu *vcpu, bool is_pabt,
17069 + fsr = &vcpu_cp15(vcpu, c5_DFSR);
17070 + }
17071 +
17072 +- prepare_fault32(vcpu, PSR_AA32_MODE_ABT | PSR_AA32_A_BIT, vect_offset);
17073 ++ prepare_fault32(vcpu, PSR_AA32_MODE_ABT, vect_offset);
17074 +
17075 + *far = addr;
17076 +
17077 +diff --git a/virt/kvm/arm/mmio.c b/virt/kvm/arm/mmio.c
17078 +index 3caee91bca08..878e0edb2e1b 100644
17079 +--- a/virt/kvm/arm/mmio.c
17080 ++++ b/virt/kvm/arm/mmio.c
17081 +@@ -117,6 +117,9 @@ int kvm_handle_mmio_return(struct kvm_vcpu *vcpu, struct kvm_run *run)
17082 + data = (data ^ mask) - mask;
17083 + }
17084 +
17085 ++ if (!vcpu->arch.mmio_decode.sixty_four)
17086 ++ data = data & 0xffffffff;
17087 ++
17088 + trace_kvm_mmio(KVM_TRACE_MMIO_READ, len, run->mmio.phys_addr,
17089 + &data);
17090 + data = vcpu_data_host_to_guest(vcpu, data, len);
17091 +@@ -137,6 +140,7 @@ static int decode_hsr(struct kvm_vcpu *vcpu, bool *is_write, int *len)
17092 + unsigned long rt;
17093 + int access_size;
17094 + bool sign_extend;
17095 ++ bool sixty_four;
17096 +
17097 + if (kvm_vcpu_dabt_iss1tw(vcpu)) {
17098 + /* page table accesses IO mem: tell guest to fix its TTBR */
17099 +@@ -150,11 +154,13 @@ static int decode_hsr(struct kvm_vcpu *vcpu, bool *is_write, int *len)
17100 +
17101 + *is_write = kvm_vcpu_dabt_iswrite(vcpu);
17102 + sign_extend = kvm_vcpu_dabt_issext(vcpu);
17103 ++ sixty_four = kvm_vcpu_dabt_issf(vcpu);
17104 + rt = kvm_vcpu_dabt_get_rd(vcpu);
17105 +
17106 + *len = access_size;
17107 + vcpu->arch.mmio_decode.sign_extend = sign_extend;
17108 + vcpu->arch.mmio_decode.rt = rt;
17109 ++ vcpu->arch.mmio_decode.sixty_four = sixty_four;
17110 +
17111 + return 0;
17112 + }
17113 +diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c
17114 +index 23c2519c5b32..c9861c2315e8 100644
17115 +--- a/virt/kvm/async_pf.c
17116 ++++ b/virt/kvm/async_pf.c
17117 +@@ -76,7 +76,7 @@ static void async_pf_execute(struct work_struct *work)
17118 + struct mm_struct *mm = apf->mm;
17119 + struct kvm_vcpu *vcpu = apf->vcpu;
17120 + unsigned long addr = apf->addr;
17121 +- gva_t gva = apf->gva;
17122 ++ gpa_t cr2_or_gpa = apf->cr2_or_gpa;
17123 + int locked = 1;
17124 +
17125 + might_sleep();
17126 +@@ -104,7 +104,7 @@ static void async_pf_execute(struct work_struct *work)
17127 + * this point
17128 + */
17129 +
17130 +- trace_kvm_async_pf_completed(addr, gva);
17131 ++ trace_kvm_async_pf_completed(addr, cr2_or_gpa);
17132 +
17133 + if (swq_has_sleeper(&vcpu->wq))
17134 + swake_up_one(&vcpu->wq);
17135 +@@ -177,8 +177,8 @@ void kvm_check_async_pf_completion(struct kvm_vcpu *vcpu)
17136 + }
17137 + }
17138 +
17139 +-int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, unsigned long hva,
17140 +- struct kvm_arch_async_pf *arch)
17141 ++int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
17142 ++ unsigned long hva, struct kvm_arch_async_pf *arch)
17143 + {
17144 + struct kvm_async_pf *work;
17145 +
17146 +@@ -197,7 +197,7 @@ int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, unsigned long hva,
17147 +
17148 + work->wakeup_all = false;
17149 + work->vcpu = vcpu;
17150 +- work->gva = gva;
17151 ++ work->cr2_or_gpa = cr2_or_gpa;
17152 + work->addr = hva;
17153 + work->arch = *arch;
17154 + work->mm = current->mm;
17155 +diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
17156 +index 9502b1a44232..beec19fcf8cd 100644
17157 +--- a/virt/kvm/kvm_main.c
17158 ++++ b/virt/kvm/kvm_main.c
17159 +@@ -1294,14 +1294,14 @@ bool kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn)
17160 + }
17161 + EXPORT_SYMBOL_GPL(kvm_is_visible_gfn);
17162 +
17163 +-unsigned long kvm_host_page_size(struct kvm *kvm, gfn_t gfn)
17164 ++unsigned long kvm_host_page_size(struct kvm_vcpu *vcpu, gfn_t gfn)
17165 + {
17166 + struct vm_area_struct *vma;
17167 + unsigned long addr, size;
17168 +
17169 + size = PAGE_SIZE;
17170 +
17171 +- addr = gfn_to_hva(kvm, gfn);
17172 ++ addr = kvm_vcpu_gfn_to_hva_prot(vcpu, gfn, NULL);
17173 + if (kvm_is_error_hva(addr))
17174 + return PAGE_SIZE;
17175 +