[gentoo-commits] proj/linux-patches:4.9 commit in: / - gentoo-commits

From:	Mike Pagano <mpagano@g.o>
To:	gentoo-commits@l.g.o
Subject:	[gentoo-commits] proj/linux-patches:4.9 commit in: /
Date:	Sat, 16 Nov 2019 10:55:03
Message-Id:	`1573901685.e758eda21d87b283279b863edee5538d33067904.mpagano@gentoo`

1

commit:     e758eda21d87b283279b863edee5538d33067904

2

Author:     Mike Pagano <mpagano <AT> gentoo <DOT> org>

3

AuthorDate: Sat Nov 16 10:54:45 2019 +0000

4

Commit:     Mike Pagano <mpagano <AT> gentoo <DOT> org>

5

CommitDate: Sat Nov 16 10:54:45 2019 +0000

6

URL:        https://gitweb.gentoo.org/proj/linux-patches.git/commit/?id=e758eda2

7

8

Linux patch 4.9.202

9

10

Signed-off-by: Mike Pagano <mpagano <AT> gentoo.org>

11

12

 0000_README              |    4 +

13

 1201_linux-4.9.202.patch | 3319 ++++++++++++++++++++++++++++++++++++++++++++++

14

 2 files changed, 3323 insertions(+)

15

16

diff --git a/0000_README b/0000_README

17

index 7d079d7..33f5858 100644

18

--- a/0000_README

19

+++ b/0000_README

20

@@ -847,6 +847,10 @@ Patch:  1200_linux-4.9.201.patch

21

 From:   http://www.kernel.org

22

 Desc:   Linux 4.9.201

23

24

+Patch:  1201_linux-4.9.202.patch

25

+From:   http://www.kernel.org

26

+Desc:   Linux 4.9.202

27

+

28

 Patch:  1500_XATTR_USER_PREFIX.patch

29

 From:   https://bugs.gentoo.org/show_bug.cgi?id=470644

30

 Desc:   Support for namespace user.pax.* on tmpfs.

31

32

diff --git a/1201_linux-4.9.202.patch b/1201_linux-4.9.202.patch

33

new file mode 100644

34

index 0000000..bbc562b

35

--- /dev/null

36

+++ b/1201_linux-4.9.202.patch

37

@@ -0,0 +1,3319 @@

38

+diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu

39

+index cadb7a9a5218..b41046b5713b 100644

40

+--- a/Documentation/ABI/testing/sysfs-devices-system-cpu

41

++++ b/Documentation/ABI/testing/sysfs-devices-system-cpu

42

+@@ -358,6 +358,8 @@ What:		/sys/devices/system/cpu/vulnerabilities

43

+ 		/sys/devices/system/cpu/vulnerabilities/spec_store_bypass

44

+ 		/sys/devices/system/cpu/vulnerabilities/l1tf

45

+ 		/sys/devices/system/cpu/vulnerabilities/mds

46

++		/sys/devices/system/cpu/vulnerabilities/tsx_async_abort

47

++		/sys/devices/system/cpu/vulnerabilities/itlb_multihit

48

+ Date:		January 2018

49

+ Contact:	Linux kernel mailing list <linux-kernel@×××××××××××.org>

50

+ Description:	Information about CPU vulnerabilities

51

+diff --git a/Documentation/hw-vuln/index.rst b/Documentation/hw-vuln/index.rst

52

+index ffc064c1ec68..24f53c501366 100644

53

+--- a/Documentation/hw-vuln/index.rst

54

++++ b/Documentation/hw-vuln/index.rst

55

+@@ -11,3 +11,5 @@ are configurable at compile, boot or run time.

56

+

57

+    l1tf

58

+    mds

59

++   tsx_async_abort

60

++   multihit.rst

61

+diff --git a/Documentation/hw-vuln/multihit.rst b/Documentation/hw-vuln/multihit.rst

62

+new file mode 100644

63

+index 000000000000..ba9988d8bce5

64

+--- /dev/null

65

++++ b/Documentation/hw-vuln/multihit.rst

66

+@@ -0,0 +1,163 @@

67

++iTLB multihit

68

++=============

69

++

70

++iTLB multihit is an erratum where some processors may incur a machine check

71

++error, possibly resulting in an unrecoverable CPU lockup, when an

72

++instruction fetch hits multiple entries in the instruction TLB. This can

73

++occur when the page size is changed along with either the physical address

74

++or cache type. A malicious guest running on a virtualized system can

75

++exploit this erratum to perform a denial of service attack.

76

++

77

++

78

++Affected processors

79

++-------------------

80

++

81

++Variations of this erratum are present on most Intel Core and Xeon processor

82

++models. The erratum is not present on:

83

++

84

++   - non-Intel processors

85

++

86

++   - Some Atoms (Airmont, Bonnell, Goldmont, GoldmontPlus, Saltwell, Silvermont)

87

++

88

++   - Intel processors that have the PSCHANGE_MC_NO bit set in the

89

++     IA32_ARCH_CAPABILITIES MSR.

90

++

91

++

92

++Related CVEs

93

++------------

94

++

95

++The following CVE entry is related to this issue:

96

++

97

++   ==============  =================================================

98

++   CVE-2018-12207  Machine Check Error Avoidance on Page Size Change

99

++   ==============  =================================================

100

++

101

++

102

++Problem

103

++-------

104

++

105

++Privileged software, including OS and virtual machine managers (VMM), are in

106

++charge of memory management. A key component in memory management is the control

107

++of the page tables. Modern processors use virtual memory, a technique that creates

108

++the illusion of a very large memory for processors. This virtual space is split

109

++into pages of a given size. Page tables translate virtual addresses to physical

110

++addresses.

111

++

112

++To reduce latency when performing a virtual to physical address translation,

113

++processors include a structure, called TLB, that caches recent translations.

114

++There are separate TLBs for instruction (iTLB) and data (dTLB).

115

++

116

++Under this errata, instructions are fetched from a linear address translated

117

++using a 4 KB translation cached in the iTLB. Privileged software modifies the

118

++paging structure so that the same linear address using large page size (2 MB, 4

119

++MB, 1 GB) with a different physical address or memory type.  After the page

120

++structure modification but before the software invalidates any iTLB entries for

121

++the linear address, a code fetch that happens on the same linear address may

122

++cause a machine-check error which can result in a system hang or shutdown.

123

++

124

++

125

++Attack scenarios

126

++----------------

127

++

128

++Attacks against the iTLB multihit erratum can be mounted from malicious

129

++guests in a virtualized system.

130

++

131

++

132

++iTLB multihit system information

133

++--------------------------------

134

++

135

++The Linux kernel provides a sysfs interface to enumerate the current iTLB

136

++multihit status of the system:whether the system is vulnerable and which

137

++mitigations are active. The relevant sysfs file is:

138

++

139

++/sys/devices/system/cpu/vulnerabilities/itlb_multihit

140

++

141

++The possible values in this file are:

142

++

143

++.. list-table::

144

++

145

++     * - Not affected

146

++       - The processor is not vulnerable.

147

++     * - KVM: Mitigation: Split huge pages

148

++       - Software changes mitigate this issue.

149

++     * - KVM: Vulnerable

150

++       - The processor is vulnerable, but no mitigation enabled

151

++

152

++

153

++Enumeration of the erratum

154

++--------------------------------

155

++

156

++A new bit has been allocated in the IA32_ARCH_CAPABILITIES (PSCHANGE_MC_NO) msr

157

++and will be set on CPU's which are mitigated against this issue.

158

++

159

++   =======================================   ===========   ===============================

160

++   IA32_ARCH_CAPABILITIES MSR                Not present   Possibly vulnerable,check model

161

++   IA32_ARCH_CAPABILITIES[PSCHANGE_MC_NO]    '0'           Likely vulnerable,check model

162

++   IA32_ARCH_CAPABILITIES[PSCHANGE_MC_NO]    '1'           Not vulnerable

163

++   =======================================   ===========   ===============================

164

++

165

++

166

++Mitigation mechanism

167

++-------------------------

168

++

169

++This erratum can be mitigated by restricting the use of large page sizes to

170

++non-executable pages.  This forces all iTLB entries to be 4K, and removes

171

++the possibility of multiple hits.

172

++

173

++In order to mitigate the vulnerability, KVM initially marks all huge pages

174

++as non-executable. If the guest attempts to execute in one of those pages,

175

++the page is broken down into 4K pages, which are then marked executable.

176

++

177

++If EPT is disabled or not available on the host, KVM is in control of TLB

178

++flushes and the problematic situation cannot happen.  However, the shadow

179

++EPT paging mechanism used by nested virtualization is vulnerable, because

180

++the nested guest can trigger multiple iTLB hits by modifying its own

181

++(non-nested) page tables.  For simplicity, KVM will make large pages

182

++non-executable in all shadow paging modes.

183

++

184

++Mitigation control on the kernel command line and KVM - module parameter

185

++------------------------------------------------------------------------

186

++

187

++The KVM hypervisor mitigation mechanism for marking huge pages as

188

++non-executable can be controlled with a module parameter "nx_huge_pages=".

189

++The kernel command line allows to control the iTLB multihit mitigations at

190

++boot time with the option "kvm.nx_huge_pages=".

191

++

192

++The valid arguments for these options are:

193

++

194

++  ==========  ================================================================

195

++  force       Mitigation is enabled. In this case, the mitigation implements

196

++              non-executable huge pages in Linux kernel KVM module. All huge

197

++              pages in the EPT are marked as non-executable.

198

++              If a guest attempts to execute in one of those pages, the page is

199

++              broken down into 4K pages, which are then marked executable.

200

++

201

++  off	      Mitigation is disabled.

202

++

203

++  auto        Enable mitigation only if the platform is affected and the kernel

204

++              was not booted with the "mitigations=off" command line parameter.

205

++	      This is the default option.

206

++  ==========  ================================================================

207

++

208

++

209

++Mitigation selection guide

210

++--------------------------

211

++

212

++1. No virtualization in use

213

++^^^^^^^^^^^^^^^^^^^^^^^^^^^

214

++

215

++   The system is protected by the kernel unconditionally and no further

216

++   action is required.

217

++

218

++2. Virtualization with trusted guests

219

++^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

220

++

221

++   If the guest comes from a trusted source, you may assume that the guest will

222

++   not attempt to maliciously exploit these errata and no further action is

223

++   required.

224

++

225

++3. Virtualization with untrusted guests

226

++^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

227

++   If the guest comes from an untrusted source, the guest host kernel will need

228

++   to apply iTLB multihit mitigation via the kernel command line or kvm

229

++   module parameter.

230

+diff --git a/Documentation/hw-vuln/tsx_async_abort.rst b/Documentation/hw-vuln/tsx_async_abort.rst

231

+new file mode 100644

232

+index 000000000000..fddbd7579c53

233

+--- /dev/null

234

++++ b/Documentation/hw-vuln/tsx_async_abort.rst

235

+@@ -0,0 +1,276 @@

236

++.. SPDX-License-Identifier: GPL-2.0

237

++

238

++TAA - TSX Asynchronous Abort

239

++======================================

240

++

241

++TAA is a hardware vulnerability that allows unprivileged speculative access to

242

++data which is available in various CPU internal buffers by using asynchronous

243

++aborts within an Intel TSX transactional region.

244

++

245

++Affected processors

246

++-------------------

247

++

248

++This vulnerability only affects Intel processors that support Intel

249

++Transactional Synchronization Extensions (TSX) when the TAA_NO bit (bit 8)

250

++is 0 in the IA32_ARCH_CAPABILITIES MSR.  On processors where the MDS_NO bit

251

++(bit 5) is 0 in the IA32_ARCH_CAPABILITIES MSR, the existing MDS mitigations

252

++also mitigate against TAA.

253

++

254

++Whether a processor is affected or not can be read out from the TAA

255

++vulnerability file in sysfs. See :ref:`tsx_async_abort_sys_info`.

256

++

257

++Related CVEs

258

++------------

259

++

260

++The following CVE entry is related to this TAA issue:

261

++

262

++   ==============  =====  ===================================================

263

++   CVE-2019-11135  TAA    TSX Asynchronous Abort (TAA) condition on some

264

++                          microprocessors utilizing speculative execution may

265

++                          allow an authenticated user to potentially enable

266

++                          information disclosure via a side channel with

267

++                          local access.

268

++   ==============  =====  ===================================================

269

++

270

++Problem

271

++-------

272

++

273

++When performing store, load or L1 refill operations, processors write

274

++data into temporary microarchitectural structures (buffers). The data in

275

++those buffers can be forwarded to load operations as an optimization.

276

++

277

++Intel TSX is an extension to the x86 instruction set architecture that adds

278

++hardware transactional memory support to improve performance of multi-threaded

279

++software. TSX lets the processor expose and exploit concurrency hidden in an

280

++application due to dynamically avoiding unnecessary synchronization.

281

++

282

++TSX supports atomic memory transactions that are either committed (success) or

283

++aborted. During an abort, operations that happened within the transactional region

284

++are rolled back. An asynchronous abort takes place, among other options, when a

285

++different thread accesses a cache line that is also used within the transactional

286

++region when that access might lead to a data race.

287

++

288

++Immediately after an uncompleted asynchronous abort, certain speculatively

289

++executed loads may read data from those internal buffers and pass it to dependent

290

++operations. This can be then used to infer the value via a cache side channel

291

++attack.

292

++

293

++Because the buffers are potentially shared between Hyper-Threads cross

294

++Hyper-Thread attacks are possible.

295

++

296

++The victim of a malicious actor does not need to make use of TSX. Only the

297

++attacker needs to begin a TSX transaction and raise an asynchronous abort

298

++which in turn potenitally leaks data stored in the buffers.

299

++

300

++More detailed technical information is available in the TAA specific x86

301

++architecture section: :ref:`Documentation/x86/tsx_async_abort.rst <tsx_async_abort>`.

302

++

303

++

304

++Attack scenarios

305

++----------------

306

++

307

++Attacks against the TAA vulnerability can be implemented from unprivileged

308

++applications running on hosts or guests.

309

++

310

++As for MDS, the attacker has no control over the memory addresses that can

311

++be leaked. Only the victim is responsible for bringing data to the CPU. As

312

++a result, the malicious actor has to sample as much data as possible and

313

++then postprocess it to try to infer any useful information from it.

314

++

315

++A potential attacker only has read access to the data. Also, there is no direct

316

++privilege escalation by using this technique.

317

++

318

++

319

++.. _tsx_async_abort_sys_info:

320

++

321

++TAA system information

322

++-----------------------

323

++

324

++The Linux kernel provides a sysfs interface to enumerate the current TAA status

325

++of mitigated systems. The relevant sysfs file is:

326

++

327

++/sys/devices/system/cpu/vulnerabilities/tsx_async_abort

328

++

329

++The possible values in this file are:

330

++

331

++.. list-table::

332

++

333

++   * - 'Vulnerable'

334

++     - The CPU is affected by this vulnerability and the microcode and kernel mitigation are not applied.

335

++   * - 'Vulnerable: Clear CPU buffers attempted, no microcode'

336

++     - The system tries to clear the buffers but the microcode might not support the operation.

337

++   * - 'Mitigation: Clear CPU buffers'

338

++     - The microcode has been updated to clear the buffers. TSX is still enabled.

339

++   * - 'Mitigation: TSX disabled'

340

++     - TSX is disabled.

341

++   * - 'Not affected'

342

++     - The CPU is not affected by this issue.

343

++

344

++.. _ucode_needed:

345

++

346

++Best effort mitigation mode

347

++^^^^^^^^^^^^^^^^^^^^^^^^^^^

348

++

349

++If the processor is vulnerable, but the availability of the microcode-based

350

++mitigation mechanism is not advertised via CPUID the kernel selects a best

351

++effort mitigation mode.  This mode invokes the mitigation instructions

352

++without a guarantee that they clear the CPU buffers.

353

++

354

++This is done to address virtualization scenarios where the host has the

355

++microcode update applied, but the hypervisor is not yet updated to expose the

356

++CPUID to the guest. If the host has updated microcode the protection takes

357

++effect; otherwise a few CPU cycles are wasted pointlessly.

358

++

359

++The state in the tsx_async_abort sysfs file reflects this situation

360

++accordingly.

361

++

362

++

363

++Mitigation mechanism

364

++--------------------

365

++

366

++The kernel detects the affected CPUs and the presence of the microcode which is

367

++required. If a CPU is affected and the microcode is available, then the kernel

368

++enables the mitigation by default.

369

++

370

++

371

++The mitigation can be controlled at boot time via a kernel command line option.

372

++See :ref:`taa_mitigation_control_command_line`.

373

++

374

++.. _virt_mechanism:

375

++

376

++Virtualization mitigation

377

++^^^^^^^^^^^^^^^^^^^^^^^^^

378

++

379

++Affected systems where the host has TAA microcode and TAA is mitigated by

380

++having disabled TSX previously, are not vulnerable regardless of the status

381

++of the VMs.

382

++

383

++In all other cases, if the host either does not have the TAA microcode or

384

++the kernel is not mitigated, the system might be vulnerable.

385

++

386

++

387

++.. _taa_mitigation_control_command_line:

388

++

389

++Mitigation control on the kernel command line

390

++---------------------------------------------

391

++

392

++The kernel command line allows to control the TAA mitigations at boot time with

393

++the option "tsx_async_abort=". The valid arguments for this option are:

394

++

395

++  ============  =============================================================

396

++  off		This option disables the TAA mitigation on affected platforms.

397

++                If the system has TSX enabled (see next parameter) and the CPU

398

++                is affected, the system is vulnerable.

399

++

400

++  full	        TAA mitigation is enabled. If TSX is enabled, on an affected

401

++                system it will clear CPU buffers on ring transitions. On

402

++                systems which are MDS-affected and deploy MDS mitigation,

403

++                TAA is also mitigated. Specifying this option on those

404

++                systems will have no effect.

405

++

406

++  full,nosmt    The same as tsx_async_abort=full, with SMT disabled on

407

++                vulnerable CPUs that have TSX enabled. This is the complete

408

++                mitigation. When TSX is disabled, SMT is not disabled because

409

++                CPU is not vulnerable to cross-thread TAA attacks.

410

++  ============  =============================================================

411

++

412

++Not specifying this option is equivalent to "tsx_async_abort=full".

413

++

414

++The kernel command line also allows to control the TSX feature using the

415

++parameter "tsx=" on CPUs which support TSX control. MSR_IA32_TSX_CTRL is used

416

++to control the TSX feature and the enumeration of the TSX feature bits (RTM

417

++and HLE) in CPUID.

418

++

419

++The valid options are:

420

++

421

++  ============  =============================================================

422

++  off		Disables TSX on the system.

423

++

424

++                Note that this option takes effect only on newer CPUs which are

425

++                not vulnerable to MDS, i.e., have MSR_IA32_ARCH_CAPABILITIES.MDS_NO=1

426

++                and which get the new IA32_TSX_CTRL MSR through a microcode

427

++                update. This new MSR allows for the reliable deactivation of

428

++                the TSX functionality.

429

++

430

++  on		Enables TSX.

431

++

432

++                Although there are mitigations for all known security

433

++                vulnerabilities, TSX has been known to be an accelerator for

434

++                several previous speculation-related CVEs, and so there may be

435

++                unknown security risks associated with leaving it enabled.

436

++

437

++  auto		Disables TSX if X86_BUG_TAA is present, otherwise enables TSX

438

++                on the system.

439

++  ============  =============================================================

440

++

441

++Not specifying this option is equivalent to "tsx=off".

442

++

443

++The following combinations of the "tsx_async_abort" and "tsx" are possible. For

444

++affected platforms tsx=auto is equivalent to tsx=off and the result will be:

445

++

446

++  =========  ==========================   =========================================

447

++  tsx=on     tsx_async_abort=full         The system will use VERW to clear CPU

448

++                                          buffers. Cross-thread attacks are still

449

++					  possible on SMT machines.

450

++  tsx=on     tsx_async_abort=full,nosmt   As above, cross-thread attacks on SMT

451

++                                          mitigated.

452

++  tsx=on     tsx_async_abort=off          The system is vulnerable.

453

++  tsx=off    tsx_async_abort=full         TSX might be disabled if microcode

454

++                                          provides a TSX control MSR. If so,

455

++					  system is not vulnerable.

456

++  tsx=off    tsx_async_abort=full,nosmt   Ditto

457

++  tsx=off    tsx_async_abort=off          ditto

458

++  =========  ==========================   =========================================

459

++

460

++

461

++For unaffected platforms "tsx=on" and "tsx_async_abort=full" does not clear CPU

462

++buffers.  For platforms without TSX control (MSR_IA32_ARCH_CAPABILITIES.MDS_NO=0)

463

++"tsx" command line argument has no effect.

464

++

465

++For the affected platforms below table indicates the mitigation status for the

466

++combinations of CPUID bit MD_CLEAR and IA32_ARCH_CAPABILITIES MSR bits MDS_NO

467

++and TSX_CTRL_MSR.

468

++

469

++  =======  =========  =============  ========================================

470

++  MDS_NO   MD_CLEAR   TSX_CTRL_MSR   Status

471

++  =======  =========  =============  ========================================

472

++    0          0            0        Vulnerable (needs microcode)

473

++    0          1            0        MDS and TAA mitigated via VERW

474

++    1          1            0        MDS fixed, TAA vulnerable if TSX enabled

475

++                                     because MD_CLEAR has no meaning and

476

++                                     VERW is not guaranteed to clear buffers

477

++    1          X            1        MDS fixed, TAA can be mitigated by

478

++                                     VERW or TSX_CTRL_MSR

479

++  =======  =========  =============  ========================================

480

++

481

++Mitigation selection guide

482

++--------------------------

483

++

484

++1. Trusted userspace and guests

485

++^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

486

++

487

++If all user space applications are from a trusted source and do not execute

488

++untrusted code which is supplied externally, then the mitigation can be

489

++disabled. The same applies to virtualized environments with trusted guests.

490

++

491

++

492

++2. Untrusted userspace and guests

493

++^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

494

++

495

++If there are untrusted applications or guests on the system, enabling TSX

496

++might allow a malicious actor to leak data from the host or from other

497

++processes running on the same physical core.

498

++

499

++If the microcode is available and the TSX is disabled on the host, attacks

500

++are prevented in a virtualized environment as well, even if the VMs do not

501

++explicitly enable the mitigation.

502

++

503

++

504

++.. _taa_default_mitigations:

505

++

506

++Default mitigations

507

++-------------------

508

++

509

++The kernel's default action for vulnerable processors is:

510

++

511

++  - Deploy TSX disable mitigation (tsx_async_abort=full tsx=off).

512

+diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt

513

+index 61b73e42f488..c81a008d6512 100644

514

+--- a/Documentation/kernel-parameters.txt

515

++++ b/Documentation/kernel-parameters.txt

516

+@@ -1975,6 +1975,25 @@ bytes respectively. Such letter suffixes can also be entirely omitted.

517

+ 			KVM MMU at runtime.

518

+ 			Default is 0 (off)

519

+

520

++	kvm.nx_huge_pages=

521

++			[KVM] Controls the software workaround for the

522

++			X86_BUG_ITLB_MULTIHIT bug.

523

++			force	: Always deploy workaround.

524

++			off	: Never deploy workaround.

525

++			auto    : Deploy workaround based on the presence of

526

++				  X86_BUG_ITLB_MULTIHIT.

527

++

528

++			Default is 'auto'.

529

++

530

++			If the software workaround is enabled for the host,

531

++			guests do need not to enable it for nested guests.

532

++

533

++	kvm.nx_huge_pages_recovery_ratio=

534

++			[KVM] Controls how many 4KiB pages are periodically zapped

535

++			back to huge pages.  0 disables the recovery, otherwise if

536

++			the value is N KVM will zap 1/Nth of the 4KiB pages every

537

++			minute.  The default is 60.

538

++

539

+ 	kvm-amd.nested=	[KVM,AMD] Allow nested virtualization in KVM/SVM.

540

+ 			Default is 1 (enabled)

541

+

542

+@@ -2490,6 +2509,13 @@ bytes respectively. Such letter suffixes can also be entirely omitted.

543

+ 					       spec_store_bypass_disable=off [X86]

544

+ 					       l1tf=off [X86]

545

+ 					       mds=off [X86]

546

++					       tsx_async_abort=off [X86]

547

++					       kvm.nx_huge_pages=off [X86]

548

++

549

++				Exceptions:

550

++					       This does not have any effect on

551

++					       kvm.nx_huge_pages when

552

++					       kvm.nx_huge_pages=force.

553

+

554

+ 			auto (default)

555

+ 				Mitigate all CPU vulnerabilities, but leave SMT

556

+@@ -2505,6 +2531,7 @@ bytes respectively. Such letter suffixes can also be entirely omitted.

557

+ 				be fully mitigated, even if it means losing SMT.

558

+ 				Equivalent to: l1tf=flush,nosmt [X86]

559

+ 					       mds=full,nosmt [X86]

560

++					       tsx_async_abort=full,nosmt [X86]

561

+

562

+ 	mminit_loglevel=

563

+ 			[KNL] When CONFIG_DEBUG_MEMORY_INIT is set, this

564

+@@ -4516,6 +4543,71 @@ bytes respectively. Such letter suffixes can also be entirely omitted.

565

+ 			platforms where RDTSC is slow and this accounting

566

+ 			can add overhead.

567

+

568

++	tsx=		[X86] Control Transactional Synchronization

569

++			Extensions (TSX) feature in Intel processors that

570

++			support TSX control.

571

++

572

++			This parameter controls the TSX feature. The options are:

573

++

574

++			on	- Enable TSX on the system. Although there are

575

++				mitigations for all known security vulnerabilities,

576

++				TSX has been known to be an accelerator for

577

++				several previous speculation-related CVEs, and

578

++				so there may be unknown	security risks associated

579

++				with leaving it enabled.

580

++

581

++			off	- Disable TSX on the system. (Note that this

582

++				option takes effect only on newer CPUs which are

583

++				not vulnerable to MDS, i.e., have

584

++				MSR_IA32_ARCH_CAPABILITIES.MDS_NO=1 and which get

585

++				the new IA32_TSX_CTRL MSR through a microcode

586

++				update. This new MSR allows for the reliable

587

++				deactivation of the TSX functionality.)

588

++

589

++			auto	- Disable TSX if X86_BUG_TAA is present,

590

++				  otherwise enable TSX on the system.

591

++

592

++			Not specifying this option is equivalent to tsx=off.

593

++

594

++			See Documentation/hw-vuln/tsx_async_abort.rst

595

++			for more details.

596

++

597

++	tsx_async_abort= [X86,INTEL] Control mitigation for the TSX Async

598

++			Abort (TAA) vulnerability.

599

++

600

++			Similar to Micro-architectural Data Sampling (MDS)

601

++			certain CPUs that support Transactional

602

++			Synchronization Extensions (TSX) are vulnerable to an

603

++			exploit against CPU internal buffers which can forward

604

++			information to a disclosure gadget under certain

605

++			conditions.

606

++

607

++			In vulnerable processors, the speculatively forwarded

608

++			data can be used in a cache side channel attack, to

609

++			access data to which the attacker does not have direct

610

++			access.

611

++

612

++			This parameter controls the TAA mitigation.  The

613

++			options are:

614

++

615

++			full       - Enable TAA mitigation on vulnerable CPUs

616

++				     if TSX is enabled.

617

++

618

++			full,nosmt - Enable TAA mitigation and disable SMT on

619

++				     vulnerable CPUs. If TSX is disabled, SMT

620

++				     is not disabled because CPU is not

621

++				     vulnerable to cross-thread TAA attacks.

622

++			off        - Unconditionally disable TAA mitigation

623

++

624

++			Not specifying this option is equivalent to

625

++			tsx_async_abort=full.  On CPUs which are MDS affected

626

++			and deploy MDS mitigation, TAA mitigation is not

627

++			required and doesn't provide any additional

628

++			mitigation.

629

++

630

++			For details see:

631

++			Documentation/hw-vuln/tsx_async_abort.rst

632

++

633

+ 	turbografx.map[2|3]=	[HW,JOY]

634

+ 			TurboGraFX parallel port interface

635

+ 			Format:

636

+diff --git a/Documentation/virtual/kvm/locking.txt b/Documentation/virtual/kvm/locking.txt

637

+index e5dd9f4d6100..46ef3680c8ab 100644

638

+--- a/Documentation/virtual/kvm/locking.txt

639

++++ b/Documentation/virtual/kvm/locking.txt

640

+@@ -13,8 +13,8 @@ The acquisition orders for mutexes are as follows:

641

+ - kvm->slots_lock is taken outside kvm->irq_lock, though acquiring

642

+   them together is quite rare.

643

+

644

+-For spinlocks, kvm_lock is taken outside kvm->mmu_lock.  Everything

645

+-else is a leaf: no other lock is taken inside the critical sections.

646

++Everything else is a leaf: no other lock is taken inside the critical

647

++sections.

648

+

649

+ 2: Exception

650

+ ------------

651

+@@ -142,7 +142,7 @@ See the comments in spte_has_volatile_bits() and mmu_spte_update().

652

+ ------------

653

+

654

+ Name:		kvm_lock

655

+-Type:		spinlock_t

656

++Type:		mutex

657

+ Arch:		any

658

+ Protects:	- vm_list

659

+

660

+diff --git a/Documentation/x86/index.rst b/Documentation/x86/index.rst

661

+index ef389dcf1b1d..0780d55c5aa8 100644

662

+--- a/Documentation/x86/index.rst

663

++++ b/Documentation/x86/index.rst

664

+@@ -6,3 +6,4 @@ x86 architecture specifics

665

+    :maxdepth: 1

666

+

667

+    mds

668

++   tsx_async_abort

669

+diff --git a/Documentation/x86/tsx_async_abort.rst b/Documentation/x86/tsx_async_abort.rst

670

+new file mode 100644

671

+index 000000000000..4a4336a89372

672

+--- /dev/null

673

++++ b/Documentation/x86/tsx_async_abort.rst

674

+@@ -0,0 +1,117 @@

675

++.. SPDX-License-Identifier: GPL-2.0

676

++

677

++TSX Async Abort (TAA) mitigation

678

++================================

679

++

680

++.. _tsx_async_abort:

681

++

682

++Overview

683

++--------

684

++

685

++TSX Async Abort (TAA) is a side channel attack on internal buffers in some

686

++Intel processors similar to Microachitectural Data Sampling (MDS).  In this

687

++case certain loads may speculatively pass invalid data to dependent operations

688

++when an asynchronous abort condition is pending in a Transactional

689

++Synchronization Extensions (TSX) transaction.  This includes loads with no

690

++fault or assist condition. Such loads may speculatively expose stale data from

691

++the same uarch data structures as in MDS, with same scope of exposure i.e.

692

++same-thread and cross-thread. This issue affects all current processors that

693

++support TSX.

694

++

695

++Mitigation strategy

696

++-------------------

697

++

698

++a) TSX disable - one of the mitigations is to disable TSX. A new MSR

699

++IA32_TSX_CTRL will be available in future and current processors after

700

++microcode update which can be used to disable TSX. In addition, it

701

++controls the enumeration of the TSX feature bits (RTM and HLE) in CPUID.

702

++

703

++b) Clear CPU buffers - similar to MDS, clearing the CPU buffers mitigates this

704

++vulnerability. More details on this approach can be found in

705

++:ref:`Documentation/hw-vuln/mds.rst <mds>`.

706

++

707

++Kernel internal mitigation modes

708

++--------------------------------

709

++

710

++ =============    ============================================================

711

++ off              Mitigation is disabled. Either the CPU is not affected or

712

++                  tsx_async_abort=off is supplied on the kernel command line.

713

++

714

++ tsx disabled     Mitigation is enabled. TSX feature is disabled by default at

715

++                  bootup on processors that support TSX control.

716

++

717

++ verw             Mitigation is enabled. CPU is affected and MD_CLEAR is

718

++                  advertised in CPUID.

719

++

720

++ ucode needed     Mitigation is enabled. CPU is affected and MD_CLEAR is not

721

++                  advertised in CPUID. That is mainly for virtualization

722

++                  scenarios where the host has the updated microcode but the

723

++                  hypervisor does not expose MD_CLEAR in CPUID. It's a best

724

++                  effort approach without guarantee.

725

++ =============    ============================================================

726

++

727

++If the CPU is affected and the "tsx_async_abort" kernel command line parameter is

728

++not provided then the kernel selects an appropriate mitigation depending on the

729

++status of RTM and MD_CLEAR CPUID bits.

730

++

731

++Below tables indicate the impact of tsx=on|off|auto cmdline options on state of

732

++TAA mitigation, VERW behavior and TSX feature for various combinations of

733

++MSR_IA32_ARCH_CAPABILITIES bits.

734

++

735

++1. "tsx=off"

736

++

737

++=========  =========  ============  ============  ==============  ===================  ======================

738

++MSR_IA32_ARCH_CAPABILITIES bits     Result with cmdline tsx=off

739

++----------------------------------  -------------------------------------------------------------------------

740

++TAA_NO     MDS_NO     TSX_CTRL_MSR  TSX state     VERW can clear  TAA mitigation       TAA mitigation

741

++                                    after bootup  CPU buffers     tsx_async_abort=off  tsx_async_abort=full

742

++=========  =========  ============  ============  ==============  ===================  ======================

743

++    0          0           0         HW default         Yes           Same as MDS           Same as MDS

744

++    0          0           1        Invalid case   Invalid case       Invalid case          Invalid case

745

++    0          1           0         HW default         No         Need ucode update     Need ucode update

746

++    0          1           1          Disabled          Yes           TSX disabled          TSX disabled

747

++    1          X           1          Disabled           X             None needed           None needed

748

++=========  =========  ============  ============  ==============  ===================  ======================

749

++

750

++2. "tsx=on"

751

++

752

++=========  =========  ============  ============  ==============  ===================  ======================

753

++MSR_IA32_ARCH_CAPABILITIES bits     Result with cmdline tsx=on

754

++----------------------------------  -------------------------------------------------------------------------

755

++TAA_NO     MDS_NO     TSX_CTRL_MSR  TSX state     VERW can clear  TAA mitigation       TAA mitigation

756

++                                    after bootup  CPU buffers     tsx_async_abort=off  tsx_async_abort=full

757

++=========  =========  ============  ============  ==============  ===================  ======================

758

++    0          0           0         HW default        Yes            Same as MDS          Same as MDS

759

++    0          0           1        Invalid case   Invalid case       Invalid case         Invalid case

760

++    0          1           0         HW default        No          Need ucode update     Need ucode update

761

++    0          1           1          Enabled          Yes               None              Same as MDS

762

++    1          X           1          Enabled          X              None needed          None needed

763

++=========  =========  ============  ============  ==============  ===================  ======================

764

++

765

++3. "tsx=auto"

766

++

767

++=========  =========  ============  ============  ==============  ===================  ======================

768

++MSR_IA32_ARCH_CAPABILITIES bits     Result with cmdline tsx=auto

769

++----------------------------------  -------------------------------------------------------------------------

770

++TAA_NO     MDS_NO     TSX_CTRL_MSR  TSX state     VERW can clear  TAA mitigation       TAA mitigation

771

++                                    after bootup  CPU buffers     tsx_async_abort=off  tsx_async_abort=full

772

++=========  =========  ============  ============  ==============  ===================  ======================

773

++    0          0           0         HW default    Yes                Same as MDS           Same as MDS

774

++    0          0           1        Invalid case  Invalid case        Invalid case          Invalid case

775

++    0          1           0         HW default    No              Need ucode update     Need ucode update

776

++    0          1           1          Disabled      Yes               TSX disabled          TSX disabled

777

++    1          X           1          Enabled       X                 None needed           None needed

778

++=========  =========  ============  ============  ==============  ===================  ======================

779

++

780

++In the tables, TSX_CTRL_MSR is a new bit in MSR_IA32_ARCH_CAPABILITIES that

781

++indicates whether MSR_IA32_TSX_CTRL is supported.

782

++

783

++There are two control bits in IA32_TSX_CTRL MSR:

784

++

785

++      Bit 0: When set it disables the Restricted Transactional Memory (RTM)

786

++             sub-feature of TSX (will force all transactions to abort on the

787

++             XBEGIN instruction).

788

++

789

++      Bit 1: When set it disables the enumeration of the RTM and HLE feature

790

++             (i.e. it will make CPUID(EAX=7).EBX{bit4} and

791

++             CPUID(EAX=7).EBX{bit11} read as 0).

792

+diff --git a/Makefile b/Makefile

793

+index 4741bbdfaa10..1e322e669301 100644

794

+--- a/Makefile

795

++++ b/Makefile

796

+@@ -1,6 +1,6 @@

797

+ VERSION = 4

798

+ PATCHLEVEL = 9

799

+-SUBLEVEL = 201

800

++SUBLEVEL = 202

801

+ EXTRAVERSION =

802

+ NAME = Roaring Lionus

803

+

804

+diff --git a/arch/mips/bcm63xx/reset.c b/arch/mips/bcm63xx/reset.c

805

+index d1fe51edf5e6..4d411da2497b 100644

806

+--- a/arch/mips/bcm63xx/reset.c

807

++++ b/arch/mips/bcm63xx/reset.c

808

+@@ -119,7 +119,7 @@

809

+ #define BCM6368_RESET_DSL	0

810

+ #define BCM6368_RESET_SAR	SOFTRESET_6368_SAR_MASK

811

+ #define BCM6368_RESET_EPHY	SOFTRESET_6368_EPHY_MASK

812

+-#define BCM6368_RESET_ENETSW	0

813

++#define BCM6368_RESET_ENETSW	SOFTRESET_6368_ENETSW_MASK

814

+ #define BCM6368_RESET_PCM	SOFTRESET_6368_PCM_MASK

815

+ #define BCM6368_RESET_MPI	SOFTRESET_6368_MPI_MASK

816

+ #define BCM6368_RESET_PCIE	0

817

+diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c

818

+index 3dc96b455e0c..37c254677ccd 100644

819

+--- a/arch/s390/kvm/kvm-s390.c

820

++++ b/arch/s390/kvm/kvm-s390.c

821

+@@ -1422,13 +1422,13 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)

822

+ 	kvm->arch.sca = (struct bsca_block *) get_zeroed_page(alloc_flags);

823

+ 	if (!kvm->arch.sca)

824

+ 		goto out_err;

825

+-	spin_lock(&kvm_lock);

826

++	mutex_lock(&kvm_lock);

827

+ 	sca_offset += 16;

828

+ 	if (sca_offset + sizeof(struct bsca_block) > PAGE_SIZE)

829

+ 		sca_offset = 0;

830

+ 	kvm->arch.sca = (struct bsca_block *)

831

+ 			((char *) kvm->arch.sca + sca_offset);

832

+-	spin_unlock(&kvm_lock);

833

++	mutex_unlock(&kvm_lock);

834

+

835

+ 	sprintf(debug_name, "kvm-%u", current->pid);

836

+

837

+diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig

838

+index e0055b4302d6..1067f7668c4e 100644

839

+--- a/arch/x86/Kconfig

840

++++ b/arch/x86/Kconfig

841

+@@ -1755,6 +1755,51 @@ config X86_INTEL_MEMORY_PROTECTION_KEYS

842

+

843

+ 	  If unsure, say y.

844

+

845

++choice

846

++	prompt "TSX enable mode"

847

++	depends on CPU_SUP_INTEL

848

++	default X86_INTEL_TSX_MODE_OFF

849

++	help

850

++	  Intel's TSX (Transactional Synchronization Extensions) feature

851

++	  allows to optimize locking protocols through lock elision which

852

++	  can lead to a noticeable performance boost.

853

++

854

++	  On the other hand it has been shown that TSX can be exploited

855

++	  to form side channel attacks (e.g. TAA) and chances are there

856

++	  will be more of those attacks discovered in the future.

857

++

858

++	  Therefore TSX is not enabled by default (aka tsx=off). An admin

859

++	  might override this decision by tsx=on the command line parameter.

860

++	  Even with TSX enabled, the kernel will attempt to enable the best

861

++	  possible TAA mitigation setting depending on the microcode available

862

++	  for the particular machine.

863

++

864

++	  This option allows to set the default tsx mode between tsx=on, =off

865

++	  and =auto. See Documentation/kernel-parameters.txt for more

866

++	  details.

867

++

868

++	  Say off if not sure, auto if TSX is in use but it should be used on safe

869

++	  platforms or on if TSX is in use and the security aspect of tsx is not

870

++	  relevant.

871

++

872

++config X86_INTEL_TSX_MODE_OFF

873

++	bool "off"

874

++	help

875

++	  TSX is disabled if possible - equals to tsx=off command line parameter.

876

++

877

++config X86_INTEL_TSX_MODE_ON

878

++	bool "on"

879

++	help

880

++	  TSX is always enabled on TSX capable HW - equals the tsx=on command

881

++	  line parameter.

882

++

883

++config X86_INTEL_TSX_MODE_AUTO

884

++	bool "auto"

885

++	help

886

++	  TSX is enabled on TSX capable HW that is believed to be safe against

887

++	  side channel attacks- equals the tsx=auto command line parameter.

888

++endchoice

889

++

890

+ config EFI

891

+ 	bool "EFI runtime service support"

892

+ 	depends on ACPI

893

+diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h

894

+index 3a972da155d6..ccc4420f051b 100644

895

+--- a/arch/x86/include/asm/cpufeatures.h

896

++++ b/arch/x86/include/asm/cpufeatures.h

897

+@@ -357,5 +357,7 @@

898

+ #define X86_BUG_MDS		X86_BUG(19) /* CPU is affected by Microarchitectural data sampling */

899

+ #define X86_BUG_MSBDS_ONLY	X86_BUG(20) /* CPU is only affected by the  MSDBS variant of BUG_MDS */

900

+ #define X86_BUG_SWAPGS		X86_BUG(21) /* CPU is affected by speculation through SWAPGS */

901

++#define X86_BUG_TAA		X86_BUG(22) /* CPU is affected by TSX Async Abort(TAA) */

902

++#define X86_BUG_ITLB_MULTIHIT	X86_BUG(23) /* CPU may incur MCE during certain page attribute changes */

903

+

904

+ #endif /* _ASM_X86_CPUFEATURES_H */

905

+diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h

906

+index 222cb69e1219..d2c14a96ec28 100644

907

+--- a/arch/x86/include/asm/kvm_host.h

908

++++ b/arch/x86/include/asm/kvm_host.h

909

+@@ -261,6 +261,7 @@ struct kvm_rmap_head {

910

+ struct kvm_mmu_page {

911

+ 	struct list_head link;

912

+ 	struct hlist_node hash_link;

913

++	struct list_head lpage_disallowed_link;

914

+

915

+ 	/*

916

+ 	 * The following two entries are used to key the shadow page in the

917

+@@ -273,6 +274,7 @@ struct kvm_mmu_page {

918

+ 	/* hold the gfn of each spte inside spt */

919

+ 	gfn_t *gfns;

920

+ 	bool unsync;

921

++	bool lpage_disallowed; /* Can't be replaced by an equiv large page */

922

+ 	int root_count;          /* Currently serving as active root */

923

+ 	unsigned int unsync_children;

924

+ 	struct kvm_rmap_head parent_ptes; /* rmap pointers to parent sptes */

925

+@@ -724,6 +726,7 @@ struct kvm_arch {

926

+ 	 */

927

+ 	struct list_head active_mmu_pages;

928

+ 	struct list_head zapped_obsolete_pages;

929

++	struct list_head lpage_disallowed_mmu_pages;

930

+ 	struct kvm_page_track_notifier_node mmu_sp_tracker;

931

+ 	struct kvm_page_track_notifier_head track_notifier_head;

932

+

933

+@@ -798,6 +801,8 @@ struct kvm_arch {

934

+

935

+ 	bool x2apic_format;

936

+ 	bool x2apic_broadcast_quirk_disabled;

937

++

938

++	struct task_struct *nx_lpage_recovery_thread;

939

+ };

940

+

941

+ struct kvm_vm_stat {

942

+@@ -811,6 +816,7 @@ struct kvm_vm_stat {

943

+ 	ulong mmu_unsync;

944

+ 	ulong remote_tlb_flush;

945

+ 	ulong lpages;

946

++	ulong nx_lpage_splits;

947

+ };

948

+

949

+ struct kvm_vcpu_stat {

950

+diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h

951

+index 86166868db8c..8d162e0f2881 100644

952

+--- a/arch/x86/include/asm/msr-index.h

953

++++ b/arch/x86/include/asm/msr-index.h

954

+@@ -77,6 +77,18 @@

955

+ 						  * Microarchitectural Data

956

+ 						  * Sampling (MDS) vulnerabilities.

957

+ 						  */

958

++#define ARCH_CAP_PSCHANGE_MC_NO		BIT(6)	 /*

959

++						  * The processor is not susceptible to a

960

++						  * machine check error due to modifying the

961

++						  * code page size along with either the

962

++						  * physical address or cache type

963

++						  * without TLB invalidation.

964

++						  */

965

++#define ARCH_CAP_TSX_CTRL_MSR		BIT(7)	/* MSR for TSX control is available. */

966

++#define ARCH_CAP_TAA_NO			BIT(8)	/*

967

++						 * Not susceptible to

968

++						 * TSX Async Abort (TAA) vulnerabilities.

969

++						 */

970

+

971

+ #define MSR_IA32_FLUSH_CMD		0x0000010b

972

+ #define L1D_FLUSH			BIT(0)	/*

973

+@@ -87,6 +99,10 @@

974

+ #define MSR_IA32_BBL_CR_CTL		0x00000119

975

+ #define MSR_IA32_BBL_CR_CTL3		0x0000011e

976

+

977

++#define MSR_IA32_TSX_CTRL		0x00000122

978

++#define TSX_CTRL_RTM_DISABLE		BIT(0)	/* Disable RTM feature */

979

++#define TSX_CTRL_CPUID_CLEAR		BIT(1)	/* Disable TSX enumeration */

980

++

981

+ #define MSR_IA32_SYSENTER_CS		0x00000174

982

+ #define MSR_IA32_SYSENTER_ESP		0x00000175

983

+ #define MSR_IA32_SYSENTER_EIP		0x00000176

984

+diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h

985

+index 10a48505abb5..8d56d701b5f7 100644

986

+--- a/arch/x86/include/asm/nospec-branch.h

987

++++ b/arch/x86/include/asm/nospec-branch.h

988

+@@ -314,7 +314,7 @@ DECLARE_STATIC_KEY_FALSE(mds_idle_clear);

989

+ #include <asm/segment.h>

990

+

991

+ /**

992

+- * mds_clear_cpu_buffers - Mitigation for MDS vulnerability

993

++ * mds_clear_cpu_buffers - Mitigation for MDS and TAA vulnerability

994

+  *

995

+  * This uses the otherwise unused and obsolete VERW instruction in

996

+  * combination with microcode which triggers a CPU buffer flush when the

997

+@@ -337,7 +337,7 @@ static inline void mds_clear_cpu_buffers(void)

998

+ }

999

+

1000

+ /**

1001

+- * mds_user_clear_cpu_buffers - Mitigation for MDS vulnerability

1002

++ * mds_user_clear_cpu_buffers - Mitigation for MDS and TAA vulnerability

1003

+  *

1004

+  * Clear CPU buffers if the corresponding static key is enabled

1005

+  */

1006

+diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h

1007

+index 155e49fc7010..92703fa09c19 100644

1008

+--- a/arch/x86/include/asm/processor.h

1009

++++ b/arch/x86/include/asm/processor.h

1010

+@@ -880,4 +880,11 @@ enum mds_mitigations {

1011

+ 	MDS_MITIGATION_VMWERV,

1012

+ };

1013

+

1014

++enum taa_mitigations {

1015

++	TAA_MITIGATION_OFF,

1016

++	TAA_MITIGATION_UCODE_NEEDED,

1017

++	TAA_MITIGATION_VERW,

1018

++	TAA_MITIGATION_TSX_DISABLED,

1019

++};

1020

++

1021

+ #endif /* _ASM_X86_PROCESSOR_H */

1022

+diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile

1023

+index 33b63670bf09..f6e386fe510c 100644

1024

+--- a/arch/x86/kernel/cpu/Makefile

1025

++++ b/arch/x86/kernel/cpu/Makefile

1026

+@@ -25,7 +25,7 @@ obj-y			+= bugs.o

1027

+ obj-$(CONFIG_PROC_FS)	+= proc.o

1028

+ obj-$(CONFIG_X86_FEATURE_NAMES) += capflags.o powerflags.o

1029

+

1030

+-obj-$(CONFIG_CPU_SUP_INTEL)		+= intel.o

1031

++obj-$(CONFIG_CPU_SUP_INTEL)		+= intel.o tsx.o

1032

+ obj-$(CONFIG_CPU_SUP_AMD)		+= amd.o

1033

+ obj-$(CONFIG_CPU_SUP_CYRIX_32)		+= cyrix.o

1034

+ obj-$(CONFIG_CPU_SUP_CENTAUR)		+= centaur.o

1035

+diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c

1036

+index 2a42fef275ad..827fc38df97a 100644

1037

+--- a/arch/x86/kernel/cpu/bugs.c

1038

++++ b/arch/x86/kernel/cpu/bugs.c

1039

+@@ -31,11 +31,14 @@

1040

+ #include <asm/intel-family.h>

1041

+ #include <asm/e820.h>

1042

+

1043

++#include "cpu.h"

1044

++

1045

+ static void __init spectre_v1_select_mitigation(void);

1046

+ static void __init spectre_v2_select_mitigation(void);

1047

+ static void __init ssb_select_mitigation(void);

1048

+ static void __init l1tf_select_mitigation(void);

1049

+ static void __init mds_select_mitigation(void);

1050

++static void __init taa_select_mitigation(void);

1051

+

1052

+ /* The base value of the SPEC_CTRL MSR that always has to be preserved. */

1053

+ u64 x86_spec_ctrl_base;

1054

+@@ -102,6 +105,7 @@ void __init check_bugs(void)

1055

+ 	ssb_select_mitigation();

1056

+ 	l1tf_select_mitigation();

1057

+ 	mds_select_mitigation();

1058

++	taa_select_mitigation();

1059

+

1060

+ 	arch_smt_update();

1061

+

1062

+@@ -265,6 +269,100 @@ static int __init mds_cmdline(char *str)

1063

+ }

1064

+ early_param("mds", mds_cmdline);

1065

+

1066

++#undef pr_fmt

1067

++#define pr_fmt(fmt)	"TAA: " fmt

1068

++

1069

++/* Default mitigation for TAA-affected CPUs */

1070

++static enum taa_mitigations taa_mitigation __ro_after_init = TAA_MITIGATION_VERW;

1071

++static bool taa_nosmt __ro_after_init;

1072

++

1073

++static const char * const taa_strings[] = {

1074

++	[TAA_MITIGATION_OFF]		= "Vulnerable",

1075

++	[TAA_MITIGATION_UCODE_NEEDED]	= "Vulnerable: Clear CPU buffers attempted, no microcode",

1076

++	[TAA_MITIGATION_VERW]		= "Mitigation: Clear CPU buffers",

1077

++	[TAA_MITIGATION_TSX_DISABLED]	= "Mitigation: TSX disabled",

1078

++};

1079

++

1080

++static void __init taa_select_mitigation(void)

1081

++{

1082

++	u64 ia32_cap;

1083

++

1084

++	if (!boot_cpu_has_bug(X86_BUG_TAA)) {

1085

++		taa_mitigation = TAA_MITIGATION_OFF;

1086

++		return;

1087

++	}

1088

++

1089

++	/* TSX previously disabled by tsx=off */

1090

++	if (!boot_cpu_has(X86_FEATURE_RTM)) {

1091

++		taa_mitigation = TAA_MITIGATION_TSX_DISABLED;

1092

++		goto out;

1093

++	}

1094

++

1095

++	if (cpu_mitigations_off()) {

1096

++		taa_mitigation = TAA_MITIGATION_OFF;

1097

++		return;

1098

++	}

1099

++

1100

++	/* TAA mitigation is turned off on the cmdline (tsx_async_abort=off) */

1101

++	if (taa_mitigation == TAA_MITIGATION_OFF)

1102

++		goto out;

1103

++

1104

++	if (boot_cpu_has(X86_FEATURE_MD_CLEAR))

1105

++		taa_mitigation = TAA_MITIGATION_VERW;

1106

++	else

1107

++		taa_mitigation = TAA_MITIGATION_UCODE_NEEDED;

1108

++

1109

++	/*

1110

++	 * VERW doesn't clear the CPU buffers when MD_CLEAR=1 and MDS_NO=1.

1111

++	 * A microcode update fixes this behavior to clear CPU buffers. It also

1112

++	 * adds support for MSR_IA32_TSX_CTRL which is enumerated by the

1113

++	 * ARCH_CAP_TSX_CTRL_MSR bit.

1114

++	 *

1115

++	 * On MDS_NO=1 CPUs if ARCH_CAP_TSX_CTRL_MSR is not set, microcode

1116

++	 * update is required.

1117

++	 */

1118

++	ia32_cap = x86_read_arch_cap_msr();

1119

++	if ( (ia32_cap & ARCH_CAP_MDS_NO) &&

1120

++	    !(ia32_cap & ARCH_CAP_TSX_CTRL_MSR))

1121

++		taa_mitigation = TAA_MITIGATION_UCODE_NEEDED;

1122

++

1123

++	/*

1124

++	 * TSX is enabled, select alternate mitigation for TAA which is

1125

++	 * the same as MDS. Enable MDS static branch to clear CPU buffers.

1126

++	 *

1127

++	 * For guests that can't determine whether the correct microcode is

1128

++	 * present on host, enable the mitigation for UCODE_NEEDED as well.

1129

++	 */

1130

++	static_branch_enable(&mds_user_clear);

1131

++

1132

++	if (taa_nosmt || cpu_mitigations_auto_nosmt())

1133

++		cpu_smt_disable(false);

1134

++

1135

++out:

1136

++	pr_info("%s\n", taa_strings[taa_mitigation]);

1137

++}

1138

++

1139

++static int __init tsx_async_abort_parse_cmdline(char *str)

1140

++{

1141

++	if (!boot_cpu_has_bug(X86_BUG_TAA))

1142

++		return 0;

1143

++

1144

++	if (!str)

1145

++		return -EINVAL;

1146

++

1147

++	if (!strcmp(str, "off")) {

1148

++		taa_mitigation = TAA_MITIGATION_OFF;

1149

++	} else if (!strcmp(str, "full")) {

1150

++		taa_mitigation = TAA_MITIGATION_VERW;

1151

++	} else if (!strcmp(str, "full,nosmt")) {

1152

++		taa_mitigation = TAA_MITIGATION_VERW;

1153

++		taa_nosmt = true;

1154

++	}

1155

++

1156

++	return 0;

1157

++}

1158

++early_param("tsx_async_abort", tsx_async_abort_parse_cmdline);

1159

++

1160

+ #undef pr_fmt

1161

+ #define pr_fmt(fmt)     "Spectre V1 : " fmt

1162

+

1163

+@@ -780,13 +878,10 @@ static void update_mds_branch_idle(void)

1164

+ }

1165

+

1166

+ #define MDS_MSG_SMT "MDS CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/mds.html for more details.\n"

1167

++#define TAA_MSG_SMT "TAA CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/tsx_async_abort.html for more details.\n"

1168

+

1169

+ void arch_smt_update(void)

1170

+ {

1171

+-	/* Enhanced IBRS implies STIBP. No update required. */

1172

+-	if (spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED)

1173

+-		return;

1174

+-

1175

+ 	mutex_lock(&spec_ctrl_mutex);

1176

+

1177

+ 	switch (spectre_v2_user) {

1178

+@@ -812,6 +907,17 @@ void arch_smt_update(void)

1179

+ 		break;

1180

+ 	}

1181

+

1182

++	switch (taa_mitigation) {

1183

++	case TAA_MITIGATION_VERW:

1184

++	case TAA_MITIGATION_UCODE_NEEDED:

1185

++		if (sched_smt_active())

1186

++			pr_warn_once(TAA_MSG_SMT);

1187

++		break;

1188

++	case TAA_MITIGATION_TSX_DISABLED:

1189

++	case TAA_MITIGATION_OFF:

1190

++		break;

1191

++	}

1192

++

1193

+ 	mutex_unlock(&spec_ctrl_mutex);

1194

+ }

1195

+

1196

+@@ -1127,6 +1233,9 @@ void x86_spec_ctrl_setup_ap(void)

1197

+ 		x86_amd_ssb_disable();

1198

+ }

1199

+

1200

++bool itlb_multihit_kvm_mitigation;

1201

++EXPORT_SYMBOL_GPL(itlb_multihit_kvm_mitigation);

1202

++

1203

+ #undef pr_fmt

1204

+ #define pr_fmt(fmt)	"L1TF: " fmt

1205

+

1206

+@@ -1282,11 +1391,24 @@ static ssize_t l1tf_show_state(char *buf)

1207

+ 		       l1tf_vmx_states[l1tf_vmx_mitigation],

1208

+ 		       sched_smt_active() ? "vulnerable" : "disabled");

1209

+ }

1210

++

1211

++static ssize_t itlb_multihit_show_state(char *buf)

1212

++{

1213

++	if (itlb_multihit_kvm_mitigation)

1214

++		return sprintf(buf, "KVM: Mitigation: Split huge pages\n");

1215

++	else

1216

++		return sprintf(buf, "KVM: Vulnerable\n");

1217

++}

1218

+ #else

1219

+ static ssize_t l1tf_show_state(char *buf)

1220

+ {

1221

+ 	return sprintf(buf, "%s\n", L1TF_DEFAULT_MSG);

1222

+ }

1223

++

1224

++static ssize_t itlb_multihit_show_state(char *buf)

1225

++{

1226

++	return sprintf(buf, "Processor vulnerable\n");

1227

++}

1228

+ #endif

1229

+

1230

+ static ssize_t mds_show_state(char *buf)

1231

+@@ -1308,6 +1430,21 @@ static ssize_t mds_show_state(char *buf)

1232

+ 		       sched_smt_active() ? "vulnerable" : "disabled");

1233

+ }

1234

+

1235

++static ssize_t tsx_async_abort_show_state(char *buf)

1236

++{

1237

++	if ((taa_mitigation == TAA_MITIGATION_TSX_DISABLED) ||

1238

++	    (taa_mitigation == TAA_MITIGATION_OFF))

1239

++		return sprintf(buf, "%s\n", taa_strings[taa_mitigation]);

1240

++

1241

++	if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) {

1242

++		return sprintf(buf, "%s; SMT Host state unknown\n",

1243

++			       taa_strings[taa_mitigation]);

1244

++	}

1245

++

1246

++	return sprintf(buf, "%s; SMT %s\n", taa_strings[taa_mitigation],

1247

++		       sched_smt_active() ? "vulnerable" : "disabled");

1248

++}

1249

++

1250

+ static char *stibp_state(void)

1251

+ {

1252

+ 	if (spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED)

1253

+@@ -1373,6 +1510,12 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr

1254

+ 	case X86_BUG_MDS:

1255

+ 		return mds_show_state(buf);

1256

+

1257

++	case X86_BUG_TAA:

1258

++		return tsx_async_abort_show_state(buf);

1259

++

1260

++	case X86_BUG_ITLB_MULTIHIT:

1261

++		return itlb_multihit_show_state(buf);

1262

++

1263

+ 	default:

1264

+ 		break;

1265

+ 	}

1266

+@@ -1409,4 +1552,14 @@ ssize_t cpu_show_mds(struct device *dev, struct device_attribute *attr, char *bu

1267

+ {

1268

+ 	return cpu_show_common(dev, attr, buf, X86_BUG_MDS);

1269

+ }

1270

++

1271

++ssize_t cpu_show_tsx_async_abort(struct device *dev, struct device_attribute *attr, char *buf)

1272

++{

1273

++	return cpu_show_common(dev, attr, buf, X86_BUG_TAA);

1274

++}

1275

++

1276

++ssize_t cpu_show_itlb_multihit(struct device *dev, struct device_attribute *attr, char *buf)

1277

++{

1278

++	return cpu_show_common(dev, attr, buf, X86_BUG_ITLB_MULTIHIT);

1279

++}

1280

+ #endif

1281

+diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c

1282

+index 12fa16051871..477df9782fdf 100644

1283

+--- a/arch/x86/kernel/cpu/common.c

1284

++++ b/arch/x86/kernel/cpu/common.c

1285

+@@ -891,13 +891,14 @@ static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c)

1286

+ 	c->x86_cache_bits = c->x86_phys_bits;

1287

+ }

1288

+

1289

+-#define NO_SPECULATION	BIT(0)

1290

+-#define NO_MELTDOWN	BIT(1)

1291

+-#define NO_SSB		BIT(2)

1292

+-#define NO_L1TF		BIT(3)

1293

+-#define NO_MDS		BIT(4)

1294

+-#define MSBDS_ONLY	BIT(5)

1295

+-#define NO_SWAPGS	BIT(6)

1296

++#define NO_SPECULATION		BIT(0)

1297

++#define NO_MELTDOWN		BIT(1)

1298

++#define NO_SSB			BIT(2)

1299

++#define NO_L1TF			BIT(3)

1300

++#define NO_MDS			BIT(4)

1301

++#define MSBDS_ONLY		BIT(5)

1302

++#define NO_SWAPGS		BIT(6)

1303

++#define NO_ITLB_MULTIHIT	BIT(7)

1304

+

1305

+ #define VULNWL(_vendor, _family, _model, _whitelist)	\

1306

+ 	{ X86_VENDOR_##_vendor, _family, _model, X86_FEATURE_ANY, _whitelist }

1307

+@@ -915,26 +916,26 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = {

1308

+ 	VULNWL(NSC,	5, X86_MODEL_ANY,	NO_SPECULATION),

1309

+

1310

+ 	/* Intel Family 6 */

1311

+-	VULNWL_INTEL(ATOM_SALTWELL,		NO_SPECULATION),

1312

+-	VULNWL_INTEL(ATOM_SALTWELL_TABLET,	NO_SPECULATION),

1313

+-	VULNWL_INTEL(ATOM_SALTWELL_MID,		NO_SPECULATION),

1314

+-	VULNWL_INTEL(ATOM_BONNELL,		NO_SPECULATION),

1315

+-	VULNWL_INTEL(ATOM_BONNELL_MID,		NO_SPECULATION),

1316

+-

1317

+-	VULNWL_INTEL(ATOM_SILVERMONT,		NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS),

1318

+-	VULNWL_INTEL(ATOM_SILVERMONT_X,		NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS),

1319

+-	VULNWL_INTEL(ATOM_SILVERMONT_MID,	NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS),

1320

+-	VULNWL_INTEL(ATOM_AIRMONT,		NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS),

1321

+-	VULNWL_INTEL(XEON_PHI_KNL,		NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS),

1322

+-	VULNWL_INTEL(XEON_PHI_KNM,		NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS),

1323

++	VULNWL_INTEL(ATOM_SALTWELL,		NO_SPECULATION | NO_ITLB_MULTIHIT),

1324

++	VULNWL_INTEL(ATOM_SALTWELL_TABLET,	NO_SPECULATION | NO_ITLB_MULTIHIT),

1325

++	VULNWL_INTEL(ATOM_SALTWELL_MID,		NO_SPECULATION | NO_ITLB_MULTIHIT),

1326

++	VULNWL_INTEL(ATOM_BONNELL,		NO_SPECULATION | NO_ITLB_MULTIHIT),

1327

++	VULNWL_INTEL(ATOM_BONNELL_MID,		NO_SPECULATION | NO_ITLB_MULTIHIT),

1328

++

1329

++	VULNWL_INTEL(ATOM_SILVERMONT,		NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),

1330

++	VULNWL_INTEL(ATOM_SILVERMONT_X,		NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),

1331

++	VULNWL_INTEL(ATOM_SILVERMONT_MID,	NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),

1332

++	VULNWL_INTEL(ATOM_AIRMONT,		NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),

1333

++	VULNWL_INTEL(XEON_PHI_KNL,		NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),

1334

++	VULNWL_INTEL(XEON_PHI_KNM,		NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),

1335

+

1336

+ 	VULNWL_INTEL(CORE_YONAH,		NO_SSB),

1337

+

1338

+-	VULNWL_INTEL(ATOM_AIRMONT_MID,		NO_L1TF | MSBDS_ONLY | NO_SWAPGS),

1339

++	VULNWL_INTEL(ATOM_AIRMONT_MID,		NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),

1340

+

1341

+-	VULNWL_INTEL(ATOM_GOLDMONT,		NO_MDS | NO_L1TF | NO_SWAPGS),

1342

+-	VULNWL_INTEL(ATOM_GOLDMONT_X,		NO_MDS | NO_L1TF | NO_SWAPGS),

1343

+-	VULNWL_INTEL(ATOM_GOLDMONT_PLUS,	NO_MDS | NO_L1TF | NO_SWAPGS),

1344

++	VULNWL_INTEL(ATOM_GOLDMONT,		NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT),

1345

++	VULNWL_INTEL(ATOM_GOLDMONT_X,		NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT),

1346

++	VULNWL_INTEL(ATOM_GOLDMONT_PLUS,	NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT),

1347

+

1348

+ 	/*

1349

+ 	 * Technically, swapgs isn't serializing on AMD (despite it previously

1350

+@@ -945,13 +946,13 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = {

1351

+ 	 */

1352

+

1353

+ 	/* AMD Family 0xf - 0x12 */

1354

+-	VULNWL_AMD(0x0f,	NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS),

1355

+-	VULNWL_AMD(0x10,	NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS),

1356

+-	VULNWL_AMD(0x11,	NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS),

1357

+-	VULNWL_AMD(0x12,	NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS),

1358

++	VULNWL_AMD(0x0f,	NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT),

1359

++	VULNWL_AMD(0x10,	NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT),

1360

++	VULNWL_AMD(0x11,	NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT),

1361

++	VULNWL_AMD(0x12,	NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT),

1362

+

1363

+ 	/* FAMILY_ANY must be last, otherwise 0x0f - 0x12 matches won't work */

1364

+-	VULNWL_AMD(X86_FAMILY_ANY,	NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS),

1365

++	VULNWL_AMD(X86_FAMILY_ANY,	NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT),

1366

+ 	{}

1367

+ };

1368

+

1369

+@@ -962,19 +963,30 @@ static bool __init cpu_matches(unsigned long which)

1370

+ 	return m && !!(m->driver_data & which);

1371

+ }

1372

+

1373

+-static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)

1374

++u64 x86_read_arch_cap_msr(void)

1375

+ {

1376

+ 	u64 ia32_cap = 0;

1377

+

1378

++	if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES))

1379

++		rdmsrl(MSR_IA32_ARCH_CAPABILITIES, ia32_cap);

1380

++

1381

++	return ia32_cap;

1382

++}

1383

++

1384

++static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)

1385

++{

1386

++	u64 ia32_cap = x86_read_arch_cap_msr();

1387

++

1388

++	/* Set ITLB_MULTIHIT bug if cpu is not in the whitelist and not mitigated */

1389

++	if (!cpu_matches(NO_ITLB_MULTIHIT) && !(ia32_cap & ARCH_CAP_PSCHANGE_MC_NO))

1390

++		setup_force_cpu_bug(X86_BUG_ITLB_MULTIHIT);

1391

++

1392

+ 	if (cpu_matches(NO_SPECULATION))

1393

+ 		return;

1394

+

1395

+ 	setup_force_cpu_bug(X86_BUG_SPECTRE_V1);

1396

+ 	setup_force_cpu_bug(X86_BUG_SPECTRE_V2);

1397

+

1398

+-	if (cpu_has(c, X86_FEATURE_ARCH_CAPABILITIES))

1399

+-		rdmsrl(MSR_IA32_ARCH_CAPABILITIES, ia32_cap);

1400

+-

1401

+ 	if (!cpu_matches(NO_SSB) && !(ia32_cap & ARCH_CAP_SSB_NO) &&

1402

+ 	   !cpu_has(c, X86_FEATURE_AMD_SSB_NO))

1403

+ 		setup_force_cpu_bug(X86_BUG_SPEC_STORE_BYPASS);

1404

+@@ -991,6 +1003,21 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)

1405

+ 	if (!cpu_matches(NO_SWAPGS))

1406

+ 		setup_force_cpu_bug(X86_BUG_SWAPGS);

1407

+

1408

++	/*

1409

++	 * When the CPU is not mitigated for TAA (TAA_NO=0) set TAA bug when:

1410

++	 *	- TSX is supported or

1411

++	 *	- TSX_CTRL is present

1412

++	 *

1413

++	 * TSX_CTRL check is needed for cases when TSX could be disabled before

1414

++	 * the kernel boot e.g. kexec.

1415

++	 * TSX_CTRL check alone is not sufficient for cases when the microcode

1416

++	 * update is not present or running as guest that don't get TSX_CTRL.

1417

++	 */

1418

++	if (!(ia32_cap & ARCH_CAP_TAA_NO) &&

1419

++	    (cpu_has(c, X86_FEATURE_RTM) ||

1420

++	     (ia32_cap & ARCH_CAP_TSX_CTRL_MSR)))

1421

++		setup_force_cpu_bug(X86_BUG_TAA);

1422

++

1423

+ 	if (cpu_matches(NO_MELTDOWN))

1424

+ 		return;

1425

+

1426

+@@ -1409,6 +1436,8 @@ void __init identify_boot_cpu(void)

1427

+ 	enable_sep_cpu();

1428

+ #endif

1429

+ 	cpu_detect_tlb(&boot_cpu_data);

1430

++

1431

++	tsx_init();

1432

+ }

1433

+

1434

+ void identify_secondary_cpu(struct cpuinfo_x86 *c)

1435

+diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h

1436

+index 2275900d4d1b..4350f50b5deb 100644

1437

+--- a/arch/x86/kernel/cpu/cpu.h

1438

++++ b/arch/x86/kernel/cpu/cpu.h

1439

+@@ -44,6 +44,22 @@ struct _tlb_table {

1440

+ extern const struct cpu_dev *const __x86_cpu_dev_start[],

1441

+ 			    *const __x86_cpu_dev_end[];

1442

+

1443

++#ifdef CONFIG_CPU_SUP_INTEL

1444

++enum tsx_ctrl_states {

1445

++	TSX_CTRL_ENABLE,

1446

++	TSX_CTRL_DISABLE,

1447

++	TSX_CTRL_NOT_SUPPORTED,

1448

++};

1449

++

1450

++extern __ro_after_init enum tsx_ctrl_states tsx_ctrl_state;

1451

++

1452

++extern void __init tsx_init(void);

1453

++extern void tsx_enable(void);

1454

++extern void tsx_disable(void);

1455

++#else

1456

++static inline void tsx_init(void) { }

1457

++#endif /* CONFIG_CPU_SUP_INTEL */

1458

++

1459

+ extern void get_cpu_cap(struct cpuinfo_x86 *c);

1460

+ extern void cpu_detect_cache_sizes(struct cpuinfo_x86 *c);

1461

+ extern int detect_extended_topology_early(struct cpuinfo_x86 *c);

1462

+@@ -51,4 +67,6 @@ extern int detect_ht_early(struct cpuinfo_x86 *c);

1463

+

1464

+ extern void x86_spec_ctrl_setup_ap(void);

1465

+

1466

++extern u64 x86_read_arch_cap_msr(void);

1467

++

1468

+ #endif /* ARCH_X86_CPU_H */

1469

+diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c

1470

+index 860f2fd9f540..476a9d5c2f35 100644

1471

+--- a/arch/x86/kernel/cpu/intel.c

1472

++++ b/arch/x86/kernel/cpu/intel.c

1473

+@@ -642,6 +642,11 @@ static void init_intel(struct cpuinfo_x86 *c)

1474

+ 		detect_vmx_virtcap(c);

1475

+

1476

+ 	init_intel_energy_perf(c);

1477

++

1478

++	if (tsx_ctrl_state == TSX_CTRL_ENABLE)

1479

++		tsx_enable();

1480

++	if (tsx_ctrl_state == TSX_CTRL_DISABLE)

1481

++		tsx_disable();

1482

+ }

1483

+

1484

+ #ifdef CONFIG_X86_32

1485

+diff --git a/arch/x86/kernel/cpu/tsx.c b/arch/x86/kernel/cpu/tsx.c

1486

+new file mode 100644

1487

+index 000000000000..3e20d322bc98

1488

+--- /dev/null

1489

++++ b/arch/x86/kernel/cpu/tsx.c

1490

+@@ -0,0 +1,140 @@

1491

++// SPDX-License-Identifier: GPL-2.0

1492

++/*

1493

++ * Intel Transactional Synchronization Extensions (TSX) control.

1494

++ *

1495

++ * Copyright (C) 2019 Intel Corporation

1496

++ *

1497

++ * Author:

1498

++ *	Pawan Gupta <pawan.kumar.gupta@×××××××××××.com>

1499

++ */

1500

++

1501

++#include <linux/cpufeature.h>

1502

++

1503

++#include <asm/cmdline.h>

1504

++

1505

++#include "cpu.h"

1506

++

1507

++enum tsx_ctrl_states tsx_ctrl_state __ro_after_init = TSX_CTRL_NOT_SUPPORTED;

1508

++

1509

++void tsx_disable(void)

1510

++{

1511

++	u64 tsx;

1512

++

1513

++	rdmsrl(MSR_IA32_TSX_CTRL, tsx);

1514

++

1515

++	/* Force all transactions to immediately abort */

1516

++	tsx |= TSX_CTRL_RTM_DISABLE;

1517

++

1518

++	/*

1519

++	 * Ensure TSX support is not enumerated in CPUID.

1520

++	 * This is visible to userspace and will ensure they

1521

++	 * do not waste resources trying TSX transactions that

1522

++	 * will always abort.

1523

++	 */

1524

++	tsx |= TSX_CTRL_CPUID_CLEAR;

1525

++

1526

++	wrmsrl(MSR_IA32_TSX_CTRL, tsx);

1527

++}

1528

++

1529

++void tsx_enable(void)

1530

++{

1531

++	u64 tsx;

1532

++

1533

++	rdmsrl(MSR_IA32_TSX_CTRL, tsx);

1534

++

1535

++	/* Enable the RTM feature in the cpu */

1536

++	tsx &= ~TSX_CTRL_RTM_DISABLE;

1537

++

1538

++	/*

1539

++	 * Ensure TSX support is enumerated in CPUID.

1540

++	 * This is visible to userspace and will ensure they

1541

++	 * can enumerate and use the TSX feature.

1542

++	 */

1543

++	tsx &= ~TSX_CTRL_CPUID_CLEAR;

1544

++

1545

++	wrmsrl(MSR_IA32_TSX_CTRL, tsx);

1546

++}

1547

++

1548

++static bool __init tsx_ctrl_is_supported(void)

1549

++{

1550

++	u64 ia32_cap = x86_read_arch_cap_msr();

1551

++

1552

++	/*

1553

++	 * TSX is controlled via MSR_IA32_TSX_CTRL.  However, support for this

1554

++	 * MSR is enumerated by ARCH_CAP_TSX_MSR bit in MSR_IA32_ARCH_CAPABILITIES.

1555

++	 *

1556

++	 * TSX control (aka MSR_IA32_TSX_CTRL) is only available after a

1557

++	 * microcode update on CPUs that have their MSR_IA32_ARCH_CAPABILITIES

1558

++	 * bit MDS_NO=1. CPUs with MDS_NO=0 are not planned to get

1559

++	 * MSR_IA32_TSX_CTRL support even after a microcode update. Thus,

1560

++	 * tsx= cmdline requests will do nothing on CPUs without

1561

++	 * MSR_IA32_TSX_CTRL support.

1562

++	 */

1563

++	return !!(ia32_cap & ARCH_CAP_TSX_CTRL_MSR);

1564

++}

1565

++

1566

++static enum tsx_ctrl_states x86_get_tsx_auto_mode(void)

1567

++{

1568

++	if (boot_cpu_has_bug(X86_BUG_TAA))

1569

++		return TSX_CTRL_DISABLE;

1570

++

1571

++	return TSX_CTRL_ENABLE;

1572

++}

1573

++

1574

++void __init tsx_init(void)

1575

++{

1576

++	char arg[5] = {};

1577

++	int ret;

1578

++

1579

++	if (!tsx_ctrl_is_supported())

1580

++		return;

1581

++

1582

++	ret = cmdline_find_option(boot_command_line, "tsx", arg, sizeof(arg));

1583

++	if (ret >= 0) {

1584

++		if (!strcmp(arg, "on")) {

1585

++			tsx_ctrl_state = TSX_CTRL_ENABLE;

1586

++		} else if (!strcmp(arg, "off")) {

1587

++			tsx_ctrl_state = TSX_CTRL_DISABLE;

1588

++		} else if (!strcmp(arg, "auto")) {

1589

++			tsx_ctrl_state = x86_get_tsx_auto_mode();

1590

++		} else {

1591

++			tsx_ctrl_state = TSX_CTRL_DISABLE;

1592

++			pr_err("tsx: invalid option, defaulting to off\n");

1593

++		}

1594

++	} else {

1595

++		/* tsx= not provided */

1596

++		if (IS_ENABLED(CONFIG_X86_INTEL_TSX_MODE_AUTO))

1597

++			tsx_ctrl_state = x86_get_tsx_auto_mode();

1598

++		else if (IS_ENABLED(CONFIG_X86_INTEL_TSX_MODE_OFF))

1599

++			tsx_ctrl_state = TSX_CTRL_DISABLE;

1600

++		else

1601

++			tsx_ctrl_state = TSX_CTRL_ENABLE;

1602

++	}

1603

++

1604

++	if (tsx_ctrl_state == TSX_CTRL_DISABLE) {

1605

++		tsx_disable();

1606

++

1607

++		/*

1608

++		 * tsx_disable() will change the state of the

1609

++		 * RTM CPUID bit.  Clear it here since it is now

1610

++		 * expected to be not set.

1611

++		 */

1612

++		setup_clear_cpu_cap(X86_FEATURE_RTM);

1613

++	} else if (tsx_ctrl_state == TSX_CTRL_ENABLE) {

1614

++

1615

++		/*

1616

++		 * HW defaults TSX to be enabled at bootup.

1617

++		 * We may still need the TSX enable support

1618

++		 * during init for special cases like

1619

++		 * kexec after TSX is disabled.

1620

++		 */

1621

++		tsx_enable();

1622

++

1623

++		/*

1624

++		 * tsx_enable() will change the state of the

1625

++		 * RTM CPUID bit.  Force it here since it is now

1626

++		 * expected to be set.

1627

++		 */

1628

++		setup_force_cpu_cap(X86_FEATURE_RTM);

1629

++	}

1630

++}

1631

+diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c

1632

+index fc8236fd2495..18c5b4920e92 100644

1633

+--- a/arch/x86/kvm/cpuid.c

1634

++++ b/arch/x86/kvm/cpuid.c

1635

+@@ -466,8 +466,16 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,

1636

+ 			/* PKU is not yet implemented for shadow paging. */

1637

+ 			if (!tdp_enabled || !boot_cpu_has(X86_FEATURE_OSPKE))

1638

+ 				entry->ecx &= ~F(PKU);

1639

++

1640

+ 			entry->edx &= kvm_cpuid_7_0_edx_x86_features;

1641

+ 			cpuid_mask(&entry->edx, CPUID_7_EDX);

1642

++			if (boot_cpu_has(X86_FEATURE_IBPB) &&

1643

++			    boot_cpu_has(X86_FEATURE_IBRS))

1644

++				entry->edx |= F(SPEC_CTRL);

1645

++			if (boot_cpu_has(X86_FEATURE_STIBP))

1646

++				entry->edx |= F(INTEL_STIBP);

1647

++			if (boot_cpu_has(X86_FEATURE_SSBD))

1648

++				entry->edx |= F(SPEC_CTRL_SSBD);

1649

+ 			/*

1650

+ 			 * We emulate ARCH_CAPABILITIES in software even

1651

+ 			 * if the host doesn't support it.

1652

+diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c

1653

+index 676edfc19a95..f0f180158c26 100644

1654

+--- a/arch/x86/kvm/mmu.c

1655

++++ b/arch/x86/kvm/mmu.c

1656

+@@ -37,6 +37,7 @@

1657

+ #include <linux/srcu.h>

1658

+ #include <linux/slab.h>

1659

+ #include <linux/uaccess.h>

1660

++#include <linux/kthread.h>

1661

+

1662

+ #include <asm/page.h>

1663

+ #include <asm/cmpxchg.h>

1664

+@@ -44,6 +45,30 @@

1665

+ #include <asm/vmx.h>

1666

+ #include <asm/kvm_page_track.h>

1667

+

1668

++extern bool itlb_multihit_kvm_mitigation;

1669

++

1670

++static int __read_mostly nx_huge_pages = -1;

1671

++static uint __read_mostly nx_huge_pages_recovery_ratio = 60;

1672

++

1673

++static int set_nx_huge_pages(const char *val, const struct kernel_param *kp);

1674

++static int set_nx_huge_pages_recovery_ratio(const char *val, const struct kernel_param *kp);

1675

++

1676

++static struct kernel_param_ops nx_huge_pages_ops = {

1677

++	.set = set_nx_huge_pages,

1678

++	.get = param_get_bool,

1679

++};

1680

++

1681

++static struct kernel_param_ops nx_huge_pages_recovery_ratio_ops = {

1682

++	.set = set_nx_huge_pages_recovery_ratio,

1683

++	.get = param_get_uint,

1684

++};

1685

++

1686

++module_param_cb(nx_huge_pages, &nx_huge_pages_ops, &nx_huge_pages, 0644);

1687

++__MODULE_PARM_TYPE(nx_huge_pages, "bool");

1688

++module_param_cb(nx_huge_pages_recovery_ratio, &nx_huge_pages_recovery_ratio_ops,

1689

++		&nx_huge_pages_recovery_ratio, 0644);

1690

++__MODULE_PARM_TYPE(nx_huge_pages_recovery_ratio, "uint");

1691

++

1692

+ /*

1693

+  * When setting this variable to true it enables Two-Dimensional-Paging

1694

+  * where the hardware walks 2 page tables:

1695

+@@ -131,9 +156,6 @@ module_param(dbg, bool, 0644);

1696

+

1697

+ #include <trace/events/kvm.h>

1698

+

1699

+-#define CREATE_TRACE_POINTS

1700

+-#include "mmutrace.h"

1701

+-

1702

+ #define SPTE_HOST_WRITEABLE	(1ULL << PT_FIRST_AVAIL_BITS_SHIFT)

1703

+ #define SPTE_MMU_WRITEABLE	(1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 1))

1704

+

1705

+@@ -142,6 +164,20 @@ module_param(dbg, bool, 0644);

1706

+ /* make pte_list_desc fit well in cache line */

1707

+ #define PTE_LIST_EXT 3

1708

+

1709

++/*

1710

++ * Return values of handle_mmio_page_fault and mmu.page_fault:

1711

++ * RET_PF_RETRY: let CPU fault again on the address.

1712

++ * RET_PF_EMULATE: mmio page fault, emulate the instruction directly.

1713

++ *

1714

++ * For handle_mmio_page_fault only:

1715

++ * RET_PF_INVALID: the spte is invalid, let the real page fault path update it.

1716

++ */

1717

++enum {

1718

++	RET_PF_RETRY = 0,

1719

++	RET_PF_EMULATE = 1,

1720

++	RET_PF_INVALID = 2,

1721

++};

1722

++

1723

+ struct pte_list_desc {

1724

+ 	u64 *sptes[PTE_LIST_EXT];

1725

+ 	struct pte_list_desc *more;

1726

+@@ -179,14 +215,23 @@ static u64 __read_mostly shadow_mmio_mask;

1727

+ static u64 __read_mostly shadow_present_mask;

1728

+

1729

+ static void mmu_spte_set(u64 *sptep, u64 spte);

1730

++static bool is_executable_pte(u64 spte);

1731

+ static void mmu_free_roots(struct kvm_vcpu *vcpu);

1732

+

1733

++#define CREATE_TRACE_POINTS

1734

++#include "mmutrace.h"

1735

++

1736

+ void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask)

1737

+ {

1738

+ 	shadow_mmio_mask = mmio_mask;

1739

+ }

1740

+ EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask);

1741

+

1742

++static bool is_nx_huge_page_enabled(void)

1743

++{

1744

++	return READ_ONCE(nx_huge_pages);

1745

++}

1746

++

1747

+ /*

1748

+  * the low bit of the generation number is always presumed to be zero.

1749

+  * This disables mmio caching during memslot updates.  The concept is

1750

+@@ -324,6 +369,11 @@ static int is_last_spte(u64 pte, int level)

1751

+ 	return 0;

1752

+ }

1753

+

1754

++static bool is_executable_pte(u64 spte)

1755

++{

1756

++	return (spte & (shadow_x_mask | shadow_nx_mask)) == shadow_x_mask;

1757

++}

1758

++

1759

+ static kvm_pfn_t spte_to_pfn(u64 pte)

1760

+ {

1761

+ 	return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;

1762

+@@ -767,10 +817,16 @@ static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index)

1763

+

1764

+ static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn)

1765

+ {

1766

+-	if (sp->role.direct)

1767

+-		BUG_ON(gfn != kvm_mmu_page_get_gfn(sp, index));

1768

+-	else

1769

++	if (!sp->role.direct) {

1770

+ 		sp->gfns[index] = gfn;

1771

++		return;

1772

++	}

1773

++

1774

++	if (WARN_ON(gfn != kvm_mmu_page_get_gfn(sp, index)))

1775

++		pr_err_ratelimited("gfn mismatch under direct page %llx "

1776

++				   "(expected %llx, got %llx)\n",

1777

++				   sp->gfn,

1778

++				   kvm_mmu_page_get_gfn(sp, index), gfn);

1779

+ }

1780

+

1781

+ /*

1782

+@@ -829,6 +885,17 @@ static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)

1783

+ 	kvm_mmu_gfn_disallow_lpage(slot, gfn);

1784

+ }

1785

+

1786

++static void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp)

1787

++{

1788

++	if (sp->lpage_disallowed)

1789

++		return;

1790

++

1791

++	++kvm->stat.nx_lpage_splits;

1792

++	list_add_tail(&sp->lpage_disallowed_link,

1793

++		      &kvm->arch.lpage_disallowed_mmu_pages);

1794

++	sp->lpage_disallowed = true;

1795

++}

1796

++

1797

+ static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)

1798

+ {

1799

+ 	struct kvm_memslots *slots;

1800

+@@ -846,6 +913,13 @@ static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)

1801

+ 	kvm_mmu_gfn_allow_lpage(slot, gfn);

1802

+ }

1803

+

1804

++static void unaccount_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp)

1805

++{

1806

++	--kvm->stat.nx_lpage_splits;

1807

++	sp->lpage_disallowed = false;

1808

++	list_del(&sp->lpage_disallowed_link);

1809

++}

1810

++

1811

+ static bool __mmu_gfn_lpage_is_disallowed(gfn_t gfn, int level,

1812

+ 					  struct kvm_memory_slot *slot)

1813

+ {

1814

+@@ -2382,6 +2456,9 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,

1815

+ 			kvm_reload_remote_mmus(kvm);

1816

+ 	}

1817

+

1818

++	if (sp->lpage_disallowed)

1819

++		unaccount_huge_nx_page(kvm, sp);

1820

++

1821

+ 	sp->role.invalid = 1;

1822

+ 	return ret;

1823

+ }

1824

+@@ -2533,6 +2610,11 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,

1825

+ 	if (!speculative)

1826

+ 		spte |= shadow_accessed_mask;

1827

+

1828

++	if (level > PT_PAGE_TABLE_LEVEL && (pte_access & ACC_EXEC_MASK) &&

1829

++	    is_nx_huge_page_enabled()) {

1830

++		pte_access &= ~ACC_EXEC_MASK;

1831

++	}

1832

++

1833

+ 	if (pte_access & ACC_EXEC_MASK)

1834

+ 		spte |= shadow_x_mask;

1835

+ 	else

1836

+@@ -2598,13 +2680,13 @@ done:

1837

+ 	return ret;

1838

+ }

1839

+

1840

+-static bool mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access,

1841

+-			 int write_fault, int level, gfn_t gfn, kvm_pfn_t pfn,

1842

+-			 bool speculative, bool host_writable)

1843

++static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access,

1844

++			int write_fault, int level, gfn_t gfn, kvm_pfn_t pfn,

1845

++		       	bool speculative, bool host_writable)

1846

+ {

1847

+ 	int was_rmapped = 0;

1848

+ 	int rmap_count;

1849

+-	bool emulate = false;

1850

++	int ret = RET_PF_RETRY;

1851

+

1852

+ 	pgprintk("%s: spte %llx write_fault %d gfn %llx\n", __func__,

1853

+ 		 *sptep, write_fault, gfn);

1854

+@@ -2634,18 +2716,15 @@ static bool mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access,

1855

+ 	if (set_spte(vcpu, sptep, pte_access, level, gfn, pfn, speculative,

1856

+ 	      true, host_writable)) {

1857

+ 		if (write_fault)

1858

+-			emulate = true;

1859

++			ret = RET_PF_EMULATE;

1860

+ 		kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);

1861

+ 	}

1862

+

1863

+ 	if (unlikely(is_mmio_spte(*sptep)))

1864

+-		emulate = true;

1865

++		ret = RET_PF_EMULATE;

1866

+

1867

+ 	pgprintk("%s: setting spte %llx\n", __func__, *sptep);

1868

+-	pgprintk("instantiating %s PTE (%s) at %llx (%llx) addr %p\n",

1869

+-		 is_large_pte(*sptep)? "2MB" : "4kB",

1870

+-		 *sptep & PT_PRESENT_MASK ?"RW":"R", gfn,

1871

+-		 *sptep, sptep);

1872

++	trace_kvm_mmu_set_spte(level, gfn, sptep);

1873

+ 	if (!was_rmapped && is_large_pte(*sptep))

1874

+ 		++vcpu->kvm->stat.lpages;

1875

+

1876

+@@ -2657,9 +2736,7 @@ static bool mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access,

1877

+ 		}

1878

+ 	}

1879

+

1880

+-	kvm_release_pfn_clean(pfn);

1881

+-

1882

+-	return emulate;

1883

++	return ret;

1884

+ }

1885

+

1886

+ static kvm_pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,

1887

+@@ -2693,9 +2770,11 @@ static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,

1888

+ 	if (ret <= 0)

1889

+ 		return -1;

1890

+

1891

+-	for (i = 0; i < ret; i++, gfn++, start++)

1892

++	for (i = 0; i < ret; i++, gfn++, start++) {

1893

+ 		mmu_set_spte(vcpu, start, access, 0, sp->role.level, gfn,

1894

+ 			     page_to_pfn(pages[i]), true, true);

1895

++		put_page(pages[i]);

1896

++	}

1897

+

1898

+ 	return 0;

1899

+ }

1900

+@@ -2743,40 +2822,71 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)

1901

+ 	__direct_pte_prefetch(vcpu, sp, sptep);

1902

+ }

1903

+

1904

+-static int __direct_map(struct kvm_vcpu *vcpu, int write, int map_writable,

1905

+-			int level, gfn_t gfn, kvm_pfn_t pfn, bool prefault)

1906

++static void disallowed_hugepage_adjust(struct kvm_shadow_walk_iterator it,

1907

++				       gfn_t gfn, kvm_pfn_t *pfnp, int *levelp)

1908

+ {

1909

+-	struct kvm_shadow_walk_iterator iterator;

1910

++	int level = *levelp;

1911

++	u64 spte = *it.sptep;

1912

++

1913

++	if (it.level == level && level > PT_PAGE_TABLE_LEVEL &&

1914

++	    is_nx_huge_page_enabled() &&

1915

++	    is_shadow_present_pte(spte) &&

1916

++	    !is_large_pte(spte)) {

1917

++		/*

1918

++		 * A small SPTE exists for this pfn, but FNAME(fetch)

1919

++		 * and __direct_map would like to create a large PTE

1920

++		 * instead: just force them to go down another level,

1921

++		 * patching back for them into pfn the next 9 bits of

1922

++		 * the address.

1923

++		 */

1924

++		u64 page_mask = KVM_PAGES_PER_HPAGE(level) - KVM_PAGES_PER_HPAGE(level - 1);

1925

++		*pfnp |= gfn & page_mask;

1926

++		(*levelp)--;

1927

++	}

1928

++}

1929

++

1930

++static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write,

1931

++			int map_writable, int level, kvm_pfn_t pfn,

1932

++			bool prefault, bool lpage_disallowed)

1933

++{

1934

++	struct kvm_shadow_walk_iterator it;

1935

+ 	struct kvm_mmu_page *sp;

1936

+-	int emulate = 0;

1937

+-	gfn_t pseudo_gfn;

1938

++	int ret;

1939

++	gfn_t gfn = gpa >> PAGE_SHIFT;

1940

++	gfn_t base_gfn = gfn;

1941

+

1942

+ 	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))

1943

+-		return 0;

1944

++		return RET_PF_RETRY;

1945

+

1946

+-	for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) {

1947

+-		if (iterator.level == level) {

1948

+-			emulate = mmu_set_spte(vcpu, iterator.sptep, ACC_ALL,

1949

+-					       write, level, gfn, pfn, prefault,

1950

+-					       map_writable);

1951

+-			direct_pte_prefetch(vcpu, iterator.sptep);

1952

+-			++vcpu->stat.pf_fixed;

1953

+-			break;

1954

+-		}

1955

++	trace_kvm_mmu_spte_requested(gpa, level, pfn);

1956

++	for_each_shadow_entry(vcpu, gpa, it) {

1957

++		/*

1958

++		 * We cannot overwrite existing page tables with an NX

1959

++		 * large page, as the leaf could be executable.

1960

++		 */

1961

++		disallowed_hugepage_adjust(it, gfn, &pfn, &level);

1962

+

1963

+-		drop_large_spte(vcpu, iterator.sptep);

1964

+-		if (!is_shadow_present_pte(*iterator.sptep)) {

1965

+-			u64 base_addr = iterator.addr;

1966

++		base_gfn = gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);

1967

++		if (it.level == level)

1968

++			break;

1969

+

1970

+-			base_addr &= PT64_LVL_ADDR_MASK(iterator.level);

1971

+-			pseudo_gfn = base_addr >> PAGE_SHIFT;

1972

+-			sp = kvm_mmu_get_page(vcpu, pseudo_gfn, iterator.addr,

1973

+-					      iterator.level - 1, 1, ACC_ALL);

1974

++		drop_large_spte(vcpu, it.sptep);

1975

++		if (!is_shadow_present_pte(*it.sptep)) {

1976

++			sp = kvm_mmu_get_page(vcpu, base_gfn, it.addr,

1977

++					      it.level - 1, true, ACC_ALL);

1978

+

1979

+-			link_shadow_page(vcpu, iterator.sptep, sp);

1980

++			link_shadow_page(vcpu, it.sptep, sp);

1981

++			if (lpage_disallowed)

1982

++				account_huge_nx_page(vcpu->kvm, sp);

1983

+ 		}

1984

+ 	}

1985

+-	return emulate;

1986

++

1987

++	ret = mmu_set_spte(vcpu, it.sptep, ACC_ALL,

1988

++			   write, level, base_gfn, pfn, prefault,

1989

++			   map_writable);

1990

++	direct_pte_prefetch(vcpu, it.sptep);

1991

++	++vcpu->stat.pf_fixed;

1992

++	return ret;

1993

+ }

1994

+

1995

+ static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *tsk)

1996

+@@ -2798,25 +2908,23 @@ static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn)

1997

+ 	 * Do not cache the mmio info caused by writing the readonly gfn

1998

+ 	 * into the spte otherwise read access on readonly gfn also can

1999

+ 	 * caused mmio page fault and treat it as mmio access.

2000

+-	 * Return 1 to tell kvm to emulate it.

2001

+ 	 */

2002

+ 	if (pfn == KVM_PFN_ERR_RO_FAULT)

2003

+-		return 1;

2004

++		return RET_PF_EMULATE;

2005

+

2006

+ 	if (pfn == KVM_PFN_ERR_HWPOISON) {

2007

+ 		kvm_send_hwpoison_signal(kvm_vcpu_gfn_to_hva(vcpu, gfn), current);

2008

+-		return 0;

2009

++		return RET_PF_RETRY;

2010

+ 	}

2011

+

2012

+ 	return -EFAULT;

2013

+ }

2014

+

2015

+ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,

2016

+-					gfn_t *gfnp, kvm_pfn_t *pfnp,

2017

++					gfn_t gfn, kvm_pfn_t *pfnp,

2018

+ 					int *levelp)

2019

+ {

2020

+ 	kvm_pfn_t pfn = *pfnp;

2021

+-	gfn_t gfn = *gfnp;

2022

+ 	int level = *levelp;

2023

+

2024

+ 	/*

2025

+@@ -2843,8 +2951,6 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,

2026

+ 		mask = KVM_PAGES_PER_HPAGE(level) - 1;

2027

+ 		VM_BUG_ON((gfn & mask) != (pfn & mask));

2028

+ 		if (pfn & mask) {

2029

+-			gfn &= ~mask;

2030

+-			*gfnp = gfn;

2031

+ 			kvm_release_pfn_clean(pfn);

2032

+ 			pfn &= ~mask;

2033

+ 			kvm_get_pfn(pfn);

2034

+@@ -3012,11 +3118,14 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,

2035

+ {

2036

+ 	int r;

2037

+ 	int level;

2038

+-	bool force_pt_level = false;

2039

++	bool force_pt_level;

2040

+ 	kvm_pfn_t pfn;

2041

+ 	unsigned long mmu_seq;

2042

+ 	bool map_writable, write = error_code & PFERR_WRITE_MASK;

2043

++	bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) &&

2044

++				is_nx_huge_page_enabled();

2045

+

2046

++	force_pt_level = lpage_disallowed;

2047

+ 	level = mapping_level(vcpu, gfn, &force_pt_level);

2048

+ 	if (likely(!force_pt_level)) {

2049

+ 		/*

2050

+@@ -3031,32 +3140,30 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,

2051

+ 	}

2052

+

2053

+ 	if (fast_page_fault(vcpu, v, level, error_code))

2054

+-		return 0;

2055

++		return RET_PF_RETRY;

2056

+

2057

+ 	mmu_seq = vcpu->kvm->mmu_notifier_seq;

2058

+ 	smp_rmb();

2059

+

2060

+ 	if (try_async_pf(vcpu, prefault, gfn, v, &pfn, write, &map_writable))

2061

+-		return 0;

2062

++		return RET_PF_RETRY;

2063

+

2064

+ 	if (handle_abnormal_pfn(vcpu, v, gfn, pfn, ACC_ALL, &r))

2065

+ 		return r;

2066

+

2067

++	r = RET_PF_RETRY;

2068

+ 	spin_lock(&vcpu->kvm->mmu_lock);

2069

+ 	if (mmu_notifier_retry(vcpu->kvm, mmu_seq))

2070

+ 		goto out_unlock;

2071

+ 	make_mmu_pages_available(vcpu);

2072

+ 	if (likely(!force_pt_level))

2073

+-		transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level);

2074

+-	r = __direct_map(vcpu, write, map_writable, level, gfn, pfn, prefault);

2075

+-	spin_unlock(&vcpu->kvm->mmu_lock);

2076

+-

2077

+-	return r;

2078

+-

2079

++		transparent_hugepage_adjust(vcpu, gfn, &pfn, &level);

2080

++	r = __direct_map(vcpu, v, write, map_writable, level, pfn,

2081

++			 prefault, false);

2082

+ out_unlock:

2083

+ 	spin_unlock(&vcpu->kvm->mmu_lock);

2084

+ 	kvm_release_pfn_clean(pfn);

2085

+-	return 0;

2086

++	return r;

2087

+ }

2088

+

2089

+

2090

+@@ -3383,38 +3490,38 @@ exit:

2091

+ 	return reserved;

2092

+ }

2093

+

2094

+-int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct)

2095

++static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct)

2096

+ {

2097

+ 	u64 spte;

2098

+ 	bool reserved;

2099

+

2100

+ 	if (mmio_info_in_cache(vcpu, addr, direct))

2101

+-		return RET_MMIO_PF_EMULATE;

2102

++		return RET_PF_EMULATE;

2103

+

2104

+ 	reserved = walk_shadow_page_get_mmio_spte(vcpu, addr, &spte);

2105

+ 	if (WARN_ON(reserved))

2106

+-		return RET_MMIO_PF_BUG;

2107

++		return -EINVAL;

2108

+

2109

+ 	if (is_mmio_spte(spte)) {

2110

+ 		gfn_t gfn = get_mmio_spte_gfn(spte);

2111

+ 		unsigned access = get_mmio_spte_access(spte);

2112

+

2113

+ 		if (!check_mmio_spte(vcpu, spte))

2114

+-			return RET_MMIO_PF_INVALID;

2115

++			return RET_PF_INVALID;

2116

+

2117

+ 		if (direct)

2118

+ 			addr = 0;

2119

+

2120

+ 		trace_handle_mmio_page_fault(addr, gfn, access);

2121

+ 		vcpu_cache_mmio_info(vcpu, addr, gfn, access);

2122

+-		return RET_MMIO_PF_EMULATE;

2123

++		return RET_PF_EMULATE;

2124

+ 	}

2125

+

2126

+ 	/*

2127

+ 	 * If the page table is zapped by other cpus, let CPU fault again on

2128

+ 	 * the address.

2129

+ 	 */

2130

+-	return RET_MMIO_PF_RETRY;

2131

++	return RET_PF_RETRY;

2132

+ }

2133

+ EXPORT_SYMBOL_GPL(handle_mmio_page_fault);

2134

+

2135

+@@ -3464,7 +3571,7 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,

2136

+ 	pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code);

2137

+

2138

+ 	if (page_fault_handle_page_track(vcpu, error_code, gfn))

2139

+-		return 1;

2140

++		return RET_PF_EMULATE;

2141

+

2142

+ 	r = mmu_topup_memory_caches(vcpu);

2143

+ 	if (r)

2144

+@@ -3548,18 +3655,21 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,

2145

+ 	unsigned long mmu_seq;

2146

+ 	int write = error_code & PFERR_WRITE_MASK;

2147

+ 	bool map_writable;

2148

++	bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) &&

2149

++				is_nx_huge_page_enabled();

2150

+

2151

+ 	MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu.root_hpa));

2152

+

2153

+ 	if (page_fault_handle_page_track(vcpu, error_code, gfn))

2154

+-		return 1;

2155

++		return RET_PF_EMULATE;

2156

+

2157

+ 	r = mmu_topup_memory_caches(vcpu);

2158

+ 	if (r)

2159

+ 		return r;

2160

+

2161

+-	force_pt_level = !check_hugepage_cache_consistency(vcpu, gfn,

2162

+-							   PT_DIRECTORY_LEVEL);

2163

++	force_pt_level =

2164

++		lpage_disallowed ||

2165

++		!check_hugepage_cache_consistency(vcpu, gfn, PT_DIRECTORY_LEVEL);

2166

+ 	level = mapping_level(vcpu, gfn, &force_pt_level);

2167

+ 	if (likely(!force_pt_level)) {

2168

+ 		if (level > PT_DIRECTORY_LEVEL &&

2169

+@@ -3569,32 +3679,30 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,

2170

+ 	}

2171

+

2172

+ 	if (fast_page_fault(vcpu, gpa, level, error_code))

2173

+-		return 0;

2174

++		return RET_PF_RETRY;

2175

+

2176

+ 	mmu_seq = vcpu->kvm->mmu_notifier_seq;

2177

+ 	smp_rmb();

2178

+

2179

+ 	if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable))

2180

+-		return 0;

2181

++		return RET_PF_RETRY;

2182

+

2183

+ 	if (handle_abnormal_pfn(vcpu, 0, gfn, pfn, ACC_ALL, &r))

2184

+ 		return r;

2185

+

2186

++	r = RET_PF_RETRY;

2187

+ 	spin_lock(&vcpu->kvm->mmu_lock);

2188

+ 	if (mmu_notifier_retry(vcpu->kvm, mmu_seq))

2189

+ 		goto out_unlock;

2190

+ 	make_mmu_pages_available(vcpu);

2191

+ 	if (likely(!force_pt_level))

2192

+-		transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level);

2193

+-	r = __direct_map(vcpu, write, map_writable, level, gfn, pfn, prefault);

2194

+-	spin_unlock(&vcpu->kvm->mmu_lock);

2195

+-

2196

+-	return r;

2197

+-

2198

++		transparent_hugepage_adjust(vcpu, gfn, &pfn, &level);

2199

++	r = __direct_map(vcpu, gpa, write, map_writable, level, pfn,

2200

++			 prefault, lpage_disallowed);

2201

+ out_unlock:

2202

+ 	spin_unlock(&vcpu->kvm->mmu_lock);

2203

+ 	kvm_release_pfn_clean(pfn);

2204

+-	return 0;

2205

++	return r;

2206

+ }

2207

+

2208

+ static void nonpaging_init_context(struct kvm_vcpu *vcpu,

2209

+@@ -4510,23 +4618,24 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code,

2210

+ 	enum emulation_result er;

2211

+ 	bool direct = vcpu->arch.mmu.direct_map || mmu_is_nested(vcpu);

2212

+

2213

++	r = RET_PF_INVALID;

2214

+ 	if (unlikely(error_code & PFERR_RSVD_MASK)) {

2215

+ 		r = handle_mmio_page_fault(vcpu, cr2, direct);

2216

+-		if (r == RET_MMIO_PF_EMULATE) {

2217

++		if (r == RET_PF_EMULATE) {

2218

+ 			emulation_type = 0;

2219

+ 			goto emulate;

2220

+ 		}

2221

+-		if (r == RET_MMIO_PF_RETRY)

2222

+-			return 1;

2223

+-		if (r < 0)

2224

+-			return r;

2225

+ 	}

2226

+

2227

+-	r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code, false);

2228

++	if (r == RET_PF_INVALID) {

2229

++		r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code, false);

2230

++		WARN_ON(r == RET_PF_INVALID);

2231

++	}

2232

++

2233

++	if (r == RET_PF_RETRY)

2234

++		return 1;

2235

+ 	if (r < 0)

2236

+ 		return r;

2237

+-	if (!r)

2238

+-		return 1;

2239

+

2240

+ 	if (mmio_info_in_cache(vcpu, cr2, direct))

2241

+ 		emulation_type = 0;

2242

+@@ -4965,7 +5074,7 @@ mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)

2243

+ 	int nr_to_scan = sc->nr_to_scan;

2244

+ 	unsigned long freed = 0;

2245

+

2246

+-	spin_lock(&kvm_lock);

2247

++	mutex_lock(&kvm_lock);

2248

+

2249

+ 	list_for_each_entry(kvm, &vm_list, vm_list) {

2250

+ 		int idx;

2251

+@@ -5015,7 +5124,7 @@ unlock:

2252

+ 		break;

2253

+ 	}

2254

+

2255

+-	spin_unlock(&kvm_lock);

2256

++	mutex_unlock(&kvm_lock);

2257

+ 	return freed;

2258

+ }

2259

+

2260

+@@ -5039,8 +5148,58 @@ static void mmu_destroy_caches(void)

2261

+ 		kmem_cache_destroy(mmu_page_header_cache);

2262

+ }

2263

+

2264

++static bool get_nx_auto_mode(void)

2265

++{

2266

++	/* Return true when CPU has the bug, and mitigations are ON */

2267

++	return boot_cpu_has_bug(X86_BUG_ITLB_MULTIHIT) && !cpu_mitigations_off();

2268

++}

2269

++

2270

++static void __set_nx_huge_pages(bool val)

2271

++{

2272

++	nx_huge_pages = itlb_multihit_kvm_mitigation = val;

2273

++}

2274

++

2275

++static int set_nx_huge_pages(const char *val, const struct kernel_param *kp)

2276

++{

2277

++	bool old_val = nx_huge_pages;

2278

++	bool new_val;

2279

++

2280

++	/* In "auto" mode deploy workaround only if CPU has the bug. */

2281

++	if (sysfs_streq(val, "off"))

2282

++		new_val = 0;

2283

++	else if (sysfs_streq(val, "force"))

2284

++		new_val = 1;

2285

++	else if (sysfs_streq(val, "auto"))

2286

++		new_val = get_nx_auto_mode();

2287

++	else if (strtobool(val, &new_val) < 0)

2288

++		return -EINVAL;

2289

++

2290

++	__set_nx_huge_pages(new_val);

2291

++

2292

++	if (new_val != old_val) {

2293

++		struct kvm *kvm;

2294

++		int idx;

2295

++

2296

++		mutex_lock(&kvm_lock);

2297

++

2298

++		list_for_each_entry(kvm, &vm_list, vm_list) {

2299

++			idx = srcu_read_lock(&kvm->srcu);

2300

++			kvm_mmu_invalidate_zap_all_pages(kvm);

2301

++			srcu_read_unlock(&kvm->srcu, idx);

2302

++

2303

++			wake_up_process(kvm->arch.nx_lpage_recovery_thread);

2304

++		}

2305

++		mutex_unlock(&kvm_lock);

2306

++	}

2307

++

2308

++	return 0;

2309

++}

2310

++

2311

+ int kvm_mmu_module_init(void)

2312

+ {

2313

++	if (nx_huge_pages == -1)

2314

++		__set_nx_huge_pages(get_nx_auto_mode());

2315

++

2316

+ 	pte_list_desc_cache = kmem_cache_create("pte_list_desc",

2317

+ 					    sizeof(struct pte_list_desc),

2318

+ 					    0, SLAB_ACCOUNT, NULL);

2319

+@@ -5104,3 +5263,116 @@ void kvm_mmu_module_exit(void)

2320

+ 	unregister_shrinker(&mmu_shrinker);

2321

+ 	mmu_audit_disable();

2322

+ }

2323

++

2324

++static int set_nx_huge_pages_recovery_ratio(const char *val, const struct kernel_param *kp)

2325

++{

2326

++	unsigned int old_val;

2327

++	int err;

2328

++

2329

++	old_val = nx_huge_pages_recovery_ratio;

2330

++	err = param_set_uint(val, kp);

2331

++	if (err)

2332

++		return err;

2333

++

2334

++	if (READ_ONCE(nx_huge_pages) &&

2335

++	    !old_val && nx_huge_pages_recovery_ratio) {

2336

++		struct kvm *kvm;

2337

++

2338

++		mutex_lock(&kvm_lock);

2339

++

2340

++		list_for_each_entry(kvm, &vm_list, vm_list)

2341

++			wake_up_process(kvm->arch.nx_lpage_recovery_thread);

2342

++

2343

++		mutex_unlock(&kvm_lock);

2344

++	}

2345

++

2346

++	return err;

2347

++}

2348

++

2349

++static void kvm_recover_nx_lpages(struct kvm *kvm)

2350

++{

2351

++	int rcu_idx;

2352

++	struct kvm_mmu_page *sp;

2353

++	unsigned int ratio;

2354

++	LIST_HEAD(invalid_list);

2355

++	ulong to_zap;

2356

++

2357

++	rcu_idx = srcu_read_lock(&kvm->srcu);

2358

++	spin_lock(&kvm->mmu_lock);

2359

++

2360

++	ratio = READ_ONCE(nx_huge_pages_recovery_ratio);

2361

++	to_zap = ratio ? DIV_ROUND_UP(kvm->stat.nx_lpage_splits, ratio) : 0;

2362

++	while (to_zap && !list_empty(&kvm->arch.lpage_disallowed_mmu_pages)) {

2363

++		/*

2364

++		 * We use a separate list instead of just using active_mmu_pages

2365

++		 * because the number of lpage_disallowed pages is expected to

2366

++		 * be relatively small compared to the total.

2367

++		 */

2368

++		sp = list_first_entry(&kvm->arch.lpage_disallowed_mmu_pages,

2369

++				      struct kvm_mmu_page,

2370

++				      lpage_disallowed_link);

2371

++		WARN_ON_ONCE(!sp->lpage_disallowed);

2372

++		kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);

2373

++		WARN_ON_ONCE(sp->lpage_disallowed);

2374

++

2375

++		if (!--to_zap || need_resched() || spin_needbreak(&kvm->mmu_lock)) {

2376

++			kvm_mmu_commit_zap_page(kvm, &invalid_list);

2377

++			if (to_zap)

2378

++				cond_resched_lock(&kvm->mmu_lock);

2379

++		}

2380

++	}

2381

++

2382

++	spin_unlock(&kvm->mmu_lock);

2383

++	srcu_read_unlock(&kvm->srcu, rcu_idx);

2384

++}

2385

++

2386

++static long get_nx_lpage_recovery_timeout(u64 start_time)

2387

++{

2388

++	return READ_ONCE(nx_huge_pages) && READ_ONCE(nx_huge_pages_recovery_ratio)

2389

++		? start_time + 60 * HZ - get_jiffies_64()

2390

++		: MAX_SCHEDULE_TIMEOUT;

2391

++}

2392

++

2393

++static int kvm_nx_lpage_recovery_worker(struct kvm *kvm, uintptr_t data)

2394

++{

2395

++	u64 start_time;

2396

++	long remaining_time;

2397

++

2398

++	while (true) {

2399

++		start_time = get_jiffies_64();

2400

++		remaining_time = get_nx_lpage_recovery_timeout(start_time);

2401

++

2402

++		set_current_state(TASK_INTERRUPTIBLE);

2403

++		while (!kthread_should_stop() && remaining_time > 0) {

2404

++			schedule_timeout(remaining_time);

2405

++			remaining_time = get_nx_lpage_recovery_timeout(start_time);

2406

++			set_current_state(TASK_INTERRUPTIBLE);

2407

++		}

2408

++

2409

++		set_current_state(TASK_RUNNING);

2410

++

2411

++		if (kthread_should_stop())

2412

++			return 0;

2413

++

2414

++		kvm_recover_nx_lpages(kvm);

2415

++	}

2416

++}

2417

++

2418

++int kvm_mmu_post_init_vm(struct kvm *kvm)

2419

++{

2420

++	int err;

2421

++

2422

++	err = kvm_vm_create_worker_thread(kvm, kvm_nx_lpage_recovery_worker, 0,

2423

++					  "kvm-nx-lpage-recovery",

2424

++					  &kvm->arch.nx_lpage_recovery_thread);

2425

++	if (!err)

2426

++		kthread_unpark(kvm->arch.nx_lpage_recovery_thread);

2427

++

2428

++	return err;

2429

++}

2430

++

2431

++void kvm_mmu_pre_destroy_vm(struct kvm *kvm)

2432

++{

2433

++	if (kvm->arch.nx_lpage_recovery_thread)

2434

++		kthread_stop(kvm->arch.nx_lpage_recovery_thread);

2435

++}

2436

+diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h

2437

+index c92834c55c59..e584689e7d46 100644

2438

+--- a/arch/x86/kvm/mmu.h

2439

++++ b/arch/x86/kvm/mmu.h

2440

+@@ -56,23 +56,6 @@ void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask);

2441

+ void

2442

+ reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context);

2443

+

2444

+-/*

2445

+- * Return values of handle_mmio_page_fault:

2446

+- * RET_MMIO_PF_EMULATE: it is a real mmio page fault, emulate the instruction

2447

+- *			directly.

2448

+- * RET_MMIO_PF_INVALID: invalid spte is detected then let the real page

2449

+- *			fault path update the mmio spte.

2450

+- * RET_MMIO_PF_RETRY: let CPU fault again on the address.

2451

+- * RET_MMIO_PF_BUG: a bug was detected (and a WARN was printed).

2452

+- */

2453

+-enum {

2454

+-	RET_MMIO_PF_EMULATE = 1,

2455

+-	RET_MMIO_PF_INVALID = 2,

2456

+-	RET_MMIO_PF_RETRY = 0,

2457

+-	RET_MMIO_PF_BUG = -1

2458

+-};

2459

+-

2460

+-int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct);

2461

+ void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu);

2462

+ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly);

2463

+ bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu);

2464

+@@ -202,4 +185,8 @@ void kvm_mmu_gfn_disallow_lpage(struct kvm_memory_slot *slot, gfn_t gfn);

2465

+ void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn);

2466

+ bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,

2467

+ 				    struct kvm_memory_slot *slot, u64 gfn);

2468

++

2469

++int kvm_mmu_post_init_vm(struct kvm *kvm);

2470

++void kvm_mmu_pre_destroy_vm(struct kvm *kvm);

2471

++

2472

+ #endif

2473

+diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h

2474

+index 5a24b846a1cb..756b14ecc957 100644

2475

+--- a/arch/x86/kvm/mmutrace.h

2476

++++ b/arch/x86/kvm/mmutrace.h

2477

+@@ -322,6 +322,65 @@ TRACE_EVENT(

2478

+ 		  __entry->kvm_gen == __entry->spte_gen

2479

+ 	)

2480

+ );

2481

++

2482

++TRACE_EVENT(

2483

++	kvm_mmu_set_spte,

2484

++	TP_PROTO(int level, gfn_t gfn, u64 *sptep),

2485

++	TP_ARGS(level, gfn, sptep),

2486

++

2487

++	TP_STRUCT__entry(

2488

++		__field(u64, gfn)

2489

++		__field(u64, spte)

2490

++		__field(u64, sptep)

2491

++		__field(u8, level)

2492

++		/* These depend on page entry type, so compute them now.  */

2493

++		__field(bool, r)

2494

++		__field(bool, x)

2495

++		__field(u8, u)

2496

++	),

2497

++

2498

++	TP_fast_assign(

2499

++		__entry->gfn = gfn;

2500

++		__entry->spte = *sptep;

2501

++		__entry->sptep = virt_to_phys(sptep);

2502

++		__entry->level = level;

2503

++		__entry->r = shadow_present_mask || (__entry->spte & PT_PRESENT_MASK);

2504

++		__entry->x = is_executable_pte(__entry->spte);

2505

++		__entry->u = shadow_user_mask ? !!(__entry->spte & shadow_user_mask) : -1;

2506

++	),

2507

++

2508

++	TP_printk("gfn %llx spte %llx (%s%s%s%s) level %d at %llx",

2509

++		  __entry->gfn, __entry->spte,

2510

++		  __entry->r ? "r" : "-",

2511

++		  __entry->spte & PT_PRESENT_MASK ? "w" : "-",

2512

++		  __entry->x ? "x" : "-",

2513

++		  __entry->u == -1 ? "" : (__entry->u ? "u" : "-"),

2514

++		  __entry->level, __entry->sptep

2515

++	)

2516

++);

2517

++

2518

++TRACE_EVENT(

2519

++	kvm_mmu_spte_requested,

2520

++	TP_PROTO(gpa_t addr, int level, kvm_pfn_t pfn),

2521

++	TP_ARGS(addr, level, pfn),

2522

++

2523

++	TP_STRUCT__entry(

2524

++		__field(u64, gfn)

2525

++		__field(u64, pfn)

2526

++		__field(u8, level)

2527

++	),

2528

++

2529

++	TP_fast_assign(

2530

++		__entry->gfn = addr >> PAGE_SHIFT;

2531

++		__entry->pfn = pfn | (__entry->gfn & (KVM_PAGES_PER_HPAGE(level) - 1));

2532

++		__entry->level = level;

2533

++	),

2534

++

2535

++	TP_printk("gfn %llx pfn %llx level %d",

2536

++		  __entry->gfn, __entry->pfn, __entry->level

2537

++	)

2538

++);

2539

++

2540

+ #endif /* _TRACE_KVMMMU_H */

2541

+

2542

+ #undef TRACE_INCLUDE_PATH

2543

+diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h

2544

+index 37363900297d..e03225e707b2 100644

2545

+--- a/arch/x86/kvm/paging_tmpl.h

2546

++++ b/arch/x86/kvm/paging_tmpl.h

2547

+@@ -499,6 +499,7 @@ FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,

2548

+ 	mmu_set_spte(vcpu, spte, pte_access, 0, PT_PAGE_TABLE_LEVEL, gfn, pfn,

2549

+ 		     true, true);

2550

+

2551

++	kvm_release_pfn_clean(pfn);

2552

+ 	return true;

2553

+ }

2554

+

2555

+@@ -572,12 +573,14 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,

2556

+ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,

2557

+ 			 struct guest_walker *gw,

2558

+ 			 int write_fault, int hlevel,

2559

+-			 kvm_pfn_t pfn, bool map_writable, bool prefault)

2560

++			 kvm_pfn_t pfn, bool map_writable, bool prefault,

2561

++			 bool lpage_disallowed)

2562

+ {

2563

+ 	struct kvm_mmu_page *sp = NULL;

2564

+ 	struct kvm_shadow_walk_iterator it;

2565

+ 	unsigned direct_access, access = gw->pt_access;

2566

+-	int top_level, emulate;

2567

++	int top_level, ret;

2568

++	gfn_t gfn, base_gfn;

2569

+

2570

+ 	direct_access = gw->pte_access;

2571

+

2572

+@@ -622,36 +625,49 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,

2573

+ 			link_shadow_page(vcpu, it.sptep, sp);

2574

+ 	}

2575

+

2576

+-	for (;

2577

+-	     shadow_walk_okay(&it) && it.level > hlevel;

2578

+-	     shadow_walk_next(&it)) {

2579

+-		gfn_t direct_gfn;

2580

++	/*

2581

++	 * FNAME(page_fault) might have clobbered the bottom bits of

2582

++	 * gw->gfn, restore them from the virtual address.

2583

++	 */

2584

++	gfn = gw->gfn | ((addr & PT_LVL_OFFSET_MASK(gw->level)) >> PAGE_SHIFT);

2585

++	base_gfn = gfn;

2586

+

2587

++	trace_kvm_mmu_spte_requested(addr, gw->level, pfn);

2588

++

2589

++	for (; shadow_walk_okay(&it); shadow_walk_next(&it)) {

2590

+ 		clear_sp_write_flooding_count(it.sptep);

2591

+-		validate_direct_spte(vcpu, it.sptep, direct_access);

2592

+

2593

+-		drop_large_spte(vcpu, it.sptep);

2594

++		/*

2595

++		 * We cannot overwrite existing page tables with an NX

2596

++		 * large page, as the leaf could be executable.

2597

++		 */

2598

++		disallowed_hugepage_adjust(it, gfn, &pfn, &hlevel);

2599

+

2600

+-		if (is_shadow_present_pte(*it.sptep))

2601

+-			continue;

2602

++		base_gfn = gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);

2603

++		if (it.level == hlevel)

2604

++			break;

2605

+

2606

+-		direct_gfn = gw->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);

2607

++		validate_direct_spte(vcpu, it.sptep, direct_access);

2608

+

2609

+-		sp = kvm_mmu_get_page(vcpu, direct_gfn, addr, it.level-1,

2610

+-				      true, direct_access);

2611

+-		link_shadow_page(vcpu, it.sptep, sp);

2612

++		drop_large_spte(vcpu, it.sptep);

2613

++

2614

++		if (!is_shadow_present_pte(*it.sptep)) {

2615

++			sp = kvm_mmu_get_page(vcpu, base_gfn, addr,

2616

++					      it.level - 1, true, direct_access);

2617

++			link_shadow_page(vcpu, it.sptep, sp);

2618

++			if (lpage_disallowed)

2619

++				account_huge_nx_page(vcpu->kvm, sp);

2620

++		}

2621

+ 	}

2622

+

2623

+-	clear_sp_write_flooding_count(it.sptep);

2624

+-	emulate = mmu_set_spte(vcpu, it.sptep, gw->pte_access, write_fault,

2625

+-			       it.level, gw->gfn, pfn, prefault, map_writable);

2626

++	ret = mmu_set_spte(vcpu, it.sptep, gw->pte_access, write_fault,

2627

++			   it.level, base_gfn, pfn, prefault, map_writable);

2628

+ 	FNAME(pte_prefetch)(vcpu, gw, it.sptep);

2629

+-

2630

+-	return emulate;

2631

++	++vcpu->stat.pf_fixed;

2632

++	return ret;

2633

+

2634

+ out_gpte_changed:

2635

+-	kvm_release_pfn_clean(pfn);

2636

+-	return 0;

2637

++	return RET_PF_RETRY;

2638

+ }

2639

+

2640

+  /*

2641

+@@ -717,9 +733,11 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,

2642

+ 	int r;

2643

+ 	kvm_pfn_t pfn;

2644

+ 	int level = PT_PAGE_TABLE_LEVEL;

2645

+-	bool force_pt_level = false;

2646

+ 	unsigned long mmu_seq;

2647

+ 	bool map_writable, is_self_change_mapping;

2648

++	bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) &&

2649

++				is_nx_huge_page_enabled();

2650

++	bool force_pt_level = lpage_disallowed;

2651

+

2652

+ 	pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);

2653

+

2654

+@@ -746,12 +764,12 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,

2655

+ 		if (!prefault)

2656

+ 			inject_page_fault(vcpu, &walker.fault);

2657

+

2658

+-		return 0;

2659

++		return RET_PF_RETRY;

2660

+ 	}

2661

+

2662

+ 	if (page_fault_handle_page_track(vcpu, error_code, walker.gfn)) {

2663

+ 		shadow_page_table_clear_flood(vcpu, addr);

2664

+-		return 1;

2665

++		return RET_PF_EMULATE;

2666

+ 	}

2667

+

2668

+ 	vcpu->arch.write_fault_to_shadow_pgtable = false;

2669

+@@ -773,7 +791,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,

2670

+

2671

+ 	if (try_async_pf(vcpu, prefault, walker.gfn, addr, &pfn, write_fault,

2672

+ 			 &map_writable))

2673

+-		return 0;

2674

++		return RET_PF_RETRY;

2675

+

2676

+ 	if (handle_abnormal_pfn(vcpu, mmu_is_nested(vcpu) ? 0 : addr,

2677

+ 				walker.gfn, pfn, walker.pte_access, &r))

2678

+@@ -799,6 +817,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,

2679

+ 			walker.pte_access &= ~ACC_EXEC_MASK;

2680

+ 	}

2681

+

2682

++	r = RET_PF_RETRY;

2683

+ 	spin_lock(&vcpu->kvm->mmu_lock);

2684

+ 	if (mmu_notifier_retry(vcpu->kvm, mmu_seq))

2685

+ 		goto out_unlock;

2686

+@@ -806,19 +825,15 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,

2687

+ 	kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT);

2688

+ 	make_mmu_pages_available(vcpu);

2689

+ 	if (!force_pt_level)

2690

+-		transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level);

2691

++		transparent_hugepage_adjust(vcpu, walker.gfn, &pfn, &level);

2692

+ 	r = FNAME(fetch)(vcpu, addr, &walker, write_fault,

2693

+-			 level, pfn, map_writable, prefault);

2694

+-	++vcpu->stat.pf_fixed;

2695

++			 level, pfn, map_writable, prefault, lpage_disallowed);

2696

+ 	kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT);

2697

+-	spin_unlock(&vcpu->kvm->mmu_lock);

2698

+-

2699

+-	return r;

2700

+

2701

+ out_unlock:

2702

+ 	spin_unlock(&vcpu->kvm->mmu_lock);

2703

+ 	kvm_release_pfn_clean(pfn);

2704

+-	return 0;

2705

++	return r;

2706

+ }

2707

+

2708

+ static gpa_t FNAME(get_level1_sp_gpa)(struct kvm_mmu_page *sp)

2709

+diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c

2710

+index f7a7b98b3271..1079228e4fef 100644

2711

+--- a/arch/x86/kvm/svm.c

2712

++++ b/arch/x86/kvm/svm.c

2713

+@@ -590,8 +590,14 @@ static int get_npt_level(void)

2714

+ static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)

2715

+ {

2716

+ 	vcpu->arch.efer = efer;

2717

+-	if (!npt_enabled && !(efer & EFER_LMA))

2718

+-		efer &= ~EFER_LME;

2719

++

2720

++	if (!npt_enabled) {

2721

++		/* Shadow paging assumes NX to be available.  */

2722

++		efer |= EFER_NX;

2723

++

2724

++		if (!(efer & EFER_LMA))

2725

++			efer &= ~EFER_LME;

2726

++	}

2727

+

2728

+ 	to_svm(vcpu)->vmcb->save.efer = efer | EFER_SVME;

2729

+ 	mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR);

2730

+diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c

2731

+index 6b66d1f0d185..4c0d6d0d6337 100644

2732

+--- a/arch/x86/kvm/vmx.c

2733

++++ b/arch/x86/kvm/vmx.c

2734

+@@ -2219,17 +2219,9 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)

2735

+ 	u64 guest_efer = vmx->vcpu.arch.efer;

2736

+ 	u64 ignore_bits = 0;

2737

+

2738

+-	if (!enable_ept) {

2739

+-		/*

2740

+-		 * NX is needed to handle CR0.WP=1, CR4.SMEP=1.  Testing

2741

+-		 * host CPUID is more efficient than testing guest CPUID

2742

+-		 * or CR4.  Host SMEP is anyway a requirement for guest SMEP.

2743

+-		 */

2744

+-		if (boot_cpu_has(X86_FEATURE_SMEP))

2745

+-			guest_efer |= EFER_NX;

2746

+-		else if (!(guest_efer & EFER_NX))

2747

+-			ignore_bits |= EFER_NX;

2748

+-	}

2749

++	/* Shadow paging assumes NX to be available.  */

2750

++	if (!enable_ept)

2751

++		guest_efer |= EFER_NX;

2752

+

2753

+ 	/*

2754

+ 	 * LMA and LME handled by hardware; SCE meaningless outside long mode.

2755

+@@ -6556,16 +6548,9 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu)

2756

+ 						       NULL, 0) == EMULATE_DONE;

2757

+ 	}

2758

+

2759

+-	ret = handle_mmio_page_fault(vcpu, gpa, true);

2760

+-	if (likely(ret == RET_MMIO_PF_EMULATE))

2761

+-		return x86_emulate_instruction(vcpu, gpa, 0, NULL, 0) ==

2762

+-					      EMULATE_DONE;

2763

+-

2764

+-	if (unlikely(ret == RET_MMIO_PF_INVALID))

2765

+-		return kvm_mmu_page_fault(vcpu, gpa, 0, NULL, 0);

2766

+-

2767

+-	if (unlikely(ret == RET_MMIO_PF_RETRY))

2768

+-		return 1;

2769

++	ret = kvm_mmu_page_fault(vcpu, gpa, PFERR_RSVD_MASK, NULL, 0);

2770

++	if (ret >= 0)

2771

++		return ret;

2772

+

2773

+ 	/* It is the real ept misconfig */

2774

+ 	WARN_ON(1);

2775

+diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c

2776

+index 0b6517f5821b..06cd710e1d45 100644

2777

+--- a/arch/x86/kvm/x86.c

2778

++++ b/arch/x86/kvm/x86.c

2779

+@@ -191,6 +191,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {

2780

+ 	{ "mmu_unsync", VM_STAT(mmu_unsync) },

2781

+ 	{ "remote_tlb_flush", VM_STAT(remote_tlb_flush) },

2782

+ 	{ "largepages", VM_STAT(lpages) },

2783

++	{ "nx_largepages_splitted", VM_STAT(nx_lpage_splits) },

2784

+ 	{ NULL }

2785

+ };

2786

+

2787

+@@ -587,7 +588,7 @@ static bool pdptrs_changed(struct kvm_vcpu *vcpu)

2788

+ 	gfn_t gfn;

2789

+ 	int r;

2790

+

2791

+-	if (is_long_mode(vcpu) || !is_pae(vcpu))

2792

++	if (is_long_mode(vcpu) || !is_pae(vcpu) || !is_paging(vcpu))

2793

+ 		return false;

2794

+

2795

+ 	if (!test_bit(VCPU_EXREG_PDPTR,

2796

+@@ -1031,6 +1032,14 @@ u64 kvm_get_arch_capabilities(void)

2797

+

2798

+ 	rdmsrl_safe(MSR_IA32_ARCH_CAPABILITIES, &data);

2799

+

2800

++	/*

2801

++	 * If nx_huge_pages is enabled, KVM's shadow paging will ensure that

2802

++	 * the nested hypervisor runs with NX huge pages.  If it is not,

2803

++	 * L1 is anyway vulnerable to ITLB_MULTIHIT explots from other

2804

++	 * L1 guests, so it need not worry about its own (L2) guests.

2805

++	 */

2806

++	data |= ARCH_CAP_PSCHANGE_MC_NO;

2807

++

2808

+ 	/*

2809

+ 	 * If we're doing cache flushes (either "always" or "cond")

2810

+ 	 * we will do one whenever the guest does a vmlaunch/vmresume.

2811

+@@ -1043,8 +1052,35 @@ u64 kvm_get_arch_capabilities(void)

2812

+ 	if (l1tf_vmx_mitigation != VMENTER_L1D_FLUSH_NEVER)

2813

+ 		data |= ARCH_CAP_SKIP_VMENTRY_L1DFLUSH;

2814

+

2815

++	if (!boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN))

2816

++		data |= ARCH_CAP_RDCL_NO;

2817

++	if (!boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS))

2818

++		data |= ARCH_CAP_SSB_NO;

2819

++	if (!boot_cpu_has_bug(X86_BUG_MDS))

2820

++		data |= ARCH_CAP_MDS_NO;

2821

++

2822

++	/*

2823

++	 * On TAA affected systems, export MDS_NO=0 when:

2824

++	 *	- TSX is enabled on the host, i.e. X86_FEATURE_RTM=1.

2825

++	 *	- Updated microcode is present. This is detected by

2826

++	 *	  the presence of ARCH_CAP_TSX_CTRL_MSR and ensures

2827

++	 *	  that VERW clears CPU buffers.

2828

++	 *

2829

++	 * When MDS_NO=0 is exported, guests deploy clear CPU buffer

2830

++	 * mitigation and don't complain:

2831

++	 *

2832

++	 *	"Vulnerable: Clear CPU buffers attempted, no microcode"

2833

++	 *

2834

++	 * If TSX is disabled on the system, guests are also mitigated against

2835

++	 * TAA and clear CPU buffer mitigation is not required for guests.

2836

++	 */

2837

++	if (boot_cpu_has_bug(X86_BUG_TAA) && boot_cpu_has(X86_FEATURE_RTM) &&

2838

++	    (data & ARCH_CAP_TSX_CTRL_MSR))

2839

++		data &= ~ARCH_CAP_MDS_NO;

2840

++

2841

+ 	return data;

2842

+ }

2843

++

2844

+ EXPORT_SYMBOL_GPL(kvm_get_arch_capabilities);

2845

+

2846

+ static int kvm_get_msr_feature(struct kvm_msr_entry *msr)

2847

+@@ -5951,17 +5987,17 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va

2848

+

2849

+ 	smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1);

2850

+

2851

+-	spin_lock(&kvm_lock);

2852

++	mutex_lock(&kvm_lock);

2853

+ 	list_for_each_entry(kvm, &vm_list, vm_list) {

2854

+ 		kvm_for_each_vcpu(i, vcpu, kvm) {

2855

+ 			if (vcpu->cpu != freq->cpu)

2856

+ 				continue;

2857

+ 			kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);

2858

+-			if (vcpu->cpu != smp_processor_id())

2859

++			if (vcpu->cpu != raw_smp_processor_id())

2860

+ 				send_ipi = 1;

2861

+ 		}

2862

+ 	}

2863

+-	spin_unlock(&kvm_lock);

2864

++	mutex_unlock(&kvm_lock);

2865

+

2866

+ 	if (freq->old < freq->new && send_ipi) {

2867

+ 		/*

2868

+@@ -6099,12 +6135,12 @@ static void pvclock_gtod_update_fn(struct work_struct *work)

2869

+ 	struct kvm_vcpu *vcpu;

2870

+ 	int i;

2871

+

2872

+-	spin_lock(&kvm_lock);

2873

++	mutex_lock(&kvm_lock);

2874

+ 	list_for_each_entry(kvm, &vm_list, vm_list)

2875

+ 		kvm_for_each_vcpu(i, vcpu, kvm)

2876

+ 			kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);

2877

+ 	atomic_set(&kvm_guest_has_master_clock, 0);

2878

+-	spin_unlock(&kvm_lock);

2879

++	mutex_unlock(&kvm_lock);

2880

+ }

2881

+

2882

+ static DECLARE_WORK(pvclock_gtod_work, pvclock_gtod_update_fn);

2883

+@@ -7491,7 +7527,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,

2884

+ 		kvm_update_cpuid(vcpu);

2885

+

2886

+ 	idx = srcu_read_lock(&vcpu->kvm->srcu);

2887

+-	if (!is_long_mode(vcpu) && is_pae(vcpu)) {

2888

++	if (!is_long_mode(vcpu) && is_pae(vcpu) && is_paging(vcpu)) {

2889

+ 		load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu));

2890

+ 		mmu_reset_needed = 1;

2891

+ 	}

2892

+@@ -8072,6 +8108,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)

2893

+ 	INIT_HLIST_HEAD(&kvm->arch.mask_notifier_list);

2894

+ 	INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);

2895

+ 	INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages);

2896

++	INIT_LIST_HEAD(&kvm->arch.lpage_disallowed_mmu_pages);

2897

+ 	INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);

2898

+ 	atomic_set(&kvm->arch.noncoherent_dma_count, 0);

2899

+

2900

+@@ -8100,6 +8137,11 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)

2901

+ 	return 0;

2902

+ }

2903

+

2904

++int kvm_arch_post_init_vm(struct kvm *kvm)

2905

++{

2906

++	return kvm_mmu_post_init_vm(kvm);

2907

++}

2908

++

2909

+ static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)

2910

+ {

2911

+ 	int r;

2912

+@@ -8206,6 +8248,11 @@ int x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size)

2913

+ }

2914

+ EXPORT_SYMBOL_GPL(x86_set_memory_region);

2915

+

2916

++void kvm_arch_pre_destroy_vm(struct kvm *kvm)

2917

++{

2918

++	kvm_mmu_pre_destroy_vm(kvm);

2919

++}

2920

++

2921

+ void kvm_arch_destroy_vm(struct kvm *kvm)

2922

+ {

2923

+ 	if (current->mm == kvm->mm) {

2924

+diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c

2925

+index 3b123735a1c4..677c5f36674b 100644

2926

+--- a/drivers/base/cpu.c

2927

++++ b/drivers/base/cpu.c

2928

+@@ -537,12 +537,27 @@ ssize_t __weak cpu_show_mds(struct device *dev,

2929

+ 	return sprintf(buf, "Not affected\n");

2930

+ }

2931

+

2932

++ssize_t __weak cpu_show_tsx_async_abort(struct device *dev,

2933

++					struct device_attribute *attr,

2934

++					char *buf)

2935

++{

2936

++	return sprintf(buf, "Not affected\n");

2937

++}

2938

++

2939

++ssize_t __weak cpu_show_itlb_multihit(struct device *dev,

2940

++			    struct device_attribute *attr, char *buf)

2941

++{

2942

++	return sprintf(buf, "Not affected\n");

2943

++}

2944

++

2945

+ static DEVICE_ATTR(meltdown, 0444, cpu_show_meltdown, NULL);

2946

+ static DEVICE_ATTR(spectre_v1, 0444, cpu_show_spectre_v1, NULL);

2947

+ static DEVICE_ATTR(spectre_v2, 0444, cpu_show_spectre_v2, NULL);

2948

+ static DEVICE_ATTR(spec_store_bypass, 0444, cpu_show_spec_store_bypass, NULL);

2949

+ static DEVICE_ATTR(l1tf, 0444, cpu_show_l1tf, NULL);

2950

+ static DEVICE_ATTR(mds, 0444, cpu_show_mds, NULL);

2951

++static DEVICE_ATTR(tsx_async_abort, 0444, cpu_show_tsx_async_abort, NULL);

2952

++static DEVICE_ATTR(itlb_multihit, 0444, cpu_show_itlb_multihit, NULL);

2953

+

2954

+ static struct attribute *cpu_root_vulnerabilities_attrs[] = {

2955

+ 	&dev_attr_meltdown.attr,

2956

+@@ -551,6 +566,8 @@ static struct attribute *cpu_root_vulnerabilities_attrs[] = {

2957

+ 	&dev_attr_spec_store_bypass.attr,

2958

+ 	&dev_attr_l1tf.attr,

2959

+ 	&dev_attr_mds.attr,

2960

++	&dev_attr_tsx_async_abort.attr,

2961

++	&dev_attr_itlb_multihit.attr,

2962

+ 	NULL

2963

+ };

2964

+

2965

+diff --git a/drivers/bluetooth/hci_ldisc.c b/drivers/bluetooth/hci_ldisc.c

2966

+index a2f6953a86f5..0a21fb86fd67 100644

2967

+--- a/drivers/bluetooth/hci_ldisc.c

2968

++++ b/drivers/bluetooth/hci_ldisc.c

2969

+@@ -653,15 +653,14 @@ static int hci_uart_set_proto(struct hci_uart *hu, int id)

2970

+ 		return err;

2971

+

2972

+ 	hu->proto = p;

2973

+-	set_bit(HCI_UART_PROTO_READY, &hu->flags);

2974

+

2975

+ 	err = hci_uart_register_dev(hu);

2976

+ 	if (err) {

2977

+-		clear_bit(HCI_UART_PROTO_READY, &hu->flags);

2978

+ 		p->close(hu);

2979

+ 		return err;

2980

+ 	}

2981

+

2982

++	set_bit(HCI_UART_PROTO_READY, &hu->flags);

2983

+ 	return 0;

2984

+ }

2985

+

2986

+diff --git a/drivers/usb/gadget/udc/core.c b/drivers/usb/gadget/udc/core.c

2987

+index 95e28ecfde0a..99c7cf4822c3 100644

2988

+--- a/drivers/usb/gadget/udc/core.c

2989

++++ b/drivers/usb/gadget/udc/core.c

2990

+@@ -817,6 +817,8 @@ int usb_gadget_map_request_by_dev(struct device *dev,

2991

+ 			dev_err(dev, "failed to map buffer\n");

2992

+ 			return -EFAULT;

2993

+ 		}

2994

++

2995

++		req->dma_mapped = 1;

2996

+ 	}

2997

+

2998

+ 	return 0;

2999

+@@ -841,9 +843,10 @@ void usb_gadget_unmap_request_by_dev(struct device *dev,

3000

+ 				is_in ? DMA_TO_DEVICE : DMA_FROM_DEVICE);

3001

+

3002

+ 		req->num_mapped_sgs = 0;

3003

+-	} else {

3004

++	} else if (req->dma_mapped) {

3005

+ 		dma_unmap_single(dev, req->dma, req->length,

3006

+ 				is_in ? DMA_TO_DEVICE : DMA_FROM_DEVICE);

3007

++		req->dma_mapped = 0;

3008

+ 	}

3009

+ }

3010

+ EXPORT_SYMBOL_GPL(usb_gadget_unmap_request_by_dev);

3011

+diff --git a/include/linux/cpu.h b/include/linux/cpu.h

3012

+index b27c9b2e683f..e19bbc38a722 100644

3013

+--- a/include/linux/cpu.h

3014

++++ b/include/linux/cpu.h

3015

+@@ -56,6 +56,11 @@ extern ssize_t cpu_show_l1tf(struct device *dev,

3016

+ 			     struct device_attribute *attr, char *buf);

3017

+ extern ssize_t cpu_show_mds(struct device *dev,

3018

+ 			    struct device_attribute *attr, char *buf);

3019

++extern ssize_t cpu_show_tsx_async_abort(struct device *dev,

3020

++					struct device_attribute *attr,

3021

++					char *buf);

3022

++extern ssize_t cpu_show_itlb_multihit(struct device *dev,

3023

++				      struct device_attribute *attr, char *buf);

3024

+

3025

+ extern __printf(4, 5)

3026

+ struct device *cpu_device_create(struct device *parent, void *drvdata,

3027

+@@ -282,28 +287,7 @@ static inline int cpuhp_smt_enable(void) { return 0; }

3028

+ static inline int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval) { return 0; }

3029

+ #endif

3030

+

3031

+-/*

3032

+- * These are used for a global "mitigations=" cmdline option for toggling

3033

+- * optional CPU mitigations.

3034

+- */

3035

+-enum cpu_mitigations {

3036

+-	CPU_MITIGATIONS_OFF,

3037

+-	CPU_MITIGATIONS_AUTO,

3038

+-	CPU_MITIGATIONS_AUTO_NOSMT,

3039

+-};

3040

+-

3041

+-extern enum cpu_mitigations cpu_mitigations;

3042

+-

3043

+-/* mitigations=off */

3044

+-static inline bool cpu_mitigations_off(void)

3045

+-{

3046

+-	return cpu_mitigations == CPU_MITIGATIONS_OFF;

3047

+-}

3048

+-

3049

+-/* mitigations=auto,nosmt */

3050

+-static inline bool cpu_mitigations_auto_nosmt(void)

3051

+-{

3052

+-	return cpu_mitigations == CPU_MITIGATIONS_AUTO_NOSMT;

3053

+-}

3054

++extern bool cpu_mitigations_off(void);

3055

++extern bool cpu_mitigations_auto_nosmt(void);

3056

+

3057

+ #endif /* _LINUX_CPU_H_ */

3058

+diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h

3059

+index eb55374b73f3..0590e7d47b02 100644

3060

+--- a/include/linux/kvm_host.h

3061

++++ b/include/linux/kvm_host.h

3062

+@@ -129,7 +129,7 @@ static inline bool is_error_page(struct page *page)

3063

+

3064

+ extern struct kmem_cache *kvm_vcpu_cache;

3065

+

3066

+-extern spinlock_t kvm_lock;

3067

++extern struct mutex kvm_lock;

3068

+ extern struct list_head vm_list;

3069

+

3070

+ struct kvm_io_range {

3071

+@@ -1208,4 +1208,10 @@ static inline bool vcpu_valid_wakeup(struct kvm_vcpu *vcpu)

3072

+ }

3073

+ #endif /* CONFIG_HAVE_KVM_INVALID_WAKEUPS */

3074

+

3075

++typedef int (*kvm_vm_thread_fn_t)(struct kvm *kvm, uintptr_t data);

3076

++

3077

++int kvm_vm_create_worker_thread(struct kvm *kvm, kvm_vm_thread_fn_t thread_fn,

3078

++				uintptr_t data, const char *name,

3079

++				struct task_struct **thread_ptr);

3080

++

3081

+ #endif

3082

+diff --git a/include/linux/usb/gadget.h b/include/linux/usb/gadget.h

3083

+index e4516e9ded0f..4b810bc7ae63 100644

3084

+--- a/include/linux/usb/gadget.h

3085

++++ b/include/linux/usb/gadget.h

3086

+@@ -48,6 +48,7 @@ struct usb_ep;

3087

+  *     by adding a zero length packet as needed;

3088

+  * @short_not_ok: When reading data, makes short packets be

3089

+  *     treated as errors (queue stops advancing till cleanup).

3090

++ * @dma_mapped: Indicates if request has been mapped to DMA (internal)

3091

+  * @complete: Function called when request completes, so this request and

3092

+  *	its buffer may be re-used.  The function will always be called with

3093

+  *	interrupts disabled, and it must not sleep.

3094

+@@ -103,6 +104,7 @@ struct usb_request {

3095

+ 	unsigned		no_interrupt:1;

3096

+ 	unsigned		zero:1;

3097

+ 	unsigned		short_not_ok:1;

3098

++	unsigned		dma_mapped:1;

3099

+

3100

+ 	void			(*complete)(struct usb_ep *ep,

3101

+ 					struct usb_request *req);

3102

+diff --git a/kernel/cpu.c b/kernel/cpu.c

3103

+index c947bb35b89f..0ed3e9deda30 100644

3104

+--- a/kernel/cpu.c

3105

++++ b/kernel/cpu.c

3106

+@@ -2235,7 +2235,18 @@ void __init boot_cpu_hotplug_init(void)

3107

+ 	this_cpu_write(cpuhp_state.state, CPUHP_ONLINE);

3108

+ }

3109

+

3110

+-enum cpu_mitigations cpu_mitigations __ro_after_init = CPU_MITIGATIONS_AUTO;

3111

++/*

3112

++ * These are used for a global "mitigations=" cmdline option for toggling

3113

++ * optional CPU mitigations.

3114

++ */

3115

++enum cpu_mitigations {

3116

++	CPU_MITIGATIONS_OFF,

3117

++	CPU_MITIGATIONS_AUTO,

3118

++	CPU_MITIGATIONS_AUTO_NOSMT,

3119

++};

3120

++

3121

++static enum cpu_mitigations cpu_mitigations __ro_after_init =

3122

++	CPU_MITIGATIONS_AUTO;

3123

+

3124

+ static int __init mitigations_parse_cmdline(char *arg)

3125

+ {

3126

+@@ -2252,3 +2263,17 @@ static int __init mitigations_parse_cmdline(char *arg)

3127

+ 	return 0;

3128

+ }

3129

+ early_param("mitigations", mitigations_parse_cmdline);

3130

++

3131

++/* mitigations=off */

3132

++bool cpu_mitigations_off(void)

3133

++{

3134

++	return cpu_mitigations == CPU_MITIGATIONS_OFF;

3135

++}

3136

++EXPORT_SYMBOL_GPL(cpu_mitigations_off);

3137

++

3138

++/* mitigations=auto,nosmt */

3139

++bool cpu_mitigations_auto_nosmt(void)

3140

++{

3141

++	return cpu_mitigations == CPU_MITIGATIONS_AUTO_NOSMT;

3142

++}

3143

++EXPORT_SYMBOL_GPL(cpu_mitigations_auto_nosmt);

3144

+diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c

3145

+index c72586a094ed..0fc93519e63e 100644

3146

+--- a/virt/kvm/kvm_main.c

3147

++++ b/virt/kvm/kvm_main.c

3148

+@@ -49,6 +49,7 @@

3149

+ #include <linux/slab.h>

3150

+ #include <linux/sort.h>

3151

+ #include <linux/bsearch.h>

3152

++#include <linux/kthread.h>

3153

+

3154

+ #include <asm/processor.h>

3155

+ #include <asm/io.h>

3156

+@@ -87,7 +88,7 @@ module_param(halt_poll_ns_shrink, uint, S_IRUGO | S_IWUSR);

3157

+  *	kvm->lock --> kvm->slots_lock --> kvm->irq_lock

3158

+  */

3159

+

3160

+-DEFINE_SPINLOCK(kvm_lock);

3161

++DEFINE_MUTEX(kvm_lock);

3162

+ static DEFINE_RAW_SPINLOCK(kvm_count_lock);

3163

+ LIST_HEAD(vm_list);

3164

+

3165

+@@ -612,6 +613,23 @@ static int kvm_create_vm_debugfs(struct kvm *kvm, int fd)

3166

+ 	return 0;

3167

+ }

3168

+

3169

++/*

3170

++ * Called after the VM is otherwise initialized, but just before adding it to

3171

++ * the vm_list.

3172

++ */

3173

++int __weak kvm_arch_post_init_vm(struct kvm *kvm)

3174

++{

3175

++	return 0;

3176

++}

3177

++

3178

++/*

3179

++ * Called just after removing the VM from the vm_list, but before doing any

3180

++ * other destruction.

3181

++ */

3182

++void __weak kvm_arch_pre_destroy_vm(struct kvm *kvm)

3183

++{

3184

++}

3185

++

3186

+ static struct kvm *kvm_create_vm(unsigned long type)

3187

+ {

3188

+ 	int r, i;

3189

+@@ -659,22 +677,31 @@ static struct kvm *kvm_create_vm(unsigned long type)

3190

+ 		kvm->buses[i] = kzalloc(sizeof(struct kvm_io_bus),

3191

+ 					GFP_KERNEL);

3192

+ 		if (!kvm->buses[i])

3193

+-			goto out_err;

3194

++			goto out_err_no_mmu_notifier;

3195

+ 	}

3196

+

3197

+ 	r = kvm_init_mmu_notifier(kvm);

3198

++	if (r)

3199

++		goto out_err_no_mmu_notifier;

3200

++

3201

++	r = kvm_arch_post_init_vm(kvm);

3202

+ 	if (r)

3203

+ 		goto out_err;

3204

+

3205

+-	spin_lock(&kvm_lock);

3206

++	mutex_lock(&kvm_lock);

3207

+ 	list_add(&kvm->vm_list, &vm_list);

3208

+-	spin_unlock(&kvm_lock);

3209

++	mutex_unlock(&kvm_lock);

3210

+

3211

+ 	preempt_notifier_inc();

3212

+

3213

+ 	return kvm;

3214

+

3215

+ out_err:

3216

++#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)

3217

++	if (kvm->mmu_notifier.ops)

3218

++		mmu_notifier_unregister(&kvm->mmu_notifier, current->mm);

3219

++#endif

3220

++out_err_no_mmu_notifier:

3221

+ 	cleanup_srcu_struct(&kvm->irq_srcu);

3222

+ out_err_no_irq_srcu:

3223

+ 	cleanup_srcu_struct(&kvm->srcu);

3224

+@@ -724,9 +751,11 @@ static void kvm_destroy_vm(struct kvm *kvm)

3225

+

3226

+ 	kvm_destroy_vm_debugfs(kvm);

3227

+ 	kvm_arch_sync_events(kvm);

3228

+-	spin_lock(&kvm_lock);

3229

++	mutex_lock(&kvm_lock);

3230

+ 	list_del(&kvm->vm_list);

3231

+-	spin_unlock(&kvm_lock);

3232

++	mutex_unlock(&kvm_lock);

3233

++	kvm_arch_pre_destroy_vm(kvm);

3234

++

3235

+ 	kvm_free_irq_routing(kvm);

3236

+ 	for (i = 0; i < KVM_NR_BUSES; i++) {

3237

+ 		if (kvm->buses[i])

3238

+@@ -3752,13 +3781,13 @@ static int vm_stat_get(void *_offset, u64 *val)

3239

+ 	u64 tmp_val;

3240

+

3241

+ 	*val = 0;

3242

+-	spin_lock(&kvm_lock);

3243

++	mutex_lock(&kvm_lock);

3244

+ 	list_for_each_entry(kvm, &vm_list, vm_list) {

3245

+ 		stat_tmp.kvm = kvm;

3246

+ 		vm_stat_get_per_vm((void *)&stat_tmp, &tmp_val);

3247

+ 		*val += tmp_val;

3248

+ 	}

3249

+-	spin_unlock(&kvm_lock);

3250

++	mutex_unlock(&kvm_lock);

3251

+ 	return 0;

3252

+ }

3253

+

3254

+@@ -3772,13 +3801,13 @@ static int vcpu_stat_get(void *_offset, u64 *val)

3255

+ 	u64 tmp_val;

3256

+

3257

+ 	*val = 0;

3258

+-	spin_lock(&kvm_lock);

3259

++	mutex_lock(&kvm_lock);

3260

+ 	list_for_each_entry(kvm, &vm_list, vm_list) {

3261

+ 		stat_tmp.kvm = kvm;

3262

+ 		vcpu_stat_get_per_vm((void *)&stat_tmp, &tmp_val);

3263

+ 		*val += tmp_val;

3264

+ 	}

3265

+-	spin_unlock(&kvm_lock);

3266

++	mutex_unlock(&kvm_lock);

3267

+ 	return 0;

3268

+ }

3269

+

3270

+@@ -3987,3 +4016,86 @@ void kvm_exit(void)

3271

+ 	kvm_vfio_ops_exit();

3272

+ }

3273

+ EXPORT_SYMBOL_GPL(kvm_exit);

3274

++

3275

++struct kvm_vm_worker_thread_context {

3276

++	struct kvm *kvm;

3277

++	struct task_struct *parent;

3278

++	struct completion init_done;

3279

++	kvm_vm_thread_fn_t thread_fn;

3280

++	uintptr_t data;

3281

++	int err;

3282

++};

3283

++

3284

++static int kvm_vm_worker_thread(void *context)

3285

++{

3286

++	/*

3287

++	 * The init_context is allocated on the stack of the parent thread, so

3288

++	 * we have to locally copy anything that is needed beyond initialization

3289

++	 */

3290

++	struct kvm_vm_worker_thread_context *init_context = context;

3291

++	struct kvm *kvm = init_context->kvm;

3292

++	kvm_vm_thread_fn_t thread_fn = init_context->thread_fn;

3293

++	uintptr_t data = init_context->data;

3294

++	int err;

3295

++

3296

++	err = kthread_park(current);

3297

++	/* kthread_park(current) is never supposed to return an error */

3298

++	WARN_ON(err != 0);

3299

++	if (err)

3300

++		goto init_complete;

3301

++

3302

++	err = cgroup_attach_task_all(init_context->parent, current);

3303

++	if (err) {

3304

++		kvm_err("%s: cgroup_attach_task_all failed with err %d\n",

3305

++			__func__, err);

3306

++		goto init_complete;

3307

++	}

3308

++

3309

++	set_user_nice(current, task_nice(init_context->parent));

3310

++

3311

++init_complete:

3312

++	init_context->err = err;

3313

++	complete(&init_context->init_done);

3314

++	init_context = NULL;

3315

++

3316

++	if (err)

3317

++		return err;

3318

++

3319

++	/* Wait to be woken up by the spawner before proceeding. */

3320

++	kthread_parkme();

3321

++

3322

++	if (!kthread_should_stop())

3323

++		err = thread_fn(kvm, data);

3324

++

3325

++	return err;

3326

++}

3327

++

3328

++int kvm_vm_create_worker_thread(struct kvm *kvm, kvm_vm_thread_fn_t thread_fn,

3329

++				uintptr_t data, const char *name,

3330

++				struct task_struct **thread_ptr)

3331

++{

3332

++	struct kvm_vm_worker_thread_context init_context = {};

3333

++	struct task_struct *thread;

3334

++

3335

++	*thread_ptr = NULL;

3336

++	init_context.kvm = kvm;

3337

++	init_context.parent = current;

3338

++	init_context.thread_fn = thread_fn;

3339

++	init_context.data = data;

3340

++	init_completion(&init_context.init_done);

3341

++

3342

++	thread = kthread_run(kvm_vm_worker_thread, &init_context,

3343

++			     "%s-%d", name, task_pid_nr(current));

3344

++	if (IS_ERR(thread))

3345

++		return PTR_ERR(thread);

3346

++

3347

++	/* kthread_run is never supposed to return NULL */

3348

++	WARN_ON(thread == NULL);

3349

++

3350

++	wait_for_completion(&init_context.init_done);

3351

++

3352

++	if (!init_context.err)

3353

++		*thread_ptr = thread;

3354

++

3355

++	return init_context.err;

3356

++}

Gentoo Archives: gentoo-commits