[gentoo-commits] proj/linux-patches:4.9 commit in: / - gentoo-commits

From:	Alice Ferrazzi <alicef@g.o>
To:	gentoo-commits@l.g.o
Subject:	[gentoo-commits] proj/linux-patches:4.9 commit in: /
Date:	Fri, 05 Jan 2018 15:04:10
Message-Id:	`1515164634.a08c6f0923abc66cb0192f849780a30c3016e946.alicef@gentoo`

1

commit:     a08c6f0923abc66cb0192f849780a30c3016e946

2

Author:     Alice Ferrazzi <alicef <AT> gentoo <DOT> org>

3

AuthorDate: Fri Jan  5 15:03:54 2018 +0000

4

Commit:     Alice Ferrazzi <alicef <AT> gentoo <DOT> org>

5

CommitDate: Fri Jan  5 15:03:54 2018 +0000

6

URL:        https://gitweb.gentoo.org/proj/linux-patches.git/commit/?id=a08c6f09

7

8

linux kernel 4.9.75

9

10

 0000_README             |    4 +

11

 1074_linux-4.9.75.patch | 2577 +++++++++++++++++++++++++++++++++++++++++++++++

12

 2 files changed, 2581 insertions(+)

13

14

diff --git a/0000_README b/0000_README

15

index 350d2c5..eed3372 100644

16

--- a/0000_README

17

+++ b/0000_README

18

@@ -339,6 +339,10 @@ Patch:  1073_linux-4.9.74.patch

19

 From:   http://www.kernel.org

20

 Desc:   Linux 4.9.74

21

22

+Patch:  1074_linux-4.9.75.patch

23

+From:   http://www.kernel.org

24

+Desc:   Linux 4.9.75

25

+

26

 Patch:  1500_XATTR_USER_PREFIX.patch

27

 From:   https://bugs.gentoo.org/show_bug.cgi?id=470644

28

 Desc:   Support for namespace user.pax.* on tmpfs.

29

30

diff --git a/1074_linux-4.9.75.patch b/1074_linux-4.9.75.patch

31

new file mode 100644

32

index 0000000..6299f19

33

--- /dev/null

34

+++ b/1074_linux-4.9.75.patch

35

@@ -0,0 +1,2577 @@

36

+diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt

37

+index 152ec4e87b57..5d2676d043de 100644

38

+--- a/Documentation/kernel-parameters.txt

39

++++ b/Documentation/kernel-parameters.txt

40

+@@ -2763,6 +2763,8 @@ bytes respectively. Such letter suffixes can also be entirely omitted.

41

+

42

+ 	nojitter	[IA-64] Disables jitter checking for ITC timers.

43

+

44

++	nopti		[X86-64] Disable KAISER isolation of kernel from user.

45

++

46

+ 	no-kvmclock	[X86,KVM] Disable paravirtualized KVM clock driver

47

+

48

+ 	no-kvmapf	[X86,KVM] Disable paravirtualized asynchronous page

49

+@@ -3325,6 +3327,12 @@ bytes respectively. Such letter suffixes can also be entirely omitted.

50

+ 	pt.		[PARIDE]

51

+ 			See Documentation/blockdev/paride.txt.

52

+

53

++	pti=		[X86_64]

54

++			Control KAISER user/kernel address space isolation:

55

++			on - enable

56

++			off - disable

57

++			auto - default setting

58

++

59

+ 	pty.legacy_count=

60

+ 			[KNL] Number of legacy pty's. Overwrites compiled-in

61

+ 			default number.

62

+diff --git a/Makefile b/Makefile

63

+index 075e429732e7..acbc1b032db2 100644

64

+--- a/Makefile

65

++++ b/Makefile

66

+@@ -1,6 +1,6 @@

67

+ VERSION = 4

68

+ PATCHLEVEL = 9

69

+-SUBLEVEL = 74

70

++SUBLEVEL = 75

71

+ EXTRAVERSION =

72

+ NAME = Roaring Lionus

73

+

74

+diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h

75

+index 766a5211f827..2728e1b7e4a6 100644

76

+--- a/arch/x86/boot/compressed/misc.h

77

++++ b/arch/x86/boot/compressed/misc.h

78

+@@ -9,6 +9,7 @@

79

+  */

80

+ #undef CONFIG_PARAVIRT

81

+ #undef CONFIG_PARAVIRT_SPINLOCKS

82

++#undef CONFIG_PAGE_TABLE_ISOLATION

83

+ #undef CONFIG_KASAN

84

+

85

+ #include <linux/linkage.h>

86

+diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S

87

+index e7b0e7ff4c58..af4e58132d91 100644

88

+--- a/arch/x86/entry/entry_64.S

89

++++ b/arch/x86/entry/entry_64.S

90

+@@ -36,6 +36,7 @@

91

+ #include <asm/smap.h>

92

+ #include <asm/pgtable_types.h>

93

+ #include <asm/export.h>

94

++#include <asm/kaiser.h>

95

+ #include <linux/err.h>

96

+

97

+ /* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this.  */

98

+@@ -146,6 +147,7 @@ ENTRY(entry_SYSCALL_64)

99

+ 	 * it is too small to ever cause noticeable irq latency.

100

+ 	 */

101

+ 	SWAPGS_UNSAFE_STACK

102

++	SWITCH_KERNEL_CR3_NO_STACK

103

+ 	/*

104

+ 	 * A hypervisor implementation might want to use a label

105

+ 	 * after the swapgs, so that it can do the swapgs

106

+@@ -228,6 +230,14 @@ entry_SYSCALL_64_fastpath:

107

+ 	movq	RIP(%rsp), %rcx

108

+ 	movq	EFLAGS(%rsp), %r11

109

+ 	RESTORE_C_REGS_EXCEPT_RCX_R11

110

++	/*

111

++	 * This opens a window where we have a user CR3, but are

112

++	 * running in the kernel.  This makes using the CS

113

++	 * register useless for telling whether or not we need to

114

++	 * switch CR3 in NMIs.  Normal interrupts are OK because

115

++	 * they are off here.

116

++	 */

117

++	SWITCH_USER_CR3

118

+ 	movq	RSP(%rsp), %rsp

119

+ 	USERGS_SYSRET64

120

+

121

+@@ -323,10 +333,26 @@ return_from_SYSCALL_64:

122

+ syscall_return_via_sysret:

123

+ 	/* rcx and r11 are already restored (see code above) */

124

+ 	RESTORE_C_REGS_EXCEPT_RCX_R11

125

++	/*

126

++	 * This opens a window where we have a user CR3, but are

127

++	 * running in the kernel.  This makes using the CS

128

++	 * register useless for telling whether or not we need to

129

++	 * switch CR3 in NMIs.  Normal interrupts are OK because

130

++	 * they are off here.

131

++	 */

132

++	SWITCH_USER_CR3

133

+ 	movq	RSP(%rsp), %rsp

134

+ 	USERGS_SYSRET64

135

+

136

+ opportunistic_sysret_failed:

137

++	/*

138

++	 * This opens a window where we have a user CR3, but are

139

++	 * running in the kernel.  This makes using the CS

140

++	 * register useless for telling whether or not we need to

141

++	 * switch CR3 in NMIs.  Normal interrupts are OK because

142

++	 * they are off here.

143

++	 */

144

++	SWITCH_USER_CR3

145

+ 	SWAPGS

146

+ 	jmp	restore_c_regs_and_iret

147

+ END(entry_SYSCALL_64)

148

+@@ -424,6 +450,7 @@ ENTRY(ret_from_fork)

149

+ 	movq	%rsp, %rdi

150

+ 	call	syscall_return_slowpath	/* returns with IRQs disabled */

151

+ 	TRACE_IRQS_ON			/* user mode is traced as IRQS on */

152

++	SWITCH_USER_CR3

153

+ 	SWAPGS

154

+ 	jmp	restore_regs_and_iret

155

+

156

+@@ -478,6 +505,7 @@ END(irq_entries_start)

157

+ 	 * tracking that we're in kernel mode.

158

+ 	 */

159

+ 	SWAPGS

160

++	SWITCH_KERNEL_CR3

161

+

162

+ 	/*

163

+ 	 * We need to tell lockdep that IRQs are off.  We can't do this until

164

+@@ -535,6 +563,7 @@ GLOBAL(retint_user)

165

+ 	mov	%rsp,%rdi

166

+ 	call	prepare_exit_to_usermode

167

+ 	TRACE_IRQS_IRETQ

168

++	SWITCH_USER_CR3

169

+ 	SWAPGS

170

+ 	jmp	restore_regs_and_iret

171

+

172

+@@ -612,6 +641,7 @@ native_irq_return_ldt:

173

+

174

+ 	pushq	%rdi				/* Stash user RDI */

175

+ 	SWAPGS

176

++	SWITCH_KERNEL_CR3

177

+ 	movq	PER_CPU_VAR(espfix_waddr), %rdi

178

+ 	movq	%rax, (0*8)(%rdi)		/* user RAX */

179

+ 	movq	(1*8)(%rsp), %rax		/* user RIP */

180

+@@ -638,6 +668,7 @@ native_irq_return_ldt:

181

+ 	 * still points to an RO alias of the ESPFIX stack.

182

+ 	 */

183

+ 	orq	PER_CPU_VAR(espfix_stack), %rax

184

++	SWITCH_USER_CR3

185

+ 	SWAPGS

186

+ 	movq	%rax, %rsp

187

+

188

+@@ -1022,7 +1053,11 @@ idtentry machine_check					has_error_code=0	paranoid=1 do_sym=*machine_check_vec

189

+ /*

190

+  * Save all registers in pt_regs, and switch gs if needed.

191

+  * Use slow, but surefire "are we in kernel?" check.

192

+- * Return: ebx=0: need swapgs on exit, ebx=1: otherwise

193

++ *

194

++ * Return: ebx=0: needs swapgs but not SWITCH_USER_CR3 in paranoid_exit

195

++ *         ebx=1: needs neither swapgs nor SWITCH_USER_CR3 in paranoid_exit

196

++ *         ebx=2: needs both swapgs and SWITCH_USER_CR3 in paranoid_exit

197

++ *         ebx=3: needs SWITCH_USER_CR3 but not swapgs in paranoid_exit

198

+  */

199

+ ENTRY(paranoid_entry)

200

+ 	cld

201

+@@ -1035,7 +1070,26 @@ ENTRY(paranoid_entry)

202

+ 	js	1f				/* negative -> in kernel */

203

+ 	SWAPGS

204

+ 	xorl	%ebx, %ebx

205

+-1:	ret

206

++1:

207

++#ifdef CONFIG_PAGE_TABLE_ISOLATION

208

++	/*

209

++	 * We might have come in between a swapgs and a SWITCH_KERNEL_CR3

210

++	 * on entry, or between a SWITCH_USER_CR3 and a swapgs on exit.

211

++	 * Do a conditional SWITCH_KERNEL_CR3: this could safely be done

212

++	 * unconditionally, but we need to find out whether the reverse

213

++	 * should be done on return (conveyed to paranoid_exit in %ebx).

214

++	 */

215

++	ALTERNATIVE "jmp 2f", "movq %cr3, %rax", X86_FEATURE_KAISER

216

++	testl	$KAISER_SHADOW_PGD_OFFSET, %eax

217

++	jz	2f

218

++	orl	$2, %ebx

219

++	andq	$(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax

220

++	/* If PCID enabled, set X86_CR3_PCID_NOFLUSH_BIT */

221

++	ALTERNATIVE "", "bts $63, %rax", X86_FEATURE_PCID

222

++	movq	%rax, %cr3

223

++2:

224

++#endif

225

++	ret

226

+ END(paranoid_entry)

227

+

228

+ /*

229

+@@ -1048,19 +1102,26 @@ END(paranoid_entry)

230

+  * be complicated.  Fortunately, we there's no good reason

231

+  * to try to handle preemption here.

232

+  *

233

+- * On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it)

234

++ * On entry: ebx=0: needs swapgs but not SWITCH_USER_CR3

235

++ *           ebx=1: needs neither swapgs nor SWITCH_USER_CR3

236

++ *           ebx=2: needs both swapgs and SWITCH_USER_CR3

237

++ *           ebx=3: needs SWITCH_USER_CR3 but not swapgs

238

+  */

239

+ ENTRY(paranoid_exit)

240

+ 	DISABLE_INTERRUPTS(CLBR_NONE)

241

+ 	TRACE_IRQS_OFF_DEBUG

242

+-	testl	%ebx, %ebx			/* swapgs needed? */

243

++	TRACE_IRQS_IRETQ_DEBUG

244

++#ifdef CONFIG_PAGE_TABLE_ISOLATION

245

++	/* No ALTERNATIVE for X86_FEATURE_KAISER: paranoid_entry sets %ebx */

246

++	testl	$2, %ebx			/* SWITCH_USER_CR3 needed? */

247

++	jz	paranoid_exit_no_switch

248

++	SWITCH_USER_CR3

249

++paranoid_exit_no_switch:

250

++#endif

251

++	testl	$1, %ebx			/* swapgs needed? */

252

+ 	jnz	paranoid_exit_no_swapgs

253

+-	TRACE_IRQS_IRETQ

254

+ 	SWAPGS_UNSAFE_STACK

255

+-	jmp	paranoid_exit_restore

256

+ paranoid_exit_no_swapgs:

257

+-	TRACE_IRQS_IRETQ_DEBUG

258

+-paranoid_exit_restore:

259

+ 	RESTORE_EXTRA_REGS

260

+ 	RESTORE_C_REGS

261

+ 	REMOVE_PT_GPREGS_FROM_STACK 8

262

+@@ -1075,6 +1136,13 @@ ENTRY(error_entry)

263

+ 	cld

264

+ 	SAVE_C_REGS 8

265

+ 	SAVE_EXTRA_REGS 8

266

++	/*

267

++	 * error_entry() always returns with a kernel gsbase and

268

++	 * CR3.  We must also have a kernel CR3/gsbase before

269

++	 * calling TRACE_IRQS_*.  Just unconditionally switch to

270

++	 * the kernel CR3 here.

271

++	 */

272

++	SWITCH_KERNEL_CR3

273

+ 	xorl	%ebx, %ebx

274

+ 	testb	$3, CS+8(%rsp)

275

+ 	jz	.Lerror_kernelspace

276

+@@ -1235,6 +1303,10 @@ ENTRY(nmi)

277

+ 	 */

278

+

279

+ 	SWAPGS_UNSAFE_STACK

280

++	/*

281

++	 * percpu variables are mapped with user CR3, so no need

282

++	 * to switch CR3 here.

283

++	 */

284

+ 	cld

285

+ 	movq	%rsp, %rdx

286

+ 	movq	PER_CPU_VAR(cpu_current_top_of_stack), %rsp

287

+@@ -1268,12 +1340,34 @@ ENTRY(nmi)

288

+

289

+ 	movq	%rsp, %rdi

290

+ 	movq	$-1, %rsi

291

++#ifdef CONFIG_PAGE_TABLE_ISOLATION

292

++	/* Unconditionally use kernel CR3 for do_nmi() */

293

++	/* %rax is saved above, so OK to clobber here */

294

++	ALTERNATIVE "jmp 2f", "movq %cr3, %rax", X86_FEATURE_KAISER

295

++	/* If PCID enabled, NOFLUSH now and NOFLUSH on return */

296

++	ALTERNATIVE "", "bts $63, %rax", X86_FEATURE_PCID

297

++	pushq	%rax

298

++	/* mask off "user" bit of pgd address and 12 PCID bits: */

299

++	andq	$(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax

300

++	movq	%rax, %cr3

301

++2:

302

++#endif

303

+ 	call	do_nmi

304

+

305

++#ifdef CONFIG_PAGE_TABLE_ISOLATION

306

++	/*

307

++	 * Unconditionally restore CR3.  I know we return to

308

++	 * kernel code that needs user CR3, but do we ever return

309

++	 * to "user mode" where we need the kernel CR3?

310

++	 */

311

++	ALTERNATIVE "", "popq %rax; movq %rax, %cr3", X86_FEATURE_KAISER

312

++#endif

313

++

314

+ 	/*

315

+ 	 * Return back to user mode.  We must *not* do the normal exit

316

+-	 * work, because we don't want to enable interrupts.  Fortunately,

317

+-	 * do_nmi doesn't modify pt_regs.

318

++	 * work, because we don't want to enable interrupts.  Do not

319

++	 * switch to user CR3: we might be going back to kernel code

320

++	 * that had a user CR3 set.

321

+ 	 */

322

+ 	SWAPGS

323

+ 	jmp	restore_c_regs_and_iret

324

+@@ -1470,22 +1564,55 @@ end_repeat_nmi:

325

+ 	ALLOC_PT_GPREGS_ON_STACK

326

+

327

+ 	/*

328

+-	 * Use paranoid_entry to handle SWAPGS, but no need to use paranoid_exit

329

+-	 * as we should not be calling schedule in NMI context.

330

+-	 * Even with normal interrupts enabled. An NMI should not be

331

+-	 * setting NEED_RESCHED or anything that normal interrupts and

332

+-	 * exceptions might do.

333

++	 * Use the same approach as paranoid_entry to handle SWAPGS, but

334

++	 * without CR3 handling since we do that differently in NMIs.  No

335

++	 * need to use paranoid_exit as we should not be calling schedule

336

++	 * in NMI context.  Even with normal interrupts enabled. An NMI

337

++	 * should not be setting NEED_RESCHED or anything that normal

338

++	 * interrupts and exceptions might do.

339

+ 	 */

340

+-	call	paranoid_entry

341

+-

342

+-	/* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */

343

++	cld

344

++	SAVE_C_REGS

345

++	SAVE_EXTRA_REGS

346

++	movl	$1, %ebx

347

++	movl	$MSR_GS_BASE, %ecx

348

++	rdmsr

349

++	testl	%edx, %edx

350

++	js	1f				/* negative -> in kernel */

351

++	SWAPGS

352

++	xorl	%ebx, %ebx

353

++1:

354

+ 	movq	%rsp, %rdi

355

+ 	movq	$-1, %rsi

356

++#ifdef CONFIG_PAGE_TABLE_ISOLATION

357

++	/* Unconditionally use kernel CR3 for do_nmi() */

358

++	/* %rax is saved above, so OK to clobber here */

359

++	ALTERNATIVE "jmp 2f", "movq %cr3, %rax", X86_FEATURE_KAISER

360

++	/* If PCID enabled, NOFLUSH now and NOFLUSH on return */

361

++	ALTERNATIVE "", "bts $63, %rax", X86_FEATURE_PCID

362

++	pushq	%rax

363

++	/* mask off "user" bit of pgd address and 12 PCID bits: */

364

++	andq	$(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax

365

++	movq	%rax, %cr3

366

++2:

367

++#endif

368

++

369

++	/* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */

370

+ 	call	do_nmi

371

+

372

++#ifdef CONFIG_PAGE_TABLE_ISOLATION

373

++	/*

374

++	 * Unconditionally restore CR3.  We might be returning to

375

++	 * kernel code that needs user CR3, like just just before

376

++	 * a sysret.

377

++	 */

378

++	ALTERNATIVE "", "popq %rax; movq %rax, %cr3", X86_FEATURE_KAISER

379

++#endif

380

++

381

+ 	testl	%ebx, %ebx			/* swapgs needed? */

382

+ 	jnz	nmi_restore

383

+ nmi_swapgs:

384

++	/* We fixed up CR3 above, so no need to switch it here */

385

+ 	SWAPGS_UNSAFE_STACK

386

+ nmi_restore:

387

+ 	RESTORE_EXTRA_REGS

388

+diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S

389

+index e1721dafbcb1..d76a97653980 100644

390

+--- a/arch/x86/entry/entry_64_compat.S

391

++++ b/arch/x86/entry/entry_64_compat.S

392

+@@ -13,6 +13,8 @@

393

+ #include <asm/irqflags.h>

394

+ #include <asm/asm.h>

395

+ #include <asm/smap.h>

396

++#include <asm/pgtable_types.h>

397

++#include <asm/kaiser.h>

398

+ #include <linux/linkage.h>

399

+ #include <linux/err.h>

400

+

401

+@@ -48,6 +50,7 @@

402

+ ENTRY(entry_SYSENTER_compat)

403

+ 	/* Interrupts are off on entry. */

404

+ 	SWAPGS_UNSAFE_STACK

405

++	SWITCH_KERNEL_CR3_NO_STACK

406

+ 	movq	PER_CPU_VAR(cpu_current_top_of_stack), %rsp

407

+

408

+ 	/*

409

+@@ -184,6 +187,7 @@ ENDPROC(entry_SYSENTER_compat)

410

+ ENTRY(entry_SYSCALL_compat)

411

+ 	/* Interrupts are off on entry. */

412

+ 	SWAPGS_UNSAFE_STACK

413

++	SWITCH_KERNEL_CR3_NO_STACK

414

+

415

+ 	/* Stash user ESP and switch to the kernel stack. */

416

+ 	movl	%esp, %r8d

417

+@@ -259,6 +263,7 @@ sysret32_from_system_call:

418

+ 	xorq	%r8, %r8

419

+ 	xorq	%r9, %r9

420

+ 	xorq	%r10, %r10

421

++	SWITCH_USER_CR3

422

+ 	movq	RSP-ORIG_RAX(%rsp), %rsp

423

+ 	swapgs

424

+ 	sysretl

425

+@@ -297,7 +302,7 @@ ENTRY(entry_INT80_compat)

426

+ 	PARAVIRT_ADJUST_EXCEPTION_FRAME

427

+ 	ASM_CLAC			/* Do this early to minimize exposure */

428

+ 	SWAPGS

429

+-

430

++	SWITCH_KERNEL_CR3_NO_STACK

431

+ 	/*

432

+ 	 * User tracing code (ptrace or signal handlers) might assume that

433

+ 	 * the saved RAX contains a 32-bit number when we're invoking a 32-bit

434

+@@ -338,6 +343,7 @@ ENTRY(entry_INT80_compat)

435

+

436

+ 	/* Go back to user mode. */

437

+ 	TRACE_IRQS_ON

438

++	SWITCH_USER_CR3

439

+ 	SWAPGS

440

+ 	jmp	restore_regs_and_iret

441

+ END(entry_INT80_compat)

442

+diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c

443

+index 9dfeeeca0ea8..8e7a3f1df3a5 100644

444

+--- a/arch/x86/events/intel/ds.c

445

++++ b/arch/x86/events/intel/ds.c

446

+@@ -2,11 +2,15 @@

447

+ #include <linux/types.h>

448

+ #include <linux/slab.h>

449

+

450

++#include <asm/kaiser.h>

451

+ #include <asm/perf_event.h>

452

+ #include <asm/insn.h>

453

+

454

+ #include "../perf_event.h"

455

+

456

++static

457

++DEFINE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct debug_store, cpu_debug_store);

458

++

459

+ /* The size of a BTS record in bytes: */

460

+ #define BTS_RECORD_SIZE		24

461

+

462

+@@ -268,6 +272,39 @@ void fini_debug_store_on_cpu(int cpu)

463

+

464

+ static DEFINE_PER_CPU(void *, insn_buffer);

465

+

466

++static void *dsalloc(size_t size, gfp_t flags, int node)

467

++{

468

++#ifdef CONFIG_PAGE_TABLE_ISOLATION

469

++	unsigned int order = get_order(size);

470

++	struct page *page;

471

++	unsigned long addr;

472

++

473

++	page = __alloc_pages_node(node, flags | __GFP_ZERO, order);

474

++	if (!page)

475

++		return NULL;

476

++	addr = (unsigned long)page_address(page);

477

++	if (kaiser_add_mapping(addr, size, __PAGE_KERNEL) < 0) {

478

++		__free_pages(page, order);

479

++		addr = 0;

480

++	}

481

++	return (void *)addr;

482

++#else

483

++	return kmalloc_node(size, flags | __GFP_ZERO, node);

484

++#endif

485

++}

486

++

487

++static void dsfree(const void *buffer, size_t size)

488

++{

489

++#ifdef CONFIG_PAGE_TABLE_ISOLATION

490

++	if (!buffer)

491

++		return;

492

++	kaiser_remove_mapping((unsigned long)buffer, size);

493

++	free_pages((unsigned long)buffer, get_order(size));

494

++#else

495

++	kfree(buffer);

496

++#endif

497

++}

498

++

499

+ static int alloc_pebs_buffer(int cpu)

500

+ {

501

+ 	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;

502

+@@ -278,7 +315,7 @@ static int alloc_pebs_buffer(int cpu)

503

+ 	if (!x86_pmu.pebs)

504

+ 		return 0;

505

+

506

+-	buffer = kzalloc_node(x86_pmu.pebs_buffer_size, GFP_KERNEL, node);

507

++	buffer = dsalloc(x86_pmu.pebs_buffer_size, GFP_KERNEL, node);

508

+ 	if (unlikely(!buffer))

509

+ 		return -ENOMEM;

510

+

511

+@@ -289,7 +326,7 @@ static int alloc_pebs_buffer(int cpu)

512

+ 	if (x86_pmu.intel_cap.pebs_format < 2) {

513

+ 		ibuffer = kzalloc_node(PEBS_FIXUP_SIZE, GFP_KERNEL, node);

514

+ 		if (!ibuffer) {

515

+-			kfree(buffer);

516

++			dsfree(buffer, x86_pmu.pebs_buffer_size);

517

+ 			return -ENOMEM;

518

+ 		}

519

+ 		per_cpu(insn_buffer, cpu) = ibuffer;

520

+@@ -315,7 +352,8 @@ static void release_pebs_buffer(int cpu)

521

+ 	kfree(per_cpu(insn_buffer, cpu));

522

+ 	per_cpu(insn_buffer, cpu) = NULL;

523

+

524

+-	kfree((void *)(unsigned long)ds->pebs_buffer_base);

525

++	dsfree((void *)(unsigned long)ds->pebs_buffer_base,

526

++			x86_pmu.pebs_buffer_size);

527

+ 	ds->pebs_buffer_base = 0;

528

+ }

529

+

530

+@@ -329,7 +367,7 @@ static int alloc_bts_buffer(int cpu)

531

+ 	if (!x86_pmu.bts)

532

+ 		return 0;

533

+

534

+-	buffer = kzalloc_node(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, node);

535

++	buffer = dsalloc(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, node);

536

+ 	if (unlikely(!buffer)) {

537

+ 		WARN_ONCE(1, "%s: BTS buffer allocation failure\n", __func__);

538

+ 		return -ENOMEM;

539

+@@ -355,19 +393,15 @@ static void release_bts_buffer(int cpu)

540

+ 	if (!ds || !x86_pmu.bts)

541

+ 		return;

542

+

543

+-	kfree((void *)(unsigned long)ds->bts_buffer_base);

544

++	dsfree((void *)(unsigned long)ds->bts_buffer_base, BTS_BUFFER_SIZE);

545

+ 	ds->bts_buffer_base = 0;

546

+ }

547

+

548

+ static int alloc_ds_buffer(int cpu)

549

+ {

550

+-	int node = cpu_to_node(cpu);

551

+-	struct debug_store *ds;

552

+-

553

+-	ds = kzalloc_node(sizeof(*ds), GFP_KERNEL, node);

554

+-	if (unlikely(!ds))

555

+-		return -ENOMEM;

556

++	struct debug_store *ds = per_cpu_ptr(&cpu_debug_store, cpu);

557

+

558

++	memset(ds, 0, sizeof(*ds));

559

+ 	per_cpu(cpu_hw_events, cpu).ds = ds;

560

+

561

+ 	return 0;

562

+@@ -381,7 +415,6 @@ static void release_ds_buffer(int cpu)

563

+ 		return;

564

+

565

+ 	per_cpu(cpu_hw_events, cpu).ds = NULL;

566

+-	kfree(ds);

567

+ }

568

+

569

+ void release_ds_buffers(void)

570

+diff --git a/arch/x86/include/asm/cmdline.h b/arch/x86/include/asm/cmdline.h

571

+index e01f7f7ccb0c..84ae170bc3d0 100644

572

+--- a/arch/x86/include/asm/cmdline.h

573

++++ b/arch/x86/include/asm/cmdline.h

574

+@@ -2,5 +2,7 @@

575

+ #define _ASM_X86_CMDLINE_H

576

+

577

+ int cmdline_find_option_bool(const char *cmdline_ptr, const char *option);

578

++int cmdline_find_option(const char *cmdline_ptr, const char *option,

579

++			char *buffer, int bufsize);

580

+

581

+ #endif /* _ASM_X86_CMDLINE_H */

582

+diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h

583

+index ed10b5bf9b93..454a37adb823 100644

584

+--- a/arch/x86/include/asm/cpufeatures.h

585

++++ b/arch/x86/include/asm/cpufeatures.h

586

+@@ -189,6 +189,7 @@

587

+

588

+ #define X86_FEATURE_CPB		( 7*32+ 2) /* AMD Core Performance Boost */

589

+ #define X86_FEATURE_EPB		( 7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */

590

++#define X86_FEATURE_INVPCID_SINGLE ( 7*32+ 4) /* Effectively INVPCID && CR4.PCIDE=1 */

591

+

592

+ #define X86_FEATURE_HW_PSTATE	( 7*32+ 8) /* AMD HW-PState */

593

+ #define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */

594

+@@ -197,6 +198,9 @@

595

+ #define X86_FEATURE_AVX512_4VNNIW (7*32+16) /* AVX-512 Neural Network Instructions */

596

+ #define X86_FEATURE_AVX512_4FMAPS (7*32+17) /* AVX-512 Multiply Accumulation Single precision */

597

+

598

++/* Because the ALTERNATIVE scheme is for members of the X86_FEATURE club... */

599

++#define X86_FEATURE_KAISER	( 7*32+31) /* CONFIG_PAGE_TABLE_ISOLATION w/o nokaiser */

600

++

601

+ /* Virtualization flags: Linux defined, word 8 */

602

+ #define X86_FEATURE_TPR_SHADOW  ( 8*32+ 0) /* Intel TPR Shadow */

603

+ #define X86_FEATURE_VNMI        ( 8*32+ 1) /* Intel Virtual NMI */

604

+diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h

605

+index 12080d87da3b..2ed5a2b3f8f7 100644

606

+--- a/arch/x86/include/asm/desc.h

607

++++ b/arch/x86/include/asm/desc.h

608

+@@ -43,7 +43,7 @@ struct gdt_page {

609

+ 	struct desc_struct gdt[GDT_ENTRIES];

610

+ } __attribute__((aligned(PAGE_SIZE)));

611

+

612

+-DECLARE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page);

613

++DECLARE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(struct gdt_page, gdt_page);

614

+

615

+ static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)

616

+ {

617

+diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h

618

+index b90e1053049b..0817d63bce41 100644

619

+--- a/arch/x86/include/asm/hw_irq.h

620

++++ b/arch/x86/include/asm/hw_irq.h

621

+@@ -178,7 +178,7 @@ extern char irq_entries_start[];

622

+ #define VECTOR_RETRIGGERED	((void *)~0UL)

623

+

624

+ typedef struct irq_desc* vector_irq_t[NR_VECTORS];

625

+-DECLARE_PER_CPU(vector_irq_t, vector_irq);

626

++DECLARE_PER_CPU_USER_MAPPED(vector_irq_t, vector_irq);

627

+

628

+ #endif /* !ASSEMBLY_ */

629

+

630

+diff --git a/arch/x86/include/asm/kaiser.h b/arch/x86/include/asm/kaiser.h

631

+new file mode 100644

632

+index 000000000000..802bbbdfe143

633

+--- /dev/null

634

++++ b/arch/x86/include/asm/kaiser.h

635

+@@ -0,0 +1,141 @@

636

++#ifndef _ASM_X86_KAISER_H

637

++#define _ASM_X86_KAISER_H

638

++

639

++#include <uapi/asm/processor-flags.h> /* For PCID constants */

640

++

641

++/*

642

++ * This file includes the definitions for the KAISER feature.

643

++ * KAISER is a counter measure against x86_64 side channel attacks on

644

++ * the kernel virtual memory.  It has a shadow pgd for every process: the

645

++ * shadow pgd has a minimalistic kernel-set mapped, but includes the whole

646

++ * user memory. Within a kernel context switch, or when an interrupt is handled,

647

++ * the pgd is switched to the normal one. When the system switches to user mode,

648

++ * the shadow pgd is enabled. By this, the virtual memory caches are freed,

649

++ * and the user may not attack the whole kernel memory.

650

++ *

651

++ * A minimalistic kernel mapping holds the parts needed to be mapped in user

652

++ * mode, such as the entry/exit functions of the user space, or the stacks.

653

++ */

654

++

655

++#define KAISER_SHADOW_PGD_OFFSET 0x1000

656

++

657

++#ifdef __ASSEMBLY__

658

++#ifdef CONFIG_PAGE_TABLE_ISOLATION

659

++

660

++.macro _SWITCH_TO_KERNEL_CR3 reg

661

++movq %cr3, \reg

662

++andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), \reg

663

++/* If PCID enabled, set X86_CR3_PCID_NOFLUSH_BIT */

664

++ALTERNATIVE "", "bts $63, \reg", X86_FEATURE_PCID

665

++movq \reg, %cr3

666

++.endm

667

++

668

++.macro _SWITCH_TO_USER_CR3 reg regb

669

++/*

670

++ * regb must be the low byte portion of reg: because we have arranged

671

++ * for the low byte of the user PCID to serve as the high byte of NOFLUSH

672

++ * (0x80 for each when PCID is enabled, or 0x00 when PCID and NOFLUSH are

673

++ * not enabled): so that the one register can update both memory and cr3.

674

++ */

675

++movq %cr3, \reg

676

++orq  PER_CPU_VAR(x86_cr3_pcid_user), \reg

677

++js   9f

678

++/* If PCID enabled, FLUSH this time, reset to NOFLUSH for next time */

679

++movb \regb, PER_CPU_VAR(x86_cr3_pcid_user+7)

680

++9:

681

++movq \reg, %cr3

682

++.endm

683

++

684

++.macro SWITCH_KERNEL_CR3

685

++ALTERNATIVE "jmp 8f", "pushq %rax", X86_FEATURE_KAISER

686

++_SWITCH_TO_KERNEL_CR3 %rax

687

++popq %rax

688

++8:

689

++.endm

690

++

691

++.macro SWITCH_USER_CR3

692

++ALTERNATIVE "jmp 8f", "pushq %rax", X86_FEATURE_KAISER

693

++_SWITCH_TO_USER_CR3 %rax %al

694

++popq %rax

695

++8:

696

++.endm

697

++

698

++.macro SWITCH_KERNEL_CR3_NO_STACK

699

++ALTERNATIVE "jmp 8f", \

700

++	__stringify(movq %rax, PER_CPU_VAR(unsafe_stack_register_backup)), \

701

++	X86_FEATURE_KAISER

702

++_SWITCH_TO_KERNEL_CR3 %rax

703

++movq PER_CPU_VAR(unsafe_stack_register_backup), %rax

704

++8:

705

++.endm

706

++

707

++#else /* CONFIG_PAGE_TABLE_ISOLATION */

708

++

709

++.macro SWITCH_KERNEL_CR3

710

++.endm

711

++.macro SWITCH_USER_CR3

712

++.endm

713

++.macro SWITCH_KERNEL_CR3_NO_STACK

714

++.endm

715

++

716

++#endif /* CONFIG_PAGE_TABLE_ISOLATION */

717

++

718

++#else /* __ASSEMBLY__ */

719

++

720

++#ifdef CONFIG_PAGE_TABLE_ISOLATION

721

++/*

722

++ * Upon kernel/user mode switch, it may happen that the address

723

++ * space has to be switched before the registers have been

724

++ * stored.  To change the address space, another register is

725

++ * needed.  A register therefore has to be stored/restored.

726

++*/

727

++DECLARE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);

728

++

729

++DECLARE_PER_CPU(unsigned long, x86_cr3_pcid_user);

730

++

731

++extern char __per_cpu_user_mapped_start[], __per_cpu_user_mapped_end[];

732

++

733

++extern int kaiser_enabled;

734

++extern void __init kaiser_check_boottime_disable(void);

735

++#else

736

++#define kaiser_enabled	0

737

++static inline void __init kaiser_check_boottime_disable(void) {}

738

++#endif /* CONFIG_PAGE_TABLE_ISOLATION */

739

++

740

++/*

741

++ * Kaiser function prototypes are needed even when CONFIG_PAGE_TABLE_ISOLATION is not set,

742

++ * so as to build with tests on kaiser_enabled instead of #ifdefs.

743

++ */

744

++

745

++/**

746

++ *  kaiser_add_mapping - map a virtual memory part to the shadow (user) mapping

747

++ *  @addr: the start address of the range

748

++ *  @size: the size of the range

749

++ *  @flags: The mapping flags of the pages

750

++ *

751

++ *  The mapping is done on a global scope, so no bigger

752

++ *  synchronization has to be done.  the pages have to be

753

++ *  manually unmapped again when they are not needed any longer.

754

++ */

755

++extern int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags);

756

++

757

++/**

758

++ *  kaiser_remove_mapping - unmap a virtual memory part of the shadow mapping

759

++ *  @addr: the start address of the range

760

++ *  @size: the size of the range

761

++ */

762

++extern void kaiser_remove_mapping(unsigned long start, unsigned long size);

763

++

764

++/**

765

++ *  kaiser_init - Initialize the shadow mapping

766

++ *

767

++ *  Most parts of the shadow mapping can be mapped upon boot

768

++ *  time.  Only per-process things like the thread stacks

769

++ *  or a new LDT have to be mapped at runtime.  These boot-

770

++ *  time mappings are permanent and never unmapped.

771

++ */

772

++extern void kaiser_init(void);

773

++

774

++#endif /* __ASSEMBLY */

775

++

776

++#endif /* _ASM_X86_KAISER_H */

777

+diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h

778

+index 437feb436efa..2536f90cd30c 100644

779

+--- a/arch/x86/include/asm/pgtable.h

780

++++ b/arch/x86/include/asm/pgtable.h

781

+@@ -18,6 +18,12 @@

782

+ #ifndef __ASSEMBLY__

783

+ #include <asm/x86_init.h>

784

+

785

++#ifdef CONFIG_PAGE_TABLE_ISOLATION

786

++extern int kaiser_enabled;

787

++#else

788

++#define kaiser_enabled 0

789

++#endif

790

++

791

+ void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd);

792

+ void ptdump_walk_pgd_level_checkwx(void);

793

+

794

+@@ -690,7 +696,17 @@ static inline pud_t *pud_offset(pgd_t *pgd, unsigned long address)

795

+

796

+ static inline int pgd_bad(pgd_t pgd)

797

+ {

798

+-	return (pgd_flags(pgd) & ~_PAGE_USER) != _KERNPG_TABLE;

799

++	pgdval_t ignore_flags = _PAGE_USER;

800

++	/*

801

++	 * We set NX on KAISER pgds that map userspace memory so

802

++	 * that userspace can not meaningfully use the kernel

803

++	 * page table by accident; it will fault on the first

804

++	 * instruction it tries to run.  See native_set_pgd().

805

++	 */

806

++	if (kaiser_enabled)

807

++		ignore_flags |= _PAGE_NX;

808

++

809

++	return (pgd_flags(pgd) & ~ignore_flags) != _KERNPG_TABLE;

810

+ }

811

+

812

+ static inline int pgd_none(pgd_t pgd)

813

+@@ -903,7 +919,15 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm,

814

+  */

815

+ static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)

816

+ {

817

+-       memcpy(dst, src, count * sizeof(pgd_t));

818

++	memcpy(dst, src, count * sizeof(pgd_t));

819

++#ifdef CONFIG_PAGE_TABLE_ISOLATION

820

++	if (kaiser_enabled) {

821

++		/* Clone the shadow pgd part as well */

822

++		memcpy(native_get_shadow_pgd(dst),

823

++			native_get_shadow_pgd(src),

824

++			count * sizeof(pgd_t));

825

++	}

826

++#endif

827

+ }

828

+

829

+ #define PTE_SHIFT ilog2(PTRS_PER_PTE)

830

+diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h

831

+index 1cc82ece9ac1..ce97c8c6a310 100644

832

+--- a/arch/x86/include/asm/pgtable_64.h

833

++++ b/arch/x86/include/asm/pgtable_64.h

834

+@@ -106,9 +106,32 @@ static inline void native_pud_clear(pud_t *pud)

835

+ 	native_set_pud(pud, native_make_pud(0));

836

+ }

837

+

838

++#ifdef CONFIG_PAGE_TABLE_ISOLATION

839

++extern pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd);

840

++

841

++static inline pgd_t *native_get_shadow_pgd(pgd_t *pgdp)

842

++{

843

++#ifdef CONFIG_DEBUG_VM

844

++	/* linux/mmdebug.h may not have been included at this point */

845

++	BUG_ON(!kaiser_enabled);

846

++#endif

847

++	return (pgd_t *)((unsigned long)pgdp | (unsigned long)PAGE_SIZE);

848

++}

849

++#else

850

++static inline pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd)

851

++{

852

++	return pgd;

853

++}

854

++static inline pgd_t *native_get_shadow_pgd(pgd_t *pgdp)

855

++{

856

++	BUILD_BUG_ON(1);

857

++	return NULL;

858

++}

859

++#endif /* CONFIG_PAGE_TABLE_ISOLATION */

860

++

861

+ static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd)

862

+ {

863

+-	*pgdp = pgd;

864

++	*pgdp = kaiser_set_shadow_pgd(pgdp, pgd);

865

+ }

866

+

867

+ static inline void native_pgd_clear(pgd_t *pgd)

868

+diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h

869

+index 8b4de22d6429..f1c8ac468292 100644

870

+--- a/arch/x86/include/asm/pgtable_types.h

871

++++ b/arch/x86/include/asm/pgtable_types.h

872

+@@ -119,7 +119,7 @@

873

+ #define _PAGE_DEVMAP	(_AT(pteval_t, 0))

874

+ #endif

875

+

876

+-#define _PAGE_PROTNONE	(_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)

877

++#define _PAGE_PROTNONE  (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)

878

+

879

+ #define _PAGE_TABLE	(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |	\

880

+ 			 _PAGE_ACCESSED | _PAGE_DIRTY)

881

+@@ -137,6 +137,33 @@

882

+ 			 _PAGE_SOFT_DIRTY)

883

+ #define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE)

884

+

885

++/* The ASID is the lower 12 bits of CR3 */

886

++#define X86_CR3_PCID_ASID_MASK  (_AC((1<<12)-1,UL))

887

++

888

++/* Mask for all the PCID-related bits in CR3: */

889

++#define X86_CR3_PCID_MASK       (X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_MASK)

890

++#define X86_CR3_PCID_ASID_KERN  (_AC(0x0,UL))

891

++

892

++#if defined(CONFIG_PAGE_TABLE_ISOLATION) && defined(CONFIG_X86_64)

893

++/* Let X86_CR3_PCID_ASID_USER be usable for the X86_CR3_PCID_NOFLUSH bit */

894

++#define X86_CR3_PCID_ASID_USER	(_AC(0x80,UL))

895

++

896

++#define X86_CR3_PCID_KERN_FLUSH		(X86_CR3_PCID_ASID_KERN)

897

++#define X86_CR3_PCID_USER_FLUSH		(X86_CR3_PCID_ASID_USER)

898

++#define X86_CR3_PCID_KERN_NOFLUSH	(X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_KERN)

899

++#define X86_CR3_PCID_USER_NOFLUSH	(X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_USER)

900

++#else

901

++#define X86_CR3_PCID_ASID_USER  (_AC(0x0,UL))

902

++/*

903

++ * PCIDs are unsupported on 32-bit and none of these bits can be

904

++ * set in CR3:

905

++ */

906

++#define X86_CR3_PCID_KERN_FLUSH		(0)

907

++#define X86_CR3_PCID_USER_FLUSH		(0)

908

++#define X86_CR3_PCID_KERN_NOFLUSH	(0)

909

++#define X86_CR3_PCID_USER_NOFLUSH	(0)

910

++#endif

911

++

912

+ /*

913

+  * The cache modes defined here are used to translate between pure SW usage

914

+  * and the HW defined cache mode bits and/or PAT entries.

915

+diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h

916

+index 83db0eae9979..8cb52ee3ade6 100644

917

+--- a/arch/x86/include/asm/processor.h

918

++++ b/arch/x86/include/asm/processor.h

919

+@@ -308,7 +308,7 @@ struct tss_struct {

920

+

921

+ } ____cacheline_aligned;

922

+

923

+-DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss);

924

++DECLARE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct tss_struct, cpu_tss);

925

+

926

+ #ifdef CONFIG_X86_32

927

+ DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack);

928

+diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h

929

+index 7d2ea6b1f7d9..94146f665a3c 100644

930

+--- a/arch/x86/include/asm/tlbflush.h

931

++++ b/arch/x86/include/asm/tlbflush.h

932

+@@ -132,6 +132,24 @@ static inline void cr4_set_bits_and_update_boot(unsigned long mask)

933

+ 	cr4_set_bits(mask);

934

+ }

935

+

936

++/*

937

++ * Declare a couple of kaiser interfaces here for convenience,

938

++ * to avoid the need for asm/kaiser.h in unexpected places.

939

++ */

940

++#ifdef CONFIG_PAGE_TABLE_ISOLATION

941

++extern int kaiser_enabled;

942

++extern void kaiser_setup_pcid(void);

943

++extern void kaiser_flush_tlb_on_return_to_user(void);

944

++#else

945

++#define kaiser_enabled 0

946

++static inline void kaiser_setup_pcid(void)

947

++{

948

++}

949

++static inline void kaiser_flush_tlb_on_return_to_user(void)

950

++{

951

++}

952

++#endif

953

++

954

+ static inline void __native_flush_tlb(void)

955

+ {

956

+ 	/*

957

+@@ -140,6 +158,8 @@ static inline void __native_flush_tlb(void)

958

+ 	 * back:

959

+ 	 */

960

+ 	preempt_disable();

961

++	if (kaiser_enabled)

962

++		kaiser_flush_tlb_on_return_to_user();

963

+ 	native_write_cr3(native_read_cr3());

964

+ 	preempt_enable();

965

+ }

966

+@@ -149,20 +169,27 @@ static inline void __native_flush_tlb_global_irq_disabled(void)

967

+ 	unsigned long cr4;

968

+

969

+ 	cr4 = this_cpu_read(cpu_tlbstate.cr4);

970

+-	/* clear PGE */

971

+-	native_write_cr4(cr4 & ~X86_CR4_PGE);

972

+-	/* write old PGE again and flush TLBs */

973

+-	native_write_cr4(cr4);

974

++	if (cr4 & X86_CR4_PGE) {

975

++		/* clear PGE and flush TLB of all entries */

976

++		native_write_cr4(cr4 & ~X86_CR4_PGE);

977

++		/* restore PGE as it was before */

978

++		native_write_cr4(cr4);

979

++	} else {

980

++		/* do it with cr3, letting kaiser flush user PCID */

981

++		__native_flush_tlb();

982

++	}

983

+ }

984

+

985

+ static inline void __native_flush_tlb_global(void)

986

+ {

987

+ 	unsigned long flags;

988

+

989

+-	if (static_cpu_has(X86_FEATURE_INVPCID)) {

990

++	if (this_cpu_has(X86_FEATURE_INVPCID)) {

991

+ 		/*

992

+ 		 * Using INVPCID is considerably faster than a pair of writes

993

+ 		 * to CR4 sandwiched inside an IRQ flag save/restore.

994

++		 *

995

++		 * Note, this works with CR4.PCIDE=0 or 1.

996

+ 		 */

997

+ 		invpcid_flush_all();

998

+ 		return;

999

+@@ -174,24 +201,45 @@ static inline void __native_flush_tlb_global(void)

1000

+ 	 * be called from deep inside debugging code.)

1001

+ 	 */

1002

+ 	raw_local_irq_save(flags);

1003

+-

1004

+ 	__native_flush_tlb_global_irq_disabled();

1005

+-

1006

+ 	raw_local_irq_restore(flags);

1007

+ }

1008

+

1009

+ static inline void __native_flush_tlb_single(unsigned long addr)

1010

+ {

1011

+-	asm volatile("invlpg (%0)" ::"r" (addr) : "memory");

1012

++	/*

1013

++	 * SIMICS #GP's if you run INVPCID with type 2/3

1014

++	 * and X86_CR4_PCIDE clear.  Shame!

1015

++	 *

1016

++	 * The ASIDs used below are hard-coded.  But, we must not

1017

++	 * call invpcid(type=1/2) before CR4.PCIDE=1.  Just call

1018

++	 * invlpg in the case we are called early.

1019

++	 */

1020

++

1021

++	if (!this_cpu_has(X86_FEATURE_INVPCID_SINGLE)) {

1022

++		if (kaiser_enabled)

1023

++			kaiser_flush_tlb_on_return_to_user();

1024

++		asm volatile("invlpg (%0)" ::"r" (addr) : "memory");

1025

++		return;

1026

++	}

1027

++	/* Flush the address out of both PCIDs. */

1028

++	/*

1029

++	 * An optimization here might be to determine addresses

1030

++	 * that are only kernel-mapped and only flush the kernel

1031

++	 * ASID.  But, userspace flushes are probably much more

1032

++	 * important performance-wise.

1033

++	 *

1034

++	 * Make sure to do only a single invpcid when KAISER is

1035

++	 * disabled and we have only a single ASID.

1036

++	 */

1037

++	if (kaiser_enabled)

1038

++		invpcid_flush_one(X86_CR3_PCID_ASID_USER, addr);

1039

++	invpcid_flush_one(X86_CR3_PCID_ASID_KERN, addr);

1040

+ }

1041

+

1042

+ static inline void __flush_tlb_all(void)

1043

+ {

1044

+-	if (boot_cpu_has(X86_FEATURE_PGE))

1045

+-		__flush_tlb_global();

1046

+-	else

1047

+-		__flush_tlb();

1048

+-

1049

++	__flush_tlb_global();

1050

+ 	/*

1051

+ 	 * Note: if we somehow had PCID but not PGE, then this wouldn't work --

1052

+ 	 * we'd end up flushing kernel translations for the current ASID but

1053

+diff --git a/arch/x86/include/uapi/asm/processor-flags.h b/arch/x86/include/uapi/asm/processor-flags.h

1054

+index 567de50a4c2a..6768d1321016 100644

1055

+--- a/arch/x86/include/uapi/asm/processor-flags.h

1056

++++ b/arch/x86/include/uapi/asm/processor-flags.h

1057

+@@ -77,7 +77,8 @@

1058

+ #define X86_CR3_PWT		_BITUL(X86_CR3_PWT_BIT)

1059

+ #define X86_CR3_PCD_BIT		4 /* Page Cache Disable */

1060

+ #define X86_CR3_PCD		_BITUL(X86_CR3_PCD_BIT)

1061

+-#define X86_CR3_PCID_MASK	_AC(0x00000fff,UL) /* PCID Mask */

1062

++#define X86_CR3_PCID_NOFLUSH_BIT 63 /* Preserve old PCID */

1063

++#define X86_CR3_PCID_NOFLUSH    _BITULL(X86_CR3_PCID_NOFLUSH_BIT)

1064

+

1065

+ /*

1066

+  * Intel CPU features in CR4

1067

+diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c

1068

+index 91588be529b9..918e44772b04 100644

1069

+--- a/arch/x86/kernel/cpu/common.c

1070

++++ b/arch/x86/kernel/cpu/common.c

1071

+@@ -93,7 +93,7 @@ static const struct cpu_dev default_cpu = {

1072

+

1073

+ static const struct cpu_dev *this_cpu = &default_cpu;

1074

+

1075

+-DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {

1076

++DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(struct gdt_page, gdt_page) = { .gdt = {

1077

+ #ifdef CONFIG_X86_64

1078

+ 	/*

1079

+ 	 * We need valid kernel segments for data and code in long mode too

1080

+@@ -327,8 +327,21 @@ static __always_inline void setup_smap(struct cpuinfo_x86 *c)

1081

+ static void setup_pcid(struct cpuinfo_x86 *c)

1082

+ {

1083

+ 	if (cpu_has(c, X86_FEATURE_PCID)) {

1084

+-		if (cpu_has(c, X86_FEATURE_PGE)) {

1085

++		if (cpu_has(c, X86_FEATURE_PGE) || kaiser_enabled) {

1086

+ 			cr4_set_bits(X86_CR4_PCIDE);

1087

++			/*

1088

++			 * INVPCID has two "groups" of types:

1089

++			 * 1/2: Invalidate an individual address

1090

++			 * 3/4: Invalidate all contexts

1091

++			 *

1092

++			 * 1/2 take a PCID, but 3/4 do not.  So, 3/4

1093

++			 * ignore the PCID argument in the descriptor.

1094

++			 * But, we have to be careful not to call 1/2

1095

++			 * with an actual non-zero PCID in them before

1096

++			 * we do the above cr4_set_bits().

1097

++			 */

1098

++			if (cpu_has(c, X86_FEATURE_INVPCID))

1099

++				set_cpu_cap(c, X86_FEATURE_INVPCID_SINGLE);

1100

+ 		} else {

1101

+ 			/*

1102

+ 			 * flush_tlb_all(), as currently implemented, won't

1103

+@@ -341,6 +354,7 @@ static void setup_pcid(struct cpuinfo_x86 *c)

1104

+ 			clear_cpu_cap(c, X86_FEATURE_PCID);

1105

+ 		}

1106

+ 	}

1107

++	kaiser_setup_pcid();

1108

+ }

1109

+

1110

+ /*

1111

+@@ -1365,7 +1379,7 @@ static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = {

1112

+ 	  [DEBUG_STACK - 1]			= DEBUG_STKSZ

1113

+ };

1114

+

1115

+-static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks

1116

++DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(char, exception_stacks

1117

+ 	[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]);

1118

+

1119

+ /* May not be marked __init: used by software suspend */

1120

+@@ -1523,6 +1537,14 @@ void cpu_init(void)

1121

+ 	 * try to read it.

1122

+ 	 */

1123

+ 	cr4_init_shadow();

1124

++	if (!kaiser_enabled) {

1125

++		/*

1126

++		 * secondary_startup_64() deferred setting PGE in cr4:

1127

++		 * probe_page_size_mask() sets it on the boot cpu,

1128

++		 * but it needs to be set on each secondary cpu.

1129

++		 */

1130

++		cr4_set_bits(X86_CR4_PGE);

1131

++	}

1132

+

1133

+ 	/*

1134

+ 	 * Load microcode on this cpu if a valid microcode is available.

1135

+diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c

1136

+index 04f89caef9c4..e33b38541be3 100644

1137

+--- a/arch/x86/kernel/espfix_64.c

1138

++++ b/arch/x86/kernel/espfix_64.c

1139

+@@ -41,6 +41,7 @@

1140

+ #include <asm/pgalloc.h>

1141

+ #include <asm/setup.h>

1142

+ #include <asm/espfix.h>

1143

++#include <asm/kaiser.h>

1144

+

1145

+ /*

1146

+  * Note: we only need 6*8 = 48 bytes for the espfix stack, but round

1147

+@@ -126,6 +127,15 @@ void __init init_espfix_bsp(void)

1148

+ 	/* Install the espfix pud into the kernel page directory */

1149

+ 	pgd_p = &init_level4_pgt[pgd_index(ESPFIX_BASE_ADDR)];

1150

+ 	pgd_populate(&init_mm, pgd_p, (pud_t *)espfix_pud_page);

1151

++	/*

1152

++	 * Just copy the top-level PGD that is mapping the espfix

1153

++	 * area to ensure it is mapped into the shadow user page

1154

++	 * tables.

1155

++	 */

1156

++	if (kaiser_enabled) {

1157

++		set_pgd(native_get_shadow_pgd(pgd_p),

1158

++			__pgd(_KERNPG_TABLE | __pa((pud_t *)espfix_pud_page)));

1159

++	}

1160

+

1161

+ 	/* Randomize the locations */

1162

+ 	init_espfix_random();

1163

+diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S

1164

+index b4421cc191b0..67cd7c1b99da 100644

1165

+--- a/arch/x86/kernel/head_64.S

1166

++++ b/arch/x86/kernel/head_64.S

1167

+@@ -190,8 +190,8 @@ ENTRY(secondary_startup_64)

1168

+ 	movq	$(init_level4_pgt - __START_KERNEL_map), %rax

1169

+ 1:

1170

+

1171

+-	/* Enable PAE mode and PGE */

1172

+-	movl	$(X86_CR4_PAE | X86_CR4_PGE), %ecx

1173

++	/* Enable PAE and PSE, but defer PGE until kaiser_enabled is decided */

1174

++	movl	$(X86_CR4_PAE | X86_CR4_PSE), %ecx

1175

+ 	movq	%rcx, %cr4

1176

+

1177

+ 	/* Setup early boot stage 4 level pagetables. */

1178

+@@ -405,6 +405,27 @@ GLOBAL(early_recursion_flag)

1179

+ 	.balign	PAGE_SIZE; \

1180

+ GLOBAL(name)

1181

+

1182

++#ifdef CONFIG_PAGE_TABLE_ISOLATION

1183

++/*

1184

++ * Each PGD needs to be 8k long and 8k aligned.  We do not

1185

++ * ever go out to userspace with these, so we do not

1186

++ * strictly *need* the second page, but this allows us to

1187

++ * have a single set_pgd() implementation that does not

1188

++ * need to worry about whether it has 4k or 8k to work

1189

++ * with.

1190

++ *

1191

++ * This ensures PGDs are 8k long:

1192

++ */

1193

++#define KAISER_USER_PGD_FILL	512

1194

++/* This ensures they are 8k-aligned: */

1195

++#define NEXT_PGD_PAGE(name) \

1196

++	.balign 2 * PAGE_SIZE; \

1197

++GLOBAL(name)

1198

++#else

1199

++#define NEXT_PGD_PAGE(name) NEXT_PAGE(name)

1200

++#define KAISER_USER_PGD_FILL	0

1201

++#endif

1202

++

1203

+ /* Automate the creation of 1 to 1 mapping pmd entries */

1204

+ #define PMDS(START, PERM, COUNT)			\

1205

+ 	i = 0 ;						\

1206

+@@ -414,9 +435,10 @@ GLOBAL(name)

1207

+ 	.endr

1208

+

1209

+ 	__INITDATA

1210

+-NEXT_PAGE(early_level4_pgt)

1211

++NEXT_PGD_PAGE(early_level4_pgt)

1212

+ 	.fill	511,8,0

1213

+ 	.quad	level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE

1214

++	.fill	KAISER_USER_PGD_FILL,8,0

1215

+

1216

+ NEXT_PAGE(early_dynamic_pgts)

1217

+ 	.fill	512*EARLY_DYNAMIC_PAGE_TABLES,8,0

1218

+@@ -424,16 +446,18 @@ NEXT_PAGE(early_dynamic_pgts)

1219

+ 	.data

1220

+

1221

+ #ifndef CONFIG_XEN

1222

+-NEXT_PAGE(init_level4_pgt)

1223

++NEXT_PGD_PAGE(init_level4_pgt)

1224

+ 	.fill	512,8,0

1225

++	.fill	KAISER_USER_PGD_FILL,8,0

1226

+ #else

1227

+-NEXT_PAGE(init_level4_pgt)

1228

++NEXT_PGD_PAGE(init_level4_pgt)

1229

+ 	.quad   level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE

1230

+ 	.org    init_level4_pgt + L4_PAGE_OFFSET*8, 0

1231

+ 	.quad   level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE

1232

+ 	.org    init_level4_pgt + L4_START_KERNEL*8, 0

1233

+ 	/* (2^48-(2*1024*1024*1024))/(2^39) = 511 */

1234

+ 	.quad   level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE

1235

++	.fill	KAISER_USER_PGD_FILL,8,0

1236

+

1237

+ NEXT_PAGE(level3_ident_pgt)

1238

+ 	.quad	level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE

1239

+@@ -444,6 +468,7 @@ NEXT_PAGE(level2_ident_pgt)

1240

+ 	 */

1241

+ 	PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD)

1242

+ #endif

1243

++	.fill	KAISER_USER_PGD_FILL,8,0

1244

+

1245

+ NEXT_PAGE(level3_kernel_pgt)

1246

+ 	.fill	L3_START_KERNEL,8,0

1247

+diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c

1248

+index 1423ab1b0312..f480b38a03c3 100644

1249

+--- a/arch/x86/kernel/irqinit.c

1250

++++ b/arch/x86/kernel/irqinit.c

1251

+@@ -51,7 +51,7 @@ static struct irqaction irq2 = {

1252

+ 	.flags = IRQF_NO_THREAD,

1253

+ };

1254

+

1255

+-DEFINE_PER_CPU(vector_irq_t, vector_irq) = {

1256

++DEFINE_PER_CPU_USER_MAPPED(vector_irq_t, vector_irq) = {

1257

+ 	[0 ... NR_VECTORS - 1] = VECTOR_UNUSED,

1258

+ };

1259

+

1260

+diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c

1261

+index 5f70014ca602..8bc68cfc0d33 100644

1262

+--- a/arch/x86/kernel/ldt.c

1263

++++ b/arch/x86/kernel/ldt.c

1264

+@@ -16,6 +16,7 @@

1265

+ #include <linux/slab.h>

1266

+ #include <linux/vmalloc.h>

1267

+ #include <linux/uaccess.h>

1268

++#include <linux/kaiser.h>

1269

+

1270

+ #include <asm/ldt.h>

1271

+ #include <asm/desc.h>

1272

+@@ -34,11 +35,21 @@ static void flush_ldt(void *current_mm)

1273

+ 	set_ldt(pc->ldt->entries, pc->ldt->size);

1274

+ }

1275

+

1276

++static void __free_ldt_struct(struct ldt_struct *ldt)

1277

++{

1278

++	if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE)

1279

++		vfree(ldt->entries);

1280

++	else

1281

++		free_page((unsigned long)ldt->entries);

1282

++	kfree(ldt);

1283

++}

1284

++

1285

+ /* The caller must call finalize_ldt_struct on the result. LDT starts zeroed. */

1286

+ static struct ldt_struct *alloc_ldt_struct(int size)

1287

+ {

1288

+ 	struct ldt_struct *new_ldt;

1289

+ 	int alloc_size;

1290

++	int ret;

1291

+

1292

+ 	if (size > LDT_ENTRIES)

1293

+ 		return NULL;

1294

+@@ -66,7 +77,13 @@ static struct ldt_struct *alloc_ldt_struct(int size)

1295

+ 		return NULL;

1296

+ 	}

1297

+

1298

++	ret = kaiser_add_mapping((unsigned long)new_ldt->entries, alloc_size,

1299

++				 __PAGE_KERNEL);

1300

+ 	new_ldt->size = size;

1301

++	if (ret) {

1302

++		__free_ldt_struct(new_ldt);

1303

++		return NULL;

1304

++	}

1305

+ 	return new_ldt;

1306

+ }

1307

+

1308

+@@ -92,12 +109,10 @@ static void free_ldt_struct(struct ldt_struct *ldt)

1309

+ 	if (likely(!ldt))

1310

+ 		return;

1311

+

1312

++	kaiser_remove_mapping((unsigned long)ldt->entries,

1313

++			      ldt->size * LDT_ENTRY_SIZE);

1314

+ 	paravirt_free_ldt(ldt->entries, ldt->size);

1315

+-	if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE)

1316

+-		vfree(ldt->entries);

1317

+-	else

1318

+-		free_page((unsigned long)ldt->entries);

1319

+-	kfree(ldt);

1320

++	__free_ldt_struct(ldt);

1321

+ }

1322

+

1323

+ /*

1324

+diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c

1325

+index bb3840cedb4f..ee43b36075c7 100644

1326

+--- a/arch/x86/kernel/paravirt_patch_64.c

1327

++++ b/arch/x86/kernel/paravirt_patch_64.c

1328

+@@ -9,7 +9,6 @@ DEF_NATIVE(pv_irq_ops, save_fl, "pushfq; popq %rax");

1329

+ DEF_NATIVE(pv_mmu_ops, read_cr2, "movq %cr2, %rax");

1330

+ DEF_NATIVE(pv_mmu_ops, read_cr3, "movq %cr3, %rax");

1331

+ DEF_NATIVE(pv_mmu_ops, write_cr3, "movq %rdi, %cr3");

1332

+-DEF_NATIVE(pv_mmu_ops, flush_tlb_single, "invlpg (%rdi)");

1333

+ DEF_NATIVE(pv_cpu_ops, clts, "clts");

1334

+ DEF_NATIVE(pv_cpu_ops, wbinvd, "wbinvd");

1335

+

1336

+@@ -59,7 +58,6 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,

1337

+ 		PATCH_SITE(pv_mmu_ops, read_cr3);

1338

+ 		PATCH_SITE(pv_mmu_ops, write_cr3);

1339

+ 		PATCH_SITE(pv_cpu_ops, clts);

1340

+-		PATCH_SITE(pv_mmu_ops, flush_tlb_single);

1341

+ 		PATCH_SITE(pv_cpu_ops, wbinvd);

1342

+ #if defined(CONFIG_PARAVIRT_SPINLOCKS)

1343

+ 		case PARAVIRT_PATCH(pv_lock_ops.queued_spin_unlock):

1344

+diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c

1345

+index 8e10e72bf6ee..a55b32007785 100644

1346

+--- a/arch/x86/kernel/process.c

1347

++++ b/arch/x86/kernel/process.c

1348

+@@ -41,7 +41,7 @@

1349

+  * section. Since TSS's are completely CPU-local, we want them

1350

+  * on exact cacheline boundaries, to eliminate cacheline ping-pong.

1351

+  */

1352

+-__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = {

1353

++__visible DEFINE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct tss_struct, cpu_tss) = {

1354

+ 	.x86_tss = {

1355

+ 		.sp0 = TOP_OF_INIT_STACK,

1356

+ #ifdef CONFIG_X86_32

1357

+diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c

1358

+index feaab07fa124..6b55012d02a3 100644

1359

+--- a/arch/x86/kernel/setup.c

1360

++++ b/arch/x86/kernel/setup.c

1361

+@@ -114,6 +114,7 @@

1362

+ #include <asm/microcode.h>

1363

+ #include <asm/mmu_context.h>

1364

+ #include <asm/kaslr.h>

1365

++#include <asm/kaiser.h>

1366

+

1367

+ /*

1368

+  * max_low_pfn_mapped: highest direct mapped pfn under 4GB

1369

+@@ -1019,6 +1020,12 @@ void __init setup_arch(char **cmdline_p)

1370

+ 	 */

1371

+ 	init_hypervisor_platform();

1372

+

1373

++	/*

1374

++	 * This needs to happen right after XENPV is set on xen and

1375

++	 * kaiser_enabled is checked below in cleanup_highmap().

1376

++	 */

1377

++	kaiser_check_boottime_disable();

1378

++

1379

+ 	x86_init.resources.probe_roms();

1380

+

1381

+ 	/* after parse_early_param, so could debug it */

1382

+diff --git a/arch/x86/kernel/tracepoint.c b/arch/x86/kernel/tracepoint.c

1383

+index 1c113db9ed57..2bb5ee464df3 100644

1384

+--- a/arch/x86/kernel/tracepoint.c

1385

++++ b/arch/x86/kernel/tracepoint.c

1386

+@@ -9,10 +9,12 @@

1387

+ #include <linux/atomic.h>

1388

+

1389

+ atomic_t trace_idt_ctr = ATOMIC_INIT(0);

1390

++__aligned(PAGE_SIZE)

1391

+ struct desc_ptr trace_idt_descr = { NR_VECTORS * 16 - 1,

1392

+ 				(unsigned long) trace_idt_table };

1393

+

1394

+ /* No need to be aligned, but done to keep all IDTs defined the same way. */

1395

++__aligned(PAGE_SIZE)

1396

+ gate_desc trace_idt_table[NR_VECTORS] __page_aligned_bss;

1397

+

1398

+ static int trace_irq_vector_refcount;

1399

+diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c

1400

+index 7e28e6c877d9..73304b1a03cc 100644

1401

+--- a/arch/x86/kvm/x86.c

1402

++++ b/arch/x86/kvm/x86.c

1403

+@@ -773,7 +773,8 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)

1404

+ 			return 1;

1405

+

1406

+ 		/* PCID can not be enabled when cr3[11:0]!=000H or EFER.LMA=0 */

1407

+-		if ((kvm_read_cr3(vcpu) & X86_CR3_PCID_MASK) || !is_long_mode(vcpu))

1408

++		if ((kvm_read_cr3(vcpu) & X86_CR3_PCID_ASID_MASK) ||

1409

++		    !is_long_mode(vcpu))

1410

+ 			return 1;

1411

+ 	}

1412

+

1413

+diff --git a/arch/x86/lib/cmdline.c b/arch/x86/lib/cmdline.c

1414

+index 5cc78bf57232..3261abb21ef4 100644

1415

+--- a/arch/x86/lib/cmdline.c

1416

++++ b/arch/x86/lib/cmdline.c

1417

+@@ -104,7 +104,112 @@ __cmdline_find_option_bool(const char *cmdline, int max_cmdline_size,

1418

+ 	return 0;	/* Buffer overrun */

1419

+ }

1420

+

1421

++/*

1422

++ * Find a non-boolean option (i.e. option=argument). In accordance with

1423

++ * standard Linux practice, if this option is repeated, this returns the

1424

++ * last instance on the command line.

1425

++ *

1426

++ * @cmdline: the cmdline string

1427

++ * @max_cmdline_size: the maximum size of cmdline

1428

++ * @option: option string to look for

1429

++ * @buffer: memory buffer to return the option argument

1430

++ * @bufsize: size of the supplied memory buffer

1431

++ *

1432

++ * Returns the length of the argument (regardless of if it was

1433

++ * truncated to fit in the buffer), or -1 on not found.

1434

++ */

1435

++static int

1436

++__cmdline_find_option(const char *cmdline, int max_cmdline_size,

1437

++		      const char *option, char *buffer, int bufsize)

1438

++{

1439

++	char c;

1440

++	int pos = 0, len = -1;

1441

++	const char *opptr = NULL;

1442

++	char *bufptr = buffer;

1443

++	enum {

1444

++		st_wordstart = 0,	/* Start of word/after whitespace */

1445

++		st_wordcmp,	/* Comparing this word */

1446

++		st_wordskip,	/* Miscompare, skip */

1447

++		st_bufcpy,	/* Copying this to buffer */

1448

++	} state = st_wordstart;

1449

++

1450

++	if (!cmdline)

1451

++		return -1;      /* No command line */

1452

++

1453

++	/*

1454

++	 * This 'pos' check ensures we do not overrun

1455

++	 * a non-NULL-terminated 'cmdline'

1456

++	 */

1457

++	while (pos++ < max_cmdline_size) {

1458

++		c = *(char *)cmdline++;

1459

++		if (!c)

1460

++			break;

1461

++

1462

++		switch (state) {

1463

++		case st_wordstart:

1464

++			if (myisspace(c))

1465

++				break;

1466

++

1467

++			state = st_wordcmp;

1468

++			opptr = option;

1469

++			/* fall through */

1470

++

1471

++		case st_wordcmp:

1472

++			if ((c == '=') && !*opptr) {

1473

++				/*

1474

++				 * We matched all the way to the end of the

1475

++				 * option we were looking for, prepare to

1476

++				 * copy the argument.

1477

++				 */

1478

++				len = 0;

1479

++				bufptr = buffer;

1480

++				state = st_bufcpy;

1481

++				break;

1482

++			} else if (c == *opptr++) {

1483

++				/*

1484

++				 * We are currently matching, so continue

1485

++				 * to the next character on the cmdline.

1486

++				 */

1487

++				break;

1488

++			}

1489

++			state = st_wordskip;

1490

++			/* fall through */

1491

++

1492

++		case st_wordskip:

1493

++			if (myisspace(c))

1494

++				state = st_wordstart;

1495

++			break;

1496

++

1497

++		case st_bufcpy:

1498

++			if (myisspace(c)) {

1499

++				state = st_wordstart;

1500

++			} else {

1501

++				/*

1502

++				 * Increment len, but don't overrun the

1503

++				 * supplied buffer and leave room for the

1504

++				 * NULL terminator.

1505

++				 */

1506

++				if (++len < bufsize)

1507

++					*bufptr++ = c;

1508

++			}

1509

++			break;

1510

++		}

1511

++	}

1512

++

1513

++	if (bufsize)

1514

++		*bufptr = '\0';

1515

++

1516

++	return len;

1517

++}

1518

++

1519

+ int cmdline_find_option_bool(const char *cmdline, const char *option)

1520

+ {

1521

+ 	return __cmdline_find_option_bool(cmdline, COMMAND_LINE_SIZE, option);

1522

+ }

1523

++

1524

++int cmdline_find_option(const char *cmdline, const char *option, char *buffer,

1525

++			int bufsize)

1526

++{

1527

++	return __cmdline_find_option(cmdline, COMMAND_LINE_SIZE, option,

1528

++				     buffer, bufsize);

1529

++}

1530

+diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile

1531

+index 96d2b847e09e..c548b46100cb 100644

1532

+--- a/arch/x86/mm/Makefile

1533

++++ b/arch/x86/mm/Makefile

1534

+@@ -37,5 +37,5 @@ obj-$(CONFIG_NUMA_EMU)		+= numa_emulation.o

1535

+

1536

+ obj-$(CONFIG_X86_INTEL_MPX)	+= mpx.o

1537

+ obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o

1538

+-obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o

1539

+-

1540

++obj-$(CONFIG_RANDOMIZE_MEMORY)	+= kaslr.o

1541

++obj-$(CONFIG_PAGE_TABLE_ISOLATION)		+= kaiser.o

1542

+diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c

1543

+index 0381638168d1..1e779bca4f3e 100644

1544

+--- a/arch/x86/mm/init.c

1545

++++ b/arch/x86/mm/init.c

1546

+@@ -177,7 +177,7 @@ static void __init probe_page_size_mask(void)

1547

+ 		cr4_set_bits_and_update_boot(X86_CR4_PSE);

1548

+

1549

+ 	/* Enable PGE if available */

1550

+-	if (boot_cpu_has(X86_FEATURE_PGE)) {

1551

++	if (boot_cpu_has(X86_FEATURE_PGE) && !kaiser_enabled) {

1552

+ 		cr4_set_bits_and_update_boot(X86_CR4_PGE);

1553

+ 		__supported_pte_mask |= _PAGE_GLOBAL;

1554

+ 	} else

1555

+diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c

1556

+index 3e27ded6ac65..7df8e3a79dc0 100644

1557

+--- a/arch/x86/mm/init_64.c

1558

++++ b/arch/x86/mm/init_64.c

1559

+@@ -324,6 +324,16 @@ void __init cleanup_highmap(void)

1560

+ 			continue;

1561

+ 		if (vaddr < (unsigned long) _text || vaddr > end)

1562

+ 			set_pmd(pmd, __pmd(0));

1563

++		else if (kaiser_enabled) {

1564

++			/*

1565

++			 * level2_kernel_pgt is initialized with _PAGE_GLOBAL:

1566

++			 * clear that now.  This is not important, so long as

1567

++			 * CR4.PGE remains clear, but it removes an anomaly.

1568

++			 * Physical mapping setup below avoids _PAGE_GLOBAL

1569

++			 * by use of massage_pgprot() inside pfn_pte() etc.

1570

++			 */

1571

++			set_pmd(pmd, pmd_clear_flags(*pmd, _PAGE_GLOBAL));

1572

++		}

1573

+ 	}

1574

+ }

1575

+

1576

+diff --git a/arch/x86/mm/kaiser.c b/arch/x86/mm/kaiser.c

1577

+new file mode 100644

1578

+index 000000000000..d8376b4ad9f0

1579

+--- /dev/null

1580

++++ b/arch/x86/mm/kaiser.c

1581

+@@ -0,0 +1,455 @@

1582

++#include <linux/bug.h>

1583

++#include <linux/kernel.h>

1584

++#include <linux/errno.h>

1585

++#include <linux/string.h>

1586

++#include <linux/types.h>

1587

++#include <linux/bug.h>

1588

++#include <linux/init.h>

1589

++#include <linux/interrupt.h>

1590

++#include <linux/spinlock.h>

1591

++#include <linux/mm.h>

1592

++#include <linux/uaccess.h>

1593

++

1594

++#undef pr_fmt

1595

++#define pr_fmt(fmt)     "Kernel/User page tables isolation: " fmt

1596

++

1597

++#include <asm/kaiser.h>

1598

++#include <asm/tlbflush.h>	/* to verify its kaiser declarations */

1599

++#include <asm/pgtable.h>

1600

++#include <asm/pgalloc.h>

1601

++#include <asm/desc.h>

1602

++#include <asm/cmdline.h>

1603

++

1604

++int kaiser_enabled __read_mostly = 1;

1605

++EXPORT_SYMBOL(kaiser_enabled);	/* for inlined TLB flush functions */

1606

++

1607

++__visible

1608

++DEFINE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);

1609

++

1610

++/*

1611

++ * These can have bit 63 set, so we can not just use a plain "or"

1612

++ * instruction to get their value or'd into CR3.  It would take

1613

++ * another register.  So, we use a memory reference to these instead.

1614

++ *

1615

++ * This is also handy because systems that do not support PCIDs

1616

++ * just end up or'ing a 0 into their CR3, which does no harm.

1617

++ */

1618

++DEFINE_PER_CPU(unsigned long, x86_cr3_pcid_user);

1619

++

1620

++/*

1621

++ * At runtime, the only things we map are some things for CPU

1622

++ * hotplug, and stacks for new processes.  No two CPUs will ever

1623

++ * be populating the same addresses, so we only need to ensure

1624

++ * that we protect between two CPUs trying to allocate and

1625

++ * populate the same page table page.

1626

++ *

1627

++ * Only take this lock when doing a set_p[4um]d(), but it is not

1628

++ * needed for doing a set_pte().  We assume that only the *owner*

1629

++ * of a given allocation will be doing this for _their_

1630

++ * allocation.

1631

++ *

1632

++ * This ensures that once a system has been running for a while

1633

++ * and there have been stacks all over and these page tables

1634

++ * are fully populated, there will be no further acquisitions of

1635

++ * this lock.

1636

++ */

1637

++static DEFINE_SPINLOCK(shadow_table_allocation_lock);

1638

++

1639

++/*

1640

++ * Returns -1 on error.

1641

++ */

1642

++static inline unsigned long get_pa_from_mapping(unsigned long vaddr)

1643

++{

1644

++	pgd_t *pgd;

1645

++	pud_t *pud;

1646

++	pmd_t *pmd;

1647

++	pte_t *pte;

1648

++

1649

++	pgd = pgd_offset_k(vaddr);

1650

++	/*

1651

++	 * We made all the kernel PGDs present in kaiser_init().

1652

++	 * We expect them to stay that way.

1653

++	 */

1654

++	BUG_ON(pgd_none(*pgd));

1655

++	/*

1656

++	 * PGDs are either 512GB or 128TB on all x86_64

1657

++	 * configurations.  We don't handle these.

1658

++	 */

1659

++	BUG_ON(pgd_large(*pgd));

1660

++

1661

++	pud = pud_offset(pgd, vaddr);

1662

++	if (pud_none(*pud)) {

1663

++		WARN_ON_ONCE(1);

1664

++		return -1;

1665

++	}

1666

++

1667

++	if (pud_large(*pud))

1668

++		return (pud_pfn(*pud) << PAGE_SHIFT) | (vaddr & ~PUD_PAGE_MASK);

1669

++

1670

++	pmd = pmd_offset(pud, vaddr);

1671

++	if (pmd_none(*pmd)) {

1672

++		WARN_ON_ONCE(1);

1673

++		return -1;

1674

++	}

1675

++

1676

++	if (pmd_large(*pmd))

1677

++		return (pmd_pfn(*pmd) << PAGE_SHIFT) | (vaddr & ~PMD_PAGE_MASK);

1678

++

1679

++	pte = pte_offset_kernel(pmd, vaddr);

1680

++	if (pte_none(*pte)) {

1681

++		WARN_ON_ONCE(1);

1682

++		return -1;

1683

++	}

1684

++

1685

++	return (pte_pfn(*pte) << PAGE_SHIFT) | (vaddr & ~PAGE_MASK);

1686

++}

1687

++

1688

++/*

1689

++ * This is a relatively normal page table walk, except that it

1690

++ * also tries to allocate page tables pages along the way.

1691

++ *

1692

++ * Returns a pointer to a PTE on success, or NULL on failure.

1693

++ */

1694

++static pte_t *kaiser_pagetable_walk(unsigned long address)

1695

++{

1696

++	pmd_t *pmd;

1697

++	pud_t *pud;

1698

++	pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(address));

1699

++	gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);

1700

++

1701

++	if (pgd_none(*pgd)) {

1702

++		WARN_ONCE(1, "All shadow pgds should have been populated");

1703

++		return NULL;

1704

++	}

1705

++	BUILD_BUG_ON(pgd_large(*pgd) != 0);

1706

++

1707

++	pud = pud_offset(pgd, address);

1708

++	/* The shadow page tables do not use large mappings: */

1709

++	if (pud_large(*pud)) {

1710

++		WARN_ON(1);

1711

++		return NULL;

1712

++	}

1713

++	if (pud_none(*pud)) {

1714

++		unsigned long new_pmd_page = __get_free_page(gfp);

1715

++		if (!new_pmd_page)

1716

++			return NULL;

1717

++		spin_lock(&shadow_table_allocation_lock);

1718

++		if (pud_none(*pud)) {

1719

++			set_pud(pud, __pud(_KERNPG_TABLE | __pa(new_pmd_page)));

1720

++			__inc_zone_page_state(virt_to_page((void *)

1721

++						new_pmd_page), NR_KAISERTABLE);

1722

++		} else

1723

++			free_page(new_pmd_page);

1724

++		spin_unlock(&shadow_table_allocation_lock);

1725

++	}

1726

++

1727

++	pmd = pmd_offset(pud, address);

1728

++	/* The shadow page tables do not use large mappings: */

1729

++	if (pmd_large(*pmd)) {

1730

++		WARN_ON(1);

1731

++		return NULL;

1732

++	}

1733

++	if (pmd_none(*pmd)) {

1734

++		unsigned long new_pte_page = __get_free_page(gfp);

1735

++		if (!new_pte_page)

1736

++			return NULL;

1737

++		spin_lock(&shadow_table_allocation_lock);

1738

++		if (pmd_none(*pmd)) {

1739

++			set_pmd(pmd, __pmd(_KERNPG_TABLE | __pa(new_pte_page)));

1740

++			__inc_zone_page_state(virt_to_page((void *)

1741

++						new_pte_page), NR_KAISERTABLE);

1742

++		} else

1743

++			free_page(new_pte_page);

1744

++		spin_unlock(&shadow_table_allocation_lock);

1745

++	}

1746

++

1747

++	return pte_offset_kernel(pmd, address);

1748

++}

1749

++

1750

++static int kaiser_add_user_map(const void *__start_addr, unsigned long size,

1751

++			       unsigned long flags)

1752

++{

1753

++	int ret = 0;

1754

++	pte_t *pte;

1755

++	unsigned long start_addr = (unsigned long )__start_addr;

1756

++	unsigned long address = start_addr & PAGE_MASK;

1757

++	unsigned long end_addr = PAGE_ALIGN(start_addr + size);

1758

++	unsigned long target_address;

1759

++

1760

++	/*

1761

++	 * It is convenient for callers to pass in __PAGE_KERNEL etc,

1762

++	 * and there is no actual harm from setting _PAGE_GLOBAL, so

1763

++	 * long as CR4.PGE is not set.  But it is nonetheless troubling

1764

++	 * to see Kaiser itself setting _PAGE_GLOBAL (now that "nokaiser"

1765

++	 * requires that not to be #defined to 0): so mask it off here.

1766

++	 */

1767

++	flags &= ~_PAGE_GLOBAL;

1768

++

1769

++	for (; address < end_addr; address += PAGE_SIZE) {

1770

++		target_address = get_pa_from_mapping(address);

1771

++		if (target_address == -1) {

1772

++			ret = -EIO;

1773

++			break;

1774

++		}

1775

++		pte = kaiser_pagetable_walk(address);

1776

++		if (!pte) {

1777

++			ret = -ENOMEM;

1778

++			break;

1779

++		}

1780

++		if (pte_none(*pte)) {

1781

++			set_pte(pte, __pte(flags | target_address));

1782

++		} else {

1783

++			pte_t tmp;

1784

++			set_pte(&tmp, __pte(flags | target_address));

1785

++			WARN_ON_ONCE(!pte_same(*pte, tmp));

1786

++		}

1787

++	}

1788

++	return ret;

1789

++}

1790

++

1791

++static int kaiser_add_user_map_ptrs(const void *start, const void *end, unsigned long flags)

1792

++{

1793

++	unsigned long size = end - start;

1794

++

1795

++	return kaiser_add_user_map(start, size, flags);

1796

++}

1797

++

1798

++/*

1799

++ * Ensure that the top level of the (shadow) page tables are

1800

++ * entirely populated.  This ensures that all processes that get

1801

++ * forked have the same entries.  This way, we do not have to

1802

++ * ever go set up new entries in older processes.

1803

++ *

1804

++ * Note: we never free these, so there are no updates to them

1805

++ * after this.

1806

++ */

1807

++static void __init kaiser_init_all_pgds(void)

1808

++{

1809

++	pgd_t *pgd;

1810

++	int i = 0;

1811

++

1812

++	pgd = native_get_shadow_pgd(pgd_offset_k((unsigned long )0));

1813

++	for (i = PTRS_PER_PGD / 2; i < PTRS_PER_PGD; i++) {

1814

++		pgd_t new_pgd;

1815

++		pud_t *pud = pud_alloc_one(&init_mm,

1816

++					   PAGE_OFFSET + i * PGDIR_SIZE);

1817

++		if (!pud) {

1818

++			WARN_ON(1);

1819

++			break;

1820

++		}

1821

++		inc_zone_page_state(virt_to_page(pud), NR_KAISERTABLE);

1822

++		new_pgd = __pgd(_KERNPG_TABLE |__pa(pud));

1823

++		/*

1824

++		 * Make sure not to stomp on some other pgd entry.

1825

++		 */

1826

++		if (!pgd_none(pgd[i])) {

1827

++			WARN_ON(1);

1828

++			continue;

1829

++		}

1830

++		set_pgd(pgd + i, new_pgd);

1831

++	}

1832

++}

1833

++

1834

++#define kaiser_add_user_map_early(start, size, flags) do {	\

1835

++	int __ret = kaiser_add_user_map(start, size, flags);	\

1836

++	WARN_ON(__ret);						\

1837

++} while (0)

1838

++

1839

++#define kaiser_add_user_map_ptrs_early(start, end, flags) do {		\

1840

++	int __ret = kaiser_add_user_map_ptrs(start, end, flags);	\

1841

++	WARN_ON(__ret);							\

1842

++} while (0)

1843

++

1844

++void __init kaiser_check_boottime_disable(void)

1845

++{

1846

++	bool enable = true;

1847

++	char arg[5];

1848

++	int ret;

1849

++

1850

++	if (boot_cpu_has(X86_FEATURE_XENPV))

1851

++		goto silent_disable;

1852

++

1853

++	ret = cmdline_find_option(boot_command_line, "pti", arg, sizeof(arg));

1854

++	if (ret > 0) {

1855

++		if (!strncmp(arg, "on", 2))

1856

++			goto enable;

1857

++

1858

++		if (!strncmp(arg, "off", 3))

1859

++			goto disable;

1860

++

1861

++		if (!strncmp(arg, "auto", 4))

1862

++			goto skip;

1863

++	}

1864

++

1865

++	if (cmdline_find_option_bool(boot_command_line, "nopti"))

1866

++		goto disable;

1867

++

1868

++skip:

1869

++	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)

1870

++		goto disable;

1871

++

1872

++enable:

1873

++	if (enable)

1874

++		setup_force_cpu_cap(X86_FEATURE_KAISER);

1875

++

1876

++	return;

1877

++

1878

++disable:

1879

++	pr_info("disabled\n");

1880

++

1881

++silent_disable:

1882

++	kaiser_enabled = 0;

1883

++	setup_clear_cpu_cap(X86_FEATURE_KAISER);

1884

++}

1885

++

1886

++/*

1887

++ * If anything in here fails, we will likely die on one of the

1888

++ * first kernel->user transitions and init will die.  But, we

1889

++ * will have most of the kernel up by then and should be able to

1890

++ * get a clean warning out of it.  If we BUG_ON() here, we run

1891

++ * the risk of being before we have good console output.

1892

++ */

1893

++void __init kaiser_init(void)

1894

++{

1895

++	int cpu;

1896

++

1897

++	if (!kaiser_enabled)

1898

++		return;

1899

++

1900

++	kaiser_init_all_pgds();

1901

++

1902

++	for_each_possible_cpu(cpu) {

1903

++		void *percpu_vaddr = __per_cpu_user_mapped_start +

1904

++				     per_cpu_offset(cpu);

1905

++		unsigned long percpu_sz = __per_cpu_user_mapped_end -

1906

++					  __per_cpu_user_mapped_start;

1907

++		kaiser_add_user_map_early(percpu_vaddr, percpu_sz,

1908

++					  __PAGE_KERNEL);

1909

++	}

1910

++

1911

++	/*

1912

++	 * Map the entry/exit text section, which is needed at

1913

++	 * switches from user to and from kernel.

1914

++	 */

1915

++	kaiser_add_user_map_ptrs_early(__entry_text_start, __entry_text_end,

1916

++				       __PAGE_KERNEL_RX);

1917

++

1918

++#if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN)

1919

++	kaiser_add_user_map_ptrs_early(__irqentry_text_start,

1920

++				       __irqentry_text_end,

1921

++				       __PAGE_KERNEL_RX);

1922

++#endif

1923

++	kaiser_add_user_map_early((void *)idt_descr.address,

1924

++				  sizeof(gate_desc) * NR_VECTORS,

1925

++				  __PAGE_KERNEL_RO);

1926

++#ifdef CONFIG_TRACING

1927

++	kaiser_add_user_map_early(&trace_idt_descr,

1928

++				  sizeof(trace_idt_descr),

1929

++				  __PAGE_KERNEL);

1930

++	kaiser_add_user_map_early(&trace_idt_table,

1931

++				  sizeof(gate_desc) * NR_VECTORS,

1932

++				  __PAGE_KERNEL);

1933

++#endif

1934

++	kaiser_add_user_map_early(&debug_idt_descr, sizeof(debug_idt_descr),

1935

++				  __PAGE_KERNEL);

1936

++	kaiser_add_user_map_early(&debug_idt_table,

1937

++				  sizeof(gate_desc) * NR_VECTORS,

1938

++				  __PAGE_KERNEL);

1939

++

1940

++	pr_info("enabled\n");

1941

++}

1942

++

1943

++/* Add a mapping to the shadow mapping, and synchronize the mappings */

1944

++int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags)

1945

++{

1946

++	if (!kaiser_enabled)

1947

++		return 0;

1948

++	return kaiser_add_user_map((const void *)addr, size, flags);

1949

++}

1950

++

1951

++void kaiser_remove_mapping(unsigned long start, unsigned long size)

1952

++{

1953

++	extern void unmap_pud_range_nofree(pgd_t *pgd,

1954

++				unsigned long start, unsigned long end);

1955

++	unsigned long end = start + size;

1956

++	unsigned long addr, next;

1957

++	pgd_t *pgd;

1958

++

1959

++	if (!kaiser_enabled)

1960

++		return;

1961

++	pgd = native_get_shadow_pgd(pgd_offset_k(start));

1962

++	for (addr = start; addr < end; pgd++, addr = next) {

1963

++		next = pgd_addr_end(addr, end);

1964

++		unmap_pud_range_nofree(pgd, addr, next);

1965

++	}

1966

++}

1967

++

1968

++/*

1969

++ * Page table pages are page-aligned.  The lower half of the top

1970

++ * level is used for userspace and the top half for the kernel.

1971

++ * This returns true for user pages that need to get copied into

1972

++ * both the user and kernel copies of the page tables, and false

1973

++ * for kernel pages that should only be in the kernel copy.

1974

++ */

1975

++static inline bool is_userspace_pgd(pgd_t *pgdp)

1976

++{

1977

++	return ((unsigned long)pgdp % PAGE_SIZE) < (PAGE_SIZE / 2);

1978

++}

1979

++

1980

++pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd)

1981

++{

1982

++	if (!kaiser_enabled)

1983

++		return pgd;

1984

++	/*

1985

++	 * Do we need to also populate the shadow pgd?  Check _PAGE_USER to

1986

++	 * skip cases like kexec and EFI which make temporary low mappings.

1987

++	 */

1988

++	if (pgd.pgd & _PAGE_USER) {

1989

++		if (is_userspace_pgd(pgdp)) {

1990

++			native_get_shadow_pgd(pgdp)->pgd = pgd.pgd;

1991

++			/*

1992

++			 * Even if the entry is *mapping* userspace, ensure

1993

++			 * that userspace can not use it.  This way, if we

1994

++			 * get out to userspace running on the kernel CR3,

1995

++			 * userspace will crash instead of running.

1996

++			 */

1997

++			if (__supported_pte_mask & _PAGE_NX)

1998

++				pgd.pgd |= _PAGE_NX;

1999

++		}

2000

++	} else if (!pgd.pgd) {

2001

++		/*

2002

++		 * pgd_clear() cannot check _PAGE_USER, and is even used to

2003

++		 * clear corrupted pgd entries: so just rely on cases like

2004

++		 * kexec and EFI never to be using pgd_clear().

2005

++		 */

2006

++		if (!WARN_ON_ONCE((unsigned long)pgdp & PAGE_SIZE) &&

2007

++		    is_userspace_pgd(pgdp))

2008

++			native_get_shadow_pgd(pgdp)->pgd = pgd.pgd;

2009

++	}

2010

++	return pgd;

2011

++}

2012

++

2013

++void kaiser_setup_pcid(void)

2014

++{

2015

++	unsigned long user_cr3 = KAISER_SHADOW_PGD_OFFSET;

2016

++

2017

++	if (this_cpu_has(X86_FEATURE_PCID))

2018

++		user_cr3 |= X86_CR3_PCID_USER_NOFLUSH;

2019

++	/*

2020

++	 * These variables are used by the entry/exit

2021

++	 * code to change PCID and pgd and TLB flushing.

2022

++	 */

2023

++	this_cpu_write(x86_cr3_pcid_user, user_cr3);

2024

++}

2025

++

2026

++/*

2027

++ * Make a note that this cpu will need to flush USER tlb on return to user.

2028

++ * If cpu does not have PCID, then the NOFLUSH bit will never have been set.

2029

++ */

2030

++void kaiser_flush_tlb_on_return_to_user(void)

2031

++{

2032

++	if (this_cpu_has(X86_FEATURE_PCID))

2033

++		this_cpu_write(x86_cr3_pcid_user,

2034

++			X86_CR3_PCID_USER_FLUSH | KAISER_SHADOW_PGD_OFFSET);

2035

++}

2036

++EXPORT_SYMBOL(kaiser_flush_tlb_on_return_to_user);

2037

+diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c

2038

+index aed206475aa7..319183d93602 100644

2039

+--- a/arch/x86/mm/kaslr.c

2040

++++ b/arch/x86/mm/kaslr.c

2041

+@@ -189,6 +189,6 @@ void __meminit init_trampoline(void)

2042

+ 		*pud_tramp = *pud;

2043

+ 	}

2044

+

2045

+-	set_pgd(&trampoline_pgd_entry,

2046

+-		__pgd(_KERNPG_TABLE | __pa(pud_page_tramp)));

2047

++	/* Avoid set_pgd(), in case it's complicated by CONFIG_PAGE_TABLE_ISOLATION */

2048

++	trampoline_pgd_entry = __pgd(_KERNPG_TABLE | __pa(pud_page_tramp));

2049

+ }

2050

+diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c

2051

+index e3353c97d086..73dcb0e18c1b 100644

2052

+--- a/arch/x86/mm/pageattr.c

2053

++++ b/arch/x86/mm/pageattr.c

2054

+@@ -52,6 +52,7 @@ static DEFINE_SPINLOCK(cpa_lock);

2055

+ #define CPA_FLUSHTLB 1

2056

+ #define CPA_ARRAY 2

2057

+ #define CPA_PAGES_ARRAY 4

2058

++#define CPA_FREE_PAGETABLES 8

2059

+

2060

+ #ifdef CONFIG_PROC_FS

2061

+ static unsigned long direct_pages_count[PG_LEVEL_NUM];

2062

+@@ -729,10 +730,13 @@ static int split_large_page(struct cpa_data *cpa, pte_t *kpte,

2063

+ 	return 0;

2064

+ }

2065

+

2066

+-static bool try_to_free_pte_page(pte_t *pte)

2067

++static bool try_to_free_pte_page(struct cpa_data *cpa, pte_t *pte)

2068

+ {

2069

+ 	int i;

2070

+

2071

++	if (!(cpa->flags & CPA_FREE_PAGETABLES))

2072

++		return false;

2073

++

2074

+ 	for (i = 0; i < PTRS_PER_PTE; i++)

2075

+ 		if (!pte_none(pte[i]))

2076

+ 			return false;

2077

+@@ -741,10 +745,13 @@ static bool try_to_free_pte_page(pte_t *pte)

2078

+ 	return true;

2079

+ }

2080

+

2081

+-static bool try_to_free_pmd_page(pmd_t *pmd)

2082

++static bool try_to_free_pmd_page(struct cpa_data *cpa, pmd_t *pmd)

2083

+ {

2084

+ 	int i;

2085

+

2086

++	if (!(cpa->flags & CPA_FREE_PAGETABLES))

2087

++		return false;

2088

++

2089

+ 	for (i = 0; i < PTRS_PER_PMD; i++)

2090

+ 		if (!pmd_none(pmd[i]))

2091

+ 			return false;

2092

+@@ -753,7 +760,9 @@ static bool try_to_free_pmd_page(pmd_t *pmd)

2093

+ 	return true;

2094

+ }

2095

+

2096

+-static bool unmap_pte_range(pmd_t *pmd, unsigned long start, unsigned long end)

2097

++static bool unmap_pte_range(struct cpa_data *cpa, pmd_t *pmd,

2098

++			    unsigned long start,

2099

++			    unsigned long end)

2100

+ {

2101

+ 	pte_t *pte = pte_offset_kernel(pmd, start);

2102

+

2103

+@@ -764,22 +773,23 @@ static bool unmap_pte_range(pmd_t *pmd, unsigned long start, unsigned long end)

2104

+ 		pte++;

2105

+ 	}

2106

+

2107

+-	if (try_to_free_pte_page((pte_t *)pmd_page_vaddr(*pmd))) {

2108

++	if (try_to_free_pte_page(cpa, (pte_t *)pmd_page_vaddr(*pmd))) {

2109

+ 		pmd_clear(pmd);

2110

+ 		return true;

2111

+ 	}

2112

+ 	return false;

2113

+ }

2114

+

2115

+-static void __unmap_pmd_range(pud_t *pud, pmd_t *pmd,

2116

++static void __unmap_pmd_range(struct cpa_data *cpa, pud_t *pud, pmd_t *pmd,

2117

+ 			      unsigned long start, unsigned long end)

2118

+ {

2119

+-	if (unmap_pte_range(pmd, start, end))

2120

+-		if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud)))

2121

++	if (unmap_pte_range(cpa, pmd, start, end))

2122

++		if (try_to_free_pmd_page(cpa, (pmd_t *)pud_page_vaddr(*pud)))

2123

+ 			pud_clear(pud);

2124

+ }

2125

+

2126

+-static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end)

2127

++static void unmap_pmd_range(struct cpa_data *cpa, pud_t *pud,

2128

++			    unsigned long start, unsigned long end)

2129

+ {

2130

+ 	pmd_t *pmd = pmd_offset(pud, start);

2131

+

2132

+@@ -790,7 +800,7 @@ static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end)

2133

+ 		unsigned long next_page = (start + PMD_SIZE) & PMD_MASK;

2134

+ 		unsigned long pre_end = min_t(unsigned long, end, next_page);

2135

+

2136

+-		__unmap_pmd_range(pud, pmd, start, pre_end);

2137

++		__unmap_pmd_range(cpa, pud, pmd, start, pre_end);

2138

+

2139

+ 		start = pre_end;

2140

+ 		pmd++;

2141

+@@ -803,7 +813,8 @@ static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end)

2142

+ 		if (pmd_large(*pmd))

2143

+ 			pmd_clear(pmd);

2144

+ 		else

2145

+-			__unmap_pmd_range(pud, pmd, start, start + PMD_SIZE);

2146

++			__unmap_pmd_range(cpa, pud, pmd,

2147

++					  start, start + PMD_SIZE);

2148

+

2149

+ 		start += PMD_SIZE;

2150

+ 		pmd++;

2151

+@@ -813,17 +824,19 @@ static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end)

2152

+ 	 * 4K leftovers?

2153

+ 	 */

2154

+ 	if (start < end)

2155

+-		return __unmap_pmd_range(pud, pmd, start, end);

2156

++		return __unmap_pmd_range(cpa, pud, pmd, start, end);

2157

+

2158

+ 	/*

2159

+ 	 * Try again to free the PMD page if haven't succeeded above.

2160

+ 	 */

2161

+ 	if (!pud_none(*pud))

2162

+-		if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud)))

2163

++		if (try_to_free_pmd_page(cpa, (pmd_t *)pud_page_vaddr(*pud)))

2164

+ 			pud_clear(pud);

2165

+ }

2166

+

2167

+-static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)

2168

++static void __unmap_pud_range(struct cpa_data *cpa, pgd_t *pgd,

2169

++			      unsigned long start,

2170

++			      unsigned long end)

2171

+ {

2172

+ 	pud_t *pud = pud_offset(pgd, start);

2173

+

2174

+@@ -834,7 +847,7 @@ static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)

2175

+ 		unsigned long next_page = (start + PUD_SIZE) & PUD_MASK;

2176

+ 		unsigned long pre_end	= min_t(unsigned long, end, next_page);

2177

+

2178

+-		unmap_pmd_range(pud, start, pre_end);

2179

++		unmap_pmd_range(cpa, pud, start, pre_end);

2180

+

2181

+ 		start = pre_end;

2182

+ 		pud++;

2183

+@@ -848,7 +861,7 @@ static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)

2184

+ 		if (pud_large(*pud))

2185

+ 			pud_clear(pud);

2186

+ 		else

2187

+-			unmap_pmd_range(pud, start, start + PUD_SIZE);

2188

++			unmap_pmd_range(cpa, pud, start, start + PUD_SIZE);

2189

+

2190

+ 		start += PUD_SIZE;

2191

+ 		pud++;

2192

+@@ -858,7 +871,7 @@ static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)

2193

+ 	 * 2M leftovers?

2194

+ 	 */

2195

+ 	if (start < end)

2196

+-		unmap_pmd_range(pud, start, end);

2197

++		unmap_pmd_range(cpa, pud, start, end);

2198

+

2199

+ 	/*

2200

+ 	 * No need to try to free the PUD page because we'll free it in

2201

+@@ -866,6 +879,24 @@ static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)

2202

+ 	 */

2203

+ }

2204

+

2205

++static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)

2206

++{

2207

++	struct cpa_data cpa = {

2208

++		.flags = CPA_FREE_PAGETABLES,

2209

++	};

2210

++

2211

++	__unmap_pud_range(&cpa, pgd, start, end);

2212

++}

2213

++

2214

++void unmap_pud_range_nofree(pgd_t *pgd, unsigned long start, unsigned long end)

2215

++{

2216

++	struct cpa_data cpa = {

2217

++		.flags = 0,

2218

++	};

2219

++

2220

++	__unmap_pud_range(&cpa, pgd, start, end);

2221

++}

2222

++

2223

+ static int alloc_pte_page(pmd_t *pmd)

2224

+ {

2225

+ 	pte_t *pte = (pte_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK);

2226

+diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c

2227

+index 3feec5af4e67..5aaec8effc5f 100644

2228

+--- a/arch/x86/mm/pgtable.c

2229

++++ b/arch/x86/mm/pgtable.c

2230

+@@ -344,14 +344,22 @@ static inline void _pgd_free(pgd_t *pgd)

2231

+ 		kmem_cache_free(pgd_cache, pgd);

2232

+ }

2233

+ #else

2234

++

2235

++/*

2236

++ * Instead of one pgd, Kaiser acquires two pgds.  Being order-1, it is

2237

++ * both 8k in size and 8k-aligned.  That lets us just flip bit 12

2238

++ * in a pointer to swap between the two 4k halves.

2239

++ */

2240

++#define PGD_ALLOCATION_ORDER	kaiser_enabled

2241

++

2242

+ static inline pgd_t *_pgd_alloc(void)

2243

+ {

2244

+-	return (pgd_t *)__get_free_page(PGALLOC_GFP);

2245

++	return (pgd_t *)__get_free_pages(PGALLOC_GFP, PGD_ALLOCATION_ORDER);

2246

+ }

2247

+

2248

+ static inline void _pgd_free(pgd_t *pgd)

2249

+ {

2250

+-	free_page((unsigned long)pgd);

2251

++	free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER);

2252

+ }

2253

+ #endif /* CONFIG_X86_PAE */

2254

+

2255

+diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c

2256

+index 53b72fb4e781..41205de487e7 100644

2257

+--- a/arch/x86/mm/tlb.c

2258

++++ b/arch/x86/mm/tlb.c

2259

+@@ -6,13 +6,14 @@

2260

+ #include <linux/interrupt.h>

2261

+ #include <linux/export.h>

2262

+ #include <linux/cpu.h>

2263

++#include <linux/debugfs.h>

2264

+

2265

+ #include <asm/tlbflush.h>

2266

+ #include <asm/mmu_context.h>

2267

+ #include <asm/cache.h>

2268

+ #include <asm/apic.h>

2269

+ #include <asm/uv/uv.h>

2270

+-#include <linux/debugfs.h>

2271

++#include <asm/kaiser.h>

2272

+

2273

+ /*

2274

+  *	TLB flushing, formerly SMP-only

2275

+@@ -34,6 +35,36 @@ struct flush_tlb_info {

2276

+ 	unsigned long flush_end;

2277

+ };

2278

+

2279

++static void load_new_mm_cr3(pgd_t *pgdir)

2280

++{

2281

++	unsigned long new_mm_cr3 = __pa(pgdir);

2282

++

2283

++	if (kaiser_enabled) {

2284

++		/*

2285

++		 * We reuse the same PCID for different tasks, so we must

2286

++		 * flush all the entries for the PCID out when we change tasks.

2287

++		 * Flush KERN below, flush USER when returning to userspace in

2288

++		 * kaiser's SWITCH_USER_CR3 (_SWITCH_TO_USER_CR3) macro.

2289

++		 *

2290

++		 * invpcid_flush_single_context(X86_CR3_PCID_ASID_USER) could

2291

++		 * do it here, but can only be used if X86_FEATURE_INVPCID is

2292

++		 * available - and many machines support pcid without invpcid.

2293

++		 *

2294

++		 * If X86_CR3_PCID_KERN_FLUSH actually added something, then it

2295

++		 * would be needed in the write_cr3() below - if PCIDs enabled.

2296

++		 */

2297

++		BUILD_BUG_ON(X86_CR3_PCID_KERN_FLUSH);

2298

++		kaiser_flush_tlb_on_return_to_user();

2299

++	}

2300

++

2301

++	/*

2302

++	 * Caution: many callers of this function expect

2303

++	 * that load_cr3() is serializing and orders TLB

2304

++	 * fills with respect to the mm_cpumask writes.

2305

++	 */

2306

++	write_cr3(new_mm_cr3);

2307

++}

2308

++

2309

+ /*

2310

+  * We cannot call mmdrop() because we are in interrupt context,

2311

+  * instead update mm->cpu_vm_mask.

2312

+@@ -45,7 +76,7 @@ void leave_mm(int cpu)

2313

+ 		BUG();

2314

+ 	if (cpumask_test_cpu(cpu, mm_cpumask(active_mm))) {

2315

+ 		cpumask_clear_cpu(cpu, mm_cpumask(active_mm));

2316

+-		load_cr3(swapper_pg_dir);

2317

++		load_new_mm_cr3(swapper_pg_dir);

2318

+ 		/*

2319

+ 		 * This gets called in the idle path where RCU

2320

+ 		 * functions differently.  Tracing normally

2321

+@@ -120,7 +151,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,

2322

+ 		 * ordering guarantee we need.

2323

+ 		 *

2324

+ 		 */

2325

+-		load_cr3(next->pgd);

2326

++		load_new_mm_cr3(next->pgd);

2327

+

2328

+ 		trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);

2329

+

2330

+@@ -167,7 +198,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,

2331

+ 			 * As above, load_cr3() is serializing and orders TLB

2332

+ 			 * fills with respect to the mm_cpumask write.

2333

+ 			 */

2334

+-			load_cr3(next->pgd);

2335

++			load_new_mm_cr3(next->pgd);

2336

+ 			trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);

2337

+ 			load_mm_cr4(next);

2338

+ 			load_mm_ldt(next);

2339

+diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h

2340

+index dc81e5287ebf..2e6000a4eb2c 100644

2341

+--- a/include/asm-generic/vmlinux.lds.h

2342

++++ b/include/asm-generic/vmlinux.lds.h

2343

+@@ -778,7 +778,14 @@

2344

+  */

2345

+ #define PERCPU_INPUT(cacheline)						\

2346

+ 	VMLINUX_SYMBOL(__per_cpu_start) = .;				\

2347

++	VMLINUX_SYMBOL(__per_cpu_user_mapped_start) = .;		\

2348

+ 	*(.data..percpu..first)						\

2349

++	. = ALIGN(cacheline);						\

2350

++	*(.data..percpu..user_mapped)					\

2351

++	*(.data..percpu..user_mapped..shared_aligned)			\

2352

++	. = ALIGN(PAGE_SIZE);						\

2353

++	*(.data..percpu..user_mapped..page_aligned)			\

2354

++	VMLINUX_SYMBOL(__per_cpu_user_mapped_end) = .;			\

2355

+ 	. = ALIGN(PAGE_SIZE);						\

2356

+ 	*(.data..percpu..page_aligned)					\

2357

+ 	. = ALIGN(cacheline);						\

2358

+diff --git a/include/linux/kaiser.h b/include/linux/kaiser.h

2359

+new file mode 100644

2360

+index 000000000000..58c55b1589d0

2361

+--- /dev/null

2362

++++ b/include/linux/kaiser.h

2363

+@@ -0,0 +1,52 @@

2364

++#ifndef _LINUX_KAISER_H

2365

++#define _LINUX_KAISER_H

2366

++

2367

++#ifdef CONFIG_PAGE_TABLE_ISOLATION

2368

++#include <asm/kaiser.h>

2369

++

2370

++static inline int kaiser_map_thread_stack(void *stack)

2371

++{

2372

++	/*

2373

++	 * Map that page of kernel stack on which we enter from user context.

2374

++	 */

2375

++	return kaiser_add_mapping((unsigned long)stack +

2376

++			THREAD_SIZE - PAGE_SIZE, PAGE_SIZE, __PAGE_KERNEL);

2377

++}

2378

++

2379

++static inline void kaiser_unmap_thread_stack(void *stack)

2380

++{

2381

++	/*

2382

++	 * Note: may be called even when kaiser_map_thread_stack() failed.

2383

++	 */

2384

++	kaiser_remove_mapping((unsigned long)stack +

2385

++			THREAD_SIZE - PAGE_SIZE, PAGE_SIZE);

2386

++}

2387

++#else

2388

++

2389

++/*

2390

++ * These stubs are used whenever CONFIG_PAGE_TABLE_ISOLATION is off, which

2391

++ * includes architectures that support KAISER, but have it disabled.

2392

++ */

2393

++

2394

++static inline void kaiser_init(void)

2395

++{

2396

++}

2397

++static inline int kaiser_add_mapping(unsigned long addr,

2398

++				     unsigned long size, unsigned long flags)

2399

++{

2400

++	return 0;

2401

++}

2402

++static inline void kaiser_remove_mapping(unsigned long start,

2403

++					 unsigned long size)

2404

++{

2405

++}

2406

++static inline int kaiser_map_thread_stack(void *stack)

2407

++{

2408

++	return 0;

2409

++}

2410

++static inline void kaiser_unmap_thread_stack(void *stack)

2411

++{

2412

++}

2413

++

2414

++#endif /* !CONFIG_PAGE_TABLE_ISOLATION */

2415

++#endif /* _LINUX_KAISER_H */

2416

+diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h

2417

+index fff21a82780c..490f5a83f947 100644

2418

+--- a/include/linux/mmzone.h

2419

++++ b/include/linux/mmzone.h

2420

+@@ -124,8 +124,9 @@ enum zone_stat_item {

2421

+ 	NR_SLAB_UNRECLAIMABLE,

2422

+ 	NR_PAGETABLE,		/* used for pagetables */

2423

+ 	NR_KERNEL_STACK_KB,	/* measured in KiB */

2424

+-	/* Second 128 byte cacheline */

2425

++	NR_KAISERTABLE,

2426

+ 	NR_BOUNCE,

2427

++	/* Second 128 byte cacheline */

2428

+ #if IS_ENABLED(CONFIG_ZSMALLOC)

2429

+ 	NR_ZSPAGES,		/* allocated in zsmalloc */

2430

+ #endif

2431

+diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h

2432

+index 8f16299ca068..8902f23bb770 100644

2433

+--- a/include/linux/percpu-defs.h

2434

++++ b/include/linux/percpu-defs.h

2435

+@@ -35,6 +35,12 @@

2436

+

2437

+ #endif

2438

+

2439

++#ifdef CONFIG_PAGE_TABLE_ISOLATION

2440

++#define USER_MAPPED_SECTION "..user_mapped"

2441

++#else

2442

++#define USER_MAPPED_SECTION ""

2443

++#endif

2444

++

2445

+ /*

2446

+  * Base implementations of per-CPU variable declarations and definitions, where

2447

+  * the section in which the variable is to be placed is provided by the

2448

+@@ -115,6 +121,12 @@

2449

+ #define DEFINE_PER_CPU(type, name)					\

2450

+ 	DEFINE_PER_CPU_SECTION(type, name, "")

2451

+

2452

++#define DECLARE_PER_CPU_USER_MAPPED(type, name)				\

2453

++	DECLARE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION)

2454

++

2455

++#define DEFINE_PER_CPU_USER_MAPPED(type, name)				\

2456

++	DEFINE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION)

2457

++

2458

+ /*

2459

+  * Declaration/definition used for per-CPU variables that must come first in

2460

+  * the set of variables.

2461

+@@ -144,6 +156,14 @@

2462

+ 	DEFINE_PER_CPU_SECTION(type, name, PER_CPU_SHARED_ALIGNED_SECTION) \

2463

+ 	____cacheline_aligned_in_smp

2464

+

2465

++#define DECLARE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(type, name)		\

2466

++	DECLARE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION PER_CPU_SHARED_ALIGNED_SECTION) \

2467

++	____cacheline_aligned_in_smp

2468

++

2469

++#define DEFINE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(type, name)		\

2470

++	DEFINE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION PER_CPU_SHARED_ALIGNED_SECTION) \

2471

++	____cacheline_aligned_in_smp

2472

++

2473

+ #define DECLARE_PER_CPU_ALIGNED(type, name)				\

2474

+ 	DECLARE_PER_CPU_SECTION(type, name, PER_CPU_ALIGNED_SECTION)	\

2475

+ 	____cacheline_aligned

2476

+@@ -162,11 +182,21 @@

2477

+ #define DEFINE_PER_CPU_PAGE_ALIGNED(type, name)				\

2478

+ 	DEFINE_PER_CPU_SECTION(type, name, "..page_aligned")		\

2479

+ 	__aligned(PAGE_SIZE)

2480

++/*

2481

++ * Declaration/definition used for per-CPU variables that must be page aligned and need to be mapped in user mode.

2482

++ */

2483

++#define DECLARE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(type, name)		\

2484

++	DECLARE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION"..page_aligned") \

2485

++	__aligned(PAGE_SIZE)

2486

++

2487

++#define DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(type, name)		\

2488

++	DEFINE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION"..page_aligned") \

2489

++	__aligned(PAGE_SIZE)

2490

+

2491

+ /*

2492

+  * Declaration/definition used for per-CPU variables that must be read mostly.

2493

+  */

2494

+-#define DECLARE_PER_CPU_READ_MOSTLY(type, name)			\

2495

++#define DECLARE_PER_CPU_READ_MOSTLY(type, name)				\

2496

+ 	DECLARE_PER_CPU_SECTION(type, name, "..read_mostly")

2497

+

2498

+ #define DEFINE_PER_CPU_READ_MOSTLY(type, name)				\

2499

+diff --git a/init/main.c b/init/main.c

2500

+index 25bac88bc66e..99f026565608 100644

2501

+--- a/init/main.c

2502

++++ b/init/main.c

2503

+@@ -80,6 +80,7 @@

2504

+ #include <linux/integrity.h>

2505

+ #include <linux/proc_ns.h>

2506

+ #include <linux/io.h>

2507

++#include <linux/kaiser.h>

2508

+

2509

+ #include <asm/io.h>

2510

+ #include <asm/bugs.h>

2511

+@@ -473,6 +474,7 @@ static void __init mm_init(void)

2512

+ 	pgtable_init();

2513

+ 	vmalloc_init();

2514

+ 	ioremap_huge_init();

2515

++	kaiser_init();

2516

+ }

2517

+

2518

+ asmlinkage __visible void __init start_kernel(void)

2519

+diff --git a/kernel/fork.c b/kernel/fork.c

2520

+index 9321b1ad3335..70e10cb49be0 100644

2521

+--- a/kernel/fork.c

2522

++++ b/kernel/fork.c

2523

+@@ -58,6 +58,7 @@

2524

+ #include <linux/tsacct_kern.h>

2525

+ #include <linux/cn_proc.h>

2526

+ #include <linux/freezer.h>

2527

++#include <linux/kaiser.h>

2528

+ #include <linux/delayacct.h>

2529

+ #include <linux/taskstats_kern.h>

2530

+ #include <linux/random.h>

2531

+@@ -213,6 +214,7 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)

2532

+

2533

+ static inline void free_thread_stack(struct task_struct *tsk)

2534

+ {

2535

++	kaiser_unmap_thread_stack(tsk->stack);

2536

+ #ifdef CONFIG_VMAP_STACK

2537

+ 	if (task_stack_vm_area(tsk)) {

2538

+ 		unsigned long flags;

2539

+@@ -495,6 +497,10 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)

2540

+ 	 * functions again.

2541

+ 	 */

2542

+ 	tsk->stack = stack;

2543

++

2544

++	err= kaiser_map_thread_stack(tsk->stack);

2545

++	if (err)

2546

++		goto free_stack;

2547

+ #ifdef CONFIG_VMAP_STACK

2548

+ 	tsk->stack_vm_area = stack_vm_area;

2549

+ #endif

2550

+diff --git a/mm/vmstat.c b/mm/vmstat.c

2551

+index 604f26a4f696..6a088df04b29 100644

2552

+--- a/mm/vmstat.c

2553

++++ b/mm/vmstat.c

2554

+@@ -932,6 +932,7 @@ const char * const vmstat_text[] = {

2555

+ 	"nr_slab_unreclaimable",

2556

+ 	"nr_page_table_pages",

2557

+ 	"nr_kernel_stack",

2558

++	"nr_overhead",

2559

+ 	"nr_bounce",

2560

+ #if IS_ENABLED(CONFIG_ZSMALLOC)

2561

+ 	"nr_zspages",

2562

+diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c

2563

+index 97f9cac98348..e86a34fd5484 100644

2564

+--- a/net/ipv4/tcp_bbr.c

2565

++++ b/net/ipv4/tcp_bbr.c

2566

+@@ -843,6 +843,11 @@ static u32 bbr_sndbuf_expand(struct sock *sk)

2567

+  */

2568

+ static u32 bbr_undo_cwnd(struct sock *sk)

2569

+ {

2570

++	struct bbr *bbr = inet_csk_ca(sk);

2571

++

2572

++	bbr->full_bw = 0;   /* spurious slow-down; reset full pipe detection */

2573

++	bbr->full_bw_cnt = 0;

2574

++	bbr_reset_lt_bw_sampling(sk);

2575

+ 	return tcp_sk(sk)->snd_cwnd;

2576

+ }

2577

+

2578

+diff --git a/security/Kconfig b/security/Kconfig

2579

+index 118f4549404e..32f36b40e9f0 100644

2580

+--- a/security/Kconfig

2581

++++ b/security/Kconfig

2582

+@@ -31,6 +31,16 @@ config SECURITY

2583

+

2584

+ 	  If you are unsure how to answer this question, answer N.

2585

+

2586

++config PAGE_TABLE_ISOLATION

2587

++	bool "Remove the kernel mapping in user mode"

2588

++	default y

2589

++	depends on X86_64 && SMP

2590

++	help

2591

++	  This enforces a strict kernel and user space isolation, in order

2592

++	  to close hardware side channels on kernel address information.

2593

++

2594

++	  If you are unsure how to answer this question, answer Y.

2595

++

2596

+ config SECURITYFS

2597

+ 	bool "Enable the securityfs filesystem"

2598

+ 	help

2599

+diff --git a/tools/arch/x86/include/asm/cpufeatures.h b/tools/arch/x86/include/asm/cpufeatures.h

2600

+index a39629206864..f79669a38c0c 100644

2601

+--- a/tools/arch/x86/include/asm/cpufeatures.h

2602

++++ b/tools/arch/x86/include/asm/cpufeatures.h

2603

+@@ -197,6 +197,9 @@

2604

+ #define X86_FEATURE_AVX512_4VNNIW (7*32+16) /* AVX-512 Neural Network Instructions */

2605

+ #define X86_FEATURE_AVX512_4FMAPS (7*32+17) /* AVX-512 Multiply Accumulation Single precision */

2606

+

2607

++/* Because the ALTERNATIVE scheme is for members of the X86_FEATURE club... */

2608

++#define X86_FEATURE_KAISER	( 7*32+31) /* CONFIG_PAGE_TABLE_ISOLATION w/o nokaiser */

2609

++

2610

+ /* Virtualization flags: Linux defined, word 8 */

2611

+ #define X86_FEATURE_TPR_SHADOW  ( 8*32+ 0) /* Intel TPR Shadow */

2612

+ #define X86_FEATURE_VNMI        ( 8*32+ 1) /* Intel Virtual NMI */

Gentoo Archives: gentoo-commits