[gentoo-commits] proj/linux-patches:4.4 commit in: / - gentoo-commits

From:	Alice Ferrazzi <alicef@g.o>
To:	gentoo-commits@l.g.o
Subject:	[gentoo-commits] proj/linux-patches:4.4 commit in: /
Date:	Fri, 05 Jan 2018 15:05:55
Message-Id:	`1515164738.c7d7705101af05e259b3b84ffc59a60ff2b96142.alicef@gentoo`

1

commit:     c7d7705101af05e259b3b84ffc59a60ff2b96142

2

Author:     Alice Ferrazzi <alicef <AT> gentoo <DOT> org>

3

AuthorDate: Fri Jan  5 15:05:38 2018 +0000

4

Commit:     Alice Ferrazzi <alicef <AT> gentoo <DOT> org>

5

CommitDate: Fri Jan  5 15:05:38 2018 +0000

6

URL:        https://gitweb.gentoo.org/proj/linux-patches.git/commit/?id=c7d77051

7

8

linux kernel 4.4.110

9

10

 0000_README              |    4 +

11

 1109_linux-4.4.110.patch | 2814 ++++++++++++++++++++++++++++++++++++++++++++++

12

 2 files changed, 2818 insertions(+)

13

14

diff --git a/0000_README b/0000_README

15

index 3be106c..46149de 100644

16

--- a/0000_README

17

+++ b/0000_README

18

@@ -479,6 +479,10 @@ Patch:  1108_linux-4.4.109.patch

19

 From:   http://www.kernel.org

20

 Desc:   Linux 4.4.109

21

22

+Patch:  1109_linux-4.4.110.patch

23

+From:   http://www.kernel.org

24

+Desc:   Linux 4.4.110

25

+

26

 Patch:  1500_XATTR_USER_PREFIX.patch

27

 From:   https://bugs.gentoo.org/show_bug.cgi?id=470644

28

 Desc:   Support for namespace user.pax.* on tmpfs.

29

30

diff --git a/1109_linux-4.4.110.patch b/1109_linux-4.4.110.patch

31

new file mode 100644

32

index 0000000..1c226ed

33

--- /dev/null

34

+++ b/1109_linux-4.4.110.patch

35

@@ -0,0 +1,2814 @@

36

+diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt

37

+index b4a83a490212..5977c4d71356 100644

38

+--- a/Documentation/kernel-parameters.txt

39

++++ b/Documentation/kernel-parameters.txt

40

+@@ -2523,6 +2523,8 @@ bytes respectively. Such letter suffixes can also be entirely omitted.

41

+

42

+ 	nojitter	[IA-64] Disables jitter checking for ITC timers.

43

+

44

++	nopti		[X86-64] Disable KAISER isolation of kernel from user.

45

++

46

+ 	no-kvmclock	[X86,KVM] Disable paravirtualized KVM clock driver

47

+

48

+ 	no-kvmapf	[X86,KVM] Disable paravirtualized asynchronous page

49

+@@ -3054,6 +3056,12 @@ bytes respectively. Such letter suffixes can also be entirely omitted.

50

+ 	pt.		[PARIDE]

51

+ 			See Documentation/blockdev/paride.txt.

52

+

53

++	pti=		[X86_64]

54

++			Control KAISER user/kernel address space isolation:

55

++			on - enable

56

++			off - disable

57

++			auto - default setting

58

++

59

+ 	pty.legacy_count=

60

+ 			[KNL] Number of legacy pty's. Overwrites compiled-in

61

+ 			default number.

62

+diff --git a/Makefile b/Makefile

63

+index 5d67056e24dd..b028c106535b 100644

64

+--- a/Makefile

65

++++ b/Makefile

66

+@@ -1,6 +1,6 @@

67

+ VERSION = 4

68

+ PATCHLEVEL = 4

69

+-SUBLEVEL = 109

70

++SUBLEVEL = 110

71

+ EXTRAVERSION =

72

+ NAME = Blurry Fish Butt

73

+

74

+diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h

75

+index 3783dc3e10b3..4abb284a5b9c 100644

76

+--- a/arch/x86/boot/compressed/misc.h

77

++++ b/arch/x86/boot/compressed/misc.h

78

+@@ -9,6 +9,7 @@

79

+  */

80

+ #undef CONFIG_PARAVIRT

81

+ #undef CONFIG_PARAVIRT_SPINLOCKS

82

++#undef CONFIG_PAGE_TABLE_ISOLATION

83

+ #undef CONFIG_KASAN

84

+

85

+ #include <linux/linkage.h>

86

+diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S

87

+index cc0f2f5da19b..952b23b5d4e9 100644

88

+--- a/arch/x86/entry/entry_64.S

89

++++ b/arch/x86/entry/entry_64.S

90

+@@ -35,6 +35,7 @@

91

+ #include <asm/asm.h>

92

+ #include <asm/smap.h>

93

+ #include <asm/pgtable_types.h>

94

++#include <asm/kaiser.h>

95

+ #include <linux/err.h>

96

+

97

+ /* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this.  */

98

+@@ -135,6 +136,7 @@ ENTRY(entry_SYSCALL_64)

99

+ 	 * it is too small to ever cause noticeable irq latency.

100

+ 	 */

101

+ 	SWAPGS_UNSAFE_STACK

102

++	SWITCH_KERNEL_CR3_NO_STACK

103

+ 	/*

104

+ 	 * A hypervisor implementation might want to use a label

105

+ 	 * after the swapgs, so that it can do the swapgs

106

+@@ -207,9 +209,17 @@ entry_SYSCALL_64_fastpath:

107

+ 	testl	$_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)

108

+ 	jnz	int_ret_from_sys_call_irqs_off	/* Go to the slow path */

109

+

110

+-	RESTORE_C_REGS_EXCEPT_RCX_R11

111

+ 	movq	RIP(%rsp), %rcx

112

+ 	movq	EFLAGS(%rsp), %r11

113

++	RESTORE_C_REGS_EXCEPT_RCX_R11

114

++	/*

115

++	 * This opens a window where we have a user CR3, but are

116

++	 * running in the kernel.  This makes using the CS

117

++	 * register useless for telling whether or not we need to

118

++	 * switch CR3 in NMIs.  Normal interrupts are OK because

119

++	 * they are off here.

120

++	 */

121

++	SWITCH_USER_CR3

122

+ 	movq	RSP(%rsp), %rsp

123

+ 	/*

124

+ 	 * 64-bit SYSRET restores rip from rcx,

125

+@@ -347,10 +357,26 @@ GLOBAL(int_ret_from_sys_call)

126

+ syscall_return_via_sysret:

127

+ 	/* rcx and r11 are already restored (see code above) */

128

+ 	RESTORE_C_REGS_EXCEPT_RCX_R11

129

++	/*

130

++	 * This opens a window where we have a user CR3, but are

131

++	 * running in the kernel.  This makes using the CS

132

++	 * register useless for telling whether or not we need to

133

++	 * switch CR3 in NMIs.  Normal interrupts are OK because

134

++	 * they are off here.

135

++	 */

136

++	SWITCH_USER_CR3

137

+ 	movq	RSP(%rsp), %rsp

138

+ 	USERGS_SYSRET64

139

+

140

+ opportunistic_sysret_failed:

141

++	/*

142

++	 * This opens a window where we have a user CR3, but are

143

++	 * running in the kernel.  This makes using the CS

144

++	 * register useless for telling whether or not we need to

145

++	 * switch CR3 in NMIs.  Normal interrupts are OK because

146

++	 * they are off here.

147

++	 */

148

++	SWITCH_USER_CR3

149

+ 	SWAPGS

150

+ 	jmp	restore_c_regs_and_iret

151

+ END(entry_SYSCALL_64)

152

+@@ -509,6 +535,7 @@ END(irq_entries_start)

153

+ 	 * tracking that we're in kernel mode.

154

+ 	 */

155

+ 	SWAPGS

156

++	SWITCH_KERNEL_CR3

157

+

158

+ 	/*

159

+ 	 * We need to tell lockdep that IRQs are off.  We can't do this until

160

+@@ -568,6 +595,7 @@ GLOBAL(retint_user)

161

+ 	mov	%rsp,%rdi

162

+ 	call	prepare_exit_to_usermode

163

+ 	TRACE_IRQS_IRETQ

164

++	SWITCH_USER_CR3

165

+ 	SWAPGS

166

+ 	jmp	restore_regs_and_iret

167

+

168

+@@ -625,6 +653,7 @@ native_irq_return_ldt:

169

+ 	pushq	%rax

170

+ 	pushq	%rdi

171

+ 	SWAPGS

172

++	SWITCH_KERNEL_CR3

173

+ 	movq	PER_CPU_VAR(espfix_waddr), %rdi

174

+ 	movq	%rax, (0*8)(%rdi)		/* RAX */

175

+ 	movq	(2*8)(%rsp), %rax		/* RIP */

176

+@@ -640,6 +669,7 @@ native_irq_return_ldt:

177

+ 	andl	$0xffff0000, %eax

178

+ 	popq	%rdi

179

+ 	orq	PER_CPU_VAR(espfix_stack), %rax

180

++	SWITCH_USER_CR3

181

+ 	SWAPGS

182

+ 	movq	%rax, %rsp

183

+ 	popq	%rax

184

+@@ -995,7 +1025,11 @@ idtentry machine_check					has_error_code=0	paranoid=1 do_sym=*machine_check_vec

185

+ /*

186

+  * Save all registers in pt_regs, and switch gs if needed.

187

+  * Use slow, but surefire "are we in kernel?" check.

188

+- * Return: ebx=0: need swapgs on exit, ebx=1: otherwise

189

++ *

190

++ * Return: ebx=0: needs swapgs but not SWITCH_USER_CR3 in paranoid_exit

191

++ *         ebx=1: needs neither swapgs nor SWITCH_USER_CR3 in paranoid_exit

192

++ *         ebx=2: needs both swapgs and SWITCH_USER_CR3 in paranoid_exit

193

++ *         ebx=3: needs SWITCH_USER_CR3 but not swapgs in paranoid_exit

194

+  */

195

+ ENTRY(paranoid_entry)

196

+ 	cld

197

+@@ -1008,7 +1042,26 @@ ENTRY(paranoid_entry)

198

+ 	js	1f				/* negative -> in kernel */

199

+ 	SWAPGS

200

+ 	xorl	%ebx, %ebx

201

+-1:	ret

202

++1:

203

++#ifdef CONFIG_PAGE_TABLE_ISOLATION

204

++	/*

205

++	 * We might have come in between a swapgs and a SWITCH_KERNEL_CR3

206

++	 * on entry, or between a SWITCH_USER_CR3 and a swapgs on exit.

207

++	 * Do a conditional SWITCH_KERNEL_CR3: this could safely be done

208

++	 * unconditionally, but we need to find out whether the reverse

209

++	 * should be done on return (conveyed to paranoid_exit in %ebx).

210

++	 */

211

++	ALTERNATIVE "jmp 2f", "movq %cr3, %rax", X86_FEATURE_KAISER

212

++	testl	$KAISER_SHADOW_PGD_OFFSET, %eax

213

++	jz	2f

214

++	orl	$2, %ebx

215

++	andq	$(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax

216

++	/* If PCID enabled, set X86_CR3_PCID_NOFLUSH_BIT */

217

++	ALTERNATIVE "", "bts $63, %rax", X86_FEATURE_PCID

218

++	movq	%rax, %cr3

219

++2:

220

++#endif

221

++	ret

222

+ END(paranoid_entry)

223

+

224

+ /*

225

+@@ -1021,19 +1074,26 @@ END(paranoid_entry)

226

+  * be complicated.  Fortunately, we there's no good reason

227

+  * to try to handle preemption here.

228

+  *

229

+- * On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it)

230

++ * On entry: ebx=0: needs swapgs but not SWITCH_USER_CR3

231

++ *           ebx=1: needs neither swapgs nor SWITCH_USER_CR3

232

++ *           ebx=2: needs both swapgs and SWITCH_USER_CR3

233

++ *           ebx=3: needs SWITCH_USER_CR3 but not swapgs

234

+  */

235

+ ENTRY(paranoid_exit)

236

+ 	DISABLE_INTERRUPTS(CLBR_NONE)

237

+ 	TRACE_IRQS_OFF_DEBUG

238

+-	testl	%ebx, %ebx			/* swapgs needed? */

239

++	TRACE_IRQS_IRETQ_DEBUG

240

++#ifdef CONFIG_PAGE_TABLE_ISOLATION

241

++	/* No ALTERNATIVE for X86_FEATURE_KAISER: paranoid_entry sets %ebx */

242

++	testl	$2, %ebx			/* SWITCH_USER_CR3 needed? */

243

++	jz	paranoid_exit_no_switch

244

++	SWITCH_USER_CR3

245

++paranoid_exit_no_switch:

246

++#endif

247

++	testl	$1, %ebx			/* swapgs needed? */

248

+ 	jnz	paranoid_exit_no_swapgs

249

+-	TRACE_IRQS_IRETQ

250

+ 	SWAPGS_UNSAFE_STACK

251

+-	jmp	paranoid_exit_restore

252

+ paranoid_exit_no_swapgs:

253

+-	TRACE_IRQS_IRETQ_DEBUG

254

+-paranoid_exit_restore:

255

+ 	RESTORE_EXTRA_REGS

256

+ 	RESTORE_C_REGS

257

+ 	REMOVE_PT_GPREGS_FROM_STACK 8

258

+@@ -1048,6 +1108,13 @@ ENTRY(error_entry)

259

+ 	cld

260

+ 	SAVE_C_REGS 8

261

+ 	SAVE_EXTRA_REGS 8

262

++	/*

263

++	 * error_entry() always returns with a kernel gsbase and

264

++	 * CR3.  We must also have a kernel CR3/gsbase before

265

++	 * calling TRACE_IRQS_*.  Just unconditionally switch to

266

++	 * the kernel CR3 here.

267

++	 */

268

++	SWITCH_KERNEL_CR3

269

+ 	xorl	%ebx, %ebx

270

+ 	testb	$3, CS+8(%rsp)

271

+ 	jz	.Lerror_kernelspace

272

+@@ -1210,6 +1277,10 @@ ENTRY(nmi)

273

+ 	 */

274

+

275

+ 	SWAPGS_UNSAFE_STACK

276

++	/*

277

++	 * percpu variables are mapped with user CR3, so no need

278

++	 * to switch CR3 here.

279

++	 */

280

+ 	cld

281

+ 	movq	%rsp, %rdx

282

+ 	movq	PER_CPU_VAR(cpu_current_top_of_stack), %rsp

283

+@@ -1243,12 +1314,34 @@ ENTRY(nmi)

284

+

285

+ 	movq	%rsp, %rdi

286

+ 	movq	$-1, %rsi

287

++#ifdef CONFIG_PAGE_TABLE_ISOLATION

288

++	/* Unconditionally use kernel CR3 for do_nmi() */

289

++	/* %rax is saved above, so OK to clobber here */

290

++	ALTERNATIVE "jmp 2f", "movq %cr3, %rax", X86_FEATURE_KAISER

291

++	/* If PCID enabled, NOFLUSH now and NOFLUSH on return */

292

++	ALTERNATIVE "", "bts $63, %rax", X86_FEATURE_PCID

293

++	pushq	%rax

294

++	/* mask off "user" bit of pgd address and 12 PCID bits: */

295

++	andq	$(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax

296

++	movq	%rax, %cr3

297

++2:

298

++#endif

299

+ 	call	do_nmi

300

+

301

++#ifdef CONFIG_PAGE_TABLE_ISOLATION

302

++	/*

303

++	 * Unconditionally restore CR3.  I know we return to

304

++	 * kernel code that needs user CR3, but do we ever return

305

++	 * to "user mode" where we need the kernel CR3?

306

++	 */

307

++	ALTERNATIVE "", "popq %rax; movq %rax, %cr3", X86_FEATURE_KAISER

308

++#endif

309

++

310

+ 	/*

311

+ 	 * Return back to user mode.  We must *not* do the normal exit

312

+-	 * work, because we don't want to enable interrupts.  Fortunately,

313

+-	 * do_nmi doesn't modify pt_regs.

314

++	 * work, because we don't want to enable interrupts.  Do not

315

++	 * switch to user CR3: we might be going back to kernel code

316

++	 * that had a user CR3 set.

317

+ 	 */

318

+ 	SWAPGS

319

+ 	jmp	restore_c_regs_and_iret

320

+@@ -1445,22 +1538,55 @@ end_repeat_nmi:

321

+ 	ALLOC_PT_GPREGS_ON_STACK

322

+

323

+ 	/*

324

+-	 * Use paranoid_entry to handle SWAPGS, but no need to use paranoid_exit

325

+-	 * as we should not be calling schedule in NMI context.

326

+-	 * Even with normal interrupts enabled. An NMI should not be

327

+-	 * setting NEED_RESCHED or anything that normal interrupts and

328

+-	 * exceptions might do.

329

++	 * Use the same approach as paranoid_entry to handle SWAPGS, but

330

++	 * without CR3 handling since we do that differently in NMIs.  No

331

++	 * need to use paranoid_exit as we should not be calling schedule

332

++	 * in NMI context.  Even with normal interrupts enabled. An NMI

333

++	 * should not be setting NEED_RESCHED or anything that normal

334

++	 * interrupts and exceptions might do.

335

+ 	 */

336

+-	call	paranoid_entry

337

+-

338

+-	/* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */

339

++	cld

340

++	SAVE_C_REGS

341

++	SAVE_EXTRA_REGS

342

++	movl	$1, %ebx

343

++	movl	$MSR_GS_BASE, %ecx

344

++	rdmsr

345

++	testl	%edx, %edx

346

++	js	1f				/* negative -> in kernel */

347

++	SWAPGS

348

++	xorl	%ebx, %ebx

349

++1:

350

+ 	movq	%rsp, %rdi

351

+ 	movq	$-1, %rsi

352

++#ifdef CONFIG_PAGE_TABLE_ISOLATION

353

++	/* Unconditionally use kernel CR3 for do_nmi() */

354

++	/* %rax is saved above, so OK to clobber here */

355

++	ALTERNATIVE "jmp 2f", "movq %cr3, %rax", X86_FEATURE_KAISER

356

++	/* If PCID enabled, NOFLUSH now and NOFLUSH on return */

357

++	ALTERNATIVE "", "bts $63, %rax", X86_FEATURE_PCID

358

++	pushq	%rax

359

++	/* mask off "user" bit of pgd address and 12 PCID bits: */

360

++	andq	$(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax

361

++	movq	%rax, %cr3

362

++2:

363

++#endif

364

++

365

++	/* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */

366

+ 	call	do_nmi

367

+

368

++#ifdef CONFIG_PAGE_TABLE_ISOLATION

369

++	/*

370

++	 * Unconditionally restore CR3.  We might be returning to

371

++	 * kernel code that needs user CR3, like just just before

372

++	 * a sysret.

373

++	 */

374

++	ALTERNATIVE "", "popq %rax; movq %rax, %cr3", X86_FEATURE_KAISER

375

++#endif

376

++

377

+ 	testl	%ebx, %ebx			/* swapgs needed? */

378

+ 	jnz	nmi_restore

379

+ nmi_swapgs:

380

++	/* We fixed up CR3 above, so no need to switch it here */

381

+ 	SWAPGS_UNSAFE_STACK

382

+ nmi_restore:

383

+ 	RESTORE_EXTRA_REGS

384

+diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S

385

+index 15cfebaa7688..d03bf0e28b8b 100644

386

+--- a/arch/x86/entry/entry_64_compat.S

387

++++ b/arch/x86/entry/entry_64_compat.S

388

+@@ -13,6 +13,8 @@

389

+ #include <asm/irqflags.h>

390

+ #include <asm/asm.h>

391

+ #include <asm/smap.h>

392

++#include <asm/pgtable_types.h>

393

++#include <asm/kaiser.h>

394

+ #include <linux/linkage.h>

395

+ #include <linux/err.h>

396

+

397

+@@ -50,6 +52,7 @@ ENDPROC(native_usergs_sysret32)

398

+ ENTRY(entry_SYSENTER_compat)

399

+ 	/* Interrupts are off on entry. */

400

+ 	SWAPGS_UNSAFE_STACK

401

++	SWITCH_KERNEL_CR3_NO_STACK

402

+ 	movq	PER_CPU_VAR(cpu_current_top_of_stack), %rsp

403

+

404

+ 	/*

405

+@@ -161,6 +164,7 @@ ENDPROC(entry_SYSENTER_compat)

406

+ ENTRY(entry_SYSCALL_compat)

407

+ 	/* Interrupts are off on entry. */

408

+ 	SWAPGS_UNSAFE_STACK

409

++	SWITCH_KERNEL_CR3_NO_STACK

410

+

411

+ 	/* Stash user ESP and switch to the kernel stack. */

412

+ 	movl	%esp, %r8d

413

+@@ -208,6 +212,7 @@ ENTRY(entry_SYSCALL_compat)

414

+ 	/* Opportunistic SYSRET */

415

+ sysret32_from_system_call:

416

+ 	TRACE_IRQS_ON			/* User mode traces as IRQs on. */

417

++	SWITCH_USER_CR3

418

+ 	movq	RBX(%rsp), %rbx		/* pt_regs->rbx */

419

+ 	movq	RBP(%rsp), %rbp		/* pt_regs->rbp */

420

+ 	movq	EFLAGS(%rsp), %r11	/* pt_regs->flags (in r11) */

421

+@@ -269,6 +274,7 @@ ENTRY(entry_INT80_compat)

422

+ 	PARAVIRT_ADJUST_EXCEPTION_FRAME

423

+ 	ASM_CLAC			/* Do this early to minimize exposure */

424

+ 	SWAPGS

425

++	SWITCH_KERNEL_CR3_NO_STACK

426

+

427

+ 	/*

428

+ 	 * User tracing code (ptrace or signal handlers) might assume that

429

+@@ -311,6 +317,7 @@ ENTRY(entry_INT80_compat)

430

+

431

+ 	/* Go back to user mode. */

432

+ 	TRACE_IRQS_ON

433

++	SWITCH_USER_CR3

434

+ 	SWAPGS

435

+ 	jmp	restore_regs_and_iret

436

+ END(entry_INT80_compat)

437

+diff --git a/arch/x86/entry/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c

438

+index ca94fa649251..5dd363d54348 100644

439

+--- a/arch/x86/entry/vdso/vclock_gettime.c

440

++++ b/arch/x86/entry/vdso/vclock_gettime.c

441

+@@ -36,6 +36,11 @@ static notrace cycle_t vread_hpet(void)

442

+ }

443

+ #endif

444

+

445

++#ifdef CONFIG_PARAVIRT_CLOCK

446

++extern u8 pvclock_page

447

++	__attribute__((visibility("hidden")));

448

++#endif

449

++

450

+ #ifndef BUILD_VDSO32

451

+

452

+ #include <linux/kernel.h>

453

+@@ -62,63 +67,65 @@ notrace static long vdso_fallback_gtod(struct timeval *tv, struct timezone *tz)

454

+

455

+ #ifdef CONFIG_PARAVIRT_CLOCK

456

+

457

+-static notrace const struct pvclock_vsyscall_time_info *get_pvti(int cpu)

458

++static notrace const struct pvclock_vsyscall_time_info *get_pvti0(void)

459

+ {

460

+-	const struct pvclock_vsyscall_time_info *pvti_base;

461

+-	int idx = cpu / (PAGE_SIZE/PVTI_SIZE);

462

+-	int offset = cpu % (PAGE_SIZE/PVTI_SIZE);

463

+-

464

+-	BUG_ON(PVCLOCK_FIXMAP_BEGIN + idx > PVCLOCK_FIXMAP_END);

465

+-

466

+-	pvti_base = (struct pvclock_vsyscall_time_info *)

467

+-		    __fix_to_virt(PVCLOCK_FIXMAP_BEGIN+idx);

468

+-

469

+-	return &pvti_base[offset];

470

++	return (const struct pvclock_vsyscall_time_info *)&pvclock_page;

471

+ }

472

+

473

+ static notrace cycle_t vread_pvclock(int *mode)

474

+ {

475

+-	const struct pvclock_vsyscall_time_info *pvti;

476

++	const struct pvclock_vcpu_time_info *pvti = &get_pvti0()->pvti;

477

+ 	cycle_t ret;

478

+-	u64 last;

479

+-	u32 version;

480

+-	u8 flags;

481

+-	unsigned cpu, cpu1;

482

+-

483

++	u64 tsc, pvti_tsc;

484

++	u64 last, delta, pvti_system_time;

485

++	u32 version, pvti_tsc_to_system_mul, pvti_tsc_shift;

486

+

487

+ 	/*

488

+-	 * Note: hypervisor must guarantee that:

489

+-	 * 1. cpu ID number maps 1:1 to per-CPU pvclock time info.

490

+-	 * 2. that per-CPU pvclock time info is updated if the

491

+-	 *    underlying CPU changes.

492

+-	 * 3. that version is increased whenever underlying CPU

493

+-	 *    changes.

494

++	 * Note: The kernel and hypervisor must guarantee that cpu ID

495

++	 * number maps 1:1 to per-CPU pvclock time info.

496

++	 *

497

++	 * Because the hypervisor is entirely unaware of guest userspace

498

++	 * preemption, it cannot guarantee that per-CPU pvclock time

499

++	 * info is updated if the underlying CPU changes or that that

500

++	 * version is increased whenever underlying CPU changes.

501

+ 	 *

502

++	 * On KVM, we are guaranteed that pvti updates for any vCPU are

503

++	 * atomic as seen by *all* vCPUs.  This is an even stronger

504

++	 * guarantee than we get with a normal seqlock.

505

++	 *

506

++	 * On Xen, we don't appear to have that guarantee, but Xen still

507

++	 * supplies a valid seqlock using the version field.

508

++

509

++	 * We only do pvclock vdso timing at all if

510

++	 * PVCLOCK_TSC_STABLE_BIT is set, and we interpret that bit to

511

++	 * mean that all vCPUs have matching pvti and that the TSC is

512

++	 * synced, so we can just look at vCPU 0's pvti.

513

+ 	 */

514

+-	do {

515

+-		cpu = __getcpu() & VGETCPU_CPU_MASK;

516

+-		/* TODO: We can put vcpu id into higher bits of pvti.version.

517

+-		 * This will save a couple of cycles by getting rid of

518

+-		 * __getcpu() calls (Gleb).

519

+-		 */

520

+-

521

+-		pvti = get_pvti(cpu);

522

+-

523

+-		version = __pvclock_read_cycles(&pvti->pvti, &ret, &flags);

524

+-

525

+-		/*

526

+-		 * Test we're still on the cpu as well as the version.

527

+-		 * We could have been migrated just after the first

528

+-		 * vgetcpu but before fetching the version, so we

529

+-		 * wouldn't notice a version change.

530

+-		 */

531

+-		cpu1 = __getcpu() & VGETCPU_CPU_MASK;

532

+-	} while (unlikely(cpu != cpu1 ||

533

+-			  (pvti->pvti.version & 1) ||

534

+-			  pvti->pvti.version != version));

535

+-

536

+-	if (unlikely(!(flags & PVCLOCK_TSC_STABLE_BIT)))

537

++

538

++	if (unlikely(!(pvti->flags & PVCLOCK_TSC_STABLE_BIT))) {

539

+ 		*mode = VCLOCK_NONE;

540

++		return 0;

541

++	}

542

++

543

++	do {

544

++		version = pvti->version;

545

++

546

++		/* This is also a read barrier, so we'll read version first. */

547

++		tsc = rdtsc_ordered();

548

++

549

++		pvti_tsc_to_system_mul = pvti->tsc_to_system_mul;

550

++		pvti_tsc_shift = pvti->tsc_shift;

551

++		pvti_system_time = pvti->system_time;

552

++		pvti_tsc = pvti->tsc_timestamp;

553

++

554

++		/* Make sure that the version double-check is last. */

555

++		smp_rmb();

556

++	} while (unlikely((version & 1) || version != pvti->version));

557

++

558

++	delta = tsc - pvti_tsc;

559

++	ret = pvti_system_time +

560

++		pvclock_scale_delta(delta, pvti_tsc_to_system_mul,

561

++				    pvti_tsc_shift);

562

+

563

+ 	/* refer to tsc.c read_tsc() comment for rationale */

564

+ 	last = gtod->cycle_last;

565

+diff --git a/arch/x86/entry/vdso/vdso-layout.lds.S b/arch/x86/entry/vdso/vdso-layout.lds.S

566

+index de2c921025f5..4158acc17df0 100644

567

+--- a/arch/x86/entry/vdso/vdso-layout.lds.S

568

++++ b/arch/x86/entry/vdso/vdso-layout.lds.S

569

+@@ -25,7 +25,7 @@ SECTIONS

570

+ 	 * segment.

571

+ 	 */

572

+

573

+-	vvar_start = . - 2 * PAGE_SIZE;

574

++	vvar_start = . - 3 * PAGE_SIZE;

575

+ 	vvar_page = vvar_start;

576

+

577

+ 	/* Place all vvars at the offsets in asm/vvar.h. */

578

+@@ -36,6 +36,7 @@ SECTIONS

579

+ #undef EMIT_VVAR

580

+

581

+ 	hpet_page = vvar_start + PAGE_SIZE;

582

++	pvclock_page = vvar_start + 2 * PAGE_SIZE;

583

+

584

+ 	. = SIZEOF_HEADERS;

585

+

586

+diff --git a/arch/x86/entry/vdso/vdso2c.c b/arch/x86/entry/vdso/vdso2c.c

587

+index 785d9922b106..491020b2826d 100644

588

+--- a/arch/x86/entry/vdso/vdso2c.c

589

++++ b/arch/x86/entry/vdso/vdso2c.c

590

+@@ -73,6 +73,7 @@ enum {

591

+ 	sym_vvar_start,

592

+ 	sym_vvar_page,

593

+ 	sym_hpet_page,

594

++	sym_pvclock_page,

595

+ 	sym_VDSO_FAKE_SECTION_TABLE_START,

596

+ 	sym_VDSO_FAKE_SECTION_TABLE_END,

597

+ };

598

+@@ -80,6 +81,7 @@ enum {

599

+ const int special_pages[] = {

600

+ 	sym_vvar_page,

601

+ 	sym_hpet_page,

602

++	sym_pvclock_page,

603

+ };

604

+

605

+ struct vdso_sym {

606

+@@ -91,6 +93,7 @@ struct vdso_sym required_syms[] = {

607

+ 	[sym_vvar_start] = {"vvar_start", true},

608

+ 	[sym_vvar_page] = {"vvar_page", true},

609

+ 	[sym_hpet_page] = {"hpet_page", true},

610

++	[sym_pvclock_page] = {"pvclock_page", true},

611

+ 	[sym_VDSO_FAKE_SECTION_TABLE_START] = {

612

+ 		"VDSO_FAKE_SECTION_TABLE_START", false

613

+ 	},

614

+diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c

615

+index 64df47148160..aa828191c654 100644

616

+--- a/arch/x86/entry/vdso/vma.c

617

++++ b/arch/x86/entry/vdso/vma.c

618

+@@ -100,6 +100,7 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr)

619

+ 		.name = "[vvar]",

620

+ 		.pages = no_pages,

621

+ 	};

622

++	struct pvclock_vsyscall_time_info *pvti;

623

+

624

+ 	if (calculate_addr) {

625

+ 		addr = vdso_addr(current->mm->start_stack,

626

+@@ -169,6 +170,18 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr)

627

+ 	}

628

+ #endif

629

+

630

++	pvti = pvclock_pvti_cpu0_va();

631

++	if (pvti && image->sym_pvclock_page) {

632

++		ret = remap_pfn_range(vma,

633

++				      text_start + image->sym_pvclock_page,

634

++				      __pa(pvti) >> PAGE_SHIFT,

635

++				      PAGE_SIZE,

636

++				      PAGE_READONLY);

637

++

638

++		if (ret)

639

++			goto up_fail;

640

++	}

641

++

642

+ up_fail:

643

+ 	if (ret)

644

+ 		current->mm->context.vdso = NULL;

645

+diff --git a/arch/x86/include/asm/cmdline.h b/arch/x86/include/asm/cmdline.h

646

+index e01f7f7ccb0c..84ae170bc3d0 100644

647

+--- a/arch/x86/include/asm/cmdline.h

648

++++ b/arch/x86/include/asm/cmdline.h

649

+@@ -2,5 +2,7 @@

650

+ #define _ASM_X86_CMDLINE_H

651

+

652

+ int cmdline_find_option_bool(const char *cmdline_ptr, const char *option);

653

++int cmdline_find_option(const char *cmdline_ptr, const char *option,

654

++			char *buffer, int bufsize);

655

+

656

+ #endif /* _ASM_X86_CMDLINE_H */

657

+diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h

658

+index f7ba9fbf12ee..f6605712ca90 100644

659

+--- a/arch/x86/include/asm/cpufeature.h

660

++++ b/arch/x86/include/asm/cpufeature.h

661

+@@ -187,6 +187,7 @@

662

+ #define X86_FEATURE_ARAT	( 7*32+ 1) /* Always Running APIC Timer */

663

+ #define X86_FEATURE_CPB		( 7*32+ 2) /* AMD Core Performance Boost */

664

+ #define X86_FEATURE_EPB		( 7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */

665

++#define X86_FEATURE_INVPCID_SINGLE ( 7*32+ 4) /* Effectively INVPCID && CR4.PCIDE=1 */

666

+ #define X86_FEATURE_PLN		( 7*32+ 5) /* Intel Power Limit Notification */

667

+ #define X86_FEATURE_PTS		( 7*32+ 6) /* Intel Package Thermal Status */

668

+ #define X86_FEATURE_DTHERM	( 7*32+ 7) /* Digital Thermal Sensor */

669

+@@ -199,6 +200,9 @@

670

+ #define X86_FEATURE_HWP_PKG_REQ ( 7*32+14) /* Intel HWP_PKG_REQ */

671

+ #define X86_FEATURE_INTEL_PT	( 7*32+15) /* Intel Processor Trace */

672

+

673

++/* Because the ALTERNATIVE scheme is for members of the X86_FEATURE club... */

674

++#define X86_FEATURE_KAISER	( 7*32+31) /* CONFIG_PAGE_TABLE_ISOLATION w/o nokaiser */

675

++

676

+ /* Virtualization flags: Linux defined, word 8 */

677

+ #define X86_FEATURE_TPR_SHADOW  ( 8*32+ 0) /* Intel TPR Shadow */

678

+ #define X86_FEATURE_VNMI        ( 8*32+ 1) /* Intel Virtual NMI */

679

+diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h

680

+index 4e10d73cf018..880db91d9457 100644

681

+--- a/arch/x86/include/asm/desc.h

682

++++ b/arch/x86/include/asm/desc.h

683

+@@ -43,7 +43,7 @@ struct gdt_page {

684

+ 	struct desc_struct gdt[GDT_ENTRIES];

685

+ } __attribute__((aligned(PAGE_SIZE)));

686

+

687

+-DECLARE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page);

688

++DECLARE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(struct gdt_page, gdt_page);

689

+

690

+ static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)

691

+ {

692

+diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h

693

+index 59caa55fb9b5..ee52ff858699 100644

694

+--- a/arch/x86/include/asm/hw_irq.h

695

++++ b/arch/x86/include/asm/hw_irq.h

696

+@@ -187,7 +187,7 @@ extern char irq_entries_start[];

697

+ #define VECTOR_RETRIGGERED	((void *)~0UL)

698

+

699

+ typedef struct irq_desc* vector_irq_t[NR_VECTORS];

700

+-DECLARE_PER_CPU(vector_irq_t, vector_irq);

701

++DECLARE_PER_CPU_USER_MAPPED(vector_irq_t, vector_irq);

702

+

703

+ #endif /* !ASSEMBLY_ */

704

+

705

+diff --git a/arch/x86/include/asm/kaiser.h b/arch/x86/include/asm/kaiser.h

706

+new file mode 100644

707

+index 000000000000..802bbbdfe143

708

+--- /dev/null

709

++++ b/arch/x86/include/asm/kaiser.h

710

+@@ -0,0 +1,141 @@

711

++#ifndef _ASM_X86_KAISER_H

712

++#define _ASM_X86_KAISER_H

713

++

714

++#include <uapi/asm/processor-flags.h> /* For PCID constants */

715

++

716

++/*

717

++ * This file includes the definitions for the KAISER feature.

718

++ * KAISER is a counter measure against x86_64 side channel attacks on

719

++ * the kernel virtual memory.  It has a shadow pgd for every process: the

720

++ * shadow pgd has a minimalistic kernel-set mapped, but includes the whole

721

++ * user memory. Within a kernel context switch, or when an interrupt is handled,

722

++ * the pgd is switched to the normal one. When the system switches to user mode,

723

++ * the shadow pgd is enabled. By this, the virtual memory caches are freed,

724

++ * and the user may not attack the whole kernel memory.

725

++ *

726

++ * A minimalistic kernel mapping holds the parts needed to be mapped in user

727

++ * mode, such as the entry/exit functions of the user space, or the stacks.

728

++ */

729

++

730

++#define KAISER_SHADOW_PGD_OFFSET 0x1000

731

++

732

++#ifdef __ASSEMBLY__

733

++#ifdef CONFIG_PAGE_TABLE_ISOLATION

734

++

735

++.macro _SWITCH_TO_KERNEL_CR3 reg

736

++movq %cr3, \reg

737

++andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), \reg

738

++/* If PCID enabled, set X86_CR3_PCID_NOFLUSH_BIT */

739

++ALTERNATIVE "", "bts $63, \reg", X86_FEATURE_PCID

740

++movq \reg, %cr3

741

++.endm

742

++

743

++.macro _SWITCH_TO_USER_CR3 reg regb

744

++/*

745

++ * regb must be the low byte portion of reg: because we have arranged

746

++ * for the low byte of the user PCID to serve as the high byte of NOFLUSH

747

++ * (0x80 for each when PCID is enabled, or 0x00 when PCID and NOFLUSH are

748

++ * not enabled): so that the one register can update both memory and cr3.

749

++ */

750

++movq %cr3, \reg

751

++orq  PER_CPU_VAR(x86_cr3_pcid_user), \reg

752

++js   9f

753

++/* If PCID enabled, FLUSH this time, reset to NOFLUSH for next time */

754

++movb \regb, PER_CPU_VAR(x86_cr3_pcid_user+7)

755

++9:

756

++movq \reg, %cr3

757

++.endm

758

++

759

++.macro SWITCH_KERNEL_CR3

760

++ALTERNATIVE "jmp 8f", "pushq %rax", X86_FEATURE_KAISER

761

++_SWITCH_TO_KERNEL_CR3 %rax

762

++popq %rax

763

++8:

764

++.endm

765

++

766

++.macro SWITCH_USER_CR3

767

++ALTERNATIVE "jmp 8f", "pushq %rax", X86_FEATURE_KAISER

768

++_SWITCH_TO_USER_CR3 %rax %al

769

++popq %rax

770

++8:

771

++.endm

772

++

773

++.macro SWITCH_KERNEL_CR3_NO_STACK

774

++ALTERNATIVE "jmp 8f", \

775

++	__stringify(movq %rax, PER_CPU_VAR(unsafe_stack_register_backup)), \

776

++	X86_FEATURE_KAISER

777

++_SWITCH_TO_KERNEL_CR3 %rax

778

++movq PER_CPU_VAR(unsafe_stack_register_backup), %rax

779

++8:

780

++.endm

781

++

782

++#else /* CONFIG_PAGE_TABLE_ISOLATION */

783

++

784

++.macro SWITCH_KERNEL_CR3

785

++.endm

786

++.macro SWITCH_USER_CR3

787

++.endm

788

++.macro SWITCH_KERNEL_CR3_NO_STACK

789

++.endm

790

++

791

++#endif /* CONFIG_PAGE_TABLE_ISOLATION */

792

++

793

++#else /* __ASSEMBLY__ */

794

++

795

++#ifdef CONFIG_PAGE_TABLE_ISOLATION

796

++/*

797

++ * Upon kernel/user mode switch, it may happen that the address

798

++ * space has to be switched before the registers have been

799

++ * stored.  To change the address space, another register is

800

++ * needed.  A register therefore has to be stored/restored.

801

++*/

802

++DECLARE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);

803

++

804

++DECLARE_PER_CPU(unsigned long, x86_cr3_pcid_user);

805

++

806

++extern char __per_cpu_user_mapped_start[], __per_cpu_user_mapped_end[];

807

++

808

++extern int kaiser_enabled;

809

++extern void __init kaiser_check_boottime_disable(void);

810

++#else

811

++#define kaiser_enabled	0

812

++static inline void __init kaiser_check_boottime_disable(void) {}

813

++#endif /* CONFIG_PAGE_TABLE_ISOLATION */

814

++

815

++/*

816

++ * Kaiser function prototypes are needed even when CONFIG_PAGE_TABLE_ISOLATION is not set,

817

++ * so as to build with tests on kaiser_enabled instead of #ifdefs.

818

++ */

819

++

820

++/**

821

++ *  kaiser_add_mapping - map a virtual memory part to the shadow (user) mapping

822

++ *  @addr: the start address of the range

823

++ *  @size: the size of the range

824

++ *  @flags: The mapping flags of the pages

825

++ *

826

++ *  The mapping is done on a global scope, so no bigger

827

++ *  synchronization has to be done.  the pages have to be

828

++ *  manually unmapped again when they are not needed any longer.

829

++ */

830

++extern int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags);

831

++

832

++/**

833

++ *  kaiser_remove_mapping - unmap a virtual memory part of the shadow mapping

834

++ *  @addr: the start address of the range

835

++ *  @size: the size of the range

836

++ */

837

++extern void kaiser_remove_mapping(unsigned long start, unsigned long size);

838

++

839

++/**

840

++ *  kaiser_init - Initialize the shadow mapping

841

++ *

842

++ *  Most parts of the shadow mapping can be mapped upon boot

843

++ *  time.  Only per-process things like the thread stacks

844

++ *  or a new LDT have to be mapped at runtime.  These boot-

845

++ *  time mappings are permanent and never unmapped.

846

++ */

847

++extern void kaiser_init(void);

848

++

849

++#endif /* __ASSEMBLY */

850

++

851

++#endif /* _ASM_X86_KAISER_H */

852

+diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h

853

+index 6ec0c8b2e9df..84c62d950023 100644

854

+--- a/arch/x86/include/asm/pgtable.h

855

++++ b/arch/x86/include/asm/pgtable.h

856

+@@ -18,6 +18,12 @@

857

+ #ifndef __ASSEMBLY__

858

+ #include <asm/x86_init.h>

859

+

860

++#ifdef CONFIG_PAGE_TABLE_ISOLATION

861

++extern int kaiser_enabled;

862

++#else

863

++#define kaiser_enabled 0

864

++#endif

865

++

866

+ void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd);

867

+ void ptdump_walk_pgd_level_checkwx(void);

868

+

869

+@@ -653,7 +659,17 @@ static inline pud_t *pud_offset(pgd_t *pgd, unsigned long address)

870

+

871

+ static inline int pgd_bad(pgd_t pgd)

872

+ {

873

+-	return (pgd_flags(pgd) & ~_PAGE_USER) != _KERNPG_TABLE;

874

++	pgdval_t ignore_flags = _PAGE_USER;

875

++	/*

876

++	 * We set NX on KAISER pgds that map userspace memory so

877

++	 * that userspace can not meaningfully use the kernel

878

++	 * page table by accident; it will fault on the first

879

++	 * instruction it tries to run.  See native_set_pgd().

880

++	 */

881

++	if (kaiser_enabled)

882

++		ignore_flags |= _PAGE_NX;

883

++

884

++	return (pgd_flags(pgd) & ~ignore_flags) != _KERNPG_TABLE;

885

+ }

886

+

887

+ static inline int pgd_none(pgd_t pgd)

888

+@@ -855,7 +871,15 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm,

889

+  */

890

+ static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)

891

+ {

892

+-       memcpy(dst, src, count * sizeof(pgd_t));

893

++	memcpy(dst, src, count * sizeof(pgd_t));

894

++#ifdef CONFIG_PAGE_TABLE_ISOLATION

895

++	if (kaiser_enabled) {

896

++		/* Clone the shadow pgd part as well */

897

++		memcpy(native_get_shadow_pgd(dst),

898

++			native_get_shadow_pgd(src),

899

++			count * sizeof(pgd_t));

900

++	}

901

++#endif

902

+ }

903

+

904

+ #define PTE_SHIFT ilog2(PTRS_PER_PTE)

905

+diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h

906

+index 2ee781114d34..c810226e741a 100644

907

+--- a/arch/x86/include/asm/pgtable_64.h

908

++++ b/arch/x86/include/asm/pgtable_64.h

909

+@@ -106,9 +106,32 @@ static inline void native_pud_clear(pud_t *pud)

910

+ 	native_set_pud(pud, native_make_pud(0));

911

+ }

912

+

913

++#ifdef CONFIG_PAGE_TABLE_ISOLATION

914

++extern pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd);

915

++

916

++static inline pgd_t *native_get_shadow_pgd(pgd_t *pgdp)

917

++{

918

++#ifdef CONFIG_DEBUG_VM

919

++	/* linux/mmdebug.h may not have been included at this point */

920

++	BUG_ON(!kaiser_enabled);

921

++#endif

922

++	return (pgd_t *)((unsigned long)pgdp | (unsigned long)PAGE_SIZE);

923

++}

924

++#else

925

++static inline pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd)

926

++{

927

++	return pgd;

928

++}

929

++static inline pgd_t *native_get_shadow_pgd(pgd_t *pgdp)

930

++{

931

++	BUILD_BUG_ON(1);

932

++	return NULL;

933

++}

934

++#endif /* CONFIG_PAGE_TABLE_ISOLATION */

935

++

936

+ static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd)

937

+ {

938

+-	*pgdp = pgd;

939

++	*pgdp = kaiser_set_shadow_pgd(pgdp, pgd);

940

+ }

941

+

942

+ static inline void native_pgd_clear(pgd_t *pgd)

943

+diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h

944

+index 79c91853e50e..8dba273da25a 100644

945

+--- a/arch/x86/include/asm/pgtable_types.h

946

++++ b/arch/x86/include/asm/pgtable_types.h

947

+@@ -89,7 +89,7 @@

948

+ #define _PAGE_NX	(_AT(pteval_t, 0))

949

+ #endif

950

+

951

+-#define _PAGE_PROTNONE	(_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)

952

++#define _PAGE_PROTNONE  (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)

953

+

954

+ #define _PAGE_TABLE	(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |	\

955

+ 			 _PAGE_ACCESSED | _PAGE_DIRTY)

956

+@@ -102,6 +102,33 @@

957

+ 			 _PAGE_SOFT_DIRTY)

958

+ #define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE)

959

+

960

++/* The ASID is the lower 12 bits of CR3 */

961

++#define X86_CR3_PCID_ASID_MASK  (_AC((1<<12)-1,UL))

962

++

963

++/* Mask for all the PCID-related bits in CR3: */

964

++#define X86_CR3_PCID_MASK       (X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_MASK)

965

++#define X86_CR3_PCID_ASID_KERN  (_AC(0x0,UL))

966

++

967

++#if defined(CONFIG_PAGE_TABLE_ISOLATION) && defined(CONFIG_X86_64)

968

++/* Let X86_CR3_PCID_ASID_USER be usable for the X86_CR3_PCID_NOFLUSH bit */

969

++#define X86_CR3_PCID_ASID_USER	(_AC(0x80,UL))

970

++

971

++#define X86_CR3_PCID_KERN_FLUSH		(X86_CR3_PCID_ASID_KERN)

972

++#define X86_CR3_PCID_USER_FLUSH		(X86_CR3_PCID_ASID_USER)

973

++#define X86_CR3_PCID_KERN_NOFLUSH	(X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_KERN)

974

++#define X86_CR3_PCID_USER_NOFLUSH	(X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_USER)

975

++#else

976

++#define X86_CR3_PCID_ASID_USER  (_AC(0x0,UL))

977

++/*

978

++ * PCIDs are unsupported on 32-bit and none of these bits can be

979

++ * set in CR3:

980

++ */

981

++#define X86_CR3_PCID_KERN_FLUSH		(0)

982

++#define X86_CR3_PCID_USER_FLUSH		(0)

983

++#define X86_CR3_PCID_KERN_NOFLUSH	(0)

984

++#define X86_CR3_PCID_USER_NOFLUSH	(0)

985

++#endif

986

++

987

+ /*

988

+  * The cache modes defined here are used to translate between pure SW usage

989

+  * and the HW defined cache mode bits and/or PAT entries.

990

+diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h

991

+index 2d5a50cb61a2..f3bdaed0188f 100644

992

+--- a/arch/x86/include/asm/processor.h

993

++++ b/arch/x86/include/asm/processor.h

994

+@@ -305,7 +305,7 @@ struct tss_struct {

995

+

996

+ } ____cacheline_aligned;

997

+

998

+-DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss);

999

++DECLARE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct tss_struct, cpu_tss);

1000

+

1001

+ #ifdef CONFIG_X86_32

1002

+ DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack);

1003

+diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h

1004

+index baad72e4c100..6045cef376c2 100644

1005

+--- a/arch/x86/include/asm/pvclock.h

1006

++++ b/arch/x86/include/asm/pvclock.h

1007

+@@ -4,6 +4,15 @@

1008

+ #include <linux/clocksource.h>

1009

+ #include <asm/pvclock-abi.h>

1010

+

1011

++#ifdef CONFIG_PARAVIRT_CLOCK

1012

++extern struct pvclock_vsyscall_time_info *pvclock_pvti_cpu0_va(void);

1013

++#else

1014

++static inline struct pvclock_vsyscall_time_info *pvclock_pvti_cpu0_va(void)

1015

++{

1016

++	return NULL;

1017

++}

1018

++#endif

1019

++

1020

+ /* some helper functions for xen and kvm pv clock sources */

1021

+ cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src);

1022

+ u8 pvclock_read_flags(struct pvclock_vcpu_time_info *src);

1023

+diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h

1024

+index 9fc5968da820..a691b66cc40a 100644

1025

+--- a/arch/x86/include/asm/tlbflush.h

1026

++++ b/arch/x86/include/asm/tlbflush.h

1027

+@@ -131,6 +131,24 @@ static inline void cr4_set_bits_and_update_boot(unsigned long mask)

1028

+ 	cr4_set_bits(mask);

1029

+ }

1030

+

1031

++/*

1032

++ * Declare a couple of kaiser interfaces here for convenience,

1033

++ * to avoid the need for asm/kaiser.h in unexpected places.

1034

++ */

1035

++#ifdef CONFIG_PAGE_TABLE_ISOLATION

1036

++extern int kaiser_enabled;

1037

++extern void kaiser_setup_pcid(void);

1038

++extern void kaiser_flush_tlb_on_return_to_user(void);

1039

++#else

1040

++#define kaiser_enabled 0

1041

++static inline void kaiser_setup_pcid(void)

1042

++{

1043

++}

1044

++static inline void kaiser_flush_tlb_on_return_to_user(void)

1045

++{

1046

++}

1047

++#endif

1048

++

1049

+ static inline void __native_flush_tlb(void)

1050

+ {

1051

+ 	/*

1052

+@@ -139,6 +157,8 @@ static inline void __native_flush_tlb(void)

1053

+ 	 * back:

1054

+ 	 */

1055

+ 	preempt_disable();

1056

++	if (kaiser_enabled)

1057

++		kaiser_flush_tlb_on_return_to_user();

1058

+ 	native_write_cr3(native_read_cr3());

1059

+ 	preempt_enable();

1060

+ }

1061

+@@ -148,20 +168,27 @@ static inline void __native_flush_tlb_global_irq_disabled(void)

1062

+ 	unsigned long cr4;

1063

+

1064

+ 	cr4 = this_cpu_read(cpu_tlbstate.cr4);

1065

+-	/* clear PGE */

1066

+-	native_write_cr4(cr4 & ~X86_CR4_PGE);

1067

+-	/* write old PGE again and flush TLBs */

1068

+-	native_write_cr4(cr4);

1069

++	if (cr4 & X86_CR4_PGE) {

1070

++		/* clear PGE and flush TLB of all entries */

1071

++		native_write_cr4(cr4 & ~X86_CR4_PGE);

1072

++		/* restore PGE as it was before */

1073

++		native_write_cr4(cr4);

1074

++	} else {

1075

++		/* do it with cr3, letting kaiser flush user PCID */

1076

++		__native_flush_tlb();

1077

++	}

1078

+ }

1079

+

1080

+ static inline void __native_flush_tlb_global(void)

1081

+ {

1082

+ 	unsigned long flags;

1083

+

1084

+-	if (static_cpu_has(X86_FEATURE_INVPCID)) {

1085

++	if (this_cpu_has(X86_FEATURE_INVPCID)) {

1086

+ 		/*

1087

+ 		 * Using INVPCID is considerably faster than a pair of writes

1088

+ 		 * to CR4 sandwiched inside an IRQ flag save/restore.

1089

++		 *

1090

++	 	 * Note, this works with CR4.PCIDE=0 or 1.

1091

+ 		 */

1092

+ 		invpcid_flush_all();

1093

+ 		return;

1094

+@@ -173,24 +200,45 @@ static inline void __native_flush_tlb_global(void)

1095

+ 	 * be called from deep inside debugging code.)

1096

+ 	 */

1097

+ 	raw_local_irq_save(flags);

1098

+-

1099

+ 	__native_flush_tlb_global_irq_disabled();

1100

+-

1101

+ 	raw_local_irq_restore(flags);

1102

+ }

1103

+

1104

+ static inline void __native_flush_tlb_single(unsigned long addr)

1105

+ {

1106

+-	asm volatile("invlpg (%0)" ::"r" (addr) : "memory");

1107

++	/*

1108

++	 * SIMICS #GP's if you run INVPCID with type 2/3

1109

++	 * and X86_CR4_PCIDE clear.  Shame!

1110

++	 *

1111

++	 * The ASIDs used below are hard-coded.  But, we must not

1112

++	 * call invpcid(type=1/2) before CR4.PCIDE=1.  Just call

1113

++	 * invlpg in the case we are called early.

1114

++	 */

1115

++

1116

++	if (!this_cpu_has(X86_FEATURE_INVPCID_SINGLE)) {

1117

++		if (kaiser_enabled)

1118

++			kaiser_flush_tlb_on_return_to_user();

1119

++		asm volatile("invlpg (%0)" ::"r" (addr) : "memory");

1120

++		return;

1121

++	}

1122

++	/* Flush the address out of both PCIDs. */

1123

++	/*

1124

++	 * An optimization here might be to determine addresses

1125

++	 * that are only kernel-mapped and only flush the kernel

1126

++	 * ASID.  But, userspace flushes are probably much more

1127

++	 * important performance-wise.

1128

++	 *

1129

++	 * Make sure to do only a single invpcid when KAISER is

1130

++	 * disabled and we have only a single ASID.

1131

++	 */

1132

++	if (kaiser_enabled)

1133

++		invpcid_flush_one(X86_CR3_PCID_ASID_USER, addr);

1134

++	invpcid_flush_one(X86_CR3_PCID_ASID_KERN, addr);

1135

+ }

1136

+

1137

+ static inline void __flush_tlb_all(void)

1138

+ {

1139

+-	if (cpu_has_pge)

1140

+-		__flush_tlb_global();

1141

+-	else

1142

+-		__flush_tlb();

1143

+-

1144

++	__flush_tlb_global();

1145

+ 	/*

1146

+ 	 * Note: if we somehow had PCID but not PGE, then this wouldn't work --

1147

+ 	 * we'd end up flushing kernel translations for the current ASID but

1148

+diff --git a/arch/x86/include/asm/vdso.h b/arch/x86/include/asm/vdso.h

1149

+index 756de9190aec..deabaf9759b6 100644

1150

+--- a/arch/x86/include/asm/vdso.h

1151

++++ b/arch/x86/include/asm/vdso.h

1152

+@@ -22,6 +22,7 @@ struct vdso_image {

1153

+

1154

+ 	long sym_vvar_page;

1155

+ 	long sym_hpet_page;

1156

++	long sym_pvclock_page;

1157

+ 	long sym_VDSO32_NOTE_MASK;

1158

+ 	long sym___kernel_sigreturn;

1159

+ 	long sym___kernel_rt_sigreturn;

1160

+diff --git a/arch/x86/include/uapi/asm/processor-flags.h b/arch/x86/include/uapi/asm/processor-flags.h

1161

+index 79887abcb5e1..1361779f44fe 100644

1162

+--- a/arch/x86/include/uapi/asm/processor-flags.h

1163

++++ b/arch/x86/include/uapi/asm/processor-flags.h

1164

+@@ -77,7 +77,8 @@

1165

+ #define X86_CR3_PWT		_BITUL(X86_CR3_PWT_BIT)

1166

+ #define X86_CR3_PCD_BIT		4 /* Page Cache Disable */

1167

+ #define X86_CR3_PCD		_BITUL(X86_CR3_PCD_BIT)

1168

+-#define X86_CR3_PCID_MASK	_AC(0x00000fff,UL) /* PCID Mask */

1169

++#define X86_CR3_PCID_NOFLUSH_BIT 63 /* Preserve old PCID */

1170

++#define X86_CR3_PCID_NOFLUSH    _BITULL(X86_CR3_PCID_NOFLUSH_BIT)

1171

+

1172

+ /*

1173

+  * Intel CPU features in CR4

1174

+diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c

1175

+index aa1e7246b06b..cc154ac64f00 100644

1176

+--- a/arch/x86/kernel/cpu/common.c

1177

++++ b/arch/x86/kernel/cpu/common.c

1178

+@@ -92,7 +92,7 @@ static const struct cpu_dev default_cpu = {

1179

+

1180

+ static const struct cpu_dev *this_cpu = &default_cpu;

1181

+

1182

+-DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {

1183

++DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(struct gdt_page, gdt_page) = { .gdt = {

1184

+ #ifdef CONFIG_X86_64

1185

+ 	/*

1186

+ 	 * We need valid kernel segments for data and code in long mode too

1187

+@@ -324,8 +324,21 @@ static __always_inline void setup_smap(struct cpuinfo_x86 *c)

1188

+ static void setup_pcid(struct cpuinfo_x86 *c)

1189

+ {

1190

+ 	if (cpu_has(c, X86_FEATURE_PCID)) {

1191

+-		if (cpu_has(c, X86_FEATURE_PGE)) {

1192

++		if (cpu_has(c, X86_FEATURE_PGE) || kaiser_enabled) {

1193

+ 			cr4_set_bits(X86_CR4_PCIDE);

1194

++			/*

1195

++			 * INVPCID has two "groups" of types:

1196

++			 * 1/2: Invalidate an individual address

1197

++			 * 3/4: Invalidate all contexts

1198

++			 *

1199

++			 * 1/2 take a PCID, but 3/4 do not.  So, 3/4

1200

++			 * ignore the PCID argument in the descriptor.

1201

++			 * But, we have to be careful not to call 1/2

1202

++			 * with an actual non-zero PCID in them before

1203

++			 * we do the above cr4_set_bits().

1204

++			 */

1205

++			if (cpu_has(c, X86_FEATURE_INVPCID))

1206

++				set_cpu_cap(c, X86_FEATURE_INVPCID_SINGLE);

1207

+ 		} else {

1208

+ 			/*

1209

+ 			 * flush_tlb_all(), as currently implemented, won't

1210

+@@ -338,6 +351,7 @@ static void setup_pcid(struct cpuinfo_x86 *c)

1211

+ 			clear_cpu_cap(c, X86_FEATURE_PCID);

1212

+ 		}

1213

+ 	}

1214

++	kaiser_setup_pcid();

1215

+ }

1216

+

1217

+ /*

1218

+@@ -1229,7 +1243,7 @@ static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = {

1219

+ 	  [DEBUG_STACK - 1]			= DEBUG_STKSZ

1220

+ };

1221

+

1222

+-static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks

1223

++DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(char, exception_stacks

1224

+ 	[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]);

1225

+

1226

+ /* May not be marked __init: used by software suspend */

1227

+@@ -1392,6 +1406,14 @@ void cpu_init(void)

1228

+ 	 * try to read it.

1229

+ 	 */

1230

+ 	cr4_init_shadow();

1231

++	if (!kaiser_enabled) {

1232

++		/*

1233

++		 * secondary_startup_64() deferred setting PGE in cr4:

1234

++		 * probe_page_size_mask() sets it on the boot cpu,

1235

++		 * but it needs to be set on each secondary cpu.

1236

++		 */

1237

++		cr4_set_bits(X86_CR4_PGE);

1238

++	}

1239

+

1240

+ 	/*

1241

+ 	 * Load microcode on this cpu if a valid microcode is available.

1242

+diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c

1243

+index 1e7de3cefc9c..f01b3a12dce0 100644

1244

+--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c

1245

++++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c

1246

+@@ -2,11 +2,15 @@

1247

+ #include <linux/types.h>

1248

+ #include <linux/slab.h>

1249

+

1250

++#include <asm/kaiser.h>

1251

+ #include <asm/perf_event.h>

1252

+ #include <asm/insn.h>

1253

+

1254

+ #include "perf_event.h"

1255

+

1256

++static

1257

++DEFINE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct debug_store, cpu_debug_store);

1258

++

1259

+ /* The size of a BTS record in bytes: */

1260

+ #define BTS_RECORD_SIZE		24

1261

+

1262

+@@ -268,6 +272,39 @@ void fini_debug_store_on_cpu(int cpu)

1263

+

1264

+ static DEFINE_PER_CPU(void *, insn_buffer);

1265

+

1266

++static void *dsalloc(size_t size, gfp_t flags, int node)

1267

++{

1268

++#ifdef CONFIG_PAGE_TABLE_ISOLATION

1269

++	unsigned int order = get_order(size);

1270

++	struct page *page;

1271

++	unsigned long addr;

1272

++

1273

++	page = __alloc_pages_node(node, flags | __GFP_ZERO, order);

1274

++	if (!page)

1275

++		return NULL;

1276

++	addr = (unsigned long)page_address(page);

1277

++	if (kaiser_add_mapping(addr, size, __PAGE_KERNEL) < 0) {

1278

++		__free_pages(page, order);

1279

++		addr = 0;

1280

++	}

1281

++	return (void *)addr;

1282

++#else

1283

++	return kmalloc_node(size, flags | __GFP_ZERO, node);

1284

++#endif

1285

++}

1286

++

1287

++static void dsfree(const void *buffer, size_t size)

1288

++{

1289

++#ifdef CONFIG_PAGE_TABLE_ISOLATION

1290

++	if (!buffer)

1291

++		return;

1292

++	kaiser_remove_mapping((unsigned long)buffer, size);

1293

++	free_pages((unsigned long)buffer, get_order(size));

1294

++#else

1295

++	kfree(buffer);

1296

++#endif

1297

++}

1298

++

1299

+ static int alloc_pebs_buffer(int cpu)

1300

+ {

1301

+ 	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;

1302

+@@ -278,7 +315,7 @@ static int alloc_pebs_buffer(int cpu)

1303

+ 	if (!x86_pmu.pebs)

1304

+ 		return 0;

1305

+

1306

+-	buffer = kzalloc_node(x86_pmu.pebs_buffer_size, GFP_KERNEL, node);

1307

++	buffer = dsalloc(x86_pmu.pebs_buffer_size, GFP_KERNEL, node);

1308

+ 	if (unlikely(!buffer))

1309

+ 		return -ENOMEM;

1310

+

1311

+@@ -289,7 +326,7 @@ static int alloc_pebs_buffer(int cpu)

1312

+ 	if (x86_pmu.intel_cap.pebs_format < 2) {

1313

+ 		ibuffer = kzalloc_node(PEBS_FIXUP_SIZE, GFP_KERNEL, node);

1314

+ 		if (!ibuffer) {

1315

+-			kfree(buffer);

1316

++			dsfree(buffer, x86_pmu.pebs_buffer_size);

1317

+ 			return -ENOMEM;

1318

+ 		}

1319

+ 		per_cpu(insn_buffer, cpu) = ibuffer;

1320

+@@ -315,7 +352,8 @@ static void release_pebs_buffer(int cpu)

1321

+ 	kfree(per_cpu(insn_buffer, cpu));

1322

+ 	per_cpu(insn_buffer, cpu) = NULL;

1323

+

1324

+-	kfree((void *)(unsigned long)ds->pebs_buffer_base);

1325

++	dsfree((void *)(unsigned long)ds->pebs_buffer_base,

1326

++			x86_pmu.pebs_buffer_size);

1327

+ 	ds->pebs_buffer_base = 0;

1328

+ }

1329

+

1330

+@@ -329,7 +367,7 @@ static int alloc_bts_buffer(int cpu)

1331

+ 	if (!x86_pmu.bts)

1332

+ 		return 0;

1333

+

1334

+-	buffer = kzalloc_node(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, node);

1335

++	buffer = dsalloc(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, node);

1336

+ 	if (unlikely(!buffer)) {

1337

+ 		WARN_ONCE(1, "%s: BTS buffer allocation failure\n", __func__);

1338

+ 		return -ENOMEM;

1339

+@@ -355,19 +393,15 @@ static void release_bts_buffer(int cpu)

1340

+ 	if (!ds || !x86_pmu.bts)

1341

+ 		return;

1342

+

1343

+-	kfree((void *)(unsigned long)ds->bts_buffer_base);

1344

++	dsfree((void *)(unsigned long)ds->bts_buffer_base, BTS_BUFFER_SIZE);

1345

+ 	ds->bts_buffer_base = 0;

1346

+ }

1347

+

1348

+ static int alloc_ds_buffer(int cpu)

1349

+ {

1350

+-	int node = cpu_to_node(cpu);

1351

+-	struct debug_store *ds;

1352

+-

1353

+-	ds = kzalloc_node(sizeof(*ds), GFP_KERNEL, node);

1354

+-	if (unlikely(!ds))

1355

+-		return -ENOMEM;

1356

++	struct debug_store *ds = per_cpu_ptr(&cpu_debug_store, cpu);

1357

+

1358

++	memset(ds, 0, sizeof(*ds));

1359

+ 	per_cpu(cpu_hw_events, cpu).ds = ds;

1360

+

1361

+ 	return 0;

1362

+@@ -381,7 +415,6 @@ static void release_ds_buffer(int cpu)

1363

+ 		return;

1364

+

1365

+ 	per_cpu(cpu_hw_events, cpu).ds = NULL;

1366

+-	kfree(ds);

1367

+ }

1368

+

1369

+ void release_ds_buffers(void)

1370

+diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c

1371

+index 4d38416e2a7f..b02cb2ec6726 100644

1372

+--- a/arch/x86/kernel/espfix_64.c

1373

++++ b/arch/x86/kernel/espfix_64.c

1374

+@@ -41,6 +41,7 @@

1375

+ #include <asm/pgalloc.h>

1376

+ #include <asm/setup.h>

1377

+ #include <asm/espfix.h>

1378

++#include <asm/kaiser.h>

1379

+

1380

+ /*

1381

+  * Note: we only need 6*8 = 48 bytes for the espfix stack, but round

1382

+@@ -126,6 +127,15 @@ void __init init_espfix_bsp(void)

1383

+ 	/* Install the espfix pud into the kernel page directory */

1384

+ 	pgd_p = &init_level4_pgt[pgd_index(ESPFIX_BASE_ADDR)];

1385

+ 	pgd_populate(&init_mm, pgd_p, (pud_t *)espfix_pud_page);

1386

++	/*

1387

++	 * Just copy the top-level PGD that is mapping the espfix

1388

++	 * area to ensure it is mapped into the shadow user page

1389

++	 * tables.

1390

++	 */

1391

++	if (kaiser_enabled) {

1392

++		set_pgd(native_get_shadow_pgd(pgd_p),

1393

++			__pgd(_KERNPG_TABLE | __pa((pud_t *)espfix_pud_page)));

1394

++	}

1395

+

1396

+ 	/* Randomize the locations */

1397

+ 	init_espfix_random();

1398

+diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S

1399

+index ffdc0e860390..4034e905741a 100644

1400

+--- a/arch/x86/kernel/head_64.S

1401

++++ b/arch/x86/kernel/head_64.S

1402

+@@ -183,8 +183,8 @@ ENTRY(secondary_startup_64)

1403

+ 	movq	$(init_level4_pgt - __START_KERNEL_map), %rax

1404

+ 1:

1405

+

1406

+-	/* Enable PAE mode and PGE */

1407

+-	movl	$(X86_CR4_PAE | X86_CR4_PGE), %ecx

1408

++	/* Enable PAE and PSE, but defer PGE until kaiser_enabled is decided */

1409

++	movl	$(X86_CR4_PAE | X86_CR4_PSE), %ecx

1410

+ 	movq	%rcx, %cr4

1411

+

1412

+ 	/* Setup early boot stage 4 level pagetables. */

1413

+@@ -441,6 +441,27 @@ early_idt_ripmsg:

1414

+ 	.balign	PAGE_SIZE; \

1415

+ GLOBAL(name)

1416

+

1417

++#ifdef CONFIG_PAGE_TABLE_ISOLATION

1418

++/*

1419

++ * Each PGD needs to be 8k long and 8k aligned.  We do not

1420

++ * ever go out to userspace with these, so we do not

1421

++ * strictly *need* the second page, but this allows us to

1422

++ * have a single set_pgd() implementation that does not

1423

++ * need to worry about whether it has 4k or 8k to work

1424

++ * with.

1425

++ *

1426

++ * This ensures PGDs are 8k long:

1427

++ */

1428

++#define KAISER_USER_PGD_FILL	512

1429

++/* This ensures they are 8k-aligned: */

1430

++#define NEXT_PGD_PAGE(name) \

1431

++	.balign 2 * PAGE_SIZE; \

1432

++GLOBAL(name)

1433

++#else

1434

++#define NEXT_PGD_PAGE(name) NEXT_PAGE(name)

1435

++#define KAISER_USER_PGD_FILL	0

1436

++#endif

1437

++

1438

+ /* Automate the creation of 1 to 1 mapping pmd entries */

1439

+ #define PMDS(START, PERM, COUNT)			\

1440

+ 	i = 0 ;						\

1441

+@@ -450,9 +471,10 @@ GLOBAL(name)

1442

+ 	.endr

1443

+

1444

+ 	__INITDATA

1445

+-NEXT_PAGE(early_level4_pgt)

1446

++NEXT_PGD_PAGE(early_level4_pgt)

1447

+ 	.fill	511,8,0

1448

+ 	.quad	level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE

1449

++	.fill	KAISER_USER_PGD_FILL,8,0

1450

+

1451

+ NEXT_PAGE(early_dynamic_pgts)

1452

+ 	.fill	512*EARLY_DYNAMIC_PAGE_TABLES,8,0

1453

+@@ -460,16 +482,18 @@ NEXT_PAGE(early_dynamic_pgts)

1454

+ 	.data

1455

+

1456

+ #ifndef CONFIG_XEN

1457

+-NEXT_PAGE(init_level4_pgt)

1458

++NEXT_PGD_PAGE(init_level4_pgt)

1459

+ 	.fill	512,8,0

1460

++	.fill	KAISER_USER_PGD_FILL,8,0

1461

+ #else

1462

+-NEXT_PAGE(init_level4_pgt)

1463

++NEXT_PGD_PAGE(init_level4_pgt)

1464

+ 	.quad   level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE

1465

+ 	.org    init_level4_pgt + L4_PAGE_OFFSET*8, 0

1466

+ 	.quad   level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE

1467

+ 	.org    init_level4_pgt + L4_START_KERNEL*8, 0

1468

+ 	/* (2^48-(2*1024*1024*1024))/(2^39) = 511 */

1469

+ 	.quad   level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE

1470

++	.fill	KAISER_USER_PGD_FILL,8,0

1471

+

1472

+ NEXT_PAGE(level3_ident_pgt)

1473

+ 	.quad	level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE

1474

+@@ -480,6 +504,7 @@ NEXT_PAGE(level2_ident_pgt)

1475

+ 	 */

1476

+ 	PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD)

1477

+ #endif

1478

++	.fill	KAISER_USER_PGD_FILL,8,0

1479

+

1480

+ NEXT_PAGE(level3_kernel_pgt)

1481

+ 	.fill	L3_START_KERNEL,8,0

1482

+diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c

1483

+index 1423ab1b0312..f480b38a03c3 100644

1484

+--- a/arch/x86/kernel/irqinit.c

1485

++++ b/arch/x86/kernel/irqinit.c

1486

+@@ -51,7 +51,7 @@ static struct irqaction irq2 = {

1487

+ 	.flags = IRQF_NO_THREAD,

1488

+ };

1489

+

1490

+-DEFINE_PER_CPU(vector_irq_t, vector_irq) = {

1491

++DEFINE_PER_CPU_USER_MAPPED(vector_irq_t, vector_irq) = {

1492

+ 	[0 ... NR_VECTORS - 1] = VECTOR_UNUSED,

1493

+ };

1494

+

1495

+diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c

1496

+index 2bd81e302427..ec1b06dc82d2 100644

1497

+--- a/arch/x86/kernel/kvmclock.c

1498

++++ b/arch/x86/kernel/kvmclock.c

1499

+@@ -45,6 +45,11 @@ early_param("no-kvmclock", parse_no_kvmclock);

1500

+ static struct pvclock_vsyscall_time_info *hv_clock;

1501

+ static struct pvclock_wall_clock wall_clock;

1502

+

1503

++struct pvclock_vsyscall_time_info *pvclock_pvti_cpu0_va(void)

1504

++{

1505

++	return hv_clock;

1506

++}

1507

++

1508

+ /*

1509

+  * The wallclock is the time of day when we booted. Since then, some time may

1510

+  * have elapsed since the hypervisor wrote the data. So we try to account for

1511

+diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c

1512

+index d6279593bcdd..bc429365b72a 100644

1513

+--- a/arch/x86/kernel/ldt.c

1514

++++ b/arch/x86/kernel/ldt.c

1515

+@@ -16,6 +16,7 @@

1516

+ #include <linux/slab.h>

1517

+ #include <linux/vmalloc.h>

1518

+ #include <linux/uaccess.h>

1519

++#include <linux/kaiser.h>

1520

+

1521

+ #include <asm/ldt.h>

1522

+ #include <asm/desc.h>

1523

+@@ -34,11 +35,21 @@ static void flush_ldt(void *current_mm)

1524

+ 	set_ldt(pc->ldt->entries, pc->ldt->size);

1525

+ }

1526

+

1527

++static void __free_ldt_struct(struct ldt_struct *ldt)

1528

++{

1529

++	if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE)

1530

++		vfree(ldt->entries);

1531

++	else

1532

++		free_page((unsigned long)ldt->entries);

1533

++	kfree(ldt);

1534

++}

1535

++

1536

+ /* The caller must call finalize_ldt_struct on the result. LDT starts zeroed. */

1537

+ static struct ldt_struct *alloc_ldt_struct(int size)

1538

+ {

1539

+ 	struct ldt_struct *new_ldt;

1540

+ 	int alloc_size;

1541

++	int ret;

1542

+

1543

+ 	if (size > LDT_ENTRIES)

1544

+ 		return NULL;

1545

+@@ -66,7 +77,13 @@ static struct ldt_struct *alloc_ldt_struct(int size)

1546

+ 		return NULL;

1547

+ 	}

1548

+

1549

++	ret = kaiser_add_mapping((unsigned long)new_ldt->entries, alloc_size,

1550

++				 __PAGE_KERNEL);

1551

+ 	new_ldt->size = size;

1552

++	if (ret) {

1553

++		__free_ldt_struct(new_ldt);

1554

++		return NULL;

1555

++	}

1556

+ 	return new_ldt;

1557

+ }

1558

+

1559

+@@ -92,12 +109,10 @@ static void free_ldt_struct(struct ldt_struct *ldt)

1560

+ 	if (likely(!ldt))

1561

+ 		return;

1562

+

1563

++	kaiser_remove_mapping((unsigned long)ldt->entries,

1564

++			      ldt->size * LDT_ENTRY_SIZE);

1565

+ 	paravirt_free_ldt(ldt->entries, ldt->size);

1566

+-	if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE)

1567

+-		vfree(ldt->entries);

1568

+-	else

1569

+-		free_page((unsigned long)ldt->entries);

1570

+-	kfree(ldt);

1571

++	__free_ldt_struct(ldt);

1572

+ }

1573

+

1574

+ /*

1575

+diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c

1576

+index 8aa05583bc42..0677bf8d3a42 100644

1577

+--- a/arch/x86/kernel/paravirt_patch_64.c

1578

++++ b/arch/x86/kernel/paravirt_patch_64.c

1579

+@@ -9,7 +9,6 @@ DEF_NATIVE(pv_irq_ops, save_fl, "pushfq; popq %rax");

1580

+ DEF_NATIVE(pv_mmu_ops, read_cr2, "movq %cr2, %rax");

1581

+ DEF_NATIVE(pv_mmu_ops, read_cr3, "movq %cr3, %rax");

1582

+ DEF_NATIVE(pv_mmu_ops, write_cr3, "movq %rdi, %cr3");

1583

+-DEF_NATIVE(pv_mmu_ops, flush_tlb_single, "invlpg (%rdi)");

1584

+ DEF_NATIVE(pv_cpu_ops, clts, "clts");

1585

+ DEF_NATIVE(pv_cpu_ops, wbinvd, "wbinvd");

1586

+

1587

+@@ -62,7 +61,6 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,

1588

+ 		PATCH_SITE(pv_mmu_ops, read_cr3);

1589

+ 		PATCH_SITE(pv_mmu_ops, write_cr3);

1590

+ 		PATCH_SITE(pv_cpu_ops, clts);

1591

+-		PATCH_SITE(pv_mmu_ops, flush_tlb_single);

1592

+ 		PATCH_SITE(pv_cpu_ops, wbinvd);

1593

+ #if defined(CONFIG_PARAVIRT_SPINLOCKS) && defined(CONFIG_QUEUED_SPINLOCKS)

1594

+ 		case PARAVIRT_PATCH(pv_lock_ops.queued_spin_unlock):

1595

+diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c

1596

+index 9f7c21c22477..7c5c5dc90ffa 100644

1597

+--- a/arch/x86/kernel/process.c

1598

++++ b/arch/x86/kernel/process.c

1599

+@@ -39,7 +39,7 @@

1600

+  * section. Since TSS's are completely CPU-local, we want them

1601

+  * on exact cacheline boundaries, to eliminate cacheline ping-pong.

1602

+  */

1603

+-__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = {

1604

++__visible DEFINE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct tss_struct, cpu_tss) = {

1605

+ 	.x86_tss = {

1606

+ 		.sp0 = TOP_OF_INIT_STACK,

1607

+ #ifdef CONFIG_X86_32

1608

+diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c

1609

+index e67b834279b2..bbaae4cf9e8e 100644

1610

+--- a/arch/x86/kernel/setup.c

1611

++++ b/arch/x86/kernel/setup.c

1612

+@@ -112,6 +112,7 @@

1613

+ #include <asm/alternative.h>

1614

+ #include <asm/prom.h>

1615

+ #include <asm/microcode.h>

1616

++#include <asm/kaiser.h>

1617

+

1618

+ /*

1619

+  * max_low_pfn_mapped: highest direct mapped pfn under 4GB

1620

+@@ -1016,6 +1017,12 @@ void __init setup_arch(char **cmdline_p)

1621

+ 	 */

1622

+ 	init_hypervisor_platform();

1623

+

1624

++	/*

1625

++	 * This needs to happen right after XENPV is set on xen and

1626

++	 * kaiser_enabled is checked below in cleanup_highmap().

1627

++	 */

1628

++	kaiser_check_boottime_disable();

1629

++

1630

+ 	x86_init.resources.probe_roms();

1631

+

1632

+ 	/* after parse_early_param, so could debug it */

1633

+diff --git a/arch/x86/kernel/tracepoint.c b/arch/x86/kernel/tracepoint.c

1634

+index 1c113db9ed57..2bb5ee464df3 100644

1635

+--- a/arch/x86/kernel/tracepoint.c

1636

++++ b/arch/x86/kernel/tracepoint.c

1637

+@@ -9,10 +9,12 @@

1638

+ #include <linux/atomic.h>

1639

+

1640

+ atomic_t trace_idt_ctr = ATOMIC_INIT(0);

1641

++__aligned(PAGE_SIZE)

1642

+ struct desc_ptr trace_idt_descr = { NR_VECTORS * 16 - 1,

1643

+ 				(unsigned long) trace_idt_table };

1644

+

1645

+ /* No need to be aligned, but done to keep all IDTs defined the same way. */

1646

++__aligned(PAGE_SIZE)

1647

+ gate_desc trace_idt_table[NR_VECTORS] __page_aligned_bss;

1648

+

1649

+ static int trace_irq_vector_refcount;

1650

+diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c

1651

+index 796f1ec67469..ccf17dbfea09 100644

1652

+--- a/arch/x86/kvm/x86.c

1653

++++ b/arch/x86/kvm/x86.c

1654

+@@ -759,7 +759,8 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)

1655

+ 			return 1;

1656

+

1657

+ 		/* PCID can not be enabled when cr3[11:0]!=000H or EFER.LMA=0 */

1658

+-		if ((kvm_read_cr3(vcpu) & X86_CR3_PCID_MASK) || !is_long_mode(vcpu))

1659

++		if ((kvm_read_cr3(vcpu) & X86_CR3_PCID_ASID_MASK) ||

1660

++		    !is_long_mode(vcpu))

1661

+ 			return 1;

1662

+ 	}

1663

+

1664

+diff --git a/arch/x86/lib/cmdline.c b/arch/x86/lib/cmdline.c

1665

+index 422db000d727..a744506856b1 100644

1666

+--- a/arch/x86/lib/cmdline.c

1667

++++ b/arch/x86/lib/cmdline.c

1668

+@@ -82,3 +82,108 @@ int cmdline_find_option_bool(const char *cmdline, const char *option)

1669

+

1670

+ 	return 0;	/* Buffer overrun */

1671

+ }

1672

++

1673

++/*

1674

++ * Find a non-boolean option (i.e. option=argument). In accordance with

1675

++ * standard Linux practice, if this option is repeated, this returns the

1676

++ * last instance on the command line.

1677

++ *

1678

++ * @cmdline: the cmdline string

1679

++ * @max_cmdline_size: the maximum size of cmdline

1680

++ * @option: option string to look for

1681

++ * @buffer: memory buffer to return the option argument

1682

++ * @bufsize: size of the supplied memory buffer

1683

++ *

1684

++ * Returns the length of the argument (regardless of if it was

1685

++ * truncated to fit in the buffer), or -1 on not found.

1686

++ */

1687

++static int

1688

++__cmdline_find_option(const char *cmdline, int max_cmdline_size,

1689

++		      const char *option, char *buffer, int bufsize)

1690

++{

1691

++	char c;

1692

++	int pos = 0, len = -1;

1693

++	const char *opptr = NULL;

1694

++	char *bufptr = buffer;

1695

++	enum {

1696

++		st_wordstart = 0,	/* Start of word/after whitespace */

1697

++		st_wordcmp,	/* Comparing this word */

1698

++		st_wordskip,	/* Miscompare, skip */

1699

++		st_bufcpy,	/* Copying this to buffer */

1700

++	} state = st_wordstart;

1701

++

1702

++	if (!cmdline)

1703

++		return -1;      /* No command line */

1704

++

1705

++	/*

1706

++	 * This 'pos' check ensures we do not overrun

1707

++	 * a non-NULL-terminated 'cmdline'

1708

++	 */

1709

++	while (pos++ < max_cmdline_size) {

1710

++		c = *(char *)cmdline++;

1711

++		if (!c)

1712

++			break;

1713

++

1714

++		switch (state) {

1715

++		case st_wordstart:

1716

++			if (myisspace(c))

1717

++				break;

1718

++

1719

++			state = st_wordcmp;

1720

++			opptr = option;

1721

++			/* fall through */

1722

++

1723

++		case st_wordcmp:

1724

++			if ((c == '=') && !*opptr) {

1725

++				/*

1726

++				 * We matched all the way to the end of the

1727

++				 * option we were looking for, prepare to

1728

++				 * copy the argument.

1729

++				 */

1730

++				len = 0;

1731

++				bufptr = buffer;

1732

++				state = st_bufcpy;

1733

++				break;

1734

++			} else if (c == *opptr++) {

1735

++				/*

1736

++				 * We are currently matching, so continue

1737

++				 * to the next character on the cmdline.

1738

++				 */

1739

++				break;

1740

++			}

1741

++			state = st_wordskip;

1742

++			/* fall through */

1743

++

1744

++		case st_wordskip:

1745

++			if (myisspace(c))

1746

++				state = st_wordstart;

1747

++			break;

1748

++

1749

++		case st_bufcpy:

1750

++			if (myisspace(c)) {

1751

++				state = st_wordstart;

1752

++			} else {

1753

++				/*

1754

++				 * Increment len, but don't overrun the

1755

++				 * supplied buffer and leave room for the

1756

++				 * NULL terminator.

1757

++				 */

1758

++				if (++len < bufsize)

1759

++					*bufptr++ = c;

1760

++			}

1761

++			break;

1762

++		}

1763

++	}

1764

++

1765

++	if (bufsize)

1766

++		*bufptr = '\0';

1767

++

1768

++	return len;

1769

++}

1770

++

1771

++int cmdline_find_option(const char *cmdline, const char *option, char *buffer,

1772

++			int bufsize)

1773

++{

1774

++	return __cmdline_find_option(cmdline, COMMAND_LINE_SIZE, option,

1775

++				     buffer, bufsize);

1776

++}

1777

+diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile

1778

+index 1ae7c141f778..61e6cead9c4a 100644

1779

+--- a/arch/x86/mm/Makefile

1780

++++ b/arch/x86/mm/Makefile

1781

+@@ -32,3 +32,4 @@ obj-$(CONFIG_ACPI_NUMA)		+= srat.o

1782

+ obj-$(CONFIG_NUMA_EMU)		+= numa_emulation.o

1783

+

1784

+ obj-$(CONFIG_X86_INTEL_MPX)	+= mpx.o

1785

++obj-$(CONFIG_PAGE_TABLE_ISOLATION)		+= kaiser.o

1786

+diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c

1787

+index ed4b372860e4..2bd45ae91eb3 100644

1788

+--- a/arch/x86/mm/init.c

1789

++++ b/arch/x86/mm/init.c

1790

+@@ -165,7 +165,7 @@ static void __init probe_page_size_mask(void)

1791

+ 		cr4_set_bits_and_update_boot(X86_CR4_PSE);

1792

+

1793

+ 	/* Enable PGE if available */

1794

+-	if (cpu_has_pge) {

1795

++	if (cpu_has_pge && !kaiser_enabled) {

1796

+ 		cr4_set_bits_and_update_boot(X86_CR4_PGE);

1797

+ 		__supported_pte_mask |= _PAGE_GLOBAL;

1798

+ 	} else

1799

+diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c

1800

+index ec081fe0ce2c..d76ec9348cff 100644

1801

+--- a/arch/x86/mm/init_64.c

1802

++++ b/arch/x86/mm/init_64.c

1803

+@@ -395,6 +395,16 @@ void __init cleanup_highmap(void)

1804

+ 			continue;

1805

+ 		if (vaddr < (unsigned long) _text || vaddr > end)

1806

+ 			set_pmd(pmd, __pmd(0));

1807

++		else if (kaiser_enabled) {

1808

++			/*

1809

++			 * level2_kernel_pgt is initialized with _PAGE_GLOBAL:

1810

++			 * clear that now.  This is not important, so long as

1811

++			 * CR4.PGE remains clear, but it removes an anomaly.

1812

++			 * Physical mapping setup below avoids _PAGE_GLOBAL

1813

++			 * by use of massage_pgprot() inside pfn_pte() etc.

1814

++			 */

1815

++			set_pmd(pmd, pmd_clear_flags(*pmd, _PAGE_GLOBAL));

1816

++		}

1817

+ 	}

1818

+ }

1819

+

1820

+diff --git a/arch/x86/mm/kaiser.c b/arch/x86/mm/kaiser.c

1821

+new file mode 100644

1822

+index 000000000000..b0b3a69f1c7f

1823

+--- /dev/null

1824

++++ b/arch/x86/mm/kaiser.c

1825

+@@ -0,0 +1,456 @@

1826

++#include <linux/bug.h>

1827

++#include <linux/kernel.h>

1828

++#include <linux/errno.h>

1829

++#include <linux/string.h>

1830

++#include <linux/types.h>

1831

++#include <linux/bug.h>

1832

++#include <linux/init.h>

1833

++#include <linux/interrupt.h>

1834

++#include <linux/spinlock.h>

1835

++#include <linux/mm.h>

1836

++#include <linux/uaccess.h>

1837

++#include <linux/ftrace.h>

1838

++

1839

++#undef pr_fmt

1840

++#define pr_fmt(fmt)     "Kernel/User page tables isolation: " fmt

1841

++

1842

++#include <asm/kaiser.h>

1843

++#include <asm/tlbflush.h>	/* to verify its kaiser declarations */

1844

++#include <asm/pgtable.h>

1845

++#include <asm/pgalloc.h>

1846

++#include <asm/desc.h>

1847

++#include <asm/cmdline.h>

1848

++

1849

++int kaiser_enabled __read_mostly = 1;

1850

++EXPORT_SYMBOL(kaiser_enabled);	/* for inlined TLB flush functions */

1851

++

1852

++__visible

1853

++DEFINE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);

1854

++

1855

++/*

1856

++ * These can have bit 63 set, so we can not just use a plain "or"

1857

++ * instruction to get their value or'd into CR3.  It would take

1858

++ * another register.  So, we use a memory reference to these instead.

1859

++ *

1860

++ * This is also handy because systems that do not support PCIDs

1861

++ * just end up or'ing a 0 into their CR3, which does no harm.

1862

++ */

1863

++DEFINE_PER_CPU(unsigned long, x86_cr3_pcid_user);

1864

++

1865

++/*

1866

++ * At runtime, the only things we map are some things for CPU

1867

++ * hotplug, and stacks for new processes.  No two CPUs will ever

1868

++ * be populating the same addresses, so we only need to ensure

1869

++ * that we protect between two CPUs trying to allocate and

1870

++ * populate the same page table page.

1871

++ *

1872

++ * Only take this lock when doing a set_p[4um]d(), but it is not

1873

++ * needed for doing a set_pte().  We assume that only the *owner*

1874

++ * of a given allocation will be doing this for _their_

1875

++ * allocation.

1876

++ *

1877

++ * This ensures that once a system has been running for a while

1878

++ * and there have been stacks all over and these page tables

1879

++ * are fully populated, there will be no further acquisitions of

1880

++ * this lock.

1881

++ */

1882

++static DEFINE_SPINLOCK(shadow_table_allocation_lock);

1883

++

1884

++/*

1885

++ * Returns -1 on error.

1886

++ */

1887

++static inline unsigned long get_pa_from_mapping(unsigned long vaddr)

1888

++{

1889

++	pgd_t *pgd;

1890

++	pud_t *pud;

1891

++	pmd_t *pmd;

1892

++	pte_t *pte;

1893

++

1894

++	pgd = pgd_offset_k(vaddr);

1895

++	/*

1896

++	 * We made all the kernel PGDs present in kaiser_init().

1897

++	 * We expect them to stay that way.

1898

++	 */

1899

++	BUG_ON(pgd_none(*pgd));

1900

++	/*

1901

++	 * PGDs are either 512GB or 128TB on all x86_64

1902

++	 * configurations.  We don't handle these.

1903

++	 */

1904

++	BUG_ON(pgd_large(*pgd));

1905

++

1906

++	pud = pud_offset(pgd, vaddr);

1907

++	if (pud_none(*pud)) {

1908

++		WARN_ON_ONCE(1);

1909

++		return -1;

1910

++	}

1911

++

1912

++	if (pud_large(*pud))

1913

++		return (pud_pfn(*pud) << PAGE_SHIFT) | (vaddr & ~PUD_PAGE_MASK);

1914

++

1915

++	pmd = pmd_offset(pud, vaddr);

1916

++	if (pmd_none(*pmd)) {

1917

++		WARN_ON_ONCE(1);

1918

++		return -1;

1919

++	}

1920

++

1921

++	if (pmd_large(*pmd))

1922

++		return (pmd_pfn(*pmd) << PAGE_SHIFT) | (vaddr & ~PMD_PAGE_MASK);

1923

++

1924

++	pte = pte_offset_kernel(pmd, vaddr);

1925

++	if (pte_none(*pte)) {

1926

++		WARN_ON_ONCE(1);

1927

++		return -1;

1928

++	}

1929

++

1930

++	return (pte_pfn(*pte) << PAGE_SHIFT) | (vaddr & ~PAGE_MASK);

1931

++}

1932

++

1933

++/*

1934

++ * This is a relatively normal page table walk, except that it

1935

++ * also tries to allocate page tables pages along the way.

1936

++ *

1937

++ * Returns a pointer to a PTE on success, or NULL on failure.

1938

++ */

1939

++static pte_t *kaiser_pagetable_walk(unsigned long address)

1940

++{

1941

++	pmd_t *pmd;

1942

++	pud_t *pud;

1943

++	pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(address));

1944

++	gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);

1945

++

1946

++	if (pgd_none(*pgd)) {

1947

++		WARN_ONCE(1, "All shadow pgds should have been populated");

1948

++		return NULL;

1949

++	}

1950

++	BUILD_BUG_ON(pgd_large(*pgd) != 0);

1951

++

1952

++	pud = pud_offset(pgd, address);

1953

++	/* The shadow page tables do not use large mappings: */

1954

++	if (pud_large(*pud)) {

1955

++		WARN_ON(1);

1956

++		return NULL;

1957

++	}

1958

++	if (pud_none(*pud)) {

1959

++		unsigned long new_pmd_page = __get_free_page(gfp);

1960

++		if (!new_pmd_page)

1961

++			return NULL;

1962

++		spin_lock(&shadow_table_allocation_lock);

1963

++		if (pud_none(*pud)) {

1964

++			set_pud(pud, __pud(_KERNPG_TABLE | __pa(new_pmd_page)));

1965

++			__inc_zone_page_state(virt_to_page((void *)

1966

++						new_pmd_page), NR_KAISERTABLE);

1967

++		} else

1968

++			free_page(new_pmd_page);

1969

++		spin_unlock(&shadow_table_allocation_lock);

1970

++	}

1971

++

1972

++	pmd = pmd_offset(pud, address);

1973

++	/* The shadow page tables do not use large mappings: */

1974

++	if (pmd_large(*pmd)) {

1975

++		WARN_ON(1);

1976

++		return NULL;

1977

++	}

1978

++	if (pmd_none(*pmd)) {

1979

++		unsigned long new_pte_page = __get_free_page(gfp);

1980

++		if (!new_pte_page)

1981

++			return NULL;

1982

++		spin_lock(&shadow_table_allocation_lock);

1983

++		if (pmd_none(*pmd)) {

1984

++			set_pmd(pmd, __pmd(_KERNPG_TABLE | __pa(new_pte_page)));

1985

++			__inc_zone_page_state(virt_to_page((void *)

1986

++						new_pte_page), NR_KAISERTABLE);

1987

++		} else

1988

++			free_page(new_pte_page);

1989

++		spin_unlock(&shadow_table_allocation_lock);

1990

++	}

1991

++

1992

++	return pte_offset_kernel(pmd, address);

1993

++}

1994

++

1995

++static int kaiser_add_user_map(const void *__start_addr, unsigned long size,

1996

++			       unsigned long flags)

1997

++{

1998

++	int ret = 0;

1999

++	pte_t *pte;

2000

++	unsigned long start_addr = (unsigned long )__start_addr;

2001

++	unsigned long address = start_addr & PAGE_MASK;

2002

++	unsigned long end_addr = PAGE_ALIGN(start_addr + size);

2003

++	unsigned long target_address;

2004

++

2005

++	/*

2006

++	 * It is convenient for callers to pass in __PAGE_KERNEL etc,

2007

++	 * and there is no actual harm from setting _PAGE_GLOBAL, so

2008

++	 * long as CR4.PGE is not set.  But it is nonetheless troubling

2009

++	 * to see Kaiser itself setting _PAGE_GLOBAL (now that "nokaiser"

2010

++	 * requires that not to be #defined to 0): so mask it off here.

2011

++	 */

2012

++	flags &= ~_PAGE_GLOBAL;

2013

++

2014

++	for (; address < end_addr; address += PAGE_SIZE) {

2015

++		target_address = get_pa_from_mapping(address);

2016

++		if (target_address == -1) {

2017

++			ret = -EIO;

2018

++			break;

2019

++		}

2020

++		pte = kaiser_pagetable_walk(address);

2021

++		if (!pte) {

2022

++			ret = -ENOMEM;

2023

++			break;

2024

++		}

2025

++		if (pte_none(*pte)) {

2026

++			set_pte(pte, __pte(flags | target_address));

2027

++		} else {

2028

++			pte_t tmp;

2029

++			set_pte(&tmp, __pte(flags | target_address));

2030

++			WARN_ON_ONCE(!pte_same(*pte, tmp));

2031

++		}

2032

++	}

2033

++	return ret;

2034

++}

2035

++

2036

++static int kaiser_add_user_map_ptrs(const void *start, const void *end, unsigned long flags)

2037

++{

2038

++	unsigned long size = end - start;

2039

++

2040

++	return kaiser_add_user_map(start, size, flags);

2041

++}

2042

++

2043

++/*

2044

++ * Ensure that the top level of the (shadow) page tables are

2045

++ * entirely populated.  This ensures that all processes that get

2046

++ * forked have the same entries.  This way, we do not have to

2047

++ * ever go set up new entries in older processes.

2048

++ *

2049

++ * Note: we never free these, so there are no updates to them

2050

++ * after this.

2051

++ */

2052

++static void __init kaiser_init_all_pgds(void)

2053

++{

2054

++	pgd_t *pgd;

2055

++	int i = 0;

2056

++

2057

++	pgd = native_get_shadow_pgd(pgd_offset_k((unsigned long )0));

2058

++	for (i = PTRS_PER_PGD / 2; i < PTRS_PER_PGD; i++) {

2059

++		pgd_t new_pgd;

2060

++		pud_t *pud = pud_alloc_one(&init_mm,

2061

++					   PAGE_OFFSET + i * PGDIR_SIZE);

2062

++		if (!pud) {

2063

++			WARN_ON(1);

2064

++			break;

2065

++		}

2066

++		inc_zone_page_state(virt_to_page(pud), NR_KAISERTABLE);

2067

++		new_pgd = __pgd(_KERNPG_TABLE |__pa(pud));

2068

++		/*

2069

++		 * Make sure not to stomp on some other pgd entry.

2070

++		 */

2071

++		if (!pgd_none(pgd[i])) {

2072

++			WARN_ON(1);

2073

++			continue;

2074

++		}

2075

++		set_pgd(pgd + i, new_pgd);

2076

++	}

2077

++}

2078

++

2079

++#define kaiser_add_user_map_early(start, size, flags) do {	\

2080

++	int __ret = kaiser_add_user_map(start, size, flags);	\

2081

++	WARN_ON(__ret);						\

2082

++} while (0)

2083

++

2084

++#define kaiser_add_user_map_ptrs_early(start, end, flags) do {		\

2085

++	int __ret = kaiser_add_user_map_ptrs(start, end, flags);	\

2086

++	WARN_ON(__ret);							\

2087

++} while (0)

2088

++

2089

++void __init kaiser_check_boottime_disable(void)

2090

++{

2091

++	bool enable = true;

2092

++	char arg[5];

2093

++	int ret;

2094

++

2095

++	if (boot_cpu_has(X86_FEATURE_XENPV))

2096

++		goto silent_disable;

2097

++

2098

++	ret = cmdline_find_option(boot_command_line, "pti", arg, sizeof(arg));

2099

++	if (ret > 0) {

2100

++		if (!strncmp(arg, "on", 2))

2101

++			goto enable;

2102

++

2103

++		if (!strncmp(arg, "off", 3))

2104

++			goto disable;

2105

++

2106

++		if (!strncmp(arg, "auto", 4))

2107

++			goto skip;

2108

++	}

2109

++

2110

++	if (cmdline_find_option_bool(boot_command_line, "nopti"))

2111

++		goto disable;

2112

++

2113

++skip:

2114

++	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)

2115

++		goto disable;

2116

++

2117

++enable:

2118

++	if (enable)

2119

++		setup_force_cpu_cap(X86_FEATURE_KAISER);

2120

++

2121

++	return;

2122

++

2123

++disable:

2124

++	pr_info("disabled\n");

2125

++

2126

++silent_disable:

2127

++	kaiser_enabled = 0;

2128

++	setup_clear_cpu_cap(X86_FEATURE_KAISER);

2129

++}

2130

++

2131

++/*

2132

++ * If anything in here fails, we will likely die on one of the

2133

++ * first kernel->user transitions and init will die.  But, we

2134

++ * will have most of the kernel up by then and should be able to

2135

++ * get a clean warning out of it.  If we BUG_ON() here, we run

2136

++ * the risk of being before we have good console output.

2137

++ */

2138

++void __init kaiser_init(void)

2139

++{

2140

++	int cpu;

2141

++

2142

++	if (!kaiser_enabled)

2143

++		return;

2144

++

2145

++	kaiser_init_all_pgds();

2146

++

2147

++	for_each_possible_cpu(cpu) {

2148

++		void *percpu_vaddr = __per_cpu_user_mapped_start +

2149

++				     per_cpu_offset(cpu);

2150

++		unsigned long percpu_sz = __per_cpu_user_mapped_end -

2151

++					  __per_cpu_user_mapped_start;

2152

++		kaiser_add_user_map_early(percpu_vaddr, percpu_sz,

2153

++					  __PAGE_KERNEL);

2154

++	}

2155

++

2156

++	/*

2157

++	 * Map the entry/exit text section, which is needed at

2158

++	 * switches from user to and from kernel.

2159

++	 */

2160

++	kaiser_add_user_map_ptrs_early(__entry_text_start, __entry_text_end,

2161

++				       __PAGE_KERNEL_RX);

2162

++

2163

++#if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN)

2164

++	kaiser_add_user_map_ptrs_early(__irqentry_text_start,

2165

++				       __irqentry_text_end,

2166

++				       __PAGE_KERNEL_RX);

2167

++#endif

2168

++	kaiser_add_user_map_early((void *)idt_descr.address,

2169

++				  sizeof(gate_desc) * NR_VECTORS,

2170

++				  __PAGE_KERNEL_RO);

2171

++#ifdef CONFIG_TRACING

2172

++	kaiser_add_user_map_early(&trace_idt_descr,

2173

++				  sizeof(trace_idt_descr),

2174

++				  __PAGE_KERNEL);

2175

++	kaiser_add_user_map_early(&trace_idt_table,

2176

++				  sizeof(gate_desc) * NR_VECTORS,

2177

++				  __PAGE_KERNEL);

2178

++#endif

2179

++	kaiser_add_user_map_early(&debug_idt_descr, sizeof(debug_idt_descr),

2180

++				  __PAGE_KERNEL);

2181

++	kaiser_add_user_map_early(&debug_idt_table,

2182

++				  sizeof(gate_desc) * NR_VECTORS,

2183

++				  __PAGE_KERNEL);

2184

++

2185

++	pr_info("enabled\n");

2186

++}

2187

++

2188

++/* Add a mapping to the shadow mapping, and synchronize the mappings */

2189

++int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags)

2190

++{

2191

++	if (!kaiser_enabled)

2192

++		return 0;

2193

++	return kaiser_add_user_map((const void *)addr, size, flags);

2194

++}

2195

++

2196

++void kaiser_remove_mapping(unsigned long start, unsigned long size)

2197

++{

2198

++	extern void unmap_pud_range_nofree(pgd_t *pgd,

2199

++				unsigned long start, unsigned long end);

2200

++	unsigned long end = start + size;

2201

++	unsigned long addr, next;

2202

++	pgd_t *pgd;

2203

++

2204

++	if (!kaiser_enabled)

2205

++		return;

2206

++	pgd = native_get_shadow_pgd(pgd_offset_k(start));

2207

++	for (addr = start; addr < end; pgd++, addr = next) {

2208

++		next = pgd_addr_end(addr, end);

2209

++		unmap_pud_range_nofree(pgd, addr, next);

2210

++	}

2211

++}

2212

++

2213

++/*

2214

++ * Page table pages are page-aligned.  The lower half of the top

2215

++ * level is used for userspace and the top half for the kernel.

2216

++ * This returns true for user pages that need to get copied into

2217

++ * both the user and kernel copies of the page tables, and false

2218

++ * for kernel pages that should only be in the kernel copy.

2219

++ */

2220

++static inline bool is_userspace_pgd(pgd_t *pgdp)

2221

++{

2222

++	return ((unsigned long)pgdp % PAGE_SIZE) < (PAGE_SIZE / 2);

2223

++}

2224

++

2225

++pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd)

2226

++{

2227

++	if (!kaiser_enabled)

2228

++		return pgd;

2229

++	/*

2230

++	 * Do we need to also populate the shadow pgd?  Check _PAGE_USER to

2231

++	 * skip cases like kexec and EFI which make temporary low mappings.

2232

++	 */

2233

++	if (pgd.pgd & _PAGE_USER) {

2234

++		if (is_userspace_pgd(pgdp)) {

2235

++			native_get_shadow_pgd(pgdp)->pgd = pgd.pgd;

2236

++			/*

2237

++			 * Even if the entry is *mapping* userspace, ensure

2238

++			 * that userspace can not use it.  This way, if we

2239

++			 * get out to userspace running on the kernel CR3,

2240

++			 * userspace will crash instead of running.

2241

++			 */

2242

++			if (__supported_pte_mask & _PAGE_NX)

2243

++				pgd.pgd |= _PAGE_NX;

2244

++		}

2245

++	} else if (!pgd.pgd) {

2246

++		/*

2247

++		 * pgd_clear() cannot check _PAGE_USER, and is even used to

2248

++		 * clear corrupted pgd entries: so just rely on cases like

2249

++		 * kexec and EFI never to be using pgd_clear().

2250

++		 */

2251

++		if (!WARN_ON_ONCE((unsigned long)pgdp & PAGE_SIZE) &&

2252

++		    is_userspace_pgd(pgdp))

2253

++			native_get_shadow_pgd(pgdp)->pgd = pgd.pgd;

2254

++	}

2255

++	return pgd;

2256

++}

2257

++

2258

++void kaiser_setup_pcid(void)

2259

++{

2260

++	unsigned long user_cr3 = KAISER_SHADOW_PGD_OFFSET;

2261

++

2262

++	if (this_cpu_has(X86_FEATURE_PCID))

2263

++		user_cr3 |= X86_CR3_PCID_USER_NOFLUSH;

2264

++	/*

2265

++	 * These variables are used by the entry/exit

2266

++	 * code to change PCID and pgd and TLB flushing.

2267

++	 */

2268

++	this_cpu_write(x86_cr3_pcid_user, user_cr3);

2269

++}

2270

++

2271

++/*

2272

++ * Make a note that this cpu will need to flush USER tlb on return to user.

2273

++ * If cpu does not have PCID, then the NOFLUSH bit will never have been set.

2274

++ */

2275

++void kaiser_flush_tlb_on_return_to_user(void)

2276

++{

2277

++	if (this_cpu_has(X86_FEATURE_PCID))

2278

++		this_cpu_write(x86_cr3_pcid_user,

2279

++			X86_CR3_PCID_USER_FLUSH | KAISER_SHADOW_PGD_OFFSET);

2280

++}

2281

++EXPORT_SYMBOL(kaiser_flush_tlb_on_return_to_user);

2282

+diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c

2283

+index 4e5ac46adc9d..81ec7c02f968 100644

2284

+--- a/arch/x86/mm/kasan_init_64.c

2285

++++ b/arch/x86/mm/kasan_init_64.c

2286

+@@ -121,11 +121,16 @@ void __init kasan_init(void)

2287

+ 	kasan_populate_zero_shadow(kasan_mem_to_shadow((void *)MODULES_END),

2288

+ 			(void *)KASAN_SHADOW_END);

2289

+

2290

+-	memset(kasan_zero_page, 0, PAGE_SIZE);

2291

+-

2292

+ 	load_cr3(init_level4_pgt);

2293

+ 	__flush_tlb_all();

2294

+-	init_task.kasan_depth = 0;

2295

+

2296

++	/*

2297

++	 * kasan_zero_page has been used as early shadow memory, thus it may

2298

++	 * contain some garbage. Now we can clear it, since after the TLB flush

2299

++	 * no one should write to it.

2300

++	 */

2301

++	memset(kasan_zero_page, 0, PAGE_SIZE);

2302

++

2303

++	init_task.kasan_depth = 0;

2304

+ 	pr_info("KernelAddressSanitizer initialized\n");

2305

+ }

2306

+diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c

2307

+index b599a780a5a9..79377e2a7bcd 100644

2308

+--- a/arch/x86/mm/pageattr.c

2309

++++ b/arch/x86/mm/pageattr.c

2310

+@@ -52,6 +52,7 @@ static DEFINE_SPINLOCK(cpa_lock);

2311

+ #define CPA_FLUSHTLB 1

2312

+ #define CPA_ARRAY 2

2313

+ #define CPA_PAGES_ARRAY 4

2314

++#define CPA_FREE_PAGETABLES 8

2315

+

2316

+ #ifdef CONFIG_PROC_FS

2317

+ static unsigned long direct_pages_count[PG_LEVEL_NUM];

2318

+@@ -723,10 +724,13 @@ static int split_large_page(struct cpa_data *cpa, pte_t *kpte,

2319

+ 	return 0;

2320

+ }

2321

+

2322

+-static bool try_to_free_pte_page(pte_t *pte)

2323

++static bool try_to_free_pte_page(struct cpa_data *cpa, pte_t *pte)

2324

+ {

2325

+ 	int i;

2326

+

2327

++	if (!(cpa->flags & CPA_FREE_PAGETABLES))

2328

++		return false;

2329

++

2330

+ 	for (i = 0; i < PTRS_PER_PTE; i++)

2331

+ 		if (!pte_none(pte[i]))

2332

+ 			return false;

2333

+@@ -735,10 +739,13 @@ static bool try_to_free_pte_page(pte_t *pte)

2334

+ 	return true;

2335

+ }

2336

+

2337

+-static bool try_to_free_pmd_page(pmd_t *pmd)

2338

++static bool try_to_free_pmd_page(struct cpa_data *cpa, pmd_t *pmd)

2339

+ {

2340

+ 	int i;

2341

+

2342

++	if (!(cpa->flags & CPA_FREE_PAGETABLES))

2343

++		return false;

2344

++

2345

+ 	for (i = 0; i < PTRS_PER_PMD; i++)

2346

+ 		if (!pmd_none(pmd[i]))

2347

+ 			return false;

2348

+@@ -759,7 +766,9 @@ static bool try_to_free_pud_page(pud_t *pud)

2349

+ 	return true;

2350

+ }

2351

+

2352

+-static bool unmap_pte_range(pmd_t *pmd, unsigned long start, unsigned long end)

2353

++static bool unmap_pte_range(struct cpa_data *cpa, pmd_t *pmd,

2354

++			    unsigned long start,

2355

++			    unsigned long end)

2356

+ {

2357

+ 	pte_t *pte = pte_offset_kernel(pmd, start);

2358

+

2359

+@@ -770,22 +779,23 @@ static bool unmap_pte_range(pmd_t *pmd, unsigned long start, unsigned long end)

2360

+ 		pte++;

2361

+ 	}

2362

+

2363

+-	if (try_to_free_pte_page((pte_t *)pmd_page_vaddr(*pmd))) {

2364

++	if (try_to_free_pte_page(cpa, (pte_t *)pmd_page_vaddr(*pmd))) {

2365

+ 		pmd_clear(pmd);

2366

+ 		return true;

2367

+ 	}

2368

+ 	return false;

2369

+ }

2370

+

2371

+-static void __unmap_pmd_range(pud_t *pud, pmd_t *pmd,

2372

++static void __unmap_pmd_range(struct cpa_data *cpa, pud_t *pud, pmd_t *pmd,

2373

+ 			      unsigned long start, unsigned long end)

2374

+ {

2375

+-	if (unmap_pte_range(pmd, start, end))

2376

+-		if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud)))

2377

++	if (unmap_pte_range(cpa, pmd, start, end))

2378

++		if (try_to_free_pmd_page(cpa, (pmd_t *)pud_page_vaddr(*pud)))

2379

+ 			pud_clear(pud);

2380

+ }

2381

+

2382

+-static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end)

2383

++static void unmap_pmd_range(struct cpa_data *cpa, pud_t *pud,

2384

++			    unsigned long start, unsigned long end)

2385

+ {

2386

+ 	pmd_t *pmd = pmd_offset(pud, start);

2387

+

2388

+@@ -796,7 +806,7 @@ static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end)

2389

+ 		unsigned long next_page = (start + PMD_SIZE) & PMD_MASK;

2390

+ 		unsigned long pre_end = min_t(unsigned long, end, next_page);

2391

+

2392

+-		__unmap_pmd_range(pud, pmd, start, pre_end);

2393

++		__unmap_pmd_range(cpa, pud, pmd, start, pre_end);

2394

+

2395

+ 		start = pre_end;

2396

+ 		pmd++;

2397

+@@ -809,7 +819,8 @@ static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end)

2398

+ 		if (pmd_large(*pmd))

2399

+ 			pmd_clear(pmd);

2400

+ 		else

2401

+-			__unmap_pmd_range(pud, pmd, start, start + PMD_SIZE);

2402

++			__unmap_pmd_range(cpa, pud, pmd,

2403

++					  start, start + PMD_SIZE);

2404

+

2405

+ 		start += PMD_SIZE;

2406

+ 		pmd++;

2407

+@@ -819,17 +830,19 @@ static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end)

2408

+ 	 * 4K leftovers?

2409

+ 	 */

2410

+ 	if (start < end)

2411

+-		return __unmap_pmd_range(pud, pmd, start, end);

2412

++		return __unmap_pmd_range(cpa, pud, pmd, start, end);

2413

+

2414

+ 	/*

2415

+ 	 * Try again to free the PMD page if haven't succeeded above.

2416

+ 	 */

2417

+ 	if (!pud_none(*pud))

2418

+-		if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud)))

2419

++		if (try_to_free_pmd_page(cpa, (pmd_t *)pud_page_vaddr(*pud)))

2420

+ 			pud_clear(pud);

2421

+ }

2422

+

2423

+-static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)

2424

++static void __unmap_pud_range(struct cpa_data *cpa, pgd_t *pgd,

2425

++			      unsigned long start,

2426

++			      unsigned long end)

2427

+ {

2428

+ 	pud_t *pud = pud_offset(pgd, start);

2429

+

2430

+@@ -840,7 +853,7 @@ static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)

2431

+ 		unsigned long next_page = (start + PUD_SIZE) & PUD_MASK;

2432

+ 		unsigned long pre_end	= min_t(unsigned long, end, next_page);

2433

+

2434

+-		unmap_pmd_range(pud, start, pre_end);

2435

++		unmap_pmd_range(cpa, pud, start, pre_end);

2436

+

2437

+ 		start = pre_end;

2438

+ 		pud++;

2439

+@@ -854,7 +867,7 @@ static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)

2440

+ 		if (pud_large(*pud))

2441

+ 			pud_clear(pud);

2442

+ 		else

2443

+-			unmap_pmd_range(pud, start, start + PUD_SIZE);

2444

++			unmap_pmd_range(cpa, pud, start, start + PUD_SIZE);

2445

+

2446

+ 		start += PUD_SIZE;

2447

+ 		pud++;

2448

+@@ -864,7 +877,7 @@ static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)

2449

+ 	 * 2M leftovers?

2450

+ 	 */

2451

+ 	if (start < end)

2452

+-		unmap_pmd_range(pud, start, end);

2453

++		unmap_pmd_range(cpa, pud, start, end);

2454

+

2455

+ 	/*

2456

+ 	 * No need to try to free the PUD page because we'll free it in

2457

+@@ -872,6 +885,24 @@ static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)

2458

+ 	 */

2459

+ }

2460

+

2461

++static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)

2462

++{

2463

++	struct cpa_data cpa = {

2464

++		.flags = CPA_FREE_PAGETABLES,

2465

++	};

2466

++

2467

++	__unmap_pud_range(&cpa, pgd, start, end);

2468

++}

2469

++

2470

++void unmap_pud_range_nofree(pgd_t *pgd, unsigned long start, unsigned long end)

2471

++{

2472

++	struct cpa_data cpa = {

2473

++		.flags = 0,

2474

++	};

2475

++

2476

++	__unmap_pud_range(&cpa, pgd, start, end);

2477

++}

2478

++

2479

+ static void unmap_pgd_range(pgd_t *root, unsigned long addr, unsigned long end)

2480

+ {

2481

+ 	pgd_t *pgd_entry = root + pgd_index(addr);

2482

+diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c

2483

+index fb0a9dd1d6e4..dbc27a2b4ad5 100644

2484

+--- a/arch/x86/mm/pgtable.c

2485

++++ b/arch/x86/mm/pgtable.c

2486

+@@ -6,7 +6,7 @@

2487

+ #include <asm/fixmap.h>

2488

+ #include <asm/mtrr.h>

2489

+

2490

+-#define PGALLOC_GFP GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO

2491

++#define PGALLOC_GFP (GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO)

2492

+

2493

+ #ifdef CONFIG_HIGHPTE

2494

+ #define PGALLOC_USER_GFP __GFP_HIGHMEM

2495

+@@ -340,14 +340,24 @@ static inline void _pgd_free(pgd_t *pgd)

2496

+ 		kmem_cache_free(pgd_cache, pgd);

2497

+ }

2498

+ #else

2499

++

2500

++/*

2501

++ * Instead of one pgd, Kaiser acquires two pgds.  Being order-1, it is

2502

++ * both 8k in size and 8k-aligned.  That lets us just flip bit 12

2503

++ * in a pointer to swap between the two 4k halves.

2504

++ */

2505

++#define PGD_ALLOCATION_ORDER	kaiser_enabled

2506

++

2507

+ static inline pgd_t *_pgd_alloc(void)

2508

+ {

2509

+-	return (pgd_t *)__get_free_page(PGALLOC_GFP);

2510

++	/* No __GFP_REPEAT: to avoid page allocation stalls in order-1 case */

2511

++	return (pgd_t *)__get_free_pages(PGALLOC_GFP & ~__GFP_REPEAT,

2512

++					 PGD_ALLOCATION_ORDER);

2513

+ }

2514

+

2515

+ static inline void _pgd_free(pgd_t *pgd)

2516

+ {

2517

+-	free_page((unsigned long)pgd);

2518

++	free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER);

2519

+ }

2520

+ #endif /* CONFIG_X86_PAE */

2521

+

2522

+diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c

2523

+index 7a4cdb632508..7cad01af6dcd 100644

2524

+--- a/arch/x86/mm/tlb.c

2525

++++ b/arch/x86/mm/tlb.c

2526

+@@ -6,13 +6,14 @@

2527

+ #include <linux/interrupt.h>

2528

+ #include <linux/module.h>

2529

+ #include <linux/cpu.h>

2530

++#include <linux/debugfs.h>

2531

+

2532

+ #include <asm/tlbflush.h>

2533

+ #include <asm/mmu_context.h>

2534

+ #include <asm/cache.h>

2535

+ #include <asm/apic.h>

2536

+ #include <asm/uv/uv.h>

2537

+-#include <linux/debugfs.h>

2538

++#include <asm/kaiser.h>

2539

+

2540

+ /*

2541

+  *	TLB flushing, formerly SMP-only

2542

+@@ -34,6 +35,36 @@ struct flush_tlb_info {

2543

+ 	unsigned long flush_end;

2544

+ };

2545

+

2546

++static void load_new_mm_cr3(pgd_t *pgdir)

2547

++{

2548

++	unsigned long new_mm_cr3 = __pa(pgdir);

2549

++

2550

++	if (kaiser_enabled) {

2551

++		/*

2552

++		 * We reuse the same PCID for different tasks, so we must

2553

++		 * flush all the entries for the PCID out when we change tasks.

2554

++		 * Flush KERN below, flush USER when returning to userspace in

2555

++		 * kaiser's SWITCH_USER_CR3 (_SWITCH_TO_USER_CR3) macro.

2556

++		 *

2557

++		 * invpcid_flush_single_context(X86_CR3_PCID_ASID_USER) could

2558

++		 * do it here, but can only be used if X86_FEATURE_INVPCID is

2559

++		 * available - and many machines support pcid without invpcid.

2560

++		 *

2561

++		 * If X86_CR3_PCID_KERN_FLUSH actually added something, then it

2562

++		 * would be needed in the write_cr3() below - if PCIDs enabled.

2563

++		 */

2564

++		BUILD_BUG_ON(X86_CR3_PCID_KERN_FLUSH);

2565

++		kaiser_flush_tlb_on_return_to_user();

2566

++	}

2567

++

2568

++	/*

2569

++	 * Caution: many callers of this function expect

2570

++	 * that load_cr3() is serializing and orders TLB

2571

++	 * fills with respect to the mm_cpumask writes.

2572

++	 */

2573

++	write_cr3(new_mm_cr3);

2574

++}

2575

++

2576

+ /*

2577

+  * We cannot call mmdrop() because we are in interrupt context,

2578

+  * instead update mm->cpu_vm_mask.

2579

+@@ -45,7 +76,7 @@ void leave_mm(int cpu)

2580

+ 		BUG();

2581

+ 	if (cpumask_test_cpu(cpu, mm_cpumask(active_mm))) {

2582

+ 		cpumask_clear_cpu(cpu, mm_cpumask(active_mm));

2583

+-		load_cr3(swapper_pg_dir);

2584

++		load_new_mm_cr3(swapper_pg_dir);

2585

+ 		/*

2586

+ 		 * This gets called in the idle path where RCU

2587

+ 		 * functions differently.  Tracing normally

2588

+@@ -105,7 +136,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,

2589

+ 		 * ordering guarantee we need.

2590

+ 		 *

2591

+ 		 */

2592

+-		load_cr3(next->pgd);

2593

++		load_new_mm_cr3(next->pgd);

2594

+

2595

+ 		trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);

2596

+

2597

+@@ -152,7 +183,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,

2598

+ 			 * As above, load_cr3() is serializing and orders TLB

2599

+ 			 * fills with respect to the mm_cpumask write.

2600

+ 			 */

2601

+-			load_cr3(next->pgd);

2602

++			load_new_mm_cr3(next->pgd);

2603

+ 			trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);

2604

+ 			load_mm_cr4(next);

2605

+ 			load_mm_ldt(next);

2606

+diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h

2607

+index ef2e8c97e183..a461b6604fd9 100644

2608

+--- a/include/asm-generic/vmlinux.lds.h

2609

++++ b/include/asm-generic/vmlinux.lds.h

2610

+@@ -725,7 +725,14 @@

2611

+  */

2612

+ #define PERCPU_INPUT(cacheline)						\

2613

+ 	VMLINUX_SYMBOL(__per_cpu_start) = .;				\

2614

++	VMLINUX_SYMBOL(__per_cpu_user_mapped_start) = .;		\

2615

+ 	*(.data..percpu..first)						\

2616

++	. = ALIGN(cacheline);						\

2617

++	*(.data..percpu..user_mapped)					\

2618

++	*(.data..percpu..user_mapped..shared_aligned)			\

2619

++	. = ALIGN(PAGE_SIZE);						\

2620

++	*(.data..percpu..user_mapped..page_aligned)			\

2621

++	VMLINUX_SYMBOL(__per_cpu_user_mapped_end) = .;			\

2622

+ 	. = ALIGN(PAGE_SIZE);						\

2623

+ 	*(.data..percpu..page_aligned)					\

2624

+ 	. = ALIGN(cacheline);						\

2625

+diff --git a/include/linux/kaiser.h b/include/linux/kaiser.h

2626

+new file mode 100644

2627

+index 000000000000..58c55b1589d0

2628

+--- /dev/null

2629

++++ b/include/linux/kaiser.h

2630

+@@ -0,0 +1,52 @@

2631

++#ifndef _LINUX_KAISER_H

2632

++#define _LINUX_KAISER_H

2633

++

2634

++#ifdef CONFIG_PAGE_TABLE_ISOLATION

2635

++#include <asm/kaiser.h>

2636

++

2637

++static inline int kaiser_map_thread_stack(void *stack)

2638

++{

2639

++	/*

2640

++	 * Map that page of kernel stack on which we enter from user context.

2641

++	 */

2642

++	return kaiser_add_mapping((unsigned long)stack +

2643

++			THREAD_SIZE - PAGE_SIZE, PAGE_SIZE, __PAGE_KERNEL);

2644

++}

2645

++

2646

++static inline void kaiser_unmap_thread_stack(void *stack)

2647

++{

2648

++	/*

2649

++	 * Note: may be called even when kaiser_map_thread_stack() failed.

2650

++	 */

2651

++	kaiser_remove_mapping((unsigned long)stack +

2652

++			THREAD_SIZE - PAGE_SIZE, PAGE_SIZE);

2653

++}

2654

++#else

2655

++

2656

++/*

2657

++ * These stubs are used whenever CONFIG_PAGE_TABLE_ISOLATION is off, which

2658

++ * includes architectures that support KAISER, but have it disabled.

2659

++ */

2660

++

2661

++static inline void kaiser_init(void)

2662

++{

2663

++}

2664

++static inline int kaiser_add_mapping(unsigned long addr,

2665

++				     unsigned long size, unsigned long flags)

2666

++{

2667

++	return 0;

2668

++}

2669

++static inline void kaiser_remove_mapping(unsigned long start,

2670

++					 unsigned long size)

2671

++{

2672

++}

2673

++static inline int kaiser_map_thread_stack(void *stack)

2674

++{

2675

++	return 0;

2676

++}

2677

++static inline void kaiser_unmap_thread_stack(void *stack)

2678

++{

2679

++}

2680

++

2681

++#endif /* !CONFIG_PAGE_TABLE_ISOLATION */

2682

++#endif /* _LINUX_KAISER_H */

2683

+diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h

2684

+index ff88d6189411..b93b578cfa42 100644

2685

+--- a/include/linux/mmzone.h

2686

++++ b/include/linux/mmzone.h

2687

+@@ -131,8 +131,9 @@ enum zone_stat_item {

2688

+ 	NR_SLAB_RECLAIMABLE,

2689

+ 	NR_SLAB_UNRECLAIMABLE,

2690

+ 	NR_PAGETABLE,		/* used for pagetables */

2691

+-	NR_KERNEL_STACK,

2692

+ 	/* Second 128 byte cacheline */

2693

++	NR_KERNEL_STACK,

2694

++	NR_KAISERTABLE,

2695

+ 	NR_UNSTABLE_NFS,	/* NFS unstable pages */

2696

+ 	NR_BOUNCE,

2697

+ 	NR_VMSCAN_WRITE,

2698

+diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h

2699

+index 8f16299ca068..8902f23bb770 100644

2700

+--- a/include/linux/percpu-defs.h

2701

++++ b/include/linux/percpu-defs.h

2702

+@@ -35,6 +35,12 @@

2703

+

2704

+ #endif

2705

+

2706

++#ifdef CONFIG_PAGE_TABLE_ISOLATION

2707

++#define USER_MAPPED_SECTION "..user_mapped"

2708

++#else

2709

++#define USER_MAPPED_SECTION ""

2710

++#endif

2711

++

2712

+ /*

2713

+  * Base implementations of per-CPU variable declarations and definitions, where

2714

+  * the section in which the variable is to be placed is provided by the

2715

+@@ -115,6 +121,12 @@

2716

+ #define DEFINE_PER_CPU(type, name)					\

2717

+ 	DEFINE_PER_CPU_SECTION(type, name, "")

2718

+

2719

++#define DECLARE_PER_CPU_USER_MAPPED(type, name)				\

2720

++	DECLARE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION)

2721

++

2722

++#define DEFINE_PER_CPU_USER_MAPPED(type, name)				\

2723

++	DEFINE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION)

2724

++

2725

+ /*

2726

+  * Declaration/definition used for per-CPU variables that must come first in

2727

+  * the set of variables.

2728

+@@ -144,6 +156,14 @@

2729

+ 	DEFINE_PER_CPU_SECTION(type, name, PER_CPU_SHARED_ALIGNED_SECTION) \

2730

+ 	____cacheline_aligned_in_smp

2731

+

2732

++#define DECLARE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(type, name)		\

2733

++	DECLARE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION PER_CPU_SHARED_ALIGNED_SECTION) \

2734

++	____cacheline_aligned_in_smp

2735

++

2736

++#define DEFINE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(type, name)		\

2737

++	DEFINE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION PER_CPU_SHARED_ALIGNED_SECTION) \

2738

++	____cacheline_aligned_in_smp

2739

++

2740

+ #define DECLARE_PER_CPU_ALIGNED(type, name)				\

2741

+ 	DECLARE_PER_CPU_SECTION(type, name, PER_CPU_ALIGNED_SECTION)	\

2742

+ 	____cacheline_aligned

2743

+@@ -162,11 +182,21 @@

2744

+ #define DEFINE_PER_CPU_PAGE_ALIGNED(type, name)				\

2745

+ 	DEFINE_PER_CPU_SECTION(type, name, "..page_aligned")		\

2746

+ 	__aligned(PAGE_SIZE)

2747

++/*

2748

++ * Declaration/definition used for per-CPU variables that must be page aligned and need to be mapped in user mode.

2749

++ */

2750

++#define DECLARE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(type, name)		\

2751

++	DECLARE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION"..page_aligned") \

2752

++	__aligned(PAGE_SIZE)

2753

++

2754

++#define DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(type, name)		\

2755

++	DEFINE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION"..page_aligned") \

2756

++	__aligned(PAGE_SIZE)

2757

+

2758

+ /*

2759

+  * Declaration/definition used for per-CPU variables that must be read mostly.

2760

+  */

2761

+-#define DECLARE_PER_CPU_READ_MOSTLY(type, name)			\

2762

++#define DECLARE_PER_CPU_READ_MOSTLY(type, name)				\

2763

+ 	DECLARE_PER_CPU_SECTION(type, name, "..read_mostly")

2764

+

2765

+ #define DEFINE_PER_CPU_READ_MOSTLY(type, name)				\

2766

+diff --git a/init/main.c b/init/main.c

2767

+index 9e64d7097f1a..49926d95442f 100644

2768

+--- a/init/main.c

2769

++++ b/init/main.c

2770

+@@ -81,6 +81,7 @@

2771

+ #include <linux/integrity.h>

2772

+ #include <linux/proc_ns.h>

2773

+ #include <linux/io.h>

2774

++#include <linux/kaiser.h>

2775

+

2776

+ #include <asm/io.h>

2777

+ #include <asm/bugs.h>

2778

+@@ -492,6 +493,7 @@ static void __init mm_init(void)

2779

+ 	pgtable_init();

2780

+ 	vmalloc_init();

2781

+ 	ioremap_huge_init();

2782

++	kaiser_init();

2783

+ }

2784

+

2785

+ asmlinkage __visible void __init start_kernel(void)

2786

+diff --git a/kernel/fork.c b/kernel/fork.c

2787

+index 68cfda1c1800..ac00f14208b7 100644

2788

+--- a/kernel/fork.c

2789

++++ b/kernel/fork.c

2790

+@@ -58,6 +58,7 @@

2791

+ #include <linux/tsacct_kern.h>

2792

+ #include <linux/cn_proc.h>

2793

+ #include <linux/freezer.h>

2794

++#include <linux/kaiser.h>

2795

+ #include <linux/delayacct.h>

2796

+ #include <linux/taskstats_kern.h>

2797

+ #include <linux/random.h>

2798

+@@ -169,6 +170,7 @@ static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,

2799

+

2800

+ static inline void free_thread_info(struct thread_info *ti)

2801

+ {

2802

++	kaiser_unmap_thread_stack(ti);

2803

+ 	free_kmem_pages((unsigned long)ti, THREAD_SIZE_ORDER);

2804

+ }

2805

+ # else

2806

+@@ -352,6 +354,10 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)

2807

+ 		goto free_ti;

2808

+

2809

+ 	tsk->stack = ti;

2810

++

2811

++	err = kaiser_map_thread_stack(tsk->stack);

2812

++	if (err)

2813

++		goto free_ti;

2814

+ #ifdef CONFIG_SECCOMP

2815

+ 	/*

2816

+ 	 * We must handle setting up seccomp filters once we're under

2817

+diff --git a/mm/vmstat.c b/mm/vmstat.c

2818

+index c344e3609c53..324b7e90b4c5 100644

2819

+--- a/mm/vmstat.c

2820

++++ b/mm/vmstat.c

2821

+@@ -736,6 +736,7 @@ const char * const vmstat_text[] = {

2822

+ 	"nr_slab_unreclaimable",

2823

+ 	"nr_page_table_pages",

2824

+ 	"nr_kernel_stack",

2825

++	"nr_overhead",

2826

+ 	"nr_unstable",

2827

+ 	"nr_bounce",

2828

+ 	"nr_vmscan_write",

2829

+diff --git a/security/Kconfig b/security/Kconfig

2830

+index e45237897b43..a3ebb6ee5bd5 100644

2831

+--- a/security/Kconfig

2832

++++ b/security/Kconfig

2833

+@@ -31,6 +31,16 @@ config SECURITY

2834

+

2835

+ 	  If you are unsure how to answer this question, answer N.

2836

+

2837

++config PAGE_TABLE_ISOLATION

2838

++	bool "Remove the kernel mapping in user mode"

2839

++	default y

2840

++	depends on X86_64 && SMP

2841

++	help

2842

++	  This enforces a strict kernel and user space isolation, in order

2843

++	  to close hardware side channels on kernel address information.

2844

++

2845

++	  If you are unsure how to answer this question, answer Y.

2846

++

2847

+ config SECURITYFS

2848

+ 	bool "Enable the securityfs filesystem"

2849

+ 	help

Gentoo Archives: gentoo-commits