[gentoo-commits] proj/linux-patches:5.13 commit in: / - gentoo-commits

From:	Mike Pagano <mpagano@g.o>
To:	gentoo-commits@l.g.o
Subject:	[gentoo-commits] proj/linux-patches:5.13 commit in: /
Date:	Mon, 12 Jul 2021 11:36:24
Message-Id:	`1626089740.5104561ec6c463e866a47f10f183292270afe519.mpagano@gentoo`

1

commit:     5104561ec6c463e866a47f10f183292270afe519

2

Author:     Mike Pagano <mpagano <AT> gentoo <DOT> org>

3

AuthorDate: Mon Jul 12 11:35:40 2021 +0000

4

Commit:     Mike Pagano <mpagano <AT> gentoo <DOT> org>

5

CommitDate: Mon Jul 12 11:35:40 2021 +0000

6

URL:        https://gitweb.gentoo.org/proj/linux-patches.git/commit/?id=5104561e

7

8

Add BMQ v5.13-r1 and gentoo default (to off) patch

9

10

Signed-off-by: Mike Pagano <mpagano <AT> gentoo.org>

11

12

 0000_README                                  |    8 +

13

 5020_BMQ-and-PDS-io-scheduler-v5.13-r1.patch | 9523 ++++++++++++++++++++++++++

14

 5021_BMQ-and-PDS-gentoo-defaults.patch       |   13 +

15

 3 files changed, 9544 insertions(+)

16

17

diff --git a/0000_README b/0000_README

18

index d3e2ab4..f92bd44 100644

19

--- a/0000_README

20

+++ b/0000_README

21

@@ -75,3 +75,11 @@ Patch:  5010_enable-cpu-optimizations-universal.patch

22

 From:   https://github.com/graysky2/kernel_gcc_patch/

23

 Desc:   Kernel >= 5.8 patch enables gcc = v9+ optimizations for additional CPUs.

24

25

+Patch:  5020_BMQ-and-PDS-io-scheduler-v5.13-r1.patch

26

+From:   https://gitlab.com/alfredchen/linux-prjc

27

+Desc:   BMQ(BitMap Queue) Scheduler. A new CPU scheduler developed from PDS(incld). Inspired by the scheduler in zircon.

28

+

29

+Patch:  5021_BMQ-and-PDS-gentoo-defaults.patch

30

+From:   https://gitweb.gentoo.org/proj/linux-patches.git/

31

+Desc:   Set defaults for BMQ. Add archs as people test, default to N

32

+

33

34

diff --git a/5020_BMQ-and-PDS-io-scheduler-v5.13-r1.patch b/5020_BMQ-and-PDS-io-scheduler-v5.13-r1.patch

35

new file mode 100644

36

index 0000000..82d7f5a

37

--- /dev/null

38

+++ b/5020_BMQ-and-PDS-io-scheduler-v5.13-r1.patch

39

@@ -0,0 +1,9523 @@

40

+diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt

41

+index cb89dbdedc46..37192ffbd3f8 100644

42

+--- a/Documentation/admin-guide/kernel-parameters.txt

43

++++ b/Documentation/admin-guide/kernel-parameters.txt

44

+@@ -4878,6 +4878,12 @@

45

+

46

+ 	sbni=		[NET] Granch SBNI12 leased line adapter

47

+

48

++	sched_timeslice=

49

++			[KNL] Time slice in us for BMQ/PDS scheduler.

50

++			Format: <int> (must be >= 1000)

51

++			Default: 4000

52

++			See Documentation/scheduler/sched-BMQ.txt

53

++

54

+ 	sched_verbose	[KNL] Enables verbose scheduler debug messages.

55

+

56

+ 	schedstats=	[KNL,X86] Enable or disable scheduled statistics.

57

+diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst

58

+index 68b21395a743..0c14a4544fd6 100644

59

+--- a/Documentation/admin-guide/sysctl/kernel.rst

60

++++ b/Documentation/admin-guide/sysctl/kernel.rst

61

+@@ -1527,3 +1527,13 @@ is 10 seconds.

62

+

63

+ The softlockup threshold is (``2 * watchdog_thresh``). Setting this

64

+ tunable to zero will disable lockup detection altogether.

65

++

66

++yield_type:

67

++===========

68

++

69

++BMQ/PDS CPU scheduler only. This determines what type of yield calls

70

++to sched_yield will perform.

71

++

72

++  0 - No yield.

73

++  1 - Deboost and requeue task. (default)

74

++  2 - Set run queue skip task.

75

+diff --git a/Documentation/scheduler/sched-BMQ.txt b/Documentation/scheduler/sched-BMQ.txt

76

+new file mode 100644

77

+index 000000000000..05c84eec0f31

78

+--- /dev/null

79

++++ b/Documentation/scheduler/sched-BMQ.txt

80

+@@ -0,0 +1,110 @@

81

++                         BitMap queue CPU Scheduler

82

++                         --------------------------

83

++

84

++CONTENT

85

++========

86

++

87

++ Background

88

++ Design

89

++   Overview

90

++   Task policy

91

++   Priority management

92

++   BitMap Queue

93

++   CPU Assignment and Migration

94

++

95

++

96

++Background

97

++==========

98

++

99

++BitMap Queue CPU scheduler, referred to as BMQ from here on, is an evolution

100

++of previous Priority and Deadline based Skiplist multiple queue scheduler(PDS),

101

++and inspired by Zircon scheduler. The goal of it is to keep the scheduler code

102

++simple, while efficiency and scalable for interactive tasks, such as desktop,

103

++movie playback and gaming etc.

104

++

105

++Design

106

++======

107

++

108

++Overview

109

++--------

110

++

111

++BMQ use per CPU run queue design, each CPU(logical) has it's own run queue,

112

++each CPU is responsible for scheduling the tasks that are putting into it's

113

++run queue.

114

++

115

++The run queue is a set of priority queues. Note that these queues are fifo

116

++queue for non-rt tasks or priority queue for rt tasks in data structure. See

117

++BitMap Queue below for details. BMQ is optimized for non-rt tasks in the fact

118

++that most applications are non-rt tasks. No matter the queue is fifo or

119

++priority, In each queue is an ordered list of runnable tasks awaiting execution

120

++and the data structures are the same. When it is time for a new task to run,

121

++the scheduler simply looks the lowest numbered queueue that contains a task,

122

++and runs the first task from the head of that queue. And per CPU idle task is

123

++also in the run queue, so the scheduler can always find a task to run on from

124

++its run queue.

125

++

126

++Each task will assigned the same timeslice(default 4ms) when it is picked to

127

++start running. Task will be reinserted at the end of the appropriate priority

128

++queue when it uses its whole timeslice. When the scheduler selects a new task

129

++from the priority queue it sets the CPU's preemption timer for the remainder of

130

++the previous timeslice. When that timer fires the scheduler will stop execution

131

++on that task, select another task and start over again.

132

++

133

++If a task blocks waiting for a shared resource then it's taken out of its

134

++priority queue and is placed in a wait queue for the shared resource. When it

135

++is unblocked it will be reinserted in the appropriate priority queue of an

136

++eligible CPU.

137

++

138

++Task policy

139

++-----------

140

++

141

++BMQ supports DEADLINE, FIFO, RR, NORMAL, BATCH and IDLE task policy like the

142

++mainline CFS scheduler. But BMQ is heavy optimized for non-rt task, that's

143

++NORMAL/BATCH/IDLE policy tasks. Below is the implementation detail of each

144

++policy.

145

++

146

++DEADLINE

147

++	It is squashed as priority 0 FIFO task.

148

++

149

++FIFO/RR

150

++	All RT tasks share one single priority queue in BMQ run queue designed. The

151

++complexity of insert operation is O(n). BMQ is not designed for system runs

152

++with major rt policy tasks.

153

++

154

++NORMAL/BATCH/IDLE

155

++	BATCH and IDLE tasks are treated as the same policy. They compete CPU with

156

++NORMAL policy tasks, but they just don't boost. To control the priority of

157

++NORMAL/BATCH/IDLE tasks, simply use nice level.

158

++

159

++ISO

160

++	ISO policy is not supported in BMQ. Please use nice level -20 NORMAL policy

161

++task instead.

162

++

163

++Priority management

164

++-------------------

165

++

166

++RT tasks have priority from 0-99. For non-rt tasks, there are three different

167

++factors used to determine the effective priority of a task. The effective

168

++priority being what is used to determine which queue it will be in.

169

++

170

++The first factor is simply the task’s static priority. Which is assigned from

171

++task's nice level, within [-20, 19] in userland's point of view and [0, 39]

172

++internally.

173

++

174

++The second factor is the priority boost. This is a value bounded between

175

++[-MAX_PRIORITY_ADJ, MAX_PRIORITY_ADJ] used to offset the base priority, it is

176

++modified by the following cases:

177

++

178

++*When a thread has used up its entire timeslice, always deboost its boost by

179

++increasing by one.

180

++*When a thread gives up cpu control(voluntary or non-voluntary) to reschedule,

181

++and its switch-in time(time after last switch and run) below the thredhold

182

++based on its priority boost, will boost its boost by decreasing by one buti is

183

++capped at 0 (won’t go negative).

184

++

185

++The intent in this system is to ensure that interactive threads are serviced

186

++quickly. These are usually the threads that interact directly with the user

187

++and cause user-perceivable latency. These threads usually do little work and

188

++spend most of their time blocked awaiting another user event. So they get the

189

++priority boost from unblocking while background threads that do most of the

190

++processing receive the priority penalty for using their entire timeslice.

191

+diff --git a/fs/proc/base.c b/fs/proc/base.c

192

+index 9cbd915025ad..f4f05b4cb2af 100644

193

+--- a/fs/proc/base.c

194

++++ b/fs/proc/base.c

195

+@@ -476,7 +476,7 @@ static int proc_pid_schedstat(struct seq_file *m, struct pid_namespace *ns,

196

+ 		seq_puts(m, "0 0 0\n");

197

+ 	else

198

+ 		seq_printf(m, "%llu %llu %lu\n",

199

+-		   (unsigned long long)task->se.sum_exec_runtime,

200

++		   (unsigned long long)tsk_seruntime(task),

201

+ 		   (unsigned long long)task->sched_info.run_delay,

202

+ 		   task->sched_info.pcount);

203

+

204

+diff --git a/include/asm-generic/resource.h b/include/asm-generic/resource.h

205

+index 8874f681b056..59eb72bf7d5f 100644

206

+--- a/include/asm-generic/resource.h

207

++++ b/include/asm-generic/resource.h

208

+@@ -23,7 +23,7 @@

209

+ 	[RLIMIT_LOCKS]		= {  RLIM_INFINITY,  RLIM_INFINITY },	\

210

+ 	[RLIMIT_SIGPENDING]	= { 		0,	       0 },	\

211

+ 	[RLIMIT_MSGQUEUE]	= {   MQ_BYTES_MAX,   MQ_BYTES_MAX },	\

212

+-	[RLIMIT_NICE]		= { 0, 0 },				\

213

++	[RLIMIT_NICE]		= { 30, 30 },				\

214

+ 	[RLIMIT_RTPRIO]		= { 0, 0 },				\

215

+ 	[RLIMIT_RTTIME]		= {  RLIM_INFINITY,  RLIM_INFINITY },	\

216

+ }

217

+diff --git a/include/linux/sched.h b/include/linux/sched.h

218

+index 32813c345115..35f7cfe6539a 100644

219

+--- a/include/linux/sched.h

220

++++ b/include/linux/sched.h

221

+@@ -678,12 +678,18 @@ struct task_struct {

222

+ 	unsigned int			ptrace;

223

+

224

+ #ifdef CONFIG_SMP

225

+-	int				on_cpu;

226

+ 	struct __call_single_node	wake_entry;

227

++#endif

228

++#if defined(CONFIG_SMP) || defined(CONFIG_SCHED_ALT)

229

++	int				on_cpu;

230

++#endif

231

++

232

++#ifdef CONFIG_SMP

233

+ #ifdef CONFIG_THREAD_INFO_IN_TASK

234

+ 	/* Current CPU: */

235

+ 	unsigned int			cpu;

236

+ #endif

237

++#ifndef CONFIG_SCHED_ALT

238

+ 	unsigned int			wakee_flips;

239

+ 	unsigned long			wakee_flip_decay_ts;

240

+ 	struct task_struct		*last_wakee;

241

+@@ -697,6 +703,7 @@ struct task_struct {

242

+ 	 */

243

+ 	int				recent_used_cpu;

244

+ 	int				wake_cpu;

245

++#endif /* !CONFIG_SCHED_ALT */

246

+ #endif

247

+ 	int				on_rq;

248

+

249

+@@ -705,13 +712,28 @@ struct task_struct {

250

+ 	int				normal_prio;

251

+ 	unsigned int			rt_priority;

252

+

253

++#ifdef CONFIG_SCHED_ALT

254

++	u64				last_ran;

255

++	s64				time_slice;

256

++	int				sq_idx;

257

++	struct list_head		sq_node;

258

++#ifdef CONFIG_SCHED_BMQ

259

++	int				boost_prio;

260

++#endif /* CONFIG_SCHED_BMQ */

261

++#ifdef CONFIG_SCHED_PDS

262

++	u64				deadline;

263

++#endif /* CONFIG_SCHED_PDS */

264

++	/* sched_clock time spent running */

265

++	u64				sched_time;

266

++#else /* !CONFIG_SCHED_ALT */

267

+ 	const struct sched_class	*sched_class;

268

+ 	struct sched_entity		se;

269

+ 	struct sched_rt_entity		rt;

270

++	struct sched_dl_entity		dl;

271

++#endif

272

+ #ifdef CONFIG_CGROUP_SCHED

273

+ 	struct task_group		*sched_task_group;

274

+ #endif

275

+-	struct sched_dl_entity		dl;

276

+

277

+ #ifdef CONFIG_UCLAMP_TASK

278

+ 	/*

279

+@@ -1407,6 +1429,15 @@ struct task_struct {

280

+ 	 */

281

+ };

282

+

283

++#ifdef CONFIG_SCHED_ALT

284

++#define tsk_seruntime(t)		((t)->sched_time)

285

++/* replace the uncertian rt_timeout with 0UL */

286

++#define tsk_rttimeout(t)		(0UL)

287

++#else /* CFS */

288

++#define tsk_seruntime(t)	((t)->se.sum_exec_runtime)

289

++#define tsk_rttimeout(t)	((t)->rt.timeout)

290

++#endif /* !CONFIG_SCHED_ALT */

291

++

292

+ static inline struct pid *task_pid(struct task_struct *task)

293

+ {

294

+ 	return task->thread_pid;

295

+diff --git a/include/linux/sched/deadline.h b/include/linux/sched/deadline.h

296

+index 1aff00b65f3c..216fdf2fe90c 100644

297

+--- a/include/linux/sched/deadline.h

298

++++ b/include/linux/sched/deadline.h

299

+@@ -1,5 +1,24 @@

300

+ /* SPDX-License-Identifier: GPL-2.0 */

301

+

302

++#ifdef CONFIG_SCHED_ALT

303

++

304

++static inline int dl_task(struct task_struct *p)

305

++{

306

++	return 0;

307

++}

308

++

309

++#ifdef CONFIG_SCHED_BMQ

310

++#define __tsk_deadline(p)	(0UL)

311

++#endif

312

++

313

++#ifdef CONFIG_SCHED_PDS

314

++#define __tsk_deadline(p)	((((u64) ((p)->prio))<<56) | (p)->deadline)

315

++#endif

316

++

317

++#else

318

++

319

++#define __tsk_deadline(p)	((p)->dl.deadline)

320

++

321

+ /*

322

+  * SCHED_DEADLINE tasks has negative priorities, reflecting

323

+  * the fact that any of them has higher prio than RT and

324

+@@ -19,6 +38,7 @@ static inline int dl_task(struct task_struct *p)

325

+ {

326

+ 	return dl_prio(p->prio);

327

+ }

328

++#endif /* CONFIG_SCHED_ALT */

329

+

330

+ static inline bool dl_time_before(u64 a, u64 b)

331

+ {

332

+diff --git a/include/linux/sched/prio.h b/include/linux/sched/prio.h

333

+index ab83d85e1183..6af9ae681116 100644

334

+--- a/include/linux/sched/prio.h

335

++++ b/include/linux/sched/prio.h

336

+@@ -18,6 +18,32 @@

337

+ #define MAX_PRIO		(MAX_RT_PRIO + NICE_WIDTH)

338

+ #define DEFAULT_PRIO		(MAX_RT_PRIO + NICE_WIDTH / 2)

339

+

340

++#ifdef CONFIG_SCHED_ALT

341

++

342

++/* Undefine MAX_PRIO and DEFAULT_PRIO */

343

++#undef MAX_PRIO

344

++#undef DEFAULT_PRIO

345

++

346

++/* +/- priority levels from the base priority */

347

++#ifdef CONFIG_SCHED_BMQ

348

++#define MAX_PRIORITY_ADJ	(7)

349

++

350

++#define MIN_NORMAL_PRIO		(MAX_RT_PRIO)

351

++#define MAX_PRIO		(MIN_NORMAL_PRIO + NICE_WIDTH)

352

++#define DEFAULT_PRIO		(MIN_NORMAL_PRIO + NICE_WIDTH / 2)

353

++#endif

354

++

355

++#ifdef CONFIG_SCHED_PDS

356

++#define MAX_PRIORITY_ADJ	(0)

357

++

358

++#define MIN_NORMAL_PRIO		(128)

359

++#define NORMAL_PRIO_NUM		(64)

360

++#define MAX_PRIO		(MIN_NORMAL_PRIO + NORMAL_PRIO_NUM)

361

++#define DEFAULT_PRIO		(MAX_PRIO - NICE_WIDTH / 2)

362

++#endif

363

++

364

++#endif /* CONFIG_SCHED_ALT */

365

++

366

+ /*

367

+  * Convert user-nice values [ -20 ... 0 ... 19 ]

368

+  * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],

369

+diff --git a/include/linux/sched/rt.h b/include/linux/sched/rt.h

370

+index e5af028c08b4..0a7565d0d3cf 100644

371

+--- a/include/linux/sched/rt.h

372

++++ b/include/linux/sched/rt.h

373

+@@ -24,8 +24,10 @@ static inline bool task_is_realtime(struct task_struct *tsk)

374

+

375

+ 	if (policy == SCHED_FIFO || policy == SCHED_RR)

376

+ 		return true;

377

++#ifndef CONFIG_SCHED_ALT

378

+ 	if (policy == SCHED_DEADLINE)

379

+ 		return true;

380

++#endif

381

+ 	return false;

382

+ }

383

+

384

+diff --git a/init/Kconfig b/init/Kconfig

385

+index a61c92066c2e..7746c8d4610b 100644

386

+--- a/init/Kconfig

387

++++ b/init/Kconfig

388

+@@ -783,9 +783,39 @@ config GENERIC_SCHED_CLOCK

389

+

390

+ menu "Scheduler features"

391

+

392

++menuconfig SCHED_ALT

393

++	bool "Alternative CPU Schedulers"

394

++	default y

395

++	help

396

++	  This feature enable alternative CPU scheduler"

397

++

398

++if SCHED_ALT

399

++

400

++choice

401

++	prompt "Alternative CPU Scheduler"

402

++	default SCHED_BMQ

403

++

404

++config SCHED_BMQ

405

++	bool "BMQ CPU scheduler"

406

++	help

407

++	  The BitMap Queue CPU scheduler for excellent interactivity and

408

++	  responsiveness on the desktop and solid scalability on normal

409

++	  hardware and commodity servers.

410

++

411

++config SCHED_PDS

412

++	bool "PDS CPU scheduler"

413

++	help

414

++	  The Priority and Deadline based Skip list multiple queue CPU

415

++	  Scheduler.

416

++

417

++endchoice

418

++

419

++endif

420

++

421

+ config UCLAMP_TASK

422

+ 	bool "Enable utilization clamping for RT/FAIR tasks"

423

+ 	depends on CPU_FREQ_GOV_SCHEDUTIL

424

++	depends on !SCHED_ALT

425

+ 	help

426

+ 	  This feature enables the scheduler to track the clamped utilization

427

+ 	  of each CPU based on RUNNABLE tasks scheduled on that CPU.

428

+@@ -871,6 +901,7 @@ config NUMA_BALANCING

429

+ 	depends on ARCH_SUPPORTS_NUMA_BALANCING

430

+ 	depends on !ARCH_WANT_NUMA_VARIABLE_LOCALITY

431

+ 	depends on SMP && NUMA && MIGRATION

432

++	depends on !SCHED_ALT

433

+ 	help

434

+ 	  This option adds support for automatic NUMA aware memory/task placement.

435

+ 	  The mechanism is quite primitive and is based on migrating memory when

436

+@@ -963,6 +994,7 @@ config FAIR_GROUP_SCHED

437

+ 	depends on CGROUP_SCHED

438

+ 	default CGROUP_SCHED

439

+

440

++if !SCHED_ALT

441

+ config CFS_BANDWIDTH

442

+ 	bool "CPU bandwidth provisioning for FAIR_GROUP_SCHED"

443

+ 	depends on FAIR_GROUP_SCHED

444

+@@ -985,6 +1017,7 @@ config RT_GROUP_SCHED

445

+ 	  realtime bandwidth for them.

446

+ 	  See Documentation/scheduler/sched-rt-group.rst for more information.

447

+

448

++endif #!SCHED_ALT

449

+ endif #CGROUP_SCHED

450

+

451

+ config UCLAMP_TASK_GROUP

452

+@@ -1228,6 +1261,7 @@ config CHECKPOINT_RESTORE

453

+

454

+ config SCHED_AUTOGROUP

455

+ 	bool "Automatic process group scheduling"

456

++	depends on !SCHED_ALT

457

+ 	select CGROUPS

458

+ 	select CGROUP_SCHED

459

+ 	select FAIR_GROUP_SCHED

460

+diff --git a/init/init_task.c b/init/init_task.c

461

+index 8b08c2e19cbb..0dfa1a63dc4e 100644

462

+--- a/init/init_task.c

463

++++ b/init/init_task.c

464

+@@ -75,9 +75,15 @@ struct task_struct init_task

465

+ 	.stack		= init_stack,

466

+ 	.usage		= REFCOUNT_INIT(2),

467

+ 	.flags		= PF_KTHREAD,

468

++#ifdef CONFIG_SCHED_ALT

469

++	.prio		= DEFAULT_PRIO + MAX_PRIORITY_ADJ,

470

++	.static_prio	= DEFAULT_PRIO,

471

++	.normal_prio	= DEFAULT_PRIO + MAX_PRIORITY_ADJ,

472

++#else

473

+ 	.prio		= MAX_PRIO - 20,

474

+ 	.static_prio	= MAX_PRIO - 20,

475

+ 	.normal_prio	= MAX_PRIO - 20,

476

++#endif

477

+ 	.policy		= SCHED_NORMAL,

478

+ 	.cpus_ptr	= &init_task.cpus_mask,

479

+ 	.cpus_mask	= CPU_MASK_ALL,

480

+@@ -87,6 +93,17 @@ struct task_struct init_task

481

+ 	.restart_block	= {

482

+ 		.fn = do_no_restart_syscall,

483

+ 	},

484

++#ifdef CONFIG_SCHED_ALT

485

++	.sq_node	= LIST_HEAD_INIT(init_task.sq_node),

486

++#ifdef CONFIG_SCHED_BMQ

487

++	.boost_prio	= 0,

488

++	.sq_idx		= 15,

489

++#endif

490

++#ifdef CONFIG_SCHED_PDS

491

++	.deadline	= 0,

492

++#endif

493

++	.time_slice	= HZ,

494

++#else

495

+ 	.se		= {

496

+ 		.group_node 	= LIST_HEAD_INIT(init_task.se.group_node),

497

+ 	},

498

+@@ -94,6 +111,7 @@ struct task_struct init_task

499

+ 		.run_list	= LIST_HEAD_INIT(init_task.rt.run_list),

500

+ 		.time_slice	= RR_TIMESLICE,

501

+ 	},

502

++#endif

503

+ 	.tasks		= LIST_HEAD_INIT(init_task.tasks),

504

+ #ifdef CONFIG_SMP

505

+ 	.pushable_tasks	= PLIST_NODE_INIT(init_task.pushable_tasks, MAX_PRIO),

506

+diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c

507

+index adb5190c4429..8c02bce63146 100644

508

+--- a/kernel/cgroup/cpuset.c

509

++++ b/kernel/cgroup/cpuset.c

510

+@@ -636,7 +636,7 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)

511

+ 	return ret;

512

+ }

513

+

514

+-#ifdef CONFIG_SMP

515

++#if defined(CONFIG_SMP) && !defined(CONFIG_SCHED_ALT)

516

+ /*

517

+  * Helper routine for generate_sched_domains().

518

+  * Do cpusets a, b have overlapping effective cpus_allowed masks?

519

+@@ -1032,7 +1032,7 @@ static void rebuild_sched_domains_locked(void)

520

+ 	/* Have scheduler rebuild the domains */

521

+ 	partition_and_rebuild_sched_domains(ndoms, doms, attr);

522

+ }

523

+-#else /* !CONFIG_SMP */

524

++#else /* !CONFIG_SMP || CONFIG_SCHED_ALT */

525

+ static void rebuild_sched_domains_locked(void)

526

+ {

527

+ }

528

+diff --git a/kernel/delayacct.c b/kernel/delayacct.c

529

+index 27725754ac99..769d773c7182 100644

530

+--- a/kernel/delayacct.c

531

++++ b/kernel/delayacct.c

532

+@@ -106,7 +106,7 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)

533

+ 	 */

534

+ 	t1 = tsk->sched_info.pcount;

535

+ 	t2 = tsk->sched_info.run_delay;

536

+-	t3 = tsk->se.sum_exec_runtime;

537

++	t3 = tsk_seruntime(tsk);

538

+

539

+ 	d->cpu_count += t1;

540

+

541

+diff --git a/kernel/exit.c b/kernel/exit.c

542

+index 65809fac3038..9504db57d878 100644

543

+--- a/kernel/exit.c

544

++++ b/kernel/exit.c

545

+@@ -122,7 +122,7 @@ static void __exit_signal(struct task_struct *tsk)

546

+ 			sig->curr_target = next_thread(tsk);

547

+ 	}

548

+

549

+-	add_device_randomness((const void*) &tsk->se.sum_exec_runtime,

550

++	add_device_randomness((const void*) &tsk_seruntime(tsk),

551

+ 			      sizeof(unsigned long long));

552

+

553

+ 	/*

554

+@@ -143,7 +143,7 @@ static void __exit_signal(struct task_struct *tsk)

555

+ 	sig->inblock += task_io_get_inblock(tsk);

556

+ 	sig->oublock += task_io_get_oublock(tsk);

557

+ 	task_io_accounting_add(&sig->ioac, &tsk->ioac);

558

+-	sig->sum_sched_runtime += tsk->se.sum_exec_runtime;

559

++	sig->sum_sched_runtime += tsk_seruntime(tsk);

560

+ 	sig->nr_threads--;

561

+ 	__unhash_process(tsk, group_dead);

562

+ 	write_sequnlock(&sig->stats_lock);

563

+diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c

564

+index 3a4beb9395c4..98a709628cb3 100644

565

+--- a/kernel/livepatch/transition.c

566

++++ b/kernel/livepatch/transition.c

567

+@@ -307,7 +307,11 @@ static bool klp_try_switch_task(struct task_struct *task)

568

+ 	 */

569

+ 	rq = task_rq_lock(task, &flags);

570

+

571

++#ifdef	CONFIG_SCHED_ALT

572

++	if (task_running(task) && task != current) {

573

++#else

574

+ 	if (task_running(rq, task) && task != current) {

575

++#endif

576

+ 		snprintf(err_buf, STACK_ERR_BUF_SIZE,

577

+ 			 "%s: %s:%d is running\n", __func__, task->comm,

578

+ 			 task->pid);

579

+diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c

580

+index 406818196a9f..31c46750fa94 100644

581

+--- a/kernel/locking/rtmutex.c

582

++++ b/kernel/locking/rtmutex.c

583

+@@ -227,14 +227,18 @@ static __always_inline bool unlock_rt_mutex_safe(struct rt_mutex *lock,

584

+  * Only use with rt_mutex_waiter_{less,equal}()

585

+  */

586

+ #define task_to_waiter(p)	\

587

+-	&(struct rt_mutex_waiter){ .prio = (p)->prio, .deadline = (p)->dl.deadline }

588

++	&(struct rt_mutex_waiter){ .prio = (p)->prio, .deadline = __tsk_deadline(p) }

589

+

590

+ static __always_inline int rt_mutex_waiter_less(struct rt_mutex_waiter *left,

591

+ 						struct rt_mutex_waiter *right)

592

+ {

593

++#ifdef CONFIG_SCHED_PDS

594

++	return (left->deadline < right->deadline);

595

++#else

596

+ 	if (left->prio < right->prio)

597

+ 		return 1;

598

+

599

++#ifndef CONFIG_SCHED_BMQ

600

+ 	/*

601

+ 	 * If both waiters have dl_prio(), we check the deadlines of the

602

+ 	 * associated tasks.

603

+@@ -243,16 +247,22 @@ static __always_inline int rt_mutex_waiter_less(struct rt_mutex_waiter *left,

604

+ 	 */

605

+ 	if (dl_prio(left->prio))

606

+ 		return dl_time_before(left->deadline, right->deadline);

607

++#endif

608

+

609

+ 	return 0;

610

++#endif

611

+ }

612

+

613

+ static __always_inline int rt_mutex_waiter_equal(struct rt_mutex_waiter *left,

614

+ 						 struct rt_mutex_waiter *right)

615

+ {

616

++#ifdef CONFIG_SCHED_PDS

617

++	return (left->deadline == right->deadline);

618

++#else

619

+ 	if (left->prio != right->prio)

620

+ 		return 0;

621

+

622

++#ifndef CONFIG_SCHED_BMQ

623

+ 	/*

624

+ 	 * If both waiters have dl_prio(), we check the deadlines of the

625

+ 	 * associated tasks.

626

+@@ -261,8 +271,10 @@ static __always_inline int rt_mutex_waiter_equal(struct rt_mutex_waiter *left,

627

+ 	 */

628

+ 	if (dl_prio(left->prio))

629

+ 		return left->deadline == right->deadline;

630

++#endif

631

+

632

+ 	return 1;

633

++#endif

634

+ }

635

+

636

+ #define __node_2_waiter(node) \

637

+@@ -654,7 +666,7 @@ static int __sched rt_mutex_adjust_prio_chain(struct task_struct *task,

638

+ 	 * the values of the node being removed.

639

+ 	 */

640

+ 	waiter->prio = task->prio;

641

+-	waiter->deadline = task->dl.deadline;

642

++	waiter->deadline = __tsk_deadline(task);

643

+

644

+ 	rt_mutex_enqueue(lock, waiter);

645

+

646

+@@ -925,7 +937,7 @@ static int __sched task_blocks_on_rt_mutex(struct rt_mutex *lock,

647

+ 	waiter->task = task;

648

+ 	waiter->lock = lock;

649

+ 	waiter->prio = task->prio;

650

+-	waiter->deadline = task->dl.deadline;

651

++	waiter->deadline = __tsk_deadline(task);

652

+

653

+ 	/* Get the top priority waiter on the lock */

654

+ 	if (rt_mutex_has_waiters(lock))

655

+diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile

656

+index 5fc9c9b70862..06b60d612535 100644

657

+--- a/kernel/sched/Makefile

658

++++ b/kernel/sched/Makefile

659

+@@ -22,14 +22,21 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)

660

+ CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer

661

+ endif

662

+

663

+-obj-y += core.o loadavg.o clock.o cputime.o

664

+-obj-y += idle.o fair.o rt.o deadline.o

665

+-obj-y += wait.o wait_bit.o swait.o completion.o

666

+-

667

+-obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o pelt.o

668

++ifdef CONFIG_SCHED_ALT

669

++obj-y += alt_core.o

670

++obj-$(CONFIG_SCHED_DEBUG) += alt_debug.o

671

++else

672

++obj-y += core.o

673

++obj-y += fair.o rt.o deadline.o

674

++obj-$(CONFIG_SMP) += cpudeadline.o stop_task.o

675

+ obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o

676

+-obj-$(CONFIG_SCHEDSTATS) += stats.o

677

++endif

678

+ obj-$(CONFIG_SCHED_DEBUG) += debug.o

679

++obj-y += loadavg.o clock.o cputime.o

680

++obj-y += idle.o

681

++obj-y += wait.o wait_bit.o swait.o completion.o

682

++obj-$(CONFIG_SMP) += cpupri.o pelt.o topology.o

683

++obj-$(CONFIG_SCHEDSTATS) += stats.o

684

+ obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o

685

+ obj-$(CONFIG_CPU_FREQ) += cpufreq.o

686

+ obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o

687

+diff --git a/kernel/sched/alt_core.c b/kernel/sched/alt_core.c

688

+new file mode 100644

689

+index 000000000000..b65b12c6014f

690

+--- /dev/null

691

++++ b/kernel/sched/alt_core.c

692

+@@ -0,0 +1,7249 @@

693

++/*

694

++ *  kernel/sched/alt_core.c

695

++ *

696

++ *  Core alternative kernel scheduler code and related syscalls

697

++ *

698

++ *  Copyright (C) 1991-2002  Linus Torvalds

699

++ *

700

++ *  2009-08-13	Brainfuck deadline scheduling policy by Con Kolivas deletes

701

++ *		a whole lot of those previous things.

702

++ *  2017-09-06	Priority and Deadline based Skip list multiple queue kernel

703

++ *		scheduler by Alfred Chen.

704

++ *  2019-02-20	BMQ(BitMap Queue) kernel scheduler by Alfred Chen.

705

++ */

706

++#define CREATE_TRACE_POINTS

707

++#include <trace/events/sched.h>

708

++#undef CREATE_TRACE_POINTS

709

++

710

++#include "sched.h"

711

++

712

++#include <linux/sched/rt.h>

713

++

714

++#include <linux/context_tracking.h>

715

++#include <linux/compat.h>

716

++#include <linux/blkdev.h>

717

++#include <linux/delayacct.h>

718

++#include <linux/freezer.h>

719

++#include <linux/init_task.h>

720

++#include <linux/kprobes.h>

721

++#include <linux/mmu_context.h>

722

++#include <linux/nmi.h>

723

++#include <linux/profile.h>

724

++#include <linux/rcupdate_wait.h>

725

++#include <linux/security.h>

726

++#include <linux/syscalls.h>

727

++#include <linux/wait_bit.h>

728

++

729

++#include <linux/kcov.h>

730

++#include <linux/scs.h>

731

++

732

++#include <asm/switch_to.h>

733

++

734

++#include "../workqueue_internal.h"

735

++#include "../../fs/io-wq.h"

736

++#include "../smpboot.h"

737

++

738

++#include "pelt.h"

739

++#include "smp.h"

740

++

741

++/*

742

++ * Export tracepoints that act as a bare tracehook (ie: have no trace event

743

++ * associated with them) to allow external modules to probe them.

744

++ */

745

++EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_irq_tp);

746

++

747

++#ifdef CONFIG_SCHED_DEBUG

748

++#define sched_feat(x)	(1)

749

++/*

750

++ * Print a warning if need_resched is set for the given duration (if

751

++ * LATENCY_WARN is enabled).

752

++ *

753

++ * If sysctl_resched_latency_warn_once is set, only one warning will be shown

754

++ * per boot.

755

++ */

756

++__read_mostly int sysctl_resched_latency_warn_ms = 100;

757

++__read_mostly int sysctl_resched_latency_warn_once = 1;

758

++#else

759

++#define sched_feat(x)	(0)

760

++#endif /* CONFIG_SCHED_DEBUG */

761

++

762

++#define ALT_SCHED_VERSION "v5.13-r1"

763

++

764

++/* rt_prio(prio) defined in include/linux/sched/rt.h */

765

++#define rt_task(p)		rt_prio((p)->prio)

766

++#define rt_policy(policy)	((policy) == SCHED_FIFO || (policy) == SCHED_RR)

767

++#define task_has_rt_policy(p)	(rt_policy((p)->policy))

768

++

769

++#define STOP_PRIO		(MAX_RT_PRIO - 1)

770

++

771

++/* Default time slice is 4 in ms, can be set via kernel parameter "sched_timeslice" */

772

++u64 sched_timeslice_ns __read_mostly = (4 << 20);

773

++

774

++static inline void requeue_task(struct task_struct *p, struct rq *rq);

775

++

776

++#ifdef CONFIG_SCHED_BMQ

777

++#include "bmq.h"

778

++#endif

779

++#ifdef CONFIG_SCHED_PDS

780

++#include "pds.h"

781

++#endif

782

++

783

++static int __init sched_timeslice(char *str)

784

++{

785

++	int timeslice_ms;

786

++

787

++	get_option(&str, &timeslice_ms);

788

++	if (2 != timeslice_ms)

789

++		timeslice_ms = 4;

790

++	sched_timeslice_ns = timeslice_ms << 20;

791

++	sched_timeslice_imp(timeslice_ms);

792

++

793

++	return 0;

794

++}

795

++early_param("sched_timeslice", sched_timeslice);

796

++

797

++/* Reschedule if less than this many μs left */

798

++#define RESCHED_NS		(100 << 10)

799

++

800

++/**

801

++ * sched_yield_type - Choose what sort of yield sched_yield will perform.

802

++ * 0: No yield.

803

++ * 1: Deboost and requeue task. (default)

804

++ * 2: Set rq skip task.

805

++ */

806

++int sched_yield_type __read_mostly = 1;

807

++

808

++#ifdef CONFIG_SMP

809

++static cpumask_t sched_rq_pending_mask ____cacheline_aligned_in_smp;

810

++

811

++DEFINE_PER_CPU(cpumask_t [NR_CPU_AFFINITY_LEVELS], sched_cpu_affinity_masks);

812

++DEFINE_PER_CPU(cpumask_t *, sched_cpu_affinity_end_mask);

813

++

814

++DEFINE_PER_CPU(cpumask_t [NR_CPU_AFFINITY_LEVELS], sched_cpu_topo_masks);

815

++DEFINE_PER_CPU(cpumask_t *, sched_cpu_llc_mask);

816

++

817

++#ifdef CONFIG_SCHED_SMT

818

++DEFINE_STATIC_KEY_FALSE(sched_smt_present);

819

++EXPORT_SYMBOL_GPL(sched_smt_present);

820

++#endif

821

++

822

++/*

823

++ * Keep a unique ID per domain (we use the first CPUs number in the cpumask of

824

++ * the domain), this allows us to quickly tell if two cpus are in the same cache

825

++ * domain, see cpus_share_cache().

826

++ */

827

++DEFINE_PER_CPU(int, sd_llc_id);

828

++#endif /* CONFIG_SMP */

829

++

830

++static DEFINE_MUTEX(sched_hotcpu_mutex);

831

++

832

++DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);

833

++

834

++#ifndef prepare_arch_switch

835

++# define prepare_arch_switch(next)	do { } while (0)

836

++#endif

837

++#ifndef finish_arch_post_lock_switch

838

++# define finish_arch_post_lock_switch()	do { } while (0)

839

++#endif

840

++

841

++#define IDLE_WM	(IDLE_TASK_SCHED_PRIO)

842

++

843

++#ifdef CONFIG_SCHED_SMT

844

++static cpumask_t sched_sg_idle_mask ____cacheline_aligned_in_smp;

845

++#endif

846

++static cpumask_t sched_rq_watermark[SCHED_BITS] ____cacheline_aligned_in_smp;

847

++

848

++/* sched_queue related functions */

849

++static inline void sched_queue_init(struct sched_queue *q)

850

++{

851

++	int i;

852

++

853

++	bitmap_zero(q->bitmap, SCHED_BITS);

854

++	for(i = 0; i < SCHED_BITS; i++)

855

++		INIT_LIST_HEAD(&q->heads[i]);

856

++}

857

++

858

++/*

859

++ * Init idle task and put into queue structure of rq

860

++ * IMPORTANT: may be called multiple times for a single cpu

861

++ */

862

++static inline void sched_queue_init_idle(struct sched_queue *q,

863

++					 struct task_struct *idle)

864

++{

865

++	idle->sq_idx = IDLE_TASK_SCHED_PRIO;

866

++	INIT_LIST_HEAD(&q->heads[idle->sq_idx]);

867

++	list_add(&idle->sq_node, &q->heads[idle->sq_idx]);

868

++}

869

++

870

++/* water mark related functions */

871

++static inline void update_sched_rq_watermark(struct rq *rq)

872

++{

873

++	unsigned long watermark = find_first_bit(rq->queue.bitmap, SCHED_QUEUE_BITS);

874

++	unsigned long last_wm = rq->watermark;

875

++	unsigned long i;

876

++	int cpu;

877

++

878

++	if (watermark == last_wm)

879

++		return;

880

++

881

++	rq->watermark = watermark;

882

++	cpu = cpu_of(rq);

883

++	if (watermark < last_wm) {

884

++		for (i = watermark + 1; i <= last_wm; i++)

885

++			cpumask_andnot(&sched_rq_watermark[i],

886

++				       &sched_rq_watermark[i], cpumask_of(cpu));

887

++#ifdef CONFIG_SCHED_SMT

888

++		if (static_branch_likely(&sched_smt_present) &&

889

++		    IDLE_WM == last_wm)

890

++			cpumask_andnot(&sched_sg_idle_mask,

891

++				       &sched_sg_idle_mask, cpu_smt_mask(cpu));

892

++#endif

893

++		return;

894

++	}

895

++	/* last_wm < watermark */

896

++	for (i = last_wm + 1; i <= watermark; i++)

897

++		cpumask_set_cpu(cpu, &sched_rq_watermark[i]);

898

++#ifdef CONFIG_SCHED_SMT

899

++	if (static_branch_likely(&sched_smt_present) && IDLE_WM == watermark) {

900

++		cpumask_t tmp;

901

++

902

++		cpumask_and(&tmp, cpu_smt_mask(cpu), &sched_rq_watermark[IDLE_WM]);

903

++		if (cpumask_equal(&tmp, cpu_smt_mask(cpu)))

904

++			cpumask_or(&sched_sg_idle_mask, cpu_smt_mask(cpu),

905

++				   &sched_sg_idle_mask);

906

++	}

907

++#endif

908

++}

909

++

910

++/*

911

++ * This routine assume that the idle task always in queue

912

++ */

913

++static inline struct task_struct *sched_rq_first_task(struct rq *rq)

914

++{

915

++	unsigned long idx = find_first_bit(rq->queue.bitmap, SCHED_QUEUE_BITS);

916

++	const struct list_head *head = &rq->queue.heads[sched_prio2idx(idx, rq)];

917

++

918

++	return list_first_entry(head, struct task_struct, sq_node);

919

++}

920

++

921

++static inline struct task_struct *

922

++sched_rq_next_task(struct task_struct *p, struct rq *rq)

923

++{

924

++	unsigned long idx = p->sq_idx;

925

++	struct list_head *head = &rq->queue.heads[idx];

926

++

927

++	if (list_is_last(&p->sq_node, head)) {

928

++		idx = find_next_bit(rq->queue.bitmap, SCHED_QUEUE_BITS,

929

++				    sched_idx2prio(idx, rq) + 1);

930

++		head = &rq->queue.heads[sched_prio2idx(idx, rq)];

931

++

932

++		return list_first_entry(head, struct task_struct, sq_node);

933

++	}

934

++

935

++	return list_next_entry(p, sq_node);

936

++}

937

++

938

++static inline struct task_struct *rq_runnable_task(struct rq *rq)

939

++{

940

++	struct task_struct *next = sched_rq_first_task(rq);

941

++

942

++	if (unlikely(next == rq->skip))

943

++		next = sched_rq_next_task(next, rq);

944

++

945

++	return next;

946

++}

947

++

948

++/*

949

++ * Serialization rules:

950

++ *

951

++ * Lock order:

952

++ *

953

++ *   p->pi_lock

954

++ *     rq->lock

955

++ *       hrtimer_cpu_base->lock (hrtimer_start() for bandwidth controls)

956

++ *

957

++ *  rq1->lock

958

++ *    rq2->lock  where: rq1 < rq2

959

++ *

960

++ * Regular state:

961

++ *

962

++ * Normal scheduling state is serialized by rq->lock. __schedule() takes the

963

++ * local CPU's rq->lock, it optionally removes the task from the runqueue and

964

++ * always looks at the local rq data structures to find the most eligible task

965

++ * to run next.

966

++ *

967

++ * Task enqueue is also under rq->lock, possibly taken from another CPU.

968

++ * Wakeups from another LLC domain might use an IPI to transfer the enqueue to

969

++ * the local CPU to avoid bouncing the runqueue state around [ see

970

++ * ttwu_queue_wakelist() ]

971

++ *

972

++ * Task wakeup, specifically wakeups that involve migration, are horribly

973

++ * complicated to avoid having to take two rq->locks.

974

++ *

975

++ * Special state:

976

++ *

977

++ * System-calls and anything external will use task_rq_lock() which acquires

978

++ * both p->pi_lock and rq->lock. As a consequence the state they change is

979

++ * stable while holding either lock:

980

++ *

981

++ *  - sched_setaffinity()/

982

++ *    set_cpus_allowed_ptr():	p->cpus_ptr, p->nr_cpus_allowed

983

++ *  - set_user_nice():		p->se.load, p->*prio

984

++ *  - __sched_setscheduler():	p->sched_class, p->policy, p->*prio,

985

++ *				p->se.load, p->rt_priority,

986

++ *				p->dl.dl_{runtime, deadline, period, flags, bw, density}

987

++ *  - sched_setnuma():		p->numa_preferred_nid

988

++ *  - sched_move_task()/

989

++ *    cpu_cgroup_fork():	p->sched_task_group

990

++ *  - uclamp_update_active()	p->uclamp*

991

++ *

992

++ * p->state <- TASK_*:

993

++ *

994

++ *   is changed locklessly using set_current_state(), __set_current_state() or

995

++ *   set_special_state(), see their respective comments, or by

996

++ *   try_to_wake_up(). This latter uses p->pi_lock to serialize against

997

++ *   concurrent self.

998

++ *

999

++ * p->on_rq <- { 0, 1 = TASK_ON_RQ_QUEUED, 2 = TASK_ON_RQ_MIGRATING }:

1000

++ *

1001

++ *   is set by activate_task() and cleared by deactivate_task(), under

1002

++ *   rq->lock. Non-zero indicates the task is runnable, the special

1003

++ *   ON_RQ_MIGRATING state is used for migration without holding both

1004

++ *   rq->locks. It indicates task_cpu() is not stable, see task_rq_lock().

1005

++ *

1006

++ * p->on_cpu <- { 0, 1 }:

1007

++ *

1008

++ *   is set by prepare_task() and cleared by finish_task() such that it will be

1009

++ *   set before p is scheduled-in and cleared after p is scheduled-out, both

1010

++ *   under rq->lock. Non-zero indicates the task is running on its CPU.

1011

++ *

1012

++ *   [ The astute reader will observe that it is possible for two tasks on one

1013

++ *     CPU to have ->on_cpu = 1 at the same time. ]

1014

++ *

1015

++ * task_cpu(p): is changed by set_task_cpu(), the rules are:

1016

++ *

1017

++ *  - Don't call set_task_cpu() on a blocked task:

1018

++ *

1019

++ *    We don't care what CPU we're not running on, this simplifies hotplug,

1020

++ *    the CPU assignment of blocked tasks isn't required to be valid.

1021

++ *

1022

++ *  - for try_to_wake_up(), called under p->pi_lock:

1023

++ *

1024

++ *    This allows try_to_wake_up() to only take one rq->lock, see its comment.

1025

++ *

1026

++ *  - for migration called under rq->lock:

1027

++ *    [ see task_on_rq_migrating() in task_rq_lock() ]

1028

++ *

1029

++ *    o move_queued_task()

1030

++ *    o detach_task()

1031

++ *

1032

++ *  - for migration called under double_rq_lock():

1033

++ *

1034

++ *    o __migrate_swap_task()

1035

++ *    o push_rt_task() / pull_rt_task()

1036

++ *    o push_dl_task() / pull_dl_task()

1037

++ *    o dl_task_offline_migration()

1038

++ *

1039

++ */

1040

++

1041

++/*

1042

++ * Context: p->pi_lock

1043

++ */

1044

++static inline struct rq

1045

++*__task_access_lock(struct task_struct *p, raw_spinlock_t **plock)

1046

++{

1047

++	struct rq *rq;

1048

++	for (;;) {

1049

++		rq = task_rq(p);

1050

++		if (p->on_cpu || task_on_rq_queued(p)) {

1051

++			raw_spin_lock(&rq->lock);

1052

++			if (likely((p->on_cpu || task_on_rq_queued(p))

1053

++				   && rq == task_rq(p))) {

1054

++				*plock = &rq->lock;

1055

++				return rq;

1056

++			}

1057

++			raw_spin_unlock(&rq->lock);

1058

++		} else if (task_on_rq_migrating(p)) {

1059

++			do {

1060

++				cpu_relax();

1061

++			} while (unlikely(task_on_rq_migrating(p)));

1062

++		} else {

1063

++			*plock = NULL;

1064

++			return rq;

1065

++		}

1066

++	}

1067

++}

1068

++

1069

++static inline void

1070

++__task_access_unlock(struct task_struct *p, raw_spinlock_t *lock)

1071

++{

1072

++	if (NULL != lock)

1073

++		raw_spin_unlock(lock);

1074

++}

1075

++

1076

++static inline struct rq

1077

++*task_access_lock_irqsave(struct task_struct *p, raw_spinlock_t **plock,

1078

++			  unsigned long *flags)

1079

++{

1080

++	struct rq *rq;

1081

++	for (;;) {

1082

++		rq = task_rq(p);

1083

++		if (p->on_cpu || task_on_rq_queued(p)) {

1084

++			raw_spin_lock_irqsave(&rq->lock, *flags);

1085

++			if (likely((p->on_cpu || task_on_rq_queued(p))

1086

++				   && rq == task_rq(p))) {

1087

++				*plock = &rq->lock;

1088

++				return rq;

1089

++			}

1090

++			raw_spin_unlock_irqrestore(&rq->lock, *flags);

1091

++		} else if (task_on_rq_migrating(p)) {

1092

++			do {

1093

++				cpu_relax();

1094

++			} while (unlikely(task_on_rq_migrating(p)));

1095

++		} else {

1096

++			raw_spin_lock_irqsave(&p->pi_lock, *flags);

1097

++			if (likely(!p->on_cpu && !p->on_rq &&

1098

++				   rq == task_rq(p))) {

1099

++				*plock = &p->pi_lock;

1100

++				return rq;

1101

++			}

1102

++			raw_spin_unlock_irqrestore(&p->pi_lock, *flags);

1103

++		}

1104

++	}

1105

++}

1106

++

1107

++static inline void

1108

++task_access_unlock_irqrestore(struct task_struct *p, raw_spinlock_t *lock,

1109

++			      unsigned long *flags)

1110

++{

1111

++	raw_spin_unlock_irqrestore(lock, *flags);

1112

++}

1113

++

1114

++/*

1115

++ * __task_rq_lock - lock the rq @p resides on.

1116

++ */

1117

++struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf)

1118

++	__acquires(rq->lock)

1119

++{

1120

++	struct rq *rq;

1121

++

1122

++	lockdep_assert_held(&p->pi_lock);

1123

++

1124

++	for (;;) {

1125

++		rq = task_rq(p);

1126

++		raw_spin_lock(&rq->lock);

1127

++		if (likely(rq == task_rq(p) && !task_on_rq_migrating(p)))

1128

++			return rq;

1129

++		raw_spin_unlock(&rq->lock);

1130

++

1131

++		while (unlikely(task_on_rq_migrating(p)))

1132

++			cpu_relax();

1133

++	}

1134

++}

1135

++

1136

++/*

1137

++ * task_rq_lock - lock p->pi_lock and lock the rq @p resides on.

1138

++ */

1139

++struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)

1140

++	__acquires(p->pi_lock)

1141

++	__acquires(rq->lock)

1142

++{

1143

++	struct rq *rq;

1144

++

1145

++	for (;;) {

1146

++		raw_spin_lock_irqsave(&p->pi_lock, rf->flags);

1147

++		rq = task_rq(p);

1148

++		raw_spin_lock(&rq->lock);

1149

++		/*

1150

++		 *	move_queued_task()		task_rq_lock()

1151

++		 *

1152

++		 *	ACQUIRE (rq->lock)

1153

++		 *	[S] ->on_rq = MIGRATING		[L] rq = task_rq()

1154

++		 *	WMB (__set_task_cpu())		ACQUIRE (rq->lock);

1155

++		 *	[S] ->cpu = new_cpu		[L] task_rq()

1156

++		 *					[L] ->on_rq

1157

++		 *	RELEASE (rq->lock)

1158

++		 *

1159

++		 * If we observe the old CPU in task_rq_lock(), the acquire of

1160

++		 * the old rq->lock will fully serialize against the stores.

1161

++		 *

1162

++		 * If we observe the new CPU in task_rq_lock(), the address

1163

++		 * dependency headed by '[L] rq = task_rq()' and the acquire

1164

++		 * will pair with the WMB to ensure we then also see migrating.

1165

++		 */

1166

++		if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {

1167

++			return rq;

1168

++		}

1169

++		raw_spin_unlock(&rq->lock);

1170

++		raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags);

1171

++

1172

++		while (unlikely(task_on_rq_migrating(p)))

1173

++			cpu_relax();

1174

++	}

1175

++}

1176

++

1177

++static inline void

1178

++rq_lock_irqsave(struct rq *rq, struct rq_flags *rf)

1179

++	__acquires(rq->lock)

1180

++{

1181

++	raw_spin_lock_irqsave(&rq->lock, rf->flags);

1182

++}

1183

++

1184

++static inline void

1185

++rq_unlock_irqrestore(struct rq *rq, struct rq_flags *rf)

1186

++	__releases(rq->lock)

1187

++{

1188

++	raw_spin_unlock_irqrestore(&rq->lock, rf->flags);

1189

++}

1190

++

1191

++/*

1192

++ * RQ-clock updating methods:

1193

++ */

1194

++

1195

++static void update_rq_clock_task(struct rq *rq, s64 delta)

1196

++{

1197

++/*

1198

++ * In theory, the compile should just see 0 here, and optimize out the call

1199

++ * to sched_rt_avg_update. But I don't trust it...

1200

++ */

1201

++	s64 __maybe_unused steal = 0, irq_delta = 0;

1202

++

1203

++#ifdef CONFIG_IRQ_TIME_ACCOUNTING

1204

++	irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;

1205

++

1206

++	/*

1207

++	 * Since irq_time is only updated on {soft,}irq_exit, we might run into

1208

++	 * this case when a previous update_rq_clock() happened inside a

1209

++	 * {soft,}irq region.

1210

++	 *

1211

++	 * When this happens, we stop ->clock_task and only update the

1212

++	 * prev_irq_time stamp to account for the part that fit, so that a next

1213

++	 * update will consume the rest. This ensures ->clock_task is

1214

++	 * monotonic.

1215

++	 *

1216

++	 * It does however cause some slight miss-attribution of {soft,}irq

1217

++	 * time, a more accurate solution would be to update the irq_time using

1218

++	 * the current rq->clock timestamp, except that would require using

1219

++	 * atomic ops.

1220

++	 */

1221

++	if (irq_delta > delta)

1222

++		irq_delta = delta;

1223

++

1224

++	rq->prev_irq_time += irq_delta;

1225

++	delta -= irq_delta;

1226

++#endif

1227

++#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING

1228

++	if (static_key_false((&paravirt_steal_rq_enabled))) {

1229

++		steal = paravirt_steal_clock(cpu_of(rq));

1230

++		steal -= rq->prev_steal_time_rq;

1231

++

1232

++		if (unlikely(steal > delta))

1233

++			steal = delta;

1234

++

1235

++		rq->prev_steal_time_rq += steal;

1236

++		delta -= steal;

1237

++	}

1238

++#endif

1239

++

1240

++	rq->clock_task += delta;

1241

++

1242

++#ifdef CONFIG_HAVE_SCHED_AVG_IRQ

1243

++	if ((irq_delta + steal))

1244

++		update_irq_load_avg(rq, irq_delta + steal);

1245

++#endif

1246

++}

1247

++

1248

++static inline void update_rq_clock(struct rq *rq)

1249

++{

1250

++	s64 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;

1251

++

1252

++	if (unlikely(delta <= 0))

1253

++		return;

1254

++	rq->clock += delta;

1255

++	update_rq_time_edge(rq);

1256

++	update_rq_clock_task(rq, delta);

1257

++}

1258

++

1259

++#ifdef CONFIG_NO_HZ_FULL

1260

++/*

1261

++ * Tick may be needed by tasks in the runqueue depending on their policy and

1262

++ * requirements. If tick is needed, lets send the target an IPI to kick it out

1263

++ * of nohz mode if necessary.

1264

++ */

1265

++static inline void sched_update_tick_dependency(struct rq *rq)

1266

++{

1267

++	int cpu = cpu_of(rq);

1268

++

1269

++	if (!tick_nohz_full_cpu(cpu))

1270

++		return;

1271

++

1272

++	if (rq->nr_running < 2)

1273

++		tick_nohz_dep_clear_cpu(cpu, TICK_DEP_BIT_SCHED);

1274

++	else

1275

++		tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED);

1276

++}

1277

++#else /* !CONFIG_NO_HZ_FULL */

1278

++static inline void sched_update_tick_dependency(struct rq *rq) { }

1279

++#endif

1280

++

1281

++/*

1282

++ * Add/Remove/Requeue task to/from the runqueue routines

1283

++ * Context: rq->lock

1284

++ */

1285

++#define __SCHED_DEQUEUE_TASK(p, rq, flags, func)		\

1286

++	psi_dequeue(p, flags & DEQUEUE_SLEEP);			\

1287

++	sched_info_dequeued(rq, p);				\

1288

++								\

1289

++	list_del(&p->sq_node);					\

1290

++	if (list_empty(&rq->queue.heads[p->sq_idx])) {		\

1291

++		clear_bit(sched_idx2prio(p->sq_idx, rq),	\

1292

++			  rq->queue.bitmap);			\

1293

++		func;						\

1294

++	}

1295

++

1296

++#define __SCHED_ENQUEUE_TASK(p, rq, flags)				\

1297

++	sched_info_queued(rq, p);					\

1298

++	psi_enqueue(p, flags);						\

1299

++									\

1300

++	p->sq_idx = task_sched_prio_idx(p, rq);				\

1301

++	list_add_tail(&p->sq_node, &rq->queue.heads[p->sq_idx]);	\

1302

++	set_bit(sched_idx2prio(p->sq_idx, rq), rq->queue.bitmap);

1303

++

1304

++static inline void dequeue_task(struct task_struct *p, struct rq *rq, int flags)

1305

++{

1306

++	lockdep_assert_held(&rq->lock);

1307

++

1308

++	/*printk(KERN_INFO "sched: dequeue(%d) %px %016llx\n", cpu_of(rq), p, p->priodl);*/

1309

++	WARN_ONCE(task_rq(p) != rq, "sched: dequeue task reside on cpu%d from cpu%d\n",

1310

++		  task_cpu(p), cpu_of(rq));

1311

++

1312

++	__SCHED_DEQUEUE_TASK(p, rq, flags, update_sched_rq_watermark(rq));

1313

++	--rq->nr_running;

1314

++#ifdef CONFIG_SMP

1315

++	if (1 == rq->nr_running)

1316

++		cpumask_clear_cpu(cpu_of(rq), &sched_rq_pending_mask);

1317

++#endif

1318

++

1319

++	sched_update_tick_dependency(rq);

1320

++}

1321

++

1322

++static inline void enqueue_task(struct task_struct *p, struct rq *rq, int flags)

1323

++{

1324

++	lockdep_assert_held(&rq->lock);

1325

++

1326

++	/*printk(KERN_INFO "sched: enqueue(%d) %px %016llx\n", cpu_of(rq), p, p->priodl);*/

1327

++	WARN_ONCE(task_rq(p) != rq, "sched: enqueue task reside on cpu%d to cpu%d\n",

1328

++		  task_cpu(p), cpu_of(rq));

1329

++

1330

++	__SCHED_ENQUEUE_TASK(p, rq, flags);

1331

++	update_sched_rq_watermark(rq);

1332

++	++rq->nr_running;

1333

++#ifdef CONFIG_SMP

1334

++	if (2 == rq->nr_running)

1335

++		cpumask_set_cpu(cpu_of(rq), &sched_rq_pending_mask);

1336

++#endif

1337

++

1338

++	sched_update_tick_dependency(rq);

1339

++}

1340

++

1341

++static inline void requeue_task(struct task_struct *p, struct rq *rq)

1342

++{

1343

++	int idx;

1344

++

1345

++	lockdep_assert_held(&rq->lock);

1346

++	/*printk(KERN_INFO "sched: requeue(%d) %px %016llx\n", cpu_of(rq), p, p->priodl);*/

1347

++	WARN_ONCE(task_rq(p) != rq, "sched: cpu[%d] requeue task reside on cpu%d\n",

1348

++		  cpu_of(rq), task_cpu(p));

1349

++

1350

++	idx = task_sched_prio_idx(p, rq);

1351

++

1352

++	list_del(&p->sq_node);

1353

++	list_add_tail(&p->sq_node, &rq->queue.heads[idx]);

1354

++	if (idx != p->sq_idx) {

1355

++		if (list_empty(&rq->queue.heads[p->sq_idx]))

1356

++			clear_bit(sched_idx2prio(p->sq_idx, rq),

1357

++				  rq->queue.bitmap);

1358

++		p->sq_idx = idx;

1359

++		set_bit(sched_idx2prio(p->sq_idx, rq), rq->queue.bitmap);

1360

++		update_sched_rq_watermark(rq);

1361

++	}

1362

++}

1363

++

1364

++/*

1365

++ * cmpxchg based fetch_or, macro so it works for different integer types

1366

++ */

1367

++#define fetch_or(ptr, mask)						\

1368

++	({								\

1369

++		typeof(ptr) _ptr = (ptr);				\

1370

++		typeof(mask) _mask = (mask);				\

1371

++		typeof(*_ptr) _old, _val = *_ptr;			\

1372

++									\

1373

++		for (;;) {						\

1374

++			_old = cmpxchg(_ptr, _val, _val | _mask);	\

1375

++			if (_old == _val)				\

1376

++				break;					\

1377

++			_val = _old;					\

1378

++		}							\

1379

++	_old;								\

1380

++})

1381

++

1382

++#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG)

1383

++/*

1384

++ * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG,

1385

++ * this avoids any races wrt polling state changes and thereby avoids

1386

++ * spurious IPIs.

1387

++ */

1388

++static bool set_nr_and_not_polling(struct task_struct *p)

1389

++{

1390

++	struct thread_info *ti = task_thread_info(p);

1391

++	return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG);

1392

++}

1393

++

1394

++/*

1395

++ * Atomically set TIF_NEED_RESCHED if TIF_POLLING_NRFLAG is set.

1396

++ *

1397

++ * If this returns true, then the idle task promises to call

1398

++ * sched_ttwu_pending() and reschedule soon.

1399

++ */

1400

++static bool set_nr_if_polling(struct task_struct *p)

1401

++{

1402

++	struct thread_info *ti = task_thread_info(p);

1403

++	typeof(ti->flags) old, val = READ_ONCE(ti->flags);

1404

++

1405

++	for (;;) {

1406

++		if (!(val & _TIF_POLLING_NRFLAG))

1407

++			return false;

1408

++		if (val & _TIF_NEED_RESCHED)

1409

++			return true;

1410

++		old = cmpxchg(&ti->flags, val, val | _TIF_NEED_RESCHED);

1411

++		if (old == val)

1412

++			break;

1413

++		val = old;

1414

++	}

1415

++	return true;

1416

++}

1417

++

1418

++#else

1419

++static bool set_nr_and_not_polling(struct task_struct *p)

1420

++{

1421

++	set_tsk_need_resched(p);

1422

++	return true;

1423

++}

1424

++

1425

++#ifdef CONFIG_SMP

1426

++static bool set_nr_if_polling(struct task_struct *p)

1427

++{

1428

++	return false;

1429

++}

1430

++#endif

1431

++#endif

1432

++

1433

++static bool __wake_q_add(struct wake_q_head *head, struct task_struct *task)

1434

++{

1435

++	struct wake_q_node *node = &task->wake_q;

1436

++

1437

++	/*

1438

++	 * Atomically grab the task, if ->wake_q is !nil already it means

1439

++	 * it's already queued (either by us or someone else) and will get the

1440

++	 * wakeup due to that.

1441

++	 *

1442

++	 * In order to ensure that a pending wakeup will observe our pending

1443

++	 * state, even in the failed case, an explicit smp_mb() must be used.

1444

++	 */

1445

++	smp_mb__before_atomic();

1446

++	if (unlikely(cmpxchg_relaxed(&node->next, NULL, WAKE_Q_TAIL)))

1447

++		return false;

1448

++

1449

++	/*

1450

++	 * The head is context local, there can be no concurrency.

1451

++	 */

1452

++	*head->lastp = node;

1453

++	head->lastp = &node->next;

1454

++	return true;

1455

++}

1456

++

1457

++/**

1458

++ * wake_q_add() - queue a wakeup for 'later' waking.

1459

++ * @head: the wake_q_head to add @task to

1460

++ * @task: the task to queue for 'later' wakeup

1461

++ *

1462

++ * Queue a task for later wakeup, most likely by the wake_up_q() call in the

1463

++ * same context, _HOWEVER_ this is not guaranteed, the wakeup can come

1464

++ * instantly.

1465

++ *

1466

++ * This function must be used as-if it were wake_up_process(); IOW the task

1467

++ * must be ready to be woken at this location.

1468

++ */

1469

++void wake_q_add(struct wake_q_head *head, struct task_struct *task)

1470

++{

1471

++	if (__wake_q_add(head, task))

1472

++		get_task_struct(task);

1473

++}

1474

++

1475

++/**

1476

++ * wake_q_add_safe() - safely queue a wakeup for 'later' waking.

1477

++ * @head: the wake_q_head to add @task to

1478

++ * @task: the task to queue for 'later' wakeup

1479

++ *

1480

++ * Queue a task for later wakeup, most likely by the wake_up_q() call in the

1481

++ * same context, _HOWEVER_ this is not guaranteed, the wakeup can come

1482

++ * instantly.

1483

++ *

1484

++ * This function must be used as-if it were wake_up_process(); IOW the task

1485

++ * must be ready to be woken at this location.

1486

++ *

1487

++ * This function is essentially a task-safe equivalent to wake_q_add(). Callers

1488

++ * that already hold reference to @task can call the 'safe' version and trust

1489

++ * wake_q to do the right thing depending whether or not the @task is already

1490

++ * queued for wakeup.

1491

++ */

1492

++void wake_q_add_safe(struct wake_q_head *head, struct task_struct *task)

1493

++{

1494

++	if (!__wake_q_add(head, task))

1495

++		put_task_struct(task);

1496

++}

1497

++

1498

++void wake_up_q(struct wake_q_head *head)

1499

++{

1500

++	struct wake_q_node *node = head->first;

1501

++

1502

++	while (node != WAKE_Q_TAIL) {

1503

++		struct task_struct *task;

1504

++

1505

++		task = container_of(node, struct task_struct, wake_q);

1506

++		BUG_ON(!task);

1507

++		/* task can safely be re-inserted now: */

1508

++		node = node->next;

1509

++		task->wake_q.next = NULL;

1510

++

1511

++		/*

1512

++		 * wake_up_process() executes a full barrier, which pairs with

1513

++		 * the queueing in wake_q_add() so as not to miss wakeups.

1514

++		 */

1515

++		wake_up_process(task);

1516

++		put_task_struct(task);

1517

++	}

1518

++}

1519

++

1520

++/*

1521

++ * resched_curr - mark rq's current task 'to be rescheduled now'.

1522

++ *

1523

++ * On UP this means the setting of the need_resched flag, on SMP it

1524

++ * might also involve a cross-CPU call to trigger the scheduler on

1525

++ * the target CPU.

1526

++ */

1527

++void resched_curr(struct rq *rq)

1528

++{

1529

++	struct task_struct *curr = rq->curr;

1530

++	int cpu;

1531

++

1532

++	lockdep_assert_held(&rq->lock);

1533

++

1534

++	if (test_tsk_need_resched(curr))

1535

++		return;

1536

++

1537

++	cpu = cpu_of(rq);

1538

++	if (cpu == smp_processor_id()) {

1539

++		set_tsk_need_resched(curr);

1540

++		set_preempt_need_resched();

1541

++		return;

1542

++	}

1543

++

1544

++	if (set_nr_and_not_polling(curr))

1545

++		smp_send_reschedule(cpu);

1546

++	else

1547

++		trace_sched_wake_idle_without_ipi(cpu);

1548

++}

1549

++

1550

++void resched_cpu(int cpu)

1551

++{

1552

++	struct rq *rq = cpu_rq(cpu);

1553

++	unsigned long flags;

1554

++

1555

++	raw_spin_lock_irqsave(&rq->lock, flags);

1556

++	if (cpu_online(cpu) || cpu == smp_processor_id())

1557

++		resched_curr(cpu_rq(cpu));

1558

++	raw_spin_unlock_irqrestore(&rq->lock, flags);

1559

++}

1560

++

1561

++#ifdef CONFIG_SMP

1562

++#ifdef CONFIG_NO_HZ_COMMON

1563

++void nohz_balance_enter_idle(int cpu) {}

1564

++

1565

++void select_nohz_load_balancer(int stop_tick) {}

1566

++

1567

++void set_cpu_sd_state_idle(void) {}

1568

++

1569

++/*

1570

++ * In the semi idle case, use the nearest busy CPU for migrating timers

1571

++ * from an idle CPU.  This is good for power-savings.

1572

++ *

1573

++ * We don't do similar optimization for completely idle system, as

1574

++ * selecting an idle CPU will add more delays to the timers than intended

1575

++ * (as that CPU's timer base may not be uptodate wrt jiffies etc).

1576

++ */

1577

++int get_nohz_timer_target(void)

1578

++{

1579

++	int i, cpu = smp_processor_id(), default_cpu = -1;

1580

++	struct cpumask *mask;

1581

++

1582

++	if (housekeeping_cpu(cpu, HK_FLAG_TIMER)) {

1583

++		if (!idle_cpu(cpu))

1584

++			return cpu;

1585

++		default_cpu = cpu;

1586

++	}

1587

++

1588

++	for (mask = per_cpu(sched_cpu_affinity_masks, cpu) + 1;

1589

++	     mask < per_cpu(sched_cpu_affinity_end_mask, cpu); mask++)

1590

++		for_each_cpu_and(i, mask, housekeeping_cpumask(HK_FLAG_TIMER))

1591

++			if (!idle_cpu(i))

1592

++				return i;

1593

++

1594

++	if (default_cpu == -1)

1595

++		default_cpu = housekeeping_any_cpu(HK_FLAG_TIMER);

1596

++	cpu = default_cpu;

1597

++

1598

++	return cpu;

1599

++}

1600

++

1601

++/*

1602

++ * When add_timer_on() enqueues a timer into the timer wheel of an

1603

++ * idle CPU then this timer might expire before the next timer event

1604

++ * which is scheduled to wake up that CPU. In case of a completely

1605

++ * idle system the next event might even be infinite time into the

1606

++ * future. wake_up_idle_cpu() ensures that the CPU is woken up and

1607

++ * leaves the inner idle loop so the newly added timer is taken into

1608

++ * account when the CPU goes back to idle and evaluates the timer

1609

++ * wheel for the next timer event.

1610

++ */

1611

++static inline void wake_up_idle_cpu(int cpu)

1612

++{

1613

++	struct rq *rq = cpu_rq(cpu);

1614

++

1615

++	if (cpu == smp_processor_id())

1616

++		return;

1617

++

1618

++	if (set_nr_and_not_polling(rq->idle))

1619

++		smp_send_reschedule(cpu);

1620

++	else

1621

++		trace_sched_wake_idle_without_ipi(cpu);

1622

++}

1623

++

1624

++static inline bool wake_up_full_nohz_cpu(int cpu)

1625

++{

1626

++	/*

1627

++	 * We just need the target to call irq_exit() and re-evaluate

1628

++	 * the next tick. The nohz full kick at least implies that.

1629

++	 * If needed we can still optimize that later with an

1630

++	 * empty IRQ.

1631

++	 */

1632

++	if (cpu_is_offline(cpu))

1633

++		return true;  /* Don't try to wake offline CPUs. */

1634

++	if (tick_nohz_full_cpu(cpu)) {

1635

++		if (cpu != smp_processor_id() ||

1636

++		    tick_nohz_tick_stopped())

1637

++			tick_nohz_full_kick_cpu(cpu);

1638

++		return true;

1639

++	}

1640

++

1641

++	return false;

1642

++}

1643

++

1644

++void wake_up_nohz_cpu(int cpu)

1645

++{

1646

++	if (!wake_up_full_nohz_cpu(cpu))

1647

++		wake_up_idle_cpu(cpu);

1648

++}

1649

++

1650

++static void nohz_csd_func(void *info)

1651

++{

1652

++	struct rq *rq = info;

1653

++	int cpu = cpu_of(rq);

1654

++	unsigned int flags;

1655

++

1656

++	/*

1657

++	 * Release the rq::nohz_csd.

1658

++	 */

1659

++	flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(cpu));

1660

++	WARN_ON(!(flags & NOHZ_KICK_MASK));

1661

++

1662

++	rq->idle_balance = idle_cpu(cpu);

1663

++	if (rq->idle_balance && !need_resched()) {

1664

++		rq->nohz_idle_balance = flags;

1665

++		raise_softirq_irqoff(SCHED_SOFTIRQ);

1666

++	}

1667

++}

1668

++

1669

++#endif /* CONFIG_NO_HZ_COMMON */

1670

++#endif /* CONFIG_SMP */

1671

++

1672

++static inline void check_preempt_curr(struct rq *rq)

1673

++{

1674

++	if (sched_rq_first_task(rq) != rq->curr)

1675

++		resched_curr(rq);

1676

++}

1677

++

1678

++#ifdef CONFIG_SCHED_HRTICK

1679

++/*

1680

++ * Use HR-timers to deliver accurate preemption points.

1681

++ */

1682

++

1683

++static void hrtick_clear(struct rq *rq)

1684

++{

1685

++	if (hrtimer_active(&rq->hrtick_timer))

1686

++		hrtimer_cancel(&rq->hrtick_timer);

1687

++}

1688

++

1689

++/*

1690

++ * High-resolution timer tick.

1691

++ * Runs from hardirq context with interrupts disabled.

1692

++ */

1693

++static enum hrtimer_restart hrtick(struct hrtimer *timer)

1694

++{

1695

++	struct rq *rq = container_of(timer, struct rq, hrtick_timer);

1696

++

1697

++	WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());

1698

++

1699

++	raw_spin_lock(&rq->lock);

1700

++	resched_curr(rq);

1701

++	raw_spin_unlock(&rq->lock);

1702

++

1703

++	return HRTIMER_NORESTART;

1704

++}

1705

++

1706

++/*

1707

++ * Use hrtick when:

1708

++ *  - enabled by features

1709

++ *  - hrtimer is actually high res

1710

++ */

1711

++static inline int hrtick_enabled(struct rq *rq)

1712

++{

1713

++	/**

1714

++	 * Alt schedule FW doesn't support sched_feat yet

1715

++	if (!sched_feat(HRTICK))

1716

++		return 0;

1717

++	*/

1718

++	if (!cpu_active(cpu_of(rq)))

1719

++		return 0;

1720

++	return hrtimer_is_hres_active(&rq->hrtick_timer);

1721

++}

1722

++

1723

++#ifdef CONFIG_SMP

1724

++

1725

++static void __hrtick_restart(struct rq *rq)

1726

++{

1727

++	struct hrtimer *timer = &rq->hrtick_timer;

1728

++	ktime_t time = rq->hrtick_time;

1729

++

1730

++	hrtimer_start(timer, time, HRTIMER_MODE_ABS_PINNED_HARD);

1731

++}

1732

++

1733

++/*

1734

++ * called from hardirq (IPI) context

1735

++ */

1736

++static void __hrtick_start(void *arg)

1737

++{

1738

++	struct rq *rq = arg;

1739

++

1740

++	raw_spin_lock(&rq->lock);

1741

++	__hrtick_restart(rq);

1742

++	raw_spin_unlock(&rq->lock);

1743

++}

1744

++

1745

++/*

1746

++ * Called to set the hrtick timer state.

1747

++ *

1748

++ * called with rq->lock held and irqs disabled

1749

++ */

1750

++void hrtick_start(struct rq *rq, u64 delay)

1751

++{

1752

++	struct hrtimer *timer = &rq->hrtick_timer;

1753

++	s64 delta;

1754

++

1755

++	/*

1756

++	 * Don't schedule slices shorter than 10000ns, that just

1757

++	 * doesn't make sense and can cause timer DoS.

1758

++	 */

1759

++	delta = max_t(s64, delay, 10000LL);

1760

++

1761

++	rq->hrtick_time = ktime_add_ns(timer->base->get_time(), delta);

1762

++

1763

++	if (rq == this_rq())

1764

++		__hrtick_restart(rq);

1765

++	else

1766

++		smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd);

1767

++}

1768

++

1769

++#else

1770

++/*

1771

++ * Called to set the hrtick timer state.

1772

++ *

1773

++ * called with rq->lock held and irqs disabled

1774

++ */

1775

++void hrtick_start(struct rq *rq, u64 delay)

1776

++{

1777

++	/*

1778

++	 * Don't schedule slices shorter than 10000ns, that just

1779

++	 * doesn't make sense. Rely on vruntime for fairness.

1780

++	 */

1781

++	delay = max_t(u64, delay, 10000LL);

1782

++	hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay),

1783

++		      HRTIMER_MODE_REL_PINNED_HARD);

1784

++}

1785

++#endif /* CONFIG_SMP */

1786

++

1787

++static void hrtick_rq_init(struct rq *rq)

1788

++{

1789

++#ifdef CONFIG_SMP

1790

++	INIT_CSD(&rq->hrtick_csd, __hrtick_start, rq);

1791

++#endif

1792

++

1793

++	hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);

1794

++	rq->hrtick_timer.function = hrtick;

1795

++}

1796

++#else	/* CONFIG_SCHED_HRTICK */

1797

++static inline int hrtick_enabled(struct rq *rq)

1798

++{

1799

++	return 0;

1800

++}

1801

++

1802

++static inline void hrtick_clear(struct rq *rq)

1803

++{

1804

++}

1805

++

1806

++static inline void hrtick_rq_init(struct rq *rq)

1807

++{

1808

++}

1809

++#endif	/* CONFIG_SCHED_HRTICK */

1810

++

1811

++/*

1812

++ * Calculate the expected normal priority: i.e. priority

1813

++ * without taking RT-inheritance into account. Might be

1814

++ * boosted by interactivity modifiers. Changes upon fork,

1815

++ * setprio syscalls, and whenever the interactivity

1816

++ * estimator recalculates.

1817

++ */

1818

++static inline int normal_prio(struct task_struct *p)

1819

++{

1820

++	return task_has_rt_policy(p) ? (MAX_RT_PRIO - 1 - p->rt_priority) :

1821

++		p->static_prio + MAX_PRIORITY_ADJ;

1822

++}

1823

++

1824

++/*

1825

++ * Calculate the current priority, i.e. the priority

1826

++ * taken into account by the scheduler. This value might

1827

++ * be boosted by RT tasks as it will be RT if the task got

1828

++ * RT-boosted. If not then it returns p->normal_prio.

1829

++ */

1830

++static int effective_prio(struct task_struct *p)

1831

++{

1832

++	p->normal_prio = normal_prio(p);

1833

++	/*

1834

++	 * If we are RT tasks or we were boosted to RT priority,

1835

++	 * keep the priority unchanged. Otherwise, update priority

1836

++	 * to the normal priority:

1837

++	 */

1838

++	if (!rt_prio(p->prio))

1839

++		return p->normal_prio;

1840

++	return p->prio;

1841

++}

1842

++

1843

++/*

1844

++ * activate_task - move a task to the runqueue.

1845

++ *

1846

++ * Context: rq->lock

1847

++ */

1848

++static void activate_task(struct task_struct *p, struct rq *rq)

1849

++{

1850

++	enqueue_task(p, rq, ENQUEUE_WAKEUP);

1851

++	p->on_rq = TASK_ON_RQ_QUEUED;

1852

++

1853

++	/*

1854

++	 * If in_iowait is set, the code below may not trigger any cpufreq

1855

++	 * utilization updates, so do it here explicitly with the IOWAIT flag

1856

++	 * passed.

1857

++	 */

1858

++	cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT * p->in_iowait);

1859

++}

1860

++

1861

++/*

1862

++ * deactivate_task - remove a task from the runqueue.

1863

++ *

1864

++ * Context: rq->lock

1865

++ */

1866

++static inline void deactivate_task(struct task_struct *p, struct rq *rq)

1867

++{

1868

++	dequeue_task(p, rq, DEQUEUE_SLEEP);

1869

++	p->on_rq = 0;

1870

++	cpufreq_update_util(rq, 0);

1871

++}

1872

++

1873

++static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)

1874

++{

1875

++#ifdef CONFIG_SMP

1876

++	/*

1877

++	 * After ->cpu is set up to a new value, task_access_lock(p, ...) can be

1878

++	 * successfully executed on another CPU. We must ensure that updates of

1879

++	 * per-task data have been completed by this moment.

1880

++	 */

1881

++	smp_wmb();

1882

++

1883

++#ifdef CONFIG_THREAD_INFO_IN_TASK

1884

++	WRITE_ONCE(p->cpu, cpu);

1885

++#else

1886

++	WRITE_ONCE(task_thread_info(p)->cpu, cpu);

1887

++#endif

1888

++#endif

1889

++}

1890

++

1891

++static inline bool is_migration_disabled(struct task_struct *p)

1892

++{

1893

++#ifdef CONFIG_SMP

1894

++	return p->migration_disabled;

1895

++#else

1896

++	return false;

1897

++#endif

1898

++}

1899

++

1900

++#define SCA_CHECK		0x01

1901

++

1902

++#ifdef CONFIG_SMP

1903

++

1904

++void set_task_cpu(struct task_struct *p, unsigned int new_cpu)

1905

++{

1906

++#ifdef CONFIG_SCHED_DEBUG

1907

++	/*

1908

++	 * We should never call set_task_cpu() on a blocked task,

1909

++	 * ttwu() will sort out the placement.

1910

++	 */

1911

++	WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&

1912

++		     !p->on_rq);

1913

++#ifdef CONFIG_LOCKDEP

1914

++	/*

1915

++	 * The caller should hold either p->pi_lock or rq->lock, when changing

1916

++	 * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks.

1917

++	 *

1918

++	 * sched_move_task() holds both and thus holding either pins the cgroup,

1919

++	 * see task_group().

1920

++	 */

1921

++	WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||

1922

++				      lockdep_is_held(&task_rq(p)->lock)));

1923

++#endif

1924

++	/*

1925

++	 * Clearly, migrating tasks to offline CPUs is a fairly daft thing.

1926

++	 */

1927

++	WARN_ON_ONCE(!cpu_online(new_cpu));

1928

++

1929

++	WARN_ON_ONCE(is_migration_disabled(p));

1930

++#endif

1931

++	if (task_cpu(p) == new_cpu)

1932

++		return;

1933

++	trace_sched_migrate_task(p, new_cpu);

1934

++	rseq_migrate(p);

1935

++	perf_event_task_migrate(p);

1936

++

1937

++	__set_task_cpu(p, new_cpu);

1938

++}

1939

++

1940

++#define MDF_FORCE_ENABLED	0x80

1941

++

1942

++static void

1943

++__do_set_cpus_ptr(struct task_struct *p, const struct cpumask *new_mask)

1944

++{

1945

++	/*

1946

++	 * This here violates the locking rules for affinity, since we're only

1947

++	 * supposed to change these variables while holding both rq->lock and

1948

++	 * p->pi_lock.

1949

++	 *

1950

++	 * HOWEVER, it magically works, because ttwu() is the only code that

1951

++	 * accesses these variables under p->pi_lock and only does so after

1952

++	 * smp_cond_load_acquire(&p->on_cpu, !VAL), and we're in __schedule()

1953

++	 * before finish_task().

1954

++	 *

1955

++	 * XXX do further audits, this smells like something putrid.

1956

++	 */

1957

++	SCHED_WARN_ON(!p->on_cpu);

1958

++	p->cpus_ptr = new_mask;

1959

++}

1960

++

1961

++void migrate_disable(void)

1962

++{

1963

++	struct task_struct *p = current;

1964

++	int cpu;

1965

++

1966

++	if (p->migration_disabled) {

1967

++		p->migration_disabled++;

1968

++		return;

1969

++	}

1970

++

1971

++	preempt_disable();

1972

++	cpu = smp_processor_id();

1973

++	if (cpumask_test_cpu(cpu, &p->cpus_mask)) {

1974

++		cpu_rq(cpu)->nr_pinned++;

1975

++		p->migration_disabled = 1;

1976

++		p->migration_flags &= ~MDF_FORCE_ENABLED;

1977

++

1978

++		/*

1979

++		 * Violates locking rules! see comment in __do_set_cpus_ptr().

1980

++		 */

1981

++		if (p->cpus_ptr == &p->cpus_mask)

1982

++			__do_set_cpus_ptr(p, cpumask_of(cpu));

1983

++	}

1984

++	preempt_enable();

1985

++}

1986

++EXPORT_SYMBOL_GPL(migrate_disable);

1987

++

1988

++void migrate_enable(void)

1989

++{

1990

++	struct task_struct *p = current;

1991

++

1992

++	if (0 == p->migration_disabled)

1993

++		return;

1994

++

1995

++	if (p->migration_disabled > 1) {

1996

++		p->migration_disabled--;

1997

++		return;

1998

++	}

1999

++

2000

++	/*

2001

++	 * Ensure stop_task runs either before or after this, and that

2002

++	 * __set_cpus_allowed_ptr(SCA_MIGRATE_ENABLE) doesn't schedule().

2003

++	 */

2004

++	preempt_disable();

2005

++	/*

2006

++	 * Assumption: current should be running on allowed cpu

2007

++	 */

2008

++	WARN_ON_ONCE(!cpumask_test_cpu(smp_processor_id(), &p->cpus_mask));

2009

++	if (p->cpus_ptr != &p->cpus_mask)

2010

++		__do_set_cpus_ptr(p, &p->cpus_mask);

2011

++	/*

2012

++	 * Mustn't clear migration_disabled() until cpus_ptr points back at the

2013

++	 * regular cpus_mask, otherwise things that race (eg.

2014

++	 * select_fallback_rq) get confused.

2015

++	 */

2016

++	barrier();

2017

++	p->migration_disabled = 0;

2018

++	this_rq()->nr_pinned--;

2019

++	preempt_enable();

2020

++}

2021

++EXPORT_SYMBOL_GPL(migrate_enable);

2022

++

2023

++static inline bool rq_has_pinned_tasks(struct rq *rq)

2024

++{

2025

++	return rq->nr_pinned;

2026

++}

2027

++

2028

++/*

2029

++ * Per-CPU kthreads are allowed to run on !active && online CPUs, see

2030

++ * __set_cpus_allowed_ptr() and select_fallback_rq().

2031

++ */

2032

++static inline bool is_cpu_allowed(struct task_struct *p, int cpu)

2033

++{

2034

++	/* When not in the task's cpumask, no point in looking further. */

2035

++	if (!cpumask_test_cpu(cpu, p->cpus_ptr))

2036

++		return false;

2037

++

2038

++	/* migrate_disabled() must be allowed to finish. */

2039

++	if (is_migration_disabled(p))

2040

++		return cpu_online(cpu);

2041

++

2042

++	/* Non kernel threads are not allowed during either online or offline. */

2043

++	if (!(p->flags & PF_KTHREAD))

2044

++		return cpu_active(cpu);

2045

++

2046

++	/* KTHREAD_IS_PER_CPU is always allowed. */

2047

++	if (kthread_is_per_cpu(p))

2048

++		return cpu_online(cpu);

2049

++

2050

++	/* Regular kernel threads don't get to stay during offline. */

2051

++	if (cpu_dying(cpu))

2052

++		return false;

2053

++

2054

++	/* But are allowed during online. */

2055

++	return cpu_online(cpu);

2056

++}

2057

++

2058

++/*

2059

++ * This is how migration works:

2060

++ *

2061

++ * 1) we invoke migration_cpu_stop() on the target CPU using

2062

++ *    stop_one_cpu().

2063

++ * 2) stopper starts to run (implicitly forcing the migrated thread

2064

++ *    off the CPU)

2065

++ * 3) it checks whether the migrated task is still in the wrong runqueue.

2066

++ * 4) if it's in the wrong runqueue then the migration thread removes

2067

++ *    it and puts it into the right queue.

2068

++ * 5) stopper completes and stop_one_cpu() returns and the migration

2069

++ *    is done.

2070

++ */

2071

++

2072

++/*

2073

++ * move_queued_task - move a queued task to new rq.

2074

++ *

2075

++ * Returns (locked) new rq. Old rq's lock is released.

2076

++ */

2077

++static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int

2078

++				   new_cpu)

2079

++{

2080

++	lockdep_assert_held(&rq->lock);

2081

++

2082

++	WRITE_ONCE(p->on_rq, TASK_ON_RQ_MIGRATING);

2083

++	dequeue_task(p, rq, 0);

2084

++	set_task_cpu(p, new_cpu);

2085

++	raw_spin_unlock(&rq->lock);

2086

++

2087

++	rq = cpu_rq(new_cpu);

2088

++

2089

++	raw_spin_lock(&rq->lock);

2090

++	BUG_ON(task_cpu(p) != new_cpu);

2091

++	sched_task_sanity_check(p, rq);

2092

++	enqueue_task(p, rq, 0);

2093

++	p->on_rq = TASK_ON_RQ_QUEUED;

2094

++	check_preempt_curr(rq);

2095

++

2096

++	return rq;

2097

++}

2098

++

2099

++struct migration_arg {

2100

++	struct task_struct *task;

2101

++	int dest_cpu;

2102

++};

2103

++

2104

++/*

2105

++ * Move (not current) task off this CPU, onto the destination CPU. We're doing

2106

++ * this because either it can't run here any more (set_cpus_allowed()

2107

++ * away from this CPU, or CPU going down), or because we're

2108

++ * attempting to rebalance this task on exec (sched_exec).

2109

++ *

2110

++ * So we race with normal scheduler movements, but that's OK, as long

2111

++ * as the task is no longer on this CPU.

2112

++ */

2113

++static struct rq *__migrate_task(struct rq *rq, struct task_struct *p, int

2114

++				 dest_cpu)

2115

++{

2116

++	/* Affinity changed (again). */

2117

++	if (!is_cpu_allowed(p, dest_cpu))

2118

++		return rq;

2119

++

2120

++	update_rq_clock(rq);

2121

++	return move_queued_task(rq, p, dest_cpu);

2122

++}

2123

++

2124

++/*

2125

++ * migration_cpu_stop - this will be executed by a highprio stopper thread

2126

++ * and performs thread migration by bumping thread off CPU then

2127

++ * 'pushing' onto another runqueue.

2128

++ */

2129

++static int migration_cpu_stop(void *data)

2130

++{

2131

++	struct migration_arg *arg = data;

2132

++	struct task_struct *p = arg->task;

2133

++	struct rq *rq = this_rq();

2134

++	unsigned long flags;

2135

++

2136

++	/*

2137

++	 * The original target CPU might have gone down and we might

2138

++	 * be on another CPU but it doesn't matter.

2139

++	 */

2140

++	local_irq_save(flags);

2141

++	/*

2142

++	 * We need to explicitly wake pending tasks before running

2143

++	 * __migrate_task() such that we will not miss enforcing cpus_ptr

2144

++	 * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test.

2145

++	 */

2146

++	flush_smp_call_function_from_idle();

2147

++

2148

++	raw_spin_lock(&p->pi_lock);

2149

++	raw_spin_lock(&rq->lock);

2150

++	/*

2151

++	 * If task_rq(p) != rq, it cannot be migrated here, because we're

2152

++	 * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because

2153

++	 * we're holding p->pi_lock.

2154

++	 */

2155

++	if (task_rq(p) == rq && task_on_rq_queued(p))

2156

++		rq = __migrate_task(rq, p, arg->dest_cpu);

2157

++	raw_spin_unlock(&rq->lock);

2158

++	raw_spin_unlock_irqrestore(&p->pi_lock, flags);

2159

++

2160

++	return 0;

2161

++}

2162

++

2163

++static inline void

2164

++set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask)

2165

++{

2166

++	cpumask_copy(&p->cpus_mask, new_mask);

2167

++	p->nr_cpus_allowed = cpumask_weight(new_mask);

2168

++}

2169

++

2170

++static void

2171

++__do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)

2172

++{

2173

++	lockdep_assert_held(&p->pi_lock);

2174

++	set_cpus_allowed_common(p, new_mask);

2175

++}

2176

++

2177

++void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)

2178

++{

2179

++	__do_set_cpus_allowed(p, new_mask);

2180

++}

2181

++

2182

++#endif

2183

++

2184

++/**

2185

++ * task_curr - is this task currently executing on a CPU?

2186

++ * @p: the task in question.

2187

++ *

2188

++ * Return: 1 if the task is currently executing. 0 otherwise.

2189

++ */

2190

++inline int task_curr(const struct task_struct *p)

2191

++{

2192

++	return cpu_curr(task_cpu(p)) == p;

2193

++}

2194

++

2195

++#ifdef CONFIG_SMP

2196

++/*

2197

++ * wait_task_inactive - wait for a thread to unschedule.

2198

++ *

2199

++ * If @match_state is nonzero, it's the @p->state value just checked and

2200

++ * not expected to change.  If it changes, i.e. @p might have woken up,

2201

++ * then return zero.  When we succeed in waiting for @p to be off its CPU,

2202

++ * we return a positive number (its total switch count).  If a second call

2203

++ * a short while later returns the same number, the caller can be sure that

2204

++ * @p has remained unscheduled the whole time.

2205

++ *

2206

++ * The caller must ensure that the task *will* unschedule sometime soon,

2207

++ * else this function might spin for a *long* time. This function can't

2208

++ * be called with interrupts off, or it may introduce deadlock with

2209

++ * smp_call_function() if an IPI is sent by the same process we are

2210

++ * waiting to become inactive.

2211

++ */

2212

++unsigned long wait_task_inactive(struct task_struct *p, long match_state)

2213

++{

2214

++	unsigned long flags;

2215

++	bool running, on_rq;

2216

++	unsigned long ncsw;

2217

++	struct rq *rq;

2218

++	raw_spinlock_t *lock;

2219

++

2220

++	for (;;) {

2221

++		rq = task_rq(p);

2222

++

2223

++		/*

2224

++		 * If the task is actively running on another CPU

2225

++		 * still, just relax and busy-wait without holding

2226

++		 * any locks.

2227

++		 *

2228

++		 * NOTE! Since we don't hold any locks, it's not

2229

++		 * even sure that "rq" stays as the right runqueue!

2230

++		 * But we don't care, since this will return false

2231

++		 * if the runqueue has changed and p is actually now

2232

++		 * running somewhere else!

2233

++		 */

2234

++		while (task_running(p) && p == rq->curr) {

2235

++			if (match_state && unlikely(p->state != match_state))

2236

++				return 0;

2237

++			cpu_relax();

2238

++		}

2239

++

2240

++		/*

2241

++		 * Ok, time to look more closely! We need the rq

2242

++		 * lock now, to be *sure*. If we're wrong, we'll

2243

++		 * just go back and repeat.

2244

++		 */

2245

++		task_access_lock_irqsave(p, &lock, &flags);

2246

++		trace_sched_wait_task(p);

2247

++		running = task_running(p);

2248

++		on_rq = p->on_rq;

2249

++		ncsw = 0;

2250

++		if (!match_state || p->state == match_state)

2251

++			ncsw = p->nvcsw | LONG_MIN; /* sets MSB */

2252

++		task_access_unlock_irqrestore(p, lock, &flags);

2253

++

2254

++		/*

2255

++		 * If it changed from the expected state, bail out now.

2256

++		 */

2257

++		if (unlikely(!ncsw))

2258

++			break;

2259

++

2260

++		/*

2261

++		 * Was it really running after all now that we

2262

++		 * checked with the proper locks actually held?

2263

++		 *

2264

++		 * Oops. Go back and try again..

2265

++		 */

2266

++		if (unlikely(running)) {

2267

++			cpu_relax();

2268

++			continue;

2269

++		}

2270

++

2271

++		/*

2272

++		 * It's not enough that it's not actively running,

2273

++		 * it must be off the runqueue _entirely_, and not

2274

++		 * preempted!

2275

++		 *

2276

++		 * So if it was still runnable (but just not actively

2277

++		 * running right now), it's preempted, and we should

2278

++		 * yield - it could be a while.

2279

++		 */

2280

++		if (unlikely(on_rq)) {

2281

++			ktime_t to = NSEC_PER_SEC / HZ;

2282

++

2283

++			set_current_state(TASK_UNINTERRUPTIBLE);

2284

++			schedule_hrtimeout(&to, HRTIMER_MODE_REL);

2285

++			continue;

2286

++		}

2287

++

2288

++		/*

2289

++		 * Ahh, all good. It wasn't running, and it wasn't

2290

++		 * runnable, which means that it will never become

2291

++		 * running in the future either. We're all done!

2292

++		 */

2293

++		break;

2294

++	}

2295

++

2296

++	return ncsw;

2297

++}

2298

++

2299

++/***

2300

++ * kick_process - kick a running thread to enter/exit the kernel

2301

++ * @p: the to-be-kicked thread

2302

++ *

2303

++ * Cause a process which is running on another CPU to enter

2304

++ * kernel-mode, without any delay. (to get signals handled.)

2305

++ *

2306

++ * NOTE: this function doesn't have to take the runqueue lock,

2307

++ * because all it wants to ensure is that the remote task enters

2308

++ * the kernel. If the IPI races and the task has been migrated

2309

++ * to another CPU then no harm is done and the purpose has been

2310

++ * achieved as well.

2311

++ */

2312

++void kick_process(struct task_struct *p)

2313

++{

2314

++	int cpu;

2315

++

2316

++	preempt_disable();

2317

++	cpu = task_cpu(p);

2318

++	if ((cpu != smp_processor_id()) && task_curr(p))

2319

++		smp_send_reschedule(cpu);

2320

++	preempt_enable();

2321

++}

2322

++EXPORT_SYMBOL_GPL(kick_process);

2323

++

2324

++/*

2325

++ * ->cpus_ptr is protected by both rq->lock and p->pi_lock

2326

++ *

2327

++ * A few notes on cpu_active vs cpu_online:

2328

++ *

2329

++ *  - cpu_active must be a subset of cpu_online

2330

++ *

2331

++ *  - on CPU-up we allow per-CPU kthreads on the online && !active CPU,

2332

++ *    see __set_cpus_allowed_ptr(). At this point the newly online

2333

++ *    CPU isn't yet part of the sched domains, and balancing will not

2334

++ *    see it.

2335

++ *

2336

++ *  - on cpu-down we clear cpu_active() to mask the sched domains and

2337

++ *    avoid the load balancer to place new tasks on the to be removed

2338

++ *    CPU. Existing tasks will remain running there and will be taken

2339

++ *    off.

2340

++ *

2341

++ * This means that fallback selection must not select !active CPUs.

2342

++ * And can assume that any active CPU must be online. Conversely

2343

++ * select_task_rq() below may allow selection of !active CPUs in order

2344

++ * to satisfy the above rules.

2345

++ */

2346

++static int select_fallback_rq(int cpu, struct task_struct *p)

2347

++{

2348

++	int nid = cpu_to_node(cpu);

2349

++	const struct cpumask *nodemask = NULL;

2350

++	enum { cpuset, possible, fail } state = cpuset;

2351

++	int dest_cpu;

2352

++

2353

++	/*

2354

++	 * If the node that the CPU is on has been offlined, cpu_to_node()

2355

++	 * will return -1. There is no CPU on the node, and we should

2356

++	 * select the CPU on the other node.

2357

++	 */

2358

++	if (nid != -1) {

2359

++		nodemask = cpumask_of_node(nid);

2360

++

2361

++		/* Look for allowed, online CPU in same node. */

2362

++		for_each_cpu(dest_cpu, nodemask) {

2363

++			if (!cpu_active(dest_cpu))

2364

++				continue;

2365

++			if (cpumask_test_cpu(dest_cpu, p->cpus_ptr))

2366

++				return dest_cpu;

2367

++		}

2368

++	}

2369

++

2370

++	for (;;) {

2371

++		/* Any allowed, online CPU? */

2372

++		for_each_cpu(dest_cpu, p->cpus_ptr) {

2373

++			if (!is_cpu_allowed(p, dest_cpu))

2374

++				continue;

2375

++			goto out;

2376

++		}

2377

++

2378

++		/* No more Mr. Nice Guy. */

2379

++		switch (state) {

2380

++		case cpuset:

2381

++			if (IS_ENABLED(CONFIG_CPUSETS)) {

2382

++				cpuset_cpus_allowed_fallback(p);

2383

++				state = possible;

2384

++				break;

2385

++			}

2386

++			fallthrough;

2387

++		case possible:

2388

++			/*

2389

++			 * XXX When called from select_task_rq() we only

2390

++			 * hold p->pi_lock and again violate locking order.

2391

++			 *

2392

++			 * More yuck to audit.

2393

++			 */

2394

++			do_set_cpus_allowed(p, cpu_possible_mask);

2395

++			state = fail;

2396

++			break;

2397

++

2398

++		case fail:

2399

++			BUG();

2400

++			break;

2401

++		}

2402

++	}

2403

++

2404

++out:

2405

++	if (state != cpuset) {

2406

++		/*

2407

++		 * Don't tell them about moving exiting tasks or

2408

++		 * kernel threads (both mm NULL), since they never

2409

++		 * leave kernel.

2410

++		 */

2411

++		if (p->mm && printk_ratelimit()) {

2412

++			printk_deferred("process %d (%s) no longer affine to cpu%d\n",

2413

++					task_pid_nr(p), p->comm, cpu);

2414

++		}

2415

++	}

2416

++

2417

++	return dest_cpu;

2418

++}

2419

++

2420

++static inline int select_task_rq(struct task_struct *p)

2421

++{

2422

++	cpumask_t chk_mask, tmp;

2423

++

2424

++	if (unlikely(!cpumask_and(&chk_mask, p->cpus_ptr, cpu_active_mask)))

2425

++		return select_fallback_rq(task_cpu(p), p);

2426

++

2427

++	if (

2428

++#ifdef CONFIG_SCHED_SMT

2429

++	    cpumask_and(&tmp, &chk_mask, &sched_sg_idle_mask) ||

2430

++#endif

2431

++	    cpumask_and(&tmp, &chk_mask, &sched_rq_watermark[IDLE_WM]) ||

2432

++	    cpumask_and(&tmp, &chk_mask,

2433

++			&sched_rq_watermark[task_sched_prio(p) + 1]))

2434

++		return best_mask_cpu(task_cpu(p), &tmp);

2435

++

2436

++	return best_mask_cpu(task_cpu(p), &chk_mask);

2437

++}

2438

++

2439

++void sched_set_stop_task(int cpu, struct task_struct *stop)

2440

++{

2441

++	static struct lock_class_key stop_pi_lock;

2442

++	struct sched_param stop_param = { .sched_priority = STOP_PRIO };

2443

++	struct sched_param start_param = { .sched_priority = 0 };

2444

++	struct task_struct *old_stop = cpu_rq(cpu)->stop;

2445

++

2446

++	if (stop) {

2447

++		/*

2448

++		 * Make it appear like a SCHED_FIFO task, its something

2449

++		 * userspace knows about and won't get confused about.

2450

++		 *

2451

++		 * Also, it will make PI more or less work without too

2452

++		 * much confusion -- but then, stop work should not

2453

++		 * rely on PI working anyway.

2454

++		 */

2455

++		sched_setscheduler_nocheck(stop, SCHED_FIFO, &stop_param);

2456

++

2457

++		/*

2458

++		 * The PI code calls rt_mutex_setprio() with ->pi_lock held to

2459

++		 * adjust the effective priority of a task. As a result,

2460

++		 * rt_mutex_setprio() can trigger (RT) balancing operations,

2461

++		 * which can then trigger wakeups of the stop thread to push

2462

++		 * around the current task.

2463

++		 *

2464

++		 * The stop task itself will never be part of the PI-chain, it

2465

++		 * never blocks, therefore that ->pi_lock recursion is safe.

2466

++		 * Tell lockdep about this by placing the stop->pi_lock in its

2467

++		 * own class.

2468

++		 */

2469

++		lockdep_set_class(&stop->pi_lock, &stop_pi_lock);

2470

++	}

2471

++

2472

++	cpu_rq(cpu)->stop = stop;

2473

++

2474

++	if (old_stop) {

2475

++		/*

2476

++		 * Reset it back to a normal scheduling policy so that

2477

++		 * it can die in pieces.

2478

++		 */

2479

++		sched_setscheduler_nocheck(old_stop, SCHED_NORMAL, &start_param);

2480

++	}

2481

++}

2482

++

2483

++/*

2484

++ * Change a given task's CPU affinity. Migrate the thread to a

2485

++ * proper CPU and schedule it away if the CPU it's executing on

2486

++ * is removed from the allowed bitmask.

2487

++ *

2488

++ * NOTE: the caller must have a valid reference to the task, the

2489

++ * task must not exit() & deallocate itself prematurely. The

2490

++ * call is not atomic; no spinlocks may be held.

2491

++ */

2492

++static int __set_cpus_allowed_ptr(struct task_struct *p,

2493

++				  const struct cpumask *new_mask,

2494

++				  u32 flags)

2495

++{

2496

++	const struct cpumask *cpu_valid_mask = cpu_active_mask;

2497

++	int dest_cpu;

2498

++	unsigned long irq_flags;

2499

++	struct rq *rq;

2500

++	raw_spinlock_t *lock;

2501

++	int ret = 0;

2502

++

2503

++	raw_spin_lock_irqsave(&p->pi_lock, irq_flags);

2504

++	rq = __task_access_lock(p, &lock);

2505

++

2506

++	if (p->flags & PF_KTHREAD || is_migration_disabled(p)) {

2507

++		/*

2508

++		 * Kernel threads are allowed on online && !active CPUs,

2509

++		 * however, during cpu-hot-unplug, even these might get pushed

2510

++		 * away if not KTHREAD_IS_PER_CPU.

2511

++		 *

2512

++		 * Specifically, migration_disabled() tasks must not fail the

2513

++		 * cpumask_any_and_distribute() pick below, esp. so on

2514

++		 * SCA_MIGRATE_ENABLE, otherwise we'll not call

2515

++		 * set_cpus_allowed_common() and actually reset p->cpus_ptr.

2516

++		 */

2517

++		cpu_valid_mask = cpu_online_mask;

2518

++	}

2519

++

2520

++	/*

2521

++	 * Must re-check here, to close a race against __kthread_bind(),

2522

++	 * sched_setaffinity() is not guaranteed to observe the flag.

2523

++	 */

2524

++	if ((flags & SCA_CHECK) && (p->flags & PF_NO_SETAFFINITY)) {

2525

++		ret = -EINVAL;

2526

++		goto out;

2527

++	}

2528

++

2529

++	if (cpumask_equal(&p->cpus_mask, new_mask))

2530

++		goto out;

2531

++

2532

++	dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask);

2533

++	if (dest_cpu >= nr_cpu_ids) {

2534

++		ret = -EINVAL;

2535

++		goto out;

2536

++	}

2537

++

2538

++	__do_set_cpus_allowed(p, new_mask);

2539

++

2540

++	/* Can the task run on the task's current CPU? If so, we're done */

2541

++	if (cpumask_test_cpu(task_cpu(p), new_mask))

2542

++		goto out;

2543

++

2544

++	if (p->migration_disabled) {

2545

++		if (likely(p->cpus_ptr != &p->cpus_mask))

2546

++			__do_set_cpus_ptr(p, &p->cpus_mask);

2547

++		p->migration_disabled = 0;

2548

++		p->migration_flags |= MDF_FORCE_ENABLED;

2549

++		/* When p is migrate_disabled, rq->lock should be held */

2550

++		rq->nr_pinned--;

2551

++	}

2552

++

2553

++	if (task_running(p) || p->state == TASK_WAKING) {

2554

++		struct migration_arg arg = { p, dest_cpu };

2555

++

2556

++		/* Need help from migration thread: drop lock and wait. */

2557

++		__task_access_unlock(p, lock);

2558

++		raw_spin_unlock_irqrestore(&p->pi_lock, irq_flags);

2559

++		stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);

2560

++		return 0;

2561

++	}

2562

++	if (task_on_rq_queued(p)) {

2563

++		/*

2564

++		 * OK, since we're going to drop the lock immediately

2565

++		 * afterwards anyway.

2566

++		 */

2567

++		update_rq_clock(rq);

2568

++		rq = move_queued_task(rq, p, dest_cpu);

2569

++		lock = &rq->lock;

2570

++	}

2571

++

2572

++out:

2573

++	__task_access_unlock(p, lock);

2574

++	raw_spin_unlock_irqrestore(&p->pi_lock, irq_flags);

2575

++

2576

++	return ret;

2577

++}

2578

++

2579

++int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)

2580

++{

2581

++	return __set_cpus_allowed_ptr(p, new_mask, 0);

2582

++}

2583

++EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);

2584

++

2585

++#else /* CONFIG_SMP */

2586

++

2587

++static inline int select_task_rq(struct task_struct *p)

2588

++{

2589

++	return 0;

2590

++}

2591

++

2592

++static inline int

2593

++__set_cpus_allowed_ptr(struct task_struct *p,

2594

++		       const struct cpumask *new_mask,

2595

++		       u32 flags)

2596

++{

2597

++	return set_cpus_allowed_ptr(p, new_mask);

2598

++}

2599

++

2600

++static inline bool rq_has_pinned_tasks(struct rq *rq)

2601

++{

2602

++	return false;

2603

++}

2604

++

2605

++#endif /* !CONFIG_SMP */

2606

++

2607

++static void

2608

++ttwu_stat(struct task_struct *p, int cpu, int wake_flags)

2609

++{

2610

++	struct rq *rq;

2611

++

2612

++	if (!schedstat_enabled())

2613

++		return;

2614

++

2615

++	rq = this_rq();

2616

++

2617

++#ifdef CONFIG_SMP

2618

++	if (cpu == rq->cpu)

2619

++		__schedstat_inc(rq->ttwu_local);

2620

++	else {

2621

++		/** Alt schedule FW ToDo:

2622

++		 * How to do ttwu_wake_remote

2623

++		 */

2624

++	}

2625

++#endif /* CONFIG_SMP */

2626

++

2627

++	__schedstat_inc(rq->ttwu_count);

2628

++}

2629

++

2630

++/*

2631

++ * Mark the task runnable and perform wakeup-preemption.

2632

++ */

2633

++static inline void

2634

++ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)

2635

++{

2636

++	check_preempt_curr(rq);

2637

++	p->state = TASK_RUNNING;

2638

++	trace_sched_wakeup(p);

2639

++}

2640

++

2641

++static inline void

2642

++ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags)

2643

++{

2644

++	if (p->sched_contributes_to_load)

2645

++		rq->nr_uninterruptible--;

2646

++

2647

++	if (

2648

++#ifdef CONFIG_SMP

2649

++	    !(wake_flags & WF_MIGRATED) &&

2650

++#endif

2651

++	    p->in_iowait) {

2652

++		delayacct_blkio_end(p);

2653

++		atomic_dec(&task_rq(p)->nr_iowait);

2654

++	}

2655

++

2656

++	activate_task(p, rq);

2657

++	ttwu_do_wakeup(rq, p, 0);

2658

++}

2659

++

2660

++/*

2661

++ * Consider @p being inside a wait loop:

2662

++ *

2663

++ *   for (;;) {

2664

++ *      set_current_state(TASK_UNINTERRUPTIBLE);

2665

++ *

2666

++ *      if (CONDITION)

2667

++ *         break;

2668

++ *

2669

++ *      schedule();

2670

++ *   }

2671

++ *   __set_current_state(TASK_RUNNING);

2672

++ *

2673

++ * between set_current_state() and schedule(). In this case @p is still

2674

++ * runnable, so all that needs doing is change p->state back to TASK_RUNNING in

2675

++ * an atomic manner.

2676

++ *

2677

++ * By taking task_rq(p)->lock we serialize against schedule(), if @p->on_rq

2678

++ * then schedule() must still happen and p->state can be changed to

2679

++ * TASK_RUNNING. Otherwise we lost the race, schedule() has happened, and we

2680

++ * need to do a full wakeup with enqueue.

2681

++ *

2682

++ * Returns: %true when the wakeup is done,

2683

++ *          %false otherwise.

2684

++ */

2685

++static int ttwu_runnable(struct task_struct *p, int wake_flags)

2686

++{

2687

++	struct rq *rq;

2688

++	raw_spinlock_t *lock;

2689

++	int ret = 0;

2690

++

2691

++	rq = __task_access_lock(p, &lock);

2692

++	if (task_on_rq_queued(p)) {

2693

++		/* check_preempt_curr() may use rq clock */

2694

++		update_rq_clock(rq);

2695

++		ttwu_do_wakeup(rq, p, wake_flags);

2696

++		ret = 1;

2697

++	}

2698

++	__task_access_unlock(p, lock);

2699

++

2700

++	return ret;

2701

++}

2702

++

2703

++#ifdef CONFIG_SMP

2704

++void sched_ttwu_pending(void *arg)

2705

++{

2706

++	struct llist_node *llist = arg;

2707

++	struct rq *rq = this_rq();

2708

++	struct task_struct *p, *t;

2709

++	struct rq_flags rf;

2710

++

2711

++	if (!llist)

2712

++		return;

2713

++

2714

++	/*

2715

++	 * rq::ttwu_pending racy indication of out-standing wakeups.

2716

++	 * Races such that false-negatives are possible, since they

2717

++	 * are shorter lived that false-positives would be.

2718

++	 */

2719

++	WRITE_ONCE(rq->ttwu_pending, 0);

2720

++

2721

++	rq_lock_irqsave(rq, &rf);

2722

++	update_rq_clock(rq);

2723

++

2724

++	llist_for_each_entry_safe(p, t, llist, wake_entry.llist) {

2725

++		if (WARN_ON_ONCE(p->on_cpu))

2726

++			smp_cond_load_acquire(&p->on_cpu, !VAL);

2727

++

2728

++		if (WARN_ON_ONCE(task_cpu(p) != cpu_of(rq)))

2729

++			set_task_cpu(p, cpu_of(rq));

2730

++

2731

++		ttwu_do_activate(rq, p, p->sched_remote_wakeup ? WF_MIGRATED : 0);

2732

++	}

2733

++

2734

++	rq_unlock_irqrestore(rq, &rf);

2735

++}

2736

++

2737

++void send_call_function_single_ipi(int cpu)

2738

++{

2739

++	struct rq *rq = cpu_rq(cpu);

2740

++

2741

++	if (!set_nr_if_polling(rq->idle))

2742

++		arch_send_call_function_single_ipi(cpu);

2743

++	else

2744

++		trace_sched_wake_idle_without_ipi(cpu);

2745

++}

2746

++

2747

++/*

2748

++ * Queue a task on the target CPUs wake_list and wake the CPU via IPI if

2749

++ * necessary. The wakee CPU on receipt of the IPI will queue the task

2750

++ * via sched_ttwu_wakeup() for activation so the wakee incurs the cost

2751

++ * of the wakeup instead of the waker.

2752

++ */

2753

++static void __ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)

2754

++{

2755

++	struct rq *rq = cpu_rq(cpu);

2756

++

2757

++	p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED);

2758

++

2759

++	WRITE_ONCE(rq->ttwu_pending, 1);

2760

++	__smp_call_single_queue(cpu, &p->wake_entry.llist);

2761

++}

2762

++

2763

++static inline bool ttwu_queue_cond(int cpu, int wake_flags)

2764

++{

2765

++	/*

2766

++	 * Do not complicate things with the async wake_list while the CPU is

2767

++	 * in hotplug state.

2768

++	 */

2769

++	if (!cpu_active(cpu))

2770

++		return false;

2771

++

2772

++	/*

2773

++	 * If the CPU does not share cache, then queue the task on the

2774

++	 * remote rqs wakelist to avoid accessing remote data.

2775

++	 */

2776

++	if (!cpus_share_cache(smp_processor_id(), cpu))

2777

++		return true;

2778

++

2779

++	/*

2780

++	 * If the task is descheduling and the only running task on the

2781

++	 * CPU then use the wakelist to offload the task activation to

2782

++	 * the soon-to-be-idle CPU as the current CPU is likely busy.

2783

++	 * nr_running is checked to avoid unnecessary task stacking.

2784

++	 */

2785

++	if ((wake_flags & WF_ON_CPU) && cpu_rq(cpu)->nr_running <= 1)

2786

++		return true;

2787

++

2788

++	return false;

2789

++}

2790

++

2791

++static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)

2792

++{

2793

++	if (__is_defined(ALT_SCHED_TTWU_QUEUE) && ttwu_queue_cond(cpu, wake_flags)) {

2794

++		if (WARN_ON_ONCE(cpu == smp_processor_id()))

2795

++			return false;

2796

++

2797

++		sched_clock_cpu(cpu); /* Sync clocks across CPUs */

2798

++		__ttwu_queue_wakelist(p, cpu, wake_flags);

2799

++		return true;

2800

++	}

2801

++

2802

++	return false;

2803

++}

2804

++

2805

++void wake_up_if_idle(int cpu)

2806

++{

2807

++	struct rq *rq = cpu_rq(cpu);

2808

++	unsigned long flags;

2809

++

2810

++	rcu_read_lock();

2811

++

2812

++	if (!is_idle_task(rcu_dereference(rq->curr)))

2813

++		goto out;

2814

++

2815

++	if (set_nr_if_polling(rq->idle)) {

2816

++		trace_sched_wake_idle_without_ipi(cpu);

2817

++	} else {

2818

++		raw_spin_lock_irqsave(&rq->lock, flags);

2819

++		if (is_idle_task(rq->curr))

2820

++			smp_send_reschedule(cpu);

2821

++		/* Else CPU is not idle, do nothing here */

2822

++		raw_spin_unlock_irqrestore(&rq->lock, flags);

2823

++	}

2824

++

2825

++out:

2826

++	rcu_read_unlock();

2827

++}

2828

++

2829

++bool cpus_share_cache(int this_cpu, int that_cpu)

2830

++{

2831

++	return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);

2832

++}

2833

++#else /* !CONFIG_SMP */

2834

++

2835

++static inline bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)

2836

++{

2837

++	return false;

2838

++}

2839

++

2840

++#endif /* CONFIG_SMP */

2841

++

2842

++static inline void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)

2843

++{

2844

++	struct rq *rq = cpu_rq(cpu);

2845

++

2846

++	if (ttwu_queue_wakelist(p, cpu, wake_flags))

2847

++		return;

2848

++

2849

++	raw_spin_lock(&rq->lock);

2850

++	update_rq_clock(rq);

2851

++	ttwu_do_activate(rq, p, wake_flags);

2852

++	raw_spin_unlock(&rq->lock);

2853

++}

2854

++

2855

++/*

2856

++ * Notes on Program-Order guarantees on SMP systems.

2857

++ *

2858

++ *  MIGRATION

2859

++ *

2860

++ * The basic program-order guarantee on SMP systems is that when a task [t]

2861

++ * migrates, all its activity on its old CPU [c0] happens-before any subsequent

2862

++ * execution on its new CPU [c1].

2863

++ *

2864

++ * For migration (of runnable tasks) this is provided by the following means:

2865

++ *

2866

++ *  A) UNLOCK of the rq(c0)->lock scheduling out task t

2867

++ *  B) migration for t is required to synchronize *both* rq(c0)->lock and

2868

++ *     rq(c1)->lock (if not at the same time, then in that order).

2869

++ *  C) LOCK of the rq(c1)->lock scheduling in task

2870

++ *

2871

++ * Transitivity guarantees that B happens after A and C after B.

2872

++ * Note: we only require RCpc transitivity.

2873

++ * Note: the CPU doing B need not be c0 or c1

2874

++ *

2875

++ * Example:

2876

++ *

2877

++ *   CPU0            CPU1            CPU2

2878

++ *

2879

++ *   LOCK rq(0)->lock

2880

++ *   sched-out X

2881

++ *   sched-in Y

2882

++ *   UNLOCK rq(0)->lock

2883

++ *

2884

++ *                                   LOCK rq(0)->lock // orders against CPU0

2885

++ *                                   dequeue X

2886

++ *                                   UNLOCK rq(0)->lock

2887

++ *

2888

++ *                                   LOCK rq(1)->lock

2889

++ *                                   enqueue X

2890

++ *                                   UNLOCK rq(1)->lock

2891

++ *

2892

++ *                   LOCK rq(1)->lock // orders against CPU2

2893

++ *                   sched-out Z

2894

++ *                   sched-in X

2895

++ *                   UNLOCK rq(1)->lock

2896

++ *

2897

++ *

2898

++ *  BLOCKING -- aka. SLEEP + WAKEUP

2899

++ *

2900

++ * For blocking we (obviously) need to provide the same guarantee as for

2901

++ * migration. However the means are completely different as there is no lock

2902

++ * chain to provide order. Instead we do:

2903

++ *

2904

++ *   1) smp_store_release(X->on_cpu, 0)   -- finish_task()

2905

++ *   2) smp_cond_load_acquire(!X->on_cpu) -- try_to_wake_up()

2906

++ *

2907

++ * Example:

2908

++ *

2909

++ *   CPU0 (schedule)  CPU1 (try_to_wake_up) CPU2 (schedule)

2910

++ *

2911

++ *   LOCK rq(0)->lock LOCK X->pi_lock

2912

++ *   dequeue X

2913

++ *   sched-out X

2914

++ *   smp_store_release(X->on_cpu, 0);

2915

++ *

2916

++ *                    smp_cond_load_acquire(&X->on_cpu, !VAL);

2917

++ *                    X->state = WAKING

2918

++ *                    set_task_cpu(X,2)

2919

++ *

2920

++ *                    LOCK rq(2)->lock

2921

++ *                    enqueue X

2922

++ *                    X->state = RUNNING

2923

++ *                    UNLOCK rq(2)->lock

2924

++ *

2925

++ *                                          LOCK rq(2)->lock // orders against CPU1

2926

++ *                                          sched-out Z

2927

++ *                                          sched-in X

2928

++ *                                          UNLOCK rq(2)->lock

2929

++ *

2930

++ *                    UNLOCK X->pi_lock

2931

++ *   UNLOCK rq(0)->lock

2932

++ *

2933

++ *

2934

++ * However; for wakeups there is a second guarantee we must provide, namely we

2935

++ * must observe the state that lead to our wakeup. That is, not only must our

2936

++ * task observe its own prior state, it must also observe the stores prior to

2937

++ * its wakeup.

2938

++ *

2939

++ * This means that any means of doing remote wakeups must order the CPU doing

2940

++ * the wakeup against the CPU the task is going to end up running on. This,

2941

++ * however, is already required for the regular Program-Order guarantee above,

2942

++ * since the waking CPU is the one issueing the ACQUIRE (smp_cond_load_acquire).

2943

++ *

2944

++ */

2945

++

2946

++/**

2947

++ * try_to_wake_up - wake up a thread

2948

++ * @p: the thread to be awakened

2949

++ * @state: the mask of task states that can be woken

2950

++ * @wake_flags: wake modifier flags (WF_*)

2951

++ *

2952

++ * Conceptually does:

2953

++ *

2954

++ *   If (@state & @p->state) @p->state = TASK_RUNNING.

2955

++ *

2956

++ * If the task was not queued/runnable, also place it back on a runqueue.

2957

++ *

2958

++ * This function is atomic against schedule() which would dequeue the task.

2959

++ *

2960

++ * It issues a full memory barrier before accessing @p->state, see the comment

2961

++ * with set_current_state().

2962

++ *

2963

++ * Uses p->pi_lock to serialize against concurrent wake-ups.

2964

++ *

2965

++ * Relies on p->pi_lock stabilizing:

2966

++ *  - p->sched_class

2967

++ *  - p->cpus_ptr

2968

++ *  - p->sched_task_group

2969

++ * in order to do migration, see its use of select_task_rq()/set_task_cpu().

2970

++ *

2971

++ * Tries really hard to only take one task_rq(p)->lock for performance.

2972

++ * Takes rq->lock in:

2973

++ *  - ttwu_runnable()    -- old rq, unavoidable, see comment there;

2974

++ *  - ttwu_queue()       -- new rq, for enqueue of the task;

2975

++ *  - psi_ttwu_dequeue() -- much sadness :-( accounting will kill us.

2976

++ *

2977

++ * As a consequence we race really badly with just about everything. See the

2978

++ * many memory barriers and their comments for details.

2979

++ *

2980

++ * Return: %true if @p->state changes (an actual wakeup was done),

2981

++ *	   %false otherwise.

2982

++ */

2983

++static int try_to_wake_up(struct task_struct *p, unsigned int state,

2984

++			  int wake_flags)

2985

++{

2986

++	unsigned long flags;

2987

++	int cpu, success = 0;

2988

++

2989

++	preempt_disable();

2990

++	if (p == current) {

2991

++		/*

2992

++		 * We're waking current, this means 'p->on_rq' and 'task_cpu(p)

2993

++		 * == smp_processor_id()'. Together this means we can special

2994

++		 * case the whole 'p->on_rq && ttwu_runnable()' case below

2995

++		 * without taking any locks.

2996

++		 *

2997

++		 * In particular:

2998

++		 *  - we rely on Program-Order guarantees for all the ordering,

2999

++		 *  - we're serialized against set_special_state() by virtue of

3000

++		 *    it disabling IRQs (this allows not taking ->pi_lock).

3001

++		 */

3002

++		if (!(p->state & state))

3003

++			goto out;

3004

++

3005

++		success = 1;

3006

++		trace_sched_waking(p);

3007

++		p->state = TASK_RUNNING;

3008

++		trace_sched_wakeup(p);

3009

++		goto out;

3010

++	}

3011

++

3012

++	/*

3013

++	 * If we are going to wake up a thread waiting for CONDITION we

3014

++	 * need to ensure that CONDITION=1 done by the caller can not be

3015

++	 * reordered with p->state check below. This pairs with smp_store_mb()

3016

++	 * in set_current_state() that the waiting thread does.

3017

++	 */

3018

++	raw_spin_lock_irqsave(&p->pi_lock, flags);

3019

++	smp_mb__after_spinlock();

3020

++	if (!(p->state & state))

3021

++		goto unlock;

3022

++

3023

++	trace_sched_waking(p);

3024

++

3025

++	/* We're going to change ->state: */

3026

++	success = 1;

3027

++

3028

++	/*

3029

++	 * Ensure we load p->on_rq _after_ p->state, otherwise it would

3030

++	 * be possible to, falsely, observe p->on_rq == 0 and get stuck

3031

++	 * in smp_cond_load_acquire() below.

3032

++	 *

3033

++	 * sched_ttwu_pending()			try_to_wake_up()

3034

++	 *   STORE p->on_rq = 1			  LOAD p->state

3035

++	 *   UNLOCK rq->lock

3036

++	 *

3037

++	 * __schedule() (switch to task 'p')

3038

++	 *   LOCK rq->lock			  smp_rmb();

3039

++	 *   smp_mb__after_spinlock();

3040

++	 *   UNLOCK rq->lock

3041

++	 *

3042

++	 * [task p]

3043

++	 *   STORE p->state = UNINTERRUPTIBLE	  LOAD p->on_rq

3044

++	 *

3045

++	 * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in

3046

++	 * __schedule().  See the comment for smp_mb__after_spinlock().

3047

++	 *

3048

++	 * A similar smb_rmb() lives in try_invoke_on_locked_down_task().

3049

++	 */

3050

++	smp_rmb();

3051

++	if (READ_ONCE(p->on_rq) && ttwu_runnable(p, wake_flags))

3052

++		goto unlock;

3053

++

3054

++#ifdef CONFIG_SMP

3055

++	/*

3056

++	 * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be

3057

++	 * possible to, falsely, observe p->on_cpu == 0.

3058

++	 *

3059

++	 * One must be running (->on_cpu == 1) in order to remove oneself

3060

++	 * from the runqueue.

3061

++	 *

3062

++	 * __schedule() (switch to task 'p')	try_to_wake_up()

3063

++	 *   STORE p->on_cpu = 1		  LOAD p->on_rq

3064

++	 *   UNLOCK rq->lock

3065

++	 *

3066

++	 * __schedule() (put 'p' to sleep)

3067

++	 *   LOCK rq->lock			  smp_rmb();

3068

++	 *   smp_mb__after_spinlock();

3069

++	 *   STORE p->on_rq = 0			  LOAD p->on_cpu

3070

++	 *

3071

++	 * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in

3072

++	 * __schedule().  See the comment for smp_mb__after_spinlock().

3073

++	 *

3074

++	 * Form a control-dep-acquire with p->on_rq == 0 above, to ensure

3075

++	 * schedule()'s deactivate_task() has 'happened' and p will no longer

3076

++	 * care about it's own p->state. See the comment in __schedule().

3077

++	 */

3078

++	smp_acquire__after_ctrl_dep();

3079

++

3080

++	/*

3081

++	 * We're doing the wakeup (@success == 1), they did a dequeue (p->on_rq

3082

++	 * == 0), which means we need to do an enqueue, change p->state to

3083

++	 * TASK_WAKING such that we can unlock p->pi_lock before doing the

3084

++	 * enqueue, such as ttwu_queue_wakelist().

3085

++	 */

3086

++	p->state = TASK_WAKING;

3087

++

3088

++	/*

3089

++	 * If the owning (remote) CPU is still in the middle of schedule() with

3090

++	 * this task as prev, considering queueing p on the remote CPUs wake_list

3091

++	 * which potentially sends an IPI instead of spinning on p->on_cpu to

3092

++	 * let the waker make forward progress. This is safe because IRQs are

3093

++	 * disabled and the IPI will deliver after on_cpu is cleared.

3094

++	 *

3095

++	 * Ensure we load task_cpu(p) after p->on_cpu:

3096

++	 *

3097

++	 * set_task_cpu(p, cpu);

3098

++	 *   STORE p->cpu = @cpu

3099

++	 * __schedule() (switch to task 'p')

3100

++	 *   LOCK rq->lock

3101

++	 *   smp_mb__after_spin_lock()          smp_cond_load_acquire(&p->on_cpu)

3102

++	 *   STORE p->on_cpu = 1                LOAD p->cpu

3103

++	 *

3104

++	 * to ensure we observe the correct CPU on which the task is currently

3105

++	 * scheduling.

3106

++	 */

3107

++	if (smp_load_acquire(&p->on_cpu) &&

3108

++	    ttwu_queue_wakelist(p, task_cpu(p), wake_flags | WF_ON_CPU))

3109

++		goto unlock;

3110

++

3111

++	/*

3112

++	 * If the owning (remote) CPU is still in the middle of schedule() with

3113

++	 * this task as prev, wait until it's done referencing the task.

3114

++	 *

3115

++	 * Pairs with the smp_store_release() in finish_task().

3116

++	 *

3117

++	 * This ensures that tasks getting woken will be fully ordered against

3118

++	 * their previous state and preserve Program Order.

3119

++	 */

3120

++	smp_cond_load_acquire(&p->on_cpu, !VAL);

3121

++

3122

++	sched_task_ttwu(p);

3123

++

3124

++	cpu = select_task_rq(p);

3125

++

3126

++	if (cpu != task_cpu(p)) {

3127

++		if (p->in_iowait) {

3128

++			delayacct_blkio_end(p);

3129

++			atomic_dec(&task_rq(p)->nr_iowait);

3130

++		}

3131

++

3132

++		wake_flags |= WF_MIGRATED;

3133

++		psi_ttwu_dequeue(p);

3134

++		set_task_cpu(p, cpu);

3135

++	}

3136

++#else

3137

++	cpu = task_cpu(p);

3138

++#endif /* CONFIG_SMP */

3139

++

3140

++	ttwu_queue(p, cpu, wake_flags);

3141

++unlock:

3142

++	raw_spin_unlock_irqrestore(&p->pi_lock, flags);

3143

++out:

3144

++	if (success)

3145

++		ttwu_stat(p, task_cpu(p), wake_flags);

3146

++	preempt_enable();

3147

++

3148

++	return success;

3149

++}

3150

++

3151

++/**

3152

++ * try_invoke_on_locked_down_task - Invoke a function on task in fixed state

3153

++ * @p: Process for which the function is to be invoked, can be @current.

3154

++ * @func: Function to invoke.

3155

++ * @arg: Argument to function.

3156

++ *

3157

++ * If the specified task can be quickly locked into a definite state

3158

++ * (either sleeping or on a given runqueue), arrange to keep it in that

3159

++ * state while invoking @func(@arg).  This function can use ->on_rq and

3160

++ * task_curr() to work out what the state is, if required.  Given that

3161

++ * @func can be invoked with a runqueue lock held, it had better be quite

3162

++ * lightweight.

3163

++ *

3164

++ * Returns:

3165

++ *	@false if the task slipped out from under the locks.

3166

++ *	@true if the task was locked onto a runqueue or is sleeping.

3167

++ *		However, @func can override this by returning @false.

3168

++ */

3169

++bool try_invoke_on_locked_down_task(struct task_struct *p, bool (*func)(struct task_struct *t, void *arg), void *arg)

3170

++{

3171

++	struct rq_flags rf;

3172

++	bool ret = false;

3173

++	struct rq *rq;

3174

++

3175

++	raw_spin_lock_irqsave(&p->pi_lock, rf.flags);

3176

++	if (p->on_rq) {

3177

++		rq = __task_rq_lock(p, &rf);

3178

++		if (task_rq(p) == rq)

3179

++			ret = func(p, arg);

3180

++		__task_rq_unlock(rq, &rf);

3181

++	} else {

3182

++		switch (p->state) {

3183

++		case TASK_RUNNING:

3184

++		case TASK_WAKING:

3185

++			break;

3186

++		default:

3187

++			smp_rmb(); // See smp_rmb() comment in try_to_wake_up().

3188

++			if (!p->on_rq)

3189

++				ret = func(p, arg);

3190

++		}

3191

++	}

3192

++	raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags);

3193

++	return ret;

3194

++}

3195

++

3196

++/**

3197

++ * wake_up_process - Wake up a specific process

3198

++ * @p: The process to be woken up.

3199

++ *

3200

++ * Attempt to wake up the nominated process and move it to the set of runnable

3201

++ * processes.

3202

++ *

3203

++ * Return: 1 if the process was woken up, 0 if it was already running.

3204

++ *

3205

++ * This function executes a full memory barrier before accessing the task state.

3206

++ */

3207

++int wake_up_process(struct task_struct *p)

3208

++{

3209

++	return try_to_wake_up(p, TASK_NORMAL, 0);

3210

++}

3211

++EXPORT_SYMBOL(wake_up_process);

3212

++

3213

++int wake_up_state(struct task_struct *p, unsigned int state)

3214

++{

3215

++	return try_to_wake_up(p, state, 0);

3216

++}

3217

++

3218

++/*

3219

++ * Perform scheduler related setup for a newly forked process p.

3220

++ * p is forked by current.

3221

++ *

3222

++ * __sched_fork() is basic setup used by init_idle() too:

3223

++ */

3224

++static inline void __sched_fork(unsigned long clone_flags, struct task_struct *p)

3225

++{

3226

++	p->on_rq			= 0;

3227

++	p->on_cpu			= 0;

3228

++	p->utime			= 0;

3229

++	p->stime			= 0;

3230

++	p->sched_time			= 0;

3231

++

3232

++#ifdef CONFIG_PREEMPT_NOTIFIERS

3233

++	INIT_HLIST_HEAD(&p->preempt_notifiers);

3234

++#endif

3235

++

3236

++#ifdef CONFIG_COMPACTION

3237

++	p->capture_control = NULL;

3238

++#endif

3239

++#ifdef CONFIG_SMP

3240

++	p->wake_entry.u_flags = CSD_TYPE_TTWU;

3241

++#endif

3242

++}

3243

++

3244

++/*

3245

++ * fork()/clone()-time setup:

3246

++ */

3247

++int sched_fork(unsigned long clone_flags, struct task_struct *p)

3248

++{

3249

++	unsigned long flags;

3250

++	struct rq *rq;

3251

++

3252

++	__sched_fork(clone_flags, p);

3253

++	/*

3254

++	 * We mark the process as NEW here. This guarantees that

3255

++	 * nobody will actually run it, and a signal or other external

3256

++	 * event cannot wake it up and insert it on the runqueue either.

3257

++	 */

3258

++	p->state = TASK_NEW;

3259

++

3260

++	/*

3261

++	 * Make sure we do not leak PI boosting priority to the child.

3262

++	 */

3263

++	p->prio = current->normal_prio;

3264

++

3265

++	/*

3266

++	 * Revert to default priority/policy on fork if requested.

3267

++	 */

3268

++	if (unlikely(p->sched_reset_on_fork)) {

3269

++		if (task_has_rt_policy(p)) {

3270

++			p->policy = SCHED_NORMAL;

3271

++			p->static_prio = NICE_TO_PRIO(0);

3272

++			p->rt_priority = 0;

3273

++		} else if (PRIO_TO_NICE(p->static_prio) < 0)

3274

++			p->static_prio = NICE_TO_PRIO(0);

3275

++

3276

++		p->prio = p->normal_prio = normal_prio(p);

3277

++

3278

++		/*

3279

++		 * We don't need the reset flag anymore after the fork. It has

3280

++		 * fulfilled its duty:

3281

++		 */

3282

++		p->sched_reset_on_fork = 0;

3283

++	}

3284

++

3285

++	/*

3286

++	 * The child is not yet in the pid-hash so no cgroup attach races,

3287

++	 * and the cgroup is pinned to this child due to cgroup_fork()

3288

++	 * is ran before sched_fork().

3289

++	 *

3290

++	 * Silence PROVE_RCU.

3291

++	 */

3292

++	raw_spin_lock_irqsave(&p->pi_lock, flags);

3293

++	/*

3294

++	 * Share the timeslice between parent and child, thus the

3295

++	 * total amount of pending timeslices in the system doesn't change,

3296

++	 * resulting in more scheduling fairness.

3297

++	 */

3298

++	rq = this_rq();

3299

++	raw_spin_lock(&rq->lock);

3300

++

3301

++	rq->curr->time_slice /= 2;

3302

++	p->time_slice = rq->curr->time_slice;

3303

++#ifdef CONFIG_SCHED_HRTICK

3304

++	hrtick_start(rq, rq->curr->time_slice);

3305

++#endif

3306

++

3307

++	if (p->time_slice < RESCHED_NS) {

3308

++		p->time_slice = sched_timeslice_ns;

3309

++		resched_curr(rq);

3310

++	}

3311

++	sched_task_fork(p, rq);

3312

++	raw_spin_unlock(&rq->lock);

3313

++

3314

++	rseq_migrate(p);

3315

++	/*

3316

++	 * We're setting the CPU for the first time, we don't migrate,

3317

++	 * so use __set_task_cpu().

3318

++	 */

3319

++	__set_task_cpu(p, cpu_of(rq));

3320

++	raw_spin_unlock_irqrestore(&p->pi_lock, flags);

3321

++

3322

++#ifdef CONFIG_SCHED_INFO

3323

++	if (unlikely(sched_info_on()))

3324

++		memset(&p->sched_info, 0, sizeof(p->sched_info));

3325

++#endif

3326

++	init_task_preempt_count(p);

3327

++

3328

++	return 0;

3329

++}

3330

++

3331

++void sched_post_fork(struct task_struct *p) {}

3332

++

3333

++#ifdef CONFIG_SCHEDSTATS

3334

++

3335

++DEFINE_STATIC_KEY_FALSE(sched_schedstats);

3336

++static bool __initdata __sched_schedstats = false;

3337

++

3338

++static void set_schedstats(bool enabled)

3339

++{

3340

++	if (enabled)

3341

++		static_branch_enable(&sched_schedstats);

3342

++	else

3343

++		static_branch_disable(&sched_schedstats);

3344

++}

3345

++

3346

++void force_schedstat_enabled(void)

3347

++{

3348

++	if (!schedstat_enabled()) {

3349

++		pr_info("kernel profiling enabled schedstats, disable via kernel.sched_schedstats.\n");

3350

++		static_branch_enable(&sched_schedstats);

3351

++	}

3352

++}

3353

++

3354

++static int __init setup_schedstats(char *str)

3355

++{

3356

++	int ret = 0;

3357

++	if (!str)

3358

++		goto out;

3359

++

3360

++	/*

3361

++	 * This code is called before jump labels have been set up, so we can't

3362

++	 * change the static branch directly just yet.  Instead set a temporary

3363

++	 * variable so init_schedstats() can do it later.

3364

++	 */

3365

++	if (!strcmp(str, "enable")) {

3366

++		__sched_schedstats = true;

3367

++		ret = 1;

3368

++	} else if (!strcmp(str, "disable")) {

3369

++		__sched_schedstats = false;

3370

++		ret = 1;

3371

++	}

3372

++out:

3373

++	if (!ret)

3374

++		pr_warn("Unable to parse schedstats=\n");

3375

++

3376

++	return ret;

3377

++}

3378

++__setup("schedstats=", setup_schedstats);

3379

++

3380

++static void __init init_schedstats(void)

3381

++{

3382

++	set_schedstats(__sched_schedstats);

3383

++}

3384

++

3385

++#ifdef CONFIG_PROC_SYSCTL

3386

++int sysctl_schedstats(struct ctl_table *table, int write,

3387

++			 void __user *buffer, size_t *lenp, loff_t *ppos)

3388

++{

3389

++	struct ctl_table t;

3390

++	int err;

3391

++	int state = static_branch_likely(&sched_schedstats);

3392

++

3393

++	if (write && !capable(CAP_SYS_ADMIN))

3394

++		return -EPERM;

3395

++

3396

++	t = *table;

3397

++	t.data = &state;

3398

++	err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);

3399

++	if (err < 0)

3400

++		return err;

3401

++	if (write)

3402

++		set_schedstats(state);

3403

++	return err;

3404

++}

3405

++#endif /* CONFIG_PROC_SYSCTL */

3406

++#else  /* !CONFIG_SCHEDSTATS */

3407

++static inline void init_schedstats(void) {}

3408

++#endif /* CONFIG_SCHEDSTATS */

3409

++

3410

++/*

3411

++ * wake_up_new_task - wake up a newly created task for the first time.

3412

++ *

3413

++ * This function will do some initial scheduler statistics housekeeping

3414

++ * that must be done for every newly created context, then puts the task

3415

++ * on the runqueue and wakes it.

3416

++ */

3417

++void wake_up_new_task(struct task_struct *p)

3418

++{

3419

++	unsigned long flags;

3420

++	struct rq *rq;

3421

++

3422

++	raw_spin_lock_irqsave(&p->pi_lock, flags);

3423

++	p->state = TASK_RUNNING;

3424

++	rq = cpu_rq(select_task_rq(p));

3425

++#ifdef CONFIG_SMP

3426

++	rseq_migrate(p);

3427

++	/*

3428

++	 * Fork balancing, do it here and not earlier because:

3429

++	 * - cpus_ptr can change in the fork path

3430

++	 * - any previously selected CPU might disappear through hotplug

3431

++	 *

3432

++	 * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq,

3433

++	 * as we're not fully set-up yet.

3434

++	 */

3435

++	__set_task_cpu(p, cpu_of(rq));

3436

++#endif

3437

++

3438

++	raw_spin_lock(&rq->lock);

3439

++	update_rq_clock(rq);

3440

++

3441

++	activate_task(p, rq);

3442

++	trace_sched_wakeup_new(p);

3443

++	check_preempt_curr(rq);

3444

++

3445

++	raw_spin_unlock(&rq->lock);

3446

++	raw_spin_unlock_irqrestore(&p->pi_lock, flags);

3447

++}

3448

++

3449

++#ifdef CONFIG_PREEMPT_NOTIFIERS

3450

++

3451

++static DEFINE_STATIC_KEY_FALSE(preempt_notifier_key);

3452

++

3453

++void preempt_notifier_inc(void)

3454

++{

3455

++	static_branch_inc(&preempt_notifier_key);

3456

++}

3457

++EXPORT_SYMBOL_GPL(preempt_notifier_inc);

3458

++

3459

++void preempt_notifier_dec(void)

3460

++{

3461

++	static_branch_dec(&preempt_notifier_key);

3462

++}

3463

++EXPORT_SYMBOL_GPL(preempt_notifier_dec);

3464

++

3465

++/**

3466

++ * preempt_notifier_register - tell me when current is being preempted & rescheduled

3467

++ * @notifier: notifier struct to register

3468

++ */

3469

++void preempt_notifier_register(struct preempt_notifier *notifier)

3470

++{

3471

++	if (!static_branch_unlikely(&preempt_notifier_key))

3472

++		WARN(1, "registering preempt_notifier while notifiers disabled\n");

3473

++

3474

++	hlist_add_head(&notifier->link, &current->preempt_notifiers);

3475

++}

3476

++EXPORT_SYMBOL_GPL(preempt_notifier_register);

3477

++

3478

++/**

3479

++ * preempt_notifier_unregister - no longer interested in preemption notifications

3480

++ * @notifier: notifier struct to unregister

3481

++ *

3482

++ * This is *not* safe to call from within a preemption notifier.

3483

++ */

3484

++void preempt_notifier_unregister(struct preempt_notifier *notifier)

3485

++{

3486

++	hlist_del(&notifier->link);

3487

++}

3488

++EXPORT_SYMBOL_GPL(preempt_notifier_unregister);

3489

++

3490

++static void __fire_sched_in_preempt_notifiers(struct task_struct *curr)

3491

++{

3492

++	struct preempt_notifier *notifier;

3493

++

3494

++	hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)

3495

++		notifier->ops->sched_in(notifier, raw_smp_processor_id());

3496

++}

3497

++

3498

++static __always_inline void fire_sched_in_preempt_notifiers(struct task_struct *curr)

3499

++{

3500

++	if (static_branch_unlikely(&preempt_notifier_key))

3501

++		__fire_sched_in_preempt_notifiers(curr);

3502

++}

3503

++

3504

++static void

3505

++__fire_sched_out_preempt_notifiers(struct task_struct *curr,

3506

++				   struct task_struct *next)

3507

++{

3508

++	struct preempt_notifier *notifier;

3509

++

3510

++	hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)

3511

++		notifier->ops->sched_out(notifier, next);

3512

++}

3513

++

3514

++static __always_inline void

3515

++fire_sched_out_preempt_notifiers(struct task_struct *curr,

3516

++				 struct task_struct *next)

3517

++{

3518

++	if (static_branch_unlikely(&preempt_notifier_key))

3519

++		__fire_sched_out_preempt_notifiers(curr, next);

3520

++}

3521

++

3522

++#else /* !CONFIG_PREEMPT_NOTIFIERS */

3523

++

3524

++static inline void fire_sched_in_preempt_notifiers(struct task_struct *curr)

3525

++{

3526

++}

3527

++

3528

++static inline void

3529

++fire_sched_out_preempt_notifiers(struct task_struct *curr,

3530

++				 struct task_struct *next)

3531

++{

3532

++}

3533

++

3534

++#endif /* CONFIG_PREEMPT_NOTIFIERS */

3535

++

3536

++static inline void prepare_task(struct task_struct *next)

3537

++{

3538

++	/*

3539

++	 * Claim the task as running, we do this before switching to it

3540

++	 * such that any running task will have this set.

3541

++	 *

3542

++	 * See the ttwu() WF_ON_CPU case and its ordering comment.

3543

++	 */

3544

++	WRITE_ONCE(next->on_cpu, 1);

3545

++}

3546

++

3547

++static inline void finish_task(struct task_struct *prev)

3548

++{

3549

++#ifdef CONFIG_SMP

3550

++	/*

3551

++	 * This must be the very last reference to @prev from this CPU. After

3552

++	 * p->on_cpu is cleared, the task can be moved to a different CPU. We

3553

++	 * must ensure this doesn't happen until the switch is completely

3554

++	 * finished.

3555

++	 *

3556

++	 * In particular, the load of prev->state in finish_task_switch() must

3557

++	 * happen before this.

3558

++	 *

3559

++	 * Pairs with the smp_cond_load_acquire() in try_to_wake_up().

3560

++	 */

3561

++	smp_store_release(&prev->on_cpu, 0);

3562

++#else

3563

++	prev->on_cpu = 0;

3564

++#endif

3565

++}

3566

++

3567

++#ifdef CONFIG_SMP

3568

++

3569

++static void do_balance_callbacks(struct rq *rq, struct callback_head *head)

3570

++{

3571

++	void (*func)(struct rq *rq);

3572

++	struct callback_head *next;

3573

++

3574

++	lockdep_assert_held(&rq->lock);

3575

++

3576

++	while (head) {

3577

++		func = (void (*)(struct rq *))head->func;

3578

++		next = head->next;

3579

++		head->next = NULL;

3580

++		head = next;

3581

++

3582

++		func(rq);

3583

++	}

3584

++}

3585

++

3586

++static void balance_push(struct rq *rq);

3587

++

3588

++struct callback_head balance_push_callback = {

3589

++	.next = NULL,

3590

++	.func = (void (*)(struct callback_head *))balance_push,

3591

++};

3592

++

3593

++static inline struct callback_head *splice_balance_callbacks(struct rq *rq)

3594

++{

3595

++	struct callback_head *head = rq->balance_callback;

3596

++

3597

++	if (head) {

3598

++		lockdep_assert_held(&rq->lock);

3599

++		rq->balance_callback = NULL;

3600

++	}

3601

++

3602

++	return head;

3603

++}

3604

++

3605

++static void __balance_callbacks(struct rq *rq)

3606

++{

3607

++	do_balance_callbacks(rq, splice_balance_callbacks(rq));

3608

++}

3609

++

3610

++static inline void balance_callbacks(struct rq *rq, struct callback_head *head)

3611

++{

3612

++	unsigned long flags;

3613

++

3614

++	if (unlikely(head)) {

3615

++		raw_spin_lock_irqsave(&rq->lock, flags);

3616

++		do_balance_callbacks(rq, head);

3617

++		raw_spin_unlock_irqrestore(&rq->lock, flags);

3618

++	}

3619

++}

3620

++

3621

++#else

3622

++

3623

++static inline void __balance_callbacks(struct rq *rq)

3624

++{

3625

++}

3626

++

3627

++static inline struct callback_head *splice_balance_callbacks(struct rq *rq)

3628

++{

3629

++	return NULL;

3630

++}

3631

++

3632

++static inline void balance_callbacks(struct rq *rq, struct callback_head *head)

3633

++{

3634

++}

3635

++

3636

++#endif

3637

++

3638

++static inline void

3639

++prepare_lock_switch(struct rq *rq, struct task_struct *next)

3640

++{

3641

++	/*

3642

++	 * Since the runqueue lock will be released by the next

3643

++	 * task (which is an invalid locking op but in the case

3644

++	 * of the scheduler it's an obvious special-case), so we

3645

++	 * do an early lockdep release here:

3646

++	 */

3647

++	spin_release(&rq->lock.dep_map, _THIS_IP_);

3648

++#ifdef CONFIG_DEBUG_SPINLOCK

3649

++	/* this is a valid case when another task releases the spinlock */

3650

++	rq->lock.owner = next;

3651

++#endif

3652

++}

3653

++

3654

++static inline void finish_lock_switch(struct rq *rq)

3655

++{

3656

++	/*

3657

++	 * If we are tracking spinlock dependencies then we have to

3658

++	 * fix up the runqueue lock - which gets 'carried over' from

3659

++	 * prev into current:

3660

++	 */

3661

++	spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);

3662

++	__balance_callbacks(rq);

3663

++	raw_spin_unlock_irq(&rq->lock);

3664

++}

3665

++

3666

++/*

3667

++ * NOP if the arch has not defined these:

3668

++ */

3669

++

3670

++#ifndef prepare_arch_switch

3671

++# define prepare_arch_switch(next)	do { } while (0)

3672

++#endif

3673

++

3674

++#ifndef finish_arch_post_lock_switch

3675

++# define finish_arch_post_lock_switch()	do { } while (0)

3676

++#endif

3677

++

3678

++static inline void kmap_local_sched_out(void)

3679

++{

3680

++#ifdef CONFIG_KMAP_LOCAL

3681

++	if (unlikely(current->kmap_ctrl.idx))

3682

++		__kmap_local_sched_out();

3683

++#endif

3684

++}

3685

++

3686

++static inline void kmap_local_sched_in(void)

3687

++{

3688

++#ifdef CONFIG_KMAP_LOCAL

3689

++	if (unlikely(current->kmap_ctrl.idx))

3690

++		__kmap_local_sched_in();

3691

++#endif

3692

++}

3693

++

3694

++/**

3695

++ * prepare_task_switch - prepare to switch tasks

3696

++ * @rq: the runqueue preparing to switch

3697

++ * @next: the task we are going to switch to.

3698

++ *

3699

++ * This is called with the rq lock held and interrupts off. It must

3700

++ * be paired with a subsequent finish_task_switch after the context

3701

++ * switch.

3702

++ *

3703

++ * prepare_task_switch sets up locking and calls architecture specific

3704

++ * hooks.

3705

++ */

3706

++static inline void

3707

++prepare_task_switch(struct rq *rq, struct task_struct *prev,

3708

++		    struct task_struct *next)

3709

++{

3710

++	kcov_prepare_switch(prev);

3711

++	sched_info_switch(rq, prev, next);

3712

++	perf_event_task_sched_out(prev, next);

3713

++	rseq_preempt(prev);

3714

++	fire_sched_out_preempt_notifiers(prev, next);

3715

++	kmap_local_sched_out();

3716

++	prepare_task(next);

3717

++	prepare_arch_switch(next);

3718

++}

3719

++

3720

++/**

3721

++ * finish_task_switch - clean up after a task-switch

3722

++ * @rq: runqueue associated with task-switch

3723

++ * @prev: the thread we just switched away from.

3724

++ *

3725

++ * finish_task_switch must be called after the context switch, paired

3726

++ * with a prepare_task_switch call before the context switch.

3727

++ * finish_task_switch will reconcile locking set up by prepare_task_switch,

3728

++ * and do any other architecture-specific cleanup actions.

3729

++ *

3730

++ * Note that we may have delayed dropping an mm in context_switch(). If

3731

++ * so, we finish that here outside of the runqueue lock.  (Doing it

3732

++ * with the lock held can cause deadlocks; see schedule() for

3733

++ * details.)

3734

++ *

3735

++ * The context switch have flipped the stack from under us and restored the

3736

++ * local variables which were saved when this task called schedule() in the

3737

++ * past. prev == current is still correct but we need to recalculate this_rq

3738

++ * because prev may have moved to another CPU.

3739

++ */

3740

++static struct rq *finish_task_switch(struct task_struct *prev)

3741

++	__releases(rq->lock)

3742

++{

3743

++	struct rq *rq = this_rq();

3744

++	struct mm_struct *mm = rq->prev_mm;

3745

++	long prev_state;

3746

++

3747

++	/*

3748

++	 * The previous task will have left us with a preempt_count of 2

3749

++	 * because it left us after:

3750

++	 *

3751

++	 *	schedule()

3752

++	 *	  preempt_disable();			// 1

3753

++	 *	  __schedule()

3754

++	 *	    raw_spin_lock_irq(&rq->lock)	// 2

3755

++	 *

3756

++	 * Also, see FORK_PREEMPT_COUNT.

3757

++	 */

3758

++	if (WARN_ONCE(preempt_count() != 2*PREEMPT_DISABLE_OFFSET,

3759

++		      "corrupted preempt_count: %s/%d/0x%x\n",

3760

++		      current->comm, current->pid, preempt_count()))

3761

++		preempt_count_set(FORK_PREEMPT_COUNT);

3762

++

3763

++	rq->prev_mm = NULL;

3764

++

3765

++	/*

3766

++	 * A task struct has one reference for the use as "current".

3767

++	 * If a task dies, then it sets TASK_DEAD in tsk->state and calls

3768

++	 * schedule one last time. The schedule call will never return, and

3769

++	 * the scheduled task must drop that reference.

3770

++	 *

3771

++	 * We must observe prev->state before clearing prev->on_cpu (in

3772

++	 * finish_task), otherwise a concurrent wakeup can get prev

3773

++	 * running on another CPU and we could rave with its RUNNING -> DEAD

3774

++	 * transition, resulting in a double drop.

3775

++	 */

3776

++	prev_state = prev->state;

3777

++	vtime_task_switch(prev);

3778

++	perf_event_task_sched_in(prev, current);

3779

++	finish_task(prev);

3780

++	finish_lock_switch(rq);

3781

++	finish_arch_post_lock_switch();

3782

++	kcov_finish_switch(current);

3783

++	/*

3784

++	 * kmap_local_sched_out() is invoked with rq::lock held and

3785

++	 * interrupts disabled. There is no requirement for that, but the

3786

++	 * sched out code does not have an interrupt enabled section.

3787

++	 * Restoring the maps on sched in does not require interrupts being

3788

++	 * disabled either.

3789

++	 */

3790

++	kmap_local_sched_in();

3791

++

3792

++	fire_sched_in_preempt_notifiers(current);

3793

++	/*

3794

++	 * When switching through a kernel thread, the loop in

3795

++	 * membarrier_{private,global}_expedited() may have observed that

3796

++	 * kernel thread and not issued an IPI. It is therefore possible to

3797

++	 * schedule between user->kernel->user threads without passing though

3798

++	 * switch_mm(). Membarrier requires a barrier after storing to

3799

++	 * rq->curr, before returning to userspace, so provide them here:

3800

++	 *

3801

++	 * - a full memory barrier for {PRIVATE,GLOBAL}_EXPEDITED, implicitly

3802

++	 *   provided by mmdrop(),

3803

++	 * - a sync_core for SYNC_CORE.

3804

++	 */

3805

++	if (mm) {

3806

++		membarrier_mm_sync_core_before_usermode(mm);

3807

++		mmdrop(mm);

3808

++	}

3809

++	if (unlikely(prev_state == TASK_DEAD)) {

3810

++		/*

3811

++		 * Remove function-return probe instances associated with this

3812

++		 * task and put them back on the free list.

3813

++		 */

3814

++		kprobe_flush_task(prev);

3815

++

3816

++		/* Task is done with its stack. */

3817

++		put_task_stack(prev);

3818

++

3819

++		put_task_struct_rcu_user(prev);

3820

++	}

3821

++

3822

++	tick_nohz_task_switch();

3823

++	return rq;

3824

++}

3825

++

3826

++/**

3827

++ * schedule_tail - first thing a freshly forked thread must call.

3828

++ * @prev: the thread we just switched away from.

3829

++ */

3830

++asmlinkage __visible void schedule_tail(struct task_struct *prev)

3831

++	__releases(rq->lock)

3832

++{

3833

++	/*

3834

++	 * New tasks start with FORK_PREEMPT_COUNT, see there and

3835

++	 * finish_task_switch() for details.

3836

++	 *

3837

++	 * finish_task_switch() will drop rq->lock() and lower preempt_count

3838

++	 * and the preempt_enable() will end up enabling preemption (on

3839

++	 * PREEMPT_COUNT kernels).

3840

++	 */

3841

++

3842

++	finish_task_switch(prev);

3843

++	preempt_enable();

3844

++

3845

++	if (current->set_child_tid)

3846

++		put_user(task_pid_vnr(current), current->set_child_tid);

3847

++

3848

++	calculate_sigpending();

3849

++}

3850

++

3851

++/*

3852

++ * context_switch - switch to the new MM and the new thread's register state.

3853

++ */

3854

++static __always_inline struct rq *

3855

++context_switch(struct rq *rq, struct task_struct *prev,

3856

++	       struct task_struct *next)

3857

++{

3858

++	prepare_task_switch(rq, prev, next);

3859

++

3860

++	/*

3861

++	 * For paravirt, this is coupled with an exit in switch_to to

3862

++	 * combine the page table reload and the switch backend into

3863

++	 * one hypercall.

3864

++	 */

3865

++	arch_start_context_switch(prev);

3866

++

3867

++	/*

3868

++	 * kernel -> kernel   lazy + transfer active

3869

++	 *   user -> kernel   lazy + mmgrab() active

3870

++	 *

3871

++	 * kernel ->   user   switch + mmdrop() active

3872

++	 *   user ->   user   switch

3873

++	 */

3874

++	if (!next->mm) {                                // to kernel

3875

++		enter_lazy_tlb(prev->active_mm, next);

3876

++

3877

++		next->active_mm = prev->active_mm;

3878

++		if (prev->mm)                           // from user

3879

++			mmgrab(prev->active_mm);

3880

++		else

3881

++			prev->active_mm = NULL;

3882

++	} else {                                        // to user

3883

++		membarrier_switch_mm(rq, prev->active_mm, next->mm);

3884

++		/*

3885

++		 * sys_membarrier() requires an smp_mb() between setting

3886

++		 * rq->curr / membarrier_switch_mm() and returning to userspace.

3887

++		 *

3888

++		 * The below provides this either through switch_mm(), or in

3889

++		 * case 'prev->active_mm == next->mm' through

3890

++		 * finish_task_switch()'s mmdrop().

3891

++		 */

3892

++		switch_mm_irqs_off(prev->active_mm, next->mm, next);

3893

++

3894

++		if (!prev->mm) {                        // from kernel

3895

++			/* will mmdrop() in finish_task_switch(). */

3896

++			rq->prev_mm = prev->active_mm;

3897

++			prev->active_mm = NULL;

3898

++		}

3899

++	}

3900

++

3901

++	prepare_lock_switch(rq, next);

3902

++

3903

++	/* Here we just switch the register state and the stack. */

3904

++	switch_to(prev, next, prev);

3905

++	barrier();

3906

++

3907

++	return finish_task_switch(prev);

3908

++}

3909

++

3910

++/*

3911

++ * nr_running, nr_uninterruptible and nr_context_switches:

3912

++ *

3913

++ * externally visible scheduler statistics: current number of runnable

3914

++ * threads, total number of context switches performed since bootup.

3915

++ */

3916

++unsigned long nr_running(void)

3917

++{

3918

++	unsigned long i, sum = 0;

3919

++

3920

++	for_each_online_cpu(i)

3921

++		sum += cpu_rq(i)->nr_running;

3922

++

3923

++	return sum;

3924

++}

3925

++

3926

++/*

3927

++ * Check if only the current task is running on the CPU.

3928

++ *

3929

++ * Caution: this function does not check that the caller has disabled

3930

++ * preemption, thus the result might have a time-of-check-to-time-of-use

3931

++ * race.  The caller is responsible to use it correctly, for example:

3932

++ *

3933

++ * - from a non-preemptible section (of course)

3934

++ *

3935

++ * - from a thread that is bound to a single CPU

3936

++ *

3937

++ * - in a loop with very short iterations (e.g. a polling loop)

3938

++ */

3939

++bool single_task_running(void)

3940

++{

3941

++	return raw_rq()->nr_running == 1;

3942

++}

3943

++EXPORT_SYMBOL(single_task_running);

3944

++

3945

++unsigned long long nr_context_switches(void)

3946

++{

3947

++	int i;

3948

++	unsigned long long sum = 0;

3949

++

3950

++	for_each_possible_cpu(i)

3951

++		sum += cpu_rq(i)->nr_switches;

3952

++

3953

++	return sum;

3954

++}

3955

++

3956

++/*

3957

++ * Consumers of these two interfaces, like for example the cpuidle menu

3958

++ * governor, are using nonsensical data. Preferring shallow idle state selection

3959

++ * for a CPU that has IO-wait which might not even end up running the task when

3960

++ * it does become runnable.

3961

++ */

3962

++

3963

++unsigned long nr_iowait_cpu(int cpu)

3964

++{

3965

++	return atomic_read(&cpu_rq(cpu)->nr_iowait);

3966

++}

3967

++

3968

++/*

3969

++ * IO-wait accounting, and how it's mostly bollocks (on SMP).

3970

++ *

3971

++ * The idea behind IO-wait account is to account the idle time that we could

3972

++ * have spend running if it were not for IO. That is, if we were to improve the

3973

++ * storage performance, we'd have a proportional reduction in IO-wait time.

3974

++ *

3975

++ * This all works nicely on UP, where, when a task blocks on IO, we account

3976

++ * idle time as IO-wait, because if the storage were faster, it could've been

3977

++ * running and we'd not be idle.

3978

++ *

3979

++ * This has been extended to SMP, by doing the same for each CPU. This however

3980

++ * is broken.

3981

++ *

3982

++ * Imagine for instance the case where two tasks block on one CPU, only the one

3983

++ * CPU will have IO-wait accounted, while the other has regular idle. Even

3984

++ * though, if the storage were faster, both could've ran at the same time,

3985

++ * utilising both CPUs.

3986

++ *

3987

++ * This means, that when looking globally, the current IO-wait accounting on

3988

++ * SMP is a lower bound, by reason of under accounting.

3989

++ *

3990

++ * Worse, since the numbers are provided per CPU, they are sometimes

3991

++ * interpreted per CPU, and that is nonsensical. A blocked task isn't strictly

3992

++ * associated with any one particular CPU, it can wake to another CPU than it

3993

++ * blocked on. This means the per CPU IO-wait number is meaningless.

3994

++ *

3995

++ * Task CPU affinities can make all that even more 'interesting'.

3996

++ */

3997

++

3998

++unsigned long nr_iowait(void)

3999

++{

4000

++	unsigned long i, sum = 0;

4001

++

4002

++	for_each_possible_cpu(i)

4003

++		sum += nr_iowait_cpu(i);

4004

++

4005

++	return sum;

4006

++}

4007

++

4008

++#ifdef CONFIG_SMP

4009

++

4010

++/*

4011

++ * sched_exec - execve() is a valuable balancing opportunity, because at

4012

++ * this point the task has the smallest effective memory and cache

4013

++ * footprint.

4014

++ */

4015

++void sched_exec(void)

4016

++{

4017

++	struct task_struct *p = current;

4018

++	unsigned long flags;

4019

++	int dest_cpu;

4020

++	struct rq *rq;

4021

++

4022

++	raw_spin_lock_irqsave(&p->pi_lock, flags);

4023

++	rq = this_rq();

4024

++

4025

++	if (rq != task_rq(p) || rq->nr_running < 2)

4026

++		goto unlock;

4027

++

4028

++	dest_cpu = select_task_rq(p);

4029

++	if (dest_cpu == smp_processor_id())

4030

++		goto unlock;

4031

++

4032

++	if (likely(cpu_active(dest_cpu))) {

4033

++		struct migration_arg arg = { p, dest_cpu };

4034

++

4035

++		raw_spin_unlock_irqrestore(&p->pi_lock, flags);

4036

++		stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);

4037

++		return;

4038

++	}

4039

++unlock:

4040

++	raw_spin_unlock_irqrestore(&p->pi_lock, flags);

4041

++}

4042

++

4043

++#endif

4044

++

4045

++DEFINE_PER_CPU(struct kernel_stat, kstat);

4046

++DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat);

4047

++

4048

++EXPORT_PER_CPU_SYMBOL(kstat);

4049

++EXPORT_PER_CPU_SYMBOL(kernel_cpustat);

4050

++

4051

++static inline void update_curr(struct rq *rq, struct task_struct *p)

4052

++{

4053

++	s64 ns = rq->clock_task - p->last_ran;

4054

++

4055

++	p->sched_time += ns;

4056

++	account_group_exec_runtime(p, ns);

4057

++

4058

++	p->time_slice -= ns;

4059

++	p->last_ran = rq->clock_task;

4060

++}

4061

++

4062

++/*

4063

++ * Return accounted runtime for the task.

4064

++ * Return separately the current's pending runtime that have not been

4065

++ * accounted yet.

4066

++ */

4067

++unsigned long long task_sched_runtime(struct task_struct *p)

4068

++{

4069

++	unsigned long flags;

4070

++	struct rq *rq;

4071

++	raw_spinlock_t *lock;

4072

++	u64 ns;

4073

++

4074

++#if defined(CONFIG_64BIT) && defined(CONFIG_SMP)

4075

++	/*

4076

++	 * 64-bit doesn't need locks to atomically read a 64-bit value.

4077

++	 * So we have a optimization chance when the task's delta_exec is 0.

4078

++	 * Reading ->on_cpu is racy, but this is ok.

4079

++	 *

4080

++	 * If we race with it leaving CPU, we'll take a lock. So we're correct.

4081

++	 * If we race with it entering CPU, unaccounted time is 0. This is

4082

++	 * indistinguishable from the read occurring a few cycles earlier.

4083

++	 * If we see ->on_cpu without ->on_rq, the task is leaving, and has

4084

++	 * been accounted, so we're correct here as well.

4085

++	 */

4086

++	if (!p->on_cpu || !task_on_rq_queued(p))

4087

++		return tsk_seruntime(p);

4088

++#endif

4089

++

4090

++	rq = task_access_lock_irqsave(p, &lock, &flags);

4091

++	/*

4092

++	 * Must be ->curr _and_ ->on_rq.  If dequeued, we would

4093

++	 * project cycles that may never be accounted to this

4094

++	 * thread, breaking clock_gettime().

4095

++	 */

4096

++	if (p == rq->curr && task_on_rq_queued(p)) {

4097

++		update_rq_clock(rq);

4098

++		update_curr(rq, p);

4099

++	}

4100

++	ns = tsk_seruntime(p);

4101

++	task_access_unlock_irqrestore(p, lock, &flags);

4102

++

4103

++	return ns;

4104

++}

4105

++

4106

++/* This manages tasks that have run out of timeslice during a scheduler_tick */

4107

++static inline void scheduler_task_tick(struct rq *rq)

4108

++{

4109

++	struct task_struct *p = rq->curr;

4110

++

4111

++	if (is_idle_task(p))

4112

++		return;

4113

++

4114

++	update_curr(rq, p);

4115

++	cpufreq_update_util(rq, 0);

4116

++

4117

++	/*

4118

++	 * Tasks have less than RESCHED_NS of time slice left they will be

4119

++	 * rescheduled.

4120

++	 */

4121

++	if (p->time_slice >= RESCHED_NS)

4122

++		return;

4123

++	set_tsk_need_resched(p);

4124

++	set_preempt_need_resched();

4125

++}

4126

++

4127

++#ifdef CONFIG_SCHED_DEBUG

4128

++static u64 cpu_resched_latency(struct rq *rq)

4129

++{

4130

++	int latency_warn_ms = READ_ONCE(sysctl_resched_latency_warn_ms);

4131

++	u64 resched_latency, now = rq_clock(rq);

4132

++	static bool warned_once;

4133

++

4134

++	if (sysctl_resched_latency_warn_once && warned_once)

4135

++		return 0;

4136

++

4137

++	if (!need_resched() || !latency_warn_ms)

4138

++		return 0;

4139

++

4140

++	if (system_state == SYSTEM_BOOTING)

4141

++		return 0;

4142

++

4143

++	if (!rq->last_seen_need_resched_ns) {

4144

++		rq->last_seen_need_resched_ns = now;

4145

++		rq->ticks_without_resched = 0;

4146

++		return 0;

4147

++	}

4148

++

4149

++	rq->ticks_without_resched++;

4150

++	resched_latency = now - rq->last_seen_need_resched_ns;

4151

++	if (resched_latency <= latency_warn_ms * NSEC_PER_MSEC)

4152

++		return 0;

4153

++

4154

++	warned_once = true;

4155

++

4156

++	return resched_latency;

4157

++}

4158

++

4159

++static int __init setup_resched_latency_warn_ms(char *str)

4160

++{

4161

++	long val;

4162

++

4163

++	if ((kstrtol(str, 0, &val))) {

4164

++		pr_warn("Unable to set resched_latency_warn_ms\n");

4165

++		return 1;

4166

++	}

4167

++

4168

++	sysctl_resched_latency_warn_ms = val;

4169

++	return 1;

4170

++}

4171

++__setup("resched_latency_warn_ms=", setup_resched_latency_warn_ms);

4172

++#else

4173

++static inline u64 cpu_resched_latency(struct rq *rq) { return 0; }

4174

++#endif /* CONFIG_SCHED_DEBUG */

4175

++

4176

++/*

4177

++ * This function gets called by the timer code, with HZ frequency.

4178

++ * We call it with interrupts disabled.

4179

++ */

4180

++void scheduler_tick(void)

4181

++{

4182

++	int cpu __maybe_unused = smp_processor_id();

4183

++	struct rq *rq = cpu_rq(cpu);

4184

++	u64 resched_latency;

4185

++

4186

++	arch_scale_freq_tick();

4187

++	sched_clock_tick();

4188

++

4189

++	raw_spin_lock(&rq->lock);

4190

++	update_rq_clock(rq);

4191

++

4192

++	scheduler_task_tick(rq);

4193

++	if (sched_feat(LATENCY_WARN))

4194

++		resched_latency = cpu_resched_latency(rq);

4195

++	calc_global_load_tick(rq);

4196

++

4197

++	rq->last_tick = rq->clock;

4198

++	raw_spin_unlock(&rq->lock);

4199

++

4200

++	if (sched_feat(LATENCY_WARN) && resched_latency)

4201

++		resched_latency_warn(cpu, resched_latency);

4202

++

4203

++	perf_event_task_tick();

4204

++}

4205

++

4206

++#ifdef CONFIG_SCHED_SMT

4207

++static inline int active_load_balance_cpu_stop(void *data)

4208

++{

4209

++	struct rq *rq = this_rq();

4210

++	struct task_struct *p = data;

4211

++	cpumask_t tmp;

4212

++	unsigned long flags;

4213

++

4214

++	local_irq_save(flags);

4215

++

4216

++	raw_spin_lock(&p->pi_lock);

4217

++	raw_spin_lock(&rq->lock);

4218

++

4219

++	rq->active_balance = 0;

4220

++	/* _something_ may have changed the task, double check again */

4221

++	if (task_on_rq_queued(p) && task_rq(p) == rq &&

4222

++	    cpumask_and(&tmp, p->cpus_ptr, &sched_sg_idle_mask) &&

4223

++	    !is_migration_disabled(p)) {

4224

++		int cpu = cpu_of(rq);

4225

++		int dcpu = __best_mask_cpu(cpu, &tmp,

4226

++					   per_cpu(sched_cpu_llc_mask, cpu));

4227

++		rq = move_queued_task(rq, p, dcpu);

4228

++	}

4229

++

4230

++	raw_spin_unlock(&rq->lock);

4231

++	raw_spin_unlock(&p->pi_lock);

4232

++

4233

++	local_irq_restore(flags);

4234

++

4235

++	return 0;

4236

++}

4237

++

4238

++/* sg_balance_trigger - trigger slibing group balance for @cpu */

4239

++static inline int sg_balance_trigger(const int cpu)

4240

++{

4241

++	struct rq *rq= cpu_rq(cpu);

4242

++	unsigned long flags;

4243

++	struct task_struct *curr;

4244

++	int res;

4245

++

4246

++	if (!raw_spin_trylock_irqsave(&rq->lock, flags))

4247

++		return 0;

4248

++	curr = rq->curr;

4249

++	res = (!is_idle_task(curr)) && (1 == rq->nr_running) &&\

4250

++	      cpumask_intersects(curr->cpus_ptr, &sched_sg_idle_mask) &&\

4251

++	      !is_migration_disabled(curr) && (!rq->active_balance);

4252

++

4253

++	if (res)

4254

++		rq->active_balance = 1;

4255

++

4256

++	raw_spin_unlock_irqrestore(&rq->lock, flags);

4257

++

4258

++	if (res)

4259

++		stop_one_cpu_nowait(cpu, active_load_balance_cpu_stop,

4260

++				    curr, &rq->active_balance_work);

4261

++	return res;

4262

++}

4263

++

4264

++/*

4265

++ * sg_balance_check - slibing group balance check for run queue @rq

4266

++ */

4267

++static inline void sg_balance_check(struct rq *rq)

4268

++{

4269

++	cpumask_t chk;

4270

++	int cpu;

4271

++

4272

++	/* exit when no sg in idle */

4273

++	if (cpumask_empty(&sched_sg_idle_mask))

4274

++		return;

4275

++

4276

++	/* exit when cpu is offline */

4277

++	if (unlikely(!rq->online))

4278

++		return;

4279

++

4280

++	cpu = cpu_of(rq);

4281

++	/*

4282

++	 * Only cpu in slibing idle group will do the checking and then

4283

++	 * find potential cpus which can migrate the current running task

4284

++	 */

4285

++	if (cpumask_test_cpu(cpu, &sched_sg_idle_mask) &&

4286

++	    cpumask_andnot(&chk, cpu_online_mask, &sched_rq_pending_mask) &&

4287

++	    cpumask_andnot(&chk, &chk, &sched_rq_watermark[IDLE_WM])) {

4288

++		int i, tried = 0;

4289

++

4290

++		for_each_cpu_wrap(i, &chk, cpu) {

4291

++			if (cpumask_subset(cpu_smt_mask(i), &chk)) {

4292

++				if (sg_balance_trigger(i))

4293

++					return;

4294

++				if (tried)

4295

++					return;

4296

++				tried++;

4297

++			}

4298

++		}

4299

++	}

4300

++}

4301

++#endif /* CONFIG_SCHED_SMT */

4302

++

4303

++#ifdef CONFIG_NO_HZ_FULL

4304

++

4305

++struct tick_work {

4306

++	int			cpu;

4307

++	atomic_t		state;

4308

++	struct delayed_work	work;

4309

++};

4310

++/* Values for ->state, see diagram below. */

4311

++#define TICK_SCHED_REMOTE_OFFLINE	0

4312

++#define TICK_SCHED_REMOTE_OFFLINING	1

4313

++#define TICK_SCHED_REMOTE_RUNNING	2

4314

++

4315

++/*

4316

++ * State diagram for ->state:

4317

++ *

4318

++ *

4319

++ *          TICK_SCHED_REMOTE_OFFLINE

4320

++ *                    |   ^

4321

++ *                    |   |

4322

++ *                    |   | sched_tick_remote()

4323

++ *                    |   |

4324

++ *                    |   |

4325

++ *                    +--TICK_SCHED_REMOTE_OFFLINING

4326

++ *                    |   ^

4327

++ *                    |   |

4328

++ * sched_tick_start() |   | sched_tick_stop()

4329

++ *                    |   |

4330

++ *                    V   |

4331

++ *          TICK_SCHED_REMOTE_RUNNING

4332

++ *

4333

++ *

4334

++ * Other transitions get WARN_ON_ONCE(), except that sched_tick_remote()

4335

++ * and sched_tick_start() are happy to leave the state in RUNNING.

4336

++ */

4337

++

4338

++static struct tick_work __percpu *tick_work_cpu;

4339

++

4340

++static void sched_tick_remote(struct work_struct *work)

4341

++{

4342

++	struct delayed_work *dwork = to_delayed_work(work);

4343

++	struct tick_work *twork = container_of(dwork, struct tick_work, work);

4344

++	int cpu = twork->cpu;

4345

++	struct rq *rq = cpu_rq(cpu);

4346

++	struct task_struct *curr;

4347

++	unsigned long flags;

4348

++	u64 delta;

4349

++	int os;

4350

++

4351

++	/*

4352

++	 * Handle the tick only if it appears the remote CPU is running in full

4353

++	 * dynticks mode. The check is racy by nature, but missing a tick or

4354

++	 * having one too much is no big deal because the scheduler tick updates

4355

++	 * statistics and checks timeslices in a time-independent way, regardless

4356

++	 * of when exactly it is running.

4357

++	 */

4358

++	if (!tick_nohz_tick_stopped_cpu(cpu))

4359

++		goto out_requeue;

4360

++

4361

++	raw_spin_lock_irqsave(&rq->lock, flags);

4362

++	curr = rq->curr;

4363

++	if (cpu_is_offline(cpu))

4364

++		goto out_unlock;

4365

++

4366

++	update_rq_clock(rq);

4367

++	if (!is_idle_task(curr)) {

4368

++		/*

4369

++		 * Make sure the next tick runs within a reasonable

4370

++		 * amount of time.

4371

++		 */

4372

++		delta = rq_clock_task(rq) - curr->last_ran;

4373

++		WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3);

4374

++	}

4375

++	scheduler_task_tick(rq);

4376

++

4377

++	calc_load_nohz_remote(rq);

4378

++out_unlock:

4379

++	raw_spin_unlock_irqrestore(&rq->lock, flags);

4380

++

4381

++out_requeue:

4382

++	/*

4383

++	 * Run the remote tick once per second (1Hz). This arbitrary

4384

++	 * frequency is large enough to avoid overload but short enough

4385

++	 * to keep scheduler internal stats reasonably up to date.  But

4386

++	 * first update state to reflect hotplug activity if required.

4387

++	 */

4388

++	os = atomic_fetch_add_unless(&twork->state, -1, TICK_SCHED_REMOTE_RUNNING);

4389

++	WARN_ON_ONCE(os == TICK_SCHED_REMOTE_OFFLINE);

4390

++	if (os == TICK_SCHED_REMOTE_RUNNING)

4391

++		queue_delayed_work(system_unbound_wq, dwork, HZ);

4392

++}

4393

++

4394

++static void sched_tick_start(int cpu)

4395

++{

4396

++	int os;

4397

++	struct tick_work *twork;

4398

++

4399

++	if (housekeeping_cpu(cpu, HK_FLAG_TICK))

4400

++		return;

4401

++

4402

++	WARN_ON_ONCE(!tick_work_cpu);

4403

++

4404

++	twork = per_cpu_ptr(tick_work_cpu, cpu);

4405

++	os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_RUNNING);

4406

++	WARN_ON_ONCE(os == TICK_SCHED_REMOTE_RUNNING);

4407

++	if (os == TICK_SCHED_REMOTE_OFFLINE) {

4408

++		twork->cpu = cpu;

4409

++		INIT_DELAYED_WORK(&twork->work, sched_tick_remote);

4410

++		queue_delayed_work(system_unbound_wq, &twork->work, HZ);

4411

++	}

4412

++}

4413

++

4414

++#ifdef CONFIG_HOTPLUG_CPU

4415

++static void sched_tick_stop(int cpu)

4416

++{

4417

++	struct tick_work *twork;

4418

++

4419

++	if (housekeeping_cpu(cpu, HK_FLAG_TICK))

4420

++		return;

4421

++

4422

++	WARN_ON_ONCE(!tick_work_cpu);

4423

++

4424

++	twork = per_cpu_ptr(tick_work_cpu, cpu);

4425

++	cancel_delayed_work_sync(&twork->work);

4426

++}

4427

++#endif /* CONFIG_HOTPLUG_CPU */

4428

++

4429

++int __init sched_tick_offload_init(void)

4430

++{

4431

++	tick_work_cpu = alloc_percpu(struct tick_work);

4432

++	BUG_ON(!tick_work_cpu);

4433

++	return 0;

4434

++}

4435

++

4436

++#else /* !CONFIG_NO_HZ_FULL */

4437

++static inline void sched_tick_start(int cpu) { }

4438

++static inline void sched_tick_stop(int cpu) { }

4439

++#endif

4440

++

4441

++#if defined(CONFIG_PREEMPTION) && (defined(CONFIG_DEBUG_PREEMPT) || \

4442

++				defined(CONFIG_PREEMPT_TRACER))

4443

++/*

4444

++ * If the value passed in is equal to the current preempt count

4445

++ * then we just disabled preemption. Start timing the latency.

4446

++ */

4447

++static inline void preempt_latency_start(int val)

4448

++{

4449

++	if (preempt_count() == val) {

4450

++		unsigned long ip = get_lock_parent_ip();

4451

++#ifdef CONFIG_DEBUG_PREEMPT

4452

++		current->preempt_disable_ip = ip;

4453

++#endif

4454

++		trace_preempt_off(CALLER_ADDR0, ip);

4455

++	}

4456

++}

4457

++

4458

++void preempt_count_add(int val)

4459

++{

4460

++#ifdef CONFIG_DEBUG_PREEMPT

4461

++	/*

4462

++	 * Underflow?

4463

++	 */

4464

++	if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))

4465

++		return;

4466

++#endif

4467

++	__preempt_count_add(val);

4468

++#ifdef CONFIG_DEBUG_PREEMPT

4469

++	/*

4470

++	 * Spinlock count overflowing soon?

4471

++	 */

4472

++	DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=

4473

++				PREEMPT_MASK - 10);

4474

++#endif

4475

++	preempt_latency_start(val);

4476

++}

4477

++EXPORT_SYMBOL(preempt_count_add);

4478

++NOKPROBE_SYMBOL(preempt_count_add);

4479

++

4480

++/*

4481

++ * If the value passed in equals to the current preempt count

4482

++ * then we just enabled preemption. Stop timing the latency.

4483

++ */

4484

++static inline void preempt_latency_stop(int val)

4485

++{

4486

++	if (preempt_count() == val)

4487

++		trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip());

4488

++}

4489

++

4490

++void preempt_count_sub(int val)

4491

++{

4492

++#ifdef CONFIG_DEBUG_PREEMPT

4493

++	/*

4494

++	 * Underflow?

4495

++	 */

4496

++	if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))

4497

++		return;

4498

++	/*

4499

++	 * Is the spinlock portion underflowing?

4500

++	 */

4501

++	if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&

4502

++			!(preempt_count() & PREEMPT_MASK)))

4503

++		return;

4504

++#endif

4505

++

4506

++	preempt_latency_stop(val);

4507

++	__preempt_count_sub(val);

4508

++}

4509

++EXPORT_SYMBOL(preempt_count_sub);

4510

++NOKPROBE_SYMBOL(preempt_count_sub);

4511

++

4512

++#else

4513

++static inline void preempt_latency_start(int val) { }

4514

++static inline void preempt_latency_stop(int val) { }

4515

++#endif

4516

++

4517

++static inline unsigned long get_preempt_disable_ip(struct task_struct *p)

4518

++{

4519

++#ifdef CONFIG_DEBUG_PREEMPT

4520

++	return p->preempt_disable_ip;

4521

++#else

4522

++	return 0;

4523

++#endif

4524

++}

4525

++

4526

++/*

4527

++ * Print scheduling while atomic bug:

4528

++ */

4529

++static noinline void __schedule_bug(struct task_struct *prev)

4530

++{

4531

++	/* Save this before calling printk(), since that will clobber it */

4532

++	unsigned long preempt_disable_ip = get_preempt_disable_ip(current);

4533

++

4534

++	if (oops_in_progress)

4535

++		return;

4536

++

4537

++	printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",

4538

++		prev->comm, prev->pid, preempt_count());

4539

++

4540

++	debug_show_held_locks(prev);

4541

++	print_modules();

4542

++	if (irqs_disabled())

4543

++		print_irqtrace_events(prev);

4544

++	if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)

4545

++	    && in_atomic_preempt_off()) {

4546

++		pr_err("Preemption disabled at:");

4547

++		print_ip_sym(KERN_ERR, preempt_disable_ip);

4548

++	}

4549

++	if (panic_on_warn)

4550

++		panic("scheduling while atomic\n");

4551

++

4552

++	dump_stack();

4553

++	add_taint(TAINT_WARN, LOCKDEP_STILL_OK);

4554

++}

4555

++

4556

++/*

4557

++ * Various schedule()-time debugging checks and statistics:

4558

++ */

4559

++static inline void schedule_debug(struct task_struct *prev, bool preempt)

4560

++{

4561

++#ifdef CONFIG_SCHED_STACK_END_CHECK

4562

++	if (task_stack_end_corrupted(prev))

4563

++		panic("corrupted stack end detected inside scheduler\n");

4564

++

4565

++	if (task_scs_end_corrupted(prev))

4566

++		panic("corrupted shadow stack detected inside scheduler\n");

4567

++#endif

4568

++

4569

++#ifdef CONFIG_DEBUG_ATOMIC_SLEEP

4570

++	if (!preempt && prev->state && prev->non_block_count) {

4571

++		printk(KERN_ERR "BUG: scheduling in a non-blocking section: %s/%d/%i\n",

4572

++			prev->comm, prev->pid, prev->non_block_count);

4573

++		dump_stack();

4574

++		add_taint(TAINT_WARN, LOCKDEP_STILL_OK);

4575

++	}

4576

++#endif

4577

++

4578

++	if (unlikely(in_atomic_preempt_off())) {

4579

++		__schedule_bug(prev);

4580

++		preempt_count_set(PREEMPT_DISABLED);

4581

++	}

4582

++	rcu_sleep_check();

4583

++	SCHED_WARN_ON(ct_state() == CONTEXT_USER);

4584

++

4585

++	profile_hit(SCHED_PROFILING, __builtin_return_address(0));

4586

++

4587

++	schedstat_inc(this_rq()->sched_count);

4588

++}

4589

++

4590

++/*

4591

++ * Compile time debug macro

4592

++ * #define ALT_SCHED_DEBUG

4593

++ */

4594

++

4595

++#ifdef ALT_SCHED_DEBUG

4596

++void alt_sched_debug(void)

4597

++{

4598

++	printk(KERN_INFO "sched: pending: 0x%04lx, idle: 0x%04lx, sg_idle: 0x%04lx\n",

4599

++	       sched_rq_pending_mask.bits[0],

4600

++	       sched_rq_watermark[IDLE_WM].bits[0],

4601

++	       sched_sg_idle_mask.bits[0]);

4602

++}

4603

++#else

4604

++inline void alt_sched_debug(void) {}

4605

++#endif

4606

++

4607

++#ifdef	CONFIG_SMP

4608

++

4609

++#define SCHED_RQ_NR_MIGRATION (32U)

4610

++/*

4611

++ * Migrate pending tasks in @rq to @dest_cpu

4612

++ * Will try to migrate mininal of half of @rq nr_running tasks and

4613

++ * SCHED_RQ_NR_MIGRATION to @dest_cpu

4614

++ */

4615

++static inline int

4616

++migrate_pending_tasks(struct rq *rq, struct rq *dest_rq, const int dest_cpu)

4617

++{

4618

++	struct task_struct *p, *skip = rq->curr;

4619

++	int nr_migrated = 0;

4620

++	int nr_tries = min(rq->nr_running / 2, SCHED_RQ_NR_MIGRATION);

4621

++

4622

++	while (skip != rq->idle && nr_tries &&

4623

++	       (p = sched_rq_next_task(skip, rq)) != rq->idle) {

4624

++		skip = sched_rq_next_task(p, rq);

4625

++		if (cpumask_test_cpu(dest_cpu, p->cpus_ptr)) {

4626

++			__SCHED_DEQUEUE_TASK(p, rq, 0, );

4627

++			set_task_cpu(p, dest_cpu);

4628

++			__SCHED_ENQUEUE_TASK(p, dest_rq, 0);

4629

++			nr_migrated++;

4630

++		}

4631

++		nr_tries--;

4632

++	}

4633

++

4634

++	return nr_migrated;

4635

++}

4636

++

4637

++static inline int take_other_rq_tasks(struct rq *rq, int cpu)

4638

++{

4639

++	struct cpumask *affinity_mask, *end_mask;

4640

++

4641

++	if (unlikely(!rq->online))

4642

++		return 0;

4643

++

4644

++	if (cpumask_empty(&sched_rq_pending_mask))

4645

++		return 0;

4646

++

4647

++	affinity_mask = per_cpu(sched_cpu_affinity_masks, cpu) + 1;

4648

++	end_mask = per_cpu(sched_cpu_affinity_end_mask, cpu);

4649

++	do {

4650

++		int i;

4651

++		for_each_cpu_and(i, &sched_rq_pending_mask, affinity_mask) {

4652

++			int nr_migrated;

4653

++			struct rq *src_rq;

4654

++

4655

++			src_rq = cpu_rq(i);

4656

++			if (!do_raw_spin_trylock(&src_rq->lock))

4657

++				continue;

4658

++			spin_acquire(&src_rq->lock.dep_map,

4659

++				     SINGLE_DEPTH_NESTING, 1, _RET_IP_);

4660

++

4661

++			if ((nr_migrated = migrate_pending_tasks(src_rq, rq, cpu))) {

4662

++				src_rq->nr_running -= nr_migrated;

4663

++				if (src_rq->nr_running < 2)

4664

++					cpumask_clear_cpu(i, &sched_rq_pending_mask);

4665

++

4666

++				rq->nr_running += nr_migrated;

4667

++				if (rq->nr_running > 1)

4668

++					cpumask_set_cpu(cpu, &sched_rq_pending_mask);

4669

++

4670

++				update_sched_rq_watermark(rq);

4671

++				cpufreq_update_util(rq, 0);

4672

++

4673

++				spin_release(&src_rq->lock.dep_map, _RET_IP_);

4674

++				do_raw_spin_unlock(&src_rq->lock);

4675

++

4676

++				return 1;

4677

++			}

4678

++

4679

++			spin_release(&src_rq->lock.dep_map, _RET_IP_);

4680

++			do_raw_spin_unlock(&src_rq->lock);

4681

++		}

4682

++	} while (++affinity_mask < end_mask);

4683

++

4684

++	return 0;

4685

++}

4686

++#endif

4687

++

4688

++/*

4689

++ * Timeslices below RESCHED_NS are considered as good as expired as there's no

4690

++ * point rescheduling when there's so little time left.

4691

++ */

4692

++static inline void check_curr(struct task_struct *p, struct rq *rq)

4693

++{

4694

++	if (unlikely(rq->idle == p))

4695

++		return;

4696

++

4697

++	update_curr(rq, p);

4698

++

4699

++	if (p->time_slice < RESCHED_NS)

4700

++		time_slice_expired(p, rq);

4701

++}

4702

++

4703

++static inline struct task_struct *

4704

++choose_next_task(struct rq *rq, int cpu, struct task_struct *prev)

4705

++{

4706

++	struct task_struct *next;

4707

++

4708

++	if (unlikely(rq->skip)) {

4709

++		next = rq_runnable_task(rq);

4710

++		if (next == rq->idle) {

4711

++#ifdef	CONFIG_SMP

4712

++			if (!take_other_rq_tasks(rq, cpu)) {

4713

++#endif

4714

++				rq->skip = NULL;

4715

++				schedstat_inc(rq->sched_goidle);

4716

++				return next;

4717

++#ifdef	CONFIG_SMP

4718

++			}

4719

++			next = rq_runnable_task(rq);

4720

++#endif

4721

++		}

4722

++		rq->skip = NULL;

4723

++#ifdef CONFIG_HIGH_RES_TIMERS

4724

++		hrtick_start(rq, next->time_slice);

4725

++#endif

4726

++		return next;

4727

++	}

4728

++

4729

++	next = sched_rq_first_task(rq);

4730

++	if (next == rq->idle) {

4731

++#ifdef	CONFIG_SMP

4732

++		if (!take_other_rq_tasks(rq, cpu)) {

4733

++#endif

4734

++			schedstat_inc(rq->sched_goidle);

4735

++			/*printk(KERN_INFO "sched: choose_next_task(%d) idle %px\n", cpu, next);*/

4736

++			return next;

4737

++#ifdef	CONFIG_SMP

4738

++		}

4739

++		next = sched_rq_first_task(rq);

4740

++#endif

4741

++	}

4742

++#ifdef CONFIG_HIGH_RES_TIMERS

4743

++	hrtick_start(rq, next->time_slice);

4744

++#endif

4745

++	/*printk(KERN_INFO "sched: choose_next_task(%d) next %px\n", cpu,

4746

++	 * next);*/

4747

++	return next;

4748

++}

4749

++

4750

++/*

4751

++ * schedule() is the main scheduler function.

4752

++ *

4753

++ * The main means of driving the scheduler and thus entering this function are:

4754

++ *

4755

++ *   1. Explicit blocking: mutex, semaphore, waitqueue, etc.

4756

++ *

4757

++ *   2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return

4758

++ *      paths. For example, see arch/x86/entry_64.S.

4759

++ *

4760

++ *      To drive preemption between tasks, the scheduler sets the flag in timer

4761

++ *      interrupt handler scheduler_tick().

4762

++ *

4763

++ *   3. Wakeups don't really cause entry into schedule(). They add a

4764

++ *      task to the run-queue and that's it.

4765

++ *

4766

++ *      Now, if the new task added to the run-queue preempts the current

4767

++ *      task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets

4768

++ *      called on the nearest possible occasion:

4769

++ *

4770

++ *       - If the kernel is preemptible (CONFIG_PREEMPTION=y):

4771

++ *

4772

++ *         - in syscall or exception context, at the next outmost

4773

++ *           preempt_enable(). (this might be as soon as the wake_up()'s

4774

++ *           spin_unlock()!)

4775

++ *

4776

++ *         - in IRQ context, return from interrupt-handler to

4777

++ *           preemptible context

4778

++ *

4779

++ *       - If the kernel is not preemptible (CONFIG_PREEMPTION is not set)

4780

++ *         then at the next:

4781

++ *

4782

++ *          - cond_resched() call

4783

++ *          - explicit schedule() call

4784

++ *          - return from syscall or exception to user-space

4785

++ *          - return from interrupt-handler to user-space

4786

++ *

4787

++ * WARNING: must be called with preemption disabled!

4788

++ */

4789

++static void __sched notrace __schedule(bool preempt)

4790

++{

4791

++	struct task_struct *prev, *next;

4792

++	unsigned long *switch_count;

4793

++	unsigned long prev_state;

4794

++	struct rq *rq;

4795

++	int cpu;

4796

++

4797

++	cpu = smp_processor_id();

4798

++	rq = cpu_rq(cpu);

4799

++	prev = rq->curr;

4800

++

4801

++	schedule_debug(prev, preempt);

4802

++

4803

++	/* by passing sched_feat(HRTICK) checking which Alt schedule FW doesn't support */

4804

++	hrtick_clear(rq);

4805

++

4806

++	local_irq_disable();

4807

++	rcu_note_context_switch(preempt);

4808

++

4809

++	/*

4810

++	 * Make sure that signal_pending_state()->signal_pending() below

4811

++	 * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)

4812

++	 * done by the caller to avoid the race with signal_wake_up():

4813

++	 *

4814

++	 * __set_current_state(@state)		signal_wake_up()

4815

++	 * schedule()				  set_tsk_thread_flag(p, TIF_SIGPENDING)

4816

++	 *					  wake_up_state(p, state)

4817

++	 *   LOCK rq->lock			    LOCK p->pi_state

4818

++	 *   smp_mb__after_spinlock()		    smp_mb__after_spinlock()

4819

++	 *     if (signal_pending_state())	    if (p->state & @state)

4820

++	 *

4821

++	 * Also, the membarrier system call requires a full memory barrier

4822

++	 * after coming from user-space, before storing to rq->curr.

4823

++	 */

4824

++	raw_spin_lock(&rq->lock);

4825

++	smp_mb__after_spinlock();

4826

++

4827

++	update_rq_clock(rq);

4828

++

4829

++	switch_count = &prev->nivcsw;

4830

++	/*

4831

++	 * We must load prev->state once (task_struct::state is volatile), such

4832

++	 * that:

4833

++	 *

4834

++	 *  - we form a control dependency vs deactivate_task() below.

4835

++	 *  - ptrace_{,un}freeze_traced() can change ->state underneath us.

4836

++	 */

4837

++	prev_state = prev->state;

4838

++	if (!preempt && prev_state && prev_state == prev->state) {

4839

++		if (signal_pending_state(prev_state, prev)) {

4840

++			prev->state = TASK_RUNNING;

4841

++		} else {

4842

++			prev->sched_contributes_to_load =

4843

++				(prev_state & TASK_UNINTERRUPTIBLE) &&

4844

++				!(prev_state & TASK_NOLOAD) &&

4845

++				!(prev->flags & PF_FROZEN);

4846

++

4847

++			if (prev->sched_contributes_to_load)

4848

++				rq->nr_uninterruptible++;

4849

++

4850

++			/*

4851

++			 * __schedule()			ttwu()

4852

++			 *   prev_state = prev->state;    if (p->on_rq && ...)

4853

++			 *   if (prev_state)		    goto out;

4854

++			 *     p->on_rq = 0;		  smp_acquire__after_ctrl_dep();

4855

++			 *				  p->state = TASK_WAKING

4856

++			 *

4857

++			 * Where __schedule() and ttwu() have matching control dependencies.

4858

++			 *

4859

++			 * After this, schedule() must not care about p->state any more.

4860

++			 */

4861

++			sched_task_deactivate(prev, rq);

4862

++			deactivate_task(prev, rq);

4863

++

4864

++			if (prev->in_iowait) {

4865

++				atomic_inc(&rq->nr_iowait);

4866

++				delayacct_blkio_start();

4867

++			}

4868

++		}

4869

++		switch_count = &prev->nvcsw;

4870

++	}

4871

++

4872

++	check_curr(prev, rq);

4873

++

4874

++	next = choose_next_task(rq, cpu, prev);

4875

++	clear_tsk_need_resched(prev);

4876

++	clear_preempt_need_resched();

4877

++#ifdef CONFIG_SCHED_DEBUG

4878

++	rq->last_seen_need_resched_ns = 0;

4879

++#endif

4880

++

4881

++	if (likely(prev != next)) {

4882

++		next->last_ran = rq->clock_task;

4883

++		rq->last_ts_switch = rq->clock;

4884

++

4885

++		rq->nr_switches++;

4886

++		/*

4887

++		 * RCU users of rcu_dereference(rq->curr) may not see

4888

++		 * changes to task_struct made by pick_next_task().

4889

++		 */

4890

++		RCU_INIT_POINTER(rq->curr, next);

4891

++		/*

4892

++		 * The membarrier system call requires each architecture

4893

++		 * to have a full memory barrier after updating

4894

++		 * rq->curr, before returning to user-space.

4895

++		 *

4896

++		 * Here are the schemes providing that barrier on the

4897

++		 * various architectures:

4898

++		 * - mm ? switch_mm() : mmdrop() for x86, s390, sparc, PowerPC.

4899

++		 *   switch_mm() rely on membarrier_arch_switch_mm() on PowerPC.

4900

++		 * - finish_lock_switch() for weakly-ordered

4901

++		 *   architectures where spin_unlock is a full barrier,

4902

++		 * - switch_to() for arm64 (weakly-ordered, spin_unlock

4903

++		 *   is a RELEASE barrier),

4904

++		 */

4905

++		++*switch_count;

4906

++

4907

++		psi_sched_switch(prev, next, !task_on_rq_queued(prev));

4908

++

4909

++		trace_sched_switch(preempt, prev, next);

4910

++

4911

++		/* Also unlocks the rq: */

4912

++		rq = context_switch(rq, prev, next);

4913

++	} else {

4914

++		__balance_callbacks(rq);

4915

++		raw_spin_unlock_irq(&rq->lock);

4916

++	}

4917

++

4918

++#ifdef CONFIG_SCHED_SMT

4919

++	sg_balance_check(rq);

4920

++#endif

4921

++}

4922

++

4923

++void __noreturn do_task_dead(void)

4924

++{

4925

++	/* Causes final put_task_struct in finish_task_switch(): */

4926

++	set_special_state(TASK_DEAD);

4927

++

4928

++	/* Tell freezer to ignore us: */

4929

++	current->flags |= PF_NOFREEZE;

4930

++

4931

++	__schedule(false);

4932

++	BUG();

4933

++

4934

++	/* Avoid "noreturn function does return" - but don't continue if BUG() is a NOP: */

4935

++	for (;;)

4936

++		cpu_relax();

4937

++}

4938

++

4939

++static inline void sched_submit_work(struct task_struct *tsk)

4940

++{

4941

++	unsigned int task_flags;

4942

++

4943

++	if (!tsk->state)

4944

++		return;

4945

++

4946

++	task_flags = tsk->flags;

4947

++	/*

4948

++	 * If a worker went to sleep, notify and ask workqueue whether

4949

++	 * it wants to wake up a task to maintain concurrency.

4950

++	 * As this function is called inside the schedule() context,

4951

++	 * we disable preemption to avoid it calling schedule() again

4952

++	 * in the possible wakeup of a kworker and because wq_worker_sleeping()

4953

++	 * requires it.

4954

++	 */

4955

++	if (task_flags & (PF_WQ_WORKER | PF_IO_WORKER)) {

4956

++		preempt_disable();

4957

++		if (task_flags & PF_WQ_WORKER)

4958

++			wq_worker_sleeping(tsk);

4959

++		else

4960

++			io_wq_worker_sleeping(tsk);

4961

++		preempt_enable_no_resched();

4962

++	}

4963

++

4964

++	if (tsk_is_pi_blocked(tsk))

4965

++		return;

4966

++

4967

++	/*

4968

++	 * If we are going to sleep and we have plugged IO queued,

4969

++	 * make sure to submit it to avoid deadlocks.

4970

++	 */

4971

++	if (blk_needs_flush_plug(tsk))

4972

++		blk_schedule_flush_plug(tsk);

4973

++}

4974

++

4975

++static void sched_update_worker(struct task_struct *tsk)

4976

++{

4977

++	if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) {

4978

++		if (tsk->flags & PF_WQ_WORKER)

4979

++			wq_worker_running(tsk);

4980

++		else

4981

++			io_wq_worker_running(tsk);

4982

++	}

4983

++}

4984

++

4985

++asmlinkage __visible void __sched schedule(void)

4986

++{

4987

++	struct task_struct *tsk = current;

4988

++

4989

++	sched_submit_work(tsk);

4990

++	do {

4991

++		preempt_disable();

4992

++		__schedule(false);

4993

++		sched_preempt_enable_no_resched();

4994

++	} while (need_resched());

4995

++	sched_update_worker(tsk);

4996

++}

4997

++EXPORT_SYMBOL(schedule);

4998

++

4999

++/*

5000

++ * synchronize_rcu_tasks() makes sure that no task is stuck in preempted

5001

++ * state (have scheduled out non-voluntarily) by making sure that all

5002

++ * tasks have either left the run queue or have gone into user space.

5003

++ * As idle tasks do not do either, they must not ever be preempted

5004

++ * (schedule out non-voluntarily).

5005

++ *

5006

++ * schedule_idle() is similar to schedule_preempt_disable() except that it

5007

++ * never enables preemption because it does not call sched_submit_work().

5008

++ */

5009

++void __sched schedule_idle(void)

5010

++{

5011

++	/*

5012

++	 * As this skips calling sched_submit_work(), which the idle task does

5013

++	 * regardless because that function is a nop when the task is in a

5014

++	 * TASK_RUNNING state, make sure this isn't used someplace that the

5015

++	 * current task can be in any other state. Note, idle is always in the

5016

++	 * TASK_RUNNING state.

5017

++	 */

5018

++	WARN_ON_ONCE(current->state);

5019

++	do {

5020

++		__schedule(false);

5021

++	} while (need_resched());

5022

++}

5023

++

5024

++#if defined(CONFIG_CONTEXT_TRACKING) && !defined(CONFIG_HAVE_CONTEXT_TRACKING_OFFSTACK)

5025

++asmlinkage __visible void __sched schedule_user(void)

5026

++{

5027

++	/*

5028

++	 * If we come here after a random call to set_need_resched(),

5029

++	 * or we have been woken up remotely but the IPI has not yet arrived,

5030

++	 * we haven't yet exited the RCU idle mode. Do it here manually until

5031

++	 * we find a better solution.

5032

++	 *

5033

++	 * NB: There are buggy callers of this function.  Ideally we

5034

++	 * should warn if prev_state != CONTEXT_USER, but that will trigger

5035

++	 * too frequently to make sense yet.

5036

++	 */

5037

++	enum ctx_state prev_state = exception_enter();

5038

++	schedule();

5039

++	exception_exit(prev_state);

5040

++}

5041

++#endif

5042

++

5043

++/**

5044

++ * schedule_preempt_disabled - called with preemption disabled

5045

++ *

5046

++ * Returns with preemption disabled. Note: preempt_count must be 1

5047

++ */

5048

++void __sched schedule_preempt_disabled(void)

5049

++{

5050

++	sched_preempt_enable_no_resched();

5051

++	schedule();

5052

++	preempt_disable();

5053

++}

5054

++

5055

++static void __sched notrace preempt_schedule_common(void)

5056

++{

5057

++	do {

5058

++		/*

5059

++		 * Because the function tracer can trace preempt_count_sub()

5060

++		 * and it also uses preempt_enable/disable_notrace(), if

5061

++		 * NEED_RESCHED is set, the preempt_enable_notrace() called

5062

++		 * by the function tracer will call this function again and

5063

++		 * cause infinite recursion.

5064

++		 *

5065

++		 * Preemption must be disabled here before the function

5066

++		 * tracer can trace. Break up preempt_disable() into two

5067

++		 * calls. One to disable preemption without fear of being

5068

++		 * traced. The other to still record the preemption latency,

5069

++		 * which can also be traced by the function tracer.

5070

++		 */

5071

++		preempt_disable_notrace();

5072

++		preempt_latency_start(1);

5073

++		__schedule(true);

5074

++		preempt_latency_stop(1);

5075

++		preempt_enable_no_resched_notrace();

5076

++

5077

++		/*

5078

++		 * Check again in case we missed a preemption opportunity

5079

++		 * between schedule and now.

5080

++		 */

5081

++	} while (need_resched());

5082

++}

5083

++

5084

++#ifdef CONFIG_PREEMPTION

5085

++/*

5086

++ * This is the entry point to schedule() from in-kernel preemption

5087

++ * off of preempt_enable.

5088

++ */

5089

++asmlinkage __visible void __sched notrace preempt_schedule(void)

5090

++{

5091

++	/*

5092

++	 * If there is a non-zero preempt_count or interrupts are disabled,

5093

++	 * we do not want to preempt the current task. Just return..

5094

++	 */

5095

++	if (likely(!preemptible()))

5096

++		return;

5097

++

5098

++	preempt_schedule_common();

5099

++}

5100

++NOKPROBE_SYMBOL(preempt_schedule);

5101

++EXPORT_SYMBOL(preempt_schedule);

5102

++

5103

++#ifdef CONFIG_PREEMPT_DYNAMIC

5104

++DEFINE_STATIC_CALL(preempt_schedule, __preempt_schedule_func);

5105

++EXPORT_STATIC_CALL_TRAMP(preempt_schedule);

5106

++#endif

5107

++

5108

++

5109

++/**

5110

++ * preempt_schedule_notrace - preempt_schedule called by tracing

5111

++ *

5112

++ * The tracing infrastructure uses preempt_enable_notrace to prevent

5113

++ * recursion and tracing preempt enabling caused by the tracing

5114

++ * infrastructure itself. But as tracing can happen in areas coming

5115

++ * from userspace or just about to enter userspace, a preempt enable

5116

++ * can occur before user_exit() is called. This will cause the scheduler

5117

++ * to be called when the system is still in usermode.

5118

++ *

5119

++ * To prevent this, the preempt_enable_notrace will use this function

5120

++ * instead of preempt_schedule() to exit user context if needed before

5121

++ * calling the scheduler.

5122

++ */

5123

++asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)

5124

++{

5125

++	enum ctx_state prev_ctx;

5126

++

5127

++	if (likely(!preemptible()))

5128

++		return;

5129

++

5130

++	do {

5131

++		/*

5132

++		 * Because the function tracer can trace preempt_count_sub()

5133

++		 * and it also uses preempt_enable/disable_notrace(), if

5134

++		 * NEED_RESCHED is set, the preempt_enable_notrace() called

5135

++		 * by the function tracer will call this function again and

5136

++		 * cause infinite recursion.

5137

++		 *

5138

++		 * Preemption must be disabled here before the function

5139

++		 * tracer can trace. Break up preempt_disable() into two

5140

++		 * calls. One to disable preemption without fear of being

5141

++		 * traced. The other to still record the preemption latency,

5142

++		 * which can also be traced by the function tracer.

5143

++		 */

5144

++		preempt_disable_notrace();

5145

++		preempt_latency_start(1);

5146

++		/*

5147

++		 * Needs preempt disabled in case user_exit() is traced

5148

++		 * and the tracer calls preempt_enable_notrace() causing

5149

++		 * an infinite recursion.

5150

++		 */

5151

++		prev_ctx = exception_enter();

5152

++		__schedule(true);

5153

++		exception_exit(prev_ctx);

5154

++

5155

++		preempt_latency_stop(1);

5156

++		preempt_enable_no_resched_notrace();

5157

++	} while (need_resched());

5158

++}

5159

++EXPORT_SYMBOL_GPL(preempt_schedule_notrace);

5160

++

5161

++#ifdef CONFIG_PREEMPT_DYNAMIC

5162

++DEFINE_STATIC_CALL(preempt_schedule_notrace, __preempt_schedule_notrace_func);

5163

++EXPORT_STATIC_CALL_TRAMP(preempt_schedule_notrace);

5164

++#endif

5165

++

5166

++#endif /* CONFIG_PREEMPTION */

5167

++

5168

++#ifdef CONFIG_PREEMPT_DYNAMIC

5169

++

5170

++#include <linux/entry-common.h>

5171

++

5172

++/*

5173

++ * SC:cond_resched

5174

++ * SC:might_resched

5175

++ * SC:preempt_schedule

5176

++ * SC:preempt_schedule_notrace

5177

++ * SC:irqentry_exit_cond_resched

5178

++ *

5179

++ *

5180

++ * NONE:

5181

++ *   cond_resched               <- __cond_resched

5182

++ *   might_resched              <- RET0

5183

++ *   preempt_schedule           <- NOP

5184

++ *   preempt_schedule_notrace   <- NOP

5185

++ *   irqentry_exit_cond_resched <- NOP

5186

++ *

5187

++ * VOLUNTARY:

5188

++ *   cond_resched               <- __cond_resched

5189

++ *   might_resched              <- __cond_resched

5190

++ *   preempt_schedule           <- NOP

5191

++ *   preempt_schedule_notrace   <- NOP

5192

++ *   irqentry_exit_cond_resched <- NOP

5193

++ *

5194

++ * FULL:

5195

++ *   cond_resched               <- RET0

5196

++ *   might_resched              <- RET0

5197

++ *   preempt_schedule           <- preempt_schedule

5198

++ *   preempt_schedule_notrace   <- preempt_schedule_notrace

5199

++ *   irqentry_exit_cond_resched <- irqentry_exit_cond_resched

5200

++ */

5201

++

5202

++enum {

5203

++	preempt_dynamic_none = 0,

5204

++	preempt_dynamic_voluntary,

5205

++	preempt_dynamic_full,

5206

++};

5207

++

5208

++int preempt_dynamic_mode = preempt_dynamic_full;

5209

++

5210

++int sched_dynamic_mode(const char *str)

5211

++{

5212

++	if (!strcmp(str, "none"))

5213

++		return preempt_dynamic_none;

5214

++

5215

++	if (!strcmp(str, "voluntary"))

5216

++		return preempt_dynamic_voluntary;

5217

++

5218

++	if (!strcmp(str, "full"))

5219

++		return preempt_dynamic_full;

5220

++

5221

++	return -EINVAL;

5222

++}

5223

++

5224

++void sched_dynamic_update(int mode)

5225

++{

5226

++	/*

5227

++	 * Avoid {NONE,VOLUNTARY} -> FULL transitions from ever ending up in

5228

++	 * the ZERO state, which is invalid.

5229

++	 */

5230

++	static_call_update(cond_resched, __cond_resched);

5231

++	static_call_update(might_resched, __cond_resched);

5232

++	static_call_update(preempt_schedule, __preempt_schedule_func);

5233

++	static_call_update(preempt_schedule_notrace, __preempt_schedule_notrace_func);

5234

++	static_call_update(irqentry_exit_cond_resched, irqentry_exit_cond_resched);

5235

++

5236

++	switch (mode) {

5237

++	case preempt_dynamic_none:

5238

++		static_call_update(cond_resched, __cond_resched);

5239

++		static_call_update(might_resched, (void *)&__static_call_return0);

5240

++		static_call_update(preempt_schedule, NULL);

5241

++		static_call_update(preempt_schedule_notrace, NULL);

5242

++		static_call_update(irqentry_exit_cond_resched, NULL);

5243

++		pr_info("Dynamic Preempt: none\n");

5244

++		break;

5245

++

5246

++	case preempt_dynamic_voluntary:

5247

++		static_call_update(cond_resched, __cond_resched);

5248

++		static_call_update(might_resched, __cond_resched);

5249

++		static_call_update(preempt_schedule, NULL);

5250

++		static_call_update(preempt_schedule_notrace, NULL);

5251

++		static_call_update(irqentry_exit_cond_resched, NULL);

5252

++		pr_info("Dynamic Preempt: voluntary\n");

5253

++		break;

5254

++

5255

++	case preempt_dynamic_full:

5256

++		static_call_update(cond_resched, (void *)&__static_call_return0);

5257

++		static_call_update(might_resched, (void *)&__static_call_return0);

5258

++		static_call_update(preempt_schedule, __preempt_schedule_func);

5259

++		static_call_update(preempt_schedule_notrace, __preempt_schedule_notrace_func);

5260

++		static_call_update(irqentry_exit_cond_resched, irqentry_exit_cond_resched);

5261

++		pr_info("Dynamic Preempt: full\n");

5262

++		break;

5263

++	}

5264

++

5265

++	preempt_dynamic_mode = mode;

5266

++}

5267

++

5268

++static int __init setup_preempt_mode(char *str)

5269

++{

5270

++	int mode = sched_dynamic_mode(str);

5271

++	if (mode < 0) {

5272

++		pr_warn("Dynamic Preempt: unsupported mode: %s\n", str);

5273

++		return 1;

5274

++	}

5275

++

5276

++	sched_dynamic_update(mode);

5277

++	return 0;

5278

++}

5279

++__setup("preempt=", setup_preempt_mode);

5280

++

5281

++#endif /* CONFIG_PREEMPT_DYNAMIC */

5282

++

5283

++/*

5284

++ * This is the entry point to schedule() from kernel preemption

5285

++ * off of irq context.

5286

++ * Note, that this is called and return with irqs disabled. This will

5287

++ * protect us against recursive calling from irq.

5288

++ */

5289

++asmlinkage __visible void __sched preempt_schedule_irq(void)

5290

++{

5291

++	enum ctx_state prev_state;

5292

++

5293

++	/* Catch callers which need to be fixed */

5294

++	BUG_ON(preempt_count() || !irqs_disabled());

5295

++

5296

++	prev_state = exception_enter();

5297

++

5298

++	do {

5299

++		preempt_disable();

5300

++		local_irq_enable();

5301

++		__schedule(true);

5302

++		local_irq_disable();

5303

++		sched_preempt_enable_no_resched();

5304

++	} while (need_resched());

5305

++

5306

++	exception_exit(prev_state);

5307

++}

5308

++

5309

++int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags,

5310

++			  void *key)

5311

++{

5312

++	WARN_ON_ONCE(IS_ENABLED(CONFIG_SCHED_DEBUG) && wake_flags & ~WF_SYNC);

5313

++	return try_to_wake_up(curr->private, mode, wake_flags);

5314

++}

5315

++EXPORT_SYMBOL(default_wake_function);

5316

++

5317

++static inline void check_task_changed(struct task_struct *p, struct rq *rq)

5318

++{

5319

++	/* Trigger resched if task sched_prio has been modified. */

5320

++	if (task_on_rq_queued(p) && task_sched_prio_idx(p, rq) != p->sq_idx) {

5321

++		requeue_task(p, rq);

5322

++		check_preempt_curr(rq);

5323

++	}

5324

++}

5325

++

5326

++#ifdef CONFIG_RT_MUTEXES

5327

++

5328

++static inline int __rt_effective_prio(struct task_struct *pi_task, int prio)

5329

++{

5330

++	if (pi_task)

5331

++		prio = min(prio, pi_task->prio);

5332

++

5333

++	return prio;

5334

++}

5335

++

5336

++static inline int rt_effective_prio(struct task_struct *p, int prio)

5337

++{

5338

++	struct task_struct *pi_task = rt_mutex_get_top_task(p);

5339

++

5340

++	return __rt_effective_prio(pi_task, prio);

5341

++}

5342

++

5343

++/*

5344

++ * rt_mutex_setprio - set the current priority of a task

5345

++ * @p: task to boost

5346

++ * @pi_task: donor task

5347

++ *

5348

++ * This function changes the 'effective' priority of a task. It does

5349

++ * not touch ->normal_prio like __setscheduler().

5350

++ *

5351

++ * Used by the rt_mutex code to implement priority inheritance

5352

++ * logic. Call site only calls if the priority of the task changed.

5353

++ */

5354

++void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)

5355

++{

5356

++	int prio;

5357

++	struct rq *rq;

5358

++	raw_spinlock_t *lock;

5359

++

5360

++	/* XXX used to be waiter->prio, not waiter->task->prio */

5361

++	prio = __rt_effective_prio(pi_task, p->normal_prio);

5362

++

5363

++	/*

5364

++	 * If nothing changed; bail early.

5365

++	 */

5366

++	if (p->pi_top_task == pi_task && prio == p->prio)

5367

++		return;

5368

++

5369

++	rq = __task_access_lock(p, &lock);

5370

++	/*

5371

++	 * Set under pi_lock && rq->lock, such that the value can be used under

5372

++	 * either lock.

5373

++	 *

5374

++	 * Note that there is loads of tricky to make this pointer cache work

5375

++	 * right. rt_mutex_slowunlock()+rt_mutex_postunlock() work together to

5376

++	 * ensure a task is de-boosted (pi_task is set to NULL) before the

5377

++	 * task is allowed to run again (and can exit). This ensures the pointer

5378

++	 * points to a blocked task -- which guarantees the task is present.

5379

++	 */

5380

++	p->pi_top_task = pi_task;

5381

++

5382

++	/*

5383

++	 * For FIFO/RR we only need to set prio, if that matches we're done.

5384

++	 */

5385

++	if (prio == p->prio)

5386

++		goto out_unlock;

5387

++

5388

++	/*

5389

++	 * Idle task boosting is a nono in general. There is one

5390

++	 * exception, when PREEMPT_RT and NOHZ is active:

5391

++	 *

5392

++	 * The idle task calls get_next_timer_interrupt() and holds

5393

++	 * the timer wheel base->lock on the CPU and another CPU wants

5394

++	 * to access the timer (probably to cancel it). We can safely

5395

++	 * ignore the boosting request, as the idle CPU runs this code

5396

++	 * with interrupts disabled and will complete the lock

5397

++	 * protected section without being interrupted. So there is no

5398

++	 * real need to boost.

5399

++	 */

5400

++	if (unlikely(p == rq->idle)) {

5401

++		WARN_ON(p != rq->curr);

5402

++		WARN_ON(p->pi_blocked_on);

5403

++		goto out_unlock;

5404

++	}

5405

++

5406

++	trace_sched_pi_setprio(p, pi_task);

5407

++	p->prio = prio;

5408

++

5409

++	check_task_changed(p, rq);

5410

++out_unlock:

5411

++	/* Avoid rq from going away on us: */

5412

++	preempt_disable();

5413

++

5414

++	__balance_callbacks(rq);

5415

++	__task_access_unlock(p, lock);

5416

++

5417

++	preempt_enable();

5418

++}

5419

++#else

5420

++static inline int rt_effective_prio(struct task_struct *p, int prio)

5421

++{

5422

++	return prio;

5423

++}

5424

++#endif

5425

++

5426

++void set_user_nice(struct task_struct *p, long nice)

5427

++{

5428

++	unsigned long flags;

5429

++	struct rq *rq;

5430

++	raw_spinlock_t *lock;

5431

++

5432

++	if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE)

5433

++		return;

5434

++	/*

5435

++	 * We have to be careful, if called from sys_setpriority(),

5436

++	 * the task might be in the middle of scheduling on another CPU.

5437

++	 */

5438

++	raw_spin_lock_irqsave(&p->pi_lock, flags);

5439

++	rq = __task_access_lock(p, &lock);

5440

++

5441

++	p->static_prio = NICE_TO_PRIO(nice);

5442

++	/*

5443

++	 * The RT priorities are set via sched_setscheduler(), but we still

5444

++	 * allow the 'normal' nice value to be set - but as expected

5445

++	 * it won't have any effect on scheduling until the task is

5446

++	 * not SCHED_NORMAL/SCHED_BATCH:

5447

++	 */

5448

++	if (task_has_rt_policy(p))

5449

++		goto out_unlock;

5450

++

5451

++	p->prio = effective_prio(p);

5452

++

5453

++	check_task_changed(p, rq);

5454

++out_unlock:

5455

++	__task_access_unlock(p, lock);

5456

++	raw_spin_unlock_irqrestore(&p->pi_lock, flags);

5457

++}

5458

++EXPORT_SYMBOL(set_user_nice);

5459

++

5460

++/*

5461

++ * can_nice - check if a task can reduce its nice value

5462

++ * @p: task

5463

++ * @nice: nice value

5464

++ */

5465

++int can_nice(const struct task_struct *p, const int nice)

5466

++{

5467

++	/* Convert nice value [19,-20] to rlimit style value [1,40] */

5468

++	int nice_rlim = nice_to_rlimit(nice);

5469

++

5470

++	return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||

5471

++		capable(CAP_SYS_NICE));

5472

++}

5473

++

5474

++#ifdef __ARCH_WANT_SYS_NICE

5475

++

5476

++/*

5477

++ * sys_nice - change the priority of the current process.

5478

++ * @increment: priority increment

5479

++ *

5480

++ * sys_setpriority is a more generic, but much slower function that

5481

++ * does similar things.

5482

++ */

5483

++SYSCALL_DEFINE1(nice, int, increment)

5484

++{

5485

++	long nice, retval;

5486

++

5487

++	/*

5488

++	 * Setpriority might change our priority at the same moment.

5489

++	 * We don't have to worry. Conceptually one call occurs first

5490

++	 * and we have a single winner.

5491

++	 */

5492

++

5493

++	increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH);

5494

++	nice = task_nice(current) + increment;

5495

++

5496

++	nice = clamp_val(nice, MIN_NICE, MAX_NICE);

5497

++	if (increment < 0 && !can_nice(current, nice))

5498

++		return -EPERM;

5499

++

5500

++	retval = security_task_setnice(current, nice);

5501

++	if (retval)

5502

++		return retval;

5503

++

5504

++	set_user_nice(current, nice);

5505

++	return 0;

5506

++}

5507

++

5508

++#endif

5509

++

5510

++/**

5511

++ * task_prio - return the priority value of a given task.

5512

++ * @p: the task in question.

5513

++ *

5514

++ * Return: The priority value as seen by users in /proc.

5515

++ *

5516

++ * sched policy         return value   kernel prio    user prio/nice

5517

++ *

5518

++ * (BMQ)normal, batch, idle[0 ... 53]  [100 ... 139]          0/[-20 ... 19]/[-7 ... 7]

5519

++ * (PDS)normal, batch, idle[0 ... 39]            100          0/[-20 ... 19]

5520

++ * fifo, rr             [-1 ... -100]     [99 ... 0]  [0 ... 99]

5521

++ */

5522

++int task_prio(const struct task_struct *p)

5523

++{

5524

++	return (p->prio < MAX_RT_PRIO) ? p->prio - MAX_RT_PRIO :

5525

++		task_sched_prio_normal(p, task_rq(p));

5526

++}

5527

++

5528

++/**

5529

++ * idle_cpu - is a given CPU idle currently?

5530

++ * @cpu: the processor in question.

5531

++ *

5532

++ * Return: 1 if the CPU is currently idle. 0 otherwise.

5533

++ */

5534

++int idle_cpu(int cpu)

5535

++{

5536

++	struct rq *rq = cpu_rq(cpu);

5537

++

5538

++	if (rq->curr != rq->idle)

5539

++		return 0;

5540

++

5541

++	if (rq->nr_running)

5542

++		return 0;

5543

++

5544

++#ifdef CONFIG_SMP

5545

++	if (rq->ttwu_pending)

5546

++		return 0;

5547

++#endif

5548

++

5549

++	return 1;

5550

++}

5551

++

5552

++/**

5553

++ * idle_task - return the idle task for a given CPU.

5554

++ * @cpu: the processor in question.

5555

++ *

5556

++ * Return: The idle task for the cpu @cpu.

5557

++ */

5558

++struct task_struct *idle_task(int cpu)

5559

++{

5560

++	return cpu_rq(cpu)->idle;

5561

++}

5562

++

5563

++/**

5564

++ * find_process_by_pid - find a process with a matching PID value.

5565

++ * @pid: the pid in question.

5566

++ *

5567

++ * The task of @pid, if found. %NULL otherwise.

5568

++ */

5569

++static inline struct task_struct *find_process_by_pid(pid_t pid)

5570

++{

5571

++	return pid ? find_task_by_vpid(pid) : current;

5572

++}

5573

++

5574

++/*

5575

++ * sched_setparam() passes in -1 for its policy, to let the functions

5576

++ * it calls know not to change it.

5577

++ */

5578

++#define SETPARAM_POLICY -1

5579

++

5580

++static void __setscheduler_params(struct task_struct *p,

5581

++		const struct sched_attr *attr)

5582

++{

5583

++	int policy = attr->sched_policy;

5584

++

5585

++	if (policy == SETPARAM_POLICY)

5586

++		policy = p->policy;

5587

++

5588

++	p->policy = policy;

5589

++

5590

++	/*

5591

++	 * allow normal nice value to be set, but will not have any

5592

++	 * effect on scheduling until the task not SCHED_NORMAL/

5593

++	 * SCHED_BATCH

5594

++	 */

5595

++	p->static_prio = NICE_TO_PRIO(attr->sched_nice);

5596

++

5597

++	/*

5598

++	 * __sched_setscheduler() ensures attr->sched_priority == 0 when

5599

++	 * !rt_policy. Always setting this ensures that things like

5600

++	 * getparam()/getattr() don't report silly values for !rt tasks.

5601

++	 */

5602

++	p->rt_priority = attr->sched_priority;

5603

++	p->normal_prio = normal_prio(p);

5604

++}

5605

++

5606

++/* Actually do priority change: must hold rq lock. */

5607

++static void __setscheduler(struct rq *rq, struct task_struct *p,

5608

++			   const struct sched_attr *attr, bool keep_boost)

5609

++{

5610

++	__setscheduler_params(p, attr);

5611

++

5612

++	/*

5613

++	 * Keep a potential priority boosting if called from

5614

++	 * sched_setscheduler().

5615

++	 */

5616

++	p->prio = normal_prio(p);

5617

++	if (keep_boost)

5618

++		p->prio = rt_effective_prio(p, p->prio);

5619

++}

5620

++

5621

++/*

5622

++ * check the target process has a UID that matches the current process's

5623

++ */

5624

++static bool check_same_owner(struct task_struct *p)

5625

++{

5626

++	const struct cred *cred = current_cred(), *pcred;

5627

++	bool match;

5628

++

5629

++	rcu_read_lock();

5630

++	pcred = __task_cred(p);

5631

++	match = (uid_eq(cred->euid, pcred->euid) ||

5632

++		 uid_eq(cred->euid, pcred->uid));

5633

++	rcu_read_unlock();

5634

++	return match;

5635

++}

5636

++

5637

++static int __sched_setscheduler(struct task_struct *p,

5638

++				const struct sched_attr *attr,

5639

++				bool user, bool pi)

5640

++{

5641

++	const struct sched_attr dl_squash_attr = {

5642

++		.size		= sizeof(struct sched_attr),

5643

++		.sched_policy	= SCHED_FIFO,

5644

++		.sched_nice	= 0,

5645

++		.sched_priority = 99,

5646

++	};

5647

++	int newprio = MAX_RT_PRIO - 1 - attr->sched_priority;

5648

++	int retval, oldpolicy = -1;

5649

++	int policy = attr->sched_policy;

5650

++	struct callback_head *head;

5651

++	unsigned long flags;

5652

++	struct rq *rq;

5653

++	int reset_on_fork;

5654

++	raw_spinlock_t *lock;

5655

++

5656

++	/* The pi code expects interrupts enabled */

5657

++	BUG_ON(pi && in_interrupt());

5658

++

5659

++	/*

5660

++	 * Alt schedule FW supports SCHED_DEADLINE by squash it as prio 0 SCHED_FIFO

5661

++	 */

5662

++	if (unlikely(SCHED_DEADLINE == policy)) {

5663

++		attr = &dl_squash_attr;

5664

++		policy = attr->sched_policy;

5665

++		newprio = MAX_RT_PRIO - 1 - attr->sched_priority;

5666

++	}

5667

++recheck:

5668

++	/* Double check policy once rq lock held */

5669

++	if (policy < 0) {

5670

++		reset_on_fork = p->sched_reset_on_fork;

5671

++		policy = oldpolicy = p->policy;

5672

++	} else {

5673

++		reset_on_fork = !!(attr->sched_flags & SCHED_RESET_ON_FORK);

5674

++

5675

++		if (policy > SCHED_IDLE)

5676

++			return -EINVAL;

5677

++	}

5678

++

5679

++	if (attr->sched_flags & ~(SCHED_FLAG_ALL))

5680

++		return -EINVAL;

5681

++

5682

++	/*

5683

++	 * Valid priorities for SCHED_FIFO and SCHED_RR are

5684

++	 * 1..MAX_RT_PRIO-1, valid priority for SCHED_NORMAL and

5685

++	 * SCHED_BATCH and SCHED_IDLE is 0.

5686

++	 */

5687

++	if (attr->sched_priority < 0 ||

5688

++	    (p->mm && attr->sched_priority > MAX_RT_PRIO - 1) ||

5689

++	    (!p->mm && attr->sched_priority > MAX_RT_PRIO - 1))

5690

++		return -EINVAL;

5691

++	if ((SCHED_RR == policy || SCHED_FIFO == policy) !=

5692

++	    (attr->sched_priority != 0))

5693

++		return -EINVAL;

5694

++

5695

++	/*

5696

++	 * Allow unprivileged RT tasks to decrease priority:

5697

++	 */

5698

++	if (user && !capable(CAP_SYS_NICE)) {

5699

++		if (SCHED_FIFO == policy || SCHED_RR == policy) {

5700

++			unsigned long rlim_rtprio =

5701

++					task_rlimit(p, RLIMIT_RTPRIO);

5702

++

5703

++			/* Can't set/change the rt policy */

5704

++			if (policy != p->policy && !rlim_rtprio)

5705

++				return -EPERM;

5706

++

5707

++			/* Can't increase priority */

5708

++			if (attr->sched_priority > p->rt_priority &&

5709

++			    attr->sched_priority > rlim_rtprio)

5710

++				return -EPERM;

5711

++		}

5712

++

5713

++		/* Can't change other user's priorities */

5714

++		if (!check_same_owner(p))

5715

++			return -EPERM;

5716

++

5717

++		/* Normal users shall not reset the sched_reset_on_fork flag */

5718

++		if (p->sched_reset_on_fork && !reset_on_fork)

5719

++			return -EPERM;

5720

++	}

5721

++

5722

++	if (user) {

5723

++		retval = security_task_setscheduler(p);

5724

++		if (retval)

5725

++			return retval;

5726

++	}

5727

++

5728

++	if (pi)

5729

++		cpuset_read_lock();

5730

++

5731

++	/*

5732

++	 * Make sure no PI-waiters arrive (or leave) while we are

5733

++	 * changing the priority of the task:

5734

++	 */

5735

++	raw_spin_lock_irqsave(&p->pi_lock, flags);

5736

++

5737

++	/*

5738

++	 * To be able to change p->policy safely, task_access_lock()

5739

++	 * must be called.

5740

++	 * IF use task_access_lock() here:

5741

++	 * For the task p which is not running, reading rq->stop is

5742

++	 * racy but acceptable as ->stop doesn't change much.

5743

++	 * An enhancemnet can be made to read rq->stop saftly.

5744

++	 */

5745

++	rq = __task_access_lock(p, &lock);

5746

++

5747

++	/*

5748

++	 * Changing the policy of the stop threads its a very bad idea

5749

++	 */

5750

++	if (p == rq->stop) {

5751

++		retval = -EINVAL;

5752

++		goto unlock;

5753

++	}

5754

++

5755

++	/*

5756

++	 * If not changing anything there's no need to proceed further:

5757

++	 */

5758

++	if (unlikely(policy == p->policy)) {

5759

++		if (rt_policy(policy) && attr->sched_priority != p->rt_priority)

5760

++			goto change;

5761

++		if (!rt_policy(policy) &&

5762

++		    NICE_TO_PRIO(attr->sched_nice) != p->static_prio)

5763

++			goto change;

5764

++

5765

++		p->sched_reset_on_fork = reset_on_fork;

5766

++		retval = 0;

5767

++		goto unlock;

5768

++	}

5769

++change:

5770

++

5771

++	/* Re-check policy now with rq lock held */

5772

++	if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {

5773

++		policy = oldpolicy = -1;

5774

++		__task_access_unlock(p, lock);

5775

++		raw_spin_unlock_irqrestore(&p->pi_lock, flags);

5776

++		if (pi)

5777

++			cpuset_read_unlock();

5778

++		goto recheck;

5779

++	}

5780

++

5781

++	p->sched_reset_on_fork = reset_on_fork;

5782

++

5783

++	if (pi) {

5784

++		/*

5785

++		 * Take priority boosted tasks into account. If the new

5786

++		 * effective priority is unchanged, we just store the new

5787

++		 * normal parameters and do not touch the scheduler class and

5788

++		 * the runqueue. This will be done when the task deboost

5789

++		 * itself.

5790

++		 */

5791

++		if (rt_effective_prio(p, newprio) == p->prio) {

5792

++			__setscheduler_params(p, attr);

5793

++			retval = 0;

5794

++			goto unlock;

5795

++		}

5796

++	}

5797

++

5798

++	__setscheduler(rq, p, attr, pi);

5799

++

5800

++	check_task_changed(p, rq);

5801

++

5802

++	/* Avoid rq from going away on us: */

5803

++	preempt_disable();

5804

++	head = splice_balance_callbacks(rq);

5805

++	__task_access_unlock(p, lock);

5806

++	raw_spin_unlock_irqrestore(&p->pi_lock, flags);

5807

++

5808

++	if (pi) {

5809

++		cpuset_read_unlock();

5810

++		rt_mutex_adjust_pi(p);

5811

++	}

5812

++

5813

++	/* Run balance callbacks after we've adjusted the PI chain: */

5814

++	balance_callbacks(rq, head);

5815

++	preempt_enable();

5816

++

5817

++	return 0;

5818

++

5819

++unlock:

5820

++	__task_access_unlock(p, lock);

5821

++	raw_spin_unlock_irqrestore(&p->pi_lock, flags);

5822

++	if (pi)

5823

++		cpuset_read_unlock();

5824

++	return retval;

5825

++}

5826

++

5827

++static int _sched_setscheduler(struct task_struct *p, int policy,

5828

++			       const struct sched_param *param, bool check)

5829

++{

5830

++	struct sched_attr attr = {

5831

++		.sched_policy   = policy,

5832

++		.sched_priority = param->sched_priority,

5833

++		.sched_nice     = PRIO_TO_NICE(p->static_prio),

5834

++	};

5835

++

5836

++	/* Fixup the legacy SCHED_RESET_ON_FORK hack. */

5837

++	if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) {

5838

++		attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;

5839

++		policy &= ~SCHED_RESET_ON_FORK;

5840

++		attr.sched_policy = policy;

5841

++	}

5842

++

5843

++	return __sched_setscheduler(p, &attr, check, true);

5844

++}

5845

++

5846

++/**

5847

++ * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.

5848

++ * @p: the task in question.

5849

++ * @policy: new policy.

5850

++ * @param: structure containing the new RT priority.

5851

++ *

5852

++ * Use sched_set_fifo(), read its comment.

5853

++ *

5854

++ * Return: 0 on success. An error code otherwise.

5855

++ *

5856

++ * NOTE that the task may be already dead.

5857

++ */

5858

++int sched_setscheduler(struct task_struct *p, int policy,

5859

++		       const struct sched_param *param)

5860

++{

5861

++	return _sched_setscheduler(p, policy, param, true);

5862

++}

5863

++

5864

++int sched_setattr(struct task_struct *p, const struct sched_attr *attr)

5865

++{

5866

++	return __sched_setscheduler(p, attr, true, true);

5867

++}

5868

++

5869

++int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr)

5870

++{

5871

++	return __sched_setscheduler(p, attr, false, true);

5872

++}

5873

++

5874

++/**

5875

++ * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.

5876

++ * @p: the task in question.

5877

++ * @policy: new policy.

5878

++ * @param: structure containing the new RT priority.

5879

++ *

5880

++ * Just like sched_setscheduler, only don't bother checking if the

5881

++ * current context has permission.  For example, this is needed in

5882

++ * stop_machine(): we create temporary high priority worker threads,

5883

++ * but our caller might not have that capability.

5884

++ *

5885

++ * Return: 0 on success. An error code otherwise.

5886

++ */

5887

++int sched_setscheduler_nocheck(struct task_struct *p, int policy,

5888

++			       const struct sched_param *param)

5889

++{

5890

++	return _sched_setscheduler(p, policy, param, false);

5891

++}

5892

++

5893

++/*

5894

++ * SCHED_FIFO is a broken scheduler model; that is, it is fundamentally

5895

++ * incapable of resource management, which is the one thing an OS really should

5896

++ * be doing.

5897

++ *

5898

++ * This is of course the reason it is limited to privileged users only.

5899

++ *

5900

++ * Worse still; it is fundamentally impossible to compose static priority

5901

++ * workloads. You cannot take two correctly working static prio workloads

5902

++ * and smash them together and still expect them to work.

5903

++ *

5904

++ * For this reason 'all' FIFO tasks the kernel creates are basically at:

5905

++ *

5906

++ *   MAX_RT_PRIO / 2

5907

++ *

5908

++ * The administrator _MUST_ configure the system, the kernel simply doesn't

5909

++ * know enough information to make a sensible choice.

5910

++ */

5911

++void sched_set_fifo(struct task_struct *p)

5912

++{

5913

++	struct sched_param sp = { .sched_priority = MAX_RT_PRIO / 2 };

5914

++	WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0);

5915

++}

5916

++EXPORT_SYMBOL_GPL(sched_set_fifo);

5917

++

5918

++/*

5919

++ * For when you don't much care about FIFO, but want to be above SCHED_NORMAL.

5920

++ */

5921

++void sched_set_fifo_low(struct task_struct *p)

5922

++{

5923

++	struct sched_param sp = { .sched_priority = 1 };

5924

++	WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0);

5925

++}

5926

++EXPORT_SYMBOL_GPL(sched_set_fifo_low);

5927

++

5928

++void sched_set_normal(struct task_struct *p, int nice)

5929

++{

5930

++	struct sched_attr attr = {

5931

++		.sched_policy = SCHED_NORMAL,

5932

++		.sched_nice = nice,

5933

++	};

5934

++	WARN_ON_ONCE(sched_setattr_nocheck(p, &attr) != 0);

5935

++}

5936

++EXPORT_SYMBOL_GPL(sched_set_normal);

5937

++

5938

++static int

5939

++do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)

5940

++{

5941

++	struct sched_param lparam;

5942

++	struct task_struct *p;

5943

++	int retval;

5944

++

5945

++	if (!param || pid < 0)

5946

++		return -EINVAL;

5947

++	if (copy_from_user(&lparam, param, sizeof(struct sched_param)))

5948

++		return -EFAULT;

5949

++

5950

++	rcu_read_lock();

5951

++	retval = -ESRCH;

5952

++	p = find_process_by_pid(pid);

5953

++	if (likely(p))

5954

++		get_task_struct(p);

5955

++	rcu_read_unlock();

5956

++

5957

++	if (likely(p)) {

5958

++		retval = sched_setscheduler(p, policy, &lparam);

5959

++		put_task_struct(p);

5960

++	}

5961

++

5962

++	return retval;

5963

++}

5964

++

5965

++/*

5966

++ * Mimics kernel/events/core.c perf_copy_attr().

5967

++ */

5968

++static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *attr)

5969

++{

5970

++	u32 size;

5971

++	int ret;

5972

++

5973

++	/* Zero the full structure, so that a short copy will be nice: */

5974

++	memset(attr, 0, sizeof(*attr));

5975

++

5976

++	ret = get_user(size, &uattr->size);

5977

++	if (ret)

5978

++		return ret;

5979

++

5980

++	/* ABI compatibility quirk: */

5981

++	if (!size)

5982

++		size = SCHED_ATTR_SIZE_VER0;

5983

++

5984

++	if (size < SCHED_ATTR_SIZE_VER0 || size > PAGE_SIZE)

5985

++		goto err_size;

5986

++

5987

++	ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size);

5988

++	if (ret) {

5989

++		if (ret == -E2BIG)

5990

++			goto err_size;

5991

++		return ret;

5992

++	}

5993

++

5994

++	/*

5995

++	 * XXX: Do we want to be lenient like existing syscalls; or do we want

5996

++	 * to be strict and return an error on out-of-bounds values?

5997

++	 */

5998

++	attr->sched_nice = clamp(attr->sched_nice, -20, 19);

5999

++

6000

++	/* sched/core.c uses zero here but we already know ret is zero */

6001

++	return 0;

6002

++

6003

++err_size:

6004

++	put_user(sizeof(*attr), &uattr->size);

6005

++	return -E2BIG;

6006

++}

6007

++

6008

++/**

6009

++ * sys_sched_setscheduler - set/change the scheduler policy and RT priority

6010

++ * @pid: the pid in question.

6011

++ * @policy: new policy.

6012

++ *

6013

++ * Return: 0 on success. An error code otherwise.

6014

++ * @param: structure containing the new RT priority.

6015

++ */

6016

++SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, struct sched_param __user *, param)

6017

++{

6018

++	if (policy < 0)

6019

++		return -EINVAL;

6020

++

6021

++	return do_sched_setscheduler(pid, policy, param);

6022

++}

6023

++

6024

++/**

6025

++ * sys_sched_setparam - set/change the RT priority of a thread

6026

++ * @pid: the pid in question.

6027

++ * @param: structure containing the new RT priority.

6028

++ *

6029

++ * Return: 0 on success. An error code otherwise.

6030

++ */

6031

++SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)

6032

++{

6033

++	return do_sched_setscheduler(pid, SETPARAM_POLICY, param);

6034

++}

6035

++

6036

++/**

6037

++ * sys_sched_setattr - same as above, but with extended sched_attr

6038

++ * @pid: the pid in question.

6039

++ * @uattr: structure containing the extended parameters.

6040

++ */

6041

++SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,

6042

++			       unsigned int, flags)

6043

++{

6044

++	struct sched_attr attr;

6045

++	struct task_struct *p;

6046

++	int retval;

6047

++

6048

++	if (!uattr || pid < 0 || flags)

6049

++		return -EINVAL;

6050

++

6051

++	retval = sched_copy_attr(uattr, &attr);

6052

++	if (retval)

6053

++		return retval;

6054

++

6055

++	if ((int)attr.sched_policy < 0)

6056

++		return -EINVAL;

6057

++

6058

++	rcu_read_lock();

6059

++	retval = -ESRCH;

6060

++	p = find_process_by_pid(pid);

6061

++	if (likely(p))

6062

++		get_task_struct(p);

6063

++	rcu_read_unlock();

6064

++

6065

++	if (likely(p)) {

6066

++		retval = sched_setattr(p, &attr);

6067

++		put_task_struct(p);

6068

++	}

6069

++

6070

++	return retval;

6071

++}

6072

++

6073

++/**

6074

++ * sys_sched_getscheduler - get the policy (scheduling class) of a thread

6075

++ * @pid: the pid in question.

6076

++ *

6077

++ * Return: On success, the policy of the thread. Otherwise, a negative error

6078

++ * code.

6079

++ */

6080

++SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)

6081

++{

6082

++	struct task_struct *p;

6083

++	int retval = -EINVAL;

6084

++

6085

++	if (pid < 0)

6086

++		goto out_nounlock;

6087

++

6088

++	retval = -ESRCH;

6089

++	rcu_read_lock();

6090

++	p = find_process_by_pid(pid);

6091

++	if (p) {

6092

++		retval = security_task_getscheduler(p);

6093

++		if (!retval)

6094

++			retval = p->policy;

6095

++	}

6096

++	rcu_read_unlock();

6097

++

6098

++out_nounlock:

6099

++	return retval;

6100

++}

6101

++

6102

++/**

6103

++ * sys_sched_getscheduler - get the RT priority of a thread

6104

++ * @pid: the pid in question.

6105

++ * @param: structure containing the RT priority.

6106

++ *

6107

++ * Return: On success, 0 and the RT priority is in @param. Otherwise, an error

6108

++ * code.

6109

++ */

6110

++SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)

6111

++{

6112

++	struct sched_param lp = { .sched_priority = 0 };

6113

++	struct task_struct *p;

6114

++	int retval = -EINVAL;

6115

++

6116

++	if (!param || pid < 0)

6117

++		goto out_nounlock;

6118

++

6119

++	rcu_read_lock();

6120

++	p = find_process_by_pid(pid);

6121

++	retval = -ESRCH;

6122

++	if (!p)

6123

++		goto out_unlock;

6124

++

6125

++	retval = security_task_getscheduler(p);

6126

++	if (retval)

6127

++		goto out_unlock;

6128

++

6129

++	if (task_has_rt_policy(p))

6130

++		lp.sched_priority = p->rt_priority;

6131

++	rcu_read_unlock();

6132

++

6133

++	/*

6134

++	 * This one might sleep, we cannot do it with a spinlock held ...

6135

++	 */

6136

++	retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;

6137

++

6138

++out_nounlock:

6139

++	return retval;

6140

++

6141

++out_unlock:

6142

++	rcu_read_unlock();

6143

++	return retval;

6144

++}

6145

++

6146

++/*

6147

++ * Copy the kernel size attribute structure (which might be larger

6148

++ * than what user-space knows about) to user-space.

6149

++ *

6150

++ * Note that all cases are valid: user-space buffer can be larger or

6151

++ * smaller than the kernel-space buffer. The usual case is that both

6152

++ * have the same size.

6153

++ */

6154

++static int

6155

++sched_attr_copy_to_user(struct sched_attr __user *uattr,

6156

++			struct sched_attr *kattr,

6157

++			unsigned int usize)

6158

++{

6159

++	unsigned int ksize = sizeof(*kattr);

6160

++

6161

++	if (!access_ok(uattr, usize))

6162

++		return -EFAULT;

6163

++

6164

++	/*

6165

++	 * sched_getattr() ABI forwards and backwards compatibility:

6166

++	 *

6167

++	 * If usize == ksize then we just copy everything to user-space and all is good.

6168

++	 *

6169

++	 * If usize < ksize then we only copy as much as user-space has space for,

6170

++	 * this keeps ABI compatibility as well. We skip the rest.

6171

++	 *

6172

++	 * If usize > ksize then user-space is using a newer version of the ABI,

6173

++	 * which part the kernel doesn't know about. Just ignore it - tooling can

6174

++	 * detect the kernel's knowledge of attributes from the attr->size value

6175

++	 * which is set to ksize in this case.

6176

++	 */

6177

++	kattr->size = min(usize, ksize);

6178

++

6179

++	if (copy_to_user(uattr, kattr, kattr->size))

6180

++		return -EFAULT;

6181

++

6182

++	return 0;

6183

++}

6184

++

6185

++/**

6186

++ * sys_sched_getattr - similar to sched_getparam, but with sched_attr

6187

++ * @pid: the pid in question.

6188

++ * @uattr: structure containing the extended parameters.

6189

++ * @usize: sizeof(attr) for fwd/bwd comp.

6190

++ * @flags: for future extension.

6191

++ */

6192

++SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,

6193

++		unsigned int, usize, unsigned int, flags)

6194

++{

6195

++	struct sched_attr kattr = { };

6196

++	struct task_struct *p;

6197

++	int retval;

6198

++

6199

++	if (!uattr || pid < 0 || usize > PAGE_SIZE ||

6200

++	    usize < SCHED_ATTR_SIZE_VER0 || flags)

6201

++		return -EINVAL;

6202

++

6203

++	rcu_read_lock();

6204

++	p = find_process_by_pid(pid);

6205

++	retval = -ESRCH;

6206

++	if (!p)

6207

++		goto out_unlock;

6208

++

6209

++	retval = security_task_getscheduler(p);

6210

++	if (retval)

6211

++		goto out_unlock;

6212

++

6213

++	kattr.sched_policy = p->policy;

6214

++	if (p->sched_reset_on_fork)

6215

++		kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;

6216

++	if (task_has_rt_policy(p))

6217

++		kattr.sched_priority = p->rt_priority;

6218

++	else

6219

++		kattr.sched_nice = task_nice(p);

6220

++

6221

++#ifdef CONFIG_UCLAMP_TASK

6222

++	kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value;

6223

++	kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value;

6224

++#endif

6225

++

6226

++	rcu_read_unlock();

6227

++

6228

++	return sched_attr_copy_to_user(uattr, &kattr, usize);

6229

++

6230

++out_unlock:

6231

++	rcu_read_unlock();

6232

++	return retval;

6233

++}

6234

++

6235

++long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)

6236

++{

6237

++	cpumask_var_t cpus_allowed, new_mask;

6238

++	struct task_struct *p;

6239

++	int retval;

6240

++

6241

++	rcu_read_lock();

6242

++

6243

++	p = find_process_by_pid(pid);

6244

++	if (!p) {

6245

++		rcu_read_unlock();

6246

++		return -ESRCH;

6247

++	}

6248

++

6249

++	/* Prevent p going away */

6250

++	get_task_struct(p);

6251

++	rcu_read_unlock();

6252

++

6253

++	if (p->flags & PF_NO_SETAFFINITY) {

6254

++		retval = -EINVAL;

6255

++		goto out_put_task;

6256

++	}

6257

++	if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {

6258

++		retval = -ENOMEM;

6259

++		goto out_put_task;

6260

++	}

6261

++	if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {

6262

++		retval = -ENOMEM;

6263

++		goto out_free_cpus_allowed;

6264

++	}

6265

++	retval = -EPERM;

6266

++	if (!check_same_owner(p)) {

6267

++		rcu_read_lock();

6268

++		if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {

6269

++			rcu_read_unlock();

6270

++			goto out_free_new_mask;

6271

++		}

6272

++		rcu_read_unlock();

6273

++	}

6274

++

6275

++	retval = security_task_setscheduler(p);

6276

++	if (retval)

6277

++		goto out_free_new_mask;

6278

++

6279

++	cpuset_cpus_allowed(p, cpus_allowed);

6280

++	cpumask_and(new_mask, in_mask, cpus_allowed);

6281

++

6282

++again:

6283

++	retval = __set_cpus_allowed_ptr(p, new_mask, SCA_CHECK);

6284

++

6285

++	if (!retval) {

6286

++		cpuset_cpus_allowed(p, cpus_allowed);

6287

++		if (!cpumask_subset(new_mask, cpus_allowed)) {

6288

++			/*

6289

++			 * We must have raced with a concurrent cpuset

6290

++			 * update. Just reset the cpus_allowed to the

6291

++			 * cpuset's cpus_allowed

6292

++			 */

6293

++			cpumask_copy(new_mask, cpus_allowed);

6294

++			goto again;

6295

++		}

6296

++	}

6297

++out_free_new_mask:

6298

++	free_cpumask_var(new_mask);

6299

++out_free_cpus_allowed:

6300

++	free_cpumask_var(cpus_allowed);

6301

++out_put_task:

6302

++	put_task_struct(p);

6303

++	return retval;

6304

++}

6305

++

6306

++static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,

6307

++			     struct cpumask *new_mask)

6308

++{

6309

++	if (len < cpumask_size())

6310

++		cpumask_clear(new_mask);

6311

++	else if (len > cpumask_size())

6312

++		len = cpumask_size();

6313

++

6314

++	return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;

6315

++}

6316

++

6317

++/**

6318

++ * sys_sched_setaffinity - set the CPU affinity of a process

6319

++ * @pid: pid of the process

6320

++ * @len: length in bytes of the bitmask pointed to by user_mask_ptr

6321

++ * @user_mask_ptr: user-space pointer to the new CPU mask

6322

++ *

6323

++ * Return: 0 on success. An error code otherwise.

6324

++ */

6325

++SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,

6326

++		unsigned long __user *, user_mask_ptr)

6327

++{

6328

++	cpumask_var_t new_mask;

6329

++	int retval;

6330

++

6331

++	if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))

6332

++		return -ENOMEM;

6333

++

6334

++	retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);

6335

++	if (retval == 0)

6336

++		retval = sched_setaffinity(pid, new_mask);

6337

++	free_cpumask_var(new_mask);

6338

++	return retval;

6339

++}

6340

++

6341

++long sched_getaffinity(pid_t pid, cpumask_t *mask)

6342

++{

6343

++	struct task_struct *p;

6344

++	raw_spinlock_t *lock;

6345

++	unsigned long flags;

6346

++	int retval;

6347

++

6348

++	rcu_read_lock();

6349

++

6350

++	retval = -ESRCH;

6351

++	p = find_process_by_pid(pid);

6352

++	if (!p)

6353

++		goto out_unlock;

6354

++

6355

++	retval = security_task_getscheduler(p);

6356

++	if (retval)

6357

++		goto out_unlock;

6358

++

6359

++	task_access_lock_irqsave(p, &lock, &flags);

6360

++	cpumask_and(mask, &p->cpus_mask, cpu_active_mask);

6361

++	task_access_unlock_irqrestore(p, lock, &flags);

6362

++

6363

++out_unlock:

6364

++	rcu_read_unlock();

6365

++

6366

++	return retval;

6367

++}

6368

++

6369

++/**

6370

++ * sys_sched_getaffinity - get the CPU affinity of a process

6371

++ * @pid: pid of the process

6372

++ * @len: length in bytes of the bitmask pointed to by user_mask_ptr

6373

++ * @user_mask_ptr: user-space pointer to hold the current CPU mask

6374

++ *

6375

++ * Return: size of CPU mask copied to user_mask_ptr on success. An

6376

++ * error code otherwise.

6377

++ */

6378

++SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,

6379

++		unsigned long __user *, user_mask_ptr)

6380

++{

6381

++	int ret;

6382

++	cpumask_var_t mask;

6383

++

6384

++	if ((len * BITS_PER_BYTE) < nr_cpu_ids)

6385

++		return -EINVAL;

6386

++	if (len & (sizeof(unsigned long)-1))

6387

++		return -EINVAL;

6388

++

6389

++	if (!alloc_cpumask_var(&mask, GFP_KERNEL))

6390

++		return -ENOMEM;

6391

++

6392

++	ret = sched_getaffinity(pid, mask);

6393

++	if (ret == 0) {

6394

++		unsigned int retlen = min_t(size_t, len, cpumask_size());

6395

++

6396

++		if (copy_to_user(user_mask_ptr, mask, retlen))

6397

++			ret = -EFAULT;

6398

++		else

6399

++			ret = retlen;

6400

++	}

6401

++	free_cpumask_var(mask);

6402

++

6403

++	return ret;

6404

++}

6405

++

6406

++static void do_sched_yield(void)

6407

++{

6408

++	struct rq *rq;

6409

++	struct rq_flags rf;

6410

++

6411

++	if (!sched_yield_type)

6412

++		return;

6413

++

6414

++	rq = this_rq_lock_irq(&rf);

6415

++

6416

++	schedstat_inc(rq->yld_count);

6417

++

6418

++	if (1 == sched_yield_type) {

6419

++		if (!rt_task(current))

6420

++			do_sched_yield_type_1(current, rq);

6421

++	} else if (2 == sched_yield_type) {

6422

++		if (rq->nr_running > 1)

6423

++			rq->skip = current;

6424

++	}

6425

++

6426

++	preempt_disable();

6427

++	raw_spin_unlock_irq(&rq->lock);

6428

++	sched_preempt_enable_no_resched();

6429

++

6430

++	schedule();

6431

++}

6432

++

6433

++/**

6434

++ * sys_sched_yield - yield the current processor to other threads.

6435

++ *

6436

++ * This function yields the current CPU to other tasks. If there are no

6437

++ * other threads running on this CPU then this function will return.

6438

++ *

6439

++ * Return: 0.

6440

++ */

6441

++SYSCALL_DEFINE0(sched_yield)

6442

++{

6443

++	do_sched_yield();

6444

++	return 0;

6445

++}

6446

++

6447

++#if !defined(CONFIG_PREEMPTION) || defined(CONFIG_PREEMPT_DYNAMIC)

6448

++int __sched __cond_resched(void)

6449

++{

6450

++	if (should_resched(0)) {

6451

++		preempt_schedule_common();

6452

++		return 1;

6453

++	}

6454

++#ifndef CONFIG_PREEMPT_RCU

6455

++	rcu_all_qs();

6456

++#endif

6457

++	return 0;

6458

++}

6459

++EXPORT_SYMBOL(__cond_resched);

6460

++#endif

6461

++

6462

++#ifdef CONFIG_PREEMPT_DYNAMIC

6463

++DEFINE_STATIC_CALL_RET0(cond_resched, __cond_resched);

6464

++EXPORT_STATIC_CALL_TRAMP(cond_resched);

6465

++

6466

++DEFINE_STATIC_CALL_RET0(might_resched, __cond_resched);

6467

++EXPORT_STATIC_CALL_TRAMP(might_resched);

6468

++#endif

6469

++

6470

++/*

6471

++ * __cond_resched_lock() - if a reschedule is pending, drop the given lock,

6472

++ * call schedule, and on return reacquire the lock.

6473

++ *

6474

++ * This works OK both with and without CONFIG_PREEMPTION.  We do strange low-level

6475

++ * operations here to prevent schedule() from being called twice (once via

6476

++ * spin_unlock(), once by hand).

6477

++ */

6478

++int __cond_resched_lock(spinlock_t *lock)

6479

++{

6480

++	int resched = should_resched(PREEMPT_LOCK_OFFSET);

6481

++	int ret = 0;

6482

++

6483

++	lockdep_assert_held(lock);

6484

++

6485

++	if (spin_needbreak(lock) || resched) {

6486

++		spin_unlock(lock);

6487

++		if (resched)

6488

++			preempt_schedule_common();

6489

++		else

6490

++			cpu_relax();

6491

++		ret = 1;

6492

++		spin_lock(lock);

6493

++	}

6494

++	return ret;

6495

++}

6496

++EXPORT_SYMBOL(__cond_resched_lock);

6497

++

6498

++int __cond_resched_rwlock_read(rwlock_t *lock)

6499

++{

6500

++	int resched = should_resched(PREEMPT_LOCK_OFFSET);

6501

++	int ret = 0;

6502

++

6503

++	lockdep_assert_held_read(lock);

6504

++

6505

++	if (rwlock_needbreak(lock) || resched) {

6506

++		read_unlock(lock);

6507

++		if (resched)

6508

++			preempt_schedule_common();

6509

++		else

6510

++			cpu_relax();

6511

++		ret = 1;

6512

++		read_lock(lock);

6513

++	}

6514

++	return ret;

6515

++}

6516

++EXPORT_SYMBOL(__cond_resched_rwlock_read);

6517

++

6518

++int __cond_resched_rwlock_write(rwlock_t *lock)

6519

++{

6520

++	int resched = should_resched(PREEMPT_LOCK_OFFSET);

6521

++	int ret = 0;

6522

++

6523

++	lockdep_assert_held_write(lock);

6524

++

6525

++	if (rwlock_needbreak(lock) || resched) {

6526

++		write_unlock(lock);

6527

++		if (resched)

6528

++			preempt_schedule_common();

6529

++		else

6530

++			cpu_relax();

6531

++		ret = 1;

6532

++		write_lock(lock);

6533

++	}

6534

++	return ret;

6535

++}

6536

++EXPORT_SYMBOL(__cond_resched_rwlock_write);

6537

++

6538

++/**

6539

++ * yield - yield the current processor to other threads.

6540

++ *

6541

++ * Do not ever use this function, there's a 99% chance you're doing it wrong.

6542

++ *

6543

++ * The scheduler is at all times free to pick the calling task as the most

6544

++ * eligible task to run, if removing the yield() call from your code breaks

6545

++ * it, it's already broken.

6546

++ *

6547

++ * Typical broken usage is:

6548

++ *

6549

++ * while (!event)

6550

++ * 	yield();

6551

++ *

6552

++ * where one assumes that yield() will let 'the other' process run that will

6553

++ * make event true. If the current task is a SCHED_FIFO task that will never

6554

++ * happen. Never use yield() as a progress guarantee!!

6555

++ *

6556

++ * If you want to use yield() to wait for something, use wait_event().

6557

++ * If you want to use yield() to be 'nice' for others, use cond_resched().

6558

++ * If you still want to use yield(), do not!

6559

++ */

6560

++void __sched yield(void)

6561

++{

6562

++	set_current_state(TASK_RUNNING);

6563

++	do_sched_yield();

6564

++}

6565

++EXPORT_SYMBOL(yield);

6566

++

6567

++/**

6568

++ * yield_to - yield the current processor to another thread in

6569

++ * your thread group, or accelerate that thread toward the

6570

++ * processor it's on.

6571

++ * @p: target task

6572

++ * @preempt: whether task preemption is allowed or not

6573

++ *

6574

++ * It's the caller's job to ensure that the target task struct

6575

++ * can't go away on us before we can do any checks.

6576

++ *

6577

++ * In Alt schedule FW, yield_to is not supported.

6578

++ *

6579

++ * Return:

6580

++ *	true (>0) if we indeed boosted the target task.

6581

++ *	false (0) if we failed to boost the target.

6582

++ *	-ESRCH if there's no task to yield to.

6583

++ */

6584

++int __sched yield_to(struct task_struct *p, bool preempt)

6585

++{

6586

++	return 0;

6587

++}

6588

++EXPORT_SYMBOL_GPL(yield_to);

6589

++

6590

++int io_schedule_prepare(void)

6591

++{

6592

++	int old_iowait = current->in_iowait;

6593

++

6594

++	current->in_iowait = 1;

6595

++	blk_schedule_flush_plug(current);

6596

++

6597

++	return old_iowait;

6598

++}

6599

++

6600

++void io_schedule_finish(int token)

6601

++{

6602

++	current->in_iowait = token;

6603

++}

6604

++

6605

++/*

6606

++ * This task is about to go to sleep on IO.  Increment rq->nr_iowait so

6607

++ * that process accounting knows that this is a task in IO wait state.

6608

++ *

6609

++ * But don't do that if it is a deliberate, throttling IO wait (this task

6610

++ * has set its backing_dev_info: the queue against which it should throttle)

6611

++ */

6612

++

6613

++long __sched io_schedule_timeout(long timeout)

6614

++{

6615

++	int token;

6616

++	long ret;

6617

++

6618

++	token = io_schedule_prepare();

6619

++	ret = schedule_timeout(timeout);

6620

++	io_schedule_finish(token);

6621

++

6622

++	return ret;

6623

++}

6624

++EXPORT_SYMBOL(io_schedule_timeout);

6625

++

6626

++void __sched io_schedule(void)

6627

++{

6628

++	int token;

6629

++

6630

++	token = io_schedule_prepare();

6631

++	schedule();

6632

++	io_schedule_finish(token);

6633

++}

6634

++EXPORT_SYMBOL(io_schedule);

6635

++

6636

++/**

6637

++ * sys_sched_get_priority_max - return maximum RT priority.

6638

++ * @policy: scheduling class.

6639

++ *

6640

++ * Return: On success, this syscall returns the maximum

6641

++ * rt_priority that can be used by a given scheduling class.

6642

++ * On failure, a negative error code is returned.

6643

++ */

6644

++SYSCALL_DEFINE1(sched_get_priority_max, int, policy)

6645

++{

6646

++	int ret = -EINVAL;

6647

++

6648

++	switch (policy) {

6649

++	case SCHED_FIFO:

6650

++	case SCHED_RR:

6651

++		ret = MAX_RT_PRIO - 1;

6652

++		break;

6653

++	case SCHED_NORMAL:

6654

++	case SCHED_BATCH:

6655

++	case SCHED_IDLE:

6656

++		ret = 0;

6657

++		break;

6658

++	}

6659

++	return ret;

6660

++}

6661

++

6662

++/**

6663

++ * sys_sched_get_priority_min - return minimum RT priority.

6664

++ * @policy: scheduling class.

6665

++ *

6666

++ * Return: On success, this syscall returns the minimum

6667

++ * rt_priority that can be used by a given scheduling class.

6668

++ * On failure, a negative error code is returned.

6669

++ */

6670

++SYSCALL_DEFINE1(sched_get_priority_min, int, policy)

6671

++{

6672

++	int ret = -EINVAL;

6673

++

6674

++	switch (policy) {

6675

++	case SCHED_FIFO:

6676

++	case SCHED_RR:

6677

++		ret = 1;

6678

++		break;

6679

++	case SCHED_NORMAL:

6680

++	case SCHED_BATCH:

6681

++	case SCHED_IDLE:

6682

++		ret = 0;

6683

++		break;

6684

++	}

6685

++	return ret;

6686

++}

6687

++

6688

++static int sched_rr_get_interval(pid_t pid, struct timespec64 *t)

6689

++{

6690

++	struct task_struct *p;

6691

++	int retval;

6692

++

6693

++	alt_sched_debug();

6694

++

6695

++	if (pid < 0)

6696

++		return -EINVAL;

6697

++

6698

++	retval = -ESRCH;

6699

++	rcu_read_lock();

6700

++	p = find_process_by_pid(pid);

6701

++	if (!p)

6702

++		goto out_unlock;

6703

++

6704

++	retval = security_task_getscheduler(p);

6705

++	if (retval)

6706

++		goto out_unlock;

6707

++	rcu_read_unlock();

6708

++

6709

++	*t = ns_to_timespec64(sched_timeslice_ns);

6710

++	return 0;

6711

++

6712

++out_unlock:

6713

++	rcu_read_unlock();

6714

++	return retval;

6715

++}

6716

++

6717

++/**

6718

++ * sys_sched_rr_get_interval - return the default timeslice of a process.

6719

++ * @pid: pid of the process.

6720

++ * @interval: userspace pointer to the timeslice value.

6721

++ *

6722

++ *

6723

++ * Return: On success, 0 and the timeslice is in @interval. Otherwise,

6724

++ * an error code.

6725

++ */

6726

++SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,

6727

++		struct __kernel_timespec __user *, interval)

6728

++{

6729

++	struct timespec64 t;

6730

++	int retval = sched_rr_get_interval(pid, &t);

6731

++

6732

++	if (retval == 0)

6733

++		retval = put_timespec64(&t, interval);

6734

++

6735

++	return retval;

6736

++}

6737

++

6738

++#ifdef CONFIG_COMPAT_32BIT_TIME

6739

++SYSCALL_DEFINE2(sched_rr_get_interval_time32, pid_t, pid,

6740

++		struct old_timespec32 __user *, interval)

6741

++{

6742

++	struct timespec64 t;

6743

++	int retval = sched_rr_get_interval(pid, &t);

6744

++

6745

++	if (retval == 0)

6746

++		retval = put_old_timespec32(&t, interval);

6747

++	return retval;

6748

++}

6749

++#endif

6750

++

6751

++void sched_show_task(struct task_struct *p)

6752

++{

6753

++	unsigned long free = 0;

6754

++	int ppid;

6755

++

6756

++	if (!try_get_task_stack(p))

6757

++		return;

6758

++

6759

++	pr_info("task:%-15.15s state:%c", p->comm, task_state_to_char(p));

6760

++

6761

++	if (p->state == TASK_RUNNING)

6762

++		pr_cont("  running task    ");

6763

++#ifdef CONFIG_DEBUG_STACK_USAGE

6764

++	free = stack_not_used(p);

6765

++#endif

6766

++	ppid = 0;

6767

++	rcu_read_lock();

6768

++	if (pid_alive(p))

6769

++		ppid = task_pid_nr(rcu_dereference(p->real_parent));

6770

++	rcu_read_unlock();

6771

++	pr_cont(" stack:%5lu pid:%5d ppid:%6d flags:0x%08lx\n",

6772

++		free, task_pid_nr(p), ppid,

6773

++		(unsigned long)task_thread_info(p)->flags);

6774

++

6775

++	print_worker_info(KERN_INFO, p);

6776

++	print_stop_info(KERN_INFO, p);

6777

++	show_stack(p, NULL, KERN_INFO);

6778

++	put_task_stack(p);

6779

++}

6780

++EXPORT_SYMBOL_GPL(sched_show_task);

6781

++

6782

++static inline bool

6783

++state_filter_match(unsigned long state_filter, struct task_struct *p)

6784

++{

6785

++	/* no filter, everything matches */

6786

++	if (!state_filter)

6787

++		return true;

6788

++

6789

++	/* filter, but doesn't match */

6790

++	if (!(p->state & state_filter))

6791

++		return false;

6792

++

6793

++	/*

6794

++	 * When looking for TASK_UNINTERRUPTIBLE skip TASK_IDLE (allows

6795

++	 * TASK_KILLABLE).

6796

++	 */

6797

++	if (state_filter == TASK_UNINTERRUPTIBLE && p->state == TASK_IDLE)

6798

++		return false;

6799

++

6800

++	return true;

6801

++}

6802

++

6803

++

6804

++void show_state_filter(unsigned long state_filter)

6805

++{

6806

++	struct task_struct *g, *p;

6807

++

6808

++	rcu_read_lock();

6809

++	for_each_process_thread(g, p) {

6810

++		/*

6811

++		 * reset the NMI-timeout, listing all files on a slow

6812

++		 * console might take a lot of time:

6813

++		 * Also, reset softlockup watchdogs on all CPUs, because

6814

++		 * another CPU might be blocked waiting for us to process

6815

++		 * an IPI.

6816

++		 */

6817

++		touch_nmi_watchdog();

6818

++		touch_all_softlockup_watchdogs();

6819

++		if (state_filter_match(state_filter, p))

6820

++			sched_show_task(p);

6821

++	}

6822

++

6823

++#ifdef CONFIG_SCHED_DEBUG

6824

++	/* TODO: Alt schedule FW should support this

6825

++	if (!state_filter)

6826

++		sysrq_sched_debug_show();

6827

++	*/

6828

++#endif

6829

++	rcu_read_unlock();

6830

++	/*

6831

++	 * Only show locks if all tasks are dumped:

6832

++	 */

6833

++	if (!state_filter)

6834

++		debug_show_all_locks();

6835

++}

6836

++

6837

++void dump_cpu_task(int cpu)

6838

++{

6839

++	pr_info("Task dump for CPU %d:\n", cpu);

6840

++	sched_show_task(cpu_curr(cpu));

6841

++}

6842

++

6843

++/**

6844

++ * init_idle - set up an idle thread for a given CPU

6845

++ * @idle: task in question

6846

++ * @cpu: CPU the idle task belongs to

6847

++ *

6848

++ * NOTE: this function does not set the idle thread's NEED_RESCHED

6849

++ * flag, to make booting more robust.

6850

++ */

6851

++void init_idle(struct task_struct *idle, int cpu)

6852

++{

6853

++	struct rq *rq = cpu_rq(cpu);

6854

++	unsigned long flags;

6855

++

6856

++	__sched_fork(0, idle);

6857

++

6858

++	raw_spin_lock_irqsave(&idle->pi_lock, flags);

6859

++	raw_spin_lock(&rq->lock);

6860

++	update_rq_clock(rq);

6861

++

6862

++	idle->last_ran = rq->clock_task;

6863

++	idle->state = TASK_RUNNING;

6864

++	idle->flags |= PF_IDLE;

6865

++	sched_queue_init_idle(&rq->queue, idle);

6866

++

6867

++	scs_task_reset(idle);

6868

++	kasan_unpoison_task_stack(idle);

6869

++

6870

++#ifdef CONFIG_SMP

6871

++	/*

6872

++	 * It's possible that init_idle() gets called multiple times on a task,

6873

++	 * in that case do_set_cpus_allowed() will not do the right thing.

6874

++	 *

6875

++	 * And since this is boot we can forgo the serialisation.

6876

++	 */

6877

++	set_cpus_allowed_common(idle, cpumask_of(cpu));

6878

++#endif

6879

++

6880

++	/* Silence PROVE_RCU */

6881

++	rcu_read_lock();

6882

++	__set_task_cpu(idle, cpu);

6883

++	rcu_read_unlock();

6884

++

6885

++	rq->idle = idle;

6886

++	rcu_assign_pointer(rq->curr, idle);

6887

++	idle->on_cpu = 1;

6888

++

6889

++	raw_spin_unlock(&rq->lock);

6890

++	raw_spin_unlock_irqrestore(&idle->pi_lock, flags);

6891

++

6892

++	/* Set the preempt count _outside_ the spinlocks! */

6893

++	init_idle_preempt_count(idle, cpu);

6894

++

6895

++	ftrace_graph_init_idle_task(idle, cpu);

6896

++	vtime_init_idle(idle, cpu);

6897

++#ifdef CONFIG_SMP

6898

++	sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);

6899

++#endif

6900

++}

6901

++

6902

++#ifdef CONFIG_SMP

6903

++

6904

++int cpuset_cpumask_can_shrink(const struct cpumask __maybe_unused *cur,

6905

++			      const struct cpumask __maybe_unused *trial)

6906

++{

6907

++	return 1;

6908

++}

6909

++

6910

++int task_can_attach(struct task_struct *p,

6911

++		    const struct cpumask *cs_cpus_allowed)

6912

++{

6913

++	int ret = 0;

6914

++

6915

++	/*

6916

++	 * Kthreads which disallow setaffinity shouldn't be moved

6917

++	 * to a new cpuset; we don't want to change their CPU

6918

++	 * affinity and isolating such threads by their set of

6919

++	 * allowed nodes is unnecessary.  Thus, cpusets are not

6920

++	 * applicable for such threads.  This prevents checking for

6921

++	 * success of set_cpus_allowed_ptr() on all attached tasks

6922

++	 * before cpus_mask may be changed.

6923

++	 */

6924

++	if (p->flags & PF_NO_SETAFFINITY)

6925

++		ret = -EINVAL;

6926

++

6927

++	return ret;

6928

++}

6929

++

6930

++bool sched_smp_initialized __read_mostly;

6931

++

6932

++#ifdef CONFIG_HOTPLUG_CPU

6933

++/*

6934

++ * Ensures that the idle task is using init_mm right before its CPU goes

6935

++ * offline.

6936

++ */

6937

++void idle_task_exit(void)

6938

++{

6939

++	struct mm_struct *mm = current->active_mm;

6940

++

6941

++	BUG_ON(current != this_rq()->idle);

6942

++

6943

++	if (mm != &init_mm) {

6944

++		switch_mm(mm, &init_mm, current);

6945

++		finish_arch_post_lock_switch();

6946

++	}

6947

++

6948

++	/* finish_cpu(), as ran on the BP, will clean up the active_mm state */

6949

++}

6950

++

6951

++static int __balance_push_cpu_stop(void *arg)

6952

++{

6953

++	struct task_struct *p = arg;

6954

++	struct rq *rq = this_rq();

6955

++	struct rq_flags rf;

6956

++	int cpu;

6957

++

6958

++	raw_spin_lock_irq(&p->pi_lock);

6959

++	rq_lock(rq, &rf);

6960

++

6961

++	update_rq_clock(rq);

6962

++

6963

++	if (task_rq(p) == rq && task_on_rq_queued(p)) {

6964

++		cpu = select_fallback_rq(rq->cpu, p);

6965

++		rq = __migrate_task(rq, p, cpu);

6966

++	}

6967

++

6968

++	rq_unlock(rq, &rf);

6969

++	raw_spin_unlock_irq(&p->pi_lock);

6970

++

6971

++	put_task_struct(p);

6972

++

6973

++	return 0;

6974

++}

6975

++

6976

++static DEFINE_PER_CPU(struct cpu_stop_work, push_work);

6977

++

6978

++/*

6979

++ * This is enabled below SCHED_AP_ACTIVE; when !cpu_active(), but only

6980

++ * effective when the hotplug motion is down.

6981

++ */

6982

++static void balance_push(struct rq *rq)

6983

++{

6984

++	struct task_struct *push_task = rq->curr;

6985

++

6986

++	lockdep_assert_held(&rq->lock);

6987

++	SCHED_WARN_ON(rq->cpu != smp_processor_id());

6988

++

6989

++	/*

6990

++	 * Ensure the thing is persistent until balance_push_set(.on = false);

6991

++	 */

6992

++	rq->balance_callback = &balance_push_callback;

6993

++

6994

++	/*

6995

++	 * Only active while going offline.

6996

++	 */

6997

++	if (!cpu_dying(rq->cpu))

6998

++		return;

6999

++

7000

++	/*

7001

++	 * Both the cpu-hotplug and stop task are in this case and are

7002

++	 * required to complete the hotplug process.

7003

++	 *

7004

++	 * XXX: the idle task does not match kthread_is_per_cpu() due to

7005

++	 * histerical raisins.

7006

++	 */

7007

++	if (rq->idle == push_task ||

7008

++	    kthread_is_per_cpu(push_task) ||

7009

++	    is_migration_disabled(push_task)) {

7010

++

7011

++		/*

7012

++		 * If this is the idle task on the outgoing CPU try to wake

7013

++		 * up the hotplug control thread which might wait for the

7014

++		 * last task to vanish. The rcuwait_active() check is

7015

++		 * accurate here because the waiter is pinned on this CPU

7016

++		 * and can't obviously be running in parallel.

7017

++		 *

7018

++		 * On RT kernels this also has to check whether there are

7019

++		 * pinned and scheduled out tasks on the runqueue. They

7020

++		 * need to leave the migrate disabled section first.

7021

++		 */

7022

++		if (!rq->nr_running && !rq_has_pinned_tasks(rq) &&

7023

++		    rcuwait_active(&rq->hotplug_wait)) {

7024

++			raw_spin_unlock(&rq->lock);

7025

++			rcuwait_wake_up(&rq->hotplug_wait);

7026

++			raw_spin_lock(&rq->lock);

7027

++		}

7028

++		return;

7029

++	}

7030

++

7031

++	get_task_struct(push_task);

7032

++	/*

7033

++	 * Temporarily drop rq->lock such that we can wake-up the stop task.

7034

++	 * Both preemption and IRQs are still disabled.

7035

++	 */

7036

++	raw_spin_unlock(&rq->lock);

7037

++	stop_one_cpu_nowait(rq->cpu, __balance_push_cpu_stop, push_task,

7038

++			    this_cpu_ptr(&push_work));

7039

++	/*

7040

++	 * At this point need_resched() is true and we'll take the loop in

7041

++	 * schedule(). The next pick is obviously going to be the stop task

7042

++	 * which kthread_is_per_cpu() and will push this task away.

7043

++	 */

7044

++	raw_spin_lock(&rq->lock);

7045

++}

7046

++

7047

++static void balance_push_set(int cpu, bool on)

7048

++{

7049

++	struct rq *rq = cpu_rq(cpu);

7050

++	struct rq_flags rf;

7051

++

7052

++	rq_lock_irqsave(rq, &rf);

7053

++	if (on) {

7054

++		WARN_ON_ONCE(rq->balance_callback);

7055

++		rq->balance_callback = &balance_push_callback;

7056

++	} else if (rq->balance_callback == &balance_push_callback) {

7057

++		rq->balance_callback = NULL;

7058

++	}

7059

++	rq_unlock_irqrestore(rq, &rf);

7060

++}

7061

++

7062

++/*

7063

++ * Invoked from a CPUs hotplug control thread after the CPU has been marked

7064

++ * inactive. All tasks which are not per CPU kernel threads are either

7065

++ * pushed off this CPU now via balance_push() or placed on a different CPU

7066

++ * during wakeup. Wait until the CPU is quiescent.

7067

++ */

7068

++static void balance_hotplug_wait(void)

7069

++{

7070

++	struct rq *rq = this_rq();

7071

++

7072

++	rcuwait_wait_event(&rq->hotplug_wait,

7073

++			   rq->nr_running == 1 && !rq_has_pinned_tasks(rq),

7074

++			   TASK_UNINTERRUPTIBLE);

7075

++}

7076

++

7077

++#else

7078

++

7079

++static void balance_push(struct rq *rq)

7080

++{

7081

++}

7082

++

7083

++static void balance_push_set(int cpu, bool on)

7084

++{

7085

++}

7086

++

7087

++static inline void balance_hotplug_wait(void)

7088

++{

7089

++}

7090

++#endif /* CONFIG_HOTPLUG_CPU */

7091

++

7092

++static void set_rq_offline(struct rq *rq)

7093

++{

7094

++	if (rq->online)

7095

++		rq->online = false;

7096

++}

7097

++

7098

++static void set_rq_online(struct rq *rq)

7099

++{

7100

++	if (!rq->online)

7101

++		rq->online = true;

7102

++}

7103

++

7104

++/*

7105

++ * used to mark begin/end of suspend/resume:

7106

++ */

7107

++static int num_cpus_frozen;

7108

++

7109

++/*

7110

++ * Update cpusets according to cpu_active mask.  If cpusets are

7111

++ * disabled, cpuset_update_active_cpus() becomes a simple wrapper

7112

++ * around partition_sched_domains().

7113

++ *

7114

++ * If we come here as part of a suspend/resume, don't touch cpusets because we

7115

++ * want to restore it back to its original state upon resume anyway.

7116

++ */

7117

++static void cpuset_cpu_active(void)

7118

++{

7119

++	if (cpuhp_tasks_frozen) {

7120

++		/*

7121

++		 * num_cpus_frozen tracks how many CPUs are involved in suspend

7122

++		 * resume sequence. As long as this is not the last online

7123

++		 * operation in the resume sequence, just build a single sched

7124

++		 * domain, ignoring cpusets.

7125

++		 */

7126

++		partition_sched_domains(1, NULL, NULL);

7127

++		if (--num_cpus_frozen)

7128

++			return;

7129

++		/*

7130

++		 * This is the last CPU online operation. So fall through and

7131

++		 * restore the original sched domains by considering the

7132

++		 * cpuset configurations.

7133

++		 */

7134

++		cpuset_force_rebuild();

7135

++	}

7136

++

7137

++	cpuset_update_active_cpus();

7138

++}

7139

++

7140

++static int cpuset_cpu_inactive(unsigned int cpu)

7141

++{

7142

++	if (!cpuhp_tasks_frozen) {

7143

++		cpuset_update_active_cpus();

7144

++	} else {

7145

++		num_cpus_frozen++;

7146

++		partition_sched_domains(1, NULL, NULL);

7147

++	}

7148

++	return 0;

7149

++}

7150

++

7151

++int sched_cpu_activate(unsigned int cpu)

7152

++{

7153

++	struct rq *rq = cpu_rq(cpu);

7154

++	unsigned long flags;

7155

++

7156

++	/*

7157

++	 * Clear the balance_push callback and prepare to schedule

7158

++	 * regular tasks.

7159

++	 */

7160

++	balance_push_set(cpu, false);

7161

++

7162

++#ifdef CONFIG_SCHED_SMT

7163

++	/*

7164

++	 * When going up, increment the number of cores with SMT present.

7165

++	 */

7166

++	if (cpumask_weight(cpu_smt_mask(cpu)) == 2)

7167

++		static_branch_inc_cpuslocked(&sched_smt_present);

7168

++#endif

7169

++	set_cpu_active(cpu, true);

7170

++

7171

++	if (sched_smp_initialized)

7172

++		cpuset_cpu_active();

7173

++

7174

++	/*

7175

++	 * Put the rq online, if not already. This happens:

7176

++	 *

7177

++	 * 1) In the early boot process, because we build the real domains

7178

++	 *    after all cpus have been brought up.

7179

++	 *

7180

++	 * 2) At runtime, if cpuset_cpu_active() fails to rebuild the

7181

++	 *    domains.

7182

++	 */

7183

++	raw_spin_lock_irqsave(&rq->lock, flags);

7184

++	set_rq_online(rq);

7185

++	raw_spin_unlock_irqrestore(&rq->lock, flags);

7186

++

7187

++	return 0;

7188

++}

7189

++

7190

++int sched_cpu_deactivate(unsigned int cpu)

7191

++{

7192

++	struct rq *rq = cpu_rq(cpu);

7193

++	unsigned long flags;

7194

++	int ret;

7195

++

7196

++	set_cpu_active(cpu, false);

7197

++

7198

++	/*

7199

++	 * From this point forward, this CPU will refuse to run any task that

7200

++	 * is not: migrate_disable() or KTHREAD_IS_PER_CPU, and will actively

7201

++	 * push those tasks away until this gets cleared, see

7202

++	 * sched_cpu_dying().

7203

++	 */

7204

++	balance_push_set(cpu, true);

7205

++

7206

++	/*

7207

++	 * We've cleared cpu_active_mask, wait for all preempt-disabled and RCU

7208

++	 * users of this state to go away such that all new such users will

7209

++	 * observe it.

7210

++	 *

7211

++	 * Specifically, we rely on ttwu to no longer target this CPU, see

7212

++	 * ttwu_queue_cond() and is_cpu_allowed().

7213

++	 *

7214

++	 * Do sync before park smpboot threads to take care the rcu boost case.

7215

++	 */

7216

++	synchronize_rcu();

7217

++

7218

++	raw_spin_lock_irqsave(&rq->lock, flags);

7219

++	update_rq_clock(rq);

7220

++	set_rq_offline(rq);

7221

++	raw_spin_unlock_irqrestore(&rq->lock, flags);

7222

++

7223

++#ifdef CONFIG_SCHED_SMT

7224

++	/*

7225

++	 * When going down, decrement the number of cores with SMT present.

7226

++	 */

7227

++	if (cpumask_weight(cpu_smt_mask(cpu)) == 2) {

7228

++		static_branch_dec_cpuslocked(&sched_smt_present);

7229

++		if (!static_branch_likely(&sched_smt_present))

7230

++			cpumask_clear(&sched_sg_idle_mask);

7231

++	}

7232

++#endif

7233

++

7234

++	if (!sched_smp_initialized)

7235

++		return 0;

7236

++

7237

++	ret = cpuset_cpu_inactive(cpu);

7238

++	if (ret) {

7239

++		balance_push_set(cpu, false);

7240

++		set_cpu_active(cpu, true);

7241

++		return ret;

7242

++	}

7243

++

7244

++	return 0;

7245

++}

7246

++

7247

++static void sched_rq_cpu_starting(unsigned int cpu)

7248

++{

7249

++	struct rq *rq = cpu_rq(cpu);

7250

++

7251

++	rq->calc_load_update = calc_load_update;

7252

++}

7253

++

7254

++int sched_cpu_starting(unsigned int cpu)

7255

++{

7256

++	sched_rq_cpu_starting(cpu);

7257

++	sched_tick_start(cpu);

7258

++	return 0;

7259

++}

7260

++

7261

++#ifdef CONFIG_HOTPLUG_CPU

7262

++

7263

++/*

7264

++ * Invoked immediately before the stopper thread is invoked to bring the

7265

++ * CPU down completely. At this point all per CPU kthreads except the

7266

++ * hotplug thread (current) and the stopper thread (inactive) have been

7267

++ * either parked or have been unbound from the outgoing CPU. Ensure that

7268

++ * any of those which might be on the way out are gone.

7269

++ *

7270

++ * If after this point a bound task is being woken on this CPU then the

7271

++ * responsible hotplug callback has failed to do it's job.

7272

++ * sched_cpu_dying() will catch it with the appropriate fireworks.

7273

++ */

7274

++int sched_cpu_wait_empty(unsigned int cpu)

7275

++{

7276

++	balance_hotplug_wait();

7277

++	return 0;

7278

++}

7279

++

7280

++/*

7281

++ * Since this CPU is going 'away' for a while, fold any nr_active delta we

7282

++ * might have. Called from the CPU stopper task after ensuring that the

7283

++ * stopper is the last running task on the CPU, so nr_active count is

7284

++ * stable. We need to take the teardown thread which is calling this into

7285

++ * account, so we hand in adjust = 1 to the load calculation.

7286

++ *

7287

++ * Also see the comment "Global load-average calculations".

7288

++ */

7289

++static void calc_load_migrate(struct rq *rq)

7290

++{

7291

++	long delta = calc_load_fold_active(rq, 1);

7292

++

7293

++	if (delta)

7294

++		atomic_long_add(delta, &calc_load_tasks);

7295

++}

7296

++

7297

++static void dump_rq_tasks(struct rq *rq, const char *loglvl)

7298

++{

7299

++	struct task_struct *g, *p;

7300

++	int cpu = cpu_of(rq);

7301

++

7302

++	lockdep_assert_held(&rq->lock);

7303

++

7304

++	printk("%sCPU%d enqueued tasks (%u total):\n", loglvl, cpu, rq->nr_running);

7305

++	for_each_process_thread(g, p) {

7306

++		if (task_cpu(p) != cpu)

7307

++			continue;

7308

++

7309

++		if (!task_on_rq_queued(p))

7310

++			continue;

7311

++

7312

++		printk("%s\tpid: %d, name: %s\n", loglvl, p->pid, p->comm);

7313

++	}

7314

++}

7315

++

7316

++int sched_cpu_dying(unsigned int cpu)

7317

++{

7318

++	struct rq *rq = cpu_rq(cpu);

7319

++	unsigned long flags;

7320

++

7321

++	/* Handle pending wakeups and then migrate everything off */

7322

++	sched_tick_stop(cpu);

7323

++

7324

++	raw_spin_lock_irqsave(&rq->lock, flags);

7325

++	if (rq->nr_running != 1 || rq_has_pinned_tasks(rq)) {

7326

++		WARN(true, "Dying CPU not properly vacated!");

7327

++		dump_rq_tasks(rq, KERN_WARNING);

7328

++	}

7329

++	raw_spin_unlock_irqrestore(&rq->lock, flags);

7330

++

7331

++	calc_load_migrate(rq);

7332

++	hrtick_clear(rq);

7333

++	return 0;

7334

++}

7335

++#endif

7336

++

7337

++#ifdef CONFIG_SMP

7338

++static void sched_init_topology_cpumask_early(void)

7339

++{

7340

++	int cpu;

7341

++	cpumask_t *tmp;

7342

++

7343

++	for_each_possible_cpu(cpu) {

7344

++		/* init affinity masks */

7345

++		tmp = per_cpu(sched_cpu_affinity_masks, cpu);

7346

++

7347

++		cpumask_copy(tmp, cpumask_of(cpu));

7348

++		tmp++;

7349

++		cpumask_copy(tmp, cpu_possible_mask);

7350

++		cpumask_clear_cpu(cpu, tmp);

7351

++		per_cpu(sched_cpu_affinity_end_mask, cpu) = ++tmp;

7352

++		/* init topo masks */

7353

++		tmp = per_cpu(sched_cpu_topo_masks, cpu);

7354

++

7355

++		cpumask_copy(tmp, cpumask_of(cpu));

7356

++		tmp++;

7357

++		cpumask_copy(tmp, cpu_possible_mask);

7358

++		per_cpu(sched_cpu_llc_mask, cpu) = tmp;

7359

++		/*per_cpu(sd_llc_id, cpu) = cpu;*/

7360

++	}

7361

++}

7362

++

7363

++#define TOPOLOGY_CPUMASK(name, mask, last) \

7364

++	if (cpumask_and(chk, chk, mask)) {					\

7365

++		cpumask_copy(topo, mask);					\

7366

++		printk(KERN_INFO "sched: cpu#%02d affinity: 0x%08lx topo: 0x%08lx - "#name,\

7367

++		       cpu, (chk++)->bits[0], (topo++)->bits[0]);		\

7368

++	}									\

7369

++	if (!last)								\

7370

++		cpumask_complement(chk, mask)

7371

++

7372

++static void sched_init_topology_cpumask(void)

7373

++{

7374

++	int cpu;

7375

++	cpumask_t *chk, *topo;

7376

++

7377

++	for_each_online_cpu(cpu) {

7378

++		/* take chance to reset time slice for idle tasks */

7379

++		cpu_rq(cpu)->idle->time_slice = sched_timeslice_ns;

7380

++

7381

++		chk = per_cpu(sched_cpu_affinity_masks, cpu) + 1;

7382

++		topo = per_cpu(sched_cpu_topo_masks, cpu) + 1;

7383

++

7384

++		cpumask_complement(chk, cpumask_of(cpu));

7385

++#ifdef CONFIG_SCHED_SMT

7386

++		TOPOLOGY_CPUMASK(smt, topology_sibling_cpumask(cpu), false);

7387

++#endif

7388

++		per_cpu(sd_llc_id, cpu) = cpumask_first(cpu_coregroup_mask(cpu));

7389

++		per_cpu(sched_cpu_llc_mask, cpu) = topo;

7390

++		TOPOLOGY_CPUMASK(coregroup, cpu_coregroup_mask(cpu), false);

7391

++

7392

++		TOPOLOGY_CPUMASK(core, topology_core_cpumask(cpu), false);

7393

++

7394

++		TOPOLOGY_CPUMASK(others, cpu_online_mask, true);

7395

++

7396

++		per_cpu(sched_cpu_affinity_end_mask, cpu) = chk;

7397

++		printk(KERN_INFO "sched: cpu#%02d llc_id = %d, llc_mask idx = %d\n",

7398

++		       cpu, per_cpu(sd_llc_id, cpu),

7399

++		       (int) (per_cpu(sched_cpu_llc_mask, cpu) -

7400

++			      per_cpu(sched_cpu_topo_masks, cpu)));

7401

++	}

7402

++}

7403

++#endif

7404

++

7405

++void __init sched_init_smp(void)

7406

++{

7407

++	/* Move init over to a non-isolated CPU */

7408

++	if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0)

7409

++		BUG();

7410

++

7411

++	sched_init_topology_cpumask();

7412

++

7413

++	sched_smp_initialized = true;

7414

++}

7415

++#else

7416

++void __init sched_init_smp(void)

7417

++{

7418

++	cpu_rq(0)->idle->time_slice = sched_timeslice_ns;

7419

++}

7420

++#endif /* CONFIG_SMP */

7421

++

7422

++int in_sched_functions(unsigned long addr)

7423

++{

7424

++	return in_lock_functions(addr) ||

7425

++		(addr >= (unsigned long)__sched_text_start

7426

++		&& addr < (unsigned long)__sched_text_end);

7427

++}

7428

++

7429

++#ifdef CONFIG_CGROUP_SCHED

7430

++/* task group related information */

7431

++struct task_group {

7432

++	struct cgroup_subsys_state css;

7433

++

7434

++	struct rcu_head rcu;

7435

++	struct list_head list;

7436

++

7437

++	struct task_group *parent;

7438

++	struct list_head siblings;

7439

++	struct list_head children;

7440

++#ifdef CONFIG_FAIR_GROUP_SCHED

7441

++	unsigned long		shares;

7442

++#endif

7443

++};

7444

++

7445

++/*

7446

++ * Default task group.

7447

++ * Every task in system belongs to this group at bootup.

7448

++ */

7449

++struct task_group root_task_group;

7450

++LIST_HEAD(task_groups);

7451

++

7452

++/* Cacheline aligned slab cache for task_group */

7453

++static struct kmem_cache *task_group_cache __read_mostly;

7454

++#endif /* CONFIG_CGROUP_SCHED */

7455

++

7456

++void __init sched_init(void)

7457

++{

7458

++	int i;

7459

++	struct rq *rq;

7460

++

7461

++	printk(KERN_INFO ALT_SCHED_VERSION_MSG);

7462

++

7463

++	wait_bit_init();

7464

++

7465

++#ifdef CONFIG_SMP

7466

++	for (i = 0; i < SCHED_BITS; i++)

7467

++		cpumask_copy(&sched_rq_watermark[i], cpu_present_mask);

7468

++#endif

7469

++

7470

++#ifdef CONFIG_CGROUP_SCHED

7471

++	task_group_cache = KMEM_CACHE(task_group, 0);

7472

++

7473

++	list_add(&root_task_group.list, &task_groups);

7474

++	INIT_LIST_HEAD(&root_task_group.children);

7475

++	INIT_LIST_HEAD(&root_task_group.siblings);

7476

++#endif /* CONFIG_CGROUP_SCHED */

7477

++	for_each_possible_cpu(i) {

7478

++		rq = cpu_rq(i);

7479

++

7480

++		sched_queue_init(&rq->queue);

7481

++		rq->watermark = IDLE_WM;

7482

++		rq->skip = NULL;

7483

++

7484

++		raw_spin_lock_init(&rq->lock);

7485

++		rq->nr_running = rq->nr_uninterruptible = 0;

7486

++		rq->calc_load_active = 0;

7487

++		rq->calc_load_update = jiffies + LOAD_FREQ;

7488

++#ifdef CONFIG_SMP

7489

++		rq->online = false;

7490

++		rq->cpu = i;

7491

++

7492

++#ifdef CONFIG_SCHED_SMT

7493

++		rq->active_balance = 0;

7494

++#endif

7495

++

7496

++#ifdef CONFIG_NO_HZ_COMMON

7497

++		INIT_CSD(&rq->nohz_csd, nohz_csd_func, rq);

7498

++#endif

7499

++		rq->balance_callback = &balance_push_callback;

7500

++#ifdef CONFIG_HOTPLUG_CPU

7501

++		rcuwait_init(&rq->hotplug_wait);

7502

++#endif

7503

++#endif /* CONFIG_SMP */

7504

++		rq->nr_switches = 0;

7505

++

7506

++		hrtick_rq_init(rq);

7507

++		atomic_set(&rq->nr_iowait, 0);

7508

++	}

7509

++#ifdef CONFIG_SMP

7510

++	/* Set rq->online for cpu 0 */

7511

++	cpu_rq(0)->online = true;

7512

++#endif

7513

++	/*

7514

++	 * The boot idle thread does lazy MMU switching as well:

7515

++	 */

7516

++	mmgrab(&init_mm);

7517

++	enter_lazy_tlb(&init_mm, current);

7518

++

7519

++	/*

7520

++	 * Make us the idle thread. Technically, schedule() should not be

7521

++	 * called from this thread, however somewhere below it might be,

7522

++	 * but because we are the idle thread, we just pick up running again

7523

++	 * when this runqueue becomes "idle".

7524

++	 */

7525

++	init_idle(current, smp_processor_id());

7526

++

7527

++	calc_load_update = jiffies + LOAD_FREQ;

7528

++

7529

++#ifdef CONFIG_SMP

7530

++	idle_thread_set_boot_cpu();

7531

++	balance_push_set(smp_processor_id(), false);

7532

++

7533

++	sched_init_topology_cpumask_early();

7534

++#endif /* SMP */

7535

++

7536

++	init_schedstats();

7537

++

7538

++	psi_init();

7539

++}

7540

++

7541

++#ifdef CONFIG_DEBUG_ATOMIC_SLEEP

7542

++static inline int preempt_count_equals(int preempt_offset)

7543

++{

7544

++	int nested = preempt_count() + rcu_preempt_depth();

7545

++

7546

++	return (nested == preempt_offset);

7547

++}

7548

++

7549

++void __might_sleep(const char *file, int line, int preempt_offset)

7550

++{

7551

++	/*

7552

++	 * Blocking primitives will set (and therefore destroy) current->state,

7553

++	 * since we will exit with TASK_RUNNING make sure we enter with it,

7554

++	 * otherwise we will destroy state.

7555

++	 */

7556

++	WARN_ONCE(current->state != TASK_RUNNING && current->task_state_change,

7557

++			"do not call blocking ops when !TASK_RUNNING; "

7558

++			"state=%lx set at [<%p>] %pS\n",

7559

++			current->state,

7560

++			(void *)current->task_state_change,

7561

++			(void *)current->task_state_change);

7562

++

7563

++	___might_sleep(file, line, preempt_offset);

7564

++}

7565

++EXPORT_SYMBOL(__might_sleep);

7566

++

7567

++void ___might_sleep(const char *file, int line, int preempt_offset)

7568

++{

7569

++	/* Ratelimiting timestamp: */

7570

++	static unsigned long prev_jiffy;

7571

++

7572

++	unsigned long preempt_disable_ip;

7573

++

7574

++	/* WARN_ON_ONCE() by default, no rate limit required: */

7575

++	rcu_sleep_check();

7576

++

7577

++	if ((preempt_count_equals(preempt_offset) && !irqs_disabled() &&

7578

++	     !is_idle_task(current) && !current->non_block_count) ||

7579

++	    system_state == SYSTEM_BOOTING || system_state > SYSTEM_RUNNING ||

7580

++	    oops_in_progress)

7581

++		return;

7582

++	if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)

7583

++		return;

7584

++	prev_jiffy = jiffies;

7585

++

7586

++	/* Save this before calling printk(), since that will clobber it: */

7587

++	preempt_disable_ip = get_preempt_disable_ip(current);

7588

++

7589

++	printk(KERN_ERR

7590

++		"BUG: sleeping function called from invalid context at %s:%d\n",

7591

++			file, line);

7592

++	printk(KERN_ERR

7593

++		"in_atomic(): %d, irqs_disabled(): %d, non_block: %d, pid: %d, name: %s\n",

7594

++			in_atomic(), irqs_disabled(), current->non_block_count,

7595

++			current->pid, current->comm);

7596

++

7597

++	if (task_stack_end_corrupted(current))

7598

++		printk(KERN_EMERG "Thread overran stack, or stack corrupted\n");

7599

++

7600

++	debug_show_held_locks(current);

7601

++	if (irqs_disabled())

7602

++		print_irqtrace_events(current);

7603

++#ifdef CONFIG_DEBUG_PREEMPT

7604

++	if (!preempt_count_equals(preempt_offset)) {

7605

++		pr_err("Preemption disabled at:");

7606

++		print_ip_sym(KERN_ERR, preempt_disable_ip);

7607

++	}

7608

++#endif

7609

++	dump_stack();

7610

++	add_taint(TAINT_WARN, LOCKDEP_STILL_OK);

7611

++}

7612

++EXPORT_SYMBOL(___might_sleep);

7613

++

7614

++void __cant_sleep(const char *file, int line, int preempt_offset)

7615

++{

7616

++	static unsigned long prev_jiffy;

7617

++

7618

++	if (irqs_disabled())

7619

++		return;

7620

++

7621

++	if (!IS_ENABLED(CONFIG_PREEMPT_COUNT))

7622

++		return;

7623

++

7624

++	if (preempt_count() > preempt_offset)

7625

++		return;

7626

++

7627

++	if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)

7628

++		return;

7629

++	prev_jiffy = jiffies;

7630

++

7631

++	printk(KERN_ERR "BUG: assuming atomic context at %s:%d\n", file, line);

7632

++	printk(KERN_ERR "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",

7633

++			in_atomic(), irqs_disabled(),

7634

++			current->pid, current->comm);

7635

++

7636

++	debug_show_held_locks(current);

7637

++	dump_stack();

7638

++	add_taint(TAINT_WARN, LOCKDEP_STILL_OK);

7639

++}

7640

++EXPORT_SYMBOL_GPL(__cant_sleep);

7641

++

7642

++#ifdef CONFIG_SMP

7643

++void __cant_migrate(const char *file, int line)

7644

++{

7645

++	static unsigned long prev_jiffy;

7646

++

7647

++	if (irqs_disabled())

7648

++		return;

7649

++

7650

++	if (is_migration_disabled(current))

7651

++		return;

7652

++

7653

++	if (!IS_ENABLED(CONFIG_PREEMPT_COUNT))

7654

++		return;

7655

++

7656

++	if (preempt_count() > 0)

7657

++		return;

7658

++

7659

++	if (current->migration_flags & MDF_FORCE_ENABLED)

7660

++		return;

7661

++

7662

++	if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)

7663

++		return;

7664

++	prev_jiffy = jiffies;

7665

++

7666

++	pr_err("BUG: assuming non migratable context at %s:%d\n", file, line);

7667

++	pr_err("in_atomic(): %d, irqs_disabled(): %d, migration_disabled() %u pid: %d, name: %s\n",

7668

++	       in_atomic(), irqs_disabled(), is_migration_disabled(current),

7669

++	       current->pid, current->comm);

7670

++

7671

++	debug_show_held_locks(current);

7672

++	dump_stack();

7673

++	add_taint(TAINT_WARN, LOCKDEP_STILL_OK);

7674

++}

7675

++EXPORT_SYMBOL_GPL(__cant_migrate);

7676

++#endif

7677

++#endif

7678

++

7679

++#ifdef CONFIG_MAGIC_SYSRQ

7680

++void normalize_rt_tasks(void)

7681

++{

7682

++	struct task_struct *g, *p;

7683

++	struct sched_attr attr = {

7684

++		.sched_policy = SCHED_NORMAL,

7685

++	};

7686

++

7687

++	read_lock(&tasklist_lock);

7688

++	for_each_process_thread(g, p) {

7689

++		/*

7690

++		 * Only normalize user tasks:

7691

++		 */

7692

++		if (p->flags & PF_KTHREAD)

7693

++			continue;

7694

++

7695

++		if (!rt_task(p)) {

7696

++			/*

7697

++			 * Renice negative nice level userspace

7698

++			 * tasks back to 0:

7699

++			 */

7700

++			if (task_nice(p) < 0)

7701

++				set_user_nice(p, 0);

7702

++			continue;

7703

++		}

7704

++

7705

++		__sched_setscheduler(p, &attr, false, false);

7706

++	}

7707

++	read_unlock(&tasklist_lock);

7708

++}

7709

++#endif /* CONFIG_MAGIC_SYSRQ */

7710

++

7711

++#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB)

7712

++/*

7713

++ * These functions are only useful for the IA64 MCA handling, or kdb.

7714

++ *

7715

++ * They can only be called when the whole system has been

7716

++ * stopped - every CPU needs to be quiescent, and no scheduling

7717

++ * activity can take place. Using them for anything else would

7718

++ * be a serious bug, and as a result, they aren't even visible

7719

++ * under any other configuration.

7720

++ */

7721

++

7722

++/**

7723

++ * curr_task - return the current task for a given CPU.

7724

++ * @cpu: the processor in question.

7725

++ *

7726

++ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!

7727

++ *

7728

++ * Return: The current task for @cpu.

7729

++ */

7730

++struct task_struct *curr_task(int cpu)

7731

++{

7732

++	return cpu_curr(cpu);

7733

++}

7734

++

7735

++#endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */

7736

++

7737

++#ifdef CONFIG_IA64

7738

++/**

7739

++ * ia64_set_curr_task - set the current task for a given CPU.

7740

++ * @cpu: the processor in question.

7741

++ * @p: the task pointer to set.

7742

++ *

7743

++ * Description: This function must only be used when non-maskable interrupts

7744

++ * are serviced on a separate stack.  It allows the architecture to switch the

7745

++ * notion of the current task on a CPU in a non-blocking manner.  This function

7746

++ * must be called with all CPU's synchronised, and interrupts disabled, the

7747

++ * and caller must save the original value of the current task (see

7748

++ * curr_task() above) and restore that value before reenabling interrupts and

7749

++ * re-starting the system.

7750

++ *

7751

++ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!

7752

++ */

7753

++void ia64_set_curr_task(int cpu, struct task_struct *p)

7754

++{

7755

++	cpu_curr(cpu) = p;

7756

++}

7757

++

7758

++#endif

7759

++

7760

++#ifdef CONFIG_CGROUP_SCHED

7761

++static void sched_free_group(struct task_group *tg)

7762

++{

7763

++	kmem_cache_free(task_group_cache, tg);

7764

++}

7765

++

7766

++/* allocate runqueue etc for a new task group */

7767

++struct task_group *sched_create_group(struct task_group *parent)

7768

++{

7769

++	struct task_group *tg;

7770

++

7771

++	tg = kmem_cache_alloc(task_group_cache, GFP_KERNEL | __GFP_ZERO);

7772

++	if (!tg)

7773

++		return ERR_PTR(-ENOMEM);

7774

++

7775

++	return tg;

7776

++}

7777

++

7778

++void sched_online_group(struct task_group *tg, struct task_group *parent)

7779

++{

7780

++}

7781

++

7782

++/* rcu callback to free various structures associated with a task group */

7783

++static void sched_free_group_rcu(struct rcu_head *rhp)

7784

++{

7785

++	/* Now it should be safe to free those cfs_rqs */

7786

++	sched_free_group(container_of(rhp, struct task_group, rcu));

7787

++}

7788

++

7789

++void sched_destroy_group(struct task_group *tg)

7790

++{

7791

++	/* Wait for possible concurrent references to cfs_rqs complete */

7792

++	call_rcu(&tg->rcu, sched_free_group_rcu);

7793

++}

7794

++

7795

++void sched_offline_group(struct task_group *tg)

7796

++{

7797

++}

7798

++

7799

++static inline struct task_group *css_tg(struct cgroup_subsys_state *css)

7800

++{

7801

++	return css ? container_of(css, struct task_group, css) : NULL;

7802

++}

7803

++

7804

++static struct cgroup_subsys_state *

7805

++cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)

7806

++{

7807

++	struct task_group *parent = css_tg(parent_css);

7808

++	struct task_group *tg;

7809

++

7810

++	if (!parent) {

7811

++		/* This is early initialization for the top cgroup */

7812

++		return &root_task_group.css;

7813

++	}

7814

++

7815

++	tg = sched_create_group(parent);

7816

++	if (IS_ERR(tg))

7817

++		return ERR_PTR(-ENOMEM);

7818

++	return &tg->css;

7819

++}

7820

++

7821

++/* Expose task group only after completing cgroup initialization */

7822

++static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)

7823

++{

7824

++	struct task_group *tg = css_tg(css);

7825

++	struct task_group *parent = css_tg(css->parent);

7826

++

7827

++	if (parent)

7828

++		sched_online_group(tg, parent);

7829

++	return 0;

7830

++}

7831

++

7832

++static void cpu_cgroup_css_released(struct cgroup_subsys_state *css)

7833

++{

7834

++	struct task_group *tg = css_tg(css);

7835

++

7836

++	sched_offline_group(tg);

7837

++}

7838

++

7839

++static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)

7840

++{

7841

++	struct task_group *tg = css_tg(css);

7842

++

7843

++	/*

7844

++	 * Relies on the RCU grace period between css_released() and this.

7845

++	 */

7846

++	sched_free_group(tg);

7847

++}

7848

++

7849

++static void cpu_cgroup_fork(struct task_struct *task)

7850

++{

7851

++}

7852

++

7853

++static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)

7854

++{

7855

++	return 0;

7856

++}

7857

++

7858

++static void cpu_cgroup_attach(struct cgroup_taskset *tset)

7859

++{

7860

++}

7861

++

7862

++#ifdef CONFIG_FAIR_GROUP_SCHED

7863

++static DEFINE_MUTEX(shares_mutex);

7864

++

7865

++int sched_group_set_shares(struct task_group *tg, unsigned long shares)

7866

++{

7867

++	/*

7868

++	 * We can't change the weight of the root cgroup.

7869

++	 */

7870

++	if (&root_task_group == tg)

7871

++		return -EINVAL;

7872

++

7873

++	shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));

7874

++

7875

++	mutex_lock(&shares_mutex);

7876

++	if (tg->shares == shares)

7877

++		goto done;

7878

++

7879

++	tg->shares = shares;

7880

++done:

7881

++	mutex_unlock(&shares_mutex);

7882

++	return 0;

7883

++}

7884

++

7885

++static int cpu_shares_write_u64(struct cgroup_subsys_state *css,

7886

++				struct cftype *cftype, u64 shareval)

7887

++{

7888

++	if (shareval > scale_load_down(ULONG_MAX))

7889

++		shareval = MAX_SHARES;

7890

++	return sched_group_set_shares(css_tg(css), scale_load(shareval));

7891

++}

7892

++

7893

++static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css,

7894

++			       struct cftype *cft)

7895

++{

7896

++	struct task_group *tg = css_tg(css);

7897

++

7898

++	return (u64) scale_load_down(tg->shares);

7899

++}

7900

++#endif

7901

++

7902

++static struct cftype cpu_legacy_files[] = {

7903

++#ifdef CONFIG_FAIR_GROUP_SCHED

7904

++	{

7905

++		.name = "shares",

7906

++		.read_u64 = cpu_shares_read_u64,

7907

++		.write_u64 = cpu_shares_write_u64,

7908

++	},

7909

++#endif

7910

++	{ }	/* Terminate */

7911

++};

7912

++

7913

++

7914

++static struct cftype cpu_files[] = {

7915

++	{ }	/* terminate */

7916

++};

7917

++

7918

++static int cpu_extra_stat_show(struct seq_file *sf,

7919

++			       struct cgroup_subsys_state *css)

7920

++{

7921

++	return 0;

7922

++}

7923

++

7924

++struct cgroup_subsys cpu_cgrp_subsys = {

7925

++	.css_alloc	= cpu_cgroup_css_alloc,

7926

++	.css_online	= cpu_cgroup_css_online,

7927

++	.css_released	= cpu_cgroup_css_released,

7928

++	.css_free	= cpu_cgroup_css_free,

7929

++	.css_extra_stat_show = cpu_extra_stat_show,

7930

++	.fork		= cpu_cgroup_fork,

7931

++	.can_attach	= cpu_cgroup_can_attach,

7932

++	.attach		= cpu_cgroup_attach,

7933

++	.legacy_cftypes	= cpu_files,

7934

++	.legacy_cftypes	= cpu_legacy_files,

7935

++	.dfl_cftypes	= cpu_files,

7936

++	.early_init	= true,

7937

++	.threaded	= true,

7938

++};

7939

++#endif	/* CONFIG_CGROUP_SCHED */

7940

++

7941

++#undef CREATE_TRACE_POINTS

7942

+diff --git a/kernel/sched/alt_debug.c b/kernel/sched/alt_debug.c

7943

+new file mode 100644

7944

+index 000000000000..1212a031700e

7945

+--- /dev/null

7946

++++ b/kernel/sched/alt_debug.c

7947

+@@ -0,0 +1,31 @@

7948

++/*

7949

++ * kernel/sched/alt_debug.c

7950

++ *

7951

++ * Print the alt scheduler debugging details

7952

++ *

7953

++ * Author: Alfred Chen

7954

++ * Date  : 2020

7955

++ */

7956

++#include "sched.h"

7957

++

7958

++/*

7959

++ * This allows printing both to /proc/sched_debug and

7960

++ * to the console

7961

++ */

7962

++#define SEQ_printf(m, x...)			\

7963

++ do {						\

7964

++	if (m)					\

7965

++		seq_printf(m, x);		\

7966

++	else					\

7967

++		pr_cont(x);			\

7968

++ } while (0)

7969

++

7970

++void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,

7971

++			  struct seq_file *m)

7972

++{

7973

++	SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, task_pid_nr_ns(p, ns),

7974

++						get_nr_threads(p));

7975

++}

7976

++

7977

++void proc_sched_set_task(struct task_struct *p)

7978

++{}

7979

+diff --git a/kernel/sched/alt_sched.h b/kernel/sched/alt_sched.h

7980

+new file mode 100644

7981

+index 000000000000..f9f79422bf0e

7982

+--- /dev/null

7983

++++ b/kernel/sched/alt_sched.h

7984

+@@ -0,0 +1,710 @@

7985

++#ifndef ALT_SCHED_H

7986

++#define ALT_SCHED_H

7987

++

7988

++#include <linux/sched.h>

7989

++

7990

++#include <linux/sched/clock.h>

7991

++#include <linux/sched/cpufreq.h>

7992

++#include <linux/sched/cputime.h>

7993

++#include <linux/sched/debug.h>

7994

++#include <linux/sched/init.h>

7995

++#include <linux/sched/isolation.h>

7996

++#include <linux/sched/loadavg.h>

7997

++#include <linux/sched/mm.h>

7998

++#include <linux/sched/nohz.h>

7999

++#include <linux/sched/signal.h>

8000

++#include <linux/sched/stat.h>

8001

++#include <linux/sched/sysctl.h>

8002

++#include <linux/sched/task.h>

8003

++#include <linux/sched/topology.h>

8004

++#include <linux/sched/wake_q.h>

8005

++

8006

++#include <uapi/linux/sched/types.h>

8007

++

8008

++#include <linux/cgroup.h>

8009

++#include <linux/cpufreq.h>

8010

++#include <linux/cpuidle.h>

8011

++#include <linux/cpuset.h>

8012

++#include <linux/ctype.h>

8013

++#include <linux/debugfs.h>

8014

++#include <linux/kthread.h>

8015

++#include <linux/livepatch.h>

8016

++#include <linux/membarrier.h>

8017

++#include <linux/proc_fs.h>

8018

++#include <linux/psi.h>

8019

++#include <linux/slab.h>

8020

++#include <linux/stop_machine.h>

8021

++#include <linux/suspend.h>

8022

++#include <linux/swait.h>

8023

++#include <linux/syscalls.h>

8024

++#include <linux/tsacct_kern.h>

8025

++

8026

++#include <asm/tlb.h>

8027

++

8028

++#ifdef CONFIG_PARAVIRT

8029

++# include <asm/paravirt.h>

8030

++#endif

8031

++

8032

++#include "cpupri.h"

8033

++

8034

++#include <trace/events/sched.h>

8035

++

8036

++#ifdef CONFIG_SCHED_BMQ

8037

++/* bits:

8038

++ * RT(0-99), (Low prio adj range, nice width, high prio adj range) / 2, cpu idle task */

8039

++#define SCHED_BITS	(MAX_RT_PRIO + NICE_WIDTH / 2 + MAX_PRIORITY_ADJ + 1)

8040

++#endif

8041

++

8042

++#ifdef CONFIG_SCHED_PDS

8043

++/* bits: RT(0-99), reserved(100-127), NORMAL_PRIO_NUM, cpu idle task */

8044

++#define SCHED_BITS	(MIN_NORMAL_PRIO + NORMAL_PRIO_NUM + 1)

8045

++#endif /* CONFIG_SCHED_PDS */

8046

++

8047

++#define IDLE_TASK_SCHED_PRIO	(SCHED_BITS - 1)

8048

++

8049

++#ifdef CONFIG_SCHED_DEBUG

8050

++# define SCHED_WARN_ON(x)	WARN_ONCE(x, #x)

8051

++extern void resched_latency_warn(int cpu, u64 latency);

8052

++#else

8053

++# define SCHED_WARN_ON(x)	({ (void)(x), 0; })

8054

++static inline void resched_latency_warn(int cpu, u64 latency) {}

8055

++#endif

8056

++

8057

++/*

8058

++ * Increase resolution of nice-level calculations for 64-bit architectures.

8059

++ * The extra resolution improves shares distribution and load balancing of

8060

++ * low-weight task groups (eg. nice +19 on an autogroup), deeper taskgroup

8061

++ * hierarchies, especially on larger systems. This is not a user-visible change

8062

++ * and does not change the user-interface for setting shares/weights.

8063

++ *

8064

++ * We increase resolution only if we have enough bits to allow this increased

8065

++ * resolution (i.e. 64-bit). The costs for increasing resolution when 32-bit

8066

++ * are pretty high and the returns do not justify the increased costs.

8067

++ *

8068

++ * Really only required when CONFIG_FAIR_GROUP_SCHED=y is also set, but to

8069

++ * increase coverage and consistency always enable it on 64-bit platforms.

8070

++ */

8071

++#ifdef CONFIG_64BIT

8072

++# define NICE_0_LOAD_SHIFT	(SCHED_FIXEDPOINT_SHIFT + SCHED_FIXEDPOINT_SHIFT)

8073

++# define scale_load(w)		((w) << SCHED_FIXEDPOINT_SHIFT)

8074

++# define scale_load_down(w) \

8075

++({ \

8076

++	unsigned long __w = (w); \

8077

++	if (__w) \

8078

++		__w = max(2UL, __w >> SCHED_FIXEDPOINT_SHIFT); \

8079

++	__w; \

8080

++})

8081

++#else

8082

++# define NICE_0_LOAD_SHIFT	(SCHED_FIXEDPOINT_SHIFT)

8083

++# define scale_load(w)		(w)

8084

++# define scale_load_down(w)	(w)

8085

++#endif

8086

++

8087

++#ifdef CONFIG_FAIR_GROUP_SCHED

8088

++#define ROOT_TASK_GROUP_LOAD	NICE_0_LOAD

8089

++

8090

++/*

8091

++ * A weight of 0 or 1 can cause arithmetics problems.

8092

++ * A weight of a cfs_rq is the sum of weights of which entities

8093

++ * are queued on this cfs_rq, so a weight of a entity should not be

8094

++ * too large, so as the shares value of a task group.

8095

++ * (The default weight is 1024 - so there's no practical

8096

++ *  limitation from this.)

8097

++ */

8098

++#define MIN_SHARES		(1UL <<  1)

8099

++#define MAX_SHARES		(1UL << 18)

8100

++#endif

8101

++

8102

++/* task_struct::on_rq states: */

8103

++#define TASK_ON_RQ_QUEUED	1

8104

++#define TASK_ON_RQ_MIGRATING	2

8105

++

8106

++static inline int task_on_rq_queued(struct task_struct *p)

8107

++{

8108

++	return p->on_rq == TASK_ON_RQ_QUEUED;

8109

++}

8110

++

8111

++static inline int task_on_rq_migrating(struct task_struct *p)

8112

++{

8113

++	return READ_ONCE(p->on_rq) == TASK_ON_RQ_MIGRATING;

8114

++}

8115

++

8116

++/*

8117

++ * wake flags

8118

++ */

8119

++#define WF_SYNC		0x01		/* waker goes to sleep after wakeup */

8120

++#define WF_FORK		0x02		/* child wakeup after fork */

8121

++#define WF_MIGRATED	0x04		/* internal use, task got migrated */

8122

++#define WF_ON_CPU	0x08		/* Wakee is on_rq */

8123

++

8124

++#define SCHED_QUEUE_BITS	(SCHED_BITS - 1)

8125

++

8126

++struct sched_queue {

8127

++	DECLARE_BITMAP(bitmap, SCHED_QUEUE_BITS);

8128

++	struct list_head heads[SCHED_BITS];

8129

++};

8130

++

8131

++/*

8132

++ * This is the main, per-CPU runqueue data structure.

8133

++ * This data should only be modified by the local cpu.

8134

++ */

8135

++struct rq {

8136

++	/* runqueue lock: */

8137

++	raw_spinlock_t lock;

8138

++

8139

++	struct task_struct __rcu *curr;

8140

++	struct task_struct *idle, *stop, *skip;

8141

++	struct mm_struct *prev_mm;

8142

++

8143

++	struct sched_queue	queue;

8144

++#ifdef CONFIG_SCHED_PDS

8145

++	u64			time_edge;

8146

++#endif

8147

++	unsigned long watermark;

8148

++

8149

++	/* switch count */

8150

++	u64 nr_switches;

8151

++

8152

++	atomic_t nr_iowait;

8153

++

8154

++#ifdef CONFIG_SCHED_DEBUG

8155

++	u64 last_seen_need_resched_ns;

8156

++	int ticks_without_resched;

8157

++#endif

8158

++

8159

++#ifdef CONFIG_MEMBARRIER

8160

++	int membarrier_state;

8161

++#endif

8162

++

8163

++#ifdef CONFIG_SMP

8164

++	int cpu;		/* cpu of this runqueue */

8165

++	bool online;

8166

++

8167

++	unsigned int		ttwu_pending;

8168

++	unsigned char		nohz_idle_balance;

8169

++	unsigned char		idle_balance;

8170

++

8171

++#ifdef CONFIG_HAVE_SCHED_AVG_IRQ

8172

++	struct sched_avg	avg_irq;

8173

++#endif

8174

++

8175

++#ifdef CONFIG_SCHED_SMT

8176

++	int active_balance;

8177

++	struct cpu_stop_work	active_balance_work;

8178

++#endif

8179

++	struct callback_head	*balance_callback;

8180

++#ifdef CONFIG_HOTPLUG_CPU

8181

++	struct rcuwait		hotplug_wait;

8182

++#endif

8183

++	unsigned int		nr_pinned;

8184

++#endif /* CONFIG_SMP */

8185

++#ifdef CONFIG_IRQ_TIME_ACCOUNTING

8186

++	u64 prev_irq_time;

8187

++#endif /* CONFIG_IRQ_TIME_ACCOUNTING */

8188

++#ifdef CONFIG_PARAVIRT

8189

++	u64 prev_steal_time;

8190

++#endif /* CONFIG_PARAVIRT */

8191

++#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING

8192

++	u64 prev_steal_time_rq;

8193

++#endif /* CONFIG_PARAVIRT_TIME_ACCOUNTING */

8194

++

8195

++	/* calc_load related fields */

8196

++	unsigned long calc_load_update;

8197

++	long calc_load_active;

8198

++

8199

++	u64 clock, last_tick;

8200

++	u64 last_ts_switch;

8201

++	u64 clock_task;

8202

++

8203

++	unsigned int  nr_running;

8204

++	unsigned long nr_uninterruptible;

8205

++

8206

++#ifdef CONFIG_SCHED_HRTICK

8207

++#ifdef CONFIG_SMP

8208

++	call_single_data_t hrtick_csd;

8209

++#endif

8210

++	struct hrtimer		hrtick_timer;

8211

++	ktime_t			hrtick_time;

8212

++#endif

8213

++

8214

++#ifdef CONFIG_SCHEDSTATS

8215

++

8216

++	/* latency stats */

8217

++	struct sched_info rq_sched_info;

8218

++	unsigned long long rq_cpu_time;

8219

++	/* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */

8220

++

8221

++	/* sys_sched_yield() stats */

8222

++	unsigned int yld_count;

8223

++

8224

++	/* schedule() stats */

8225

++	unsigned int sched_switch;

8226

++	unsigned int sched_count;

8227

++	unsigned int sched_goidle;

8228

++

8229

++	/* try_to_wake_up() stats */

8230

++	unsigned int ttwu_count;

8231

++	unsigned int ttwu_local;

8232

++#endif /* CONFIG_SCHEDSTATS */

8233

++

8234

++#ifdef CONFIG_CPU_IDLE

8235

++	/* Must be inspected within a rcu lock section */

8236

++	struct cpuidle_state *idle_state;

8237

++#endif

8238

++

8239

++#ifdef CONFIG_NO_HZ_COMMON

8240

++#ifdef CONFIG_SMP

8241

++	call_single_data_t	nohz_csd;

8242

++#endif

8243

++	atomic_t		nohz_flags;

8244

++#endif /* CONFIG_NO_HZ_COMMON */

8245

++};

8246

++

8247

++extern unsigned long calc_load_update;

8248

++extern atomic_long_t calc_load_tasks;

8249

++

8250

++extern void calc_global_load_tick(struct rq *this_rq);

8251

++extern long calc_load_fold_active(struct rq *this_rq, long adjust);

8252

++

8253

++DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);

8254

++#define cpu_rq(cpu)		(&per_cpu(runqueues, (cpu)))

8255

++#define this_rq()		this_cpu_ptr(&runqueues)

8256

++#define task_rq(p)		cpu_rq(task_cpu(p))

8257

++#define cpu_curr(cpu)		(cpu_rq(cpu)->curr)

8258

++#define raw_rq()		raw_cpu_ptr(&runqueues)

8259

++

8260

++#ifdef CONFIG_SMP

8261

++#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)

8262

++void register_sched_domain_sysctl(void);

8263

++void unregister_sched_domain_sysctl(void);

8264

++#else

8265

++static inline void register_sched_domain_sysctl(void)

8266

++{

8267

++}

8268

++static inline void unregister_sched_domain_sysctl(void)

8269

++{

8270

++}

8271

++#endif

8272

++

8273

++extern bool sched_smp_initialized;

8274

++

8275

++enum {

8276

++	ITSELF_LEVEL_SPACE_HOLDER,

8277

++#ifdef CONFIG_SCHED_SMT

8278

++	SMT_LEVEL_SPACE_HOLDER,

8279

++#endif

8280

++	COREGROUP_LEVEL_SPACE_HOLDER,

8281

++	CORE_LEVEL_SPACE_HOLDER,

8282

++	OTHER_LEVEL_SPACE_HOLDER,

8283

++	NR_CPU_AFFINITY_LEVELS

8284

++};

8285

++

8286

++DECLARE_PER_CPU(cpumask_t [NR_CPU_AFFINITY_LEVELS], sched_cpu_topo_masks);

8287

++DECLARE_PER_CPU(cpumask_t *, sched_cpu_llc_mask);

8288

++

8289

++static inline int __best_mask_cpu(int cpu, const cpumask_t *cpumask,

8290

++				  const cpumask_t *mask)

8291

++{

8292

++#if NR_CPUS <= 64

8293

++	unsigned long t;

8294

++

8295

++	while ((t = cpumask->bits[0] & mask->bits[0]) == 0UL)

8296

++		mask++;

8297

++

8298

++	return __ffs(t);

8299

++#else

8300

++	while ((cpu = cpumask_any_and(cpumask, mask)) >= nr_cpu_ids)

8301

++		mask++;

8302

++	return cpu;

8303

++#endif

8304

++}

8305

++

8306

++static inline int best_mask_cpu(int cpu, const cpumask_t *mask)

8307

++{

8308

++#if NR_CPUS <= 64

8309

++	unsigned long llc_match;

8310

++	cpumask_t *chk = per_cpu(sched_cpu_llc_mask, cpu);

8311

++

8312

++	if ((llc_match = mask->bits[0] & chk->bits[0])) {

8313

++		unsigned long match;

8314

++

8315

++		chk = per_cpu(sched_cpu_topo_masks, cpu);

8316

++		if (mask->bits[0] & chk->bits[0])

8317

++			return cpu;

8318

++

8319

++#ifdef CONFIG_SCHED_SMT

8320

++		chk++;

8321

++		if ((match = mask->bits[0] & chk->bits[0]))

8322

++			return __ffs(match);

8323

++#endif

8324

++

8325

++		return __ffs(llc_match);

8326

++	}

8327

++

8328

++	return __best_mask_cpu(cpu, mask, chk + 1);

8329

++#else

8330

++	cpumask_t llc_match;

8331

++	cpumask_t *chk = per_cpu(sched_cpu_llc_mask, cpu);

8332

++

8333

++	if (cpumask_and(&llc_match, mask, chk)) {

8334

++		cpumask_t tmp;

8335

++

8336

++		chk = per_cpu(sched_cpu_topo_masks, cpu);

8337

++		if (cpumask_test_cpu(cpu, mask))

8338

++			return cpu;

8339

++

8340

++#ifdef CONFIG_SCHED_SMT

8341

++		chk++;

8342

++		if (cpumask_and(&tmp, mask, chk))

8343

++			return cpumask_any(&tmp);

8344

++#endif

8345

++

8346

++		return cpumask_any(&llc_match);

8347

++	}

8348

++

8349

++	return __best_mask_cpu(cpu, mask, chk + 1);

8350

++#endif

8351

++}

8352

++

8353

++extern void flush_smp_call_function_from_idle(void);

8354

++

8355

++#else  /* !CONFIG_SMP */

8356

++static inline void flush_smp_call_function_from_idle(void) { }

8357

++#endif

8358

++

8359

++#ifndef arch_scale_freq_tick

8360

++static __always_inline

8361

++void arch_scale_freq_tick(void)

8362

++{

8363

++}

8364

++#endif

8365

++

8366

++#ifndef arch_scale_freq_capacity

8367

++static __always_inline

8368

++unsigned long arch_scale_freq_capacity(int cpu)

8369

++{

8370

++	return SCHED_CAPACITY_SCALE;

8371

++}

8372

++#endif

8373

++

8374

++static inline u64 __rq_clock_broken(struct rq *rq)

8375

++{

8376

++	return READ_ONCE(rq->clock);

8377

++}

8378

++

8379

++static inline u64 rq_clock(struct rq *rq)

8380

++{

8381

++	/*

8382

++	 * Relax lockdep_assert_held() checking as in VRQ, call to

8383

++	 * sched_info_xxxx() may not held rq->lock

8384

++	 * lockdep_assert_held(&rq->lock);

8385

++	 */

8386

++	return rq->clock;

8387

++}

8388

++

8389

++static inline u64 rq_clock_task(struct rq *rq)

8390

++{

8391

++	/*

8392

++	 * Relax lockdep_assert_held() checking as in VRQ, call to

8393

++	 * sched_info_xxxx() may not held rq->lock

8394

++	 * lockdep_assert_held(&rq->lock);

8395

++	 */

8396

++	return rq->clock_task;

8397

++}

8398

++

8399

++/*

8400

++ * {de,en}queue flags:

8401

++ *

8402

++ * DEQUEUE_SLEEP  - task is no longer runnable

8403

++ * ENQUEUE_WAKEUP - task just became runnable

8404

++ *

8405

++ */

8406

++

8407

++#define DEQUEUE_SLEEP		0x01

8408

++

8409

++#define ENQUEUE_WAKEUP		0x01

8410

++

8411

++

8412

++/*

8413

++ * Below are scheduler API which using in other kernel code

8414

++ * It use the dummy rq_flags

8415

++ * ToDo : BMQ need to support these APIs for compatibility with mainline

8416

++ * scheduler code.

8417

++ */

8418

++struct rq_flags {

8419

++	unsigned long flags;

8420

++};

8421

++

8422

++struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf)

8423

++	__acquires(rq->lock);

8424

++

8425

++struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)

8426

++	__acquires(p->pi_lock)

8427

++	__acquires(rq->lock);

8428

++

8429

++static inline void __task_rq_unlock(struct rq *rq, struct rq_flags *rf)

8430

++	__releases(rq->lock)

8431

++{

8432

++	raw_spin_unlock(&rq->lock);

8433

++}

8434

++

8435

++static inline void

8436

++task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf)

8437

++	__releases(rq->lock)

8438

++	__releases(p->pi_lock)

8439

++{

8440

++	raw_spin_unlock(&rq->lock);

8441

++	raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags);

8442

++}

8443

++

8444

++static inline void

8445

++rq_lock(struct rq *rq, struct rq_flags *rf)

8446

++	__acquires(rq->lock)

8447

++{

8448

++	raw_spin_lock(&rq->lock);

8449

++}

8450

++

8451

++static inline void

8452

++rq_unlock_irq(struct rq *rq, struct rq_flags *rf)

8453

++	__releases(rq->lock)

8454

++{

8455

++	raw_spin_unlock_irq(&rq->lock);

8456

++}

8457

++

8458

++static inline void

8459

++rq_unlock(struct rq *rq, struct rq_flags *rf)

8460

++	__releases(rq->lock)

8461

++{

8462

++	raw_spin_unlock(&rq->lock);

8463

++}

8464

++

8465

++static inline struct rq *

8466

++this_rq_lock_irq(struct rq_flags *rf)

8467

++	__acquires(rq->lock)

8468

++{

8469

++	struct rq *rq;

8470

++

8471

++	local_irq_disable();

8472

++	rq = this_rq();

8473

++	raw_spin_lock(&rq->lock);

8474

++

8475

++	return rq;

8476

++}

8477

++

8478

++static inline int task_current(struct rq *rq, struct task_struct *p)

8479

++{

8480

++	return rq->curr == p;

8481

++}

8482

++

8483

++static inline bool task_running(struct task_struct *p)

8484

++{

8485

++	return p->on_cpu;

8486

++}

8487

++

8488

++extern int task_running_nice(struct task_struct *p);

8489

++

8490

++extern struct static_key_false sched_schedstats;

8491

++

8492

++#ifdef CONFIG_CPU_IDLE

8493

++static inline void idle_set_state(struct rq *rq,

8494

++				  struct cpuidle_state *idle_state)

8495

++{

8496

++	rq->idle_state = idle_state;

8497

++}

8498

++

8499

++static inline struct cpuidle_state *idle_get_state(struct rq *rq)

8500

++{

8501

++	WARN_ON(!rcu_read_lock_held());

8502

++	return rq->idle_state;

8503

++}

8504

++#else

8505

++static inline void idle_set_state(struct rq *rq,

8506

++				  struct cpuidle_state *idle_state)

8507

++{

8508

++}

8509

++

8510

++static inline struct cpuidle_state *idle_get_state(struct rq *rq)

8511

++{

8512

++	return NULL;

8513

++}

8514

++#endif

8515

++

8516

++static inline int cpu_of(const struct rq *rq)

8517

++{

8518

++#ifdef CONFIG_SMP

8519

++	return rq->cpu;

8520

++#else

8521

++	return 0;

8522

++#endif

8523

++}

8524

++

8525

++#include "stats.h"

8526

++

8527

++#ifdef CONFIG_NO_HZ_COMMON

8528

++#define NOHZ_BALANCE_KICK_BIT	0

8529

++#define NOHZ_STATS_KICK_BIT	1

8530

++

8531

++#define NOHZ_BALANCE_KICK	BIT(NOHZ_BALANCE_KICK_BIT)

8532

++#define NOHZ_STATS_KICK		BIT(NOHZ_STATS_KICK_BIT)

8533

++

8534

++#define NOHZ_KICK_MASK	(NOHZ_BALANCE_KICK | NOHZ_STATS_KICK)

8535

++

8536

++#define nohz_flags(cpu)	(&cpu_rq(cpu)->nohz_flags)

8537

++

8538

++/* TODO: needed?

8539

++extern void nohz_balance_exit_idle(struct rq *rq);

8540

++#else

8541

++static inline void nohz_balance_exit_idle(struct rq *rq) { }

8542

++*/

8543

++#endif

8544

++

8545

++#ifdef CONFIG_IRQ_TIME_ACCOUNTING

8546

++struct irqtime {

8547

++	u64			total;

8548

++	u64			tick_delta;

8549

++	u64			irq_start_time;

8550

++	struct u64_stats_sync	sync;

8551

++};

8552

++

8553

++DECLARE_PER_CPU(struct irqtime, cpu_irqtime);

8554

++

8555

++/*

8556

++ * Returns the irqtime minus the softirq time computed by ksoftirqd.

8557

++ * Otherwise ksoftirqd's sum_exec_runtime is substracted its own runtime

8558

++ * and never move forward.

8559

++ */

8560

++static inline u64 irq_time_read(int cpu)

8561

++{

8562

++	struct irqtime *irqtime = &per_cpu(cpu_irqtime, cpu);

8563

++	unsigned int seq;

8564

++	u64 total;

8565

++

8566

++	do {

8567

++		seq = __u64_stats_fetch_begin(&irqtime->sync);

8568

++		total = irqtime->total;

8569

++	} while (__u64_stats_fetch_retry(&irqtime->sync, seq));

8570

++

8571

++	return total;

8572

++}

8573

++#endif /* CONFIG_IRQ_TIME_ACCOUNTING */

8574

++

8575

++#ifdef CONFIG_CPU_FREQ

8576

++DECLARE_PER_CPU(struct update_util_data __rcu *, cpufreq_update_util_data);

8577

++

8578

++/**

8579

++ * cpufreq_update_util - Take a note about CPU utilization changes.

8580

++ * @rq: Runqueue to carry out the update for.

8581

++ * @flags: Update reason flags.

8582

++ *

8583

++ * This function is called by the scheduler on the CPU whose utilization is

8584

++ * being updated.

8585

++ *

8586

++ * It can only be called from RCU-sched read-side critical sections.

8587

++ *

8588

++ * The way cpufreq is currently arranged requires it to evaluate the CPU

8589

++ * performance state (frequency/voltage) on a regular basis to prevent it from

8590

++ * being stuck in a completely inadequate performance level for too long.

8591

++ * That is not guaranteed to happen if the updates are only triggered from CFS

8592

++ * and DL, though, because they may not be coming in if only RT tasks are

8593

++ * active all the time (or there are RT tasks only).

8594

++ *

8595

++ * As a workaround for that issue, this function is called periodically by the

8596

++ * RT sched class to trigger extra cpufreq updates to prevent it from stalling,

8597

++ * but that really is a band-aid.  Going forward it should be replaced with

8598

++ * solutions targeted more specifically at RT tasks.

8599

++ */

8600

++static inline void cpufreq_update_util(struct rq *rq, unsigned int flags)

8601

++{

8602

++	struct update_util_data *data;

8603

++

8604

++	data = rcu_dereference_sched(*per_cpu_ptr(&cpufreq_update_util_data,

8605

++						  cpu_of(rq)));

8606

++	if (data)

8607

++		data->func(data, rq_clock(rq), flags);

8608

++}

8609

++#else

8610

++static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {}

8611

++#endif /* CONFIG_CPU_FREQ */

8612

++

8613

++#ifdef CONFIG_NO_HZ_FULL

8614

++extern int __init sched_tick_offload_init(void);

8615

++#else

8616

++static inline int sched_tick_offload_init(void) { return 0; }

8617

++#endif

8618

++

8619

++#ifdef arch_scale_freq_capacity

8620

++#ifndef arch_scale_freq_invariant

8621

++#define arch_scale_freq_invariant()	(true)

8622

++#endif

8623

++#else /* arch_scale_freq_capacity */

8624

++#define arch_scale_freq_invariant()	(false)

8625

++#endif

8626

++

8627

++extern void schedule_idle(void);

8628

++

8629

++#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)

8630

++

8631

++/*

8632

++ * !! For sched_setattr_nocheck() (kernel) only !!

8633

++ *

8634

++ * This is actually gross. :(

8635

++ *

8636

++ * It is used to make schedutil kworker(s) higher priority than SCHED_DEADLINE

8637

++ * tasks, but still be able to sleep. We need this on platforms that cannot

8638

++ * atomically change clock frequency. Remove once fast switching will be

8639

++ * available on such platforms.

8640

++ *

8641

++ * SUGOV stands for SchedUtil GOVernor.

8642

++ */

8643

++#define SCHED_FLAG_SUGOV	0x10000000

8644

++

8645

++#ifdef CONFIG_MEMBARRIER

8646

++/*

8647

++ * The scheduler provides memory barriers required by membarrier between:

8648

++ * - prior user-space memory accesses and store to rq->membarrier_state,

8649

++ * - store to rq->membarrier_state and following user-space memory accesses.

8650

++ * In the same way it provides those guarantees around store to rq->curr.

8651

++ */

8652

++static inline void membarrier_switch_mm(struct rq *rq,

8653

++					struct mm_struct *prev_mm,

8654

++					struct mm_struct *next_mm)

8655

++{

8656

++	int membarrier_state;

8657

++

8658

++	if (prev_mm == next_mm)

8659

++		return;

8660

++

8661

++	membarrier_state = atomic_read(&next_mm->membarrier_state);

8662

++	if (READ_ONCE(rq->membarrier_state) == membarrier_state)

8663

++		return;

8664

++

8665

++	WRITE_ONCE(rq->membarrier_state, membarrier_state);

8666

++}

8667

++#else

8668

++static inline void membarrier_switch_mm(struct rq *rq,

8669

++					struct mm_struct *prev_mm,

8670

++					struct mm_struct *next_mm)

8671

++{

8672

++}

8673

++#endif

8674

++

8675

++#ifdef CONFIG_NUMA

8676

++extern int sched_numa_find_closest(const struct cpumask *cpus, int cpu);

8677

++#else

8678

++static inline int sched_numa_find_closest(const struct cpumask *cpus, int cpu)

8679

++{

8680

++	return nr_cpu_ids;

8681

++}

8682

++#endif

8683

++

8684

++extern void swake_up_all_locked(struct swait_queue_head *q);

8685

++extern void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait);

8686

++

8687

++#ifdef CONFIG_PREEMPT_DYNAMIC

8688

++extern int preempt_dynamic_mode;

8689

++extern int sched_dynamic_mode(const char *str);

8690

++extern void sched_dynamic_update(int mode);

8691

++#endif

8692

++

8693

++static inline void nohz_run_idle_balance(int cpu) { }

8694

++#endif /* ALT_SCHED_H */

8695

+diff --git a/kernel/sched/bmq.h b/kernel/sched/bmq.h

8696

+new file mode 100644

8697

+index 000000000000..7635c00dde7f

8698

+--- /dev/null

8699

++++ b/kernel/sched/bmq.h

8700

+@@ -0,0 +1,111 @@

8701

++#define ALT_SCHED_VERSION_MSG "sched/bmq: BMQ CPU Scheduler "ALT_SCHED_VERSION" by Alfred Chen.\n"

8702

++

8703

++/*

8704

++ * BMQ only routines

8705

++ */

8706

++#define rq_switch_time(rq)	((rq)->clock - (rq)->last_ts_switch)

8707

++#define boost_threshold(p)	(sched_timeslice_ns >>\

8708

++				 (15 - MAX_PRIORITY_ADJ -  (p)->boost_prio))

8709

++

8710

++static inline void boost_task(struct task_struct *p)

8711

++{

8712

++	int limit;

8713

++

8714

++	switch (p->policy) {

8715

++	case SCHED_NORMAL:

8716

++		limit = -MAX_PRIORITY_ADJ;

8717

++		break;

8718

++	case SCHED_BATCH:

8719

++	case SCHED_IDLE:

8720

++		limit = 0;

8721

++		break;

8722

++	default:

8723

++		return;

8724

++	}

8725

++

8726

++	if (p->boost_prio > limit)

8727

++		p->boost_prio--;

8728

++}

8729

++

8730

++static inline void deboost_task(struct task_struct *p)

8731

++{

8732

++	if (p->boost_prio < MAX_PRIORITY_ADJ)

8733

++		p->boost_prio++;

8734

++}

8735

++

8736

++/*

8737

++ * Common interfaces

8738

++ */

8739

++static inline void sched_timeslice_imp(const int timeslice_ms) {}

8740

++

8741

++static inline int

8742

++task_sched_prio_normal(const struct task_struct *p, const struct rq *rq)

8743

++{

8744

++	return p->prio + p->boost_prio - MAX_RT_PRIO;

8745

++}

8746

++

8747

++static inline int task_sched_prio(const struct task_struct *p)

8748

++{

8749

++	return (p->prio < MAX_RT_PRIO)? p->prio : MAX_RT_PRIO / 2 + (p->prio + p->boost_prio) / 2;

8750

++}

8751

++

8752

++static inline int

8753

++task_sched_prio_idx(const struct task_struct *p, const struct rq *rq)

8754

++{

8755

++	return task_sched_prio(p);

8756

++}

8757

++

8758

++static inline int sched_prio2idx(int prio, struct rq *rq)

8759

++{

8760

++	return prio;

8761

++}

8762

++

8763

++static inline int sched_idx2prio(int idx, struct rq *rq)

8764

++{

8765

++	return idx;

8766

++}

8767

++

8768

++static inline void time_slice_expired(struct task_struct *p, struct rq *rq)

8769

++{

8770

++	p->time_slice = sched_timeslice_ns;

8771

++

8772

++	if (SCHED_FIFO != p->policy && task_on_rq_queued(p)) {

8773

++		if (SCHED_RR != p->policy)

8774

++			deboost_task(p);

8775

++		requeue_task(p, rq);

8776

++	}

8777

++}

8778

++

8779

++static inline void sched_task_sanity_check(struct task_struct *p, struct rq *rq) {}

8780

++

8781

++inline int task_running_nice(struct task_struct *p)

8782

++{

8783

++	return (p->prio + p->boost_prio > DEFAULT_PRIO + MAX_PRIORITY_ADJ);

8784

++}

8785

++

8786

++static void sched_task_fork(struct task_struct *p, struct rq *rq)

8787

++{

8788

++	p->boost_prio = (p->boost_prio < 0) ?

8789

++		p->boost_prio + MAX_PRIORITY_ADJ : MAX_PRIORITY_ADJ;

8790

++}

8791

++

8792

++static void do_sched_yield_type_1(struct task_struct *p, struct rq *rq)

8793

++{

8794

++	p->boost_prio = MAX_PRIORITY_ADJ;

8795

++}

8796

++

8797

++#ifdef CONFIG_SMP

8798

++static void sched_task_ttwu(struct task_struct *p)

8799

++{

8800

++	if(this_rq()->clock_task - p->last_ran > sched_timeslice_ns)

8801

++		boost_task(p);

8802

++}

8803

++#endif

8804

++

8805

++static void sched_task_deactivate(struct task_struct *p, struct rq *rq)

8806

++{

8807

++	if (rq_switch_time(rq) < boost_threshold(p))

8808

++		boost_task(p);

8809

++}

8810

++

8811

++static inline void update_rq_time_edge(struct rq *rq) {}

8812

+diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c

8813

+index 4f09afd2f321..805b54e517ff 100644

8814

+--- a/kernel/sched/cpufreq_schedutil.c

8815

++++ b/kernel/sched/cpufreq_schedutil.c

8816

+@@ -57,6 +57,13 @@ struct sugov_cpu {

8817

+ 	unsigned long		bw_dl;

8818

+ 	unsigned long		max;

8819

+

8820

++#ifdef CONFIG_SCHED_ALT

8821

++	/* For genenal cpu load util */

8822

++	s32			load_history;

8823

++	u64			load_block;

8824

++	u64			load_stamp;

8825

++#endif

8826

++

8827

+ 	/* The field below is for single-CPU policies only: */

8828

+ #ifdef CONFIG_NO_HZ_COMMON

8829

+ 	unsigned long		saved_idle_calls;

8830

+@@ -160,6 +167,7 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy,

8831

+ 	return cpufreq_driver_resolve_freq(policy, freq);

8832

+ }

8833

+

8834

++#ifndef CONFIG_SCHED_ALT

8835

+ static void sugov_get_util(struct sugov_cpu *sg_cpu)

8836

+ {

8837

+ 	struct rq *rq = cpu_rq(sg_cpu->cpu);

8838

+@@ -171,6 +179,55 @@ static void sugov_get_util(struct sugov_cpu *sg_cpu)

8839

+ 					  FREQUENCY_UTIL, NULL);

8840

+ }

8841

+

8842

++#else /* CONFIG_SCHED_ALT */

8843

++

8844

++#define SG_CPU_LOAD_HISTORY_BITS	(sizeof(s32) * 8ULL)

8845

++#define SG_CPU_UTIL_SHIFT		(8)

8846

++#define SG_CPU_LOAD_HISTORY_SHIFT	(SG_CPU_LOAD_HISTORY_BITS - 1 - SG_CPU_UTIL_SHIFT)

8847

++#define SG_CPU_LOAD_HISTORY_TO_UTIL(l)	(((l) >> SG_CPU_LOAD_HISTORY_SHIFT) & 0xff)

8848

++

8849

++#define LOAD_BLOCK(t)		((t) >> 17)

8850

++#define LOAD_HALF_BLOCK(t)	((t) >> 16)

8851

++#define BLOCK_MASK(t)		((t) & ((0x01 << 18) - 1))

8852

++#define LOAD_BLOCK_BIT(b)	(1UL << (SG_CPU_LOAD_HISTORY_BITS - 1 - (b)))

8853

++#define CURRENT_LOAD_BIT	LOAD_BLOCK_BIT(0)

8854

++

8855

++static void sugov_get_util(struct sugov_cpu *sg_cpu)

8856

++{

8857

++	unsigned long max = arch_scale_cpu_capacity(sg_cpu->cpu);

8858

++

8859

++	sg_cpu->max = max;

8860

++	sg_cpu->bw_dl = 0;

8861

++	sg_cpu->util = SG_CPU_LOAD_HISTORY_TO_UTIL(sg_cpu->load_history) *

8862

++		(max >> SG_CPU_UTIL_SHIFT);

8863

++}

8864

++

8865

++static inline void sugov_cpu_load_update(struct sugov_cpu *sg_cpu, u64 time)

8866

++{

8867

++	u64 delta = min(LOAD_BLOCK(time) - LOAD_BLOCK(sg_cpu->load_stamp),

8868

++			SG_CPU_LOAD_HISTORY_BITS - 1);

8869

++	u64 prev = !!(sg_cpu->load_history & CURRENT_LOAD_BIT);

8870

++	u64 curr = !!cpu_rq(sg_cpu->cpu)->nr_running;

8871

++

8872

++	if (delta) {

8873

++		sg_cpu->load_history = sg_cpu->load_history >> delta;

8874

++

8875

++		if (delta <= SG_CPU_UTIL_SHIFT) {

8876

++			sg_cpu->load_block += (~BLOCK_MASK(sg_cpu->load_stamp)) * prev;

8877

++			if (!!LOAD_HALF_BLOCK(sg_cpu->load_block) ^ curr)

8878

++				sg_cpu->load_history ^= LOAD_BLOCK_BIT(delta);

8879

++		}

8880

++

8881

++		sg_cpu->load_block = BLOCK_MASK(time) * prev;

8882

++	} else {

8883

++		sg_cpu->load_block += (time - sg_cpu->load_stamp) * prev;

8884

++	}

8885

++	if (prev ^ curr)

8886

++		sg_cpu->load_history ^= CURRENT_LOAD_BIT;

8887

++	sg_cpu->load_stamp = time;

8888

++}

8889

++#endif /* CONFIG_SCHED_ALT */

8890

++

8891

+ /**

8892

+  * sugov_iowait_reset() - Reset the IO boost status of a CPU.

8893

+  * @sg_cpu: the sugov data for the CPU to boost

8894

+@@ -311,13 +368,19 @@ static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; }

8895

+  */

8896

+ static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu)

8897

+ {

8898

++#ifndef CONFIG_SCHED_ALT

8899

+ 	if (cpu_bw_dl(cpu_rq(sg_cpu->cpu)) > sg_cpu->bw_dl)

8900

+ 		sg_cpu->sg_policy->limits_changed = true;

8901

++#endif

8902

+ }

8903

+

8904

+ static inline bool sugov_update_single_common(struct sugov_cpu *sg_cpu,

8905

+ 					      u64 time, unsigned int flags)

8906

+ {

8907

++#ifdef CONFIG_SCHED_ALT

8908

++	sugov_cpu_load_update(sg_cpu, time);

8909

++#endif /* CONFIG_SCHED_ALT */

8910

++

8911

+ 	sugov_iowait_boost(sg_cpu, time, flags);

8912

+ 	sg_cpu->last_update = time;

8913

+

8914

+@@ -438,6 +501,10 @@ sugov_update_shared(struct update_util_data *hook, u64 time, unsigned int flags)

8915

+

8916

+ 	raw_spin_lock(&sg_policy->update_lock);

8917

+

8918

++#ifdef CONFIG_SCHED_ALT

8919

++	sugov_cpu_load_update(sg_cpu, time);

8920

++#endif /* CONFIG_SCHED_ALT */

8921

++

8922

+ 	sugov_iowait_boost(sg_cpu, time, flags);

8923

+ 	sg_cpu->last_update = time;

8924

+

8925

+@@ -598,6 +665,7 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy)

8926

+ 	}

8927

+

8928

+ 	ret = sched_setattr_nocheck(thread, &attr);

8929

++

8930

+ 	if (ret) {

8931

+ 		kthread_stop(thread);

8932

+ 		pr_warn("%s: failed to set SCHED_DEADLINE\n", __func__);

8933

+@@ -832,7 +900,9 @@ cpufreq_governor_init(schedutil_gov);

8934

+ #ifdef CONFIG_ENERGY_MODEL

8935

+ static void rebuild_sd_workfn(struct work_struct *work)

8936

+ {

8937

++#ifndef CONFIG_SCHED_ALT

8938

+ 	rebuild_sched_domains_energy();

8939

++#endif /* CONFIG_SCHED_ALT */

8940

+ }

8941

+ static DECLARE_WORK(rebuild_sd_work, rebuild_sd_workfn);

8942

+

8943

+diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c

8944

+index 872e481d5098..f920c8b48ec1 100644

8945

+--- a/kernel/sched/cputime.c

8946

++++ b/kernel/sched/cputime.c

8947

+@@ -123,7 +123,7 @@ void account_user_time(struct task_struct *p, u64 cputime)

8948

+ 	p->utime += cputime;

8949

+ 	account_group_user_time(p, cputime);

8950

+

8951

+-	index = (task_nice(p) > 0) ? CPUTIME_NICE : CPUTIME_USER;

8952

++	index = task_running_nice(p) ? CPUTIME_NICE : CPUTIME_USER;

8953

+

8954

+ 	/* Add user time to cpustat. */

8955

+ 	task_group_account_field(p, index, cputime);

8956

+@@ -147,7 +147,7 @@ void account_guest_time(struct task_struct *p, u64 cputime)

8957

+ 	p->gtime += cputime;

8958

+

8959

+ 	/* Add guest time to cpustat. */

8960

+-	if (task_nice(p) > 0) {

8961

++	if (task_running_nice(p)) {

8962

+ 		cpustat[CPUTIME_NICE] += cputime;

8963

+ 		cpustat[CPUTIME_GUEST_NICE] += cputime;

8964

+ 	} else {

8965

+@@ -270,7 +270,7 @@ static inline u64 account_other_time(u64 max)

8966

+ #ifdef CONFIG_64BIT

8967

+ static inline u64 read_sum_exec_runtime(struct task_struct *t)

8968

+ {

8969

+-	return t->se.sum_exec_runtime;

8970

++	return tsk_seruntime(t);

8971

+ }

8972

+ #else

8973

+ static u64 read_sum_exec_runtime(struct task_struct *t)

8974

+@@ -280,7 +280,7 @@ static u64 read_sum_exec_runtime(struct task_struct *t)

8975

+ 	struct rq *rq;

8976

+

8977

+ 	rq = task_rq_lock(t, &rf);

8978

+-	ns = t->se.sum_exec_runtime;

8979

++	ns = tsk_seruntime(t);

8980

+ 	task_rq_unlock(rq, t, &rf);

8981

+

8982

+ 	return ns;

8983

+@@ -612,7 +612,7 @@ void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev,

8984

+ void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st)

8985

+ {

8986

+ 	struct task_cputime cputime = {

8987

+-		.sum_exec_runtime = p->se.sum_exec_runtime,

8988

++		.sum_exec_runtime = tsk_seruntime(p),

8989

+ 	};

8990

+

8991

+ 	task_cputime(p, &cputime.utime, &cputime.stime);

8992

+diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c

8993

+index c5aacbd492a1..105433c36b5f 100644

8994

+--- a/kernel/sched/debug.c

8995

++++ b/kernel/sched/debug.c

8996

+@@ -8,6 +8,7 @@

8997

+  */

8998

+ #include "sched.h"

8999

+

9000

++#ifndef CONFIG_SCHED_ALT

9001

+ /*

9002

+  * This allows printing both to /proc/sched_debug and

9003

+  * to the console

9004

+@@ -210,6 +211,7 @@ static const struct file_operations sched_scaling_fops = {

9005

+ };

9006

+

9007

+ #endif /* SMP */

9008

++#endif /* !CONFIG_SCHED_ALT */

9009

+

9010

+ #ifdef CONFIG_PREEMPT_DYNAMIC

9011

+

9012

+@@ -273,6 +275,7 @@ static const struct file_operations sched_dynamic_fops = {

9013

+

9014

+ #endif /* CONFIG_PREEMPT_DYNAMIC */

9015

+

9016

++#ifndef CONFIG_SCHED_ALT

9017

+ __read_mostly bool sched_debug_verbose;

9018

+

9019

+ static const struct seq_operations sched_debug_sops;

9020

+@@ -288,6 +291,7 @@ static const struct file_operations sched_debug_fops = {

9021

+ 	.llseek		= seq_lseek,

9022

+ 	.release	= seq_release,

9023

+ };

9024

++#endif /* !CONFIG_SCHED_ALT */

9025

+

9026

+ static struct dentry *debugfs_sched;

9027

+

9028

+@@ -297,12 +301,15 @@ static __init int sched_init_debug(void)

9029

+

9030

+ 	debugfs_sched = debugfs_create_dir("sched", NULL);

9031

+

9032

++#ifndef CONFIG_SCHED_ALT

9033

+ 	debugfs_create_file("features", 0644, debugfs_sched, NULL, &sched_feat_fops);

9034

+ 	debugfs_create_bool("verbose", 0644, debugfs_sched, &sched_debug_verbose);

9035

++#endif /* !CONFIG_SCHED_ALT */

9036

+ #ifdef CONFIG_PREEMPT_DYNAMIC

9037

+ 	debugfs_create_file("preempt", 0644, debugfs_sched, NULL, &sched_dynamic_fops);

9038

+ #endif

9039

+

9040

++#ifndef CONFIG_SCHED_ALT

9041

+ 	debugfs_create_u32("latency_ns", 0644, debugfs_sched, &sysctl_sched_latency);

9042

+ 	debugfs_create_u32("min_granularity_ns", 0644, debugfs_sched, &sysctl_sched_min_granularity);

9043

+ 	debugfs_create_u32("wakeup_granularity_ns", 0644, debugfs_sched, &sysctl_sched_wakeup_granularity);

9044

+@@ -330,11 +337,13 @@ static __init int sched_init_debug(void)

9045

+ #endif

9046

+

9047

+ 	debugfs_create_file("debug", 0444, debugfs_sched, NULL, &sched_debug_fops);

9048

++#endif /* !CONFIG_SCHED_ALT */

9049

+

9050

+ 	return 0;

9051

+ }

9052

+ late_initcall(sched_init_debug);

9053

+

9054

++#ifndef CONFIG_SCHED_ALT

9055

+ #ifdef CONFIG_SMP

9056

+

9057

+ static cpumask_var_t		sd_sysctl_cpus;

9058

+@@ -1047,6 +1056,7 @@ void proc_sched_set_task(struct task_struct *p)

9059

+ 	memset(&p->se.statistics, 0, sizeof(p->se.statistics));

9060

+ #endif

9061

+ }

9062

++#endif /* !CONFIG_SCHED_ALT */

9063

+

9064

+ void resched_latency_warn(int cpu, u64 latency)

9065

+ {

9066

+diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c

9067

+index 7ca3d3d86c2a..23e890141939 100644

9068

+--- a/kernel/sched/idle.c

9069

++++ b/kernel/sched/idle.c

9070

+@@ -403,6 +403,7 @@ void cpu_startup_entry(enum cpuhp_state state)

9071

+ 		do_idle();

9072

+ }

9073

+

9074

++#ifndef CONFIG_SCHED_ALT

9075

+ /*

9076

+  * idle-task scheduling class.

9077

+  */

9078

+@@ -516,3 +517,4 @@ DEFINE_SCHED_CLASS(idle) = {

9079

+ 	.switched_to		= switched_to_idle,

9080

+ 	.update_curr		= update_curr_idle,

9081

+ };

9082

++#endif

9083

+diff --git a/kernel/sched/pds.h b/kernel/sched/pds.h

9084

+new file mode 100644

9085

+index 000000000000..06d88e72b543

9086

+--- /dev/null

9087

++++ b/kernel/sched/pds.h

9088

+@@ -0,0 +1,129 @@

9089

++#define ALT_SCHED_VERSION_MSG "sched/pds: PDS CPU Scheduler "ALT_SCHED_VERSION" by Alfred Chen.\n"

9090

++

9091

++static int sched_timeslice_shift = 22;

9092

++

9093

++#define NORMAL_PRIO_MOD(x)	((x) & (NORMAL_PRIO_NUM - 1))

9094

++

9095

++/*

9096

++ * Common interfaces

9097

++ */

9098

++static inline void sched_timeslice_imp(const int timeslice_ms)

9099

++{

9100

++	if (2 == timeslice_ms)

9101

++		sched_timeslice_shift = 21;

9102

++}

9103

++

9104

++static inline int

9105

++task_sched_prio_normal(const struct task_struct *p, const struct rq *rq)

9106

++{

9107

++	s64 delta = p->deadline - rq->time_edge + NORMAL_PRIO_NUM - NICE_WIDTH;

9108

++

9109

++	if (unlikely(delta > NORMAL_PRIO_NUM - 1)) {

9110

++		pr_info("pds: task_sched_prio_normal delta %lld, deadline %llu, time_edge %llu\n",

9111

++			delta, p->deadline, rq->time_edge);

9112

++		return NORMAL_PRIO_NUM - 1;

9113

++	}

9114

++

9115

++	return (delta < 0) ? 0 : delta;

9116

++}

9117

++

9118

++static inline int task_sched_prio(const struct task_struct *p)

9119

++{

9120

++	return (p->prio < MAX_RT_PRIO) ? p->prio :

9121

++		MIN_NORMAL_PRIO + task_sched_prio_normal(p, task_rq(p));

9122

++}

9123

++

9124

++static inline int

9125

++task_sched_prio_idx(const struct task_struct *p, const struct rq *rq)

9126

++{

9127

++	return (p->prio < MAX_RT_PRIO) ? p->prio : MIN_NORMAL_PRIO +

9128

++		NORMAL_PRIO_MOD(task_sched_prio_normal(p, rq) + rq->time_edge);

9129

++}

9130

++

9131

++static inline int sched_prio2idx(int prio, struct rq *rq)

9132

++{

9133

++	return (IDLE_TASK_SCHED_PRIO == prio || prio < MAX_RT_PRIO) ? prio :

9134

++		MIN_NORMAL_PRIO + NORMAL_PRIO_MOD((prio - MIN_NORMAL_PRIO) +

9135

++						  rq->time_edge);

9136

++}

9137

++

9138

++static inline int sched_idx2prio(int idx, struct rq *rq)

9139

++{

9140

++	return (idx < MAX_RT_PRIO) ? idx : MIN_NORMAL_PRIO +

9141

++		NORMAL_PRIO_MOD((idx - MIN_NORMAL_PRIO) + NORMAL_PRIO_NUM -

9142

++				NORMAL_PRIO_MOD(rq->time_edge));

9143

++}

9144

++

9145

++static inline void sched_renew_deadline(struct task_struct *p, const struct rq *rq)

9146

++{

9147

++	if (p->prio >= MAX_RT_PRIO)

9148

++		p->deadline = (rq->clock >> sched_timeslice_shift) +

9149

++			p->static_prio - (MAX_PRIO - NICE_WIDTH);

9150

++}

9151

++

9152

++int task_running_nice(struct task_struct *p)

9153

++{

9154

++	return (p->prio > DEFAULT_PRIO);

9155

++}

9156

++

9157

++static inline void update_rq_time_edge(struct rq *rq)

9158

++{

9159

++	struct list_head head;

9160

++	u64 old = rq->time_edge;

9161

++	u64 now = rq->clock >> sched_timeslice_shift;

9162

++	u64 prio, delta;

9163

++

9164

++	if (now == old)

9165

++		return;

9166

++

9167

++	delta = min_t(u64, NORMAL_PRIO_NUM, now - old);

9168

++	INIT_LIST_HEAD(&head);

9169

++

9170

++	for_each_set_bit(prio, &rq->queue.bitmap[2], delta)

9171

++		list_splice_tail_init(rq->queue.heads + MIN_NORMAL_PRIO +

9172

++				      NORMAL_PRIO_MOD(prio + old), &head);

9173

++

9174

++	rq->queue.bitmap[2] = (NORMAL_PRIO_NUM == delta) ? 0UL :

9175

++		rq->queue.bitmap[2] >> delta;

9176

++	rq->time_edge = now;

9177

++	if (!list_empty(&head)) {

9178

++		u64 idx = MIN_NORMAL_PRIO + NORMAL_PRIO_MOD(now);

9179

++		struct task_struct *p;

9180

++

9181

++		list_for_each_entry(p, &head, sq_node)

9182

++			p->sq_idx = idx;

9183

++

9184

++		list_splice(&head, rq->queue.heads + idx);

9185

++		rq->queue.bitmap[2] |= 1UL;

9186

++	}

9187

++}

9188

++

9189

++static inline void time_slice_expired(struct task_struct *p, struct rq *rq)

9190

++{

9191

++	p->time_slice = sched_timeslice_ns;

9192

++	sched_renew_deadline(p, rq);

9193

++	if (SCHED_FIFO != p->policy && task_on_rq_queued(p))

9194

++		requeue_task(p, rq);

9195

++}

9196

++

9197

++static inline void sched_task_sanity_check(struct task_struct *p, struct rq *rq)

9198

++{

9199

++	u64 max_dl = rq->time_edge + NICE_WIDTH - 1;

9200

++	if (unlikely(p->deadline > max_dl))

9201

++		p->deadline = max_dl;

9202

++}

9203

++

9204

++static void sched_task_fork(struct task_struct *p, struct rq *rq)

9205

++{

9206

++	sched_renew_deadline(p, rq);

9207

++}

9208

++

9209

++static void do_sched_yield_type_1(struct task_struct *p, struct rq *rq)

9210

++{

9211

++	time_slice_expired(p, rq);

9212

++}

9213

++

9214

++#ifdef CONFIG_SMP

9215

++static void sched_task_ttwu(struct task_struct *p) {}

9216

++#endif

9217

++static void sched_task_deactivate(struct task_struct *p, struct rq *rq) {}

9218

+diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c

9219

+index a554e3bbab2b..3e56f5e6ff5c 100644

9220

+--- a/kernel/sched/pelt.c

9221

++++ b/kernel/sched/pelt.c

9222

+@@ -270,6 +270,7 @@ ___update_load_avg(struct sched_avg *sa, unsigned long load)

9223

+ 	WRITE_ONCE(sa->util_avg, sa->util_sum / divider);

9224

+ }

9225

+

9226

++#ifndef CONFIG_SCHED_ALT

9227

+ /*

9228

+  * sched_entity:

9229

+  *

9230

+@@ -387,8 +388,9 @@ int update_dl_rq_load_avg(u64 now, struct rq *rq, int running)

9231

+

9232

+ 	return 0;

9233

+ }

9234

++#endif

9235

+

9236

+-#ifdef CONFIG_SCHED_THERMAL_PRESSURE

9237

++#if defined(CONFIG_SCHED_THERMAL_PRESSURE) && !defined(CONFIG_SCHED_ALT)

9238

+ /*

9239

+  * thermal:

9240

+  *

9241

+diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h

9242

+index cfe94ffd2b38..8a33dc6124aa 100644

9243

+--- a/kernel/sched/pelt.h

9244

++++ b/kernel/sched/pelt.h

9245

+@@ -1,13 +1,15 @@

9246

+ #ifdef CONFIG_SMP

9247

+ #include "sched-pelt.h"

9248

+

9249

++#ifndef CONFIG_SCHED_ALT

9250

+ int __update_load_avg_blocked_se(u64 now, struct sched_entity *se);

9251

+ int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_entity *se);

9252

+ int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq);

9253

+ int update_rt_rq_load_avg(u64 now, struct rq *rq, int running);

9254

+ int update_dl_rq_load_avg(u64 now, struct rq *rq, int running);

9255

++#endif

9256

+

9257

+-#ifdef CONFIG_SCHED_THERMAL_PRESSURE

9258

++#if defined(CONFIG_SCHED_THERMAL_PRESSURE) && !defined(CONFIG_SCHED_ALT)

9259

+ int update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity);

9260

+

9261

+ static inline u64 thermal_load_avg(struct rq *rq)

9262

+@@ -42,6 +44,7 @@ static inline u32 get_pelt_divider(struct sched_avg *avg)

9263

+ 	return LOAD_AVG_MAX - 1024 + avg->period_contrib;

9264

+ }

9265

+

9266

++#ifndef CONFIG_SCHED_ALT

9267

+ static inline void cfs_se_util_change(struct sched_avg *avg)

9268

+ {

9269

+ 	unsigned int enqueued;

9270

+@@ -153,9 +156,11 @@ static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq)

9271

+ 	return rq_clock_pelt(rq_of(cfs_rq));

9272

+ }

9273

+ #endif

9274

++#endif /* CONFIG_SCHED_ALT */

9275

+

9276

+ #else

9277

+

9278

++#ifndef CONFIG_SCHED_ALT

9279

+ static inline int

9280

+ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)

9281

+ {

9282

+@@ -173,6 +178,7 @@ update_dl_rq_load_avg(u64 now, struct rq *rq, int running)

9283

+ {

9284

+ 	return 0;

9285

+ }

9286

++#endif

9287

+

9288

+ static inline int

9289

+ update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity)

9290

+diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h

9291

+index a189bec13729..02e4234cbc1f 100644

9292

+--- a/kernel/sched/sched.h

9293

++++ b/kernel/sched/sched.h

9294

+@@ -2,6 +2,10 @@

9295

+ /*

9296

+  * Scheduler internal types and methods:

9297

+  */

9298

++#ifdef CONFIG_SCHED_ALT

9299

++#include "alt_sched.h"

9300

++#else

9301

++

9302

+ #include <linux/sched.h>

9303

+

9304

+ #include <linux/sched/autogroup.h>

9305

+@@ -2749,3 +2753,8 @@ extern int sched_dynamic_mode(const char *str);

9306

+ extern void sched_dynamic_update(int mode);

9307

+ #endif

9308

+

9309

++static inline int task_running_nice(struct task_struct *p)

9310

++{

9311

++	return (task_nice(p) > 0);

9312

++}

9313

++#endif /* !CONFIG_SCHED_ALT */

9314

+diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c

9315

+index 3f93fc3b5648..528b71e144e9 100644

9316

+--- a/kernel/sched/stats.c

9317

++++ b/kernel/sched/stats.c

9318

+@@ -22,8 +22,10 @@ static int show_schedstat(struct seq_file *seq, void *v)

9319

+ 	} else {

9320

+ 		struct rq *rq;

9321

+ #ifdef CONFIG_SMP

9322

++#ifndef CONFIG_SCHED_ALT

9323

+ 		struct sched_domain *sd;

9324

+ 		int dcount = 0;

9325

++#endif

9326

+ #endif

9327

+ 		cpu = (unsigned long)(v - 2);

9328

+ 		rq = cpu_rq(cpu);

9329

+@@ -40,6 +42,7 @@ static int show_schedstat(struct seq_file *seq, void *v)

9330

+ 		seq_printf(seq, "\n");

9331

+

9332

+ #ifdef CONFIG_SMP

9333

++#ifndef CONFIG_SCHED_ALT

9334

+ 		/* domain-specific stats */

9335

+ 		rcu_read_lock();

9336

+ 		for_each_domain(cpu, sd) {

9337

+@@ -68,6 +71,7 @@ static int show_schedstat(struct seq_file *seq, void *v)

9338

+ 			    sd->ttwu_move_balance);

9339

+ 		}

9340

+ 		rcu_read_unlock();

9341

++#endif

9342

+ #endif

9343

+ 	}

9344

+ 	return 0;

9345

+diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c

9346

+index 55a0a243e871..fda2e8fe6ffe 100644

9347

+--- a/kernel/sched/topology.c

9348

++++ b/kernel/sched/topology.c

9349

+@@ -4,6 +4,7 @@

9350

+  */

9351

+ #include "sched.h"

9352

+

9353

++#ifndef CONFIG_SCHED_ALT

9354

+ DEFINE_MUTEX(sched_domains_mutex);

9355

+

9356

+ /* Protected by sched_domains_mutex: */

9357

+@@ -1272,8 +1273,10 @@ static void init_sched_groups_capacity(int cpu, struct sched_domain *sd)

9358

+  */

9359

+

9360

+ static int default_relax_domain_level = -1;

9361

++#endif /* CONFIG_SCHED_ALT */

9362

+ int sched_domain_level_max;

9363

+

9364

++#ifndef CONFIG_SCHED_ALT

9365

+ static int __init setup_relax_domain_level(char *str)

9366

+ {

9367

+ 	if (kstrtoint(str, 0, &default_relax_domain_level))

9368

+@@ -1503,6 +1506,7 @@ sd_init(struct sched_domain_topology_level *tl,

9369

+

9370

+ 	return sd;

9371

+ }

9372

++#endif /* CONFIG_SCHED_ALT */

9373

+

9374

+ /*

9375

+  * Topology list, bottom-up.

9376

+@@ -1532,6 +1536,7 @@ void set_sched_topology(struct sched_domain_topology_level *tl)

9377

+ 	sched_domain_topology = tl;

9378

+ }

9379

+

9380

++#ifndef CONFIG_SCHED_ALT

9381

+ #ifdef CONFIG_NUMA

9382

+

9383

+ static const struct cpumask *sd_numa_mask(int cpu)

9384

+@@ -2398,3 +2403,17 @@ void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],

9385

+ 	partition_sched_domains_locked(ndoms_new, doms_new, dattr_new);

9386

+ 	mutex_unlock(&sched_domains_mutex);

9387

+ }

9388

++#else /* CONFIG_SCHED_ALT */

9389

++void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],

9390

++			     struct sched_domain_attr *dattr_new)

9391

++{}

9392

++

9393

++#ifdef CONFIG_NUMA

9394

++int __read_mostly		node_reclaim_distance = RECLAIM_DISTANCE;

9395

++

9396

++int sched_numa_find_closest(const struct cpumask *cpus, int cpu)

9397

++{

9398

++	return best_mask_cpu(cpu, cpus);

9399

++}

9400

++#endif /* CONFIG_NUMA */

9401

++#endif

9402

+diff --git a/kernel/sysctl.c b/kernel/sysctl.c

9403

+index d4a78e08f6d8..403bd33e5880 100644

9404

+--- a/kernel/sysctl.c

9405

++++ b/kernel/sysctl.c

9406

+@@ -120,6 +120,10 @@ static unsigned long long_max = LONG_MAX;

9407

+ static int one_hundred = 100;

9408

+ static int two_hundred = 200;

9409

+ static int one_thousand = 1000;

9410

++#ifdef CONFIG_SCHED_ALT

9411

++static int __maybe_unused zero = 0;

9412

++extern int sched_yield_type;

9413

++#endif

9414

+ #ifdef CONFIG_PRINTK

9415

+ static int ten_thousand = 10000;

9416

+ #endif

9417

+@@ -1729,6 +1733,24 @@ int proc_do_static_key(struct ctl_table *table, int write,

9418

+ }

9419

+

9420

+ static struct ctl_table kern_table[] = {

9421

++#ifdef CONFIG_SCHED_ALT

9422

++/* In ALT, only supported "sched_schedstats" */

9423

++#ifdef CONFIG_SCHED_DEBUG

9424

++#ifdef CONFIG_SMP

9425

++#ifdef CONFIG_SCHEDSTATS

9426

++	{

9427

++		.procname	= "sched_schedstats",

9428

++		.data		= NULL,

9429

++		.maxlen		= sizeof(unsigned int),

9430

++		.mode		= 0644,

9431

++		.proc_handler	= sysctl_schedstats,

9432

++		.extra1		= SYSCTL_ZERO,

9433

++		.extra2		= SYSCTL_ONE,

9434

++	},

9435

++#endif /* CONFIG_SCHEDSTATS */

9436

++#endif /* CONFIG_SMP */

9437

++#endif /* CONFIG_SCHED_DEBUG */

9438

++#else  /* !CONFIG_SCHED_ALT */

9439

+ 	{

9440

+ 		.procname	= "sched_child_runs_first",

9441

+ 		.data		= &sysctl_sched_child_runs_first,

9442

+@@ -1848,6 +1870,7 @@ static struct ctl_table kern_table[] = {

9443

+ 		.extra2		= SYSCTL_ONE,

9444

+ 	},

9445

+ #endif

9446

++#endif /* !CONFIG_SCHED_ALT */

9447

+ #ifdef CONFIG_PROVE_LOCKING

9448

+ 	{

9449

+ 		.procname	= "prove_locking",

9450

+@@ -2424,6 +2447,17 @@ static struct ctl_table kern_table[] = {

9451

+ 		.proc_handler	= proc_dointvec,

9452

+ 	},

9453

+ #endif

9454

++#ifdef CONFIG_SCHED_ALT

9455

++	{

9456

++		.procname	= "yield_type",

9457

++		.data		= &sched_yield_type,

9458

++		.maxlen		= sizeof (int),

9459

++		.mode		= 0644,

9460

++		.proc_handler	= &proc_dointvec_minmax,

9461

++		.extra1		= &zero,

9462

++		.extra2		= &two,

9463

++	},

9464

++#endif

9465

+ #if defined(CONFIG_S390) && defined(CONFIG_SMP)

9466

+ 	{

9467

+ 		.procname	= "spin_retry",

9468

+diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c

9469

+index 4a66725b1d4a..cb80ed5c1f5c 100644

9470

+--- a/kernel/time/hrtimer.c

9471

++++ b/kernel/time/hrtimer.c

9472

+@@ -1940,8 +1940,10 @@ long hrtimer_nanosleep(ktime_t rqtp, const enum hrtimer_mode mode,

9473

+ 	int ret = 0;

9474

+ 	u64 slack;

9475

+

9476

++#ifndef CONFIG_SCHED_ALT

9477

+ 	slack = current->timer_slack_ns;

9478

+ 	if (dl_task(current) || rt_task(current))

9479

++#endif

9480

+ 		slack = 0;

9481

+

9482

+ 	hrtimer_init_sleeper_on_stack(&t, clockid, mode);

9483

+diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c

9484

+index 3bb96a8b49c9..11509fcf1d8a 100644

9485

+--- a/kernel/time/posix-cpu-timers.c

9486

++++ b/kernel/time/posix-cpu-timers.c

9487

+@@ -216,7 +216,7 @@ static void task_sample_cputime(struct task_struct *p, u64 *samples)

9488

+ 	u64 stime, utime;

9489

+

9490

+ 	task_cputime(p, &utime, &stime);

9491

+-	store_samples(samples, stime, utime, p->se.sum_exec_runtime);

9492

++	store_samples(samples, stime, utime, tsk_seruntime(p));

9493

+ }

9494

+

9495

+ static void proc_sample_cputime_atomic(struct task_cputime_atomic *at,

9496

+@@ -801,6 +801,7 @@ static void collect_posix_cputimers(struct posix_cputimers *pct, u64 *samples,

9497

+ 	}

9498

+ }

9499

+

9500

++#ifndef CONFIG_SCHED_ALT

9501

+ static inline void check_dl_overrun(struct task_struct *tsk)

9502

+ {

9503

+ 	if (tsk->dl.dl_overrun) {

9504

+@@ -808,6 +809,7 @@ static inline void check_dl_overrun(struct task_struct *tsk)

9505

+ 		__group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk);

9506

+ 	}

9507

+ }

9508

++#endif

9509

+

9510

+ static bool check_rlimit(u64 time, u64 limit, int signo, bool rt, bool hard)

9511

+ {

9512

+@@ -835,8 +837,10 @@ static void check_thread_timers(struct task_struct *tsk,

9513

+ 	u64 samples[CPUCLOCK_MAX];

9514

+ 	unsigned long soft;

9515

+

9516

++#ifndef CONFIG_SCHED_ALT

9517

+ 	if (dl_task(tsk))

9518

+ 		check_dl_overrun(tsk);

9519

++#endif

9520

+

9521

+ 	if (expiry_cache_is_inactive(pct))

9522

+ 		return;

9523

+@@ -850,7 +854,7 @@ static void check_thread_timers(struct task_struct *tsk,

9524

+ 	soft = task_rlimit(tsk, RLIMIT_RTTIME);

9525

+ 	if (soft != RLIM_INFINITY) {

9526

+ 		/* Task RT timeout is accounted in jiffies. RTTIME is usec */

9527

+-		unsigned long rttime = tsk->rt.timeout * (USEC_PER_SEC / HZ);

9528

++		unsigned long rttime = tsk_rttimeout(tsk) * (USEC_PER_SEC / HZ);

9529

+ 		unsigned long hard = task_rlimit_max(tsk, RLIMIT_RTTIME);

9530

+

9531

+ 		/* At the hard limit, send SIGKILL. No further action. */

9532

+@@ -1086,8 +1090,10 @@ static inline bool fastpath_timer_check(struct task_struct *tsk)

9533

+ 			return true;

9534

+ 	}

9535

+

9536

++#ifndef CONFIG_SCHED_ALT

9537

+ 	if (dl_task(tsk) && tsk->dl.dl_overrun)

9538

+ 		return true;

9539

++#endif

9540

+

9541

+ 	return false;

9542

+ }

9543

+diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c

9544

+index adf7ef194005..11c8f36e281b 100644

9545

+--- a/kernel/trace/trace_selftest.c

9546

++++ b/kernel/trace/trace_selftest.c

9547

+@@ -1052,10 +1052,15 @@ static int trace_wakeup_test_thread(void *data)

9548

+ {

9549

+ 	/* Make this a -deadline thread */

9550

+ 	static const struct sched_attr attr = {

9551

++#ifdef CONFIG_SCHED_ALT

9552

++		/* No deadline on BMQ/PDS, use RR */

9553

++		.sched_policy = SCHED_RR,

9554

++#else

9555

+ 		.sched_policy = SCHED_DEADLINE,

9556

+ 		.sched_runtime = 100000ULL,

9557

+ 		.sched_deadline = 10000000ULL,

9558

+ 		.sched_period = 10000000ULL

9559

++#endif

9560

+ 	};

9561

+ 	struct wakeup_test_data *x = data;

9562

+

9563

9564

diff --git a/5021_BMQ-and-PDS-gentoo-defaults.patch b/5021_BMQ-and-PDS-gentoo-defaults.patch

9565

new file mode 100644

9566

index 0000000..d449eec

9567

--- /dev/null

9568

+++ b/5021_BMQ-and-PDS-gentoo-defaults.patch

9569

@@ -0,0 +1,13 @@

9570

+--- a/init/Kconfig	2021-04-27 07:38:30.556467045 -0400

9571

++++ b/init/Kconfig	2021-04-27 07:39:32.956412800 -0400

9572

+@@ -780,8 +780,9 @@ config GENERIC_SCHED_CLOCK

9573

+ menu "Scheduler features"

9574

+

9575

+ menuconfig SCHED_ALT

9576

++	depends on X86_64

9577

+ 	bool "Alternative CPU Schedulers"

9578

+-	default y

9579

++	default n

9580

+ 	help

9581

+ 	  This feature enable alternative CPU scheduler"

9582

+

Gentoo Archives: gentoo-commits