[gentoo-commits] proj/linux-patches:6.2 commit in: / - gentoo-commits

From:	Mike Pagano <mpagano@g.o>
To:	gentoo-commits@l.g.o
Subject:	[gentoo-commits] proj/linux-patches:6.2 commit in: /
Date:	Mon, 27 Feb 2023 18:46:02
Message-Id:	`1677523522.26c13af1abdb1a80f16f5de399d1029781be3e1d.mpagano@gentoo`

1

commit:     26c13af1abdb1a80f16f5de399d1029781be3e1d

2

Author:     Mike Pagano <mpagano <AT> gentoo <DOT> org>

3

AuthorDate: Mon Feb 27 18:45:22 2023 +0000

4

Commit:     Mike Pagano <mpagano <AT> gentoo <DOT> org>

5

CommitDate: Mon Feb 27 18:45:22 2023 +0000

6

URL:        https://gitweb.gentoo.org/proj/linux-patches.git/commit/?id=26c13af1

7

8

Add BMQ Scheduler, USE=experimental

9

10

Signed-off-by: Mike Pagano <mpagano <AT> gentoo.org>

11

12

 0000_README                                 |     8 +

13

 5020_BMQ-and-PDS-io-scheduler-v6.2-r0.patch | 10270 ++++++++++++++++++++++++++

14

 5021_BMQ-and-PDS-gentoo-defaults.patch      |    13 +

15

 3 files changed, 10291 insertions(+)

16

17

diff --git a/0000_README b/0000_README

18

index ae528a67..b0db3406 100644

19

--- a/0000_README

20

+++ b/0000_README

21

@@ -90,3 +90,11 @@ Desc:   Kernel module that provides a kernel filesystem for uid/gid shifting

22

 Patch:  5010_enable-cpu-optimizations-universal.patch

23

 From:   https://github.com/graysky2/kernel_compiler_patch

24

 Desc:   Kernel >= 5.15 patch enables gcc = v11.1+ optimizations for additional CPUs.

25

+

26

+Patch:  5020_BMQ-and-PDS-io-scheduler-v6.2-r0.patch

27

+From:   https://github.com/Frogging-Family/linux-tkg https://gitlab.com/alfredchen/projectc

28

+Desc:   BMQ(BitMap Queue) Scheduler. A new CPU scheduler developed from PDS(incld). Inspired by the scheduler in zircon.

29

+

30

+Patch:  5021_BMQ-and-PDS-gentoo-defaults.patch

31

+From:   https://gitweb.gentoo.org/proj/linux-patches.git/

32

+Desc:   Set defaults for BMQ. Add archs as people test, default to N

33

34

diff --git a/5020_BMQ-and-PDS-io-scheduler-v6.2-r0.patch b/5020_BMQ-and-PDS-io-scheduler-v6.2-r0.patch

35

new file mode 100644

36

index 00000000..fd815b76

37

--- /dev/null

38

+++ b/5020_BMQ-and-PDS-io-scheduler-v6.2-r0.patch

39

@@ -0,0 +1,10270 @@

40

+diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt

41

+index 6cfa6e3996cf..1b6a407213da 100644

42

+--- a/Documentation/admin-guide/kernel-parameters.txt

43

++++ b/Documentation/admin-guide/kernel-parameters.txt

44

+@@ -5437,6 +5437,12 @@

45

+ 	sa1100ir	[NET]

46

+ 			See drivers/net/irda/sa1100_ir.c.

47

+

48

++	sched_timeslice=

49

++			[KNL] Time slice in ms for Project C BMQ/PDS scheduler.

50

++			Format: integer 2, 4

51

++			Default: 4

52

++			See Documentation/scheduler/sched-BMQ.txt

53

++

54

+ 	sched_verbose	[KNL] Enables verbose scheduler debug messages.

55

+

56

+ 	schedstats=	[KNL,X86] Enable or disable scheduled statistics.

57

+diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst

58

+index 46e3d62c0eea..fb4568c919d0 100644

59

+--- a/Documentation/admin-guide/sysctl/kernel.rst

60

++++ b/Documentation/admin-guide/sysctl/kernel.rst

61

+@@ -1597,3 +1597,13 @@ is 10 seconds.

62

+

63

+ The softlockup threshold is (``2 * watchdog_thresh``). Setting this

64

+ tunable to zero will disable lockup detection altogether.

65

++

66

++yield_type:

67

++===========

68

++

69

++BMQ/PDS CPU scheduler only. This determines what type of yield calls

70

++to sched_yield will perform.

71

++

72

++  0 - No yield.

73

++  1 - Deboost and requeue task. (default)

74

++  2 - Set run queue skip task.

75

+diff --git a/Documentation/scheduler/sched-BMQ.txt b/Documentation/scheduler/sched-BMQ.txt

76

+new file mode 100644

77

+index 000000000000..05c84eec0f31

78

+--- /dev/null

79

++++ b/Documentation/scheduler/sched-BMQ.txt

80

+@@ -0,0 +1,110 @@

81

++                         BitMap queue CPU Scheduler

82

++                         --------------------------

83

++

84

++CONTENT

85

++========

86

++

87

++ Background

88

++ Design

89

++   Overview

90

++   Task policy

91

++   Priority management

92

++   BitMap Queue

93

++   CPU Assignment and Migration

94

++

95

++

96

++Background

97

++==========

98

++

99

++BitMap Queue CPU scheduler, referred to as BMQ from here on, is an evolution

100

++of previous Priority and Deadline based Skiplist multiple queue scheduler(PDS),

101

++and inspired by Zircon scheduler. The goal of it is to keep the scheduler code

102

++simple, while efficiency and scalable for interactive tasks, such as desktop,

103

++movie playback and gaming etc.

104

++

105

++Design

106

++======

107

++

108

++Overview

109

++--------

110

++

111

++BMQ use per CPU run queue design, each CPU(logical) has it's own run queue,

112

++each CPU is responsible for scheduling the tasks that are putting into it's

113

++run queue.

114

++

115

++The run queue is a set of priority queues. Note that these queues are fifo

116

++queue for non-rt tasks or priority queue for rt tasks in data structure. See

117

++BitMap Queue below for details. BMQ is optimized for non-rt tasks in the fact

118

++that most applications are non-rt tasks. No matter the queue is fifo or

119

++priority, In each queue is an ordered list of runnable tasks awaiting execution

120

++and the data structures are the same. When it is time for a new task to run,

121

++the scheduler simply looks the lowest numbered queueue that contains a task,

122

++and runs the first task from the head of that queue. And per CPU idle task is

123

++also in the run queue, so the scheduler can always find a task to run on from

124

++its run queue.

125

++

126

++Each task will assigned the same timeslice(default 4ms) when it is picked to

127

++start running. Task will be reinserted at the end of the appropriate priority

128

++queue when it uses its whole timeslice. When the scheduler selects a new task

129

++from the priority queue it sets the CPU's preemption timer for the remainder of

130

++the previous timeslice. When that timer fires the scheduler will stop execution

131

++on that task, select another task and start over again.

132

++

133

++If a task blocks waiting for a shared resource then it's taken out of its

134

++priority queue and is placed in a wait queue for the shared resource. When it

135

++is unblocked it will be reinserted in the appropriate priority queue of an

136

++eligible CPU.

137

++

138

++Task policy

139

++-----------

140

++

141

++BMQ supports DEADLINE, FIFO, RR, NORMAL, BATCH and IDLE task policy like the

142

++mainline CFS scheduler. But BMQ is heavy optimized for non-rt task, that's

143

++NORMAL/BATCH/IDLE policy tasks. Below is the implementation detail of each

144

++policy.

145

++

146

++DEADLINE

147

++	It is squashed as priority 0 FIFO task.

148

++

149

++FIFO/RR

150

++	All RT tasks share one single priority queue in BMQ run queue designed. The

151

++complexity of insert operation is O(n). BMQ is not designed for system runs

152

++with major rt policy tasks.

153

++

154

++NORMAL/BATCH/IDLE

155

++	BATCH and IDLE tasks are treated as the same policy. They compete CPU with

156

++NORMAL policy tasks, but they just don't boost. To control the priority of

157

++NORMAL/BATCH/IDLE tasks, simply use nice level.

158

++

159

++ISO

160

++	ISO policy is not supported in BMQ. Please use nice level -20 NORMAL policy

161

++task instead.

162

++

163

++Priority management

164

++-------------------

165

++

166

++RT tasks have priority from 0-99. For non-rt tasks, there are three different

167

++factors used to determine the effective priority of a task. The effective

168

++priority being what is used to determine which queue it will be in.

169

++

170

++The first factor is simply the task’s static priority. Which is assigned from

171

++task's nice level, within [-20, 19] in userland's point of view and [0, 39]

172

++internally.

173

++

174

++The second factor is the priority boost. This is a value bounded between

175

++[-MAX_PRIORITY_ADJ, MAX_PRIORITY_ADJ] used to offset the base priority, it is

176

++modified by the following cases:

177

++

178

++*When a thread has used up its entire timeslice, always deboost its boost by

179

++increasing by one.

180

++*When a thread gives up cpu control(voluntary or non-voluntary) to reschedule,

181

++and its switch-in time(time after last switch and run) below the thredhold

182

++based on its priority boost, will boost its boost by decreasing by one buti is

183

++capped at 0 (won’t go negative).

184

++

185

++The intent in this system is to ensure that interactive threads are serviced

186

++quickly. These are usually the threads that interact directly with the user

187

++and cause user-perceivable latency. These threads usually do little work and

188

++spend most of their time blocked awaiting another user event. So they get the

189

++priority boost from unblocking while background threads that do most of the

190

++processing receive the priority penalty for using their entire timeslice.

191

+diff --git a/fs/proc/base.c b/fs/proc/base.c

192

+index 9e479d7d202b..2a8530021b23 100644

193

+--- a/fs/proc/base.c

194

++++ b/fs/proc/base.c

195

+@@ -479,7 +479,7 @@ static int proc_pid_schedstat(struct seq_file *m, struct pid_namespace *ns,

196

+ 		seq_puts(m, "0 0 0\n");

197

+ 	else

198

+ 		seq_printf(m, "%llu %llu %lu\n",

199

+-		   (unsigned long long)task->se.sum_exec_runtime,

200

++		   (unsigned long long)tsk_seruntime(task),

201

+ 		   (unsigned long long)task->sched_info.run_delay,

202

+ 		   task->sched_info.pcount);

203

+

204

+diff --git a/include/asm-generic/resource.h b/include/asm-generic/resource.h

205

+index 8874f681b056..59eb72bf7d5f 100644

206

+--- a/include/asm-generic/resource.h

207

++++ b/include/asm-generic/resource.h

208

+@@ -23,7 +23,7 @@

209

+ 	[RLIMIT_LOCKS]		= {  RLIM_INFINITY,  RLIM_INFINITY },	\

210

+ 	[RLIMIT_SIGPENDING]	= { 		0,	       0 },	\

211

+ 	[RLIMIT_MSGQUEUE]	= {   MQ_BYTES_MAX,   MQ_BYTES_MAX },	\

212

+-	[RLIMIT_NICE]		= { 0, 0 },				\

213

++	[RLIMIT_NICE]		= { 30, 30 },				\

214

+ 	[RLIMIT_RTPRIO]		= { 0, 0 },				\

215

+ 	[RLIMIT_RTTIME]		= {  RLIM_INFINITY,  RLIM_INFINITY },	\

216

+ }

217

+diff --git a/include/linux/sched.h b/include/linux/sched.h

218

+index 853d08f7562b..ad7e050d7455 100644

219

+--- a/include/linux/sched.h

220

++++ b/include/linux/sched.h

221

+@@ -762,8 +762,14 @@ struct task_struct {

222

+ 	unsigned int			ptrace;

223

+

224

+ #ifdef CONFIG_SMP

225

+-	int				on_cpu;

226

+ 	struct __call_single_node	wake_entry;

227

++#endif

228

++#if defined(CONFIG_SMP) || defined(CONFIG_SCHED_ALT)

229

++	int				on_cpu;

230

++#endif

231

++

232

++#ifdef CONFIG_SMP

233

++#ifndef CONFIG_SCHED_ALT

234

+ 	unsigned int			wakee_flips;

235

+ 	unsigned long			wakee_flip_decay_ts;

236

+ 	struct task_struct		*last_wakee;

237

+@@ -777,6 +783,7 @@ struct task_struct {

238

+ 	 */

239

+ 	int				recent_used_cpu;

240

+ 	int				wake_cpu;

241

++#endif /* !CONFIG_SCHED_ALT */

242

+ #endif

243

+ 	int				on_rq;

244

+

245

+@@ -785,6 +792,20 @@ struct task_struct {

246

+ 	int				normal_prio;

247

+ 	unsigned int			rt_priority;

248

+

249

++#ifdef CONFIG_SCHED_ALT

250

++	u64				last_ran;

251

++	s64				time_slice;

252

++	int				sq_idx;

253

++	struct list_head		sq_node;

254

++#ifdef CONFIG_SCHED_BMQ

255

++	int				boost_prio;

256

++#endif /* CONFIG_SCHED_BMQ */

257

++#ifdef CONFIG_SCHED_PDS

258

++	u64				deadline;

259

++#endif /* CONFIG_SCHED_PDS */

260

++	/* sched_clock time spent running */

261

++	u64				sched_time;

262

++#else /* !CONFIG_SCHED_ALT */

263

+ 	struct sched_entity		se;

264

+ 	struct sched_rt_entity		rt;

265

+ 	struct sched_dl_entity		dl;

266

+@@ -795,6 +816,7 @@ struct task_struct {

267

+ 	unsigned long			core_cookie;

268

+ 	unsigned int			core_occupation;

269

+ #endif

270

++#endif /* !CONFIG_SCHED_ALT */

271

+

272

+ #ifdef CONFIG_CGROUP_SCHED

273

+ 	struct task_group		*sched_task_group;

274

+@@ -1539,6 +1561,15 @@ struct task_struct {

275

+ 	 */

276

+ };

277

+

278

++#ifdef CONFIG_SCHED_ALT

279

++#define tsk_seruntime(t)		((t)->sched_time)

280

++/* replace the uncertian rt_timeout with 0UL */

281

++#define tsk_rttimeout(t)		(0UL)

282

++#else /* CFS */

283

++#define tsk_seruntime(t)	((t)->se.sum_exec_runtime)

284

++#define tsk_rttimeout(t)	((t)->rt.timeout)

285

++#endif /* !CONFIG_SCHED_ALT */

286

++

287

+ static inline struct pid *task_pid(struct task_struct *task)

288

+ {

289

+ 	return task->thread_pid;

290

+diff --git a/include/linux/sched/deadline.h b/include/linux/sched/deadline.h

291

+index 7c83d4d5a971..fa30f98cb2be 100644

292

+--- a/include/linux/sched/deadline.h

293

++++ b/include/linux/sched/deadline.h

294

+@@ -1,5 +1,24 @@

295

+ /* SPDX-License-Identifier: GPL-2.0 */

296

+

297

++#ifdef CONFIG_SCHED_ALT

298

++

299

++static inline int dl_task(struct task_struct *p)

300

++{

301

++	return 0;

302

++}

303

++

304

++#ifdef CONFIG_SCHED_BMQ

305

++#define __tsk_deadline(p)	(0UL)

306

++#endif

307

++

308

++#ifdef CONFIG_SCHED_PDS

309

++#define __tsk_deadline(p)	((((u64) ((p)->prio))<<56) | (p)->deadline)

310

++#endif

311

++

312

++#else

313

++

314

++#define __tsk_deadline(p)	((p)->dl.deadline)

315

++

316

+ /*

317

+  * SCHED_DEADLINE tasks has negative priorities, reflecting

318

+  * the fact that any of them has higher prio than RT and

319

+@@ -21,6 +40,7 @@ static inline int dl_task(struct task_struct *p)

320

+ {

321

+ 	return dl_prio(p->prio);

322

+ }

323

++#endif /* CONFIG_SCHED_ALT */

324

+

325

+ static inline bool dl_time_before(u64 a, u64 b)

326

+ {

327

+diff --git a/include/linux/sched/prio.h b/include/linux/sched/prio.h

328

+index ab83d85e1183..6af9ae681116 100644

329

+--- a/include/linux/sched/prio.h

330

++++ b/include/linux/sched/prio.h

331

+@@ -18,6 +18,32 @@

332

+ #define MAX_PRIO		(MAX_RT_PRIO + NICE_WIDTH)

333

+ #define DEFAULT_PRIO		(MAX_RT_PRIO + NICE_WIDTH / 2)

334

+

335

++#ifdef CONFIG_SCHED_ALT

336

++

337

++/* Undefine MAX_PRIO and DEFAULT_PRIO */

338

++#undef MAX_PRIO

339

++#undef DEFAULT_PRIO

340

++

341

++/* +/- priority levels from the base priority */

342

++#ifdef CONFIG_SCHED_BMQ

343

++#define MAX_PRIORITY_ADJ	(7)

344

++

345

++#define MIN_NORMAL_PRIO		(MAX_RT_PRIO)

346

++#define MAX_PRIO		(MIN_NORMAL_PRIO + NICE_WIDTH)

347

++#define DEFAULT_PRIO		(MIN_NORMAL_PRIO + NICE_WIDTH / 2)

348

++#endif

349

++

350

++#ifdef CONFIG_SCHED_PDS

351

++#define MAX_PRIORITY_ADJ	(0)

352

++

353

++#define MIN_NORMAL_PRIO		(128)

354

++#define NORMAL_PRIO_NUM		(64)

355

++#define MAX_PRIO		(MIN_NORMAL_PRIO + NORMAL_PRIO_NUM)

356

++#define DEFAULT_PRIO		(MAX_PRIO - NICE_WIDTH / 2)

357

++#endif

358

++

359

++#endif /* CONFIG_SCHED_ALT */

360

++

361

+ /*

362

+  * Convert user-nice values [ -20 ... 0 ... 19 ]

363

+  * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],

364

+diff --git a/include/linux/sched/rt.h b/include/linux/sched/rt.h

365

+index 994c25640e15..8c050a59ece1 100644

366

+--- a/include/linux/sched/rt.h

367

++++ b/include/linux/sched/rt.h

368

+@@ -24,8 +24,10 @@ static inline bool task_is_realtime(struct task_struct *tsk)

369

+

370

+ 	if (policy == SCHED_FIFO || policy == SCHED_RR)

371

+ 		return true;

372

++#ifndef CONFIG_SCHED_ALT

373

+ 	if (policy == SCHED_DEADLINE)

374

+ 		return true;

375

++#endif

376

+ 	return false;

377

+ }

378

+

379

+diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h

380

+index 816df6cc444e..c8da08e18c91 100644

381

+--- a/include/linux/sched/topology.h

382

++++ b/include/linux/sched/topology.h

383

+@@ -234,7 +234,8 @@ static inline bool cpus_share_cache(int this_cpu, int that_cpu)

384

+

385

+ #endif	/* !CONFIG_SMP */

386

+

387

+-#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)

388

++#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) && \

389

++	!defined(CONFIG_SCHED_ALT)

390

+ extern void rebuild_sched_domains_energy(void);

391

+ #else

392

+ static inline void rebuild_sched_domains_energy(void)

393

+diff --git a/init/Kconfig b/init/Kconfig

394

+index 44e90b28a30f..af24591984ab 100644

395

+--- a/init/Kconfig

396

++++ b/init/Kconfig

397

+@@ -821,6 +821,7 @@ menu "Scheduler features"

398

+ config UCLAMP_TASK

399

+ 	bool "Enable utilization clamping for RT/FAIR tasks"

400

+ 	depends on CPU_FREQ_GOV_SCHEDUTIL

401

++	depends on !SCHED_ALT

402

+ 	help

403

+ 	  This feature enables the scheduler to track the clamped utilization

404

+ 	  of each CPU based on RUNNABLE tasks scheduled on that CPU.

405

+@@ -867,6 +868,35 @@ config UCLAMP_BUCKETS_COUNT

406

+

407

+ 	  If in doubt, use the default value.

408

+

409

++menuconfig SCHED_ALT

410

++	bool "Alternative CPU Schedulers"

411

++	default y

412

++	help

413

++	  This feature enable alternative CPU scheduler"

414

++

415

++if SCHED_ALT

416

++

417

++choice

418

++	prompt "Alternative CPU Scheduler"

419

++	default SCHED_BMQ

420

++

421

++config SCHED_BMQ

422

++	bool "BMQ CPU scheduler"

423

++	help

424

++	  The BitMap Queue CPU scheduler for excellent interactivity and

425

++	  responsiveness on the desktop and solid scalability on normal

426

++	  hardware and commodity servers.

427

++

428

++config SCHED_PDS

429

++	bool "PDS CPU scheduler"

430

++	help

431

++	  The Priority and Deadline based Skip list multiple queue CPU

432

++	  Scheduler.

433

++

434

++endchoice

435

++

436

++endif

437

++

438

+ endmenu

439

+

440

+ #

441

+@@ -924,6 +954,7 @@ config NUMA_BALANCING

442

+ 	depends on ARCH_SUPPORTS_NUMA_BALANCING

443

+ 	depends on !ARCH_WANT_NUMA_VARIABLE_LOCALITY

444

+ 	depends on SMP && NUMA && MIGRATION && !PREEMPT_RT

445

++	depends on !SCHED_ALT

446

+ 	help

447

+ 	  This option adds support for automatic NUMA aware memory/task placement.

448

+ 	  The mechanism is quite primitive and is based on migrating memory when

449

+@@ -1021,6 +1052,7 @@ config FAIR_GROUP_SCHED

450

+ 	depends on CGROUP_SCHED

451

+ 	default CGROUP_SCHED

452

+

453

++if !SCHED_ALT

454

+ config CFS_BANDWIDTH

455

+ 	bool "CPU bandwidth provisioning for FAIR_GROUP_SCHED"

456

+ 	depends on FAIR_GROUP_SCHED

457

+@@ -1043,6 +1075,7 @@ config RT_GROUP_SCHED

458

+ 	  realtime bandwidth for them.

459

+ 	  See Documentation/scheduler/sched-rt-group.rst for more information.

460

+

461

++endif #!SCHED_ALT

462

+ endif #CGROUP_SCHED

463

+

464

+ config UCLAMP_TASK_GROUP

465

+@@ -1287,6 +1320,7 @@ config CHECKPOINT_RESTORE

466

+

467

+ config SCHED_AUTOGROUP

468

+ 	bool "Automatic process group scheduling"

469

++	depends on !SCHED_ALT

470

+ 	select CGROUPS

471

+ 	select CGROUP_SCHED

472

+ 	select FAIR_GROUP_SCHED

473

+diff --git a/init/init_task.c b/init/init_task.c

474

+index ff6c4b9bfe6b..19e9c662d1a1 100644

475

+--- a/init/init_task.c

476

++++ b/init/init_task.c

477

+@@ -75,9 +75,15 @@ struct task_struct init_task

478

+ 	.stack		= init_stack,

479

+ 	.usage		= REFCOUNT_INIT(2),

480

+ 	.flags		= PF_KTHREAD,

481

++#ifdef CONFIG_SCHED_ALT

482

++	.prio		= DEFAULT_PRIO + MAX_PRIORITY_ADJ,

483

++	.static_prio	= DEFAULT_PRIO,

484

++	.normal_prio	= DEFAULT_PRIO + MAX_PRIORITY_ADJ,

485

++#else

486

+ 	.prio		= MAX_PRIO - 20,

487

+ 	.static_prio	= MAX_PRIO - 20,

488

+ 	.normal_prio	= MAX_PRIO - 20,

489

++#endif

490

+ 	.policy		= SCHED_NORMAL,

491

+ 	.cpus_ptr	= &init_task.cpus_mask,

492

+ 	.user_cpus_ptr	= NULL,

493

+@@ -88,6 +94,17 @@ struct task_struct init_task

494

+ 	.restart_block	= {

495

+ 		.fn = do_no_restart_syscall,

496

+ 	},

497

++#ifdef CONFIG_SCHED_ALT

498

++	.sq_node	= LIST_HEAD_INIT(init_task.sq_node),

499

++#ifdef CONFIG_SCHED_BMQ

500

++	.boost_prio	= 0,

501

++	.sq_idx		= 15,

502

++#endif

503

++#ifdef CONFIG_SCHED_PDS

504

++	.deadline	= 0,

505

++#endif

506

++	.time_slice	= HZ,

507

++#else

508

+ 	.se		= {

509

+ 		.group_node 	= LIST_HEAD_INIT(init_task.se.group_node),

510

+ 	},

511

+@@ -95,6 +112,7 @@ struct task_struct init_task

512

+ 		.run_list	= LIST_HEAD_INIT(init_task.rt.run_list),

513

+ 		.time_slice	= RR_TIMESLICE,

514

+ 	},

515

++#endif

516

+ 	.tasks		= LIST_HEAD_INIT(init_task.tasks),

517

+ #ifdef CONFIG_SMP

518

+ 	.pushable_tasks	= PLIST_NODE_INIT(init_task.pushable_tasks, MAX_PRIO),

519

+diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt

520

+index c2f1fd95a821..41654679b1b2 100644

521

+--- a/kernel/Kconfig.preempt

522

++++ b/kernel/Kconfig.preempt

523

+@@ -117,7 +117,7 @@ config PREEMPT_DYNAMIC

524

+

525

+ config SCHED_CORE

526

+ 	bool "Core Scheduling for SMT"

527

+-	depends on SCHED_SMT

528

++	depends on SCHED_SMT && !SCHED_ALT

529

+ 	help

530

+ 	  This option permits Core Scheduling, a means of coordinated task

531

+ 	  selection across SMT siblings. When enabled -- see

532

+diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c

533

+index ca826bd1eba3..60e194f1d6d8 100644

534

+--- a/kernel/cgroup/cpuset.c

535

++++ b/kernel/cgroup/cpuset.c

536

+@@ -791,7 +791,7 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)

537

+ 	return ret;

538

+ }

539

+

540

+-#ifdef CONFIG_SMP

541

++#if defined(CONFIG_SMP) && !defined(CONFIG_SCHED_ALT)

542

+ /*

543

+  * Helper routine for generate_sched_domains().

544

+  * Do cpusets a, b have overlapping effective cpus_allowed masks?

545

+@@ -1187,7 +1187,7 @@ static void rebuild_sched_domains_locked(void)

546

+ 	/* Have scheduler rebuild the domains */

547

+ 	partition_and_rebuild_sched_domains(ndoms, doms, attr);

548

+ }

549

+-#else /* !CONFIG_SMP */

550

++#else /* !CONFIG_SMP || CONFIG_SCHED_ALT */

551

+ static void rebuild_sched_domains_locked(void)

552

+ {

553

+ }

554

+diff --git a/kernel/delayacct.c b/kernel/delayacct.c

555

+index e39cb696cfbd..463423572e09 100644

556

+--- a/kernel/delayacct.c

557

++++ b/kernel/delayacct.c

558

+@@ -150,7 +150,7 @@ int delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)

559

+ 	 */

560

+ 	t1 = tsk->sched_info.pcount;

561

+ 	t2 = tsk->sched_info.run_delay;

562

+-	t3 = tsk->se.sum_exec_runtime;

563

++	t3 = tsk_seruntime(tsk);

564

+

565

+ 	d->cpu_count += t1;

566

+

567

+diff --git a/kernel/exit.c b/kernel/exit.c

568

+index 15dc2ec80c46..1e583e0f89a7 100644

569

+--- a/kernel/exit.c

570

++++ b/kernel/exit.c

571

+@@ -172,7 +172,7 @@ static void __exit_signal(struct task_struct *tsk)

572

+ 			sig->curr_target = next_thread(tsk);

573

+ 	}

574

+

575

+-	add_device_randomness((const void*) &tsk->se.sum_exec_runtime,

576

++	add_device_randomness((const void*) &tsk_seruntime(tsk),

577

+ 			      sizeof(unsigned long long));

578

+

579

+ 	/*

580

+@@ -193,7 +193,7 @@ static void __exit_signal(struct task_struct *tsk)

581

+ 	sig->inblock += task_io_get_inblock(tsk);

582

+ 	sig->oublock += task_io_get_oublock(tsk);

583

+ 	task_io_accounting_add(&sig->ioac, &tsk->ioac);

584

+-	sig->sum_sched_runtime += tsk->se.sum_exec_runtime;

585

++	sig->sum_sched_runtime += tsk_seruntime(tsk);

586

+ 	sig->nr_threads--;

587

+ 	__unhash_process(tsk, group_dead);

588

+ 	write_sequnlock(&sig->stats_lock);

589

+diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c

590

+index 728f434de2bb..0e1082a4e878 100644

591

+--- a/kernel/locking/rtmutex.c

592

++++ b/kernel/locking/rtmutex.c

593

+@@ -337,21 +337,25 @@ static __always_inline void

594

+ waiter_update_prio(struct rt_mutex_waiter *waiter, struct task_struct *task)

595

+ {

596

+ 	waiter->prio = __waiter_prio(task);

597

+-	waiter->deadline = task->dl.deadline;

598

++	waiter->deadline = __tsk_deadline(task);

599

+ }

600

+

601

+ /*

602

+  * Only use with rt_mutex_waiter_{less,equal}()

603

+  */

604

+ #define task_to_waiter(p)	\

605

+-	&(struct rt_mutex_waiter){ .prio = __waiter_prio(p), .deadline = (p)->dl.deadline }

606

++	&(struct rt_mutex_waiter){ .prio = __waiter_prio(p), .deadline = __tsk_deadline(p) }

607

+

608

+ static __always_inline int rt_mutex_waiter_less(struct rt_mutex_waiter *left,

609

+ 						struct rt_mutex_waiter *right)

610

+ {

611

++#ifdef CONFIG_SCHED_PDS

612

++	return (left->deadline < right->deadline);

613

++#else

614

+ 	if (left->prio < right->prio)

615

+ 		return 1;

616

+

617

++#ifndef CONFIG_SCHED_BMQ

618

+ 	/*

619

+ 	 * If both waiters have dl_prio(), we check the deadlines of the

620

+ 	 * associated tasks.

621

+@@ -360,16 +364,22 @@ static __always_inline int rt_mutex_waiter_less(struct rt_mutex_waiter *left,

622

+ 	 */

623

+ 	if (dl_prio(left->prio))

624

+ 		return dl_time_before(left->deadline, right->deadline);

625

++#endif

626

+

627

+ 	return 0;

628

++#endif

629

+ }

630

+

631

+ static __always_inline int rt_mutex_waiter_equal(struct rt_mutex_waiter *left,

632

+ 						 struct rt_mutex_waiter *right)

633

+ {

634

++#ifdef CONFIG_SCHED_PDS

635

++	return (left->deadline == right->deadline);

636

++#else

637

+ 	if (left->prio != right->prio)

638

+ 		return 0;

639

+

640

++#ifndef CONFIG_SCHED_BMQ

641

+ 	/*

642

+ 	 * If both waiters have dl_prio(), we check the deadlines of the

643

+ 	 * associated tasks.

644

+@@ -378,8 +388,10 @@ static __always_inline int rt_mutex_waiter_equal(struct rt_mutex_waiter *left,

645

+ 	 */

646

+ 	if (dl_prio(left->prio))

647

+ 		return left->deadline == right->deadline;

648

++#endif

649

+

650

+ 	return 1;

651

++#endif

652

+ }

653

+

654

+ static inline bool rt_mutex_steal(struct rt_mutex_waiter *waiter,

655

+diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile

656

+index 976092b7bd45..31d587c16ec1 100644

657

+--- a/kernel/sched/Makefile

658

++++ b/kernel/sched/Makefile

659

+@@ -28,7 +28,12 @@ endif

660

+ # These compilation units have roughly the same size and complexity - so their

661

+ # build parallelizes well and finishes roughly at once:

662

+ #

663

++ifdef CONFIG_SCHED_ALT

664

++obj-y += alt_core.o

665

++obj-$(CONFIG_SCHED_DEBUG) += alt_debug.o

666

++else

667

+ obj-y += core.o

668

+ obj-y += fair.o

669

++endif

670

+ obj-y += build_policy.o

671

+ obj-y += build_utility.o

672

+diff --git a/kernel/sched/alt_core.c b/kernel/sched/alt_core.c

673

+new file mode 100644

674

+index 000000000000..f5e9c01f9382

675

+--- /dev/null

676

++++ b/kernel/sched/alt_core.c

677

+@@ -0,0 +1,8111 @@

678

++/*

679

++ *  kernel/sched/alt_core.c

680

++ *

681

++ *  Core alternative kernel scheduler code and related syscalls

682

++ *

683

++ *  Copyright (C) 1991-2002  Linus Torvalds

684

++ *

685

++ *  2009-08-13	Brainfuck deadline scheduling policy by Con Kolivas deletes

686

++ *		a whole lot of those previous things.

687

++ *  2017-09-06	Priority and Deadline based Skip list multiple queue kernel

688

++ *		scheduler by Alfred Chen.

689

++ *  2019-02-20	BMQ(BitMap Queue) kernel scheduler by Alfred Chen.

690

++ */

691

++#include <linux/sched/cputime.h>

692

++#include <linux/sched/debug.h>

693

++#include <linux/sched/isolation.h>

694

++#include <linux/sched/loadavg.h>

695

++#include <linux/sched/mm.h>

696

++#include <linux/sched/nohz.h>

697

++#include <linux/sched/stat.h>

698

++#include <linux/sched/wake_q.h>

699

++

700

++#include <linux/blkdev.h>

701

++#include <linux/context_tracking.h>

702

++#include <linux/cpuset.h>

703

++#include <linux/delayacct.h>

704

++#include <linux/init_task.h>

705

++#include <linux/kcov.h>

706

++#include <linux/kprobes.h>

707

++#include <linux/nmi.h>

708

++#include <linux/scs.h>

709

++

710

++#include <uapi/linux/sched/types.h>

711

++

712

++#include <asm/irq_regs.h>

713

++#include <asm/switch_to.h>

714

++

715

++#define CREATE_TRACE_POINTS

716

++#include <trace/events/sched.h>

717

++#undef CREATE_TRACE_POINTS

718

++

719

++#include "sched.h"

720

++

721

++#include "pelt.h"

722

++

723

++#include "../../io_uring/io-wq.h"

724

++#include "../smpboot.h"

725

++

726

++/*

727

++ * Export tracepoints that act as a bare tracehook (ie: have no trace event

728

++ * associated with them) to allow external modules to probe them.

729

++ */

730

++EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_irq_tp);

731

++

732

++#ifdef CONFIG_SCHED_DEBUG

733

++#define sched_feat(x)	(1)

734

++/*

735

++ * Print a warning if need_resched is set for the given duration (if

736

++ * LATENCY_WARN is enabled).

737

++ *

738

++ * If sysctl_resched_latency_warn_once is set, only one warning will be shown

739

++ * per boot.

740

++ */

741

++__read_mostly int sysctl_resched_latency_warn_ms = 100;

742

++__read_mostly int sysctl_resched_latency_warn_once = 1;

743

++#else

744

++#define sched_feat(x)	(0)

745

++#endif /* CONFIG_SCHED_DEBUG */

746

++

747

++#define ALT_SCHED_VERSION "v6.2-r0"

748

++

749

++/* rt_prio(prio) defined in include/linux/sched/rt.h */

750

++#define rt_task(p)		rt_prio((p)->prio)

751

++#define rt_policy(policy)	((policy) == SCHED_FIFO || (policy) == SCHED_RR)

752

++#define task_has_rt_policy(p)	(rt_policy((p)->policy))

753

++

754

++#define STOP_PRIO		(MAX_RT_PRIO - 1)

755

++

756

++/* Default time slice is 4 in ms, can be set via kernel parameter "sched_timeslice" */

757

++u64 sched_timeslice_ns __read_mostly = (4 << 20);

758

++

759

++static inline void requeue_task(struct task_struct *p, struct rq *rq, int idx);

760

++

761

++#ifdef CONFIG_SCHED_BMQ

762

++#include "bmq.h"

763

++#endif

764

++#ifdef CONFIG_SCHED_PDS

765

++#include "pds.h"

766

++#endif

767

++

768

++struct affinity_context {

769

++	const struct cpumask *new_mask;

770

++	struct cpumask *user_mask;

771

++	unsigned int flags;

772

++};

773

++

774

++static int __init sched_timeslice(char *str)

775

++{

776

++	int timeslice_ms;

777

++

778

++	get_option(&str, &timeslice_ms);

779

++	if (2 != timeslice_ms)

780

++		timeslice_ms = 4;

781

++	sched_timeslice_ns = timeslice_ms << 20;

782

++	sched_timeslice_imp(timeslice_ms);

783

++

784

++	return 0;

785

++}

786

++early_param("sched_timeslice", sched_timeslice);

787

++

788

++/* Reschedule if less than this many μs left */

789

++#define RESCHED_NS		(100 << 10)

790

++

791

++/**

792

++ * sched_yield_type - Choose what sort of yield sched_yield will perform.

793

++ * 0: No yield.

794

++ * 1: Deboost and requeue task. (default)

795

++ * 2: Set rq skip task.

796

++ */

797

++int sched_yield_type __read_mostly = 1;

798

++

799

++#ifdef CONFIG_SMP

800

++static cpumask_t sched_rq_pending_mask ____cacheline_aligned_in_smp;

801

++

802

++DEFINE_PER_CPU(cpumask_t [NR_CPU_AFFINITY_LEVELS], sched_cpu_topo_masks);

803

++DEFINE_PER_CPU(cpumask_t *, sched_cpu_llc_mask);

804

++DEFINE_PER_CPU(cpumask_t *, sched_cpu_topo_end_mask);

805

++

806

++#ifdef CONFIG_SCHED_SMT

807

++DEFINE_STATIC_KEY_FALSE(sched_smt_present);

808

++EXPORT_SYMBOL_GPL(sched_smt_present);

809

++#endif

810

++

811

++/*

812

++ * Keep a unique ID per domain (we use the first CPUs number in the cpumask of

813

++ * the domain), this allows us to quickly tell if two cpus are in the same cache

814

++ * domain, see cpus_share_cache().

815

++ */

816

++DEFINE_PER_CPU(int, sd_llc_id);

817

++#endif /* CONFIG_SMP */

818

++

819

++static DEFINE_MUTEX(sched_hotcpu_mutex);

820

++

821

++DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);

822

++

823

++#ifndef prepare_arch_switch

824

++# define prepare_arch_switch(next)	do { } while (0)

825

++#endif

826

++#ifndef finish_arch_post_lock_switch

827

++# define finish_arch_post_lock_switch()	do { } while (0)

828

++#endif

829

++

830

++#ifdef CONFIG_SCHED_SMT

831

++static cpumask_t sched_sg_idle_mask ____cacheline_aligned_in_smp;

832

++#endif

833

++static cpumask_t sched_preempt_mask[SCHED_QUEUE_BITS] ____cacheline_aligned_in_smp;

834

++static cpumask_t *const sched_idle_mask = &sched_preempt_mask[0];

835

++

836

++/* task function */

837

++static inline const struct cpumask *task_user_cpus(struct task_struct *p)

838

++{

839

++	if (!p->user_cpus_ptr)

840

++		return cpu_possible_mask; /* &init_task.cpus_mask */

841

++	return p->user_cpus_ptr;

842

++}

843

++

844

++/* sched_queue related functions */

845

++static inline void sched_queue_init(struct sched_queue *q)

846

++{

847

++	int i;

848

++

849

++	bitmap_zero(q->bitmap, SCHED_QUEUE_BITS);

850

++	for(i = 0; i < SCHED_BITS; i++)

851

++		INIT_LIST_HEAD(&q->heads[i]);

852

++}

853

++

854

++/*

855

++ * Init idle task and put into queue structure of rq

856

++ * IMPORTANT: may be called multiple times for a single cpu

857

++ */

858

++static inline void sched_queue_init_idle(struct sched_queue *q,

859

++					 struct task_struct *idle)

860

++{

861

++	idle->sq_idx = IDLE_TASK_SCHED_PRIO;

862

++	INIT_LIST_HEAD(&q->heads[idle->sq_idx]);

863

++	list_add(&idle->sq_node, &q->heads[idle->sq_idx]);

864

++}

865

++

866

++static inline void

867

++clear_recorded_preempt_mask(int pr, int low, int high, int cpu)

868

++{

869

++	if (low < pr && pr <= high)

870

++		cpumask_clear_cpu(cpu, sched_preempt_mask + SCHED_QUEUE_BITS - pr);

871

++}

872

++

873

++static inline void

874

++set_recorded_preempt_mask(int pr, int low, int high, int cpu)

875

++{

876

++	if (low < pr && pr <= high)

877

++		cpumask_set_cpu(cpu, sched_preempt_mask + SCHED_QUEUE_BITS - pr);

878

++}

879

++

880

++static atomic_t sched_prio_record = ATOMIC_INIT(0);

881

++

882

++/* water mark related functions */

883

++static inline void update_sched_preempt_mask(struct rq *rq)

884

++{

885

++	unsigned long prio = find_first_bit(rq->queue.bitmap, SCHED_QUEUE_BITS);

886

++	unsigned long last_prio = rq->prio;

887

++	int cpu, pr;

888

++

889

++	if (prio == last_prio)

890

++		return;

891

++

892

++	rq->prio = prio;

893

++	cpu = cpu_of(rq);

894

++	pr = atomic_read(&sched_prio_record);

895

++

896

++	if (prio < last_prio) {

897

++		if (IDLE_TASK_SCHED_PRIO == last_prio) {

898

++			cpumask_clear_cpu(cpu, sched_idle_mask);

899

++			last_prio -= 2;

900

++#ifdef CONFIG_SCHED_SMT

901

++			if (static_branch_likely(&sched_smt_present))

902

++				cpumask_andnot(&sched_sg_idle_mask,

903

++					       &sched_sg_idle_mask, cpu_smt_mask(cpu));

904

++#endif

905

++		}

906

++		clear_recorded_preempt_mask(pr, prio, last_prio, cpu);

907

++

908

++		return;

909

++	}

910

++	/* last_prio < prio */

911

++	if (IDLE_TASK_SCHED_PRIO == prio) {

912

++		cpumask_set_cpu(cpu, sched_idle_mask);

913

++		prio -= 2;

914

++#ifdef CONFIG_SCHED_SMT

915

++		if (static_branch_likely(&sched_smt_present)) {

916

++			cpumask_t tmp;

917

++

918

++			cpumask_and(&tmp, cpu_smt_mask(cpu), sched_idle_mask);

919

++			if (cpumask_equal(&tmp, cpu_smt_mask(cpu)))

920

++				cpumask_or(&sched_sg_idle_mask,

921

++					   &sched_sg_idle_mask, cpu_smt_mask(cpu));

922

++		}

923

++#endif

924

++	}

925

++	set_recorded_preempt_mask(pr, last_prio, prio, cpu);

926

++}

927

++

928

++/*

929

++ * This routine assume that the idle task always in queue

930

++ */

931

++static inline struct task_struct *sched_rq_first_task(struct rq *rq)

932

++{

933

++	unsigned long idx = find_first_bit(rq->queue.bitmap, SCHED_QUEUE_BITS);

934

++	const struct list_head *head = &rq->queue.heads[sched_prio2idx(idx, rq)];

935

++

936

++	return list_first_entry(head, struct task_struct, sq_node);

937

++}

938

++

939

++static inline struct task_struct *

940

++sched_rq_next_task(struct task_struct *p, struct rq *rq)

941

++{

942

++	unsigned long idx = p->sq_idx;

943

++	struct list_head *head = &rq->queue.heads[idx];

944

++

945

++	if (list_is_last(&p->sq_node, head)) {

946

++		idx = find_next_bit(rq->queue.bitmap, SCHED_QUEUE_BITS,

947

++				    sched_idx2prio(idx, rq) + 1);

948

++		head = &rq->queue.heads[sched_prio2idx(idx, rq)];

949

++

950

++		return list_first_entry(head, struct task_struct, sq_node);

951

++	}

952

++

953

++	return list_next_entry(p, sq_node);

954

++}

955

++

956

++static inline struct task_struct *rq_runnable_task(struct rq *rq)

957

++{

958

++	struct task_struct *next = sched_rq_first_task(rq);

959

++

960

++	if (unlikely(next == rq->skip))

961

++		next = sched_rq_next_task(next, rq);

962

++

963

++	return next;

964

++}

965

++

966

++/*

967

++ * Serialization rules:

968

++ *

969

++ * Lock order:

970

++ *

971

++ *   p->pi_lock

972

++ *     rq->lock

973

++ *       hrtimer_cpu_base->lock (hrtimer_start() for bandwidth controls)

974

++ *

975

++ *  rq1->lock

976

++ *    rq2->lock  where: rq1 < rq2

977

++ *

978

++ * Regular state:

979

++ *

980

++ * Normal scheduling state is serialized by rq->lock. __schedule() takes the

981

++ * local CPU's rq->lock, it optionally removes the task from the runqueue and

982

++ * always looks at the local rq data structures to find the most eligible task

983

++ * to run next.

984

++ *

985

++ * Task enqueue is also under rq->lock, possibly taken from another CPU.

986

++ * Wakeups from another LLC domain might use an IPI to transfer the enqueue to

987

++ * the local CPU to avoid bouncing the runqueue state around [ see

988

++ * ttwu_queue_wakelist() ]

989

++ *

990

++ * Task wakeup, specifically wakeups that involve migration, are horribly

991

++ * complicated to avoid having to take two rq->locks.

992

++ *

993

++ * Special state:

994

++ *

995

++ * System-calls and anything external will use task_rq_lock() which acquires

996

++ * both p->pi_lock and rq->lock. As a consequence the state they change is

997

++ * stable while holding either lock:

998

++ *

999

++ *  - sched_setaffinity()/

1000

++ *    set_cpus_allowed_ptr():	p->cpus_ptr, p->nr_cpus_allowed

1001

++ *  - set_user_nice():		p->se.load, p->*prio

1002

++ *  - __sched_setscheduler():	p->sched_class, p->policy, p->*prio,

1003

++ *				p->se.load, p->rt_priority,

1004

++ *				p->dl.dl_{runtime, deadline, period, flags, bw, density}

1005

++ *  - sched_setnuma():		p->numa_preferred_nid

1006

++ *  - sched_move_task():        p->sched_task_group

1007

++ *  - uclamp_update_active()	p->uclamp*

1008

++ *

1009

++ * p->state <- TASK_*:

1010

++ *

1011

++ *   is changed locklessly using set_current_state(), __set_current_state() or

1012

++ *   set_special_state(), see their respective comments, or by

1013

++ *   try_to_wake_up(). This latter uses p->pi_lock to serialize against

1014

++ *   concurrent self.

1015

++ *

1016

++ * p->on_rq <- { 0, 1 = TASK_ON_RQ_QUEUED, 2 = TASK_ON_RQ_MIGRATING }:

1017

++ *

1018

++ *   is set by activate_task() and cleared by deactivate_task(), under

1019

++ *   rq->lock. Non-zero indicates the task is runnable, the special

1020

++ *   ON_RQ_MIGRATING state is used for migration without holding both

1021

++ *   rq->locks. It indicates task_cpu() is not stable, see task_rq_lock().

1022

++ *

1023

++ * p->on_cpu <- { 0, 1 }:

1024

++ *

1025

++ *   is set by prepare_task() and cleared by finish_task() such that it will be

1026

++ *   set before p is scheduled-in and cleared after p is scheduled-out, both

1027

++ *   under rq->lock. Non-zero indicates the task is running on its CPU.

1028

++ *

1029

++ *   [ The astute reader will observe that it is possible for two tasks on one

1030

++ *     CPU to have ->on_cpu = 1 at the same time. ]

1031

++ *

1032

++ * task_cpu(p): is changed by set_task_cpu(), the rules are:

1033

++ *

1034

++ *  - Don't call set_task_cpu() on a blocked task:

1035

++ *

1036

++ *    We don't care what CPU we're not running on, this simplifies hotplug,

1037

++ *    the CPU assignment of blocked tasks isn't required to be valid.

1038

++ *

1039

++ *  - for try_to_wake_up(), called under p->pi_lock:

1040

++ *

1041

++ *    This allows try_to_wake_up() to only take one rq->lock, see its comment.

1042

++ *

1043

++ *  - for migration called under rq->lock:

1044

++ *    [ see task_on_rq_migrating() in task_rq_lock() ]

1045

++ *

1046

++ *    o move_queued_task()

1047

++ *    o detach_task()

1048

++ *

1049

++ *  - for migration called under double_rq_lock():

1050

++ *

1051

++ *    o __migrate_swap_task()

1052

++ *    o push_rt_task() / pull_rt_task()

1053

++ *    o push_dl_task() / pull_dl_task()

1054

++ *    o dl_task_offline_migration()

1055

++ *

1056

++ */

1057

++

1058

++/*

1059

++ * Context: p->pi_lock

1060

++ */

1061

++static inline struct rq

1062

++*__task_access_lock(struct task_struct *p, raw_spinlock_t **plock)

1063

++{

1064

++	struct rq *rq;

1065

++	for (;;) {

1066

++		rq = task_rq(p);

1067

++		if (p->on_cpu || task_on_rq_queued(p)) {

1068

++			raw_spin_lock(&rq->lock);

1069

++			if (likely((p->on_cpu || task_on_rq_queued(p))

1070

++				   && rq == task_rq(p))) {

1071

++				*plock = &rq->lock;

1072

++				return rq;

1073

++			}

1074

++			raw_spin_unlock(&rq->lock);

1075

++		} else if (task_on_rq_migrating(p)) {

1076

++			do {

1077

++				cpu_relax();

1078

++			} while (unlikely(task_on_rq_migrating(p)));

1079

++		} else {

1080

++			*plock = NULL;

1081

++			return rq;

1082

++		}

1083

++	}

1084

++}

1085

++

1086

++static inline void

1087

++__task_access_unlock(struct task_struct *p, raw_spinlock_t *lock)

1088

++{

1089

++	if (NULL != lock)

1090

++		raw_spin_unlock(lock);

1091

++}

1092

++

1093

++static inline struct rq

1094

++*task_access_lock_irqsave(struct task_struct *p, raw_spinlock_t **plock,

1095

++			  unsigned long *flags)

1096

++{

1097

++	struct rq *rq;

1098

++	for (;;) {

1099

++		rq = task_rq(p);

1100

++		if (p->on_cpu || task_on_rq_queued(p)) {

1101

++			raw_spin_lock_irqsave(&rq->lock, *flags);

1102

++			if (likely((p->on_cpu || task_on_rq_queued(p))

1103

++				   && rq == task_rq(p))) {

1104

++				*plock = &rq->lock;

1105

++				return rq;

1106

++			}

1107

++			raw_spin_unlock_irqrestore(&rq->lock, *flags);

1108

++		} else if (task_on_rq_migrating(p)) {

1109

++			do {

1110

++				cpu_relax();

1111

++			} while (unlikely(task_on_rq_migrating(p)));

1112

++		} else {

1113

++			raw_spin_lock_irqsave(&p->pi_lock, *flags);

1114

++			if (likely(!p->on_cpu && !p->on_rq &&

1115

++				   rq == task_rq(p))) {

1116

++				*plock = &p->pi_lock;

1117

++				return rq;

1118

++			}

1119

++			raw_spin_unlock_irqrestore(&p->pi_lock, *flags);

1120

++		}

1121

++	}

1122

++}

1123

++

1124

++static inline void

1125

++task_access_unlock_irqrestore(struct task_struct *p, raw_spinlock_t *lock,

1126

++			      unsigned long *flags)

1127

++{

1128

++	raw_spin_unlock_irqrestore(lock, *flags);

1129

++}

1130

++

1131

++/*

1132

++ * __task_rq_lock - lock the rq @p resides on.

1133

++ */

1134

++struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf)

1135

++	__acquires(rq->lock)

1136

++{

1137

++	struct rq *rq;

1138

++

1139

++	lockdep_assert_held(&p->pi_lock);

1140

++

1141

++	for (;;) {

1142

++		rq = task_rq(p);

1143

++		raw_spin_lock(&rq->lock);

1144

++		if (likely(rq == task_rq(p) && !task_on_rq_migrating(p)))

1145

++			return rq;

1146

++		raw_spin_unlock(&rq->lock);

1147

++

1148

++		while (unlikely(task_on_rq_migrating(p)))

1149

++			cpu_relax();

1150

++	}

1151

++}

1152

++

1153

++/*

1154

++ * task_rq_lock - lock p->pi_lock and lock the rq @p resides on.

1155

++ */

1156

++struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)

1157

++	__acquires(p->pi_lock)

1158

++	__acquires(rq->lock)

1159

++{

1160

++	struct rq *rq;

1161

++

1162

++	for (;;) {

1163

++		raw_spin_lock_irqsave(&p->pi_lock, rf->flags);

1164

++		rq = task_rq(p);

1165

++		raw_spin_lock(&rq->lock);

1166

++		/*

1167

++		 *	move_queued_task()		task_rq_lock()

1168

++		 *

1169

++		 *	ACQUIRE (rq->lock)

1170

++		 *	[S] ->on_rq = MIGRATING		[L] rq = task_rq()

1171

++		 *	WMB (__set_task_cpu())		ACQUIRE (rq->lock);

1172

++		 *	[S] ->cpu = new_cpu		[L] task_rq()

1173

++		 *					[L] ->on_rq

1174

++		 *	RELEASE (rq->lock)

1175

++		 *

1176

++		 * If we observe the old CPU in task_rq_lock(), the acquire of

1177

++		 * the old rq->lock will fully serialize against the stores.

1178

++		 *

1179

++		 * If we observe the new CPU in task_rq_lock(), the address

1180

++		 * dependency headed by '[L] rq = task_rq()' and the acquire

1181

++		 * will pair with the WMB to ensure we then also see migrating.

1182

++		 */

1183

++		if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {

1184

++			return rq;

1185

++		}

1186

++		raw_spin_unlock(&rq->lock);

1187

++		raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags);

1188

++

1189

++		while (unlikely(task_on_rq_migrating(p)))

1190

++			cpu_relax();

1191

++	}

1192

++}

1193

++

1194

++static inline void

1195

++rq_lock_irqsave(struct rq *rq, struct rq_flags *rf)

1196

++	__acquires(rq->lock)

1197

++{

1198

++	raw_spin_lock_irqsave(&rq->lock, rf->flags);

1199

++}

1200

++

1201

++static inline void

1202

++rq_unlock_irqrestore(struct rq *rq, struct rq_flags *rf)

1203

++	__releases(rq->lock)

1204

++{

1205

++	raw_spin_unlock_irqrestore(&rq->lock, rf->flags);

1206

++}

1207

++

1208

++void raw_spin_rq_lock_nested(struct rq *rq, int subclass)

1209

++{

1210

++	raw_spinlock_t *lock;

1211

++

1212

++	/* Matches synchronize_rcu() in __sched_core_enable() */

1213

++	preempt_disable();

1214

++

1215

++	for (;;) {

1216

++		lock = __rq_lockp(rq);

1217

++		raw_spin_lock_nested(lock, subclass);

1218

++		if (likely(lock == __rq_lockp(rq))) {

1219

++			/* preempt_count *MUST* be > 1 */

1220

++			preempt_enable_no_resched();

1221

++			return;

1222

++		}

1223

++		raw_spin_unlock(lock);

1224

++	}

1225

++}

1226

++

1227

++void raw_spin_rq_unlock(struct rq *rq)

1228

++{

1229

++	raw_spin_unlock(rq_lockp(rq));

1230

++}

1231

++

1232

++/*

1233

++ * RQ-clock updating methods:

1234

++ */

1235

++

1236

++static void update_rq_clock_task(struct rq *rq, s64 delta)

1237

++{

1238

++/*

1239

++ * In theory, the compile should just see 0 here, and optimize out the call

1240

++ * to sched_rt_avg_update. But I don't trust it...

1241

++ */

1242

++	s64 __maybe_unused steal = 0, irq_delta = 0;

1243

++

1244

++#ifdef CONFIG_IRQ_TIME_ACCOUNTING

1245

++	irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;

1246

++

1247

++	/*

1248

++	 * Since irq_time is only updated on {soft,}irq_exit, we might run into

1249

++	 * this case when a previous update_rq_clock() happened inside a

1250

++	 * {soft,}irq region.

1251

++	 *

1252

++	 * When this happens, we stop ->clock_task and only update the

1253

++	 * prev_irq_time stamp to account for the part that fit, so that a next

1254

++	 * update will consume the rest. This ensures ->clock_task is

1255

++	 * monotonic.

1256

++	 *

1257

++	 * It does however cause some slight miss-attribution of {soft,}irq

1258

++	 * time, a more accurate solution would be to update the irq_time using

1259

++	 * the current rq->clock timestamp, except that would require using

1260

++	 * atomic ops.

1261

++	 */

1262

++	if (irq_delta > delta)

1263

++		irq_delta = delta;

1264

++

1265

++	rq->prev_irq_time += irq_delta;

1266

++	delta -= irq_delta;

1267

++	psi_account_irqtime(rq->curr, irq_delta);

1268

++#endif

1269

++#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING

1270

++	if (static_key_false((&paravirt_steal_rq_enabled))) {

1271

++		steal = paravirt_steal_clock(cpu_of(rq));

1272

++		steal -= rq->prev_steal_time_rq;

1273

++

1274

++		if (unlikely(steal > delta))

1275

++			steal = delta;

1276

++

1277

++		rq->prev_steal_time_rq += steal;

1278

++		delta -= steal;

1279

++	}

1280

++#endif

1281

++

1282

++	rq->clock_task += delta;

1283

++

1284

++#ifdef CONFIG_HAVE_SCHED_AVG_IRQ

1285

++	if ((irq_delta + steal))

1286

++		update_irq_load_avg(rq, irq_delta + steal);

1287

++#endif

1288

++}

1289

++

1290

++static inline void update_rq_clock(struct rq *rq)

1291

++{

1292

++	s64 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;

1293

++

1294

++	if (unlikely(delta <= 0))

1295

++		return;

1296

++	rq->clock += delta;

1297

++	update_rq_time_edge(rq);

1298

++	update_rq_clock_task(rq, delta);

1299

++}

1300

++

1301

++/*

1302

++ * RQ Load update routine

1303

++ */

1304

++#define RQ_LOAD_HISTORY_BITS		(sizeof(s32) * 8ULL)

1305

++#define RQ_UTIL_SHIFT			(8)

1306

++#define RQ_LOAD_HISTORY_TO_UTIL(l)	(((l) >> (RQ_LOAD_HISTORY_BITS - 1 - RQ_UTIL_SHIFT)) & 0xff)

1307

++

1308

++#define LOAD_BLOCK(t)		((t) >> 17)

1309

++#define LOAD_HALF_BLOCK(t)	((t) >> 16)

1310

++#define BLOCK_MASK(t)		((t) & ((0x01 << 18) - 1))

1311

++#define LOAD_BLOCK_BIT(b)	(1UL << (RQ_LOAD_HISTORY_BITS - 1 - (b)))

1312

++#define CURRENT_LOAD_BIT	LOAD_BLOCK_BIT(0)

1313

++

1314

++static inline void rq_load_update(struct rq *rq)

1315

++{

1316

++	u64 time = rq->clock;

1317

++	u64 delta = min(LOAD_BLOCK(time) - LOAD_BLOCK(rq->load_stamp),

1318

++			RQ_LOAD_HISTORY_BITS - 1);

1319

++	u64 prev = !!(rq->load_history & CURRENT_LOAD_BIT);

1320

++	u64 curr = !!rq->nr_running;

1321

++

1322

++	if (delta) {

1323

++		rq->load_history = rq->load_history >> delta;

1324

++

1325

++		if (delta < RQ_UTIL_SHIFT) {

1326

++			rq->load_block += (~BLOCK_MASK(rq->load_stamp)) * prev;

1327

++			if (!!LOAD_HALF_BLOCK(rq->load_block) ^ curr)

1328

++				rq->load_history ^= LOAD_BLOCK_BIT(delta);

1329

++		}

1330

++

1331

++		rq->load_block = BLOCK_MASK(time) * prev;

1332

++	} else {

1333

++		rq->load_block += (time - rq->load_stamp) * prev;

1334

++	}

1335

++	if (prev ^ curr)

1336

++		rq->load_history ^= CURRENT_LOAD_BIT;

1337

++	rq->load_stamp = time;

1338

++}

1339

++

1340

++unsigned long rq_load_util(struct rq *rq, unsigned long max)

1341

++{

1342

++	return RQ_LOAD_HISTORY_TO_UTIL(rq->load_history) * (max >> RQ_UTIL_SHIFT);

1343

++}

1344

++

1345

++#ifdef CONFIG_SMP

1346

++unsigned long sched_cpu_util(int cpu)

1347

++{

1348

++	return rq_load_util(cpu_rq(cpu), arch_scale_cpu_capacity(cpu));

1349

++}

1350

++#endif /* CONFIG_SMP */

1351

++

1352

++#ifdef CONFIG_CPU_FREQ

1353

++/**

1354

++ * cpufreq_update_util - Take a note about CPU utilization changes.

1355

++ * @rq: Runqueue to carry out the update for.

1356

++ * @flags: Update reason flags.

1357

++ *

1358

++ * This function is called by the scheduler on the CPU whose utilization is

1359

++ * being updated.

1360

++ *

1361

++ * It can only be called from RCU-sched read-side critical sections.

1362

++ *

1363

++ * The way cpufreq is currently arranged requires it to evaluate the CPU

1364

++ * performance state (frequency/voltage) on a regular basis to prevent it from

1365

++ * being stuck in a completely inadequate performance level for too long.

1366

++ * That is not guaranteed to happen if the updates are only triggered from CFS

1367

++ * and DL, though, because they may not be coming in if only RT tasks are

1368

++ * active all the time (or there are RT tasks only).

1369

++ *

1370

++ * As a workaround for that issue, this function is called periodically by the

1371

++ * RT sched class to trigger extra cpufreq updates to prevent it from stalling,

1372

++ * but that really is a band-aid.  Going forward it should be replaced with

1373

++ * solutions targeted more specifically at RT tasks.

1374

++ */

1375

++static inline void cpufreq_update_util(struct rq *rq, unsigned int flags)

1376

++{

1377

++	struct update_util_data *data;

1378

++

1379

++#ifdef CONFIG_SMP

1380

++	rq_load_update(rq);

1381

++#endif

1382

++	data = rcu_dereference_sched(*per_cpu_ptr(&cpufreq_update_util_data,

1383

++						  cpu_of(rq)));

1384

++	if (data)

1385

++		data->func(data, rq_clock(rq), flags);

1386

++}

1387

++#else

1388

++static inline void cpufreq_update_util(struct rq *rq, unsigned int flags)

1389

++{

1390

++#ifdef CONFIG_SMP

1391

++	rq_load_update(rq);

1392

++#endif

1393

++}

1394

++#endif /* CONFIG_CPU_FREQ */

1395

++

1396

++#ifdef CONFIG_NO_HZ_FULL

1397

++/*

1398

++ * Tick may be needed by tasks in the runqueue depending on their policy and

1399

++ * requirements. If tick is needed, lets send the target an IPI to kick it out

1400

++ * of nohz mode if necessary.

1401

++ */

1402

++static inline void sched_update_tick_dependency(struct rq *rq)

1403

++{

1404

++	int cpu = cpu_of(rq);

1405

++

1406

++	if (!tick_nohz_full_cpu(cpu))

1407

++		return;

1408

++

1409

++	if (rq->nr_running < 2)

1410

++		tick_nohz_dep_clear_cpu(cpu, TICK_DEP_BIT_SCHED);

1411

++	else

1412

++		tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED);

1413

++}

1414

++#else /* !CONFIG_NO_HZ_FULL */

1415

++static inline void sched_update_tick_dependency(struct rq *rq) { }

1416

++#endif

1417

++

1418

++bool sched_task_on_rq(struct task_struct *p)

1419

++{

1420

++	return task_on_rq_queued(p);

1421

++}

1422

++

1423

++unsigned long get_wchan(struct task_struct *p)

1424

++{

1425

++	unsigned long ip = 0;

1426

++	unsigned int state;

1427

++

1428

++	if (!p || p == current)

1429

++		return 0;

1430

++

1431

++	/* Only get wchan if task is blocked and we can keep it that way. */

1432

++	raw_spin_lock_irq(&p->pi_lock);

1433

++	state = READ_ONCE(p->__state);

1434

++	smp_rmb(); /* see try_to_wake_up() */

1435

++	if (state != TASK_RUNNING && state != TASK_WAKING && !p->on_rq)

1436

++		ip = __get_wchan(p);

1437

++	raw_spin_unlock_irq(&p->pi_lock);

1438

++

1439

++	return ip;

1440

++}

1441

++

1442

++/*

1443

++ * Add/Remove/Requeue task to/from the runqueue routines

1444

++ * Context: rq->lock

1445

++ */

1446

++#define __SCHED_DEQUEUE_TASK(p, rq, flags)					\

1447

++	sched_info_dequeue(rq, p);						\

1448

++	psi_dequeue(p, flags & DEQUEUE_SLEEP);					\

1449

++										\

1450

++	list_del(&p->sq_node);							\

1451

++	if (list_empty(&rq->queue.heads[p->sq_idx])) 				\

1452

++		clear_bit(sched_idx2prio(p->sq_idx, rq), rq->queue.bitmap);

1453

++

1454

++#define __SCHED_ENQUEUE_TASK(p, rq, flags)				\

1455

++	sched_info_enqueue(rq, p);					\

1456

++	psi_enqueue(p, flags & ENQUEUE_WAKEUP);				\

1457

++									\

1458

++	p->sq_idx = task_sched_prio_idx(p, rq);				\

1459

++	list_add_tail(&p->sq_node, &rq->queue.heads[p->sq_idx]);	\

1460

++	set_bit(sched_idx2prio(p->sq_idx, rq), rq->queue.bitmap);

1461

++

1462

++static inline void dequeue_task(struct task_struct *p, struct rq *rq, int flags)

1463

++{

1464

++	lockdep_assert_held(&rq->lock);

1465

++

1466

++	/*printk(KERN_INFO "sched: dequeue(%d) %px %016llx\n", cpu_of(rq), p, p->priodl);*/

1467

++	WARN_ONCE(task_rq(p) != rq, "sched: dequeue task reside on cpu%d from cpu%d\n",

1468

++		  task_cpu(p), cpu_of(rq));

1469

++

1470

++	__SCHED_DEQUEUE_TASK(p, rq, flags);

1471

++	--rq->nr_running;

1472

++#ifdef CONFIG_SMP

1473

++	if (1 == rq->nr_running)

1474

++		cpumask_clear_cpu(cpu_of(rq), &sched_rq_pending_mask);

1475

++#endif

1476

++

1477

++	sched_update_tick_dependency(rq);

1478

++}

1479

++

1480

++static inline void enqueue_task(struct task_struct *p, struct rq *rq, int flags)

1481

++{

1482

++	lockdep_assert_held(&rq->lock);

1483

++

1484

++	/*printk(KERN_INFO "sched: enqueue(%d) %px %016llx\n", cpu_of(rq), p, p->priodl);*/

1485

++	WARN_ONCE(task_rq(p) != rq, "sched: enqueue task reside on cpu%d to cpu%d\n",

1486

++		  task_cpu(p), cpu_of(rq));

1487

++

1488

++	__SCHED_ENQUEUE_TASK(p, rq, flags);

1489

++	update_sched_preempt_mask(rq);

1490

++	++rq->nr_running;

1491

++#ifdef CONFIG_SMP

1492

++	if (2 == rq->nr_running)

1493

++		cpumask_set_cpu(cpu_of(rq), &sched_rq_pending_mask);

1494

++#endif

1495

++

1496

++	sched_update_tick_dependency(rq);

1497

++}

1498

++

1499

++static inline void requeue_task(struct task_struct *p, struct rq *rq, int idx)

1500

++{

1501

++	lockdep_assert_held(&rq->lock);

1502

++	/*printk(KERN_INFO "sched: requeue(%d) %px %016llx\n", cpu_of(rq), p, p->priodl);*/

1503

++	WARN_ONCE(task_rq(p) != rq, "sched: cpu[%d] requeue task reside on cpu%d\n",

1504

++		  cpu_of(rq), task_cpu(p));

1505

++

1506

++	list_del(&p->sq_node);

1507

++	list_add_tail(&p->sq_node, &rq->queue.heads[idx]);

1508

++	if (idx != p->sq_idx) {

1509

++		if (list_empty(&rq->queue.heads[p->sq_idx]))

1510

++			clear_bit(sched_idx2prio(p->sq_idx, rq),

1511

++				  rq->queue.bitmap);

1512

++		p->sq_idx = idx;

1513

++		set_bit(sched_idx2prio(p->sq_idx, rq), rq->queue.bitmap);

1514

++		update_sched_preempt_mask(rq);

1515

++	}

1516

++}

1517

++

1518

++/*

1519

++ * cmpxchg based fetch_or, macro so it works for different integer types

1520

++ */

1521

++#define fetch_or(ptr, mask)						\

1522

++	({								\

1523

++		typeof(ptr) _ptr = (ptr);				\

1524

++		typeof(mask) _mask = (mask);				\

1525

++		typeof(*_ptr) _val = *_ptr;				\

1526

++									\

1527

++		do {							\

1528

++		} while (!try_cmpxchg(_ptr, &_val, _val | _mask));	\

1529

++	_val;								\

1530

++})

1531

++

1532

++#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG)

1533

++/*

1534

++ * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG,

1535

++ * this avoids any races wrt polling state changes and thereby avoids

1536

++ * spurious IPIs.

1537

++ */

1538

++static inline bool set_nr_and_not_polling(struct task_struct *p)

1539

++{

1540

++	struct thread_info *ti = task_thread_info(p);

1541

++	return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG);

1542

++}

1543

++

1544

++/*

1545

++ * Atomically set TIF_NEED_RESCHED if TIF_POLLING_NRFLAG is set.

1546

++ *

1547

++ * If this returns true, then the idle task promises to call

1548

++ * sched_ttwu_pending() and reschedule soon.

1549

++ */

1550

++static bool set_nr_if_polling(struct task_struct *p)

1551

++{

1552

++	struct thread_info *ti = task_thread_info(p);

1553

++	typeof(ti->flags) val = READ_ONCE(ti->flags);

1554

++

1555

++	for (;;) {

1556

++		if (!(val & _TIF_POLLING_NRFLAG))

1557

++			return false;

1558

++		if (val & _TIF_NEED_RESCHED)

1559

++			return true;

1560

++		if (try_cmpxchg(&ti->flags, &val, val | _TIF_NEED_RESCHED))

1561

++			break;

1562

++	}

1563

++	return true;

1564

++}

1565

++

1566

++#else

1567

++static inline bool set_nr_and_not_polling(struct task_struct *p)

1568

++{

1569

++	set_tsk_need_resched(p);

1570

++	return true;

1571

++}

1572

++

1573

++#ifdef CONFIG_SMP

1574

++static inline bool set_nr_if_polling(struct task_struct *p)

1575

++{

1576

++	return false;

1577

++}

1578

++#endif

1579

++#endif

1580

++

1581

++static bool __wake_q_add(struct wake_q_head *head, struct task_struct *task)

1582

++{

1583

++	struct wake_q_node *node = &task->wake_q;

1584

++

1585

++	/*

1586

++	 * Atomically grab the task, if ->wake_q is !nil already it means

1587

++	 * it's already queued (either by us or someone else) and will get the

1588

++	 * wakeup due to that.

1589

++	 *

1590

++	 * In order to ensure that a pending wakeup will observe our pending

1591

++	 * state, even in the failed case, an explicit smp_mb() must be used.

1592

++	 */

1593

++	smp_mb__before_atomic();

1594

++	if (unlikely(cmpxchg_relaxed(&node->next, NULL, WAKE_Q_TAIL)))

1595

++		return false;

1596

++

1597

++	/*

1598

++	 * The head is context local, there can be no concurrency.

1599

++	 */

1600

++	*head->lastp = node;

1601

++	head->lastp = &node->next;

1602

++	return true;

1603

++}

1604

++

1605

++/**

1606

++ * wake_q_add() - queue a wakeup for 'later' waking.

1607

++ * @head: the wake_q_head to add @task to

1608

++ * @task: the task to queue for 'later' wakeup

1609

++ *

1610

++ * Queue a task for later wakeup, most likely by the wake_up_q() call in the

1611

++ * same context, _HOWEVER_ this is not guaranteed, the wakeup can come

1612

++ * instantly.

1613

++ *

1614

++ * This function must be used as-if it were wake_up_process(); IOW the task

1615

++ * must be ready to be woken at this location.

1616

++ */

1617

++void wake_q_add(struct wake_q_head *head, struct task_struct *task)

1618

++{

1619

++	if (__wake_q_add(head, task))

1620

++		get_task_struct(task);

1621

++}

1622

++

1623

++/**

1624

++ * wake_q_add_safe() - safely queue a wakeup for 'later' waking.

1625

++ * @head: the wake_q_head to add @task to

1626

++ * @task: the task to queue for 'later' wakeup

1627

++ *

1628

++ * Queue a task for later wakeup, most likely by the wake_up_q() call in the

1629

++ * same context, _HOWEVER_ this is not guaranteed, the wakeup can come

1630

++ * instantly.

1631

++ *

1632

++ * This function must be used as-if it were wake_up_process(); IOW the task

1633

++ * must be ready to be woken at this location.

1634

++ *

1635

++ * This function is essentially a task-safe equivalent to wake_q_add(). Callers

1636

++ * that already hold reference to @task can call the 'safe' version and trust

1637

++ * wake_q to do the right thing depending whether or not the @task is already

1638

++ * queued for wakeup.

1639

++ */

1640

++void wake_q_add_safe(struct wake_q_head *head, struct task_struct *task)

1641

++{

1642

++	if (!__wake_q_add(head, task))

1643

++		put_task_struct(task);

1644

++}

1645

++

1646

++void wake_up_q(struct wake_q_head *head)

1647

++{

1648

++	struct wake_q_node *node = head->first;

1649

++

1650

++	while (node != WAKE_Q_TAIL) {

1651

++		struct task_struct *task;

1652

++

1653

++		task = container_of(node, struct task_struct, wake_q);

1654

++		/* task can safely be re-inserted now: */

1655

++		node = node->next;

1656

++		task->wake_q.next = NULL;

1657

++

1658

++		/*

1659

++		 * wake_up_process() executes a full barrier, which pairs with

1660

++		 * the queueing in wake_q_add() so as not to miss wakeups.

1661

++		 */

1662

++		wake_up_process(task);

1663

++		put_task_struct(task);

1664

++	}

1665

++}

1666

++

1667

++/*

1668

++ * resched_curr - mark rq's current task 'to be rescheduled now'.

1669

++ *

1670

++ * On UP this means the setting of the need_resched flag, on SMP it

1671

++ * might also involve a cross-CPU call to trigger the scheduler on

1672

++ * the target CPU.

1673

++ */

1674

++void resched_curr(struct rq *rq)

1675

++{

1676

++	struct task_struct *curr = rq->curr;

1677

++	int cpu;

1678

++

1679

++	lockdep_assert_held(&rq->lock);

1680

++

1681

++	if (test_tsk_need_resched(curr))

1682

++		return;

1683

++

1684

++	cpu = cpu_of(rq);

1685

++	if (cpu == smp_processor_id()) {

1686

++		set_tsk_need_resched(curr);

1687

++		set_preempt_need_resched();

1688

++		return;

1689

++	}

1690

++

1691

++	if (set_nr_and_not_polling(curr))

1692

++		smp_send_reschedule(cpu);

1693

++	else

1694

++		trace_sched_wake_idle_without_ipi(cpu);

1695

++}

1696

++

1697

++void resched_cpu(int cpu)

1698

++{

1699

++	struct rq *rq = cpu_rq(cpu);

1700

++	unsigned long flags;

1701

++

1702

++	raw_spin_lock_irqsave(&rq->lock, flags);

1703

++	if (cpu_online(cpu) || cpu == smp_processor_id())

1704

++		resched_curr(cpu_rq(cpu));

1705

++	raw_spin_unlock_irqrestore(&rq->lock, flags);

1706

++}

1707

++

1708

++#ifdef CONFIG_SMP

1709

++#ifdef CONFIG_NO_HZ_COMMON

1710

++void nohz_balance_enter_idle(int cpu) {}

1711

++

1712

++void select_nohz_load_balancer(int stop_tick) {}

1713

++

1714

++void set_cpu_sd_state_idle(void) {}

1715

++

1716

++/*

1717

++ * In the semi idle case, use the nearest busy CPU for migrating timers

1718

++ * from an idle CPU.  This is good for power-savings.

1719

++ *

1720

++ * We don't do similar optimization for completely idle system, as

1721

++ * selecting an idle CPU will add more delays to the timers than intended

1722

++ * (as that CPU's timer base may not be uptodate wrt jiffies etc).

1723

++ */

1724

++int get_nohz_timer_target(void)

1725

++{

1726

++	int i, cpu = smp_processor_id(), default_cpu = -1;

1727

++	struct cpumask *mask;

1728

++	const struct cpumask *hk_mask;

1729

++

1730

++	if (housekeeping_cpu(cpu, HK_TYPE_TIMER)) {

1731

++		if (!idle_cpu(cpu))

1732

++			return cpu;

1733

++		default_cpu = cpu;

1734

++	}

1735

++

1736

++	hk_mask = housekeeping_cpumask(HK_TYPE_TIMER);

1737

++

1738

++	for (mask = per_cpu(sched_cpu_topo_masks, cpu) + 1;

1739

++	     mask < per_cpu(sched_cpu_topo_end_mask, cpu); mask++)

1740

++		for_each_cpu_and(i, mask, hk_mask)

1741

++			if (!idle_cpu(i))

1742

++				return i;

1743

++

1744

++	if (default_cpu == -1)

1745

++		default_cpu = housekeeping_any_cpu(HK_TYPE_TIMER);

1746

++	cpu = default_cpu;

1747

++

1748

++	return cpu;

1749

++}

1750

++

1751

++/*

1752

++ * When add_timer_on() enqueues a timer into the timer wheel of an

1753

++ * idle CPU then this timer might expire before the next timer event

1754

++ * which is scheduled to wake up that CPU. In case of a completely

1755

++ * idle system the next event might even be infinite time into the

1756

++ * future. wake_up_idle_cpu() ensures that the CPU is woken up and

1757

++ * leaves the inner idle loop so the newly added timer is taken into

1758

++ * account when the CPU goes back to idle and evaluates the timer

1759

++ * wheel for the next timer event.

1760

++ */

1761

++static inline void wake_up_idle_cpu(int cpu)

1762

++{

1763

++	struct rq *rq = cpu_rq(cpu);

1764

++

1765

++	if (cpu == smp_processor_id())

1766

++		return;

1767

++

1768

++	if (set_nr_and_not_polling(rq->idle))

1769

++		smp_send_reschedule(cpu);

1770

++	else

1771

++		trace_sched_wake_idle_without_ipi(cpu);

1772

++}

1773

++

1774

++static inline bool wake_up_full_nohz_cpu(int cpu)

1775

++{

1776

++	/*

1777

++	 * We just need the target to call irq_exit() and re-evaluate

1778

++	 * the next tick. The nohz full kick at least implies that.

1779

++	 * If needed we can still optimize that later with an

1780

++	 * empty IRQ.

1781

++	 */

1782

++	if (cpu_is_offline(cpu))

1783

++		return true;  /* Don't try to wake offline CPUs. */

1784

++	if (tick_nohz_full_cpu(cpu)) {

1785

++		if (cpu != smp_processor_id() ||

1786

++		    tick_nohz_tick_stopped())

1787

++			tick_nohz_full_kick_cpu(cpu);

1788

++		return true;

1789

++	}

1790

++

1791

++	return false;

1792

++}

1793

++

1794

++void wake_up_nohz_cpu(int cpu)

1795

++{

1796

++	if (!wake_up_full_nohz_cpu(cpu))

1797

++		wake_up_idle_cpu(cpu);

1798

++}

1799

++

1800

++static void nohz_csd_func(void *info)

1801

++{

1802

++	struct rq *rq = info;

1803

++	int cpu = cpu_of(rq);

1804

++	unsigned int flags;

1805

++

1806

++	/*

1807

++	 * Release the rq::nohz_csd.

1808

++	 */

1809

++	flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(cpu));

1810

++	WARN_ON(!(flags & NOHZ_KICK_MASK));

1811

++

1812

++	rq->idle_balance = idle_cpu(cpu);

1813

++	if (rq->idle_balance && !need_resched()) {

1814

++		rq->nohz_idle_balance = flags;

1815

++		raise_softirq_irqoff(SCHED_SOFTIRQ);

1816

++	}

1817

++}

1818

++

1819

++#endif /* CONFIG_NO_HZ_COMMON */

1820

++#endif /* CONFIG_SMP */

1821

++

1822

++static inline void check_preempt_curr(struct rq *rq)

1823

++{

1824

++	if (sched_rq_first_task(rq) != rq->curr)

1825

++		resched_curr(rq);

1826

++}

1827

++

1828

++#ifdef CONFIG_SCHED_HRTICK

1829

++/*

1830

++ * Use HR-timers to deliver accurate preemption points.

1831

++ */

1832

++

1833

++static void hrtick_clear(struct rq *rq)

1834

++{

1835

++	if (hrtimer_active(&rq->hrtick_timer))

1836

++		hrtimer_cancel(&rq->hrtick_timer);

1837

++}

1838

++

1839

++/*

1840

++ * High-resolution timer tick.

1841

++ * Runs from hardirq context with interrupts disabled.

1842

++ */

1843

++static enum hrtimer_restart hrtick(struct hrtimer *timer)

1844

++{

1845

++	struct rq *rq = container_of(timer, struct rq, hrtick_timer);

1846

++

1847

++	WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());

1848

++

1849

++	raw_spin_lock(&rq->lock);

1850

++	resched_curr(rq);

1851

++	raw_spin_unlock(&rq->lock);

1852

++

1853

++	return HRTIMER_NORESTART;

1854

++}

1855

++

1856

++/*

1857

++ * Use hrtick when:

1858

++ *  - enabled by features

1859

++ *  - hrtimer is actually high res

1860

++ */

1861

++static inline int hrtick_enabled(struct rq *rq)

1862

++{

1863

++	/**

1864

++	 * Alt schedule FW doesn't support sched_feat yet

1865

++	if (!sched_feat(HRTICK))

1866

++		return 0;

1867

++	*/

1868

++	if (!cpu_active(cpu_of(rq)))

1869

++		return 0;

1870

++	return hrtimer_is_hres_active(&rq->hrtick_timer);

1871

++}

1872

++

1873

++#ifdef CONFIG_SMP

1874

++

1875

++static void __hrtick_restart(struct rq *rq)

1876

++{

1877

++	struct hrtimer *timer = &rq->hrtick_timer;

1878

++	ktime_t time = rq->hrtick_time;

1879

++

1880

++	hrtimer_start(timer, time, HRTIMER_MODE_ABS_PINNED_HARD);

1881

++}

1882

++

1883

++/*

1884

++ * called from hardirq (IPI) context

1885

++ */

1886

++static void __hrtick_start(void *arg)

1887

++{

1888

++	struct rq *rq = arg;

1889

++

1890

++	raw_spin_lock(&rq->lock);

1891

++	__hrtick_restart(rq);

1892

++	raw_spin_unlock(&rq->lock);

1893

++}

1894

++

1895

++/*

1896

++ * Called to set the hrtick timer state.

1897

++ *

1898

++ * called with rq->lock held and irqs disabled

1899

++ */

1900

++void hrtick_start(struct rq *rq, u64 delay)

1901

++{

1902

++	struct hrtimer *timer = &rq->hrtick_timer;

1903

++	s64 delta;

1904

++

1905

++	/*

1906

++	 * Don't schedule slices shorter than 10000ns, that just

1907

++	 * doesn't make sense and can cause timer DoS.

1908

++	 */

1909

++	delta = max_t(s64, delay, 10000LL);

1910

++

1911

++	rq->hrtick_time = ktime_add_ns(timer->base->get_time(), delta);

1912

++

1913

++	if (rq == this_rq())

1914

++		__hrtick_restart(rq);

1915

++	else

1916

++		smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd);

1917

++}

1918

++

1919

++#else

1920

++/*

1921

++ * Called to set the hrtick timer state.

1922

++ *

1923

++ * called with rq->lock held and irqs disabled

1924

++ */

1925

++void hrtick_start(struct rq *rq, u64 delay)

1926

++{

1927

++	/*

1928

++	 * Don't schedule slices shorter than 10000ns, that just

1929

++	 * doesn't make sense. Rely on vruntime for fairness.

1930

++	 */

1931

++	delay = max_t(u64, delay, 10000LL);

1932

++	hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay),

1933

++		      HRTIMER_MODE_REL_PINNED_HARD);

1934

++}

1935

++#endif /* CONFIG_SMP */

1936

++

1937

++static void hrtick_rq_init(struct rq *rq)

1938

++{

1939

++#ifdef CONFIG_SMP

1940

++	INIT_CSD(&rq->hrtick_csd, __hrtick_start, rq);

1941

++#endif

1942

++

1943

++	hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);

1944

++	rq->hrtick_timer.function = hrtick;

1945

++}

1946

++#else	/* CONFIG_SCHED_HRTICK */

1947

++static inline int hrtick_enabled(struct rq *rq)

1948

++{

1949

++	return 0;

1950

++}

1951

++

1952

++static inline void hrtick_clear(struct rq *rq)

1953

++{

1954

++}

1955

++

1956

++static inline void hrtick_rq_init(struct rq *rq)

1957

++{

1958

++}

1959

++#endif	/* CONFIG_SCHED_HRTICK */

1960

++

1961

++static inline int __normal_prio(int policy, int rt_prio, int static_prio)

1962

++{

1963

++	return rt_policy(policy) ? (MAX_RT_PRIO - 1 - rt_prio) :

1964

++		static_prio + MAX_PRIORITY_ADJ;

1965

++}

1966

++

1967

++/*

1968

++ * Calculate the expected normal priority: i.e. priority

1969

++ * without taking RT-inheritance into account. Might be

1970

++ * boosted by interactivity modifiers. Changes upon fork,

1971

++ * setprio syscalls, and whenever the interactivity

1972

++ * estimator recalculates.

1973

++ */

1974

++static inline int normal_prio(struct task_struct *p)

1975

++{

1976

++	return __normal_prio(p->policy, p->rt_priority, p->static_prio);

1977

++}

1978

++

1979

++/*

1980

++ * Calculate the current priority, i.e. the priority

1981

++ * taken into account by the scheduler. This value might

1982

++ * be boosted by RT tasks as it will be RT if the task got

1983

++ * RT-boosted. If not then it returns p->normal_prio.

1984

++ */

1985

++static int effective_prio(struct task_struct *p)

1986

++{

1987

++	p->normal_prio = normal_prio(p);

1988

++	/*

1989

++	 * If we are RT tasks or we were boosted to RT priority,

1990

++	 * keep the priority unchanged. Otherwise, update priority

1991

++	 * to the normal priority:

1992

++	 */

1993

++	if (!rt_prio(p->prio))

1994

++		return p->normal_prio;

1995

++	return p->prio;

1996

++}

1997

++

1998

++/*

1999

++ * activate_task - move a task to the runqueue.

2000

++ *

2001

++ * Context: rq->lock

2002

++ */

2003

++static void activate_task(struct task_struct *p, struct rq *rq)

2004

++{

2005

++	enqueue_task(p, rq, ENQUEUE_WAKEUP);

2006

++	p->on_rq = TASK_ON_RQ_QUEUED;

2007

++

2008

++	/*

2009

++	 * If in_iowait is set, the code below may not trigger any cpufreq

2010

++	 * utilization updates, so do it here explicitly with the IOWAIT flag

2011

++	 * passed.

2012

++	 */

2013

++	cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT * p->in_iowait);

2014

++}

2015

++

2016

++/*

2017

++ * deactivate_task - remove a task from the runqueue.

2018

++ *

2019

++ * Context: rq->lock

2020

++ */

2021

++static inline void deactivate_task(struct task_struct *p, struct rq *rq)

2022

++{

2023

++	dequeue_task(p, rq, DEQUEUE_SLEEP);

2024

++	p->on_rq = 0;

2025

++	cpufreq_update_util(rq, 0);

2026

++}

2027

++

2028

++static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)

2029

++{

2030

++#ifdef CONFIG_SMP

2031

++	/*

2032

++	 * After ->cpu is set up to a new value, task_access_lock(p, ...) can be

2033

++	 * successfully executed on another CPU. We must ensure that updates of

2034

++	 * per-task data have been completed by this moment.

2035

++	 */

2036

++	smp_wmb();

2037

++

2038

++	WRITE_ONCE(task_thread_info(p)->cpu, cpu);

2039

++#endif

2040

++}

2041

++

2042

++static inline bool is_migration_disabled(struct task_struct *p)

2043

++{

2044

++#ifdef CONFIG_SMP

2045

++	return p->migration_disabled;

2046

++#else

2047

++	return false;

2048

++#endif

2049

++}

2050

++

2051

++#define SCA_CHECK		0x01

2052

++#define SCA_USER		0x08

2053

++

2054

++#ifdef CONFIG_SMP

2055

++

2056

++void set_task_cpu(struct task_struct *p, unsigned int new_cpu)

2057

++{

2058

++#ifdef CONFIG_SCHED_DEBUG

2059

++	unsigned int state = READ_ONCE(p->__state);

2060

++

2061

++	/*

2062

++	 * We should never call set_task_cpu() on a blocked task,

2063

++	 * ttwu() will sort out the placement.

2064

++	 */

2065

++	WARN_ON_ONCE(state != TASK_RUNNING && state != TASK_WAKING && !p->on_rq);

2066

++

2067

++#ifdef CONFIG_LOCKDEP

2068

++	/*

2069

++	 * The caller should hold either p->pi_lock or rq->lock, when changing

2070

++	 * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks.

2071

++	 *

2072

++	 * sched_move_task() holds both and thus holding either pins the cgroup,

2073

++	 * see task_group().

2074

++	 */

2075

++	WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||

2076

++				      lockdep_is_held(&task_rq(p)->lock)));

2077

++#endif

2078

++	/*

2079

++	 * Clearly, migrating tasks to offline CPUs is a fairly daft thing.

2080

++	 */

2081

++	WARN_ON_ONCE(!cpu_online(new_cpu));

2082

++

2083

++	WARN_ON_ONCE(is_migration_disabled(p));

2084

++#endif

2085

++	trace_sched_migrate_task(p, new_cpu);

2086

++

2087

++	if (task_cpu(p) != new_cpu)

2088

++	{

2089

++		rseq_migrate(p);

2090

++		perf_event_task_migrate(p);

2091

++	}

2092

++

2093

++	__set_task_cpu(p, new_cpu);

2094

++}

2095

++

2096

++#define MDF_FORCE_ENABLED	0x80

2097

++

2098

++static void

2099

++__do_set_cpus_ptr(struct task_struct *p, const struct cpumask *new_mask)

2100

++{

2101

++	/*

2102

++	 * This here violates the locking rules for affinity, since we're only

2103

++	 * supposed to change these variables while holding both rq->lock and

2104

++	 * p->pi_lock.

2105

++	 *

2106

++	 * HOWEVER, it magically works, because ttwu() is the only code that

2107

++	 * accesses these variables under p->pi_lock and only does so after

2108

++	 * smp_cond_load_acquire(&p->on_cpu, !VAL), and we're in __schedule()

2109

++	 * before finish_task().

2110

++	 *

2111

++	 * XXX do further audits, this smells like something putrid.

2112

++	 */

2113

++	SCHED_WARN_ON(!p->on_cpu);

2114

++	p->cpus_ptr = new_mask;

2115

++}

2116

++

2117

++void migrate_disable(void)

2118

++{

2119

++	struct task_struct *p = current;

2120

++	int cpu;

2121

++

2122

++	if (p->migration_disabled) {

2123

++		p->migration_disabled++;

2124

++		return;

2125

++	}

2126

++

2127

++	preempt_disable();

2128

++	cpu = smp_processor_id();

2129

++	if (cpumask_test_cpu(cpu, &p->cpus_mask)) {

2130

++		cpu_rq(cpu)->nr_pinned++;

2131

++		p->migration_disabled = 1;

2132

++		p->migration_flags &= ~MDF_FORCE_ENABLED;

2133

++

2134

++		/*

2135

++		 * Violates locking rules! see comment in __do_set_cpus_ptr().

2136

++		 */

2137

++		if (p->cpus_ptr == &p->cpus_mask)

2138

++			__do_set_cpus_ptr(p, cpumask_of(cpu));

2139

++	}

2140

++	preempt_enable();

2141

++}

2142

++EXPORT_SYMBOL_GPL(migrate_disable);

2143

++

2144

++void migrate_enable(void)

2145

++{

2146

++	struct task_struct *p = current;

2147

++

2148

++	if (0 == p->migration_disabled)

2149

++		return;

2150

++

2151

++	if (p->migration_disabled > 1) {

2152

++		p->migration_disabled--;

2153

++		return;

2154

++	}

2155

++

2156

++	if (WARN_ON_ONCE(!p->migration_disabled))

2157

++		return;

2158

++

2159

++	/*

2160

++	 * Ensure stop_task runs either before or after this, and that

2161

++	 * __set_cpus_allowed_ptr(SCA_MIGRATE_ENABLE) doesn't schedule().

2162

++	 */

2163

++	preempt_disable();

2164

++	/*

2165

++	 * Assumption: current should be running on allowed cpu

2166

++	 */

2167

++	WARN_ON_ONCE(!cpumask_test_cpu(smp_processor_id(), &p->cpus_mask));

2168

++	if (p->cpus_ptr != &p->cpus_mask)

2169

++		__do_set_cpus_ptr(p, &p->cpus_mask);

2170

++	/*

2171

++	 * Mustn't clear migration_disabled() until cpus_ptr points back at the

2172

++	 * regular cpus_mask, otherwise things that race (eg.

2173

++	 * select_fallback_rq) get confused.

2174

++	 */

2175

++	barrier();

2176

++	p->migration_disabled = 0;

2177

++	this_rq()->nr_pinned--;

2178

++	preempt_enable();

2179

++}

2180

++EXPORT_SYMBOL_GPL(migrate_enable);

2181

++

2182

++static inline bool rq_has_pinned_tasks(struct rq *rq)

2183

++{

2184

++	return rq->nr_pinned;

2185

++}

2186

++

2187

++/*

2188

++ * Per-CPU kthreads are allowed to run on !active && online CPUs, see

2189

++ * __set_cpus_allowed_ptr() and select_fallback_rq().

2190

++ */

2191

++static inline bool is_cpu_allowed(struct task_struct *p, int cpu)

2192

++{

2193

++	/* When not in the task's cpumask, no point in looking further. */

2194

++	if (!cpumask_test_cpu(cpu, p->cpus_ptr))

2195

++		return false;

2196

++

2197

++	/* migrate_disabled() must be allowed to finish. */

2198

++	if (is_migration_disabled(p))

2199

++		return cpu_online(cpu);

2200

++

2201

++	/* Non kernel threads are not allowed during either online or offline. */

2202

++	if (!(p->flags & PF_KTHREAD))

2203

++		return cpu_active(cpu) && task_cpu_possible(cpu, p);

2204

++

2205

++	/* KTHREAD_IS_PER_CPU is always allowed. */

2206

++	if (kthread_is_per_cpu(p))

2207

++		return cpu_online(cpu);

2208

++

2209

++	/* Regular kernel threads don't get to stay during offline. */

2210

++	if (cpu_dying(cpu))

2211

++		return false;

2212

++

2213

++	/* But are allowed during online. */

2214

++	return cpu_online(cpu);

2215

++}

2216

++

2217

++/*

2218

++ * This is how migration works:

2219

++ *

2220

++ * 1) we invoke migration_cpu_stop() on the target CPU using

2221

++ *    stop_one_cpu().

2222

++ * 2) stopper starts to run (implicitly forcing the migrated thread

2223

++ *    off the CPU)

2224

++ * 3) it checks whether the migrated task is still in the wrong runqueue.

2225

++ * 4) if it's in the wrong runqueue then the migration thread removes

2226

++ *    it and puts it into the right queue.

2227

++ * 5) stopper completes and stop_one_cpu() returns and the migration

2228

++ *    is done.

2229

++ */

2230

++

2231

++/*

2232

++ * move_queued_task - move a queued task to new rq.

2233

++ *

2234

++ * Returns (locked) new rq. Old rq's lock is released.

2235

++ */

2236

++static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int

2237

++				   new_cpu)

2238

++{

2239

++	lockdep_assert_held(&rq->lock);

2240

++

2241

++	WRITE_ONCE(p->on_rq, TASK_ON_RQ_MIGRATING);

2242

++	dequeue_task(p, rq, 0);

2243

++	update_sched_preempt_mask(rq);

2244

++	set_task_cpu(p, new_cpu);

2245

++	raw_spin_unlock(&rq->lock);

2246

++

2247

++	rq = cpu_rq(new_cpu);

2248

++

2249

++	raw_spin_lock(&rq->lock);

2250

++	WARN_ON_ONCE(task_cpu(p) != new_cpu);

2251

++	sched_task_sanity_check(p, rq);

2252

++	enqueue_task(p, rq, 0);

2253

++	p->on_rq = TASK_ON_RQ_QUEUED;

2254

++	check_preempt_curr(rq);

2255

++

2256

++	return rq;

2257

++}

2258

++

2259

++struct migration_arg {

2260

++	struct task_struct *task;

2261

++	int dest_cpu;

2262

++};

2263

++

2264

++/*

2265

++ * Move (not current) task off this CPU, onto the destination CPU. We're doing

2266

++ * this because either it can't run here any more (set_cpus_allowed()

2267

++ * away from this CPU, or CPU going down), or because we're

2268

++ * attempting to rebalance this task on exec (sched_exec).

2269

++ *

2270

++ * So we race with normal scheduler movements, but that's OK, as long

2271

++ * as the task is no longer on this CPU.

2272

++ */

2273

++static struct rq *__migrate_task(struct rq *rq, struct task_struct *p, int

2274

++				 dest_cpu)

2275

++{

2276

++	/* Affinity changed (again). */

2277

++	if (!is_cpu_allowed(p, dest_cpu))

2278

++		return rq;

2279

++

2280

++	update_rq_clock(rq);

2281

++	return move_queued_task(rq, p, dest_cpu);

2282

++}

2283

++

2284

++/*

2285

++ * migration_cpu_stop - this will be executed by a highprio stopper thread

2286

++ * and performs thread migration by bumping thread off CPU then

2287

++ * 'pushing' onto another runqueue.

2288

++ */

2289

++static int migration_cpu_stop(void *data)

2290

++{

2291

++	struct migration_arg *arg = data;

2292

++	struct task_struct *p = arg->task;

2293

++	struct rq *rq = this_rq();

2294

++	unsigned long flags;

2295

++

2296

++	/*

2297

++	 * The original target CPU might have gone down and we might

2298

++	 * be on another CPU but it doesn't matter.

2299

++	 */

2300

++	local_irq_save(flags);

2301

++	/*

2302

++	 * We need to explicitly wake pending tasks before running

2303

++	 * __migrate_task() such that we will not miss enforcing cpus_ptr

2304

++	 * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test.

2305

++	 */

2306

++	flush_smp_call_function_queue();

2307

++

2308

++	raw_spin_lock(&p->pi_lock);

2309

++	raw_spin_lock(&rq->lock);

2310

++	/*

2311

++	 * If task_rq(p) != rq, it cannot be migrated here, because we're

2312

++	 * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because

2313

++	 * we're holding p->pi_lock.

2314

++	 */

2315

++	if (task_rq(p) == rq && task_on_rq_queued(p))

2316

++		rq = __migrate_task(rq, p, arg->dest_cpu);

2317

++	raw_spin_unlock(&rq->lock);

2318

++	raw_spin_unlock_irqrestore(&p->pi_lock, flags);

2319

++

2320

++	return 0;

2321

++}

2322

++

2323

++static inline void

2324

++set_cpus_allowed_common(struct task_struct *p, struct affinity_context *ctx)

2325

++{

2326

++	cpumask_copy(&p->cpus_mask, ctx->new_mask);

2327

++	p->nr_cpus_allowed = cpumask_weight(ctx->new_mask);

2328

++

2329

++	/*

2330

++	 * Swap in a new user_cpus_ptr if SCA_USER flag set

2331

++	 */

2332

++	if (ctx->flags & SCA_USER)

2333

++		swap(p->user_cpus_ptr, ctx->user_mask);

2334

++}

2335

++

2336

++static void

2337

++__do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx)

2338

++{

2339

++	lockdep_assert_held(&p->pi_lock);

2340

++	set_cpus_allowed_common(p, ctx);

2341

++}

2342

++

2343

++/*

2344

++ * Used for kthread_bind() and select_fallback_rq(), in both cases the user

2345

++ * affinity (if any) should be destroyed too.

2346

++ */

2347

++void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)

2348

++{

2349

++	struct affinity_context ac = {

2350

++		.new_mask  = new_mask,

2351

++		.user_mask = NULL,

2352

++		.flags     = SCA_USER,	/* clear the user requested mask */

2353

++	};

2354

++	union cpumask_rcuhead {

2355

++		cpumask_t cpumask;

2356

++		struct rcu_head rcu;

2357

++	};

2358

++

2359

++	__do_set_cpus_allowed(p, &ac);

2360

++

2361

++	/*

2362

++	 * Because this is called with p->pi_lock held, it is not possible

2363

++	 * to use kfree() here (when PREEMPT_RT=y), therefore punt to using

2364

++	 * kfree_rcu().

2365

++	 */

2366

++	kfree_rcu((union cpumask_rcuhead *)ac.user_mask, rcu);

2367

++}

2368

++

2369

++static cpumask_t *alloc_user_cpus_ptr(int node)

2370

++{

2371

++	/*

2372

++	 * See do_set_cpus_allowed() above for the rcu_head usage.

2373

++	 */

2374

++	int size = max_t(int, cpumask_size(), sizeof(struct rcu_head));

2375

++

2376

++	return kmalloc_node(size, GFP_KERNEL, node);

2377

++}

2378

++

2379

++int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src,

2380

++		      int node)

2381

++{

2382

++	cpumask_t *user_mask;

2383

++	unsigned long flags;

2384

++

2385

++	/*

2386

++	 * Always clear dst->user_cpus_ptr first as their user_cpus_ptr's

2387

++	 * may differ by now due to racing.

2388

++	 */

2389

++	dst->user_cpus_ptr = NULL;

2390

++

2391

++	/*

2392

++	 * This check is racy and losing the race is a valid situation.

2393

++	 * It is not worth the extra overhead of taking the pi_lock on

2394

++	 * every fork/clone.

2395

++	 */

2396

++	if (data_race(!src->user_cpus_ptr))

2397

++		return 0;

2398

++

2399

++	user_mask = alloc_user_cpus_ptr(node);

2400

++	if (!user_mask)

2401

++		return -ENOMEM;

2402

++

2403

++	/*

2404

++	 * Use pi_lock to protect content of user_cpus_ptr

2405

++	 *

2406

++	 * Though unlikely, user_cpus_ptr can be reset to NULL by a concurrent

2407

++	 * do_set_cpus_allowed().

2408

++	 */

2409

++	raw_spin_lock_irqsave(&src->pi_lock, flags);

2410

++	if (src->user_cpus_ptr) {

2411

++		swap(dst->user_cpus_ptr, user_mask);

2412

++		cpumask_copy(dst->user_cpus_ptr, src->user_cpus_ptr);

2413

++	}

2414

++	raw_spin_unlock_irqrestore(&src->pi_lock, flags);

2415

++

2416

++	if (unlikely(user_mask))

2417

++		kfree(user_mask);

2418

++

2419

++	return 0;

2420

++}

2421

++

2422

++static inline struct cpumask *clear_user_cpus_ptr(struct task_struct *p)

2423

++{

2424

++	struct cpumask *user_mask = NULL;

2425

++

2426

++	swap(p->user_cpus_ptr, user_mask);

2427

++

2428

++	return user_mask;

2429

++}

2430

++

2431

++void release_user_cpus_ptr(struct task_struct *p)

2432

++{

2433

++	kfree(clear_user_cpus_ptr(p));

2434

++}

2435

++

2436

++#endif

2437

++

2438

++/**

2439

++ * task_curr - is this task currently executing on a CPU?

2440

++ * @p: the task in question.

2441

++ *

2442

++ * Return: 1 if the task is currently executing. 0 otherwise.

2443

++ */

2444

++inline int task_curr(const struct task_struct *p)

2445

++{

2446

++	return cpu_curr(task_cpu(p)) == p;

2447

++}

2448

++

2449

++#ifdef CONFIG_SMP

2450

++/*

2451

++ * wait_task_inactive - wait for a thread to unschedule.

2452

++ *

2453

++ * Wait for the thread to block in any of the states set in @match_state.

2454

++ * If it changes, i.e. @p might have woken up, then return zero.  When we

2455

++ * succeed in waiting for @p to be off its CPU, we return a positive number

2456

++ * (its total switch count).  If a second call a short while later returns the

2457

++ * same number, the caller can be sure that @p has remained unscheduled the

2458

++ * whole time.

2459

++ *

2460

++ * The caller must ensure that the task *will* unschedule sometime soon,

2461

++ * else this function might spin for a *long* time. This function can't

2462

++ * be called with interrupts off, or it may introduce deadlock with

2463

++ * smp_call_function() if an IPI is sent by the same process we are

2464

++ * waiting to become inactive.

2465

++ */

2466

++unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state)

2467

++{

2468

++	unsigned long flags;

2469

++	bool running, on_rq;

2470

++	unsigned long ncsw;

2471

++	struct rq *rq;

2472

++	raw_spinlock_t *lock;

2473

++

2474

++	for (;;) {

2475

++		rq = task_rq(p);

2476

++

2477

++		/*

2478

++		 * If the task is actively running on another CPU

2479

++		 * still, just relax and busy-wait without holding

2480

++		 * any locks.

2481

++		 *

2482

++		 * NOTE! Since we don't hold any locks, it's not

2483

++		 * even sure that "rq" stays as the right runqueue!

2484

++		 * But we don't care, since this will return false

2485

++		 * if the runqueue has changed and p is actually now

2486

++		 * running somewhere else!

2487

++		 */

2488

++		while (task_on_cpu(p) && p == rq->curr) {

2489

++			if (!(READ_ONCE(p->__state) & match_state))

2490

++				return 0;

2491

++			cpu_relax();

2492

++		}

2493

++

2494

++		/*

2495

++		 * Ok, time to look more closely! We need the rq

2496

++		 * lock now, to be *sure*. If we're wrong, we'll

2497

++		 * just go back and repeat.

2498

++		 */

2499

++		task_access_lock_irqsave(p, &lock, &flags);

2500

++		trace_sched_wait_task(p);

2501

++		running = task_on_cpu(p);

2502

++		on_rq = p->on_rq;

2503

++		ncsw = 0;

2504

++		if (READ_ONCE(p->__state) & match_state)

2505

++			ncsw = p->nvcsw | LONG_MIN; /* sets MSB */

2506

++		task_access_unlock_irqrestore(p, lock, &flags);

2507

++

2508

++		/*

2509

++		 * If it changed from the expected state, bail out now.

2510

++		 */

2511

++		if (unlikely(!ncsw))

2512

++			break;

2513

++

2514

++		/*

2515

++		 * Was it really running after all now that we

2516

++		 * checked with the proper locks actually held?

2517

++		 *

2518

++		 * Oops. Go back and try again..

2519

++		 */

2520

++		if (unlikely(running)) {

2521

++			cpu_relax();

2522

++			continue;

2523

++		}

2524

++

2525

++		/*

2526

++		 * It's not enough that it's not actively running,

2527

++		 * it must be off the runqueue _entirely_, and not

2528

++		 * preempted!

2529

++		 *

2530

++		 * So if it was still runnable (but just not actively

2531

++		 * running right now), it's preempted, and we should

2532

++		 * yield - it could be a while.

2533

++		 */

2534

++		if (unlikely(on_rq)) {

2535

++			ktime_t to = NSEC_PER_SEC / HZ;

2536

++

2537

++			set_current_state(TASK_UNINTERRUPTIBLE);

2538

++			schedule_hrtimeout(&to, HRTIMER_MODE_REL_HARD);

2539

++			continue;

2540

++		}

2541

++

2542

++		/*

2543

++		 * Ahh, all good. It wasn't running, and it wasn't

2544

++		 * runnable, which means that it will never become

2545

++		 * running in the future either. We're all done!

2546

++		 */

2547

++		break;

2548

++	}

2549

++

2550

++	return ncsw;

2551

++}

2552

++

2553

++/***

2554

++ * kick_process - kick a running thread to enter/exit the kernel

2555

++ * @p: the to-be-kicked thread

2556

++ *

2557

++ * Cause a process which is running on another CPU to enter

2558

++ * kernel-mode, without any delay. (to get signals handled.)

2559

++ *

2560

++ * NOTE: this function doesn't have to take the runqueue lock,

2561

++ * because all it wants to ensure is that the remote task enters

2562

++ * the kernel. If the IPI races and the task has been migrated

2563

++ * to another CPU then no harm is done and the purpose has been

2564

++ * achieved as well.

2565

++ */

2566

++void kick_process(struct task_struct *p)

2567

++{

2568

++	int cpu;

2569

++

2570

++	preempt_disable();

2571

++	cpu = task_cpu(p);

2572

++	if ((cpu != smp_processor_id()) && task_curr(p))

2573

++		smp_send_reschedule(cpu);

2574

++	preempt_enable();

2575

++}

2576

++EXPORT_SYMBOL_GPL(kick_process);

2577

++

2578

++/*

2579

++ * ->cpus_ptr is protected by both rq->lock and p->pi_lock

2580

++ *

2581

++ * A few notes on cpu_active vs cpu_online:

2582

++ *

2583

++ *  - cpu_active must be a subset of cpu_online

2584

++ *

2585

++ *  - on CPU-up we allow per-CPU kthreads on the online && !active CPU,

2586

++ *    see __set_cpus_allowed_ptr(). At this point the newly online

2587

++ *    CPU isn't yet part of the sched domains, and balancing will not

2588

++ *    see it.

2589

++ *

2590

++ *  - on cpu-down we clear cpu_active() to mask the sched domains and

2591

++ *    avoid the load balancer to place new tasks on the to be removed

2592

++ *    CPU. Existing tasks will remain running there and will be taken

2593

++ *    off.

2594

++ *

2595

++ * This means that fallback selection must not select !active CPUs.

2596

++ * And can assume that any active CPU must be online. Conversely

2597

++ * select_task_rq() below may allow selection of !active CPUs in order

2598

++ * to satisfy the above rules.

2599

++ */

2600

++static int select_fallback_rq(int cpu, struct task_struct *p)

2601

++{

2602

++	int nid = cpu_to_node(cpu);

2603

++	const struct cpumask *nodemask = NULL;

2604

++	enum { cpuset, possible, fail } state = cpuset;

2605

++	int dest_cpu;

2606

++

2607

++	/*

2608

++	 * If the node that the CPU is on has been offlined, cpu_to_node()

2609

++	 * will return -1. There is no CPU on the node, and we should

2610

++	 * select the CPU on the other node.

2611

++	 */

2612

++	if (nid != -1) {

2613

++		nodemask = cpumask_of_node(nid);

2614

++

2615

++		/* Look for allowed, online CPU in same node. */

2616

++		for_each_cpu(dest_cpu, nodemask) {

2617

++			if (is_cpu_allowed(p, dest_cpu))

2618

++				return dest_cpu;

2619

++		}

2620

++	}

2621

++

2622

++	for (;;) {

2623

++		/* Any allowed, online CPU? */

2624

++		for_each_cpu(dest_cpu, p->cpus_ptr) {

2625

++			if (!is_cpu_allowed(p, dest_cpu))

2626

++				continue;

2627

++			goto out;

2628

++		}

2629

++

2630

++		/* No more Mr. Nice Guy. */

2631

++		switch (state) {

2632

++		case cpuset:

2633

++			if (cpuset_cpus_allowed_fallback(p)) {

2634

++				state = possible;

2635

++				break;

2636

++			}

2637

++			fallthrough;

2638

++		case possible:

2639

++			/*

2640

++			 * XXX When called from select_task_rq() we only

2641

++			 * hold p->pi_lock and again violate locking order.

2642

++			 *

2643

++			 * More yuck to audit.

2644

++			 */

2645

++			do_set_cpus_allowed(p, task_cpu_possible_mask(p));

2646

++			state = fail;

2647

++			break;

2648

++

2649

++		case fail:

2650

++			BUG();

2651

++			break;

2652

++		}

2653

++	}

2654

++

2655

++out:

2656

++	if (state != cpuset) {

2657

++		/*

2658

++		 * Don't tell them about moving exiting tasks or

2659

++		 * kernel threads (both mm NULL), since they never

2660

++		 * leave kernel.

2661

++		 */

2662

++		if (p->mm && printk_ratelimit()) {

2663

++			printk_deferred("process %d (%s) no longer affine to cpu%d\n",

2664

++					task_pid_nr(p), p->comm, cpu);

2665

++		}

2666

++	}

2667

++

2668

++	return dest_cpu;

2669

++}

2670

++

2671

++static inline void

2672

++sched_preempt_mask_flush(cpumask_t *mask, int prio)

2673

++{

2674

++	int cpu;

2675

++

2676

++	cpumask_copy(mask, sched_idle_mask);

2677

++

2678

++	for_each_cpu_not(cpu, mask) {

2679

++		if (prio < cpu_rq(cpu)->prio)

2680

++			cpumask_set_cpu(cpu, mask);

2681

++	}

2682

++}

2683

++

2684

++static inline int

2685

++preempt_mask_check(struct task_struct *p, cpumask_t *allow_mask, cpumask_t *preempt_mask)

2686

++{

2687

++	int task_prio = task_sched_prio(p);

2688

++	cpumask_t *mask = sched_preempt_mask + SCHED_QUEUE_BITS - 1 - task_prio;

2689

++	int pr = atomic_read(&sched_prio_record);

2690

++

2691

++	if (pr != task_prio) {

2692

++		sched_preempt_mask_flush(mask, task_prio);

2693

++		atomic_set(&sched_prio_record, task_prio);

2694

++	}

2695

++

2696

++	return cpumask_and(preempt_mask, allow_mask, mask);

2697

++}

2698

++

2699

++static inline int select_task_rq(struct task_struct *p)

2700

++{

2701

++	cpumask_t allow_mask, mask;

2702

++

2703

++	if (unlikely(!cpumask_and(&allow_mask, p->cpus_ptr, cpu_active_mask)))

2704

++		return select_fallback_rq(task_cpu(p), p);

2705

++

2706

++	if (

2707

++#ifdef CONFIG_SCHED_SMT

2708

++	    cpumask_and(&mask, &allow_mask, &sched_sg_idle_mask) ||

2709

++#endif

2710

++	    cpumask_and(&mask, &allow_mask, sched_idle_mask) ||

2711

++	    preempt_mask_check(p, &allow_mask, &mask))

2712

++		return best_mask_cpu(task_cpu(p), &mask);

2713

++

2714

++	return best_mask_cpu(task_cpu(p), &allow_mask);

2715

++}

2716

++

2717

++void sched_set_stop_task(int cpu, struct task_struct *stop)

2718

++{

2719

++	static struct lock_class_key stop_pi_lock;

2720

++	struct sched_param stop_param = { .sched_priority = STOP_PRIO };

2721

++	struct sched_param start_param = { .sched_priority = 0 };

2722

++	struct task_struct *old_stop = cpu_rq(cpu)->stop;

2723

++

2724

++	if (stop) {

2725

++		/*

2726

++		 * Make it appear like a SCHED_FIFO task, its something

2727

++		 * userspace knows about and won't get confused about.

2728

++		 *

2729

++		 * Also, it will make PI more or less work without too

2730

++		 * much confusion -- but then, stop work should not

2731

++		 * rely on PI working anyway.

2732

++		 */

2733

++		sched_setscheduler_nocheck(stop, SCHED_FIFO, &stop_param);

2734

++

2735

++		/*

2736

++		 * The PI code calls rt_mutex_setprio() with ->pi_lock held to

2737

++		 * adjust the effective priority of a task. As a result,

2738

++		 * rt_mutex_setprio() can trigger (RT) balancing operations,

2739

++		 * which can then trigger wakeups of the stop thread to push

2740

++		 * around the current task.

2741

++		 *

2742

++		 * The stop task itself will never be part of the PI-chain, it

2743

++		 * never blocks, therefore that ->pi_lock recursion is safe.

2744

++		 * Tell lockdep about this by placing the stop->pi_lock in its

2745

++		 * own class.

2746

++		 */

2747

++		lockdep_set_class(&stop->pi_lock, &stop_pi_lock);

2748

++	}

2749

++

2750

++	cpu_rq(cpu)->stop = stop;

2751

++

2752

++	if (old_stop) {

2753

++		/*

2754

++		 * Reset it back to a normal scheduling policy so that

2755

++		 * it can die in pieces.

2756

++		 */

2757

++		sched_setscheduler_nocheck(old_stop, SCHED_NORMAL, &start_param);

2758

++	}

2759

++}

2760

++

2761

++static int affine_move_task(struct rq *rq, struct task_struct *p, int dest_cpu,

2762

++			    raw_spinlock_t *lock, unsigned long irq_flags)

2763

++	__releases(rq->lock)

2764

++	__releases(p->pi_lock)

2765

++{

2766

++	/* Can the task run on the task's current CPU? If so, we're done */

2767

++	if (!cpumask_test_cpu(task_cpu(p), &p->cpus_mask)) {

2768

++		if (p->migration_disabled) {

2769

++			if (likely(p->cpus_ptr != &p->cpus_mask))

2770

++				__do_set_cpus_ptr(p, &p->cpus_mask);

2771

++			p->migration_disabled = 0;

2772

++			p->migration_flags |= MDF_FORCE_ENABLED;

2773

++			/* When p is migrate_disabled, rq->lock should be held */

2774

++			rq->nr_pinned--;

2775

++		}

2776

++

2777

++		if (task_on_cpu(p) || READ_ONCE(p->__state) == TASK_WAKING) {

2778

++			struct migration_arg arg = { p, dest_cpu };

2779

++

2780

++			/* Need help from migration thread: drop lock and wait. */

2781

++			__task_access_unlock(p, lock);

2782

++			raw_spin_unlock_irqrestore(&p->pi_lock, irq_flags);

2783

++			stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);

2784

++			return 0;

2785

++		}

2786

++		if (task_on_rq_queued(p)) {

2787

++			/*

2788

++			 * OK, since we're going to drop the lock immediately

2789

++			 * afterwards anyway.

2790

++			 */

2791

++			update_rq_clock(rq);

2792

++			rq = move_queued_task(rq, p, dest_cpu);

2793

++			lock = &rq->lock;

2794

++		}

2795

++	}

2796

++	__task_access_unlock(p, lock);

2797

++	raw_spin_unlock_irqrestore(&p->pi_lock, irq_flags);

2798

++	return 0;

2799

++}

2800

++

2801

++static int __set_cpus_allowed_ptr_locked(struct task_struct *p,

2802

++					 struct affinity_context *ctx,

2803

++					 struct rq *rq,

2804

++					 raw_spinlock_t *lock,

2805

++					 unsigned long irq_flags)

2806

++{

2807

++	const struct cpumask *cpu_allowed_mask = task_cpu_possible_mask(p);

2808

++	const struct cpumask *cpu_valid_mask = cpu_active_mask;

2809

++	bool kthread = p->flags & PF_KTHREAD;

2810

++	int dest_cpu;

2811

++	int ret = 0;

2812

++

2813

++	if (kthread || is_migration_disabled(p)) {

2814

++		/*

2815

++		 * Kernel threads are allowed on online && !active CPUs,

2816

++		 * however, during cpu-hot-unplug, even these might get pushed

2817

++		 * away if not KTHREAD_IS_PER_CPU.

2818

++		 *

2819

++		 * Specifically, migration_disabled() tasks must not fail the

2820

++		 * cpumask_any_and_distribute() pick below, esp. so on

2821

++		 * SCA_MIGRATE_ENABLE, otherwise we'll not call

2822

++		 * set_cpus_allowed_common() and actually reset p->cpus_ptr.

2823

++		 */

2824

++		cpu_valid_mask = cpu_online_mask;

2825

++	}

2826

++

2827

++	if (!kthread && !cpumask_subset(ctx->new_mask, cpu_allowed_mask)) {

2828

++		ret = -EINVAL;

2829

++		goto out;

2830

++	}

2831

++

2832

++	/*

2833

++	 * Must re-check here, to close a race against __kthread_bind(),

2834

++	 * sched_setaffinity() is not guaranteed to observe the flag.

2835

++	 */

2836

++	if ((ctx->flags & SCA_CHECK) && (p->flags & PF_NO_SETAFFINITY)) {

2837

++		ret = -EINVAL;

2838

++		goto out;

2839

++	}

2840

++

2841

++	if (cpumask_equal(&p->cpus_mask, ctx->new_mask))

2842

++		goto out;

2843

++

2844

++	dest_cpu = cpumask_any_and(cpu_valid_mask, ctx->new_mask);

2845

++	if (dest_cpu >= nr_cpu_ids) {

2846

++		ret = -EINVAL;

2847

++		goto out;

2848

++	}

2849

++

2850

++	__do_set_cpus_allowed(p, ctx);

2851

++

2852

++	return affine_move_task(rq, p, dest_cpu, lock, irq_flags);

2853

++

2854

++out:

2855

++	__task_access_unlock(p, lock);

2856

++	raw_spin_unlock_irqrestore(&p->pi_lock, irq_flags);

2857

++

2858

++	return ret;

2859

++}

2860

++

2861

++/*

2862

++ * Change a given task's CPU affinity. Migrate the thread to a

2863

++ * is removed from the allowed bitmask.

2864

++ *

2865

++ * NOTE: the caller must have a valid reference to the task, the

2866

++ * task must not exit() & deallocate itself prematurely. The

2867

++ * call is not atomic; no spinlocks may be held.

2868

++ */

2869

++static int __set_cpus_allowed_ptr(struct task_struct *p,

2870

++				  struct affinity_context *ctx)

2871

++{

2872

++	unsigned long irq_flags;

2873

++	struct rq *rq;

2874

++	raw_spinlock_t *lock;

2875

++

2876

++	raw_spin_lock_irqsave(&p->pi_lock, irq_flags);

2877

++	rq = __task_access_lock(p, &lock);

2878

++	/*

2879

++	 * Masking should be skipped if SCA_USER or any of the SCA_MIGRATE_*

2880

++	 * flags are set.

2881

++	 */

2882

++	if (p->user_cpus_ptr &&

2883

++	    !(ctx->flags & SCA_USER) &&

2884

++	    cpumask_and(rq->scratch_mask, ctx->new_mask, p->user_cpus_ptr))

2885

++		ctx->new_mask = rq->scratch_mask;

2886

++

2887

++

2888

++	return __set_cpus_allowed_ptr_locked(p, ctx, rq, lock, irq_flags);

2889

++}

2890

++

2891

++int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)

2892

++{

2893

++	struct affinity_context ac = {

2894

++		.new_mask  = new_mask,

2895

++		.flags     = 0,

2896

++	};

2897

++

2898

++	return __set_cpus_allowed_ptr(p, &ac);

2899

++}

2900

++EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);

2901

++

2902

++/*

2903

++ * Change a given task's CPU affinity to the intersection of its current

2904

++ * affinity mask and @subset_mask, writing the resulting mask to @new_mask.

2905

++ * If user_cpus_ptr is defined, use it as the basis for restricting CPU

2906

++ * affinity or use cpu_online_mask instead.

2907

++ *

2908

++ * If the resulting mask is empty, leave the affinity unchanged and return

2909

++ * -EINVAL.

2910

++ */

2911

++static int restrict_cpus_allowed_ptr(struct task_struct *p,

2912

++				     struct cpumask *new_mask,

2913

++				     const struct cpumask *subset_mask)

2914

++{

2915

++	struct affinity_context ac = {

2916

++		.new_mask  = new_mask,

2917

++		.flags     = 0,

2918

++	};

2919

++	unsigned long irq_flags;

2920

++	raw_spinlock_t *lock;

2921

++	struct rq *rq;

2922

++	int err;

2923

++

2924

++	raw_spin_lock_irqsave(&p->pi_lock, irq_flags);

2925

++	rq = __task_access_lock(p, &lock);

2926

++

2927

++	if (!cpumask_and(new_mask, task_user_cpus(p), subset_mask)) {

2928

++		err = -EINVAL;

2929

++		goto err_unlock;

2930

++	}

2931

++

2932

++	return __set_cpus_allowed_ptr_locked(p, &ac, rq, lock, irq_flags);

2933

++

2934

++err_unlock:

2935

++	__task_access_unlock(p, lock);

2936

++	raw_spin_unlock_irqrestore(&p->pi_lock, irq_flags);

2937

++	return err;

2938

++}

2939

++

2940

++/*

2941

++ * Restrict the CPU affinity of task @p so that it is a subset of

2942

++ * task_cpu_possible_mask() and point @p->user_cpus_ptr to a copy of the

2943

++ * old affinity mask. If the resulting mask is empty, we warn and walk

2944

++ * up the cpuset hierarchy until we find a suitable mask.

2945

++ */

2946

++void force_compatible_cpus_allowed_ptr(struct task_struct *p)

2947

++{

2948

++	cpumask_var_t new_mask;

2949

++	const struct cpumask *override_mask = task_cpu_possible_mask(p);

2950

++

2951

++	alloc_cpumask_var(&new_mask, GFP_KERNEL);

2952

++

2953

++	/*

2954

++	 * __migrate_task() can fail silently in the face of concurrent

2955

++	 * offlining of the chosen destination CPU, so take the hotplug

2956

++	 * lock to ensure that the migration succeeds.

2957

++	 */

2958

++	cpus_read_lock();

2959

++	if (!cpumask_available(new_mask))

2960

++		goto out_set_mask;

2961

++

2962

++	if (!restrict_cpus_allowed_ptr(p, new_mask, override_mask))

2963

++		goto out_free_mask;

2964

++

2965

++	/*

2966

++	 * We failed to find a valid subset of the affinity mask for the

2967

++	 * task, so override it based on its cpuset hierarchy.

2968

++	 */

2969

++	cpuset_cpus_allowed(p, new_mask);

2970

++	override_mask = new_mask;

2971

++

2972

++out_set_mask:

2973

++	if (printk_ratelimit()) {

2974

++		printk_deferred("Overriding affinity for process %d (%s) to CPUs %*pbl\n",

2975

++				task_pid_nr(p), p->comm,

2976

++				cpumask_pr_args(override_mask));

2977

++	}

2978

++

2979

++	WARN_ON(set_cpus_allowed_ptr(p, override_mask));

2980

++out_free_mask:

2981

++	cpus_read_unlock();

2982

++	free_cpumask_var(new_mask);

2983

++}

2984

++

2985

++static int

2986

++__sched_setaffinity(struct task_struct *p, struct affinity_context *ctx);

2987

++

2988

++/*

2989

++ * Restore the affinity of a task @p which was previously restricted by a

2990

++ * call to force_compatible_cpus_allowed_ptr().

2991

++ *

2992

++ * It is the caller's responsibility to serialise this with any calls to

2993

++ * force_compatible_cpus_allowed_ptr(@p).

2994

++ */

2995

++void relax_compatible_cpus_allowed_ptr(struct task_struct *p)

2996

++{

2997

++	struct affinity_context ac = {

2998

++		.new_mask  = task_user_cpus(p),

2999

++		.flags     = 0,

3000

++	};

3001

++	int ret;

3002

++

3003

++	/*

3004

++	 * Try to restore the old affinity mask with __sched_setaffinity().

3005

++	 * Cpuset masking will be done there too.

3006

++	 */

3007

++	ret = __sched_setaffinity(p, &ac);

3008

++	WARN_ON_ONCE(ret);

3009

++}

3010

++

3011

++#else /* CONFIG_SMP */

3012

++

3013

++static inline int select_task_rq(struct task_struct *p)

3014

++{

3015

++	return 0;

3016

++}

3017

++

3018

++static inline int

3019

++__set_cpus_allowed_ptr(struct task_struct *p,

3020

++		       struct affinity_context *ctx)

3021

++{

3022

++	return set_cpus_allowed_ptr(p, ctx->new_mask);

3023

++}

3024

++

3025

++static inline bool rq_has_pinned_tasks(struct rq *rq)

3026

++{

3027

++	return false;

3028

++}

3029

++

3030

++static inline cpumask_t *alloc_user_cpus_ptr(int node)

3031

++{

3032

++	return NULL;

3033

++}

3034

++

3035

++#endif /* !CONFIG_SMP */

3036

++

3037

++static void

3038

++ttwu_stat(struct task_struct *p, int cpu, int wake_flags)

3039

++{

3040

++	struct rq *rq;

3041

++

3042

++	if (!schedstat_enabled())

3043

++		return;

3044

++

3045

++	rq = this_rq();

3046

++

3047

++#ifdef CONFIG_SMP

3048

++	if (cpu == rq->cpu) {

3049

++		__schedstat_inc(rq->ttwu_local);

3050

++		__schedstat_inc(p->stats.nr_wakeups_local);

3051

++	} else {

3052

++		/** Alt schedule FW ToDo:

3053

++		 * How to do ttwu_wake_remote

3054

++		 */

3055

++	}

3056

++#endif /* CONFIG_SMP */

3057

++

3058

++	__schedstat_inc(rq->ttwu_count);

3059

++	__schedstat_inc(p->stats.nr_wakeups);

3060

++}

3061

++

3062

++/*

3063

++ * Mark the task runnable and perform wakeup-preemption.

3064

++ */

3065

++static inline void

3066

++ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)

3067

++{

3068

++	check_preempt_curr(rq);

3069

++	WRITE_ONCE(p->__state, TASK_RUNNING);

3070

++	trace_sched_wakeup(p);

3071

++}

3072

++

3073

++static inline void

3074

++ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags)

3075

++{

3076

++	if (p->sched_contributes_to_load)

3077

++		rq->nr_uninterruptible--;

3078

++

3079

++	if (

3080

++#ifdef CONFIG_SMP

3081

++	    !(wake_flags & WF_MIGRATED) &&

3082

++#endif

3083

++	    p->in_iowait) {

3084

++		delayacct_blkio_end(p);

3085

++		atomic_dec(&task_rq(p)->nr_iowait);

3086

++	}

3087

++

3088

++	activate_task(p, rq);

3089

++	ttwu_do_wakeup(rq, p, 0);

3090

++}

3091

++

3092

++/*

3093

++ * Consider @p being inside a wait loop:

3094

++ *

3095

++ *   for (;;) {

3096

++ *      set_current_state(TASK_UNINTERRUPTIBLE);

3097

++ *

3098

++ *      if (CONDITION)

3099

++ *         break;

3100

++ *

3101

++ *      schedule();

3102

++ *   }

3103

++ *   __set_current_state(TASK_RUNNING);

3104

++ *

3105

++ * between set_current_state() and schedule(). In this case @p is still

3106

++ * runnable, so all that needs doing is change p->state back to TASK_RUNNING in

3107

++ * an atomic manner.

3108

++ *

3109

++ * By taking task_rq(p)->lock we serialize against schedule(), if @p->on_rq

3110

++ * then schedule() must still happen and p->state can be changed to

3111

++ * TASK_RUNNING. Otherwise we lost the race, schedule() has happened, and we

3112

++ * need to do a full wakeup with enqueue.

3113

++ *

3114

++ * Returns: %true when the wakeup is done,

3115

++ *          %false otherwise.

3116

++ */

3117

++static int ttwu_runnable(struct task_struct *p, int wake_flags)

3118

++{

3119

++	struct rq *rq;

3120

++	raw_spinlock_t *lock;

3121

++	int ret = 0;

3122

++

3123

++	rq = __task_access_lock(p, &lock);

3124

++	if (task_on_rq_queued(p)) {

3125

++		/* check_preempt_curr() may use rq clock */

3126

++		update_rq_clock(rq);

3127

++		ttwu_do_wakeup(rq, p, wake_flags);

3128

++		ret = 1;

3129

++	}

3130

++	__task_access_unlock(p, lock);

3131

++

3132

++	return ret;

3133

++}

3134

++

3135

++#ifdef CONFIG_SMP

3136

++void sched_ttwu_pending(void *arg)

3137

++{

3138

++	struct llist_node *llist = arg;

3139

++	struct rq *rq = this_rq();

3140

++	struct task_struct *p, *t;

3141

++	struct rq_flags rf;

3142

++

3143

++	if (!llist)

3144

++		return;

3145

++

3146

++	rq_lock_irqsave(rq, &rf);

3147

++	update_rq_clock(rq);

3148

++

3149

++	llist_for_each_entry_safe(p, t, llist, wake_entry.llist) {

3150

++		if (WARN_ON_ONCE(p->on_cpu))

3151

++			smp_cond_load_acquire(&p->on_cpu, !VAL);

3152

++

3153

++		if (WARN_ON_ONCE(task_cpu(p) != cpu_of(rq)))

3154

++			set_task_cpu(p, cpu_of(rq));

3155

++

3156

++		ttwu_do_activate(rq, p, p->sched_remote_wakeup ? WF_MIGRATED : 0);

3157

++	}

3158

++

3159

++	/*

3160

++	 * Must be after enqueueing at least once task such that

3161

++	 * idle_cpu() does not observe a false-negative -- if it does,

3162

++	 * it is possible for select_idle_siblings() to stack a number

3163

++	 * of tasks on this CPU during that window.

3164

++	 *

3165

++	 * It is ok to clear ttwu_pending when another task pending.

3166

++	 * We will receive IPI after local irq enabled and then enqueue it.

3167

++	 * Since now nr_running > 0, idle_cpu() will always get correct result.

3168

++	 */

3169

++	WRITE_ONCE(rq->ttwu_pending, 0);

3170

++	rq_unlock_irqrestore(rq, &rf);

3171

++}

3172

++

3173

++void send_call_function_single_ipi(int cpu)

3174

++{

3175

++	struct rq *rq = cpu_rq(cpu);

3176

++

3177

++	if (!set_nr_if_polling(rq->idle))

3178

++		arch_send_call_function_single_ipi(cpu);

3179

++	else

3180

++		trace_sched_wake_idle_without_ipi(cpu);

3181

++}

3182

++

3183

++/*

3184

++ * Queue a task on the target CPUs wake_list and wake the CPU via IPI if

3185

++ * necessary. The wakee CPU on receipt of the IPI will queue the task

3186

++ * via sched_ttwu_wakeup() for activation so the wakee incurs the cost

3187

++ * of the wakeup instead of the waker.

3188

++ */

3189

++static void __ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)

3190

++{

3191

++	struct rq *rq = cpu_rq(cpu);

3192

++

3193

++	p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED);

3194

++

3195

++	WRITE_ONCE(rq->ttwu_pending, 1);

3196

++	__smp_call_single_queue(cpu, &p->wake_entry.llist);

3197

++}

3198

++

3199

++static inline bool ttwu_queue_cond(struct task_struct *p, int cpu)

3200

++{

3201

++	/*

3202

++	 * Do not complicate things with the async wake_list while the CPU is

3203

++	 * in hotplug state.

3204

++	 */

3205

++	if (!cpu_active(cpu))

3206

++		return false;

3207

++

3208

++	/* Ensure the task will still be allowed to run on the CPU. */

3209

++	if (!cpumask_test_cpu(cpu, p->cpus_ptr))

3210

++		return false;

3211

++

3212

++	/*

3213

++	 * If the CPU does not share cache, then queue the task on the

3214

++	 * remote rqs wakelist to avoid accessing remote data.

3215

++	 */

3216

++	if (!cpus_share_cache(smp_processor_id(), cpu))

3217

++		return true;

3218

++

3219

++	if (cpu == smp_processor_id())

3220

++		return false;

3221

++

3222

++	/*

3223

++	 * If the wakee cpu is idle, or the task is descheduling and the

3224

++	 * only running task on the CPU, then use the wakelist to offload

3225

++	 * the task activation to the idle (or soon-to-be-idle) CPU as

3226

++	 * the current CPU is likely busy. nr_running is checked to

3227

++	 * avoid unnecessary task stacking.

3228

++	 *

3229

++	 * Note that we can only get here with (wakee) p->on_rq=0,

3230

++	 * p->on_cpu can be whatever, we've done the dequeue, so

3231

++	 * the wakee has been accounted out of ->nr_running.

3232

++	 */

3233

++	if (!cpu_rq(cpu)->nr_running)

3234

++		return true;

3235

++

3236

++	return false;

3237

++}

3238

++

3239

++static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)

3240

++{

3241

++	if (__is_defined(ALT_SCHED_TTWU_QUEUE) && ttwu_queue_cond(p, cpu)) {

3242

++		sched_clock_cpu(cpu); /* Sync clocks across CPUs */

3243

++		__ttwu_queue_wakelist(p, cpu, wake_flags);

3244

++		return true;

3245

++	}

3246

++

3247

++	return false;

3248

++}

3249

++

3250

++void wake_up_if_idle(int cpu)

3251

++{

3252

++	struct rq *rq = cpu_rq(cpu);

3253

++	unsigned long flags;

3254

++

3255

++	rcu_read_lock();

3256

++

3257

++	if (!is_idle_task(rcu_dereference(rq->curr)))

3258

++		goto out;

3259

++

3260

++	raw_spin_lock_irqsave(&rq->lock, flags);

3261

++	if (is_idle_task(rq->curr))

3262

++		resched_curr(rq);

3263

++	/* Else CPU is not idle, do nothing here */

3264

++	raw_spin_unlock_irqrestore(&rq->lock, flags);

3265

++

3266

++out:

3267

++	rcu_read_unlock();

3268

++}

3269

++

3270

++bool cpus_share_cache(int this_cpu, int that_cpu)

3271

++{

3272

++	if (this_cpu == that_cpu)

3273

++		return true;

3274

++

3275

++	return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);

3276

++}

3277

++#else /* !CONFIG_SMP */

3278

++

3279

++static inline bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)

3280

++{

3281

++	return false;

3282

++}

3283

++

3284

++#endif /* CONFIG_SMP */

3285

++

3286

++static inline void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)

3287

++{

3288

++	struct rq *rq = cpu_rq(cpu);

3289

++

3290

++	if (ttwu_queue_wakelist(p, cpu, wake_flags))

3291

++		return;

3292

++

3293

++	raw_spin_lock(&rq->lock);

3294

++	update_rq_clock(rq);

3295

++	ttwu_do_activate(rq, p, wake_flags);

3296

++	raw_spin_unlock(&rq->lock);

3297

++}

3298

++

3299

++/*

3300

++ * Invoked from try_to_wake_up() to check whether the task can be woken up.

3301

++ *

3302

++ * The caller holds p::pi_lock if p != current or has preemption

3303

++ * disabled when p == current.

3304

++ *

3305

++ * The rules of PREEMPT_RT saved_state:

3306

++ *

3307

++ *   The related locking code always holds p::pi_lock when updating

3308

++ *   p::saved_state, which means the code is fully serialized in both cases.

3309

++ *

3310

++ *   The lock wait and lock wakeups happen via TASK_RTLOCK_WAIT. No other

3311

++ *   bits set. This allows to distinguish all wakeup scenarios.

3312

++ */

3313

++static __always_inline

3314

++bool ttwu_state_match(struct task_struct *p, unsigned int state, int *success)

3315

++{

3316

++	if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)) {

3317

++		WARN_ON_ONCE((state & TASK_RTLOCK_WAIT) &&

3318

++			     state != TASK_RTLOCK_WAIT);

3319

++	}

3320

++

3321

++	if (READ_ONCE(p->__state) & state) {

3322

++		*success = 1;

3323

++		return true;

3324

++	}

3325

++

3326

++#ifdef CONFIG_PREEMPT_RT

3327

++	/*

3328

++	 * Saved state preserves the task state across blocking on

3329

++	 * an RT lock.  If the state matches, set p::saved_state to

3330

++	 * TASK_RUNNING, but do not wake the task because it waits

3331

++	 * for a lock wakeup. Also indicate success because from

3332

++	 * the regular waker's point of view this has succeeded.

3333

++	 *

3334

++	 * After acquiring the lock the task will restore p::__state

3335

++	 * from p::saved_state which ensures that the regular

3336

++	 * wakeup is not lost. The restore will also set

3337

++	 * p::saved_state to TASK_RUNNING so any further tests will

3338

++	 * not result in false positives vs. @success

3339

++	 */

3340

++	if (p->saved_state & state) {

3341

++		p->saved_state = TASK_RUNNING;

3342

++		*success = 1;

3343

++	}

3344

++#endif

3345

++	return false;

3346

++}

3347

++

3348

++/*

3349

++ * Notes on Program-Order guarantees on SMP systems.

3350

++ *

3351

++ *  MIGRATION

3352

++ *

3353

++ * The basic program-order guarantee on SMP systems is that when a task [t]

3354

++ * migrates, all its activity on its old CPU [c0] happens-before any subsequent

3355

++ * execution on its new CPU [c1].

3356

++ *

3357

++ * For migration (of runnable tasks) this is provided by the following means:

3358

++ *

3359

++ *  A) UNLOCK of the rq(c0)->lock scheduling out task t

3360

++ *  B) migration for t is required to synchronize *both* rq(c0)->lock and

3361

++ *     rq(c1)->lock (if not at the same time, then in that order).

3362

++ *  C) LOCK of the rq(c1)->lock scheduling in task

3363

++ *

3364

++ * Transitivity guarantees that B happens after A and C after B.

3365

++ * Note: we only require RCpc transitivity.

3366

++ * Note: the CPU doing B need not be c0 or c1

3367

++ *

3368

++ * Example:

3369

++ *

3370

++ *   CPU0            CPU1            CPU2

3371

++ *

3372

++ *   LOCK rq(0)->lock

3373

++ *   sched-out X

3374

++ *   sched-in Y

3375

++ *   UNLOCK rq(0)->lock

3376

++ *

3377

++ *                                   LOCK rq(0)->lock // orders against CPU0

3378

++ *                                   dequeue X

3379

++ *                                   UNLOCK rq(0)->lock

3380

++ *

3381

++ *                                   LOCK rq(1)->lock

3382

++ *                                   enqueue X

3383

++ *                                   UNLOCK rq(1)->lock

3384

++ *

3385

++ *                   LOCK rq(1)->lock // orders against CPU2

3386

++ *                   sched-out Z

3387

++ *                   sched-in X

3388

++ *                   UNLOCK rq(1)->lock

3389

++ *

3390

++ *

3391

++ *  BLOCKING -- aka. SLEEP + WAKEUP

3392

++ *

3393

++ * For blocking we (obviously) need to provide the same guarantee as for

3394

++ * migration. However the means are completely different as there is no lock

3395

++ * chain to provide order. Instead we do:

3396

++ *

3397

++ *   1) smp_store_release(X->on_cpu, 0)   -- finish_task()

3398

++ *   2) smp_cond_load_acquire(!X->on_cpu) -- try_to_wake_up()

3399

++ *

3400

++ * Example:

3401

++ *

3402

++ *   CPU0 (schedule)  CPU1 (try_to_wake_up) CPU2 (schedule)

3403

++ *

3404

++ *   LOCK rq(0)->lock LOCK X->pi_lock

3405

++ *   dequeue X

3406

++ *   sched-out X

3407

++ *   smp_store_release(X->on_cpu, 0);

3408

++ *

3409

++ *                    smp_cond_load_acquire(&X->on_cpu, !VAL);

3410

++ *                    X->state = WAKING

3411

++ *                    set_task_cpu(X,2)

3412

++ *

3413

++ *                    LOCK rq(2)->lock

3414

++ *                    enqueue X

3415

++ *                    X->state = RUNNING

3416

++ *                    UNLOCK rq(2)->lock

3417

++ *

3418

++ *                                          LOCK rq(2)->lock // orders against CPU1

3419

++ *                                          sched-out Z

3420

++ *                                          sched-in X

3421

++ *                                          UNLOCK rq(2)->lock

3422

++ *

3423

++ *                    UNLOCK X->pi_lock

3424

++ *   UNLOCK rq(0)->lock

3425

++ *

3426

++ *

3427

++ * However; for wakeups there is a second guarantee we must provide, namely we

3428

++ * must observe the state that lead to our wakeup. That is, not only must our

3429

++ * task observe its own prior state, it must also observe the stores prior to

3430

++ * its wakeup.

3431

++ *

3432

++ * This means that any means of doing remote wakeups must order the CPU doing

3433

++ * the wakeup against the CPU the task is going to end up running on. This,

3434

++ * however, is already required for the regular Program-Order guarantee above,

3435

++ * since the waking CPU is the one issueing the ACQUIRE (smp_cond_load_acquire).

3436

++ *

3437

++ */

3438

++

3439

++/**

3440

++ * try_to_wake_up - wake up a thread

3441

++ * @p: the thread to be awakened

3442

++ * @state: the mask of task states that can be woken

3443

++ * @wake_flags: wake modifier flags (WF_*)

3444

++ *

3445

++ * Conceptually does:

3446

++ *

3447

++ *   If (@state & @p->state) @p->state = TASK_RUNNING.

3448

++ *

3449

++ * If the task was not queued/runnable, also place it back on a runqueue.

3450

++ *

3451

++ * This function is atomic against schedule() which would dequeue the task.

3452

++ *

3453

++ * It issues a full memory barrier before accessing @p->state, see the comment

3454

++ * with set_current_state().

3455

++ *

3456

++ * Uses p->pi_lock to serialize against concurrent wake-ups.

3457

++ *

3458

++ * Relies on p->pi_lock stabilizing:

3459

++ *  - p->sched_class

3460

++ *  - p->cpus_ptr

3461

++ *  - p->sched_task_group

3462

++ * in order to do migration, see its use of select_task_rq()/set_task_cpu().

3463

++ *

3464

++ * Tries really hard to only take one task_rq(p)->lock for performance.

3465

++ * Takes rq->lock in:

3466

++ *  - ttwu_runnable()    -- old rq, unavoidable, see comment there;

3467

++ *  - ttwu_queue()       -- new rq, for enqueue of the task;

3468

++ *  - psi_ttwu_dequeue() -- much sadness :-( accounting will kill us.

3469

++ *

3470

++ * As a consequence we race really badly with just about everything. See the

3471

++ * many memory barriers and their comments for details.

3472

++ *

3473

++ * Return: %true if @p->state changes (an actual wakeup was done),

3474

++ *	   %false otherwise.

3475

++ */

3476

++static int try_to_wake_up(struct task_struct *p, unsigned int state,

3477

++			  int wake_flags)

3478

++{

3479

++	unsigned long flags;

3480

++	int cpu, success = 0;

3481

++

3482

++	preempt_disable();

3483

++	if (p == current) {

3484

++		/*

3485

++		 * We're waking current, this means 'p->on_rq' and 'task_cpu(p)

3486

++		 * == smp_processor_id()'. Together this means we can special

3487

++		 * case the whole 'p->on_rq && ttwu_runnable()' case below

3488

++		 * without taking any locks.

3489

++		 *

3490

++		 * In particular:

3491

++		 *  - we rely on Program-Order guarantees for all the ordering,

3492

++		 *  - we're serialized against set_special_state() by virtue of

3493

++		 *    it disabling IRQs (this allows not taking ->pi_lock).

3494

++		 */

3495

++		if (!ttwu_state_match(p, state, &success))

3496

++			goto out;

3497

++

3498

++		trace_sched_waking(p);

3499

++		WRITE_ONCE(p->__state, TASK_RUNNING);

3500

++		trace_sched_wakeup(p);

3501

++		goto out;

3502

++	}

3503

++

3504

++	/*

3505

++	 * If we are going to wake up a thread waiting for CONDITION we

3506

++	 * need to ensure that CONDITION=1 done by the caller can not be

3507

++	 * reordered with p->state check below. This pairs with smp_store_mb()

3508

++	 * in set_current_state() that the waiting thread does.

3509

++	 */

3510

++	raw_spin_lock_irqsave(&p->pi_lock, flags);

3511

++	smp_mb__after_spinlock();

3512

++	if (!ttwu_state_match(p, state, &success))

3513

++		goto unlock;

3514

++

3515

++	trace_sched_waking(p);

3516

++

3517

++	/*

3518

++	 * Ensure we load p->on_rq _after_ p->state, otherwise it would

3519

++	 * be possible to, falsely, observe p->on_rq == 0 and get stuck

3520

++	 * in smp_cond_load_acquire() below.

3521

++	 *

3522

++	 * sched_ttwu_pending()			try_to_wake_up()

3523

++	 *   STORE p->on_rq = 1			  LOAD p->state

3524

++	 *   UNLOCK rq->lock

3525

++	 *

3526

++	 * __schedule() (switch to task 'p')

3527

++	 *   LOCK rq->lock			  smp_rmb();

3528

++	 *   smp_mb__after_spinlock();

3529

++	 *   UNLOCK rq->lock

3530

++	 *

3531

++	 * [task p]

3532

++	 *   STORE p->state = UNINTERRUPTIBLE	  LOAD p->on_rq

3533

++	 *

3534

++	 * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in

3535

++	 * __schedule().  See the comment for smp_mb__after_spinlock().

3536

++	 *

3537

++	 * A similar smb_rmb() lives in try_invoke_on_locked_down_task().

3538

++	 */

3539

++	smp_rmb();

3540

++	if (READ_ONCE(p->on_rq) && ttwu_runnable(p, wake_flags))

3541

++		goto unlock;

3542

++

3543

++#ifdef CONFIG_SMP

3544

++	/*

3545

++	 * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be

3546

++	 * possible to, falsely, observe p->on_cpu == 0.

3547

++	 *

3548

++	 * One must be running (->on_cpu == 1) in order to remove oneself

3549

++	 * from the runqueue.

3550

++	 *

3551

++	 * __schedule() (switch to task 'p')	try_to_wake_up()

3552

++	 *   STORE p->on_cpu = 1		  LOAD p->on_rq

3553

++	 *   UNLOCK rq->lock

3554

++	 *

3555

++	 * __schedule() (put 'p' to sleep)

3556

++	 *   LOCK rq->lock			  smp_rmb();

3557

++	 *   smp_mb__after_spinlock();

3558

++	 *   STORE p->on_rq = 0			  LOAD p->on_cpu

3559

++	 *

3560

++	 * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in

3561

++	 * __schedule().  See the comment for smp_mb__after_spinlock().

3562

++	 *

3563

++	 * Form a control-dep-acquire with p->on_rq == 0 above, to ensure

3564

++	 * schedule()'s deactivate_task() has 'happened' and p will no longer

3565

++	 * care about it's own p->state. See the comment in __schedule().

3566

++	 */

3567

++	smp_acquire__after_ctrl_dep();

3568

++

3569

++	/*

3570

++	 * We're doing the wakeup (@success == 1), they did a dequeue (p->on_rq

3571

++	 * == 0), which means we need to do an enqueue, change p->state to

3572

++	 * TASK_WAKING such that we can unlock p->pi_lock before doing the

3573

++	 * enqueue, such as ttwu_queue_wakelist().

3574

++	 */

3575

++	WRITE_ONCE(p->__state, TASK_WAKING);

3576

++

3577

++	/*

3578

++	 * If the owning (remote) CPU is still in the middle of schedule() with

3579

++	 * this task as prev, considering queueing p on the remote CPUs wake_list

3580

++	 * which potentially sends an IPI instead of spinning on p->on_cpu to

3581

++	 * let the waker make forward progress. This is safe because IRQs are

3582

++	 * disabled and the IPI will deliver after on_cpu is cleared.

3583

++	 *

3584

++	 * Ensure we load task_cpu(p) after p->on_cpu:

3585

++	 *

3586

++	 * set_task_cpu(p, cpu);

3587

++	 *   STORE p->cpu = @cpu

3588

++	 * __schedule() (switch to task 'p')

3589

++	 *   LOCK rq->lock

3590

++	 *   smp_mb__after_spin_lock()          smp_cond_load_acquire(&p->on_cpu)

3591

++	 *   STORE p->on_cpu = 1                LOAD p->cpu

3592

++	 *

3593

++	 * to ensure we observe the correct CPU on which the task is currently

3594

++	 * scheduling.

3595

++	 */

3596

++	if (smp_load_acquire(&p->on_cpu) &&

3597

++	    ttwu_queue_wakelist(p, task_cpu(p), wake_flags))

3598

++		goto unlock;

3599

++

3600

++	/*

3601

++	 * If the owning (remote) CPU is still in the middle of schedule() with

3602

++	 * this task as prev, wait until it's done referencing the task.

3603

++	 *

3604

++	 * Pairs with the smp_store_release() in finish_task().

3605

++	 *

3606

++	 * This ensures that tasks getting woken will be fully ordered against

3607

++	 * their previous state and preserve Program Order.

3608

++	 */

3609

++	smp_cond_load_acquire(&p->on_cpu, !VAL);

3610

++

3611

++	sched_task_ttwu(p);

3612

++

3613

++	cpu = select_task_rq(p);

3614

++

3615

++	if (cpu != task_cpu(p)) {

3616

++		if (p->in_iowait) {

3617

++			delayacct_blkio_end(p);

3618

++			atomic_dec(&task_rq(p)->nr_iowait);

3619

++		}

3620

++

3621

++		wake_flags |= WF_MIGRATED;

3622

++		psi_ttwu_dequeue(p);

3623

++		set_task_cpu(p, cpu);

3624

++	}

3625

++#else

3626

++	cpu = task_cpu(p);

3627

++#endif /* CONFIG_SMP */

3628

++

3629

++	ttwu_queue(p, cpu, wake_flags);

3630

++unlock:

3631

++	raw_spin_unlock_irqrestore(&p->pi_lock, flags);

3632

++out:

3633

++	if (success)

3634

++		ttwu_stat(p, task_cpu(p), wake_flags);

3635

++	preempt_enable();

3636

++

3637

++	return success;

3638

++}

3639

++

3640

++static bool __task_needs_rq_lock(struct task_struct *p)

3641

++{

3642

++	unsigned int state = READ_ONCE(p->__state);

3643

++

3644

++	/*

3645

++	 * Since pi->lock blocks try_to_wake_up(), we don't need rq->lock when

3646

++	 * the task is blocked. Make sure to check @state since ttwu() can drop

3647

++	 * locks at the end, see ttwu_queue_wakelist().

3648

++	 */

3649

++	if (state == TASK_RUNNING || state == TASK_WAKING)

3650

++		return true;

3651

++

3652

++	/*

3653

++	 * Ensure we load p->on_rq after p->__state, otherwise it would be

3654

++	 * possible to, falsely, observe p->on_rq == 0.

3655

++	 *

3656

++	 * See try_to_wake_up() for a longer comment.

3657

++	 */

3658

++	smp_rmb();

3659

++	if (p->on_rq)

3660

++		return true;

3661

++

3662

++#ifdef CONFIG_SMP

3663

++	/*

3664

++	 * Ensure the task has finished __schedule() and will not be referenced

3665

++	 * anymore. Again, see try_to_wake_up() for a longer comment.

3666

++	 */

3667

++	smp_rmb();

3668

++	smp_cond_load_acquire(&p->on_cpu, !VAL);

3669

++#endif

3670

++

3671

++	return false;

3672

++}

3673

++

3674

++/**

3675

++ * task_call_func - Invoke a function on task in fixed state

3676

++ * @p: Process for which the function is to be invoked, can be @current.

3677

++ * @func: Function to invoke.

3678

++ * @arg: Argument to function.

3679

++ *

3680

++ * Fix the task in it's current state by avoiding wakeups and or rq operations

3681

++ * and call @func(@arg) on it.  This function can use ->on_rq and task_curr()

3682

++ * to work out what the state is, if required.  Given that @func can be invoked

3683

++ * with a runqueue lock held, it had better be quite lightweight.

3684

++ *

3685

++ * Returns:

3686

++ *   Whatever @func returns

3687

++ */

3688

++int task_call_func(struct task_struct *p, task_call_f func, void *arg)

3689

++{

3690

++	struct rq *rq = NULL;

3691

++	struct rq_flags rf;

3692

++	int ret;

3693

++

3694

++	raw_spin_lock_irqsave(&p->pi_lock, rf.flags);

3695

++

3696

++	if (__task_needs_rq_lock(p))

3697

++		rq = __task_rq_lock(p, &rf);

3698

++

3699

++	/*

3700

++	 * At this point the task is pinned; either:

3701

++	 *  - blocked and we're holding off wakeups      (pi->lock)

3702

++	 *  - woken, and we're holding off enqueue       (rq->lock)

3703

++	 *  - queued, and we're holding off schedule     (rq->lock)

3704

++	 *  - running, and we're holding off de-schedule (rq->lock)

3705

++	 *

3706

++	 * The called function (@func) can use: task_curr(), p->on_rq and

3707

++	 * p->__state to differentiate between these states.

3708

++	 */

3709

++	ret = func(p, arg);

3710

++

3711

++	if (rq)

3712

++		__task_rq_unlock(rq, &rf);

3713

++

3714

++	raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags);

3715

++	return ret;

3716

++}

3717

++

3718

++/**

3719

++ * cpu_curr_snapshot - Return a snapshot of the currently running task

3720

++ * @cpu: The CPU on which to snapshot the task.

3721

++ *

3722

++ * Returns the task_struct pointer of the task "currently" running on

3723

++ * the specified CPU.  If the same task is running on that CPU throughout,

3724

++ * the return value will be a pointer to that task's task_struct structure.

3725

++ * If the CPU did any context switches even vaguely concurrently with the

3726

++ * execution of this function, the return value will be a pointer to the

3727

++ * task_struct structure of a randomly chosen task that was running on

3728

++ * that CPU somewhere around the time that this function was executing.

3729

++ *

3730

++ * If the specified CPU was offline, the return value is whatever it

3731

++ * is, perhaps a pointer to the task_struct structure of that CPU's idle

3732

++ * task, but there is no guarantee.  Callers wishing a useful return

3733

++ * value must take some action to ensure that the specified CPU remains

3734

++ * online throughout.

3735

++ *

3736

++ * This function executes full memory barriers before and after fetching

3737

++ * the pointer, which permits the caller to confine this function's fetch

3738

++ * with respect to the caller's accesses to other shared variables.

3739

++ */

3740

++struct task_struct *cpu_curr_snapshot(int cpu)

3741

++{

3742

++	struct task_struct *t;

3743

++

3744

++	smp_mb(); /* Pairing determined by caller's synchronization design. */

3745

++	t = rcu_dereference(cpu_curr(cpu));

3746

++	smp_mb(); /* Pairing determined by caller's synchronization design. */

3747

++	return t;

3748

++}

3749

++

3750

++/**

3751

++ * wake_up_process - Wake up a specific process

3752

++ * @p: The process to be woken up.

3753

++ *

3754

++ * Attempt to wake up the nominated process and move it to the set of runnable

3755

++ * processes.

3756

++ *

3757

++ * Return: 1 if the process was woken up, 0 if it was already running.

3758

++ *

3759

++ * This function executes a full memory barrier before accessing the task state.

3760

++ */

3761

++int wake_up_process(struct task_struct *p)

3762

++{

3763

++	return try_to_wake_up(p, TASK_NORMAL, 0);

3764

++}

3765

++EXPORT_SYMBOL(wake_up_process);

3766

++

3767

++int wake_up_state(struct task_struct *p, unsigned int state)

3768

++{

3769

++	return try_to_wake_up(p, state, 0);

3770

++}

3771

++

3772

++/*

3773

++ * Perform scheduler related setup for a newly forked process p.

3774

++ * p is forked by current.

3775

++ *

3776

++ * __sched_fork() is basic setup used by init_idle() too:

3777

++ */

3778

++static inline void __sched_fork(unsigned long clone_flags, struct task_struct *p)

3779

++{

3780

++	p->on_rq			= 0;

3781

++	p->on_cpu			= 0;

3782

++	p->utime			= 0;

3783

++	p->stime			= 0;

3784

++	p->sched_time			= 0;

3785

++

3786

++#ifdef CONFIG_SCHEDSTATS

3787

++	/* Even if schedstat is disabled, there should not be garbage */

3788

++	memset(&p->stats, 0, sizeof(p->stats));

3789

++#endif

3790

++

3791

++#ifdef CONFIG_PREEMPT_NOTIFIERS

3792

++	INIT_HLIST_HEAD(&p->preempt_notifiers);

3793

++#endif

3794

++

3795

++#ifdef CONFIG_COMPACTION

3796

++	p->capture_control = NULL;

3797

++#endif

3798

++#ifdef CONFIG_SMP

3799

++	p->wake_entry.u_flags = CSD_TYPE_TTWU;

3800

++#endif

3801

++}

3802

++

3803

++/*

3804

++ * fork()/clone()-time setup:

3805

++ */

3806

++int sched_fork(unsigned long clone_flags, struct task_struct *p)

3807

++{

3808

++	__sched_fork(clone_flags, p);

3809

++	/*

3810

++	 * We mark the process as NEW here. This guarantees that

3811

++	 * nobody will actually run it, and a signal or other external

3812

++	 * event cannot wake it up and insert it on the runqueue either.

3813

++	 */

3814

++	p->__state = TASK_NEW;

3815

++

3816

++	/*

3817

++	 * Make sure we do not leak PI boosting priority to the child.

3818

++	 */

3819

++	p->prio = current->normal_prio;

3820

++

3821

++	/*

3822

++	 * Revert to default priority/policy on fork if requested.

3823

++	 */

3824

++	if (unlikely(p->sched_reset_on_fork)) {

3825

++		if (task_has_rt_policy(p)) {

3826

++			p->policy = SCHED_NORMAL;

3827

++			p->static_prio = NICE_TO_PRIO(0);

3828

++			p->rt_priority = 0;

3829

++		} else if (PRIO_TO_NICE(p->static_prio) < 0)

3830

++			p->static_prio = NICE_TO_PRIO(0);

3831

++

3832

++		p->prio = p->normal_prio = p->static_prio;

3833

++

3834

++		/*

3835

++		 * We don't need the reset flag anymore after the fork. It has

3836

++		 * fulfilled its duty:

3837

++		 */

3838

++		p->sched_reset_on_fork = 0;

3839

++	}

3840

++

3841

++#ifdef CONFIG_SCHED_INFO

3842

++	if (unlikely(sched_info_on()))

3843

++		memset(&p->sched_info, 0, sizeof(p->sched_info));

3844

++#endif

3845

++	init_task_preempt_count(p);

3846

++

3847

++	return 0;

3848

++}

3849

++

3850

++void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs)

3851

++{

3852

++	unsigned long flags;

3853

++	struct rq *rq;

3854

++

3855

++	/*

3856

++	 * Because we're not yet on the pid-hash, p->pi_lock isn't strictly

3857

++	 * required yet, but lockdep gets upset if rules are violated.

3858

++	 */

3859

++	raw_spin_lock_irqsave(&p->pi_lock, flags);

3860

++	/*

3861

++	 * Share the timeslice between parent and child, thus the

3862

++	 * total amount of pending timeslices in the system doesn't change,

3863

++	 * resulting in more scheduling fairness.

3864

++	 */

3865

++	rq = this_rq();

3866

++	raw_spin_lock(&rq->lock);

3867

++

3868

++	rq->curr->time_slice /= 2;

3869

++	p->time_slice = rq->curr->time_slice;

3870

++#ifdef CONFIG_SCHED_HRTICK

3871

++	hrtick_start(rq, rq->curr->time_slice);

3872

++#endif

3873

++

3874

++	if (p->time_slice < RESCHED_NS) {

3875

++		p->time_slice = sched_timeslice_ns;

3876

++		resched_curr(rq);

3877

++	}

3878

++	sched_task_fork(p, rq);

3879

++	raw_spin_unlock(&rq->lock);

3880

++

3881

++	rseq_migrate(p);

3882

++	/*

3883

++	 * We're setting the CPU for the first time, we don't migrate,

3884

++	 * so use __set_task_cpu().

3885

++	 */

3886

++	__set_task_cpu(p, smp_processor_id());

3887

++	raw_spin_unlock_irqrestore(&p->pi_lock, flags);

3888

++}

3889

++

3890

++void sched_post_fork(struct task_struct *p)

3891

++{

3892

++}

3893

++

3894

++#ifdef CONFIG_SCHEDSTATS

3895

++

3896

++DEFINE_STATIC_KEY_FALSE(sched_schedstats);

3897

++

3898

++static void set_schedstats(bool enabled)

3899

++{

3900

++	if (enabled)

3901

++		static_branch_enable(&sched_schedstats);

3902

++	else

3903

++		static_branch_disable(&sched_schedstats);

3904

++}

3905

++

3906

++void force_schedstat_enabled(void)

3907

++{

3908

++	if (!schedstat_enabled()) {

3909

++		pr_info("kernel profiling enabled schedstats, disable via kernel.sched_schedstats.\n");

3910

++		static_branch_enable(&sched_schedstats);

3911

++	}

3912

++}

3913

++

3914

++static int __init setup_schedstats(char *str)

3915

++{

3916

++	int ret = 0;

3917

++	if (!str)

3918

++		goto out;

3919

++

3920

++	if (!strcmp(str, "enable")) {

3921

++		set_schedstats(true);

3922

++		ret = 1;

3923

++	} else if (!strcmp(str, "disable")) {

3924

++		set_schedstats(false);

3925

++		ret = 1;

3926

++	}

3927

++out:

3928

++	if (!ret)

3929

++		pr_warn("Unable to parse schedstats=\n");

3930

++

3931

++	return ret;

3932

++}

3933

++__setup("schedstats=", setup_schedstats);

3934

++

3935

++#ifdef CONFIG_PROC_SYSCTL

3936

++static int sysctl_schedstats(struct ctl_table *table, int write, void *buffer,

3937

++		size_t *lenp, loff_t *ppos)

3938

++{

3939

++	struct ctl_table t;

3940

++	int err;

3941

++	int state = static_branch_likely(&sched_schedstats);

3942

++

3943

++	if (write && !capable(CAP_SYS_ADMIN))

3944

++		return -EPERM;

3945

++

3946

++	t = *table;

3947

++	t.data = &state;

3948

++	err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);

3949

++	if (err < 0)

3950

++		return err;

3951

++	if (write)

3952

++		set_schedstats(state);

3953

++	return err;

3954

++}

3955

++

3956

++static struct ctl_table sched_core_sysctls[] = {

3957

++	{

3958

++		.procname       = "sched_schedstats",

3959

++		.data           = NULL,

3960

++		.maxlen         = sizeof(unsigned int),

3961

++		.mode           = 0644,

3962

++		.proc_handler   = sysctl_schedstats,

3963

++		.extra1         = SYSCTL_ZERO,

3964

++		.extra2         = SYSCTL_ONE,

3965

++	},

3966

++	{}

3967

++};

3968

++static int __init sched_core_sysctl_init(void)

3969

++{

3970

++	register_sysctl_init("kernel", sched_core_sysctls);

3971

++	return 0;

3972

++}

3973

++late_initcall(sched_core_sysctl_init);

3974

++#endif /* CONFIG_PROC_SYSCTL */

3975

++#endif /* CONFIG_SCHEDSTATS */

3976

++

3977

++/*

3978

++ * wake_up_new_task - wake up a newly created task for the first time.

3979

++ *

3980

++ * This function will do some initial scheduler statistics housekeeping

3981

++ * that must be done for every newly created context, then puts the task

3982

++ * on the runqueue and wakes it.

3983

++ */

3984

++void wake_up_new_task(struct task_struct *p)

3985

++{

3986

++	unsigned long flags;

3987

++	struct rq *rq;

3988

++

3989

++	raw_spin_lock_irqsave(&p->pi_lock, flags);

3990

++	WRITE_ONCE(p->__state, TASK_RUNNING);

3991

++	rq = cpu_rq(select_task_rq(p));

3992

++#ifdef CONFIG_SMP

3993

++	rseq_migrate(p);

3994

++	/*

3995

++	 * Fork balancing, do it here and not earlier because:

3996

++	 * - cpus_ptr can change in the fork path

3997

++	 * - any previously selected CPU might disappear through hotplug

3998

++	 *

3999

++	 * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq,

4000

++	 * as we're not fully set-up yet.

4001

++	 */

4002

++	__set_task_cpu(p, cpu_of(rq));

4003

++#endif

4004

++

4005

++	raw_spin_lock(&rq->lock);

4006

++	update_rq_clock(rq);

4007

++

4008

++	activate_task(p, rq);

4009

++	trace_sched_wakeup_new(p);

4010

++	check_preempt_curr(rq);

4011

++

4012

++	raw_spin_unlock(&rq->lock);

4013

++	raw_spin_unlock_irqrestore(&p->pi_lock, flags);

4014

++}

4015

++

4016

++#ifdef CONFIG_PREEMPT_NOTIFIERS

4017

++

4018

++static DEFINE_STATIC_KEY_FALSE(preempt_notifier_key);

4019

++

4020

++void preempt_notifier_inc(void)

4021

++{

4022

++	static_branch_inc(&preempt_notifier_key);

4023

++}

4024

++EXPORT_SYMBOL_GPL(preempt_notifier_inc);

4025

++

4026

++void preempt_notifier_dec(void)

4027

++{

4028

++	static_branch_dec(&preempt_notifier_key);

4029

++}

4030

++EXPORT_SYMBOL_GPL(preempt_notifier_dec);

4031

++

4032

++/**

4033

++ * preempt_notifier_register - tell me when current is being preempted & rescheduled

4034

++ * @notifier: notifier struct to register

4035

++ */

4036

++void preempt_notifier_register(struct preempt_notifier *notifier)

4037

++{

4038

++	if (!static_branch_unlikely(&preempt_notifier_key))

4039

++		WARN(1, "registering preempt_notifier while notifiers disabled\n");

4040

++

4041

++	hlist_add_head(&notifier->link, &current->preempt_notifiers);

4042

++}

4043

++EXPORT_SYMBOL_GPL(preempt_notifier_register);

4044

++

4045

++/**

4046

++ * preempt_notifier_unregister - no longer interested in preemption notifications

4047

++ * @notifier: notifier struct to unregister

4048

++ *

4049

++ * This is *not* safe to call from within a preemption notifier.

4050

++ */

4051

++void preempt_notifier_unregister(struct preempt_notifier *notifier)

4052

++{

4053

++	hlist_del(&notifier->link);

4054

++}

4055

++EXPORT_SYMBOL_GPL(preempt_notifier_unregister);

4056

++

4057

++static void __fire_sched_in_preempt_notifiers(struct task_struct *curr)

4058

++{

4059

++	struct preempt_notifier *notifier;

4060

++

4061

++	hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)

4062

++		notifier->ops->sched_in(notifier, raw_smp_processor_id());

4063

++}

4064

++

4065

++static __always_inline void fire_sched_in_preempt_notifiers(struct task_struct *curr)

4066

++{

4067

++	if (static_branch_unlikely(&preempt_notifier_key))

4068

++		__fire_sched_in_preempt_notifiers(curr);

4069

++}

4070

++

4071

++static void

4072

++__fire_sched_out_preempt_notifiers(struct task_struct *curr,

4073

++				   struct task_struct *next)

4074

++{

4075

++	struct preempt_notifier *notifier;

4076

++

4077

++	hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)

4078

++		notifier->ops->sched_out(notifier, next);

4079

++}

4080

++

4081

++static __always_inline void

4082

++fire_sched_out_preempt_notifiers(struct task_struct *curr,

4083

++				 struct task_struct *next)

4084

++{

4085

++	if (static_branch_unlikely(&preempt_notifier_key))

4086

++		__fire_sched_out_preempt_notifiers(curr, next);

4087

++}

4088

++

4089

++#else /* !CONFIG_PREEMPT_NOTIFIERS */

4090

++

4091

++static inline void fire_sched_in_preempt_notifiers(struct task_struct *curr)

4092

++{

4093

++}

4094

++

4095

++static inline void

4096

++fire_sched_out_preempt_notifiers(struct task_struct *curr,

4097

++				 struct task_struct *next)

4098

++{

4099

++}

4100

++

4101

++#endif /* CONFIG_PREEMPT_NOTIFIERS */

4102

++

4103

++static inline void prepare_task(struct task_struct *next)

4104

++{

4105

++	/*

4106

++	 * Claim the task as running, we do this before switching to it

4107

++	 * such that any running task will have this set.

4108

++	 *

4109

++	 * See the smp_load_acquire(&p->on_cpu) case in ttwu() and

4110

++	 * its ordering comment.

4111

++	 */

4112

++	WRITE_ONCE(next->on_cpu, 1);

4113

++}

4114

++

4115

++static inline void finish_task(struct task_struct *prev)

4116

++{

4117

++#ifdef CONFIG_SMP

4118

++	/*

4119

++	 * This must be the very last reference to @prev from this CPU. After

4120

++	 * p->on_cpu is cleared, the task can be moved to a different CPU. We

4121

++	 * must ensure this doesn't happen until the switch is completely

4122

++	 * finished.

4123

++	 *

4124

++	 * In particular, the load of prev->state in finish_task_switch() must

4125

++	 * happen before this.

4126

++	 *

4127

++	 * Pairs with the smp_cond_load_acquire() in try_to_wake_up().

4128

++	 */

4129

++	smp_store_release(&prev->on_cpu, 0);

4130

++#else

4131

++	prev->on_cpu = 0;

4132

++#endif

4133

++}

4134

++

4135

++#ifdef CONFIG_SMP

4136

++

4137

++static void do_balance_callbacks(struct rq *rq, struct balance_callback *head)

4138

++{

4139

++	void (*func)(struct rq *rq);

4140

++	struct balance_callback *next;

4141

++

4142

++	lockdep_assert_held(&rq->lock);

4143

++

4144

++	while (head) {

4145

++		func = (void (*)(struct rq *))head->func;

4146

++		next = head->next;

4147

++		head->next = NULL;

4148

++		head = next;

4149

++

4150

++		func(rq);

4151

++	}

4152

++}

4153

++

4154

++static void balance_push(struct rq *rq);

4155

++

4156

++/*

4157

++ * balance_push_callback is a right abuse of the callback interface and plays

4158

++ * by significantly different rules.

4159

++ *

4160

++ * Where the normal balance_callback's purpose is to be ran in the same context

4161

++ * that queued it (only later, when it's safe to drop rq->lock again),

4162

++ * balance_push_callback is specifically targeted at __schedule().

4163

++ *

4164

++ * This abuse is tolerated because it places all the unlikely/odd cases behind

4165

++ * a single test, namely: rq->balance_callback == NULL.

4166

++ */

4167

++struct balance_callback balance_push_callback = {

4168

++	.next = NULL,

4169

++	.func = balance_push,

4170

++};

4171

++

4172

++static inline struct balance_callback *

4173

++__splice_balance_callbacks(struct rq *rq, bool split)

4174

++{

4175

++	struct balance_callback *head = rq->balance_callback;

4176

++

4177

++	if (likely(!head))

4178

++		return NULL;

4179

++

4180

++	lockdep_assert_rq_held(rq);

4181

++	/*

4182

++	 * Must not take balance_push_callback off the list when

4183

++	 * splice_balance_callbacks() and balance_callbacks() are not

4184

++	 * in the same rq->lock section.

4185

++	 *

4186

++	 * In that case it would be possible for __schedule() to interleave

4187

++	 * and observe the list empty.

4188

++	 */

4189

++	if (split && head == &balance_push_callback)

4190

++		head = NULL;

4191

++	else

4192

++		rq->balance_callback = NULL;

4193

++

4194

++	return head;

4195

++}

4196

++

4197

++static inline struct balance_callback *splice_balance_callbacks(struct rq *rq)

4198

++{

4199

++	return __splice_balance_callbacks(rq, true);

4200

++}

4201

++

4202

++static void __balance_callbacks(struct rq *rq)

4203

++{

4204

++	do_balance_callbacks(rq, __splice_balance_callbacks(rq, false));

4205

++}

4206

++

4207

++static inline void balance_callbacks(struct rq *rq, struct balance_callback *head)

4208

++{

4209

++	unsigned long flags;

4210

++

4211

++	if (unlikely(head)) {

4212

++		raw_spin_lock_irqsave(&rq->lock, flags);

4213

++		do_balance_callbacks(rq, head);

4214

++		raw_spin_unlock_irqrestore(&rq->lock, flags);

4215

++	}

4216

++}

4217

++

4218

++#else

4219

++

4220

++static inline void __balance_callbacks(struct rq *rq)

4221

++{

4222

++}

4223

++

4224

++static inline struct balance_callback *splice_balance_callbacks(struct rq *rq)

4225

++{

4226

++	return NULL;

4227

++}

4228

++

4229

++static inline void balance_callbacks(struct rq *rq, struct balance_callback *head)

4230

++{

4231

++}

4232

++

4233

++#endif

4234

++

4235

++static inline void

4236

++prepare_lock_switch(struct rq *rq, struct task_struct *next)

4237

++{

4238

++	/*

4239

++	 * Since the runqueue lock will be released by the next

4240

++	 * task (which is an invalid locking op but in the case

4241

++	 * of the scheduler it's an obvious special-case), so we

4242

++	 * do an early lockdep release here:

4243

++	 */

4244

++	spin_release(&rq->lock.dep_map, _THIS_IP_);

4245

++#ifdef CONFIG_DEBUG_SPINLOCK

4246

++	/* this is a valid case when another task releases the spinlock */

4247

++	rq->lock.owner = next;

4248

++#endif

4249

++}

4250

++

4251

++static inline void finish_lock_switch(struct rq *rq)

4252

++{

4253

++	/*

4254

++	 * If we are tracking spinlock dependencies then we have to

4255

++	 * fix up the runqueue lock - which gets 'carried over' from

4256

++	 * prev into current:

4257

++	 */

4258

++	spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);

4259

++	__balance_callbacks(rq);

4260

++	raw_spin_unlock_irq(&rq->lock);

4261

++}

4262

++

4263

++/*

4264

++ * NOP if the arch has not defined these:

4265

++ */

4266

++

4267

++#ifndef prepare_arch_switch

4268

++# define prepare_arch_switch(next)	do { } while (0)

4269

++#endif

4270

++

4271

++#ifndef finish_arch_post_lock_switch

4272

++# define finish_arch_post_lock_switch()	do { } while (0)

4273

++#endif

4274

++

4275

++static inline void kmap_local_sched_out(void)

4276

++{

4277

++#ifdef CONFIG_KMAP_LOCAL

4278

++	if (unlikely(current->kmap_ctrl.idx))

4279

++		__kmap_local_sched_out();

4280

++#endif

4281

++}

4282

++

4283

++static inline void kmap_local_sched_in(void)

4284

++{

4285

++#ifdef CONFIG_KMAP_LOCAL

4286

++	if (unlikely(current->kmap_ctrl.idx))

4287

++		__kmap_local_sched_in();

4288

++#endif

4289

++}

4290

++

4291

++/**

4292

++ * prepare_task_switch - prepare to switch tasks

4293

++ * @rq: the runqueue preparing to switch

4294

++ * @next: the task we are going to switch to.

4295

++ *

4296

++ * This is called with the rq lock held and interrupts off. It must

4297

++ * be paired with a subsequent finish_task_switch after the context

4298

++ * switch.

4299

++ *

4300

++ * prepare_task_switch sets up locking and calls architecture specific

4301

++ * hooks.

4302

++ */

4303

++static inline void

4304

++prepare_task_switch(struct rq *rq, struct task_struct *prev,

4305

++		    struct task_struct *next)

4306

++{

4307

++	kcov_prepare_switch(prev);

4308

++	sched_info_switch(rq, prev, next);

4309

++	perf_event_task_sched_out(prev, next);

4310

++	rseq_preempt(prev);

4311

++	fire_sched_out_preempt_notifiers(prev, next);

4312

++	kmap_local_sched_out();

4313

++	prepare_task(next);

4314

++	prepare_arch_switch(next);

4315

++}

4316

++

4317

++/**

4318

++ * finish_task_switch - clean up after a task-switch

4319

++ * @rq: runqueue associated with task-switch

4320

++ * @prev: the thread we just switched away from.

4321

++ *

4322

++ * finish_task_switch must be called after the context switch, paired

4323

++ * with a prepare_task_switch call before the context switch.

4324

++ * finish_task_switch will reconcile locking set up by prepare_task_switch,

4325

++ * and do any other architecture-specific cleanup actions.

4326

++ *

4327

++ * Note that we may have delayed dropping an mm in context_switch(). If

4328

++ * so, we finish that here outside of the runqueue lock.  (Doing it

4329

++ * with the lock held can cause deadlocks; see schedule() for

4330

++ * details.)

4331

++ *

4332

++ * The context switch have flipped the stack from under us and restored the

4333

++ * local variables which were saved when this task called schedule() in the

4334

++ * past. prev == current is still correct but we need to recalculate this_rq

4335

++ * because prev may have moved to another CPU.

4336

++ */

4337

++static struct rq *finish_task_switch(struct task_struct *prev)

4338

++	__releases(rq->lock)

4339

++{

4340

++	struct rq *rq = this_rq();

4341

++	struct mm_struct *mm = rq->prev_mm;

4342

++	unsigned int prev_state;

4343

++

4344

++	/*

4345

++	 * The previous task will have left us with a preempt_count of 2

4346

++	 * because it left us after:

4347

++	 *

4348

++	 *	schedule()

4349

++	 *	  preempt_disable();			// 1

4350

++	 *	  __schedule()

4351

++	 *	    raw_spin_lock_irq(&rq->lock)	// 2

4352

++	 *

4353

++	 * Also, see FORK_PREEMPT_COUNT.

4354

++	 */

4355

++	if (WARN_ONCE(preempt_count() != 2*PREEMPT_DISABLE_OFFSET,

4356

++		      "corrupted preempt_count: %s/%d/0x%x\n",

4357

++		      current->comm, current->pid, preempt_count()))

4358

++		preempt_count_set(FORK_PREEMPT_COUNT);

4359

++

4360

++	rq->prev_mm = NULL;

4361

++

4362

++	/*

4363

++	 * A task struct has one reference for the use as "current".

4364

++	 * If a task dies, then it sets TASK_DEAD in tsk->state and calls

4365

++	 * schedule one last time. The schedule call will never return, and

4366

++	 * the scheduled task must drop that reference.

4367

++	 *

4368

++	 * We must observe prev->state before clearing prev->on_cpu (in

4369

++	 * finish_task), otherwise a concurrent wakeup can get prev

4370

++	 * running on another CPU and we could rave with its RUNNING -> DEAD

4371

++	 * transition, resulting in a double drop.

4372

++	 */

4373

++	prev_state = READ_ONCE(prev->__state);

4374

++	vtime_task_switch(prev);

4375

++	perf_event_task_sched_in(prev, current);

4376

++	finish_task(prev);

4377

++	tick_nohz_task_switch();

4378

++	finish_lock_switch(rq);

4379

++	finish_arch_post_lock_switch();

4380

++	kcov_finish_switch(current);

4381

++	/*

4382

++	 * kmap_local_sched_out() is invoked with rq::lock held and

4383

++	 * interrupts disabled. There is no requirement for that, but the

4384

++	 * sched out code does not have an interrupt enabled section.

4385

++	 * Restoring the maps on sched in does not require interrupts being

4386

++	 * disabled either.

4387

++	 */

4388

++	kmap_local_sched_in();

4389

++

4390

++	fire_sched_in_preempt_notifiers(current);

4391

++	/*

4392

++	 * When switching through a kernel thread, the loop in

4393

++	 * membarrier_{private,global}_expedited() may have observed that

4394

++	 * kernel thread and not issued an IPI. It is therefore possible to

4395

++	 * schedule between user->kernel->user threads without passing though

4396

++	 * switch_mm(). Membarrier requires a barrier after storing to

4397

++	 * rq->curr, before returning to userspace, so provide them here:

4398

++	 *

4399

++	 * - a full memory barrier for {PRIVATE,GLOBAL}_EXPEDITED, implicitly

4400

++	 *   provided by mmdrop(),

4401

++	 * - a sync_core for SYNC_CORE.

4402

++	 */

4403

++	if (mm) {

4404

++		membarrier_mm_sync_core_before_usermode(mm);

4405

++		mmdrop_sched(mm);

4406

++	}

4407

++	if (unlikely(prev_state == TASK_DEAD)) {

4408

++		/* Task is done with its stack. */

4409

++		put_task_stack(prev);

4410

++

4411

++		put_task_struct_rcu_user(prev);

4412

++	}

4413

++

4414

++	return rq;

4415

++}

4416

++

4417

++/**

4418

++ * schedule_tail - first thing a freshly forked thread must call.

4419

++ * @prev: the thread we just switched away from.

4420

++ */

4421

++asmlinkage __visible void schedule_tail(struct task_struct *prev)

4422

++	__releases(rq->lock)

4423

++{

4424

++	/*

4425

++	 * New tasks start with FORK_PREEMPT_COUNT, see there and

4426

++	 * finish_task_switch() for details.

4427

++	 *

4428

++	 * finish_task_switch() will drop rq->lock() and lower preempt_count

4429

++	 * and the preempt_enable() will end up enabling preemption (on

4430

++	 * PREEMPT_COUNT kernels).

4431

++	 */

4432

++

4433

++	finish_task_switch(prev);

4434

++	preempt_enable();

4435

++

4436

++	if (current->set_child_tid)

4437

++		put_user(task_pid_vnr(current), current->set_child_tid);

4438

++

4439

++	calculate_sigpending();

4440

++}

4441

++

4442

++/*

4443

++ * context_switch - switch to the new MM and the new thread's register state.

4444

++ */

4445

++static __always_inline struct rq *

4446

++context_switch(struct rq *rq, struct task_struct *prev,

4447

++	       struct task_struct *next)

4448

++{

4449

++	prepare_task_switch(rq, prev, next);

4450

++

4451

++	/*

4452

++	 * For paravirt, this is coupled with an exit in switch_to to

4453

++	 * combine the page table reload and the switch backend into

4454

++	 * one hypercall.

4455

++	 */

4456

++	arch_start_context_switch(prev);

4457

++

4458

++	/*

4459

++	 * kernel -> kernel   lazy + transfer active

4460

++	 *   user -> kernel   lazy + mmgrab() active

4461

++	 *

4462

++	 * kernel ->   user   switch + mmdrop() active

4463

++	 *   user ->   user   switch

4464

++	 */

4465

++	if (!next->mm) {                                // to kernel

4466

++		enter_lazy_tlb(prev->active_mm, next);

4467

++

4468

++		next->active_mm = prev->active_mm;

4469

++		if (prev->mm)                           // from user

4470

++			mmgrab(prev->active_mm);

4471

++		else

4472

++			prev->active_mm = NULL;

4473

++	} else {                                        // to user

4474

++		membarrier_switch_mm(rq, prev->active_mm, next->mm);

4475

++		/*

4476

++		 * sys_membarrier() requires an smp_mb() between setting

4477

++		 * rq->curr / membarrier_switch_mm() and returning to userspace.

4478

++		 *

4479

++		 * The below provides this either through switch_mm(), or in

4480

++		 * case 'prev->active_mm == next->mm' through

4481

++		 * finish_task_switch()'s mmdrop().

4482

++		 */

4483

++		switch_mm_irqs_off(prev->active_mm, next->mm, next);

4484

++		lru_gen_use_mm(next->mm);

4485

++

4486

++		if (!prev->mm) {                        // from kernel

4487

++			/* will mmdrop() in finish_task_switch(). */

4488

++			rq->prev_mm = prev->active_mm;

4489

++			prev->active_mm = NULL;

4490

++		}

4491

++	}

4492

++

4493

++	prepare_lock_switch(rq, next);

4494

++

4495

++	/* Here we just switch the register state and the stack. */

4496

++	switch_to(prev, next, prev);

4497

++	barrier();

4498

++

4499

++	return finish_task_switch(prev);

4500

++}

4501

++

4502

++/*

4503

++ * nr_running, nr_uninterruptible and nr_context_switches:

4504

++ *

4505

++ * externally visible scheduler statistics: current number of runnable

4506

++ * threads, total number of context switches performed since bootup.

4507

++ */

4508

++unsigned int nr_running(void)

4509

++{

4510

++	unsigned int i, sum = 0;

4511

++

4512

++	for_each_online_cpu(i)

4513

++		sum += cpu_rq(i)->nr_running;

4514

++

4515

++	return sum;

4516

++}

4517

++

4518

++/*

4519

++ * Check if only the current task is running on the CPU.

4520

++ *

4521

++ * Caution: this function does not check that the caller has disabled

4522

++ * preemption, thus the result might have a time-of-check-to-time-of-use

4523

++ * race.  The caller is responsible to use it correctly, for example:

4524

++ *

4525

++ * - from a non-preemptible section (of course)

4526

++ *

4527

++ * - from a thread that is bound to a single CPU

4528

++ *

4529

++ * - in a loop with very short iterations (e.g. a polling loop)

4530

++ */

4531

++bool single_task_running(void)

4532

++{

4533

++	return raw_rq()->nr_running == 1;

4534

++}

4535

++EXPORT_SYMBOL(single_task_running);

4536

++

4537

++unsigned long long nr_context_switches(void)

4538

++{

4539

++	int i;

4540

++	unsigned long long sum = 0;

4541

++

4542

++	for_each_possible_cpu(i)

4543

++		sum += cpu_rq(i)->nr_switches;

4544

++

4545

++	return sum;

4546

++}

4547

++

4548

++/*

4549

++ * Consumers of these two interfaces, like for example the cpuidle menu

4550

++ * governor, are using nonsensical data. Preferring shallow idle state selection

4551

++ * for a CPU that has IO-wait which might not even end up running the task when

4552

++ * it does become runnable.

4553

++ */

4554

++

4555

++unsigned int nr_iowait_cpu(int cpu)

4556

++{

4557

++	return atomic_read(&cpu_rq(cpu)->nr_iowait);

4558

++}

4559

++

4560

++/*

4561

++ * IO-wait accounting, and how it's mostly bollocks (on SMP).

4562

++ *

4563

++ * The idea behind IO-wait account is to account the idle time that we could

4564

++ * have spend running if it were not for IO. That is, if we were to improve the

4565

++ * storage performance, we'd have a proportional reduction in IO-wait time.

4566

++ *

4567

++ * This all works nicely on UP, where, when a task blocks on IO, we account

4568

++ * idle time as IO-wait, because if the storage were faster, it could've been

4569

++ * running and we'd not be idle.

4570

++ *

4571

++ * This has been extended to SMP, by doing the same for each CPU. This however

4572

++ * is broken.

4573

++ *

4574

++ * Imagine for instance the case where two tasks block on one CPU, only the one

4575

++ * CPU will have IO-wait accounted, while the other has regular idle. Even

4576

++ * though, if the storage were faster, both could've ran at the same time,

4577

++ * utilising both CPUs.

4578

++ *

4579

++ * This means, that when looking globally, the current IO-wait accounting on

4580

++ * SMP is a lower bound, by reason of under accounting.

4581

++ *

4582

++ * Worse, since the numbers are provided per CPU, they are sometimes

4583

++ * interpreted per CPU, and that is nonsensical. A blocked task isn't strictly

4584

++ * associated with any one particular CPU, it can wake to another CPU than it

4585

++ * blocked on. This means the per CPU IO-wait number is meaningless.

4586

++ *

4587

++ * Task CPU affinities can make all that even more 'interesting'.

4588

++ */

4589

++

4590

++unsigned int nr_iowait(void)

4591

++{

4592

++	unsigned int i, sum = 0;

4593

++

4594

++	for_each_possible_cpu(i)

4595

++		sum += nr_iowait_cpu(i);

4596

++

4597

++	return sum;

4598

++}

4599

++

4600

++#ifdef CONFIG_SMP

4601

++

4602

++/*

4603

++ * sched_exec - execve() is a valuable balancing opportunity, because at

4604

++ * this point the task has the smallest effective memory and cache

4605

++ * footprint.

4606

++ */

4607

++void sched_exec(void)

4608

++{

4609

++}

4610

++

4611

++#endif

4612

++

4613

++DEFINE_PER_CPU(struct kernel_stat, kstat);

4614

++DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat);

4615

++

4616

++EXPORT_PER_CPU_SYMBOL(kstat);

4617

++EXPORT_PER_CPU_SYMBOL(kernel_cpustat);

4618

++

4619

++static inline void update_curr(struct rq *rq, struct task_struct *p)

4620

++{

4621

++	s64 ns = rq->clock_task - p->last_ran;

4622

++

4623

++	p->sched_time += ns;

4624

++	cgroup_account_cputime(p, ns);

4625

++	account_group_exec_runtime(p, ns);

4626

++

4627

++	p->time_slice -= ns;

4628

++	p->last_ran = rq->clock_task;

4629

++}

4630

++

4631

++/*

4632

++ * Return accounted runtime for the task.

4633

++ * Return separately the current's pending runtime that have not been

4634

++ * accounted yet.

4635

++ */

4636

++unsigned long long task_sched_runtime(struct task_struct *p)

4637

++{

4638

++	unsigned long flags;

4639

++	struct rq *rq;

4640

++	raw_spinlock_t *lock;

4641

++	u64 ns;

4642

++

4643

++#if defined(CONFIG_64BIT) && defined(CONFIG_SMP)

4644

++	/*

4645

++	 * 64-bit doesn't need locks to atomically read a 64-bit value.

4646

++	 * So we have a optimization chance when the task's delta_exec is 0.

4647

++	 * Reading ->on_cpu is racy, but this is ok.

4648

++	 *

4649

++	 * If we race with it leaving CPU, we'll take a lock. So we're correct.

4650

++	 * If we race with it entering CPU, unaccounted time is 0. This is

4651

++	 * indistinguishable from the read occurring a few cycles earlier.

4652

++	 * If we see ->on_cpu without ->on_rq, the task is leaving, and has

4653

++	 * been accounted, so we're correct here as well.

4654

++	 */

4655

++	if (!p->on_cpu || !task_on_rq_queued(p))

4656

++		return tsk_seruntime(p);

4657

++#endif

4658

++

4659

++	rq = task_access_lock_irqsave(p, &lock, &flags);

4660

++	/*

4661

++	 * Must be ->curr _and_ ->on_rq.  If dequeued, we would

4662

++	 * project cycles that may never be accounted to this

4663

++	 * thread, breaking clock_gettime().

4664

++	 */

4665

++	if (p == rq->curr && task_on_rq_queued(p)) {

4666

++		update_rq_clock(rq);

4667

++		update_curr(rq, p);

4668

++	}

4669

++	ns = tsk_seruntime(p);

4670

++	task_access_unlock_irqrestore(p, lock, &flags);

4671

++

4672

++	return ns;

4673

++}

4674

++

4675

++/* This manages tasks that have run out of timeslice during a scheduler_tick */

4676

++static inline void scheduler_task_tick(struct rq *rq)

4677

++{

4678

++	struct task_struct *p = rq->curr;

4679

++

4680

++	if (is_idle_task(p))

4681

++		return;

4682

++

4683

++	update_curr(rq, p);

4684

++	cpufreq_update_util(rq, 0);

4685

++

4686

++	/*

4687

++	 * Tasks have less than RESCHED_NS of time slice left they will be

4688

++	 * rescheduled.

4689

++	 */

4690

++	if (p->time_slice >= RESCHED_NS)

4691

++		return;

4692

++	set_tsk_need_resched(p);

4693

++	set_preempt_need_resched();

4694

++}

4695

++

4696

++#ifdef CONFIG_SCHED_DEBUG

4697

++static u64 cpu_resched_latency(struct rq *rq)

4698

++{

4699

++	int latency_warn_ms = READ_ONCE(sysctl_resched_latency_warn_ms);

4700

++	u64 resched_latency, now = rq_clock(rq);

4701

++	static bool warned_once;

4702

++

4703

++	if (sysctl_resched_latency_warn_once && warned_once)

4704

++		return 0;

4705

++

4706

++	if (!need_resched() || !latency_warn_ms)

4707

++		return 0;

4708

++

4709

++	if (system_state == SYSTEM_BOOTING)

4710

++		return 0;

4711

++

4712

++	if (!rq->last_seen_need_resched_ns) {

4713

++		rq->last_seen_need_resched_ns = now;

4714

++		rq->ticks_without_resched = 0;

4715

++		return 0;

4716

++	}

4717

++

4718

++	rq->ticks_without_resched++;

4719

++	resched_latency = now - rq->last_seen_need_resched_ns;

4720

++	if (resched_latency <= latency_warn_ms * NSEC_PER_MSEC)

4721

++		return 0;

4722

++

4723

++	warned_once = true;

4724

++

4725

++	return resched_latency;

4726

++}

4727

++

4728

++static int __init setup_resched_latency_warn_ms(char *str)

4729

++{

4730

++	long val;

4731

++

4732

++	if ((kstrtol(str, 0, &val))) {

4733

++		pr_warn("Unable to set resched_latency_warn_ms\n");

4734

++		return 1;

4735

++	}

4736

++

4737

++	sysctl_resched_latency_warn_ms = val;

4738

++	return 1;

4739

++}

4740

++__setup("resched_latency_warn_ms=", setup_resched_latency_warn_ms);

4741

++#else

4742

++static inline u64 cpu_resched_latency(struct rq *rq) { return 0; }

4743

++#endif /* CONFIG_SCHED_DEBUG */

4744

++

4745

++/*

4746

++ * This function gets called by the timer code, with HZ frequency.

4747

++ * We call it with interrupts disabled.

4748

++ */

4749

++void scheduler_tick(void)

4750

++{

4751

++	int cpu __maybe_unused = smp_processor_id();

4752

++	struct rq *rq = cpu_rq(cpu);

4753

++	u64 resched_latency;

4754

++

4755

++	if (housekeeping_cpu(cpu, HK_TYPE_TICK))

4756

++		arch_scale_freq_tick();

4757

++

4758

++	sched_clock_tick();

4759

++

4760

++	raw_spin_lock(&rq->lock);

4761

++	update_rq_clock(rq);

4762

++

4763

++	scheduler_task_tick(rq);

4764

++	if (sched_feat(LATENCY_WARN))

4765

++		resched_latency = cpu_resched_latency(rq);

4766

++	calc_global_load_tick(rq);

4767

++

4768

++	rq->last_tick = rq->clock;

4769

++	raw_spin_unlock(&rq->lock);

4770

++

4771

++	if (sched_feat(LATENCY_WARN) && resched_latency)

4772

++		resched_latency_warn(cpu, resched_latency);

4773

++

4774

++	perf_event_task_tick();

4775

++}

4776

++

4777

++#ifdef CONFIG_SCHED_SMT

4778

++static inline int sg_balance_cpu_stop(void *data)

4779

++{

4780

++	struct rq *rq = this_rq();

4781

++	struct task_struct *p = data;

4782

++	cpumask_t tmp;

4783

++	unsigned long flags;

4784

++

4785

++	local_irq_save(flags);

4786

++

4787

++	raw_spin_lock(&p->pi_lock);

4788

++	raw_spin_lock(&rq->lock);

4789

++

4790

++	rq->active_balance = 0;

4791

++	/* _something_ may have changed the task, double check again */

4792

++	if (task_on_rq_queued(p) && task_rq(p) == rq &&

4793

++	    cpumask_and(&tmp, p->cpus_ptr, &sched_sg_idle_mask) &&

4794

++	    !is_migration_disabled(p)) {

4795

++		int cpu = cpu_of(rq);

4796

++		int dcpu = __best_mask_cpu(&tmp, per_cpu(sched_cpu_llc_mask, cpu));

4797

++		rq = move_queued_task(rq, p, dcpu);

4798

++	}

4799

++

4800

++	raw_spin_unlock(&rq->lock);

4801

++	raw_spin_unlock(&p->pi_lock);

4802

++

4803

++	local_irq_restore(flags);

4804

++

4805

++	return 0;

4806

++}

4807

++

4808

++/* sg_balance_trigger - trigger slibing group balance for @cpu */

4809

++static inline int sg_balance_trigger(const int cpu)

4810

++{

4811

++	struct rq *rq= cpu_rq(cpu);

4812

++	unsigned long flags;

4813

++	struct task_struct *curr;

4814

++	int res;

4815

++

4816

++	if (!raw_spin_trylock_irqsave(&rq->lock, flags))

4817

++		return 0;

4818

++	curr = rq->curr;

4819

++	res = (!is_idle_task(curr)) && (1 == rq->nr_running) &&\

4820

++	      cpumask_intersects(curr->cpus_ptr, &sched_sg_idle_mask) &&\

4821

++	      !is_migration_disabled(curr) && (!rq->active_balance);

4822

++

4823

++	if (res)

4824

++		rq->active_balance = 1;

4825

++

4826

++	raw_spin_unlock_irqrestore(&rq->lock, flags);

4827

++

4828

++	if (res)

4829

++		stop_one_cpu_nowait(cpu, sg_balance_cpu_stop, curr,

4830

++				    &rq->active_balance_work);

4831

++	return res;

4832

++}

4833

++

4834

++/*

4835

++ * sg_balance - slibing group balance check for run queue @rq

4836

++ */

4837

++static inline void sg_balance(struct rq *rq)

4838

++{

4839

++	cpumask_t chk;

4840

++	int cpu = cpu_of(rq);

4841

++

4842

++	/* exit when cpu is offline */

4843

++	if (unlikely(!rq->online))

4844

++		return;

4845

++

4846

++	/*

4847

++	 * Only cpu in slibing idle group will do the checking and then

4848

++	 * find potential cpus which can migrate the current running task

4849

++	 */

4850

++	if (cpumask_test_cpu(cpu, &sched_sg_idle_mask) &&

4851

++	    cpumask_andnot(&chk, cpu_online_mask, sched_idle_mask) &&

4852

++	    cpumask_andnot(&chk, &chk, &sched_rq_pending_mask)) {

4853

++		int i;

4854

++

4855

++		for_each_cpu_wrap(i, &chk, cpu) {

4856

++			if (!cpumask_intersects(cpu_smt_mask(i), sched_idle_mask) &&\

4857

++			    sg_balance_trigger(i))

4858

++				return;

4859

++		}

4860

++	}

4861

++}

4862

++#endif /* CONFIG_SCHED_SMT */

4863

++

4864

++#ifdef CONFIG_NO_HZ_FULL

4865

++

4866

++struct tick_work {

4867

++	int			cpu;

4868

++	atomic_t		state;

4869

++	struct delayed_work	work;

4870

++};

4871

++/* Values for ->state, see diagram below. */

4872

++#define TICK_SCHED_REMOTE_OFFLINE	0

4873

++#define TICK_SCHED_REMOTE_OFFLINING	1

4874

++#define TICK_SCHED_REMOTE_RUNNING	2

4875

++

4876

++/*

4877

++ * State diagram for ->state:

4878

++ *

4879

++ *

4880

++ *          TICK_SCHED_REMOTE_OFFLINE

4881

++ *                    |   ^

4882

++ *                    |   |

4883

++ *                    |   | sched_tick_remote()

4884

++ *                    |   |

4885

++ *                    |   |

4886

++ *                    +--TICK_SCHED_REMOTE_OFFLINING

4887

++ *                    |   ^

4888

++ *                    |   |

4889

++ * sched_tick_start() |   | sched_tick_stop()

4890

++ *                    |   |

4891

++ *                    V   |

4892

++ *          TICK_SCHED_REMOTE_RUNNING

4893

++ *

4894

++ *

4895

++ * Other transitions get WARN_ON_ONCE(), except that sched_tick_remote()

4896

++ * and sched_tick_start() are happy to leave the state in RUNNING.

4897

++ */

4898

++

4899

++static struct tick_work __percpu *tick_work_cpu;

4900

++

4901

++static void sched_tick_remote(struct work_struct *work)

4902

++{

4903

++	struct delayed_work *dwork = to_delayed_work(work);

4904

++	struct tick_work *twork = container_of(dwork, struct tick_work, work);

4905

++	int cpu = twork->cpu;

4906

++	struct rq *rq = cpu_rq(cpu);

4907

++	struct task_struct *curr;

4908

++	unsigned long flags;

4909

++	u64 delta;

4910

++	int os;

4911

++

4912

++	/*

4913

++	 * Handle the tick only if it appears the remote CPU is running in full

4914

++	 * dynticks mode. The check is racy by nature, but missing a tick or

4915

++	 * having one too much is no big deal because the scheduler tick updates

4916

++	 * statistics and checks timeslices in a time-independent way, regardless

4917

++	 * of when exactly it is running.

4918

++	 */

4919

++	if (!tick_nohz_tick_stopped_cpu(cpu))

4920

++		goto out_requeue;

4921

++

4922

++	raw_spin_lock_irqsave(&rq->lock, flags);

4923

++	curr = rq->curr;

4924

++	if (cpu_is_offline(cpu))

4925

++		goto out_unlock;

4926

++

4927

++	update_rq_clock(rq);

4928

++	if (!is_idle_task(curr)) {

4929

++		/*

4930

++		 * Make sure the next tick runs within a reasonable

4931

++		 * amount of time.

4932

++		 */

4933

++		delta = rq_clock_task(rq) - curr->last_ran;

4934

++		WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3);

4935

++	}

4936

++	scheduler_task_tick(rq);

4937

++

4938

++	calc_load_nohz_remote(rq);

4939

++out_unlock:

4940

++	raw_spin_unlock_irqrestore(&rq->lock, flags);

4941

++

4942

++out_requeue:

4943

++	/*

4944

++	 * Run the remote tick once per second (1Hz). This arbitrary

4945

++	 * frequency is large enough to avoid overload but short enough

4946

++	 * to keep scheduler internal stats reasonably up to date.  But

4947

++	 * first update state to reflect hotplug activity if required.

4948

++	 */

4949

++	os = atomic_fetch_add_unless(&twork->state, -1, TICK_SCHED_REMOTE_RUNNING);

4950

++	WARN_ON_ONCE(os == TICK_SCHED_REMOTE_OFFLINE);

4951

++	if (os == TICK_SCHED_REMOTE_RUNNING)

4952

++		queue_delayed_work(system_unbound_wq, dwork, HZ);

4953

++}

4954

++

4955

++static void sched_tick_start(int cpu)

4956

++{

4957

++	int os;

4958

++	struct tick_work *twork;

4959

++

4960

++	if (housekeeping_cpu(cpu, HK_TYPE_TICK))

4961

++		return;

4962

++

4963

++	WARN_ON_ONCE(!tick_work_cpu);

4964

++

4965

++	twork = per_cpu_ptr(tick_work_cpu, cpu);

4966

++	os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_RUNNING);

4967

++	WARN_ON_ONCE(os == TICK_SCHED_REMOTE_RUNNING);

4968

++	if (os == TICK_SCHED_REMOTE_OFFLINE) {

4969

++		twork->cpu = cpu;

4970

++		INIT_DELAYED_WORK(&twork->work, sched_tick_remote);

4971

++		queue_delayed_work(system_unbound_wq, &twork->work, HZ);

4972

++	}

4973

++}

4974

++

4975

++#ifdef CONFIG_HOTPLUG_CPU

4976

++static void sched_tick_stop(int cpu)

4977

++{

4978

++	struct tick_work *twork;

4979

++	int os;

4980

++

4981

++	if (housekeeping_cpu(cpu, HK_TYPE_TICK))

4982

++		return;

4983

++

4984

++	WARN_ON_ONCE(!tick_work_cpu);

4985

++

4986

++	twork = per_cpu_ptr(tick_work_cpu, cpu);

4987

++	/* There cannot be competing actions, but don't rely on stop-machine. */

4988

++	os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_OFFLINING);

4989

++	WARN_ON_ONCE(os != TICK_SCHED_REMOTE_RUNNING);

4990

++	/* Don't cancel, as this would mess up the state machine. */

4991

++}

4992

++#endif /* CONFIG_HOTPLUG_CPU */

4993

++

4994

++int __init sched_tick_offload_init(void)

4995

++{

4996

++	tick_work_cpu = alloc_percpu(struct tick_work);

4997

++	BUG_ON(!tick_work_cpu);

4998

++	return 0;

4999

++}

5000

++

5001

++#else /* !CONFIG_NO_HZ_FULL */

5002

++static inline void sched_tick_start(int cpu) { }

5003

++static inline void sched_tick_stop(int cpu) { }

5004

++#endif

5005

++

5006

++#if defined(CONFIG_PREEMPTION) && (defined(CONFIG_DEBUG_PREEMPT) || \

5007

++				defined(CONFIG_PREEMPT_TRACER))

5008

++/*

5009

++ * If the value passed in is equal to the current preempt count

5010

++ * then we just disabled preemption. Start timing the latency.

5011

++ */

5012

++static inline void preempt_latency_start(int val)

5013

++{

5014

++	if (preempt_count() == val) {

5015

++		unsigned long ip = get_lock_parent_ip();

5016

++#ifdef CONFIG_DEBUG_PREEMPT

5017

++		current->preempt_disable_ip = ip;

5018

++#endif

5019

++		trace_preempt_off(CALLER_ADDR0, ip);

5020

++	}

5021

++}

5022

++

5023

++void preempt_count_add(int val)

5024

++{

5025

++#ifdef CONFIG_DEBUG_PREEMPT

5026

++	/*

5027

++	 * Underflow?

5028

++	 */

5029

++	if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))

5030

++		return;

5031

++#endif

5032

++	__preempt_count_add(val);

5033

++#ifdef CONFIG_DEBUG_PREEMPT

5034

++	/*

5035

++	 * Spinlock count overflowing soon?

5036

++	 */

5037

++	DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=

5038

++				PREEMPT_MASK - 10);

5039

++#endif

5040

++	preempt_latency_start(val);

5041

++}

5042

++EXPORT_SYMBOL(preempt_count_add);

5043

++NOKPROBE_SYMBOL(preempt_count_add);

5044

++

5045

++/*

5046

++ * If the value passed in equals to the current preempt count

5047

++ * then we just enabled preemption. Stop timing the latency.

5048

++ */

5049

++static inline void preempt_latency_stop(int val)

5050

++{

5051

++	if (preempt_count() == val)

5052

++		trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip());

5053

++}

5054

++

5055

++void preempt_count_sub(int val)

5056

++{

5057

++#ifdef CONFIG_DEBUG_PREEMPT

5058

++	/*

5059

++	 * Underflow?

5060

++	 */

5061

++	if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))

5062

++		return;

5063

++	/*

5064

++	 * Is the spinlock portion underflowing?

5065

++	 */

5066

++	if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&

5067

++			!(preempt_count() & PREEMPT_MASK)))

5068

++		return;

5069

++#endif

5070

++

5071

++	preempt_latency_stop(val);

5072

++	__preempt_count_sub(val);

5073

++}

5074

++EXPORT_SYMBOL(preempt_count_sub);

5075

++NOKPROBE_SYMBOL(preempt_count_sub);

5076

++

5077

++#else

5078

++static inline void preempt_latency_start(int val) { }

5079

++static inline void preempt_latency_stop(int val) { }

5080

++#endif

5081

++

5082

++static inline unsigned long get_preempt_disable_ip(struct task_struct *p)

5083

++{

5084

++#ifdef CONFIG_DEBUG_PREEMPT

5085

++	return p->preempt_disable_ip;

5086

++#else

5087

++	return 0;

5088

++#endif

5089

++}

5090

++

5091

++/*

5092

++ * Print scheduling while atomic bug:

5093

++ */

5094

++static noinline void __schedule_bug(struct task_struct *prev)

5095

++{

5096

++	/* Save this before calling printk(), since that will clobber it */

5097

++	unsigned long preempt_disable_ip = get_preempt_disable_ip(current);

5098

++

5099

++	if (oops_in_progress)

5100

++		return;

5101

++

5102

++	printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",

5103

++		prev->comm, prev->pid, preempt_count());

5104

++

5105

++	debug_show_held_locks(prev);

5106

++	print_modules();

5107

++	if (irqs_disabled())

5108

++		print_irqtrace_events(prev);

5109

++	if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)

5110

++	    && in_atomic_preempt_off()) {

5111

++		pr_err("Preemption disabled at:");

5112

++		print_ip_sym(KERN_ERR, preempt_disable_ip);

5113

++	}

5114

++	check_panic_on_warn("scheduling while atomic");

5115

++

5116

++	dump_stack();

5117

++	add_taint(TAINT_WARN, LOCKDEP_STILL_OK);

5118

++}

5119

++

5120

++/*

5121

++ * Various schedule()-time debugging checks and statistics:

5122

++ */

5123

++static inline void schedule_debug(struct task_struct *prev, bool preempt)

5124

++{

5125

++#ifdef CONFIG_SCHED_STACK_END_CHECK

5126

++	if (task_stack_end_corrupted(prev))

5127

++		panic("corrupted stack end detected inside scheduler\n");

5128

++

5129

++	if (task_scs_end_corrupted(prev))

5130

++		panic("corrupted shadow stack detected inside scheduler\n");

5131

++#endif

5132

++

5133

++#ifdef CONFIG_DEBUG_ATOMIC_SLEEP

5134

++	if (!preempt && READ_ONCE(prev->__state) && prev->non_block_count) {

5135

++		printk(KERN_ERR "BUG: scheduling in a non-blocking section: %s/%d/%i\n",

5136

++			prev->comm, prev->pid, prev->non_block_count);

5137

++		dump_stack();

5138

++		add_taint(TAINT_WARN, LOCKDEP_STILL_OK);

5139

++	}

5140

++#endif

5141

++

5142

++	if (unlikely(in_atomic_preempt_off())) {

5143

++		__schedule_bug(prev);

5144

++		preempt_count_set(PREEMPT_DISABLED);

5145

++	}

5146

++	rcu_sleep_check();

5147

++	SCHED_WARN_ON(ct_state() == CONTEXT_USER);

5148

++

5149

++	profile_hit(SCHED_PROFILING, __builtin_return_address(0));

5150

++

5151

++	schedstat_inc(this_rq()->sched_count);

5152

++}

5153

++

5154

++/*

5155

++ * Compile time debug macro

5156

++ * #define ALT_SCHED_DEBUG

5157

++ */

5158

++

5159

++#ifdef ALT_SCHED_DEBUG

5160

++void alt_sched_debug(void)

5161

++{

5162

++	printk(KERN_INFO "sched: pending: 0x%04lx, idle: 0x%04lx, sg_idle: 0x%04lx\n",

5163

++	       sched_rq_pending_mask.bits[0],

5164

++	       sched_idle_mask->bits[0],

5165

++	       sched_sg_idle_mask.bits[0]);

5166

++}

5167

++#else

5168

++inline void alt_sched_debug(void) {}

5169

++#endif

5170

++

5171

++#ifdef	CONFIG_SMP

5172

++

5173

++#ifdef CONFIG_PREEMPT_RT

5174

++#define SCHED_NR_MIGRATE_BREAK 8

5175

++#else

5176

++#define SCHED_NR_MIGRATE_BREAK 32

5177

++#endif

5178

++

5179

++const_debug unsigned int sysctl_sched_nr_migrate = SCHED_NR_MIGRATE_BREAK;

5180

++

5181

++/*

5182

++ * Migrate pending tasks in @rq to @dest_cpu

5183

++ */

5184

++static inline int

5185

++migrate_pending_tasks(struct rq *rq, struct rq *dest_rq, const int dest_cpu)

5186

++{

5187

++	struct task_struct *p, *skip = rq->curr;

5188

++	int nr_migrated = 0;

5189

++	int nr_tries = min(rq->nr_running / 2, sysctl_sched_nr_migrate);

5190

++

5191

++	while (skip != rq->idle && nr_tries &&

5192

++	       (p = sched_rq_next_task(skip, rq)) != rq->idle) {

5193

++		skip = sched_rq_next_task(p, rq);

5194

++		if (cpumask_test_cpu(dest_cpu, p->cpus_ptr)) {

5195

++			__SCHED_DEQUEUE_TASK(p, rq, 0);

5196

++			set_task_cpu(p, dest_cpu);

5197

++			sched_task_sanity_check(p, dest_rq);

5198

++			__SCHED_ENQUEUE_TASK(p, dest_rq, 0);

5199

++			nr_migrated++;

5200

++		}

5201

++		nr_tries--;

5202

++	}

5203

++

5204

++	return nr_migrated;

5205

++}

5206

++

5207

++static inline int take_other_rq_tasks(struct rq *rq, int cpu)

5208

++{

5209

++	struct cpumask *topo_mask, *end_mask;

5210

++

5211

++	if (unlikely(!rq->online))

5212

++		return 0;

5213

++

5214

++	if (cpumask_empty(&sched_rq_pending_mask))

5215

++		return 0;

5216

++

5217

++	topo_mask = per_cpu(sched_cpu_topo_masks, cpu) + 1;

5218

++	end_mask = per_cpu(sched_cpu_topo_end_mask, cpu);

5219

++	do {

5220

++		int i;

5221

++		for_each_cpu_and(i, &sched_rq_pending_mask, topo_mask) {

5222

++			int nr_migrated;

5223

++			struct rq *src_rq;

5224

++

5225

++			src_rq = cpu_rq(i);

5226

++			if (!do_raw_spin_trylock(&src_rq->lock))

5227

++				continue;

5228

++			spin_acquire(&src_rq->lock.dep_map,

5229

++				     SINGLE_DEPTH_NESTING, 1, _RET_IP_);

5230

++

5231

++			if ((nr_migrated = migrate_pending_tasks(src_rq, rq, cpu))) {

5232

++				src_rq->nr_running -= nr_migrated;

5233

++				if (src_rq->nr_running < 2)

5234

++					cpumask_clear_cpu(i, &sched_rq_pending_mask);

5235

++

5236

++				spin_release(&src_rq->lock.dep_map, _RET_IP_);

5237

++				do_raw_spin_unlock(&src_rq->lock);

5238

++

5239

++				rq->nr_running += nr_migrated;

5240

++				if (rq->nr_running > 1)

5241

++					cpumask_set_cpu(cpu, &sched_rq_pending_mask);

5242

++

5243

++				cpufreq_update_util(rq, 0);

5244

++

5245

++				return 1;

5246

++			}

5247

++

5248

++			spin_release(&src_rq->lock.dep_map, _RET_IP_);

5249

++			do_raw_spin_unlock(&src_rq->lock);

5250

++		}

5251

++	} while (++topo_mask < end_mask);

5252

++

5253

++	return 0;

5254

++}

5255

++#endif

5256

++

5257

++/*

5258

++ * Timeslices below RESCHED_NS are considered as good as expired as there's no

5259

++ * point rescheduling when there's so little time left.

5260

++ */

5261

++static inline void check_curr(struct task_struct *p, struct rq *rq)

5262

++{

5263

++	if (unlikely(rq->idle == p))

5264

++		return;

5265

++

5266

++	update_curr(rq, p);

5267

++

5268

++	if (p->time_slice < RESCHED_NS)

5269

++		time_slice_expired(p, rq);

5270

++}

5271

++

5272

++static inline struct task_struct *

5273

++choose_next_task(struct rq *rq, int cpu)

5274

++{

5275

++	struct task_struct *next;

5276

++

5277

++	if (unlikely(rq->skip)) {

5278

++		next = rq_runnable_task(rq);

5279

++		if (next == rq->idle) {

5280

++#ifdef	CONFIG_SMP

5281

++			if (!take_other_rq_tasks(rq, cpu)) {

5282

++#endif

5283

++				rq->skip = NULL;

5284

++				schedstat_inc(rq->sched_goidle);

5285

++				return next;

5286

++#ifdef	CONFIG_SMP

5287

++			}

5288

++			next = rq_runnable_task(rq);

5289

++#endif

5290

++		}

5291

++		rq->skip = NULL;

5292

++#ifdef CONFIG_HIGH_RES_TIMERS

5293

++		hrtick_start(rq, next->time_slice);

5294

++#endif

5295

++		return next;

5296

++	}

5297

++

5298

++	next = sched_rq_first_task(rq);

5299

++	if (next == rq->idle) {

5300

++#ifdef	CONFIG_SMP

5301

++		if (!take_other_rq_tasks(rq, cpu)) {

5302

++#endif

5303

++			schedstat_inc(rq->sched_goidle);

5304

++			/*printk(KERN_INFO "sched: choose_next_task(%d) idle %px\n", cpu, next);*/

5305

++			return next;

5306

++#ifdef	CONFIG_SMP

5307

++		}

5308

++		next = sched_rq_first_task(rq);

5309

++#endif

5310

++	}

5311

++#ifdef CONFIG_HIGH_RES_TIMERS

5312

++	hrtick_start(rq, next->time_slice);

5313

++#endif

5314

++	/*printk(KERN_INFO "sched: choose_next_task(%d) next %px\n", cpu,

5315

++	 * next);*/

5316

++	return next;

5317

++}

5318

++

5319

++/*

5320

++ * Constants for the sched_mode argument of __schedule().

5321

++ *

5322

++ * The mode argument allows RT enabled kernels to differentiate a

5323

++ * preemption from blocking on an 'sleeping' spin/rwlock. Note that

5324

++ * SM_MASK_PREEMPT for !RT has all bits set, which allows the compiler to

5325

++ * optimize the AND operation out and just check for zero.

5326

++ */

5327

++#define SM_NONE			0x0

5328

++#define SM_PREEMPT		0x1

5329

++#define SM_RTLOCK_WAIT		0x2

5330

++

5331

++#ifndef CONFIG_PREEMPT_RT

5332

++# define SM_MASK_PREEMPT	(~0U)

5333

++#else

5334

++# define SM_MASK_PREEMPT	SM_PREEMPT

5335

++#endif

5336

++

5337

++/*

5338

++ * schedule() is the main scheduler function.

5339

++ *

5340

++ * The main means of driving the scheduler and thus entering this function are:

5341

++ *

5342

++ *   1. Explicit blocking: mutex, semaphore, waitqueue, etc.

5343

++ *

5344

++ *   2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return

5345

++ *      paths. For example, see arch/x86/entry_64.S.

5346

++ *

5347

++ *      To drive preemption between tasks, the scheduler sets the flag in timer

5348

++ *      interrupt handler scheduler_tick().

5349

++ *

5350

++ *   3. Wakeups don't really cause entry into schedule(). They add a

5351

++ *      task to the run-queue and that's it.

5352

++ *

5353

++ *      Now, if the new task added to the run-queue preempts the current

5354

++ *      task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets

5355

++ *      called on the nearest possible occasion:

5356

++ *

5357

++ *       - If the kernel is preemptible (CONFIG_PREEMPTION=y):

5358

++ *

5359

++ *         - in syscall or exception context, at the next outmost

5360

++ *           preempt_enable(). (this might be as soon as the wake_up()'s

5361

++ *           spin_unlock()!)

5362

++ *

5363

++ *         - in IRQ context, return from interrupt-handler to

5364

++ *           preemptible context

5365

++ *

5366

++ *       - If the kernel is not preemptible (CONFIG_PREEMPTION is not set)

5367

++ *         then at the next:

5368

++ *

5369

++ *          - cond_resched() call

5370

++ *          - explicit schedule() call

5371

++ *          - return from syscall or exception to user-space

5372

++ *          - return from interrupt-handler to user-space

5373

++ *

5374

++ * WARNING: must be called with preemption disabled!

5375

++ */

5376

++static void __sched notrace __schedule(unsigned int sched_mode)

5377

++{

5378

++	struct task_struct *prev, *next;

5379

++	unsigned long *switch_count;

5380

++	unsigned long prev_state;

5381

++	struct rq *rq;

5382

++	int cpu;

5383

++	int deactivated = 0;

5384

++

5385

++	cpu = smp_processor_id();

5386

++	rq = cpu_rq(cpu);

5387

++	prev = rq->curr;

5388

++

5389

++	schedule_debug(prev, !!sched_mode);

5390

++

5391

++	/* by passing sched_feat(HRTICK) checking which Alt schedule FW doesn't support */

5392

++	hrtick_clear(rq);

5393

++

5394

++	local_irq_disable();

5395

++	rcu_note_context_switch(!!sched_mode);

5396

++

5397

++	/*

5398

++	 * Make sure that signal_pending_state()->signal_pending() below

5399

++	 * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)

5400

++	 * done by the caller to avoid the race with signal_wake_up():

5401

++	 *

5402

++	 * __set_current_state(@state)		signal_wake_up()

5403

++	 * schedule()				  set_tsk_thread_flag(p, TIF_SIGPENDING)

5404

++	 *					  wake_up_state(p, state)

5405

++	 *   LOCK rq->lock			    LOCK p->pi_state

5406

++	 *   smp_mb__after_spinlock()		    smp_mb__after_spinlock()

5407

++	 *     if (signal_pending_state())	    if (p->state & @state)

5408

++	 *

5409

++	 * Also, the membarrier system call requires a full memory barrier

5410

++	 * after coming from user-space, before storing to rq->curr.

5411

++	 */

5412

++	raw_spin_lock(&rq->lock);

5413

++	smp_mb__after_spinlock();

5414

++

5415

++	update_rq_clock(rq);

5416

++

5417

++	switch_count = &prev->nivcsw;

5418

++	/*

5419

++	 * We must load prev->state once (task_struct::state is volatile), such

5420

++	 * that we form a control dependency vs deactivate_task() below.

5421

++	 */

5422

++	prev_state = READ_ONCE(prev->__state);

5423

++	if (!(sched_mode & SM_MASK_PREEMPT) && prev_state) {

5424

++		if (signal_pending_state(prev_state, prev)) {

5425

++			WRITE_ONCE(prev->__state, TASK_RUNNING);

5426

++		} else {

5427

++			prev->sched_contributes_to_load =

5428

++				(prev_state & TASK_UNINTERRUPTIBLE) &&

5429

++				!(prev_state & TASK_NOLOAD) &&

5430

++				!(prev_state & TASK_FROZEN);

5431

++

5432

++			if (prev->sched_contributes_to_load)

5433

++				rq->nr_uninterruptible++;

5434

++

5435

++			/*

5436

++			 * __schedule()			ttwu()

5437

++			 *   prev_state = prev->state;    if (p->on_rq && ...)

5438

++			 *   if (prev_state)		    goto out;

5439

++			 *     p->on_rq = 0;		  smp_acquire__after_ctrl_dep();

5440

++			 *				  p->state = TASK_WAKING

5441

++			 *

5442

++			 * Where __schedule() and ttwu() have matching control dependencies.

5443

++			 *

5444

++			 * After this, schedule() must not care about p->state any more.

5445

++			 */

5446

++			sched_task_deactivate(prev, rq);

5447

++			deactivate_task(prev, rq);

5448

++			deactivated = 1;

5449

++

5450

++			if (prev->in_iowait) {

5451

++				atomic_inc(&rq->nr_iowait);

5452

++				delayacct_blkio_start();

5453

++			}

5454

++		}

5455

++		switch_count = &prev->nvcsw;

5456

++	}

5457

++

5458

++	check_curr(prev, rq);

5459

++

5460

++	next = choose_next_task(rq, cpu);

5461

++	clear_tsk_need_resched(prev);

5462

++	clear_preempt_need_resched();

5463

++#ifdef CONFIG_SCHED_DEBUG

5464

++	rq->last_seen_need_resched_ns = 0;

5465

++#endif

5466

++

5467

++	if (likely(prev != next)) {

5468

++		if (deactivated)

5469

++			update_sched_preempt_mask(rq);

5470

++		next->last_ran = rq->clock_task;

5471

++		rq->last_ts_switch = rq->clock;

5472

++

5473

++		rq->nr_switches++;

5474

++		/*

5475

++		 * RCU users of rcu_dereference(rq->curr) may not see

5476

++		 * changes to task_struct made by pick_next_task().

5477

++		 */

5478

++		RCU_INIT_POINTER(rq->curr, next);

5479

++		/*

5480

++		 * The membarrier system call requires each architecture

5481

++		 * to have a full memory barrier after updating

5482

++		 * rq->curr, before returning to user-space.

5483

++		 *

5484

++		 * Here are the schemes providing that barrier on the

5485

++		 * various architectures:

5486

++		 * - mm ? switch_mm() : mmdrop() for x86, s390, sparc, PowerPC.

5487

++		 *   switch_mm() rely on membarrier_arch_switch_mm() on PowerPC.

5488

++		 * - finish_lock_switch() for weakly-ordered

5489

++		 *   architectures where spin_unlock is a full barrier,

5490

++		 * - switch_to() for arm64 (weakly-ordered, spin_unlock

5491

++		 *   is a RELEASE barrier),

5492

++		 */

5493

++		++*switch_count;

5494

++

5495

++		psi_sched_switch(prev, next, !task_on_rq_queued(prev));

5496

++

5497

++		trace_sched_switch(sched_mode & SM_MASK_PREEMPT, prev, next, prev_state);

5498

++

5499

++		/* Also unlocks the rq: */

5500

++		rq = context_switch(rq, prev, next);

5501

++	} else {

5502

++		__balance_callbacks(rq);

5503

++		raw_spin_unlock_irq(&rq->lock);

5504

++	}

5505

++

5506

++#ifdef CONFIG_SCHED_SMT

5507

++	sg_balance(rq);

5508

++#endif

5509

++}

5510

++

5511

++void __noreturn do_task_dead(void)

5512

++{

5513

++	/* Causes final put_task_struct in finish_task_switch(): */

5514

++	set_special_state(TASK_DEAD);

5515

++

5516

++	/* Tell freezer to ignore us: */

5517

++	current->flags |= PF_NOFREEZE;

5518

++

5519

++	__schedule(SM_NONE);

5520

++	BUG();

5521

++

5522

++	/* Avoid "noreturn function does return" - but don't continue if BUG() is a NOP: */

5523

++	for (;;)

5524

++		cpu_relax();

5525

++}

5526

++

5527

++static inline void sched_submit_work(struct task_struct *tsk)

5528

++{

5529

++	unsigned int task_flags;

5530

++

5531

++	if (task_is_running(tsk))

5532

++		return;

5533

++

5534

++	task_flags = tsk->flags;

5535

++	/*

5536

++	 * If a worker goes to sleep, notify and ask workqueue whether it

5537

++	 * wants to wake up a task to maintain concurrency.

5538

++	 */

5539

++	if (task_flags & (PF_WQ_WORKER | PF_IO_WORKER)) {

5540

++		if (task_flags & PF_WQ_WORKER)

5541

++			wq_worker_sleeping(tsk);

5542

++		else

5543

++			io_wq_worker_sleeping(tsk);

5544

++	}

5545

++

5546

++	/*

5547

++	 * spinlock and rwlock must not flush block requests.  This will

5548

++	 * deadlock if the callback attempts to acquire a lock which is

5549

++	 * already acquired.

5550

++	 */

5551

++	SCHED_WARN_ON(current->__state & TASK_RTLOCK_WAIT);

5552

++

5553

++	/*

5554

++	 * If we are going to sleep and we have plugged IO queued,

5555

++	 * make sure to submit it to avoid deadlocks.

5556

++	 */

5557

++	blk_flush_plug(tsk->plug, true);

5558

++}

5559

++

5560

++static void sched_update_worker(struct task_struct *tsk)

5561

++{

5562

++	if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) {

5563

++		if (tsk->flags & PF_WQ_WORKER)

5564

++			wq_worker_running(tsk);

5565

++		else

5566

++			io_wq_worker_running(tsk);

5567

++	}

5568

++}

5569

++

5570

++asmlinkage __visible void __sched schedule(void)

5571

++{

5572

++	struct task_struct *tsk = current;

5573

++

5574

++	sched_submit_work(tsk);

5575

++	do {

5576

++		preempt_disable();

5577

++		__schedule(SM_NONE);

5578

++		sched_preempt_enable_no_resched();

5579

++	} while (need_resched());

5580

++	sched_update_worker(tsk);

5581

++}

5582

++EXPORT_SYMBOL(schedule);

5583

++

5584

++/*

5585

++ * synchronize_rcu_tasks() makes sure that no task is stuck in preempted

5586

++ * state (have scheduled out non-voluntarily) by making sure that all

5587

++ * tasks have either left the run queue or have gone into user space.

5588

++ * As idle tasks do not do either, they must not ever be preempted

5589

++ * (schedule out non-voluntarily).

5590

++ *

5591

++ * schedule_idle() is similar to schedule_preempt_disable() except that it

5592

++ * never enables preemption because it does not call sched_submit_work().

5593

++ */

5594

++void __sched schedule_idle(void)

5595

++{

5596

++	/*

5597

++	 * As this skips calling sched_submit_work(), which the idle task does

5598

++	 * regardless because that function is a nop when the task is in a

5599

++	 * TASK_RUNNING state, make sure this isn't used someplace that the

5600

++	 * current task can be in any other state. Note, idle is always in the

5601

++	 * TASK_RUNNING state.

5602

++	 */

5603

++	WARN_ON_ONCE(current->__state);

5604

++	do {

5605

++		__schedule(SM_NONE);

5606

++	} while (need_resched());

5607

++}

5608

++

5609

++#if defined(CONFIG_CONTEXT_TRACKING_USER) && !defined(CONFIG_HAVE_CONTEXT_TRACKING_USER_OFFSTACK)

5610

++asmlinkage __visible void __sched schedule_user(void)

5611

++{

5612

++	/*

5613

++	 * If we come here after a random call to set_need_resched(),

5614

++	 * or we have been woken up remotely but the IPI has not yet arrived,

5615

++	 * we haven't yet exited the RCU idle mode. Do it here manually until

5616

++	 * we find a better solution.

5617

++	 *

5618

++	 * NB: There are buggy callers of this function.  Ideally we

5619

++	 * should warn if prev_state != CONTEXT_USER, but that will trigger

5620

++	 * too frequently to make sense yet.

5621

++	 */

5622

++	enum ctx_state prev_state = exception_enter();

5623

++	schedule();

5624

++	exception_exit(prev_state);

5625

++}

5626

++#endif

5627

++

5628

++/**

5629

++ * schedule_preempt_disabled - called with preemption disabled

5630

++ *

5631

++ * Returns with preemption disabled. Note: preempt_count must be 1

5632

++ */

5633

++void __sched schedule_preempt_disabled(void)

5634

++{

5635

++	sched_preempt_enable_no_resched();

5636

++	schedule();

5637

++	preempt_disable();

5638

++}

5639

++

5640

++#ifdef CONFIG_PREEMPT_RT

5641

++void __sched notrace schedule_rtlock(void)

5642

++{

5643

++	do {

5644

++		preempt_disable();

5645

++		__schedule(SM_RTLOCK_WAIT);

5646

++		sched_preempt_enable_no_resched();

5647

++	} while (need_resched());

5648

++}

5649

++NOKPROBE_SYMBOL(schedule_rtlock);

5650

++#endif

5651

++

5652

++static void __sched notrace preempt_schedule_common(void)

5653

++{

5654

++	do {

5655

++		/*

5656

++		 * Because the function tracer can trace preempt_count_sub()

5657

++		 * and it also uses preempt_enable/disable_notrace(), if

5658

++		 * NEED_RESCHED is set, the preempt_enable_notrace() called

5659

++		 * by the function tracer will call this function again and

5660

++		 * cause infinite recursion.

5661

++		 *

5662

++		 * Preemption must be disabled here before the function

5663

++		 * tracer can trace. Break up preempt_disable() into two

5664

++		 * calls. One to disable preemption without fear of being

5665

++		 * traced. The other to still record the preemption latency,

5666

++		 * which can also be traced by the function tracer.

5667

++		 */

5668

++		preempt_disable_notrace();

5669

++		preempt_latency_start(1);

5670

++		__schedule(SM_PREEMPT);

5671

++		preempt_latency_stop(1);

5672

++		preempt_enable_no_resched_notrace();

5673

++

5674

++		/*

5675

++		 * Check again in case we missed a preemption opportunity

5676

++		 * between schedule and now.

5677

++		 */

5678

++	} while (need_resched());

5679

++}

5680

++

5681

++#ifdef CONFIG_PREEMPTION

5682

++/*

5683

++ * This is the entry point to schedule() from in-kernel preemption

5684

++ * off of preempt_enable.

5685

++ */

5686

++asmlinkage __visible void __sched notrace preempt_schedule(void)

5687

++{

5688

++	/*

5689

++	 * If there is a non-zero preempt_count or interrupts are disabled,

5690

++	 * we do not want to preempt the current task. Just return..

5691

++	 */

5692

++	if (likely(!preemptible()))

5693

++		return;

5694

++

5695

++	preempt_schedule_common();

5696

++}

5697

++NOKPROBE_SYMBOL(preempt_schedule);

5698

++EXPORT_SYMBOL(preempt_schedule);

5699

++

5700

++#ifdef CONFIG_PREEMPT_DYNAMIC

5701

++#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)

5702

++#ifndef preempt_schedule_dynamic_enabled

5703

++#define preempt_schedule_dynamic_enabled	preempt_schedule

5704

++#define preempt_schedule_dynamic_disabled	NULL

5705

++#endif

5706

++DEFINE_STATIC_CALL(preempt_schedule, preempt_schedule_dynamic_enabled);

5707

++EXPORT_STATIC_CALL_TRAMP(preempt_schedule);

5708

++#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)

5709

++static DEFINE_STATIC_KEY_TRUE(sk_dynamic_preempt_schedule);

5710

++void __sched notrace dynamic_preempt_schedule(void)

5711

++{

5712

++	if (!static_branch_unlikely(&sk_dynamic_preempt_schedule))

5713

++		return;

5714

++	preempt_schedule();

5715

++}

5716

++NOKPROBE_SYMBOL(dynamic_preempt_schedule);

5717

++EXPORT_SYMBOL(dynamic_preempt_schedule);

5718

++#endif

5719

++#endif

5720

++

5721

++/**

5722

++ * preempt_schedule_notrace - preempt_schedule called by tracing

5723

++ *

5724

++ * The tracing infrastructure uses preempt_enable_notrace to prevent

5725

++ * recursion and tracing preempt enabling caused by the tracing

5726

++ * infrastructure itself. But as tracing can happen in areas coming

5727

++ * from userspace or just about to enter userspace, a preempt enable

5728

++ * can occur before user_exit() is called. This will cause the scheduler

5729

++ * to be called when the system is still in usermode.

5730

++ *

5731

++ * To prevent this, the preempt_enable_notrace will use this function

5732

++ * instead of preempt_schedule() to exit user context if needed before

5733

++ * calling the scheduler.

5734

++ */

5735

++asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)

5736

++{

5737

++	enum ctx_state prev_ctx;

5738

++

5739

++	if (likely(!preemptible()))

5740

++		return;

5741

++

5742

++	do {

5743

++		/*

5744

++		 * Because the function tracer can trace preempt_count_sub()

5745

++		 * and it also uses preempt_enable/disable_notrace(), if

5746

++		 * NEED_RESCHED is set, the preempt_enable_notrace() called

5747

++		 * by the function tracer will call this function again and

5748

++		 * cause infinite recursion.

5749

++		 *

5750

++		 * Preemption must be disabled here before the function

5751

++		 * tracer can trace. Break up preempt_disable() into two

5752

++		 * calls. One to disable preemption without fear of being

5753

++		 * traced. The other to still record the preemption latency,

5754

++		 * which can also be traced by the function tracer.

5755

++		 */

5756

++		preempt_disable_notrace();

5757

++		preempt_latency_start(1);

5758

++		/*

5759

++		 * Needs preempt disabled in case user_exit() is traced

5760

++		 * and the tracer calls preempt_enable_notrace() causing

5761

++		 * an infinite recursion.

5762

++		 */

5763

++		prev_ctx = exception_enter();

5764

++		__schedule(SM_PREEMPT);

5765

++		exception_exit(prev_ctx);

5766

++

5767

++		preempt_latency_stop(1);

5768

++		preempt_enable_no_resched_notrace();

5769

++	} while (need_resched());

5770

++}

5771

++EXPORT_SYMBOL_GPL(preempt_schedule_notrace);

5772

++

5773

++#ifdef CONFIG_PREEMPT_DYNAMIC

5774

++#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)

5775

++#ifndef preempt_schedule_notrace_dynamic_enabled

5776

++#define preempt_schedule_notrace_dynamic_enabled	preempt_schedule_notrace

5777

++#define preempt_schedule_notrace_dynamic_disabled	NULL

5778

++#endif

5779

++DEFINE_STATIC_CALL(preempt_schedule_notrace, preempt_schedule_notrace_dynamic_enabled);

5780

++EXPORT_STATIC_CALL_TRAMP(preempt_schedule_notrace);

5781

++#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)

5782

++static DEFINE_STATIC_KEY_TRUE(sk_dynamic_preempt_schedule_notrace);

5783

++void __sched notrace dynamic_preempt_schedule_notrace(void)

5784

++{

5785

++	if (!static_branch_unlikely(&sk_dynamic_preempt_schedule_notrace))

5786

++		return;

5787

++	preempt_schedule_notrace();

5788

++}

5789

++NOKPROBE_SYMBOL(dynamic_preempt_schedule_notrace);

5790

++EXPORT_SYMBOL(dynamic_preempt_schedule_notrace);

5791

++#endif

5792

++#endif

5793

++

5794

++#endif /* CONFIG_PREEMPTION */

5795

++

5796

++/*

5797

++ * This is the entry point to schedule() from kernel preemption

5798

++ * off of irq context.

5799

++ * Note, that this is called and return with irqs disabled. This will

5800

++ * protect us against recursive calling from irq.

5801

++ */

5802

++asmlinkage __visible void __sched preempt_schedule_irq(void)

5803

++{

5804

++	enum ctx_state prev_state;

5805

++

5806

++	/* Catch callers which need to be fixed */

5807

++	BUG_ON(preempt_count() || !irqs_disabled());

5808

++

5809

++	prev_state = exception_enter();

5810

++

5811

++	do {

5812

++		preempt_disable();

5813

++		local_irq_enable();

5814

++		__schedule(SM_PREEMPT);

5815

++		local_irq_disable();

5816

++		sched_preempt_enable_no_resched();

5817

++	} while (need_resched());

5818

++

5819

++	exception_exit(prev_state);

5820

++}

5821

++

5822

++int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags,

5823

++			  void *key)

5824

++{

5825

++	WARN_ON_ONCE(IS_ENABLED(CONFIG_SCHED_DEBUG) && wake_flags & ~WF_SYNC);

5826

++	return try_to_wake_up(curr->private, mode, wake_flags);

5827

++}

5828

++EXPORT_SYMBOL(default_wake_function);

5829

++

5830

++static inline void check_task_changed(struct task_struct *p, struct rq *rq)

5831

++{

5832

++	int idx;

5833

++

5834

++	/* Trigger resched if task sched_prio has been modified. */

5835

++	if (task_on_rq_queued(p) && (idx = task_sched_prio_idx(p, rq)) != p->sq_idx) {

5836

++		requeue_task(p, rq, idx);

5837

++		check_preempt_curr(rq);

5838

++	}

5839

++}

5840

++

5841

++static void __setscheduler_prio(struct task_struct *p, int prio)

5842

++{

5843

++	p->prio = prio;

5844

++}

5845

++

5846

++#ifdef CONFIG_RT_MUTEXES

5847

++

5848

++static inline int __rt_effective_prio(struct task_struct *pi_task, int prio)

5849

++{

5850

++	if (pi_task)

5851

++		prio = min(prio, pi_task->prio);

5852

++

5853

++	return prio;

5854

++}

5855

++

5856

++static inline int rt_effective_prio(struct task_struct *p, int prio)

5857

++{

5858

++	struct task_struct *pi_task = rt_mutex_get_top_task(p);

5859

++

5860

++	return __rt_effective_prio(pi_task, prio);

5861

++}

5862

++

5863

++/*

5864

++ * rt_mutex_setprio - set the current priority of a task

5865

++ * @p: task to boost

5866

++ * @pi_task: donor task

5867

++ *

5868

++ * This function changes the 'effective' priority of a task. It does

5869

++ * not touch ->normal_prio like __setscheduler().

5870

++ *

5871

++ * Used by the rt_mutex code to implement priority inheritance

5872

++ * logic. Call site only calls if the priority of the task changed.

5873

++ */

5874

++void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)

5875

++{

5876

++	int prio;

5877

++	struct rq *rq;

5878

++	raw_spinlock_t *lock;

5879

++

5880

++	/* XXX used to be waiter->prio, not waiter->task->prio */

5881

++	prio = __rt_effective_prio(pi_task, p->normal_prio);

5882

++

5883

++	/*

5884

++	 * If nothing changed; bail early.

5885

++	 */

5886

++	if (p->pi_top_task == pi_task && prio == p->prio)

5887

++		return;

5888

++

5889

++	rq = __task_access_lock(p, &lock);

5890

++	update_rq_clock(rq);

5891

++	/*

5892

++	 * Set under pi_lock && rq->lock, such that the value can be used under

5893

++	 * either lock.

5894

++	 *

5895

++	 * Note that there is loads of tricky to make this pointer cache work

5896

++	 * right. rt_mutex_slowunlock()+rt_mutex_postunlock() work together to

5897

++	 * ensure a task is de-boosted (pi_task is set to NULL) before the

5898

++	 * task is allowed to run again (and can exit). This ensures the pointer

5899

++	 * points to a blocked task -- which guarantees the task is present.

5900

++	 */

5901

++	p->pi_top_task = pi_task;

5902

++

5903

++	/*

5904

++	 * For FIFO/RR we only need to set prio, if that matches we're done.

5905

++	 */

5906

++	if (prio == p->prio)

5907

++		goto out_unlock;

5908

++

5909

++	/*

5910

++	 * Idle task boosting is a nono in general. There is one

5911

++	 * exception, when PREEMPT_RT and NOHZ is active:

5912

++	 *

5913

++	 * The idle task calls get_next_timer_interrupt() and holds

5914

++	 * the timer wheel base->lock on the CPU and another CPU wants

5915

++	 * to access the timer (probably to cancel it). We can safely

5916

++	 * ignore the boosting request, as the idle CPU runs this code

5917

++	 * with interrupts disabled and will complete the lock

5918

++	 * protected section without being interrupted. So there is no

5919

++	 * real need to boost.

5920

++	 */

5921

++	if (unlikely(p == rq->idle)) {

5922

++		WARN_ON(p != rq->curr);

5923

++		WARN_ON(p->pi_blocked_on);

5924

++		goto out_unlock;

5925

++	}

5926

++

5927

++	trace_sched_pi_setprio(p, pi_task);

5928

++

5929

++	__setscheduler_prio(p, prio);

5930

++

5931

++	check_task_changed(p, rq);

5932

++out_unlock:

5933

++	/* Avoid rq from going away on us: */

5934

++	preempt_disable();

5935

++

5936

++	__balance_callbacks(rq);

5937

++	__task_access_unlock(p, lock);

5938

++

5939

++	preempt_enable();

5940

++}

5941

++#else

5942

++static inline int rt_effective_prio(struct task_struct *p, int prio)

5943

++{

5944

++	return prio;

5945

++}

5946

++#endif

5947

++

5948

++void set_user_nice(struct task_struct *p, long nice)

5949

++{

5950

++	unsigned long flags;

5951

++	struct rq *rq;

5952

++	raw_spinlock_t *lock;

5953

++

5954

++	if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE)

5955

++		return;

5956

++	/*

5957

++	 * We have to be careful, if called from sys_setpriority(),

5958

++	 * the task might be in the middle of scheduling on another CPU.

5959

++	 */

5960

++	raw_spin_lock_irqsave(&p->pi_lock, flags);

5961

++	rq = __task_access_lock(p, &lock);

5962

++

5963

++	p->static_prio = NICE_TO_PRIO(nice);

5964

++	/*

5965

++	 * The RT priorities are set via sched_setscheduler(), but we still

5966

++	 * allow the 'normal' nice value to be set - but as expected

5967

++	 * it won't have any effect on scheduling until the task is

5968

++	 * not SCHED_NORMAL/SCHED_BATCH:

5969

++	 */

5970

++	if (task_has_rt_policy(p))

5971

++		goto out_unlock;

5972

++

5973

++	p->prio = effective_prio(p);

5974

++

5975

++	check_task_changed(p, rq);

5976

++out_unlock:

5977

++	__task_access_unlock(p, lock);

5978

++	raw_spin_unlock_irqrestore(&p->pi_lock, flags);

5979

++}

5980

++EXPORT_SYMBOL(set_user_nice);

5981

++

5982

++/*

5983

++ * is_nice_reduction - check if nice value is an actual reduction

5984

++ *

5985

++ * Similar to can_nice() but does not perform a capability check.

5986

++ *

5987

++ * @p: task

5988

++ * @nice: nice value

5989

++ */

5990

++static bool is_nice_reduction(const struct task_struct *p, const int nice)

5991

++{

5992

++	/* Convert nice value [19,-20] to rlimit style value [1,40]: */

5993

++	int nice_rlim = nice_to_rlimit(nice);

5994

++

5995

++	return (nice_rlim <= task_rlimit(p, RLIMIT_NICE));

5996

++}

5997

++

5998

++/*

5999

++ * can_nice - check if a task can reduce its nice value

6000

++ * @p: task

6001

++ * @nice: nice value

6002

++ */

6003

++int can_nice(const struct task_struct *p, const int nice)

6004

++{

6005

++	return is_nice_reduction(p, nice) || capable(CAP_SYS_NICE);

6006

++}

6007

++

6008

++#ifdef __ARCH_WANT_SYS_NICE

6009

++

6010

++/*

6011

++ * sys_nice - change the priority of the current process.

6012

++ * @increment: priority increment

6013

++ *

6014

++ * sys_setpriority is a more generic, but much slower function that

6015

++ * does similar things.

6016

++ */

6017

++SYSCALL_DEFINE1(nice, int, increment)

6018

++{

6019

++	long nice, retval;

6020

++

6021

++	/*

6022

++	 * Setpriority might change our priority at the same moment.

6023

++	 * We don't have to worry. Conceptually one call occurs first

6024

++	 * and we have a single winner.

6025

++	 */

6026

++

6027

++	increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH);

6028

++	nice = task_nice(current) + increment;

6029

++

6030

++	nice = clamp_val(nice, MIN_NICE, MAX_NICE);

6031

++	if (increment < 0 && !can_nice(current, nice))

6032

++		return -EPERM;

6033

++

6034

++	retval = security_task_setnice(current, nice);

6035

++	if (retval)

6036

++		return retval;

6037

++

6038

++	set_user_nice(current, nice);

6039

++	return 0;

6040

++}

6041

++

6042

++#endif

6043

++

6044

++/**

6045

++ * task_prio - return the priority value of a given task.

6046

++ * @p: the task in question.

6047

++ *

6048

++ * Return: The priority value as seen by users in /proc.

6049

++ *

6050

++ * sched policy         return value   kernel prio    user prio/nice

6051

++ *

6052

++ * (BMQ)normal, batch, idle[0 ... 53]  [100 ... 139]          0/[-20 ... 19]/[-7 ... 7]

6053

++ * (PDS)normal, batch, idle[0 ... 39]            100          0/[-20 ... 19]

6054

++ * fifo, rr             [-1 ... -100]     [99 ... 0]  [0 ... 99]

6055

++ */

6056

++int task_prio(const struct task_struct *p)

6057

++{

6058

++	return (p->prio < MAX_RT_PRIO) ? p->prio - MAX_RT_PRIO :

6059

++		task_sched_prio_normal(p, task_rq(p));

6060

++}

6061

++

6062

++/**

6063

++ * idle_cpu - is a given CPU idle currently?

6064

++ * @cpu: the processor in question.

6065

++ *

6066

++ * Return: 1 if the CPU is currently idle. 0 otherwise.

6067

++ */

6068

++int idle_cpu(int cpu)

6069

++{

6070

++	struct rq *rq = cpu_rq(cpu);

6071

++

6072

++	if (rq->curr != rq->idle)

6073

++		return 0;

6074

++

6075

++	if (rq->nr_running)

6076

++		return 0;

6077

++

6078

++#ifdef CONFIG_SMP

6079

++	if (rq->ttwu_pending)

6080

++		return 0;

6081

++#endif

6082

++

6083

++	return 1;

6084

++}

6085

++

6086

++/**

6087

++ * idle_task - return the idle task for a given CPU.

6088

++ * @cpu: the processor in question.

6089

++ *

6090

++ * Return: The idle task for the cpu @cpu.

6091

++ */

6092

++struct task_struct *idle_task(int cpu)

6093

++{

6094

++	return cpu_rq(cpu)->idle;

6095

++}

6096

++

6097

++/**

6098

++ * find_process_by_pid - find a process with a matching PID value.

6099

++ * @pid: the pid in question.

6100

++ *

6101

++ * The task of @pid, if found. %NULL otherwise.

6102

++ */

6103

++static inline struct task_struct *find_process_by_pid(pid_t pid)

6104

++{

6105

++	return pid ? find_task_by_vpid(pid) : current;

6106

++}

6107

++

6108

++/*

6109

++ * sched_setparam() passes in -1 for its policy, to let the functions

6110

++ * it calls know not to change it.

6111

++ */

6112

++#define SETPARAM_POLICY -1

6113

++

6114

++static void __setscheduler_params(struct task_struct *p,

6115

++		const struct sched_attr *attr)

6116

++{

6117

++	int policy = attr->sched_policy;

6118

++

6119

++	if (policy == SETPARAM_POLICY)

6120

++		policy = p->policy;

6121

++

6122

++	p->policy = policy;

6123

++

6124

++	/*

6125

++	 * allow normal nice value to be set, but will not have any

6126

++	 * effect on scheduling until the task not SCHED_NORMAL/

6127

++	 * SCHED_BATCH

6128

++	 */

6129

++	p->static_prio = NICE_TO_PRIO(attr->sched_nice);

6130

++

6131

++	/*

6132

++	 * __sched_setscheduler() ensures attr->sched_priority == 0 when

6133

++	 * !rt_policy. Always setting this ensures that things like

6134

++	 * getparam()/getattr() don't report silly values for !rt tasks.

6135

++	 */

6136

++	p->rt_priority = attr->sched_priority;

6137

++	p->normal_prio = normal_prio(p);

6138

++}

6139

++

6140

++/*

6141

++ * check the target process has a UID that matches the current process's

6142

++ */

6143

++static bool check_same_owner(struct task_struct *p)

6144

++{

6145

++	const struct cred *cred = current_cred(), *pcred;

6146

++	bool match;

6147

++

6148

++	rcu_read_lock();

6149

++	pcred = __task_cred(p);

6150

++	match = (uid_eq(cred->euid, pcred->euid) ||

6151

++		 uid_eq(cred->euid, pcred->uid));

6152

++	rcu_read_unlock();

6153

++	return match;

6154

++}

6155

++

6156

++/*

6157

++ * Allow unprivileged RT tasks to decrease priority.

6158

++ * Only issue a capable test if needed and only once to avoid an audit

6159

++ * event on permitted non-privileged operations:

6160

++ */

6161

++static int user_check_sched_setscheduler(struct task_struct *p,

6162

++					 const struct sched_attr *attr,

6163

++					 int policy, int reset_on_fork)

6164

++{

6165

++	if (rt_policy(policy)) {

6166

++		unsigned long rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO);

6167

++

6168

++		/* Can't set/change the rt policy: */

6169

++		if (policy != p->policy && !rlim_rtprio)

6170

++			goto req_priv;

6171

++

6172

++		/* Can't increase priority: */

6173

++		if (attr->sched_priority > p->rt_priority &&

6174

++		    attr->sched_priority > rlim_rtprio)

6175

++			goto req_priv;

6176

++	}

6177

++

6178

++	/* Can't change other user's priorities: */

6179

++	if (!check_same_owner(p))

6180

++		goto req_priv;

6181

++

6182

++	/* Normal users shall not reset the sched_reset_on_fork flag: */

6183

++	if (p->sched_reset_on_fork && !reset_on_fork)

6184

++		goto req_priv;

6185

++

6186

++	return 0;

6187

++

6188

++req_priv:

6189

++	if (!capable(CAP_SYS_NICE))

6190

++		return -EPERM;

6191

++

6192

++	return 0;

6193

++}

6194

++

6195

++static int __sched_setscheduler(struct task_struct *p,

6196

++				const struct sched_attr *attr,

6197

++				bool user, bool pi)

6198

++{

6199

++	const struct sched_attr dl_squash_attr = {

6200

++		.size		= sizeof(struct sched_attr),

6201

++		.sched_policy	= SCHED_FIFO,

6202

++		.sched_nice	= 0,

6203

++		.sched_priority = 99,

6204

++	};

6205

++	int oldpolicy = -1, policy = attr->sched_policy;

6206

++	int retval, newprio;

6207

++	struct balance_callback *head;

6208

++	unsigned long flags;

6209

++	struct rq *rq;

6210

++	int reset_on_fork;

6211

++	raw_spinlock_t *lock;

6212

++

6213

++	/* The pi code expects interrupts enabled */

6214

++	BUG_ON(pi && in_interrupt());

6215

++

6216

++	/*

6217

++	 * Alt schedule FW supports SCHED_DEADLINE by squash it as prio 0 SCHED_FIFO

6218

++	 */

6219

++	if (unlikely(SCHED_DEADLINE == policy)) {

6220

++		attr = &dl_squash_attr;

6221

++		policy = attr->sched_policy;

6222

++	}

6223

++recheck:

6224

++	/* Double check policy once rq lock held */

6225

++	if (policy < 0) {

6226

++		reset_on_fork = p->sched_reset_on_fork;

6227

++		policy = oldpolicy = p->policy;

6228

++	} else {

6229

++		reset_on_fork = !!(attr->sched_flags & SCHED_RESET_ON_FORK);

6230

++

6231

++		if (policy > SCHED_IDLE)

6232

++			return -EINVAL;

6233

++	}

6234

++

6235

++	if (attr->sched_flags & ~(SCHED_FLAG_ALL))

6236

++		return -EINVAL;

6237

++

6238

++	/*

6239

++	 * Valid priorities for SCHED_FIFO and SCHED_RR are

6240

++	 * 1..MAX_RT_PRIO-1, valid priority for SCHED_NORMAL and

6241

++	 * SCHED_BATCH and SCHED_IDLE is 0.

6242

++	 */

6243

++	if (attr->sched_priority < 0 ||

6244

++	    (p->mm && attr->sched_priority > MAX_RT_PRIO - 1) ||

6245

++	    (!p->mm && attr->sched_priority > MAX_RT_PRIO - 1))

6246

++		return -EINVAL;

6247

++	if ((SCHED_RR == policy || SCHED_FIFO == policy) !=

6248

++	    (attr->sched_priority != 0))

6249

++		return -EINVAL;

6250

++

6251

++	if (user) {

6252

++		retval = user_check_sched_setscheduler(p, attr, policy, reset_on_fork);

6253

++		if (retval)

6254

++			return retval;

6255

++

6256

++		retval = security_task_setscheduler(p);

6257

++		if (retval)

6258

++			return retval;

6259

++	}

6260

++

6261

++	if (pi)

6262

++		cpuset_read_lock();

6263

++

6264

++	/*

6265

++	 * Make sure no PI-waiters arrive (or leave) while we are

6266

++	 * changing the priority of the task:

6267

++	 */

6268

++	raw_spin_lock_irqsave(&p->pi_lock, flags);

6269

++

6270

++	/*

6271

++	 * To be able to change p->policy safely, task_access_lock()

6272

++	 * must be called.

6273

++	 * IF use task_access_lock() here:

6274

++	 * For the task p which is not running, reading rq->stop is

6275

++	 * racy but acceptable as ->stop doesn't change much.

6276

++	 * An enhancemnet can be made to read rq->stop saftly.

6277

++	 */

6278

++	rq = __task_access_lock(p, &lock);

6279

++

6280

++	/*

6281

++	 * Changing the policy of the stop threads its a very bad idea

6282

++	 */

6283

++	if (p == rq->stop) {

6284

++		retval = -EINVAL;

6285

++		goto unlock;

6286

++	}

6287

++

6288

++	/*

6289

++	 * If not changing anything there's no need to proceed further:

6290

++	 */

6291

++	if (unlikely(policy == p->policy)) {

6292

++		if (rt_policy(policy) && attr->sched_priority != p->rt_priority)

6293

++			goto change;

6294

++		if (!rt_policy(policy) &&

6295

++		    NICE_TO_PRIO(attr->sched_nice) != p->static_prio)

6296

++			goto change;

6297

++

6298

++		p->sched_reset_on_fork = reset_on_fork;

6299

++		retval = 0;

6300

++		goto unlock;

6301

++	}

6302

++change:

6303

++

6304

++	/* Re-check policy now with rq lock held */

6305

++	if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {

6306

++		policy = oldpolicy = -1;

6307

++		__task_access_unlock(p, lock);

6308

++		raw_spin_unlock_irqrestore(&p->pi_lock, flags);

6309

++		if (pi)

6310

++			cpuset_read_unlock();

6311

++		goto recheck;

6312

++	}

6313

++

6314

++	p->sched_reset_on_fork = reset_on_fork;

6315

++

6316

++	newprio = __normal_prio(policy, attr->sched_priority, NICE_TO_PRIO(attr->sched_nice));

6317

++	if (pi) {

6318

++		/*

6319

++		 * Take priority boosted tasks into account. If the new

6320

++		 * effective priority is unchanged, we just store the new

6321

++		 * normal parameters and do not touch the scheduler class and

6322

++		 * the runqueue. This will be done when the task deboost

6323

++		 * itself.

6324

++		 */

6325

++		newprio = rt_effective_prio(p, newprio);

6326

++	}

6327

++

6328

++	if (!(attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)) {

6329

++		__setscheduler_params(p, attr);

6330

++		__setscheduler_prio(p, newprio);

6331

++	}

6332

++

6333

++	check_task_changed(p, rq);

6334

++

6335

++	/* Avoid rq from going away on us: */

6336

++	preempt_disable();

6337

++	head = splice_balance_callbacks(rq);

6338

++	__task_access_unlock(p, lock);

6339

++	raw_spin_unlock_irqrestore(&p->pi_lock, flags);

6340

++

6341

++	if (pi) {

6342

++		cpuset_read_unlock();

6343

++		rt_mutex_adjust_pi(p);

6344

++	}

6345

++

6346

++	/* Run balance callbacks after we've adjusted the PI chain: */

6347

++	balance_callbacks(rq, head);

6348

++	preempt_enable();

6349

++

6350

++	return 0;

6351

++

6352

++unlock:

6353

++	__task_access_unlock(p, lock);

6354

++	raw_spin_unlock_irqrestore(&p->pi_lock, flags);

6355

++	if (pi)

6356

++		cpuset_read_unlock();

6357

++	return retval;

6358

++}

6359

++

6360

++static int _sched_setscheduler(struct task_struct *p, int policy,

6361

++			       const struct sched_param *param, bool check)

6362

++{

6363

++	struct sched_attr attr = {

6364

++		.sched_policy   = policy,

6365

++		.sched_priority = param->sched_priority,

6366

++		.sched_nice     = PRIO_TO_NICE(p->static_prio),

6367

++	};

6368

++

6369

++	/* Fixup the legacy SCHED_RESET_ON_FORK hack. */

6370

++	if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) {

6371

++		attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;

6372

++		policy &= ~SCHED_RESET_ON_FORK;

6373

++		attr.sched_policy = policy;

6374

++	}

6375

++

6376

++	return __sched_setscheduler(p, &attr, check, true);

6377

++}

6378

++

6379

++/**

6380

++ * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.

6381

++ * @p: the task in question.

6382

++ * @policy: new policy.

6383

++ * @param: structure containing the new RT priority.

6384

++ *

6385

++ * Use sched_set_fifo(), read its comment.

6386

++ *

6387

++ * Return: 0 on success. An error code otherwise.

6388

++ *

6389

++ * NOTE that the task may be already dead.

6390

++ */

6391

++int sched_setscheduler(struct task_struct *p, int policy,

6392

++		       const struct sched_param *param)

6393

++{

6394

++	return _sched_setscheduler(p, policy, param, true);

6395

++}

6396

++

6397

++int sched_setattr(struct task_struct *p, const struct sched_attr *attr)

6398

++{

6399

++	return __sched_setscheduler(p, attr, true, true);

6400

++}

6401

++

6402

++int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr)

6403

++{

6404

++	return __sched_setscheduler(p, attr, false, true);

6405

++}

6406

++EXPORT_SYMBOL_GPL(sched_setattr_nocheck);

6407

++

6408

++/**

6409

++ * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.

6410

++ * @p: the task in question.

6411

++ * @policy: new policy.

6412

++ * @param: structure containing the new RT priority.

6413

++ *

6414

++ * Just like sched_setscheduler, only don't bother checking if the

6415

++ * current context has permission.  For example, this is needed in

6416

++ * stop_machine(): we create temporary high priority worker threads,

6417

++ * but our caller might not have that capability.

6418

++ *

6419

++ * Return: 0 on success. An error code otherwise.

6420

++ */

6421

++int sched_setscheduler_nocheck(struct task_struct *p, int policy,

6422

++			       const struct sched_param *param)

6423

++{

6424

++	return _sched_setscheduler(p, policy, param, false);

6425

++}

6426

++

6427

++/*

6428

++ * SCHED_FIFO is a broken scheduler model; that is, it is fundamentally

6429

++ * incapable of resource management, which is the one thing an OS really should

6430

++ * be doing.

6431

++ *

6432

++ * This is of course the reason it is limited to privileged users only.

6433

++ *

6434

++ * Worse still; it is fundamentally impossible to compose static priority

6435

++ * workloads. You cannot take two correctly working static prio workloads

6436

++ * and smash them together and still expect them to work.

6437

++ *

6438

++ * For this reason 'all' FIFO tasks the kernel creates are basically at:

6439

++ *

6440

++ *   MAX_RT_PRIO / 2

6441

++ *

6442

++ * The administrator _MUST_ configure the system, the kernel simply doesn't

6443

++ * know enough information to make a sensible choice.

6444

++ */

6445

++void sched_set_fifo(struct task_struct *p)

6446

++{

6447

++	struct sched_param sp = { .sched_priority = MAX_RT_PRIO / 2 };

6448

++	WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0);

6449

++}

6450

++EXPORT_SYMBOL_GPL(sched_set_fifo);

6451

++

6452

++/*

6453

++ * For when you don't much care about FIFO, but want to be above SCHED_NORMAL.

6454

++ */

6455

++void sched_set_fifo_low(struct task_struct *p)

6456

++{

6457

++	struct sched_param sp = { .sched_priority = 1 };

6458

++	WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0);

6459

++}

6460

++EXPORT_SYMBOL_GPL(sched_set_fifo_low);

6461

++

6462

++void sched_set_normal(struct task_struct *p, int nice)

6463

++{

6464

++	struct sched_attr attr = {

6465

++		.sched_policy = SCHED_NORMAL,

6466

++		.sched_nice = nice,

6467

++	};

6468

++	WARN_ON_ONCE(sched_setattr_nocheck(p, &attr) != 0);

6469

++}

6470

++EXPORT_SYMBOL_GPL(sched_set_normal);

6471

++

6472

++static int

6473

++do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)

6474

++{

6475

++	struct sched_param lparam;

6476

++	struct task_struct *p;

6477

++	int retval;

6478

++

6479

++	if (!param || pid < 0)

6480

++		return -EINVAL;

6481

++	if (copy_from_user(&lparam, param, sizeof(struct sched_param)))

6482

++		return -EFAULT;

6483

++

6484

++	rcu_read_lock();

6485

++	retval = -ESRCH;

6486

++	p = find_process_by_pid(pid);

6487

++	if (likely(p))

6488

++		get_task_struct(p);

6489

++	rcu_read_unlock();

6490

++

6491

++	if (likely(p)) {

6492

++		retval = sched_setscheduler(p, policy, &lparam);

6493

++		put_task_struct(p);

6494

++	}

6495

++

6496

++	return retval;

6497

++}

6498

++

6499

++/*

6500

++ * Mimics kernel/events/core.c perf_copy_attr().

6501

++ */

6502

++static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *attr)

6503

++{

6504

++	u32 size;

6505

++	int ret;

6506

++

6507

++	/* Zero the full structure, so that a short copy will be nice: */

6508

++	memset(attr, 0, sizeof(*attr));

6509

++

6510

++	ret = get_user(size, &uattr->size);

6511

++	if (ret)

6512

++		return ret;

6513

++

6514

++	/* ABI compatibility quirk: */

6515

++	if (!size)

6516

++		size = SCHED_ATTR_SIZE_VER0;

6517

++

6518

++	if (size < SCHED_ATTR_SIZE_VER0 || size > PAGE_SIZE)

6519

++		goto err_size;

6520

++

6521

++	ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size);

6522

++	if (ret) {

6523

++		if (ret == -E2BIG)

6524

++			goto err_size;

6525

++		return ret;

6526

++	}

6527

++

6528

++	/*

6529

++	 * XXX: Do we want to be lenient like existing syscalls; or do we want

6530

++	 * to be strict and return an error on out-of-bounds values?

6531

++	 */

6532

++	attr->sched_nice = clamp(attr->sched_nice, -20, 19);

6533

++

6534

++	/* sched/core.c uses zero here but we already know ret is zero */

6535

++	return 0;

6536

++

6537

++err_size:

6538

++	put_user(sizeof(*attr), &uattr->size);

6539

++	return -E2BIG;

6540

++}

6541

++

6542

++/**

6543

++ * sys_sched_setscheduler - set/change the scheduler policy and RT priority

6544

++ * @pid: the pid in question.

6545

++ * @policy: new policy.

6546

++ *

6547

++ * Return: 0 on success. An error code otherwise.

6548

++ * @param: structure containing the new RT priority.

6549

++ */

6550

++SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, struct sched_param __user *, param)

6551

++{

6552

++	if (policy < 0)

6553

++		return -EINVAL;

6554

++

6555

++	return do_sched_setscheduler(pid, policy, param);

6556

++}

6557

++

6558

++/**

6559

++ * sys_sched_setparam - set/change the RT priority of a thread

6560

++ * @pid: the pid in question.

6561

++ * @param: structure containing the new RT priority.

6562

++ *

6563

++ * Return: 0 on success. An error code otherwise.

6564

++ */

6565

++SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)

6566

++{

6567

++	return do_sched_setscheduler(pid, SETPARAM_POLICY, param);

6568

++}

6569

++

6570

++/**

6571

++ * sys_sched_setattr - same as above, but with extended sched_attr

6572

++ * @pid: the pid in question.

6573

++ * @uattr: structure containing the extended parameters.

6574

++ */

6575

++SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,

6576

++			       unsigned int, flags)

6577

++{

6578

++	struct sched_attr attr;

6579

++	struct task_struct *p;

6580

++	int retval;

6581

++

6582

++	if (!uattr || pid < 0 || flags)

6583

++		return -EINVAL;

6584

++

6585

++	retval = sched_copy_attr(uattr, &attr);

6586

++	if (retval)

6587

++		return retval;

6588

++

6589

++	if ((int)attr.sched_policy < 0)

6590

++		return -EINVAL;

6591

++

6592

++	rcu_read_lock();

6593

++	retval = -ESRCH;

6594

++	p = find_process_by_pid(pid);

6595

++	if (likely(p))

6596

++		get_task_struct(p);

6597

++	rcu_read_unlock();

6598

++

6599

++	if (likely(p)) {

6600

++		retval = sched_setattr(p, &attr);

6601

++		put_task_struct(p);

6602

++	}

6603

++

6604

++	return retval;

6605

++}

6606

++

6607

++/**

6608

++ * sys_sched_getscheduler - get the policy (scheduling class) of a thread

6609

++ * @pid: the pid in question.

6610

++ *

6611

++ * Return: On success, the policy of the thread. Otherwise, a negative error

6612

++ * code.

6613

++ */

6614

++SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)

6615

++{

6616

++	struct task_struct *p;

6617

++	int retval = -EINVAL;

6618

++

6619

++	if (pid < 0)

6620

++		goto out_nounlock;

6621

++

6622

++	retval = -ESRCH;

6623

++	rcu_read_lock();

6624

++	p = find_process_by_pid(pid);

6625

++	if (p) {

6626

++		retval = security_task_getscheduler(p);

6627

++		if (!retval)

6628

++			retval = p->policy;

6629

++	}

6630

++	rcu_read_unlock();

6631

++

6632

++out_nounlock:

6633

++	return retval;

6634

++}

6635

++

6636

++/**

6637

++ * sys_sched_getscheduler - get the RT priority of a thread

6638

++ * @pid: the pid in question.

6639

++ * @param: structure containing the RT priority.

6640

++ *

6641

++ * Return: On success, 0 and the RT priority is in @param. Otherwise, an error

6642

++ * code.

6643

++ */

6644

++SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)

6645

++{

6646

++	struct sched_param lp = { .sched_priority = 0 };

6647

++	struct task_struct *p;

6648

++	int retval = -EINVAL;

6649

++

6650

++	if (!param || pid < 0)

6651

++		goto out_nounlock;

6652

++

6653

++	rcu_read_lock();

6654

++	p = find_process_by_pid(pid);

6655

++	retval = -ESRCH;

6656

++	if (!p)

6657

++		goto out_unlock;

6658

++

6659

++	retval = security_task_getscheduler(p);

6660

++	if (retval)

6661

++		goto out_unlock;

6662

++

6663

++	if (task_has_rt_policy(p))

6664

++		lp.sched_priority = p->rt_priority;

6665

++	rcu_read_unlock();

6666

++

6667

++	/*

6668

++	 * This one might sleep, we cannot do it with a spinlock held ...

6669

++	 */

6670

++	retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;

6671

++

6672

++out_nounlock:

6673

++	return retval;

6674

++

6675

++out_unlock:

6676

++	rcu_read_unlock();

6677

++	return retval;

6678

++}

6679

++

6680

++/*

6681

++ * Copy the kernel size attribute structure (which might be larger

6682

++ * than what user-space knows about) to user-space.

6683

++ *

6684

++ * Note that all cases are valid: user-space buffer can be larger or

6685

++ * smaller than the kernel-space buffer. The usual case is that both

6686

++ * have the same size.

6687

++ */

6688

++static int

6689

++sched_attr_copy_to_user(struct sched_attr __user *uattr,

6690

++			struct sched_attr *kattr,

6691

++			unsigned int usize)

6692

++{

6693

++	unsigned int ksize = sizeof(*kattr);

6694

++

6695

++	if (!access_ok(uattr, usize))

6696

++		return -EFAULT;

6697

++

6698

++	/*

6699

++	 * sched_getattr() ABI forwards and backwards compatibility:

6700

++	 *

6701

++	 * If usize == ksize then we just copy everything to user-space and all is good.

6702

++	 *

6703

++	 * If usize < ksize then we only copy as much as user-space has space for,

6704

++	 * this keeps ABI compatibility as well. We skip the rest.

6705

++	 *

6706

++	 * If usize > ksize then user-space is using a newer version of the ABI,

6707

++	 * which part the kernel doesn't know about. Just ignore it - tooling can

6708

++	 * detect the kernel's knowledge of attributes from the attr->size value

6709

++	 * which is set to ksize in this case.

6710

++	 */

6711

++	kattr->size = min(usize, ksize);

6712

++

6713

++	if (copy_to_user(uattr, kattr, kattr->size))

6714

++		return -EFAULT;

6715

++

6716

++	return 0;

6717

++}

6718

++

6719

++/**

6720

++ * sys_sched_getattr - similar to sched_getparam, but with sched_attr

6721

++ * @pid: the pid in question.

6722

++ * @uattr: structure containing the extended parameters.

6723

++ * @usize: sizeof(attr) for fwd/bwd comp.

6724

++ * @flags: for future extension.

6725

++ */

6726

++SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,

6727

++		unsigned int, usize, unsigned int, flags)

6728

++{

6729

++	struct sched_attr kattr = { };

6730

++	struct task_struct *p;

6731

++	int retval;

6732

++

6733

++	if (!uattr || pid < 0 || usize > PAGE_SIZE ||

6734

++	    usize < SCHED_ATTR_SIZE_VER0 || flags)

6735

++		return -EINVAL;

6736

++

6737

++	rcu_read_lock();

6738

++	p = find_process_by_pid(pid);

6739

++	retval = -ESRCH;

6740

++	if (!p)

6741

++		goto out_unlock;

6742

++

6743

++	retval = security_task_getscheduler(p);

6744

++	if (retval)

6745

++		goto out_unlock;

6746

++

6747

++	kattr.sched_policy = p->policy;

6748

++	if (p->sched_reset_on_fork)

6749

++		kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;

6750

++	if (task_has_rt_policy(p))

6751

++		kattr.sched_priority = p->rt_priority;

6752

++	else

6753

++		kattr.sched_nice = task_nice(p);

6754

++	kattr.sched_flags &= SCHED_FLAG_ALL;

6755

++

6756

++#ifdef CONFIG_UCLAMP_TASK

6757

++	kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value;

6758

++	kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value;

6759

++#endif

6760

++

6761

++	rcu_read_unlock();

6762

++

6763

++	return sched_attr_copy_to_user(uattr, &kattr, usize);

6764

++

6765

++out_unlock:

6766

++	rcu_read_unlock();

6767

++	return retval;

6768

++}

6769

++

6770

++#ifdef CONFIG_SMP

6771

++int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask)

6772

++{

6773

++	return 0;

6774

++}

6775

++#endif

6776

++

6777

++static int

6778

++__sched_setaffinity(struct task_struct *p, struct affinity_context *ctx)

6779

++{

6780

++	int retval;

6781

++	cpumask_var_t cpus_allowed, new_mask;

6782

++

6783

++	if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL))

6784

++		return -ENOMEM;

6785

++

6786

++	if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {

6787

++		retval = -ENOMEM;

6788

++		goto out_free_cpus_allowed;

6789

++	}

6790

++

6791

++	cpuset_cpus_allowed(p, cpus_allowed);

6792

++	cpumask_and(new_mask, ctx->new_mask, cpus_allowed);

6793

++

6794

++	ctx->new_mask = new_mask;

6795

++	ctx->flags |= SCA_CHECK;

6796

++

6797

++	retval = __set_cpus_allowed_ptr(p, ctx);

6798

++	if (retval)

6799

++		goto out_free_new_mask;

6800

++

6801

++	cpuset_cpus_allowed(p, cpus_allowed);

6802

++	if (!cpumask_subset(new_mask, cpus_allowed)) {

6803

++		/*

6804

++		 * We must have raced with a concurrent cpuset

6805

++		 * update. Just reset the cpus_allowed to the

6806

++		 * cpuset's cpus_allowed

6807

++		 */

6808

++		cpumask_copy(new_mask, cpus_allowed);

6809

++

6810

++		/*

6811

++		 * If SCA_USER is set, a 2nd call to __set_cpus_allowed_ptr()

6812

++		 * will restore the previous user_cpus_ptr value.

6813

++		 *

6814

++		 * In the unlikely event a previous user_cpus_ptr exists,

6815

++		 * we need to further restrict the mask to what is allowed

6816

++		 * by that old user_cpus_ptr.

6817

++		 */

6818

++		if (unlikely((ctx->flags & SCA_USER) && ctx->user_mask)) {

6819

++			bool empty = !cpumask_and(new_mask, new_mask,

6820

++						  ctx->user_mask);

6821

++

6822

++			if (WARN_ON_ONCE(empty))

6823

++				cpumask_copy(new_mask, cpus_allowed);

6824

++		}

6825

++		__set_cpus_allowed_ptr(p, ctx);

6826

++		retval = -EINVAL;

6827

++	}

6828

++

6829

++out_free_new_mask:

6830

++	free_cpumask_var(new_mask);

6831

++out_free_cpus_allowed:

6832

++	free_cpumask_var(cpus_allowed);

6833

++	return retval;

6834

++}

6835

++

6836

++long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)

6837

++{

6838

++	struct affinity_context ac;

6839

++	struct cpumask *user_mask;

6840

++	struct task_struct *p;

6841

++	int retval;

6842

++

6843

++	rcu_read_lock();

6844

++

6845

++	p = find_process_by_pid(pid);

6846

++	if (!p) {

6847

++		rcu_read_unlock();

6848

++		return -ESRCH;

6849

++	}

6850

++

6851

++	/* Prevent p going away */

6852

++	get_task_struct(p);

6853

++	rcu_read_unlock();

6854

++

6855

++	if (p->flags & PF_NO_SETAFFINITY) {

6856

++		retval = -EINVAL;

6857

++		goto out_put_task;

6858

++	}

6859

++

6860

++	if (!check_same_owner(p)) {

6861

++		rcu_read_lock();

6862

++		if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {

6863

++			rcu_read_unlock();

6864

++			retval = -EPERM;

6865

++			goto out_put_task;

6866

++		}

6867

++		rcu_read_unlock();

6868

++	}

6869

++

6870

++	retval = security_task_setscheduler(p);

6871

++	if (retval)

6872

++		goto out_put_task;

6873

++

6874

++	/*

6875

++	 * With non-SMP configs, user_cpus_ptr/user_mask isn't used and

6876

++	 * alloc_user_cpus_ptr() returns NULL.

6877

++	 */

6878

++	user_mask = alloc_user_cpus_ptr(NUMA_NO_NODE);

6879

++	if (user_mask) {

6880

++		cpumask_copy(user_mask, in_mask);

6881

++	} else if (IS_ENABLED(CONFIG_SMP)) {

6882

++		retval = -ENOMEM;

6883

++		goto out_put_task;

6884

++	}

6885

++

6886

++	ac = (struct affinity_context){

6887

++		.new_mask  = in_mask,

6888

++		.user_mask = user_mask,

6889

++		.flags     = SCA_USER,

6890

++	};

6891

++

6892

++	retval = __sched_setaffinity(p, &ac);

6893

++	kfree(ac.user_mask);

6894

++

6895

++out_put_task:

6896

++	put_task_struct(p);

6897

++	return retval;

6898

++}

6899

++

6900

++static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,

6901

++			     struct cpumask *new_mask)

6902

++{

6903

++	if (len < cpumask_size())

6904

++		cpumask_clear(new_mask);

6905

++	else if (len > cpumask_size())

6906

++		len = cpumask_size();

6907

++

6908

++	return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;

6909

++}

6910

++

6911

++/**

6912

++ * sys_sched_setaffinity - set the CPU affinity of a process

6913

++ * @pid: pid of the process

6914

++ * @len: length in bytes of the bitmask pointed to by user_mask_ptr

6915

++ * @user_mask_ptr: user-space pointer to the new CPU mask

6916

++ *

6917

++ * Return: 0 on success. An error code otherwise.

6918

++ */

6919

++SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,

6920

++		unsigned long __user *, user_mask_ptr)

6921

++{

6922

++	cpumask_var_t new_mask;

6923

++	int retval;

6924

++

6925

++	if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))

6926

++		return -ENOMEM;

6927

++

6928

++	retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);

6929

++	if (retval == 0)

6930

++		retval = sched_setaffinity(pid, new_mask);

6931

++	free_cpumask_var(new_mask);

6932

++	return retval;

6933

++}

6934

++

6935

++long sched_getaffinity(pid_t pid, cpumask_t *mask)

6936

++{

6937

++	struct task_struct *p;

6938

++	raw_spinlock_t *lock;

6939

++	unsigned long flags;

6940

++	int retval;

6941

++

6942

++	rcu_read_lock();

6943

++

6944

++	retval = -ESRCH;

6945

++	p = find_process_by_pid(pid);

6946

++	if (!p)

6947

++		goto out_unlock;

6948

++

6949

++	retval = security_task_getscheduler(p);

6950

++	if (retval)

6951

++		goto out_unlock;

6952

++

6953

++	task_access_lock_irqsave(p, &lock, &flags);

6954

++	cpumask_and(mask, &p->cpus_mask, cpu_active_mask);

6955

++	task_access_unlock_irqrestore(p, lock, &flags);

6956

++

6957

++out_unlock:

6958

++	rcu_read_unlock();

6959

++

6960

++	return retval;

6961

++}

6962

++

6963

++/**

6964

++ * sys_sched_getaffinity - get the CPU affinity of a process

6965

++ * @pid: pid of the process

6966

++ * @len: length in bytes of the bitmask pointed to by user_mask_ptr

6967

++ * @user_mask_ptr: user-space pointer to hold the current CPU mask

6968

++ *

6969

++ * Return: size of CPU mask copied to user_mask_ptr on success. An

6970

++ * error code otherwise.

6971

++ */

6972

++SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,

6973

++		unsigned long __user *, user_mask_ptr)

6974

++{

6975

++	int ret;

6976

++	cpumask_var_t mask;

6977

++

6978

++	if ((len * BITS_PER_BYTE) < nr_cpu_ids)

6979

++		return -EINVAL;

6980

++	if (len & (sizeof(unsigned long)-1))

6981

++		return -EINVAL;

6982

++

6983

++	if (!alloc_cpumask_var(&mask, GFP_KERNEL))

6984

++		return -ENOMEM;

6985

++

6986

++	ret = sched_getaffinity(pid, mask);

6987

++	if (ret == 0) {

6988

++		unsigned int retlen = min_t(size_t, len, cpumask_size());

6989

++

6990

++		if (copy_to_user(user_mask_ptr, mask, retlen))

6991

++			ret = -EFAULT;

6992

++		else

6993

++			ret = retlen;

6994

++	}

6995

++	free_cpumask_var(mask);

6996

++

6997

++	return ret;

6998

++}

6999

++

7000

++static void do_sched_yield(void)

7001

++{

7002

++	struct rq *rq;

7003

++	struct rq_flags rf;

7004

++

7005

++	if (!sched_yield_type)

7006

++		return;

7007

++

7008

++	rq = this_rq_lock_irq(&rf);

7009

++

7010

++	schedstat_inc(rq->yld_count);

7011

++

7012

++	if (1 == sched_yield_type) {

7013

++		if (!rt_task(current))

7014

++			do_sched_yield_type_1(current, rq);

7015

++	} else if (2 == sched_yield_type) {

7016

++		if (rq->nr_running > 1)

7017

++			rq->skip = current;

7018

++	}

7019

++

7020

++	preempt_disable();

7021

++	raw_spin_unlock_irq(&rq->lock);

7022

++	sched_preempt_enable_no_resched();

7023

++

7024

++	schedule();

7025

++}

7026

++

7027

++/**

7028

++ * sys_sched_yield - yield the current processor to other threads.

7029

++ *

7030

++ * This function yields the current CPU to other tasks. If there are no

7031

++ * other threads running on this CPU then this function will return.

7032

++ *

7033

++ * Return: 0.

7034

++ */

7035

++SYSCALL_DEFINE0(sched_yield)

7036

++{

7037

++	do_sched_yield();

7038

++	return 0;

7039

++}

7040

++

7041

++#if !defined(CONFIG_PREEMPTION) || defined(CONFIG_PREEMPT_DYNAMIC)

7042

++int __sched __cond_resched(void)

7043

++{

7044

++	if (should_resched(0)) {

7045

++		preempt_schedule_common();

7046

++		return 1;

7047

++	}

7048

++	/*

7049

++	 * In preemptible kernels, ->rcu_read_lock_nesting tells the tick

7050

++	 * whether the current CPU is in an RCU read-side critical section,

7051

++	 * so the tick can report quiescent states even for CPUs looping

7052

++	 * in kernel context.  In contrast, in non-preemptible kernels,

7053

++	 * RCU readers leave no in-memory hints, which means that CPU-bound

7054

++	 * processes executing in kernel context might never report an

7055

++	 * RCU quiescent state.  Therefore, the following code causes

7056

++	 * cond_resched() to report a quiescent state, but only when RCU

7057

++	 * is in urgent need of one.

7058

++	 */

7059

++#ifndef CONFIG_PREEMPT_RCU

7060

++	rcu_all_qs();

7061

++#endif

7062

++	return 0;

7063

++}

7064

++EXPORT_SYMBOL(__cond_resched);

7065

++#endif

7066

++

7067

++#ifdef CONFIG_PREEMPT_DYNAMIC

7068

++#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)

7069

++#define cond_resched_dynamic_enabled	__cond_resched

7070

++#define cond_resched_dynamic_disabled	((void *)&__static_call_return0)

7071

++DEFINE_STATIC_CALL_RET0(cond_resched, __cond_resched);

7072

++EXPORT_STATIC_CALL_TRAMP(cond_resched);

7073

++

7074

++#define might_resched_dynamic_enabled	__cond_resched

7075

++#define might_resched_dynamic_disabled	((void *)&__static_call_return0)

7076

++DEFINE_STATIC_CALL_RET0(might_resched, __cond_resched);

7077

++EXPORT_STATIC_CALL_TRAMP(might_resched);

7078

++#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)

7079

++static DEFINE_STATIC_KEY_FALSE(sk_dynamic_cond_resched);

7080

++int __sched dynamic_cond_resched(void)

7081

++{

7082

++	if (!static_branch_unlikely(&sk_dynamic_cond_resched))

7083

++		return 0;

7084

++	return __cond_resched();

7085

++}

7086

++EXPORT_SYMBOL(dynamic_cond_resched);

7087

++

7088

++static DEFINE_STATIC_KEY_FALSE(sk_dynamic_might_resched);

7089

++int __sched dynamic_might_resched(void)

7090

++{

7091

++	if (!static_branch_unlikely(&sk_dynamic_might_resched))

7092

++		return 0;

7093

++	return __cond_resched();

7094

++}

7095

++EXPORT_SYMBOL(dynamic_might_resched);

7096

++#endif

7097

++#endif

7098

++

7099

++/*

7100

++ * __cond_resched_lock() - if a reschedule is pending, drop the given lock,

7101

++ * call schedule, and on return reacquire the lock.

7102

++ *

7103

++ * This works OK both with and without CONFIG_PREEMPTION.  We do strange low-level

7104

++ * operations here to prevent schedule() from being called twice (once via

7105

++ * spin_unlock(), once by hand).

7106

++ */

7107

++int __cond_resched_lock(spinlock_t *lock)

7108

++{

7109

++	int resched = should_resched(PREEMPT_LOCK_OFFSET);

7110

++	int ret = 0;

7111

++

7112

++	lockdep_assert_held(lock);

7113

++

7114

++	if (spin_needbreak(lock) || resched) {

7115

++		spin_unlock(lock);

7116

++		if (!_cond_resched())

7117

++			cpu_relax();

7118

++		ret = 1;

7119

++		spin_lock(lock);

7120

++	}

7121

++	return ret;

7122

++}

7123

++EXPORT_SYMBOL(__cond_resched_lock);

7124

++

7125

++int __cond_resched_rwlock_read(rwlock_t *lock)

7126

++{

7127

++	int resched = should_resched(PREEMPT_LOCK_OFFSET);

7128

++	int ret = 0;

7129

++

7130

++	lockdep_assert_held_read(lock);

7131

++

7132

++	if (rwlock_needbreak(lock) || resched) {

7133

++		read_unlock(lock);

7134

++		if (!_cond_resched())

7135

++			cpu_relax();

7136

++		ret = 1;

7137

++		read_lock(lock);

7138

++	}

7139

++	return ret;

7140

++}

7141

++EXPORT_SYMBOL(__cond_resched_rwlock_read);

7142

++

7143

++int __cond_resched_rwlock_write(rwlock_t *lock)

7144

++{

7145

++	int resched = should_resched(PREEMPT_LOCK_OFFSET);

7146

++	int ret = 0;

7147

++

7148

++	lockdep_assert_held_write(lock);

7149

++

7150

++	if (rwlock_needbreak(lock) || resched) {

7151

++		write_unlock(lock);

7152

++		if (!_cond_resched())

7153

++			cpu_relax();

7154

++		ret = 1;

7155

++		write_lock(lock);

7156

++	}

7157

++	return ret;

7158

++}

7159

++EXPORT_SYMBOL(__cond_resched_rwlock_write);

7160

++

7161

++#ifdef CONFIG_PREEMPT_DYNAMIC

7162

++

7163

++#ifdef CONFIG_GENERIC_ENTRY

7164

++#include <linux/entry-common.h>

7165

++#endif

7166

++

7167

++/*

7168

++ * SC:cond_resched

7169

++ * SC:might_resched

7170

++ * SC:preempt_schedule

7171

++ * SC:preempt_schedule_notrace

7172

++ * SC:irqentry_exit_cond_resched

7173

++ *

7174

++ *

7175

++ * NONE:

7176

++ *   cond_resched               <- __cond_resched

7177

++ *   might_resched              <- RET0

7178

++ *   preempt_schedule           <- NOP

7179

++ *   preempt_schedule_notrace   <- NOP

7180

++ *   irqentry_exit_cond_resched <- NOP

7181

++ *

7182

++ * VOLUNTARY:

7183

++ *   cond_resched               <- __cond_resched

7184

++ *   might_resched              <- __cond_resched

7185

++ *   preempt_schedule           <- NOP

7186

++ *   preempt_schedule_notrace   <- NOP

7187

++ *   irqentry_exit_cond_resched <- NOP

7188

++ *

7189

++ * FULL:

7190

++ *   cond_resched               <- RET0

7191

++ *   might_resched              <- RET0

7192

++ *   preempt_schedule           <- preempt_schedule

7193

++ *   preempt_schedule_notrace   <- preempt_schedule_notrace

7194

++ *   irqentry_exit_cond_resched <- irqentry_exit_cond_resched

7195

++ */

7196

++

7197

++enum {

7198

++	preempt_dynamic_undefined = -1,

7199

++	preempt_dynamic_none,

7200

++	preempt_dynamic_voluntary,

7201

++	preempt_dynamic_full,

7202

++};

7203

++

7204

++int preempt_dynamic_mode = preempt_dynamic_undefined;

7205

++

7206

++int sched_dynamic_mode(const char *str)

7207

++{

7208

++	if (!strcmp(str, "none"))

7209

++		return preempt_dynamic_none;

7210

++

7211

++	if (!strcmp(str, "voluntary"))

7212

++		return preempt_dynamic_voluntary;

7213

++

7214

++	if (!strcmp(str, "full"))

7215

++		return preempt_dynamic_full;

7216

++

7217

++	return -EINVAL;

7218

++}

7219

++

7220

++#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)

7221

++#define preempt_dynamic_enable(f)	static_call_update(f, f##_dynamic_enabled)

7222

++#define preempt_dynamic_disable(f)	static_call_update(f, f##_dynamic_disabled)

7223

++#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)

7224

++#define preempt_dynamic_enable(f)	static_key_enable(&sk_dynamic_##f.key)

7225

++#define preempt_dynamic_disable(f)	static_key_disable(&sk_dynamic_##f.key)

7226

++#else

7227

++#error "Unsupported PREEMPT_DYNAMIC mechanism"

7228

++#endif

7229

++

7230

++void sched_dynamic_update(int mode)

7231

++{

7232

++	/*

7233

++	 * Avoid {NONE,VOLUNTARY} -> FULL transitions from ever ending up in

7234

++	 * the ZERO state, which is invalid.

7235

++	 */

7236

++	preempt_dynamic_enable(cond_resched);

7237

++	preempt_dynamic_enable(might_resched);

7238

++	preempt_dynamic_enable(preempt_schedule);

7239

++	preempt_dynamic_enable(preempt_schedule_notrace);

7240

++	preempt_dynamic_enable(irqentry_exit_cond_resched);

7241

++

7242

++	switch (mode) {

7243

++	case preempt_dynamic_none:

7244

++		preempt_dynamic_enable(cond_resched);

7245

++		preempt_dynamic_disable(might_resched);

7246

++		preempt_dynamic_disable(preempt_schedule);

7247

++		preempt_dynamic_disable(preempt_schedule_notrace);

7248

++		preempt_dynamic_disable(irqentry_exit_cond_resched);

7249

++		pr_info("Dynamic Preempt: none\n");

7250

++		break;

7251

++

7252

++	case preempt_dynamic_voluntary:

7253

++		preempt_dynamic_enable(cond_resched);

7254

++		preempt_dynamic_enable(might_resched);

7255

++		preempt_dynamic_disable(preempt_schedule);

7256

++		preempt_dynamic_disable(preempt_schedule_notrace);

7257

++		preempt_dynamic_disable(irqentry_exit_cond_resched);

7258

++		pr_info("Dynamic Preempt: voluntary\n");

7259

++		break;

7260

++

7261

++	case preempt_dynamic_full:

7262

++		preempt_dynamic_disable(cond_resched);

7263

++		preempt_dynamic_disable(might_resched);

7264

++		preempt_dynamic_enable(preempt_schedule);

7265

++		preempt_dynamic_enable(preempt_schedule_notrace);

7266

++		preempt_dynamic_enable(irqentry_exit_cond_resched);

7267

++		pr_info("Dynamic Preempt: full\n");

7268

++		break;

7269

++	}

7270

++

7271

++	preempt_dynamic_mode = mode;

7272

++}

7273

++

7274

++static int __init setup_preempt_mode(char *str)

7275

++{

7276

++	int mode = sched_dynamic_mode(str);

7277

++	if (mode < 0) {

7278

++		pr_warn("Dynamic Preempt: unsupported mode: %s\n", str);

7279

++		return 0;

7280

++	}

7281

++

7282

++	sched_dynamic_update(mode);

7283

++	return 1;

7284

++}

7285

++__setup("preempt=", setup_preempt_mode);

7286

++

7287

++static void __init preempt_dynamic_init(void)

7288

++{

7289

++	if (preempt_dynamic_mode == preempt_dynamic_undefined) {

7290

++		if (IS_ENABLED(CONFIG_PREEMPT_NONE)) {

7291

++			sched_dynamic_update(preempt_dynamic_none);

7292

++		} else if (IS_ENABLED(CONFIG_PREEMPT_VOLUNTARY)) {

7293

++			sched_dynamic_update(preempt_dynamic_voluntary);

7294

++		} else {

7295

++			/* Default static call setting, nothing to do */

7296

++			WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPT));

7297

++			preempt_dynamic_mode = preempt_dynamic_full;

7298

++			pr_info("Dynamic Preempt: full\n");

7299

++		}

7300

++	}

7301

++}

7302

++

7303

++#define PREEMPT_MODEL_ACCESSOR(mode) \

7304

++	bool preempt_model_##mode(void)						 \

7305

++	{									 \

7306

++		WARN_ON_ONCE(preempt_dynamic_mode == preempt_dynamic_undefined); \

7307

++		return preempt_dynamic_mode == preempt_dynamic_##mode;		 \

7308

++	}									 \

7309

++	EXPORT_SYMBOL_GPL(preempt_model_##mode)

7310

++

7311

++PREEMPT_MODEL_ACCESSOR(none);

7312

++PREEMPT_MODEL_ACCESSOR(voluntary);

7313

++PREEMPT_MODEL_ACCESSOR(full);

7314

++

7315

++#else /* !CONFIG_PREEMPT_DYNAMIC */

7316

++

7317

++static inline void preempt_dynamic_init(void) { }

7318

++

7319

++#endif /* #ifdef CONFIG_PREEMPT_DYNAMIC */

7320

++

7321

++/**

7322

++ * yield - yield the current processor to other threads.

7323

++ *

7324

++ * Do not ever use this function, there's a 99% chance you're doing it wrong.

7325

++ *

7326

++ * The scheduler is at all times free to pick the calling task as the most

7327

++ * eligible task to run, if removing the yield() call from your code breaks

7328

++ * it, it's already broken.

7329

++ *

7330

++ * Typical broken usage is:

7331

++ *

7332

++ * while (!event)

7333

++ * 	yield();

7334

++ *

7335

++ * where one assumes that yield() will let 'the other' process run that will

7336

++ * make event true. If the current task is a SCHED_FIFO task that will never

7337

++ * happen. Never use yield() as a progress guarantee!!

7338

++ *

7339

++ * If you want to use yield() to wait for something, use wait_event().

7340

++ * If you want to use yield() to be 'nice' for others, use cond_resched().

7341

++ * If you still want to use yield(), do not!

7342

++ */

7343

++void __sched yield(void)

7344

++{

7345

++	set_current_state(TASK_RUNNING);

7346

++	do_sched_yield();

7347

++}

7348

++EXPORT_SYMBOL(yield);

7349

++

7350

++/**

7351

++ * yield_to - yield the current processor to another thread in

7352

++ * your thread group, or accelerate that thread toward the

7353

++ * processor it's on.

7354

++ * @p: target task

7355

++ * @preempt: whether task preemption is allowed or not

7356

++ *

7357

++ * It's the caller's job to ensure that the target task struct

7358

++ * can't go away on us before we can do any checks.

7359

++ *

7360

++ * In Alt schedule FW, yield_to is not supported.

7361

++ *

7362

++ * Return:

7363

++ *	true (>0) if we indeed boosted the target task.

7364

++ *	false (0) if we failed to boost the target.

7365

++ *	-ESRCH if there's no task to yield to.

7366

++ */

7367

++int __sched yield_to(struct task_struct *p, bool preempt)

7368

++{

7369

++	return 0;

7370

++}

7371

++EXPORT_SYMBOL_GPL(yield_to);

7372

++

7373

++int io_schedule_prepare(void)

7374

++{

7375

++	int old_iowait = current->in_iowait;

7376

++

7377

++	current->in_iowait = 1;

7378

++	blk_flush_plug(current->plug, true);

7379

++	return old_iowait;

7380

++}

7381

++

7382

++void io_schedule_finish(int token)

7383

++{

7384

++	current->in_iowait = token;

7385

++}

7386

++

7387

++/*

7388

++ * This task is about to go to sleep on IO.  Increment rq->nr_iowait so

7389

++ * that process accounting knows that this is a task in IO wait state.

7390

++ *

7391

++ * But don't do that if it is a deliberate, throttling IO wait (this task

7392

++ * has set its backing_dev_info: the queue against which it should throttle)

7393

++ */

7394

++

7395

++long __sched io_schedule_timeout(long timeout)

7396

++{

7397

++	int token;

7398

++	long ret;

7399

++

7400

++	token = io_schedule_prepare();

7401

++	ret = schedule_timeout(timeout);

7402

++	io_schedule_finish(token);

7403

++

7404

++	return ret;

7405

++}

7406

++EXPORT_SYMBOL(io_schedule_timeout);

7407

++

7408

++void __sched io_schedule(void)

7409

++{

7410

++	int token;

7411

++

7412

++	token = io_schedule_prepare();

7413

++	schedule();

7414

++	io_schedule_finish(token);

7415

++}

7416

++EXPORT_SYMBOL(io_schedule);

7417

++

7418

++/**

7419

++ * sys_sched_get_priority_max - return maximum RT priority.

7420

++ * @policy: scheduling class.

7421

++ *

7422

++ * Return: On success, this syscall returns the maximum

7423

++ * rt_priority that can be used by a given scheduling class.

7424

++ * On failure, a negative error code is returned.

7425

++ */

7426

++SYSCALL_DEFINE1(sched_get_priority_max, int, policy)

7427

++{

7428

++	int ret = -EINVAL;

7429

++

7430

++	switch (policy) {

7431

++	case SCHED_FIFO:

7432

++	case SCHED_RR:

7433

++		ret = MAX_RT_PRIO - 1;

7434

++		break;

7435

++	case SCHED_NORMAL:

7436

++	case SCHED_BATCH:

7437

++	case SCHED_IDLE:

7438

++		ret = 0;

7439

++		break;

7440

++	}

7441

++	return ret;

7442

++}

7443

++

7444

++/**

7445

++ * sys_sched_get_priority_min - return minimum RT priority.

7446

++ * @policy: scheduling class.

7447

++ *

7448

++ * Return: On success, this syscall returns the minimum

7449

++ * rt_priority that can be used by a given scheduling class.

7450

++ * On failure, a negative error code is returned.

7451

++ */

7452

++SYSCALL_DEFINE1(sched_get_priority_min, int, policy)

7453

++{

7454

++	int ret = -EINVAL;

7455

++

7456

++	switch (policy) {

7457

++	case SCHED_FIFO:

7458

++	case SCHED_RR:

7459

++		ret = 1;

7460

++		break;

7461

++	case SCHED_NORMAL:

7462

++	case SCHED_BATCH:

7463

++	case SCHED_IDLE:

7464

++		ret = 0;

7465

++		break;

7466

++	}

7467

++	return ret;

7468

++}

7469

++

7470

++static int sched_rr_get_interval(pid_t pid, struct timespec64 *t)

7471

++{

7472

++	struct task_struct *p;

7473

++	int retval;

7474

++

7475

++	alt_sched_debug();

7476

++

7477

++	if (pid < 0)

7478

++		return -EINVAL;

7479

++

7480

++	retval = -ESRCH;

7481

++	rcu_read_lock();

7482

++	p = find_process_by_pid(pid);

7483

++	if (!p)

7484

++		goto out_unlock;

7485

++

7486

++	retval = security_task_getscheduler(p);

7487

++	if (retval)

7488

++		goto out_unlock;

7489

++	rcu_read_unlock();

7490

++

7491

++	*t = ns_to_timespec64(sched_timeslice_ns);

7492

++	return 0;

7493

++

7494

++out_unlock:

7495

++	rcu_read_unlock();

7496

++	return retval;

7497

++}

7498

++

7499

++/**

7500

++ * sys_sched_rr_get_interval - return the default timeslice of a process.

7501

++ * @pid: pid of the process.

7502

++ * @interval: userspace pointer to the timeslice value.

7503

++ *

7504

++ *

7505

++ * Return: On success, 0 and the timeslice is in @interval. Otherwise,

7506

++ * an error code.

7507

++ */

7508

++SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,

7509

++		struct __kernel_timespec __user *, interval)

7510

++{

7511

++	struct timespec64 t;

7512

++	int retval = sched_rr_get_interval(pid, &t);

7513

++

7514

++	if (retval == 0)

7515

++		retval = put_timespec64(&t, interval);

7516

++

7517

++	return retval;

7518

++}

7519

++

7520

++#ifdef CONFIG_COMPAT_32BIT_TIME

7521

++SYSCALL_DEFINE2(sched_rr_get_interval_time32, pid_t, pid,

7522

++		struct old_timespec32 __user *, interval)

7523

++{

7524

++	struct timespec64 t;

7525

++	int retval = sched_rr_get_interval(pid, &t);

7526

++

7527

++	if (retval == 0)

7528

++		retval = put_old_timespec32(&t, interval);

7529

++	return retval;

7530

++}

7531

++#endif

7532

++

7533

++void sched_show_task(struct task_struct *p)

7534

++{

7535

++	unsigned long free = 0;

7536

++	int ppid;

7537

++

7538

++	if (!try_get_task_stack(p))

7539

++		return;

7540

++

7541

++	pr_info("task:%-15.15s state:%c", p->comm, task_state_to_char(p));

7542

++

7543

++	if (task_is_running(p))

7544

++		pr_cont("  running task    ");

7545

++#ifdef CONFIG_DEBUG_STACK_USAGE

7546

++	free = stack_not_used(p);

7547

++#endif

7548

++	ppid = 0;

7549

++	rcu_read_lock();

7550

++	if (pid_alive(p))

7551

++		ppid = task_pid_nr(rcu_dereference(p->real_parent));

7552

++	rcu_read_unlock();

7553

++	pr_cont(" stack:%-5lu pid:%-5d ppid:%-6d flags:0x%08lx\n",

7554

++		free, task_pid_nr(p), ppid,

7555

++		read_task_thread_flags(p));

7556

++

7557

++	print_worker_info(KERN_INFO, p);

7558

++	print_stop_info(KERN_INFO, p);

7559

++	show_stack(p, NULL, KERN_INFO);

7560

++	put_task_stack(p);

7561

++}

7562

++EXPORT_SYMBOL_GPL(sched_show_task);

7563

++

7564

++static inline bool

7565

++state_filter_match(unsigned long state_filter, struct task_struct *p)

7566

++{

7567

++	unsigned int state = READ_ONCE(p->__state);

7568

++

7569

++	/* no filter, everything matches */

7570

++	if (!state_filter)

7571

++		return true;

7572

++

7573

++	/* filter, but doesn't match */

7574

++	if (!(state & state_filter))

7575

++		return false;

7576

++

7577

++	/*

7578

++	 * When looking for TASK_UNINTERRUPTIBLE skip TASK_IDLE (allows

7579

++	 * TASK_KILLABLE).

7580

++	 */

7581

++	if (state_filter == TASK_UNINTERRUPTIBLE && (state & TASK_NOLOAD))

7582

++		return false;

7583

++

7584

++	return true;

7585

++}

7586

++

7587

++

7588

++void show_state_filter(unsigned int state_filter)

7589

++{

7590

++	struct task_struct *g, *p;

7591

++

7592

++	rcu_read_lock();

7593

++	for_each_process_thread(g, p) {

7594

++		/*

7595

++		 * reset the NMI-timeout, listing all files on a slow

7596

++		 * console might take a lot of time:

7597

++		 * Also, reset softlockup watchdogs on all CPUs, because

7598

++		 * another CPU might be blocked waiting for us to process

7599

++		 * an IPI.

7600

++		 */

7601

++		touch_nmi_watchdog();

7602

++		touch_all_softlockup_watchdogs();

7603

++		if (state_filter_match(state_filter, p))

7604

++			sched_show_task(p);

7605

++	}

7606

++

7607

++#ifdef CONFIG_SCHED_DEBUG

7608

++	/* TODO: Alt schedule FW should support this

7609

++	if (!state_filter)

7610

++		sysrq_sched_debug_show();

7611

++	*/

7612

++#endif

7613

++	rcu_read_unlock();

7614

++	/*

7615

++	 * Only show locks if all tasks are dumped:

7616

++	 */

7617

++	if (!state_filter)

7618

++		debug_show_all_locks();

7619

++}

7620

++

7621

++void dump_cpu_task(int cpu)

7622

++{

7623

++	if (cpu == smp_processor_id() && in_hardirq()) {

7624

++		struct pt_regs *regs;

7625

++

7626

++		regs = get_irq_regs();

7627

++		if (regs) {

7628

++			show_regs(regs);

7629

++			return;

7630

++		}

7631

++	}

7632

++

7633

++	if (trigger_single_cpu_backtrace(cpu))

7634

++		return;

7635

++

7636

++	pr_info("Task dump for CPU %d:\n", cpu);

7637

++	sched_show_task(cpu_curr(cpu));

7638

++}

7639

++

7640

++/**

7641

++ * init_idle - set up an idle thread for a given CPU

7642

++ * @idle: task in question

7643

++ * @cpu: CPU the idle task belongs to

7644

++ *

7645

++ * NOTE: this function does not set the idle thread's NEED_RESCHED

7646

++ * flag, to make booting more robust.

7647

++ */

7648

++void __init init_idle(struct task_struct *idle, int cpu)

7649

++{

7650

++#ifdef CONFIG_SMP

7651

++	struct affinity_context ac = (struct affinity_context) {

7652

++		.new_mask  = cpumask_of(cpu),

7653

++		.flags     = 0,

7654

++	};

7655

++#endif

7656

++	struct rq *rq = cpu_rq(cpu);

7657

++	unsigned long flags;

7658

++

7659

++	__sched_fork(0, idle);

7660

++

7661

++	raw_spin_lock_irqsave(&idle->pi_lock, flags);

7662

++	raw_spin_lock(&rq->lock);

7663

++

7664

++	idle->last_ran = rq->clock_task;

7665

++	idle->__state = TASK_RUNNING;

7666

++	/*

7667

++	 * PF_KTHREAD should already be set at this point; regardless, make it

7668

++	 * look like a proper per-CPU kthread.

7669

++	 */

7670

++	idle->flags |= PF_IDLE | PF_KTHREAD | PF_NO_SETAFFINITY;

7671

++	kthread_set_per_cpu(idle, cpu);

7672

++

7673

++	sched_queue_init_idle(&rq->queue, idle);

7674

++

7675

++#ifdef CONFIG_SMP

7676

++	/*

7677

++	 * It's possible that init_idle() gets called multiple times on a task,

7678

++	 * in that case do_set_cpus_allowed() will not do the right thing.

7679

++	 *

7680

++	 * And since this is boot we can forgo the serialisation.

7681

++	 */

7682

++	set_cpus_allowed_common(idle, &ac);

7683

++#endif

7684

++

7685

++	/* Silence PROVE_RCU */

7686

++	rcu_read_lock();

7687

++	__set_task_cpu(idle, cpu);

7688

++	rcu_read_unlock();

7689

++

7690

++	rq->idle = idle;

7691

++	rcu_assign_pointer(rq->curr, idle);

7692

++	idle->on_cpu = 1;

7693

++

7694

++	raw_spin_unlock(&rq->lock);

7695

++	raw_spin_unlock_irqrestore(&idle->pi_lock, flags);

7696

++

7697

++	/* Set the preempt count _outside_ the spinlocks! */

7698

++	init_idle_preempt_count(idle, cpu);

7699

++

7700

++	ftrace_graph_init_idle_task(idle, cpu);

7701

++	vtime_init_idle(idle, cpu);

7702

++#ifdef CONFIG_SMP

7703

++	sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);

7704

++#endif

7705

++}

7706

++

7707

++#ifdef CONFIG_SMP

7708

++

7709

++int cpuset_cpumask_can_shrink(const struct cpumask __maybe_unused *cur,

7710

++			      const struct cpumask __maybe_unused *trial)

7711

++{

7712

++	return 1;

7713

++}

7714

++

7715

++int task_can_attach(struct task_struct *p,

7716

++		    const struct cpumask *cs_effective_cpus)

7717

++{

7718

++	int ret = 0;

7719

++

7720

++	/*

7721

++	 * Kthreads which disallow setaffinity shouldn't be moved

7722

++	 * to a new cpuset; we don't want to change their CPU

7723

++	 * affinity and isolating such threads by their set of

7724

++	 * allowed nodes is unnecessary.  Thus, cpusets are not

7725

++	 * applicable for such threads.  This prevents checking for

7726

++	 * success of set_cpus_allowed_ptr() on all attached tasks

7727

++	 * before cpus_mask may be changed.

7728

++	 */

7729

++	if (p->flags & PF_NO_SETAFFINITY)

7730

++		ret = -EINVAL;

7731

++

7732

++	return ret;

7733

++}

7734

++

7735

++bool sched_smp_initialized __read_mostly;

7736

++

7737

++#ifdef CONFIG_HOTPLUG_CPU

7738

++/*

7739

++ * Ensures that the idle task is using init_mm right before its CPU goes

7740

++ * offline.

7741

++ */

7742

++void idle_task_exit(void)

7743

++{

7744

++	struct mm_struct *mm = current->active_mm;

7745

++

7746

++	BUG_ON(current != this_rq()->idle);

7747

++

7748

++	if (mm != &init_mm) {

7749

++		switch_mm(mm, &init_mm, current);

7750

++		finish_arch_post_lock_switch();

7751

++	}

7752

++

7753

++	/* finish_cpu(), as ran on the BP, will clean up the active_mm state */

7754

++}

7755

++

7756

++static int __balance_push_cpu_stop(void *arg)

7757

++{

7758

++	struct task_struct *p = arg;

7759

++	struct rq *rq = this_rq();

7760

++	struct rq_flags rf;

7761

++	int cpu;

7762

++

7763

++	raw_spin_lock_irq(&p->pi_lock);

7764

++	rq_lock(rq, &rf);

7765

++

7766

++	update_rq_clock(rq);

7767

++

7768

++	if (task_rq(p) == rq && task_on_rq_queued(p)) {

7769

++		cpu = select_fallback_rq(rq->cpu, p);

7770

++		rq = __migrate_task(rq, p, cpu);

7771

++	}

7772

++

7773

++	rq_unlock(rq, &rf);

7774

++	raw_spin_unlock_irq(&p->pi_lock);

7775

++

7776

++	put_task_struct(p);

7777

++

7778

++	return 0;

7779

++}

7780

++

7781

++static DEFINE_PER_CPU(struct cpu_stop_work, push_work);

7782

++

7783

++/*

7784

++ * This is enabled below SCHED_AP_ACTIVE; when !cpu_active(), but only

7785

++ * effective when the hotplug motion is down.

7786

++ */

7787

++static void balance_push(struct rq *rq)

7788

++{

7789

++	struct task_struct *push_task = rq->curr;

7790

++

7791

++	lockdep_assert_held(&rq->lock);

7792

++

7793

++	/*

7794

++	 * Ensure the thing is persistent until balance_push_set(.on = false);

7795

++	 */

7796

++	rq->balance_callback = &balance_push_callback;

7797

++

7798

++	/*

7799

++	 * Only active while going offline and when invoked on the outgoing

7800

++	 * CPU.

7801

++	 */

7802

++	if (!cpu_dying(rq->cpu) || rq != this_rq())

7803

++		return;

7804

++

7805

++	/*

7806

++	 * Both the cpu-hotplug and stop task are in this case and are

7807

++	 * required to complete the hotplug process.

7808

++	 */

7809

++	if (kthread_is_per_cpu(push_task) ||

7810

++	    is_migration_disabled(push_task)) {

7811

++

7812

++		/*

7813

++		 * If this is the idle task on the outgoing CPU try to wake

7814

++		 * up the hotplug control thread which might wait for the

7815

++		 * last task to vanish. The rcuwait_active() check is

7816

++		 * accurate here because the waiter is pinned on this CPU

7817

++		 * and can't obviously be running in parallel.

7818

++		 *

7819

++		 * On RT kernels this also has to check whether there are

7820

++		 * pinned and scheduled out tasks on the runqueue. They

7821

++		 * need to leave the migrate disabled section first.

7822

++		 */

7823

++		if (!rq->nr_running && !rq_has_pinned_tasks(rq) &&

7824

++		    rcuwait_active(&rq->hotplug_wait)) {

7825

++			raw_spin_unlock(&rq->lock);

7826

++			rcuwait_wake_up(&rq->hotplug_wait);

7827

++			raw_spin_lock(&rq->lock);

7828

++		}

7829

++		return;

7830

++	}

7831

++

7832

++	get_task_struct(push_task);

7833

++	/*

7834

++	 * Temporarily drop rq->lock such that we can wake-up the stop task.

7835

++	 * Both preemption and IRQs are still disabled.

7836

++	 */

7837

++	raw_spin_unlock(&rq->lock);

7838

++	stop_one_cpu_nowait(rq->cpu, __balance_push_cpu_stop, push_task,

7839

++			    this_cpu_ptr(&push_work));

7840

++	/*

7841

++	 * At this point need_resched() is true and we'll take the loop in

7842

++	 * schedule(). The next pick is obviously going to be the stop task

7843

++	 * which kthread_is_per_cpu() and will push this task away.

7844

++	 */

7845

++	raw_spin_lock(&rq->lock);

7846

++}

7847

++

7848

++static void balance_push_set(int cpu, bool on)

7849

++{

7850

++	struct rq *rq = cpu_rq(cpu);

7851

++	struct rq_flags rf;

7852

++

7853

++	rq_lock_irqsave(rq, &rf);

7854

++	if (on) {

7855

++		WARN_ON_ONCE(rq->balance_callback);

7856

++		rq->balance_callback = &balance_push_callback;

7857

++	} else if (rq->balance_callback == &balance_push_callback) {

7858

++		rq->balance_callback = NULL;

7859

++	}

7860

++	rq_unlock_irqrestore(rq, &rf);

7861

++}

7862

++

7863

++/*

7864

++ * Invoked from a CPUs hotplug control thread after the CPU has been marked

7865

++ * inactive. All tasks which are not per CPU kernel threads are either

7866

++ * pushed off this CPU now via balance_push() or placed on a different CPU

7867

++ * during wakeup. Wait until the CPU is quiescent.

7868

++ */

7869

++static void balance_hotplug_wait(void)

7870

++{

7871

++	struct rq *rq = this_rq();

7872

++

7873

++	rcuwait_wait_event(&rq->hotplug_wait,

7874

++			   rq->nr_running == 1 && !rq_has_pinned_tasks(rq),

7875

++			   TASK_UNINTERRUPTIBLE);

7876

++}

7877

++

7878

++#else

7879

++

7880

++static void balance_push(struct rq *rq)

7881

++{

7882

++}

7883

++

7884

++static void balance_push_set(int cpu, bool on)

7885

++{

7886

++}

7887

++

7888

++static inline void balance_hotplug_wait(void)

7889

++{

7890

++}

7891

++#endif /* CONFIG_HOTPLUG_CPU */

7892

++

7893

++static void set_rq_offline(struct rq *rq)

7894

++{

7895

++	if (rq->online)

7896

++		rq->online = false;

7897

++}

7898

++

7899

++static void set_rq_online(struct rq *rq)

7900

++{

7901

++	if (!rq->online)

7902

++		rq->online = true;

7903

++}

7904

++

7905

++/*

7906

++ * used to mark begin/end of suspend/resume:

7907

++ */

7908

++static int num_cpus_frozen;

7909

++

7910

++/*

7911

++ * Update cpusets according to cpu_active mask.  If cpusets are

7912

++ * disabled, cpuset_update_active_cpus() becomes a simple wrapper

7913

++ * around partition_sched_domains().

7914

++ *

7915

++ * If we come here as part of a suspend/resume, don't touch cpusets because we

7916

++ * want to restore it back to its original state upon resume anyway.

7917

++ */

7918

++static void cpuset_cpu_active(void)

7919

++{

7920

++	if (cpuhp_tasks_frozen) {

7921

++		/*

7922

++		 * num_cpus_frozen tracks how many CPUs are involved in suspend

7923

++		 * resume sequence. As long as this is not the last online

7924

++		 * operation in the resume sequence, just build a single sched

7925

++		 * domain, ignoring cpusets.

7926

++		 */

7927

++		partition_sched_domains(1, NULL, NULL);

7928

++		if (--num_cpus_frozen)

7929

++			return;

7930

++		/*

7931

++		 * This is the last CPU online operation. So fall through and

7932

++		 * restore the original sched domains by considering the

7933

++		 * cpuset configurations.

7934

++		 */

7935

++		cpuset_force_rebuild();

7936

++	}

7937

++

7938

++	cpuset_update_active_cpus();

7939

++}

7940

++

7941

++static int cpuset_cpu_inactive(unsigned int cpu)

7942

++{

7943

++	if (!cpuhp_tasks_frozen) {

7944

++		cpuset_update_active_cpus();

7945

++	} else {

7946

++		num_cpus_frozen++;

7947

++		partition_sched_domains(1, NULL, NULL);

7948

++	}

7949

++	return 0;

7950

++}

7951

++

7952

++int sched_cpu_activate(unsigned int cpu)

7953

++{

7954

++	struct rq *rq = cpu_rq(cpu);

7955

++	unsigned long flags;

7956

++

7957

++	/*

7958

++	 * Clear the balance_push callback and prepare to schedule

7959

++	 * regular tasks.

7960

++	 */

7961

++	balance_push_set(cpu, false);

7962

++

7963

++#ifdef CONFIG_SCHED_SMT

7964

++	/*

7965

++	 * When going up, increment the number of cores with SMT present.

7966

++	 */

7967

++	if (cpumask_weight(cpu_smt_mask(cpu)) == 2)

7968

++		static_branch_inc_cpuslocked(&sched_smt_present);

7969

++#endif

7970

++	set_cpu_active(cpu, true);

7971

++

7972

++	if (sched_smp_initialized)

7973

++		cpuset_cpu_active();

7974

++

7975

++	/*

7976

++	 * Put the rq online, if not already. This happens:

7977

++	 *

7978

++	 * 1) In the early boot process, because we build the real domains

7979

++	 *    after all cpus have been brought up.

7980

++	 *

7981

++	 * 2) At runtime, if cpuset_cpu_active() fails to rebuild the

7982

++	 *    domains.

7983

++	 */

7984

++	raw_spin_lock_irqsave(&rq->lock, flags);

7985

++	set_rq_online(rq);

7986

++	raw_spin_unlock_irqrestore(&rq->lock, flags);

7987

++

7988

++	return 0;

7989

++}

7990

++

7991

++int sched_cpu_deactivate(unsigned int cpu)

7992

++{

7993

++	struct rq *rq = cpu_rq(cpu);

7994

++	unsigned long flags;

7995

++	int ret;

7996

++

7997

++	set_cpu_active(cpu, false);

7998

++

7999

++	/*

8000

++	 * From this point forward, this CPU will refuse to run any task that

8001

++	 * is not: migrate_disable() or KTHREAD_IS_PER_CPU, and will actively

8002

++	 * push those tasks away until this gets cleared, see

8003

++	 * sched_cpu_dying().

8004

++	 */

8005

++	balance_push_set(cpu, true);

8006

++

8007

++	/*

8008

++	 * We've cleared cpu_active_mask, wait for all preempt-disabled and RCU

8009

++	 * users of this state to go away such that all new such users will

8010

++	 * observe it.

8011

++	 *

8012

++	 * Specifically, we rely on ttwu to no longer target this CPU, see

8013

++	 * ttwu_queue_cond() and is_cpu_allowed().

8014

++	 *

8015

++	 * Do sync before park smpboot threads to take care the rcu boost case.

8016

++	 */

8017

++	synchronize_rcu();

8018

++

8019

++	raw_spin_lock_irqsave(&rq->lock, flags);

8020

++	update_rq_clock(rq);

8021

++	set_rq_offline(rq);

8022

++	raw_spin_unlock_irqrestore(&rq->lock, flags);

8023

++

8024

++#ifdef CONFIG_SCHED_SMT

8025

++	/*

8026

++	 * When going down, decrement the number of cores with SMT present.

8027

++	 */

8028

++	if (cpumask_weight(cpu_smt_mask(cpu)) == 2) {

8029

++		static_branch_dec_cpuslocked(&sched_smt_present);

8030

++		if (!static_branch_likely(&sched_smt_present))

8031

++			cpumask_clear(&sched_sg_idle_mask);

8032

++	}

8033

++#endif

8034

++

8035

++	if (!sched_smp_initialized)

8036

++		return 0;

8037

++

8038

++	ret = cpuset_cpu_inactive(cpu);

8039

++	if (ret) {

8040

++		balance_push_set(cpu, false);

8041

++		set_cpu_active(cpu, true);

8042

++		return ret;

8043

++	}

8044

++

8045

++	return 0;

8046

++}

8047

++

8048

++static void sched_rq_cpu_starting(unsigned int cpu)

8049

++{

8050

++	struct rq *rq = cpu_rq(cpu);

8051

++

8052

++	rq->calc_load_update = calc_load_update;

8053

++}

8054

++

8055

++int sched_cpu_starting(unsigned int cpu)

8056

++{

8057

++	sched_rq_cpu_starting(cpu);

8058

++	sched_tick_start(cpu);

8059

++	return 0;

8060

++}

8061

++

8062

++#ifdef CONFIG_HOTPLUG_CPU

8063

++

8064

++/*

8065

++ * Invoked immediately before the stopper thread is invoked to bring the

8066

++ * CPU down completely. At this point all per CPU kthreads except the

8067

++ * hotplug thread (current) and the stopper thread (inactive) have been

8068

++ * either parked or have been unbound from the outgoing CPU. Ensure that

8069

++ * any of those which might be on the way out are gone.

8070

++ *

8071

++ * If after this point a bound task is being woken on this CPU then the

8072

++ * responsible hotplug callback has failed to do it's job.

8073

++ * sched_cpu_dying() will catch it with the appropriate fireworks.

8074

++ */

8075

++int sched_cpu_wait_empty(unsigned int cpu)

8076

++{

8077

++	balance_hotplug_wait();

8078

++	return 0;

8079

++}

8080

++

8081

++/*

8082

++ * Since this CPU is going 'away' for a while, fold any nr_active delta we

8083

++ * might have. Called from the CPU stopper task after ensuring that the

8084

++ * stopper is the last running task on the CPU, so nr_active count is

8085

++ * stable. We need to take the teardown thread which is calling this into

8086

++ * account, so we hand in adjust = 1 to the load calculation.

8087

++ *

8088

++ * Also see the comment "Global load-average calculations".

8089

++ */

8090

++static void calc_load_migrate(struct rq *rq)

8091

++{

8092

++	long delta = calc_load_fold_active(rq, 1);

8093

++

8094

++	if (delta)

8095

++		atomic_long_add(delta, &calc_load_tasks);

8096

++}

8097

++

8098

++static void dump_rq_tasks(struct rq *rq, const char *loglvl)

8099

++{

8100

++	struct task_struct *g, *p;

8101

++	int cpu = cpu_of(rq);

8102

++

8103

++	lockdep_assert_held(&rq->lock);

8104

++

8105

++	printk("%sCPU%d enqueued tasks (%u total):\n", loglvl, cpu, rq->nr_running);

8106

++	for_each_process_thread(g, p) {

8107

++		if (task_cpu(p) != cpu)

8108

++			continue;

8109

++

8110

++		if (!task_on_rq_queued(p))

8111

++			continue;

8112

++

8113

++		printk("%s\tpid: %d, name: %s\n", loglvl, p->pid, p->comm);

8114

++	}

8115

++}

8116

++

8117

++int sched_cpu_dying(unsigned int cpu)

8118

++{

8119

++	struct rq *rq = cpu_rq(cpu);

8120

++	unsigned long flags;

8121

++

8122

++	/* Handle pending wakeups and then migrate everything off */

8123

++	sched_tick_stop(cpu);

8124

++

8125

++	raw_spin_lock_irqsave(&rq->lock, flags);

8126

++	if (rq->nr_running != 1 || rq_has_pinned_tasks(rq)) {

8127

++		WARN(true, "Dying CPU not properly vacated!");

8128

++		dump_rq_tasks(rq, KERN_WARNING);

8129

++	}

8130

++	raw_spin_unlock_irqrestore(&rq->lock, flags);

8131

++

8132

++	calc_load_migrate(rq);

8133

++	hrtick_clear(rq);

8134

++	return 0;

8135

++}

8136

++#endif

8137

++

8138

++#ifdef CONFIG_SMP

8139

++static void sched_init_topology_cpumask_early(void)

8140

++{

8141

++	int cpu;

8142

++	cpumask_t *tmp;

8143

++

8144

++	for_each_possible_cpu(cpu) {

8145

++		/* init topo masks */

8146

++		tmp = per_cpu(sched_cpu_topo_masks, cpu);

8147

++

8148

++		cpumask_copy(tmp, cpumask_of(cpu));

8149

++		tmp++;

8150

++		cpumask_copy(tmp, cpu_possible_mask);

8151

++		per_cpu(sched_cpu_llc_mask, cpu) = tmp;

8152

++		per_cpu(sched_cpu_topo_end_mask, cpu) = ++tmp;

8153

++		/*per_cpu(sd_llc_id, cpu) = cpu;*/

8154

++	}

8155

++}

8156

++

8157

++#define TOPOLOGY_CPUMASK(name, mask, last)\

8158

++	if (cpumask_and(topo, topo, mask)) {					\

8159

++		cpumask_copy(topo, mask);					\

8160

++		printk(KERN_INFO "sched: cpu#%02d topo: 0x%08lx - "#name,	\

8161

++		       cpu, (topo++)->bits[0]);					\

8162

++	}									\

8163

++	if (!last)								\

8164

++		cpumask_complement(topo, mask)

8165

++

8166

++static void sched_init_topology_cpumask(void)

8167

++{

8168

++	int cpu;

8169

++	cpumask_t *topo;

8170

++

8171

++	for_each_online_cpu(cpu) {

8172

++		/* take chance to reset time slice for idle tasks */

8173

++		cpu_rq(cpu)->idle->time_slice = sched_timeslice_ns;

8174

++

8175

++		topo = per_cpu(sched_cpu_topo_masks, cpu) + 1;

8176

++

8177

++		cpumask_complement(topo, cpumask_of(cpu));

8178

++#ifdef CONFIG_SCHED_SMT

8179

++		TOPOLOGY_CPUMASK(smt, topology_sibling_cpumask(cpu), false);

8180

++#endif

8181

++		per_cpu(sd_llc_id, cpu) = cpumask_first(cpu_coregroup_mask(cpu));

8182

++		per_cpu(sched_cpu_llc_mask, cpu) = topo;

8183

++		TOPOLOGY_CPUMASK(coregroup, cpu_coregroup_mask(cpu), false);

8184

++

8185

++		TOPOLOGY_CPUMASK(core, topology_core_cpumask(cpu), false);

8186

++

8187

++		TOPOLOGY_CPUMASK(others, cpu_online_mask, true);

8188

++

8189

++		per_cpu(sched_cpu_topo_end_mask, cpu) = topo;

8190

++		printk(KERN_INFO "sched: cpu#%02d llc_id = %d, llc_mask idx = %d\n",

8191

++		       cpu, per_cpu(sd_llc_id, cpu),

8192

++		       (int) (per_cpu(sched_cpu_llc_mask, cpu) -

8193

++			      per_cpu(sched_cpu_topo_masks, cpu)));

8194

++	}

8195

++}

8196

++#endif

8197

++

8198

++void __init sched_init_smp(void)

8199

++{

8200

++	/* Move init over to a non-isolated CPU */

8201

++	if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_TYPE_DOMAIN)) < 0)

8202

++		BUG();

8203

++	current->flags &= ~PF_NO_SETAFFINITY;

8204

++

8205

++	sched_init_topology_cpumask();

8206

++

8207

++	sched_smp_initialized = true;

8208

++}

8209

++

8210

++static int __init migration_init(void)

8211

++{

8212

++	sched_cpu_starting(smp_processor_id());

8213

++	return 0;

8214

++}

8215

++early_initcall(migration_init);

8216

++

8217

++#else

8218

++void __init sched_init_smp(void)

8219

++{

8220

++	cpu_rq(0)->idle->time_slice = sched_timeslice_ns;

8221

++}

8222

++#endif /* CONFIG_SMP */

8223

++

8224

++int in_sched_functions(unsigned long addr)

8225

++{

8226

++	return in_lock_functions(addr) ||

8227

++		(addr >= (unsigned long)__sched_text_start

8228

++		&& addr < (unsigned long)__sched_text_end);

8229

++}

8230

++

8231

++#ifdef CONFIG_CGROUP_SCHED

8232

++/* task group related information */

8233

++struct task_group {

8234

++	struct cgroup_subsys_state css;

8235

++

8236

++	struct rcu_head rcu;

8237

++	struct list_head list;

8238

++

8239

++	struct task_group *parent;

8240

++	struct list_head siblings;

8241

++	struct list_head children;

8242

++#ifdef CONFIG_FAIR_GROUP_SCHED

8243

++	unsigned long		shares;

8244

++#endif

8245

++};

8246

++

8247

++/*

8248

++ * Default task group.

8249

++ * Every task in system belongs to this group at bootup.

8250

++ */

8251

++struct task_group root_task_group;

8252

++LIST_HEAD(task_groups);

8253

++

8254

++/* Cacheline aligned slab cache for task_group */

8255

++static struct kmem_cache *task_group_cache __read_mostly;

8256

++#endif /* CONFIG_CGROUP_SCHED */

8257

++

8258

++void __init sched_init(void)

8259

++{

8260

++	int i;

8261

++	struct rq *rq;

8262

++

8263

++	printk(KERN_INFO ALT_SCHED_VERSION_MSG);

8264

++

8265

++	wait_bit_init();

8266

++

8267

++#ifdef CONFIG_SMP

8268

++	for (i = 0; i < SCHED_QUEUE_BITS; i++)

8269

++		cpumask_copy(sched_preempt_mask + i, cpu_present_mask);

8270

++#endif

8271

++

8272

++#ifdef CONFIG_CGROUP_SCHED

8273

++	task_group_cache = KMEM_CACHE(task_group, 0);

8274

++

8275

++	list_add(&root_task_group.list, &task_groups);

8276

++	INIT_LIST_HEAD(&root_task_group.children);

8277

++	INIT_LIST_HEAD(&root_task_group.siblings);

8278

++#endif /* CONFIG_CGROUP_SCHED */

8279

++	for_each_possible_cpu(i) {

8280

++		rq = cpu_rq(i);

8281

++

8282

++		sched_queue_init(&rq->queue);

8283

++		rq->prio = IDLE_TASK_SCHED_PRIO;

8284

++		rq->skip = NULL;

8285

++

8286

++		raw_spin_lock_init(&rq->lock);

8287

++		rq->nr_running = rq->nr_uninterruptible = 0;

8288

++		rq->calc_load_active = 0;

8289

++		rq->calc_load_update = jiffies + LOAD_FREQ;

8290

++#ifdef CONFIG_SMP

8291

++		rq->online = false;

8292

++		rq->cpu = i;

8293

++

8294

++#ifdef CONFIG_SCHED_SMT

8295

++		rq->active_balance = 0;

8296

++#endif

8297

++

8298

++#ifdef CONFIG_NO_HZ_COMMON

8299

++		INIT_CSD(&rq->nohz_csd, nohz_csd_func, rq);

8300

++#endif

8301

++		rq->balance_callback = &balance_push_callback;

8302

++#ifdef CONFIG_HOTPLUG_CPU

8303

++		rcuwait_init(&rq->hotplug_wait);

8304

++#endif

8305

++#endif /* CONFIG_SMP */

8306

++		rq->nr_switches = 0;

8307

++

8308

++		hrtick_rq_init(rq);

8309

++		atomic_set(&rq->nr_iowait, 0);

8310

++

8311

++		zalloc_cpumask_var_node(&rq->scratch_mask, GFP_KERNEL, cpu_to_node(i));

8312

++	}

8313

++#ifdef CONFIG_SMP

8314

++	/* Set rq->online for cpu 0 */

8315

++	cpu_rq(0)->online = true;

8316

++#endif

8317

++	/*

8318

++	 * The boot idle thread does lazy MMU switching as well:

8319

++	 */

8320

++	mmgrab(&init_mm);

8321

++	enter_lazy_tlb(&init_mm, current);

8322

++

8323

++	/*

8324

++	 * The idle task doesn't need the kthread struct to function, but it

8325

++	 * is dressed up as a per-CPU kthread and thus needs to play the part

8326

++	 * if we want to avoid special-casing it in code that deals with per-CPU

8327

++	 * kthreads.

8328

++	 */

8329

++	WARN_ON(!set_kthread_struct(current));

8330

++

8331

++	/*

8332

++	 * Make us the idle thread. Technically, schedule() should not be

8333

++	 * called from this thread, however somewhere below it might be,

8334

++	 * but because we are the idle thread, we just pick up running again

8335

++	 * when this runqueue becomes "idle".

8336

++	 */

8337

++	init_idle(current, smp_processor_id());

8338

++

8339

++	calc_load_update = jiffies + LOAD_FREQ;

8340

++

8341

++#ifdef CONFIG_SMP

8342

++	idle_thread_set_boot_cpu();

8343

++	balance_push_set(smp_processor_id(), false);

8344

++

8345

++	sched_init_topology_cpumask_early();

8346

++#endif /* SMP */

8347

++

8348

++	psi_init();

8349

++

8350

++	preempt_dynamic_init();

8351

++}

8352

++

8353

++#ifdef CONFIG_DEBUG_ATOMIC_SLEEP

8354

++

8355

++void __might_sleep(const char *file, int line)

8356

++{

8357

++	unsigned int state = get_current_state();

8358

++	/*

8359

++	 * Blocking primitives will set (and therefore destroy) current->state,

8360

++	 * since we will exit with TASK_RUNNING make sure we enter with it,

8361

++	 * otherwise we will destroy state.

8362

++	 */

8363

++	WARN_ONCE(state != TASK_RUNNING && current->task_state_change,

8364

++			"do not call blocking ops when !TASK_RUNNING; "

8365

++			"state=%x set at [<%p>] %pS\n", state,

8366

++			(void *)current->task_state_change,

8367

++			(void *)current->task_state_change);

8368

++

8369

++	__might_resched(file, line, 0);

8370

++}

8371

++EXPORT_SYMBOL(__might_sleep);

8372

++

8373

++static void print_preempt_disable_ip(int preempt_offset, unsigned long ip)

8374

++{

8375

++	if (!IS_ENABLED(CONFIG_DEBUG_PREEMPT))

8376

++		return;

8377

++

8378

++	if (preempt_count() == preempt_offset)

8379

++		return;

8380

++

8381

++	pr_err("Preemption disabled at:");

8382

++	print_ip_sym(KERN_ERR, ip);

8383

++}

8384

++

8385

++static inline bool resched_offsets_ok(unsigned int offsets)

8386

++{

8387

++	unsigned int nested = preempt_count();

8388

++

8389

++	nested += rcu_preempt_depth() << MIGHT_RESCHED_RCU_SHIFT;

8390

++

8391

++	return nested == offsets;

8392

++}

8393

++

8394

++void __might_resched(const char *file, int line, unsigned int offsets)

8395

++{

8396

++	/* Ratelimiting timestamp: */

8397

++	static unsigned long prev_jiffy;

8398

++

8399

++	unsigned long preempt_disable_ip;

8400

++

8401

++	/* WARN_ON_ONCE() by default, no rate limit required: */

8402

++	rcu_sleep_check();

8403

++

8404

++	if ((resched_offsets_ok(offsets) && !irqs_disabled() &&

8405

++	     !is_idle_task(current) && !current->non_block_count) ||

8406

++	    system_state == SYSTEM_BOOTING || system_state > SYSTEM_RUNNING ||

8407

++	    oops_in_progress)

8408

++		return;

8409

++	if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)

8410

++		return;

8411

++	prev_jiffy = jiffies;

8412

++

8413

++	/* Save this before calling printk(), since that will clobber it: */

8414

++	preempt_disable_ip = get_preempt_disable_ip(current);

8415

++

8416

++	pr_err("BUG: sleeping function called from invalid context at %s:%d\n",

8417

++	       file, line);

8418

++	pr_err("in_atomic(): %d, irqs_disabled(): %d, non_block: %d, pid: %d, name: %s\n",

8419

++	       in_atomic(), irqs_disabled(), current->non_block_count,

8420

++	       current->pid, current->comm);

8421

++	pr_err("preempt_count: %x, expected: %x\n", preempt_count(),

8422

++	       offsets & MIGHT_RESCHED_PREEMPT_MASK);

8423

++

8424

++	if (IS_ENABLED(CONFIG_PREEMPT_RCU)) {

8425

++		pr_err("RCU nest depth: %d, expected: %u\n",

8426

++		       rcu_preempt_depth(), offsets >> MIGHT_RESCHED_RCU_SHIFT);

8427

++	}

8428

++

8429

++	if (task_stack_end_corrupted(current))

8430

++		pr_emerg("Thread overran stack, or stack corrupted\n");

8431

++

8432

++	debug_show_held_locks(current);

8433

++	if (irqs_disabled())

8434

++		print_irqtrace_events(current);

8435

++

8436

++	print_preempt_disable_ip(offsets & MIGHT_RESCHED_PREEMPT_MASK,

8437

++				 preempt_disable_ip);

8438

++

8439

++	dump_stack();

8440

++	add_taint(TAINT_WARN, LOCKDEP_STILL_OK);

8441

++}

8442

++EXPORT_SYMBOL(__might_resched);

8443

++

8444

++void __cant_sleep(const char *file, int line, int preempt_offset)

8445

++{

8446

++	static unsigned long prev_jiffy;

8447

++

8448

++	if (irqs_disabled())

8449

++		return;

8450

++

8451

++	if (!IS_ENABLED(CONFIG_PREEMPT_COUNT))

8452

++		return;

8453

++

8454

++	if (preempt_count() > preempt_offset)

8455

++		return;

8456

++

8457

++	if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)

8458

++		return;

8459

++	prev_jiffy = jiffies;

8460

++

8461

++	printk(KERN_ERR "BUG: assuming atomic context at %s:%d\n", file, line);

8462

++	printk(KERN_ERR "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",

8463

++			in_atomic(), irqs_disabled(),

8464

++			current->pid, current->comm);

8465

++

8466

++	debug_show_held_locks(current);

8467

++	dump_stack();

8468

++	add_taint(TAINT_WARN, LOCKDEP_STILL_OK);

8469

++}

8470

++EXPORT_SYMBOL_GPL(__cant_sleep);

8471

++

8472

++#ifdef CONFIG_SMP

8473

++void __cant_migrate(const char *file, int line)

8474

++{

8475

++	static unsigned long prev_jiffy;

8476

++

8477

++	if (irqs_disabled())

8478

++		return;

8479

++

8480

++	if (is_migration_disabled(current))

8481

++		return;

8482

++

8483

++	if (!IS_ENABLED(CONFIG_PREEMPT_COUNT))

8484

++		return;

8485

++

8486

++	if (preempt_count() > 0)

8487

++		return;

8488

++

8489

++	if (current->migration_flags & MDF_FORCE_ENABLED)

8490

++		return;

8491

++

8492

++	if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)

8493

++		return;

8494

++	prev_jiffy = jiffies;

8495

++

8496

++	pr_err("BUG: assuming non migratable context at %s:%d\n", file, line);

8497

++	pr_err("in_atomic(): %d, irqs_disabled(): %d, migration_disabled() %u pid: %d, name: %s\n",

8498

++	       in_atomic(), irqs_disabled(), is_migration_disabled(current),

8499

++	       current->pid, current->comm);

8500

++

8501

++	debug_show_held_locks(current);

8502

++	dump_stack();

8503

++	add_taint(TAINT_WARN, LOCKDEP_STILL_OK);

8504

++}

8505

++EXPORT_SYMBOL_GPL(__cant_migrate);

8506

++#endif

8507

++#endif

8508

++

8509

++#ifdef CONFIG_MAGIC_SYSRQ

8510

++void normalize_rt_tasks(void)

8511

++{

8512

++	struct task_struct *g, *p;

8513

++	struct sched_attr attr = {

8514

++		.sched_policy = SCHED_NORMAL,

8515

++	};

8516

++

8517

++	read_lock(&tasklist_lock);

8518

++	for_each_process_thread(g, p) {

8519

++		/*

8520

++		 * Only normalize user tasks:

8521

++		 */

8522

++		if (p->flags & PF_KTHREAD)

8523

++			continue;

8524

++

8525

++		schedstat_set(p->stats.wait_start,  0);

8526

++		schedstat_set(p->stats.sleep_start, 0);

8527

++		schedstat_set(p->stats.block_start, 0);

8528

++

8529

++		if (!rt_task(p)) {

8530

++			/*

8531

++			 * Renice negative nice level userspace

8532

++			 * tasks back to 0:

8533

++			 */

8534

++			if (task_nice(p) < 0)

8535

++				set_user_nice(p, 0);

8536

++			continue;

8537

++		}

8538

++

8539

++		__sched_setscheduler(p, &attr, false, false);

8540

++	}

8541

++	read_unlock(&tasklist_lock);

8542

++}

8543

++#endif /* CONFIG_MAGIC_SYSRQ */

8544

++

8545

++#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB)

8546

++/*

8547

++ * These functions are only useful for the IA64 MCA handling, or kdb.

8548

++ *

8549

++ * They can only be called when the whole system has been

8550

++ * stopped - every CPU needs to be quiescent, and no scheduling

8551

++ * activity can take place. Using them for anything else would

8552

++ * be a serious bug, and as a result, they aren't even visible

8553

++ * under any other configuration.

8554

++ */

8555

++

8556

++/**

8557

++ * curr_task - return the current task for a given CPU.

8558

++ * @cpu: the processor in question.

8559

++ *

8560

++ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!

8561

++ *

8562

++ * Return: The current task for @cpu.

8563

++ */

8564

++struct task_struct *curr_task(int cpu)

8565

++{

8566

++	return cpu_curr(cpu);

8567

++}

8568

++

8569

++#endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */

8570

++

8571

++#ifdef CONFIG_IA64

8572

++/**

8573

++ * ia64_set_curr_task - set the current task for a given CPU.

8574

++ * @cpu: the processor in question.

8575

++ * @p: the task pointer to set.

8576

++ *

8577

++ * Description: This function must only be used when non-maskable interrupts

8578

++ * are serviced on a separate stack.  It allows the architecture to switch the

8579

++ * notion of the current task on a CPU in a non-blocking manner.  This function

8580

++ * must be called with all CPU's synchronised, and interrupts disabled, the

8581

++ * and caller must save the original value of the current task (see

8582

++ * curr_task() above) and restore that value before reenabling interrupts and

8583

++ * re-starting the system.

8584

++ *

8585

++ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!

8586

++ */

8587

++void ia64_set_curr_task(int cpu, struct task_struct *p)

8588

++{

8589

++	cpu_curr(cpu) = p;

8590

++}

8591

++

8592

++#endif

8593

++

8594

++#ifdef CONFIG_CGROUP_SCHED

8595

++static void sched_free_group(struct task_group *tg)

8596

++{

8597

++	kmem_cache_free(task_group_cache, tg);

8598

++}

8599

++

8600

++static void sched_free_group_rcu(struct rcu_head *rhp)

8601

++{

8602

++	sched_free_group(container_of(rhp, struct task_group, rcu));

8603

++}

8604

++

8605

++static void sched_unregister_group(struct task_group *tg)

8606

++{

8607

++	/*

8608

++	 * We have to wait for yet another RCU grace period to expire, as

8609

++	 * print_cfs_stats() might run concurrently.

8610

++	 */

8611

++	call_rcu(&tg->rcu, sched_free_group_rcu);

8612

++}

8613

++

8614

++/* allocate runqueue etc for a new task group */

8615

++struct task_group *sched_create_group(struct task_group *parent)

8616

++{

8617

++	struct task_group *tg;

8618

++

8619

++	tg = kmem_cache_alloc(task_group_cache, GFP_KERNEL | __GFP_ZERO);

8620

++	if (!tg)

8621

++		return ERR_PTR(-ENOMEM);

8622

++

8623

++	return tg;

8624

++}

8625

++

8626

++void sched_online_group(struct task_group *tg, struct task_group *parent)

8627

++{

8628

++}

8629

++

8630

++/* rcu callback to free various structures associated with a task group */

8631

++static void sched_unregister_group_rcu(struct rcu_head *rhp)

8632

++{

8633

++	/* Now it should be safe to free those cfs_rqs: */

8634

++	sched_unregister_group(container_of(rhp, struct task_group, rcu));

8635

++}

8636

++

8637

++void sched_destroy_group(struct task_group *tg)

8638

++{

8639

++	/* Wait for possible concurrent references to cfs_rqs complete: */

8640

++	call_rcu(&tg->rcu, sched_unregister_group_rcu);

8641

++}

8642

++

8643

++void sched_release_group(struct task_group *tg)

8644

++{

8645

++}

8646

++

8647

++static inline struct task_group *css_tg(struct cgroup_subsys_state *css)

8648

++{

8649

++	return css ? container_of(css, struct task_group, css) : NULL;

8650

++}

8651

++

8652

++static struct cgroup_subsys_state *

8653

++cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)

8654

++{

8655

++	struct task_group *parent = css_tg(parent_css);

8656

++	struct task_group *tg;

8657

++

8658

++	if (!parent) {

8659

++		/* This is early initialization for the top cgroup */

8660

++		return &root_task_group.css;

8661

++	}

8662

++

8663

++	tg = sched_create_group(parent);

8664

++	if (IS_ERR(tg))

8665

++		return ERR_PTR(-ENOMEM);

8666

++	return &tg->css;

8667

++}

8668

++

8669

++/* Expose task group only after completing cgroup initialization */

8670

++static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)

8671

++{

8672

++	struct task_group *tg = css_tg(css);

8673

++	struct task_group *parent = css_tg(css->parent);

8674

++

8675

++	if (parent)

8676

++		sched_online_group(tg, parent);

8677

++	return 0;

8678

++}

8679

++

8680

++static void cpu_cgroup_css_released(struct cgroup_subsys_state *css)

8681

++{

8682

++	struct task_group *tg = css_tg(css);

8683

++

8684

++	sched_release_group(tg);

8685

++}

8686

++

8687

++static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)

8688

++{

8689

++	struct task_group *tg = css_tg(css);

8690

++

8691

++	/*

8692

++	 * Relies on the RCU grace period between css_released() and this.

8693

++	 */

8694

++	sched_unregister_group(tg);

8695

++}

8696

++

8697

++#ifdef CONFIG_RT_GROUP_SCHED

8698

++static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)

8699

++{

8700

++	return 0;

8701

++}

8702

++#endif

8703

++

8704

++static void cpu_cgroup_attach(struct cgroup_taskset *tset)

8705

++{

8706

++}

8707

++

8708

++#ifdef CONFIG_FAIR_GROUP_SCHED

8709

++static DEFINE_MUTEX(shares_mutex);

8710

++

8711

++int sched_group_set_shares(struct task_group *tg, unsigned long shares)

8712

++{

8713

++	/*

8714

++	 * We can't change the weight of the root cgroup.

8715

++	 */

8716

++	if (&root_task_group == tg)

8717

++		return -EINVAL;

8718

++

8719

++	shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));

8720

++

8721

++	mutex_lock(&shares_mutex);

8722

++	if (tg->shares == shares)

8723

++		goto done;

8724

++

8725

++	tg->shares = shares;

8726

++done:

8727

++	mutex_unlock(&shares_mutex);

8728

++	return 0;

8729

++}

8730

++

8731

++static int cpu_shares_write_u64(struct cgroup_subsys_state *css,

8732

++				struct cftype *cftype, u64 shareval)

8733

++{

8734

++	if (shareval > scale_load_down(ULONG_MAX))

8735

++		shareval = MAX_SHARES;

8736

++	return sched_group_set_shares(css_tg(css), scale_load(shareval));

8737

++}

8738

++

8739

++static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css,

8740

++			       struct cftype *cft)

8741

++{

8742

++	struct task_group *tg = css_tg(css);

8743

++

8744

++	return (u64) scale_load_down(tg->shares);

8745

++}

8746

++#endif

8747

++

8748

++static struct cftype cpu_legacy_files[] = {

8749

++#ifdef CONFIG_FAIR_GROUP_SCHED

8750

++	{

8751

++		.name = "shares",

8752

++		.read_u64 = cpu_shares_read_u64,

8753

++		.write_u64 = cpu_shares_write_u64,

8754

++	},

8755

++#endif

8756

++	{ }	/* Terminate */

8757

++};

8758

++

8759

++

8760

++static struct cftype cpu_files[] = {

8761

++	{ }	/* terminate */

8762

++};

8763

++

8764

++static int cpu_extra_stat_show(struct seq_file *sf,

8765

++			       struct cgroup_subsys_state *css)

8766

++{

8767

++	return 0;

8768

++}

8769

++

8770

++struct cgroup_subsys cpu_cgrp_subsys = {

8771

++	.css_alloc	= cpu_cgroup_css_alloc,

8772

++	.css_online	= cpu_cgroup_css_online,

8773

++	.css_released	= cpu_cgroup_css_released,

8774

++	.css_free	= cpu_cgroup_css_free,

8775

++	.css_extra_stat_show = cpu_extra_stat_show,

8776

++#ifdef CONFIG_RT_GROUP_SCHED

8777

++	.can_attach	= cpu_cgroup_can_attach,

8778

++#endif

8779

++	.attach		= cpu_cgroup_attach,

8780

++	.legacy_cftypes	= cpu_files,

8781

++	.legacy_cftypes	= cpu_legacy_files,

8782

++	.dfl_cftypes	= cpu_files,

8783

++	.early_init	= true,

8784

++	.threaded	= true,

8785

++};

8786

++#endif	/* CONFIG_CGROUP_SCHED */

8787

++

8788

++#undef CREATE_TRACE_POINTS

8789

+diff --git a/kernel/sched/alt_debug.c b/kernel/sched/alt_debug.c

8790

+new file mode 100644

8791

+index 000000000000..1212a031700e

8792

+--- /dev/null

8793

++++ b/kernel/sched/alt_debug.c

8794

+@@ -0,0 +1,31 @@

8795

++/*

8796

++ * kernel/sched/alt_debug.c

8797

++ *

8798

++ * Print the alt scheduler debugging details

8799

++ *

8800

++ * Author: Alfred Chen

8801

++ * Date  : 2020

8802

++ */

8803

++#include "sched.h"

8804

++

8805

++/*

8806

++ * This allows printing both to /proc/sched_debug and

8807

++ * to the console

8808

++ */

8809

++#define SEQ_printf(m, x...)			\

8810

++ do {						\

8811

++	if (m)					\

8812

++		seq_printf(m, x);		\

8813

++	else					\

8814

++		pr_cont(x);			\

8815

++ } while (0)

8816

++

8817

++void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,

8818

++			  struct seq_file *m)

8819

++{

8820

++	SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, task_pid_nr_ns(p, ns),

8821

++						get_nr_threads(p));

8822

++}

8823

++

8824

++void proc_sched_set_task(struct task_struct *p)

8825

++{}

8826

+diff --git a/kernel/sched/alt_sched.h b/kernel/sched/alt_sched.h

8827

+new file mode 100644

8828

+index 000000000000..0b563999d4c1

8829

+--- /dev/null

8830

++++ b/kernel/sched/alt_sched.h

8831

+@@ -0,0 +1,671 @@

8832

++#ifndef ALT_SCHED_H

8833

++#define ALT_SCHED_H

8834

++

8835

++#include <linux/context_tracking.h>

8836

++#include <linux/profile.h>

8837

++#include <linux/psi.h>

8838

++#include <linux/stop_machine.h>

8839

++#include <linux/syscalls.h>

8840

++#include <linux/tick.h>

8841

++

8842

++#include <trace/events/power.h>

8843

++#include <trace/events/sched.h>

8844

++

8845

++#include "../workqueue_internal.h"

8846

++

8847

++#include "cpupri.h"

8848

++

8849

++#ifdef CONFIG_SCHED_BMQ

8850

++/* bits:

8851

++ * RT(0-99), (Low prio adj range, nice width, high prio adj range) / 2, cpu idle task */

8852

++#define SCHED_BITS	(MAX_RT_PRIO + NICE_WIDTH / 2 + MAX_PRIORITY_ADJ + 1)

8853

++#endif

8854

++

8855

++#ifdef CONFIG_SCHED_PDS

8856

++/* bits: RT(0-99), reserved(100-127), NORMAL_PRIO_NUM, cpu idle task */

8857

++#define SCHED_BITS	(MIN_NORMAL_PRIO + NORMAL_PRIO_NUM + 1)

8858

++#endif /* CONFIG_SCHED_PDS */

8859

++

8860

++#define IDLE_TASK_SCHED_PRIO	(SCHED_BITS - 1)

8861

++

8862

++#ifdef CONFIG_SCHED_DEBUG

8863

++# define SCHED_WARN_ON(x)	WARN_ONCE(x, #x)

8864

++extern void resched_latency_warn(int cpu, u64 latency);

8865

++#else

8866

++# define SCHED_WARN_ON(x)	({ (void)(x), 0; })

8867

++static inline void resched_latency_warn(int cpu, u64 latency) {}

8868

++#endif

8869

++

8870

++/*

8871

++ * Increase resolution of nice-level calculations for 64-bit architectures.

8872

++ * The extra resolution improves shares distribution and load balancing of

8873

++ * low-weight task groups (eg. nice +19 on an autogroup), deeper taskgroup

8874

++ * hierarchies, especially on larger systems. This is not a user-visible change

8875

++ * and does not change the user-interface for setting shares/weights.

8876

++ *

8877

++ * We increase resolution only if we have enough bits to allow this increased

8878

++ * resolution (i.e. 64-bit). The costs for increasing resolution when 32-bit

8879

++ * are pretty high and the returns do not justify the increased costs.

8880

++ *

8881

++ * Really only required when CONFIG_FAIR_GROUP_SCHED=y is also set, but to

8882

++ * increase coverage and consistency always enable it on 64-bit platforms.

8883

++ */

8884

++#ifdef CONFIG_64BIT

8885

++# define NICE_0_LOAD_SHIFT	(SCHED_FIXEDPOINT_SHIFT + SCHED_FIXEDPOINT_SHIFT)

8886

++# define scale_load(w)		((w) << SCHED_FIXEDPOINT_SHIFT)

8887

++# define scale_load_down(w) \

8888

++({ \

8889

++	unsigned long __w = (w); \

8890

++	if (__w) \

8891

++		__w = max(2UL, __w >> SCHED_FIXEDPOINT_SHIFT); \

8892

++	__w; \

8893

++})

8894

++#else

8895

++# define NICE_0_LOAD_SHIFT	(SCHED_FIXEDPOINT_SHIFT)

8896

++# define scale_load(w)		(w)

8897

++# define scale_load_down(w)	(w)

8898

++#endif

8899

++

8900

++#ifdef CONFIG_FAIR_GROUP_SCHED

8901

++#define ROOT_TASK_GROUP_LOAD	NICE_0_LOAD

8902

++

8903

++/*

8904

++ * A weight of 0 or 1 can cause arithmetics problems.

8905

++ * A weight of a cfs_rq is the sum of weights of which entities

8906

++ * are queued on this cfs_rq, so a weight of a entity should not be

8907

++ * too large, so as the shares value of a task group.

8908

++ * (The default weight is 1024 - so there's no practical

8909

++ *  limitation from this.)

8910

++ */

8911

++#define MIN_SHARES		(1UL <<  1)

8912

++#define MAX_SHARES		(1UL << 18)

8913

++#endif

8914

++

8915

++/*

8916

++ * Tunables that become constants when CONFIG_SCHED_DEBUG is off:

8917

++ */

8918

++#ifdef CONFIG_SCHED_DEBUG

8919

++# define const_debug __read_mostly

8920

++#else

8921

++# define const_debug const

8922

++#endif

8923

++

8924

++/* task_struct::on_rq states: */

8925

++#define TASK_ON_RQ_QUEUED	1

8926

++#define TASK_ON_RQ_MIGRATING	2

8927

++

8928

++static inline int task_on_rq_queued(struct task_struct *p)

8929

++{

8930

++	return p->on_rq == TASK_ON_RQ_QUEUED;

8931

++}

8932

++

8933

++static inline int task_on_rq_migrating(struct task_struct *p)

8934

++{

8935

++	return READ_ONCE(p->on_rq) == TASK_ON_RQ_MIGRATING;

8936

++}

8937

++

8938

++/*

8939

++ * wake flags

8940

++ */

8941

++#define WF_SYNC		0x01		/* waker goes to sleep after wakeup */

8942

++#define WF_FORK		0x02		/* child wakeup after fork */

8943

++#define WF_MIGRATED	0x04		/* internal use, task got migrated */

8944

++

8945

++#define SCHED_QUEUE_BITS	(SCHED_BITS - 1)

8946

++

8947

++struct sched_queue {

8948

++	DECLARE_BITMAP(bitmap, SCHED_QUEUE_BITS);

8949

++	struct list_head heads[SCHED_BITS];

8950

++};

8951

++

8952

++struct rq;

8953

++struct balance_callback {

8954

++	struct balance_callback *next;

8955

++	void (*func)(struct rq *rq);

8956

++};

8957

++

8958

++/*

8959

++ * This is the main, per-CPU runqueue data structure.

8960

++ * This data should only be modified by the local cpu.

8961

++ */

8962

++struct rq {

8963

++	/* runqueue lock: */

8964

++	raw_spinlock_t lock;

8965

++

8966

++	struct task_struct __rcu *curr;

8967

++	struct task_struct *idle, *stop, *skip;

8968

++	struct mm_struct *prev_mm;

8969

++

8970

++	struct sched_queue	queue;

8971

++#ifdef CONFIG_SCHED_PDS

8972

++	u64			time_edge;

8973

++#endif

8974

++	unsigned long prio;

8975

++

8976

++	/* switch count */

8977

++	u64 nr_switches;

8978

++

8979

++	atomic_t nr_iowait;

8980

++

8981

++#ifdef CONFIG_SCHED_DEBUG

8982

++	u64 last_seen_need_resched_ns;

8983

++	int ticks_without_resched;

8984

++#endif

8985

++

8986

++#ifdef CONFIG_MEMBARRIER

8987

++	int membarrier_state;

8988

++#endif

8989

++

8990

++#ifdef CONFIG_SMP

8991

++	int cpu;		/* cpu of this runqueue */

8992

++	bool online;

8993

++

8994

++	unsigned int		ttwu_pending;

8995

++	unsigned char		nohz_idle_balance;

8996

++	unsigned char		idle_balance;

8997

++

8998

++#ifdef CONFIG_HAVE_SCHED_AVG_IRQ

8999

++	struct sched_avg	avg_irq;

9000

++#endif

9001

++

9002

++#ifdef CONFIG_SCHED_SMT

9003

++	int active_balance;

9004

++	struct cpu_stop_work	active_balance_work;

9005

++#endif

9006

++	struct balance_callback	*balance_callback;

9007

++#ifdef CONFIG_HOTPLUG_CPU

9008

++	struct rcuwait		hotplug_wait;

9009

++#endif

9010

++	unsigned int		nr_pinned;

9011

++

9012

++#endif /* CONFIG_SMP */

9013

++#ifdef CONFIG_IRQ_TIME_ACCOUNTING

9014

++	u64 prev_irq_time;

9015

++#endif /* CONFIG_IRQ_TIME_ACCOUNTING */

9016

++#ifdef CONFIG_PARAVIRT

9017

++	u64 prev_steal_time;

9018

++#endif /* CONFIG_PARAVIRT */

9019

++#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING

9020

++	u64 prev_steal_time_rq;

9021

++#endif /* CONFIG_PARAVIRT_TIME_ACCOUNTING */

9022

++

9023

++	/* For genenal cpu load util */

9024

++	s32 load_history;

9025

++	u64 load_block;

9026

++	u64 load_stamp;

9027

++

9028

++	/* calc_load related fields */

9029

++	unsigned long calc_load_update;

9030

++	long calc_load_active;

9031

++

9032

++	u64 clock, last_tick;

9033

++	u64 last_ts_switch;

9034

++	u64 clock_task;

9035

++

9036

++	unsigned int  nr_running;

9037

++	unsigned long nr_uninterruptible;

9038

++

9039

++#ifdef CONFIG_SCHED_HRTICK

9040

++#ifdef CONFIG_SMP

9041

++	call_single_data_t hrtick_csd;

9042

++#endif

9043

++	struct hrtimer		hrtick_timer;

9044

++	ktime_t			hrtick_time;

9045

++#endif

9046

++

9047

++#ifdef CONFIG_SCHEDSTATS

9048

++

9049

++	/* latency stats */

9050

++	struct sched_info rq_sched_info;

9051

++	unsigned long long rq_cpu_time;

9052

++	/* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */

9053

++

9054

++	/* sys_sched_yield() stats */

9055

++	unsigned int yld_count;

9056

++

9057

++	/* schedule() stats */

9058

++	unsigned int sched_switch;

9059

++	unsigned int sched_count;

9060

++	unsigned int sched_goidle;

9061

++

9062

++	/* try_to_wake_up() stats */

9063

++	unsigned int ttwu_count;

9064

++	unsigned int ttwu_local;

9065

++#endif /* CONFIG_SCHEDSTATS */

9066

++

9067

++#ifdef CONFIG_CPU_IDLE

9068

++	/* Must be inspected within a rcu lock section */

9069

++	struct cpuidle_state *idle_state;

9070

++#endif

9071

++

9072

++#ifdef CONFIG_NO_HZ_COMMON

9073

++#ifdef CONFIG_SMP

9074

++	call_single_data_t	nohz_csd;

9075

++#endif

9076

++	atomic_t		nohz_flags;

9077

++#endif /* CONFIG_NO_HZ_COMMON */

9078

++

9079

++	/* Scratch cpumask to be temporarily used under rq_lock */

9080

++	cpumask_var_t		scratch_mask;

9081

++};

9082

++

9083

++extern unsigned long rq_load_util(struct rq *rq, unsigned long max);

9084

++

9085

++extern unsigned long calc_load_update;

9086

++extern atomic_long_t calc_load_tasks;

9087

++

9088

++extern void calc_global_load_tick(struct rq *this_rq);

9089

++extern long calc_load_fold_active(struct rq *this_rq, long adjust);

9090

++

9091

++DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);

9092

++#define cpu_rq(cpu)		(&per_cpu(runqueues, (cpu)))

9093

++#define this_rq()		this_cpu_ptr(&runqueues)

9094

++#define task_rq(p)		cpu_rq(task_cpu(p))

9095

++#define cpu_curr(cpu)		(cpu_rq(cpu)->curr)

9096

++#define raw_rq()		raw_cpu_ptr(&runqueues)

9097

++

9098

++#ifdef CONFIG_SMP

9099

++#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)

9100

++void register_sched_domain_sysctl(void);

9101

++void unregister_sched_domain_sysctl(void);

9102

++#else

9103

++static inline void register_sched_domain_sysctl(void)

9104

++{

9105

++}

9106

++static inline void unregister_sched_domain_sysctl(void)

9107

++{

9108

++}

9109

++#endif

9110

++

9111

++extern bool sched_smp_initialized;

9112

++

9113

++enum {

9114

++	ITSELF_LEVEL_SPACE_HOLDER,

9115

++#ifdef CONFIG_SCHED_SMT

9116

++	SMT_LEVEL_SPACE_HOLDER,

9117

++#endif

9118

++	COREGROUP_LEVEL_SPACE_HOLDER,

9119

++	CORE_LEVEL_SPACE_HOLDER,

9120

++	OTHER_LEVEL_SPACE_HOLDER,

9121

++	NR_CPU_AFFINITY_LEVELS

9122

++};

9123

++

9124

++DECLARE_PER_CPU(cpumask_t [NR_CPU_AFFINITY_LEVELS], sched_cpu_topo_masks);

9125

++DECLARE_PER_CPU(cpumask_t *, sched_cpu_llc_mask);

9126

++

9127

++static inline int

9128

++__best_mask_cpu(const cpumask_t *cpumask, const cpumask_t *mask)

9129

++{

9130

++	int cpu;

9131

++

9132

++	while ((cpu = cpumask_any_and(cpumask, mask)) >= nr_cpu_ids)

9133

++		mask++;

9134

++

9135

++	return cpu;

9136

++}

9137

++

9138

++static inline int best_mask_cpu(int cpu, const cpumask_t *mask)

9139

++{

9140

++	return __best_mask_cpu(mask, per_cpu(sched_cpu_topo_masks, cpu));

9141

++}

9142

++

9143

++extern void flush_smp_call_function_queue(void);

9144

++

9145

++#else  /* !CONFIG_SMP */

9146

++static inline void flush_smp_call_function_queue(void) { }

9147

++#endif

9148

++

9149

++#ifndef arch_scale_freq_tick

9150

++static __always_inline

9151

++void arch_scale_freq_tick(void)

9152

++{

9153

++}

9154

++#endif

9155

++

9156

++#ifndef arch_scale_freq_capacity

9157

++static __always_inline

9158

++unsigned long arch_scale_freq_capacity(int cpu)

9159

++{

9160

++	return SCHED_CAPACITY_SCALE;

9161

++}

9162

++#endif

9163

++

9164

++static inline u64 __rq_clock_broken(struct rq *rq)

9165

++{

9166

++	return READ_ONCE(rq->clock);

9167

++}

9168

++

9169

++static inline u64 rq_clock(struct rq *rq)

9170

++{

9171

++	/*

9172

++	 * Relax lockdep_assert_held() checking as in VRQ, call to

9173

++	 * sched_info_xxxx() may not held rq->lock

9174

++	 * lockdep_assert_held(&rq->lock);

9175

++	 */

9176

++	return rq->clock;

9177

++}

9178

++

9179

++static inline u64 rq_clock_task(struct rq *rq)

9180

++{

9181

++	/*

9182

++	 * Relax lockdep_assert_held() checking as in VRQ, call to

9183

++	 * sched_info_xxxx() may not held rq->lock

9184

++	 * lockdep_assert_held(&rq->lock);

9185

++	 */

9186

++	return rq->clock_task;

9187

++}

9188

++

9189

++/*

9190

++ * {de,en}queue flags:

9191

++ *

9192

++ * DEQUEUE_SLEEP  - task is no longer runnable

9193

++ * ENQUEUE_WAKEUP - task just became runnable

9194

++ *

9195

++ */

9196

++

9197

++#define DEQUEUE_SLEEP		0x01

9198

++

9199

++#define ENQUEUE_WAKEUP		0x01

9200

++

9201

++

9202

++/*

9203

++ * Below are scheduler API which using in other kernel code

9204

++ * It use the dummy rq_flags

9205

++ * ToDo : BMQ need to support these APIs for compatibility with mainline

9206

++ * scheduler code.

9207

++ */

9208

++struct rq_flags {

9209

++	unsigned long flags;

9210

++};

9211

++

9212

++struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf)

9213

++	__acquires(rq->lock);

9214

++

9215

++struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)

9216

++	__acquires(p->pi_lock)

9217

++	__acquires(rq->lock);

9218

++

9219

++static inline void __task_rq_unlock(struct rq *rq, struct rq_flags *rf)

9220

++	__releases(rq->lock)

9221

++{

9222

++	raw_spin_unlock(&rq->lock);

9223

++}

9224

++

9225

++static inline void

9226

++task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf)

9227

++	__releases(rq->lock)

9228

++	__releases(p->pi_lock)

9229

++{

9230

++	raw_spin_unlock(&rq->lock);

9231

++	raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags);

9232

++}

9233

++

9234

++static inline void

9235

++rq_lock(struct rq *rq, struct rq_flags *rf)

9236

++	__acquires(rq->lock)

9237

++{

9238

++	raw_spin_lock(&rq->lock);

9239

++}

9240

++

9241

++static inline void

9242

++rq_unlock(struct rq *rq, struct rq_flags *rf)

9243

++	__releases(rq->lock)

9244

++{

9245

++	raw_spin_unlock(&rq->lock);

9246

++}

9247

++

9248

++static inline void

9249

++rq_lock_irq(struct rq *rq, struct rq_flags *rf)

9250

++	__acquires(rq->lock)

9251

++{

9252

++	raw_spin_lock_irq(&rq->lock);

9253

++}

9254

++

9255

++static inline void

9256

++rq_unlock_irq(struct rq *rq, struct rq_flags *rf)

9257

++	__releases(rq->lock)

9258

++{

9259

++	raw_spin_unlock_irq(&rq->lock);

9260

++}

9261

++

9262

++static inline struct rq *

9263

++this_rq_lock_irq(struct rq_flags *rf)

9264

++	__acquires(rq->lock)

9265

++{

9266

++	struct rq *rq;

9267

++

9268

++	local_irq_disable();

9269

++	rq = this_rq();

9270

++	raw_spin_lock(&rq->lock);

9271

++

9272

++	return rq;

9273

++}

9274

++

9275

++static inline raw_spinlock_t *__rq_lockp(struct rq *rq)

9276

++{

9277

++	return &rq->lock;

9278

++}

9279

++

9280

++static inline raw_spinlock_t *rq_lockp(struct rq *rq)

9281

++{

9282

++	return __rq_lockp(rq);

9283

++}

9284

++

9285

++static inline void lockdep_assert_rq_held(struct rq *rq)

9286

++{

9287

++	lockdep_assert_held(__rq_lockp(rq));

9288

++}

9289

++

9290

++extern void raw_spin_rq_lock_nested(struct rq *rq, int subclass);

9291

++extern void raw_spin_rq_unlock(struct rq *rq);

9292

++

9293

++static inline void raw_spin_rq_lock(struct rq *rq)

9294

++{

9295

++	raw_spin_rq_lock_nested(rq, 0);

9296

++}

9297

++

9298

++static inline void raw_spin_rq_lock_irq(struct rq *rq)

9299

++{

9300

++	local_irq_disable();

9301

++	raw_spin_rq_lock(rq);

9302

++}

9303

++

9304

++static inline void raw_spin_rq_unlock_irq(struct rq *rq)

9305

++{

9306

++	raw_spin_rq_unlock(rq);

9307

++	local_irq_enable();

9308

++}

9309

++

9310

++static inline int task_current(struct rq *rq, struct task_struct *p)

9311

++{

9312

++	return rq->curr == p;

9313

++}

9314

++

9315

++static inline bool task_on_cpu(struct task_struct *p)

9316

++{

9317

++	return p->on_cpu;

9318

++}

9319

++

9320

++extern int task_running_nice(struct task_struct *p);

9321

++

9322

++extern struct static_key_false sched_schedstats;

9323

++

9324

++#ifdef CONFIG_CPU_IDLE

9325

++static inline void idle_set_state(struct rq *rq,

9326

++				  struct cpuidle_state *idle_state)

9327

++{

9328

++	rq->idle_state = idle_state;

9329

++}

9330

++

9331

++static inline struct cpuidle_state *idle_get_state(struct rq *rq)

9332

++{

9333

++	WARN_ON(!rcu_read_lock_held());

9334

++	return rq->idle_state;

9335

++}

9336

++#else

9337

++static inline void idle_set_state(struct rq *rq,

9338

++				  struct cpuidle_state *idle_state)

9339

++{

9340

++}

9341

++

9342

++static inline struct cpuidle_state *idle_get_state(struct rq *rq)

9343

++{

9344

++	return NULL;

9345

++}

9346

++#endif

9347

++

9348

++static inline int cpu_of(const struct rq *rq)

9349

++{

9350

++#ifdef CONFIG_SMP

9351

++	return rq->cpu;

9352

++#else

9353

++	return 0;

9354

++#endif

9355

++}

9356

++

9357

++#include "stats.h"

9358

++

9359

++#ifdef CONFIG_NO_HZ_COMMON

9360

++#define NOHZ_BALANCE_KICK_BIT	0

9361

++#define NOHZ_STATS_KICK_BIT	1

9362

++

9363

++#define NOHZ_BALANCE_KICK	BIT(NOHZ_BALANCE_KICK_BIT)

9364

++#define NOHZ_STATS_KICK		BIT(NOHZ_STATS_KICK_BIT)

9365

++

9366

++#define NOHZ_KICK_MASK	(NOHZ_BALANCE_KICK | NOHZ_STATS_KICK)

9367

++

9368

++#define nohz_flags(cpu)	(&cpu_rq(cpu)->nohz_flags)

9369

++

9370

++/* TODO: needed?

9371

++extern void nohz_balance_exit_idle(struct rq *rq);

9372

++#else

9373

++static inline void nohz_balance_exit_idle(struct rq *rq) { }

9374

++*/

9375

++#endif

9376

++

9377

++#ifdef CONFIG_IRQ_TIME_ACCOUNTING

9378

++struct irqtime {

9379

++	u64			total;

9380

++	u64			tick_delta;

9381

++	u64			irq_start_time;

9382

++	struct u64_stats_sync	sync;

9383

++};

9384

++

9385

++DECLARE_PER_CPU(struct irqtime, cpu_irqtime);

9386

++

9387

++/*

9388

++ * Returns the irqtime minus the softirq time computed by ksoftirqd.

9389

++ * Otherwise ksoftirqd's sum_exec_runtime is substracted its own runtime

9390

++ * and never move forward.

9391

++ */

9392

++static inline u64 irq_time_read(int cpu)

9393

++{

9394

++	struct irqtime *irqtime = &per_cpu(cpu_irqtime, cpu);

9395

++	unsigned int seq;

9396

++	u64 total;

9397

++

9398

++	do {

9399

++		seq = __u64_stats_fetch_begin(&irqtime->sync);

9400

++		total = irqtime->total;

9401

++	} while (__u64_stats_fetch_retry(&irqtime->sync, seq));

9402

++

9403

++	return total;

9404

++}

9405

++#endif /* CONFIG_IRQ_TIME_ACCOUNTING */

9406

++

9407

++#ifdef CONFIG_CPU_FREQ

9408

++DECLARE_PER_CPU(struct update_util_data __rcu *, cpufreq_update_util_data);

9409

++#endif /* CONFIG_CPU_FREQ */

9410

++

9411

++#ifdef CONFIG_NO_HZ_FULL

9412

++extern int __init sched_tick_offload_init(void);

9413

++#else

9414

++static inline int sched_tick_offload_init(void) { return 0; }

9415

++#endif

9416

++

9417

++#ifdef arch_scale_freq_capacity

9418

++#ifndef arch_scale_freq_invariant

9419

++#define arch_scale_freq_invariant()	(true)

9420

++#endif

9421

++#else /* arch_scale_freq_capacity */

9422

++#define arch_scale_freq_invariant()	(false)

9423

++#endif

9424

++

9425

++extern void schedule_idle(void);

9426

++

9427

++#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)

9428

++

9429

++/*

9430

++ * !! For sched_setattr_nocheck() (kernel) only !!

9431

++ *

9432

++ * This is actually gross. :(

9433

++ *

9434

++ * It is used to make schedutil kworker(s) higher priority than SCHED_DEADLINE

9435

++ * tasks, but still be able to sleep. We need this on platforms that cannot

9436

++ * atomically change clock frequency. Remove once fast switching will be

9437

++ * available on such platforms.

9438

++ *

9439

++ * SUGOV stands for SchedUtil GOVernor.

9440

++ */

9441

++#define SCHED_FLAG_SUGOV	0x10000000

9442

++

9443

++#ifdef CONFIG_MEMBARRIER

9444

++/*

9445

++ * The scheduler provides memory barriers required by membarrier between:

9446

++ * - prior user-space memory accesses and store to rq->membarrier_state,

9447

++ * - store to rq->membarrier_state and following user-space memory accesses.

9448

++ * In the same way it provides those guarantees around store to rq->curr.

9449

++ */

9450

++static inline void membarrier_switch_mm(struct rq *rq,

9451

++					struct mm_struct *prev_mm,

9452

++					struct mm_struct *next_mm)

9453

++{

9454

++	int membarrier_state;

9455

++

9456

++	if (prev_mm == next_mm)

9457

++		return;

9458

++

9459

++	membarrier_state = atomic_read(&next_mm->membarrier_state);

9460

++	if (READ_ONCE(rq->membarrier_state) == membarrier_state)

9461

++		return;

9462

++

9463

++	WRITE_ONCE(rq->membarrier_state, membarrier_state);

9464

++}

9465

++#else

9466

++static inline void membarrier_switch_mm(struct rq *rq,

9467

++					struct mm_struct *prev_mm,

9468

++					struct mm_struct *next_mm)

9469

++{

9470

++}

9471

++#endif

9472

++

9473

++#ifdef CONFIG_NUMA

9474

++extern int sched_numa_find_closest(const struct cpumask *cpus, int cpu);

9475

++#else

9476

++static inline int sched_numa_find_closest(const struct cpumask *cpus, int cpu)

9477

++{

9478

++	return nr_cpu_ids;

9479

++}

9480

++#endif

9481

++

9482

++extern void swake_up_all_locked(struct swait_queue_head *q);

9483

++extern void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait);

9484

++

9485

++#ifdef CONFIG_PREEMPT_DYNAMIC

9486

++extern int preempt_dynamic_mode;

9487

++extern int sched_dynamic_mode(const char *str);

9488

++extern void sched_dynamic_update(int mode);

9489

++#endif

9490

++

9491

++static inline void nohz_run_idle_balance(int cpu) { }

9492

++

9493

++static inline

9494

++unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util,

9495

++				  struct task_struct *p)

9496

++{

9497

++	return util;

9498

++}

9499

++

9500

++static inline bool uclamp_rq_is_capped(struct rq *rq) { return false; }

9501

++

9502

++#endif /* ALT_SCHED_H */

9503

+diff --git a/kernel/sched/bmq.h b/kernel/sched/bmq.h

9504

+new file mode 100644

9505

+index 000000000000..66b77291b9d0

9506

+--- /dev/null

9507

++++ b/kernel/sched/bmq.h

9508

+@@ -0,0 +1,110 @@

9509

++#define ALT_SCHED_VERSION_MSG "sched/bmq: BMQ CPU Scheduler "ALT_SCHED_VERSION" by Alfred Chen.\n"

9510

++

9511

++/*

9512

++ * BMQ only routines

9513

++ */

9514

++#define rq_switch_time(rq)	((rq)->clock - (rq)->last_ts_switch)

9515

++#define boost_threshold(p)	(sched_timeslice_ns >>\

9516

++				 (15 - MAX_PRIORITY_ADJ -  (p)->boost_prio))

9517

++

9518

++static inline void boost_task(struct task_struct *p)

9519

++{

9520

++	int limit;

9521

++

9522

++	switch (p->policy) {

9523

++	case SCHED_NORMAL:

9524

++		limit = -MAX_PRIORITY_ADJ;

9525

++		break;

9526

++	case SCHED_BATCH:

9527

++	case SCHED_IDLE:

9528

++		limit = 0;

9529

++		break;

9530

++	default:

9531

++		return;

9532

++	}

9533

++

9534

++	if (p->boost_prio > limit)

9535

++		p->boost_prio--;

9536

++}

9537

++

9538

++static inline void deboost_task(struct task_struct *p)

9539

++{

9540

++	if (p->boost_prio < MAX_PRIORITY_ADJ)

9541

++		p->boost_prio++;

9542

++}

9543

++

9544

++/*

9545

++ * Common interfaces

9546

++ */

9547

++static inline void sched_timeslice_imp(const int timeslice_ms) {}

9548

++

9549

++static inline int

9550

++task_sched_prio_normal(const struct task_struct *p, const struct rq *rq)

9551

++{

9552

++	return p->prio + p->boost_prio - MAX_RT_PRIO;

9553

++}

9554

++

9555

++static inline int task_sched_prio(const struct task_struct *p)

9556

++{

9557

++	return (p->prio < MAX_RT_PRIO)? p->prio : MAX_RT_PRIO / 2 + (p->prio + p->boost_prio) / 2;

9558

++}

9559

++

9560

++static inline int

9561

++task_sched_prio_idx(const struct task_struct *p, const struct rq *rq)

9562

++{

9563

++	return task_sched_prio(p);

9564

++}

9565

++

9566

++static inline int sched_prio2idx(int prio, struct rq *rq)

9567

++{

9568

++	return prio;

9569

++}

9570

++

9571

++static inline int sched_idx2prio(int idx, struct rq *rq)

9572

++{

9573

++	return idx;

9574

++}

9575

++

9576

++static inline void time_slice_expired(struct task_struct *p, struct rq *rq)

9577

++{

9578

++	p->time_slice = sched_timeslice_ns;

9579

++

9580

++	if (SCHED_FIFO != p->policy && task_on_rq_queued(p)) {

9581

++		if (SCHED_RR != p->policy)

9582

++			deboost_task(p);

9583

++		requeue_task(p, rq, task_sched_prio_idx(p, rq));

9584

++	}

9585

++}

9586

++

9587

++static inline void sched_task_sanity_check(struct task_struct *p, struct rq *rq) {}

9588

++

9589

++inline int task_running_nice(struct task_struct *p)

9590

++{

9591

++	return (p->prio + p->boost_prio > DEFAULT_PRIO + MAX_PRIORITY_ADJ);

9592

++}

9593

++

9594

++static void sched_task_fork(struct task_struct *p, struct rq *rq)

9595

++{

9596

++	p->boost_prio = MAX_PRIORITY_ADJ;

9597

++}

9598

++

9599

++static inline void do_sched_yield_type_1(struct task_struct *p, struct rq *rq)

9600

++{

9601

++	p->boost_prio = MAX_PRIORITY_ADJ;

9602

++}

9603

++

9604

++#ifdef CONFIG_SMP

9605

++static inline void sched_task_ttwu(struct task_struct *p)

9606

++{

9607

++	if(this_rq()->clock_task - p->last_ran > sched_timeslice_ns)

9608

++		boost_task(p);

9609

++}

9610

++#endif

9611

++

9612

++static inline void sched_task_deactivate(struct task_struct *p, struct rq *rq)

9613

++{

9614

++	if (rq_switch_time(rq) < boost_threshold(p))

9615

++		boost_task(p);

9616

++}

9617

++

9618

++static inline void update_rq_time_edge(struct rq *rq) {}

9619

+diff --git a/kernel/sched/build_policy.c b/kernel/sched/build_policy.c

9620

+index d9dc9ab3773f..71a25540d65e 100644

9621

+--- a/kernel/sched/build_policy.c

9622

++++ b/kernel/sched/build_policy.c

9623

+@@ -42,13 +42,19 @@

9624

+

9625

+ #include "idle.c"

9626

+

9627

++#ifndef CONFIG_SCHED_ALT

9628

+ #include "rt.c"

9629

++#endif

9630

+

9631

+ #ifdef CONFIG_SMP

9632

++#ifndef CONFIG_SCHED_ALT

9633

+ # include "cpudeadline.c"

9634

++#endif

9635

+ # include "pelt.c"

9636

+ #endif

9637

+

9638

+ #include "cputime.c"

9639

+-#include "deadline.c"

9640

+

9641

++#ifndef CONFIG_SCHED_ALT

9642

++#include "deadline.c"

9643

++#endif

9644

+diff --git a/kernel/sched/build_utility.c b/kernel/sched/build_utility.c

9645

+index 99bdd96f454f..23f80a86d2d7 100644

9646

+--- a/kernel/sched/build_utility.c

9647

++++ b/kernel/sched/build_utility.c

9648

+@@ -85,7 +85,9 @@

9649

+

9650

+ #ifdef CONFIG_SMP

9651

+ # include "cpupri.c"

9652

++#ifndef CONFIG_SCHED_ALT

9653

+ # include "stop_task.c"

9654

++#endif

9655

+ # include "topology.c"

9656

+ #endif

9657

+

9658

+diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c

9659

+index 1207c78f85c1..68812e0756cb 100644

9660

+--- a/kernel/sched/cpufreq_schedutil.c

9661

++++ b/kernel/sched/cpufreq_schedutil.c

9662

+@@ -159,9 +159,14 @@ static void sugov_get_util(struct sugov_cpu *sg_cpu)

9663

+ 	struct rq *rq = cpu_rq(sg_cpu->cpu);

9664

+

9665

+ 	sg_cpu->max = arch_scale_cpu_capacity(sg_cpu->cpu);

9666

++#ifndef CONFIG_SCHED_ALT

9667

+ 	sg_cpu->bw_dl = cpu_bw_dl(rq);

9668

+ 	sg_cpu->util = effective_cpu_util(sg_cpu->cpu, cpu_util_cfs(sg_cpu->cpu),

9669

+ 					  FREQUENCY_UTIL, NULL);

9670

++#else

9671

++	sg_cpu->bw_dl = 0;

9672

++	sg_cpu->util = rq_load_util(rq, sg_cpu->max);

9673

++#endif /* CONFIG_SCHED_ALT */

9674

+ }

9675

+

9676

+ /**

9677

+@@ -305,8 +310,10 @@ static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; }

9678

+  */

9679

+ static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu)

9680

+ {

9681

++#ifndef CONFIG_SCHED_ALT

9682

+ 	if (cpu_bw_dl(cpu_rq(sg_cpu->cpu)) > sg_cpu->bw_dl)

9683

+ 		sg_cpu->sg_policy->limits_changed = true;

9684

++#endif

9685

+ }

9686

+

9687

+ static inline bool sugov_update_single_common(struct sugov_cpu *sg_cpu,

9688

+@@ -606,6 +613,7 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy)

9689

+ 	}

9690

+

9691

+ 	ret = sched_setattr_nocheck(thread, &attr);

9692

++

9693

+ 	if (ret) {

9694

+ 		kthread_stop(thread);

9695

+ 		pr_warn("%s: failed to set SCHED_DEADLINE\n", __func__);

9696

+@@ -838,7 +846,9 @@ cpufreq_governor_init(schedutil_gov);

9697

+ #ifdef CONFIG_ENERGY_MODEL

9698

+ static void rebuild_sd_workfn(struct work_struct *work)

9699

+ {

9700

++#ifndef CONFIG_SCHED_ALT

9701

+ 	rebuild_sched_domains_energy();

9702

++#endif /* CONFIG_SCHED_ALT */

9703

+ }

9704

+ static DECLARE_WORK(rebuild_sd_work, rebuild_sd_workfn);

9705

+

9706

+diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c

9707

+index 95fc77853743..b48b3f9ed47f 100644

9708

+--- a/kernel/sched/cputime.c

9709

++++ b/kernel/sched/cputime.c

9710

+@@ -122,7 +122,7 @@ void account_user_time(struct task_struct *p, u64 cputime)

9711

+ 	p->utime += cputime;

9712

+ 	account_group_user_time(p, cputime);

9713

+

9714

+-	index = (task_nice(p) > 0) ? CPUTIME_NICE : CPUTIME_USER;

9715

++	index = task_running_nice(p) ? CPUTIME_NICE : CPUTIME_USER;

9716

+

9717

+ 	/* Add user time to cpustat. */

9718

+ 	task_group_account_field(p, index, cputime);

9719

+@@ -146,7 +146,7 @@ void account_guest_time(struct task_struct *p, u64 cputime)

9720

+ 	p->gtime += cputime;

9721

+

9722

+ 	/* Add guest time to cpustat. */

9723

+-	if (task_nice(p) > 0) {

9724

++	if (task_running_nice(p)) {

9725

+ 		task_group_account_field(p, CPUTIME_NICE, cputime);

9726

+ 		cpustat[CPUTIME_GUEST_NICE] += cputime;

9727

+ 	} else {

9728

+@@ -284,7 +284,7 @@ static inline u64 account_other_time(u64 max)

9729

+ #ifdef CONFIG_64BIT

9730

+ static inline u64 read_sum_exec_runtime(struct task_struct *t)

9731

+ {

9732

+-	return t->se.sum_exec_runtime;

9733

++	return tsk_seruntime(t);

9734

+ }

9735

+ #else

9736

+ static u64 read_sum_exec_runtime(struct task_struct *t)

9737

+@@ -294,7 +294,7 @@ static u64 read_sum_exec_runtime(struct task_struct *t)

9738

+ 	struct rq *rq;

9739

+

9740

+ 	rq = task_rq_lock(t, &rf);

9741

+-	ns = t->se.sum_exec_runtime;

9742

++	ns = tsk_seruntime(t);

9743

+ 	task_rq_unlock(rq, t, &rf);

9744

+

9745

+ 	return ns;

9746

+@@ -626,7 +626,7 @@ void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev,

9747

+ void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st)

9748

+ {

9749

+ 	struct task_cputime cputime = {

9750

+-		.sum_exec_runtime = p->se.sum_exec_runtime,

9751

++		.sum_exec_runtime = tsk_seruntime(p),

9752

+ 	};

9753

+

9754

+ 	if (task_cputime(p, &cputime.utime, &cputime.stime))

9755

+diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c

9756

+index 1637b65ba07a..033c6deeb515 100644

9757

+--- a/kernel/sched/debug.c

9758

++++ b/kernel/sched/debug.c

9759

+@@ -7,6 +7,7 @@

9760

+  * Copyright(C) 2007, Red Hat, Inc., Ingo Molnar

9761

+  */

9762

+

9763

++#ifndef CONFIG_SCHED_ALT

9764

+ /*

9765

+  * This allows printing both to /proc/sched_debug and

9766

+  * to the console

9767

+@@ -215,6 +216,7 @@ static const struct file_operations sched_scaling_fops = {

9768

+ };

9769

+

9770

+ #endif /* SMP */

9771

++#endif /* !CONFIG_SCHED_ALT */

9772

+

9773

+ #ifdef CONFIG_PREEMPT_DYNAMIC

9774

+

9775

+@@ -278,6 +280,7 @@ static const struct file_operations sched_dynamic_fops = {

9776

+

9777

+ #endif /* CONFIG_PREEMPT_DYNAMIC */

9778

+

9779

++#ifndef CONFIG_SCHED_ALT

9780

+ __read_mostly bool sched_debug_verbose;

9781

+

9782

+ static const struct seq_operations sched_debug_sops;

9783

+@@ -293,6 +296,7 @@ static const struct file_operations sched_debug_fops = {

9784

+ 	.llseek		= seq_lseek,

9785

+ 	.release	= seq_release,

9786

+ };

9787

++#endif /* !CONFIG_SCHED_ALT */

9788

+

9789

+ static struct dentry *debugfs_sched;

9790

+

9791

+@@ -302,12 +306,15 @@ static __init int sched_init_debug(void)

9792

+

9793

+ 	debugfs_sched = debugfs_create_dir("sched", NULL);

9794

+

9795

++#ifndef CONFIG_SCHED_ALT

9796

+ 	debugfs_create_file("features", 0644, debugfs_sched, NULL, &sched_feat_fops);

9797

+ 	debugfs_create_bool("verbose", 0644, debugfs_sched, &sched_debug_verbose);

9798

++#endif /* !CONFIG_SCHED_ALT */

9799

+ #ifdef CONFIG_PREEMPT_DYNAMIC

9800

+ 	debugfs_create_file("preempt", 0644, debugfs_sched, NULL, &sched_dynamic_fops);

9801

+ #endif

9802

+

9803

++#ifndef CONFIG_SCHED_ALT

9804

+ 	debugfs_create_u32("latency_ns", 0644, debugfs_sched, &sysctl_sched_latency);

9805

+ 	debugfs_create_u32("min_granularity_ns", 0644, debugfs_sched, &sysctl_sched_min_granularity);

9806

+ 	debugfs_create_u32("idle_min_granularity_ns", 0644, debugfs_sched, &sysctl_sched_idle_min_granularity);

9807

+@@ -337,11 +344,13 @@ static __init int sched_init_debug(void)

9808

+ #endif

9809

+

9810

+ 	debugfs_create_file("debug", 0444, debugfs_sched, NULL, &sched_debug_fops);

9811

++#endif /* !CONFIG_SCHED_ALT */

9812

+

9813

+ 	return 0;

9814

+ }

9815

+ late_initcall(sched_init_debug);

9816

+

9817

++#ifndef CONFIG_SCHED_ALT

9818

+ #ifdef CONFIG_SMP

9819

+

9820

+ static cpumask_var_t		sd_sysctl_cpus;

9821

+@@ -1068,6 +1077,7 @@ void proc_sched_set_task(struct task_struct *p)

9822

+ 	memset(&p->stats, 0, sizeof(p->stats));

9823

+ #endif

9824

+ }

9825

++#endif /* !CONFIG_SCHED_ALT */

9826

+

9827

+ void resched_latency_warn(int cpu, u64 latency)

9828

+ {

9829

+diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c

9830

+index f26ab2675f7d..480d4ad16d45 100644

9831

+--- a/kernel/sched/idle.c

9832

++++ b/kernel/sched/idle.c

9833

+@@ -400,6 +400,7 @@ void cpu_startup_entry(enum cpuhp_state state)

9834

+ 		do_idle();

9835

+ }

9836

+

9837

++#ifndef CONFIG_SCHED_ALT

9838

+ /*

9839

+  * idle-task scheduling class.

9840

+  */

9841

+@@ -521,3 +522,4 @@ DEFINE_SCHED_CLASS(idle) = {

9842

+ 	.switched_to		= switched_to_idle,

9843

+ 	.update_curr		= update_curr_idle,

9844

+ };

9845

++#endif

9846

+diff --git a/kernel/sched/pds.h b/kernel/sched/pds.h

9847

+new file mode 100644

9848

+index 000000000000..56a649d02e49

9849

+--- /dev/null

9850

++++ b/kernel/sched/pds.h

9851

+@@ -0,0 +1,127 @@

9852

++#define ALT_SCHED_VERSION_MSG "sched/pds: PDS CPU Scheduler "ALT_SCHED_VERSION" by Alfred Chen.\n"

9853

++

9854

++static int sched_timeslice_shift = 22;

9855

++

9856

++#define NORMAL_PRIO_MOD(x)	((x) & (NORMAL_PRIO_NUM - 1))

9857

++

9858

++/*

9859

++ * Common interfaces

9860

++ */

9861

++static inline void sched_timeslice_imp(const int timeslice_ms)

9862

++{

9863

++	if (2 == timeslice_ms)

9864

++		sched_timeslice_shift = 21;

9865

++}

9866

++

9867

++static inline int

9868

++task_sched_prio_normal(const struct task_struct *p, const struct rq *rq)

9869

++{

9870

++	s64 delta = p->deadline - rq->time_edge + NORMAL_PRIO_NUM - NICE_WIDTH;

9871

++

9872

++	if (WARN_ONCE(delta > NORMAL_PRIO_NUM - 1,

9873

++		      "pds: task_sched_prio_normal() delta %lld\n", delta))

9874

++		return NORMAL_PRIO_NUM - 1;

9875

++

9876

++	return (delta < 0) ? 0 : delta;

9877

++}

9878

++

9879

++static inline int task_sched_prio(const struct task_struct *p)

9880

++{

9881

++	return (p->prio < MAX_RT_PRIO) ? p->prio :

9882

++		MIN_NORMAL_PRIO + task_sched_prio_normal(p, task_rq(p));

9883

++}

9884

++

9885

++static inline int

9886

++task_sched_prio_idx(const struct task_struct *p, const struct rq *rq)

9887

++{

9888

++	return (p->prio < MAX_RT_PRIO) ? p->prio : MIN_NORMAL_PRIO +

9889

++		NORMAL_PRIO_MOD(task_sched_prio_normal(p, rq) + rq->time_edge);

9890

++}

9891

++

9892

++static inline int sched_prio2idx(int prio, struct rq *rq)

9893

++{

9894

++	return (IDLE_TASK_SCHED_PRIO == prio || prio < MAX_RT_PRIO) ? prio :

9895

++		MIN_NORMAL_PRIO + NORMAL_PRIO_MOD((prio - MIN_NORMAL_PRIO) +

9896

++						  rq->time_edge);

9897

++}

9898

++

9899

++static inline int sched_idx2prio(int idx, struct rq *rq)

9900

++{

9901

++	return (idx < MAX_RT_PRIO) ? idx : MIN_NORMAL_PRIO +

9902

++		NORMAL_PRIO_MOD((idx - MIN_NORMAL_PRIO) + NORMAL_PRIO_NUM -

9903

++				NORMAL_PRIO_MOD(rq->time_edge));

9904

++}

9905

++

9906

++static inline void sched_renew_deadline(struct task_struct *p, const struct rq *rq)

9907

++{

9908

++	if (p->prio >= MAX_RT_PRIO)

9909

++		p->deadline = (rq->clock >> sched_timeslice_shift) +

9910

++			p->static_prio - (MAX_PRIO - NICE_WIDTH);

9911

++}

9912

++

9913

++int task_running_nice(struct task_struct *p)

9914

++{

9915

++	return (p->prio > DEFAULT_PRIO);

9916

++}

9917

++

9918

++static inline void update_rq_time_edge(struct rq *rq)

9919

++{

9920

++	struct list_head head;

9921

++	u64 old = rq->time_edge;

9922

++	u64 now = rq->clock >> sched_timeslice_shift;

9923

++	u64 prio, delta;

9924

++

9925

++	if (now == old)

9926

++		return;

9927

++

9928

++	delta = min_t(u64, NORMAL_PRIO_NUM, now - old);

9929

++	INIT_LIST_HEAD(&head);

9930

++

9931

++	for_each_set_bit(prio, &rq->queue.bitmap[2], delta)

9932

++		list_splice_tail_init(rq->queue.heads + MIN_NORMAL_PRIO +

9933

++				      NORMAL_PRIO_MOD(prio + old), &head);

9934

++

9935

++	rq->queue.bitmap[2] = (NORMAL_PRIO_NUM == delta) ? 0UL :

9936

++		rq->queue.bitmap[2] >> delta;

9937

++	rq->time_edge = now;

9938

++	if (!list_empty(&head)) {

9939

++		u64 idx = MIN_NORMAL_PRIO + NORMAL_PRIO_MOD(now);

9940

++		struct task_struct *p;

9941

++

9942

++		list_for_each_entry(p, &head, sq_node)

9943

++			p->sq_idx = idx;

9944

++

9945

++		list_splice(&head, rq->queue.heads + idx);

9946

++		rq->queue.bitmap[2] |= 1UL;

9947

++	}

9948

++}

9949

++

9950

++static inline void time_slice_expired(struct task_struct *p, struct rq *rq)

9951

++{

9952

++	p->time_slice = sched_timeslice_ns;

9953

++	sched_renew_deadline(p, rq);

9954

++	if (SCHED_FIFO != p->policy && task_on_rq_queued(p))

9955

++		requeue_task(p, rq, task_sched_prio_idx(p, rq));

9956

++}

9957

++

9958

++static inline void sched_task_sanity_check(struct task_struct *p, struct rq *rq)

9959

++{

9960

++	u64 max_dl = rq->time_edge + NICE_WIDTH - 1;

9961

++	if (unlikely(p->deadline > max_dl))

9962

++		p->deadline = max_dl;

9963

++}

9964

++

9965

++static void sched_task_fork(struct task_struct *p, struct rq *rq)

9966

++{

9967

++	sched_renew_deadline(p, rq);

9968

++}

9969

++

9970

++static inline void do_sched_yield_type_1(struct task_struct *p, struct rq *rq)

9971

++{

9972

++	time_slice_expired(p, rq);

9973

++}

9974

++

9975

++#ifdef CONFIG_SMP

9976

++static inline void sched_task_ttwu(struct task_struct *p) {}

9977

++#endif

9978

++static inline void sched_task_deactivate(struct task_struct *p, struct rq *rq) {}

9979

+diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c

9980

+index 0f310768260c..bd38bf738fe9 100644

9981

+--- a/kernel/sched/pelt.c

9982

++++ b/kernel/sched/pelt.c

9983

+@@ -266,6 +266,7 @@ ___update_load_avg(struct sched_avg *sa, unsigned long load)

9984

+ 	WRITE_ONCE(sa->util_avg, sa->util_sum / divider);

9985

+ }

9986

+

9987

++#ifndef CONFIG_SCHED_ALT

9988

+ /*

9989

+  * sched_entity:

9990

+  *

9991

+@@ -383,8 +384,9 @@ int update_dl_rq_load_avg(u64 now, struct rq *rq, int running)

9992

+

9993

+ 	return 0;

9994

+ }

9995

++#endif

9996

+

9997

+-#ifdef CONFIG_SCHED_THERMAL_PRESSURE

9998

++#if defined(CONFIG_SCHED_THERMAL_PRESSURE) && !defined(CONFIG_SCHED_ALT)

9999

+ /*

10000

+  * thermal:

10001

+  *

10002

+diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h

10003

+index 3a0e0dc28721..e8a7d84aa5a5 100644

10004

+--- a/kernel/sched/pelt.h

10005

++++ b/kernel/sched/pelt.h

10006

+@@ -1,13 +1,15 @@

10007

+ #ifdef CONFIG_SMP

10008

+ #include "sched-pelt.h"

10009

+

10010

++#ifndef CONFIG_SCHED_ALT

10011

+ int __update_load_avg_blocked_se(u64 now, struct sched_entity *se);

10012

+ int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_entity *se);

10013

+ int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq);

10014

+ int update_rt_rq_load_avg(u64 now, struct rq *rq, int running);

10015

+ int update_dl_rq_load_avg(u64 now, struct rq *rq, int running);

10016

++#endif

10017

+

10018

+-#ifdef CONFIG_SCHED_THERMAL_PRESSURE

10019

++#if defined(CONFIG_SCHED_THERMAL_PRESSURE) && !defined(CONFIG_SCHED_ALT)

10020

+ int update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity);

10021

+

10022

+ static inline u64 thermal_load_avg(struct rq *rq)

10023

+@@ -44,6 +46,7 @@ static inline u32 get_pelt_divider(struct sched_avg *avg)

10024

+ 	return PELT_MIN_DIVIDER + avg->period_contrib;

10025

+ }

10026

+

10027

++#ifndef CONFIG_SCHED_ALT

10028

+ static inline void cfs_se_util_change(struct sched_avg *avg)

10029

+ {

10030

+ 	unsigned int enqueued;

10031

+@@ -180,9 +183,11 @@ static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq)

10032

+ 	return rq_clock_pelt(rq_of(cfs_rq));

10033

+ }

10034

+ #endif

10035

++#endif /* CONFIG_SCHED_ALT */

10036

+

10037

+ #else

10038

+

10039

++#ifndef CONFIG_SCHED_ALT

10040

+ static inline int

10041

+ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)

10042

+ {

10043

+@@ -200,6 +205,7 @@ update_dl_rq_load_avg(u64 now, struct rq *rq, int running)

10044

+ {

10045

+ 	return 0;

10046

+ }

10047

++#endif

10048

+

10049

+ static inline int

10050

+ update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity)

10051

+diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h

10052

+index 771f8ddb7053..787a5069d69a 100644

10053

+--- a/kernel/sched/sched.h

10054

++++ b/kernel/sched/sched.h

10055

+@@ -5,6 +5,10 @@

10056

+ #ifndef _KERNEL_SCHED_SCHED_H

10057

+ #define _KERNEL_SCHED_SCHED_H

10058

+

10059

++#ifdef CONFIG_SCHED_ALT

10060

++#include "alt_sched.h"

10061

++#else

10062

++

10063

+ #include <linux/sched/affinity.h>

10064

+ #include <linux/sched/autogroup.h>

10065

+ #include <linux/sched/cpufreq.h>

10066

+@@ -3261,4 +3265,9 @@ static inline void update_current_exec_runtime(struct task_struct *curr,

10067

+ 	cgroup_account_cputime(curr, delta_exec);

10068

+ }

10069

+

10070

++static inline int task_running_nice(struct task_struct *p)

10071

++{

10072

++	return (task_nice(p) > 0);

10073

++}

10074

++#endif /* !CONFIG_SCHED_ALT */

10075

+ #endif /* _KERNEL_SCHED_SCHED_H */

10076

+diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c

10077

+index 857f837f52cb..5486c63e4790 100644

10078

+--- a/kernel/sched/stats.c

10079

++++ b/kernel/sched/stats.c

10080

+@@ -125,8 +125,10 @@ static int show_schedstat(struct seq_file *seq, void *v)

10081

+ 	} else {

10082

+ 		struct rq *rq;

10083

+ #ifdef CONFIG_SMP

10084

++#ifndef CONFIG_SCHED_ALT

10085

+ 		struct sched_domain *sd;

10086

+ 		int dcount = 0;

10087

++#endif

10088

+ #endif

10089

+ 		cpu = (unsigned long)(v - 2);

10090

+ 		rq = cpu_rq(cpu);

10091

+@@ -143,6 +145,7 @@ static int show_schedstat(struct seq_file *seq, void *v)

10092

+ 		seq_printf(seq, "\n");

10093

+

10094

+ #ifdef CONFIG_SMP

10095

++#ifndef CONFIG_SCHED_ALT

10096

+ 		/* domain-specific stats */

10097

+ 		rcu_read_lock();

10098

+ 		for_each_domain(cpu, sd) {

10099

+@@ -171,6 +174,7 @@ static int show_schedstat(struct seq_file *seq, void *v)

10100

+ 			    sd->ttwu_move_balance);

10101

+ 		}

10102

+ 		rcu_read_unlock();

10103

++#endif

10104

+ #endif

10105

+ 	}

10106

+ 	return 0;

10107

+diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h

10108

+index 38f3698f5e5b..b9d597394316 100644

10109

+--- a/kernel/sched/stats.h

10110

++++ b/kernel/sched/stats.h

10111

+@@ -89,6 +89,7 @@ static inline void rq_sched_info_depart  (struct rq *rq, unsigned long long delt

10112

+

10113

+ #endif /* CONFIG_SCHEDSTATS */

10114

+

10115

++#ifndef CONFIG_SCHED_ALT

10116

+ #ifdef CONFIG_FAIR_GROUP_SCHED

10117

+ struct sched_entity_stats {

10118

+ 	struct sched_entity     se;

10119

+@@ -105,6 +106,7 @@ __schedstats_from_se(struct sched_entity *se)

10120

+ #endif

10121

+ 	return &task_of(se)->stats;

10122

+ }

10123

++#endif /* CONFIG_SCHED_ALT */

10124

+

10125

+ #ifdef CONFIG_PSI

10126

+ void psi_task_change(struct task_struct *task, int clear, int set);

10127

+diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c

10128

+index 8739c2a5a54e..d8dd6c15eb47 100644

10129

+--- a/kernel/sched/topology.c

10130

++++ b/kernel/sched/topology.c

10131

+@@ -3,6 +3,7 @@

10132

+  * Scheduler topology setup/handling methods

10133

+  */

10134

+

10135

++#ifndef CONFIG_SCHED_ALT

10136

+ DEFINE_MUTEX(sched_domains_mutex);

10137

+

10138

+ /* Protected by sched_domains_mutex: */

10139

+@@ -1413,8 +1414,10 @@ static void asym_cpu_capacity_scan(void)

10140

+  */

10141

+

10142

+ static int default_relax_domain_level = -1;

10143

++#endif /* CONFIG_SCHED_ALT */

10144

+ int sched_domain_level_max;

10145

+

10146

++#ifndef CONFIG_SCHED_ALT

10147

+ static int __init setup_relax_domain_level(char *str)

10148

+ {

10149

+ 	if (kstrtoint(str, 0, &default_relax_domain_level))

10150

+@@ -1647,6 +1650,7 @@ sd_init(struct sched_domain_topology_level *tl,

10151

+

10152

+ 	return sd;

10153

+ }

10154

++#endif /* CONFIG_SCHED_ALT */

10155

+

10156

+ /*

10157

+  * Topology list, bottom-up.

10158

+@@ -1683,6 +1687,7 @@ void set_sched_topology(struct sched_domain_topology_level *tl)

10159

+ 	sched_domain_topology_saved = NULL;

10160

+ }

10161

+

10162

++#ifndef CONFIG_SCHED_ALT

10163

+ #ifdef CONFIG_NUMA

10164

+

10165

+ static const struct cpumask *sd_numa_mask(int cpu)

10166

+@@ -2645,3 +2650,15 @@ void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],

10167

+ 	partition_sched_domains_locked(ndoms_new, doms_new, dattr_new);

10168

+ 	mutex_unlock(&sched_domains_mutex);

10169

+ }

10170

++#else /* CONFIG_SCHED_ALT */

10171

++void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],

10172

++			     struct sched_domain_attr *dattr_new)

10173

++{}

10174

++

10175

++#ifdef CONFIG_NUMA

10176

++int sched_numa_find_closest(const struct cpumask *cpus, int cpu)

10177

++{

10178

++	return best_mask_cpu(cpu, cpus);

10179

++}

10180

++#endif /* CONFIG_NUMA */

10181

++#endif

10182

+diff --git a/kernel/sysctl.c b/kernel/sysctl.c

10183

+index 137d4abe3eda..6bada3a6d571 100644

10184

+--- a/kernel/sysctl.c

10185

++++ b/kernel/sysctl.c

10186

+@@ -93,6 +93,10 @@ EXPORT_SYMBOL_GPL(sysctl_long_vals);

10187

+

10188

+ /* Constants used for minimum and maximum */

10189

+

10190

++#ifdef CONFIG_SCHED_ALT

10191

++extern int sched_yield_type;

10192

++#endif

10193

++

10194

+ #ifdef CONFIG_PERF_EVENTS

10195

+ static const int six_hundred_forty_kb = 640 * 1024;

10196

+ #endif

10197

+@@ -1934,6 +1938,17 @@ static struct ctl_table kern_table[] = {

10198

+ 		.proc_handler	= proc_dointvec,

10199

+ 	},

10200

+ #endif

10201

++#ifdef CONFIG_SCHED_ALT

10202

++	{

10203

++		.procname	= "yield_type",

10204

++		.data		= &sched_yield_type,

10205

++		.maxlen		= sizeof (int),

10206

++		.mode		= 0644,

10207

++		.proc_handler	= &proc_dointvec_minmax,

10208

++		.extra1		= SYSCTL_ZERO,

10209

++		.extra2		= SYSCTL_TWO,

10210

++	},

10211

++#endif

10212

+ #if defined(CONFIG_S390) && defined(CONFIG_SMP)

10213

+ 	{

10214

+ 		.procname	= "spin_retry",

10215

+diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c

10216

+index 3ae661ab6260..35f0176dcdb0 100644

10217

+--- a/kernel/time/hrtimer.c

10218

++++ b/kernel/time/hrtimer.c

10219

+@@ -2088,8 +2088,10 @@ long hrtimer_nanosleep(ktime_t rqtp, const enum hrtimer_mode mode,

10220

+ 	int ret = 0;

10221

+ 	u64 slack;

10222

+

10223

++#ifndef CONFIG_SCHED_ALT

10224

+ 	slack = current->timer_slack_ns;

10225

+ 	if (dl_task(current) || rt_task(current))

10226

++#endif

10227

+ 		slack = 0;

10228

+

10229

+ 	hrtimer_init_sleeper_on_stack(&t, clockid, mode);

10230

+diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c

10231

+index cb925e8ef9a8..67d823510f5c 100644

10232

+--- a/kernel/time/posix-cpu-timers.c

10233

++++ b/kernel/time/posix-cpu-timers.c

10234

+@@ -223,7 +223,7 @@ static void task_sample_cputime(struct task_struct *p, u64 *samples)

10235

+ 	u64 stime, utime;

10236

+

10237

+ 	task_cputime(p, &utime, &stime);

10238

+-	store_samples(samples, stime, utime, p->se.sum_exec_runtime);

10239

++	store_samples(samples, stime, utime, tsk_seruntime(p));

10240

+ }

10241

+

10242

+ static void proc_sample_cputime_atomic(struct task_cputime_atomic *at,

10243

+@@ -866,6 +866,7 @@ static void collect_posix_cputimers(struct posix_cputimers *pct, u64 *samples,

10244

+ 	}

10245

+ }

10246

+

10247

++#ifndef CONFIG_SCHED_ALT

10248

+ static inline void check_dl_overrun(struct task_struct *tsk)

10249

+ {

10250

+ 	if (tsk->dl.dl_overrun) {

10251

+@@ -873,6 +874,7 @@ static inline void check_dl_overrun(struct task_struct *tsk)

10252

+ 		send_signal_locked(SIGXCPU, SEND_SIG_PRIV, tsk, PIDTYPE_TGID);

10253

+ 	}

10254

+ }

10255

++#endif

10256

+

10257

+ static bool check_rlimit(u64 time, u64 limit, int signo, bool rt, bool hard)

10258

+ {

10259

+@@ -900,8 +902,10 @@ static void check_thread_timers(struct task_struct *tsk,

10260

+ 	u64 samples[CPUCLOCK_MAX];

10261

+ 	unsigned long soft;

10262

+

10263

++#ifndef CONFIG_SCHED_ALT

10264

+ 	if (dl_task(tsk))

10265

+ 		check_dl_overrun(tsk);

10266

++#endif

10267

+

10268

+ 	if (expiry_cache_is_inactive(pct))

10269

+ 		return;

10270

+@@ -915,7 +919,7 @@ static void check_thread_timers(struct task_struct *tsk,

10271

+ 	soft = task_rlimit(tsk, RLIMIT_RTTIME);

10272

+ 	if (soft != RLIM_INFINITY) {

10273

+ 		/* Task RT timeout is accounted in jiffies. RTTIME is usec */

10274

+-		unsigned long rttime = tsk->rt.timeout * (USEC_PER_SEC / HZ);

10275

++		unsigned long rttime = tsk_rttimeout(tsk) * (USEC_PER_SEC / HZ);

10276

+ 		unsigned long hard = task_rlimit_max(tsk, RLIMIT_RTTIME);

10277

+

10278

+ 		/* At the hard limit, send SIGKILL. No further action. */

10279

+@@ -1151,8 +1155,10 @@ static inline bool fastpath_timer_check(struct task_struct *tsk)

10280

+ 			return true;

10281

+ 	}

10282

+

10283

++#ifndef CONFIG_SCHED_ALT

10284

+ 	if (dl_task(tsk) && tsk->dl.dl_overrun)

10285

+ 		return true;

10286

++#endif

10287

+

10288

+ 	return false;

10289

+ }

10290

+diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c

10291

+index ff0536cea968..ce266990006d 100644

10292

+--- a/kernel/trace/trace_selftest.c

10293

++++ b/kernel/trace/trace_selftest.c

10294

+@@ -1150,10 +1150,15 @@ static int trace_wakeup_test_thread(void *data)

10295

+ {

10296

+ 	/* Make this a -deadline thread */

10297

+ 	static const struct sched_attr attr = {

10298

++#ifdef CONFIG_SCHED_ALT

10299

++		/* No deadline on BMQ/PDS, use RR */

10300

++		.sched_policy = SCHED_RR,

10301

++#else

10302

+ 		.sched_policy = SCHED_DEADLINE,

10303

+ 		.sched_runtime = 100000ULL,

10304

+ 		.sched_deadline = 10000000ULL,

10305

+ 		.sched_period = 10000000ULL

10306

++#endif

10307

+ 	};

10308

+ 	struct wakeup_test_data *x = data;

10309

+

10310

10311

diff --git a/5021_BMQ-and-PDS-gentoo-defaults.patch b/5021_BMQ-and-PDS-gentoo-defaults.patch

10312

new file mode 100644

10313

index 00000000..6dc48eec

10314

--- /dev/null

10315

+++ b/5021_BMQ-and-PDS-gentoo-defaults.patch

10316

@@ -0,0 +1,13 @@

10317

+--- a/init/Kconfig	2023-02-13 08:16:09.534315265 -0500

10318

++++ b/init/Kconfig	2023-02-13 08:17:24.130237204 -0500

10319

+@@ -867,8 +867,9 @@ config UCLAMP_BUCKETS_COUNT

10320

+ 	  If in doubt, use the default value.

10321

+

10322

+ menuconfig SCHED_ALT

10323

++	depends on X86_64

10324

+ 	bool "Alternative CPU Schedulers"

10325

+-	default y

10326

++	default n

10327

+ 	help

10328

+ 	  This feature enable alternative CPU scheduler"

10329

+

Gentoo Archives: gentoo-commits