[gentoo-commits] proj/linux-patches:5.15 commit in: / - gentoo-commits

From:	Mike Pagano <mpagano@g.o>
To:	gentoo-commits@l.g.o
Subject:	[gentoo-commits] proj/linux-patches:5.15 commit in: /
Date:	Sun, 21 Nov 2021 20:56:25
Message-Id:	`1637528166.3164db0223bf8d829b763411f45449277097f8d8.mpagano@gentoo`

1

commit:     3164db0223bf8d829b763411f45449277097f8d8

2

Author:     Mike Pagano <mpagano <AT> gentoo <DOT> org>

3

AuthorDate: Sun Nov 21 20:56:06 2021 +0000

4

Commit:     Mike Pagano <mpagano <AT> gentoo <DOT> org>

5

CommitDate: Sun Nov 21 20:56:06 2021 +0000

6

URL:        https://gitweb.gentoo.org/proj/linux-patches.git/commit/?id=3164db02

7

8

Remove old BMQ Patch

9

10

Signed-off-by: Mike Pagano <mpagano <AT> gentoo.org>

11

12

 5020_BMQ-and-PDS-io-scheduler-v5.15-r0.patch | 9798 --------------------------

13

 1 file changed, 9798 deletions(-)

14

15

diff --git a/5020_BMQ-and-PDS-io-scheduler-v5.15-r0.patch b/5020_BMQ-and-PDS-io-scheduler-v5.15-r0.patch

16

deleted file mode 100644

17

index c22cf656..00000000

18

--- a/5020_BMQ-and-PDS-io-scheduler-v5.15-r0.patch

19

+++ /dev/null

20

@@ -1,9798 +0,0 @@

21

-diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt

22

-index 43dc35fe5bc0..0873e92ca5d0 100644

23

---- a/Documentation/admin-guide/kernel-parameters.txt

24

-+++ b/Documentation/admin-guide/kernel-parameters.txt

25

-@@ -4985,6 +4985,12 @@

26

- 	sa1100ir	[NET]

27

- 			See drivers/net/irda/sa1100_ir.c.

28

-

29

-+	sched_timeslice=

30

-+			[KNL] Time slice in ms for Project C BMQ/PDS scheduler.

31

-+			Format: integer 2, 4

32

-+			Default: 4

33

-+			See Documentation/scheduler/sched-BMQ.txt

34

-+

35

- 	sched_verbose	[KNL] Enables verbose scheduler debug messages.

36

-

37

- 	schedstats=	[KNL,X86] Enable or disable scheduled statistics.

38

-diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst

39

-index 426162009ce9..15ac2d7e47cd 100644

40

---- a/Documentation/admin-guide/sysctl/kernel.rst

41

-+++ b/Documentation/admin-guide/sysctl/kernel.rst

42

-@@ -1542,3 +1542,13 @@ is 10 seconds.

43

-

44

- The softlockup threshold is (``2 * watchdog_thresh``). Setting this

45

- tunable to zero will disable lockup detection altogether.

46

-+

47

-+yield_type:

48

-+===========

49

-+

50

-+BMQ/PDS CPU scheduler only. This determines what type of yield calls

51

-+to sched_yield will perform.

52

-+

53

-+  0 - No yield.

54

-+  1 - Deboost and requeue task. (default)

55

-+  2 - Set run queue skip task.

56

-diff --git a/Documentation/scheduler/sched-BMQ.txt b/Documentation/scheduler/sched-BMQ.txt

57

-new file mode 100644

58

-index 000000000000..05c84eec0f31

59

---- /dev/null

60

-+++ b/Documentation/scheduler/sched-BMQ.txt

61

-@@ -0,0 +1,110 @@

62

-+                         BitMap queue CPU Scheduler

63

-+                         --------------------------

64

-+

65

-+CONTENT

66

-+========

67

-+

68

-+ Background

69

-+ Design

70

-+   Overview

71

-+   Task policy

72

-+   Priority management

73

-+   BitMap Queue

74

-+   CPU Assignment and Migration

75

-+

76

-+

77

-+Background

78

-+==========

79

-+

80

-+BitMap Queue CPU scheduler, referred to as BMQ from here on, is an evolution

81

-+of previous Priority and Deadline based Skiplist multiple queue scheduler(PDS),

82

-+and inspired by Zircon scheduler. The goal of it is to keep the scheduler code

83

-+simple, while efficiency and scalable for interactive tasks, such as desktop,

84

-+movie playback and gaming etc.

85

-+

86

-+Design

87

-+======

88

-+

89

-+Overview

90

-+--------

91

-+

92

-+BMQ use per CPU run queue design, each CPU(logical) has it's own run queue,

93

-+each CPU is responsible for scheduling the tasks that are putting into it's

94

-+run queue.

95

-+

96

-+The run queue is a set of priority queues. Note that these queues are fifo

97

-+queue for non-rt tasks or priority queue for rt tasks in data structure. See

98

-+BitMap Queue below for details. BMQ is optimized for non-rt tasks in the fact

99

-+that most applications are non-rt tasks. No matter the queue is fifo or

100

-+priority, In each queue is an ordered list of runnable tasks awaiting execution

101

-+and the data structures are the same. When it is time for a new task to run,

102

-+the scheduler simply looks the lowest numbered queueue that contains a task,

103

-+and runs the first task from the head of that queue. And per CPU idle task is

104

-+also in the run queue, so the scheduler can always find a task to run on from

105

-+its run queue.

106

-+

107

-+Each task will assigned the same timeslice(default 4ms) when it is picked to

108

-+start running. Task will be reinserted at the end of the appropriate priority

109

-+queue when it uses its whole timeslice. When the scheduler selects a new task

110

-+from the priority queue it sets the CPU's preemption timer for the remainder of

111

-+the previous timeslice. When that timer fires the scheduler will stop execution

112

-+on that task, select another task and start over again.

113

-+

114

-+If a task blocks waiting for a shared resource then it's taken out of its

115

-+priority queue and is placed in a wait queue for the shared resource. When it

116

-+is unblocked it will be reinserted in the appropriate priority queue of an

117

-+eligible CPU.

118

-+

119

-+Task policy

120

-+-----------

121

-+

122

-+BMQ supports DEADLINE, FIFO, RR, NORMAL, BATCH and IDLE task policy like the

123

-+mainline CFS scheduler. But BMQ is heavy optimized for non-rt task, that's

124

-+NORMAL/BATCH/IDLE policy tasks. Below is the implementation detail of each

125

-+policy.

126

-+

127

-+DEADLINE

128

-+	It is squashed as priority 0 FIFO task.

129

-+

130

-+FIFO/RR

131

-+	All RT tasks share one single priority queue in BMQ run queue designed. The

132

-+complexity of insert operation is O(n). BMQ is not designed for system runs

133

-+with major rt policy tasks.

134

-+

135

-+NORMAL/BATCH/IDLE

136

-+	BATCH and IDLE tasks are treated as the same policy. They compete CPU with

137

-+NORMAL policy tasks, but they just don't boost. To control the priority of

138

-+NORMAL/BATCH/IDLE tasks, simply use nice level.

139

-+

140

-+ISO

141

-+	ISO policy is not supported in BMQ. Please use nice level -20 NORMAL policy

142

-+task instead.

143

-+

144

-+Priority management

145

-+-------------------

146

-+

147

-+RT tasks have priority from 0-99. For non-rt tasks, there are three different

148

-+factors used to determine the effective priority of a task. The effective

149

-+priority being what is used to determine which queue it will be in.

150

-+

151

-+The first factor is simply the task’s static priority. Which is assigned from

152

-+task's nice level, within [-20, 19] in userland's point of view and [0, 39]

153

-+internally.

154

-+

155

-+The second factor is the priority boost. This is a value bounded between

156

-+[-MAX_PRIORITY_ADJ, MAX_PRIORITY_ADJ] used to offset the base priority, it is

157

-+modified by the following cases:

158

-+

159

-+*When a thread has used up its entire timeslice, always deboost its boost by

160

-+increasing by one.

161

-+*When a thread gives up cpu control(voluntary or non-voluntary) to reschedule,

162

-+and its switch-in time(time after last switch and run) below the thredhold

163

-+based on its priority boost, will boost its boost by decreasing by one buti is

164

-+capped at 0 (won’t go negative).

165

-+

166

-+The intent in this system is to ensure that interactive threads are serviced

167

-+quickly. These are usually the threads that interact directly with the user

168

-+and cause user-perceivable latency. These threads usually do little work and

169

-+spend most of their time blocked awaiting another user event. So they get the

170

-+priority boost from unblocking while background threads that do most of the

171

-+processing receive the priority penalty for using their entire timeslice.

172

-diff --git a/fs/proc/base.c b/fs/proc/base.c

173

-index 533d5836eb9a..5756c51c9b58 100644

174

---- a/fs/proc/base.c

175

-+++ b/fs/proc/base.c

176

-@@ -477,7 +477,7 @@ static int proc_pid_schedstat(struct seq_file *m, struct pid_namespace *ns,

177

- 		seq_puts(m, "0 0 0\n");

178

- 	else

179

- 		seq_printf(m, "%llu %llu %lu\n",

180

--		   (unsigned long long)task->se.sum_exec_runtime,

181

-+		   (unsigned long long)tsk_seruntime(task),

182

- 		   (unsigned long long)task->sched_info.run_delay,

183

- 		   task->sched_info.pcount);

184

-

185

-diff --git a/include/asm-generic/resource.h b/include/asm-generic/resource.h

186

-index 8874f681b056..59eb72bf7d5f 100644

187

---- a/include/asm-generic/resource.h

188

-+++ b/include/asm-generic/resource.h

189

-@@ -23,7 +23,7 @@

190

- 	[RLIMIT_LOCKS]		= {  RLIM_INFINITY,  RLIM_INFINITY },	\

191

- 	[RLIMIT_SIGPENDING]	= { 		0,	       0 },	\

192

- 	[RLIMIT_MSGQUEUE]	= {   MQ_BYTES_MAX,   MQ_BYTES_MAX },	\

193

--	[RLIMIT_NICE]		= { 0, 0 },				\

194

-+	[RLIMIT_NICE]		= { 30, 30 },				\

195

- 	[RLIMIT_RTPRIO]		= { 0, 0 },				\

196

- 	[RLIMIT_RTTIME]		= {  RLIM_INFINITY,  RLIM_INFINITY },	\

197

- }

198

-diff --git a/include/linux/sched.h b/include/linux/sched.h

199

-index c1a927ddec64..a7eb91d15442 100644

200

---- a/include/linux/sched.h

201

-+++ b/include/linux/sched.h

202

-@@ -748,12 +748,18 @@ struct task_struct {

203

- 	unsigned int			ptrace;

204

-

205

- #ifdef CONFIG_SMP

206

--	int				on_cpu;

207

- 	struct __call_single_node	wake_entry;

208

-+#endif

209

-+#if defined(CONFIG_SMP) || defined(CONFIG_SCHED_ALT)

210

-+	int				on_cpu;

211

-+#endif

212

-+

213

-+#ifdef CONFIG_SMP

214

- #ifdef CONFIG_THREAD_INFO_IN_TASK

215

- 	/* Current CPU: */

216

- 	unsigned int			cpu;

217

- #endif

218

-+#ifndef CONFIG_SCHED_ALT

219

- 	unsigned int			wakee_flips;

220

- 	unsigned long			wakee_flip_decay_ts;

221

- 	struct task_struct		*last_wakee;

222

-@@ -767,6 +773,7 @@ struct task_struct {

223

- 	 */

224

- 	int				recent_used_cpu;

225

- 	int				wake_cpu;

226

-+#endif /* !CONFIG_SCHED_ALT */

227

- #endif

228

- 	int				on_rq;

229

-

230

-@@ -775,6 +782,20 @@ struct task_struct {

231

- 	int				normal_prio;

232

- 	unsigned int			rt_priority;

233

-

234

-+#ifdef CONFIG_SCHED_ALT

235

-+	u64				last_ran;

236

-+	s64				time_slice;

237

-+	int				sq_idx;

238

-+	struct list_head		sq_node;

239

-+#ifdef CONFIG_SCHED_BMQ

240

-+	int				boost_prio;

241

-+#endif /* CONFIG_SCHED_BMQ */

242

-+#ifdef CONFIG_SCHED_PDS

243

-+	u64				deadline;

244

-+#endif /* CONFIG_SCHED_PDS */

245

-+	/* sched_clock time spent running */

246

-+	u64				sched_time;

247

-+#else /* !CONFIG_SCHED_ALT */

248

- 	const struct sched_class	*sched_class;

249

- 	struct sched_entity		se;

250

- 	struct sched_rt_entity		rt;

251

-@@ -785,6 +806,7 @@ struct task_struct {

252

- 	unsigned long			core_cookie;

253

- 	unsigned int			core_occupation;

254

- #endif

255

-+#endif /* !CONFIG_SCHED_ALT */

256

-

257

- #ifdef CONFIG_CGROUP_SCHED

258

- 	struct task_group		*sched_task_group;

259

-@@ -1505,6 +1527,15 @@ struct task_struct {

260

- 	 */

261

- };

262

-

263

-+#ifdef CONFIG_SCHED_ALT

264

-+#define tsk_seruntime(t)		((t)->sched_time)

265

-+/* replace the uncertian rt_timeout with 0UL */

266

-+#define tsk_rttimeout(t)		(0UL)

267

-+#else /* CFS */

268

-+#define tsk_seruntime(t)	((t)->se.sum_exec_runtime)

269

-+#define tsk_rttimeout(t)	((t)->rt.timeout)

270

-+#endif /* !CONFIG_SCHED_ALT */

271

-+

272

- static inline struct pid *task_pid(struct task_struct *task)

273

- {

274

- 	return task->thread_pid;

275

-diff --git a/include/linux/sched/deadline.h b/include/linux/sched/deadline.h

276

-index 1aff00b65f3c..216fdf2fe90c 100644

277

---- a/include/linux/sched/deadline.h

278

-+++ b/include/linux/sched/deadline.h

279

-@@ -1,5 +1,24 @@

280

- /* SPDX-License-Identifier: GPL-2.0 */

281

-

282

-+#ifdef CONFIG_SCHED_ALT

283

-+

284

-+static inline int dl_task(struct task_struct *p)

285

-+{

286

-+	return 0;

287

-+}

288

-+

289

-+#ifdef CONFIG_SCHED_BMQ

290

-+#define __tsk_deadline(p)	(0UL)

291

-+#endif

292

-+

293

-+#ifdef CONFIG_SCHED_PDS

294

-+#define __tsk_deadline(p)	((((u64) ((p)->prio))<<56) | (p)->deadline)

295

-+#endif

296

-+

297

-+#else

298

-+

299

-+#define __tsk_deadline(p)	((p)->dl.deadline)

300

-+

301

- /*

302

-  * SCHED_DEADLINE tasks has negative priorities, reflecting

303

-  * the fact that any of them has higher prio than RT and

304

-@@ -19,6 +38,7 @@ static inline int dl_task(struct task_struct *p)

305

- {

306

- 	return dl_prio(p->prio);

307

- }

308

-+#endif /* CONFIG_SCHED_ALT */

309

-

310

- static inline bool dl_time_before(u64 a, u64 b)

311

- {

312

-diff --git a/include/linux/sched/prio.h b/include/linux/sched/prio.h

313

-index ab83d85e1183..6af9ae681116 100644

314

---- a/include/linux/sched/prio.h

315

-+++ b/include/linux/sched/prio.h

316

-@@ -18,6 +18,32 @@

317

- #define MAX_PRIO		(MAX_RT_PRIO + NICE_WIDTH)

318

- #define DEFAULT_PRIO		(MAX_RT_PRIO + NICE_WIDTH / 2)

319

-

320

-+#ifdef CONFIG_SCHED_ALT

321

-+

322

-+/* Undefine MAX_PRIO and DEFAULT_PRIO */

323

-+#undef MAX_PRIO

324

-+#undef DEFAULT_PRIO

325

-+

326

-+/* +/- priority levels from the base priority */

327

-+#ifdef CONFIG_SCHED_BMQ

328

-+#define MAX_PRIORITY_ADJ	(7)

329

-+

330

-+#define MIN_NORMAL_PRIO		(MAX_RT_PRIO)

331

-+#define MAX_PRIO		(MIN_NORMAL_PRIO + NICE_WIDTH)

332

-+#define DEFAULT_PRIO		(MIN_NORMAL_PRIO + NICE_WIDTH / 2)

333

-+#endif

334

-+

335

-+#ifdef CONFIG_SCHED_PDS

336

-+#define MAX_PRIORITY_ADJ	(0)

337

-+

338

-+#define MIN_NORMAL_PRIO		(128)

339

-+#define NORMAL_PRIO_NUM		(64)

340

-+#define MAX_PRIO		(MIN_NORMAL_PRIO + NORMAL_PRIO_NUM)

341

-+#define DEFAULT_PRIO		(MAX_PRIO - NICE_WIDTH / 2)

342

-+#endif

343

-+

344

-+#endif /* CONFIG_SCHED_ALT */

345

-+

346

- /*

347

-  * Convert user-nice values [ -20 ... 0 ... 19 ]

348

-  * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],

349

-diff --git a/include/linux/sched/rt.h b/include/linux/sched/rt.h

350

-index e5af028c08b4..0a7565d0d3cf 100644

351

---- a/include/linux/sched/rt.h

352

-+++ b/include/linux/sched/rt.h

353

-@@ -24,8 +24,10 @@ static inline bool task_is_realtime(struct task_struct *tsk)

354

-

355

- 	if (policy == SCHED_FIFO || policy == SCHED_RR)

356

- 		return true;

357

-+#ifndef CONFIG_SCHED_ALT

358

- 	if (policy == SCHED_DEADLINE)

359

- 		return true;

360

-+#endif

361

- 	return false;

362

- }

363

-

364

-diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h

365

-index 8f0f778b7c91..991f2280475b 100644

366

---- a/include/linux/sched/topology.h

367

-+++ b/include/linux/sched/topology.h

368

-@@ -225,7 +225,8 @@ static inline bool cpus_share_cache(int this_cpu, int that_cpu)

369

-

370

- #endif	/* !CONFIG_SMP */

371

-

372

--#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)

373

-+#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) && \

374

-+	!defined(CONFIG_SCHED_ALT)

375

- extern void rebuild_sched_domains_energy(void);

376

- #else

377

- static inline void rebuild_sched_domains_energy(void)

378

-diff --git a/init/Kconfig b/init/Kconfig

379

-index 11f8a845f259..c8e82fcafb9e 100644

380

---- a/init/Kconfig

381

-+++ b/init/Kconfig

382

-@@ -814,9 +814,39 @@ config GENERIC_SCHED_CLOCK

383

-

384

- menu "Scheduler features"

385

-

386

-+menuconfig SCHED_ALT

387

-+	bool "Alternative CPU Schedulers"

388

-+	default y

389

-+	help

390

-+	  This feature enable alternative CPU scheduler"

391

-+

392

-+if SCHED_ALT

393

-+

394

-+choice

395

-+	prompt "Alternative CPU Scheduler"

396

-+	default SCHED_BMQ

397

-+

398

-+config SCHED_BMQ

399

-+	bool "BMQ CPU scheduler"

400

-+	help

401

-+	  The BitMap Queue CPU scheduler for excellent interactivity and

402

-+	  responsiveness on the desktop and solid scalability on normal

403

-+	  hardware and commodity servers.

404

-+

405

-+config SCHED_PDS

406

-+	bool "PDS CPU scheduler"

407

-+	help

408

-+	  The Priority and Deadline based Skip list multiple queue CPU

409

-+	  Scheduler.

410

-+

411

-+endchoice

412

-+

413

-+endif

414

-+

415

- config UCLAMP_TASK

416

- 	bool "Enable utilization clamping for RT/FAIR tasks"

417

- 	depends on CPU_FREQ_GOV_SCHEDUTIL

418

-+	depends on !SCHED_ALT

419

- 	help

420

- 	  This feature enables the scheduler to track the clamped utilization

421

- 	  of each CPU based on RUNNABLE tasks scheduled on that CPU.

422

-@@ -902,6 +932,7 @@ config NUMA_BALANCING

423

- 	depends on ARCH_SUPPORTS_NUMA_BALANCING

424

- 	depends on !ARCH_WANT_NUMA_VARIABLE_LOCALITY

425

- 	depends on SMP && NUMA && MIGRATION

426

-+	depends on !SCHED_ALT

427

- 	help

428

- 	  This option adds support for automatic NUMA aware memory/task placement.

429

- 	  The mechanism is quite primitive and is based on migrating memory when

430

-@@ -994,6 +1025,7 @@ config FAIR_GROUP_SCHED

431

- 	depends on CGROUP_SCHED

432

- 	default CGROUP_SCHED

433

-

434

-+if !SCHED_ALT

435

- config CFS_BANDWIDTH

436

- 	bool "CPU bandwidth provisioning for FAIR_GROUP_SCHED"

437

- 	depends on FAIR_GROUP_SCHED

438

-@@ -1016,6 +1048,7 @@ config RT_GROUP_SCHED

439

- 	  realtime bandwidth for them.

440

- 	  See Documentation/scheduler/sched-rt-group.rst for more information.

441

-

442

-+endif #!SCHED_ALT

443

- endif #CGROUP_SCHED

444

-

445

- config UCLAMP_TASK_GROUP

446

-@@ -1259,6 +1292,7 @@ config CHECKPOINT_RESTORE

447

-

448

- config SCHED_AUTOGROUP

449

- 	bool "Automatic process group scheduling"

450

-+	depends on !SCHED_ALT

451

- 	select CGROUPS

452

- 	select CGROUP_SCHED

453

- 	select FAIR_GROUP_SCHED

454

-diff --git a/init/init_task.c b/init/init_task.c

455

-index 2d024066e27b..49f706df0904 100644

456

---- a/init/init_task.c

457

-+++ b/init/init_task.c

458

-@@ -75,9 +75,15 @@ struct task_struct init_task

459

- 	.stack		= init_stack,

460

- 	.usage		= REFCOUNT_INIT(2),

461

- 	.flags		= PF_KTHREAD,

462

-+#ifdef CONFIG_SCHED_ALT

463

-+	.prio		= DEFAULT_PRIO + MAX_PRIORITY_ADJ,

464

-+	.static_prio	= DEFAULT_PRIO,

465

-+	.normal_prio	= DEFAULT_PRIO + MAX_PRIORITY_ADJ,

466

-+#else

467

- 	.prio		= MAX_PRIO - 20,

468

- 	.static_prio	= MAX_PRIO - 20,

469

- 	.normal_prio	= MAX_PRIO - 20,

470

-+#endif

471

- 	.policy		= SCHED_NORMAL,

472

- 	.cpus_ptr	= &init_task.cpus_mask,

473

- 	.user_cpus_ptr	= NULL,

474

-@@ -88,6 +94,17 @@ struct task_struct init_task

475

- 	.restart_block	= {

476

- 		.fn = do_no_restart_syscall,

477

- 	},

478

-+#ifdef CONFIG_SCHED_ALT

479

-+	.sq_node	= LIST_HEAD_INIT(init_task.sq_node),

480

-+#ifdef CONFIG_SCHED_BMQ

481

-+	.boost_prio	= 0,

482

-+	.sq_idx		= 15,

483

-+#endif

484

-+#ifdef CONFIG_SCHED_PDS

485

-+	.deadline	= 0,

486

-+#endif

487

-+	.time_slice	= HZ,

488

-+#else

489

- 	.se		= {

490

- 		.group_node 	= LIST_HEAD_INIT(init_task.se.group_node),

491

- 	},

492

-@@ -95,6 +112,7 @@ struct task_struct init_task

493

- 		.run_list	= LIST_HEAD_INIT(init_task.rt.run_list),

494

- 		.time_slice	= RR_TIMESLICE,

495

- 	},

496

-+#endif

497

- 	.tasks		= LIST_HEAD_INIT(init_task.tasks),

498

- #ifdef CONFIG_SMP

499

- 	.pushable_tasks	= PLIST_NODE_INIT(init_task.pushable_tasks, MAX_PRIO),

500

-diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt

501

-index 5876e30c5740..7594d0a31869 100644

502

---- a/kernel/Kconfig.preempt

503

-+++ b/kernel/Kconfig.preempt

504

-@@ -102,7 +102,7 @@ config PREEMPT_DYNAMIC

505

-

506

- config SCHED_CORE

507

- 	bool "Core Scheduling for SMT"

508

--	depends on SCHED_SMT

509

-+	depends on SCHED_SMT && !SCHED_ALT

510

- 	help

511

- 	  This option permits Core Scheduling, a means of coordinated task

512

- 	  selection across SMT siblings. When enabled -- see

513

-diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c

514

-index 2a9695ccb65f..292112c267b8 100644

515

---- a/kernel/cgroup/cpuset.c

516

-+++ b/kernel/cgroup/cpuset.c

517

-@@ -664,7 +664,7 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)

518

- 	return ret;

519

- }

520

-

521

--#ifdef CONFIG_SMP

522

-+#if defined(CONFIG_SMP) && !defined(CONFIG_SCHED_ALT)

523

- /*

524

-  * Helper routine for generate_sched_domains().

525

-  * Do cpusets a, b have overlapping effective cpus_allowed masks?

526

-@@ -1060,7 +1060,7 @@ static void rebuild_sched_domains_locked(void)

527

- 	/* Have scheduler rebuild the domains */

528

- 	partition_and_rebuild_sched_domains(ndoms, doms, attr);

529

- }

530

--#else /* !CONFIG_SMP */

531

-+#else /* !CONFIG_SMP || CONFIG_SCHED_ALT */

532

- static void rebuild_sched_domains_locked(void)

533

- {

534

- }

535

-diff --git a/kernel/delayacct.c b/kernel/delayacct.c

536

-index 51530d5b15a8..e542d71bb94b 100644

537

---- a/kernel/delayacct.c

538

-+++ b/kernel/delayacct.c

539

-@@ -139,7 +139,7 @@ int delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)

540

- 	 */

541

- 	t1 = tsk->sched_info.pcount;

542

- 	t2 = tsk->sched_info.run_delay;

543

--	t3 = tsk->se.sum_exec_runtime;

544

-+	t3 = tsk_seruntime(tsk);

545

-

546

- 	d->cpu_count += t1;

547

-

548

-diff --git a/kernel/exit.c b/kernel/exit.c

549

-index 91a43e57a32e..4b157befc10c 100644

550

---- a/kernel/exit.c

551

-+++ b/kernel/exit.c

552

-@@ -122,7 +122,7 @@ static void __exit_signal(struct task_struct *tsk)

553

- 			sig->curr_target = next_thread(tsk);

554

- 	}

555

-

556

--	add_device_randomness((const void*) &tsk->se.sum_exec_runtime,

557

-+	add_device_randomness((const void*) &tsk_seruntime(tsk),

558

- 			      sizeof(unsigned long long));

559

-

560

- 	/*

561

-@@ -143,7 +143,7 @@ static void __exit_signal(struct task_struct *tsk)

562

- 	sig->inblock += task_io_get_inblock(tsk);

563

- 	sig->oublock += task_io_get_oublock(tsk);

564

- 	task_io_accounting_add(&sig->ioac, &tsk->ioac);

565

--	sig->sum_sched_runtime += tsk->se.sum_exec_runtime;

566

-+	sig->sum_sched_runtime += tsk_seruntime(tsk);

567

- 	sig->nr_threads--;

568

- 	__unhash_process(tsk, group_dead);

569

- 	write_sequnlock(&sig->stats_lock);

570

-diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c

571

-index 291b857a6e20..f3480cdb7497 100644

572

---- a/kernel/livepatch/transition.c

573

-+++ b/kernel/livepatch/transition.c

574

-@@ -307,7 +307,11 @@ static bool klp_try_switch_task(struct task_struct *task)

575

- 	 */

576

- 	rq = task_rq_lock(task, &flags);

577

-

578

-+#ifdef	CONFIG_SCHED_ALT

579

-+	if (task_running(task) && task != current) {

580

-+#else

581

- 	if (task_running(rq, task) && task != current) {

582

-+#endif

583

- 		snprintf(err_buf, STACK_ERR_BUF_SIZE,

584

- 			 "%s: %s:%d is running\n", __func__, task->comm,

585

- 			 task->pid);

586

-diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c

587

-index 6bb116c559b4..d4c8168a8270 100644

588

---- a/kernel/locking/rtmutex.c

589

-+++ b/kernel/locking/rtmutex.c

590

-@@ -298,21 +298,25 @@ static __always_inline void

591

- waiter_update_prio(struct rt_mutex_waiter *waiter, struct task_struct *task)

592

- {

593

- 	waiter->prio = __waiter_prio(task);

594

--	waiter->deadline = task->dl.deadline;

595

-+	waiter->deadline = __tsk_deadline(task);

596

- }

597

-

598

- /*

599

-  * Only use with rt_mutex_waiter_{less,equal}()

600

-  */

601

- #define task_to_waiter(p)	\

602

--	&(struct rt_mutex_waiter){ .prio = __waiter_prio(p), .deadline = (p)->dl.deadline }

603

-+	&(struct rt_mutex_waiter){ .prio = __waiter_prio(p), .deadline = __tsk_deadline(p) }

604

-

605

- static __always_inline int rt_mutex_waiter_less(struct rt_mutex_waiter *left,

606

- 						struct rt_mutex_waiter *right)

607

- {

608

-+#ifdef CONFIG_SCHED_PDS

609

-+	return (left->deadline < right->deadline);

610

-+#else

611

- 	if (left->prio < right->prio)

612

- 		return 1;

613

-

614

-+#ifndef CONFIG_SCHED_BMQ

615

- 	/*

616

- 	 * If both waiters have dl_prio(), we check the deadlines of the

617

- 	 * associated tasks.

618

-@@ -321,16 +325,22 @@ static __always_inline int rt_mutex_waiter_less(struct rt_mutex_waiter *left,

619

- 	 */

620

- 	if (dl_prio(left->prio))

621

- 		return dl_time_before(left->deadline, right->deadline);

622

-+#endif

623

-

624

- 	return 0;

625

-+#endif

626

- }

627

-

628

- static __always_inline int rt_mutex_waiter_equal(struct rt_mutex_waiter *left,

629

- 						 struct rt_mutex_waiter *right)

630

- {

631

-+#ifdef CONFIG_SCHED_PDS

632

-+	return (left->deadline == right->deadline);

633

-+#else

634

- 	if (left->prio != right->prio)

635

- 		return 0;

636

-

637

-+#ifndef CONFIG_SCHED_BMQ

638

- 	/*

639

- 	 * If both waiters have dl_prio(), we check the deadlines of the

640

- 	 * associated tasks.

641

-@@ -339,8 +349,10 @@ static __always_inline int rt_mutex_waiter_equal(struct rt_mutex_waiter *left,

642

- 	 */

643

- 	if (dl_prio(left->prio))

644

- 		return left->deadline == right->deadline;

645

-+#endif

646

-

647

- 	return 1;

648

-+#endif

649

- }

650

-

651

- static inline bool rt_mutex_steal(struct rt_mutex_waiter *waiter,

652

-diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile

653

-index 978fcfca5871..0425ee149b4d 100644

654

---- a/kernel/sched/Makefile

655

-+++ b/kernel/sched/Makefile

656

-@@ -22,14 +22,21 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)

657

- CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer

658

- endif

659

-

660

--obj-y += core.o loadavg.o clock.o cputime.o

661

--obj-y += idle.o fair.o rt.o deadline.o

662

--obj-y += wait.o wait_bit.o swait.o completion.o

663

--

664

--obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o pelt.o

665

-+ifdef CONFIG_SCHED_ALT

666

-+obj-y += alt_core.o

667

-+obj-$(CONFIG_SCHED_DEBUG) += alt_debug.o

668

-+else

669

-+obj-y += core.o

670

-+obj-y += fair.o rt.o deadline.o

671

-+obj-$(CONFIG_SMP) += cpudeadline.o stop_task.o

672

- obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o

673

--obj-$(CONFIG_SCHEDSTATS) += stats.o

674

-+endif

675

- obj-$(CONFIG_SCHED_DEBUG) += debug.o

676

-+obj-y += loadavg.o clock.o cputime.o

677

-+obj-y += idle.o

678

-+obj-y += wait.o wait_bit.o swait.o completion.o

679

-+obj-$(CONFIG_SMP) += cpupri.o pelt.o topology.o

680

-+obj-$(CONFIG_SCHEDSTATS) += stats.o

681

- obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o

682

- obj-$(CONFIG_CPU_FREQ) += cpufreq.o

683

- obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o

684

-diff --git a/kernel/sched/alt_core.c b/kernel/sched/alt_core.c

685

-new file mode 100644

686

-index 000000000000..9576c57f82da

687

---- /dev/null

688

-+++ b/kernel/sched/alt_core.c

689

-@@ -0,0 +1,7626 @@

690

-+/*

691

-+ *  kernel/sched/alt_core.c

692

-+ *

693

-+ *  Core alternative kernel scheduler code and related syscalls

694

-+ *

695

-+ *  Copyright (C) 1991-2002  Linus Torvalds

696

-+ *

697

-+ *  2009-08-13	Brainfuck deadline scheduling policy by Con Kolivas deletes

698

-+ *		a whole lot of those previous things.

699

-+ *  2017-09-06	Priority and Deadline based Skip list multiple queue kernel

700

-+ *		scheduler by Alfred Chen.

701

-+ *  2019-02-20	BMQ(BitMap Queue) kernel scheduler by Alfred Chen.

702

-+ */

703

-+#define CREATE_TRACE_POINTS

704

-+#include <trace/events/sched.h>

705

-+#undef CREATE_TRACE_POINTS

706

-+

707

-+#include "sched.h"

708

-+

709

-+#include <linux/sched/rt.h>

710

-+

711

-+#include <linux/context_tracking.h>

712

-+#include <linux/compat.h>

713

-+#include <linux/blkdev.h>

714

-+#include <linux/delayacct.h>

715

-+#include <linux/freezer.h>

716

-+#include <linux/init_task.h>

717

-+#include <linux/kprobes.h>

718

-+#include <linux/mmu_context.h>

719

-+#include <linux/nmi.h>

720

-+#include <linux/profile.h>

721

-+#include <linux/rcupdate_wait.h>

722

-+#include <linux/security.h>

723

-+#include <linux/syscalls.h>

724

-+#include <linux/wait_bit.h>

725

-+

726

-+#include <linux/kcov.h>

727

-+#include <linux/scs.h>

728

-+

729

-+#include <asm/switch_to.h>

730

-+

731

-+#include "../workqueue_internal.h"

732

-+#include "../../fs/io-wq.h"

733

-+#include "../smpboot.h"

734

-+

735

-+#include "pelt.h"

736

-+#include "smp.h"

737

-+

738

-+/*

739

-+ * Export tracepoints that act as a bare tracehook (ie: have no trace event

740

-+ * associated with them) to allow external modules to probe them.

741

-+ */

742

-+EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_irq_tp);

743

-+

744

-+#ifdef CONFIG_SCHED_DEBUG

745

-+#define sched_feat(x)	(1)

746

-+/*

747

-+ * Print a warning if need_resched is set for the given duration (if

748

-+ * LATENCY_WARN is enabled).

749

-+ *

750

-+ * If sysctl_resched_latency_warn_once is set, only one warning will be shown

751

-+ * per boot.

752

-+ */

753

-+__read_mostly int sysctl_resched_latency_warn_ms = 100;

754

-+__read_mostly int sysctl_resched_latency_warn_once = 1;

755

-+#else

756

-+#define sched_feat(x)	(0)

757

-+#endif /* CONFIG_SCHED_DEBUG */

758

-+

759

-+#define ALT_SCHED_VERSION "v5.15-r0"

760

-+

761

-+/* rt_prio(prio) defined in include/linux/sched/rt.h */

762

-+#define rt_task(p)		rt_prio((p)->prio)

763

-+#define rt_policy(policy)	((policy) == SCHED_FIFO || (policy) == SCHED_RR)

764

-+#define task_has_rt_policy(p)	(rt_policy((p)->policy))

765

-+

766

-+#define STOP_PRIO		(MAX_RT_PRIO - 1)

767

-+

768

-+/* Default time slice is 4 in ms, can be set via kernel parameter "sched_timeslice" */

769

-+u64 sched_timeslice_ns __read_mostly = (4 << 20);

770

-+

771

-+static inline void requeue_task(struct task_struct *p, struct rq *rq);

772

-+

773

-+#ifdef CONFIG_SCHED_BMQ

774

-+#include "bmq.h"

775

-+#endif

776

-+#ifdef CONFIG_SCHED_PDS

777

-+#include "pds.h"

778

-+#endif

779

-+

780

-+static int __init sched_timeslice(char *str)

781

-+{

782

-+	int timeslice_ms;

783

-+

784

-+	get_option(&str, &timeslice_ms);

785

-+	if (2 != timeslice_ms)

786

-+		timeslice_ms = 4;

787

-+	sched_timeslice_ns = timeslice_ms << 20;

788

-+	sched_timeslice_imp(timeslice_ms);

789

-+

790

-+	return 0;

791

-+}

792

-+early_param("sched_timeslice", sched_timeslice);

793

-+

794

-+/* Reschedule if less than this many μs left */

795

-+#define RESCHED_NS		(100 << 10)

796

-+

797

-+/**

798

-+ * sched_yield_type - Choose what sort of yield sched_yield will perform.

799

-+ * 0: No yield.

800

-+ * 1: Deboost and requeue task. (default)

801

-+ * 2: Set rq skip task.

802

-+ */

803

-+int sched_yield_type __read_mostly = 1;

804

-+

805

-+#ifdef CONFIG_SMP

806

-+static cpumask_t sched_rq_pending_mask ____cacheline_aligned_in_smp;

807

-+

808

-+DEFINE_PER_CPU(cpumask_t [NR_CPU_AFFINITY_LEVELS], sched_cpu_topo_masks);

809

-+DEFINE_PER_CPU(cpumask_t *, sched_cpu_llc_mask);

810

-+DEFINE_PER_CPU(cpumask_t *, sched_cpu_topo_end_mask);

811

-+

812

-+#ifdef CONFIG_SCHED_SMT

813

-+DEFINE_STATIC_KEY_FALSE(sched_smt_present);

814

-+EXPORT_SYMBOL_GPL(sched_smt_present);

815

-+#endif

816

-+

817

-+/*

818

-+ * Keep a unique ID per domain (we use the first CPUs number in the cpumask of

819

-+ * the domain), this allows us to quickly tell if two cpus are in the same cache

820

-+ * domain, see cpus_share_cache().

821

-+ */

822

-+DEFINE_PER_CPU(int, sd_llc_id);

823

-+#endif /* CONFIG_SMP */

824

-+

825

-+static DEFINE_MUTEX(sched_hotcpu_mutex);

826

-+

827

-+DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);

828

-+

829

-+#ifndef prepare_arch_switch

830

-+# define prepare_arch_switch(next)	do { } while (0)

831

-+#endif

832

-+#ifndef finish_arch_post_lock_switch

833

-+# define finish_arch_post_lock_switch()	do { } while (0)

834

-+#endif

835

-+

836

-+#ifdef CONFIG_SCHED_SMT

837

-+static cpumask_t sched_sg_idle_mask ____cacheline_aligned_in_smp;

838

-+#endif

839

-+static cpumask_t sched_rq_watermark[SCHED_BITS] ____cacheline_aligned_in_smp;

840

-+

841

-+/* sched_queue related functions */

842

-+static inline void sched_queue_init(struct sched_queue *q)

843

-+{

844

-+	int i;

845

-+

846

-+	bitmap_zero(q->bitmap, SCHED_BITS);

847

-+	for(i = 0; i < SCHED_BITS; i++)

848

-+		INIT_LIST_HEAD(&q->heads[i]);

849

-+}

850

-+

851

-+/*

852

-+ * Init idle task and put into queue structure of rq

853

-+ * IMPORTANT: may be called multiple times for a single cpu

854

-+ */

855

-+static inline void sched_queue_init_idle(struct sched_queue *q,

856

-+					 struct task_struct *idle)

857

-+{

858

-+	idle->sq_idx = IDLE_TASK_SCHED_PRIO;

859

-+	INIT_LIST_HEAD(&q->heads[idle->sq_idx]);

860

-+	list_add(&idle->sq_node, &q->heads[idle->sq_idx]);

861

-+}

862

-+

863

-+/* water mark related functions */

864

-+static inline void update_sched_rq_watermark(struct rq *rq)

865

-+{

866

-+	unsigned long watermark = find_first_bit(rq->queue.bitmap, SCHED_QUEUE_BITS);

867

-+	unsigned long last_wm = rq->watermark;

868

-+	unsigned long i;

869

-+	int cpu;

870

-+

871

-+	if (watermark == last_wm)

872

-+		return;

873

-+

874

-+	rq->watermark = watermark;

875

-+	cpu = cpu_of(rq);

876

-+	if (watermark < last_wm) {

877

-+		for (i = last_wm; i > watermark; i--)

878

-+			cpumask_clear_cpu(cpu, sched_rq_watermark + SCHED_BITS - 1 - i);

879

-+#ifdef CONFIG_SCHED_SMT

880

-+		if (static_branch_likely(&sched_smt_present) &&

881

-+		    IDLE_TASK_SCHED_PRIO == last_wm)

882

-+			cpumask_andnot(&sched_sg_idle_mask,

883

-+				       &sched_sg_idle_mask, cpu_smt_mask(cpu));

884

-+#endif

885

-+		return;

886

-+	}

887

-+	/* last_wm < watermark */

888

-+	for (i = watermark; i > last_wm; i--)

889

-+		cpumask_set_cpu(cpu, sched_rq_watermark + SCHED_BITS - 1 - i);

890

-+#ifdef CONFIG_SCHED_SMT

891

-+	if (static_branch_likely(&sched_smt_present) &&

892

-+	    IDLE_TASK_SCHED_PRIO == watermark) {

893

-+		cpumask_t tmp;

894

-+

895

-+		cpumask_and(&tmp, cpu_smt_mask(cpu), sched_rq_watermark);

896

-+		if (cpumask_equal(&tmp, cpu_smt_mask(cpu)))

897

-+			cpumask_or(&sched_sg_idle_mask,

898

-+				   &sched_sg_idle_mask, cpu_smt_mask(cpu));

899

-+	}

900

-+#endif

901

-+}

902

-+

903

-+/*

904

-+ * This routine assume that the idle task always in queue

905

-+ */

906

-+static inline struct task_struct *sched_rq_first_task(struct rq *rq)

907

-+{

908

-+	unsigned long idx = find_first_bit(rq->queue.bitmap, SCHED_QUEUE_BITS);

909

-+	const struct list_head *head = &rq->queue.heads[sched_prio2idx(idx, rq)];

910

-+

911

-+	return list_first_entry(head, struct task_struct, sq_node);

912

-+}

913

-+

914

-+static inline struct task_struct *

915

-+sched_rq_next_task(struct task_struct *p, struct rq *rq)

916

-+{

917

-+	unsigned long idx = p->sq_idx;

918

-+	struct list_head *head = &rq->queue.heads[idx];

919

-+

920

-+	if (list_is_last(&p->sq_node, head)) {

921

-+		idx = find_next_bit(rq->queue.bitmap, SCHED_QUEUE_BITS,

922

-+				    sched_idx2prio(idx, rq) + 1);

923

-+		head = &rq->queue.heads[sched_prio2idx(idx, rq)];

924

-+

925

-+		return list_first_entry(head, struct task_struct, sq_node);

926

-+	}

927

-+

928

-+	return list_next_entry(p, sq_node);

929

-+}

930

-+

931

-+static inline struct task_struct *rq_runnable_task(struct rq *rq)

932

-+{

933

-+	struct task_struct *next = sched_rq_first_task(rq);

934

-+

935

-+	if (unlikely(next == rq->skip))

936

-+		next = sched_rq_next_task(next, rq);

937

-+

938

-+	return next;

939

-+}

940

-+

941

-+/*

942

-+ * Serialization rules:

943

-+ *

944

-+ * Lock order:

945

-+ *

946

-+ *   p->pi_lock

947

-+ *     rq->lock

948

-+ *       hrtimer_cpu_base->lock (hrtimer_start() for bandwidth controls)

949

-+ *

950

-+ *  rq1->lock

951

-+ *    rq2->lock  where: rq1 < rq2

952

-+ *

953

-+ * Regular state:

954

-+ *

955

-+ * Normal scheduling state is serialized by rq->lock. __schedule() takes the

956

-+ * local CPU's rq->lock, it optionally removes the task from the runqueue and

957

-+ * always looks at the local rq data structures to find the most eligible task

958

-+ * to run next.

959

-+ *

960

-+ * Task enqueue is also under rq->lock, possibly taken from another CPU.

961

-+ * Wakeups from another LLC domain might use an IPI to transfer the enqueue to

962

-+ * the local CPU to avoid bouncing the runqueue state around [ see

963

-+ * ttwu_queue_wakelist() ]

964

-+ *

965

-+ * Task wakeup, specifically wakeups that involve migration, are horribly

966

-+ * complicated to avoid having to take two rq->locks.

967

-+ *

968

-+ * Special state:

969

-+ *

970

-+ * System-calls and anything external will use task_rq_lock() which acquires

971

-+ * both p->pi_lock and rq->lock. As a consequence the state they change is

972

-+ * stable while holding either lock:

973

-+ *

974

-+ *  - sched_setaffinity()/

975

-+ *    set_cpus_allowed_ptr():	p->cpus_ptr, p->nr_cpus_allowed

976

-+ *  - set_user_nice():		p->se.load, p->*prio

977

-+ *  - __sched_setscheduler():	p->sched_class, p->policy, p->*prio,

978

-+ *				p->se.load, p->rt_priority,

979

-+ *				p->dl.dl_{runtime, deadline, period, flags, bw, density}

980

-+ *  - sched_setnuma():		p->numa_preferred_nid

981

-+ *  - sched_move_task()/

982

-+ *    cpu_cgroup_fork():	p->sched_task_group

983

-+ *  - uclamp_update_active()	p->uclamp*

984

-+ *

985

-+ * p->state <- TASK_*:

986

-+ *

987

-+ *   is changed locklessly using set_current_state(), __set_current_state() or

988

-+ *   set_special_state(), see their respective comments, or by

989

-+ *   try_to_wake_up(). This latter uses p->pi_lock to serialize against

990

-+ *   concurrent self.

991

-+ *

992

-+ * p->on_rq <- { 0, 1 = TASK_ON_RQ_QUEUED, 2 = TASK_ON_RQ_MIGRATING }:

993

-+ *

994

-+ *   is set by activate_task() and cleared by deactivate_task(), under

995

-+ *   rq->lock. Non-zero indicates the task is runnable, the special

996

-+ *   ON_RQ_MIGRATING state is used for migration without holding both

997

-+ *   rq->locks. It indicates task_cpu() is not stable, see task_rq_lock().

998

-+ *

999

-+ * p->on_cpu <- { 0, 1 }:

1000

-+ *

1001

-+ *   is set by prepare_task() and cleared by finish_task() such that it will be

1002

-+ *   set before p is scheduled-in and cleared after p is scheduled-out, both

1003

-+ *   under rq->lock. Non-zero indicates the task is running on its CPU.

1004

-+ *

1005

-+ *   [ The astute reader will observe that it is possible for two tasks on one

1006

-+ *     CPU to have ->on_cpu = 1 at the same time. ]

1007

-+ *

1008

-+ * task_cpu(p): is changed by set_task_cpu(), the rules are:

1009

-+ *

1010

-+ *  - Don't call set_task_cpu() on a blocked task:

1011

-+ *

1012

-+ *    We don't care what CPU we're not running on, this simplifies hotplug,

1013

-+ *    the CPU assignment of blocked tasks isn't required to be valid.

1014

-+ *

1015

-+ *  - for try_to_wake_up(), called under p->pi_lock:

1016

-+ *

1017

-+ *    This allows try_to_wake_up() to only take one rq->lock, see its comment.

1018

-+ *

1019

-+ *  - for migration called under rq->lock:

1020

-+ *    [ see task_on_rq_migrating() in task_rq_lock() ]

1021

-+ *

1022

-+ *    o move_queued_task()

1023

-+ *    o detach_task()

1024

-+ *

1025

-+ *  - for migration called under double_rq_lock():

1026

-+ *

1027

-+ *    o __migrate_swap_task()

1028

-+ *    o push_rt_task() / pull_rt_task()

1029

-+ *    o push_dl_task() / pull_dl_task()

1030

-+ *    o dl_task_offline_migration()

1031

-+ *

1032

-+ */

1033

-+

1034

-+/*

1035

-+ * Context: p->pi_lock

1036

-+ */

1037

-+static inline struct rq

1038

-+*__task_access_lock(struct task_struct *p, raw_spinlock_t **plock)

1039

-+{

1040

-+	struct rq *rq;

1041

-+	for (;;) {

1042

-+		rq = task_rq(p);

1043

-+		if (p->on_cpu || task_on_rq_queued(p)) {

1044

-+			raw_spin_lock(&rq->lock);

1045

-+			if (likely((p->on_cpu || task_on_rq_queued(p))

1046

-+				   && rq == task_rq(p))) {

1047

-+				*plock = &rq->lock;

1048

-+				return rq;

1049

-+			}

1050

-+			raw_spin_unlock(&rq->lock);

1051

-+		} else if (task_on_rq_migrating(p)) {

1052

-+			do {

1053

-+				cpu_relax();

1054

-+			} while (unlikely(task_on_rq_migrating(p)));

1055

-+		} else {

1056

-+			*plock = NULL;

1057

-+			return rq;

1058

-+		}

1059

-+	}

1060

-+}

1061

-+

1062

-+static inline void

1063

-+__task_access_unlock(struct task_struct *p, raw_spinlock_t *lock)

1064

-+{

1065

-+	if (NULL != lock)

1066

-+		raw_spin_unlock(lock);

1067

-+}

1068

-+

1069

-+static inline struct rq

1070

-+*task_access_lock_irqsave(struct task_struct *p, raw_spinlock_t **plock,

1071

-+			  unsigned long *flags)

1072

-+{

1073

-+	struct rq *rq;

1074

-+	for (;;) {

1075

-+		rq = task_rq(p);

1076

-+		if (p->on_cpu || task_on_rq_queued(p)) {

1077

-+			raw_spin_lock_irqsave(&rq->lock, *flags);

1078

-+			if (likely((p->on_cpu || task_on_rq_queued(p))

1079

-+				   && rq == task_rq(p))) {

1080

-+				*plock = &rq->lock;

1081

-+				return rq;

1082

-+			}

1083

-+			raw_spin_unlock_irqrestore(&rq->lock, *flags);

1084

-+		} else if (task_on_rq_migrating(p)) {

1085

-+			do {

1086

-+				cpu_relax();

1087

-+			} while (unlikely(task_on_rq_migrating(p)));

1088

-+		} else {

1089

-+			raw_spin_lock_irqsave(&p->pi_lock, *flags);

1090

-+			if (likely(!p->on_cpu && !p->on_rq &&

1091

-+				   rq == task_rq(p))) {

1092

-+				*plock = &p->pi_lock;

1093

-+				return rq;

1094

-+			}

1095

-+			raw_spin_unlock_irqrestore(&p->pi_lock, *flags);

1096

-+		}

1097

-+	}

1098

-+}

1099

-+

1100

-+static inline void

1101

-+task_access_unlock_irqrestore(struct task_struct *p, raw_spinlock_t *lock,

1102

-+			      unsigned long *flags)

1103

-+{

1104

-+	raw_spin_unlock_irqrestore(lock, *flags);

1105

-+}

1106

-+

1107

-+/*

1108

-+ * __task_rq_lock - lock the rq @p resides on.

1109

-+ */

1110

-+struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf)

1111

-+	__acquires(rq->lock)

1112

-+{

1113

-+	struct rq *rq;

1114

-+

1115

-+	lockdep_assert_held(&p->pi_lock);

1116

-+

1117

-+	for (;;) {

1118

-+		rq = task_rq(p);

1119

-+		raw_spin_lock(&rq->lock);

1120

-+		if (likely(rq == task_rq(p) && !task_on_rq_migrating(p)))

1121

-+			return rq;

1122

-+		raw_spin_unlock(&rq->lock);

1123

-+

1124

-+		while (unlikely(task_on_rq_migrating(p)))

1125

-+			cpu_relax();

1126

-+	}

1127

-+}

1128

-+

1129

-+/*

1130

-+ * task_rq_lock - lock p->pi_lock and lock the rq @p resides on.

1131

-+ */

1132

-+struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)

1133

-+	__acquires(p->pi_lock)

1134

-+	__acquires(rq->lock)

1135

-+{

1136

-+	struct rq *rq;

1137

-+

1138

-+	for (;;) {

1139

-+		raw_spin_lock_irqsave(&p->pi_lock, rf->flags);

1140

-+		rq = task_rq(p);

1141

-+		raw_spin_lock(&rq->lock);

1142

-+		/*

1143

-+		 *	move_queued_task()		task_rq_lock()

1144

-+		 *

1145

-+		 *	ACQUIRE (rq->lock)

1146

-+		 *	[S] ->on_rq = MIGRATING		[L] rq = task_rq()

1147

-+		 *	WMB (__set_task_cpu())		ACQUIRE (rq->lock);

1148

-+		 *	[S] ->cpu = new_cpu		[L] task_rq()

1149

-+		 *					[L] ->on_rq

1150

-+		 *	RELEASE (rq->lock)

1151

-+		 *

1152

-+		 * If we observe the old CPU in task_rq_lock(), the acquire of

1153

-+		 * the old rq->lock will fully serialize against the stores.

1154

-+		 *

1155

-+		 * If we observe the new CPU in task_rq_lock(), the address

1156

-+		 * dependency headed by '[L] rq = task_rq()' and the acquire

1157

-+		 * will pair with the WMB to ensure we then also see migrating.

1158

-+		 */

1159

-+		if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {

1160

-+			return rq;

1161

-+		}

1162

-+		raw_spin_unlock(&rq->lock);

1163

-+		raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags);

1164

-+

1165

-+		while (unlikely(task_on_rq_migrating(p)))

1166

-+			cpu_relax();

1167

-+	}

1168

-+}

1169

-+

1170

-+static inline void

1171

-+rq_lock_irqsave(struct rq *rq, struct rq_flags *rf)

1172

-+	__acquires(rq->lock)

1173

-+{

1174

-+	raw_spin_lock_irqsave(&rq->lock, rf->flags);

1175

-+}

1176

-+

1177

-+static inline void

1178

-+rq_unlock_irqrestore(struct rq *rq, struct rq_flags *rf)

1179

-+	__releases(rq->lock)

1180

-+{

1181

-+	raw_spin_unlock_irqrestore(&rq->lock, rf->flags);

1182

-+}

1183

-+

1184

-+void raw_spin_rq_lock_nested(struct rq *rq, int subclass)

1185

-+{

1186

-+	raw_spinlock_t *lock;

1187

-+

1188

-+	/* Matches synchronize_rcu() in __sched_core_enable() */

1189

-+	preempt_disable();

1190

-+

1191

-+	for (;;) {

1192

-+		lock = __rq_lockp(rq);

1193

-+		raw_spin_lock_nested(lock, subclass);

1194

-+		if (likely(lock == __rq_lockp(rq))) {

1195

-+			/* preempt_count *MUST* be > 1 */

1196

-+			preempt_enable_no_resched();

1197

-+			return;

1198

-+		}

1199

-+		raw_spin_unlock(lock);

1200

-+	}

1201

-+}

1202

-+

1203

-+void raw_spin_rq_unlock(struct rq *rq)

1204

-+{

1205

-+	raw_spin_unlock(rq_lockp(rq));

1206

-+}

1207

-+

1208

-+/*

1209

-+ * RQ-clock updating methods:

1210

-+ */

1211

-+

1212

-+static void update_rq_clock_task(struct rq *rq, s64 delta)

1213

-+{

1214

-+/*

1215

-+ * In theory, the compile should just see 0 here, and optimize out the call

1216

-+ * to sched_rt_avg_update. But I don't trust it...

1217

-+ */

1218

-+	s64 __maybe_unused steal = 0, irq_delta = 0;

1219

-+

1220

-+#ifdef CONFIG_IRQ_TIME_ACCOUNTING

1221

-+	irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;

1222

-+

1223

-+	/*

1224

-+	 * Since irq_time is only updated on {soft,}irq_exit, we might run into

1225

-+	 * this case when a previous update_rq_clock() happened inside a

1226

-+	 * {soft,}irq region.

1227

-+	 *

1228

-+	 * When this happens, we stop ->clock_task and only update the

1229

-+	 * prev_irq_time stamp to account for the part that fit, so that a next

1230

-+	 * update will consume the rest. This ensures ->clock_task is

1231

-+	 * monotonic.

1232

-+	 *

1233

-+	 * It does however cause some slight miss-attribution of {soft,}irq

1234

-+	 * time, a more accurate solution would be to update the irq_time using

1235

-+	 * the current rq->clock timestamp, except that would require using

1236

-+	 * atomic ops.

1237

-+	 */

1238

-+	if (irq_delta > delta)

1239

-+		irq_delta = delta;

1240

-+

1241

-+	rq->prev_irq_time += irq_delta;

1242

-+	delta -= irq_delta;

1243

-+#endif

1244

-+#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING

1245

-+	if (static_key_false((&paravirt_steal_rq_enabled))) {

1246

-+		steal = paravirt_steal_clock(cpu_of(rq));

1247

-+		steal -= rq->prev_steal_time_rq;

1248

-+

1249

-+		if (unlikely(steal > delta))

1250

-+			steal = delta;

1251

-+

1252

-+		rq->prev_steal_time_rq += steal;

1253

-+		delta -= steal;

1254

-+	}

1255

-+#endif

1256

-+

1257

-+	rq->clock_task += delta;

1258

-+

1259

-+#ifdef CONFIG_HAVE_SCHED_AVG_IRQ

1260

-+	if ((irq_delta + steal))

1261

-+		update_irq_load_avg(rq, irq_delta + steal);

1262

-+#endif

1263

-+}

1264

-+

1265

-+static inline void update_rq_clock(struct rq *rq)

1266

-+{

1267

-+	s64 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;

1268

-+

1269

-+	if (unlikely(delta <= 0))

1270

-+		return;

1271

-+	rq->clock += delta;

1272

-+	update_rq_time_edge(rq);

1273

-+	update_rq_clock_task(rq, delta);

1274

-+}

1275

-+

1276

-+/*

1277

-+ * RQ Load update routine

1278

-+ */

1279

-+#define RQ_LOAD_HISTORY_BITS		(sizeof(s32) * 8ULL)

1280

-+#define RQ_UTIL_SHIFT			(8)

1281

-+#define RQ_LOAD_HISTORY_TO_UTIL(l)	(((l) >> (RQ_LOAD_HISTORY_BITS - 1 - RQ_UTIL_SHIFT)) & 0xff)

1282

-+

1283

-+#define LOAD_BLOCK(t)		((t) >> 17)

1284

-+#define LOAD_HALF_BLOCK(t)	((t) >> 16)

1285

-+#define BLOCK_MASK(t)		((t) & ((0x01 << 18) - 1))

1286

-+#define LOAD_BLOCK_BIT(b)	(1UL << (RQ_LOAD_HISTORY_BITS - 1 - (b)))

1287

-+#define CURRENT_LOAD_BIT	LOAD_BLOCK_BIT(0)

1288

-+

1289

-+static inline void rq_load_update(struct rq *rq)

1290

-+{

1291

-+	u64 time = rq->clock;

1292

-+	u64 delta = min(LOAD_BLOCK(time) - LOAD_BLOCK(rq->load_stamp),

1293

-+			RQ_LOAD_HISTORY_BITS - 1);

1294

-+	u64 prev = !!(rq->load_history & CURRENT_LOAD_BIT);

1295

-+	u64 curr = !!rq->nr_running;

1296

-+

1297

-+	if (delta) {

1298

-+		rq->load_history = rq->load_history >> delta;

1299

-+

1300

-+		if (delta < RQ_UTIL_SHIFT) {

1301

-+			rq->load_block += (~BLOCK_MASK(rq->load_stamp)) * prev;

1302

-+			if (!!LOAD_HALF_BLOCK(rq->load_block) ^ curr)

1303

-+				rq->load_history ^= LOAD_BLOCK_BIT(delta);

1304

-+		}

1305

-+

1306

-+		rq->load_block = BLOCK_MASK(time) * prev;

1307

-+	} else {

1308

-+		rq->load_block += (time - rq->load_stamp) * prev;

1309

-+	}

1310

-+	if (prev ^ curr)

1311

-+		rq->load_history ^= CURRENT_LOAD_BIT;

1312

-+	rq->load_stamp = time;

1313

-+}

1314

-+

1315

-+unsigned long rq_load_util(struct rq *rq, unsigned long max)

1316

-+{

1317

-+	return RQ_LOAD_HISTORY_TO_UTIL(rq->load_history) * (max >> RQ_UTIL_SHIFT);

1318

-+}

1319

-+

1320

-+#ifdef CONFIG_SMP

1321

-+unsigned long sched_cpu_util(int cpu, unsigned long max)

1322

-+{

1323

-+	return rq_load_util(cpu_rq(cpu), max);

1324

-+}

1325

-+#endif /* CONFIG_SMP */

1326

-+

1327

-+#ifdef CONFIG_CPU_FREQ

1328

-+/**

1329

-+ * cpufreq_update_util - Take a note about CPU utilization changes.

1330

-+ * @rq: Runqueue to carry out the update for.

1331

-+ * @flags: Update reason flags.

1332

-+ *

1333

-+ * This function is called by the scheduler on the CPU whose utilization is

1334

-+ * being updated.

1335

-+ *

1336

-+ * It can only be called from RCU-sched read-side critical sections.

1337

-+ *

1338

-+ * The way cpufreq is currently arranged requires it to evaluate the CPU

1339

-+ * performance state (frequency/voltage) on a regular basis to prevent it from

1340

-+ * being stuck in a completely inadequate performance level for too long.

1341

-+ * That is not guaranteed to happen if the updates are only triggered from CFS

1342

-+ * and DL, though, because they may not be coming in if only RT tasks are

1343

-+ * active all the time (or there are RT tasks only).

1344

-+ *

1345

-+ * As a workaround for that issue, this function is called periodically by the

1346

-+ * RT sched class to trigger extra cpufreq updates to prevent it from stalling,

1347

-+ * but that really is a band-aid.  Going forward it should be replaced with

1348

-+ * solutions targeted more specifically at RT tasks.

1349

-+ */

1350

-+static inline void cpufreq_update_util(struct rq *rq, unsigned int flags)

1351

-+{

1352

-+	struct update_util_data *data;

1353

-+

1354

-+#ifdef CONFIG_SMP

1355

-+	rq_load_update(rq);

1356

-+#endif

1357

-+	data = rcu_dereference_sched(*per_cpu_ptr(&cpufreq_update_util_data,

1358

-+						  cpu_of(rq)));

1359

-+	if (data)

1360

-+		data->func(data, rq_clock(rq), flags);

1361

-+}

1362

-+#else

1363

-+static inline void cpufreq_update_util(struct rq *rq, unsigned int flags)

1364

-+{

1365

-+#ifdef CONFIG_SMP

1366

-+	rq_load_update(rq);

1367

-+#endif

1368

-+}

1369

-+#endif /* CONFIG_CPU_FREQ */

1370

-+

1371

-+#ifdef CONFIG_NO_HZ_FULL

1372

-+/*

1373

-+ * Tick may be needed by tasks in the runqueue depending on their policy and

1374

-+ * requirements. If tick is needed, lets send the target an IPI to kick it out

1375

-+ * of nohz mode if necessary.

1376

-+ */

1377

-+static inline void sched_update_tick_dependency(struct rq *rq)

1378

-+{

1379

-+	int cpu = cpu_of(rq);

1380

-+

1381

-+	if (!tick_nohz_full_cpu(cpu))

1382

-+		return;

1383

-+

1384

-+	if (rq->nr_running < 2)

1385

-+		tick_nohz_dep_clear_cpu(cpu, TICK_DEP_BIT_SCHED);

1386

-+	else

1387

-+		tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED);

1388

-+}

1389

-+#else /* !CONFIG_NO_HZ_FULL */

1390

-+static inline void sched_update_tick_dependency(struct rq *rq) { }

1391

-+#endif

1392

-+

1393

-+bool sched_task_on_rq(struct task_struct *p)

1394

-+{

1395

-+	return task_on_rq_queued(p);

1396

-+}

1397

-+

1398

-+/*

1399

-+ * Add/Remove/Requeue task to/from the runqueue routines

1400

-+ * Context: rq->lock

1401

-+ */

1402

-+#define __SCHED_DEQUEUE_TASK(p, rq, flags, func)		\

1403

-+	psi_dequeue(p, flags & DEQUEUE_SLEEP);			\

1404

-+	sched_info_dequeue(rq, p);				\

1405

-+								\

1406

-+	list_del(&p->sq_node);					\

1407

-+	if (list_empty(&rq->queue.heads[p->sq_idx])) {		\

1408

-+		clear_bit(sched_idx2prio(p->sq_idx, rq),	\

1409

-+			  rq->queue.bitmap);			\

1410

-+		func;						\

1411

-+	}

1412

-+

1413

-+#define __SCHED_ENQUEUE_TASK(p, rq, flags)				\

1414

-+	sched_info_enqueue(rq, p);					\

1415

-+	psi_enqueue(p, flags);						\

1416

-+									\

1417

-+	p->sq_idx = task_sched_prio_idx(p, rq);				\

1418

-+	list_add_tail(&p->sq_node, &rq->queue.heads[p->sq_idx]);	\

1419

-+	set_bit(sched_idx2prio(p->sq_idx, rq), rq->queue.bitmap);

1420

-+

1421

-+static inline void dequeue_task(struct task_struct *p, struct rq *rq, int flags)

1422

-+{

1423

-+	lockdep_assert_held(&rq->lock);

1424

-+

1425

-+	/*printk(KERN_INFO "sched: dequeue(%d) %px %016llx\n", cpu_of(rq), p, p->priodl);*/

1426

-+	WARN_ONCE(task_rq(p) != rq, "sched: dequeue task reside on cpu%d from cpu%d\n",

1427

-+		  task_cpu(p), cpu_of(rq));

1428

-+

1429

-+	__SCHED_DEQUEUE_TASK(p, rq, flags, update_sched_rq_watermark(rq));

1430

-+	--rq->nr_running;

1431

-+#ifdef CONFIG_SMP

1432

-+	if (1 == rq->nr_running)

1433

-+		cpumask_clear_cpu(cpu_of(rq), &sched_rq_pending_mask);

1434

-+#endif

1435

-+

1436

-+	sched_update_tick_dependency(rq);

1437

-+}

1438

-+

1439

-+static inline void enqueue_task(struct task_struct *p, struct rq *rq, int flags)

1440

-+{

1441

-+	lockdep_assert_held(&rq->lock);

1442

-+

1443

-+	/*printk(KERN_INFO "sched: enqueue(%d) %px %016llx\n", cpu_of(rq), p, p->priodl);*/

1444

-+	WARN_ONCE(task_rq(p) != rq, "sched: enqueue task reside on cpu%d to cpu%d\n",

1445

-+		  task_cpu(p), cpu_of(rq));

1446

-+

1447

-+	__SCHED_ENQUEUE_TASK(p, rq, flags);

1448

-+	update_sched_rq_watermark(rq);

1449

-+	++rq->nr_running;

1450

-+#ifdef CONFIG_SMP

1451

-+	if (2 == rq->nr_running)

1452

-+		cpumask_set_cpu(cpu_of(rq), &sched_rq_pending_mask);

1453

-+#endif

1454

-+

1455

-+	sched_update_tick_dependency(rq);

1456

-+}

1457

-+

1458

-+static inline void requeue_task(struct task_struct *p, struct rq *rq)

1459

-+{

1460

-+	int idx;

1461

-+

1462

-+	lockdep_assert_held(&rq->lock);

1463

-+	/*printk(KERN_INFO "sched: requeue(%d) %px %016llx\n", cpu_of(rq), p, p->priodl);*/

1464

-+	WARN_ONCE(task_rq(p) != rq, "sched: cpu[%d] requeue task reside on cpu%d\n",

1465

-+		  cpu_of(rq), task_cpu(p));

1466

-+

1467

-+	idx = task_sched_prio_idx(p, rq);

1468

-+

1469

-+	list_del(&p->sq_node);

1470

-+	list_add_tail(&p->sq_node, &rq->queue.heads[idx]);

1471

-+	if (idx != p->sq_idx) {

1472

-+		if (list_empty(&rq->queue.heads[p->sq_idx]))

1473

-+			clear_bit(sched_idx2prio(p->sq_idx, rq),

1474

-+				  rq->queue.bitmap);

1475

-+		p->sq_idx = idx;

1476

-+		set_bit(sched_idx2prio(p->sq_idx, rq), rq->queue.bitmap);

1477

-+		update_sched_rq_watermark(rq);

1478

-+	}

1479

-+}

1480

-+

1481

-+/*

1482

-+ * cmpxchg based fetch_or, macro so it works for different integer types

1483

-+ */

1484

-+#define fetch_or(ptr, mask)						\

1485

-+	({								\

1486

-+		typeof(ptr) _ptr = (ptr);				\

1487

-+		typeof(mask) _mask = (mask);				\

1488

-+		typeof(*_ptr) _old, _val = *_ptr;			\

1489

-+									\

1490

-+		for (;;) {						\

1491

-+			_old = cmpxchg(_ptr, _val, _val | _mask);	\

1492

-+			if (_old == _val)				\

1493

-+				break;					\

1494

-+			_val = _old;					\

1495

-+		}							\

1496

-+	_old;								\

1497

-+})

1498

-+

1499

-+#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG)

1500

-+/*

1501

-+ * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG,

1502

-+ * this avoids any races wrt polling state changes and thereby avoids

1503

-+ * spurious IPIs.

1504

-+ */

1505

-+static bool set_nr_and_not_polling(struct task_struct *p)

1506

-+{

1507

-+	struct thread_info *ti = task_thread_info(p);

1508

-+	return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG);

1509

-+}

1510

-+

1511

-+/*

1512

-+ * Atomically set TIF_NEED_RESCHED if TIF_POLLING_NRFLAG is set.

1513

-+ *

1514

-+ * If this returns true, then the idle task promises to call

1515

-+ * sched_ttwu_pending() and reschedule soon.

1516

-+ */

1517

-+static bool set_nr_if_polling(struct task_struct *p)

1518

-+{

1519

-+	struct thread_info *ti = task_thread_info(p);

1520

-+	typeof(ti->flags) old, val = READ_ONCE(ti->flags);

1521

-+

1522

-+	for (;;) {

1523

-+		if (!(val & _TIF_POLLING_NRFLAG))

1524

-+			return false;

1525

-+		if (val & _TIF_NEED_RESCHED)

1526

-+			return true;

1527

-+		old = cmpxchg(&ti->flags, val, val | _TIF_NEED_RESCHED);

1528

-+		if (old == val)

1529

-+			break;

1530

-+		val = old;

1531

-+	}

1532

-+	return true;

1533

-+}

1534

-+

1535

-+#else

1536

-+static bool set_nr_and_not_polling(struct task_struct *p)

1537

-+{

1538

-+	set_tsk_need_resched(p);

1539

-+	return true;

1540

-+}

1541

-+

1542

-+#ifdef CONFIG_SMP

1543

-+static bool set_nr_if_polling(struct task_struct *p)

1544

-+{

1545

-+	return false;

1546

-+}

1547

-+#endif

1548

-+#endif

1549

-+

1550

-+static bool __wake_q_add(struct wake_q_head *head, struct task_struct *task)

1551

-+{

1552

-+	struct wake_q_node *node = &task->wake_q;

1553

-+

1554

-+	/*

1555

-+	 * Atomically grab the task, if ->wake_q is !nil already it means

1556

-+	 * it's already queued (either by us or someone else) and will get the

1557

-+	 * wakeup due to that.

1558

-+	 *

1559

-+	 * In order to ensure that a pending wakeup will observe our pending

1560

-+	 * state, even in the failed case, an explicit smp_mb() must be used.

1561

-+	 */

1562

-+	smp_mb__before_atomic();

1563

-+	if (unlikely(cmpxchg_relaxed(&node->next, NULL, WAKE_Q_TAIL)))

1564

-+		return false;

1565

-+

1566

-+	/*

1567

-+	 * The head is context local, there can be no concurrency.

1568

-+	 */

1569

-+	*head->lastp = node;

1570

-+	head->lastp = &node->next;

1571

-+	return true;

1572

-+}

1573

-+

1574

-+/**

1575

-+ * wake_q_add() - queue a wakeup for 'later' waking.

1576

-+ * @head: the wake_q_head to add @task to

1577

-+ * @task: the task to queue for 'later' wakeup

1578

-+ *

1579

-+ * Queue a task for later wakeup, most likely by the wake_up_q() call in the

1580

-+ * same context, _HOWEVER_ this is not guaranteed, the wakeup can come

1581

-+ * instantly.

1582

-+ *

1583

-+ * This function must be used as-if it were wake_up_process(); IOW the task

1584

-+ * must be ready to be woken at this location.

1585

-+ */

1586

-+void wake_q_add(struct wake_q_head *head, struct task_struct *task)

1587

-+{

1588

-+	if (__wake_q_add(head, task))

1589

-+		get_task_struct(task);

1590

-+}

1591

-+

1592

-+/**

1593

-+ * wake_q_add_safe() - safely queue a wakeup for 'later' waking.

1594

-+ * @head: the wake_q_head to add @task to

1595

-+ * @task: the task to queue for 'later' wakeup

1596

-+ *

1597

-+ * Queue a task for later wakeup, most likely by the wake_up_q() call in the

1598

-+ * same context, _HOWEVER_ this is not guaranteed, the wakeup can come

1599

-+ * instantly.

1600

-+ *

1601

-+ * This function must be used as-if it were wake_up_process(); IOW the task

1602

-+ * must be ready to be woken at this location.

1603

-+ *

1604

-+ * This function is essentially a task-safe equivalent to wake_q_add(). Callers

1605

-+ * that already hold reference to @task can call the 'safe' version and trust

1606

-+ * wake_q to do the right thing depending whether or not the @task is already

1607

-+ * queued for wakeup.

1608

-+ */

1609

-+void wake_q_add_safe(struct wake_q_head *head, struct task_struct *task)

1610

-+{

1611

-+	if (!__wake_q_add(head, task))

1612

-+		put_task_struct(task);

1613

-+}

1614

-+

1615

-+void wake_up_q(struct wake_q_head *head)

1616

-+{

1617

-+	struct wake_q_node *node = head->first;

1618

-+

1619

-+	while (node != WAKE_Q_TAIL) {

1620

-+		struct task_struct *task;

1621

-+

1622

-+		task = container_of(node, struct task_struct, wake_q);

1623

-+		/* task can safely be re-inserted now: */

1624

-+		node = node->next;

1625

-+		task->wake_q.next = NULL;

1626

-+

1627

-+		/*

1628

-+		 * wake_up_process() executes a full barrier, which pairs with

1629

-+		 * the queueing in wake_q_add() so as not to miss wakeups.

1630

-+		 */

1631

-+		wake_up_process(task);

1632

-+		put_task_struct(task);

1633

-+	}

1634

-+}

1635

-+

1636

-+/*

1637

-+ * resched_curr - mark rq's current task 'to be rescheduled now'.

1638

-+ *

1639

-+ * On UP this means the setting of the need_resched flag, on SMP it

1640

-+ * might also involve a cross-CPU call to trigger the scheduler on

1641

-+ * the target CPU.

1642

-+ */

1643

-+void resched_curr(struct rq *rq)

1644

-+{

1645

-+	struct task_struct *curr = rq->curr;

1646

-+	int cpu;

1647

-+

1648

-+	lockdep_assert_held(&rq->lock);

1649

-+

1650

-+	if (test_tsk_need_resched(curr))

1651

-+		return;

1652

-+

1653

-+	cpu = cpu_of(rq);

1654

-+	if (cpu == smp_processor_id()) {

1655

-+		set_tsk_need_resched(curr);

1656

-+		set_preempt_need_resched();

1657

-+		return;

1658

-+	}

1659

-+

1660

-+	if (set_nr_and_not_polling(curr))

1661

-+		smp_send_reschedule(cpu);

1662

-+	else

1663

-+		trace_sched_wake_idle_without_ipi(cpu);

1664

-+}

1665

-+

1666

-+void resched_cpu(int cpu)

1667

-+{

1668

-+	struct rq *rq = cpu_rq(cpu);

1669

-+	unsigned long flags;

1670

-+

1671

-+	raw_spin_lock_irqsave(&rq->lock, flags);

1672

-+	if (cpu_online(cpu) || cpu == smp_processor_id())

1673

-+		resched_curr(cpu_rq(cpu));

1674

-+	raw_spin_unlock_irqrestore(&rq->lock, flags);

1675

-+}

1676

-+

1677

-+#ifdef CONFIG_SMP

1678

-+#ifdef CONFIG_NO_HZ_COMMON

1679

-+void nohz_balance_enter_idle(int cpu) {}

1680

-+

1681

-+void select_nohz_load_balancer(int stop_tick) {}

1682

-+

1683

-+void set_cpu_sd_state_idle(void) {}

1684

-+

1685

-+/*

1686

-+ * In the semi idle case, use the nearest busy CPU for migrating timers

1687

-+ * from an idle CPU.  This is good for power-savings.

1688

-+ *

1689

-+ * We don't do similar optimization for completely idle system, as

1690

-+ * selecting an idle CPU will add more delays to the timers than intended

1691

-+ * (as that CPU's timer base may not be uptodate wrt jiffies etc).

1692

-+ */

1693

-+int get_nohz_timer_target(void)

1694

-+{

1695

-+	int i, cpu = smp_processor_id(), default_cpu = -1;

1696

-+	struct cpumask *mask;

1697

-+	const struct cpumask *hk_mask;

1698

-+

1699

-+	if (housekeeping_cpu(cpu, HK_FLAG_TIMER)) {

1700

-+		if (!idle_cpu(cpu))

1701

-+			return cpu;

1702

-+		default_cpu = cpu;

1703

-+	}

1704

-+

1705

-+	hk_mask = housekeeping_cpumask(HK_FLAG_TIMER);

1706

-+

1707

-+	for (mask = per_cpu(sched_cpu_topo_masks, cpu) + 1;

1708

-+	     mask < per_cpu(sched_cpu_topo_end_mask, cpu); mask++)

1709

-+		for_each_cpu_and(i, mask, hk_mask)

1710

-+			if (!idle_cpu(i))

1711

-+				return i;

1712

-+

1713

-+	if (default_cpu == -1)

1714

-+		default_cpu = housekeeping_any_cpu(HK_FLAG_TIMER);

1715

-+	cpu = default_cpu;

1716

-+

1717

-+	return cpu;

1718

-+}

1719

-+

1720

-+/*

1721

-+ * When add_timer_on() enqueues a timer into the timer wheel of an

1722

-+ * idle CPU then this timer might expire before the next timer event

1723

-+ * which is scheduled to wake up that CPU. In case of a completely

1724

-+ * idle system the next event might even be infinite time into the

1725

-+ * future. wake_up_idle_cpu() ensures that the CPU is woken up and

1726

-+ * leaves the inner idle loop so the newly added timer is taken into

1727

-+ * account when the CPU goes back to idle and evaluates the timer

1728

-+ * wheel for the next timer event.

1729

-+ */

1730

-+static inline void wake_up_idle_cpu(int cpu)

1731

-+{

1732

-+	struct rq *rq = cpu_rq(cpu);

1733

-+

1734

-+	if (cpu == smp_processor_id())

1735

-+		return;

1736

-+

1737

-+	if (set_nr_and_not_polling(rq->idle))

1738

-+		smp_send_reschedule(cpu);

1739

-+	else

1740

-+		trace_sched_wake_idle_without_ipi(cpu);

1741

-+}

1742

-+

1743

-+static inline bool wake_up_full_nohz_cpu(int cpu)

1744

-+{

1745

-+	/*

1746

-+	 * We just need the target to call irq_exit() and re-evaluate

1747

-+	 * the next tick. The nohz full kick at least implies that.

1748

-+	 * If needed we can still optimize that later with an

1749

-+	 * empty IRQ.

1750

-+	 */

1751

-+	if (cpu_is_offline(cpu))

1752

-+		return true;  /* Don't try to wake offline CPUs. */

1753

-+	if (tick_nohz_full_cpu(cpu)) {

1754

-+		if (cpu != smp_processor_id() ||

1755

-+		    tick_nohz_tick_stopped())

1756

-+			tick_nohz_full_kick_cpu(cpu);

1757

-+		return true;

1758

-+	}

1759

-+

1760

-+	return false;

1761

-+}

1762

-+

1763

-+void wake_up_nohz_cpu(int cpu)

1764

-+{

1765

-+	if (!wake_up_full_nohz_cpu(cpu))

1766

-+		wake_up_idle_cpu(cpu);

1767

-+}

1768

-+

1769

-+static void nohz_csd_func(void *info)

1770

-+{

1771

-+	struct rq *rq = info;

1772

-+	int cpu = cpu_of(rq);

1773

-+	unsigned int flags;

1774

-+

1775

-+	/*

1776

-+	 * Release the rq::nohz_csd.

1777

-+	 */

1778

-+	flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(cpu));

1779

-+	WARN_ON(!(flags & NOHZ_KICK_MASK));

1780

-+

1781

-+	rq->idle_balance = idle_cpu(cpu);

1782

-+	if (rq->idle_balance && !need_resched()) {

1783

-+		rq->nohz_idle_balance = flags;

1784

-+		raise_softirq_irqoff(SCHED_SOFTIRQ);

1785

-+	}

1786

-+}

1787

-+

1788

-+#endif /* CONFIG_NO_HZ_COMMON */

1789

-+#endif /* CONFIG_SMP */

1790

-+

1791

-+static inline void check_preempt_curr(struct rq *rq)

1792

-+{

1793

-+	if (sched_rq_first_task(rq) != rq->curr)

1794

-+		resched_curr(rq);

1795

-+}

1796

-+

1797

-+#ifdef CONFIG_SCHED_HRTICK

1798

-+/*

1799

-+ * Use HR-timers to deliver accurate preemption points.

1800

-+ */

1801

-+

1802

-+static void hrtick_clear(struct rq *rq)

1803

-+{

1804

-+	if (hrtimer_active(&rq->hrtick_timer))

1805

-+		hrtimer_cancel(&rq->hrtick_timer);

1806

-+}

1807

-+

1808

-+/*

1809

-+ * High-resolution timer tick.

1810

-+ * Runs from hardirq context with interrupts disabled.

1811

-+ */

1812

-+static enum hrtimer_restart hrtick(struct hrtimer *timer)

1813

-+{

1814

-+	struct rq *rq = container_of(timer, struct rq, hrtick_timer);

1815

-+

1816

-+	WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());

1817

-+

1818

-+	raw_spin_lock(&rq->lock);

1819

-+	resched_curr(rq);

1820

-+	raw_spin_unlock(&rq->lock);

1821

-+

1822

-+	return HRTIMER_NORESTART;

1823

-+}

1824

-+

1825

-+/*

1826

-+ * Use hrtick when:

1827

-+ *  - enabled by features

1828

-+ *  - hrtimer is actually high res

1829

-+ */

1830

-+static inline int hrtick_enabled(struct rq *rq)

1831

-+{

1832

-+	/**

1833

-+	 * Alt schedule FW doesn't support sched_feat yet

1834

-+	if (!sched_feat(HRTICK))

1835

-+		return 0;

1836

-+	*/

1837

-+	if (!cpu_active(cpu_of(rq)))

1838

-+		return 0;

1839

-+	return hrtimer_is_hres_active(&rq->hrtick_timer);

1840

-+}

1841

-+

1842

-+#ifdef CONFIG_SMP

1843

-+

1844

-+static void __hrtick_restart(struct rq *rq)

1845

-+{

1846

-+	struct hrtimer *timer = &rq->hrtick_timer;

1847

-+	ktime_t time = rq->hrtick_time;

1848

-+

1849

-+	hrtimer_start(timer, time, HRTIMER_MODE_ABS_PINNED_HARD);

1850

-+}

1851

-+

1852

-+/*

1853

-+ * called from hardirq (IPI) context

1854

-+ */

1855

-+static void __hrtick_start(void *arg)

1856

-+{

1857

-+	struct rq *rq = arg;

1858

-+

1859

-+	raw_spin_lock(&rq->lock);

1860

-+	__hrtick_restart(rq);

1861

-+	raw_spin_unlock(&rq->lock);

1862

-+}

1863

-+

1864

-+/*

1865

-+ * Called to set the hrtick timer state.

1866

-+ *

1867

-+ * called with rq->lock held and irqs disabled

1868

-+ */

1869

-+void hrtick_start(struct rq *rq, u64 delay)

1870

-+{

1871

-+	struct hrtimer *timer = &rq->hrtick_timer;

1872

-+	s64 delta;

1873

-+

1874

-+	/*

1875

-+	 * Don't schedule slices shorter than 10000ns, that just

1876

-+	 * doesn't make sense and can cause timer DoS.

1877

-+	 */

1878

-+	delta = max_t(s64, delay, 10000LL);

1879

-+

1880

-+	rq->hrtick_time = ktime_add_ns(timer->base->get_time(), delta);

1881

-+

1882

-+	if (rq == this_rq())

1883

-+		__hrtick_restart(rq);

1884

-+	else

1885

-+		smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd);

1886

-+}

1887

-+

1888

-+#else

1889

-+/*

1890

-+ * Called to set the hrtick timer state.

1891

-+ *

1892

-+ * called with rq->lock held and irqs disabled

1893

-+ */

1894

-+void hrtick_start(struct rq *rq, u64 delay)

1895

-+{

1896

-+	/*

1897

-+	 * Don't schedule slices shorter than 10000ns, that just

1898

-+	 * doesn't make sense. Rely on vruntime for fairness.

1899

-+	 */

1900

-+	delay = max_t(u64, delay, 10000LL);

1901

-+	hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay),

1902

-+		      HRTIMER_MODE_REL_PINNED_HARD);

1903

-+}

1904

-+#endif /* CONFIG_SMP */

1905

-+

1906

-+static void hrtick_rq_init(struct rq *rq)

1907

-+{

1908

-+#ifdef CONFIG_SMP

1909

-+	INIT_CSD(&rq->hrtick_csd, __hrtick_start, rq);

1910

-+#endif

1911

-+

1912

-+	hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);

1913

-+	rq->hrtick_timer.function = hrtick;

1914

-+}

1915

-+#else	/* CONFIG_SCHED_HRTICK */

1916

-+static inline int hrtick_enabled(struct rq *rq)

1917

-+{

1918

-+	return 0;

1919

-+}

1920

-+

1921

-+static inline void hrtick_clear(struct rq *rq)

1922

-+{

1923

-+}

1924

-+

1925

-+static inline void hrtick_rq_init(struct rq *rq)

1926

-+{

1927

-+}

1928

-+#endif	/* CONFIG_SCHED_HRTICK */

1929

-+

1930

-+static inline int __normal_prio(int policy, int rt_prio, int static_prio)

1931

-+{

1932

-+	return rt_policy(policy) ? (MAX_RT_PRIO - 1 - rt_prio) :

1933

-+		static_prio + MAX_PRIORITY_ADJ;

1934

-+}

1935

-+

1936

-+/*

1937

-+ * Calculate the expected normal priority: i.e. priority

1938

-+ * without taking RT-inheritance into account. Might be

1939

-+ * boosted by interactivity modifiers. Changes upon fork,

1940

-+ * setprio syscalls, and whenever the interactivity

1941

-+ * estimator recalculates.

1942

-+ */

1943

-+static inline int normal_prio(struct task_struct *p)

1944

-+{

1945

-+	return __normal_prio(p->policy, p->rt_priority, p->static_prio);

1946

-+}

1947

-+

1948

-+/*

1949

-+ * Calculate the current priority, i.e. the priority

1950

-+ * taken into account by the scheduler. This value might

1951

-+ * be boosted by RT tasks as it will be RT if the task got

1952

-+ * RT-boosted. If not then it returns p->normal_prio.

1953

-+ */

1954

-+static int effective_prio(struct task_struct *p)

1955

-+{

1956

-+	p->normal_prio = normal_prio(p);

1957

-+	/*

1958

-+	 * If we are RT tasks or we were boosted to RT priority,

1959

-+	 * keep the priority unchanged. Otherwise, update priority

1960

-+	 * to the normal priority:

1961

-+	 */

1962

-+	if (!rt_prio(p->prio))

1963

-+		return p->normal_prio;

1964

-+	return p->prio;

1965

-+}

1966

-+

1967

-+/*

1968

-+ * activate_task - move a task to the runqueue.

1969

-+ *

1970

-+ * Context: rq->lock

1971

-+ */

1972

-+static void activate_task(struct task_struct *p, struct rq *rq)

1973

-+{

1974

-+	enqueue_task(p, rq, ENQUEUE_WAKEUP);

1975

-+	p->on_rq = TASK_ON_RQ_QUEUED;

1976

-+

1977

-+	/*

1978

-+	 * If in_iowait is set, the code below may not trigger any cpufreq

1979

-+	 * utilization updates, so do it here explicitly with the IOWAIT flag

1980

-+	 * passed.

1981

-+	 */

1982

-+	cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT * p->in_iowait);

1983

-+}

1984

-+

1985

-+/*

1986

-+ * deactivate_task - remove a task from the runqueue.

1987

-+ *

1988

-+ * Context: rq->lock

1989

-+ */

1990

-+static inline void deactivate_task(struct task_struct *p, struct rq *rq)

1991

-+{

1992

-+	dequeue_task(p, rq, DEQUEUE_SLEEP);

1993

-+	p->on_rq = 0;

1994

-+	cpufreq_update_util(rq, 0);

1995

-+}

1996

-+

1997

-+static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)

1998

-+{

1999

-+#ifdef CONFIG_SMP

2000

-+	/*

2001

-+	 * After ->cpu is set up to a new value, task_access_lock(p, ...) can be

2002

-+	 * successfully executed on another CPU. We must ensure that updates of

2003

-+	 * per-task data have been completed by this moment.

2004

-+	 */

2005

-+	smp_wmb();

2006

-+

2007

-+#ifdef CONFIG_THREAD_INFO_IN_TASK

2008

-+	WRITE_ONCE(p->cpu, cpu);

2009

-+#else

2010

-+	WRITE_ONCE(task_thread_info(p)->cpu, cpu);

2011

-+#endif

2012

-+#endif

2013

-+}

2014

-+

2015

-+static inline bool is_migration_disabled(struct task_struct *p)

2016

-+{

2017

-+#ifdef CONFIG_SMP

2018

-+	return p->migration_disabled;

2019

-+#else

2020

-+	return false;

2021

-+#endif

2022

-+}

2023

-+

2024

-+#define SCA_CHECK		0x01

2025

-+#define SCA_USER		0x08

2026

-+

2027

-+#ifdef CONFIG_SMP

2028

-+

2029

-+void set_task_cpu(struct task_struct *p, unsigned int new_cpu)

2030

-+{

2031

-+#ifdef CONFIG_SCHED_DEBUG

2032

-+	unsigned int state = READ_ONCE(p->__state);

2033

-+

2034

-+	/*

2035

-+	 * We should never call set_task_cpu() on a blocked task,

2036

-+	 * ttwu() will sort out the placement.

2037

-+	 */

2038

-+	WARN_ON_ONCE(state != TASK_RUNNING && state != TASK_WAKING && !p->on_rq);

2039

-+

2040

-+#ifdef CONFIG_LOCKDEP

2041

-+	/*

2042

-+	 * The caller should hold either p->pi_lock or rq->lock, when changing

2043

-+	 * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks.

2044

-+	 *

2045

-+	 * sched_move_task() holds both and thus holding either pins the cgroup,

2046

-+	 * see task_group().

2047

-+	 */

2048

-+	WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||

2049

-+				      lockdep_is_held(&task_rq(p)->lock)));

2050

-+#endif

2051

-+	/*

2052

-+	 * Clearly, migrating tasks to offline CPUs is a fairly daft thing.

2053

-+	 */

2054

-+	WARN_ON_ONCE(!cpu_online(new_cpu));

2055

-+

2056

-+	WARN_ON_ONCE(is_migration_disabled(p));

2057

-+#endif

2058

-+	if (task_cpu(p) == new_cpu)

2059

-+		return;

2060

-+	trace_sched_migrate_task(p, new_cpu);

2061

-+	rseq_migrate(p);

2062

-+	perf_event_task_migrate(p);

2063

-+

2064

-+	__set_task_cpu(p, new_cpu);

2065

-+}

2066

-+

2067

-+#define MDF_FORCE_ENABLED	0x80

2068

-+

2069

-+static void

2070

-+__do_set_cpus_ptr(struct task_struct *p, const struct cpumask *new_mask)

2071

-+{

2072

-+	/*

2073

-+	 * This here violates the locking rules for affinity, since we're only

2074

-+	 * supposed to change these variables while holding both rq->lock and

2075

-+	 * p->pi_lock.

2076

-+	 *

2077

-+	 * HOWEVER, it magically works, because ttwu() is the only code that

2078

-+	 * accesses these variables under p->pi_lock and only does so after

2079

-+	 * smp_cond_load_acquire(&p->on_cpu, !VAL), and we're in __schedule()

2080

-+	 * before finish_task().

2081

-+	 *

2082

-+	 * XXX do further audits, this smells like something putrid.

2083

-+	 */

2084

-+	SCHED_WARN_ON(!p->on_cpu);

2085

-+	p->cpus_ptr = new_mask;

2086

-+}

2087

-+

2088

-+void migrate_disable(void)

2089

-+{

2090

-+	struct task_struct *p = current;

2091

-+	int cpu;

2092

-+

2093

-+	if (p->migration_disabled) {

2094

-+		p->migration_disabled++;

2095

-+		return;

2096

-+	}

2097

-+

2098

-+	preempt_disable();

2099

-+	cpu = smp_processor_id();

2100

-+	if (cpumask_test_cpu(cpu, &p->cpus_mask)) {

2101

-+		cpu_rq(cpu)->nr_pinned++;

2102

-+		p->migration_disabled = 1;

2103

-+		p->migration_flags &= ~MDF_FORCE_ENABLED;

2104

-+

2105

-+		/*

2106

-+		 * Violates locking rules! see comment in __do_set_cpus_ptr().

2107

-+		 */

2108

-+		if (p->cpus_ptr == &p->cpus_mask)

2109

-+			__do_set_cpus_ptr(p, cpumask_of(cpu));

2110

-+	}

2111

-+	preempt_enable();

2112

-+}

2113

-+EXPORT_SYMBOL_GPL(migrate_disable);

2114

-+

2115

-+void migrate_enable(void)

2116

-+{

2117

-+	struct task_struct *p = current;

2118

-+

2119

-+	if (0 == p->migration_disabled)

2120

-+		return;

2121

-+

2122

-+	if (p->migration_disabled > 1) {

2123

-+		p->migration_disabled--;

2124

-+		return;

2125

-+	}

2126

-+

2127

-+	/*

2128

-+	 * Ensure stop_task runs either before or after this, and that

2129

-+	 * __set_cpus_allowed_ptr(SCA_MIGRATE_ENABLE) doesn't schedule().

2130

-+	 */

2131

-+	preempt_disable();

2132

-+	/*

2133

-+	 * Assumption: current should be running on allowed cpu

2134

-+	 */

2135

-+	WARN_ON_ONCE(!cpumask_test_cpu(smp_processor_id(), &p->cpus_mask));

2136

-+	if (p->cpus_ptr != &p->cpus_mask)

2137

-+		__do_set_cpus_ptr(p, &p->cpus_mask);

2138

-+	/*

2139

-+	 * Mustn't clear migration_disabled() until cpus_ptr points back at the

2140

-+	 * regular cpus_mask, otherwise things that race (eg.

2141

-+	 * select_fallback_rq) get confused.

2142

-+	 */

2143

-+	barrier();

2144

-+	p->migration_disabled = 0;

2145

-+	this_rq()->nr_pinned--;

2146

-+	preempt_enable();

2147

-+}

2148

-+EXPORT_SYMBOL_GPL(migrate_enable);

2149

-+

2150

-+static inline bool rq_has_pinned_tasks(struct rq *rq)

2151

-+{

2152

-+	return rq->nr_pinned;

2153

-+}

2154

-+

2155

-+/*

2156

-+ * Per-CPU kthreads are allowed to run on !active && online CPUs, see

2157

-+ * __set_cpus_allowed_ptr() and select_fallback_rq().

2158

-+ */

2159

-+static inline bool is_cpu_allowed(struct task_struct *p, int cpu)

2160

-+{

2161

-+	/* When not in the task's cpumask, no point in looking further. */

2162

-+	if (!cpumask_test_cpu(cpu, p->cpus_ptr))

2163

-+		return false;

2164

-+

2165

-+	/* migrate_disabled() must be allowed to finish. */

2166

-+	if (is_migration_disabled(p))

2167

-+		return cpu_online(cpu);

2168

-+

2169

-+	/* Non kernel threads are not allowed during either online or offline. */

2170

-+	if (!(p->flags & PF_KTHREAD))

2171

-+		return cpu_active(cpu) && task_cpu_possible(cpu, p);

2172

-+

2173

-+	/* KTHREAD_IS_PER_CPU is always allowed. */

2174

-+	if (kthread_is_per_cpu(p))

2175

-+		return cpu_online(cpu);

2176

-+

2177

-+	/* Regular kernel threads don't get to stay during offline. */

2178

-+	if (cpu_dying(cpu))

2179

-+		return false;

2180

-+

2181

-+	/* But are allowed during online. */

2182

-+	return cpu_online(cpu);

2183

-+}

2184

-+

2185

-+/*

2186

-+ * This is how migration works:

2187

-+ *

2188

-+ * 1) we invoke migration_cpu_stop() on the target CPU using

2189

-+ *    stop_one_cpu().

2190

-+ * 2) stopper starts to run (implicitly forcing the migrated thread

2191

-+ *    off the CPU)

2192

-+ * 3) it checks whether the migrated task is still in the wrong runqueue.

2193

-+ * 4) if it's in the wrong runqueue then the migration thread removes

2194

-+ *    it and puts it into the right queue.

2195

-+ * 5) stopper completes and stop_one_cpu() returns and the migration

2196

-+ *    is done.

2197

-+ */

2198

-+

2199

-+/*

2200

-+ * move_queued_task - move a queued task to new rq.

2201

-+ *

2202

-+ * Returns (locked) new rq. Old rq's lock is released.

2203

-+ */

2204

-+static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int

2205

-+				   new_cpu)

2206

-+{

2207

-+	lockdep_assert_held(&rq->lock);

2208

-+

2209

-+	WRITE_ONCE(p->on_rq, TASK_ON_RQ_MIGRATING);

2210

-+	dequeue_task(p, rq, 0);

2211

-+	set_task_cpu(p, new_cpu);

2212

-+	raw_spin_unlock(&rq->lock);

2213

-+

2214

-+	rq = cpu_rq(new_cpu);

2215

-+

2216

-+	raw_spin_lock(&rq->lock);

2217

-+	BUG_ON(task_cpu(p) != new_cpu);

2218

-+	sched_task_sanity_check(p, rq);

2219

-+	enqueue_task(p, rq, 0);

2220

-+	p->on_rq = TASK_ON_RQ_QUEUED;

2221

-+	check_preempt_curr(rq);

2222

-+

2223

-+	return rq;

2224

-+}

2225

-+

2226

-+struct migration_arg {

2227

-+	struct task_struct *task;

2228

-+	int dest_cpu;

2229

-+};

2230

-+

2231

-+/*

2232

-+ * Move (not current) task off this CPU, onto the destination CPU. We're doing

2233

-+ * this because either it can't run here any more (set_cpus_allowed()

2234

-+ * away from this CPU, or CPU going down), or because we're

2235

-+ * attempting to rebalance this task on exec (sched_exec).

2236

-+ *

2237

-+ * So we race with normal scheduler movements, but that's OK, as long

2238

-+ * as the task is no longer on this CPU.

2239

-+ */

2240

-+static struct rq *__migrate_task(struct rq *rq, struct task_struct *p, int

2241

-+				 dest_cpu)

2242

-+{

2243

-+	/* Affinity changed (again). */

2244

-+	if (!is_cpu_allowed(p, dest_cpu))

2245

-+		return rq;

2246

-+

2247

-+	update_rq_clock(rq);

2248

-+	return move_queued_task(rq, p, dest_cpu);

2249

-+}

2250

-+

2251

-+/*

2252

-+ * migration_cpu_stop - this will be executed by a highprio stopper thread

2253

-+ * and performs thread migration by bumping thread off CPU then

2254

-+ * 'pushing' onto another runqueue.

2255

-+ */

2256

-+static int migration_cpu_stop(void *data)

2257

-+{

2258

-+	struct migration_arg *arg = data;

2259

-+	struct task_struct *p = arg->task;

2260

-+	struct rq *rq = this_rq();

2261

-+	unsigned long flags;

2262

-+

2263

-+	/*

2264

-+	 * The original target CPU might have gone down and we might

2265

-+	 * be on another CPU but it doesn't matter.

2266

-+	 */

2267

-+	local_irq_save(flags);

2268

-+	/*

2269

-+	 * We need to explicitly wake pending tasks before running

2270

-+	 * __migrate_task() such that we will not miss enforcing cpus_ptr

2271

-+	 * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test.

2272

-+	 */

2273

-+	flush_smp_call_function_from_idle();

2274

-+

2275

-+	raw_spin_lock(&p->pi_lock);

2276

-+	raw_spin_lock(&rq->lock);

2277

-+	/*

2278

-+	 * If task_rq(p) != rq, it cannot be migrated here, because we're

2279

-+	 * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because

2280

-+	 * we're holding p->pi_lock.

2281

-+	 */

2282

-+	if (task_rq(p) == rq && task_on_rq_queued(p))

2283

-+		rq = __migrate_task(rq, p, arg->dest_cpu);

2284

-+	raw_spin_unlock(&rq->lock);

2285

-+	raw_spin_unlock_irqrestore(&p->pi_lock, flags);

2286

-+

2287

-+	return 0;

2288

-+}

2289

-+

2290

-+static inline void

2291

-+set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask)

2292

-+{

2293

-+	cpumask_copy(&p->cpus_mask, new_mask);

2294

-+	p->nr_cpus_allowed = cpumask_weight(new_mask);

2295

-+}

2296

-+

2297

-+static void

2298

-+__do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)

2299

-+{

2300

-+	lockdep_assert_held(&p->pi_lock);

2301

-+	set_cpus_allowed_common(p, new_mask);

2302

-+}

2303

-+

2304

-+void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)

2305

-+{

2306

-+	__do_set_cpus_allowed(p, new_mask);

2307

-+}

2308

-+

2309

-+int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src,

2310

-+		      int node)

2311

-+{

2312

-+	if (!src->user_cpus_ptr)

2313

-+		return 0;

2314

-+

2315

-+	dst->user_cpus_ptr = kmalloc_node(cpumask_size(), GFP_KERNEL, node);

2316

-+	if (!dst->user_cpus_ptr)

2317

-+		return -ENOMEM;

2318

-+

2319

-+	cpumask_copy(dst->user_cpus_ptr, src->user_cpus_ptr);

2320

-+	return 0;

2321

-+}

2322

-+

2323

-+static inline struct cpumask *clear_user_cpus_ptr(struct task_struct *p)

2324

-+{

2325

-+	struct cpumask *user_mask = NULL;

2326

-+

2327

-+	swap(p->user_cpus_ptr, user_mask);

2328

-+

2329

-+	return user_mask;

2330

-+}

2331

-+

2332

-+void release_user_cpus_ptr(struct task_struct *p)

2333

-+{

2334

-+	kfree(clear_user_cpus_ptr(p));

2335

-+}

2336

-+

2337

-+#endif

2338

-+

2339

-+/**

2340

-+ * task_curr - is this task currently executing on a CPU?

2341

-+ * @p: the task in question.

2342

-+ *

2343

-+ * Return: 1 if the task is currently executing. 0 otherwise.

2344

-+ */

2345

-+inline int task_curr(const struct task_struct *p)

2346

-+{

2347

-+	return cpu_curr(task_cpu(p)) == p;

2348

-+}

2349

-+

2350

-+#ifdef CONFIG_SMP

2351

-+/*

2352

-+ * wait_task_inactive - wait for a thread to unschedule.

2353

-+ *

2354

-+ * If @match_state is nonzero, it's the @p->state value just checked and

2355

-+ * not expected to change.  If it changes, i.e. @p might have woken up,

2356

-+ * then return zero.  When we succeed in waiting for @p to be off its CPU,

2357

-+ * we return a positive number (its total switch count).  If a second call

2358

-+ * a short while later returns the same number, the caller can be sure that

2359

-+ * @p has remained unscheduled the whole time.

2360

-+ *

2361

-+ * The caller must ensure that the task *will* unschedule sometime soon,

2362

-+ * else this function might spin for a *long* time. This function can't

2363

-+ * be called with interrupts off, or it may introduce deadlock with

2364

-+ * smp_call_function() if an IPI is sent by the same process we are

2365

-+ * waiting to become inactive.

2366

-+ */

2367

-+unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state)

2368

-+{

2369

-+	unsigned long flags;

2370

-+	bool running, on_rq;

2371

-+	unsigned long ncsw;

2372

-+	struct rq *rq;

2373

-+	raw_spinlock_t *lock;

2374

-+

2375

-+	for (;;) {

2376

-+		rq = task_rq(p);

2377

-+

2378

-+		/*

2379

-+		 * If the task is actively running on another CPU

2380

-+		 * still, just relax and busy-wait without holding

2381

-+		 * any locks.

2382

-+		 *

2383

-+		 * NOTE! Since we don't hold any locks, it's not

2384

-+		 * even sure that "rq" stays as the right runqueue!

2385

-+		 * But we don't care, since this will return false

2386

-+		 * if the runqueue has changed and p is actually now

2387

-+		 * running somewhere else!

2388

-+		 */

2389

-+		while (task_running(p) && p == rq->curr) {

2390

-+			if (match_state && unlikely(READ_ONCE(p->__state) != match_state))

2391

-+				return 0;

2392

-+			cpu_relax();

2393

-+		}

2394

-+

2395

-+		/*

2396

-+		 * Ok, time to look more closely! We need the rq

2397

-+		 * lock now, to be *sure*. If we're wrong, we'll

2398

-+		 * just go back and repeat.

2399

-+		 */

2400

-+		task_access_lock_irqsave(p, &lock, &flags);

2401

-+		trace_sched_wait_task(p);

2402

-+		running = task_running(p);

2403

-+		on_rq = p->on_rq;

2404

-+		ncsw = 0;

2405

-+		if (!match_state || READ_ONCE(p->__state) == match_state)

2406

-+			ncsw = p->nvcsw | LONG_MIN; /* sets MSB */

2407

-+		task_access_unlock_irqrestore(p, lock, &flags);

2408

-+

2409

-+		/*

2410

-+		 * If it changed from the expected state, bail out now.

2411

-+		 */

2412

-+		if (unlikely(!ncsw))

2413

-+			break;

2414

-+

2415

-+		/*

2416

-+		 * Was it really running after all now that we

2417

-+		 * checked with the proper locks actually held?

2418

-+		 *

2419

-+		 * Oops. Go back and try again..

2420

-+		 */

2421

-+		if (unlikely(running)) {

2422

-+			cpu_relax();

2423

-+			continue;

2424

-+		}

2425

-+

2426

-+		/*

2427

-+		 * It's not enough that it's not actively running,

2428

-+		 * it must be off the runqueue _entirely_, and not

2429

-+		 * preempted!

2430

-+		 *

2431

-+		 * So if it was still runnable (but just not actively

2432

-+		 * running right now), it's preempted, and we should

2433

-+		 * yield - it could be a while.

2434

-+		 */

2435

-+		if (unlikely(on_rq)) {

2436

-+			ktime_t to = NSEC_PER_SEC / HZ;

2437

-+

2438

-+			set_current_state(TASK_UNINTERRUPTIBLE);

2439

-+			schedule_hrtimeout(&to, HRTIMER_MODE_REL);

2440

-+			continue;

2441

-+		}

2442

-+

2443

-+		/*

2444

-+		 * Ahh, all good. It wasn't running, and it wasn't

2445

-+		 * runnable, which means that it will never become

2446

-+		 * running in the future either. We're all done!

2447

-+		 */

2448

-+		break;

2449

-+	}

2450

-+

2451

-+	return ncsw;

2452

-+}

2453

-+

2454

-+/***

2455

-+ * kick_process - kick a running thread to enter/exit the kernel

2456

-+ * @p: the to-be-kicked thread

2457

-+ *

2458

-+ * Cause a process which is running on another CPU to enter

2459

-+ * kernel-mode, without any delay. (to get signals handled.)

2460

-+ *

2461

-+ * NOTE: this function doesn't have to take the runqueue lock,

2462

-+ * because all it wants to ensure is that the remote task enters

2463

-+ * the kernel. If the IPI races and the task has been migrated

2464

-+ * to another CPU then no harm is done and the purpose has been

2465

-+ * achieved as well.

2466

-+ */

2467

-+void kick_process(struct task_struct *p)

2468

-+{

2469

-+	int cpu;

2470

-+

2471

-+	preempt_disable();

2472

-+	cpu = task_cpu(p);

2473

-+	if ((cpu != smp_processor_id()) && task_curr(p))

2474

-+		smp_send_reschedule(cpu);

2475

-+	preempt_enable();

2476

-+}

2477

-+EXPORT_SYMBOL_GPL(kick_process);

2478

-+

2479

-+/*

2480

-+ * ->cpus_ptr is protected by both rq->lock and p->pi_lock

2481

-+ *

2482

-+ * A few notes on cpu_active vs cpu_online:

2483

-+ *

2484

-+ *  - cpu_active must be a subset of cpu_online

2485

-+ *

2486

-+ *  - on CPU-up we allow per-CPU kthreads on the online && !active CPU,

2487

-+ *    see __set_cpus_allowed_ptr(). At this point the newly online

2488

-+ *    CPU isn't yet part of the sched domains, and balancing will not

2489

-+ *    see it.

2490

-+ *

2491

-+ *  - on cpu-down we clear cpu_active() to mask the sched domains and

2492

-+ *    avoid the load balancer to place new tasks on the to be removed

2493

-+ *    CPU. Existing tasks will remain running there and will be taken

2494

-+ *    off.

2495

-+ *

2496

-+ * This means that fallback selection must not select !active CPUs.

2497

-+ * And can assume that any active CPU must be online. Conversely

2498

-+ * select_task_rq() below may allow selection of !active CPUs in order

2499

-+ * to satisfy the above rules.

2500

-+ */

2501

-+static int select_fallback_rq(int cpu, struct task_struct *p)

2502

-+{

2503

-+	int nid = cpu_to_node(cpu);

2504

-+	const struct cpumask *nodemask = NULL;

2505

-+	enum { cpuset, possible, fail } state = cpuset;

2506

-+	int dest_cpu;

2507

-+

2508

-+	/*

2509

-+	 * If the node that the CPU is on has been offlined, cpu_to_node()

2510

-+	 * will return -1. There is no CPU on the node, and we should

2511

-+	 * select the CPU on the other node.

2512

-+	 */

2513

-+	if (nid != -1) {

2514

-+		nodemask = cpumask_of_node(nid);

2515

-+

2516

-+		/* Look for allowed, online CPU in same node. */

2517

-+		for_each_cpu(dest_cpu, nodemask) {

2518

-+			if (is_cpu_allowed(p, dest_cpu))

2519

-+				return dest_cpu;

2520

-+		}

2521

-+	}

2522

-+

2523

-+	for (;;) {

2524

-+		/* Any allowed, online CPU? */

2525

-+		for_each_cpu(dest_cpu, p->cpus_ptr) {

2526

-+			if (!is_cpu_allowed(p, dest_cpu))

2527

-+				continue;

2528

-+			goto out;

2529

-+		}

2530

-+

2531

-+		/* No more Mr. Nice Guy. */

2532

-+		switch (state) {

2533

-+		case cpuset:

2534

-+			if (cpuset_cpus_allowed_fallback(p)) {

2535

-+				state = possible;

2536

-+				break;

2537

-+			}

2538

-+			fallthrough;

2539

-+		case possible:

2540

-+			/*

2541

-+			 * XXX When called from select_task_rq() we only

2542

-+			 * hold p->pi_lock and again violate locking order.

2543

-+			 *

2544

-+			 * More yuck to audit.

2545

-+			 */

2546

-+			do_set_cpus_allowed(p, task_cpu_possible_mask(p));

2547

-+			state = fail;

2548

-+			break;

2549

-+

2550

-+		case fail:

2551

-+			BUG();

2552

-+			break;

2553

-+		}

2554

-+	}

2555

-+

2556

-+out:

2557

-+	if (state != cpuset) {

2558

-+		/*

2559

-+		 * Don't tell them about moving exiting tasks or

2560

-+		 * kernel threads (both mm NULL), since they never

2561

-+		 * leave kernel.

2562

-+		 */

2563

-+		if (p->mm && printk_ratelimit()) {

2564

-+			printk_deferred("process %d (%s) no longer affine to cpu%d\n",

2565

-+					task_pid_nr(p), p->comm, cpu);

2566

-+		}

2567

-+	}

2568

-+

2569

-+	return dest_cpu;

2570

-+}

2571

-+

2572

-+static inline int select_task_rq(struct task_struct *p)

2573

-+{

2574

-+	cpumask_t chk_mask, tmp;

2575

-+

2576

-+	if (unlikely(!cpumask_and(&chk_mask, p->cpus_ptr, cpu_active_mask)))

2577

-+		return select_fallback_rq(task_cpu(p), p);

2578

-+

2579

-+	if (

2580

-+#ifdef CONFIG_SCHED_SMT

2581

-+	    cpumask_and(&tmp, &chk_mask, &sched_sg_idle_mask) ||

2582

-+#endif

2583

-+	    cpumask_and(&tmp, &chk_mask, sched_rq_watermark) ||

2584

-+	    cpumask_and(&tmp, &chk_mask,

2585

-+			sched_rq_watermark + SCHED_BITS - task_sched_prio(p)))

2586

-+		return best_mask_cpu(task_cpu(p), &tmp);

2587

-+

2588

-+	return best_mask_cpu(task_cpu(p), &chk_mask);

2589

-+}

2590

-+

2591

-+void sched_set_stop_task(int cpu, struct task_struct *stop)

2592

-+{

2593

-+	static struct lock_class_key stop_pi_lock;

2594

-+	struct sched_param stop_param = { .sched_priority = STOP_PRIO };

2595

-+	struct sched_param start_param = { .sched_priority = 0 };

2596

-+	struct task_struct *old_stop = cpu_rq(cpu)->stop;

2597

-+

2598

-+	if (stop) {

2599

-+		/*

2600

-+		 * Make it appear like a SCHED_FIFO task, its something

2601

-+		 * userspace knows about and won't get confused about.

2602

-+		 *

2603

-+		 * Also, it will make PI more or less work without too

2604

-+		 * much confusion -- but then, stop work should not

2605

-+		 * rely on PI working anyway.

2606

-+		 */

2607

-+		sched_setscheduler_nocheck(stop, SCHED_FIFO, &stop_param);

2608

-+

2609

-+		/*

2610

-+		 * The PI code calls rt_mutex_setprio() with ->pi_lock held to

2611

-+		 * adjust the effective priority of a task. As a result,

2612

-+		 * rt_mutex_setprio() can trigger (RT) balancing operations,

2613

-+		 * which can then trigger wakeups of the stop thread to push

2614

-+		 * around the current task.

2615

-+		 *

2616

-+		 * The stop task itself will never be part of the PI-chain, it

2617

-+		 * never blocks, therefore that ->pi_lock recursion is safe.

2618

-+		 * Tell lockdep about this by placing the stop->pi_lock in its

2619

-+		 * own class.

2620

-+		 */

2621

-+		lockdep_set_class(&stop->pi_lock, &stop_pi_lock);

2622

-+	}

2623

-+

2624

-+	cpu_rq(cpu)->stop = stop;

2625

-+

2626

-+	if (old_stop) {

2627

-+		/*

2628

-+		 * Reset it back to a normal scheduling policy so that

2629

-+		 * it can die in pieces.

2630

-+		 */

2631

-+		sched_setscheduler_nocheck(old_stop, SCHED_NORMAL, &start_param);

2632

-+	}

2633

-+}

2634

-+

2635

-+static int affine_move_task(struct rq *rq, struct task_struct *p, int dest_cpu,

2636

-+			    raw_spinlock_t *lock, unsigned long irq_flags)

2637

-+{

2638

-+	/* Can the task run on the task's current CPU? If so, we're done */

2639

-+	if (!cpumask_test_cpu(task_cpu(p), &p->cpus_mask)) {

2640

-+		if (p->migration_disabled) {

2641

-+			if (likely(p->cpus_ptr != &p->cpus_mask))

2642

-+				__do_set_cpus_ptr(p, &p->cpus_mask);

2643

-+			p->migration_disabled = 0;

2644

-+			p->migration_flags |= MDF_FORCE_ENABLED;

2645

-+			/* When p is migrate_disabled, rq->lock should be held */

2646

-+			rq->nr_pinned--;

2647

-+		}

2648

-+

2649

-+		if (task_running(p) || READ_ONCE(p->__state) == TASK_WAKING) {

2650

-+			struct migration_arg arg = { p, dest_cpu };

2651

-+

2652

-+			/* Need help from migration thread: drop lock and wait. */

2653

-+			__task_access_unlock(p, lock);

2654

-+			raw_spin_unlock_irqrestore(&p->pi_lock, irq_flags);

2655

-+			stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);

2656

-+			return 0;

2657

-+		}

2658

-+		if (task_on_rq_queued(p)) {

2659

-+			/*

2660

-+			 * OK, since we're going to drop the lock immediately

2661

-+			 * afterwards anyway.

2662

-+			 */

2663

-+			update_rq_clock(rq);

2664

-+			rq = move_queued_task(rq, p, dest_cpu);

2665

-+			lock = &rq->lock;

2666

-+		}

2667

-+	}

2668

-+	__task_access_unlock(p, lock);

2669

-+	raw_spin_unlock_irqrestore(&p->pi_lock, irq_flags);

2670

-+	return 0;

2671

-+}

2672

-+

2673

-+static int __set_cpus_allowed_ptr_locked(struct task_struct *p,

2674

-+					 const struct cpumask *new_mask,

2675

-+					 u32 flags,

2676

-+					 struct rq *rq,

2677

-+					 raw_spinlock_t *lock,

2678

-+					 unsigned long irq_flags)

2679

-+{

2680

-+	const struct cpumask *cpu_allowed_mask = task_cpu_possible_mask(p);

2681

-+	const struct cpumask *cpu_valid_mask = cpu_active_mask;

2682

-+	bool kthread = p->flags & PF_KTHREAD;

2683

-+	struct cpumask *user_mask = NULL;

2684

-+	int dest_cpu;

2685

-+	int ret = 0;

2686

-+

2687

-+	if (kthread || is_migration_disabled(p)) {

2688

-+		/*

2689

-+		 * Kernel threads are allowed on online && !active CPUs,

2690

-+		 * however, during cpu-hot-unplug, even these might get pushed

2691

-+		 * away if not KTHREAD_IS_PER_CPU.

2692

-+		 *

2693

-+		 * Specifically, migration_disabled() tasks must not fail the

2694

-+		 * cpumask_any_and_distribute() pick below, esp. so on

2695

-+		 * SCA_MIGRATE_ENABLE, otherwise we'll not call

2696

-+		 * set_cpus_allowed_common() and actually reset p->cpus_ptr.

2697

-+		 */

2698

-+		cpu_valid_mask = cpu_online_mask;

2699

-+	}

2700

-+

2701

-+	if (!kthread && !cpumask_subset(new_mask, cpu_allowed_mask)) {

2702

-+		ret = -EINVAL;

2703

-+		goto out;

2704

-+	}

2705

-+

2706

-+	/*

2707

-+	 * Must re-check here, to close a race against __kthread_bind(),

2708

-+	 * sched_setaffinity() is not guaranteed to observe the flag.

2709

-+	 */

2710

-+	if ((flags & SCA_CHECK) && (p->flags & PF_NO_SETAFFINITY)) {

2711

-+		ret = -EINVAL;

2712

-+		goto out;

2713

-+	}

2714

-+

2715

-+	if (cpumask_equal(&p->cpus_mask, new_mask))

2716

-+		goto out;

2717

-+

2718

-+	dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask);

2719

-+	if (dest_cpu >= nr_cpu_ids) {

2720

-+		ret = -EINVAL;

2721

-+		goto out;

2722

-+	}

2723

-+

2724

-+	__do_set_cpus_allowed(p, new_mask);

2725

-+

2726

-+	if (flags & SCA_USER)

2727

-+		user_mask = clear_user_cpus_ptr(p);

2728

-+

2729

-+	ret = affine_move_task(rq, p, dest_cpu, lock, irq_flags);

2730

-+

2731

-+	kfree(user_mask);

2732

-+

2733

-+	return ret;

2734

-+

2735

-+out:

2736

-+	__task_access_unlock(p, lock);

2737

-+	raw_spin_unlock_irqrestore(&p->pi_lock, irq_flags);

2738

-+

2739

-+	return ret;

2740

-+}

2741

-+

2742

-+/*

2743

-+ * Change a given task's CPU affinity. Migrate the thread to a

2744

-+ * proper CPU and schedule it away if the CPU it's executing on

2745

-+ * is removed from the allowed bitmask.

2746

-+ *

2747

-+ * NOTE: the caller must have a valid reference to the task, the

2748

-+ * task must not exit() & deallocate itself prematurely. The

2749

-+ * call is not atomic; no spinlocks may be held.

2750

-+ */

2751

-+static int __set_cpus_allowed_ptr(struct task_struct *p,

2752

-+				  const struct cpumask *new_mask, u32 flags)

2753

-+{

2754

-+	unsigned long irq_flags;

2755

-+	struct rq *rq;

2756

-+	raw_spinlock_t *lock;

2757

-+

2758

-+	raw_spin_lock_irqsave(&p->pi_lock, irq_flags);

2759

-+	rq = __task_access_lock(p, &lock);

2760

-+

2761

-+	return __set_cpus_allowed_ptr_locked(p, new_mask, flags, rq, lock, irq_flags);

2762

-+}

2763

-+

2764

-+int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)

2765

-+{

2766

-+	return __set_cpus_allowed_ptr(p, new_mask, 0);

2767

-+}

2768

-+EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);

2769

-+

2770

-+/*

2771

-+ * Change a given task's CPU affinity to the intersection of its current

2772

-+ * affinity mask and @subset_mask, writing the resulting mask to @new_mask

2773

-+ * and pointing @p->user_cpus_ptr to a copy of the old mask.

2774

-+ * If the resulting mask is empty, leave the affinity unchanged and return

2775

-+ * -EINVAL.

2776

-+ */

2777

-+static int restrict_cpus_allowed_ptr(struct task_struct *p,

2778

-+				     struct cpumask *new_mask,

2779

-+				     const struct cpumask *subset_mask)

2780

-+{

2781

-+	struct cpumask *user_mask = NULL;

2782

-+	unsigned long irq_flags;

2783

-+	raw_spinlock_t *lock;

2784

-+	struct rq *rq;

2785

-+	int err;

2786

-+

2787

-+	if (!p->user_cpus_ptr) {

2788

-+		user_mask = kmalloc(cpumask_size(), GFP_KERNEL);

2789

-+		if (!user_mask)

2790

-+			return -ENOMEM;

2791

-+	}

2792

-+

2793

-+	raw_spin_lock_irqsave(&p->pi_lock, irq_flags);

2794

-+	rq = __task_access_lock(p, &lock);

2795

-+

2796

-+	if (!cpumask_and(new_mask, &p->cpus_mask, subset_mask)) {

2797

-+		err = -EINVAL;

2798

-+		goto err_unlock;

2799

-+	}

2800

-+

2801

-+	/*

2802

-+	 * We're about to butcher the task affinity, so keep track of what

2803

-+	 * the user asked for in case we're able to restore it later on.

2804

-+	 */

2805

-+	if (user_mask) {

2806

-+		cpumask_copy(user_mask, p->cpus_ptr);

2807

-+		p->user_cpus_ptr = user_mask;

2808

-+	}

2809

-+

2810

-+	/*return __set_cpus_allowed_ptr_locked(p, new_mask, 0, rq, &rf);*/

2811

-+	return __set_cpus_allowed_ptr_locked(p, new_mask, 0, rq, lock, irq_flags);

2812

-+

2813

-+err_unlock:

2814

-+	__task_access_unlock(p, lock);

2815

-+	raw_spin_unlock_irqrestore(&p->pi_lock, irq_flags);

2816

-+	kfree(user_mask);

2817

-+	return err;

2818

-+}

2819

-+

2820

-+/*

2821

-+ * Restrict the CPU affinity of task @p so that it is a subset of

2822

-+ * task_cpu_possible_mask() and point @p->user_cpu_ptr to a copy of the

2823

-+ * old affinity mask. If the resulting mask is empty, we warn and walk

2824

-+ * up the cpuset hierarchy until we find a suitable mask.

2825

-+ */

2826

-+void force_compatible_cpus_allowed_ptr(struct task_struct *p)

2827

-+{

2828

-+	cpumask_var_t new_mask;

2829

-+	const struct cpumask *override_mask = task_cpu_possible_mask(p);

2830

-+

2831

-+	alloc_cpumask_var(&new_mask, GFP_KERNEL);

2832

-+

2833

-+	/*

2834

-+	 * __migrate_task() can fail silently in the face of concurrent

2835

-+	 * offlining of the chosen destination CPU, so take the hotplug

2836

-+	 * lock to ensure that the migration succeeds.

2837

-+	 */

2838

-+	cpus_read_lock();

2839

-+	if (!cpumask_available(new_mask))

2840

-+		goto out_set_mask;

2841

-+

2842

-+	if (!restrict_cpus_allowed_ptr(p, new_mask, override_mask))

2843

-+		goto out_free_mask;

2844

-+

2845

-+	/*

2846

-+	 * We failed to find a valid subset of the affinity mask for the

2847

-+	 * task, so override it based on its cpuset hierarchy.

2848

-+	 */

2849

-+	cpuset_cpus_allowed(p, new_mask);

2850

-+	override_mask = new_mask;

2851

-+

2852

-+out_set_mask:

2853

-+	if (printk_ratelimit()) {

2854

-+		printk_deferred("Overriding affinity for process %d (%s) to CPUs %*pbl\n",

2855

-+				task_pid_nr(p), p->comm,

2856

-+				cpumask_pr_args(override_mask));

2857

-+	}

2858

-+

2859

-+	WARN_ON(set_cpus_allowed_ptr(p, override_mask));

2860

-+out_free_mask:

2861

-+	cpus_read_unlock();

2862

-+	free_cpumask_var(new_mask);

2863

-+}

2864

-+

2865

-+static int

2866

-+__sched_setaffinity(struct task_struct *p, const struct cpumask *mask);

2867

-+

2868

-+/*

2869

-+ * Restore the affinity of a task @p which was previously restricted by a

2870

-+ * call to force_compatible_cpus_allowed_ptr(). This will clear (and free)

2871

-+ * @p->user_cpus_ptr.

2872

-+ *

2873

-+ * It is the caller's responsibility to serialise this with any calls to

2874

-+ * force_compatible_cpus_allowed_ptr(@p).

2875

-+ */

2876

-+void relax_compatible_cpus_allowed_ptr(struct task_struct *p)

2877

-+{

2878

-+	struct cpumask *user_mask = p->user_cpus_ptr;

2879

-+	unsigned long flags;

2880

-+

2881

-+	/*

2882

-+	 * Try to restore the old affinity mask. If this fails, then

2883

-+	 * we free the mask explicitly to avoid it being inherited across

2884

-+	 * a subsequent fork().

2885

-+	 */

2886

-+	if (!user_mask || !__sched_setaffinity(p, user_mask))

2887

-+		return;

2888

-+

2889

-+	raw_spin_lock_irqsave(&p->pi_lock, flags);

2890

-+	user_mask = clear_user_cpus_ptr(p);

2891

-+	raw_spin_unlock_irqrestore(&p->pi_lock, flags);

2892

-+

2893

-+	kfree(user_mask);

2894

-+}

2895

-+

2896

-+#else /* CONFIG_SMP */

2897

-+

2898

-+static inline int select_task_rq(struct task_struct *p)

2899

-+{

2900

-+	return 0;

2901

-+}

2902

-+

2903

-+static inline int

2904

-+__set_cpus_allowed_ptr(struct task_struct *p,

2905

-+		       const struct cpumask *new_mask, u32 flags)

2906

-+{

2907

-+	return set_cpus_allowed_ptr(p, new_mask);

2908

-+}

2909

-+

2910

-+static inline bool rq_has_pinned_tasks(struct rq *rq)

2911

-+{

2912

-+	return false;

2913

-+}

2914

-+

2915

-+#endif /* !CONFIG_SMP */

2916

-+

2917

-+static void

2918

-+ttwu_stat(struct task_struct *p, int cpu, int wake_flags)

2919

-+{

2920

-+	struct rq *rq;

2921

-+

2922

-+	if (!schedstat_enabled())

2923

-+		return;

2924

-+

2925

-+	rq = this_rq();

2926

-+

2927

-+#ifdef CONFIG_SMP

2928

-+	if (cpu == rq->cpu)

2929

-+		__schedstat_inc(rq->ttwu_local);

2930

-+	else {

2931

-+		/** Alt schedule FW ToDo:

2932

-+		 * How to do ttwu_wake_remote

2933

-+		 */

2934

-+	}

2935

-+#endif /* CONFIG_SMP */

2936

-+

2937

-+	__schedstat_inc(rq->ttwu_count);

2938

-+}

2939

-+

2940

-+/*

2941

-+ * Mark the task runnable and perform wakeup-preemption.

2942

-+ */

2943

-+static inline void

2944

-+ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)

2945

-+{

2946

-+	check_preempt_curr(rq);

2947

-+	WRITE_ONCE(p->__state, TASK_RUNNING);

2948

-+	trace_sched_wakeup(p);

2949

-+}

2950

-+

2951

-+static inline void

2952

-+ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags)

2953

-+{

2954

-+	if (p->sched_contributes_to_load)

2955

-+		rq->nr_uninterruptible--;

2956

-+

2957

-+	if (

2958

-+#ifdef CONFIG_SMP

2959

-+	    !(wake_flags & WF_MIGRATED) &&

2960

-+#endif

2961

-+	    p->in_iowait) {

2962

-+		delayacct_blkio_end(p);

2963

-+		atomic_dec(&task_rq(p)->nr_iowait);

2964

-+	}

2965

-+

2966

-+	activate_task(p, rq);

2967

-+	ttwu_do_wakeup(rq, p, 0);

2968

-+}

2969

-+

2970

-+/*

2971

-+ * Consider @p being inside a wait loop:

2972

-+ *

2973

-+ *   for (;;) {

2974

-+ *      set_current_state(TASK_UNINTERRUPTIBLE);

2975

-+ *

2976

-+ *      if (CONDITION)

2977

-+ *         break;

2978

-+ *

2979

-+ *      schedule();

2980

-+ *   }

2981

-+ *   __set_current_state(TASK_RUNNING);

2982

-+ *

2983

-+ * between set_current_state() and schedule(). In this case @p is still

2984

-+ * runnable, so all that needs doing is change p->state back to TASK_RUNNING in

2985

-+ * an atomic manner.

2986

-+ *

2987

-+ * By taking task_rq(p)->lock we serialize against schedule(), if @p->on_rq

2988

-+ * then schedule() must still happen and p->state can be changed to

2989

-+ * TASK_RUNNING. Otherwise we lost the race, schedule() has happened, and we

2990

-+ * need to do a full wakeup with enqueue.

2991

-+ *

2992

-+ * Returns: %true when the wakeup is done,

2993

-+ *          %false otherwise.

2994

-+ */

2995

-+static int ttwu_runnable(struct task_struct *p, int wake_flags)

2996

-+{

2997

-+	struct rq *rq;

2998

-+	raw_spinlock_t *lock;

2999

-+	int ret = 0;

3000

-+

3001

-+	rq = __task_access_lock(p, &lock);

3002

-+	if (task_on_rq_queued(p)) {

3003

-+		/* check_preempt_curr() may use rq clock */

3004

-+		update_rq_clock(rq);

3005

-+		ttwu_do_wakeup(rq, p, wake_flags);

3006

-+		ret = 1;

3007

-+	}

3008

-+	__task_access_unlock(p, lock);

3009

-+

3010

-+	return ret;

3011

-+}

3012

-+

3013

-+#ifdef CONFIG_SMP

3014

-+void sched_ttwu_pending(void *arg)

3015

-+{

3016

-+	struct llist_node *llist = arg;

3017

-+	struct rq *rq = this_rq();

3018

-+	struct task_struct *p, *t;

3019

-+	struct rq_flags rf;

3020

-+

3021

-+	if (!llist)

3022

-+		return;

3023

-+

3024

-+	/*

3025

-+	 * rq::ttwu_pending racy indication of out-standing wakeups.

3026

-+	 * Races such that false-negatives are possible, since they

3027

-+	 * are shorter lived that false-positives would be.

3028

-+	 */

3029

-+	WRITE_ONCE(rq->ttwu_pending, 0);

3030

-+

3031

-+	rq_lock_irqsave(rq, &rf);

3032

-+	update_rq_clock(rq);

3033

-+

3034

-+	llist_for_each_entry_safe(p, t, llist, wake_entry.llist) {

3035

-+		if (WARN_ON_ONCE(p->on_cpu))

3036

-+			smp_cond_load_acquire(&p->on_cpu, !VAL);

3037

-+

3038

-+		if (WARN_ON_ONCE(task_cpu(p) != cpu_of(rq)))

3039

-+			set_task_cpu(p, cpu_of(rq));

3040

-+

3041

-+		ttwu_do_activate(rq, p, p->sched_remote_wakeup ? WF_MIGRATED : 0);

3042

-+	}

3043

-+

3044

-+	rq_unlock_irqrestore(rq, &rf);

3045

-+}

3046

-+

3047

-+void send_call_function_single_ipi(int cpu)

3048

-+{

3049

-+	struct rq *rq = cpu_rq(cpu);

3050

-+

3051

-+	if (!set_nr_if_polling(rq->idle))

3052

-+		arch_send_call_function_single_ipi(cpu);

3053

-+	else

3054

-+		trace_sched_wake_idle_without_ipi(cpu);

3055

-+}

3056

-+

3057

-+/*

3058

-+ * Queue a task on the target CPUs wake_list and wake the CPU via IPI if

3059

-+ * necessary. The wakee CPU on receipt of the IPI will queue the task

3060

-+ * via sched_ttwu_wakeup() for activation so the wakee incurs the cost

3061

-+ * of the wakeup instead of the waker.

3062

-+ */

3063

-+static void __ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)

3064

-+{

3065

-+	struct rq *rq = cpu_rq(cpu);

3066

-+

3067

-+	p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED);

3068

-+

3069

-+	WRITE_ONCE(rq->ttwu_pending, 1);

3070

-+	__smp_call_single_queue(cpu, &p->wake_entry.llist);

3071

-+}

3072

-+

3073

-+static inline bool ttwu_queue_cond(int cpu, int wake_flags)

3074

-+{

3075

-+	/*

3076

-+	 * Do not complicate things with the async wake_list while the CPU is

3077

-+	 * in hotplug state.

3078

-+	 */

3079

-+	if (!cpu_active(cpu))

3080

-+		return false;

3081

-+

3082

-+	/*

3083

-+	 * If the CPU does not share cache, then queue the task on the

3084

-+	 * remote rqs wakelist to avoid accessing remote data.

3085

-+	 */

3086

-+	if (!cpus_share_cache(smp_processor_id(), cpu))

3087

-+		return true;

3088

-+

3089

-+	/*

3090

-+	 * If the task is descheduling and the only running task on the

3091

-+	 * CPU then use the wakelist to offload the task activation to

3092

-+	 * the soon-to-be-idle CPU as the current CPU is likely busy.

3093

-+	 * nr_running is checked to avoid unnecessary task stacking.

3094

-+	 */

3095

-+	if ((wake_flags & WF_ON_CPU) && cpu_rq(cpu)->nr_running <= 1)

3096

-+		return true;

3097

-+

3098

-+	return false;

3099

-+}

3100

-+

3101

-+static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)

3102

-+{

3103

-+	if (__is_defined(ALT_SCHED_TTWU_QUEUE) && ttwu_queue_cond(cpu, wake_flags)) {

3104

-+		if (WARN_ON_ONCE(cpu == smp_processor_id()))

3105

-+			return false;

3106

-+

3107

-+		sched_clock_cpu(cpu); /* Sync clocks across CPUs */

3108

-+		__ttwu_queue_wakelist(p, cpu, wake_flags);

3109

-+		return true;

3110

-+	}

3111

-+

3112

-+	return false;

3113

-+}

3114

-+

3115

-+void wake_up_if_idle(int cpu)

3116

-+{

3117

-+	struct rq *rq = cpu_rq(cpu);

3118

-+	unsigned long flags;

3119

-+

3120

-+	rcu_read_lock();

3121

-+

3122

-+	if (!is_idle_task(rcu_dereference(rq->curr)))

3123

-+		goto out;

3124

-+

3125

-+	if (set_nr_if_polling(rq->idle)) {

3126

-+		trace_sched_wake_idle_without_ipi(cpu);

3127

-+	} else {

3128

-+		raw_spin_lock_irqsave(&rq->lock, flags);

3129

-+		if (is_idle_task(rq->curr))

3130

-+			smp_send_reschedule(cpu);

3131

-+		/* Else CPU is not idle, do nothing here */

3132

-+		raw_spin_unlock_irqrestore(&rq->lock, flags);

3133

-+	}

3134

-+

3135

-+out:

3136

-+	rcu_read_unlock();

3137

-+}

3138

-+

3139

-+bool cpus_share_cache(int this_cpu, int that_cpu)

3140

-+{

3141

-+	return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);

3142

-+}

3143

-+#else /* !CONFIG_SMP */

3144

-+

3145

-+static inline bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)

3146

-+{

3147

-+	return false;

3148

-+}

3149

-+

3150

-+#endif /* CONFIG_SMP */

3151

-+

3152

-+static inline void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)

3153

-+{

3154

-+	struct rq *rq = cpu_rq(cpu);

3155

-+

3156

-+	if (ttwu_queue_wakelist(p, cpu, wake_flags))

3157

-+		return;

3158

-+

3159

-+	raw_spin_lock(&rq->lock);

3160

-+	update_rq_clock(rq);

3161

-+	ttwu_do_activate(rq, p, wake_flags);

3162

-+	raw_spin_unlock(&rq->lock);

3163

-+}

3164

-+

3165

-+/*

3166

-+ * Invoked from try_to_wake_up() to check whether the task can be woken up.

3167

-+ *

3168

-+ * The caller holds p::pi_lock if p != current or has preemption

3169

-+ * disabled when p == current.

3170

-+ *

3171

-+ * The rules of PREEMPT_RT saved_state:

3172

-+ *

3173

-+ *   The related locking code always holds p::pi_lock when updating

3174

-+ *   p::saved_state, which means the code is fully serialized in both cases.

3175

-+ *

3176

-+ *   The lock wait and lock wakeups happen via TASK_RTLOCK_WAIT. No other

3177

-+ *   bits set. This allows to distinguish all wakeup scenarios.

3178

-+ */

3179

-+static __always_inline

3180

-+bool ttwu_state_match(struct task_struct *p, unsigned int state, int *success)

3181

-+{

3182

-+	if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)) {

3183

-+		WARN_ON_ONCE((state & TASK_RTLOCK_WAIT) &&

3184

-+			     state != TASK_RTLOCK_WAIT);

3185

-+	}

3186

-+

3187

-+	if (READ_ONCE(p->__state) & state) {

3188

-+		*success = 1;

3189

-+		return true;

3190

-+	}

3191

-+

3192

-+#ifdef CONFIG_PREEMPT_RT

3193

-+	/*

3194

-+	 * Saved state preserves the task state across blocking on

3195

-+	 * an RT lock.  If the state matches, set p::saved_state to

3196

-+	 * TASK_RUNNING, but do not wake the task because it waits

3197

-+	 * for a lock wakeup. Also indicate success because from

3198

-+	 * the regular waker's point of view this has succeeded.

3199

-+	 *

3200

-+	 * After acquiring the lock the task will restore p::__state

3201

-+	 * from p::saved_state which ensures that the regular

3202

-+	 * wakeup is not lost. The restore will also set

3203

-+	 * p::saved_state to TASK_RUNNING so any further tests will

3204

-+	 * not result in false positives vs. @success

3205

-+	 */

3206

-+	if (p->saved_state & state) {

3207

-+		p->saved_state = TASK_RUNNING;

3208

-+		*success = 1;

3209

-+	}

3210

-+#endif

3211

-+	return false;

3212

-+}

3213

-+

3214

-+/*

3215

-+ * Notes on Program-Order guarantees on SMP systems.

3216

-+ *

3217

-+ *  MIGRATION

3218

-+ *

3219

-+ * The basic program-order guarantee on SMP systems is that when a task [t]

3220

-+ * migrates, all its activity on its old CPU [c0] happens-before any subsequent

3221

-+ * execution on its new CPU [c1].

3222

-+ *

3223

-+ * For migration (of runnable tasks) this is provided by the following means:

3224

-+ *

3225

-+ *  A) UNLOCK of the rq(c0)->lock scheduling out task t

3226

-+ *  B) migration for t is required to synchronize *both* rq(c0)->lock and

3227

-+ *     rq(c1)->lock (if not at the same time, then in that order).

3228

-+ *  C) LOCK of the rq(c1)->lock scheduling in task

3229

-+ *

3230

-+ * Transitivity guarantees that B happens after A and C after B.

3231

-+ * Note: we only require RCpc transitivity.

3232

-+ * Note: the CPU doing B need not be c0 or c1

3233

-+ *

3234

-+ * Example:

3235

-+ *

3236

-+ *   CPU0            CPU1            CPU2

3237

-+ *

3238

-+ *   LOCK rq(0)->lock

3239

-+ *   sched-out X

3240

-+ *   sched-in Y

3241

-+ *   UNLOCK rq(0)->lock

3242

-+ *

3243

-+ *                                   LOCK rq(0)->lock // orders against CPU0

3244

-+ *                                   dequeue X

3245

-+ *                                   UNLOCK rq(0)->lock

3246

-+ *

3247

-+ *                                   LOCK rq(1)->lock

3248

-+ *                                   enqueue X

3249

-+ *                                   UNLOCK rq(1)->lock

3250

-+ *

3251

-+ *                   LOCK rq(1)->lock // orders against CPU2

3252

-+ *                   sched-out Z

3253

-+ *                   sched-in X

3254

-+ *                   UNLOCK rq(1)->lock

3255

-+ *

3256

-+ *

3257

-+ *  BLOCKING -- aka. SLEEP + WAKEUP

3258

-+ *

3259

-+ * For blocking we (obviously) need to provide the same guarantee as for

3260

-+ * migration. However the means are completely different as there is no lock

3261

-+ * chain to provide order. Instead we do:

3262

-+ *

3263

-+ *   1) smp_store_release(X->on_cpu, 0)   -- finish_task()

3264

-+ *   2) smp_cond_load_acquire(!X->on_cpu) -- try_to_wake_up()

3265

-+ *

3266

-+ * Example:

3267

-+ *

3268

-+ *   CPU0 (schedule)  CPU1 (try_to_wake_up) CPU2 (schedule)

3269

-+ *

3270

-+ *   LOCK rq(0)->lock LOCK X->pi_lock

3271

-+ *   dequeue X

3272

-+ *   sched-out X

3273

-+ *   smp_store_release(X->on_cpu, 0);

3274

-+ *

3275

-+ *                    smp_cond_load_acquire(&X->on_cpu, !VAL);

3276

-+ *                    X->state = WAKING

3277

-+ *                    set_task_cpu(X,2)

3278

-+ *

3279

-+ *                    LOCK rq(2)->lock

3280

-+ *                    enqueue X

3281

-+ *                    X->state = RUNNING

3282

-+ *                    UNLOCK rq(2)->lock

3283

-+ *

3284

-+ *                                          LOCK rq(2)->lock // orders against CPU1

3285

-+ *                                          sched-out Z

3286

-+ *                                          sched-in X

3287

-+ *                                          UNLOCK rq(2)->lock

3288

-+ *

3289

-+ *                    UNLOCK X->pi_lock

3290

-+ *   UNLOCK rq(0)->lock

3291

-+ *

3292

-+ *

3293

-+ * However; for wakeups there is a second guarantee we must provide, namely we

3294

-+ * must observe the state that lead to our wakeup. That is, not only must our

3295

-+ * task observe its own prior state, it must also observe the stores prior to

3296

-+ * its wakeup.

3297

-+ *

3298

-+ * This means that any means of doing remote wakeups must order the CPU doing

3299

-+ * the wakeup against the CPU the task is going to end up running on. This,

3300

-+ * however, is already required for the regular Program-Order guarantee above,

3301

-+ * since the waking CPU is the one issueing the ACQUIRE (smp_cond_load_acquire).

3302

-+ *

3303

-+ */

3304

-+

3305

-+/**

3306

-+ * try_to_wake_up - wake up a thread

3307

-+ * @p: the thread to be awakened

3308

-+ * @state: the mask of task states that can be woken

3309

-+ * @wake_flags: wake modifier flags (WF_*)

3310

-+ *

3311

-+ * Conceptually does:

3312

-+ *

3313

-+ *   If (@state & @p->state) @p->state = TASK_RUNNING.

3314

-+ *

3315

-+ * If the task was not queued/runnable, also place it back on a runqueue.

3316

-+ *

3317

-+ * This function is atomic against schedule() which would dequeue the task.

3318

-+ *

3319

-+ * It issues a full memory barrier before accessing @p->state, see the comment

3320

-+ * with set_current_state().

3321

-+ *

3322

-+ * Uses p->pi_lock to serialize against concurrent wake-ups.

3323

-+ *

3324

-+ * Relies on p->pi_lock stabilizing:

3325

-+ *  - p->sched_class

3326

-+ *  - p->cpus_ptr

3327

-+ *  - p->sched_task_group

3328

-+ * in order to do migration, see its use of select_task_rq()/set_task_cpu().

3329

-+ *

3330

-+ * Tries really hard to only take one task_rq(p)->lock for performance.

3331

-+ * Takes rq->lock in:

3332

-+ *  - ttwu_runnable()    -- old rq, unavoidable, see comment there;

3333

-+ *  - ttwu_queue()       -- new rq, for enqueue of the task;

3334

-+ *  - psi_ttwu_dequeue() -- much sadness :-( accounting will kill us.

3335

-+ *

3336

-+ * As a consequence we race really badly with just about everything. See the

3337

-+ * many memory barriers and their comments for details.

3338

-+ *

3339

-+ * Return: %true if @p->state changes (an actual wakeup was done),

3340

-+ *	   %false otherwise.

3341

-+ */

3342

-+static int try_to_wake_up(struct task_struct *p, unsigned int state,

3343

-+			  int wake_flags)

3344

-+{

3345

-+	unsigned long flags;

3346

-+	int cpu, success = 0;

3347

-+

3348

-+	preempt_disable();

3349

-+	if (p == current) {

3350

-+		/*

3351

-+		 * We're waking current, this means 'p->on_rq' and 'task_cpu(p)

3352

-+		 * == smp_processor_id()'. Together this means we can special

3353

-+		 * case the whole 'p->on_rq && ttwu_runnable()' case below

3354

-+		 * without taking any locks.

3355

-+		 *

3356

-+		 * In particular:

3357

-+		 *  - we rely on Program-Order guarantees for all the ordering,

3358

-+		 *  - we're serialized against set_special_state() by virtue of

3359

-+		 *    it disabling IRQs (this allows not taking ->pi_lock).

3360

-+		 */

3361

-+		if (!ttwu_state_match(p, state, &success))

3362

-+			goto out;

3363

-+

3364

-+		trace_sched_waking(p);

3365

-+		WRITE_ONCE(p->__state, TASK_RUNNING);

3366

-+		trace_sched_wakeup(p);

3367

-+		goto out;

3368

-+	}

3369

-+

3370

-+	/*

3371

-+	 * If we are going to wake up a thread waiting for CONDITION we

3372

-+	 * need to ensure that CONDITION=1 done by the caller can not be

3373

-+	 * reordered with p->state check below. This pairs with smp_store_mb()

3374

-+	 * in set_current_state() that the waiting thread does.

3375

-+	 */

3376

-+	raw_spin_lock_irqsave(&p->pi_lock, flags);

3377

-+	smp_mb__after_spinlock();

3378

-+	if (!ttwu_state_match(p, state, &success))

3379

-+		goto unlock;

3380

-+

3381

-+	trace_sched_waking(p);

3382

-+

3383

-+	/*

3384

-+	 * Ensure we load p->on_rq _after_ p->state, otherwise it would

3385

-+	 * be possible to, falsely, observe p->on_rq == 0 and get stuck

3386

-+	 * in smp_cond_load_acquire() below.

3387

-+	 *

3388

-+	 * sched_ttwu_pending()			try_to_wake_up()

3389

-+	 *   STORE p->on_rq = 1			  LOAD p->state

3390

-+	 *   UNLOCK rq->lock

3391

-+	 *

3392

-+	 * __schedule() (switch to task 'p')

3393

-+	 *   LOCK rq->lock			  smp_rmb();

3394

-+	 *   smp_mb__after_spinlock();

3395

-+	 *   UNLOCK rq->lock

3396

-+	 *

3397

-+	 * [task p]

3398

-+	 *   STORE p->state = UNINTERRUPTIBLE	  LOAD p->on_rq

3399

-+	 *

3400

-+	 * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in

3401

-+	 * __schedule().  See the comment for smp_mb__after_spinlock().

3402

-+	 *

3403

-+	 * A similar smb_rmb() lives in try_invoke_on_locked_down_task().

3404

-+	 */

3405

-+	smp_rmb();

3406

-+	if (READ_ONCE(p->on_rq) && ttwu_runnable(p, wake_flags))

3407

-+		goto unlock;

3408

-+

3409

-+#ifdef CONFIG_SMP

3410

-+	/*

3411

-+	 * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be

3412

-+	 * possible to, falsely, observe p->on_cpu == 0.

3413

-+	 *

3414

-+	 * One must be running (->on_cpu == 1) in order to remove oneself

3415

-+	 * from the runqueue.

3416

-+	 *

3417

-+	 * __schedule() (switch to task 'p')	try_to_wake_up()

3418

-+	 *   STORE p->on_cpu = 1		  LOAD p->on_rq

3419

-+	 *   UNLOCK rq->lock

3420

-+	 *

3421

-+	 * __schedule() (put 'p' to sleep)

3422

-+	 *   LOCK rq->lock			  smp_rmb();

3423

-+	 *   smp_mb__after_spinlock();

3424

-+	 *   STORE p->on_rq = 0			  LOAD p->on_cpu

3425

-+	 *

3426

-+	 * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in

3427

-+	 * __schedule().  See the comment for smp_mb__after_spinlock().

3428

-+	 *

3429

-+	 * Form a control-dep-acquire with p->on_rq == 0 above, to ensure

3430

-+	 * schedule()'s deactivate_task() has 'happened' and p will no longer

3431

-+	 * care about it's own p->state. See the comment in __schedule().

3432

-+	 */

3433

-+	smp_acquire__after_ctrl_dep();

3434

-+

3435

-+	/*

3436

-+	 * We're doing the wakeup (@success == 1), they did a dequeue (p->on_rq

3437

-+	 * == 0), which means we need to do an enqueue, change p->state to

3438

-+	 * TASK_WAKING such that we can unlock p->pi_lock before doing the

3439

-+	 * enqueue, such as ttwu_queue_wakelist().

3440

-+	 */

3441

-+	WRITE_ONCE(p->__state, TASK_WAKING);

3442

-+

3443

-+	/*

3444

-+	 * If the owning (remote) CPU is still in the middle of schedule() with

3445

-+	 * this task as prev, considering queueing p on the remote CPUs wake_list

3446

-+	 * which potentially sends an IPI instead of spinning on p->on_cpu to

3447

-+	 * let the waker make forward progress. This is safe because IRQs are

3448

-+	 * disabled and the IPI will deliver after on_cpu is cleared.

3449

-+	 *

3450

-+	 * Ensure we load task_cpu(p) after p->on_cpu:

3451

-+	 *

3452

-+	 * set_task_cpu(p, cpu);

3453

-+	 *   STORE p->cpu = @cpu

3454

-+	 * __schedule() (switch to task 'p')

3455

-+	 *   LOCK rq->lock

3456

-+	 *   smp_mb__after_spin_lock()          smp_cond_load_acquire(&p->on_cpu)

3457

-+	 *   STORE p->on_cpu = 1                LOAD p->cpu

3458

-+	 *

3459

-+	 * to ensure we observe the correct CPU on which the task is currently

3460

-+	 * scheduling.

3461

-+	 */

3462

-+	if (smp_load_acquire(&p->on_cpu) &&

3463

-+	    ttwu_queue_wakelist(p, task_cpu(p), wake_flags | WF_ON_CPU))

3464

-+		goto unlock;

3465

-+

3466

-+	/*

3467

-+	 * If the owning (remote) CPU is still in the middle of schedule() with

3468

-+	 * this task as prev, wait until it's done referencing the task.

3469

-+	 *

3470

-+	 * Pairs with the smp_store_release() in finish_task().

3471

-+	 *

3472

-+	 * This ensures that tasks getting woken will be fully ordered against

3473

-+	 * their previous state and preserve Program Order.

3474

-+	 */

3475

-+	smp_cond_load_acquire(&p->on_cpu, !VAL);

3476

-+

3477

-+	sched_task_ttwu(p);

3478

-+

3479

-+	cpu = select_task_rq(p);

3480

-+

3481

-+	if (cpu != task_cpu(p)) {

3482

-+		if (p->in_iowait) {

3483

-+			delayacct_blkio_end(p);

3484

-+			atomic_dec(&task_rq(p)->nr_iowait);

3485

-+		}

3486

-+

3487

-+		wake_flags |= WF_MIGRATED;

3488

-+		psi_ttwu_dequeue(p);

3489

-+		set_task_cpu(p, cpu);

3490

-+	}

3491

-+#else

3492

-+	cpu = task_cpu(p);

3493

-+#endif /* CONFIG_SMP */

3494

-+

3495

-+	ttwu_queue(p, cpu, wake_flags);

3496

-+unlock:

3497

-+	raw_spin_unlock_irqrestore(&p->pi_lock, flags);

3498

-+out:

3499

-+	if (success)

3500

-+		ttwu_stat(p, task_cpu(p), wake_flags);

3501

-+	preempt_enable();

3502

-+

3503

-+	return success;

3504

-+}

3505

-+

3506

-+/**

3507

-+ * try_invoke_on_locked_down_task - Invoke a function on task in fixed state

3508

-+ * @p: Process for which the function is to be invoked, can be @current.

3509

-+ * @func: Function to invoke.

3510

-+ * @arg: Argument to function.

3511

-+ *

3512

-+ * If the specified task can be quickly locked into a definite state

3513

-+ * (either sleeping or on a given runqueue), arrange to keep it in that

3514

-+ * state while invoking @func(@arg).  This function can use ->on_rq and

3515

-+ * task_curr() to work out what the state is, if required.  Given that

3516

-+ * @func can be invoked with a runqueue lock held, it had better be quite

3517

-+ * lightweight.

3518

-+ *

3519

-+ * Returns:

3520

-+ *	@false if the task slipped out from under the locks.

3521

-+ *	@true if the task was locked onto a runqueue or is sleeping.

3522

-+ *		However, @func can override this by returning @false.

3523

-+ */

3524

-+bool try_invoke_on_locked_down_task(struct task_struct *p, bool (*func)(struct task_struct *t, void *arg), void *arg)

3525

-+{

3526

-+	struct rq_flags rf;

3527

-+	bool ret = false;

3528

-+	struct rq *rq;

3529

-+

3530

-+	raw_spin_lock_irqsave(&p->pi_lock, rf.flags);

3531

-+	if (p->on_rq) {

3532

-+		rq = __task_rq_lock(p, &rf);

3533

-+		if (task_rq(p) == rq)

3534

-+			ret = func(p, arg);

3535

-+		__task_rq_unlock(rq, &rf);

3536

-+	} else {

3537

-+		switch (READ_ONCE(p->__state)) {

3538

-+		case TASK_RUNNING:

3539

-+		case TASK_WAKING:

3540

-+			break;

3541

-+		default:

3542

-+			smp_rmb(); // See smp_rmb() comment in try_to_wake_up().

3543

-+			if (!p->on_rq)

3544

-+				ret = func(p, arg);

3545

-+		}

3546

-+	}

3547

-+	raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags);

3548

-+	return ret;

3549

-+}

3550

-+

3551

-+/**

3552

-+ * wake_up_process - Wake up a specific process

3553

-+ * @p: The process to be woken up.

3554

-+ *

3555

-+ * Attempt to wake up the nominated process and move it to the set of runnable

3556

-+ * processes.

3557

-+ *

3558

-+ * Return: 1 if the process was woken up, 0 if it was already running.

3559

-+ *

3560

-+ * This function executes a full memory barrier before accessing the task state.

3561

-+ */

3562

-+int wake_up_process(struct task_struct *p)

3563

-+{

3564

-+	return try_to_wake_up(p, TASK_NORMAL, 0);

3565

-+}

3566

-+EXPORT_SYMBOL(wake_up_process);

3567

-+

3568

-+int wake_up_state(struct task_struct *p, unsigned int state)

3569

-+{

3570

-+	return try_to_wake_up(p, state, 0);

3571

-+}

3572

-+

3573

-+/*

3574

-+ * Perform scheduler related setup for a newly forked process p.

3575

-+ * p is forked by current.

3576

-+ *

3577

-+ * __sched_fork() is basic setup used by init_idle() too:

3578

-+ */

3579

-+static inline void __sched_fork(unsigned long clone_flags, struct task_struct *p)

3580

-+{

3581

-+	p->on_rq			= 0;

3582

-+	p->on_cpu			= 0;

3583

-+	p->utime			= 0;

3584

-+	p->stime			= 0;

3585

-+	p->sched_time			= 0;

3586

-+

3587

-+#ifdef CONFIG_PREEMPT_NOTIFIERS

3588

-+	INIT_HLIST_HEAD(&p->preempt_notifiers);

3589

-+#endif

3590

-+

3591

-+#ifdef CONFIG_COMPACTION

3592

-+	p->capture_control = NULL;

3593

-+#endif

3594

-+#ifdef CONFIG_SMP

3595

-+	p->wake_entry.u_flags = CSD_TYPE_TTWU;

3596

-+#endif

3597

-+}

3598

-+

3599

-+/*

3600

-+ * fork()/clone()-time setup:

3601

-+ */

3602

-+int sched_fork(unsigned long clone_flags, struct task_struct *p)

3603

-+{

3604

-+	unsigned long flags;

3605

-+	struct rq *rq;

3606

-+

3607

-+	__sched_fork(clone_flags, p);

3608

-+	/*

3609

-+	 * We mark the process as NEW here. This guarantees that

3610

-+	 * nobody will actually run it, and a signal or other external

3611

-+	 * event cannot wake it up and insert it on the runqueue either.

3612

-+	 */

3613

-+	p->__state = TASK_NEW;

3614

-+

3615

-+	/*

3616

-+	 * Make sure we do not leak PI boosting priority to the child.

3617

-+	 */

3618

-+	p->prio = current->normal_prio;

3619

-+

3620

-+	/*

3621

-+	 * Revert to default priority/policy on fork if requested.

3622

-+	 */

3623

-+	if (unlikely(p->sched_reset_on_fork)) {

3624

-+		if (task_has_rt_policy(p)) {

3625

-+			p->policy = SCHED_NORMAL;

3626

-+			p->static_prio = NICE_TO_PRIO(0);

3627

-+			p->rt_priority = 0;

3628

-+		} else if (PRIO_TO_NICE(p->static_prio) < 0)

3629

-+			p->static_prio = NICE_TO_PRIO(0);

3630

-+

3631

-+		p->prio = p->normal_prio = p->static_prio;

3632

-+

3633

-+		/*

3634

-+		 * We don't need the reset flag anymore after the fork. It has

3635

-+		 * fulfilled its duty:

3636

-+		 */

3637

-+		p->sched_reset_on_fork = 0;

3638

-+	}

3639

-+

3640

-+	/*

3641

-+	 * The child is not yet in the pid-hash so no cgroup attach races,

3642

-+	 * and the cgroup is pinned to this child due to cgroup_fork()

3643

-+	 * is ran before sched_fork().

3644

-+	 *

3645

-+	 * Silence PROVE_RCU.

3646

-+	 */

3647

-+	raw_spin_lock_irqsave(&p->pi_lock, flags);

3648

-+	/*

3649

-+	 * Share the timeslice between parent and child, thus the

3650

-+	 * total amount of pending timeslices in the system doesn't change,

3651

-+	 * resulting in more scheduling fairness.

3652

-+	 */

3653

-+	rq = this_rq();

3654

-+	raw_spin_lock(&rq->lock);

3655

-+

3656

-+	rq->curr->time_slice /= 2;

3657

-+	p->time_slice = rq->curr->time_slice;

3658

-+#ifdef CONFIG_SCHED_HRTICK

3659

-+	hrtick_start(rq, rq->curr->time_slice);

3660

-+#endif

3661

-+

3662

-+	if (p->time_slice < RESCHED_NS) {

3663

-+		p->time_slice = sched_timeslice_ns;

3664

-+		resched_curr(rq);

3665

-+	}

3666

-+	sched_task_fork(p, rq);

3667

-+	raw_spin_unlock(&rq->lock);

3668

-+

3669

-+	rseq_migrate(p);

3670

-+	/*

3671

-+	 * We're setting the CPU for the first time, we don't migrate,

3672

-+	 * so use __set_task_cpu().

3673

-+	 */

3674

-+	__set_task_cpu(p, cpu_of(rq));

3675

-+	raw_spin_unlock_irqrestore(&p->pi_lock, flags);

3676

-+

3677

-+#ifdef CONFIG_SCHED_INFO

3678

-+	if (unlikely(sched_info_on()))

3679

-+		memset(&p->sched_info, 0, sizeof(p->sched_info));

3680

-+#endif

3681

-+	init_task_preempt_count(p);

3682

-+

3683

-+	return 0;

3684

-+}

3685

-+

3686

-+void sched_post_fork(struct task_struct *p) {}

3687

-+

3688

-+#ifdef CONFIG_SCHEDSTATS

3689

-+

3690

-+DEFINE_STATIC_KEY_FALSE(sched_schedstats);

3691

-+

3692

-+static void set_schedstats(bool enabled)

3693

-+{

3694

-+	if (enabled)

3695

-+		static_branch_enable(&sched_schedstats);

3696

-+	else

3697

-+		static_branch_disable(&sched_schedstats);

3698

-+}

3699

-+

3700

-+void force_schedstat_enabled(void)

3701

-+{

3702

-+	if (!schedstat_enabled()) {

3703

-+		pr_info("kernel profiling enabled schedstats, disable via kernel.sched_schedstats.\n");

3704

-+		static_branch_enable(&sched_schedstats);

3705

-+	}

3706

-+}

3707

-+

3708

-+static int __init setup_schedstats(char *str)

3709

-+{

3710

-+	int ret = 0;

3711

-+	if (!str)

3712

-+		goto out;

3713

-+

3714

-+	if (!strcmp(str, "enable")) {

3715

-+		set_schedstats(true);

3716

-+		ret = 1;

3717

-+	} else if (!strcmp(str, "disable")) {

3718

-+		set_schedstats(false);

3719

-+		ret = 1;

3720

-+	}

3721

-+out:

3722

-+	if (!ret)

3723

-+		pr_warn("Unable to parse schedstats=\n");

3724

-+

3725

-+	return ret;

3726

-+}

3727

-+__setup("schedstats=", setup_schedstats);

3728

-+

3729

-+#ifdef CONFIG_PROC_SYSCTL

3730

-+int sysctl_schedstats(struct ctl_table *table, int write,

3731

-+			 void __user *buffer, size_t *lenp, loff_t *ppos)

3732

-+{

3733

-+	struct ctl_table t;

3734

-+	int err;

3735

-+	int state = static_branch_likely(&sched_schedstats);

3736

-+

3737

-+	if (write && !capable(CAP_SYS_ADMIN))

3738

-+		return -EPERM;

3739

-+

3740

-+	t = *table;

3741

-+	t.data = &state;

3742

-+	err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);

3743

-+	if (err < 0)

3744

-+		return err;

3745

-+	if (write)

3746

-+		set_schedstats(state);

3747

-+	return err;

3748

-+}

3749

-+#endif /* CONFIG_PROC_SYSCTL */

3750

-+#endif /* CONFIG_SCHEDSTATS */

3751

-+

3752

-+/*

3753

-+ * wake_up_new_task - wake up a newly created task for the first time.

3754

-+ *

3755

-+ * This function will do some initial scheduler statistics housekeeping

3756

-+ * that must be done for every newly created context, then puts the task

3757

-+ * on the runqueue and wakes it.

3758

-+ */

3759

-+void wake_up_new_task(struct task_struct *p)

3760

-+{

3761

-+	unsigned long flags;

3762

-+	struct rq *rq;

3763

-+

3764

-+	raw_spin_lock_irqsave(&p->pi_lock, flags);

3765

-+	WRITE_ONCE(p->__state, TASK_RUNNING);

3766

-+	rq = cpu_rq(select_task_rq(p));

3767

-+#ifdef CONFIG_SMP

3768

-+	rseq_migrate(p);

3769

-+	/*

3770

-+	 * Fork balancing, do it here and not earlier because:

3771

-+	 * - cpus_ptr can change in the fork path

3772

-+	 * - any previously selected CPU might disappear through hotplug

3773

-+	 *

3774

-+	 * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq,

3775

-+	 * as we're not fully set-up yet.

3776

-+	 */

3777

-+	__set_task_cpu(p, cpu_of(rq));

3778

-+#endif

3779

-+

3780

-+	raw_spin_lock(&rq->lock);

3781

-+	update_rq_clock(rq);

3782

-+

3783

-+	activate_task(p, rq);

3784

-+	trace_sched_wakeup_new(p);

3785

-+	check_preempt_curr(rq);

3786

-+

3787

-+	raw_spin_unlock(&rq->lock);

3788

-+	raw_spin_unlock_irqrestore(&p->pi_lock, flags);

3789

-+}

3790

-+

3791

-+#ifdef CONFIG_PREEMPT_NOTIFIERS

3792

-+

3793

-+static DEFINE_STATIC_KEY_FALSE(preempt_notifier_key);

3794

-+

3795

-+void preempt_notifier_inc(void)

3796

-+{

3797

-+	static_branch_inc(&preempt_notifier_key);

3798

-+}

3799

-+EXPORT_SYMBOL_GPL(preempt_notifier_inc);

3800

-+

3801

-+void preempt_notifier_dec(void)

3802

-+{

3803

-+	static_branch_dec(&preempt_notifier_key);

3804

-+}

3805

-+EXPORT_SYMBOL_GPL(preempt_notifier_dec);

3806

-+

3807

-+/**

3808

-+ * preempt_notifier_register - tell me when current is being preempted & rescheduled

3809

-+ * @notifier: notifier struct to register

3810

-+ */

3811

-+void preempt_notifier_register(struct preempt_notifier *notifier)

3812

-+{

3813

-+	if (!static_branch_unlikely(&preempt_notifier_key))

3814

-+		WARN(1, "registering preempt_notifier while notifiers disabled\n");

3815

-+

3816

-+	hlist_add_head(&notifier->link, &current->preempt_notifiers);

3817

-+}

3818

-+EXPORT_SYMBOL_GPL(preempt_notifier_register);

3819

-+

3820

-+/**

3821

-+ * preempt_notifier_unregister - no longer interested in preemption notifications

3822

-+ * @notifier: notifier struct to unregister

3823

-+ *

3824

-+ * This is *not* safe to call from within a preemption notifier.

3825

-+ */

3826

-+void preempt_notifier_unregister(struct preempt_notifier *notifier)

3827

-+{

3828

-+	hlist_del(&notifier->link);

3829

-+}

3830

-+EXPORT_SYMBOL_GPL(preempt_notifier_unregister);

3831

-+

3832

-+static void __fire_sched_in_preempt_notifiers(struct task_struct *curr)

3833

-+{

3834

-+	struct preempt_notifier *notifier;

3835

-+

3836

-+	hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)

3837

-+		notifier->ops->sched_in(notifier, raw_smp_processor_id());

3838

-+}

3839

-+

3840

-+static __always_inline void fire_sched_in_preempt_notifiers(struct task_struct *curr)

3841

-+{

3842

-+	if (static_branch_unlikely(&preempt_notifier_key))

3843

-+		__fire_sched_in_preempt_notifiers(curr);

3844

-+}

3845

-+

3846

-+static void

3847

-+__fire_sched_out_preempt_notifiers(struct task_struct *curr,

3848

-+				   struct task_struct *next)

3849

-+{

3850

-+	struct preempt_notifier *notifier;

3851

-+

3852

-+	hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)

3853

-+		notifier->ops->sched_out(notifier, next);

3854

-+}

3855

-+

3856

-+static __always_inline void

3857

-+fire_sched_out_preempt_notifiers(struct task_struct *curr,

3858

-+				 struct task_struct *next)

3859

-+{

3860

-+	if (static_branch_unlikely(&preempt_notifier_key))

3861

-+		__fire_sched_out_preempt_notifiers(curr, next);

3862

-+}

3863

-+

3864

-+#else /* !CONFIG_PREEMPT_NOTIFIERS */

3865

-+

3866

-+static inline void fire_sched_in_preempt_notifiers(struct task_struct *curr)

3867

-+{

3868

-+}

3869

-+

3870

-+static inline void

3871

-+fire_sched_out_preempt_notifiers(struct task_struct *curr,

3872

-+				 struct task_struct *next)

3873

-+{

3874

-+}

3875

-+

3876

-+#endif /* CONFIG_PREEMPT_NOTIFIERS */

3877

-+

3878

-+static inline void prepare_task(struct task_struct *next)

3879

-+{

3880

-+	/*

3881

-+	 * Claim the task as running, we do this before switching to it

3882

-+	 * such that any running task will have this set.

3883

-+	 *

3884

-+	 * See the ttwu() WF_ON_CPU case and its ordering comment.

3885

-+	 */

3886

-+	WRITE_ONCE(next->on_cpu, 1);

3887

-+}

3888

-+

3889

-+static inline void finish_task(struct task_struct *prev)

3890

-+{

3891

-+#ifdef CONFIG_SMP

3892

-+	/*

3893

-+	 * This must be the very last reference to @prev from this CPU. After

3894

-+	 * p->on_cpu is cleared, the task can be moved to a different CPU. We

3895

-+	 * must ensure this doesn't happen until the switch is completely

3896

-+	 * finished.

3897

-+	 *

3898

-+	 * In particular, the load of prev->state in finish_task_switch() must

3899

-+	 * happen before this.

3900

-+	 *

3901

-+	 * Pairs with the smp_cond_load_acquire() in try_to_wake_up().

3902

-+	 */

3903

-+	smp_store_release(&prev->on_cpu, 0);

3904

-+#else

3905

-+	prev->on_cpu = 0;

3906

-+#endif

3907

-+}

3908

-+

3909

-+#ifdef CONFIG_SMP

3910

-+

3911

-+static void do_balance_callbacks(struct rq *rq, struct callback_head *head)

3912

-+{

3913

-+	void (*func)(struct rq *rq);

3914

-+	struct callback_head *next;

3915

-+

3916

-+	lockdep_assert_held(&rq->lock);

3917

-+

3918

-+	while (head) {

3919

-+		func = (void (*)(struct rq *))head->func;

3920

-+		next = head->next;

3921

-+		head->next = NULL;

3922

-+		head = next;

3923

-+

3924

-+		func(rq);

3925

-+	}

3926

-+}

3927

-+

3928

-+static void balance_push(struct rq *rq);

3929

-+

3930

-+struct callback_head balance_push_callback = {

3931

-+	.next = NULL,

3932

-+	.func = (void (*)(struct callback_head *))balance_push,

3933

-+};

3934

-+

3935

-+static inline struct callback_head *splice_balance_callbacks(struct rq *rq)

3936

-+{

3937

-+	struct callback_head *head = rq->balance_callback;

3938

-+

3939

-+	if (head) {

3940

-+		lockdep_assert_held(&rq->lock);

3941

-+		rq->balance_callback = NULL;

3942

-+	}

3943

-+

3944

-+	return head;

3945

-+}

3946

-+

3947

-+static void __balance_callbacks(struct rq *rq)

3948

-+{

3949

-+	do_balance_callbacks(rq, splice_balance_callbacks(rq));

3950

-+}

3951

-+

3952

-+static inline void balance_callbacks(struct rq *rq, struct callback_head *head)

3953

-+{

3954

-+	unsigned long flags;

3955

-+

3956

-+	if (unlikely(head)) {

3957

-+		raw_spin_lock_irqsave(&rq->lock, flags);

3958

-+		do_balance_callbacks(rq, head);

3959

-+		raw_spin_unlock_irqrestore(&rq->lock, flags);

3960

-+	}

3961

-+}

3962

-+

3963

-+#else

3964

-+

3965

-+static inline void __balance_callbacks(struct rq *rq)

3966

-+{

3967

-+}

3968

-+

3969

-+static inline struct callback_head *splice_balance_callbacks(struct rq *rq)

3970

-+{

3971

-+	return NULL;

3972

-+}

3973

-+

3974

-+static inline void balance_callbacks(struct rq *rq, struct callback_head *head)

3975

-+{

3976

-+}

3977

-+

3978

-+#endif

3979

-+

3980

-+static inline void

3981

-+prepare_lock_switch(struct rq *rq, struct task_struct *next)

3982

-+{

3983

-+	/*

3984

-+	 * Since the runqueue lock will be released by the next

3985

-+	 * task (which is an invalid locking op but in the case

3986

-+	 * of the scheduler it's an obvious special-case), so we

3987

-+	 * do an early lockdep release here:

3988

-+	 */

3989

-+	spin_release(&rq->lock.dep_map, _THIS_IP_);

3990

-+#ifdef CONFIG_DEBUG_SPINLOCK

3991

-+	/* this is a valid case when another task releases the spinlock */

3992

-+	rq->lock.owner = next;

3993

-+#endif

3994

-+}

3995

-+

3996

-+static inline void finish_lock_switch(struct rq *rq)

3997

-+{

3998

-+	/*

3999

-+	 * If we are tracking spinlock dependencies then we have to

4000

-+	 * fix up the runqueue lock - which gets 'carried over' from

4001

-+	 * prev into current:

4002

-+	 */

4003

-+	spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);

4004

-+	__balance_callbacks(rq);

4005

-+	raw_spin_unlock_irq(&rq->lock);

4006

-+}

4007

-+

4008

-+/*

4009

-+ * NOP if the arch has not defined these:

4010

-+ */

4011

-+

4012

-+#ifndef prepare_arch_switch

4013

-+# define prepare_arch_switch(next)	do { } while (0)

4014

-+#endif

4015

-+

4016

-+#ifndef finish_arch_post_lock_switch

4017

-+# define finish_arch_post_lock_switch()	do { } while (0)

4018

-+#endif

4019

-+

4020

-+static inline void kmap_local_sched_out(void)

4021

-+{

4022

-+#ifdef CONFIG_KMAP_LOCAL

4023

-+	if (unlikely(current->kmap_ctrl.idx))

4024

-+		__kmap_local_sched_out();

4025

-+#endif

4026

-+}

4027

-+

4028

-+static inline void kmap_local_sched_in(void)

4029

-+{

4030

-+#ifdef CONFIG_KMAP_LOCAL

4031

-+	if (unlikely(current->kmap_ctrl.idx))

4032

-+		__kmap_local_sched_in();

4033

-+#endif

4034

-+}

4035

-+

4036

-+/**

4037

-+ * prepare_task_switch - prepare to switch tasks

4038

-+ * @rq: the runqueue preparing to switch

4039

-+ * @next: the task we are going to switch to.

4040

-+ *

4041

-+ * This is called with the rq lock held and interrupts off. It must

4042

-+ * be paired with a subsequent finish_task_switch after the context

4043

-+ * switch.

4044

-+ *

4045

-+ * prepare_task_switch sets up locking and calls architecture specific

4046

-+ * hooks.

4047

-+ */

4048

-+static inline void

4049

-+prepare_task_switch(struct rq *rq, struct task_struct *prev,

4050

-+		    struct task_struct *next)

4051

-+{

4052

-+	kcov_prepare_switch(prev);

4053

-+	sched_info_switch(rq, prev, next);

4054

-+	perf_event_task_sched_out(prev, next);

4055

-+	rseq_preempt(prev);

4056

-+	fire_sched_out_preempt_notifiers(prev, next);

4057

-+	kmap_local_sched_out();

4058

-+	prepare_task(next);

4059

-+	prepare_arch_switch(next);

4060

-+}

4061

-+

4062

-+/**

4063

-+ * finish_task_switch - clean up after a task-switch

4064

-+ * @rq: runqueue associated with task-switch

4065

-+ * @prev: the thread we just switched away from.

4066

-+ *

4067

-+ * finish_task_switch must be called after the context switch, paired

4068

-+ * with a prepare_task_switch call before the context switch.

4069

-+ * finish_task_switch will reconcile locking set up by prepare_task_switch,

4070

-+ * and do any other architecture-specific cleanup actions.

4071

-+ *

4072

-+ * Note that we may have delayed dropping an mm in context_switch(). If

4073

-+ * so, we finish that here outside of the runqueue lock.  (Doing it

4074

-+ * with the lock held can cause deadlocks; see schedule() for

4075

-+ * details.)

4076

-+ *

4077

-+ * The context switch have flipped the stack from under us and restored the

4078

-+ * local variables which were saved when this task called schedule() in the

4079

-+ * past. prev == current is still correct but we need to recalculate this_rq

4080

-+ * because prev may have moved to another CPU.

4081

-+ */

4082

-+static struct rq *finish_task_switch(struct task_struct *prev)

4083

-+	__releases(rq->lock)

4084

-+{

4085

-+	struct rq *rq = this_rq();

4086

-+	struct mm_struct *mm = rq->prev_mm;

4087

-+	long prev_state;

4088

-+

4089

-+	/*

4090

-+	 * The previous task will have left us with a preempt_count of 2

4091

-+	 * because it left us after:

4092

-+	 *

4093

-+	 *	schedule()

4094

-+	 *	  preempt_disable();			// 1

4095

-+	 *	  __schedule()

4096

-+	 *	    raw_spin_lock_irq(&rq->lock)	// 2

4097

-+	 *

4098

-+	 * Also, see FORK_PREEMPT_COUNT.

4099

-+	 */

4100

-+	if (WARN_ONCE(preempt_count() != 2*PREEMPT_DISABLE_OFFSET,

4101

-+		      "corrupted preempt_count: %s/%d/0x%x\n",

4102

-+		      current->comm, current->pid, preempt_count()))

4103

-+		preempt_count_set(FORK_PREEMPT_COUNT);

4104

-+

4105

-+	rq->prev_mm = NULL;

4106

-+

4107

-+	/*

4108

-+	 * A task struct has one reference for the use as "current".

4109

-+	 * If a task dies, then it sets TASK_DEAD in tsk->state and calls

4110

-+	 * schedule one last time. The schedule call will never return, and

4111

-+	 * the scheduled task must drop that reference.

4112

-+	 *

4113

-+	 * We must observe prev->state before clearing prev->on_cpu (in

4114

-+	 * finish_task), otherwise a concurrent wakeup can get prev

4115

-+	 * running on another CPU and we could rave with its RUNNING -> DEAD

4116

-+	 * transition, resulting in a double drop.

4117

-+	 */

4118

-+	prev_state = READ_ONCE(prev->__state);

4119

-+	vtime_task_switch(prev);

4120

-+	perf_event_task_sched_in(prev, current);

4121

-+	finish_task(prev);

4122

-+	tick_nohz_task_switch();

4123

-+	finish_lock_switch(rq);

4124

-+	finish_arch_post_lock_switch();

4125

-+	kcov_finish_switch(current);

4126

-+	/*

4127

-+	 * kmap_local_sched_out() is invoked with rq::lock held and

4128

-+	 * interrupts disabled. There is no requirement for that, but the

4129

-+	 * sched out code does not have an interrupt enabled section.

4130

-+	 * Restoring the maps on sched in does not require interrupts being

4131

-+	 * disabled either.

4132

-+	 */

4133

-+	kmap_local_sched_in();

4134

-+

4135

-+	fire_sched_in_preempt_notifiers(current);

4136

-+	/*

4137

-+	 * When switching through a kernel thread, the loop in

4138

-+	 * membarrier_{private,global}_expedited() may have observed that

4139

-+	 * kernel thread and not issued an IPI. It is therefore possible to

4140

-+	 * schedule between user->kernel->user threads without passing though

4141

-+	 * switch_mm(). Membarrier requires a barrier after storing to

4142

-+	 * rq->curr, before returning to userspace, so provide them here:

4143

-+	 *

4144

-+	 * - a full memory barrier for {PRIVATE,GLOBAL}_EXPEDITED, implicitly

4145

-+	 *   provided by mmdrop(),

4146

-+	 * - a sync_core for SYNC_CORE.

4147

-+	 */

4148

-+	if (mm) {

4149

-+		membarrier_mm_sync_core_before_usermode(mm);

4150

-+		mmdrop(mm);

4151

-+	}

4152

-+	if (unlikely(prev_state == TASK_DEAD)) {

4153

-+		/*

4154

-+		 * Remove function-return probe instances associated with this

4155

-+		 * task and put them back on the free list.

4156

-+		 */

4157

-+		kprobe_flush_task(prev);

4158

-+

4159

-+		/* Task is done with its stack. */

4160

-+		put_task_stack(prev);

4161

-+

4162

-+		put_task_struct_rcu_user(prev);

4163

-+	}

4164

-+

4165

-+	return rq;

4166

-+}

4167

-+

4168

-+/**

4169

-+ * schedule_tail - first thing a freshly forked thread must call.

4170

-+ * @prev: the thread we just switched away from.

4171

-+ */

4172

-+asmlinkage __visible void schedule_tail(struct task_struct *prev)

4173

-+	__releases(rq->lock)

4174

-+{

4175

-+	/*

4176

-+	 * New tasks start with FORK_PREEMPT_COUNT, see there and

4177

-+	 * finish_task_switch() for details.

4178

-+	 *

4179

-+	 * finish_task_switch() will drop rq->lock() and lower preempt_count

4180

-+	 * and the preempt_enable() will end up enabling preemption (on

4181

-+	 * PREEMPT_COUNT kernels).

4182

-+	 */

4183

-+

4184

-+	finish_task_switch(prev);

4185

-+	preempt_enable();

4186

-+

4187

-+	if (current->set_child_tid)

4188

-+		put_user(task_pid_vnr(current), current->set_child_tid);

4189

-+

4190

-+	calculate_sigpending();

4191

-+}

4192

-+

4193

-+/*

4194

-+ * context_switch - switch to the new MM and the new thread's register state.

4195

-+ */

4196

-+static __always_inline struct rq *

4197

-+context_switch(struct rq *rq, struct task_struct *prev,

4198

-+	       struct task_struct *next)

4199

-+{

4200

-+	prepare_task_switch(rq, prev, next);

4201

-+

4202

-+	/*

4203

-+	 * For paravirt, this is coupled with an exit in switch_to to

4204

-+	 * combine the page table reload and the switch backend into

4205

-+	 * one hypercall.

4206

-+	 */

4207

-+	arch_start_context_switch(prev);

4208

-+

4209

-+	/*

4210

-+	 * kernel -> kernel   lazy + transfer active

4211

-+	 *   user -> kernel   lazy + mmgrab() active

4212

-+	 *

4213

-+	 * kernel ->   user   switch + mmdrop() active

4214

-+	 *   user ->   user   switch

4215

-+	 */

4216

-+	if (!next->mm) {                                // to kernel

4217

-+		enter_lazy_tlb(prev->active_mm, next);

4218

-+

4219

-+		next->active_mm = prev->active_mm;

4220

-+		if (prev->mm)                           // from user

4221

-+			mmgrab(prev->active_mm);

4222

-+		else

4223

-+			prev->active_mm = NULL;

4224

-+	} else {                                        // to user

4225

-+		membarrier_switch_mm(rq, prev->active_mm, next->mm);

4226

-+		/*

4227

-+		 * sys_membarrier() requires an smp_mb() between setting

4228

-+		 * rq->curr / membarrier_switch_mm() and returning to userspace.

4229

-+		 *

4230

-+		 * The below provides this either through switch_mm(), or in

4231

-+		 * case 'prev->active_mm == next->mm' through

4232

-+		 * finish_task_switch()'s mmdrop().

4233

-+		 */

4234

-+		switch_mm_irqs_off(prev->active_mm, next->mm, next);

4235

-+

4236

-+		if (!prev->mm) {                        // from kernel

4237

-+			/* will mmdrop() in finish_task_switch(). */

4238

-+			rq->prev_mm = prev->active_mm;

4239

-+			prev->active_mm = NULL;

4240

-+		}

4241

-+	}

4242

-+

4243

-+	prepare_lock_switch(rq, next);

4244

-+

4245

-+	/* Here we just switch the register state and the stack. */

4246

-+	switch_to(prev, next, prev);

4247

-+	barrier();

4248

-+

4249

-+	return finish_task_switch(prev);

4250

-+}

4251

-+

4252

-+/*

4253

-+ * nr_running, nr_uninterruptible and nr_context_switches:

4254

-+ *

4255

-+ * externally visible scheduler statistics: current number of runnable

4256

-+ * threads, total number of context switches performed since bootup.

4257

-+ */

4258

-+unsigned int nr_running(void)

4259

-+{

4260

-+	unsigned int i, sum = 0;

4261

-+

4262

-+	for_each_online_cpu(i)

4263

-+		sum += cpu_rq(i)->nr_running;

4264

-+

4265

-+	return sum;

4266

-+}

4267

-+

4268

-+/*

4269

-+ * Check if only the current task is running on the CPU.

4270

-+ *

4271

-+ * Caution: this function does not check that the caller has disabled

4272

-+ * preemption, thus the result might have a time-of-check-to-time-of-use

4273

-+ * race.  The caller is responsible to use it correctly, for example:

4274

-+ *

4275

-+ * - from a non-preemptible section (of course)

4276

-+ *

4277

-+ * - from a thread that is bound to a single CPU

4278

-+ *

4279

-+ * - in a loop with very short iterations (e.g. a polling loop)

4280

-+ */

4281

-+bool single_task_running(void)

4282

-+{

4283

-+	return raw_rq()->nr_running == 1;

4284

-+}

4285

-+EXPORT_SYMBOL(single_task_running);

4286

-+

4287

-+unsigned long long nr_context_switches(void)

4288

-+{

4289

-+	int i;

4290

-+	unsigned long long sum = 0;

4291

-+

4292

-+	for_each_possible_cpu(i)

4293

-+		sum += cpu_rq(i)->nr_switches;

4294

-+

4295

-+	return sum;

4296

-+}

4297

-+

4298

-+/*

4299

-+ * Consumers of these two interfaces, like for example the cpuidle menu

4300

-+ * governor, are using nonsensical data. Preferring shallow idle state selection

4301

-+ * for a CPU that has IO-wait which might not even end up running the task when

4302

-+ * it does become runnable.

4303

-+ */

4304

-+

4305

-+unsigned int nr_iowait_cpu(int cpu)

4306

-+{

4307

-+	return atomic_read(&cpu_rq(cpu)->nr_iowait);

4308

-+}

4309

-+

4310

-+/*

4311

-+ * IO-wait accounting, and how it's mostly bollocks (on SMP).

4312

-+ *

4313

-+ * The idea behind IO-wait account is to account the idle time that we could

4314

-+ * have spend running if it were not for IO. That is, if we were to improve the

4315

-+ * storage performance, we'd have a proportional reduction in IO-wait time.

4316

-+ *

4317

-+ * This all works nicely on UP, where, when a task blocks on IO, we account

4318

-+ * idle time as IO-wait, because if the storage were faster, it could've been

4319

-+ * running and we'd not be idle.

4320

-+ *

4321

-+ * This has been extended to SMP, by doing the same for each CPU. This however

4322

-+ * is broken.

4323

-+ *

4324

-+ * Imagine for instance the case where two tasks block on one CPU, only the one

4325

-+ * CPU will have IO-wait accounted, while the other has regular idle. Even

4326

-+ * though, if the storage were faster, both could've ran at the same time,

4327

-+ * utilising both CPUs.

4328

-+ *

4329

-+ * This means, that when looking globally, the current IO-wait accounting on

4330

-+ * SMP is a lower bound, by reason of under accounting.

4331

-+ *

4332

-+ * Worse, since the numbers are provided per CPU, they are sometimes

4333

-+ * interpreted per CPU, and that is nonsensical. A blocked task isn't strictly

4334

-+ * associated with any one particular CPU, it can wake to another CPU than it

4335

-+ * blocked on. This means the per CPU IO-wait number is meaningless.

4336

-+ *

4337

-+ * Task CPU affinities can make all that even more 'interesting'.

4338

-+ */

4339

-+

4340

-+unsigned int nr_iowait(void)

4341

-+{

4342

-+	unsigned int i, sum = 0;

4343

-+

4344

-+	for_each_possible_cpu(i)

4345

-+		sum += nr_iowait_cpu(i);

4346

-+

4347

-+	return sum;

4348

-+}

4349

-+

4350

-+#ifdef CONFIG_SMP

4351

-+

4352

-+/*

4353

-+ * sched_exec - execve() is a valuable balancing opportunity, because at

4354

-+ * this point the task has the smallest effective memory and cache

4355

-+ * footprint.

4356

-+ */

4357

-+void sched_exec(void)

4358

-+{

4359

-+	struct task_struct *p = current;

4360

-+	unsigned long flags;

4361

-+	int dest_cpu;

4362

-+

4363

-+	raw_spin_lock_irqsave(&p->pi_lock, flags);

4364

-+	dest_cpu = cpumask_any(p->cpus_ptr);

4365

-+	if (dest_cpu == smp_processor_id())

4366

-+		goto unlock;

4367

-+

4368

-+	if (likely(cpu_active(dest_cpu))) {

4369

-+		struct migration_arg arg = { p, dest_cpu };

4370

-+

4371

-+		raw_spin_unlock_irqrestore(&p->pi_lock, flags);

4372

-+		stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);

4373

-+		return;

4374

-+	}

4375

-+unlock:

4376

-+	raw_spin_unlock_irqrestore(&p->pi_lock, flags);

4377

-+}

4378

-+

4379

-+#endif

4380

-+

4381

-+DEFINE_PER_CPU(struct kernel_stat, kstat);

4382

-+DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat);

4383

-+

4384

-+EXPORT_PER_CPU_SYMBOL(kstat);

4385

-+EXPORT_PER_CPU_SYMBOL(kernel_cpustat);

4386

-+

4387

-+static inline void update_curr(struct rq *rq, struct task_struct *p)

4388

-+{

4389

-+	s64 ns = rq->clock_task - p->last_ran;

4390

-+

4391

-+	p->sched_time += ns;

4392

-+	cgroup_account_cputime(p, ns);

4393

-+	account_group_exec_runtime(p, ns);

4394

-+

4395

-+	p->time_slice -= ns;

4396

-+	p->last_ran = rq->clock_task;

4397

-+}

4398

-+

4399

-+/*

4400

-+ * Return accounted runtime for the task.

4401

-+ * Return separately the current's pending runtime that have not been

4402

-+ * accounted yet.

4403

-+ */

4404

-+unsigned long long task_sched_runtime(struct task_struct *p)

4405

-+{

4406

-+	unsigned long flags;

4407

-+	struct rq *rq;

4408

-+	raw_spinlock_t *lock;

4409

-+	u64 ns;

4410

-+

4411

-+#if defined(CONFIG_64BIT) && defined(CONFIG_SMP)

4412

-+	/*

4413

-+	 * 64-bit doesn't need locks to atomically read a 64-bit value.

4414

-+	 * So we have a optimization chance when the task's delta_exec is 0.

4415

-+	 * Reading ->on_cpu is racy, but this is ok.

4416

-+	 *

4417

-+	 * If we race with it leaving CPU, we'll take a lock. So we're correct.

4418

-+	 * If we race with it entering CPU, unaccounted time is 0. This is

4419

-+	 * indistinguishable from the read occurring a few cycles earlier.

4420

-+	 * If we see ->on_cpu without ->on_rq, the task is leaving, and has

4421

-+	 * been accounted, so we're correct here as well.

4422

-+	 */

4423

-+	if (!p->on_cpu || !task_on_rq_queued(p))

4424

-+		return tsk_seruntime(p);

4425

-+#endif

4426

-+

4427

-+	rq = task_access_lock_irqsave(p, &lock, &flags);

4428

-+	/*

4429

-+	 * Must be ->curr _and_ ->on_rq.  If dequeued, we would

4430

-+	 * project cycles that may never be accounted to this

4431

-+	 * thread, breaking clock_gettime().

4432

-+	 */

4433

-+	if (p == rq->curr && task_on_rq_queued(p)) {

4434

-+		update_rq_clock(rq);

4435

-+		update_curr(rq, p);

4436

-+	}

4437

-+	ns = tsk_seruntime(p);

4438

-+	task_access_unlock_irqrestore(p, lock, &flags);

4439

-+

4440

-+	return ns;

4441

-+}

4442

-+

4443

-+/* This manages tasks that have run out of timeslice during a scheduler_tick */

4444

-+static inline void scheduler_task_tick(struct rq *rq)

4445

-+{

4446

-+	struct task_struct *p = rq->curr;

4447

-+

4448

-+	if (is_idle_task(p))

4449

-+		return;

4450

-+

4451

-+	update_curr(rq, p);

4452

-+	cpufreq_update_util(rq, 0);

4453

-+

4454

-+	/*

4455

-+	 * Tasks have less than RESCHED_NS of time slice left they will be

4456

-+	 * rescheduled.

4457

-+	 */

4458

-+	if (p->time_slice >= RESCHED_NS)

4459

-+		return;

4460

-+	set_tsk_need_resched(p);

4461

-+	set_preempt_need_resched();

4462

-+}

4463

-+

4464

-+#ifdef CONFIG_SCHED_DEBUG

4465

-+static u64 cpu_resched_latency(struct rq *rq)

4466

-+{

4467

-+	int latency_warn_ms = READ_ONCE(sysctl_resched_latency_warn_ms);

4468

-+	u64 resched_latency, now = rq_clock(rq);

4469

-+	static bool warned_once;

4470

-+

4471

-+	if (sysctl_resched_latency_warn_once && warned_once)

4472

-+		return 0;

4473

-+

4474

-+	if (!need_resched() || !latency_warn_ms)

4475

-+		return 0;

4476

-+

4477

-+	if (system_state == SYSTEM_BOOTING)

4478

-+		return 0;

4479

-+

4480

-+	if (!rq->last_seen_need_resched_ns) {

4481

-+		rq->last_seen_need_resched_ns = now;

4482

-+		rq->ticks_without_resched = 0;

4483

-+		return 0;

4484

-+	}

4485

-+

4486

-+	rq->ticks_without_resched++;

4487

-+	resched_latency = now - rq->last_seen_need_resched_ns;

4488

-+	if (resched_latency <= latency_warn_ms * NSEC_PER_MSEC)

4489

-+		return 0;

4490

-+

4491

-+	warned_once = true;

4492

-+

4493

-+	return resched_latency;

4494

-+}

4495

-+

4496

-+static int __init setup_resched_latency_warn_ms(char *str)

4497

-+{

4498

-+	long val;

4499

-+

4500

-+	if ((kstrtol(str, 0, &val))) {

4501

-+		pr_warn("Unable to set resched_latency_warn_ms\n");

4502

-+		return 1;

4503

-+	}

4504

-+

4505

-+	sysctl_resched_latency_warn_ms = val;

4506

-+	return 1;

4507

-+}

4508

-+__setup("resched_latency_warn_ms=", setup_resched_latency_warn_ms);

4509

-+#else

4510

-+static inline u64 cpu_resched_latency(struct rq *rq) { return 0; }

4511

-+#endif /* CONFIG_SCHED_DEBUG */

4512

-+

4513

-+/*

4514

-+ * This function gets called by the timer code, with HZ frequency.

4515

-+ * We call it with interrupts disabled.

4516

-+ */

4517

-+void scheduler_tick(void)

4518

-+{

4519

-+	int cpu __maybe_unused = smp_processor_id();

4520

-+	struct rq *rq = cpu_rq(cpu);

4521

-+	u64 resched_latency;

4522

-+

4523

-+	arch_scale_freq_tick();

4524

-+	sched_clock_tick();

4525

-+

4526

-+	raw_spin_lock(&rq->lock);

4527

-+	update_rq_clock(rq);

4528

-+

4529

-+	scheduler_task_tick(rq);

4530

-+	if (sched_feat(LATENCY_WARN))

4531

-+		resched_latency = cpu_resched_latency(rq);

4532

-+	calc_global_load_tick(rq);

4533

-+

4534

-+	rq->last_tick = rq->clock;

4535

-+	raw_spin_unlock(&rq->lock);

4536

-+

4537

-+	if (sched_feat(LATENCY_WARN) && resched_latency)

4538

-+		resched_latency_warn(cpu, resched_latency);

4539

-+

4540

-+	perf_event_task_tick();

4541

-+}

4542

-+

4543

-+#ifdef CONFIG_SCHED_SMT

4544

-+static inline int active_load_balance_cpu_stop(void *data)

4545

-+{

4546

-+	struct rq *rq = this_rq();

4547

-+	struct task_struct *p = data;

4548

-+	cpumask_t tmp;

4549

-+	unsigned long flags;

4550

-+

4551

-+	local_irq_save(flags);

4552

-+

4553

-+	raw_spin_lock(&p->pi_lock);

4554

-+	raw_spin_lock(&rq->lock);

4555

-+

4556

-+	rq->active_balance = 0;

4557

-+	/* _something_ may have changed the task, double check again */

4558

-+	if (task_on_rq_queued(p) && task_rq(p) == rq &&

4559

-+	    cpumask_and(&tmp, p->cpus_ptr, &sched_sg_idle_mask) &&

4560

-+	    !is_migration_disabled(p)) {

4561

-+		int cpu = cpu_of(rq);

4562

-+		int dcpu = __best_mask_cpu(&tmp, per_cpu(sched_cpu_llc_mask, cpu));

4563

-+		rq = move_queued_task(rq, p, dcpu);

4564

-+	}

4565

-+

4566

-+	raw_spin_unlock(&rq->lock);

4567

-+	raw_spin_unlock(&p->pi_lock);

4568

-+

4569

-+	local_irq_restore(flags);

4570

-+

4571

-+	return 0;

4572

-+}

4573

-+

4574

-+/* sg_balance_trigger - trigger slibing group balance for @cpu */

4575

-+static inline int sg_balance_trigger(const int cpu)

4576

-+{

4577

-+	struct rq *rq= cpu_rq(cpu);

4578

-+	unsigned long flags;

4579

-+	struct task_struct *curr;

4580

-+	int res;

4581

-+

4582

-+	if (!raw_spin_trylock_irqsave(&rq->lock, flags))

4583

-+		return 0;

4584

-+	curr = rq->curr;

4585

-+	res = (!is_idle_task(curr)) && (1 == rq->nr_running) &&\

4586

-+	      cpumask_intersects(curr->cpus_ptr, &sched_sg_idle_mask) &&\

4587

-+	      !is_migration_disabled(curr) && (!rq->active_balance);

4588

-+

4589

-+	if (res)

4590

-+		rq->active_balance = 1;

4591

-+

4592

-+	raw_spin_unlock_irqrestore(&rq->lock, flags);

4593

-+

4594

-+	if (res)

4595

-+		stop_one_cpu_nowait(cpu, active_load_balance_cpu_stop,

4596

-+				    curr, &rq->active_balance_work);

4597

-+	return res;

4598

-+}

4599

-+

4600

-+/*

4601

-+ * sg_balance_check - slibing group balance check for run queue @rq

4602

-+ */

4603

-+static inline void sg_balance_check(struct rq *rq)

4604

-+{

4605

-+	cpumask_t chk;

4606

-+	int cpu = cpu_of(rq);

4607

-+

4608

-+	/* exit when cpu is offline */

4609

-+	if (unlikely(!rq->online))

4610

-+		return;

4611

-+

4612

-+	/*

4613

-+	 * Only cpu in slibing idle group will do the checking and then

4614

-+	 * find potential cpus which can migrate the current running task

4615

-+	 */

4616

-+	if (cpumask_test_cpu(cpu, &sched_sg_idle_mask) &&

4617

-+	    cpumask_andnot(&chk, cpu_online_mask, sched_rq_watermark) &&

4618

-+	    cpumask_andnot(&chk, &chk, &sched_rq_pending_mask)) {

4619

-+		int i;

4620

-+

4621

-+		for_each_cpu_wrap(i, &chk, cpu) {

4622

-+			if (cpumask_subset(cpu_smt_mask(i), &chk) &&

4623

-+			    sg_balance_trigger(i))

4624

-+				return;

4625

-+		}

4626

-+	}

4627

-+}

4628

-+#endif /* CONFIG_SCHED_SMT */

4629

-+

4630

-+#ifdef CONFIG_NO_HZ_FULL

4631

-+

4632

-+struct tick_work {

4633

-+	int			cpu;

4634

-+	atomic_t		state;

4635

-+	struct delayed_work	work;

4636

-+};

4637

-+/* Values for ->state, see diagram below. */

4638

-+#define TICK_SCHED_REMOTE_OFFLINE	0

4639

-+#define TICK_SCHED_REMOTE_OFFLINING	1

4640

-+#define TICK_SCHED_REMOTE_RUNNING	2

4641

-+

4642

-+/*

4643

-+ * State diagram for ->state:

4644

-+ *

4645

-+ *

4646

-+ *          TICK_SCHED_REMOTE_OFFLINE

4647

-+ *                    |   ^

4648

-+ *                    |   |

4649

-+ *                    |   | sched_tick_remote()

4650

-+ *                    |   |

4651

-+ *                    |   |

4652

-+ *                    +--TICK_SCHED_REMOTE_OFFLINING

4653

-+ *                    |   ^

4654

-+ *                    |   |

4655

-+ * sched_tick_start() |   | sched_tick_stop()

4656

-+ *                    |   |

4657

-+ *                    V   |

4658

-+ *          TICK_SCHED_REMOTE_RUNNING

4659

-+ *

4660

-+ *

4661

-+ * Other transitions get WARN_ON_ONCE(), except that sched_tick_remote()

4662

-+ * and sched_tick_start() are happy to leave the state in RUNNING.

4663

-+ */

4664

-+

4665

-+static struct tick_work __percpu *tick_work_cpu;

4666

-+

4667

-+static void sched_tick_remote(struct work_struct *work)

4668

-+{

4669

-+	struct delayed_work *dwork = to_delayed_work(work);

4670

-+	struct tick_work *twork = container_of(dwork, struct tick_work, work);

4671

-+	int cpu = twork->cpu;

4672

-+	struct rq *rq = cpu_rq(cpu);

4673

-+	struct task_struct *curr;

4674

-+	unsigned long flags;

4675

-+	u64 delta;

4676

-+	int os;

4677

-+

4678

-+	/*

4679

-+	 * Handle the tick only if it appears the remote CPU is running in full

4680

-+	 * dynticks mode. The check is racy by nature, but missing a tick or

4681

-+	 * having one too much is no big deal because the scheduler tick updates

4682

-+	 * statistics and checks timeslices in a time-independent way, regardless

4683

-+	 * of when exactly it is running.

4684

-+	 */

4685

-+	if (!tick_nohz_tick_stopped_cpu(cpu))

4686

-+		goto out_requeue;

4687

-+

4688

-+	raw_spin_lock_irqsave(&rq->lock, flags);

4689

-+	curr = rq->curr;

4690

-+	if (cpu_is_offline(cpu))

4691

-+		goto out_unlock;

4692

-+

4693

-+	update_rq_clock(rq);

4694

-+	if (!is_idle_task(curr)) {

4695

-+		/*

4696

-+		 * Make sure the next tick runs within a reasonable

4697

-+		 * amount of time.

4698

-+		 */

4699

-+		delta = rq_clock_task(rq) - curr->last_ran;

4700

-+		WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3);

4701

-+	}

4702

-+	scheduler_task_tick(rq);

4703

-+

4704

-+	calc_load_nohz_remote(rq);

4705

-+out_unlock:

4706

-+	raw_spin_unlock_irqrestore(&rq->lock, flags);

4707

-+

4708

-+out_requeue:

4709

-+	/*

4710

-+	 * Run the remote tick once per second (1Hz). This arbitrary

4711

-+	 * frequency is large enough to avoid overload but short enough

4712

-+	 * to keep scheduler internal stats reasonably up to date.  But

4713

-+	 * first update state to reflect hotplug activity if required.

4714

-+	 */

4715

-+	os = atomic_fetch_add_unless(&twork->state, -1, TICK_SCHED_REMOTE_RUNNING);

4716

-+	WARN_ON_ONCE(os == TICK_SCHED_REMOTE_OFFLINE);

4717

-+	if (os == TICK_SCHED_REMOTE_RUNNING)

4718

-+		queue_delayed_work(system_unbound_wq, dwork, HZ);

4719

-+}

4720

-+

4721

-+static void sched_tick_start(int cpu)

4722

-+{

4723

-+	int os;

4724

-+	struct tick_work *twork;

4725

-+

4726

-+	if (housekeeping_cpu(cpu, HK_FLAG_TICK))

4727

-+		return;

4728

-+

4729

-+	WARN_ON_ONCE(!tick_work_cpu);

4730

-+

4731

-+	twork = per_cpu_ptr(tick_work_cpu, cpu);

4732

-+	os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_RUNNING);

4733

-+	WARN_ON_ONCE(os == TICK_SCHED_REMOTE_RUNNING);

4734

-+	if (os == TICK_SCHED_REMOTE_OFFLINE) {

4735

-+		twork->cpu = cpu;

4736

-+		INIT_DELAYED_WORK(&twork->work, sched_tick_remote);

4737

-+		queue_delayed_work(system_unbound_wq, &twork->work, HZ);

4738

-+	}

4739

-+}

4740

-+

4741

-+#ifdef CONFIG_HOTPLUG_CPU

4742

-+static void sched_tick_stop(int cpu)

4743

-+{

4744

-+	struct tick_work *twork;

4745

-+

4746

-+	if (housekeeping_cpu(cpu, HK_FLAG_TICK))

4747

-+		return;

4748

-+

4749

-+	WARN_ON_ONCE(!tick_work_cpu);

4750

-+

4751

-+	twork = per_cpu_ptr(tick_work_cpu, cpu);

4752

-+	cancel_delayed_work_sync(&twork->work);

4753

-+}

4754

-+#endif /* CONFIG_HOTPLUG_CPU */

4755

-+

4756

-+int __init sched_tick_offload_init(void)

4757

-+{

4758

-+	tick_work_cpu = alloc_percpu(struct tick_work);

4759

-+	BUG_ON(!tick_work_cpu);

4760

-+	return 0;

4761

-+}

4762

-+

4763

-+#else /* !CONFIG_NO_HZ_FULL */

4764

-+static inline void sched_tick_start(int cpu) { }

4765

-+static inline void sched_tick_stop(int cpu) { }

4766

-+#endif

4767

-+

4768

-+#if defined(CONFIG_PREEMPTION) && (defined(CONFIG_DEBUG_PREEMPT) || \

4769

-+				defined(CONFIG_PREEMPT_TRACER))

4770

-+/*

4771

-+ * If the value passed in is equal to the current preempt count

4772

-+ * then we just disabled preemption. Start timing the latency.

4773

-+ */

4774

-+static inline void preempt_latency_start(int val)

4775

-+{

4776

-+	if (preempt_count() == val) {

4777

-+		unsigned long ip = get_lock_parent_ip();

4778

-+#ifdef CONFIG_DEBUG_PREEMPT

4779

-+		current->preempt_disable_ip = ip;

4780

-+#endif

4781

-+		trace_preempt_off(CALLER_ADDR0, ip);

4782

-+	}

4783

-+}

4784

-+

4785

-+void preempt_count_add(int val)

4786

-+{

4787

-+#ifdef CONFIG_DEBUG_PREEMPT

4788

-+	/*

4789

-+	 * Underflow?

4790

-+	 */

4791

-+	if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))

4792

-+		return;

4793

-+#endif

4794

-+	__preempt_count_add(val);

4795

-+#ifdef CONFIG_DEBUG_PREEMPT

4796

-+	/*

4797

-+	 * Spinlock count overflowing soon?

4798

-+	 */

4799

-+	DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=

4800

-+				PREEMPT_MASK - 10);

4801

-+#endif

4802

-+	preempt_latency_start(val);

4803

-+}

4804

-+EXPORT_SYMBOL(preempt_count_add);

4805

-+NOKPROBE_SYMBOL(preempt_count_add);

4806

-+

4807

-+/*

4808

-+ * If the value passed in equals to the current preempt count

4809

-+ * then we just enabled preemption. Stop timing the latency.

4810

-+ */

4811

-+static inline void preempt_latency_stop(int val)

4812

-+{

4813

-+	if (preempt_count() == val)

4814

-+		trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip());

4815

-+}

4816

-+

4817

-+void preempt_count_sub(int val)

4818

-+{

4819

-+#ifdef CONFIG_DEBUG_PREEMPT

4820

-+	/*

4821

-+	 * Underflow?

4822

-+	 */

4823

-+	if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))

4824

-+		return;

4825

-+	/*

4826

-+	 * Is the spinlock portion underflowing?

4827

-+	 */

4828

-+	if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&

4829

-+			!(preempt_count() & PREEMPT_MASK)))

4830

-+		return;

4831

-+#endif

4832

-+

4833

-+	preempt_latency_stop(val);

4834

-+	__preempt_count_sub(val);

4835

-+}

4836

-+EXPORT_SYMBOL(preempt_count_sub);

4837

-+NOKPROBE_SYMBOL(preempt_count_sub);

4838

-+

4839

-+#else

4840

-+static inline void preempt_latency_start(int val) { }

4841

-+static inline void preempt_latency_stop(int val) { }

4842

-+#endif

4843

-+

4844

-+static inline unsigned long get_preempt_disable_ip(struct task_struct *p)

4845

-+{

4846

-+#ifdef CONFIG_DEBUG_PREEMPT

4847

-+	return p->preempt_disable_ip;

4848

-+#else

4849

-+	return 0;

4850

-+#endif

4851

-+}

4852

-+

4853

-+/*

4854

-+ * Print scheduling while atomic bug:

4855

-+ */

4856

-+static noinline void __schedule_bug(struct task_struct *prev)

4857

-+{

4858

-+	/* Save this before calling printk(), since that will clobber it */

4859

-+	unsigned long preempt_disable_ip = get_preempt_disable_ip(current);

4860

-+

4861

-+	if (oops_in_progress)

4862

-+		return;

4863

-+

4864

-+	printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",

4865

-+		prev->comm, prev->pid, preempt_count());

4866

-+

4867

-+	debug_show_held_locks(prev);

4868

-+	print_modules();

4869

-+	if (irqs_disabled())

4870

-+		print_irqtrace_events(prev);

4871

-+	if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)

4872

-+	    && in_atomic_preempt_off()) {

4873

-+		pr_err("Preemption disabled at:");

4874

-+		print_ip_sym(KERN_ERR, preempt_disable_ip);

4875

-+	}

4876

-+	if (panic_on_warn)

4877

-+		panic("scheduling while atomic\n");

4878

-+

4879

-+	dump_stack();

4880

-+	add_taint(TAINT_WARN, LOCKDEP_STILL_OK);

4881

-+}

4882

-+

4883

-+/*

4884

-+ * Various schedule()-time debugging checks and statistics:

4885

-+ */

4886

-+static inline void schedule_debug(struct task_struct *prev, bool preempt)

4887

-+{

4888

-+#ifdef CONFIG_SCHED_STACK_END_CHECK

4889

-+	if (task_stack_end_corrupted(prev))

4890

-+		panic("corrupted stack end detected inside scheduler\n");

4891

-+

4892

-+	if (task_scs_end_corrupted(prev))

4893

-+		panic("corrupted shadow stack detected inside scheduler\n");

4894

-+#endif

4895

-+

4896

-+#ifdef CONFIG_DEBUG_ATOMIC_SLEEP

4897

-+	if (!preempt && READ_ONCE(prev->__state) && prev->non_block_count) {

4898

-+		printk(KERN_ERR "BUG: scheduling in a non-blocking section: %s/%d/%i\n",

4899

-+			prev->comm, prev->pid, prev->non_block_count);

4900

-+		dump_stack();

4901

-+		add_taint(TAINT_WARN, LOCKDEP_STILL_OK);

4902

-+	}

4903

-+#endif

4904

-+

4905

-+	if (unlikely(in_atomic_preempt_off())) {

4906

-+		__schedule_bug(prev);

4907

-+		preempt_count_set(PREEMPT_DISABLED);

4908

-+	}

4909

-+	rcu_sleep_check();

4910

-+	SCHED_WARN_ON(ct_state() == CONTEXT_USER);

4911

-+

4912

-+	profile_hit(SCHED_PROFILING, __builtin_return_address(0));

4913

-+

4914

-+	schedstat_inc(this_rq()->sched_count);

4915

-+}

4916

-+

4917

-+/*

4918

-+ * Compile time debug macro

4919

-+ * #define ALT_SCHED_DEBUG

4920

-+ */

4921

-+

4922

-+#ifdef ALT_SCHED_DEBUG

4923

-+void alt_sched_debug(void)

4924

-+{

4925

-+	printk(KERN_INFO "sched: pending: 0x%04lx, idle: 0x%04lx, sg_idle: 0x%04lx\n",

4926

-+	       sched_rq_pending_mask.bits[0],

4927

-+	       sched_rq_watermark[0].bits[0],

4928

-+	       sched_sg_idle_mask.bits[0]);

4929

-+}

4930

-+#else

4931

-+inline void alt_sched_debug(void) {}

4932

-+#endif

4933

-+

4934

-+#ifdef	CONFIG_SMP

4935

-+

4936

-+#define SCHED_RQ_NR_MIGRATION (32U)

4937

-+/*

4938

-+ * Migrate pending tasks in @rq to @dest_cpu

4939

-+ * Will try to migrate mininal of half of @rq nr_running tasks and

4940

-+ * SCHED_RQ_NR_MIGRATION to @dest_cpu

4941

-+ */

4942

-+static inline int

4943

-+migrate_pending_tasks(struct rq *rq, struct rq *dest_rq, const int dest_cpu)

4944

-+{

4945

-+	struct task_struct *p, *skip = rq->curr;

4946

-+	int nr_migrated = 0;

4947

-+	int nr_tries = min(rq->nr_running / 2, SCHED_RQ_NR_MIGRATION);

4948

-+

4949

-+	while (skip != rq->idle && nr_tries &&

4950

-+	       (p = sched_rq_next_task(skip, rq)) != rq->idle) {

4951

-+		skip = sched_rq_next_task(p, rq);

4952

-+		if (cpumask_test_cpu(dest_cpu, p->cpus_ptr)) {

4953

-+			__SCHED_DEQUEUE_TASK(p, rq, 0, );

4954

-+			set_task_cpu(p, dest_cpu);

4955

-+			sched_task_sanity_check(p, dest_rq);

4956

-+			__SCHED_ENQUEUE_TASK(p, dest_rq, 0);

4957

-+			nr_migrated++;

4958

-+		}

4959

-+		nr_tries--;

4960

-+	}

4961

-+

4962

-+	return nr_migrated;

4963

-+}

4964

-+

4965

-+static inline int take_other_rq_tasks(struct rq *rq, int cpu)

4966

-+{

4967

-+	struct cpumask *topo_mask, *end_mask;

4968

-+

4969

-+	if (unlikely(!rq->online))

4970

-+		return 0;

4971

-+

4972

-+	if (cpumask_empty(&sched_rq_pending_mask))

4973

-+		return 0;

4974

-+

4975

-+	topo_mask = per_cpu(sched_cpu_topo_masks, cpu) + 1;

4976

-+	end_mask = per_cpu(sched_cpu_topo_end_mask, cpu);

4977

-+	do {

4978

-+		int i;

4979

-+		for_each_cpu_and(i, &sched_rq_pending_mask, topo_mask) {

4980

-+			int nr_migrated;

4981

-+			struct rq *src_rq;

4982

-+

4983

-+			src_rq = cpu_rq(i);

4984

-+			if (!do_raw_spin_trylock(&src_rq->lock))

4985

-+				continue;

4986

-+			spin_acquire(&src_rq->lock.dep_map,

4987

-+				     SINGLE_DEPTH_NESTING, 1, _RET_IP_);

4988

-+

4989

-+			if ((nr_migrated = migrate_pending_tasks(src_rq, rq, cpu))) {

4990

-+				src_rq->nr_running -= nr_migrated;

4991

-+				if (src_rq->nr_running < 2)

4992

-+					cpumask_clear_cpu(i, &sched_rq_pending_mask);

4993

-+

4994

-+				rq->nr_running += nr_migrated;

4995

-+				if (rq->nr_running > 1)

4996

-+					cpumask_set_cpu(cpu, &sched_rq_pending_mask);

4997

-+

4998

-+				update_sched_rq_watermark(rq);

4999

-+				cpufreq_update_util(rq, 0);

5000

-+

5001

-+				spin_release(&src_rq->lock.dep_map, _RET_IP_);

5002

-+				do_raw_spin_unlock(&src_rq->lock);

5003

-+

5004

-+				return 1;

5005

-+			}

5006

-+

5007

-+			spin_release(&src_rq->lock.dep_map, _RET_IP_);

5008

-+			do_raw_spin_unlock(&src_rq->lock);

5009

-+		}

5010

-+	} while (++topo_mask < end_mask);

5011

-+

5012

-+	return 0;

5013

-+}

5014

-+#endif

5015

-+

5016

-+/*

5017

-+ * Timeslices below RESCHED_NS are considered as good as expired as there's no

5018

-+ * point rescheduling when there's so little time left.

5019

-+ */

5020

-+static inline void check_curr(struct task_struct *p, struct rq *rq)

5021

-+{

5022

-+	if (unlikely(rq->idle == p))

5023

-+		return;

5024

-+

5025

-+	update_curr(rq, p);

5026

-+

5027

-+	if (p->time_slice < RESCHED_NS)

5028

-+		time_slice_expired(p, rq);

5029

-+}

5030

-+

5031

-+static inline struct task_struct *

5032

-+choose_next_task(struct rq *rq, int cpu, struct task_struct *prev)

5033

-+{

5034

-+	struct task_struct *next;

5035

-+

5036

-+	if (unlikely(rq->skip)) {

5037

-+		next = rq_runnable_task(rq);

5038

-+		if (next == rq->idle) {

5039

-+#ifdef	CONFIG_SMP

5040

-+			if (!take_other_rq_tasks(rq, cpu)) {

5041

-+#endif

5042

-+				rq->skip = NULL;

5043

-+				schedstat_inc(rq->sched_goidle);

5044

-+				return next;

5045

-+#ifdef	CONFIG_SMP

5046

-+			}

5047

-+			next = rq_runnable_task(rq);

5048

-+#endif

5049

-+		}

5050

-+		rq->skip = NULL;

5051

-+#ifdef CONFIG_HIGH_RES_TIMERS

5052

-+		hrtick_start(rq, next->time_slice);

5053

-+#endif

5054

-+		return next;

5055

-+	}

5056

-+

5057

-+	next = sched_rq_first_task(rq);

5058

-+	if (next == rq->idle) {

5059

-+#ifdef	CONFIG_SMP

5060

-+		if (!take_other_rq_tasks(rq, cpu)) {

5061

-+#endif

5062

-+			schedstat_inc(rq->sched_goidle);

5063

-+			/*printk(KERN_INFO "sched: choose_next_task(%d) idle %px\n", cpu, next);*/

5064

-+			return next;

5065

-+#ifdef	CONFIG_SMP

5066

-+		}

5067

-+		next = sched_rq_first_task(rq);

5068

-+#endif

5069

-+	}

5070

-+#ifdef CONFIG_HIGH_RES_TIMERS

5071

-+	hrtick_start(rq, next->time_slice);

5072

-+#endif

5073

-+	/*printk(KERN_INFO "sched: choose_next_task(%d) next %px\n", cpu,

5074

-+	 * next);*/

5075

-+	return next;

5076

-+}

5077

-+

5078

-+/*

5079

-+ * Constants for the sched_mode argument of __schedule().

5080

-+ *

5081

-+ * The mode argument allows RT enabled kernels to differentiate a

5082

-+ * preemption from blocking on an 'sleeping' spin/rwlock. Note that

5083

-+ * SM_MASK_PREEMPT for !RT has all bits set, which allows the compiler to

5084

-+ * optimize the AND operation out and just check for zero.

5085

-+ */

5086

-+#define SM_NONE			0x0

5087

-+#define SM_PREEMPT		0x1

5088

-+#define SM_RTLOCK_WAIT		0x2

5089

-+

5090

-+#ifndef CONFIG_PREEMPT_RT

5091

-+# define SM_MASK_PREEMPT	(~0U)

5092

-+#else

5093

-+# define SM_MASK_PREEMPT	SM_PREEMPT

5094

-+#endif

5095

-+

5096

-+/*

5097

-+ * schedule() is the main scheduler function.

5098

-+ *

5099

-+ * The main means of driving the scheduler and thus entering this function are:

5100

-+ *

5101

-+ *   1. Explicit blocking: mutex, semaphore, waitqueue, etc.

5102

-+ *

5103

-+ *   2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return

5104

-+ *      paths. For example, see arch/x86/entry_64.S.

5105

-+ *

5106

-+ *      To drive preemption between tasks, the scheduler sets the flag in timer

5107

-+ *      interrupt handler scheduler_tick().

5108

-+ *

5109

-+ *   3. Wakeups don't really cause entry into schedule(). They add a

5110

-+ *      task to the run-queue and that's it.

5111

-+ *

5112

-+ *      Now, if the new task added to the run-queue preempts the current

5113

-+ *      task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets

5114

-+ *      called on the nearest possible occasion:

5115

-+ *

5116

-+ *       - If the kernel is preemptible (CONFIG_PREEMPTION=y):

5117

-+ *

5118

-+ *         - in syscall or exception context, at the next outmost

5119

-+ *           preempt_enable(). (this might be as soon as the wake_up()'s

5120

-+ *           spin_unlock()!)

5121

-+ *

5122

-+ *         - in IRQ context, return from interrupt-handler to

5123

-+ *           preemptible context

5124

-+ *

5125

-+ *       - If the kernel is not preemptible (CONFIG_PREEMPTION is not set)

5126

-+ *         then at the next:

5127

-+ *

5128

-+ *          - cond_resched() call

5129

-+ *          - explicit schedule() call

5130

-+ *          - return from syscall or exception to user-space

5131

-+ *          - return from interrupt-handler to user-space

5132

-+ *

5133

-+ * WARNING: must be called with preemption disabled!

5134

-+ */

5135

-+static void __sched notrace __schedule(unsigned int sched_mode)

5136

-+{

5137

-+	struct task_struct *prev, *next;

5138

-+	unsigned long *switch_count;

5139

-+	unsigned long prev_state;

5140

-+	struct rq *rq;

5141

-+	int cpu;

5142

-+

5143

-+	cpu = smp_processor_id();

5144

-+	rq = cpu_rq(cpu);

5145

-+	prev = rq->curr;

5146

-+

5147

-+	schedule_debug(prev, !!sched_mode);

5148

-+

5149

-+	/* by passing sched_feat(HRTICK) checking which Alt schedule FW doesn't support */

5150

-+	hrtick_clear(rq);

5151

-+

5152

-+	local_irq_disable();

5153

-+	rcu_note_context_switch(!!sched_mode);

5154

-+

5155

-+	/*

5156

-+	 * Make sure that signal_pending_state()->signal_pending() below

5157

-+	 * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)

5158

-+	 * done by the caller to avoid the race with signal_wake_up():

5159

-+	 *

5160

-+	 * __set_current_state(@state)		signal_wake_up()

5161

-+	 * schedule()				  set_tsk_thread_flag(p, TIF_SIGPENDING)

5162

-+	 *					  wake_up_state(p, state)

5163

-+	 *   LOCK rq->lock			    LOCK p->pi_state

5164

-+	 *   smp_mb__after_spinlock()		    smp_mb__after_spinlock()

5165

-+	 *     if (signal_pending_state())	    if (p->state & @state)

5166

-+	 *

5167

-+	 * Also, the membarrier system call requires a full memory barrier

5168

-+	 * after coming from user-space, before storing to rq->curr.

5169

-+	 */

5170

-+	raw_spin_lock(&rq->lock);

5171

-+	smp_mb__after_spinlock();

5172

-+

5173

-+	update_rq_clock(rq);

5174

-+

5175

-+	switch_count = &prev->nivcsw;

5176

-+	/*

5177

-+	 * We must load prev->state once (task_struct::state is volatile), such

5178

-+	 * that:

5179

-+	 *

5180

-+	 *  - we form a control dependency vs deactivate_task() below.

5181

-+	 *  - ptrace_{,un}freeze_traced() can change ->state underneath us.

5182

-+	 */

5183

-+	prev_state = READ_ONCE(prev->__state);

5184

-+	if (!(sched_mode & SM_MASK_PREEMPT) && prev_state) {

5185

-+		if (signal_pending_state(prev_state, prev)) {

5186

-+			WRITE_ONCE(prev->__state, TASK_RUNNING);

5187

-+		} else {

5188

-+			prev->sched_contributes_to_load =

5189

-+				(prev_state & TASK_UNINTERRUPTIBLE) &&

5190

-+				!(prev_state & TASK_NOLOAD) &&

5191

-+				!(prev->flags & PF_FROZEN);

5192

-+

5193

-+			if (prev->sched_contributes_to_load)

5194

-+				rq->nr_uninterruptible++;

5195

-+

5196

-+			/*

5197

-+			 * __schedule()			ttwu()

5198

-+			 *   prev_state = prev->state;    if (p->on_rq && ...)

5199

-+			 *   if (prev_state)		    goto out;

5200

-+			 *     p->on_rq = 0;		  smp_acquire__after_ctrl_dep();

5201

-+			 *				  p->state = TASK_WAKING

5202

-+			 *

5203

-+			 * Where __schedule() and ttwu() have matching control dependencies.

5204

-+			 *

5205

-+			 * After this, schedule() must not care about p->state any more.

5206

-+			 */

5207

-+			sched_task_deactivate(prev, rq);

5208

-+			deactivate_task(prev, rq);

5209

-+

5210

-+			if (prev->in_iowait) {

5211

-+				atomic_inc(&rq->nr_iowait);

5212

-+				delayacct_blkio_start();

5213

-+			}

5214

-+		}

5215

-+		switch_count = &prev->nvcsw;

5216

-+	}

5217

-+

5218

-+	check_curr(prev, rq);

5219

-+

5220

-+	next = choose_next_task(rq, cpu, prev);

5221

-+	clear_tsk_need_resched(prev);

5222

-+	clear_preempt_need_resched();

5223

-+#ifdef CONFIG_SCHED_DEBUG

5224

-+	rq->last_seen_need_resched_ns = 0;

5225

-+#endif

5226

-+

5227

-+	if (likely(prev != next)) {

5228

-+		next->last_ran = rq->clock_task;

5229

-+		rq->last_ts_switch = rq->clock;

5230

-+

5231

-+		rq->nr_switches++;

5232

-+		/*

5233

-+		 * RCU users of rcu_dereference(rq->curr) may not see

5234

-+		 * changes to task_struct made by pick_next_task().

5235

-+		 */

5236

-+		RCU_INIT_POINTER(rq->curr, next);

5237

-+		/*

5238

-+		 * The membarrier system call requires each architecture

5239

-+		 * to have a full memory barrier after updating

5240

-+		 * rq->curr, before returning to user-space.

5241

-+		 *

5242

-+		 * Here are the schemes providing that barrier on the

5243

-+		 * various architectures:

5244

-+		 * - mm ? switch_mm() : mmdrop() for x86, s390, sparc, PowerPC.

5245

-+		 *   switch_mm() rely on membarrier_arch_switch_mm() on PowerPC.

5246

-+		 * - finish_lock_switch() for weakly-ordered

5247

-+		 *   architectures where spin_unlock is a full barrier,

5248

-+		 * - switch_to() for arm64 (weakly-ordered, spin_unlock

5249

-+		 *   is a RELEASE barrier),

5250

-+		 */

5251

-+		++*switch_count;

5252

-+

5253

-+		psi_sched_switch(prev, next, !task_on_rq_queued(prev));

5254

-+

5255

-+		trace_sched_switch(sched_mode & SM_MASK_PREEMPT, prev, next);

5256

-+

5257

-+		/* Also unlocks the rq: */

5258

-+		rq = context_switch(rq, prev, next);

5259

-+	} else {

5260

-+		__balance_callbacks(rq);

5261

-+		raw_spin_unlock_irq(&rq->lock);

5262

-+	}

5263

-+

5264

-+#ifdef CONFIG_SCHED_SMT

5265

-+	sg_balance_check(rq);

5266

-+#endif

5267

-+}

5268

-+

5269

-+void __noreturn do_task_dead(void)

5270

-+{

5271

-+	/* Causes final put_task_struct in finish_task_switch(): */

5272

-+	set_special_state(TASK_DEAD);

5273

-+

5274

-+	/* Tell freezer to ignore us: */

5275

-+	current->flags |= PF_NOFREEZE;

5276

-+

5277

-+	__schedule(SM_NONE);

5278

-+	BUG();

5279

-+

5280

-+	/* Avoid "noreturn function does return" - but don't continue if BUG() is a NOP: */

5281

-+	for (;;)

5282

-+		cpu_relax();

5283

-+}

5284

-+

5285

-+static inline void sched_submit_work(struct task_struct *tsk)

5286

-+{

5287

-+	unsigned int task_flags;

5288

-+

5289

-+	if (task_is_running(tsk))

5290

-+		return;

5291

-+

5292

-+	task_flags = tsk->flags;

5293

-+	/*

5294

-+	 * If a worker went to sleep, notify and ask workqueue whether

5295

-+	 * it wants to wake up a task to maintain concurrency.

5296

-+	 * As this function is called inside the schedule() context,

5297

-+	 * we disable preemption to avoid it calling schedule() again

5298

-+	 * in the possible wakeup of a kworker and because wq_worker_sleeping()

5299

-+	 * requires it.

5300

-+	 */

5301

-+	if (task_flags & (PF_WQ_WORKER | PF_IO_WORKER)) {

5302

-+		preempt_disable();

5303

-+		if (task_flags & PF_WQ_WORKER)

5304

-+			wq_worker_sleeping(tsk);

5305

-+		else

5306

-+			io_wq_worker_sleeping(tsk);

5307

-+		preempt_enable_no_resched();

5308

-+	}

5309

-+

5310

-+	if (tsk_is_pi_blocked(tsk))

5311

-+		return;

5312

-+

5313

-+	/*

5314

-+	 * If we are going to sleep and we have plugged IO queued,

5315

-+	 * make sure to submit it to avoid deadlocks.

5316

-+	 */

5317

-+	if (blk_needs_flush_plug(tsk))

5318

-+		blk_schedule_flush_plug(tsk);

5319

-+}

5320

-+

5321

-+static void sched_update_worker(struct task_struct *tsk)

5322

-+{

5323

-+	if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) {

5324

-+		if (tsk->flags & PF_WQ_WORKER)

5325

-+			wq_worker_running(tsk);

5326

-+		else

5327

-+			io_wq_worker_running(tsk);

5328

-+	}

5329

-+}

5330

-+

5331

-+asmlinkage __visible void __sched schedule(void)

5332

-+{

5333

-+	struct task_struct *tsk = current;

5334

-+

5335

-+	sched_submit_work(tsk);

5336

-+	do {

5337

-+		preempt_disable();

5338

-+		__schedule(SM_NONE);

5339

-+		sched_preempt_enable_no_resched();

5340

-+	} while (need_resched());

5341

-+	sched_update_worker(tsk);

5342

-+}

5343

-+EXPORT_SYMBOL(schedule);

5344

-+

5345

-+/*

5346

-+ * synchronize_rcu_tasks() makes sure that no task is stuck in preempted

5347

-+ * state (have scheduled out non-voluntarily) by making sure that all

5348

-+ * tasks have either left the run queue or have gone into user space.

5349

-+ * As idle tasks do not do either, they must not ever be preempted

5350

-+ * (schedule out non-voluntarily).

5351

-+ *

5352

-+ * schedule_idle() is similar to schedule_preempt_disable() except that it

5353

-+ * never enables preemption because it does not call sched_submit_work().

5354

-+ */

5355

-+void __sched schedule_idle(void)

5356

-+{

5357

-+	/*

5358

-+	 * As this skips calling sched_submit_work(), which the idle task does

5359

-+	 * regardless because that function is a nop when the task is in a

5360

-+	 * TASK_RUNNING state, make sure this isn't used someplace that the

5361

-+	 * current task can be in any other state. Note, idle is always in the

5362

-+	 * TASK_RUNNING state.

5363

-+	 */

5364

-+	WARN_ON_ONCE(current->__state);

5365

-+	do {

5366

-+		__schedule(SM_NONE);

5367

-+	} while (need_resched());

5368

-+}

5369

-+

5370

-+#if defined(CONFIG_CONTEXT_TRACKING) && !defined(CONFIG_HAVE_CONTEXT_TRACKING_OFFSTACK)

5371

-+asmlinkage __visible void __sched schedule_user(void)

5372

-+{

5373

-+	/*

5374

-+	 * If we come here after a random call to set_need_resched(),

5375

-+	 * or we have been woken up remotely but the IPI has not yet arrived,

5376

-+	 * we haven't yet exited the RCU idle mode. Do it here manually until

5377

-+	 * we find a better solution.

5378

-+	 *

5379

-+	 * NB: There are buggy callers of this function.  Ideally we

5380

-+	 * should warn if prev_state != CONTEXT_USER, but that will trigger

5381

-+	 * too frequently to make sense yet.

5382

-+	 */

5383

-+	enum ctx_state prev_state = exception_enter();

5384

-+	schedule();

5385

-+	exception_exit(prev_state);

5386

-+}

5387

-+#endif

5388

-+

5389

-+/**

5390

-+ * schedule_preempt_disabled - called with preemption disabled

5391

-+ *

5392

-+ * Returns with preemption disabled. Note: preempt_count must be 1

5393

-+ */

5394

-+void __sched schedule_preempt_disabled(void)

5395

-+{

5396

-+	sched_preempt_enable_no_resched();

5397

-+	schedule();

5398

-+	preempt_disable();

5399

-+}

5400

-+

5401

-+#ifdef CONFIG_PREEMPT_RT

5402

-+void __sched notrace schedule_rtlock(void)

5403

-+{

5404

-+	do {

5405

-+		preempt_disable();

5406

-+		__schedule(SM_RTLOCK_WAIT);

5407

-+		sched_preempt_enable_no_resched();

5408

-+	} while (need_resched());

5409

-+}

5410

-+NOKPROBE_SYMBOL(schedule_rtlock);

5411

-+#endif

5412

-+

5413

-+static void __sched notrace preempt_schedule_common(void)

5414

-+{

5415

-+	do {

5416

-+		/*

5417

-+		 * Because the function tracer can trace preempt_count_sub()

5418

-+		 * and it also uses preempt_enable/disable_notrace(), if

5419

-+		 * NEED_RESCHED is set, the preempt_enable_notrace() called

5420

-+		 * by the function tracer will call this function again and

5421

-+		 * cause infinite recursion.

5422

-+		 *

5423

-+		 * Preemption must be disabled here before the function

5424

-+		 * tracer can trace. Break up preempt_disable() into two

5425

-+		 * calls. One to disable preemption without fear of being

5426

-+		 * traced. The other to still record the preemption latency,

5427

-+		 * which can also be traced by the function tracer.

5428

-+		 */

5429

-+		preempt_disable_notrace();

5430

-+		preempt_latency_start(1);

5431

-+		__schedule(SM_PREEMPT);

5432

-+		preempt_latency_stop(1);

5433

-+		preempt_enable_no_resched_notrace();

5434

-+

5435

-+		/*

5436

-+		 * Check again in case we missed a preemption opportunity

5437

-+		 * between schedule and now.

5438

-+		 */

5439

-+	} while (need_resched());

5440

-+}

5441

-+

5442

-+#ifdef CONFIG_PREEMPTION

5443

-+/*

5444

-+ * This is the entry point to schedule() from in-kernel preemption

5445

-+ * off of preempt_enable.

5446

-+ */

5447

-+asmlinkage __visible void __sched notrace preempt_schedule(void)

5448

-+{

5449

-+	/*

5450

-+	 * If there is a non-zero preempt_count or interrupts are disabled,

5451

-+	 * we do not want to preempt the current task. Just return..

5452

-+	 */

5453

-+	if (likely(!preemptible()))

5454

-+		return;

5455

-+

5456

-+	preempt_schedule_common();

5457

-+}

5458

-+NOKPROBE_SYMBOL(preempt_schedule);

5459

-+EXPORT_SYMBOL(preempt_schedule);

5460

-+

5461

-+#ifdef CONFIG_PREEMPT_DYNAMIC

5462

-+DEFINE_STATIC_CALL(preempt_schedule, __preempt_schedule_func);

5463

-+EXPORT_STATIC_CALL_TRAMP(preempt_schedule);

5464

-+#endif

5465

-+

5466

-+

5467

-+/**

5468

-+ * preempt_schedule_notrace - preempt_schedule called by tracing

5469

-+ *

5470

-+ * The tracing infrastructure uses preempt_enable_notrace to prevent

5471

-+ * recursion and tracing preempt enabling caused by the tracing

5472

-+ * infrastructure itself. But as tracing can happen in areas coming

5473

-+ * from userspace or just about to enter userspace, a preempt enable

5474

-+ * can occur before user_exit() is called. This will cause the scheduler

5475

-+ * to be called when the system is still in usermode.

5476

-+ *

5477

-+ * To prevent this, the preempt_enable_notrace will use this function

5478

-+ * instead of preempt_schedule() to exit user context if needed before

5479

-+ * calling the scheduler.

5480

-+ */

5481

-+asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)

5482

-+{

5483

-+	enum ctx_state prev_ctx;

5484

-+

5485

-+	if (likely(!preemptible()))

5486

-+		return;

5487

-+

5488

-+	do {

5489

-+		/*

5490

-+		 * Because the function tracer can trace preempt_count_sub()

5491

-+		 * and it also uses preempt_enable/disable_notrace(), if

5492

-+		 * NEED_RESCHED is set, the preempt_enable_notrace() called

5493

-+		 * by the function tracer will call this function again and

5494

-+		 * cause infinite recursion.

5495

-+		 *

5496

-+		 * Preemption must be disabled here before the function

5497

-+		 * tracer can trace. Break up preempt_disable() into two

5498

-+		 * calls. One to disable preemption without fear of being

5499

-+		 * traced. The other to still record the preemption latency,

5500

-+		 * which can also be traced by the function tracer.

5501

-+		 */

5502

-+		preempt_disable_notrace();

5503

-+		preempt_latency_start(1);

5504

-+		/*

5505

-+		 * Needs preempt disabled in case user_exit() is traced

5506

-+		 * and the tracer calls preempt_enable_notrace() causing

5507

-+		 * an infinite recursion.

5508

-+		 */

5509

-+		prev_ctx = exception_enter();

5510

-+		__schedule(SM_PREEMPT);

5511

-+		exception_exit(prev_ctx);

5512

-+

5513

-+		preempt_latency_stop(1);

5514

-+		preempt_enable_no_resched_notrace();

5515

-+	} while (need_resched());

5516

-+}

5517

-+EXPORT_SYMBOL_GPL(preempt_schedule_notrace);

5518

-+

5519

-+#ifdef CONFIG_PREEMPT_DYNAMIC

5520

-+DEFINE_STATIC_CALL(preempt_schedule_notrace, __preempt_schedule_notrace_func);

5521

-+EXPORT_STATIC_CALL_TRAMP(preempt_schedule_notrace);

5522

-+#endif

5523

-+

5524

-+#endif /* CONFIG_PREEMPTION */

5525

-+

5526

-+#ifdef CONFIG_PREEMPT_DYNAMIC

5527

-+

5528

-+#include <linux/entry-common.h>

5529

-+

5530

-+/*

5531

-+ * SC:cond_resched

5532

-+ * SC:might_resched

5533

-+ * SC:preempt_schedule

5534

-+ * SC:preempt_schedule_notrace

5535

-+ * SC:irqentry_exit_cond_resched

5536

-+ *

5537

-+ *

5538

-+ * NONE:

5539

-+ *   cond_resched               <- __cond_resched

5540

-+ *   might_resched              <- RET0

5541

-+ *   preempt_schedule           <- NOP

5542

-+ *   preempt_schedule_notrace   <- NOP

5543

-+ *   irqentry_exit_cond_resched <- NOP

5544

-+ *

5545

-+ * VOLUNTARY:

5546

-+ *   cond_resched               <- __cond_resched

5547

-+ *   might_resched              <- __cond_resched

5548

-+ *   preempt_schedule           <- NOP

5549

-+ *   preempt_schedule_notrace   <- NOP

5550

-+ *   irqentry_exit_cond_resched <- NOP

5551

-+ *

5552

-+ * FULL:

5553

-+ *   cond_resched               <- RET0

5554

-+ *   might_resched              <- RET0

5555

-+ *   preempt_schedule           <- preempt_schedule

5556

-+ *   preempt_schedule_notrace   <- preempt_schedule_notrace

5557

-+ *   irqentry_exit_cond_resched <- irqentry_exit_cond_resched

5558

-+ */

5559

-+

5560

-+enum {

5561

-+	preempt_dynamic_none = 0,

5562

-+	preempt_dynamic_voluntary,

5563

-+	preempt_dynamic_full,

5564

-+};

5565

-+

5566

-+int preempt_dynamic_mode = preempt_dynamic_full;

5567

-+

5568

-+int sched_dynamic_mode(const char *str)

5569

-+{

5570

-+	if (!strcmp(str, "none"))

5571

-+		return preempt_dynamic_none;

5572

-+

5573

-+	if (!strcmp(str, "voluntary"))

5574

-+		return preempt_dynamic_voluntary;

5575

-+

5576

-+	if (!strcmp(str, "full"))

5577

-+		return preempt_dynamic_full;

5578

-+

5579

-+	return -EINVAL;

5580

-+}

5581

-+

5582

-+void sched_dynamic_update(int mode)

5583

-+{

5584

-+	/*

5585

-+	 * Avoid {NONE,VOLUNTARY} -> FULL transitions from ever ending up in

5586

-+	 * the ZERO state, which is invalid.

5587

-+	 */

5588

-+	static_call_update(cond_resched, __cond_resched);

5589

-+	static_call_update(might_resched, __cond_resched);

5590

-+	static_call_update(preempt_schedule, __preempt_schedule_func);

5591

-+	static_call_update(preempt_schedule_notrace, __preempt_schedule_notrace_func);

5592

-+	static_call_update(irqentry_exit_cond_resched, irqentry_exit_cond_resched);

5593

-+

5594

-+	switch (mode) {

5595

-+	case preempt_dynamic_none:

5596

-+		static_call_update(cond_resched, __cond_resched);

5597

-+		static_call_update(might_resched, (void *)&__static_call_return0);

5598

-+		static_call_update(preempt_schedule, NULL);

5599

-+		static_call_update(preempt_schedule_notrace, NULL);

5600

-+		static_call_update(irqentry_exit_cond_resched, NULL);

5601

-+		pr_info("Dynamic Preempt: none\n");

5602

-+		break;

5603

-+

5604

-+	case preempt_dynamic_voluntary:

5605

-+		static_call_update(cond_resched, __cond_resched);

5606

-+		static_call_update(might_resched, __cond_resched);

5607

-+		static_call_update(preempt_schedule, NULL);

5608

-+		static_call_update(preempt_schedule_notrace, NULL);

5609

-+		static_call_update(irqentry_exit_cond_resched, NULL);

5610

-+		pr_info("Dynamic Preempt: voluntary\n");

5611

-+		break;

5612

-+

5613

-+	case preempt_dynamic_full:

5614

-+		static_call_update(cond_resched, (void *)&__static_call_return0);

5615

-+		static_call_update(might_resched, (void *)&__static_call_return0);

5616

-+		static_call_update(preempt_schedule, __preempt_schedule_func);

5617

-+		static_call_update(preempt_schedule_notrace, __preempt_schedule_notrace_func);

5618

-+		static_call_update(irqentry_exit_cond_resched, irqentry_exit_cond_resched);

5619

-+		pr_info("Dynamic Preempt: full\n");

5620

-+		break;

5621

-+	}

5622

-+

5623

-+	preempt_dynamic_mode = mode;

5624

-+}

5625

-+

5626

-+static int __init setup_preempt_mode(char *str)

5627

-+{

5628

-+	int mode = sched_dynamic_mode(str);

5629

-+	if (mode < 0) {

5630

-+		pr_warn("Dynamic Preempt: unsupported mode: %s\n", str);

5631

-+		return 1;

5632

-+	}

5633

-+

5634

-+	sched_dynamic_update(mode);

5635

-+	return 0;

5636

-+}

5637

-+__setup("preempt=", setup_preempt_mode);

5638

-+

5639

-+#endif /* CONFIG_PREEMPT_DYNAMIC */

5640

-+

5641

-+/*

5642

-+ * This is the entry point to schedule() from kernel preemption

5643

-+ * off of irq context.

5644

-+ * Note, that this is called and return with irqs disabled. This will

5645

-+ * protect us against recursive calling from irq.

5646

-+ */

5647

-+asmlinkage __visible void __sched preempt_schedule_irq(void)

5648

-+{

5649

-+	enum ctx_state prev_state;

5650

-+

5651

-+	/* Catch callers which need to be fixed */

5652

-+	BUG_ON(preempt_count() || !irqs_disabled());

5653

-+

5654

-+	prev_state = exception_enter();

5655

-+

5656

-+	do {

5657

-+		preempt_disable();

5658

-+		local_irq_enable();

5659

-+		__schedule(SM_PREEMPT);

5660

-+		local_irq_disable();

5661

-+		sched_preempt_enable_no_resched();

5662

-+	} while (need_resched());

5663

-+

5664

-+	exception_exit(prev_state);

5665

-+}

5666

-+

5667

-+int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags,

5668

-+			  void *key)

5669

-+{

5670

-+	WARN_ON_ONCE(IS_ENABLED(CONFIG_SCHED_DEBUG) && wake_flags & ~WF_SYNC);

5671

-+	return try_to_wake_up(curr->private, mode, wake_flags);

5672

-+}

5673

-+EXPORT_SYMBOL(default_wake_function);

5674

-+

5675

-+static inline void check_task_changed(struct task_struct *p, struct rq *rq)

5676

-+{

5677

-+	/* Trigger resched if task sched_prio has been modified. */

5678

-+	if (task_on_rq_queued(p) && task_sched_prio_idx(p, rq) != p->sq_idx) {

5679

-+		requeue_task(p, rq);

5680

-+		check_preempt_curr(rq);

5681

-+	}

5682

-+}

5683

-+

5684

-+static void __setscheduler_prio(struct task_struct *p, int prio)

5685

-+{

5686

-+	p->prio = prio;

5687

-+}

5688

-+

5689

-+#ifdef CONFIG_RT_MUTEXES

5690

-+

5691

-+static inline int __rt_effective_prio(struct task_struct *pi_task, int prio)

5692

-+{

5693

-+	if (pi_task)

5694

-+		prio = min(prio, pi_task->prio);

5695

-+

5696

-+	return prio;

5697

-+}

5698

-+

5699

-+static inline int rt_effective_prio(struct task_struct *p, int prio)

5700

-+{

5701

-+	struct task_struct *pi_task = rt_mutex_get_top_task(p);

5702

-+

5703

-+	return __rt_effective_prio(pi_task, prio);

5704

-+}

5705

-+

5706

-+/*

5707

-+ * rt_mutex_setprio - set the current priority of a task

5708

-+ * @p: task to boost

5709

-+ * @pi_task: donor task

5710

-+ *

5711

-+ * This function changes the 'effective' priority of a task. It does

5712

-+ * not touch ->normal_prio like __setscheduler().

5713

-+ *

5714

-+ * Used by the rt_mutex code to implement priority inheritance

5715

-+ * logic. Call site only calls if the priority of the task changed.

5716

-+ */

5717

-+void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)

5718

-+{

5719

-+	int prio;

5720

-+	struct rq *rq;

5721

-+	raw_spinlock_t *lock;

5722

-+

5723

-+	/* XXX used to be waiter->prio, not waiter->task->prio */

5724

-+	prio = __rt_effective_prio(pi_task, p->normal_prio);

5725

-+

5726

-+	/*

5727

-+	 * If nothing changed; bail early.

5728

-+	 */

5729

-+	if (p->pi_top_task == pi_task && prio == p->prio)

5730

-+		return;

5731

-+

5732

-+	rq = __task_access_lock(p, &lock);

5733

-+	/*

5734

-+	 * Set under pi_lock && rq->lock, such that the value can be used under

5735

-+	 * either lock.

5736

-+	 *

5737

-+	 * Note that there is loads of tricky to make this pointer cache work

5738

-+	 * right. rt_mutex_slowunlock()+rt_mutex_postunlock() work together to

5739

-+	 * ensure a task is de-boosted (pi_task is set to NULL) before the

5740

-+	 * task is allowed to run again (and can exit). This ensures the pointer

5741

-+	 * points to a blocked task -- which guarantees the task is present.

5742

-+	 */

5743

-+	p->pi_top_task = pi_task;

5744

-+

5745

-+	/*

5746

-+	 * For FIFO/RR we only need to set prio, if that matches we're done.

5747

-+	 */

5748

-+	if (prio == p->prio)

5749

-+		goto out_unlock;

5750

-+

5751

-+	/*

5752

-+	 * Idle task boosting is a nono in general. There is one

5753

-+	 * exception, when PREEMPT_RT and NOHZ is active:

5754

-+	 *

5755

-+	 * The idle task calls get_next_timer_interrupt() and holds

5756

-+	 * the timer wheel base->lock on the CPU and another CPU wants

5757

-+	 * to access the timer (probably to cancel it). We can safely

5758

-+	 * ignore the boosting request, as the idle CPU runs this code

5759

-+	 * with interrupts disabled and will complete the lock

5760

-+	 * protected section without being interrupted. So there is no

5761

-+	 * real need to boost.

5762

-+	 */

5763

-+	if (unlikely(p == rq->idle)) {

5764

-+		WARN_ON(p != rq->curr);

5765

-+		WARN_ON(p->pi_blocked_on);

5766

-+		goto out_unlock;

5767

-+	}

5768

-+

5769

-+	trace_sched_pi_setprio(p, pi_task);

5770

-+

5771

-+	__setscheduler_prio(p, prio);

5772

-+

5773

-+	check_task_changed(p, rq);

5774

-+out_unlock:

5775

-+	/* Avoid rq from going away on us: */

5776

-+	preempt_disable();

5777

-+

5778

-+	__balance_callbacks(rq);

5779

-+	__task_access_unlock(p, lock);

5780

-+

5781

-+	preempt_enable();

5782

-+}

5783

-+#else

5784

-+static inline int rt_effective_prio(struct task_struct *p, int prio)

5785

-+{

5786

-+	return prio;

5787

-+}

5788

-+#endif

5789

-+

5790

-+void set_user_nice(struct task_struct *p, long nice)

5791

-+{

5792

-+	unsigned long flags;

5793

-+	struct rq *rq;

5794

-+	raw_spinlock_t *lock;

5795

-+

5796

-+	if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE)

5797

-+		return;

5798

-+	/*

5799

-+	 * We have to be careful, if called from sys_setpriority(),

5800

-+	 * the task might be in the middle of scheduling on another CPU.

5801

-+	 */

5802

-+	raw_spin_lock_irqsave(&p->pi_lock, flags);

5803

-+	rq = __task_access_lock(p, &lock);

5804

-+

5805

-+	p->static_prio = NICE_TO_PRIO(nice);

5806

-+	/*

5807

-+	 * The RT priorities are set via sched_setscheduler(), but we still

5808

-+	 * allow the 'normal' nice value to be set - but as expected

5809

-+	 * it won't have any effect on scheduling until the task is

5810

-+	 * not SCHED_NORMAL/SCHED_BATCH:

5811

-+	 */

5812

-+	if (task_has_rt_policy(p))

5813

-+		goto out_unlock;

5814

-+

5815

-+	p->prio = effective_prio(p);

5816

-+

5817

-+	check_task_changed(p, rq);

5818

-+out_unlock:

5819

-+	__task_access_unlock(p, lock);

5820

-+	raw_spin_unlock_irqrestore(&p->pi_lock, flags);

5821

-+}

5822

-+EXPORT_SYMBOL(set_user_nice);

5823

-+

5824

-+/*

5825

-+ * can_nice - check if a task can reduce its nice value

5826

-+ * @p: task

5827

-+ * @nice: nice value

5828

-+ */

5829

-+int can_nice(const struct task_struct *p, const int nice)

5830

-+{

5831

-+	/* Convert nice value [19,-20] to rlimit style value [1,40] */

5832

-+	int nice_rlim = nice_to_rlimit(nice);

5833

-+

5834

-+	return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||

5835

-+		capable(CAP_SYS_NICE));

5836

-+}

5837

-+

5838

-+#ifdef __ARCH_WANT_SYS_NICE

5839

-+

5840

-+/*

5841

-+ * sys_nice - change the priority of the current process.

5842

-+ * @increment: priority increment

5843

-+ *

5844

-+ * sys_setpriority is a more generic, but much slower function that

5845

-+ * does similar things.

5846

-+ */

5847

-+SYSCALL_DEFINE1(nice, int, increment)

5848

-+{

5849

-+	long nice, retval;

5850

-+

5851

-+	/*

5852

-+	 * Setpriority might change our priority at the same moment.

5853

-+	 * We don't have to worry. Conceptually one call occurs first

5854

-+	 * and we have a single winner.

5855

-+	 */

5856

-+

5857

-+	increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH);

5858

-+	nice = task_nice(current) + increment;

5859

-+

5860

-+	nice = clamp_val(nice, MIN_NICE, MAX_NICE);

5861

-+	if (increment < 0 && !can_nice(current, nice))

5862

-+		return -EPERM;

5863

-+

5864

-+	retval = security_task_setnice(current, nice);

5865

-+	if (retval)

5866

-+		return retval;

5867

-+

5868

-+	set_user_nice(current, nice);

5869

-+	return 0;

5870

-+}

5871

-+

5872

-+#endif

5873

-+

5874

-+/**

5875

-+ * task_prio - return the priority value of a given task.

5876

-+ * @p: the task in question.

5877

-+ *

5878

-+ * Return: The priority value as seen by users in /proc.

5879

-+ *

5880

-+ * sched policy         return value   kernel prio    user prio/nice

5881

-+ *

5882

-+ * (BMQ)normal, batch, idle[0 ... 53]  [100 ... 139]          0/[-20 ... 19]/[-7 ... 7]

5883

-+ * (PDS)normal, batch, idle[0 ... 39]            100          0/[-20 ... 19]

5884

-+ * fifo, rr             [-1 ... -100]     [99 ... 0]  [0 ... 99]

5885

-+ */

5886

-+int task_prio(const struct task_struct *p)

5887

-+{

5888

-+	return (p->prio < MAX_RT_PRIO) ? p->prio - MAX_RT_PRIO :

5889

-+		task_sched_prio_normal(p, task_rq(p));

5890

-+}

5891

-+

5892

-+/**

5893

-+ * idle_cpu - is a given CPU idle currently?

5894

-+ * @cpu: the processor in question.

5895

-+ *

5896

-+ * Return: 1 if the CPU is currently idle. 0 otherwise.

5897

-+ */

5898

-+int idle_cpu(int cpu)

5899

-+{

5900

-+	struct rq *rq = cpu_rq(cpu);

5901

-+

5902

-+	if (rq->curr != rq->idle)

5903

-+		return 0;

5904

-+

5905

-+	if (rq->nr_running)

5906

-+		return 0;

5907

-+

5908

-+#ifdef CONFIG_SMP

5909

-+	if (rq->ttwu_pending)

5910

-+		return 0;

5911

-+#endif

5912

-+

5913

-+	return 1;

5914

-+}

5915

-+

5916

-+/**

5917

-+ * idle_task - return the idle task for a given CPU.

5918

-+ * @cpu: the processor in question.

5919

-+ *

5920

-+ * Return: The idle task for the cpu @cpu.

5921

-+ */

5922

-+struct task_struct *idle_task(int cpu)

5923

-+{

5924

-+	return cpu_rq(cpu)->idle;

5925

-+}

5926

-+

5927

-+/**

5928

-+ * find_process_by_pid - find a process with a matching PID value.

5929

-+ * @pid: the pid in question.

5930

-+ *

5931

-+ * The task of @pid, if found. %NULL otherwise.

5932

-+ */

5933

-+static inline struct task_struct *find_process_by_pid(pid_t pid)

5934

-+{

5935

-+	return pid ? find_task_by_vpid(pid) : current;

5936

-+}

5937

-+

5938

-+/*

5939

-+ * sched_setparam() passes in -1 for its policy, to let the functions

5940

-+ * it calls know not to change it.

5941

-+ */

5942

-+#define SETPARAM_POLICY -1

5943

-+

5944

-+static void __setscheduler_params(struct task_struct *p,

5945

-+		const struct sched_attr *attr)

5946

-+{

5947

-+	int policy = attr->sched_policy;

5948

-+

5949

-+	if (policy == SETPARAM_POLICY)

5950

-+		policy = p->policy;

5951

-+

5952

-+	p->policy = policy;

5953

-+

5954

-+	/*

5955

-+	 * allow normal nice value to be set, but will not have any

5956

-+	 * effect on scheduling until the task not SCHED_NORMAL/

5957

-+	 * SCHED_BATCH

5958

-+	 */

5959

-+	p->static_prio = NICE_TO_PRIO(attr->sched_nice);

5960

-+

5961

-+	/*

5962

-+	 * __sched_setscheduler() ensures attr->sched_priority == 0 when

5963

-+	 * !rt_policy. Always setting this ensures that things like

5964

-+	 * getparam()/getattr() don't report silly values for !rt tasks.

5965

-+	 */

5966

-+	p->rt_priority = attr->sched_priority;

5967

-+	p->normal_prio = normal_prio(p);

5968

-+}

5969

-+

5970

-+/*

5971

-+ * check the target process has a UID that matches the current process's

5972

-+ */

5973

-+static bool check_same_owner(struct task_struct *p)

5974

-+{

5975

-+	const struct cred *cred = current_cred(), *pcred;

5976

-+	bool match;

5977

-+

5978

-+	rcu_read_lock();

5979

-+	pcred = __task_cred(p);

5980

-+	match = (uid_eq(cred->euid, pcred->euid) ||

5981

-+		 uid_eq(cred->euid, pcred->uid));

5982

-+	rcu_read_unlock();

5983

-+	return match;

5984

-+}

5985

-+

5986

-+static int __sched_setscheduler(struct task_struct *p,

5987

-+				const struct sched_attr *attr,

5988

-+				bool user, bool pi)

5989

-+{

5990

-+	const struct sched_attr dl_squash_attr = {

5991

-+		.size		= sizeof(struct sched_attr),

5992

-+		.sched_policy	= SCHED_FIFO,

5993

-+		.sched_nice	= 0,

5994

-+		.sched_priority = 99,

5995

-+	};

5996

-+	int oldpolicy = -1, policy = attr->sched_policy;

5997

-+	int retval, newprio;

5998

-+	struct callback_head *head;

5999

-+	unsigned long flags;

6000

-+	struct rq *rq;

6001

-+	int reset_on_fork;

6002

-+	raw_spinlock_t *lock;

6003

-+

6004

-+	/* The pi code expects interrupts enabled */

6005

-+	BUG_ON(pi && in_interrupt());

6006

-+

6007

-+	/*

6008

-+	 * Alt schedule FW supports SCHED_DEADLINE by squash it as prio 0 SCHED_FIFO

6009

-+	 */

6010

-+	if (unlikely(SCHED_DEADLINE == policy)) {

6011

-+		attr = &dl_squash_attr;

6012

-+		policy = attr->sched_policy;

6013

-+	}

6014

-+recheck:

6015

-+	/* Double check policy once rq lock held */

6016

-+	if (policy < 0) {

6017

-+		reset_on_fork = p->sched_reset_on_fork;

6018

-+		policy = oldpolicy = p->policy;

6019

-+	} else {

6020

-+		reset_on_fork = !!(attr->sched_flags & SCHED_RESET_ON_FORK);

6021

-+

6022

-+		if (policy > SCHED_IDLE)

6023

-+			return -EINVAL;

6024

-+	}

6025

-+

6026

-+	if (attr->sched_flags & ~(SCHED_FLAG_ALL))

6027

-+		return -EINVAL;

6028

-+

6029

-+	/*

6030

-+	 * Valid priorities for SCHED_FIFO and SCHED_RR are

6031

-+	 * 1..MAX_RT_PRIO-1, valid priority for SCHED_NORMAL and

6032

-+	 * SCHED_BATCH and SCHED_IDLE is 0.

6033

-+	 */

6034

-+	if (attr->sched_priority < 0 ||

6035

-+	    (p->mm && attr->sched_priority > MAX_RT_PRIO - 1) ||

6036

-+	    (!p->mm && attr->sched_priority > MAX_RT_PRIO - 1))

6037

-+		return -EINVAL;

6038

-+	if ((SCHED_RR == policy || SCHED_FIFO == policy) !=

6039

-+	    (attr->sched_priority != 0))

6040

-+		return -EINVAL;

6041

-+

6042

-+	/*

6043

-+	 * Allow unprivileged RT tasks to decrease priority:

6044

-+	 */

6045

-+	if (user && !capable(CAP_SYS_NICE)) {

6046

-+		if (SCHED_FIFO == policy || SCHED_RR == policy) {

6047

-+			unsigned long rlim_rtprio =

6048

-+					task_rlimit(p, RLIMIT_RTPRIO);

6049

-+

6050

-+			/* Can't set/change the rt policy */

6051

-+			if (policy != p->policy && !rlim_rtprio)

6052

-+				return -EPERM;

6053

-+

6054

-+			/* Can't increase priority */

6055

-+			if (attr->sched_priority > p->rt_priority &&

6056

-+			    attr->sched_priority > rlim_rtprio)

6057

-+				return -EPERM;

6058

-+		}

6059

-+

6060

-+		/* Can't change other user's priorities */

6061

-+		if (!check_same_owner(p))

6062

-+			return -EPERM;

6063

-+

6064

-+		/* Normal users shall not reset the sched_reset_on_fork flag */

6065

-+		if (p->sched_reset_on_fork && !reset_on_fork)

6066

-+			return -EPERM;

6067

-+	}

6068

-+

6069

-+	if (user) {

6070

-+		retval = security_task_setscheduler(p);

6071

-+		if (retval)

6072

-+			return retval;

6073

-+	}

6074

-+

6075

-+	if (pi)

6076

-+		cpuset_read_lock();

6077

-+

6078

-+	/*

6079

-+	 * Make sure no PI-waiters arrive (or leave) while we are

6080

-+	 * changing the priority of the task:

6081

-+	 */

6082

-+	raw_spin_lock_irqsave(&p->pi_lock, flags);

6083

-+

6084

-+	/*

6085

-+	 * To be able to change p->policy safely, task_access_lock()

6086

-+	 * must be called.

6087

-+	 * IF use task_access_lock() here:

6088

-+	 * For the task p which is not running, reading rq->stop is

6089

-+	 * racy but acceptable as ->stop doesn't change much.

6090

-+	 * An enhancemnet can be made to read rq->stop saftly.

6091

-+	 */

6092

-+	rq = __task_access_lock(p, &lock);

6093

-+

6094

-+	/*

6095

-+	 * Changing the policy of the stop threads its a very bad idea

6096

-+	 */

6097

-+	if (p == rq->stop) {

6098

-+		retval = -EINVAL;

6099

-+		goto unlock;

6100

-+	}

6101

-+

6102

-+	/*

6103

-+	 * If not changing anything there's no need to proceed further:

6104

-+	 */

6105

-+	if (unlikely(policy == p->policy)) {

6106

-+		if (rt_policy(policy) && attr->sched_priority != p->rt_priority)

6107

-+			goto change;

6108

-+		if (!rt_policy(policy) &&

6109

-+		    NICE_TO_PRIO(attr->sched_nice) != p->static_prio)

6110

-+			goto change;

6111

-+

6112

-+		p->sched_reset_on_fork = reset_on_fork;

6113

-+		retval = 0;

6114

-+		goto unlock;

6115

-+	}

6116

-+change:

6117

-+

6118

-+	/* Re-check policy now with rq lock held */

6119

-+	if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {

6120

-+		policy = oldpolicy = -1;

6121

-+		__task_access_unlock(p, lock);

6122

-+		raw_spin_unlock_irqrestore(&p->pi_lock, flags);

6123

-+		if (pi)

6124

-+			cpuset_read_unlock();

6125

-+		goto recheck;

6126

-+	}

6127

-+

6128

-+	p->sched_reset_on_fork = reset_on_fork;

6129

-+

6130

-+	newprio = __normal_prio(policy, attr->sched_priority, NICE_TO_PRIO(attr->sched_nice));

6131

-+	if (pi) {

6132

-+		/*

6133

-+		 * Take priority boosted tasks into account. If the new

6134

-+		 * effective priority is unchanged, we just store the new

6135

-+		 * normal parameters and do not touch the scheduler class and

6136

-+		 * the runqueue. This will be done when the task deboost

6137

-+		 * itself.

6138

-+		 */

6139

-+		newprio = rt_effective_prio(p, newprio);

6140

-+	}

6141

-+

6142

-+	if (!(attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)) {

6143

-+		__setscheduler_params(p, attr);

6144

-+		__setscheduler_prio(p, newprio);

6145

-+	}

6146

-+

6147

-+	check_task_changed(p, rq);

6148

-+

6149

-+	/* Avoid rq from going away on us: */

6150

-+	preempt_disable();

6151

-+	head = splice_balance_callbacks(rq);

6152

-+	__task_access_unlock(p, lock);

6153

-+	raw_spin_unlock_irqrestore(&p->pi_lock, flags);

6154

-+

6155

-+	if (pi) {

6156

-+		cpuset_read_unlock();

6157

-+		rt_mutex_adjust_pi(p);

6158

-+	}

6159

-+

6160

-+	/* Run balance callbacks after we've adjusted the PI chain: */

6161

-+	balance_callbacks(rq, head);

6162

-+	preempt_enable();

6163

-+

6164

-+	return 0;

6165

-+

6166

-+unlock:

6167

-+	__task_access_unlock(p, lock);

6168

-+	raw_spin_unlock_irqrestore(&p->pi_lock, flags);

6169

-+	if (pi)

6170

-+		cpuset_read_unlock();

6171

-+	return retval;

6172

-+}

6173

-+

6174

-+static int _sched_setscheduler(struct task_struct *p, int policy,

6175

-+			       const struct sched_param *param, bool check)

6176

-+{

6177

-+	struct sched_attr attr = {

6178

-+		.sched_policy   = policy,

6179

-+		.sched_priority = param->sched_priority,

6180

-+		.sched_nice     = PRIO_TO_NICE(p->static_prio),

6181

-+	};

6182

-+

6183

-+	/* Fixup the legacy SCHED_RESET_ON_FORK hack. */

6184

-+	if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) {

6185

-+		attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;

6186

-+		policy &= ~SCHED_RESET_ON_FORK;

6187

-+		attr.sched_policy = policy;

6188

-+	}

6189

-+

6190

-+	return __sched_setscheduler(p, &attr, check, true);

6191

-+}

6192

-+

6193

-+/**

6194

-+ * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.

6195

-+ * @p: the task in question.

6196

-+ * @policy: new policy.

6197

-+ * @param: structure containing the new RT priority.

6198

-+ *

6199

-+ * Use sched_set_fifo(), read its comment.

6200

-+ *

6201

-+ * Return: 0 on success. An error code otherwise.

6202

-+ *

6203

-+ * NOTE that the task may be already dead.

6204

-+ */

6205

-+int sched_setscheduler(struct task_struct *p, int policy,

6206

-+		       const struct sched_param *param)

6207

-+{

6208

-+	return _sched_setscheduler(p, policy, param, true);

6209

-+}

6210

-+

6211

-+int sched_setattr(struct task_struct *p, const struct sched_attr *attr)

6212

-+{

6213

-+	return __sched_setscheduler(p, attr, true, true);

6214

-+}

6215

-+

6216

-+int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr)

6217

-+{

6218

-+	return __sched_setscheduler(p, attr, false, true);

6219

-+}

6220

-+EXPORT_SYMBOL_GPL(sched_setattr_nocheck);

6221

-+

6222

-+/**

6223

-+ * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.

6224

-+ * @p: the task in question.

6225

-+ * @policy: new policy.

6226

-+ * @param: structure containing the new RT priority.

6227

-+ *

6228

-+ * Just like sched_setscheduler, only don't bother checking if the

6229

-+ * current context has permission.  For example, this is needed in

6230

-+ * stop_machine(): we create temporary high priority worker threads,

6231

-+ * but our caller might not have that capability.

6232

-+ *

6233

-+ * Return: 0 on success. An error code otherwise.

6234

-+ */

6235

-+int sched_setscheduler_nocheck(struct task_struct *p, int policy,

6236

-+			       const struct sched_param *param)

6237

-+{

6238

-+	return _sched_setscheduler(p, policy, param, false);

6239

-+}

6240

-+

6241

-+/*

6242

-+ * SCHED_FIFO is a broken scheduler model; that is, it is fundamentally

6243

-+ * incapable of resource management, which is the one thing an OS really should

6244

-+ * be doing.

6245

-+ *

6246

-+ * This is of course the reason it is limited to privileged users only.

6247

-+ *

6248

-+ * Worse still; it is fundamentally impossible to compose static priority

6249

-+ * workloads. You cannot take two correctly working static prio workloads

6250

-+ * and smash them together and still expect them to work.

6251

-+ *

6252

-+ * For this reason 'all' FIFO tasks the kernel creates are basically at:

6253

-+ *

6254

-+ *   MAX_RT_PRIO / 2

6255

-+ *

6256

-+ * The administrator _MUST_ configure the system, the kernel simply doesn't

6257

-+ * know enough information to make a sensible choice.

6258

-+ */

6259

-+void sched_set_fifo(struct task_struct *p)

6260

-+{

6261

-+	struct sched_param sp = { .sched_priority = MAX_RT_PRIO / 2 };

6262

-+	WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0);

6263

-+}

6264

-+EXPORT_SYMBOL_GPL(sched_set_fifo);

6265

-+

6266

-+/*

6267

-+ * For when you don't much care about FIFO, but want to be above SCHED_NORMAL.

6268

-+ */

6269

-+void sched_set_fifo_low(struct task_struct *p)

6270

-+{

6271

-+	struct sched_param sp = { .sched_priority = 1 };

6272

-+	WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0);

6273

-+}

6274

-+EXPORT_SYMBOL_GPL(sched_set_fifo_low);

6275

-+

6276

-+void sched_set_normal(struct task_struct *p, int nice)

6277

-+{

6278

-+	struct sched_attr attr = {

6279

-+		.sched_policy = SCHED_NORMAL,

6280

-+		.sched_nice = nice,

6281

-+	};

6282

-+	WARN_ON_ONCE(sched_setattr_nocheck(p, &attr) != 0);

6283

-+}

6284

-+EXPORT_SYMBOL_GPL(sched_set_normal);

6285

-+

6286

-+static int

6287

-+do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)

6288

-+{

6289

-+	struct sched_param lparam;

6290

-+	struct task_struct *p;

6291

-+	int retval;

6292

-+

6293

-+	if (!param || pid < 0)

6294

-+		return -EINVAL;

6295

-+	if (copy_from_user(&lparam, param, sizeof(struct sched_param)))

6296

-+		return -EFAULT;

6297

-+

6298

-+	rcu_read_lock();

6299

-+	retval = -ESRCH;

6300

-+	p = find_process_by_pid(pid);

6301

-+	if (likely(p))

6302

-+		get_task_struct(p);

6303

-+	rcu_read_unlock();

6304

-+

6305

-+	if (likely(p)) {

6306

-+		retval = sched_setscheduler(p, policy, &lparam);

6307

-+		put_task_struct(p);

6308

-+	}

6309

-+

6310

-+	return retval;

6311

-+}

6312

-+

6313

-+/*

6314

-+ * Mimics kernel/events/core.c perf_copy_attr().

6315

-+ */

6316

-+static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *attr)

6317

-+{

6318

-+	u32 size;

6319

-+	int ret;

6320

-+

6321

-+	/* Zero the full structure, so that a short copy will be nice: */

6322

-+	memset(attr, 0, sizeof(*attr));

6323

-+

6324

-+	ret = get_user(size, &uattr->size);

6325

-+	if (ret)

6326

-+		return ret;

6327

-+

6328

-+	/* ABI compatibility quirk: */

6329

-+	if (!size)

6330

-+		size = SCHED_ATTR_SIZE_VER0;

6331

-+

6332

-+	if (size < SCHED_ATTR_SIZE_VER0 || size > PAGE_SIZE)

6333

-+		goto err_size;

6334

-+

6335

-+	ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size);

6336

-+	if (ret) {

6337

-+		if (ret == -E2BIG)

6338

-+			goto err_size;

6339

-+		return ret;

6340

-+	}

6341

-+

6342

-+	/*

6343

-+	 * XXX: Do we want to be lenient like existing syscalls; or do we want

6344

-+	 * to be strict and return an error on out-of-bounds values?

6345

-+	 */

6346

-+	attr->sched_nice = clamp(attr->sched_nice, -20, 19);

6347

-+

6348

-+	/* sched/core.c uses zero here but we already know ret is zero */

6349

-+	return 0;

6350

-+

6351

-+err_size:

6352

-+	put_user(sizeof(*attr), &uattr->size);

6353

-+	return -E2BIG;

6354

-+}

6355

-+

6356

-+/**

6357

-+ * sys_sched_setscheduler - set/change the scheduler policy and RT priority

6358

-+ * @pid: the pid in question.

6359

-+ * @policy: new policy.

6360

-+ *

6361

-+ * Return: 0 on success. An error code otherwise.

6362

-+ * @param: structure containing the new RT priority.

6363

-+ */

6364

-+SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, struct sched_param __user *, param)

6365

-+{

6366

-+	if (policy < 0)

6367

-+		return -EINVAL;

6368

-+

6369

-+	return do_sched_setscheduler(pid, policy, param);

6370

-+}

6371

-+

6372

-+/**

6373

-+ * sys_sched_setparam - set/change the RT priority of a thread

6374

-+ * @pid: the pid in question.

6375

-+ * @param: structure containing the new RT priority.

6376

-+ *

6377

-+ * Return: 0 on success. An error code otherwise.

6378

-+ */

6379

-+SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)

6380

-+{

6381

-+	return do_sched_setscheduler(pid, SETPARAM_POLICY, param);

6382

-+}

6383

-+

6384

-+/**

6385

-+ * sys_sched_setattr - same as above, but with extended sched_attr

6386

-+ * @pid: the pid in question.

6387

-+ * @uattr: structure containing the extended parameters.

6388

-+ */

6389

-+SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,

6390

-+			       unsigned int, flags)

6391

-+{

6392

-+	struct sched_attr attr;

6393

-+	struct task_struct *p;

6394

-+	int retval;

6395

-+

6396

-+	if (!uattr || pid < 0 || flags)

6397

-+		return -EINVAL;

6398

-+

6399

-+	retval = sched_copy_attr(uattr, &attr);

6400

-+	if (retval)

6401

-+		return retval;

6402

-+

6403

-+	if ((int)attr.sched_policy < 0)

6404

-+		return -EINVAL;

6405

-+

6406

-+	rcu_read_lock();

6407

-+	retval = -ESRCH;

6408

-+	p = find_process_by_pid(pid);

6409

-+	if (likely(p))

6410

-+		get_task_struct(p);

6411

-+	rcu_read_unlock();

6412

-+

6413

-+	if (likely(p)) {

6414

-+		retval = sched_setattr(p, &attr);

6415

-+		put_task_struct(p);

6416

-+	}

6417

-+

6418

-+	return retval;

6419

-+}

6420

-+

6421

-+/**

6422

-+ * sys_sched_getscheduler - get the policy (scheduling class) of a thread

6423

-+ * @pid: the pid in question.

6424

-+ *

6425

-+ * Return: On success, the policy of the thread. Otherwise, a negative error

6426

-+ * code.

6427

-+ */

6428

-+SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)

6429

-+{

6430

-+	struct task_struct *p;

6431

-+	int retval = -EINVAL;

6432

-+

6433

-+	if (pid < 0)

6434

-+		goto out_nounlock;

6435

-+

6436

-+	retval = -ESRCH;

6437

-+	rcu_read_lock();

6438

-+	p = find_process_by_pid(pid);

6439

-+	if (p) {

6440

-+		retval = security_task_getscheduler(p);

6441

-+		if (!retval)

6442

-+			retval = p->policy;

6443

-+	}

6444

-+	rcu_read_unlock();

6445

-+

6446

-+out_nounlock:

6447

-+	return retval;

6448

-+}

6449

-+

6450

-+/**

6451

-+ * sys_sched_getscheduler - get the RT priority of a thread

6452

-+ * @pid: the pid in question.

6453

-+ * @param: structure containing the RT priority.

6454

-+ *

6455

-+ * Return: On success, 0 and the RT priority is in @param. Otherwise, an error

6456

-+ * code.

6457

-+ */

6458

-+SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)

6459

-+{

6460

-+	struct sched_param lp = { .sched_priority = 0 };

6461

-+	struct task_struct *p;

6462

-+	int retval = -EINVAL;

6463

-+

6464

-+	if (!param || pid < 0)

6465

-+		goto out_nounlock;

6466

-+

6467

-+	rcu_read_lock();

6468

-+	p = find_process_by_pid(pid);

6469

-+	retval = -ESRCH;

6470

-+	if (!p)

6471

-+		goto out_unlock;

6472

-+

6473

-+	retval = security_task_getscheduler(p);

6474

-+	if (retval)

6475

-+		goto out_unlock;

6476

-+

6477

-+	if (task_has_rt_policy(p))

6478

-+		lp.sched_priority = p->rt_priority;

6479

-+	rcu_read_unlock();

6480

-+

6481

-+	/*

6482

-+	 * This one might sleep, we cannot do it with a spinlock held ...

6483

-+	 */

6484

-+	retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;

6485

-+

6486

-+out_nounlock:

6487

-+	return retval;

6488

-+

6489

-+out_unlock:

6490

-+	rcu_read_unlock();

6491

-+	return retval;

6492

-+}

6493

-+

6494

-+/*

6495

-+ * Copy the kernel size attribute structure (which might be larger

6496

-+ * than what user-space knows about) to user-space.

6497

-+ *

6498

-+ * Note that all cases are valid: user-space buffer can be larger or

6499

-+ * smaller than the kernel-space buffer. The usual case is that both

6500

-+ * have the same size.

6501

-+ */

6502

-+static int

6503

-+sched_attr_copy_to_user(struct sched_attr __user *uattr,

6504

-+			struct sched_attr *kattr,

6505

-+			unsigned int usize)

6506

-+{

6507

-+	unsigned int ksize = sizeof(*kattr);

6508

-+

6509

-+	if (!access_ok(uattr, usize))

6510

-+		return -EFAULT;

6511

-+

6512

-+	/*

6513

-+	 * sched_getattr() ABI forwards and backwards compatibility:

6514

-+	 *

6515

-+	 * If usize == ksize then we just copy everything to user-space and all is good.

6516

-+	 *

6517

-+	 * If usize < ksize then we only copy as much as user-space has space for,

6518

-+	 * this keeps ABI compatibility as well. We skip the rest.

6519

-+	 *

6520

-+	 * If usize > ksize then user-space is using a newer version of the ABI,

6521

-+	 * which part the kernel doesn't know about. Just ignore it - tooling can

6522

-+	 * detect the kernel's knowledge of attributes from the attr->size value

6523

-+	 * which is set to ksize in this case.

6524

-+	 */

6525

-+	kattr->size = min(usize, ksize);

6526

-+

6527

-+	if (copy_to_user(uattr, kattr, kattr->size))

6528

-+		return -EFAULT;

6529

-+

6530

-+	return 0;

6531

-+}

6532

-+

6533

-+/**

6534

-+ * sys_sched_getattr - similar to sched_getparam, but with sched_attr

6535

-+ * @pid: the pid in question.

6536

-+ * @uattr: structure containing the extended parameters.

6537

-+ * @usize: sizeof(attr) for fwd/bwd comp.

6538

-+ * @flags: for future extension.

6539

-+ */

6540

-+SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,

6541

-+		unsigned int, usize, unsigned int, flags)

6542

-+{

6543

-+	struct sched_attr kattr = { };

6544

-+	struct task_struct *p;

6545

-+	int retval;

6546

-+

6547

-+	if (!uattr || pid < 0 || usize > PAGE_SIZE ||

6548

-+	    usize < SCHED_ATTR_SIZE_VER0 || flags)

6549

-+		return -EINVAL;

6550

-+

6551

-+	rcu_read_lock();

6552

-+	p = find_process_by_pid(pid);

6553

-+	retval = -ESRCH;

6554

-+	if (!p)

6555

-+		goto out_unlock;

6556

-+

6557

-+	retval = security_task_getscheduler(p);

6558

-+	if (retval)

6559

-+		goto out_unlock;

6560

-+

6561

-+	kattr.sched_policy = p->policy;

6562

-+	if (p->sched_reset_on_fork)

6563

-+		kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;

6564

-+	if (task_has_rt_policy(p))

6565

-+		kattr.sched_priority = p->rt_priority;

6566

-+	else

6567

-+		kattr.sched_nice = task_nice(p);

6568

-+	kattr.sched_flags &= SCHED_FLAG_ALL;

6569

-+

6570

-+#ifdef CONFIG_UCLAMP_TASK

6571

-+	kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value;

6572

-+	kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value;

6573

-+#endif

6574

-+

6575

-+	rcu_read_unlock();

6576

-+

6577

-+	return sched_attr_copy_to_user(uattr, &kattr, usize);

6578

-+

6579

-+out_unlock:

6580

-+	rcu_read_unlock();

6581

-+	return retval;

6582

-+}

6583

-+

6584

-+static int

6585

-+__sched_setaffinity(struct task_struct *p, const struct cpumask *mask)

6586

-+{

6587

-+	int retval;

6588

-+	cpumask_var_t cpus_allowed, new_mask;

6589

-+

6590

-+	if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL))

6591

-+		return -ENOMEM;

6592

-+

6593

-+	if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {

6594

-+		retval = -ENOMEM;

6595

-+		goto out_free_cpus_allowed;

6596

-+	}

6597

-+

6598

-+	cpuset_cpus_allowed(p, cpus_allowed);

6599

-+	cpumask_and(new_mask, mask, cpus_allowed);

6600

-+again:

6601

-+	retval = __set_cpus_allowed_ptr(p, new_mask, SCA_CHECK | SCA_USER);

6602

-+	if (retval)

6603

-+		goto out_free_new_mask;

6604

-+

6605

-+	cpuset_cpus_allowed(p, cpus_allowed);

6606

-+	if (!cpumask_subset(new_mask, cpus_allowed)) {

6607

-+		/*

6608

-+		 * We must have raced with a concurrent cpuset

6609

-+		 * update. Just reset the cpus_allowed to the

6610

-+		 * cpuset's cpus_allowed

6611

-+		 */

6612

-+		cpumask_copy(new_mask, cpus_allowed);

6613

-+		goto again;

6614

-+	}

6615

-+

6616

-+out_free_new_mask:

6617

-+	free_cpumask_var(new_mask);

6618

-+out_free_cpus_allowed:

6619

-+	free_cpumask_var(cpus_allowed);

6620

-+	return retval;

6621

-+}

6622

-+

6623

-+long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)

6624

-+{

6625

-+	struct task_struct *p;

6626

-+	int retval;

6627

-+

6628

-+	rcu_read_lock();

6629

-+

6630

-+	p = find_process_by_pid(pid);

6631

-+	if (!p) {

6632

-+		rcu_read_unlock();

6633

-+		return -ESRCH;

6634

-+	}

6635

-+

6636

-+	/* Prevent p going away */

6637

-+	get_task_struct(p);

6638

-+	rcu_read_unlock();

6639

-+

6640

-+	if (p->flags & PF_NO_SETAFFINITY) {

6641

-+		retval = -EINVAL;

6642

-+		goto out_put_task;

6643

-+	}

6644

-+

6645

-+	if (!check_same_owner(p)) {

6646

-+		rcu_read_lock();

6647

-+		if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {

6648

-+			rcu_read_unlock();

6649

-+			retval = -EPERM;

6650

-+			goto out_put_task;

6651

-+		}

6652

-+		rcu_read_unlock();

6653

-+	}

6654

-+

6655

-+	retval = security_task_setscheduler(p);

6656

-+	if (retval)

6657

-+		goto out_put_task;

6658

-+

6659

-+	retval = __sched_setaffinity(p, in_mask);

6660

-+out_put_task:

6661

-+	put_task_struct(p);

6662

-+	return retval;

6663

-+}

6664

-+

6665

-+static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,

6666

-+			     struct cpumask *new_mask)

6667

-+{

6668

-+	if (len < cpumask_size())

6669

-+		cpumask_clear(new_mask);

6670

-+	else if (len > cpumask_size())

6671

-+		len = cpumask_size();

6672

-+

6673

-+	return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;

6674

-+}

6675

-+

6676

-+/**

6677

-+ * sys_sched_setaffinity - set the CPU affinity of a process

6678

-+ * @pid: pid of the process

6679

-+ * @len: length in bytes of the bitmask pointed to by user_mask_ptr

6680

-+ * @user_mask_ptr: user-space pointer to the new CPU mask

6681

-+ *

6682

-+ * Return: 0 on success. An error code otherwise.

6683

-+ */

6684

-+SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,

6685

-+		unsigned long __user *, user_mask_ptr)

6686

-+{

6687

-+	cpumask_var_t new_mask;

6688

-+	int retval;

6689

-+

6690

-+	if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))

6691

-+		return -ENOMEM;

6692

-+

6693

-+	retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);

6694

-+	if (retval == 0)

6695

-+		retval = sched_setaffinity(pid, new_mask);

6696

-+	free_cpumask_var(new_mask);

6697

-+	return retval;

6698

-+}

6699

-+

6700

-+long sched_getaffinity(pid_t pid, cpumask_t *mask)

6701

-+{

6702

-+	struct task_struct *p;

6703

-+	raw_spinlock_t *lock;

6704

-+	unsigned long flags;

6705

-+	int retval;

6706

-+

6707

-+	rcu_read_lock();

6708

-+

6709

-+	retval = -ESRCH;

6710

-+	p = find_process_by_pid(pid);

6711

-+	if (!p)

6712

-+		goto out_unlock;

6713

-+

6714

-+	retval = security_task_getscheduler(p);

6715

-+	if (retval)

6716

-+		goto out_unlock;

6717

-+

6718

-+	task_access_lock_irqsave(p, &lock, &flags);

6719

-+	cpumask_and(mask, &p->cpus_mask, cpu_active_mask);

6720

-+	task_access_unlock_irqrestore(p, lock, &flags);

6721

-+

6722

-+out_unlock:

6723

-+	rcu_read_unlock();

6724

-+

6725

-+	return retval;

6726

-+}

6727

-+

6728

-+/**

6729

-+ * sys_sched_getaffinity - get the CPU affinity of a process

6730

-+ * @pid: pid of the process

6731

-+ * @len: length in bytes of the bitmask pointed to by user_mask_ptr

6732

-+ * @user_mask_ptr: user-space pointer to hold the current CPU mask

6733

-+ *

6734

-+ * Return: size of CPU mask copied to user_mask_ptr on success. An

6735

-+ * error code otherwise.

6736

-+ */

6737

-+SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,

6738

-+		unsigned long __user *, user_mask_ptr)

6739

-+{

6740

-+	int ret;

6741

-+	cpumask_var_t mask;

6742

-+

6743

-+	if ((len * BITS_PER_BYTE) < nr_cpu_ids)

6744

-+		return -EINVAL;

6745

-+	if (len & (sizeof(unsigned long)-1))

6746

-+		return -EINVAL;

6747

-+

6748

-+	if (!alloc_cpumask_var(&mask, GFP_KERNEL))

6749

-+		return -ENOMEM;

6750

-+

6751

-+	ret = sched_getaffinity(pid, mask);

6752

-+	if (ret == 0) {

6753

-+		unsigned int retlen = min_t(size_t, len, cpumask_size());

6754

-+

6755

-+		if (copy_to_user(user_mask_ptr, mask, retlen))

6756

-+			ret = -EFAULT;

6757

-+		else

6758

-+			ret = retlen;

6759

-+	}

6760

-+	free_cpumask_var(mask);

6761

-+

6762

-+	return ret;

6763

-+}

6764

-+

6765

-+static void do_sched_yield(void)

6766

-+{

6767

-+	struct rq *rq;

6768

-+	struct rq_flags rf;

6769

-+

6770

-+	if (!sched_yield_type)

6771

-+		return;

6772

-+

6773

-+	rq = this_rq_lock_irq(&rf);

6774

-+

6775

-+	schedstat_inc(rq->yld_count);

6776

-+

6777

-+	if (1 == sched_yield_type) {

6778

-+		if (!rt_task(current))

6779

-+			do_sched_yield_type_1(current, rq);

6780

-+	} else if (2 == sched_yield_type) {

6781

-+		if (rq->nr_running > 1)

6782

-+			rq->skip = current;

6783

-+	}

6784

-+

6785

-+	preempt_disable();

6786

-+	raw_spin_unlock_irq(&rq->lock);

6787

-+	sched_preempt_enable_no_resched();

6788

-+

6789

-+	schedule();

6790

-+}

6791

-+

6792

-+/**

6793

-+ * sys_sched_yield - yield the current processor to other threads.

6794

-+ *

6795

-+ * This function yields the current CPU to other tasks. If there are no

6796

-+ * other threads running on this CPU then this function will return.

6797

-+ *

6798

-+ * Return: 0.

6799

-+ */

6800

-+SYSCALL_DEFINE0(sched_yield)

6801

-+{

6802

-+	do_sched_yield();

6803

-+	return 0;

6804

-+}

6805

-+

6806

-+#if !defined(CONFIG_PREEMPTION) || defined(CONFIG_PREEMPT_DYNAMIC)

6807

-+int __sched __cond_resched(void)

6808

-+{

6809

-+	if (should_resched(0)) {

6810

-+		preempt_schedule_common();

6811

-+		return 1;

6812

-+	}

6813

-+	/*

6814

-+	 * In preemptible kernels, ->rcu_read_lock_nesting tells the tick

6815

-+	 * whether the current CPU is in an RCU read-side critical section,

6816

-+	 * so the tick can report quiescent states even for CPUs looping

6817

-+	 * in kernel context.  In contrast, in non-preemptible kernels,

6818

-+	 * RCU readers leave no in-memory hints, which means that CPU-bound

6819

-+	 * processes executing in kernel context might never report an

6820

-+	 * RCU quiescent state.  Therefore, the following code causes

6821

-+	 * cond_resched() to report a quiescent state, but only when RCU

6822

-+	 * is in urgent need of one.

6823

-+	 */

6824

-+#ifndef CONFIG_PREEMPT_RCU

6825

-+	rcu_all_qs();

6826

-+#endif

6827

-+	return 0;

6828

-+}

6829

-+EXPORT_SYMBOL(__cond_resched);

6830

-+#endif

6831

-+

6832

-+#ifdef CONFIG_PREEMPT_DYNAMIC

6833

-+DEFINE_STATIC_CALL_RET0(cond_resched, __cond_resched);

6834

-+EXPORT_STATIC_CALL_TRAMP(cond_resched);

6835

-+

6836

-+DEFINE_STATIC_CALL_RET0(might_resched, __cond_resched);

6837

-+EXPORT_STATIC_CALL_TRAMP(might_resched);

6838

-+#endif

6839

-+

6840

-+/*

6841

-+ * __cond_resched_lock() - if a reschedule is pending, drop the given lock,

6842

-+ * call schedule, and on return reacquire the lock.

6843

-+ *

6844

-+ * This works OK both with and without CONFIG_PREEMPTION.  We do strange low-level

6845

-+ * operations here to prevent schedule() from being called twice (once via

6846

-+ * spin_unlock(), once by hand).

6847

-+ */

6848

-+int __cond_resched_lock(spinlock_t *lock)

6849

-+{

6850

-+	int resched = should_resched(PREEMPT_LOCK_OFFSET);

6851

-+	int ret = 0;

6852

-+

6853

-+	lockdep_assert_held(lock);

6854

-+

6855

-+	if (spin_needbreak(lock) || resched) {

6856

-+		spin_unlock(lock);

6857

-+		if (resched)

6858

-+			preempt_schedule_common();

6859

-+		else

6860

-+			cpu_relax();

6861

-+		ret = 1;

6862

-+		spin_lock(lock);

6863

-+	}

6864

-+	return ret;

6865

-+}

6866

-+EXPORT_SYMBOL(__cond_resched_lock);

6867

-+

6868

-+int __cond_resched_rwlock_read(rwlock_t *lock)

6869

-+{

6870

-+	int resched = should_resched(PREEMPT_LOCK_OFFSET);

6871

-+	int ret = 0;

6872

-+

6873

-+	lockdep_assert_held_read(lock);

6874

-+

6875

-+	if (rwlock_needbreak(lock) || resched) {

6876

-+		read_unlock(lock);

6877

-+		if (resched)

6878

-+			preempt_schedule_common();

6879

-+		else

6880

-+			cpu_relax();

6881

-+		ret = 1;

6882

-+		read_lock(lock);

6883

-+	}

6884

-+	return ret;

6885

-+}

6886

-+EXPORT_SYMBOL(__cond_resched_rwlock_read);

6887

-+

6888

-+int __cond_resched_rwlock_write(rwlock_t *lock)

6889

-+{

6890

-+	int resched = should_resched(PREEMPT_LOCK_OFFSET);

6891

-+	int ret = 0;

6892

-+

6893

-+	lockdep_assert_held_write(lock);

6894

-+

6895

-+	if (rwlock_needbreak(lock) || resched) {

6896

-+		write_unlock(lock);

6897

-+		if (resched)

6898

-+			preempt_schedule_common();

6899

-+		else

6900

-+			cpu_relax();

6901

-+		ret = 1;

6902

-+		write_lock(lock);

6903

-+	}

6904

-+	return ret;

6905

-+}

6906

-+EXPORT_SYMBOL(__cond_resched_rwlock_write);

6907

-+

6908

-+/**

6909

-+ * yield - yield the current processor to other threads.

6910

-+ *

6911

-+ * Do not ever use this function, there's a 99% chance you're doing it wrong.

6912

-+ *

6913

-+ * The scheduler is at all times free to pick the calling task as the most

6914

-+ * eligible task to run, if removing the yield() call from your code breaks

6915

-+ * it, it's already broken.

6916

-+ *

6917

-+ * Typical broken usage is:

6918

-+ *

6919

-+ * while (!event)

6920

-+ * 	yield();

6921

-+ *

6922

-+ * where one assumes that yield() will let 'the other' process run that will

6923

-+ * make event true. If the current task is a SCHED_FIFO task that will never

6924

-+ * happen. Never use yield() as a progress guarantee!!

6925

-+ *

6926

-+ * If you want to use yield() to wait for something, use wait_event().

6927

-+ * If you want to use yield() to be 'nice' for others, use cond_resched().

6928

-+ * If you still want to use yield(), do not!

6929

-+ */

6930

-+void __sched yield(void)

6931

-+{

6932

-+	set_current_state(TASK_RUNNING);

6933

-+	do_sched_yield();

6934

-+}

6935

-+EXPORT_SYMBOL(yield);

6936

-+

6937

-+/**

6938

-+ * yield_to - yield the current processor to another thread in

6939

-+ * your thread group, or accelerate that thread toward the

6940

-+ * processor it's on.

6941

-+ * @p: target task

6942

-+ * @preempt: whether task preemption is allowed or not

6943

-+ *

6944

-+ * It's the caller's job to ensure that the target task struct

6945

-+ * can't go away on us before we can do any checks.

6946

-+ *

6947

-+ * In Alt schedule FW, yield_to is not supported.

6948

-+ *

6949

-+ * Return:

6950

-+ *	true (>0) if we indeed boosted the target task.

6951

-+ *	false (0) if we failed to boost the target.

6952

-+ *	-ESRCH if there's no task to yield to.

6953

-+ */

6954

-+int __sched yield_to(struct task_struct *p, bool preempt)

6955

-+{

6956

-+	return 0;

6957

-+}

6958

-+EXPORT_SYMBOL_GPL(yield_to);

6959

-+

6960

-+int io_schedule_prepare(void)

6961

-+{

6962

-+	int old_iowait = current->in_iowait;

6963

-+

6964

-+	current->in_iowait = 1;

6965

-+	blk_schedule_flush_plug(current);

6966

-+

6967

-+	return old_iowait;

6968

-+}

6969

-+

6970

-+void io_schedule_finish(int token)

6971

-+{

6972

-+	current->in_iowait = token;

6973

-+}

6974

-+

6975

-+/*

6976

-+ * This task is about to go to sleep on IO.  Increment rq->nr_iowait so

6977

-+ * that process accounting knows that this is a task in IO wait state.

6978

-+ *

6979

-+ * But don't do that if it is a deliberate, throttling IO wait (this task

6980

-+ * has set its backing_dev_info: the queue against which it should throttle)

6981

-+ */

6982

-+

6983

-+long __sched io_schedule_timeout(long timeout)

6984

-+{

6985

-+	int token;

6986

-+	long ret;

6987

-+

6988

-+	token = io_schedule_prepare();

6989

-+	ret = schedule_timeout(timeout);

6990

-+	io_schedule_finish(token);

6991

-+

6992

-+	return ret;

6993

-+}

6994

-+EXPORT_SYMBOL(io_schedule_timeout);

6995

-+

6996

-+void __sched io_schedule(void)

6997

-+{

6998

-+	int token;

6999

-+

7000

-+	token = io_schedule_prepare();

7001

-+	schedule();

7002

-+	io_schedule_finish(token);

7003

-+}

7004

-+EXPORT_SYMBOL(io_schedule);

7005

-+

7006

-+/**

7007

-+ * sys_sched_get_priority_max - return maximum RT priority.

7008

-+ * @policy: scheduling class.

7009

-+ *

7010

-+ * Return: On success, this syscall returns the maximum

7011

-+ * rt_priority that can be used by a given scheduling class.

7012

-+ * On failure, a negative error code is returned.

7013

-+ */

7014

-+SYSCALL_DEFINE1(sched_get_priority_max, int, policy)

7015

-+{

7016

-+	int ret = -EINVAL;

7017

-+

7018

-+	switch (policy) {

7019

-+	case SCHED_FIFO:

7020

-+	case SCHED_RR:

7021

-+		ret = MAX_RT_PRIO - 1;

7022

-+		break;

7023

-+	case SCHED_NORMAL:

7024

-+	case SCHED_BATCH:

7025

-+	case SCHED_IDLE:

7026

-+		ret = 0;

7027

-+		break;

7028

-+	}

7029

-+	return ret;

7030

-+}

7031

-+

7032

-+/**

7033

-+ * sys_sched_get_priority_min - return minimum RT priority.

7034

-+ * @policy: scheduling class.

7035

-+ *

7036

-+ * Return: On success, this syscall returns the minimum

7037

-+ * rt_priority that can be used by a given scheduling class.

7038

-+ * On failure, a negative error code is returned.

7039

-+ */

7040

-+SYSCALL_DEFINE1(sched_get_priority_min, int, policy)

7041

-+{

7042

-+	int ret = -EINVAL;

7043

-+

7044

-+	switch (policy) {

7045

-+	case SCHED_FIFO:

7046

-+	case SCHED_RR:

7047

-+		ret = 1;

7048

-+		break;

7049

-+	case SCHED_NORMAL:

7050

-+	case SCHED_BATCH:

7051

-+	case SCHED_IDLE:

7052

-+		ret = 0;

7053

-+		break;

7054

-+	}

7055

-+	return ret;

7056

-+}

7057

-+

7058

-+static int sched_rr_get_interval(pid_t pid, struct timespec64 *t)

7059

-+{

7060

-+	struct task_struct *p;

7061

-+	int retval;

7062

-+

7063

-+	alt_sched_debug();

7064

-+

7065

-+	if (pid < 0)

7066

-+		return -EINVAL;

7067

-+

7068

-+	retval = -ESRCH;

7069

-+	rcu_read_lock();

7070

-+	p = find_process_by_pid(pid);

7071

-+	if (!p)

7072

-+		goto out_unlock;

7073

-+

7074

-+	retval = security_task_getscheduler(p);

7075

-+	if (retval)

7076

-+		goto out_unlock;

7077

-+	rcu_read_unlock();

7078

-+

7079

-+	*t = ns_to_timespec64(sched_timeslice_ns);

7080

-+	return 0;

7081

-+

7082

-+out_unlock:

7083

-+	rcu_read_unlock();

7084

-+	return retval;

7085

-+}

7086

-+

7087

-+/**

7088

-+ * sys_sched_rr_get_interval - return the default timeslice of a process.

7089

-+ * @pid: pid of the process.

7090

-+ * @interval: userspace pointer to the timeslice value.

7091

-+ *

7092

-+ *

7093

-+ * Return: On success, 0 and the timeslice is in @interval. Otherwise,

7094

-+ * an error code.

7095

-+ */

7096

-+SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,

7097

-+		struct __kernel_timespec __user *, interval)

7098

-+{

7099

-+	struct timespec64 t;

7100

-+	int retval = sched_rr_get_interval(pid, &t);

7101

-+

7102

-+	if (retval == 0)

7103

-+		retval = put_timespec64(&t, interval);

7104

-+

7105

-+	return retval;

7106

-+}

7107

-+

7108

-+#ifdef CONFIG_COMPAT_32BIT_TIME

7109

-+SYSCALL_DEFINE2(sched_rr_get_interval_time32, pid_t, pid,

7110

-+		struct old_timespec32 __user *, interval)

7111

-+{

7112

-+	struct timespec64 t;

7113

-+	int retval = sched_rr_get_interval(pid, &t);

7114

-+

7115

-+	if (retval == 0)

7116

-+		retval = put_old_timespec32(&t, interval);

7117

-+	return retval;

7118

-+}

7119

-+#endif

7120

-+

7121

-+void sched_show_task(struct task_struct *p)

7122

-+{

7123

-+	unsigned long free = 0;

7124

-+	int ppid;

7125

-+

7126

-+	if (!try_get_task_stack(p))

7127

-+		return;

7128

-+

7129

-+	pr_info("task:%-15.15s state:%c", p->comm, task_state_to_char(p));

7130

-+

7131

-+	if (task_is_running(p))

7132

-+		pr_cont("  running task    ");

7133

-+#ifdef CONFIG_DEBUG_STACK_USAGE

7134

-+	free = stack_not_used(p);

7135

-+#endif

7136

-+	ppid = 0;

7137

-+	rcu_read_lock();

7138

-+	if (pid_alive(p))

7139

-+		ppid = task_pid_nr(rcu_dereference(p->real_parent));

7140

-+	rcu_read_unlock();

7141

-+	pr_cont(" stack:%5lu pid:%5d ppid:%6d flags:0x%08lx\n",

7142

-+		free, task_pid_nr(p), ppid,

7143

-+		(unsigned long)task_thread_info(p)->flags);

7144

-+

7145

-+	print_worker_info(KERN_INFO, p);

7146

-+	print_stop_info(KERN_INFO, p);

7147

-+	show_stack(p, NULL, KERN_INFO);

7148

-+	put_task_stack(p);

7149

-+}

7150

-+EXPORT_SYMBOL_GPL(sched_show_task);

7151

-+

7152

-+static inline bool

7153

-+state_filter_match(unsigned long state_filter, struct task_struct *p)

7154

-+{

7155

-+	unsigned int state = READ_ONCE(p->__state);

7156

-+

7157

-+	/* no filter, everything matches */

7158

-+	if (!state_filter)

7159

-+		return true;

7160

-+

7161

-+	/* filter, but doesn't match */

7162

-+	if (!(state & state_filter))

7163

-+		return false;

7164

-+

7165

-+	/*

7166

-+	 * When looking for TASK_UNINTERRUPTIBLE skip TASK_IDLE (allows

7167

-+	 * TASK_KILLABLE).

7168

-+	 */

7169

-+	if (state_filter == TASK_UNINTERRUPTIBLE && state == TASK_IDLE)

7170

-+		return false;

7171

-+

7172

-+	return true;

7173

-+}

7174

-+

7175

-+

7176

-+void show_state_filter(unsigned int state_filter)

7177

-+{

7178

-+	struct task_struct *g, *p;

7179

-+

7180

-+	rcu_read_lock();

7181

-+	for_each_process_thread(g, p) {

7182

-+		/*

7183

-+		 * reset the NMI-timeout, listing all files on a slow

7184

-+		 * console might take a lot of time:

7185

-+		 * Also, reset softlockup watchdogs on all CPUs, because

7186

-+		 * another CPU might be blocked waiting for us to process

7187

-+		 * an IPI.

7188

-+		 */

7189

-+		touch_nmi_watchdog();

7190

-+		touch_all_softlockup_watchdogs();

7191

-+		if (state_filter_match(state_filter, p))

7192

-+			sched_show_task(p);

7193

-+	}

7194

-+

7195

-+#ifdef CONFIG_SCHED_DEBUG

7196

-+	/* TODO: Alt schedule FW should support this

7197

-+	if (!state_filter)

7198

-+		sysrq_sched_debug_show();

7199

-+	*/

7200

-+#endif

7201

-+	rcu_read_unlock();

7202

-+	/*

7203

-+	 * Only show locks if all tasks are dumped:

7204

-+	 */

7205

-+	if (!state_filter)

7206

-+		debug_show_all_locks();

7207

-+}

7208

-+

7209

-+void dump_cpu_task(int cpu)

7210

-+{

7211

-+	pr_info("Task dump for CPU %d:\n", cpu);

7212

-+	sched_show_task(cpu_curr(cpu));

7213

-+}

7214

-+

7215

-+/**

7216

-+ * init_idle - set up an idle thread for a given CPU

7217

-+ * @idle: task in question

7218

-+ * @cpu: CPU the idle task belongs to

7219

-+ *

7220

-+ * NOTE: this function does not set the idle thread's NEED_RESCHED

7221

-+ * flag, to make booting more robust.

7222

-+ */

7223

-+void __init init_idle(struct task_struct *idle, int cpu)

7224

-+{

7225

-+	struct rq *rq = cpu_rq(cpu);

7226

-+	unsigned long flags;

7227

-+

7228

-+	__sched_fork(0, idle);

7229

-+

7230

-+	/*

7231

-+	 * The idle task doesn't need the kthread struct to function, but it

7232

-+	 * is dressed up as a per-CPU kthread and thus needs to play the part

7233

-+	 * if we want to avoid special-casing it in code that deals with per-CPU

7234

-+	 * kthreads.

7235

-+	 */

7236

-+	set_kthread_struct(idle);

7237

-+

7238

-+	raw_spin_lock_irqsave(&idle->pi_lock, flags);

7239

-+	raw_spin_lock(&rq->lock);

7240

-+	update_rq_clock(rq);

7241

-+

7242

-+	idle->last_ran = rq->clock_task;

7243

-+	idle->__state = TASK_RUNNING;

7244

-+	/*

7245

-+	 * PF_KTHREAD should already be set at this point; regardless, make it

7246

-+	 * look like a proper per-CPU kthread.

7247

-+	 */

7248

-+	idle->flags |= PF_IDLE | PF_KTHREAD | PF_NO_SETAFFINITY;

7249

-+	kthread_set_per_cpu(idle, cpu);

7250

-+

7251

-+	sched_queue_init_idle(&rq->queue, idle);

7252

-+

7253

-+	scs_task_reset(idle);

7254

-+	kasan_unpoison_task_stack(idle);

7255

-+

7256

-+#ifdef CONFIG_SMP

7257

-+	/*

7258

-+	 * It's possible that init_idle() gets called multiple times on a task,

7259

-+	 * in that case do_set_cpus_allowed() will not do the right thing.

7260

-+	 *

7261

-+	 * And since this is boot we can forgo the serialisation.

7262

-+	 */

7263

-+	set_cpus_allowed_common(idle, cpumask_of(cpu));

7264

-+#endif

7265

-+

7266

-+	/* Silence PROVE_RCU */

7267

-+	rcu_read_lock();

7268

-+	__set_task_cpu(idle, cpu);

7269

-+	rcu_read_unlock();

7270

-+

7271

-+	rq->idle = idle;

7272

-+	rcu_assign_pointer(rq->curr, idle);

7273

-+	idle->on_cpu = 1;

7274

-+

7275

-+	raw_spin_unlock(&rq->lock);

7276

-+	raw_spin_unlock_irqrestore(&idle->pi_lock, flags);

7277

-+

7278

-+	/* Set the preempt count _outside_ the spinlocks! */

7279

-+	init_idle_preempt_count(idle, cpu);

7280

-+

7281

-+	ftrace_graph_init_idle_task(idle, cpu);

7282

-+	vtime_init_idle(idle, cpu);

7283

-+#ifdef CONFIG_SMP

7284

-+	sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);

7285

-+#endif

7286

-+}

7287

-+

7288

-+#ifdef CONFIG_SMP

7289

-+

7290

-+int cpuset_cpumask_can_shrink(const struct cpumask __maybe_unused *cur,

7291

-+			      const struct cpumask __maybe_unused *trial)

7292

-+{

7293

-+	return 1;

7294

-+}

7295

-+

7296

-+int task_can_attach(struct task_struct *p,

7297

-+		    const struct cpumask *cs_cpus_allowed)

7298

-+{

7299

-+	int ret = 0;

7300

-+

7301

-+	/*

7302

-+	 * Kthreads which disallow setaffinity shouldn't be moved

7303

-+	 * to a new cpuset; we don't want to change their CPU

7304

-+	 * affinity and isolating such threads by their set of

7305

-+	 * allowed nodes is unnecessary.  Thus, cpusets are not

7306

-+	 * applicable for such threads.  This prevents checking for

7307

-+	 * success of set_cpus_allowed_ptr() on all attached tasks

7308

-+	 * before cpus_mask may be changed.

7309

-+	 */

7310

-+	if (p->flags & PF_NO_SETAFFINITY)

7311

-+		ret = -EINVAL;

7312

-+

7313

-+	return ret;

7314

-+}

7315

-+

7316

-+bool sched_smp_initialized __read_mostly;

7317

-+

7318

-+#ifdef CONFIG_HOTPLUG_CPU

7319

-+/*

7320

-+ * Ensures that the idle task is using init_mm right before its CPU goes

7321

-+ * offline.

7322

-+ */

7323

-+void idle_task_exit(void)

7324

-+{

7325

-+	struct mm_struct *mm = current->active_mm;

7326

-+

7327

-+	BUG_ON(current != this_rq()->idle);

7328

-+

7329

-+	if (mm != &init_mm) {

7330

-+		switch_mm(mm, &init_mm, current);

7331

-+		finish_arch_post_lock_switch();

7332

-+	}

7333

-+

7334

-+	scs_task_reset(current);

7335

-+	/* finish_cpu(), as ran on the BP, will clean up the active_mm state */

7336

-+}

7337

-+

7338

-+static int __balance_push_cpu_stop(void *arg)

7339

-+{

7340

-+	struct task_struct *p = arg;

7341

-+	struct rq *rq = this_rq();

7342

-+	struct rq_flags rf;

7343

-+	int cpu;

7344

-+

7345

-+	raw_spin_lock_irq(&p->pi_lock);

7346

-+	rq_lock(rq, &rf);

7347

-+

7348

-+	update_rq_clock(rq);

7349

-+

7350

-+	if (task_rq(p) == rq && task_on_rq_queued(p)) {

7351

-+		cpu = select_fallback_rq(rq->cpu, p);

7352

-+		rq = __migrate_task(rq, p, cpu);

7353

-+	}

7354

-+

7355

-+	rq_unlock(rq, &rf);

7356

-+	raw_spin_unlock_irq(&p->pi_lock);

7357

-+

7358

-+	put_task_struct(p);

7359

-+

7360

-+	return 0;

7361

-+}

7362

-+

7363

-+static DEFINE_PER_CPU(struct cpu_stop_work, push_work);

7364

-+

7365

-+/*

7366

-+ * This is enabled below SCHED_AP_ACTIVE; when !cpu_active(), but only

7367

-+ * effective when the hotplug motion is down.

7368

-+ */

7369

-+static void balance_push(struct rq *rq)

7370

-+{

7371

-+	struct task_struct *push_task = rq->curr;

7372

-+

7373

-+	lockdep_assert_held(&rq->lock);

7374

-+

7375

-+	/*

7376

-+	 * Ensure the thing is persistent until balance_push_set(.on = false);

7377

-+	 */

7378

-+	rq->balance_callback = &balance_push_callback;

7379

-+

7380

-+	/*

7381

-+	 * Only active while going offline and when invoked on the outgoing

7382

-+	 * CPU.

7383

-+	 */

7384

-+	if (!cpu_dying(rq->cpu) || rq != this_rq())

7385

-+		return;

7386

-+

7387

-+	/*

7388

-+	 * Both the cpu-hotplug and stop task are in this case and are

7389

-+	 * required to complete the hotplug process.

7390

-+	 */

7391

-+	if (kthread_is_per_cpu(push_task) ||

7392

-+	    is_migration_disabled(push_task)) {

7393

-+

7394

-+		/*

7395

-+		 * If this is the idle task on the outgoing CPU try to wake

7396

-+		 * up the hotplug control thread which might wait for the

7397

-+		 * last task to vanish. The rcuwait_active() check is

7398

-+		 * accurate here because the waiter is pinned on this CPU

7399

-+		 * and can't obviously be running in parallel.

7400

-+		 *

7401

-+		 * On RT kernels this also has to check whether there are

7402

-+		 * pinned and scheduled out tasks on the runqueue. They

7403

-+		 * need to leave the migrate disabled section first.

7404

-+		 */

7405

-+		if (!rq->nr_running && !rq_has_pinned_tasks(rq) &&

7406

-+		    rcuwait_active(&rq->hotplug_wait)) {

7407

-+			raw_spin_unlock(&rq->lock);

7408

-+			rcuwait_wake_up(&rq->hotplug_wait);

7409

-+			raw_spin_lock(&rq->lock);

7410

-+		}

7411

-+		return;

7412

-+	}

7413

-+

7414

-+	get_task_struct(push_task);

7415

-+	/*

7416

-+	 * Temporarily drop rq->lock such that we can wake-up the stop task.

7417

-+	 * Both preemption and IRQs are still disabled.

7418

-+	 */

7419

-+	raw_spin_unlock(&rq->lock);

7420

-+	stop_one_cpu_nowait(rq->cpu, __balance_push_cpu_stop, push_task,

7421

-+			    this_cpu_ptr(&push_work));

7422

-+	/*

7423

-+	 * At this point need_resched() is true and we'll take the loop in

7424

-+	 * schedule(). The next pick is obviously going to be the stop task

7425

-+	 * which kthread_is_per_cpu() and will push this task away.

7426

-+	 */

7427

-+	raw_spin_lock(&rq->lock);

7428

-+}

7429

-+

7430

-+static void balance_push_set(int cpu, bool on)

7431

-+{

7432

-+	struct rq *rq = cpu_rq(cpu);

7433

-+	struct rq_flags rf;

7434

-+

7435

-+	rq_lock_irqsave(rq, &rf);

7436

-+	if (on) {

7437

-+		WARN_ON_ONCE(rq->balance_callback);

7438

-+		rq->balance_callback = &balance_push_callback;

7439

-+	} else if (rq->balance_callback == &balance_push_callback) {

7440

-+		rq->balance_callback = NULL;

7441

-+	}

7442

-+	rq_unlock_irqrestore(rq, &rf);

7443

-+}

7444

-+

7445

-+/*

7446

-+ * Invoked from a CPUs hotplug control thread after the CPU has been marked

7447

-+ * inactive. All tasks which are not per CPU kernel threads are either

7448

-+ * pushed off this CPU now via balance_push() or placed on a different CPU

7449

-+ * during wakeup. Wait until the CPU is quiescent.

7450

-+ */

7451

-+static void balance_hotplug_wait(void)

7452

-+{

7453

-+	struct rq *rq = this_rq();

7454

-+

7455

-+	rcuwait_wait_event(&rq->hotplug_wait,

7456

-+			   rq->nr_running == 1 && !rq_has_pinned_tasks(rq),

7457

-+			   TASK_UNINTERRUPTIBLE);

7458

-+}

7459

-+

7460

-+#else

7461

-+

7462

-+static void balance_push(struct rq *rq)

7463

-+{

7464

-+}

7465

-+

7466

-+static void balance_push_set(int cpu, bool on)

7467

-+{

7468

-+}

7469

-+

7470

-+static inline void balance_hotplug_wait(void)

7471

-+{

7472

-+}

7473

-+#endif /* CONFIG_HOTPLUG_CPU */

7474

-+

7475

-+static void set_rq_offline(struct rq *rq)

7476

-+{

7477

-+	if (rq->online)

7478

-+		rq->online = false;

7479

-+}

7480

-+

7481

-+static void set_rq_online(struct rq *rq)

7482

-+{

7483

-+	if (!rq->online)

7484

-+		rq->online = true;

7485

-+}

7486

-+

7487

-+/*

7488

-+ * used to mark begin/end of suspend/resume:

7489

-+ */

7490

-+static int num_cpus_frozen;

7491

-+

7492

-+/*

7493

-+ * Update cpusets according to cpu_active mask.  If cpusets are

7494

-+ * disabled, cpuset_update_active_cpus() becomes a simple wrapper

7495

-+ * around partition_sched_domains().

7496

-+ *

7497

-+ * If we come here as part of a suspend/resume, don't touch cpusets because we

7498

-+ * want to restore it back to its original state upon resume anyway.

7499

-+ */

7500

-+static void cpuset_cpu_active(void)

7501

-+{

7502

-+	if (cpuhp_tasks_frozen) {

7503

-+		/*

7504

-+		 * num_cpus_frozen tracks how many CPUs are involved in suspend

7505

-+		 * resume sequence. As long as this is not the last online

7506

-+		 * operation in the resume sequence, just build a single sched

7507

-+		 * domain, ignoring cpusets.

7508

-+		 */

7509

-+		partition_sched_domains(1, NULL, NULL);

7510

-+		if (--num_cpus_frozen)

7511

-+			return;

7512

-+		/*

7513

-+		 * This is the last CPU online operation. So fall through and

7514

-+		 * restore the original sched domains by considering the

7515

-+		 * cpuset configurations.

7516

-+		 */

7517

-+		cpuset_force_rebuild();

7518

-+	}

7519

-+

7520

-+	cpuset_update_active_cpus();

7521

-+}

7522

-+

7523

-+static int cpuset_cpu_inactive(unsigned int cpu)

7524

-+{

7525

-+	if (!cpuhp_tasks_frozen) {

7526

-+		cpuset_update_active_cpus();

7527

-+	} else {

7528

-+		num_cpus_frozen++;

7529

-+		partition_sched_domains(1, NULL, NULL);

7530

-+	}

7531

-+	return 0;

7532

-+}

7533

-+

7534

-+int sched_cpu_activate(unsigned int cpu)

7535

-+{

7536

-+	struct rq *rq = cpu_rq(cpu);

7537

-+	unsigned long flags;

7538

-+

7539

-+	/*

7540

-+	 * Clear the balance_push callback and prepare to schedule

7541

-+	 * regular tasks.

7542

-+	 */

7543

-+	balance_push_set(cpu, false);

7544

-+

7545

-+#ifdef CONFIG_SCHED_SMT

7546

-+	/*

7547

-+	 * When going up, increment the number of cores with SMT present.

7548

-+	 */

7549

-+	if (cpumask_weight(cpu_smt_mask(cpu)) == 2)

7550

-+		static_branch_inc_cpuslocked(&sched_smt_present);

7551

-+#endif

7552

-+	set_cpu_active(cpu, true);

7553

-+

7554

-+	if (sched_smp_initialized)

7555

-+		cpuset_cpu_active();

7556

-+

7557

-+	/*

7558

-+	 * Put the rq online, if not already. This happens:

7559

-+	 *

7560

-+	 * 1) In the early boot process, because we build the real domains

7561

-+	 *    after all cpus have been brought up.

7562

-+	 *

7563

-+	 * 2) At runtime, if cpuset_cpu_active() fails to rebuild the

7564

-+	 *    domains.

7565

-+	 */

7566

-+	raw_spin_lock_irqsave(&rq->lock, flags);

7567

-+	set_rq_online(rq);

7568

-+	raw_spin_unlock_irqrestore(&rq->lock, flags);

7569

-+

7570

-+	return 0;

7571

-+}

7572

-+

7573

-+int sched_cpu_deactivate(unsigned int cpu)

7574

-+{

7575

-+	struct rq *rq = cpu_rq(cpu);

7576

-+	unsigned long flags;

7577

-+	int ret;

7578

-+

7579

-+	set_cpu_active(cpu, false);

7580

-+

7581

-+	/*

7582

-+	 * From this point forward, this CPU will refuse to run any task that

7583

-+	 * is not: migrate_disable() or KTHREAD_IS_PER_CPU, and will actively

7584

-+	 * push those tasks away until this gets cleared, see

7585

-+	 * sched_cpu_dying().

7586

-+	 */

7587

-+	balance_push_set(cpu, true);

7588

-+

7589

-+	/*

7590

-+	 * We've cleared cpu_active_mask, wait for all preempt-disabled and RCU

7591

-+	 * users of this state to go away such that all new such users will

7592

-+	 * observe it.

7593

-+	 *

7594

-+	 * Specifically, we rely on ttwu to no longer target this CPU, see

7595

-+	 * ttwu_queue_cond() and is_cpu_allowed().

7596

-+	 *

7597

-+	 * Do sync before park smpboot threads to take care the rcu boost case.

7598

-+	 */

7599

-+	synchronize_rcu();

7600

-+

7601

-+	raw_spin_lock_irqsave(&rq->lock, flags);

7602

-+	update_rq_clock(rq);

7603

-+	set_rq_offline(rq);

7604

-+	raw_spin_unlock_irqrestore(&rq->lock, flags);

7605

-+

7606

-+#ifdef CONFIG_SCHED_SMT

7607

-+	/*

7608

-+	 * When going down, decrement the number of cores with SMT present.

7609

-+	 */

7610

-+	if (cpumask_weight(cpu_smt_mask(cpu)) == 2) {

7611

-+		static_branch_dec_cpuslocked(&sched_smt_present);

7612

-+		if (!static_branch_likely(&sched_smt_present))

7613

-+			cpumask_clear(&sched_sg_idle_mask);

7614

-+	}

7615

-+#endif

7616

-+

7617

-+	if (!sched_smp_initialized)

7618

-+		return 0;

7619

-+

7620

-+	ret = cpuset_cpu_inactive(cpu);

7621

-+	if (ret) {

7622

-+		balance_push_set(cpu, false);

7623

-+		set_cpu_active(cpu, true);

7624

-+		return ret;

7625

-+	}

7626

-+

7627

-+	return 0;

7628

-+}

7629

-+

7630

-+static void sched_rq_cpu_starting(unsigned int cpu)

7631

-+{

7632

-+	struct rq *rq = cpu_rq(cpu);

7633

-+

7634

-+	rq->calc_load_update = calc_load_update;

7635

-+}

7636

-+

7637

-+int sched_cpu_starting(unsigned int cpu)

7638

-+{

7639

-+	sched_rq_cpu_starting(cpu);

7640

-+	sched_tick_start(cpu);

7641

-+	return 0;

7642

-+}

7643

-+

7644

-+#ifdef CONFIG_HOTPLUG_CPU

7645

-+

7646

-+/*

7647

-+ * Invoked immediately before the stopper thread is invoked to bring the

7648

-+ * CPU down completely. At this point all per CPU kthreads except the

7649

-+ * hotplug thread (current) and the stopper thread (inactive) have been

7650

-+ * either parked or have been unbound from the outgoing CPU. Ensure that

7651

-+ * any of those which might be on the way out are gone.

7652

-+ *

7653

-+ * If after this point a bound task is being woken on this CPU then the

7654

-+ * responsible hotplug callback has failed to do it's job.

7655

-+ * sched_cpu_dying() will catch it with the appropriate fireworks.

7656

-+ */

7657

-+int sched_cpu_wait_empty(unsigned int cpu)

7658

-+{

7659

-+	balance_hotplug_wait();

7660

-+	return 0;

7661

-+}

7662

-+

7663

-+/*

7664

-+ * Since this CPU is going 'away' for a while, fold any nr_active delta we

7665

-+ * might have. Called from the CPU stopper task after ensuring that the

7666

-+ * stopper is the last running task on the CPU, so nr_active count is

7667

-+ * stable. We need to take the teardown thread which is calling this into

7668

-+ * account, so we hand in adjust = 1 to the load calculation.

7669

-+ *

7670

-+ * Also see the comment "Global load-average calculations".

7671

-+ */

7672

-+static void calc_load_migrate(struct rq *rq)

7673

-+{

7674

-+	long delta = calc_load_fold_active(rq, 1);

7675

-+

7676

-+	if (delta)

7677

-+		atomic_long_add(delta, &calc_load_tasks);

7678

-+}

7679

-+

7680

-+static void dump_rq_tasks(struct rq *rq, const char *loglvl)

7681

-+{

7682

-+	struct task_struct *g, *p;

7683

-+	int cpu = cpu_of(rq);

7684

-+

7685

-+	lockdep_assert_held(&rq->lock);

7686

-+

7687

-+	printk("%sCPU%d enqueued tasks (%u total):\n", loglvl, cpu, rq->nr_running);

7688

-+	for_each_process_thread(g, p) {

7689

-+		if (task_cpu(p) != cpu)

7690

-+			continue;

7691

-+

7692

-+		if (!task_on_rq_queued(p))

7693

-+			continue;

7694

-+

7695

-+		printk("%s\tpid: %d, name: %s\n", loglvl, p->pid, p->comm);

7696

-+	}

7697

-+}

7698

-+

7699

-+int sched_cpu_dying(unsigned int cpu)

7700

-+{

7701

-+	struct rq *rq = cpu_rq(cpu);

7702

-+	unsigned long flags;

7703

-+

7704

-+	/* Handle pending wakeups and then migrate everything off */

7705

-+	sched_tick_stop(cpu);

7706

-+

7707

-+	raw_spin_lock_irqsave(&rq->lock, flags);

7708

-+	if (rq->nr_running != 1 || rq_has_pinned_tasks(rq)) {

7709

-+		WARN(true, "Dying CPU not properly vacated!");

7710

-+		dump_rq_tasks(rq, KERN_WARNING);

7711

-+	}

7712

-+	raw_spin_unlock_irqrestore(&rq->lock, flags);

7713

-+

7714

-+	calc_load_migrate(rq);

7715

-+	hrtick_clear(rq);

7716

-+	return 0;

7717

-+}

7718

-+#endif

7719

-+

7720

-+#ifdef CONFIG_SMP

7721

-+static void sched_init_topology_cpumask_early(void)

7722

-+{

7723

-+	int cpu;

7724

-+	cpumask_t *tmp;

7725

-+

7726

-+	for_each_possible_cpu(cpu) {

7727

-+		/* init topo masks */

7728

-+		tmp = per_cpu(sched_cpu_topo_masks, cpu);

7729

-+

7730

-+		cpumask_copy(tmp, cpumask_of(cpu));

7731

-+		tmp++;

7732

-+		cpumask_copy(tmp, cpu_possible_mask);

7733

-+		per_cpu(sched_cpu_llc_mask, cpu) = tmp;

7734

-+		per_cpu(sched_cpu_topo_end_mask, cpu) = ++tmp;

7735

-+		/*per_cpu(sd_llc_id, cpu) = cpu;*/

7736

-+	}

7737

-+}

7738

-+

7739

-+#define TOPOLOGY_CPUMASK(name, mask, last)\

7740

-+	if (cpumask_and(topo, topo, mask)) {					\

7741

-+		cpumask_copy(topo, mask);					\

7742

-+		printk(KERN_INFO "sched: cpu#%02d topo: 0x%08lx - "#name,	\

7743

-+		       cpu, (topo++)->bits[0]);					\

7744

-+	}									\

7745

-+	if (!last)								\

7746

-+		cpumask_complement(topo, mask)

7747

-+

7748

-+static void sched_init_topology_cpumask(void)

7749

-+{

7750

-+	int cpu;

7751

-+	cpumask_t *topo;

7752

-+

7753

-+	for_each_online_cpu(cpu) {

7754

-+		/* take chance to reset time slice for idle tasks */

7755

-+		cpu_rq(cpu)->idle->time_slice = sched_timeslice_ns;

7756

-+

7757

-+		topo = per_cpu(sched_cpu_topo_masks, cpu) + 1;

7758

-+

7759

-+		cpumask_complement(topo, cpumask_of(cpu));

7760

-+#ifdef CONFIG_SCHED_SMT

7761

-+		TOPOLOGY_CPUMASK(smt, topology_sibling_cpumask(cpu), false);

7762

-+#endif

7763

-+		per_cpu(sd_llc_id, cpu) = cpumask_first(cpu_coregroup_mask(cpu));

7764

-+		per_cpu(sched_cpu_llc_mask, cpu) = topo;

7765

-+		TOPOLOGY_CPUMASK(coregroup, cpu_coregroup_mask(cpu), false);

7766

-+

7767

-+		TOPOLOGY_CPUMASK(core, topology_core_cpumask(cpu), false);

7768

-+

7769

-+		TOPOLOGY_CPUMASK(others, cpu_online_mask, true);

7770

-+

7771

-+		per_cpu(sched_cpu_topo_end_mask, cpu) = topo;

7772

-+		printk(KERN_INFO "sched: cpu#%02d llc_id = %d, llc_mask idx = %d\n",

7773

-+		       cpu, per_cpu(sd_llc_id, cpu),

7774

-+		       (int) (per_cpu(sched_cpu_llc_mask, cpu) -

7775

-+			      per_cpu(sched_cpu_topo_masks, cpu)));

7776

-+	}

7777

-+}

7778

-+#endif

7779

-+

7780

-+void __init sched_init_smp(void)

7781

-+{

7782

-+	/* Move init over to a non-isolated CPU */

7783

-+	if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0)

7784

-+		BUG();

7785

-+	current->flags &= ~PF_NO_SETAFFINITY;

7786

-+

7787

-+	sched_init_topology_cpumask();

7788

-+

7789

-+	sched_smp_initialized = true;

7790

-+}

7791

-+#else

7792

-+void __init sched_init_smp(void)

7793

-+{

7794

-+	cpu_rq(0)->idle->time_slice = sched_timeslice_ns;

7795

-+}

7796

-+#endif /* CONFIG_SMP */

7797

-+

7798

-+int in_sched_functions(unsigned long addr)

7799

-+{

7800

-+	return in_lock_functions(addr) ||

7801

-+		(addr >= (unsigned long)__sched_text_start

7802

-+		&& addr < (unsigned long)__sched_text_end);

7803

-+}

7804

-+

7805

-+#ifdef CONFIG_CGROUP_SCHED

7806

-+/* task group related information */

7807

-+struct task_group {

7808

-+	struct cgroup_subsys_state css;

7809

-+

7810

-+	struct rcu_head rcu;

7811

-+	struct list_head list;

7812

-+

7813

-+	struct task_group *parent;

7814

-+	struct list_head siblings;

7815

-+	struct list_head children;

7816

-+#ifdef CONFIG_FAIR_GROUP_SCHED

7817

-+	unsigned long		shares;

7818

-+#endif

7819

-+};

7820

-+

7821

-+/*

7822

-+ * Default task group.

7823

-+ * Every task in system belongs to this group at bootup.

7824

-+ */

7825

-+struct task_group root_task_group;

7826

-+LIST_HEAD(task_groups);

7827

-+

7828

-+/* Cacheline aligned slab cache for task_group */

7829

-+static struct kmem_cache *task_group_cache __read_mostly;

7830

-+#endif /* CONFIG_CGROUP_SCHED */

7831

-+

7832

-+void __init sched_init(void)

7833

-+{

7834

-+	int i;

7835

-+	struct rq *rq;

7836

-+

7837

-+	printk(KERN_INFO ALT_SCHED_VERSION_MSG);

7838

-+

7839

-+	wait_bit_init();

7840

-+

7841

-+#ifdef CONFIG_SMP

7842

-+	for (i = 0; i < SCHED_BITS; i++)

7843

-+		cpumask_copy(sched_rq_watermark + i, cpu_present_mask);

7844

-+#endif

7845

-+

7846

-+#ifdef CONFIG_CGROUP_SCHED

7847

-+	task_group_cache = KMEM_CACHE(task_group, 0);

7848

-+

7849

-+	list_add(&root_task_group.list, &task_groups);

7850

-+	INIT_LIST_HEAD(&root_task_group.children);

7851

-+	INIT_LIST_HEAD(&root_task_group.siblings);

7852

-+#endif /* CONFIG_CGROUP_SCHED */

7853

-+	for_each_possible_cpu(i) {

7854

-+		rq = cpu_rq(i);

7855

-+

7856

-+		sched_queue_init(&rq->queue);

7857

-+		rq->watermark = IDLE_TASK_SCHED_PRIO;

7858

-+		rq->skip = NULL;

7859

-+

7860

-+		raw_spin_lock_init(&rq->lock);

7861

-+		rq->nr_running = rq->nr_uninterruptible = 0;

7862

-+		rq->calc_load_active = 0;

7863

-+		rq->calc_load_update = jiffies + LOAD_FREQ;

7864

-+#ifdef CONFIG_SMP

7865

-+		rq->online = false;

7866

-+		rq->cpu = i;

7867

-+

7868

-+#ifdef CONFIG_SCHED_SMT

7869

-+		rq->active_balance = 0;

7870

-+#endif

7871

-+

7872

-+#ifdef CONFIG_NO_HZ_COMMON

7873

-+		INIT_CSD(&rq->nohz_csd, nohz_csd_func, rq);

7874

-+#endif

7875

-+		rq->balance_callback = &balance_push_callback;

7876

-+#ifdef CONFIG_HOTPLUG_CPU

7877

-+		rcuwait_init(&rq->hotplug_wait);

7878

-+#endif

7879

-+#endif /* CONFIG_SMP */

7880

-+		rq->nr_switches = 0;

7881

-+

7882

-+		hrtick_rq_init(rq);

7883

-+		atomic_set(&rq->nr_iowait, 0);

7884

-+	}

7885

-+#ifdef CONFIG_SMP

7886

-+	/* Set rq->online for cpu 0 */

7887

-+	cpu_rq(0)->online = true;

7888

-+#endif

7889

-+	/*

7890

-+	 * The boot idle thread does lazy MMU switching as well:

7891

-+	 */

7892

-+	mmgrab(&init_mm);

7893

-+	enter_lazy_tlb(&init_mm, current);

7894

-+

7895

-+	/*

7896

-+	 * Make us the idle thread. Technically, schedule() should not be

7897

-+	 * called from this thread, however somewhere below it might be,

7898

-+	 * but because we are the idle thread, we just pick up running again

7899

-+	 * when this runqueue becomes "idle".

7900

-+	 */

7901

-+	init_idle(current, smp_processor_id());

7902

-+

7903

-+	calc_load_update = jiffies + LOAD_FREQ;

7904

-+

7905

-+#ifdef CONFIG_SMP

7906

-+	idle_thread_set_boot_cpu();

7907

-+	balance_push_set(smp_processor_id(), false);

7908

-+

7909

-+	sched_init_topology_cpumask_early();

7910

-+#endif /* SMP */

7911

-+

7912

-+	psi_init();

7913

-+}

7914

-+

7915

-+#ifdef CONFIG_DEBUG_ATOMIC_SLEEP

7916

-+static inline int preempt_count_equals(int preempt_offset)

7917

-+{

7918

-+	int nested = preempt_count() + rcu_preempt_depth();

7919

-+

7920

-+	return (nested == preempt_offset);

7921

-+}

7922

-+

7923

-+void __might_sleep(const char *file, int line, int preempt_offset)

7924

-+{

7925

-+	unsigned int state = get_current_state();

7926

-+	/*

7927

-+	 * Blocking primitives will set (and therefore destroy) current->state,

7928

-+	 * since we will exit with TASK_RUNNING make sure we enter with it,

7929

-+	 * otherwise we will destroy state.

7930

-+	 */

7931

-+	WARN_ONCE(state != TASK_RUNNING && current->task_state_change,

7932

-+			"do not call blocking ops when !TASK_RUNNING; "

7933

-+			"state=%x set at [<%p>] %pS\n", state,

7934

-+			(void *)current->task_state_change,

7935

-+			(void *)current->task_state_change);

7936

-+

7937

-+	___might_sleep(file, line, preempt_offset);

7938

-+}

7939

-+EXPORT_SYMBOL(__might_sleep);

7940

-+

7941

-+void ___might_sleep(const char *file, int line, int preempt_offset)

7942

-+{

7943

-+	/* Ratelimiting timestamp: */

7944

-+	static unsigned long prev_jiffy;

7945

-+

7946

-+	unsigned long preempt_disable_ip;

7947

-+

7948

-+	/* WARN_ON_ONCE() by default, no rate limit required: */

7949

-+	rcu_sleep_check();

7950

-+

7951

-+	if ((preempt_count_equals(preempt_offset) && !irqs_disabled() &&

7952

-+	     !is_idle_task(current) && !current->non_block_count) ||

7953

-+	    system_state == SYSTEM_BOOTING || system_state > SYSTEM_RUNNING ||

7954

-+	    oops_in_progress)

7955

-+		return;

7956

-+	if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)

7957

-+		return;

7958

-+	prev_jiffy = jiffies;

7959

-+

7960

-+	/* Save this before calling printk(), since that will clobber it: */

7961

-+	preempt_disable_ip = get_preempt_disable_ip(current);

7962

-+

7963

-+	printk(KERN_ERR

7964

-+		"BUG: sleeping function called from invalid context at %s:%d\n",

7965

-+			file, line);

7966

-+	printk(KERN_ERR

7967

-+		"in_atomic(): %d, irqs_disabled(): %d, non_block: %d, pid: %d, name: %s\n",

7968

-+			in_atomic(), irqs_disabled(), current->non_block_count,

7969

-+			current->pid, current->comm);

7970

-+

7971

-+	if (task_stack_end_corrupted(current))

7972

-+		printk(KERN_EMERG "Thread overran stack, or stack corrupted\n");

7973

-+

7974

-+	debug_show_held_locks(current);

7975

-+	if (irqs_disabled())

7976

-+		print_irqtrace_events(current);

7977

-+#ifdef CONFIG_DEBUG_PREEMPT

7978

-+	if (!preempt_count_equals(preempt_offset)) {

7979

-+		pr_err("Preemption disabled at:");

7980

-+		print_ip_sym(KERN_ERR, preempt_disable_ip);

7981

-+	}

7982

-+#endif

7983

-+	dump_stack();

7984

-+	add_taint(TAINT_WARN, LOCKDEP_STILL_OK);

7985

-+}

7986

-+EXPORT_SYMBOL(___might_sleep);

7987

-+

7988

-+void __cant_sleep(const char *file, int line, int preempt_offset)

7989

-+{

7990

-+	static unsigned long prev_jiffy;

7991

-+

7992

-+	if (irqs_disabled())

7993

-+		return;

7994

-+

7995

-+	if (!IS_ENABLED(CONFIG_PREEMPT_COUNT))

7996

-+		return;

7997

-+

7998

-+	if (preempt_count() > preempt_offset)

7999

-+		return;

8000

-+

8001

-+	if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)

8002

-+		return;

8003

-+	prev_jiffy = jiffies;

8004

-+

8005

-+	printk(KERN_ERR "BUG: assuming atomic context at %s:%d\n", file, line);

8006

-+	printk(KERN_ERR "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",

8007

-+			in_atomic(), irqs_disabled(),

8008

-+			current->pid, current->comm);

8009

-+

8010

-+	debug_show_held_locks(current);

8011

-+	dump_stack();

8012

-+	add_taint(TAINT_WARN, LOCKDEP_STILL_OK);

8013

-+}

8014

-+EXPORT_SYMBOL_GPL(__cant_sleep);

8015

-+

8016

-+#ifdef CONFIG_SMP

8017

-+void __cant_migrate(const char *file, int line)

8018

-+{

8019

-+	static unsigned long prev_jiffy;

8020

-+

8021

-+	if (irqs_disabled())

8022

-+		return;

8023

-+

8024

-+	if (is_migration_disabled(current))

8025

-+		return;

8026

-+

8027

-+	if (!IS_ENABLED(CONFIG_PREEMPT_COUNT))

8028

-+		return;

8029

-+

8030

-+	if (preempt_count() > 0)

8031

-+		return;

8032

-+

8033

-+	if (current->migration_flags & MDF_FORCE_ENABLED)

8034

-+		return;

8035

-+

8036

-+	if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)

8037

-+		return;

8038

-+	prev_jiffy = jiffies;

8039

-+

8040

-+	pr_err("BUG: assuming non migratable context at %s:%d\n", file, line);

8041

-+	pr_err("in_atomic(): %d, irqs_disabled(): %d, migration_disabled() %u pid: %d, name: %s\n",

8042

-+	       in_atomic(), irqs_disabled(), is_migration_disabled(current),

8043

-+	       current->pid, current->comm);

8044

-+

8045

-+	debug_show_held_locks(current);

8046

-+	dump_stack();

8047

-+	add_taint(TAINT_WARN, LOCKDEP_STILL_OK);

8048

-+}

8049

-+EXPORT_SYMBOL_GPL(__cant_migrate);

8050

-+#endif

8051

-+#endif

8052

-+

8053

-+#ifdef CONFIG_MAGIC_SYSRQ

8054

-+void normalize_rt_tasks(void)

8055

-+{

8056

-+	struct task_struct *g, *p;

8057

-+	struct sched_attr attr = {

8058

-+		.sched_policy = SCHED_NORMAL,

8059

-+	};

8060

-+

8061

-+	read_lock(&tasklist_lock);

8062

-+	for_each_process_thread(g, p) {

8063

-+		/*

8064

-+		 * Only normalize user tasks:

8065

-+		 */

8066

-+		if (p->flags & PF_KTHREAD)

8067

-+			continue;

8068

-+

8069

-+		if (!rt_task(p)) {

8070

-+			/*

8071

-+			 * Renice negative nice level userspace

8072

-+			 * tasks back to 0:

8073

-+			 */

8074

-+			if (task_nice(p) < 0)

8075

-+				set_user_nice(p, 0);

8076

-+			continue;

8077

-+		}

8078

-+

8079

-+		__sched_setscheduler(p, &attr, false, false);

8080

-+	}

8081

-+	read_unlock(&tasklist_lock);

8082

-+}

8083

-+#endif /* CONFIG_MAGIC_SYSRQ */

8084

-+

8085

-+#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB)

8086

-+/*

8087

-+ * These functions are only useful for the IA64 MCA handling, or kdb.

8088

-+ *

8089

-+ * They can only be called when the whole system has been

8090

-+ * stopped - every CPU needs to be quiescent, and no scheduling

8091

-+ * activity can take place. Using them for anything else would

8092

-+ * be a serious bug, and as a result, they aren't even visible

8093

-+ * under any other configuration.

8094

-+ */

8095

-+

8096

-+/**

8097

-+ * curr_task - return the current task for a given CPU.

8098

-+ * @cpu: the processor in question.

8099

-+ *

8100

-+ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!

8101

-+ *

8102

-+ * Return: The current task for @cpu.

8103

-+ */

8104

-+struct task_struct *curr_task(int cpu)

8105

-+{

8106

-+	return cpu_curr(cpu);

8107

-+}

8108

-+

8109

-+#endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */

8110

-+

8111

-+#ifdef CONFIG_IA64

8112

-+/**

8113

-+ * ia64_set_curr_task - set the current task for a given CPU.

8114

-+ * @cpu: the processor in question.

8115

-+ * @p: the task pointer to set.

8116

-+ *

8117

-+ * Description: This function must only be used when non-maskable interrupts

8118

-+ * are serviced on a separate stack.  It allows the architecture to switch the

8119

-+ * notion of the current task on a CPU in a non-blocking manner.  This function

8120

-+ * must be called with all CPU's synchronised, and interrupts disabled, the

8121

-+ * and caller must save the original value of the current task (see

8122

-+ * curr_task() above) and restore that value before reenabling interrupts and

8123

-+ * re-starting the system.

8124

-+ *

8125

-+ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!

8126

-+ */

8127

-+void ia64_set_curr_task(int cpu, struct task_struct *p)

8128

-+{

8129

-+	cpu_curr(cpu) = p;

8130

-+}

8131

-+

8132

-+#endif

8133

-+

8134

-+#ifdef CONFIG_CGROUP_SCHED

8135

-+static void sched_free_group(struct task_group *tg)

8136

-+{

8137

-+	kmem_cache_free(task_group_cache, tg);

8138

-+}

8139

-+

8140

-+/* allocate runqueue etc for a new task group */

8141

-+struct task_group *sched_create_group(struct task_group *parent)

8142

-+{

8143

-+	struct task_group *tg;

8144

-+

8145

-+	tg = kmem_cache_alloc(task_group_cache, GFP_KERNEL | __GFP_ZERO);

8146

-+	if (!tg)

8147

-+		return ERR_PTR(-ENOMEM);

8148

-+

8149

-+	return tg;

8150

-+}

8151

-+

8152

-+void sched_online_group(struct task_group *tg, struct task_group *parent)

8153

-+{

8154

-+}

8155

-+

8156

-+/* rcu callback to free various structures associated with a task group */

8157

-+static void sched_free_group_rcu(struct rcu_head *rhp)

8158

-+{

8159

-+	/* Now it should be safe to free those cfs_rqs */

8160

-+	sched_free_group(container_of(rhp, struct task_group, rcu));

8161

-+}

8162

-+

8163

-+void sched_destroy_group(struct task_group *tg)

8164

-+{

8165

-+	/* Wait for possible concurrent references to cfs_rqs complete */

8166

-+	call_rcu(&tg->rcu, sched_free_group_rcu);

8167

-+}

8168

-+

8169

-+void sched_offline_group(struct task_group *tg)

8170

-+{

8171

-+}

8172

-+

8173

-+static inline struct task_group *css_tg(struct cgroup_subsys_state *css)

8174

-+{

8175

-+	return css ? container_of(css, struct task_group, css) : NULL;

8176

-+}

8177

-+

8178

-+static struct cgroup_subsys_state *

8179

-+cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)

8180

-+{

8181

-+	struct task_group *parent = css_tg(parent_css);

8182

-+	struct task_group *tg;

8183

-+

8184

-+	if (!parent) {

8185

-+		/* This is early initialization for the top cgroup */

8186

-+		return &root_task_group.css;

8187

-+	}

8188

-+

8189

-+	tg = sched_create_group(parent);

8190

-+	if (IS_ERR(tg))

8191

-+		return ERR_PTR(-ENOMEM);

8192

-+	return &tg->css;

8193

-+}

8194

-+

8195

-+/* Expose task group only after completing cgroup initialization */

8196

-+static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)

8197

-+{

8198

-+	struct task_group *tg = css_tg(css);

8199

-+	struct task_group *parent = css_tg(css->parent);

8200

-+

8201

-+	if (parent)

8202

-+		sched_online_group(tg, parent);

8203

-+	return 0;

8204

-+}

8205

-+

8206

-+static void cpu_cgroup_css_released(struct cgroup_subsys_state *css)

8207

-+{

8208

-+	struct task_group *tg = css_tg(css);

8209

-+

8210

-+	sched_offline_group(tg);

8211

-+}

8212

-+

8213

-+static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)

8214

-+{

8215

-+	struct task_group *tg = css_tg(css);

8216

-+

8217

-+	/*

8218

-+	 * Relies on the RCU grace period between css_released() and this.

8219

-+	 */

8220

-+	sched_free_group(tg);

8221

-+}

8222

-+

8223

-+static void cpu_cgroup_fork(struct task_struct *task)

8224

-+{

8225

-+}

8226

-+

8227

-+static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)

8228

-+{

8229

-+	return 0;

8230

-+}

8231

-+

8232

-+static void cpu_cgroup_attach(struct cgroup_taskset *tset)

8233

-+{

8234

-+}

8235

-+

8236

-+#ifdef CONFIG_FAIR_GROUP_SCHED

8237

-+static DEFINE_MUTEX(shares_mutex);

8238

-+

8239

-+int sched_group_set_shares(struct task_group *tg, unsigned long shares)

8240

-+{

8241

-+	/*

8242

-+	 * We can't change the weight of the root cgroup.

8243

-+	 */

8244

-+	if (&root_task_group == tg)

8245

-+		return -EINVAL;

8246

-+

8247

-+	shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));

8248

-+

8249

-+	mutex_lock(&shares_mutex);

8250

-+	if (tg->shares == shares)

8251

-+		goto done;

8252

-+

8253

-+	tg->shares = shares;

8254

-+done:

8255

-+	mutex_unlock(&shares_mutex);

8256

-+	return 0;

8257

-+}

8258

-+

8259

-+static int cpu_shares_write_u64(struct cgroup_subsys_state *css,

8260

-+				struct cftype *cftype, u64 shareval)

8261

-+{

8262

-+	if (shareval > scale_load_down(ULONG_MAX))

8263

-+		shareval = MAX_SHARES;

8264

-+	return sched_group_set_shares(css_tg(css), scale_load(shareval));

8265

-+}

8266

-+

8267

-+static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css,

8268

-+			       struct cftype *cft)

8269

-+{

8270

-+	struct task_group *tg = css_tg(css);

8271

-+

8272

-+	return (u64) scale_load_down(tg->shares);

8273

-+}

8274

-+#endif

8275

-+

8276

-+static struct cftype cpu_legacy_files[] = {

8277

-+#ifdef CONFIG_FAIR_GROUP_SCHED

8278

-+	{

8279

-+		.name = "shares",

8280

-+		.read_u64 = cpu_shares_read_u64,

8281

-+		.write_u64 = cpu_shares_write_u64,

8282

-+	},

8283

-+#endif

8284

-+	{ }	/* Terminate */

8285

-+};

8286

-+

8287

-+

8288

-+static struct cftype cpu_files[] = {

8289

-+	{ }	/* terminate */

8290

-+};

8291

-+

8292

-+static int cpu_extra_stat_show(struct seq_file *sf,

8293

-+			       struct cgroup_subsys_state *css)

8294

-+{

8295

-+	return 0;

8296

-+}

8297

-+

8298

-+struct cgroup_subsys cpu_cgrp_subsys = {

8299

-+	.css_alloc	= cpu_cgroup_css_alloc,

8300

-+	.css_online	= cpu_cgroup_css_online,

8301

-+	.css_released	= cpu_cgroup_css_released,

8302

-+	.css_free	= cpu_cgroup_css_free,

8303

-+	.css_extra_stat_show = cpu_extra_stat_show,

8304

-+	.fork		= cpu_cgroup_fork,

8305

-+	.can_attach	= cpu_cgroup_can_attach,

8306

-+	.attach		= cpu_cgroup_attach,

8307

-+	.legacy_cftypes	= cpu_files,

8308

-+	.legacy_cftypes	= cpu_legacy_files,

8309

-+	.dfl_cftypes	= cpu_files,

8310

-+	.early_init	= true,

8311

-+	.threaded	= true,

8312

-+};

8313

-+#endif	/* CONFIG_CGROUP_SCHED */

8314

-+

8315

-+#undef CREATE_TRACE_POINTS

8316

-diff --git a/kernel/sched/alt_debug.c b/kernel/sched/alt_debug.c

8317

-new file mode 100644

8318

-index 000000000000..1212a031700e

8319

---- /dev/null

8320

-+++ b/kernel/sched/alt_debug.c

8321

-@@ -0,0 +1,31 @@

8322

-+/*

8323

-+ * kernel/sched/alt_debug.c

8324

-+ *

8325

-+ * Print the alt scheduler debugging details

8326

-+ *

8327

-+ * Author: Alfred Chen

8328

-+ * Date  : 2020

8329

-+ */

8330

-+#include "sched.h"

8331

-+

8332

-+/*

8333

-+ * This allows printing both to /proc/sched_debug and

8334

-+ * to the console

8335

-+ */

8336

-+#define SEQ_printf(m, x...)			\

8337

-+ do {						\

8338

-+	if (m)					\

8339

-+		seq_printf(m, x);		\

8340

-+	else					\

8341

-+		pr_cont(x);			\

8342

-+ } while (0)

8343

-+

8344

-+void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,

8345

-+			  struct seq_file *m)

8346

-+{

8347

-+	SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, task_pid_nr_ns(p, ns),

8348

-+						get_nr_threads(p));

8349

-+}

8350

-+

8351

-+void proc_sched_set_task(struct task_struct *p)

8352

-+{}

8353

-diff --git a/kernel/sched/alt_sched.h b/kernel/sched/alt_sched.h

8354

-new file mode 100644

8355

-index 000000000000..289058a09bd5

8356

---- /dev/null

8357

-+++ b/kernel/sched/alt_sched.h

8358

-@@ -0,0 +1,666 @@

8359

-+#ifndef ALT_SCHED_H

8360

-+#define ALT_SCHED_H

8361

-+

8362

-+#include <linux/sched.h>

8363

-+

8364

-+#include <linux/sched/clock.h>

8365

-+#include <linux/sched/cpufreq.h>

8366

-+#include <linux/sched/cputime.h>

8367

-+#include <linux/sched/debug.h>

8368

-+#include <linux/sched/init.h>

8369

-+#include <linux/sched/isolation.h>

8370

-+#include <linux/sched/loadavg.h>

8371

-+#include <linux/sched/mm.h>

8372

-+#include <linux/sched/nohz.h>

8373

-+#include <linux/sched/signal.h>

8374

-+#include <linux/sched/stat.h>

8375

-+#include <linux/sched/sysctl.h>

8376

-+#include <linux/sched/task.h>

8377

-+#include <linux/sched/topology.h>

8378

-+#include <linux/sched/wake_q.h>

8379

-+

8380

-+#include <uapi/linux/sched/types.h>

8381

-+

8382

-+#include <linux/cgroup.h>

8383

-+#include <linux/cpufreq.h>

8384

-+#include <linux/cpuidle.h>

8385

-+#include <linux/cpuset.h>

8386

-+#include <linux/ctype.h>

8387

-+#include <linux/debugfs.h>

8388

-+#include <linux/kthread.h>

8389

-+#include <linux/livepatch.h>

8390

-+#include <linux/membarrier.h>

8391

-+#include <linux/proc_fs.h>

8392

-+#include <linux/psi.h>

8393

-+#include <linux/slab.h>

8394

-+#include <linux/stop_machine.h>

8395

-+#include <linux/suspend.h>

8396

-+#include <linux/swait.h>

8397

-+#include <linux/syscalls.h>

8398

-+#include <linux/tsacct_kern.h>

8399

-+

8400

-+#include <asm/tlb.h>

8401

-+

8402

-+#ifdef CONFIG_PARAVIRT

8403

-+# include <asm/paravirt.h>

8404

-+#endif

8405

-+

8406

-+#include "cpupri.h"

8407

-+

8408

-+#include <trace/events/sched.h>

8409

-+

8410

-+#ifdef CONFIG_SCHED_BMQ

8411

-+/* bits:

8412

-+ * RT(0-99), (Low prio adj range, nice width, high prio adj range) / 2, cpu idle task */

8413

-+#define SCHED_BITS	(MAX_RT_PRIO + NICE_WIDTH / 2 + MAX_PRIORITY_ADJ + 1)

8414

-+#endif

8415

-+

8416

-+#ifdef CONFIG_SCHED_PDS

8417

-+/* bits: RT(0-99), reserved(100-127), NORMAL_PRIO_NUM, cpu idle task */

8418

-+#define SCHED_BITS	(MIN_NORMAL_PRIO + NORMAL_PRIO_NUM + 1)

8419

-+#endif /* CONFIG_SCHED_PDS */

8420

-+

8421

-+#define IDLE_TASK_SCHED_PRIO	(SCHED_BITS - 1)

8422

-+

8423

-+#ifdef CONFIG_SCHED_DEBUG

8424

-+# define SCHED_WARN_ON(x)	WARN_ONCE(x, #x)

8425

-+extern void resched_latency_warn(int cpu, u64 latency);

8426

-+#else

8427

-+# define SCHED_WARN_ON(x)	({ (void)(x), 0; })

8428

-+static inline void resched_latency_warn(int cpu, u64 latency) {}

8429

-+#endif

8430

-+

8431

-+/*

8432

-+ * Increase resolution of nice-level calculations for 64-bit architectures.

8433

-+ * The extra resolution improves shares distribution and load balancing of

8434

-+ * low-weight task groups (eg. nice +19 on an autogroup), deeper taskgroup

8435

-+ * hierarchies, especially on larger systems. This is not a user-visible change

8436

-+ * and does not change the user-interface for setting shares/weights.

8437

-+ *

8438

-+ * We increase resolution only if we have enough bits to allow this increased

8439

-+ * resolution (i.e. 64-bit). The costs for increasing resolution when 32-bit

8440

-+ * are pretty high and the returns do not justify the increased costs.

8441

-+ *

8442

-+ * Really only required when CONFIG_FAIR_GROUP_SCHED=y is also set, but to

8443

-+ * increase coverage and consistency always enable it on 64-bit platforms.

8444

-+ */

8445

-+#ifdef CONFIG_64BIT

8446

-+# define NICE_0_LOAD_SHIFT	(SCHED_FIXEDPOINT_SHIFT + SCHED_FIXEDPOINT_SHIFT)

8447

-+# define scale_load(w)		((w) << SCHED_FIXEDPOINT_SHIFT)

8448

-+# define scale_load_down(w) \

8449

-+({ \

8450

-+	unsigned long __w = (w); \

8451

-+	if (__w) \

8452

-+		__w = max(2UL, __w >> SCHED_FIXEDPOINT_SHIFT); \

8453

-+	__w; \

8454

-+})

8455

-+#else

8456

-+# define NICE_0_LOAD_SHIFT	(SCHED_FIXEDPOINT_SHIFT)

8457

-+# define scale_load(w)		(w)

8458

-+# define scale_load_down(w)	(w)

8459

-+#endif

8460

-+

8461

-+#ifdef CONFIG_FAIR_GROUP_SCHED

8462

-+#define ROOT_TASK_GROUP_LOAD	NICE_0_LOAD

8463

-+

8464

-+/*

8465

-+ * A weight of 0 or 1 can cause arithmetics problems.

8466

-+ * A weight of a cfs_rq is the sum of weights of which entities

8467

-+ * are queued on this cfs_rq, so a weight of a entity should not be

8468

-+ * too large, so as the shares value of a task group.

8469

-+ * (The default weight is 1024 - so there's no practical

8470

-+ *  limitation from this.)

8471

-+ */

8472

-+#define MIN_SHARES		(1UL <<  1)

8473

-+#define MAX_SHARES		(1UL << 18)

8474

-+#endif

8475

-+

8476

-+/* task_struct::on_rq states: */

8477

-+#define TASK_ON_RQ_QUEUED	1

8478

-+#define TASK_ON_RQ_MIGRATING	2

8479

-+

8480

-+static inline int task_on_rq_queued(struct task_struct *p)

8481

-+{

8482

-+	return p->on_rq == TASK_ON_RQ_QUEUED;

8483

-+}

8484

-+

8485

-+static inline int task_on_rq_migrating(struct task_struct *p)

8486

-+{

8487

-+	return READ_ONCE(p->on_rq) == TASK_ON_RQ_MIGRATING;

8488

-+}

8489

-+

8490

-+/*

8491

-+ * wake flags

8492

-+ */

8493

-+#define WF_SYNC		0x01		/* waker goes to sleep after wakeup */

8494

-+#define WF_FORK		0x02		/* child wakeup after fork */

8495

-+#define WF_MIGRATED	0x04		/* internal use, task got migrated */

8496

-+#define WF_ON_CPU	0x08		/* Wakee is on_rq */

8497

-+

8498

-+#define SCHED_QUEUE_BITS	(SCHED_BITS - 1)

8499

-+

8500

-+struct sched_queue {

8501

-+	DECLARE_BITMAP(bitmap, SCHED_QUEUE_BITS);

8502

-+	struct list_head heads[SCHED_BITS];

8503

-+};

8504

-+

8505

-+/*

8506

-+ * This is the main, per-CPU runqueue data structure.

8507

-+ * This data should only be modified by the local cpu.

8508

-+ */

8509

-+struct rq {

8510

-+	/* runqueue lock: */

8511

-+	raw_spinlock_t lock;

8512

-+

8513

-+	struct task_struct __rcu *curr;

8514

-+	struct task_struct *idle, *stop, *skip;

8515

-+	struct mm_struct *prev_mm;

8516

-+

8517

-+	struct sched_queue	queue;

8518

-+#ifdef CONFIG_SCHED_PDS

8519

-+	u64			time_edge;

8520

-+#endif

8521

-+	unsigned long watermark;

8522

-+

8523

-+	/* switch count */

8524

-+	u64 nr_switches;

8525

-+

8526

-+	atomic_t nr_iowait;

8527

-+

8528

-+#ifdef CONFIG_SCHED_DEBUG

8529

-+	u64 last_seen_need_resched_ns;

8530

-+	int ticks_without_resched;

8531

-+#endif

8532

-+

8533

-+#ifdef CONFIG_MEMBARRIER

8534

-+	int membarrier_state;

8535

-+#endif

8536

-+

8537

-+#ifdef CONFIG_SMP

8538

-+	int cpu;		/* cpu of this runqueue */

8539

-+	bool online;

8540

-+

8541

-+	unsigned int		ttwu_pending;

8542

-+	unsigned char		nohz_idle_balance;

8543

-+	unsigned char		idle_balance;

8544

-+

8545

-+#ifdef CONFIG_HAVE_SCHED_AVG_IRQ

8546

-+	struct sched_avg	avg_irq;

8547

-+#endif

8548

-+

8549

-+#ifdef CONFIG_SCHED_SMT

8550

-+	int active_balance;

8551

-+	struct cpu_stop_work	active_balance_work;

8552

-+#endif

8553

-+	struct callback_head	*balance_callback;

8554

-+#ifdef CONFIG_HOTPLUG_CPU

8555

-+	struct rcuwait		hotplug_wait;

8556

-+#endif

8557

-+	unsigned int		nr_pinned;

8558

-+

8559

-+#endif /* CONFIG_SMP */

8560

-+#ifdef CONFIG_IRQ_TIME_ACCOUNTING

8561

-+	u64 prev_irq_time;

8562

-+#endif /* CONFIG_IRQ_TIME_ACCOUNTING */

8563

-+#ifdef CONFIG_PARAVIRT

8564

-+	u64 prev_steal_time;

8565

-+#endif /* CONFIG_PARAVIRT */

8566

-+#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING

8567

-+	u64 prev_steal_time_rq;

8568

-+#endif /* CONFIG_PARAVIRT_TIME_ACCOUNTING */

8569

-+

8570

-+	/* For genenal cpu load util */

8571

-+	s32 load_history;

8572

-+	u64 load_block;

8573

-+	u64 load_stamp;

8574

-+

8575

-+	/* calc_load related fields */

8576

-+	unsigned long calc_load_update;

8577

-+	long calc_load_active;

8578

-+

8579

-+	u64 clock, last_tick;

8580

-+	u64 last_ts_switch;

8581

-+	u64 clock_task;

8582

-+

8583

-+	unsigned int  nr_running;

8584

-+	unsigned long nr_uninterruptible;

8585

-+

8586

-+#ifdef CONFIG_SCHED_HRTICK

8587

-+#ifdef CONFIG_SMP

8588

-+	call_single_data_t hrtick_csd;

8589

-+#endif

8590

-+	struct hrtimer		hrtick_timer;

8591

-+	ktime_t			hrtick_time;

8592

-+#endif

8593

-+

8594

-+#ifdef CONFIG_SCHEDSTATS

8595

-+

8596

-+	/* latency stats */

8597

-+	struct sched_info rq_sched_info;

8598

-+	unsigned long long rq_cpu_time;

8599

-+	/* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */

8600

-+

8601

-+	/* sys_sched_yield() stats */

8602

-+	unsigned int yld_count;

8603

-+

8604

-+	/* schedule() stats */

8605

-+	unsigned int sched_switch;

8606

-+	unsigned int sched_count;

8607

-+	unsigned int sched_goidle;

8608

-+

8609

-+	/* try_to_wake_up() stats */

8610

-+	unsigned int ttwu_count;

8611

-+	unsigned int ttwu_local;

8612

-+#endif /* CONFIG_SCHEDSTATS */

8613

-+

8614

-+#ifdef CONFIG_CPU_IDLE

8615

-+	/* Must be inspected within a rcu lock section */

8616

-+	struct cpuidle_state *idle_state;

8617

-+#endif

8618

-+

8619

-+#ifdef CONFIG_NO_HZ_COMMON

8620

-+#ifdef CONFIG_SMP

8621

-+	call_single_data_t	nohz_csd;

8622

-+#endif

8623

-+	atomic_t		nohz_flags;

8624

-+#endif /* CONFIG_NO_HZ_COMMON */

8625

-+};

8626

-+

8627

-+extern unsigned long rq_load_util(struct rq *rq, unsigned long max);

8628

-+

8629

-+extern unsigned long calc_load_update;

8630

-+extern atomic_long_t calc_load_tasks;

8631

-+

8632

-+extern void calc_global_load_tick(struct rq *this_rq);

8633

-+extern long calc_load_fold_active(struct rq *this_rq, long adjust);

8634

-+

8635

-+DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);

8636

-+#define cpu_rq(cpu)		(&per_cpu(runqueues, (cpu)))

8637

-+#define this_rq()		this_cpu_ptr(&runqueues)

8638

-+#define task_rq(p)		cpu_rq(task_cpu(p))

8639

-+#define cpu_curr(cpu)		(cpu_rq(cpu)->curr)

8640

-+#define raw_rq()		raw_cpu_ptr(&runqueues)

8641

-+

8642

-+#ifdef CONFIG_SMP

8643

-+#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)

8644

-+void register_sched_domain_sysctl(void);

8645

-+void unregister_sched_domain_sysctl(void);

8646

-+#else

8647

-+static inline void register_sched_domain_sysctl(void)

8648

-+{

8649

-+}

8650

-+static inline void unregister_sched_domain_sysctl(void)

8651

-+{

8652

-+}

8653

-+#endif

8654

-+

8655

-+extern bool sched_smp_initialized;

8656

-+

8657

-+enum {

8658

-+	ITSELF_LEVEL_SPACE_HOLDER,

8659

-+#ifdef CONFIG_SCHED_SMT

8660

-+	SMT_LEVEL_SPACE_HOLDER,

8661

-+#endif

8662

-+	COREGROUP_LEVEL_SPACE_HOLDER,

8663

-+	CORE_LEVEL_SPACE_HOLDER,

8664

-+	OTHER_LEVEL_SPACE_HOLDER,

8665

-+	NR_CPU_AFFINITY_LEVELS

8666

-+};

8667

-+

8668

-+DECLARE_PER_CPU(cpumask_t [NR_CPU_AFFINITY_LEVELS], sched_cpu_topo_masks);

8669

-+DECLARE_PER_CPU(cpumask_t *, sched_cpu_llc_mask);

8670

-+

8671

-+static inline int

8672

-+__best_mask_cpu(const cpumask_t *cpumask, const cpumask_t *mask)

8673

-+{

8674

-+	int cpu;

8675

-+

8676

-+	while ((cpu = cpumask_any_and(cpumask, mask)) >= nr_cpu_ids)

8677

-+		mask++;

8678

-+

8679

-+	return cpu;

8680

-+}

8681

-+

8682

-+static inline int best_mask_cpu(int cpu, const cpumask_t *mask)

8683

-+{

8684

-+	return __best_mask_cpu(mask, per_cpu(sched_cpu_topo_masks, cpu));

8685

-+}

8686

-+

8687

-+extern void flush_smp_call_function_from_idle(void);

8688

-+

8689

-+#else  /* !CONFIG_SMP */

8690

-+static inline void flush_smp_call_function_from_idle(void) { }

8691

-+#endif

8692

-+

8693

-+#ifndef arch_scale_freq_tick

8694

-+static __always_inline

8695

-+void arch_scale_freq_tick(void)

8696

-+{

8697

-+}

8698

-+#endif

8699

-+

8700

-+#ifndef arch_scale_freq_capacity

8701

-+static __always_inline

8702

-+unsigned long arch_scale_freq_capacity(int cpu)

8703

-+{

8704

-+	return SCHED_CAPACITY_SCALE;

8705

-+}

8706

-+#endif

8707

-+

8708

-+static inline u64 __rq_clock_broken(struct rq *rq)

8709

-+{

8710

-+	return READ_ONCE(rq->clock);

8711

-+}

8712

-+

8713

-+static inline u64 rq_clock(struct rq *rq)

8714

-+{

8715

-+	/*

8716

-+	 * Relax lockdep_assert_held() checking as in VRQ, call to

8717

-+	 * sched_info_xxxx() may not held rq->lock

8718

-+	 * lockdep_assert_held(&rq->lock);

8719

-+	 */

8720

-+	return rq->clock;

8721

-+}

8722

-+

8723

-+static inline u64 rq_clock_task(struct rq *rq)

8724

-+{

8725

-+	/*

8726

-+	 * Relax lockdep_assert_held() checking as in VRQ, call to

8727

-+	 * sched_info_xxxx() may not held rq->lock

8728

-+	 * lockdep_assert_held(&rq->lock);

8729

-+	 */

8730

-+	return rq->clock_task;

8731

-+}

8732

-+

8733

-+/*

8734

-+ * {de,en}queue flags:

8735

-+ *

8736

-+ * DEQUEUE_SLEEP  - task is no longer runnable

8737

-+ * ENQUEUE_WAKEUP - task just became runnable

8738

-+ *

8739

-+ */

8740

-+

8741

-+#define DEQUEUE_SLEEP		0x01

8742

-+

8743

-+#define ENQUEUE_WAKEUP		0x01

8744

-+

8745

-+

8746

-+/*

8747

-+ * Below are scheduler API which using in other kernel code

8748

-+ * It use the dummy rq_flags

8749

-+ * ToDo : BMQ need to support these APIs for compatibility with mainline

8750

-+ * scheduler code.

8751

-+ */

8752

-+struct rq_flags {

8753

-+	unsigned long flags;

8754

-+};

8755

-+

8756

-+struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf)

8757

-+	__acquires(rq->lock);

8758

-+

8759

-+struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)

8760

-+	__acquires(p->pi_lock)

8761

-+	__acquires(rq->lock);

8762

-+

8763

-+static inline void __task_rq_unlock(struct rq *rq, struct rq_flags *rf)

8764

-+	__releases(rq->lock)

8765

-+{

8766

-+	raw_spin_unlock(&rq->lock);

8767

-+}

8768

-+

8769

-+static inline void

8770

-+task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf)

8771

-+	__releases(rq->lock)

8772

-+	__releases(p->pi_lock)

8773

-+{

8774

-+	raw_spin_unlock(&rq->lock);

8775

-+	raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags);

8776

-+}

8777

-+

8778

-+static inline void

8779

-+rq_lock(struct rq *rq, struct rq_flags *rf)

8780

-+	__acquires(rq->lock)

8781

-+{

8782

-+	raw_spin_lock(&rq->lock);

8783

-+}

8784

-+

8785

-+static inline void

8786

-+rq_unlock_irq(struct rq *rq, struct rq_flags *rf)

8787

-+	__releases(rq->lock)

8788

-+{

8789

-+	raw_spin_unlock_irq(&rq->lock);

8790

-+}

8791

-+

8792

-+static inline void

8793

-+rq_unlock(struct rq *rq, struct rq_flags *rf)

8794

-+	__releases(rq->lock)

8795

-+{

8796

-+	raw_spin_unlock(&rq->lock);

8797

-+}

8798

-+

8799

-+static inline struct rq *

8800

-+this_rq_lock_irq(struct rq_flags *rf)

8801

-+	__acquires(rq->lock)

8802

-+{

8803

-+	struct rq *rq;

8804

-+

8805

-+	local_irq_disable();

8806

-+	rq = this_rq();

8807

-+	raw_spin_lock(&rq->lock);

8808

-+

8809

-+	return rq;

8810

-+}

8811

-+

8812

-+extern void raw_spin_rq_lock_nested(struct rq *rq, int subclass);

8813

-+extern void raw_spin_rq_unlock(struct rq *rq);

8814

-+

8815

-+static inline raw_spinlock_t *__rq_lockp(struct rq *rq)

8816

-+{

8817

-+	return &rq->lock;

8818

-+}

8819

-+

8820

-+static inline raw_spinlock_t *rq_lockp(struct rq *rq)

8821

-+{

8822

-+	return __rq_lockp(rq);

8823

-+}

8824

-+

8825

-+static inline void raw_spin_rq_lock(struct rq *rq)

8826

-+{

8827

-+	raw_spin_rq_lock_nested(rq, 0);

8828

-+}

8829

-+

8830

-+static inline void raw_spin_rq_lock_irq(struct rq *rq)

8831

-+{

8832

-+	local_irq_disable();

8833

-+	raw_spin_rq_lock(rq);

8834

-+}

8835

-+

8836

-+static inline void raw_spin_rq_unlock_irq(struct rq *rq)

8837

-+{

8838

-+	raw_spin_rq_unlock(rq);

8839

-+	local_irq_enable();

8840

-+}

8841

-+

8842

-+static inline int task_current(struct rq *rq, struct task_struct *p)

8843

-+{

8844

-+	return rq->curr == p;

8845

-+}

8846

-+

8847

-+static inline bool task_running(struct task_struct *p)

8848

-+{

8849

-+	return p->on_cpu;

8850

-+}

8851

-+

8852

-+extern int task_running_nice(struct task_struct *p);

8853

-+

8854

-+extern struct static_key_false sched_schedstats;

8855

-+

8856

-+#ifdef CONFIG_CPU_IDLE

8857

-+static inline void idle_set_state(struct rq *rq,

8858

-+				  struct cpuidle_state *idle_state)

8859

-+{

8860

-+	rq->idle_state = idle_state;

8861

-+}

8862

-+

8863

-+static inline struct cpuidle_state *idle_get_state(struct rq *rq)

8864

-+{

8865

-+	WARN_ON(!rcu_read_lock_held());

8866

-+	return rq->idle_state;

8867

-+}

8868

-+#else

8869

-+static inline void idle_set_state(struct rq *rq,

8870

-+				  struct cpuidle_state *idle_state)

8871

-+{

8872

-+}

8873

-+

8874

-+static inline struct cpuidle_state *idle_get_state(struct rq *rq)

8875

-+{

8876

-+	return NULL;

8877

-+}

8878

-+#endif

8879

-+

8880

-+static inline int cpu_of(const struct rq *rq)

8881

-+{

8882

-+#ifdef CONFIG_SMP

8883

-+	return rq->cpu;

8884

-+#else

8885

-+	return 0;

8886

-+#endif

8887

-+}

8888

-+

8889

-+#include "stats.h"

8890

-+

8891

-+#ifdef CONFIG_NO_HZ_COMMON

8892

-+#define NOHZ_BALANCE_KICK_BIT	0

8893

-+#define NOHZ_STATS_KICK_BIT	1

8894

-+

8895

-+#define NOHZ_BALANCE_KICK	BIT(NOHZ_BALANCE_KICK_BIT)

8896

-+#define NOHZ_STATS_KICK		BIT(NOHZ_STATS_KICK_BIT)

8897

-+

8898

-+#define NOHZ_KICK_MASK	(NOHZ_BALANCE_KICK | NOHZ_STATS_KICK)

8899

-+

8900

-+#define nohz_flags(cpu)	(&cpu_rq(cpu)->nohz_flags)

8901

-+

8902

-+/* TODO: needed?

8903

-+extern void nohz_balance_exit_idle(struct rq *rq);

8904

-+#else

8905

-+static inline void nohz_balance_exit_idle(struct rq *rq) { }

8906

-+*/

8907

-+#endif

8908

-+

8909

-+#ifdef CONFIG_IRQ_TIME_ACCOUNTING

8910

-+struct irqtime {

8911

-+	u64			total;

8912

-+	u64			tick_delta;

8913

-+	u64			irq_start_time;

8914

-+	struct u64_stats_sync	sync;

8915

-+};

8916

-+

8917

-+DECLARE_PER_CPU(struct irqtime, cpu_irqtime);

8918

-+

8919

-+/*

8920

-+ * Returns the irqtime minus the softirq time computed by ksoftirqd.

8921

-+ * Otherwise ksoftirqd's sum_exec_runtime is substracted its own runtime

8922

-+ * and never move forward.

8923

-+ */

8924

-+static inline u64 irq_time_read(int cpu)

8925

-+{

8926

-+	struct irqtime *irqtime = &per_cpu(cpu_irqtime, cpu);

8927

-+	unsigned int seq;

8928

-+	u64 total;

8929

-+

8930

-+	do {

8931

-+		seq = __u64_stats_fetch_begin(&irqtime->sync);

8932

-+		total = irqtime->total;

8933

-+	} while (__u64_stats_fetch_retry(&irqtime->sync, seq));

8934

-+

8935

-+	return total;

8936

-+}

8937

-+#endif /* CONFIG_IRQ_TIME_ACCOUNTING */

8938

-+

8939

-+#ifdef CONFIG_CPU_FREQ

8940

-+DECLARE_PER_CPU(struct update_util_data __rcu *, cpufreq_update_util_data);

8941

-+#endif /* CONFIG_CPU_FREQ */

8942

-+

8943

-+#ifdef CONFIG_NO_HZ_FULL

8944

-+extern int __init sched_tick_offload_init(void);

8945

-+#else

8946

-+static inline int sched_tick_offload_init(void) { return 0; }

8947

-+#endif

8948

-+

8949

-+#ifdef arch_scale_freq_capacity

8950

-+#ifndef arch_scale_freq_invariant

8951

-+#define arch_scale_freq_invariant()	(true)

8952

-+#endif

8953

-+#else /* arch_scale_freq_capacity */

8954

-+#define arch_scale_freq_invariant()	(false)

8955

-+#endif

8956

-+

8957

-+extern void schedule_idle(void);

8958

-+

8959

-+#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)

8960

-+

8961

-+/*

8962

-+ * !! For sched_setattr_nocheck() (kernel) only !!

8963

-+ *

8964

-+ * This is actually gross. :(

8965

-+ *

8966

-+ * It is used to make schedutil kworker(s) higher priority than SCHED_DEADLINE

8967

-+ * tasks, but still be able to sleep. We need this on platforms that cannot

8968

-+ * atomically change clock frequency. Remove once fast switching will be

8969

-+ * available on such platforms.

8970

-+ *

8971

-+ * SUGOV stands for SchedUtil GOVernor.

8972

-+ */

8973

-+#define SCHED_FLAG_SUGOV	0x10000000

8974

-+

8975

-+#ifdef CONFIG_MEMBARRIER

8976

-+/*

8977

-+ * The scheduler provides memory barriers required by membarrier between:

8978

-+ * - prior user-space memory accesses and store to rq->membarrier_state,

8979

-+ * - store to rq->membarrier_state and following user-space memory accesses.

8980

-+ * In the same way it provides those guarantees around store to rq->curr.

8981

-+ */

8982

-+static inline void membarrier_switch_mm(struct rq *rq,

8983

-+					struct mm_struct *prev_mm,

8984

-+					struct mm_struct *next_mm)

8985

-+{

8986

-+	int membarrier_state;

8987

-+

8988

-+	if (prev_mm == next_mm)

8989

-+		return;

8990

-+

8991

-+	membarrier_state = atomic_read(&next_mm->membarrier_state);

8992

-+	if (READ_ONCE(rq->membarrier_state) == membarrier_state)

8993

-+		return;

8994

-+

8995

-+	WRITE_ONCE(rq->membarrier_state, membarrier_state);

8996

-+}

8997

-+#else

8998

-+static inline void membarrier_switch_mm(struct rq *rq,

8999

-+					struct mm_struct *prev_mm,

9000

-+					struct mm_struct *next_mm)

9001

-+{

9002

-+}

9003

-+#endif

9004

-+

9005

-+#ifdef CONFIG_NUMA

9006

-+extern int sched_numa_find_closest(const struct cpumask *cpus, int cpu);

9007

-+#else

9008

-+static inline int sched_numa_find_closest(const struct cpumask *cpus, int cpu)

9009

-+{

9010

-+	return nr_cpu_ids;

9011

-+}

9012

-+#endif

9013

-+

9014

-+extern void swake_up_all_locked(struct swait_queue_head *q);

9015

-+extern void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait);

9016

-+

9017

-+#ifdef CONFIG_PREEMPT_DYNAMIC

9018

-+extern int preempt_dynamic_mode;

9019

-+extern int sched_dynamic_mode(const char *str);

9020

-+extern void sched_dynamic_update(int mode);

9021

-+#endif

9022

-+

9023

-+static inline void nohz_run_idle_balance(int cpu) { }

9024

-+#endif /* ALT_SCHED_H */

9025

-diff --git a/kernel/sched/bmq.h b/kernel/sched/bmq.h

9026

-new file mode 100644

9027

-index 000000000000..be3ee4a553ca

9028

---- /dev/null

9029

-+++ b/kernel/sched/bmq.h

9030

-@@ -0,0 +1,111 @@

9031

-+#define ALT_SCHED_VERSION_MSG "sched/bmq: BMQ CPU Scheduler "ALT_SCHED_VERSION" by Alfred Chen.\n"

9032

-+

9033

-+/*

9034

-+ * BMQ only routines

9035

-+ */

9036

-+#define rq_switch_time(rq)	((rq)->clock - (rq)->last_ts_switch)

9037

-+#define boost_threshold(p)	(sched_timeslice_ns >>\

9038

-+				 (15 - MAX_PRIORITY_ADJ -  (p)->boost_prio))

9039

-+

9040

-+static inline void boost_task(struct task_struct *p)

9041

-+{

9042

-+	int limit;

9043

-+

9044

-+	switch (p->policy) {

9045

-+	case SCHED_NORMAL:

9046

-+		limit = -MAX_PRIORITY_ADJ;

9047

-+		break;

9048

-+	case SCHED_BATCH:

9049

-+	case SCHED_IDLE:

9050

-+		limit = 0;

9051

-+		break;

9052

-+	default:

9053

-+		return;

9054

-+	}

9055

-+

9056

-+	if (p->boost_prio > limit)

9057

-+		p->boost_prio--;

9058

-+}

9059

-+

9060

-+static inline void deboost_task(struct task_struct *p)

9061

-+{

9062

-+	if (p->boost_prio < MAX_PRIORITY_ADJ)

9063

-+		p->boost_prio++;

9064

-+}

9065

-+

9066

-+/*

9067

-+ * Common interfaces

9068

-+ */

9069

-+static inline void sched_timeslice_imp(const int timeslice_ms) {}

9070

-+

9071

-+static inline int

9072

-+task_sched_prio_normal(const struct task_struct *p, const struct rq *rq)

9073

-+{

9074

-+	return p->prio + p->boost_prio - MAX_RT_PRIO;

9075

-+}

9076

-+

9077

-+static inline int task_sched_prio(const struct task_struct *p)

9078

-+{

9079

-+	return (p->prio < MAX_RT_PRIO)? p->prio : MAX_RT_PRIO / 2 + (p->prio + p->boost_prio) / 2;

9080

-+}

9081

-+

9082

-+static inline int

9083

-+task_sched_prio_idx(const struct task_struct *p, const struct rq *rq)

9084

-+{

9085

-+	return task_sched_prio(p);

9086

-+}

9087

-+

9088

-+static inline int sched_prio2idx(int prio, struct rq *rq)

9089

-+{

9090

-+	return prio;

9091

-+}

9092

-+

9093

-+static inline int sched_idx2prio(int idx, struct rq *rq)

9094

-+{

9095

-+	return idx;

9096

-+}

9097

-+

9098

-+static inline void time_slice_expired(struct task_struct *p, struct rq *rq)

9099

-+{

9100

-+	p->time_slice = sched_timeslice_ns;

9101

-+

9102

-+	if (SCHED_FIFO != p->policy && task_on_rq_queued(p)) {

9103

-+		if (SCHED_RR != p->policy)

9104

-+			deboost_task(p);

9105

-+		requeue_task(p, rq);

9106

-+	}

9107

-+}

9108

-+

9109

-+static inline void sched_task_sanity_check(struct task_struct *p, struct rq *rq) {}

9110

-+

9111

-+inline int task_running_nice(struct task_struct *p)

9112

-+{

9113

-+	return (p->prio + p->boost_prio > DEFAULT_PRIO + MAX_PRIORITY_ADJ);

9114

-+}

9115

-+

9116

-+static void sched_task_fork(struct task_struct *p, struct rq *rq)

9117

-+{

9118

-+	p->boost_prio = (p->boost_prio < 0) ?

9119

-+		p->boost_prio + MAX_PRIORITY_ADJ : MAX_PRIORITY_ADJ;

9120

-+}

9121

-+

9122

-+static inline void do_sched_yield_type_1(struct task_struct *p, struct rq *rq)

9123

-+{

9124

-+	p->boost_prio = MAX_PRIORITY_ADJ;

9125

-+}

9126

-+

9127

-+#ifdef CONFIG_SMP

9128

-+static inline void sched_task_ttwu(struct task_struct *p)

9129

-+{

9130

-+	if(this_rq()->clock_task - p->last_ran > sched_timeslice_ns)

9131

-+		boost_task(p);

9132

-+}

9133

-+#endif

9134

-+

9135

-+static inline void sched_task_deactivate(struct task_struct *p, struct rq *rq)

9136

-+{

9137

-+	if (rq_switch_time(rq) < boost_threshold(p))

9138

-+		boost_task(p);

9139

-+}

9140

-+

9141

-+static inline void update_rq_time_edge(struct rq *rq) {}

9142

-diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c

9143

-index e7af18857371..3e38816b736e 100644

9144

---- a/kernel/sched/cpufreq_schedutil.c

9145

-+++ b/kernel/sched/cpufreq_schedutil.c

9146

-@@ -167,9 +167,14 @@ static void sugov_get_util(struct sugov_cpu *sg_cpu)

9147

- 	unsigned long max = arch_scale_cpu_capacity(sg_cpu->cpu);

9148

-

9149

- 	sg_cpu->max = max;

9150

-+#ifndef CONFIG_SCHED_ALT

9151

- 	sg_cpu->bw_dl = cpu_bw_dl(rq);

9152

- 	sg_cpu->util = effective_cpu_util(sg_cpu->cpu, cpu_util_cfs(rq), max,

9153

- 					  FREQUENCY_UTIL, NULL);

9154

-+#else

9155

-+	sg_cpu->bw_dl = 0;

9156

-+	sg_cpu->util = rq_load_util(rq, max);

9157

-+#endif /* CONFIG_SCHED_ALT */

9158

- }

9159

-

9160

- /**

9161

-@@ -312,8 +317,10 @@ static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; }

9162

-  */

9163

- static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu)

9164

- {

9165

-+#ifndef CONFIG_SCHED_ALT

9166

- 	if (cpu_bw_dl(cpu_rq(sg_cpu->cpu)) > sg_cpu->bw_dl)

9167

- 		sg_cpu->sg_policy->limits_changed = true;

9168

-+#endif

9169

- }

9170

-

9171

- static inline bool sugov_update_single_common(struct sugov_cpu *sg_cpu,

9172

-@@ -607,6 +614,7 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy)

9173

- 	}

9174

-

9175

- 	ret = sched_setattr_nocheck(thread, &attr);

9176

-+

9177

- 	if (ret) {

9178

- 		kthread_stop(thread);

9179

- 		pr_warn("%s: failed to set SCHED_DEADLINE\n", __func__);

9180

-@@ -839,7 +847,9 @@ cpufreq_governor_init(schedutil_gov);

9181

- #ifdef CONFIG_ENERGY_MODEL

9182

- static void rebuild_sd_workfn(struct work_struct *work)

9183

- {

9184

-+#ifndef CONFIG_SCHED_ALT

9185

- 	rebuild_sched_domains_energy();

9186

-+#endif /* CONFIG_SCHED_ALT */

9187

- }

9188

- static DECLARE_WORK(rebuild_sd_work, rebuild_sd_workfn);

9189

-

9190

-diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c

9191

-index 872e481d5098..f920c8b48ec1 100644

9192

---- a/kernel/sched/cputime.c

9193

-+++ b/kernel/sched/cputime.c

9194

-@@ -123,7 +123,7 @@ void account_user_time(struct task_struct *p, u64 cputime)

9195

- 	p->utime += cputime;

9196

- 	account_group_user_time(p, cputime);

9197

-

9198

--	index = (task_nice(p) > 0) ? CPUTIME_NICE : CPUTIME_USER;

9199

-+	index = task_running_nice(p) ? CPUTIME_NICE : CPUTIME_USER;

9200

-

9201

- 	/* Add user time to cpustat. */

9202

- 	task_group_account_field(p, index, cputime);

9203

-@@ -147,7 +147,7 @@ void account_guest_time(struct task_struct *p, u64 cputime)

9204

- 	p->gtime += cputime;

9205

-

9206

- 	/* Add guest time to cpustat. */

9207

--	if (task_nice(p) > 0) {

9208

-+	if (task_running_nice(p)) {

9209

- 		cpustat[CPUTIME_NICE] += cputime;

9210

- 		cpustat[CPUTIME_GUEST_NICE] += cputime;

9211

- 	} else {

9212

-@@ -270,7 +270,7 @@ static inline u64 account_other_time(u64 max)

9213

- #ifdef CONFIG_64BIT

9214

- static inline u64 read_sum_exec_runtime(struct task_struct *t)

9215

- {

9216

--	return t->se.sum_exec_runtime;

9217

-+	return tsk_seruntime(t);

9218

- }

9219

- #else

9220

- static u64 read_sum_exec_runtime(struct task_struct *t)

9221

-@@ -280,7 +280,7 @@ static u64 read_sum_exec_runtime(struct task_struct *t)

9222

- 	struct rq *rq;

9223

-

9224

- 	rq = task_rq_lock(t, &rf);

9225

--	ns = t->se.sum_exec_runtime;

9226

-+	ns = tsk_seruntime(t);

9227

- 	task_rq_unlock(rq, t, &rf);

9228

-

9229

- 	return ns;

9230

-@@ -612,7 +612,7 @@ void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev,

9231

- void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st)

9232

- {

9233

- 	struct task_cputime cputime = {

9234

--		.sum_exec_runtime = p->se.sum_exec_runtime,

9235

-+		.sum_exec_runtime = tsk_seruntime(p),

9236

- 	};

9237

-

9238

- 	task_cputime(p, &cputime.utime, &cputime.stime);

9239

-diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c

9240

-index 17a653b67006..17ab2fe34d7a 100644

9241

---- a/kernel/sched/debug.c

9242

-+++ b/kernel/sched/debug.c

9243

-@@ -8,6 +8,7 @@

9244

-  */

9245

- #include "sched.h"

9246

-

9247

-+#ifndef CONFIG_SCHED_ALT

9248

- /*

9249

-  * This allows printing both to /proc/sched_debug and

9250

-  * to the console

9251

-@@ -216,6 +217,7 @@ static const struct file_operations sched_scaling_fops = {

9252

- };

9253

-

9254

- #endif /* SMP */

9255

-+#endif /* !CONFIG_SCHED_ALT */

9256

-

9257

- #ifdef CONFIG_PREEMPT_DYNAMIC

9258

-

9259

-@@ -279,6 +281,7 @@ static const struct file_operations sched_dynamic_fops = {

9260

-

9261

- #endif /* CONFIG_PREEMPT_DYNAMIC */

9262

-

9263

-+#ifndef CONFIG_SCHED_ALT

9264

- __read_mostly bool sched_debug_verbose;

9265

-

9266

- static const struct seq_operations sched_debug_sops;

9267

-@@ -294,6 +297,7 @@ static const struct file_operations sched_debug_fops = {

9268

- 	.llseek		= seq_lseek,

9269

- 	.release	= seq_release,

9270

- };

9271

-+#endif /* !CONFIG_SCHED_ALT */

9272

-

9273

- static struct dentry *debugfs_sched;

9274

-

9275

-@@ -303,12 +307,15 @@ static __init int sched_init_debug(void)

9276

-

9277

- 	debugfs_sched = debugfs_create_dir("sched", NULL);

9278

-

9279

-+#ifndef CONFIG_SCHED_ALT

9280

- 	debugfs_create_file("features", 0644, debugfs_sched, NULL, &sched_feat_fops);

9281

- 	debugfs_create_bool("verbose", 0644, debugfs_sched, &sched_debug_verbose);

9282

-+#endif /* !CONFIG_SCHED_ALT */

9283

- #ifdef CONFIG_PREEMPT_DYNAMIC

9284

- 	debugfs_create_file("preempt", 0644, debugfs_sched, NULL, &sched_dynamic_fops);

9285

- #endif

9286

-

9287

-+#ifndef CONFIG_SCHED_ALT

9288

- 	debugfs_create_u32("latency_ns", 0644, debugfs_sched, &sysctl_sched_latency);

9289

- 	debugfs_create_u32("min_granularity_ns", 0644, debugfs_sched, &sysctl_sched_min_granularity);

9290

- 	debugfs_create_u32("wakeup_granularity_ns", 0644, debugfs_sched, &sysctl_sched_wakeup_granularity);

9291

-@@ -336,11 +343,13 @@ static __init int sched_init_debug(void)

9292

- #endif

9293

-

9294

- 	debugfs_create_file("debug", 0444, debugfs_sched, NULL, &sched_debug_fops);

9295

-+#endif /* !CONFIG_SCHED_ALT */

9296

-

9297

- 	return 0;

9298

- }

9299

- late_initcall(sched_init_debug);

9300

-

9301

-+#ifndef CONFIG_SCHED_ALT

9302

- #ifdef CONFIG_SMP

9303

-

9304

- static cpumask_var_t		sd_sysctl_cpus;

9305

-@@ -1063,6 +1072,7 @@ void proc_sched_set_task(struct task_struct *p)

9306

- 	memset(&p->se.statistics, 0, sizeof(p->se.statistics));

9307

- #endif

9308

- }

9309

-+#endif /* !CONFIG_SCHED_ALT */

9310

-

9311

- void resched_latency_warn(int cpu, u64 latency)

9312

- {

9313

-diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c

9314

-index d17b0a5ce6ac..6ff77fc6b73a 100644

9315

---- a/kernel/sched/idle.c

9316

-+++ b/kernel/sched/idle.c

9317

-@@ -403,6 +403,7 @@ void cpu_startup_entry(enum cpuhp_state state)

9318

- 		do_idle();

9319

- }

9320

-

9321

-+#ifndef CONFIG_SCHED_ALT

9322

- /*

9323

-  * idle-task scheduling class.

9324

-  */

9325

-@@ -525,3 +526,4 @@ DEFINE_SCHED_CLASS(idle) = {

9326

- 	.switched_to		= switched_to_idle,

9327

- 	.update_curr		= update_curr_idle,

9328

- };

9329

-+#endif

9330

-diff --git a/kernel/sched/pds.h b/kernel/sched/pds.h

9331

-new file mode 100644

9332

-index 000000000000..0f1f0d708b77

9333

---- /dev/null

9334

-+++ b/kernel/sched/pds.h

9335

-@@ -0,0 +1,127 @@

9336

-+#define ALT_SCHED_VERSION_MSG "sched/pds: PDS CPU Scheduler "ALT_SCHED_VERSION" by Alfred Chen.\n"

9337

-+

9338

-+static int sched_timeslice_shift = 22;

9339

-+

9340

-+#define NORMAL_PRIO_MOD(x)	((x) & (NORMAL_PRIO_NUM - 1))

9341

-+

9342

-+/*

9343

-+ * Common interfaces

9344

-+ */

9345

-+static inline void sched_timeslice_imp(const int timeslice_ms)

9346

-+{

9347

-+	if (2 == timeslice_ms)

9348

-+		sched_timeslice_shift = 21;

9349

-+}

9350

-+

9351

-+static inline int

9352

-+task_sched_prio_normal(const struct task_struct *p, const struct rq *rq)

9353

-+{

9354

-+	s64 delta = p->deadline - rq->time_edge + NORMAL_PRIO_NUM - NICE_WIDTH;

9355

-+

9356

-+	if (WARN_ONCE(delta > NORMAL_PRIO_NUM - 1,

9357

-+		      "pds: task_sched_prio_normal() delta %lld\n", delta))

9358

-+		return NORMAL_PRIO_NUM - 1;

9359

-+

9360

-+	return (delta < 0) ? 0 : delta;

9361

-+}

9362

-+

9363

-+static inline int task_sched_prio(const struct task_struct *p)

9364

-+{

9365

-+	return (p->prio < MAX_RT_PRIO) ? p->prio :

9366

-+		MIN_NORMAL_PRIO + task_sched_prio_normal(p, task_rq(p));

9367

-+}

9368

-+

9369

-+static inline int

9370

-+task_sched_prio_idx(const struct task_struct *p, const struct rq *rq)

9371

-+{

9372

-+	return (p->prio < MAX_RT_PRIO) ? p->prio : MIN_NORMAL_PRIO +

9373

-+		NORMAL_PRIO_MOD(task_sched_prio_normal(p, rq) + rq->time_edge);

9374

-+}

9375

-+

9376

-+static inline int sched_prio2idx(int prio, struct rq *rq)

9377

-+{

9378

-+	return (IDLE_TASK_SCHED_PRIO == prio || prio < MAX_RT_PRIO) ? prio :

9379

-+		MIN_NORMAL_PRIO + NORMAL_PRIO_MOD((prio - MIN_NORMAL_PRIO) +

9380

-+						  rq->time_edge);

9381

-+}

9382

-+

9383

-+static inline int sched_idx2prio(int idx, struct rq *rq)

9384

-+{

9385

-+	return (idx < MAX_RT_PRIO) ? idx : MIN_NORMAL_PRIO +

9386

-+		NORMAL_PRIO_MOD((idx - MIN_NORMAL_PRIO) + NORMAL_PRIO_NUM -

9387

-+				NORMAL_PRIO_MOD(rq->time_edge));

9388

-+}

9389

-+

9390

-+static inline void sched_renew_deadline(struct task_struct *p, const struct rq *rq)

9391

-+{

9392

-+	if (p->prio >= MAX_RT_PRIO)

9393

-+		p->deadline = (rq->clock >> sched_timeslice_shift) +

9394

-+			p->static_prio - (MAX_PRIO - NICE_WIDTH);

9395

-+}

9396

-+

9397

-+int task_running_nice(struct task_struct *p)

9398

-+{

9399

-+	return (p->prio > DEFAULT_PRIO);

9400

-+}

9401

-+

9402

-+static inline void update_rq_time_edge(struct rq *rq)

9403

-+{

9404

-+	struct list_head head;

9405

-+	u64 old = rq->time_edge;

9406

-+	u64 now = rq->clock >> sched_timeslice_shift;

9407

-+	u64 prio, delta;

9408

-+

9409

-+	if (now == old)

9410

-+		return;

9411

-+

9412

-+	delta = min_t(u64, NORMAL_PRIO_NUM, now - old);

9413

-+	INIT_LIST_HEAD(&head);

9414

-+

9415

-+	for_each_set_bit(prio, &rq->queue.bitmap[2], delta)

9416

-+		list_splice_tail_init(rq->queue.heads + MIN_NORMAL_PRIO +

9417

-+				      NORMAL_PRIO_MOD(prio + old), &head);

9418

-+

9419

-+	rq->queue.bitmap[2] = (NORMAL_PRIO_NUM == delta) ? 0UL :

9420

-+		rq->queue.bitmap[2] >> delta;

9421

-+	rq->time_edge = now;

9422

-+	if (!list_empty(&head)) {

9423

-+		u64 idx = MIN_NORMAL_PRIO + NORMAL_PRIO_MOD(now);

9424

-+		struct task_struct *p;

9425

-+

9426

-+		list_for_each_entry(p, &head, sq_node)

9427

-+			p->sq_idx = idx;

9428

-+

9429

-+		list_splice(&head, rq->queue.heads + idx);

9430

-+		rq->queue.bitmap[2] |= 1UL;

9431

-+	}

9432

-+}

9433

-+

9434

-+static inline void time_slice_expired(struct task_struct *p, struct rq *rq)

9435

-+{

9436

-+	p->time_slice = sched_timeslice_ns;

9437

-+	sched_renew_deadline(p, rq);

9438

-+	if (SCHED_FIFO != p->policy && task_on_rq_queued(p))

9439

-+		requeue_task(p, rq);

9440

-+}

9441

-+

9442

-+static inline void sched_task_sanity_check(struct task_struct *p, struct rq *rq)

9443

-+{

9444

-+	u64 max_dl = rq->time_edge + NICE_WIDTH - 1;

9445

-+	if (unlikely(p->deadline > max_dl))

9446

-+		p->deadline = max_dl;

9447

-+}

9448

-+

9449

-+static void sched_task_fork(struct task_struct *p, struct rq *rq)

9450

-+{

9451

-+	sched_renew_deadline(p, rq);

9452

-+}

9453

-+

9454

-+static inline void do_sched_yield_type_1(struct task_struct *p, struct rq *rq)

9455

-+{

9456

-+	time_slice_expired(p, rq);

9457

-+}

9458

-+

9459

-+#ifdef CONFIG_SMP

9460

-+static inline void sched_task_ttwu(struct task_struct *p) {}

9461

-+#endif

9462

-+static inline void sched_task_deactivate(struct task_struct *p, struct rq *rq) {}

9463

-diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c

9464

-index a554e3bbab2b..3e56f5e6ff5c 100644

9465

---- a/kernel/sched/pelt.c

9466

-+++ b/kernel/sched/pelt.c

9467

-@@ -270,6 +270,7 @@ ___update_load_avg(struct sched_avg *sa, unsigned long load)

9468

- 	WRITE_ONCE(sa->util_avg, sa->util_sum / divider);

9469

- }

9470

-

9471

-+#ifndef CONFIG_SCHED_ALT

9472

- /*

9473

-  * sched_entity:

9474

-  *

9475

-@@ -387,8 +388,9 @@ int update_dl_rq_load_avg(u64 now, struct rq *rq, int running)

9476

-

9477

- 	return 0;

9478

- }

9479

-+#endif

9480

-

9481

--#ifdef CONFIG_SCHED_THERMAL_PRESSURE

9482

-+#if defined(CONFIG_SCHED_THERMAL_PRESSURE) && !defined(CONFIG_SCHED_ALT)

9483

- /*

9484

-  * thermal:

9485

-  *

9486

-diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h

9487

-index e06071bf3472..adf567df34d4 100644

9488

---- a/kernel/sched/pelt.h

9489

-+++ b/kernel/sched/pelt.h

9490

-@@ -1,13 +1,15 @@

9491

- #ifdef CONFIG_SMP

9492

- #include "sched-pelt.h"

9493

-

9494

-+#ifndef CONFIG_SCHED_ALT

9495

- int __update_load_avg_blocked_se(u64 now, struct sched_entity *se);

9496

- int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_entity *se);

9497

- int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq);

9498

- int update_rt_rq_load_avg(u64 now, struct rq *rq, int running);

9499

- int update_dl_rq_load_avg(u64 now, struct rq *rq, int running);

9500

-+#endif

9501

-

9502

--#ifdef CONFIG_SCHED_THERMAL_PRESSURE

9503

-+#if defined(CONFIG_SCHED_THERMAL_PRESSURE) && !defined(CONFIG_SCHED_ALT)

9504

- int update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity);

9505

-

9506

- static inline u64 thermal_load_avg(struct rq *rq)

9507

-@@ -42,6 +44,7 @@ static inline u32 get_pelt_divider(struct sched_avg *avg)

9508

- 	return LOAD_AVG_MAX - 1024 + avg->period_contrib;

9509

- }

9510

-

9511

-+#ifndef CONFIG_SCHED_ALT

9512

- static inline void cfs_se_util_change(struct sched_avg *avg)

9513

- {

9514

- 	unsigned int enqueued;

9515

-@@ -153,9 +156,11 @@ static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq)

9516

- 	return rq_clock_pelt(rq_of(cfs_rq));

9517

- }

9518

- #endif

9519

-+#endif /* CONFIG_SCHED_ALT */

9520

-

9521

- #else

9522

-

9523

-+#ifndef CONFIG_SCHED_ALT

9524

- static inline int

9525

- update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)

9526

- {

9527

-@@ -173,6 +178,7 @@ update_dl_rq_load_avg(u64 now, struct rq *rq, int running)

9528

- {

9529

- 	return 0;

9530

- }

9531

-+#endif

9532

-

9533

- static inline int

9534

- update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity)

9535

-diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h

9536

-index 3d3e5793e117..c1d976ef623f 100644

9537

---- a/kernel/sched/sched.h

9538

-+++ b/kernel/sched/sched.h

9539

-@@ -2,6 +2,10 @@

9540

- /*

9541

-  * Scheduler internal types and methods:

9542

-  */

9543

-+#ifdef CONFIG_SCHED_ALT

9544

-+#include "alt_sched.h"

9545

-+#else

9546

-+

9547

- #include <linux/sched.h>

9548

-

9549

- #include <linux/sched/autogroup.h>

9550

-@@ -3064,3 +3068,8 @@ extern int sched_dynamic_mode(const char *str);

9551

- extern void sched_dynamic_update(int mode);

9552

- #endif

9553

-

9554

-+static inline int task_running_nice(struct task_struct *p)

9555

-+{

9556

-+	return (task_nice(p) > 0);

9557

-+}

9558

-+#endif /* !CONFIG_SCHED_ALT */

9559

-diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c

9560

-index 3f93fc3b5648..528b71e144e9 100644

9561

---- a/kernel/sched/stats.c

9562

-+++ b/kernel/sched/stats.c

9563

-@@ -22,8 +22,10 @@ static int show_schedstat(struct seq_file *seq, void *v)

9564

- 	} else {

9565

- 		struct rq *rq;

9566

- #ifdef CONFIG_SMP

9567

-+#ifndef CONFIG_SCHED_ALT

9568

- 		struct sched_domain *sd;

9569

- 		int dcount = 0;

9570

-+#endif

9571

- #endif

9572

- 		cpu = (unsigned long)(v - 2);

9573

- 		rq = cpu_rq(cpu);

9574

-@@ -40,6 +42,7 @@ static int show_schedstat(struct seq_file *seq, void *v)

9575

- 		seq_printf(seq, "\n");

9576

-

9577

- #ifdef CONFIG_SMP

9578

-+#ifndef CONFIG_SCHED_ALT

9579

- 		/* domain-specific stats */

9580

- 		rcu_read_lock();

9581

- 		for_each_domain(cpu, sd) {

9582

-@@ -68,6 +71,7 @@ static int show_schedstat(struct seq_file *seq, void *v)

9583

- 			    sd->ttwu_move_balance);

9584

- 		}

9585

- 		rcu_read_unlock();

9586

-+#endif

9587

- #endif

9588

- 	}

9589

- 	return 0;

9590

-diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c

9591

-index 4e8698e62f07..36c61551252e 100644

9592

---- a/kernel/sched/topology.c

9593

-+++ b/kernel/sched/topology.c

9594

-@@ -4,6 +4,7 @@

9595

-  */

9596

- #include "sched.h"

9597

-

9598

-+#ifndef CONFIG_SCHED_ALT

9599

- DEFINE_MUTEX(sched_domains_mutex);

9600

-

9601

- /* Protected by sched_domains_mutex: */

9602

-@@ -1382,8 +1383,10 @@ static void asym_cpu_capacity_scan(void)

9603

-  */

9604

-

9605

- static int default_relax_domain_level = -1;

9606

-+#endif /* CONFIG_SCHED_ALT */

9607

- int sched_domain_level_max;

9608

-

9609

-+#ifndef CONFIG_SCHED_ALT

9610

- static int __init setup_relax_domain_level(char *str)

9611

- {

9612

- 	if (kstrtoint(str, 0, &default_relax_domain_level))

9613

-@@ -1619,6 +1622,7 @@ sd_init(struct sched_domain_topology_level *tl,

9614

-

9615

- 	return sd;

9616

- }

9617

-+#endif /* CONFIG_SCHED_ALT */

9618

-

9619

- /*

9620

-  * Topology list, bottom-up.

9621

-@@ -1648,6 +1652,7 @@ void set_sched_topology(struct sched_domain_topology_level *tl)

9622

- 	sched_domain_topology = tl;

9623

- }

9624

-

9625

-+#ifndef CONFIG_SCHED_ALT

9626

- #ifdef CONFIG_NUMA

9627

-

9628

- static const struct cpumask *sd_numa_mask(int cpu)

9629

-@@ -2516,3 +2521,17 @@ void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],

9630

- 	partition_sched_domains_locked(ndoms_new, doms_new, dattr_new);

9631

- 	mutex_unlock(&sched_domains_mutex);

9632

- }

9633

-+#else /* CONFIG_SCHED_ALT */

9634

-+void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],

9635

-+			     struct sched_domain_attr *dattr_new)

9636

-+{}

9637

-+

9638

-+#ifdef CONFIG_NUMA

9639

-+int __read_mostly		node_reclaim_distance = RECLAIM_DISTANCE;

9640

-+

9641

-+int sched_numa_find_closest(const struct cpumask *cpus, int cpu)

9642

-+{

9643

-+	return best_mask_cpu(cpu, cpus);

9644

-+}

9645

-+#endif /* CONFIG_NUMA */

9646

-+#endif

9647

-diff --git a/kernel/sysctl.c b/kernel/sysctl.c

9648

-index 083be6af29d7..09fc6281d488 100644

9649

---- a/kernel/sysctl.c

9650

-+++ b/kernel/sysctl.c

9651

-@@ -122,6 +122,10 @@ static unsigned long long_max = LONG_MAX;

9652

- static int one_hundred = 100;

9653

- static int two_hundred = 200;

9654

- static int one_thousand = 1000;

9655

-+#ifdef CONFIG_SCHED_ALT

9656

-+static int __maybe_unused zero = 0;

9657

-+extern int sched_yield_type;

9658

-+#endif

9659

- #ifdef CONFIG_PRINTK

9660

- static int ten_thousand = 10000;

9661

- #endif

9662

-@@ -1771,6 +1775,24 @@ int proc_do_static_key(struct ctl_table *table, int write,

9663

- }

9664

-

9665

- static struct ctl_table kern_table[] = {

9666

-+#ifdef CONFIG_SCHED_ALT

9667

-+/* In ALT, only supported "sched_schedstats" */

9668

-+#ifdef CONFIG_SCHED_DEBUG

9669

-+#ifdef CONFIG_SMP

9670

-+#ifdef CONFIG_SCHEDSTATS

9671

-+	{

9672

-+		.procname	= "sched_schedstats",

9673

-+		.data		= NULL,

9674

-+		.maxlen		= sizeof(unsigned int),

9675

-+		.mode		= 0644,

9676

-+		.proc_handler	= sysctl_schedstats,

9677

-+		.extra1		= SYSCTL_ZERO,

9678

-+		.extra2		= SYSCTL_ONE,

9679

-+	},

9680

-+#endif /* CONFIG_SCHEDSTATS */

9681

-+#endif /* CONFIG_SMP */

9682

-+#endif /* CONFIG_SCHED_DEBUG */

9683

-+#else  /* !CONFIG_SCHED_ALT */

9684

- 	{

9685

- 		.procname	= "sched_child_runs_first",

9686

- 		.data		= &sysctl_sched_child_runs_first,

9687

-@@ -1901,6 +1923,7 @@ static struct ctl_table kern_table[] = {

9688

- 		.extra2		= SYSCTL_ONE,

9689

- 	},

9690

- #endif

9691

-+#endif /* !CONFIG_SCHED_ALT */

9692

- #ifdef CONFIG_PROVE_LOCKING

9693

- 	{

9694

- 		.procname	= "prove_locking",

9695

-@@ -2477,6 +2500,17 @@ static struct ctl_table kern_table[] = {

9696

- 		.proc_handler	= proc_dointvec,

9697

- 	},

9698

- #endif

9699

-+#ifdef CONFIG_SCHED_ALT

9700

-+	{

9701

-+		.procname	= "yield_type",

9702

-+		.data		= &sched_yield_type,

9703

-+		.maxlen		= sizeof (int),

9704

-+		.mode		= 0644,

9705

-+		.proc_handler	= &proc_dointvec_minmax,

9706

-+		.extra1		= &zero,

9707

-+		.extra2		= &two,

9708

-+	},

9709

-+#endif

9710

- #if defined(CONFIG_S390) && defined(CONFIG_SMP)

9711

- 	{

9712

- 		.procname	= "spin_retry",

9713

-diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c

9714

-index 0ea8702eb516..a27a0f3a654d 100644

9715

---- a/kernel/time/hrtimer.c

9716

-+++ b/kernel/time/hrtimer.c

9717

-@@ -2088,8 +2088,10 @@ long hrtimer_nanosleep(ktime_t rqtp, const enum hrtimer_mode mode,

9718

- 	int ret = 0;

9719

- 	u64 slack;

9720

-

9721

-+#ifndef CONFIG_SCHED_ALT

9722

- 	slack = current->timer_slack_ns;

9723

- 	if (dl_task(current) || rt_task(current))

9724

-+#endif

9725

- 		slack = 0;

9726

-

9727

- 	hrtimer_init_sleeper_on_stack(&t, clockid, mode);

9728

-diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c

9729

-index 643d412ac623..6bf27565242f 100644

9730

---- a/kernel/time/posix-cpu-timers.c

9731

-+++ b/kernel/time/posix-cpu-timers.c

9732

-@@ -216,7 +216,7 @@ static void task_sample_cputime(struct task_struct *p, u64 *samples)

9733

- 	u64 stime, utime;

9734

-

9735

- 	task_cputime(p, &utime, &stime);

9736

--	store_samples(samples, stime, utime, p->se.sum_exec_runtime);

9737

-+	store_samples(samples, stime, utime, tsk_seruntime(p));

9738

- }

9739

-

9740

- static void proc_sample_cputime_atomic(struct task_cputime_atomic *at,

9741

-@@ -859,6 +859,7 @@ static void collect_posix_cputimers(struct posix_cputimers *pct, u64 *samples,

9742

- 	}

9743

- }

9744

-

9745

-+#ifndef CONFIG_SCHED_ALT

9746

- static inline void check_dl_overrun(struct task_struct *tsk)

9747

- {

9748

- 	if (tsk->dl.dl_overrun) {

9749

-@@ -866,6 +867,7 @@ static inline void check_dl_overrun(struct task_struct *tsk)

9750

- 		__group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk);

9751

- 	}

9752

- }

9753

-+#endif

9754

-

9755

- static bool check_rlimit(u64 time, u64 limit, int signo, bool rt, bool hard)

9756

- {

9757

-@@ -893,8 +895,10 @@ static void check_thread_timers(struct task_struct *tsk,

9758

- 	u64 samples[CPUCLOCK_MAX];

9759

- 	unsigned long soft;

9760

-

9761

-+#ifndef CONFIG_SCHED_ALT

9762

- 	if (dl_task(tsk))

9763

- 		check_dl_overrun(tsk);

9764

-+#endif

9765

-

9766

- 	if (expiry_cache_is_inactive(pct))

9767

- 		return;

9768

-@@ -908,7 +912,7 @@ static void check_thread_timers(struct task_struct *tsk,

9769

- 	soft = task_rlimit(tsk, RLIMIT_RTTIME);

9770

- 	if (soft != RLIM_INFINITY) {

9771

- 		/* Task RT timeout is accounted in jiffies. RTTIME is usec */

9772

--		unsigned long rttime = tsk->rt.timeout * (USEC_PER_SEC / HZ);

9773

-+		unsigned long rttime = tsk_rttimeout(tsk) * (USEC_PER_SEC / HZ);

9774

- 		unsigned long hard = task_rlimit_max(tsk, RLIMIT_RTTIME);

9775

-

9776

- 		/* At the hard limit, send SIGKILL. No further action. */

9777

-@@ -1144,8 +1148,10 @@ static inline bool fastpath_timer_check(struct task_struct *tsk)

9778

- 			return true;

9779

- 	}

9780

-

9781

-+#ifndef CONFIG_SCHED_ALT

9782

- 	if (dl_task(tsk) && tsk->dl.dl_overrun)

9783

- 		return true;

9784

-+#endif

9785

-

9786

- 	return false;

9787

- }

9788

-diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c

9789

-index adf7ef194005..11c8f36e281b 100644

9790

---- a/kernel/trace/trace_selftest.c

9791

-+++ b/kernel/trace/trace_selftest.c

9792

-@@ -1052,10 +1052,15 @@ static int trace_wakeup_test_thread(void *data)

9793

- {

9794

- 	/* Make this a -deadline thread */

9795

- 	static const struct sched_attr attr = {

9796

-+#ifdef CONFIG_SCHED_ALT

9797

-+		/* No deadline on BMQ/PDS, use RR */

9798

-+		.sched_policy = SCHED_RR,

9799

-+#else

9800

- 		.sched_policy = SCHED_DEADLINE,

9801

- 		.sched_runtime = 100000ULL,

9802

- 		.sched_deadline = 10000000ULL,

9803

- 		.sched_period = 10000000ULL

9804

-+#endif

9805

- 	};

9806

- 	struct wakeup_test_data *x = data;

9807

-

9808

---- a/kernel/sched/alt_core.c	2021-11-19 09:43:18.978378235 -0500

9809

-+++ b/kernel/sched/alt_core.c	2021-11-19 09:44:22.692033290 -0500

9810

-@@ -2994,7 +2994,7 @@ int sched_fork(unsigned long clone_flags

9811

- 	return 0;

9812

- }

9813

-

9814

--void sched_post_fork(struct task_struct *p) {}

9815

-+void sched_post_fork(struct task_struct *p, struct kernel_clone_args *kargs) {}

9816

-

9817

- #ifdef CONFIG_SCHEDSTATS

9818

-

Gentoo Archives: gentoo-commits