[gentoo-commits] proj/linux-patches:4.4 commit in: / - gentoo-commits

From:	Mike Pagano <mpagano@g.o>
To:	gentoo-commits@l.g.o
Subject:	[gentoo-commits] proj/linux-patches:4.4 commit in: /
Date:	Fri, 19 Feb 2016 23:34:07
Message-Id:	`1455924861.e405a92e97b137da0b286e072041325cb48713e0.mpagano@gentoo`

1

commit:     e405a92e97b137da0b286e072041325cb48713e0

2

Author:     Mike Pagano <mpagano <AT> gentoo <DOT> org>

3

AuthorDate: Fri Feb 19 23:34:21 2016 +0000

4

Commit:     Mike Pagano <mpagano <AT> gentoo <DOT> org>

5

CommitDate: Fri Feb 19 23:34:21 2016 +0000

6

URL:        https://gitweb.gentoo.org/proj/linux-patches.git/commit/?id=e405a92e

7

8

BFQ Patchset v7r11 for 4.4

9

10

 0000_README                                        |   12 +

11

 ...oups-kconfig-build-bits-for-BFQ-v7r11-4.4.patch |  103 +

12

 ...ntroduce-the-BFQ-v7r11-I-O-sched-for-4.4.patch1 | 7097 ++++++++++++++++++++

13

 ...arly-Queue-Merge-EQM-to-BFQ-v7r11-for-4.4.patch | 1101 +++

14

 4 files changed, 8313 insertions(+)

15

16

diff --git a/0000_README b/0000_README

17

index de28467..d2dfbc9 100644

18

--- a/0000_README

19

+++ b/0000_README

20

@@ -79,6 +79,18 @@ Patch:  5000_enable-additional-cpu-optimizations-for-gcc.patch

21

 From:   https://github.com/graysky2/kernel_gcc_patch/

22

 Desc:   Kernel patch enables gcc < v4.9 optimizations for additional CPUs.

23

24

+Patch:  5001_block-cgroups-kconfig-build-bits-for-BFQ-v7r11-4.4.patch

25

+From:   http://algo.ing.unimo.it/people/paolo/disk_sched/

26

+Desc:   BFQ v7r11 patch 1 for 4.4: Build, cgroups and kconfig bits

27

+

28

+Patch:  5002_block-introduce-the-BFQ-v7r11-I-O-sched-for-4.4.patch1

29

+From:   http://algo.ing.unimo.it/people/paolo/disk_sched/

30

+Desc:   BFQ v7r11 patch 2 for 4.4: BFQ Scheduler

31

+

32

+Patch:  5003_block-bfq-add-Early-Queue-Merge-EQM-to-BFQ-v7r11-for-4.4.patch

33

+From:   http://algo.ing.unimo.it/people/paolo/disk_sched/

34

+Desc:   BFQ v7r11 patch 3 for 4.4: Early Queue Merge (EQM)

35

+

36

 Patch:  5010_enable-additional-cpu-optimizations-for-gcc-4.9.patch

37

 From:   https://github.com/graysky2/kernel_gcc_patch/

38

 Desc:   Kernel patch enables gcc >= v4.9 optimizations for additional CPUs.

39

40

diff --git a/5001_block-cgroups-kconfig-build-bits-for-BFQ-v7r11-4.4.patch b/5001_block-cgroups-kconfig-build-bits-for-BFQ-v7r11-4.4.patch

41

new file mode 100644

42

index 0000000..a5bf7cf

43

--- /dev/null

44

+++ b/5001_block-cgroups-kconfig-build-bits-for-BFQ-v7r11-4.4.patch

45

@@ -0,0 +1,103 @@

46

+From f54f3003586bf00ba0ee5974a92b732477b834e3 Mon Sep 17 00:00:00 2001

47

+From: Paolo Valente <paolo.valente@×××××××.it>

48

+Date: Tue, 7 Apr 2015 13:39:12 +0200

49

+Subject: [PATCH 1/3] block: cgroups, kconfig, build bits for BFQ-v7r11-4.4.0

50

+

51

+Update Kconfig.iosched and do the related Makefile changes to include

52

+kernel configuration options for BFQ. Also increase the number of

53

+policies supported by the blkio controller so that BFQ can add its

54

+own.

55

+

56

+Signed-off-by: Paolo Valente <paolo.valente@×××××××.it>

57

+Signed-off-by: Arianna Avanzini <avanzini@××××××.com>

58

+---

59

+ block/Kconfig.iosched  | 32 ++++++++++++++++++++++++++++++++

60

+ block/Makefile         |  1 +

61

+ include/linux/blkdev.h |  2 +-

62

+ 3 files changed, 34 insertions(+), 1 deletion(-)

63

+

64

+diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched

65

+index 421bef9..0ee5f0f 100644

66

+--- a/block/Kconfig.iosched

67

++++ b/block/Kconfig.iosched

68

+@@ -39,6 +39,27 @@ config CFQ_GROUP_IOSCHED

69

+ 	---help---

70

+ 	  Enable group IO scheduling in CFQ.

71

+

72

++config IOSCHED_BFQ

73

++	tristate "BFQ I/O scheduler"

74

++	default n

75

++	---help---

76

++	  The BFQ I/O scheduler tries to distribute bandwidth among

77

++	  all processes according to their weights.

78

++	  It aims at distributing the bandwidth as desired, independently of

79

++	  the disk parameters and with any workload. It also tries to

80

++	  guarantee low latency to interactive and soft real-time

81

++	  applications. If compiled built-in (saying Y here), BFQ can

82

++	  be configured to support hierarchical scheduling.

83

++

84

++config CGROUP_BFQIO

85

++	bool "BFQ hierarchical scheduling support"

86

++	depends on CGROUPS && IOSCHED_BFQ=y

87

++	default n

88

++	---help---

89

++	  Enable hierarchical scheduling in BFQ, using the cgroups

90

++	  filesystem interface.  The name of the subsystem will be

91

++	  bfqio.

92

++

93

+ choice

94

+ 	prompt "Default I/O scheduler"

95

+ 	default DEFAULT_CFQ

96

+@@ -52,6 +73,16 @@ choice

97

+ 	config DEFAULT_CFQ

98

+ 		bool "CFQ" if IOSCHED_CFQ=y

99

+

100

++	config DEFAULT_BFQ

101

++		bool "BFQ" if IOSCHED_BFQ=y

102

++		help

103

++		  Selects BFQ as the default I/O scheduler which will be

104

++		  used by default for all block devices.

105

++		  The BFQ I/O scheduler aims at distributing the bandwidth

106

++		  as desired, independently of the disk parameters and with

107

++		  any workload. It also tries to guarantee low latency to

108

++		  interactive and soft real-time applications.

109

++

110

+ 	config DEFAULT_NOOP

111

+ 		bool "No-op"

112

+

113

+@@ -61,6 +92,7 @@ config DEFAULT_IOSCHED

114

+ 	string

115

+ 	default "deadline" if DEFAULT_DEADLINE

116

+ 	default "cfq" if DEFAULT_CFQ

117

++	default "bfq" if DEFAULT_BFQ

118

+ 	default "noop" if DEFAULT_NOOP

119

+

120

+ endmenu

121

+diff --git a/block/Makefile b/block/Makefile

122

+index 00ecc97..1ed86d5 100644

123

+--- a/block/Makefile

124

++++ b/block/Makefile

125

+@@ -18,6 +18,7 @@ obj-$(CONFIG_BLK_DEV_THROTTLING)	+= blk-throttle.o

126

+ obj-$(CONFIG_IOSCHED_NOOP)	+= noop-iosched.o

127

+ obj-$(CONFIG_IOSCHED_DEADLINE)	+= deadline-iosched.o

128

+ obj-$(CONFIG_IOSCHED_CFQ)	+= cfq-iosched.o

129

++obj-$(CONFIG_IOSCHED_BFQ)	+= bfq-iosched.o

130

+

131

+ obj-$(CONFIG_BLOCK_COMPAT)	+= compat_ioctl.o

132

+ obj-$(CONFIG_BLK_CMDLINE_PARSER)	+= cmdline-parser.o

133

+diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h

134

+index c70e358..ae43492 100644

135

+--- a/include/linux/blkdev.h

136

++++ b/include/linux/blkdev.h

137

+@@ -44,7 +44,7 @@ struct pr_ops;

138

+  * Maximum number of blkcg policies allowed to be registered concurrently.

139

+  * Defined here to simplify include dependency.

140

+  */

141

+-#define BLKCG_MAX_POLS		2

142

++#define BLKCG_MAX_POLS		3

143

+

144

+ struct request;

145

+ typedef void (rq_end_io_fn)(struct request *, int);

146

+--

147

+1.9.1

148

+

149

150

diff --git a/5002_block-introduce-the-BFQ-v7r11-I-O-sched-for-4.4.patch1 b/5002_block-introduce-the-BFQ-v7r11-I-O-sched-for-4.4.patch1

151

new file mode 100644

152

index 0000000..6ed6973

153

--- /dev/null

154

+++ b/5002_block-introduce-the-BFQ-v7r11-I-O-sched-for-4.4.patch1

155

@@ -0,0 +1,7097 @@

156

+From 03d30cc06a5436c05ee338bd21903802181bafe9 Mon Sep 17 00:00:00 2001

157

+From: Paolo Valente <paolo.valente@×××××××.it>

158

+Date: Thu, 9 May 2013 19:10:02 +0200

159

+Subject: [PATCH 2/3] block: introduce the BFQ-v7r11 I/O sched for 4.4.0

160

+

161

+The general structure is borrowed from CFQ, as much of the code for

162

+handling I/O contexts. Over time, several useful features have been

163

+ported from CFQ as well (details in the changelog in README.BFQ). A

164

+(bfq_)queue is associated to each task doing I/O on a device, and each

165

+time a scheduling decision has to be made a queue is selected and served

166

+until it expires.

167

+

168

+    - Slices are given in the service domain: tasks are assigned

169

+      budgets, measured in number of sectors. Once got the disk, a task

170

+      must however consume its assigned budget within a configurable

171

+      maximum time (by default, the maximum possible value of the

172

+      budgets is automatically computed to comply with this timeout).

173

+      This allows the desired latency vs "throughput boosting" tradeoff

174

+      to be set.

175

+

176

+    - Budgets are scheduled according to a variant of WF2Q+, implemented

177

+      using an augmented rb-tree to take eligibility into account while

178

+      preserving an O(log N) overall complexity.

179

+

180

+    - A low-latency tunable is provided; if enabled, both interactive

181

+      and soft real-time applications are guaranteed a very low latency.

182

+

183

+    - Latency guarantees are preserved also in the presence of NCQ.

184

+

185

+    - Also with flash-based devices, a high throughput is achieved

186

+      while still preserving latency guarantees.

187

+

188

+    - BFQ features Early Queue Merge (EQM), a sort of fusion of the

189

+      cooperating-queue-merging and the preemption mechanisms present

190

+      in CFQ. EQM is in fact a unified mechanism that tries to get a

191

+      sequential read pattern, and hence a high throughput, with any

192

+      set of processes performing interleaved I/O over a contiguous

193

+      sequence of sectors.

194

+

195

+    - BFQ supports full hierarchical scheduling, exporting a cgroups

196

+      interface.  Since each node has a full scheduler, each group can

197

+      be assigned its own weight.

198

+

199

+    - If the cgroups interface is not used, only I/O priorities can be

200

+      assigned to processes, with ioprio values mapped to weights

201

+      with the relation weight = IOPRIO_BE_NR - ioprio.

202

+

203

+    - ioprio classes are served in strict priority order, i.e., lower

204

+      priority queues are not served as long as there are higher

205

+      priority queues.  Among queues in the same class the bandwidth is

206

+      distributed in proportion to the weight of each queue. A very

207

+      thin extra bandwidth is however guaranteed to the Idle class, to

208

+      prevent it from starving.

209

+

210

+Signed-off-by: Paolo Valente <paolo.valente@×××××××.it>

211

+Signed-off-by: Arianna Avanzini <avanzini@××××××.com>

212

+---

213

+ block/Kconfig.iosched |    6 +-

214

+ block/bfq-cgroup.c    | 1182 ++++++++++++++++

215

+ block/bfq-ioc.c       |   36 +

216

+ block/bfq-iosched.c   | 3754 +++++++++++++++++++++++++++++++++++++++++++++++++

217

+ block/bfq-sched.c     | 1200 ++++++++++++++++

218

+ block/bfq.h           |  801 +++++++++++

219

+ 6 files changed, 6975 insertions(+), 4 deletions(-)

220

+ create mode 100644 block/bfq-cgroup.c

221

+ create mode 100644 block/bfq-ioc.c

222

+ create mode 100644 block/bfq-iosched.c

223

+ create mode 100644 block/bfq-sched.c

224

+ create mode 100644 block/bfq.h

225

+

226

+diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched

227

+index 0ee5f0f..f78cd1a 100644

228

+--- a/block/Kconfig.iosched

229

++++ b/block/Kconfig.iosched

230

+@@ -51,14 +51,12 @@ config IOSCHED_BFQ

231

+ 	  applications. If compiled built-in (saying Y here), BFQ can

232

+ 	  be configured to support hierarchical scheduling.

233

+

234

+-config CGROUP_BFQIO

235

++config BFQ_GROUP_IOSCHED

236

+ 	bool "BFQ hierarchical scheduling support"

237

+ 	depends on CGROUPS && IOSCHED_BFQ=y

238

+ 	default n

239

+ 	---help---

240

+-	  Enable hierarchical scheduling in BFQ, using the cgroups

241

+-	  filesystem interface.  The name of the subsystem will be

242

+-	  bfqio.

243

++	  Enable hierarchical scheduling in BFQ, using the blkio controller.

244

+

245

+ choice

246

+ 	prompt "Default I/O scheduler"

247

+diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c

248

+new file mode 100644

249

+index 0000000..8610cd6

250

+--- /dev/null

251

++++ b/block/bfq-cgroup.c

252

+@@ -0,0 +1,1182 @@

253

++/*

254

++ * BFQ: CGROUPS support.

255

++ *

256

++ * Based on ideas and code from CFQ:

257

++ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>

258

++ *

259

++ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>

260

++ *		      Paolo Valente <paolo.valente@×××××××.it>

261

++ *

262

++ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>

263

++ *

264

++ * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ

265

++ * file.

266

++ */

267

++

268

++#ifdef CONFIG_BFQ_GROUP_IOSCHED

269

++

270

++/* bfqg stats flags */

271

++enum bfqg_stats_flags {

272

++	BFQG_stats_waiting = 0,

273

++	BFQG_stats_idling,

274

++	BFQG_stats_empty,

275

++};

276

++

277

++#define BFQG_FLAG_FNS(name)						\

278

++static void bfqg_stats_mark_##name(struct bfqg_stats *stats)	\

279

++{									\

280

++	stats->flags |= (1 << BFQG_stats_##name);			\

281

++}									\

282

++static void bfqg_stats_clear_##name(struct bfqg_stats *stats)	\

283

++{									\

284

++	stats->flags &= ~(1 << BFQG_stats_##name);			\

285

++}									\

286

++static int bfqg_stats_##name(struct bfqg_stats *stats)		\

287

++{									\

288

++	return (stats->flags & (1 << BFQG_stats_##name)) != 0;		\

289

++}									\

290

++

291

++BFQG_FLAG_FNS(waiting)

292

++BFQG_FLAG_FNS(idling)

293

++BFQG_FLAG_FNS(empty)

294

++#undef BFQG_FLAG_FNS

295

++

296

++/* This should be called with the queue_lock held. */

297

++static void bfqg_stats_update_group_wait_time(struct bfqg_stats *stats)

298

++{

299

++	unsigned long long now;

300

++

301

++	if (!bfqg_stats_waiting(stats))

302

++		return;

303

++

304

++	now = sched_clock();

305

++	if (time_after64(now, stats->start_group_wait_time))

306

++		blkg_stat_add(&stats->group_wait_time,

307

++			      now - stats->start_group_wait_time);

308

++	bfqg_stats_clear_waiting(stats);

309

++}

310

++

311

++/* This should be called with the queue_lock held. */

312

++static void bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg,

313

++						 struct bfq_group *curr_bfqg)

314

++{

315

++	struct bfqg_stats *stats = &bfqg->stats;

316

++

317

++	if (bfqg_stats_waiting(stats))

318

++		return;

319

++	if (bfqg == curr_bfqg)

320

++		return;

321

++	stats->start_group_wait_time = sched_clock();

322

++	bfqg_stats_mark_waiting(stats);

323

++}

324

++

325

++/* This should be called with the queue_lock held. */

326

++static void bfqg_stats_end_empty_time(struct bfqg_stats *stats)

327

++{

328

++	unsigned long long now;

329

++

330

++	if (!bfqg_stats_empty(stats))

331

++		return;

332

++

333

++	now = sched_clock();

334

++	if (time_after64(now, stats->start_empty_time))

335

++		blkg_stat_add(&stats->empty_time,

336

++			      now - stats->start_empty_time);

337

++	bfqg_stats_clear_empty(stats);

338

++}

339

++

340

++static void bfqg_stats_update_dequeue(struct bfq_group *bfqg)

341

++{

342

++	blkg_stat_add(&bfqg->stats.dequeue, 1);

343

++}

344

++

345

++static void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg)

346

++{

347

++	struct bfqg_stats *stats = &bfqg->stats;

348

++

349

++	if (blkg_rwstat_total(&stats->queued))

350

++		return;

351

++

352

++	/*

353

++	 * group is already marked empty. This can happen if bfqq got new

354

++	 * request in parent group and moved to this group while being added

355

++	 * to service tree. Just ignore the event and move on.

356

++	 */

357

++	if (bfqg_stats_empty(stats))

358

++		return;

359

++

360

++	stats->start_empty_time = sched_clock();

361

++	bfqg_stats_mark_empty(stats);

362

++}

363

++

364

++static void bfqg_stats_update_idle_time(struct bfq_group *bfqg)

365

++{

366

++	struct bfqg_stats *stats = &bfqg->stats;

367

++

368

++	if (bfqg_stats_idling(stats)) {

369

++		unsigned long long now = sched_clock();

370

++

371

++		if (time_after64(now, stats->start_idle_time))

372

++			blkg_stat_add(&stats->idle_time,

373

++				      now - stats->start_idle_time);

374

++		bfqg_stats_clear_idling(stats);

375

++	}

376

++}

377

++

378

++static void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg)

379

++{

380

++	struct bfqg_stats *stats = &bfqg->stats;

381

++

382

++	stats->start_idle_time = sched_clock();

383

++	bfqg_stats_mark_idling(stats);

384

++}

385

++

386

++static void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg)

387

++{

388

++	struct bfqg_stats *stats = &bfqg->stats;

389

++

390

++	blkg_stat_add(&stats->avg_queue_size_sum,

391

++		      blkg_rwstat_total(&stats->queued));

392

++	blkg_stat_add(&stats->avg_queue_size_samples, 1);

393

++	bfqg_stats_update_group_wait_time(stats);

394

++}

395

++

396

++static struct blkcg_policy blkcg_policy_bfq;

397

++

398

++/*

399

++ * blk-cgroup policy-related handlers

400

++ * The following functions help in converting between blk-cgroup

401

++ * internal structures and BFQ-specific structures.

402

++ */

403

++

404

++static struct bfq_group *pd_to_bfqg(struct blkg_policy_data *pd)

405

++{

406

++	return pd ? container_of(pd, struct bfq_group, pd) : NULL;

407

++}

408

++

409

++static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg)

410

++{

411

++	return pd_to_blkg(&bfqg->pd);

412

++}

413

++

414

++static struct bfq_group *blkg_to_bfqg(struct blkcg_gq *blkg)

415

++{

416

++	struct blkg_policy_data *pd = blkg_to_pd(blkg, &blkcg_policy_bfq);

417

++	BUG_ON(!pd);

418

++	return pd_to_bfqg(pd);

419

++}

420

++

421

++/*

422

++ * bfq_group handlers

423

++ * The following functions help in navigating the bfq_group hierarchy

424

++ * by allowing to find the parent of a bfq_group or the bfq_group

425

++ * associated to a bfq_queue.

426

++ */

427

++

428

++static struct bfq_group *bfqg_parent(struct bfq_group *bfqg)

429

++{

430

++	struct blkcg_gq *pblkg = bfqg_to_blkg(bfqg)->parent;

431

++

432

++	return pblkg ? blkg_to_bfqg(pblkg) : NULL;

433

++}

434

++

435

++static struct bfq_group *bfqq_group(struct bfq_queue *bfqq)

436

++{

437

++	struct bfq_entity *group_entity = bfqq->entity.parent;

438

++

439

++	return group_entity ? container_of(group_entity, struct bfq_group,

440

++					   entity) :

441

++			      bfqq->bfqd->root_group;

442

++}

443

++

444

++/*

445

++ * The following two functions handle get and put of a bfq_group by

446

++ * wrapping the related blk-cgroup hooks.

447

++ */

448

++

449

++static void bfqg_get(struct bfq_group *bfqg)

450

++{

451

++	return blkg_get(bfqg_to_blkg(bfqg));

452

++}

453

++

454

++static void bfqg_put(struct bfq_group *bfqg)

455

++{

456

++	return blkg_put(bfqg_to_blkg(bfqg));

457

++}

458

++

459

++static void bfqg_stats_update_io_add(struct bfq_group *bfqg,

460

++				     struct bfq_queue *bfqq,

461

++				     int rw)

462

++{

463

++	blkg_rwstat_add(&bfqg->stats.queued, rw, 1);

464

++	bfqg_stats_end_empty_time(&bfqg->stats);

465

++	if (!(bfqq == ((struct bfq_data *)bfqg->bfqd)->in_service_queue))

466

++		bfqg_stats_set_start_group_wait_time(bfqg, bfqq_group(bfqq));

467

++}

468

++

469

++static void bfqg_stats_update_io_remove(struct bfq_group *bfqg, int rw)

470

++{

471

++	blkg_rwstat_add(&bfqg->stats.queued, rw, -1);

472

++}

473

++

474

++static void bfqg_stats_update_io_merged(struct bfq_group *bfqg, int rw)

475

++{

476

++	blkg_rwstat_add(&bfqg->stats.merged, rw, 1);

477

++}

478

++

479

++static void bfqg_stats_update_dispatch(struct bfq_group *bfqg,

480

++					      uint64_t bytes, int rw)

481

++{

482

++	blkg_stat_add(&bfqg->stats.sectors, bytes >> 9);

483

++	blkg_rwstat_add(&bfqg->stats.serviced, rw, 1);

484

++	blkg_rwstat_add(&bfqg->stats.service_bytes, rw, bytes);

485

++}

486

++

487

++static void bfqg_stats_update_completion(struct bfq_group *bfqg,

488

++			uint64_t start_time, uint64_t io_start_time, int rw)

489

++{

490

++	struct bfqg_stats *stats = &bfqg->stats;

491

++	unsigned long long now = sched_clock();

492

++

493

++	if (time_after64(now, io_start_time))

494

++		blkg_rwstat_add(&stats->service_time, rw, now - io_start_time);

495

++	if (time_after64(io_start_time, start_time))

496

++		blkg_rwstat_add(&stats->wait_time, rw,

497

++				io_start_time - start_time);

498

++}

499

++

500

++/* @stats = 0 */

501

++static void bfqg_stats_reset(struct bfqg_stats *stats)

502

++{

503

++	if (!stats)

504

++		return;

505

++

506

++	/* queued stats shouldn't be cleared */

507

++	blkg_rwstat_reset(&stats->service_bytes);

508

++	blkg_rwstat_reset(&stats->serviced);

509

++	blkg_rwstat_reset(&stats->merged);

510

++	blkg_rwstat_reset(&stats->service_time);

511

++	blkg_rwstat_reset(&stats->wait_time);

512

++	blkg_stat_reset(&stats->time);

513

++	blkg_stat_reset(&stats->unaccounted_time);

514

++	blkg_stat_reset(&stats->avg_queue_size_sum);

515

++	blkg_stat_reset(&stats->avg_queue_size_samples);

516

++	blkg_stat_reset(&stats->dequeue);

517

++	blkg_stat_reset(&stats->group_wait_time);

518

++	blkg_stat_reset(&stats->idle_time);

519

++	blkg_stat_reset(&stats->empty_time);

520

++}

521

++

522

++/* @to += @from */

523

++static void bfqg_stats_merge(struct bfqg_stats *to, struct bfqg_stats *from)

524

++{

525

++	if (!to || !from)

526

++		return;

527

++

528

++	/* queued stats shouldn't be cleared */

529

++	blkg_rwstat_add_aux(&to->service_bytes, &from->service_bytes);

530

++	blkg_rwstat_add_aux(&to->serviced, &from->serviced);

531

++	blkg_rwstat_add_aux(&to->merged, &from->merged);

532

++	blkg_rwstat_add_aux(&to->service_time, &from->service_time);

533

++	blkg_rwstat_add_aux(&to->wait_time, &from->wait_time);

534

++	blkg_stat_add_aux(&from->time, &from->time);

535

++	blkg_stat_add_aux(&to->unaccounted_time, &from->unaccounted_time);

536

++	blkg_stat_add_aux(&to->avg_queue_size_sum, &from->avg_queue_size_sum);

537

++	blkg_stat_add_aux(&to->avg_queue_size_samples, &from->avg_queue_size_samples);

538

++	blkg_stat_add_aux(&to->dequeue, &from->dequeue);

539

++	blkg_stat_add_aux(&to->group_wait_time, &from->group_wait_time);

540

++	blkg_stat_add_aux(&to->idle_time, &from->idle_time);

541

++	blkg_stat_add_aux(&to->empty_time, &from->empty_time);

542

++}

543

++

544

++/*

545

++ * Transfer @bfqg's stats to its parent's dead_stats so that the ancestors'

546

++ * recursive stats can still account for the amount used by this bfqg after

547

++ * it's gone.

548

++ */

549

++static void bfqg_stats_xfer_dead(struct bfq_group *bfqg)

550

++{

551

++	struct bfq_group *parent;

552

++

553

++	if (!bfqg) /* root_group */

554

++		return;

555

++

556

++	parent = bfqg_parent(bfqg);

557

++

558

++	lockdep_assert_held(bfqg_to_blkg(bfqg)->q->queue_lock);

559

++

560

++	if (unlikely(!parent))

561

++		return;

562

++

563

++	bfqg_stats_merge(&parent->dead_stats, &bfqg->stats);

564

++	bfqg_stats_merge(&parent->dead_stats, &bfqg->dead_stats);

565

++	bfqg_stats_reset(&bfqg->stats);

566

++	bfqg_stats_reset(&bfqg->dead_stats);

567

++}

568

++

569

++static void bfq_init_entity(struct bfq_entity *entity,

570

++			    struct bfq_group *bfqg)

571

++{

572

++	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

573

++

574

++	entity->weight = entity->new_weight;

575

++	entity->orig_weight = entity->new_weight;

576

++	if (bfqq) {

577

++		bfqq->ioprio = bfqq->new_ioprio;

578

++		bfqq->ioprio_class = bfqq->new_ioprio_class;

579

++		bfqg_get(bfqg);

580

++	}

581

++	entity->parent = bfqg->my_entity;

582

++	entity->sched_data = &bfqg->sched_data;

583

++}

584

++

585

++static void bfqg_stats_exit(struct bfqg_stats *stats)

586

++{

587

++	blkg_rwstat_exit(&stats->service_bytes);

588

++	blkg_rwstat_exit(&stats->serviced);

589

++	blkg_rwstat_exit(&stats->merged);

590

++	blkg_rwstat_exit(&stats->service_time);

591

++	blkg_rwstat_exit(&stats->wait_time);

592

++	blkg_rwstat_exit(&stats->queued);

593

++	blkg_stat_exit(&stats->sectors);

594

++	blkg_stat_exit(&stats->time);

595

++	blkg_stat_exit(&stats->unaccounted_time);

596

++	blkg_stat_exit(&stats->avg_queue_size_sum);

597

++	blkg_stat_exit(&stats->avg_queue_size_samples);

598

++	blkg_stat_exit(&stats->dequeue);

599

++	blkg_stat_exit(&stats->group_wait_time);

600

++	blkg_stat_exit(&stats->idle_time);

601

++	blkg_stat_exit(&stats->empty_time);

602

++}

603

++

604

++static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp)

605

++{

606

++	if (blkg_rwstat_init(&stats->service_bytes, gfp) ||

607

++	    blkg_rwstat_init(&stats->serviced, gfp) ||

608

++	    blkg_rwstat_init(&stats->merged, gfp) ||

609

++	    blkg_rwstat_init(&stats->service_time, gfp) ||

610

++	    blkg_rwstat_init(&stats->wait_time, gfp) ||

611

++	    blkg_rwstat_init(&stats->queued, gfp) ||

612

++	    blkg_stat_init(&stats->sectors, gfp) ||

613

++	    blkg_stat_init(&stats->time, gfp) ||

614

++	    blkg_stat_init(&stats->unaccounted_time, gfp) ||

615

++	    blkg_stat_init(&stats->avg_queue_size_sum, gfp) ||

616

++	    blkg_stat_init(&stats->avg_queue_size_samples, gfp) ||

617

++	    blkg_stat_init(&stats->dequeue, gfp) ||

618

++	    blkg_stat_init(&stats->group_wait_time, gfp) ||

619

++	    blkg_stat_init(&stats->idle_time, gfp) ||

620

++	    blkg_stat_init(&stats->empty_time, gfp)) {

621

++		bfqg_stats_exit(stats);

622

++		return -ENOMEM;

623

++	}

624

++

625

++	return 0;

626

++}

627

++

628

++static struct bfq_group_data *cpd_to_bfqgd(struct blkcg_policy_data *cpd)

629

++ {

630

++	return cpd ? container_of(cpd, struct bfq_group_data, pd) : NULL;

631

++ }

632

++

633

++static struct bfq_group_data *blkcg_to_bfqgd(struct blkcg *blkcg)

634

++{

635

++	return cpd_to_bfqgd(blkcg_to_cpd(blkcg, &blkcg_policy_bfq));

636

++}

637

++

638

++static void bfq_cpd_init(struct blkcg_policy_data *cpd)

639

++{

640

++	struct bfq_group_data *d = cpd_to_bfqgd(cpd);

641

++

642

++	d->weight = BFQ_DEFAULT_GRP_WEIGHT;

643

++}

644

++

645

++static struct blkg_policy_data *bfq_pd_alloc(gfp_t gfp, int node)

646

++{

647

++	struct bfq_group *bfqg;

648

++

649

++	bfqg = kzalloc_node(sizeof(*bfqg), gfp, node);

650

++	if (!bfqg)

651

++		return NULL;

652

++

653

++	if (bfqg_stats_init(&bfqg->stats, gfp) ||

654

++	    bfqg_stats_init(&bfqg->dead_stats, gfp)) {

655

++		kfree(bfqg);

656

++		return NULL;

657

++	}

658

++

659

++	return &bfqg->pd;

660

++}

661

++

662

++static void bfq_group_set_parent(struct bfq_group *bfqg,

663

++					struct bfq_group *parent)

664

++{

665

++	struct bfq_entity *entity;

666

++

667

++	BUG_ON(!parent);

668

++	BUG_ON(!bfqg);

669

++	BUG_ON(bfqg == parent);

670

++

671

++	entity = &bfqg->entity;

672

++	entity->parent = parent->my_entity;

673

++	entity->sched_data = &parent->sched_data;

674

++}

675

++

676

++static void bfq_pd_init(struct blkg_policy_data *pd)

677

++{

678

++	struct blkcg_gq *blkg = pd_to_blkg(pd);

679

++	struct bfq_group *bfqg = blkg_to_bfqg(blkg);

680

++	struct bfq_data *bfqd = blkg->q->elevator->elevator_data;

681

++	struct bfq_entity *entity = &bfqg->entity;

682

++	struct bfq_group_data *d = blkcg_to_bfqgd(blkg->blkcg);

683

++

684

++	entity->orig_weight = entity->weight = entity->new_weight = d->weight;

685

++	entity->my_sched_data = &bfqg->sched_data;

686

++	bfqg->my_entity = entity; /*

687

++				   * the root_group's will be set to NULL

688

++				   * in bfq_init_queue()

689

++				   */

690

++	bfqg->bfqd = bfqd;

691

++	bfqg->active_entities = 0;

692

++}

693

++

694

++static void bfq_pd_free(struct blkg_policy_data *pd)

695

++{

696

++	struct bfq_group *bfqg = pd_to_bfqg(pd);

697

++

698

++	bfqg_stats_exit(&bfqg->stats);

699

++	bfqg_stats_exit(&bfqg->dead_stats);

700

++

701

++	return kfree(bfqg);

702

++}

703

++

704

++/* offset delta from bfqg->stats to bfqg->dead_stats */

705

++static const int dead_stats_off_delta = offsetof(struct bfq_group, dead_stats) -

706

++					offsetof(struct bfq_group, stats);

707

++

708

++/* to be used by recursive prfill, sums live and dead stats recursively */

709

++static u64 bfqg_stat_pd_recursive_sum(struct blkg_policy_data *pd, int off)

710

++{

711

++	u64 sum = 0;

712

++

713

++	sum += blkg_stat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_bfq, off);

714

++	sum += blkg_stat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_bfq,

715

++				       off + dead_stats_off_delta);

716

++	return sum;

717

++}

718

++

719

++/* to be used by recursive prfill, sums live and dead rwstats recursively */

720

++static struct blkg_rwstat bfqg_rwstat_pd_recursive_sum(struct blkg_policy_data *pd,

721

++						       int off)

722

++{

723

++	struct blkg_rwstat a, b;

724

++

725

++	a = blkg_rwstat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_bfq, off);

726

++	b = blkg_rwstat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_bfq,

727

++				      off + dead_stats_off_delta);

728

++	blkg_rwstat_add_aux(&a, &b);

729

++	return a;

730

++}

731

++

732

++static void bfq_pd_reset_stats(struct blkg_policy_data *pd)

733

++{

734

++	struct bfq_group *bfqg = pd_to_bfqg(pd);

735

++

736

++	bfqg_stats_reset(&bfqg->stats);

737

++	bfqg_stats_reset(&bfqg->dead_stats);

738

++}

739

++

740

++static struct bfq_group *bfq_find_alloc_group(struct bfq_data *bfqd,

741

++					      struct blkcg *blkcg)

742

++{

743

++	struct request_queue *q = bfqd->queue;

744

++	struct bfq_group *bfqg = NULL, *parent;

745

++	struct bfq_entity *entity = NULL;

746

++

747

++	assert_spin_locked(bfqd->queue->queue_lock);

748

++

749

++	/* avoid lookup for the common case where there's no blkcg */

750

++	if (blkcg == &blkcg_root) {

751

++		bfqg = bfqd->root_group;

752

++	} else {

753

++		struct blkcg_gq *blkg;

754

++

755

++		blkg = blkg_lookup_create(blkcg, q);

756

++		if (!IS_ERR(blkg))

757

++			bfqg = blkg_to_bfqg(blkg);

758

++		else /* fallback to root_group */

759

++			bfqg = bfqd->root_group;

760

++	}

761

++

762

++	BUG_ON(!bfqg);

763

++

764

++	/*

765

++	 * Update chain of bfq_groups as we might be handling a leaf group

766

++	 * which, along with some of its relatives, has not been hooked yet

767

++	 * to the private hierarchy of BFQ.

768

++	 */

769

++	entity = &bfqg->entity;

770

++	for_each_entity(entity) {

771

++		bfqg = container_of(entity, struct bfq_group, entity);

772

++		BUG_ON(!bfqg);

773

++		if (bfqg != bfqd->root_group) {

774

++			parent = bfqg_parent(bfqg);

775

++			if (!parent)

776

++				parent = bfqd->root_group;

777

++			BUG_ON(!parent);

778

++			bfq_group_set_parent(bfqg, parent);

779

++		}

780

++	}

781

++

782

++	return bfqg;

783

++}

784

++

785

++/**

786

++ * bfq_bfqq_move - migrate @bfqq to @bfqg.

787

++ * @bfqd: queue descriptor.

788

++ * @bfqq: the queue to move.

789

++ * @entity: @bfqq's entity.

790

++ * @bfqg: the group to move to.

791

++ *

792

++ * Move @bfqq to @bfqg, deactivating it from its old group and reactivating

793

++ * it on the new one.  Avoid putting the entity on the old group idle tree.

794

++ *

795

++ * Must be called under the queue lock; the cgroup owning @bfqg must

796

++ * not disappear (by now this just means that we are called under

797

++ * rcu_read_lock()).

798

++ */

799

++static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,

800

++			  struct bfq_entity *entity, struct bfq_group *bfqg)

801

++{

802

++	int busy, resume;

803

++

804

++	busy = bfq_bfqq_busy(bfqq);

805

++	resume = !RB_EMPTY_ROOT(&bfqq->sort_list);

806

++

807

++	BUG_ON(resume && !entity->on_st);

808

++	BUG_ON(busy && !resume && entity->on_st &&

809

++	       bfqq != bfqd->in_service_queue);

810

++

811

++	if (busy) {

812

++		BUG_ON(atomic_read(&bfqq->ref) < 2);

813

++

814

++		if (!resume)

815

++			bfq_del_bfqq_busy(bfqd, bfqq, 0);

816

++		else

817

++			bfq_deactivate_bfqq(bfqd, bfqq, 0);

818

++	} else if (entity->on_st)

819

++		bfq_put_idle_entity(bfq_entity_service_tree(entity), entity);

820

++	bfqg_put(bfqq_group(bfqq));

821

++

822

++	/*

823

++	 * Here we use a reference to bfqg.  We don't need a refcounter

824

++	 * as the cgroup reference will not be dropped, so that its

825

++	 * destroy() callback will not be invoked.

826

++	 */

827

++	entity->parent = bfqg->my_entity;

828

++	entity->sched_data = &bfqg->sched_data;

829

++	bfqg_get(bfqg);

830

++

831

++	if (busy) {

832

++		if (resume)

833

++			bfq_activate_bfqq(bfqd, bfqq);

834

++	}

835

++

836

++	if (!bfqd->in_service_queue && !bfqd->rq_in_driver)

837

++		bfq_schedule_dispatch(bfqd);

838

++}

839

++

840

++/**

841

++ * __bfq_bic_change_cgroup - move @bic to @cgroup.

842

++ * @bfqd: the queue descriptor.

843

++ * @bic: the bic to move.

844

++ * @blkcg: the blk-cgroup to move to.

845

++ *

846

++ * Move bic to blkcg, assuming that bfqd->queue is locked; the caller

847

++ * has to make sure that the reference to cgroup is valid across the call.

848

++ *

849

++ * NOTE: an alternative approach might have been to store the current

850

++ * cgroup in bfqq and getting a reference to it, reducing the lookup

851

++ * time here, at the price of slightly more complex code.

852

++ */

853

++static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd,

854

++						struct bfq_io_cq *bic,

855

++						struct blkcg *blkcg)

856

++{

857

++	struct bfq_queue *async_bfqq = bic_to_bfqq(bic, 0);

858

++	struct bfq_queue *sync_bfqq = bic_to_bfqq(bic, 1);

859

++	struct bfq_group *bfqg;

860

++	struct bfq_entity *entity;

861

++

862

++	lockdep_assert_held(bfqd->queue->queue_lock);

863

++

864

++	bfqg = bfq_find_alloc_group(bfqd, blkcg);

865

++	if (async_bfqq) {

866

++		entity = &async_bfqq->entity;

867

++

868

++		if (entity->sched_data != &bfqg->sched_data) {

869

++			bic_set_bfqq(bic, NULL, 0);

870

++			bfq_log_bfqq(bfqd, async_bfqq,

871

++				     "bic_change_group: %p %d",

872

++				     async_bfqq, atomic_read(&async_bfqq->ref));

873

++			bfq_put_queue(async_bfqq);

874

++		}

875

++	}

876

++

877

++	if (sync_bfqq) {

878

++		entity = &sync_bfqq->entity;

879

++		if (entity->sched_data != &bfqg->sched_data)

880

++			bfq_bfqq_move(bfqd, sync_bfqq, entity, bfqg);

881

++	}

882

++

883

++	return bfqg;

884

++}

885

++

886

++static void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio)

887

++{

888

++	struct bfq_data *bfqd = bic_to_bfqd(bic);

889

++	struct blkcg *blkcg;

890

++	struct bfq_group *bfqg = NULL;

891

++	uint64_t id;

892

++

893

++	rcu_read_lock();

894

++	blkcg = bio_blkcg(bio);

895

++	id = blkcg->css.serial_nr;

896

++	rcu_read_unlock();

897

++

898

++	/*

899

++	 * Check whether blkcg has changed.  The condition may trigger

900

++	 * spuriously on a newly created cic but there's no harm.

901

++	 */

902

++	if (unlikely(!bfqd) || likely(bic->blkcg_id == id))

903

++		return;

904

++

905

++	bfqg = __bfq_bic_change_cgroup(bfqd, bic, blkcg);

906

++	BUG_ON(!bfqg);

907

++	bic->blkcg_id = id;

908

++}

909

++

910

++/**

911

++ * bfq_flush_idle_tree - deactivate any entity on the idle tree of @st.

912

++ * @st: the service tree being flushed.

913

++ */

914

++static void bfq_flush_idle_tree(struct bfq_service_tree *st)

915

++{

916

++	struct bfq_entity *entity = st->first_idle;

917

++

918

++	for (; entity ; entity = st->first_idle)

919

++		__bfq_deactivate_entity(entity, 0);

920

++}

921

++

922

++/**

923

++ * bfq_reparent_leaf_entity - move leaf entity to the root_group.

924

++ * @bfqd: the device data structure with the root group.

925

++ * @entity: the entity to move.

926

++ */

927

++static void bfq_reparent_leaf_entity(struct bfq_data *bfqd,

928

++				     struct bfq_entity *entity)

929

++{

930

++	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

931

++

932

++	BUG_ON(!bfqq);

933

++	bfq_bfqq_move(bfqd, bfqq, entity, bfqd->root_group);

934

++	return;

935

++}

936

++

937

++/**

938

++ * bfq_reparent_active_entities - move to the root group all active

939

++ *                                entities.

940

++ * @bfqd: the device data structure with the root group.

941

++ * @bfqg: the group to move from.

942

++ * @st: the service tree with the entities.

943

++ *

944

++ * Needs queue_lock to be taken and reference to be valid over the call.

945

++ */

946

++static void bfq_reparent_active_entities(struct bfq_data *bfqd,

947

++					 struct bfq_group *bfqg,

948

++					 struct bfq_service_tree *st)

949

++{

950

++	struct rb_root *active = &st->active;

951

++	struct bfq_entity *entity = NULL;

952

++

953

++	if (!RB_EMPTY_ROOT(&st->active))

954

++		entity = bfq_entity_of(rb_first(active));

955

++

956

++	for (; entity ; entity = bfq_entity_of(rb_first(active)))

957

++		bfq_reparent_leaf_entity(bfqd, entity);

958

++

959

++	if (bfqg->sched_data.in_service_entity)

960

++		bfq_reparent_leaf_entity(bfqd,

961

++			bfqg->sched_data.in_service_entity);

962

++

963

++	return;

964

++}

965

++

966

++/**

967

++ * bfq_destroy_group - destroy @bfqg.

968

++ * @bfqg: the group being destroyed.

969

++ *

970

++ * Destroy @bfqg, making sure that it is not referenced from its parent.

971

++ * blkio already grabs the queue_lock for us, so no need to use RCU-based magic

972

++ */

973

++static void bfq_pd_offline(struct blkg_policy_data *pd)

974

++{

975

++	struct bfq_service_tree *st;

976

++	struct bfq_group *bfqg;

977

++	struct bfq_data *bfqd;

978

++	struct bfq_entity *entity;

979

++	int i;

980

++

981

++	BUG_ON(!pd);

982

++	bfqg = pd_to_bfqg(pd);

983

++	BUG_ON(!bfqg);

984

++	bfqd = bfqg->bfqd;

985

++	BUG_ON(bfqd && !bfqd->root_group);

986

++

987

++	entity = bfqg->my_entity;

988

++

989

++	if (!entity) /* root group */

990

++		return;

991

++

992

++	/*

993

++	 * Empty all service_trees belonging to this group before

994

++	 * deactivating the group itself.

995

++	 */

996

++	for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) {

997

++		BUG_ON(!bfqg->sched_data.service_tree);

998

++		st = bfqg->sched_data.service_tree + i;

999

++		/*

1000

++		 * The idle tree may still contain bfq_queues belonging

1001

++		 * to exited task because they never migrated to a different

1002

++		 * cgroup from the one being destroyed now.  No one else

1003

++		 * can access them so it's safe to act without any lock.

1004

++		 */

1005

++		bfq_flush_idle_tree(st);

1006

++

1007

++		/*

1008

++		 * It may happen that some queues are still active

1009

++		 * (busy) upon group destruction (if the corresponding

1010

++		 * processes have been forced to terminate). We move

1011

++		 * all the leaf entities corresponding to these queues

1012

++		 * to the root_group.

1013

++		 * Also, it may happen that the group has an entity

1014

++		 * in service, which is disconnected from the active

1015

++		 * tree: it must be moved, too.

1016

++		 * There is no need to put the sync queues, as the

1017

++		 * scheduler has taken no reference.

1018

++		 */

1019

++		bfq_reparent_active_entities(bfqd, bfqg, st);

1020

++		BUG_ON(!RB_EMPTY_ROOT(&st->active));

1021

++		BUG_ON(!RB_EMPTY_ROOT(&st->idle));

1022

++	}

1023

++	BUG_ON(bfqg->sched_data.next_in_service);

1024

++	BUG_ON(bfqg->sched_data.in_service_entity);

1025

++

1026

++	__bfq_deactivate_entity(entity, 0);

1027

++	bfq_put_async_queues(bfqd, bfqg);

1028

++	BUG_ON(entity->tree);

1029

++

1030

++	bfqg_stats_xfer_dead(bfqg);

1031

++}

1032

++

1033

++static void bfq_end_wr_async(struct bfq_data *bfqd)

1034

++{

1035

++	struct blkcg_gq *blkg;

1036

++

1037

++	list_for_each_entry(blkg, &bfqd->queue->blkg_list, q_node) {

1038

++		struct bfq_group *bfqg = blkg_to_bfqg(blkg);

1039

++

1040

++		bfq_end_wr_async_queues(bfqd, bfqg);

1041

++	}

1042

++	bfq_end_wr_async_queues(bfqd, bfqd->root_group);

1043

++}

1044

++

1045

++static u64 bfqio_cgroup_weight_read(struct cgroup_subsys_state *css,

1046

++				       struct cftype *cftype)

1047

++{

1048

++	struct blkcg *blkcg = css_to_blkcg(css);

1049

++	struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg);

1050

++	int ret = -EINVAL;

1051

++

1052

++	spin_lock_irq(&blkcg->lock);

1053

++	ret = bfqgd->weight;

1054

++	spin_unlock_irq(&blkcg->lock);

1055

++

1056

++	return ret;

1057

++}

1058

++

1059

++static int bfqio_cgroup_weight_read_dfl(struct seq_file *sf, void *v)

1060

++{

1061

++	struct blkcg *blkcg = css_to_blkcg(seq_css(sf));

1062

++	struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg);

1063

++

1064

++	spin_lock_irq(&blkcg->lock);

1065

++	seq_printf(sf, "%u\n", bfqgd->weight);

1066

++	spin_unlock_irq(&blkcg->lock);

1067

++

1068

++	return 0;

1069

++}

1070

++

1071

++static int bfqio_cgroup_weight_write(struct cgroup_subsys_state *css,

1072

++					struct cftype *cftype,

1073

++					u64 val)

1074

++{

1075

++	struct blkcg *blkcg = css_to_blkcg(css);

1076

++	struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg);

1077

++	struct blkcg_gq *blkg;

1078

++	int ret = -EINVAL;

1079

++

1080

++	if (val < BFQ_MIN_WEIGHT || val > BFQ_MAX_WEIGHT)

1081

++		return ret;

1082

++

1083

++	ret = 0;

1084

++	spin_lock_irq(&blkcg->lock);

1085

++	bfqgd->weight = (unsigned short)val;

1086

++	hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {

1087

++		struct bfq_group *bfqg = blkg_to_bfqg(blkg);

1088

++		if (!bfqg)

1089

++			continue;

1090

++		/*

1091

++		 * Setting the prio_changed flag of the entity

1092

++		 * to 1 with new_weight == weight would re-set

1093

++		 * the value of the weight to its ioprio mapping.

1094

++		 * Set the flag only if necessary.

1095

++		 */

1096

++		if ((unsigned short)val != bfqg->entity.new_weight) {

1097

++			bfqg->entity.new_weight = (unsigned short)val;

1098

++			/*

1099

++			 * Make sure that the above new value has been

1100

++			 * stored in bfqg->entity.new_weight before

1101

++			 * setting the prio_changed flag. In fact,

1102

++			 * this flag may be read asynchronously (in

1103

++			 * critical sections protected by a different

1104

++			 * lock than that held here), and finding this

1105

++			 * flag set may cause the execution of the code

1106

++			 * for updating parameters whose value may

1107

++			 * depend also on bfqg->entity.new_weight (in

1108

++			 * __bfq_entity_update_weight_prio).

1109

++			 * This barrier makes sure that the new value

1110

++			 * of bfqg->entity.new_weight is correctly

1111

++			 * seen in that code.

1112

++			 */

1113

++			smp_wmb();

1114

++			bfqg->entity.prio_changed = 1;

1115

++		}

1116

++	}

1117

++	spin_unlock_irq(&blkcg->lock);

1118

++

1119

++	return ret;

1120

++}

1121

++

1122

++static ssize_t bfqio_cgroup_weight_write_dfl(struct kernfs_open_file *of,

1123

++					     char *buf, size_t nbytes,

1124

++					     loff_t off)

1125

++{

1126

++	/* First unsigned long found in the file is used */

1127

++	return bfqio_cgroup_weight_write(of_css(of), NULL,

1128

++					 simple_strtoull(strim(buf), NULL, 0));

1129

++}

1130

++

1131

++static int bfqg_print_stat(struct seq_file *sf, void *v)

1132

++{

1133

++	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_stat,

1134

++			  &blkcg_policy_bfq, seq_cft(sf)->private, false);

1135

++	return 0;

1136

++}

1137

++

1138

++static int bfqg_print_rwstat(struct seq_file *sf, void *v)

1139

++{

1140

++	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_rwstat,

1141

++			  &blkcg_policy_bfq, seq_cft(sf)->private, true);

1142

++	return 0;

1143

++}

1144

++

1145

++static u64 bfqg_prfill_stat_recursive(struct seq_file *sf,

1146

++				      struct blkg_policy_data *pd, int off)

1147

++{

1148

++	u64 sum = bfqg_stat_pd_recursive_sum(pd, off);

1149

++

1150

++	return __blkg_prfill_u64(sf, pd, sum);

1151

++}

1152

++

1153

++static u64 bfqg_prfill_rwstat_recursive(struct seq_file *sf,

1154

++					struct blkg_policy_data *pd, int off)

1155

++{

1156

++	struct blkg_rwstat sum = bfqg_rwstat_pd_recursive_sum(pd, off);

1157

++

1158

++	return __blkg_prfill_rwstat(sf, pd, &sum);

1159

++}

1160

++

1161

++static int bfqg_print_stat_recursive(struct seq_file *sf, void *v)

1162

++{

1163

++	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),

1164

++			  bfqg_prfill_stat_recursive, &blkcg_policy_bfq,

1165

++			  seq_cft(sf)->private, false);

1166

++	return 0;

1167

++}

1168

++

1169

++static int bfqg_print_rwstat_recursive(struct seq_file *sf, void *v)

1170

++{

1171

++	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),

1172

++			  bfqg_prfill_rwstat_recursive, &blkcg_policy_bfq,

1173

++			  seq_cft(sf)->private, true);

1174

++	return 0;

1175

++}

1176

++

1177

++static u64 bfqg_prfill_avg_queue_size(struct seq_file *sf,

1178

++				      struct blkg_policy_data *pd, int off)

1179

++{

1180

++	struct bfq_group *bfqg = pd_to_bfqg(pd);

1181

++	u64 samples = blkg_stat_read(&bfqg->stats.avg_queue_size_samples);

1182

++	u64 v = 0;

1183

++

1184

++	if (samples) {

1185

++		v = blkg_stat_read(&bfqg->stats.avg_queue_size_sum);

1186

++		v = div64_u64(v, samples);

1187

++	}

1188

++	__blkg_prfill_u64(sf, pd, v);

1189

++	return 0;

1190

++}

1191

++

1192

++/* print avg_queue_size */

1193

++static int bfqg_print_avg_queue_size(struct seq_file *sf, void *v)

1194

++{

1195

++	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),

1196

++			  bfqg_prfill_avg_queue_size, &blkcg_policy_bfq,

1197

++			  0, false);

1198

++	return 0;

1199

++}

1200

++

1201

++static struct bfq_group *bfq_create_group_hierarchy(struct bfq_data *bfqd, int node)

1202

++{

1203

++	int ret;

1204

++

1205

++	ret = blkcg_activate_policy(bfqd->queue, &blkcg_policy_bfq);

1206

++	if (ret)

1207

++		return NULL;

1208

++

1209

++        return blkg_to_bfqg(bfqd->queue->root_blkg);

1210

++}

1211

++

1212

++static struct blkcg_policy_data *bfq_cpd_alloc(gfp_t gfp)

1213

++{

1214

++        struct bfq_group_data *bgd;

1215

++

1216

++        bgd = kzalloc(sizeof(*bgd), GFP_KERNEL);

1217

++        if (!bgd)

1218

++                return NULL;

1219

++        return &bgd->pd;

1220

++}

1221

++

1222

++static void bfq_cpd_free(struct blkcg_policy_data *cpd)

1223

++{

1224

++        kfree(cpd_to_bfqgd(cpd));

1225

++}

1226

++

1227

++static struct cftype bfqio_files_dfl[] = {

1228

++	{

1229

++		.name = "weight",

1230

++		.flags = CFTYPE_NOT_ON_ROOT,

1231

++		.seq_show = bfqio_cgroup_weight_read_dfl,

1232

++		.write = bfqio_cgroup_weight_write_dfl,

1233

++	},

1234

++	{} /* terminate */

1235

++};

1236

++

1237

++static struct cftype bfqio_files[] = {

1238

++	{

1239

++		.name = "bfq.weight",

1240

++		.read_u64 = bfqio_cgroup_weight_read,

1241

++		.write_u64 = bfqio_cgroup_weight_write,

1242

++	},

1243

++	/* statistics, cover only the tasks in the bfqg */

1244

++	{

1245

++		.name = "bfq.time",

1246

++		.private = offsetof(struct bfq_group, stats.time),

1247

++		.seq_show = bfqg_print_stat,

1248

++	},

1249

++	{

1250

++		.name = "bfq.sectors",

1251

++		.private = offsetof(struct bfq_group, stats.sectors),

1252

++		.seq_show = bfqg_print_stat,

1253

++	},

1254

++	{

1255

++		.name = "bfq.io_service_bytes",

1256

++		.private = offsetof(struct bfq_group, stats.service_bytes),

1257

++		.seq_show = bfqg_print_rwstat,

1258

++	},

1259

++	{

1260

++		.name = "bfq.io_serviced",

1261

++		.private = offsetof(struct bfq_group, stats.serviced),

1262

++		.seq_show = bfqg_print_rwstat,

1263

++	},

1264

++	{

1265

++		.name = "bfq.io_service_time",

1266

++		.private = offsetof(struct bfq_group, stats.service_time),

1267

++		.seq_show = bfqg_print_rwstat,

1268

++	},

1269

++	{

1270

++		.name = "bfq.io_wait_time",

1271

++		.private = offsetof(struct bfq_group, stats.wait_time),

1272

++		.seq_show = bfqg_print_rwstat,

1273

++	},

1274

++	{

1275

++		.name = "bfq.io_merged",

1276

++		.private = offsetof(struct bfq_group, stats.merged),

1277

++		.seq_show = bfqg_print_rwstat,

1278

++	},

1279

++	{

1280

++		.name = "bfq.io_queued",

1281

++		.private = offsetof(struct bfq_group, stats.queued),

1282

++		.seq_show = bfqg_print_rwstat,

1283

++	},

1284

++

1285

++	/* the same statictics which cover the bfqg and its descendants */

1286

++	{

1287

++		.name = "bfq.time_recursive",

1288

++		.private = offsetof(struct bfq_group, stats.time),

1289

++		.seq_show = bfqg_print_stat_recursive,

1290

++	},

1291

++	{

1292

++		.name = "bfq.sectors_recursive",

1293

++		.private = offsetof(struct bfq_group, stats.sectors),

1294

++		.seq_show = bfqg_print_stat_recursive,

1295

++	},

1296

++	{

1297

++		.name = "bfq.io_service_bytes_recursive",

1298

++		.private = offsetof(struct bfq_group, stats.service_bytes),

1299

++		.seq_show = bfqg_print_rwstat_recursive,

1300

++	},

1301

++	{

1302

++		.name = "bfq.io_serviced_recursive",

1303

++		.private = offsetof(struct bfq_group, stats.serviced),

1304

++		.seq_show = bfqg_print_rwstat_recursive,

1305

++	},

1306

++	{

1307

++		.name = "bfq.io_service_time_recursive",

1308

++		.private = offsetof(struct bfq_group, stats.service_time),

1309

++		.seq_show = bfqg_print_rwstat_recursive,

1310

++	},

1311

++	{

1312

++		.name = "bfq.io_wait_time_recursive",

1313

++		.private = offsetof(struct bfq_group, stats.wait_time),

1314

++		.seq_show = bfqg_print_rwstat_recursive,

1315

++	},

1316

++	{

1317

++		.name = "bfq.io_merged_recursive",

1318

++		.private = offsetof(struct bfq_group, stats.merged),

1319

++		.seq_show = bfqg_print_rwstat_recursive,

1320

++	},

1321

++	{

1322

++		.name = "bfq.io_queued_recursive",

1323

++		.private = offsetof(struct bfq_group, stats.queued),

1324

++		.seq_show = bfqg_print_rwstat_recursive,

1325

++	},

1326

++	{

1327

++		.name = "bfq.avg_queue_size",

1328

++		.seq_show = bfqg_print_avg_queue_size,

1329

++	},

1330

++	{

1331

++		.name = "bfq.group_wait_time",

1332

++		.private = offsetof(struct bfq_group, stats.group_wait_time),

1333

++		.seq_show = bfqg_print_stat,

1334

++	},

1335

++	{

1336

++		.name = "bfq.idle_time",

1337

++		.private = offsetof(struct bfq_group, stats.idle_time),

1338

++		.seq_show = bfqg_print_stat,

1339

++	},

1340

++	{

1341

++		.name = "bfq.empty_time",

1342

++		.private = offsetof(struct bfq_group, stats.empty_time),

1343

++		.seq_show = bfqg_print_stat,

1344

++	},

1345

++	{

1346

++		.name = "bfq.dequeue",

1347

++		.private = offsetof(struct bfq_group, stats.dequeue),

1348

++		.seq_show = bfqg_print_stat,

1349

++	},

1350

++	{

1351

++		.name = "bfq.unaccounted_time",

1352

++		.private = offsetof(struct bfq_group, stats.unaccounted_time),

1353

++		.seq_show = bfqg_print_stat,

1354

++	},

1355

++	{ }	/* terminate */

1356

++};

1357

++

1358

++static struct blkcg_policy blkcg_policy_bfq = {

1359

++       .dfl_cftypes            = bfqio_files_dfl,

1360

++       .legacy_cftypes         = bfqio_files,

1361

++

1362

++       .pd_alloc_fn            = bfq_pd_alloc,

1363

++       .pd_init_fn             = bfq_pd_init,

1364

++       .pd_offline_fn          = bfq_pd_offline,

1365

++       .pd_free_fn             = bfq_pd_free,

1366

++       .pd_reset_stats_fn      = bfq_pd_reset_stats,

1367

++

1368

++       .cpd_alloc_fn           = bfq_cpd_alloc,

1369

++       .cpd_init_fn            = bfq_cpd_init,

1370

++       .cpd_bind_fn	       = bfq_cpd_init,

1371

++       .cpd_free_fn            = bfq_cpd_free,

1372

++

1373

++};

1374

++

1375

++#else

1376

++

1377

++static void bfq_init_entity(struct bfq_entity *entity,

1378

++			    struct bfq_group *bfqg)

1379

++{

1380

++	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

1381

++	entity->weight = entity->new_weight;

1382

++	entity->orig_weight = entity->new_weight;

1383

++	if (bfqq) {

1384

++		bfqq->ioprio = bfqq->new_ioprio;

1385

++		bfqq->ioprio_class = bfqq->new_ioprio_class;

1386

++	}

1387

++	entity->sched_data = &bfqg->sched_data;

1388

++}

1389

++

1390

++static struct bfq_group *

1391

++bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio)

1392

++{

1393

++	struct bfq_data *bfqd = bic_to_bfqd(bic);

1394

++	return bfqd->root_group;

1395

++}

1396

++

1397

++static void bfq_bfqq_move(struct bfq_data *bfqd,

1398

++			  struct bfq_queue *bfqq,

1399

++			  struct bfq_entity *entity,

1400

++			  struct bfq_group *bfqg)

1401

++{

1402

++}

1403

++

1404

++static void bfq_end_wr_async(struct bfq_data *bfqd)

1405

++{

1406

++	bfq_end_wr_async_queues(bfqd, bfqd->root_group);

1407

++}

1408

++

1409

++static void bfq_disconnect_groups(struct bfq_data *bfqd)

1410

++{

1411

++	bfq_put_async_queues(bfqd, bfqd->root_group);

1412

++}

1413

++

1414

++static struct bfq_group *bfq_find_alloc_group(struct bfq_data *bfqd,

1415

++                                              struct blkcg *blkcg)

1416

++{

1417

++	return bfqd->root_group;

1418

++}

1419

++

1420

++static struct bfq_group *bfq_create_group_hierarchy(struct bfq_data *bfqd, int node)

1421

++{

1422

++	struct bfq_group *bfqg;

1423

++	int i;

1424

++

1425

++	bfqg = kmalloc_node(sizeof(*bfqg), GFP_KERNEL | __GFP_ZERO, node);

1426

++	if (!bfqg)

1427

++		return NULL;

1428

++

1429

++	for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)

1430

++		bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;

1431

++

1432

++	return bfqg;

1433

++}

1434

++#endif

1435

+diff --git a/block/bfq-ioc.c b/block/bfq-ioc.c

1436

+new file mode 100644

1437

+index 0000000..fb7bb8f

1438

+--- /dev/null

1439

++++ b/block/bfq-ioc.c

1440

+@@ -0,0 +1,36 @@

1441

++/*

1442

++ * BFQ: I/O context handling.

1443

++ *

1444

++ * Based on ideas and code from CFQ:

1445

++ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>

1446

++ *

1447

++ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>

1448

++ *		      Paolo Valente <paolo.valente@×××××××.it>

1449

++ *

1450

++ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>

1451

++ */

1452

++

1453

++/**

1454

++ * icq_to_bic - convert iocontext queue structure to bfq_io_cq.

1455

++ * @icq: the iocontext queue.

1456

++ */

1457

++static struct bfq_io_cq *icq_to_bic(struct io_cq *icq)

1458

++{

1459

++	/* bic->icq is the first member, %NULL will convert to %NULL */

1460

++	return container_of(icq, struct bfq_io_cq, icq);

1461

++}

1462

++

1463

++/**

1464

++ * bfq_bic_lookup - search into @ioc a bic associated to @bfqd.

1465

++ * @bfqd: the lookup key.

1466

++ * @ioc: the io_context of the process doing I/O.

1467

++ *

1468

++ * Queue lock must be held.

1469

++ */

1470

++static struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd,

1471

++					struct io_context *ioc)

1472

++{

1473

++	if (ioc)

1474

++		return icq_to_bic(ioc_lookup_icq(ioc, bfqd->queue));

1475

++	return NULL;

1476

++}

1477

+diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c

1478

+new file mode 100644

1479

+index 0000000..f9787a6

1480

+--- /dev/null

1481

++++ b/block/bfq-iosched.c

1482

+@@ -0,0 +1,3754 @@

1483

++/*

1484

++ * Budget Fair Queueing (BFQ) disk scheduler.

1485

++ *

1486

++ * Based on ideas and code from CFQ:

1487

++ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>

1488

++ *

1489

++ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>

1490

++ *		      Paolo Valente <paolo.valente@×××××××.it>

1491

++ *

1492

++ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>

1493

++ *

1494

++ * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ

1495

++ * file.

1496

++ *

1497

++ * BFQ is a proportional-share storage-I/O scheduling algorithm based on

1498

++ * the slice-by-slice service scheme of CFQ. But BFQ assigns budgets,

1499

++ * measured in number of sectors, to processes instead of time slices. The

1500

++ * device is not granted to the in-service process for a given time slice,

1501

++ * but until it has exhausted its assigned budget. This change from the time

1502

++ * to the service domain allows BFQ to distribute the device throughput

1503

++ * among processes as desired, without any distortion due to ZBR, workload

1504

++ * fluctuations or other factors. BFQ uses an ad hoc internal scheduler,

1505

++ * called B-WF2Q+, to schedule processes according to their budgets. More

1506

++ * precisely, BFQ schedules queues associated to processes. Thanks to the

1507

++ * accurate policy of B-WF2Q+, BFQ can afford to assign high budgets to

1508

++ * I/O-bound processes issuing sequential requests (to boost the

1509

++ * throughput), and yet guarantee a low latency to interactive and soft

1510

++ * real-time applications.

1511

++ *

1512

++ * BFQ is described in [1], where also a reference to the initial, more

1513

++ * theoretical paper on BFQ can be found. The interested reader can find

1514

++ * in the latter paper full details on the main algorithm, as well as

1515

++ * formulas of the guarantees and formal proofs of all the properties.

1516

++ * With respect to the version of BFQ presented in these papers, this

1517

++ * implementation adds a few more heuristics, such as the one that

1518

++ * guarantees a low latency to soft real-time applications, and a

1519

++ * hierarchical extension based on H-WF2Q+.

1520

++ *

1521

++ * B-WF2Q+ is based on WF2Q+, that is described in [2], together with

1522

++ * H-WF2Q+, while the augmented tree used to implement B-WF2Q+ with O(log N)

1523

++ * complexity derives from the one introduced with EEVDF in [3].

1524

++ *

1525

++ * [1] P. Valente and M. Andreolini, ``Improving Application Responsiveness

1526

++ *     with the BFQ Disk I/O Scheduler'',

1527

++ *     Proceedings of the 5th Annual International Systems and Storage

1528

++ *     Conference (SYSTOR '12), June 2012.

1529

++ *

1530

++ * http://algogroup.unimo.it/people/paolo/disk_sched/bf1-v1-suite-results.pdf

1531

++ *

1532

++ * [2] Jon C.R. Bennett and H. Zhang, ``Hierarchical Packet Fair Queueing

1533

++ *     Algorithms,'' IEEE/ACM Transactions on Networking, 5(5):675-689,

1534

++ *     Oct 1997.

1535

++ *

1536

++ * http://www.cs.cmu.edu/~hzhang/papers/TON-97-Oct.ps.gz

1537

++ *

1538

++ * [3] I. Stoica and H. Abdel-Wahab, ``Earliest Eligible Virtual Deadline

1539

++ *     First: A Flexible and Accurate Mechanism for Proportional Share

1540

++ *     Resource Allocation,'' technical report.

1541

++ *

1542

++ * http://www.cs.berkeley.edu/~istoica/papers/eevdf-tr-95.pdf

1543

++ */

1544

++#include <linux/module.h>

1545

++#include <linux/slab.h>

1546

++#include <linux/blkdev.h>

1547

++#include <linux/cgroup.h>

1548

++#include <linux/elevator.h>

1549

++#include <linux/jiffies.h>

1550

++#include <linux/rbtree.h>

1551

++#include <linux/ioprio.h>

1552

++#include "bfq.h"

1553

++#include "blk.h"

1554

++

1555

++/* Expiration time of sync (0) and async (1) requests, in jiffies. */

1556

++static const int bfq_fifo_expire[2] = { HZ / 4, HZ / 8 };

1557

++

1558

++/* Maximum backwards seek, in KiB. */

1559

++static const int bfq_back_max = 16 * 1024;

1560

++

1561

++/* Penalty of a backwards seek, in number of sectors. */

1562

++static const int bfq_back_penalty = 2;

1563

++

1564

++/* Idling period duration, in jiffies. */

1565

++static int bfq_slice_idle = HZ / 125;

1566

++

1567

++/* Minimum number of assigned budgets for which stats are safe to compute. */

1568

++static const int bfq_stats_min_budgets = 194;

1569

++

1570

++/* Default maximum budget values, in sectors and number of requests. */

1571

++static const int bfq_default_max_budget = 16 * 1024;

1572

++static const int bfq_max_budget_async_rq = 4;

1573

++

1574

++/*

1575

++ * Async to sync throughput distribution is controlled as follows:

1576

++ * when an async request is served, the entity is charged the number

1577

++ * of sectors of the request, multiplied by the factor below

1578

++ */

1579

++static const int bfq_async_charge_factor = 10;

1580

++

1581

++/* Default timeout values, in jiffies, approximating CFQ defaults. */

1582

++static const int bfq_timeout_sync = HZ / 8;

1583

++static int bfq_timeout_async = HZ / 25;

1584

++

1585

++struct kmem_cache *bfq_pool;

1586

++

1587

++/* Below this threshold (in ms), we consider thinktime immediate. */

1588

++#define BFQ_MIN_TT		2

1589

++

1590

++/* hw_tag detection: parallel requests threshold and min samples needed. */

1591

++#define BFQ_HW_QUEUE_THRESHOLD	4

1592

++#define BFQ_HW_QUEUE_SAMPLES	32

1593

++

1594

++#define BFQQ_SEEK_THR	 (sector_t)(8 * 1024)

1595

++#define BFQQ_SEEKY(bfqq) ((bfqq)->seek_mean > BFQQ_SEEK_THR)

1596

++

1597

++/* Min samples used for peak rate estimation (for autotuning). */

1598

++#define BFQ_PEAK_RATE_SAMPLES	32

1599

++

1600

++/* Shift used for peak rate fixed precision calculations. */

1601

++#define BFQ_RATE_SHIFT		16

1602

++

1603

++/*

1604

++ * By default, BFQ computes the duration of the weight raising for

1605

++ * interactive applications automatically, using the following formula:

1606

++ * duration = (R / r) * T, where r is the peak rate of the device, and

1607

++ * R and T are two reference parameters.

1608

++ * In particular, R is the peak rate of the reference device (see below),

1609

++ * and T is a reference time: given the systems that are likely to be

1610

++ * installed on the reference device according to its speed class, T is

1611

++ * about the maximum time needed, under BFQ and while reading two files in

1612

++ * parallel, to load typical large applications on these systems.

1613

++ * In practice, the slower/faster the device at hand is, the more/less it

1614

++ * takes to load applications with respect to the reference device.

1615

++ * Accordingly, the longer/shorter BFQ grants weight raising to interactive

1616

++ * applications.

1617

++ *

1618

++ * BFQ uses four different reference pairs (R, T), depending on:

1619

++ * . whether the device is rotational or non-rotational;

1620

++ * . whether the device is slow, such as old or portable HDDs, as well as

1621

++ *   SD cards, or fast, such as newer HDDs and SSDs.

1622

++ *

1623

++ * The device's speed class is dynamically (re)detected in

1624

++ * bfq_update_peak_rate() every time the estimated peak rate is updated.

1625

++ *

1626

++ * In the following definitions, R_slow[0]/R_fast[0] and T_slow[0]/T_fast[0]

1627

++ * are the reference values for a slow/fast rotational device, whereas

1628

++ * R_slow[1]/R_fast[1] and T_slow[1]/T_fast[1] are the reference values for

1629

++ * a slow/fast non-rotational device. Finally, device_speed_thresh are the

1630

++ * thresholds used to switch between speed classes.

1631

++ * Both the reference peak rates and the thresholds are measured in

1632

++ * sectors/usec, left-shifted by BFQ_RATE_SHIFT.

1633

++ */

1634

++static int R_slow[2] = {1536, 10752};

1635

++static int R_fast[2] = {17415, 34791};

1636

++/*

1637

++ * To improve readability, a conversion function is used to initialize the

1638

++ * following arrays, which entails that they can be initialized only in a

1639

++ * function.

1640

++ */

1641

++static int T_slow[2];

1642

++static int T_fast[2];

1643

++static int device_speed_thresh[2];

1644

++

1645

++#define BFQ_SERVICE_TREE_INIT	((struct bfq_service_tree)		\

1646

++				{ RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 })

1647

++

1648

++#define RQ_BIC(rq)		((struct bfq_io_cq *) (rq)->elv.priv[0])

1649

++#define RQ_BFQQ(rq)		((rq)->elv.priv[1])

1650

++

1651

++static void bfq_schedule_dispatch(struct bfq_data *bfqd);

1652

++

1653

++#include "bfq-ioc.c"

1654

++#include "bfq-sched.c"

1655

++#include "bfq-cgroup.c"

1656

++

1657

++#define bfq_class_idle(bfqq)	((bfqq)->ioprio_class == IOPRIO_CLASS_IDLE)

1658

++#define bfq_class_rt(bfqq)	((bfqq)->ioprio_class == IOPRIO_CLASS_RT)

1659

++

1660

++#define bfq_sample_valid(samples)	((samples) > 80)

1661

++

1662

++/*

1663

++ * We regard a request as SYNC, if either it's a read or has the SYNC bit

1664

++ * set (in which case it could also be a direct WRITE).

1665

++ */

1666

++static int bfq_bio_sync(struct bio *bio)

1667

++{

1668

++	if (bio_data_dir(bio) == READ || (bio->bi_rw & REQ_SYNC))

1669

++		return 1;

1670

++

1671

++	return 0;

1672

++}

1673

++

1674

++/*

1675

++ * Scheduler run of queue, if there are requests pending and no one in the

1676

++ * driver that will restart queueing.

1677

++ */

1678

++static void bfq_schedule_dispatch(struct bfq_data *bfqd)

1679

++{

1680

++	if (bfqd->queued != 0) {

1681

++		bfq_log(bfqd, "schedule dispatch");

1682

++		kblockd_schedule_work(&bfqd->unplug_work);

1683

++	}

1684

++}

1685

++

1686

++/*

1687

++ * Lifted from AS - choose which of rq1 and rq2 that is best served now.

1688

++ * We choose the request that is closesr to the head right now.  Distance

1689

++ * behind the head is penalized and only allowed to a certain extent.

1690

++ */

1691

++static struct request *bfq_choose_req(struct bfq_data *bfqd,

1692

++				      struct request *rq1,

1693

++				      struct request *rq2,

1694

++				      sector_t last)

1695

++{

1696

++	sector_t s1, s2, d1 = 0, d2 = 0;

1697

++	unsigned long back_max;

1698

++#define BFQ_RQ1_WRAP	0x01 /* request 1 wraps */

1699

++#define BFQ_RQ2_WRAP	0x02 /* request 2 wraps */

1700

++	unsigned wrap = 0; /* bit mask: requests behind the disk head? */

1701

++

1702

++	if (!rq1 || rq1 == rq2)

1703

++		return rq2;

1704

++	if (!rq2)

1705

++		return rq1;

1706

++

1707

++	if (rq_is_sync(rq1) && !rq_is_sync(rq2))

1708

++		return rq1;

1709

++	else if (rq_is_sync(rq2) && !rq_is_sync(rq1))

1710

++		return rq2;

1711

++	if ((rq1->cmd_flags & REQ_META) && !(rq2->cmd_flags & REQ_META))

1712

++		return rq1;

1713

++	else if ((rq2->cmd_flags & REQ_META) && !(rq1->cmd_flags & REQ_META))

1714

++		return rq2;

1715

++

1716

++	s1 = blk_rq_pos(rq1);

1717

++	s2 = blk_rq_pos(rq2);

1718

++

1719

++	/*

1720

++	 * By definition, 1KiB is 2 sectors.

1721

++	 */

1722

++	back_max = bfqd->bfq_back_max * 2;

1723

++

1724

++	/*

1725

++	 * Strict one way elevator _except_ in the case where we allow

1726

++	 * short backward seeks which are biased as twice the cost of a

1727

++	 * similar forward seek.

1728

++	 */

1729

++	if (s1 >= last)

1730

++		d1 = s1 - last;

1731

++	else if (s1 + back_max >= last)

1732

++		d1 = (last - s1) * bfqd->bfq_back_penalty;

1733

++	else

1734

++		wrap |= BFQ_RQ1_WRAP;

1735

++

1736

++	if (s2 >= last)

1737

++		d2 = s2 - last;

1738

++	else if (s2 + back_max >= last)

1739

++		d2 = (last - s2) * bfqd->bfq_back_penalty;

1740

++	else

1741

++		wrap |= BFQ_RQ2_WRAP;

1742

++

1743

++	/* Found required data */

1744

++

1745

++	/*

1746

++	 * By doing switch() on the bit mask "wrap" we avoid having to

1747

++	 * check two variables for all permutations: --> faster!

1748

++	 */

1749

++	switch (wrap) {

1750

++	case 0: /* common case for CFQ: rq1 and rq2 not wrapped */

1751

++		if (d1 < d2)

1752

++			return rq1;

1753

++		else if (d2 < d1)

1754

++			return rq2;

1755

++		else {

1756

++			if (s1 >= s2)

1757

++				return rq1;

1758

++			else

1759

++				return rq2;

1760

++		}

1761

++

1762

++	case BFQ_RQ2_WRAP:

1763

++		return rq1;

1764

++	case BFQ_RQ1_WRAP:

1765

++		return rq2;

1766

++	case (BFQ_RQ1_WRAP|BFQ_RQ2_WRAP): /* both rqs wrapped */

1767

++	default:

1768

++		/*

1769

++		 * Since both rqs are wrapped,

1770

++		 * start with the one that's further behind head

1771

++		 * (--> only *one* back seek required),

1772

++		 * since back seek takes more time than forward.

1773

++		 */

1774

++		if (s1 <= s2)

1775

++			return rq1;

1776

++		else

1777

++			return rq2;

1778

++	}

1779

++}

1780

++

1781

++/*

1782

++ * Tell whether there are active queues or groups with differentiated weights.

1783

++ */

1784

++static bool bfq_differentiated_weights(struct bfq_data *bfqd)

1785

++{

1786

++	/*

1787

++	 * For weights to differ, at least one of the trees must contain

1788

++	 * at least two nodes.

1789

++	 */

1790

++	return (!RB_EMPTY_ROOT(&bfqd->queue_weights_tree) &&

1791

++		(bfqd->queue_weights_tree.rb_node->rb_left ||

1792

++		 bfqd->queue_weights_tree.rb_node->rb_right)

1793

++#ifdef CONFIG_BFQ_GROUP_IOSCHED

1794

++	       ) ||

1795

++	       (!RB_EMPTY_ROOT(&bfqd->group_weights_tree) &&

1796

++		(bfqd->group_weights_tree.rb_node->rb_left ||

1797

++		 bfqd->group_weights_tree.rb_node->rb_right)

1798

++#endif

1799

++	       );

1800

++}

1801

++

1802

++/*

1803

++ * The following function returns true if every queue must receive the

1804

++ * same share of the throughput (this condition is used when deciding

1805

++ * whether idling may be disabled, see the comments in the function

1806

++ * bfq_bfqq_may_idle()).

1807

++ *

1808

++ * Such a scenario occurs when:

1809

++ * 1) all active queues have the same weight,

1810

++ * 2) all active groups at the same level in the groups tree have the same

1811

++ *    weight,

1812

++ * 3) all active groups at the same level in the groups tree have the same

1813

++ *    number of children.

1814

++ *

1815

++ * Unfortunately, keeping the necessary state for evaluating exactly the

1816

++ * above symmetry conditions would be quite complex and time-consuming.

1817

++ * Therefore this function evaluates, instead, the following stronger

1818

++ * sub-conditions, for which it is much easier to maintain the needed

1819

++ * state:

1820

++ * 1) all active queues have the same weight,

1821

++ * 2) all active groups have the same weight,

1822

++ * 3) all active groups have at most one active child each.

1823

++ * In particular, the last two conditions are always true if hierarchical

1824

++ * support and the cgroups interface are not enabled, thus no state needs

1825

++ * to be maintained in this case.

1826

++ */

1827

++static bool bfq_symmetric_scenario(struct bfq_data *bfqd)

1828

++{

1829

++	return

1830

++#ifdef CONFIG_BFQ_GROUP_IOSCHED

1831

++		!bfqd->active_numerous_groups &&

1832

++#endif

1833

++		!bfq_differentiated_weights(bfqd);

1834

++}

1835

++

1836

++/*

1837

++ * If the weight-counter tree passed as input contains no counter for

1838

++ * the weight of the input entity, then add that counter; otherwise just

1839

++ * increment the existing counter.

1840

++ *

1841

++ * Note that weight-counter trees contain few nodes in mostly symmetric

1842

++ * scenarios. For example, if all queues have the same weight, then the

1843

++ * weight-counter tree for the queues may contain at most one node.

1844

++ * This holds even if low_latency is on, because weight-raised queues

1845

++ * are not inserted in the tree.

1846

++ * In most scenarios, the rate at which nodes are created/destroyed

1847

++ * should be low too.

1848

++ */

1849

++static void bfq_weights_tree_add(struct bfq_data *bfqd,

1850

++				 struct bfq_entity *entity,

1851

++				 struct rb_root *root)

1852

++{

1853

++	struct rb_node **new = &(root->rb_node), *parent = NULL;

1854

++

1855

++	/*

1856

++	 * Do not insert if the entity is already associated with a

1857

++	 * counter, which happens if:

1858

++	 *   1) the entity is associated with a queue,

1859

++	 *   2) a request arrival has caused the queue to become both

1860

++	 *      non-weight-raised, and hence change its weight, and

1861

++	 *      backlogged; in this respect, each of the two events

1862

++	 *      causes an invocation of this function,

1863

++	 *   3) this is the invocation of this function caused by the

1864

++	 *      second event. This second invocation is actually useless,

1865

++	 *      and we handle this fact by exiting immediately. More

1866

++	 *      efficient or clearer solutions might possibly be adopted.

1867

++	 */

1868

++	if (entity->weight_counter)

1869

++		return;

1870

++

1871

++	while (*new) {

1872

++		struct bfq_weight_counter *__counter = container_of(*new,

1873

++						struct bfq_weight_counter,

1874

++						weights_node);

1875

++		parent = *new;

1876

++

1877

++		if (entity->weight == __counter->weight) {

1878

++			entity->weight_counter = __counter;

1879

++			goto inc_counter;

1880

++		}

1881

++		if (entity->weight < __counter->weight)

1882

++			new = &((*new)->rb_left);

1883

++		else

1884

++			new = &((*new)->rb_right);

1885

++	}

1886

++

1887

++	entity->weight_counter = kzalloc(sizeof(struct bfq_weight_counter),

1888

++					 GFP_ATOMIC);

1889

++	entity->weight_counter->weight = entity->weight;

1890

++	rb_link_node(&entity->weight_counter->weights_node, parent, new);

1891

++	rb_insert_color(&entity->weight_counter->weights_node, root);

1892

++

1893

++inc_counter:

1894

++	entity->weight_counter->num_active++;

1895

++}

1896

++

1897

++/*

1898

++ * Decrement the weight counter associated with the entity, and, if the

1899

++ * counter reaches 0, remove the counter from the tree.

1900

++ * See the comments to the function bfq_weights_tree_add() for considerations

1901

++ * about overhead.

1902

++ */

1903

++static void bfq_weights_tree_remove(struct bfq_data *bfqd,

1904

++				    struct bfq_entity *entity,

1905

++				    struct rb_root *root)

1906

++{

1907

++	if (!entity->weight_counter)

1908

++		return;

1909

++

1910

++	BUG_ON(RB_EMPTY_ROOT(root));

1911

++	BUG_ON(entity->weight_counter->weight != entity->weight);

1912

++

1913

++	BUG_ON(!entity->weight_counter->num_active);

1914

++	entity->weight_counter->num_active--;

1915

++	if (entity->weight_counter->num_active > 0)

1916

++		goto reset_entity_pointer;

1917

++

1918

++	rb_erase(&entity->weight_counter->weights_node, root);

1919

++	kfree(entity->weight_counter);

1920

++

1921

++reset_entity_pointer:

1922

++	entity->weight_counter = NULL;

1923

++}

1924

++

1925

++static struct request *bfq_find_next_rq(struct bfq_data *bfqd,

1926

++					struct bfq_queue *bfqq,

1927

++					struct request *last)

1928

++{

1929

++	struct rb_node *rbnext = rb_next(&last->rb_node);

1930

++	struct rb_node *rbprev = rb_prev(&last->rb_node);

1931

++	struct request *next = NULL, *prev = NULL;

1932

++

1933

++	BUG_ON(RB_EMPTY_NODE(&last->rb_node));

1934

++

1935

++	if (rbprev)

1936

++		prev = rb_entry_rq(rbprev);

1937

++

1938

++	if (rbnext)

1939

++		next = rb_entry_rq(rbnext);

1940

++	else {

1941

++		rbnext = rb_first(&bfqq->sort_list);

1942

++		if (rbnext && rbnext != &last->rb_node)

1943

++			next = rb_entry_rq(rbnext);

1944

++	}

1945

++

1946

++	return bfq_choose_req(bfqd, next, prev, blk_rq_pos(last));

1947

++}

1948

++

1949

++/* see the definition of bfq_async_charge_factor for details */

1950

++static unsigned long bfq_serv_to_charge(struct request *rq,

1951

++					struct bfq_queue *bfqq)

1952

++{

1953

++	return blk_rq_sectors(rq) *

1954

++		(1 + ((!bfq_bfqq_sync(bfqq)) * (bfqq->wr_coeff == 1) *

1955

++		bfq_async_charge_factor));

1956

++}

1957

++

1958

++/**

1959

++ * bfq_updated_next_req - update the queue after a new next_rq selection.

1960

++ * @bfqd: the device data the queue belongs to.

1961

++ * @bfqq: the queue to update.

1962

++ *

1963

++ * If the first request of a queue changes we make sure that the queue

1964

++ * has enough budget to serve at least its first request (if the

1965

++ * request has grown).  We do this because if the queue has not enough

1966

++ * budget for its first request, it has to go through two dispatch

1967

++ * rounds to actually get it dispatched.

1968

++ */

1969

++static void bfq_updated_next_req(struct bfq_data *bfqd,

1970

++				 struct bfq_queue *bfqq)

1971

++{

1972

++	struct bfq_entity *entity = &bfqq->entity;

1973

++	struct bfq_service_tree *st = bfq_entity_service_tree(entity);

1974

++	struct request *next_rq = bfqq->next_rq;

1975

++	unsigned long new_budget;

1976

++

1977

++	if (!next_rq)

1978

++		return;

1979

++

1980

++	if (bfqq == bfqd->in_service_queue)

1981

++		/*

1982

++		 * In order not to break guarantees, budgets cannot be

1983

++		 * changed after an entity has been selected.

1984

++		 */

1985

++		return;

1986

++

1987

++	BUG_ON(entity->tree != &st->active);

1988

++	BUG_ON(entity == entity->sched_data->in_service_entity);

1989

++

1990

++	new_budget = max_t(unsigned long, bfqq->max_budget,

1991

++			   bfq_serv_to_charge(next_rq, bfqq));

1992

++	if (entity->budget != new_budget) {

1993

++		entity->budget = new_budget;

1994

++		bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu",

1995

++					 new_budget);

1996

++		bfq_activate_bfqq(bfqd, bfqq);

1997

++	}

1998

++}

1999

++

2000

++static unsigned int bfq_wr_duration(struct bfq_data *bfqd)

2001

++{

2002

++	u64 dur;

2003

++

2004

++	if (bfqd->bfq_wr_max_time > 0)

2005

++		return bfqd->bfq_wr_max_time;

2006

++

2007

++	dur = bfqd->RT_prod;

2008

++	do_div(dur, bfqd->peak_rate);

2009

++

2010

++	return dur;

2011

++}

2012

++

2013

++/* Empty burst list and add just bfqq (see comments to bfq_handle_burst) */

2014

++static void bfq_reset_burst_list(struct bfq_data *bfqd, struct bfq_queue *bfqq)

2015

++{

2016

++	struct bfq_queue *item;

2017

++	struct hlist_node *n;

2018

++

2019

++	hlist_for_each_entry_safe(item, n, &bfqd->burst_list, burst_list_node)

2020

++		hlist_del_init(&item->burst_list_node);

2021

++	hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list);

2022

++	bfqd->burst_size = 1;

2023

++}

2024

++

2025

++/* Add bfqq to the list of queues in current burst (see bfq_handle_burst) */

2026

++static void bfq_add_to_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq)

2027

++{

2028

++	/* Increment burst size to take into account also bfqq */

2029

++	bfqd->burst_size++;

2030

++

2031

++	if (bfqd->burst_size == bfqd->bfq_large_burst_thresh) {

2032

++		struct bfq_queue *pos, *bfqq_item;

2033

++		struct hlist_node *n;

2034

++

2035

++		/*

2036

++		 * Enough queues have been activated shortly after each

2037

++		 * other to consider this burst as large.

2038

++		 */

2039

++		bfqd->large_burst = true;

2040

++

2041

++		/*

2042

++		 * We can now mark all queues in the burst list as

2043

++		 * belonging to a large burst.

2044

++		 */

2045

++		hlist_for_each_entry(bfqq_item, &bfqd->burst_list,

2046

++				     burst_list_node)

2047

++		        bfq_mark_bfqq_in_large_burst(bfqq_item);

2048

++		bfq_mark_bfqq_in_large_burst(bfqq);

2049

++

2050

++		/*

2051

++		 * From now on, and until the current burst finishes, any

2052

++		 * new queue being activated shortly after the last queue

2053

++		 * was inserted in the burst can be immediately marked as

2054

++		 * belonging to a large burst. So the burst list is not

2055

++		 * needed any more. Remove it.

2056

++		 */

2057

++		hlist_for_each_entry_safe(pos, n, &bfqd->burst_list,

2058

++					  burst_list_node)

2059

++			hlist_del_init(&pos->burst_list_node);

2060

++	} else /* burst not yet large: add bfqq to the burst list */

2061

++		hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list);

2062

++}

2063

++

2064

++/*

2065

++ * If many queues happen to become active shortly after each other, then,

2066

++ * to help the processes associated to these queues get their job done as

2067

++ * soon as possible, it is usually better to not grant either weight-raising

2068

++ * or device idling to these queues. In this comment we describe, firstly,

2069

++ * the reasons why this fact holds, and, secondly, the next function, which

2070

++ * implements the main steps needed to properly mark these queues so that

2071

++ * they can then be treated in a different way.

2072

++ *

2073

++ * As for the terminology, we say that a queue becomes active, i.e.,

2074

++ * switches from idle to backlogged, either when it is created (as a

2075

++ * consequence of the arrival of an I/O request), or, if already existing,

2076

++ * when a new request for the queue arrives while the queue is idle.

2077

++ * Bursts of activations, i.e., activations of different queues occurring

2078

++ * shortly after each other, are typically caused by services or applications

2079

++ * that spawn or reactivate many parallel threads/processes. Examples are

2080

++ * systemd during boot or git grep.

2081

++ *

2082

++ * These services or applications benefit mostly from a high throughput:

2083

++ * the quicker the requests of the activated queues are cumulatively served,

2084

++ * the sooner the target job of these queues gets completed. As a consequence,

2085

++ * weight-raising any of these queues, which also implies idling the device

2086

++ * for it, is almost always counterproductive: in most cases it just lowers

2087

++ * throughput.

2088

++ *

2089

++ * On the other hand, a burst of activations may be also caused by the start

2090

++ * of an application that does not consist in a lot of parallel I/O-bound

2091

++ * threads. In fact, with a complex application, the burst may be just a

2092

++ * consequence of the fact that several processes need to be executed to

2093

++ * start-up the application. To start an application as quickly as possible,

2094

++ * the best thing to do is to privilege the I/O related to the application

2095

++ * with respect to all other I/O. Therefore, the best strategy to start as

2096

++ * quickly as possible an application that causes a burst of activations is

2097

++ * to weight-raise all the queues activated during the burst. This is the

2098

++ * exact opposite of the best strategy for the other type of bursts.

2099

++ *

2100

++ * In the end, to take the best action for each of the two cases, the two

2101

++ * types of bursts need to be distinguished. Fortunately, this seems

2102

++ * relatively easy to do, by looking at the sizes of the bursts. In

2103

++ * particular, we found a threshold such that bursts with a larger size

2104

++ * than that threshold are apparently caused only by services or commands

2105

++ * such as systemd or git grep. For brevity, hereafter we call just 'large'

2106

++ * these bursts. BFQ *does not* weight-raise queues whose activations occur

2107

++ * in a large burst. In addition, for each of these queues BFQ performs or

2108

++ * does not perform idling depending on which choice boosts the throughput

2109

++ * most. The exact choice depends on the device and request pattern at

2110

++ * hand.

2111

++ *

2112

++ * Turning back to the next function, it implements all the steps needed

2113

++ * to detect the occurrence of a large burst and to properly mark all the

2114

++ * queues belonging to it (so that they can then be treated in a different

2115

++ * way). This goal is achieved by maintaining a special "burst list" that

2116

++ * holds, temporarily, the queues that belong to the burst in progress. The

2117

++ * list is then used to mark these queues as belonging to a large burst if

2118

++ * the burst does become large. The main steps are the following.

2119

++ *

2120

++ * . when the very first queue is activated, the queue is inserted into the

2121

++ *   list (as it could be the first queue in a possible burst)

2122

++ *

2123

++ * . if the current burst has not yet become large, and a queue Q that does

2124

++ *   not yet belong to the burst is activated shortly after the last time

2125

++ *   at which a new queue entered the burst list, then the function appends

2126

++ *   Q to the burst list

2127

++ *

2128

++ * . if, as a consequence of the previous step, the burst size reaches

2129

++ *   the large-burst threshold, then

2130

++ *

2131

++ *     . all the queues in the burst list are marked as belonging to a

2132

++ *       large burst

2133

++ *

2134

++ *     . the burst list is deleted; in fact, the burst list already served

2135

++ *       its purpose (keeping temporarily track of the queues in a burst,

2136

++ *       so as to be able to mark them as belonging to a large burst in the

2137

++ *       previous sub-step), and now is not needed any more

2138

++ *

2139

++ *     . the device enters a large-burst mode

2140

++ *

2141

++ * . if a queue Q that does not belong to the burst is activated while

2142

++ *   the device is in large-burst mode and shortly after the last time

2143

++ *   at which a queue either entered the burst list or was marked as

2144

++ *   belonging to the current large burst, then Q is immediately marked

2145

++ *   as belonging to a large burst.

2146

++ *

2147

++ * . if a queue Q that does not belong to the burst is activated a while

2148

++ *   later, i.e., not shortly after, than the last time at which a queue

2149

++ *   either entered the burst list or was marked as belonging to the

2150

++ *   current large burst, then the current burst is deemed as finished and:

2151

++ *

2152

++ *        . the large-burst mode is reset if set

2153

++ *

2154

++ *        . the burst list is emptied

2155

++ *

2156

++ *        . Q is inserted in the burst list, as Q may be the first queue

2157

++ *          in a possible new burst (then the burst list contains just Q

2158

++ *          after this step).

2159

++ */

2160

++static void bfq_handle_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq,

2161

++			     bool idle_for_long_time)

2162

++{

2163

++	/*

2164

++	 * If bfqq happened to be activated in a burst, but has been idle

2165

++	 * for at least as long as an interactive queue, then we assume

2166

++	 * that, in the overall I/O initiated in the burst, the I/O

2167

++	 * associated to bfqq is finished. So bfqq does not need to be

2168

++	 * treated as a queue belonging to a burst anymore. Accordingly,

2169

++	 * we reset bfqq's in_large_burst flag if set, and remove bfqq

2170

++	 * from the burst list if it's there. We do not decrement instead

2171

++	 * burst_size, because the fact that bfqq does not need to belong

2172

++	 * to the burst list any more does not invalidate the fact that

2173

++	 * bfqq may have been activated during the current burst.

2174

++	 */

2175

++	if (idle_for_long_time) {

2176

++		hlist_del_init(&bfqq->burst_list_node);

2177

++		bfq_clear_bfqq_in_large_burst(bfqq);

2178

++	}

2179

++

2180

++	/*

2181

++	 * If bfqq is already in the burst list or is part of a large

2182

++	 * burst, then there is nothing else to do.

2183

++	 */

2184

++	if (!hlist_unhashed(&bfqq->burst_list_node) ||

2185

++	    bfq_bfqq_in_large_burst(bfqq))

2186

++		return;

2187

++

2188

++	/*

2189

++	 * If bfqq's activation happens late enough, then the current

2190

++	 * burst is finished, and related data structures must be reset.

2191

++	 *

2192

++	 * In this respect, consider the special case where bfqq is the very

2193

++	 * first queue being activated. In this case, last_ins_in_burst is

2194

++	 * not yet significant when we get here. But it is easy to verify

2195

++	 * that, whether or not the following condition is true, bfqq will

2196

++	 * end up being inserted into the burst list. In particular the

2197

++	 * list will happen to contain only bfqq. And this is exactly what

2198

++	 * has to happen, as bfqq may be the first queue in a possible

2199

++	 * burst.

2200

++	 */

2201

++	if (time_is_before_jiffies(bfqd->last_ins_in_burst +

2202

++	    bfqd->bfq_burst_interval)) {

2203

++		bfqd->large_burst = false;

2204

++		bfq_reset_burst_list(bfqd, bfqq);

2205

++		return;

2206

++	}

2207

++

2208

++	/*

2209

++	 * If we get here, then bfqq is being activated shortly after the

2210

++	 * last queue. So, if the current burst is also large, we can mark

2211

++	 * bfqq as belonging to this large burst immediately.

2212

++	 */

2213

++	if (bfqd->large_burst) {

2214

++		bfq_mark_bfqq_in_large_burst(bfqq);

2215

++		return;

2216

++	}

2217

++

2218

++	/*

2219

++	 * If we get here, then a large-burst state has not yet been

2220

++	 * reached, but bfqq is being activated shortly after the last

2221

++	 * queue. Then we add bfqq to the burst.

2222

++	 */

2223

++	bfq_add_to_burst(bfqd, bfqq);

2224

++}

2225

++

2226

++static void bfq_add_request(struct request *rq)

2227

++{

2228

++	struct bfq_queue *bfqq = RQ_BFQQ(rq);

2229

++	struct bfq_entity *entity = &bfqq->entity;

2230

++	struct bfq_data *bfqd = bfqq->bfqd;

2231

++	struct request *next_rq, *prev;

2232

++	unsigned long old_wr_coeff = bfqq->wr_coeff;

2233

++	bool interactive = false;

2234

++

2235

++	bfq_log_bfqq(bfqd, bfqq, "add_request %d", rq_is_sync(rq));

2236

++	bfqq->queued[rq_is_sync(rq)]++;

2237

++	bfqd->queued++;

2238

++

2239

++	elv_rb_add(&bfqq->sort_list, rq);

2240

++

2241

++	/*

2242

++	 * Check if this request is a better next-serve candidate.

2243

++	 */

2244

++	prev = bfqq->next_rq;

2245

++	next_rq = bfq_choose_req(bfqd, bfqq->next_rq, rq, bfqd->last_position);

2246

++	BUG_ON(!next_rq);

2247

++	bfqq->next_rq = next_rq;

2248

++

2249

++	if (!bfq_bfqq_busy(bfqq)) {

2250

++		bool soft_rt, in_burst,

2251

++		     idle_for_long_time = time_is_before_jiffies(

2252

++						bfqq->budget_timeout +

2253

++						bfqd->bfq_wr_min_idle_time);

2254

++

2255

++#ifdef CONFIG_BFQ_GROUP_IOSCHED

2256

++		bfqg_stats_update_io_add(bfqq_group(RQ_BFQQ(rq)), bfqq,

2257

++					 rq->cmd_flags);

2258

++#endif

2259

++		if (bfq_bfqq_sync(bfqq)) {

2260

++			bool already_in_burst =

2261

++			   !hlist_unhashed(&bfqq->burst_list_node) ||

2262

++			   bfq_bfqq_in_large_burst(bfqq);

2263

++			bfq_handle_burst(bfqd, bfqq, idle_for_long_time);

2264

++			/*

2265

++			 * If bfqq was not already in the current burst,

2266

++			 * then, at this point, bfqq either has been

2267

++			 * added to the current burst or has caused the

2268

++			 * current burst to terminate. In particular, in

2269

++			 * the second case, bfqq has become the first

2270

++			 * queue in a possible new burst.

2271

++			 * In both cases last_ins_in_burst needs to be

2272

++			 * moved forward.

2273

++			 */

2274

++			if (!already_in_burst)

2275

++				bfqd->last_ins_in_burst = jiffies;

2276

++		}

2277

++

2278

++		in_burst = bfq_bfqq_in_large_burst(bfqq);

2279

++		soft_rt = bfqd->bfq_wr_max_softrt_rate > 0 &&

2280

++			!in_burst &&

2281

++			time_is_before_jiffies(bfqq->soft_rt_next_start);

2282

++		interactive = !in_burst && idle_for_long_time;

2283

++		entity->budget = max_t(unsigned long, bfqq->max_budget,

2284

++				       bfq_serv_to_charge(next_rq, bfqq));

2285

++

2286

++		if (!bfq_bfqq_IO_bound(bfqq)) {

2287

++			if (time_before(jiffies,

2288

++					RQ_BIC(rq)->ttime.last_end_request +

2289

++					bfqd->bfq_slice_idle)) {

2290

++				bfqq->requests_within_timer++;

2291

++				if (bfqq->requests_within_timer >=

2292

++				    bfqd->bfq_requests_within_timer)

2293

++					bfq_mark_bfqq_IO_bound(bfqq);

2294

++			} else

2295

++				bfqq->requests_within_timer = 0;

2296

++		}

2297

++

2298

++		if (!bfqd->low_latency)

2299

++			goto add_bfqq_busy;

2300

++

2301

++		/*

2302

++		 * If the queue:

2303

++		 * - is not being boosted,

2304

++		 * - has been idle for enough time,

2305

++		 * - is not a sync queue or is linked to a bfq_io_cq (it is

2306

++		 *   shared "for its nature" or it is not shared and its

2307

++		 *   requests have not been redirected to a shared queue)

2308

++		 * start a weight-raising period.

2309

++		 */

2310

++		if (old_wr_coeff == 1 && (interactive || soft_rt) &&

2311

++		    (!bfq_bfqq_sync(bfqq) || bfqq->bic)) {

2312

++			bfqq->wr_coeff = bfqd->bfq_wr_coeff;

2313

++			if (interactive)

2314

++				bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);

2315

++			else

2316

++				bfqq->wr_cur_max_time =

2317

++					bfqd->bfq_wr_rt_max_time;

2318

++			bfq_log_bfqq(bfqd, bfqq,

2319

++				     "wrais starting at %lu, rais_max_time %u",

2320

++				     jiffies,

2321

++				     jiffies_to_msecs(bfqq->wr_cur_max_time));

2322

++		} else if (old_wr_coeff > 1) {

2323

++			if (interactive)

2324

++				bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);

2325

++			else if (in_burst ||

2326

++				 (bfqq->wr_cur_max_time ==

2327

++				  bfqd->bfq_wr_rt_max_time &&

2328

++				  !soft_rt)) {

2329

++				bfqq->wr_coeff = 1;

2330

++				bfq_log_bfqq(bfqd, bfqq,

2331

++					"wrais ending at %lu, rais_max_time %u",

2332

++					jiffies,

2333

++					jiffies_to_msecs(bfqq->

2334

++						wr_cur_max_time));

2335

++			} else if (time_before(

2336

++					bfqq->last_wr_start_finish +

2337

++					bfqq->wr_cur_max_time,

2338

++					jiffies +

2339

++					bfqd->bfq_wr_rt_max_time) &&

2340

++				   soft_rt) {

2341

++				/*

2342

++				 *

2343

++				 * The remaining weight-raising time is lower

2344

++				 * than bfqd->bfq_wr_rt_max_time, which means

2345

++				 * that the application is enjoying weight

2346

++				 * raising either because deemed soft-rt in

2347

++				 * the near past, or because deemed interactive

2348

++				 * a long ago.

2349

++				 * In both cases, resetting now the current

2350

++				 * remaining weight-raising time for the

2351

++				 * application to the weight-raising duration

2352

++				 * for soft rt applications would not cause any

2353

++				 * latency increase for the application (as the

2354

++				 * new duration would be higher than the

2355

++				 * remaining time).

2356

++				 *

2357

++				 * In addition, the application is now meeting

2358

++				 * the requirements for being deemed soft rt.

2359

++				 * In the end we can correctly and safely

2360

++				 * (re)charge the weight-raising duration for

2361

++				 * the application with the weight-raising

2362

++				 * duration for soft rt applications.

2363

++				 *

2364

++				 * In particular, doing this recharge now, i.e.,

2365

++				 * before the weight-raising period for the

2366

++				 * application finishes, reduces the probability

2367

++				 * of the following negative scenario:

2368

++				 * 1) the weight of a soft rt application is

2369

++				 *    raised at startup (as for any newly

2370

++				 *    created application),

2371

++				 * 2) since the application is not interactive,

2372

++				 *    at a certain time weight-raising is

2373

++				 *    stopped for the application,

2374

++				 * 3) at that time the application happens to

2375

++				 *    still have pending requests, and hence

2376

++				 *    is destined to not have a chance to be

2377

++				 *    deemed soft rt before these requests are

2378

++				 *    completed (see the comments to the

2379

++				 *    function bfq_bfqq_softrt_next_start()

2380

++				 *    for details on soft rt detection),

2381

++				 * 4) these pending requests experience a high

2382

++				 *    latency because the application is not

2383

++				 *    weight-raised while they are pending.

2384

++				 */

2385

++				bfqq->last_wr_start_finish = jiffies;

2386

++				bfqq->wr_cur_max_time =

2387

++					bfqd->bfq_wr_rt_max_time;

2388

++			}

2389

++		}

2390

++		if (old_wr_coeff != bfqq->wr_coeff)

2391

++			entity->prio_changed = 1;

2392

++add_bfqq_busy:

2393

++		bfqq->last_idle_bklogged = jiffies;

2394

++		bfqq->service_from_backlogged = 0;

2395

++		bfq_clear_bfqq_softrt_update(bfqq);

2396

++		bfq_add_bfqq_busy(bfqd, bfqq);

2397

++	} else {

2398

++		if (bfqd->low_latency && old_wr_coeff == 1 && !rq_is_sync(rq) &&

2399

++		    time_is_before_jiffies(

2400

++				bfqq->last_wr_start_finish +

2401

++				bfqd->bfq_wr_min_inter_arr_async)) {

2402

++			bfqq->wr_coeff = bfqd->bfq_wr_coeff;

2403

++			bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);

2404

++

2405

++			bfqd->wr_busy_queues++;

2406

++			entity->prio_changed = 1;

2407

++			bfq_log_bfqq(bfqd, bfqq,

2408

++			    "non-idle wrais starting at %lu, rais_max_time %u",

2409

++			    jiffies,

2410

++			    jiffies_to_msecs(bfqq->wr_cur_max_time));

2411

++		}

2412

++		if (prev != bfqq->next_rq)

2413

++			bfq_updated_next_req(bfqd, bfqq);

2414

++	}

2415

++

2416

++	if (bfqd->low_latency &&

2417

++		(old_wr_coeff == 1 || bfqq->wr_coeff == 1 || interactive))

2418

++		bfqq->last_wr_start_finish = jiffies;

2419

++}

2420

++

2421

++static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd,

2422

++					  struct bio *bio)

2423

++{

2424

++	struct task_struct *tsk = current;

2425

++	struct bfq_io_cq *bic;

2426

++	struct bfq_queue *bfqq;

2427

++

2428

++	bic = bfq_bic_lookup(bfqd, tsk->io_context);

2429

++	if (!bic)

2430

++		return NULL;

2431

++

2432

++	bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));

2433

++	if (bfqq)

2434

++		return elv_rb_find(&bfqq->sort_list, bio_end_sector(bio));

2435

++

2436

++	return NULL;

2437

++}

2438

++

2439

++static void bfq_activate_request(struct request_queue *q, struct request *rq)

2440

++{

2441

++	struct bfq_data *bfqd = q->elevator->elevator_data;

2442

++

2443

++	bfqd->rq_in_driver++;

2444

++	bfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq);

2445

++	bfq_log(bfqd, "activate_request: new bfqd->last_position %llu",

2446

++		(long long unsigned)bfqd->last_position);

2447

++}

2448

++

2449

++static void bfq_deactivate_request(struct request_queue *q, struct request *rq)

2450

++{

2451

++	struct bfq_data *bfqd = q->elevator->elevator_data;

2452

++

2453

++	BUG_ON(bfqd->rq_in_driver == 0);

2454

++	bfqd->rq_in_driver--;

2455

++}

2456

++

2457

++static void bfq_remove_request(struct request *rq)

2458

++{

2459

++	struct bfq_queue *bfqq = RQ_BFQQ(rq);

2460

++	struct bfq_data *bfqd = bfqq->bfqd;

2461

++	const int sync = rq_is_sync(rq);

2462

++

2463

++	if (bfqq->next_rq == rq) {

2464

++		bfqq->next_rq = bfq_find_next_rq(bfqd, bfqq, rq);

2465

++		bfq_updated_next_req(bfqd, bfqq);

2466

++	}

2467

++

2468

++	if (rq->queuelist.prev != &rq->queuelist)

2469

++		list_del_init(&rq->queuelist);

2470

++	BUG_ON(bfqq->queued[sync] == 0);

2471

++	bfqq->queued[sync]--;

2472

++	bfqd->queued--;

2473

++	elv_rb_del(&bfqq->sort_list, rq);

2474

++

2475

++	if (RB_EMPTY_ROOT(&bfqq->sort_list)) {

2476

++		if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->in_service_queue)

2477

++			bfq_del_bfqq_busy(bfqd, bfqq, 1);

2478

++		/*

2479

++		 * Remove queue from request-position tree as it is empty.

2480

++		 */

2481

++		if (bfqq->pos_root) {

2482

++			rb_erase(&bfqq->pos_node, bfqq->pos_root);

2483

++			bfqq->pos_root = NULL;

2484

++		}

2485

++	}

2486

++

2487

++	if (rq->cmd_flags & REQ_META) {

2488

++		BUG_ON(bfqq->meta_pending == 0);

2489

++		bfqq->meta_pending--;

2490

++	}

2491

++#ifdef CONFIG_BFQ_GROUP_IOSCHED

2492

++	bfqg_stats_update_io_remove(bfqq_group(bfqq), rq->cmd_flags);

2493

++#endif

2494

++}

2495

++

2496

++static int bfq_merge(struct request_queue *q, struct request **req,

2497

++		     struct bio *bio)

2498

++{

2499

++	struct bfq_data *bfqd = q->elevator->elevator_data;

2500

++	struct request *__rq;

2501

++

2502

++	__rq = bfq_find_rq_fmerge(bfqd, bio);

2503

++	if (__rq && elv_rq_merge_ok(__rq, bio)) {

2504

++		*req = __rq;

2505

++		return ELEVATOR_FRONT_MERGE;

2506

++	}

2507

++

2508

++	return ELEVATOR_NO_MERGE;

2509

++}

2510

++

2511

++static void bfq_merged_request(struct request_queue *q, struct request *req,

2512

++			       int type)

2513

++{

2514

++	if (type == ELEVATOR_FRONT_MERGE &&

2515

++	    rb_prev(&req->rb_node) &&

2516

++	    blk_rq_pos(req) <

2517

++	    blk_rq_pos(container_of(rb_prev(&req->rb_node),

2518

++				    struct request, rb_node))) {

2519

++		struct bfq_queue *bfqq = RQ_BFQQ(req);

2520

++		struct bfq_data *bfqd = bfqq->bfqd;

2521

++		struct request *prev, *next_rq;

2522

++

2523

++		/* Reposition request in its sort_list */

2524

++		elv_rb_del(&bfqq->sort_list, req);

2525

++		elv_rb_add(&bfqq->sort_list, req);

2526

++		/* Choose next request to be served for bfqq */

2527

++		prev = bfqq->next_rq;

2528

++		next_rq = bfq_choose_req(bfqd, bfqq->next_rq, req,

2529

++					 bfqd->last_position);

2530

++		BUG_ON(!next_rq);

2531

++		bfqq->next_rq = next_rq;

2532

++	}

2533

++}

2534

++

2535

++#ifdef CONFIG_BFQ_GROUP_IOSCHED

2536

++static void bfq_bio_merged(struct request_queue *q, struct request *req,

2537

++			   struct bio *bio)

2538

++{

2539

++	bfqg_stats_update_io_merged(bfqq_group(RQ_BFQQ(req)), bio->bi_rw);

2540

++}

2541

++#endif

2542

++

2543

++static void bfq_merged_requests(struct request_queue *q, struct request *rq,

2544

++				struct request *next)

2545

++{

2546

++	struct bfq_queue *bfqq = RQ_BFQQ(rq), *next_bfqq = RQ_BFQQ(next);

2547

++

2548

++	/*

2549

++	 * If next and rq belong to the same bfq_queue and next is older

2550

++	 * than rq, then reposition rq in the fifo (by substituting next

2551

++	 * with rq). Otherwise, if next and rq belong to different

2552

++	 * bfq_queues, never reposition rq: in fact, we would have to

2553

++	 * reposition it with respect to next's position in its own fifo,

2554

++	 * which would most certainly be too expensive with respect to

2555

++	 * the benefits.

2556

++	 */

2557

++	if (bfqq == next_bfqq &&

2558

++	    !list_empty(&rq->queuelist) && !list_empty(&next->queuelist) &&

2559

++	    time_before(next->fifo_time, rq->fifo_time)) {

2560

++		list_del_init(&rq->queuelist);

2561

++		list_replace_init(&next->queuelist, &rq->queuelist);

2562

++		rq->fifo_time = next->fifo_time;

2563

++	}

2564

++

2565

++	if (bfqq->next_rq == next)

2566

++		bfqq->next_rq = rq;

2567

++

2568

++	bfq_remove_request(next);

2569

++#ifdef CONFIG_BFQ_GROUP_IOSCHED

2570

++	bfqg_stats_update_io_merged(bfqq_group(bfqq), next->cmd_flags);

2571

++#endif

2572

++}

2573

++

2574

++/* Must be called with bfqq != NULL */

2575

++static void bfq_bfqq_end_wr(struct bfq_queue *bfqq)

2576

++{

2577

++	BUG_ON(!bfqq);

2578

++	if (bfq_bfqq_busy(bfqq))

2579

++		bfqq->bfqd->wr_busy_queues--;

2580

++	bfqq->wr_coeff = 1;

2581

++	bfqq->wr_cur_max_time = 0;

2582

++	/* Trigger a weight change on the next activation of the queue */

2583

++	bfqq->entity.prio_changed = 1;

2584

++}

2585

++

2586

++static void bfq_end_wr_async_queues(struct bfq_data *bfqd,

2587

++				    struct bfq_group *bfqg)

2588

++{

2589

++	int i, j;

2590

++

2591

++	for (i = 0; i < 2; i++)

2592

++		for (j = 0; j < IOPRIO_BE_NR; j++)

2593

++			if (bfqg->async_bfqq[i][j])

2594

++				bfq_bfqq_end_wr(bfqg->async_bfqq[i][j]);

2595

++	if (bfqg->async_idle_bfqq)

2596

++		bfq_bfqq_end_wr(bfqg->async_idle_bfqq);

2597

++}

2598

++

2599

++static void bfq_end_wr(struct bfq_data *bfqd)

2600

++{

2601

++	struct bfq_queue *bfqq;

2602

++

2603

++	spin_lock_irq(bfqd->queue->queue_lock);

2604

++

2605

++	list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list)

2606

++		bfq_bfqq_end_wr(bfqq);

2607

++	list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list)

2608

++		bfq_bfqq_end_wr(bfqq);

2609

++	bfq_end_wr_async(bfqd);

2610

++

2611

++	spin_unlock_irq(bfqd->queue->queue_lock);

2612

++}

2613

++

2614

++static int bfq_allow_merge(struct request_queue *q, struct request *rq,

2615

++			   struct bio *bio)

2616

++{

2617

++	struct bfq_data *bfqd = q->elevator->elevator_data;

2618

++	struct bfq_io_cq *bic;

2619

++

2620

++	/*

2621

++	 * Disallow merge of a sync bio into an async request.

2622

++	 */

2623

++	if (bfq_bio_sync(bio) && !rq_is_sync(rq))

2624

++		return 0;

2625

++

2626

++	/*

2627

++	 * Lookup the bfqq that this bio will be queued with. Allow

2628

++	 * merge only if rq is queued there.

2629

++	 * Queue lock is held here.

2630

++	 */

2631

++	bic = bfq_bic_lookup(bfqd, current->io_context);

2632

++	if (!bic)

2633

++		return 0;

2634

++

2635

++	return bic_to_bfqq(bic, bfq_bio_sync(bio)) == RQ_BFQQ(rq);

2636

++}

2637

++

2638

++static void __bfq_set_in_service_queue(struct bfq_data *bfqd,

2639

++				       struct bfq_queue *bfqq)

2640

++{

2641

++	if (bfqq) {

2642

++#ifdef CONFIG_BFQ_GROUP_IOSCHED

2643

++		bfqg_stats_update_avg_queue_size(bfqq_group(bfqq));

2644

++#endif

2645

++		bfq_mark_bfqq_must_alloc(bfqq);

2646

++		bfq_mark_bfqq_budget_new(bfqq);

2647

++		bfq_clear_bfqq_fifo_expire(bfqq);

2648

++

2649

++		bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8;

2650

++

2651

++		bfq_log_bfqq(bfqd, bfqq,

2652

++			     "set_in_service_queue, cur-budget = %d",

2653

++			     bfqq->entity.budget);

2654

++	}

2655

++

2656

++	bfqd->in_service_queue = bfqq;

2657

++}

2658

++

2659

++/*

2660

++ * Get and set a new queue for service.

2661

++ */

2662

++static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd)

2663

++{

2664

++	struct bfq_queue *bfqq = bfq_get_next_queue(bfqd);

2665

++

2666

++	__bfq_set_in_service_queue(bfqd, bfqq);

2667

++	return bfqq;

2668

++}

2669

++

2670

++/*

2671

++ * If enough samples have been computed, return the current max budget

2672

++ * stored in bfqd, which is dynamically updated according to the

2673

++ * estimated disk peak rate; otherwise return the default max budget

2674

++ */

2675

++static int bfq_max_budget(struct bfq_data *bfqd)

2676

++{

2677

++	if (bfqd->budgets_assigned < bfq_stats_min_budgets)

2678

++		return bfq_default_max_budget;

2679

++	else

2680

++		return bfqd->bfq_max_budget;

2681

++}

2682

++

2683

++/*

2684

++ * Return min budget, which is a fraction of the current or default

2685

++ * max budget (trying with 1/32)

2686

++ */

2687

++static int bfq_min_budget(struct bfq_data *bfqd)

2688

++{

2689

++	if (bfqd->budgets_assigned < bfq_stats_min_budgets)

2690

++		return bfq_default_max_budget / 32;

2691

++	else

2692

++		return bfqd->bfq_max_budget / 32;

2693

++}

2694

++

2695

++static void bfq_arm_slice_timer(struct bfq_data *bfqd)

2696

++{

2697

++	struct bfq_queue *bfqq = bfqd->in_service_queue;

2698

++	struct bfq_io_cq *bic;

2699

++	unsigned long sl;

2700

++

2701

++	BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list));

2702

++

2703

++	/* Processes have exited, don't wait. */

2704

++	bic = bfqd->in_service_bic;

2705

++	if (!bic || atomic_read(&bic->icq.ioc->active_ref) == 0)

2706

++		return;

2707

++

2708

++	bfq_mark_bfqq_wait_request(bfqq);

2709

++

2710

++	/*

2711

++	 * We don't want to idle for seeks, but we do want to allow

2712

++	 * fair distribution of slice time for a process doing back-to-back

2713

++	 * seeks. So allow a little bit of time for him to submit a new rq.

2714

++	 *

2715

++	 * To prevent processes with (partly) seeky workloads from

2716

++	 * being too ill-treated, grant them a small fraction of the

2717

++	 * assigned budget before reducing the waiting time to

2718

++	 * BFQ_MIN_TT. This happened to help reduce latency.

2719

++	 */

2720

++	sl = bfqd->bfq_slice_idle;

2721

++	/*

2722

++	 * Unless the queue is being weight-raised or the scenario is

2723

++	 * asymmetric, grant only minimum idle time if the queue either

2724

++	 * has been seeky for long enough or has already proved to be

2725

++	 * constantly seeky.

2726

++	 */

2727

++	if (bfq_sample_valid(bfqq->seek_samples) &&

2728

++	    ((BFQQ_SEEKY(bfqq) && bfqq->entity.service >

2729

++				  bfq_max_budget(bfqq->bfqd) / 8) ||

2730

++	      bfq_bfqq_constantly_seeky(bfqq)) && bfqq->wr_coeff == 1 &&

2731

++	    bfq_symmetric_scenario(bfqd))

2732

++		sl = min(sl, msecs_to_jiffies(BFQ_MIN_TT));

2733

++	else if (bfqq->wr_coeff > 1)

2734

++		sl = sl * 3;

2735

++	bfqd->last_idling_start = ktime_get();

2736

++	mod_timer(&bfqd->idle_slice_timer, jiffies + sl);

2737

++#ifdef CONFIG_BFQ_GROUP_IOSCHED

2738

++	bfqg_stats_set_start_idle_time(bfqq_group(bfqq));

2739

++#endif

2740

++	bfq_log(bfqd, "arm idle: %u/%u ms",

2741

++		jiffies_to_msecs(sl), jiffies_to_msecs(bfqd->bfq_slice_idle));

2742

++}

2743

++

2744

++/*

2745

++ * Set the maximum time for the in-service queue to consume its

2746

++ * budget. This prevents seeky processes from lowering the disk

2747

++ * throughput (always guaranteed with a time slice scheme as in CFQ).

2748

++ */

2749

++static void bfq_set_budget_timeout(struct bfq_data *bfqd)

2750

++{

2751

++	struct bfq_queue *bfqq = bfqd->in_service_queue;

2752

++	unsigned int timeout_coeff;

2753

++	if (bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time)

2754

++		timeout_coeff = 1;

2755

++	else

2756

++		timeout_coeff = bfqq->entity.weight / bfqq->entity.orig_weight;

2757

++

2758

++	bfqd->last_budget_start = ktime_get();

2759

++

2760

++	bfq_clear_bfqq_budget_new(bfqq);

2761

++	bfqq->budget_timeout = jiffies +

2762

++		bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] * timeout_coeff;

2763

++

2764

++	bfq_log_bfqq(bfqd, bfqq, "set budget_timeout %u",

2765

++		jiffies_to_msecs(bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] *

2766

++		timeout_coeff));

2767

++}

2768

++

2769

++/*

2770

++ * Move request from internal lists to the request queue dispatch list.

2771

++ */

2772

++static void bfq_dispatch_insert(struct request_queue *q, struct request *rq)

2773

++{

2774

++	struct bfq_data *bfqd = q->elevator->elevator_data;

2775

++	struct bfq_queue *bfqq = RQ_BFQQ(rq);

2776

++

2777

++	/*

2778

++	 * For consistency, the next instruction should have been executed

2779

++	 * after removing the request from the queue and dispatching it.

2780

++	 * We execute instead this instruction before bfq_remove_request()

2781

++	 * (and hence introduce a temporary inconsistency), for efficiency.

2782

++	 * In fact, in a forced_dispatch, this prevents two counters related

2783

++	 * to bfqq->dispatched to risk to be uselessly decremented if bfqq

2784

++	 * is not in service, and then to be incremented again after

2785

++	 * incrementing bfqq->dispatched.

2786

++	 */

2787

++	bfqq->dispatched++;

2788

++	bfq_remove_request(rq);

2789

++	elv_dispatch_sort(q, rq);

2790

++

2791

++	if (bfq_bfqq_sync(bfqq))

2792

++		bfqd->sync_flight++;

2793

++#ifdef CONFIG_BFQ_GROUP_IOSCHED

2794

++	bfqg_stats_update_dispatch(bfqq_group(bfqq), blk_rq_bytes(rq),

2795

++				   rq->cmd_flags);

2796

++#endif

2797

++}

2798

++

2799

++/*

2800

++ * Return expired entry, or NULL to just start from scratch in rbtree.

2801

++ */

2802

++static struct request *bfq_check_fifo(struct bfq_queue *bfqq)

2803

++{

2804

++	struct request *rq = NULL;

2805

++

2806

++	if (bfq_bfqq_fifo_expire(bfqq))

2807

++		return NULL;

2808

++

2809

++	bfq_mark_bfqq_fifo_expire(bfqq);

2810

++

2811

++	if (list_empty(&bfqq->fifo))

2812

++		return NULL;

2813

++

2814

++	rq = rq_entry_fifo(bfqq->fifo.next);

2815

++

2816

++	if (time_before(jiffies, rq->fifo_time))

2817

++		return NULL;

2818

++

2819

++	return rq;

2820

++}

2821

++

2822

++static int bfq_bfqq_budget_left(struct bfq_queue *bfqq)

2823

++{

2824

++	struct bfq_entity *entity = &bfqq->entity;

2825

++	return entity->budget - entity->service;

2826

++}

2827

++

2828

++static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq)

2829

++{

2830

++	BUG_ON(bfqq != bfqd->in_service_queue);

2831

++

2832

++	__bfq_bfqd_reset_in_service(bfqd);

2833

++

2834

++	if (RB_EMPTY_ROOT(&bfqq->sort_list)) {

2835

++		/*

2836

++		 * Overloading budget_timeout field to store the time

2837

++		 * at which the queue remains with no backlog; used by

2838

++		 * the weight-raising mechanism.

2839

++		 */

2840

++		bfqq->budget_timeout = jiffies;

2841

++		bfq_del_bfqq_busy(bfqd, bfqq, 1);

2842

++	} else

2843

++		bfq_activate_bfqq(bfqd, bfqq);

2844

++}

2845

++

2846

++/**

2847

++ * __bfq_bfqq_recalc_budget - try to adapt the budget to the @bfqq behavior.

2848

++ * @bfqd: device data.

2849

++ * @bfqq: queue to update.

2850

++ * @reason: reason for expiration.

2851

++ *

2852

++ * Handle the feedback on @bfqq budget at queue expiration.

2853

++ * See the body for detailed comments.

2854

++ */

2855

++static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd,

2856

++				     struct bfq_queue *bfqq,

2857

++				     enum bfqq_expiration reason)

2858

++{

2859

++	struct request *next_rq;

2860

++	int budget, min_budget;

2861

++

2862

++	budget = bfqq->max_budget;

2863

++	min_budget = bfq_min_budget(bfqd);

2864

++

2865

++	BUG_ON(bfqq != bfqd->in_service_queue);

2866

++

2867

++	bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last budg %d, budg left %d",

2868

++		bfqq->entity.budget, bfq_bfqq_budget_left(bfqq));

2869

++	bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last max_budg %d, min budg %d",

2870

++		budget, bfq_min_budget(bfqd));

2871

++	bfq_log_bfqq(bfqd, bfqq, "recalc_budg: sync %d, seeky %d",

2872

++		bfq_bfqq_sync(bfqq), BFQQ_SEEKY(bfqd->in_service_queue));

2873

++

2874

++	if (bfq_bfqq_sync(bfqq)) {

2875

++		switch (reason) {

2876

++		/*

2877

++		 * Caveat: in all the following cases we trade latency

2878

++		 * for throughput.

2879

++		 */

2880

++		case BFQ_BFQQ_TOO_IDLE:

2881

++			/*

2882

++			 * This is the only case where we may reduce

2883

++			 * the budget: if there is no request of the

2884

++			 * process still waiting for completion, then

2885

++			 * we assume (tentatively) that the timer has

2886

++			 * expired because the batch of requests of

2887

++			 * the process could have been served with a

2888

++			 * smaller budget.  Hence, betting that

2889

++			 * process will behave in the same way when it

2890

++			 * becomes backlogged again, we reduce its

2891

++			 * next budget.  As long as we guess right,

2892

++			 * this budget cut reduces the latency

2893

++			 * experienced by the process.

2894

++			 *

2895

++			 * However, if there are still outstanding

2896

++			 * requests, then the process may have not yet

2897

++			 * issued its next request just because it is

2898

++			 * still waiting for the completion of some of

2899

++			 * the still outstanding ones.  So in this

2900

++			 * subcase we do not reduce its budget, on the

2901

++			 * contrary we increase it to possibly boost

2902

++			 * the throughput, as discussed in the

2903

++			 * comments to the BUDGET_TIMEOUT case.

2904

++			 */

2905

++			if (bfqq->dispatched > 0) /* still outstanding reqs */

2906

++				budget = min(budget * 2, bfqd->bfq_max_budget);

2907

++			else {

2908

++				if (budget > 5 * min_budget)

2909

++					budget -= 4 * min_budget;

2910

++				else

2911

++					budget = min_budget;

2912

++			}

2913

++			break;

2914

++		case BFQ_BFQQ_BUDGET_TIMEOUT:

2915

++			/*

2916

++			 * We double the budget here because: 1) it

2917

++			 * gives the chance to boost the throughput if

2918

++			 * this is not a seeky process (which may have

2919

++			 * bumped into this timeout because of, e.g.,

2920

++			 * ZBR), 2) together with charge_full_budget

2921

++			 * it helps give seeky processes higher

2922

++			 * timestamps, and hence be served less

2923

++			 * frequently.

2924

++			 */

2925

++			budget = min(budget * 2, bfqd->bfq_max_budget);

2926

++			break;

2927

++		case BFQ_BFQQ_BUDGET_EXHAUSTED:

2928

++			/*

2929

++			 * The process still has backlog, and did not

2930

++			 * let either the budget timeout or the disk

2931

++			 * idling timeout expire. Hence it is not

2932

++			 * seeky, has a short thinktime and may be

2933

++			 * happy with a higher budget too. So

2934

++			 * definitely increase the budget of this good

2935

++			 * candidate to boost the disk throughput.

2936

++			 */

2937

++			budget = min(budget * 4, bfqd->bfq_max_budget);

2938

++			break;

2939

++		case BFQ_BFQQ_NO_MORE_REQUESTS:

2940

++		       /*

2941

++			* Leave the budget unchanged.

2942

++			*/

2943

++		default:

2944

++			return;

2945

++		}

2946

++	} else

2947

++		/*

2948

++		 * Async queues get always the maximum possible budget

2949

++		 * (their ability to dispatch is limited by

2950

++		 * @bfqd->bfq_max_budget_async_rq).

2951

++		 */

2952

++		budget = bfqd->bfq_max_budget;

2953

++

2954

++	bfqq->max_budget = budget;

2955

++

2956

++	if (bfqd->budgets_assigned >= bfq_stats_min_budgets &&

2957

++	    !bfqd->bfq_user_max_budget)

2958

++		bfqq->max_budget = min(bfqq->max_budget, bfqd->bfq_max_budget);

2959

++

2960

++	/*

2961

++	 * Make sure that we have enough budget for the next request.

2962

++	 * Since the finish time of the bfqq must be kept in sync with

2963

++	 * the budget, be sure to call __bfq_bfqq_expire() after the

2964

++	 * update.

2965

++	 */

2966

++	next_rq = bfqq->next_rq;

2967

++	if (next_rq)

2968

++		bfqq->entity.budget = max_t(unsigned long, bfqq->max_budget,

2969

++					    bfq_serv_to_charge(next_rq, bfqq));

2970

++	else

2971

++		bfqq->entity.budget = bfqq->max_budget;

2972

++

2973

++	bfq_log_bfqq(bfqd, bfqq, "head sect: %u, new budget %d",

2974

++			next_rq ? blk_rq_sectors(next_rq) : 0,

2975

++			bfqq->entity.budget);

2976

++}

2977

++

2978

++static unsigned long bfq_calc_max_budget(u64 peak_rate, u64 timeout)

2979

++{

2980

++	unsigned long max_budget;

2981

++

2982

++	/*

2983

++	 * The max_budget calculated when autotuning is equal to the

2984

++	 * amount of sectors transfered in timeout_sync at the

2985

++	 * estimated peak rate.

2986

++	 */

2987

++	max_budget = (unsigned long)(peak_rate * 1000 *

2988

++				     timeout >> BFQ_RATE_SHIFT);

2989

++

2990

++	return max_budget;

2991

++}

2992

++

2993

++/*

2994

++ * In addition to updating the peak rate, checks whether the process

2995

++ * is "slow", and returns 1 if so. This slow flag is used, in addition

2996

++ * to the budget timeout, to reduce the amount of service provided to

2997

++ * seeky processes, and hence reduce their chances to lower the

2998

++ * throughput. See the code for more details.

2999

++ */

3000

++static bool bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq,

3001

++				 bool compensate, enum bfqq_expiration reason)

3002

++{

3003

++	u64 bw, usecs, expected, timeout;

3004

++	ktime_t delta;

3005

++	int update = 0;

3006

++

3007

++	if (!bfq_bfqq_sync(bfqq) || bfq_bfqq_budget_new(bfqq))

3008

++		return false;

3009

++

3010

++	if (compensate)

3011

++		delta = bfqd->last_idling_start;

3012

++	else

3013

++		delta = ktime_get();

3014

++	delta = ktime_sub(delta, bfqd->last_budget_start);

3015

++	usecs = ktime_to_us(delta);

3016

++

3017

++	/* Don't trust short/unrealistic values. */

3018

++	if (usecs < 100 || usecs >= LONG_MAX)

3019

++		return false;

3020

++

3021

++	/*

3022

++	 * Calculate the bandwidth for the last slice.  We use a 64 bit

3023

++	 * value to store the peak rate, in sectors per usec in fixed

3024

++	 * point math.  We do so to have enough precision in the estimate

3025

++	 * and to avoid overflows.

3026

++	 */

3027

++	bw = (u64)bfqq->entity.service << BFQ_RATE_SHIFT;

3028

++	do_div(bw, (unsigned long)usecs);

3029

++

3030

++	timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]);

3031

++

3032

++	/*

3033

++	 * Use only long (> 20ms) intervals to filter out spikes for

3034

++	 * the peak rate estimation.

3035

++	 */

3036

++	if (usecs > 20000) {

3037

++		if (bw > bfqd->peak_rate ||

3038

++		   (!BFQQ_SEEKY(bfqq) &&

3039

++		    reason == BFQ_BFQQ_BUDGET_TIMEOUT)) {

3040

++			bfq_log(bfqd, "measured bw =%llu", bw);

3041

++			/*

3042

++			 * To smooth oscillations use a low-pass filter with

3043

++			 * alpha=7/8, i.e.,

3044

++			 * new_rate = (7/8) * old_rate + (1/8) * bw

3045

++			 */

3046

++			do_div(bw, 8);

3047

++			if (bw == 0)

3048

++				return 0;

3049

++			bfqd->peak_rate *= 7;

3050

++			do_div(bfqd->peak_rate, 8);

3051

++			bfqd->peak_rate += bw;

3052

++			update = 1;

3053

++			bfq_log(bfqd, "new peak_rate=%llu", bfqd->peak_rate);

3054

++		}

3055

++

3056

++		update |= bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES - 1;

3057

++

3058

++		if (bfqd->peak_rate_samples < BFQ_PEAK_RATE_SAMPLES)

3059

++			bfqd->peak_rate_samples++;

3060

++

3061

++		if (bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES &&

3062

++		    update) {

3063

++			int dev_type = blk_queue_nonrot(bfqd->queue);

3064

++			if (bfqd->bfq_user_max_budget == 0) {

3065

++				bfqd->bfq_max_budget =

3066

++					bfq_calc_max_budget(bfqd->peak_rate,

3067

++							    timeout);

3068

++				bfq_log(bfqd, "new max_budget=%d",

3069

++					bfqd->bfq_max_budget);

3070

++			}

3071

++			if (bfqd->device_speed == BFQ_BFQD_FAST &&

3072

++			    bfqd->peak_rate < device_speed_thresh[dev_type]) {

3073

++				bfqd->device_speed = BFQ_BFQD_SLOW;

3074

++				bfqd->RT_prod = R_slow[dev_type] *

3075

++						T_slow[dev_type];

3076

++			} else if (bfqd->device_speed == BFQ_BFQD_SLOW &&

3077

++			    bfqd->peak_rate > device_speed_thresh[dev_type]) {

3078

++				bfqd->device_speed = BFQ_BFQD_FAST;

3079

++				bfqd->RT_prod = R_fast[dev_type] *

3080

++						T_fast[dev_type];

3081

++			}

3082

++		}

3083

++	}

3084

++

3085

++	/*

3086

++	 * If the process has been served for a too short time

3087

++	 * interval to let its possible sequential accesses prevail on

3088

++	 * the initial seek time needed to move the disk head on the

3089

++	 * first sector it requested, then give the process a chance

3090

++	 * and for the moment return false.

3091

++	 */

3092

++	if (bfqq->entity.budget <= bfq_max_budget(bfqd) / 8)

3093

++		return false;

3094

++

3095

++	/*

3096

++	 * A process is considered ``slow'' (i.e., seeky, so that we

3097

++	 * cannot treat it fairly in the service domain, as it would

3098

++	 * slow down too much the other processes) if, when a slice

3099

++	 * ends for whatever reason, it has received service at a

3100

++	 * rate that would not be high enough to complete the budget

3101

++	 * before the budget timeout expiration.

3102

++	 */

3103

++	expected = bw * 1000 * timeout >> BFQ_RATE_SHIFT;

3104

++

3105

++	/*

3106

++	 * Caveat: processes doing IO in the slower disk zones will

3107

++	 * tend to be slow(er) even if not seeky. And the estimated

3108

++	 * peak rate will actually be an average over the disk

3109

++	 * surface. Hence, to not be too harsh with unlucky processes,

3110

++	 * we keep a budget/3 margin of safety before declaring a

3111

++	 * process slow.

3112

++	 */

3113

++	return expected > (4 * bfqq->entity.budget) / 3;

3114

++}

3115

++

3116

++/*

3117

++ * To be deemed as soft real-time, an application must meet two

3118

++ * requirements. First, the application must not require an average

3119

++ * bandwidth higher than the approximate bandwidth required to playback or

3120

++ * record a compressed high-definition video.

3121

++ * The next function is invoked on the completion of the last request of a

3122

++ * batch, to compute the next-start time instant, soft_rt_next_start, such

3123

++ * that, if the next request of the application does not arrive before

3124

++ * soft_rt_next_start, then the above requirement on the bandwidth is met.

3125

++ *

3126

++ * The second requirement is that the request pattern of the application is

3127

++ * isochronous, i.e., that, after issuing a request or a batch of requests,

3128

++ * the application stops issuing new requests until all its pending requests

3129

++ * have been completed. After that, the application may issue a new batch,

3130

++ * and so on.

3131

++ * For this reason the next function is invoked to compute

3132

++ * soft_rt_next_start only for applications that meet this requirement,

3133

++ * whereas soft_rt_next_start is set to infinity for applications that do

3134

++ * not.

3135

++ *

3136

++ * Unfortunately, even a greedy application may happen to behave in an

3137

++ * isochronous way if the CPU load is high. In fact, the application may

3138

++ * stop issuing requests while the CPUs are busy serving other processes,

3139

++ * then restart, then stop again for a while, and so on. In addition, if

3140

++ * the disk achieves a low enough throughput with the request pattern

3141

++ * issued by the application (e.g., because the request pattern is random

3142

++ * and/or the device is slow), then the application may meet the above

3143

++ * bandwidth requirement too. To prevent such a greedy application to be

3144

++ * deemed as soft real-time, a further rule is used in the computation of

3145

++ * soft_rt_next_start: soft_rt_next_start must be higher than the current

3146

++ * time plus the maximum time for which the arrival of a request is waited

3147

++ * for when a sync queue becomes idle, namely bfqd->bfq_slice_idle.

3148

++ * This filters out greedy applications, as the latter issue instead their

3149

++ * next request as soon as possible after the last one has been completed

3150

++ * (in contrast, when a batch of requests is completed, a soft real-time

3151

++ * application spends some time processing data).

3152

++ *

3153

++ * Unfortunately, the last filter may easily generate false positives if

3154

++ * only bfqd->bfq_slice_idle is used as a reference time interval and one

3155

++ * or both the following cases occur:

3156

++ * 1) HZ is so low that the duration of a jiffy is comparable to or higher

3157

++ *    than bfqd->bfq_slice_idle. This happens, e.g., on slow devices with

3158

++ *    HZ=100.

3159

++ * 2) jiffies, instead of increasing at a constant rate, may stop increasing

3160

++ *    for a while, then suddenly 'jump' by several units to recover the lost

3161

++ *    increments. This seems to happen, e.g., inside virtual machines.

3162

++ * To address this issue, we do not use as a reference time interval just

3163

++ * bfqd->bfq_slice_idle, but bfqd->bfq_slice_idle plus a few jiffies. In

3164

++ * particular we add the minimum number of jiffies for which the filter

3165

++ * seems to be quite precise also in embedded systems and KVM/QEMU virtual

3166

++ * machines.

3167

++ */

3168

++static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd,

3169

++						struct bfq_queue *bfqq)

3170

++{

3171

++	return max(bfqq->last_idle_bklogged +

3172

++		   HZ * bfqq->service_from_backlogged /

3173

++		   bfqd->bfq_wr_max_softrt_rate,

3174

++		   jiffies + bfqq->bfqd->bfq_slice_idle + 4);

3175

++}

3176

++

3177

++/*

3178

++ * Return the largest-possible time instant such that, for as long as possible,

3179

++ * the current time will be lower than this time instant according to the macro

3180

++ * time_is_before_jiffies().

3181

++ */

3182

++static unsigned long bfq_infinity_from_now(unsigned long now)

3183

++{

3184

++	return now + ULONG_MAX / 2;

3185

++}

3186

++

3187

++/**

3188

++ * bfq_bfqq_expire - expire a queue.

3189

++ * @bfqd: device owning the queue.

3190

++ * @bfqq: the queue to expire.

3191

++ * @compensate: if true, compensate for the time spent idling.

3192

++ * @reason: the reason causing the expiration.

3193

++ *

3194

++ *

3195

++ * If the process associated to the queue is slow (i.e., seeky), or in

3196

++ * case of budget timeout, or, finally, if it is async, we

3197

++ * artificially charge it an entire budget (independently of the

3198

++ * actual service it received). As a consequence, the queue will get

3199

++ * higher timestamps than the correct ones upon reactivation, and

3200

++ * hence it will be rescheduled as if it had received more service

3201

++ * than what it actually received. In the end, this class of processes

3202

++ * will receive less service in proportion to how slowly they consume

3203

++ * their budgets (and hence how seriously they tend to lower the

3204

++ * throughput).

3205

++ *

3206

++ * In contrast, when a queue expires because it has been idling for

3207

++ * too much or because it exhausted its budget, we do not touch the

3208

++ * amount of service it has received. Hence when the queue will be

3209

++ * reactivated and its timestamps updated, the latter will be in sync

3210

++ * with the actual service received by the queue until expiration.

3211

++ *

3212

++ * Charging a full budget to the first type of queues and the exact

3213

++ * service to the others has the effect of using the WF2Q+ policy to

3214

++ * schedule the former on a timeslice basis, without violating the

3215

++ * service domain guarantees of the latter.

3216

++ */

3217

++static void bfq_bfqq_expire(struct bfq_data *bfqd,

3218

++			    struct bfq_queue *bfqq,

3219

++			    bool compensate,

3220

++			    enum bfqq_expiration reason)

3221

++{

3222

++	bool slow;

3223

++	BUG_ON(bfqq != bfqd->in_service_queue);

3224

++

3225

++	/*

3226

++	 * Update disk peak rate for autotuning and check whether the

3227

++	 * process is slow (see bfq_update_peak_rate).

3228

++	 */

3229

++	slow = bfq_update_peak_rate(bfqd, bfqq, compensate, reason);

3230

++

3231

++	/*

3232

++	 * As above explained, 'punish' slow (i.e., seeky), timed-out

3233

++	 * and async queues, to favor sequential sync workloads.

3234

++	 *

3235

++	 * Processes doing I/O in the slower disk zones will tend to be

3236

++	 * slow(er) even if not seeky. Hence, since the estimated peak

3237

++	 * rate is actually an average over the disk surface, these

3238

++	 * processes may timeout just for bad luck. To avoid punishing

3239

++	 * them we do not charge a full budget to a process that

3240

++	 * succeeded in consuming at least 2/3 of its budget.

3241

++	 */

3242

++	if (slow || (reason == BFQ_BFQQ_BUDGET_TIMEOUT &&

3243

++		     bfq_bfqq_budget_left(bfqq) >=  bfqq->entity.budget / 3))

3244

++		bfq_bfqq_charge_full_budget(bfqq);

3245

++

3246

++	bfqq->service_from_backlogged += bfqq->entity.service;

3247

++

3248

++	if (BFQQ_SEEKY(bfqq) && reason == BFQ_BFQQ_BUDGET_TIMEOUT &&

3249

++	    !bfq_bfqq_constantly_seeky(bfqq)) {

3250

++		bfq_mark_bfqq_constantly_seeky(bfqq);

3251

++		if (!blk_queue_nonrot(bfqd->queue))

3252

++			bfqd->const_seeky_busy_in_flight_queues++;

3253

++	}

3254

++

3255

++	if (reason == BFQ_BFQQ_TOO_IDLE &&

3256

++	    bfqq->entity.service <= 2 * bfqq->entity.budget / 10 )

3257

++		bfq_clear_bfqq_IO_bound(bfqq);

3258

++

3259

++	if (bfqd->low_latency && bfqq->wr_coeff == 1)

3260

++		bfqq->last_wr_start_finish = jiffies;

3261

++

3262

++	if (bfqd->low_latency && bfqd->bfq_wr_max_softrt_rate > 0 &&

3263

++	    RB_EMPTY_ROOT(&bfqq->sort_list)) {

3264

++		/*

3265

++		 * If we get here, and there are no outstanding requests,

3266

++		 * then the request pattern is isochronous (see the comments

3267

++		 * to the function bfq_bfqq_softrt_next_start()). Hence we

3268

++		 * can compute soft_rt_next_start. If, instead, the queue

3269

++		 * still has outstanding requests, then we have to wait

3270

++		 * for the completion of all the outstanding requests to

3271

++		 * discover whether the request pattern is actually

3272

++		 * isochronous.

3273

++		 */

3274

++		if (bfqq->dispatched == 0)

3275

++			bfqq->soft_rt_next_start =

3276

++				bfq_bfqq_softrt_next_start(bfqd, bfqq);

3277

++		else {

3278

++			/*

3279

++			 * The application is still waiting for the

3280

++			 * completion of one or more requests:

3281

++			 * prevent it from possibly being incorrectly

3282

++			 * deemed as soft real-time by setting its

3283

++			 * soft_rt_next_start to infinity. In fact,

3284

++			 * without this assignment, the application

3285

++			 * would be incorrectly deemed as soft

3286

++			 * real-time if:

3287

++			 * 1) it issued a new request before the

3288

++			 *    completion of all its in-flight

3289

++			 *    requests, and

3290

++			 * 2) at that time, its soft_rt_next_start

3291

++			 *    happened to be in the past.

3292

++			 */

3293

++			bfqq->soft_rt_next_start =

3294

++				bfq_infinity_from_now(jiffies);

3295

++			/*

3296

++			 * Schedule an update of soft_rt_next_start to when

3297

++			 * the task may be discovered to be isochronous.

3298

++			 */

3299

++			bfq_mark_bfqq_softrt_update(bfqq);

3300

++		}

3301

++	}

3302

++

3303

++	bfq_log_bfqq(bfqd, bfqq,

3304

++		"expire (%d, slow %d, num_disp %d, idle_win %d)", reason,

3305

++		slow, bfqq->dispatched, bfq_bfqq_idle_window(bfqq));

3306

++

3307

++	/*

3308

++	 * Increase, decrease or leave budget unchanged according to

3309

++	 * reason.

3310

++	 */

3311

++	__bfq_bfqq_recalc_budget(bfqd, bfqq, reason);

3312

++	__bfq_bfqq_expire(bfqd, bfqq);

3313

++}

3314

++

3315

++/*

3316

++ * Budget timeout is not implemented through a dedicated timer, but

3317

++ * just checked on request arrivals and completions, as well as on

3318

++ * idle timer expirations.

3319

++ */

3320

++static bool bfq_bfqq_budget_timeout(struct bfq_queue *bfqq)

3321

++{

3322

++	if (bfq_bfqq_budget_new(bfqq) ||

3323

++	    time_before(jiffies, bfqq->budget_timeout))

3324

++		return false;

3325

++	return true;

3326

++}

3327

++

3328

++/*

3329

++ * If we expire a queue that is waiting for the arrival of a new

3330

++ * request, we may prevent the fictitious timestamp back-shifting that

3331

++ * allows the guarantees of the queue to be preserved (see [1] for

3332

++ * this tricky aspect). Hence we return true only if this condition

3333

++ * does not hold, or if the queue is slow enough to deserve only to be

3334

++ * kicked off for preserving a high throughput.

3335

++*/

3336

++static bool bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq)

3337

++{

3338

++	bfq_log_bfqq(bfqq->bfqd, bfqq,

3339

++		"may_budget_timeout: wait_request %d left %d timeout %d",

3340

++		bfq_bfqq_wait_request(bfqq),

3341

++			bfq_bfqq_budget_left(bfqq) >=  bfqq->entity.budget / 3,

3342

++		bfq_bfqq_budget_timeout(bfqq));

3343

++

3344

++	return (!bfq_bfqq_wait_request(bfqq) ||

3345

++		bfq_bfqq_budget_left(bfqq) >=  bfqq->entity.budget / 3)

3346

++		&&

3347

++		bfq_bfqq_budget_timeout(bfqq);

3348

++}

3349

++

3350

++/*

3351

++ * For a queue that becomes empty, device idling is allowed only if

3352

++ * this function returns true for that queue. As a consequence, since

3353

++ * device idling plays a critical role for both throughput boosting

3354

++ * and service guarantees, the return value of this function plays a

3355

++ * critical role as well.

3356

++ *

3357

++ * In a nutshell, this function returns true only if idling is

3358

++ * beneficial for throughput or, even if detrimental for throughput,

3359

++ * idling is however necessary to preserve service guarantees (low

3360

++ * latency, desired throughput distribution, ...). In particular, on

3361

++ * NCQ-capable devices, this function tries to return false, so as to

3362

++ * help keep the drives' internal queues full, whenever this helps the

3363

++ * device boost the throughput without causing any service-guarantee

3364

++ * issue.

3365

++ *

3366

++ * In more detail, the return value of this function is obtained by,

3367

++ * first, computing a number of boolean variables that take into

3368

++ * account throughput and service-guarantee issues, and, then,

3369

++ * combining these variables in a logical expression. Most of the

3370

++ * issues taken into account are not trivial. We discuss these issues

3371

++ * while introducing the variables.

3372

++ */

3373

++static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq)

3374

++{

3375

++	struct bfq_data *bfqd = bfqq->bfqd;

3376

++	bool idling_boosts_thr, idling_boosts_thr_without_issues,

3377

++		all_queues_seeky, on_hdd_and_not_all_queues_seeky,

3378

++		idling_needed_for_service_guarantees,

3379

++		asymmetric_scenario;

3380

++

3381

++	/*

3382

++	 * The next variable takes into account the cases where idling

3383

++	 * boosts the throughput.

3384

++	 *

3385

++	 * The value of the variable is computed considering, first, that

3386

++	 * idling is virtually always beneficial for the throughput if:

3387

++	 * (a) the device is not NCQ-capable, or

3388

++	 * (b) regardless of the presence of NCQ, the device is rotational

3389

++	 *     and the request pattern for bfqq is I/O-bound and sequential.

3390

++	 *

3391

++	 * Secondly, and in contrast to the above item (b), idling an

3392

++	 * NCQ-capable flash-based device would not boost the

3393

++	 * throughput even with sequential I/O; rather it would lower

3394

++	 * the throughput in proportion to how fast the device

3395

++	 * is. Accordingly, the next variable is true if any of the

3396

++	 * above conditions (a) and (b) is true, and, in particular,

3397

++	 * happens to be false if bfqd is an NCQ-capable flash-based

3398

++	 * device.

3399

++	 */

3400

++	idling_boosts_thr = !bfqd->hw_tag ||

3401

++		(!blk_queue_nonrot(bfqd->queue) && bfq_bfqq_IO_bound(bfqq) &&

3402

++		 bfq_bfqq_idle_window(bfqq)) ;

3403

++

3404

++	/*

3405

++	 * The value of the next variable,

3406

++	 * idling_boosts_thr_without_issues, is equal to that of

3407

++	 * idling_boosts_thr, unless a special case holds. In this

3408

++	 * special case, described below, idling may cause problems to

3409

++	 * weight-raised queues.

3410

++	 *

3411

++	 * When the request pool is saturated (e.g., in the presence

3412

++	 * of write hogs), if the processes associated with

3413

++	 * non-weight-raised queues ask for requests at a lower rate,

3414

++	 * then processes associated with weight-raised queues have a

3415

++	 * higher probability to get a request from the pool

3416

++	 * immediately (or at least soon) when they need one. Thus

3417

++	 * they have a higher probability to actually get a fraction

3418

++	 * of the device throughput proportional to their high

3419

++	 * weight. This is especially true with NCQ-capable drives,

3420

++	 * which enqueue several requests in advance, and further

3421

++	 * reorder internally-queued requests.

3422

++	 *

3423

++	 * For this reason, we force to false the value of

3424

++	 * idling_boosts_thr_without_issues if there are weight-raised

3425

++	 * busy queues. In this case, and if bfqq is not weight-raised,

3426

++	 * this guarantees that the device is not idled for bfqq (if,

3427

++	 * instead, bfqq is weight-raised, then idling will be

3428

++	 * guaranteed by another variable, see below). Combined with

3429

++	 * the timestamping rules of BFQ (see [1] for details), this

3430

++	 * behavior causes bfqq, and hence any sync non-weight-raised

3431

++	 * queue, to get a lower number of requests served, and thus

3432

++	 * to ask for a lower number of requests from the request

3433

++	 * pool, before the busy weight-raised queues get served

3434

++	 * again. This often mitigates starvation problems in the

3435

++	 * presence of heavy write workloads and NCQ, thereby

3436

++	 * guaranteeing a higher application and system responsiveness

3437

++	 * in these hostile scenarios.

3438

++	 */

3439

++	idling_boosts_thr_without_issues = idling_boosts_thr &&

3440

++		bfqd->wr_busy_queues == 0;

3441

++

3442

++	/*

3443

++	 * There are then two cases where idling must be performed not

3444

++	 * for throughput concerns, but to preserve service

3445

++	 * guarantees. In the description of these cases, we say, for

3446

++	 * short, that a queue is sequential/random if the process

3447

++	 * associated to the queue issues sequential/random requests

3448

++	 * (in the second case the queue may be tagged as seeky or

3449

++	 * even constantly_seeky).

3450

++	 *

3451

++	 * To introduce the first case, we note that, since

3452

++	 * bfq_bfqq_idle_window(bfqq) is false if the device is

3453

++	 * NCQ-capable and bfqq is random (see

3454

++	 * bfq_update_idle_window()), then, from the above two

3455

++	 * assignments it follows that

3456

++	 * idling_boosts_thr_without_issues is false if the device is

3457

++	 * NCQ-capable and bfqq is random. Therefore, for this case,

3458

++	 * device idling would never be allowed if we used just

3459

++	 * idling_boosts_thr_without_issues to decide whether to allow

3460

++	 * it. And, beneficially, this would imply that throughput

3461

++	 * would always be boosted also with random I/O on NCQ-capable

3462

++	 * HDDs.

3463

++	 *

3464

++	 * But we must be careful on this point, to avoid an unfair

3465

++	 * treatment for bfqq. In fact, because of the same above

3466

++	 * assignments, idling_boosts_thr_without_issues is, on the

3467

++	 * other hand, true if 1) the device is an HDD and bfqq is

3468

++	 * sequential, and 2) there are no busy weight-raised

3469

++	 * queues. As a consequence, if we used just

3470

++	 * idling_boosts_thr_without_issues to decide whether to idle

3471

++	 * the device, then with an HDD we might easily bump into a

3472

++	 * scenario where queues that are sequential and I/O-bound

3473

++	 * would enjoy idling, whereas random queues would not. The

3474

++	 * latter might then get a low share of the device throughput,

3475

++	 * simply because the former would get many requests served

3476

++	 * after being set as in service, while the latter would not.

3477

++	 *

3478

++	 * To address this issue, we start by setting to true a

3479

++	 * sentinel variable, on_hdd_and_not_all_queues_seeky, if the

3480

++	 * device is rotational and not all queues with pending or

3481

++	 * in-flight requests are constantly seeky (i.e., there are

3482

++	 * active sequential queues, and bfqq might then be mistreated

3483

++	 * if it does not enjoy idling because it is random).

3484

++	 */

3485

++	all_queues_seeky = bfq_bfqq_constantly_seeky(bfqq) &&

3486

++			   bfqd->busy_in_flight_queues ==

3487

++			   bfqd->const_seeky_busy_in_flight_queues;

3488

++

3489

++	on_hdd_and_not_all_queues_seeky =

3490

++		!blk_queue_nonrot(bfqd->queue) && !all_queues_seeky;

3491

++

3492

++	/*

3493

++	 * To introduce the second case where idling needs to be

3494

++	 * performed to preserve service guarantees, we can note that

3495

++	 * allowing the drive to enqueue more than one request at a

3496

++	 * time, and hence delegating de facto final scheduling

3497

++	 * decisions to the drive's internal scheduler, causes loss of

3498

++	 * control on the actual request service order. In particular,

3499

++	 * the critical situation is when requests from different

3500

++	 * processes happens to be present, at the same time, in the

3501

++	 * internal queue(s) of the drive. In such a situation, the

3502

++	 * drive, by deciding the service order of the

3503

++	 * internally-queued requests, does determine also the actual

3504

++	 * throughput distribution among these processes. But the

3505

++	 * drive typically has no notion or concern about per-process

3506

++	 * throughput distribution, and makes its decisions only on a

3507

++	 * per-request basis. Therefore, the service distribution

3508

++	 * enforced by the drive's internal scheduler is likely to

3509

++	 * coincide with the desired device-throughput distribution

3510

++	 * only in a completely symmetric scenario where:

3511

++	 * (i)  each of these processes must get the same throughput as

3512

++	 *      the others;

3513

++	 * (ii) all these processes have the same I/O pattern

3514

++	        (either sequential or random).

3515

++	 * In fact, in such a scenario, the drive will tend to treat

3516

++	 * the requests of each of these processes in about the same

3517

++	 * way as the requests of the others, and thus to provide

3518

++	 * each of these processes with about the same throughput

3519

++	 * (which is exactly the desired throughput distribution). In

3520

++	 * contrast, in any asymmetric scenario, device idling is

3521

++	 * certainly needed to guarantee that bfqq receives its

3522

++	 * assigned fraction of the device throughput (see [1] for

3523

++	 * details).

3524

++	 *

3525

++	 * We address this issue by controlling, actually, only the

3526

++	 * symmetry sub-condition (i), i.e., provided that

3527

++	 * sub-condition (i) holds, idling is not performed,

3528

++	 * regardless of whether sub-condition (ii) holds. In other

3529

++	 * words, only if sub-condition (i) holds, then idling is

3530

++	 * allowed, and the device tends to be prevented from queueing

3531

++	 * many requests, possibly of several processes. The reason

3532

++	 * for not controlling also sub-condition (ii) is that, first,

3533

++	 * in the case of an HDD, the asymmetry in terms of types of

3534

++	 * I/O patterns is already taken in to account in the above

3535

++	 * sentinel variable

3536

++	 * on_hdd_and_not_all_queues_seeky. Secondly, in the case of a

3537

++	 * flash-based device, we prefer however to privilege

3538

++	 * throughput (and idling lowers throughput for this type of

3539

++	 * devices), for the following reasons:

3540

++	 * 1) differently from HDDs, the service time of random

3541

++	 *    requests is not orders of magnitudes lower than the service

3542

++	 *    time of sequential requests; thus, even if processes doing

3543

++	 *    sequential I/O get a preferential treatment with respect to

3544

++	 *    others doing random I/O, the consequences are not as

3545

++	 *    dramatic as with HDDs;

3546

++	 * 2) if a process doing random I/O does need strong

3547

++	 *    throughput guarantees, it is hopefully already being

3548

++	 *    weight-raised, or the user is likely to have assigned it a

3549

++	 *    higher weight than the other processes (and thus

3550

++	 *    sub-condition (i) is likely to be false, which triggers

3551

++	 *    idling).

3552

++	 *

3553

++	 * According to the above considerations, the next variable is

3554

++	 * true (only) if sub-condition (i) holds. To compute the

3555

++	 * value of this variable, we not only use the return value of

3556

++	 * the function bfq_symmetric_scenario(), but also check

3557

++	 * whether bfqq is being weight-raised, because

3558

++	 * bfq_symmetric_scenario() does not take into account also

3559

++	 * weight-raised queues (see comments to

3560

++	 * bfq_weights_tree_add()).

3561

++	 *

3562

++	 * As a side note, it is worth considering that the above

3563

++	 * device-idling countermeasures may however fail in the

3564

++	 * following unlucky scenario: if idling is (correctly)

3565

++	 * disabled in a time period during which all symmetry

3566

++	 * sub-conditions hold, and hence the device is allowed to

3567

++	 * enqueue many requests, but at some later point in time some

3568

++	 * sub-condition stops to hold, then it may become impossible

3569

++	 * to let requests be served in the desired order until all

3570

++	 * the requests already queued in the device have been served.

3571

++	 */

3572

++	asymmetric_scenario = bfqq->wr_coeff > 1 ||

3573

++		!bfq_symmetric_scenario(bfqd);

3574

++

3575

++	/*

3576

++	 * Finally, there is a case where maximizing throughput is the

3577

++	 * best choice even if it may cause unfairness toward

3578

++	 * bfqq. Such a case is when bfqq became active in a burst of

3579

++	 * queue activations. Queues that became active during a large

3580

++	 * burst benefit only from throughput, as discussed in the

3581

++	 * comments to bfq_handle_burst. Thus, if bfqq became active

3582

++	 * in a burst and not idling the device maximizes throughput,

3583

++	 * then the device must no be idled, because not idling the

3584

++	 * device provides bfqq and all other queues in the burst with

3585

++	 * maximum benefit. Combining this and the two cases above, we

3586

++	 * can now establish when idling is actually needed to

3587

++	 * preserve service guarantees.

3588

++	 */

3589

++	idling_needed_for_service_guarantees =

3590

++		(on_hdd_and_not_all_queues_seeky || asymmetric_scenario) &&

3591

++		!bfq_bfqq_in_large_burst(bfqq);

3592

++

3593

++	/*

3594

++	 * We have now all the components we need to compute the return

3595

++	 * value of the function, which is true only if both the following

3596

++	 * conditions hold:

3597

++	 * 1) bfqq is sync, because idling make sense only for sync queues;

3598

++	 * 2) idling either boosts the throughput (without issues), or

3599

++	 *    is necessary to preserve service guarantees.

3600

++	 */

3601

++	return bfq_bfqq_sync(bfqq) &&

3602

++		(idling_boosts_thr_without_issues ||

3603

++		 idling_needed_for_service_guarantees);

3604

++}

3605

++

3606

++/*

3607

++ * If the in-service queue is empty but the function bfq_bfqq_may_idle

3608

++ * returns true, then:

3609

++ * 1) the queue must remain in service and cannot be expired, and

3610

++ * 2) the device must be idled to wait for the possible arrival of a new

3611

++ *    request for the queue.

3612

++ * See the comments to the function bfq_bfqq_may_idle for the reasons

3613

++ * why performing device idling is the best choice to boost the throughput

3614

++ * and preserve service guarantees when bfq_bfqq_may_idle itself

3615

++ * returns true.

3616

++ */

3617

++static bool bfq_bfqq_must_idle(struct bfq_queue *bfqq)

3618

++{

3619

++	struct bfq_data *bfqd = bfqq->bfqd;

3620

++

3621

++	return RB_EMPTY_ROOT(&bfqq->sort_list) && bfqd->bfq_slice_idle != 0 &&

3622

++	       bfq_bfqq_may_idle(bfqq);

3623

++}

3624

++

3625

++/*

3626

++ * Select a queue for service.  If we have a current queue in service,

3627

++ * check whether to continue servicing it, or retrieve and set a new one.

3628

++ */

3629

++static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)

3630

++{

3631

++	struct bfq_queue *bfqq;

3632

++	struct request *next_rq;

3633

++	enum bfqq_expiration reason = BFQ_BFQQ_BUDGET_TIMEOUT;

3634

++

3635

++	bfqq = bfqd->in_service_queue;

3636

++	if (!bfqq)

3637

++		goto new_queue;

3638

++

3639

++	bfq_log_bfqq(bfqd, bfqq, "select_queue: already in-service queue");

3640

++

3641

++	if (bfq_may_expire_for_budg_timeout(bfqq) &&

3642

++	    !timer_pending(&bfqd->idle_slice_timer) &&

3643

++	    !bfq_bfqq_must_idle(bfqq))

3644

++		goto expire;

3645

++

3646

++	next_rq = bfqq->next_rq;

3647

++	/*

3648

++	 * If bfqq has requests queued and it has enough budget left to

3649

++	 * serve them, keep the queue, otherwise expire it.

3650

++	 */

3651

++	if (next_rq) {

3652

++		if (bfq_serv_to_charge(next_rq, bfqq) >

3653

++			bfq_bfqq_budget_left(bfqq)) {

3654

++			reason = BFQ_BFQQ_BUDGET_EXHAUSTED;

3655

++			goto expire;

3656

++		} else {

3657

++			/*

3658

++			 * The idle timer may be pending because we may

3659

++			 * not disable disk idling even when a new request

3660

++			 * arrives.

3661

++			 */

3662

++			if (timer_pending(&bfqd->idle_slice_timer)) {

3663

++				/*

3664

++				 * If we get here: 1) at least a new request

3665

++				 * has arrived but we have not disabled the

3666

++				 * timer because the request was too small,

3667

++				 * 2) then the block layer has unplugged

3668

++				 * the device, causing the dispatch to be

3669

++				 * invoked.

3670

++				 *

3671

++				 * Since the device is unplugged, now the

3672

++				 * requests are probably large enough to

3673

++				 * provide a reasonable throughput.

3674

++				 * So we disable idling.

3675

++				 */

3676

++				bfq_clear_bfqq_wait_request(bfqq);

3677

++				del_timer(&bfqd->idle_slice_timer);

3678

++#ifdef CONFIG_BFQ_GROUP_IOSCHED

3679

++				bfqg_stats_update_idle_time(bfqq_group(bfqq));

3680

++#endif

3681

++			}

3682

++			goto keep_queue;

3683

++		}

3684

++	}

3685

++

3686

++	/*

3687

++	 * No requests pending. However, if the in-service queue is idling

3688

++	 * for a new request, or has requests waiting for a completion and

3689

++	 * may idle after their completion, then keep it anyway.

3690

++	 */

3691

++	if (timer_pending(&bfqd->idle_slice_timer) ||

3692

++	    (bfqq->dispatched != 0 && bfq_bfqq_may_idle(bfqq))) {

3693

++		bfqq = NULL;

3694

++		goto keep_queue;

3695

++	}

3696

++

3697

++	reason = BFQ_BFQQ_NO_MORE_REQUESTS;

3698

++expire:

3699

++	bfq_bfqq_expire(bfqd, bfqq, false, reason);

3700

++new_queue:

3701

++	bfqq = bfq_set_in_service_queue(bfqd);

3702

++	bfq_log(bfqd, "select_queue: new queue %d returned",

3703

++		bfqq ? bfqq->pid : 0);

3704

++keep_queue:

3705

++	return bfqq;

3706

++}

3707

++

3708

++static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq)

3709

++{

3710

++	struct bfq_entity *entity = &bfqq->entity;

3711

++	if (bfqq->wr_coeff > 1) { /* queue is being weight-raised */

3712

++		bfq_log_bfqq(bfqd, bfqq,

3713

++			"raising period dur %u/%u msec, old coeff %u, w %d(%d)",

3714

++			jiffies_to_msecs(jiffies - bfqq->last_wr_start_finish),

3715

++			jiffies_to_msecs(bfqq->wr_cur_max_time),

3716

++			bfqq->wr_coeff,

3717

++			bfqq->entity.weight, bfqq->entity.orig_weight);

3718

++

3719

++		BUG_ON(bfqq != bfqd->in_service_queue && entity->weight !=

3720

++		       entity->orig_weight * bfqq->wr_coeff);

3721

++		if (entity->prio_changed)

3722

++			bfq_log_bfqq(bfqd, bfqq, "WARN: pending prio change");

3723

++

3724

++		/*

3725

++		 * If the queue was activated in a burst, or

3726

++		 * too much time has elapsed from the beginning

3727

++		 * of this weight-raising period, then end weight

3728

++		 * raising.

3729

++		 */

3730

++		if (bfq_bfqq_in_large_burst(bfqq) ||

3731

++		    time_is_before_jiffies(bfqq->last_wr_start_finish +

3732

++					   bfqq->wr_cur_max_time)) {

3733

++			bfqq->last_wr_start_finish = jiffies;

3734

++			bfq_log_bfqq(bfqd, bfqq,

3735

++				     "wrais ending at %lu, rais_max_time %u",

3736

++				     bfqq->last_wr_start_finish,

3737

++				     jiffies_to_msecs(bfqq->wr_cur_max_time));

3738

++			bfq_bfqq_end_wr(bfqq);

3739

++		}

3740

++	}

3741

++	/* Update weight both if it must be raised and if it must be lowered */

3742

++	if ((entity->weight > entity->orig_weight) != (bfqq->wr_coeff > 1))

3743

++		__bfq_entity_update_weight_prio(

3744

++			bfq_entity_service_tree(entity),

3745

++			entity);

3746

++}

3747

++

3748

++/*

3749

++ * Dispatch one request from bfqq, moving it to the request queue

3750

++ * dispatch list.

3751

++ */

3752

++static int bfq_dispatch_request(struct bfq_data *bfqd,

3753

++				struct bfq_queue *bfqq)

3754

++{

3755

++	int dispatched = 0;

3756

++	struct request *rq;

3757

++	unsigned long service_to_charge;

3758

++

3759

++	BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list));

3760

++

3761

++	/* Follow expired path, else get first next available. */

3762

++	rq = bfq_check_fifo(bfqq);

3763

++	if (!rq)

3764

++		rq = bfqq->next_rq;

3765

++	service_to_charge = bfq_serv_to_charge(rq, bfqq);

3766

++

3767

++	if (service_to_charge > bfq_bfqq_budget_left(bfqq)) {

3768

++		/*

3769

++		 * This may happen if the next rq is chosen in fifo order

3770

++		 * instead of sector order. The budget is properly

3771

++		 * dimensioned to be always sufficient to serve the next

3772

++		 * request only if it is chosen in sector order. The reason

3773

++		 * is that it would be quite inefficient and little useful

3774

++		 * to always make sure that the budget is large enough to

3775

++		 * serve even the possible next rq in fifo order.

3776

++		 * In fact, requests are seldom served in fifo order.

3777

++		 *

3778

++		 * Expire the queue for budget exhaustion, and make sure

3779

++		 * that the next act_budget is enough to serve the next

3780

++		 * request, even if it comes from the fifo expired path.

3781

++		 */

3782

++		bfqq->next_rq = rq;

3783

++		/*

3784

++		 * Since this dispatch is failed, make sure that

3785

++		 * a new one will be performed

3786

++		 */

3787

++		if (!bfqd->rq_in_driver)

3788

++			bfq_schedule_dispatch(bfqd);

3789

++		goto expire;

3790

++	}

3791

++

3792

++	/* Finally, insert request into driver dispatch list. */

3793

++	bfq_bfqq_served(bfqq, service_to_charge);

3794

++	bfq_dispatch_insert(bfqd->queue, rq);

3795

++

3796

++	bfq_update_wr_data(bfqd, bfqq);

3797

++

3798

++	bfq_log_bfqq(bfqd, bfqq,

3799

++			"dispatched %u sec req (%llu), budg left %d",

3800

++			blk_rq_sectors(rq),

3801

++			(long long unsigned)blk_rq_pos(rq),

3802

++			bfq_bfqq_budget_left(bfqq));

3803

++

3804

++	dispatched++;

3805

++

3806

++	if (!bfqd->in_service_bic) {

3807

++		atomic_long_inc(&RQ_BIC(rq)->icq.ioc->refcount);

3808

++		bfqd->in_service_bic = RQ_BIC(rq);

3809

++	}

3810

++

3811

++	if (bfqd->busy_queues > 1 && ((!bfq_bfqq_sync(bfqq) &&

3812

++	    dispatched >= bfqd->bfq_max_budget_async_rq) ||

3813

++	    bfq_class_idle(bfqq)))

3814

++		goto expire;

3815

++

3816

++	return dispatched;

3817

++

3818

++expire:

3819

++	bfq_bfqq_expire(bfqd, bfqq, false, BFQ_BFQQ_BUDGET_EXHAUSTED);

3820

++	return dispatched;

3821

++}

3822

++

3823

++static int __bfq_forced_dispatch_bfqq(struct bfq_queue *bfqq)

3824

++{

3825

++	int dispatched = 0;

3826

++

3827

++	while (bfqq->next_rq) {

3828

++		bfq_dispatch_insert(bfqq->bfqd->queue, bfqq->next_rq);

3829

++		dispatched++;

3830

++	}

3831

++

3832

++	BUG_ON(!list_empty(&bfqq->fifo));

3833

++	return dispatched;

3834

++}

3835

++

3836

++/*

3837

++ * Drain our current requests.

3838

++ * Used for barriers and when switching io schedulers on-the-fly.

3839

++ */

3840

++static int bfq_forced_dispatch(struct bfq_data *bfqd)

3841

++{

3842

++	struct bfq_queue *bfqq, *n;

3843

++	struct bfq_service_tree *st;

3844

++	int dispatched = 0;

3845

++

3846

++	bfqq = bfqd->in_service_queue;

3847

++	if (bfqq)

3848

++		__bfq_bfqq_expire(bfqd, bfqq);

3849

++

3850

++	/*

3851

++	 * Loop through classes, and be careful to leave the scheduler

3852

++	 * in a consistent state, as feedback mechanisms and vtime

3853

++	 * updates cannot be disabled during the process.

3854

++	 */

3855

++	list_for_each_entry_safe(bfqq, n, &bfqd->active_list, bfqq_list) {

3856

++		st = bfq_entity_service_tree(&bfqq->entity);

3857

++

3858

++		dispatched += __bfq_forced_dispatch_bfqq(bfqq);

3859

++		bfqq->max_budget = bfq_max_budget(bfqd);

3860

++

3861

++		bfq_forget_idle(st);

3862

++	}

3863

++

3864

++	BUG_ON(bfqd->busy_queues != 0);

3865

++

3866

++	return dispatched;

3867

++}

3868

++

3869

++static int bfq_dispatch_requests(struct request_queue *q, int force)

3870

++{

3871

++	struct bfq_data *bfqd = q->elevator->elevator_data;

3872

++	struct bfq_queue *bfqq;

3873

++	int max_dispatch;

3874

++

3875

++	bfq_log(bfqd, "dispatch requests: %d busy queues", bfqd->busy_queues);

3876

++	if (bfqd->busy_queues == 0)

3877

++		return 0;

3878

++

3879

++	if (unlikely(force))

3880

++		return bfq_forced_dispatch(bfqd);

3881

++

3882

++	bfqq = bfq_select_queue(bfqd);

3883

++	if (!bfqq)

3884

++		return 0;

3885

++

3886

++	if (bfq_class_idle(bfqq))

3887

++		max_dispatch = 1;

3888

++

3889

++	if (!bfq_bfqq_sync(bfqq))

3890

++		max_dispatch = bfqd->bfq_max_budget_async_rq;

3891

++

3892

++	if (!bfq_bfqq_sync(bfqq) && bfqq->dispatched >= max_dispatch) {

3893

++		if (bfqd->busy_queues > 1)

3894

++			return 0;

3895

++		if (bfqq->dispatched >= 4 * max_dispatch)

3896

++			return 0;

3897

++	}

3898

++

3899

++	if (bfqd->sync_flight != 0 && !bfq_bfqq_sync(bfqq))

3900

++		return 0;

3901

++

3902

++	bfq_clear_bfqq_wait_request(bfqq);

3903

++	BUG_ON(timer_pending(&bfqd->idle_slice_timer));

3904

++

3905

++	if (!bfq_dispatch_request(bfqd, bfqq))

3906

++		return 0;

3907

++

3908

++	bfq_log_bfqq(bfqd, bfqq, "dispatched %s request",

3909

++			bfq_bfqq_sync(bfqq) ? "sync" : "async");

3910

++

3911

++	return 1;

3912

++}

3913

++

3914

++/*

3915

++ * Task holds one reference to the queue, dropped when task exits.  Each rq

3916

++ * in-flight on this queue also holds a reference, dropped when rq is freed.

3917

++ *

3918

++ * Queue lock must be held here.

3919

++ */

3920

++static void bfq_put_queue(struct bfq_queue *bfqq)

3921

++{

3922

++	struct bfq_data *bfqd = bfqq->bfqd;

3923

++#ifdef CONFIG_BFQ_GROUP_IOSCHED

3924

++	struct bfq_group *bfqg = bfqq_group(bfqq);

3925

++#endif

3926

++

3927

++	BUG_ON(atomic_read(&bfqq->ref) <= 0);

3928

++

3929

++	bfq_log_bfqq(bfqd, bfqq, "put_queue: %p %d", bfqq,

3930

++		     atomic_read(&bfqq->ref));

3931

++	if (!atomic_dec_and_test(&bfqq->ref))

3932

++		return;

3933

++

3934

++	BUG_ON(rb_first(&bfqq->sort_list));

3935

++	BUG_ON(bfqq->allocated[READ] + bfqq->allocated[WRITE] != 0);

3936

++	BUG_ON(bfqq->entity.tree);

3937

++	BUG_ON(bfq_bfqq_busy(bfqq));

3938

++	BUG_ON(bfqd->in_service_queue == bfqq);

3939

++

3940

++	if (bfq_bfqq_sync(bfqq))

3941

++		/*

3942

++		 * The fact that this queue is being destroyed does not

3943

++		 * invalidate the fact that this queue may have been

3944

++		 * activated during the current burst. As a consequence,

3945

++		 * although the queue does not exist anymore, and hence

3946

++		 * needs to be removed from the burst list if there,

3947

++		 * the burst size has not to be decremented.

3948

++		 */

3949

++		hlist_del_init(&bfqq->burst_list_node);

3950

++

3951

++	bfq_log_bfqq(bfqd, bfqq, "put_queue: %p freed", bfqq);

3952

++

3953

++	kmem_cache_free(bfq_pool, bfqq);

3954

++#ifdef CONFIG_BFQ_GROUP_IOSCHED

3955

++	bfqg_put(bfqg);

3956

++#endif

3957

++}

3958

++

3959

++static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)

3960

++{

3961

++	if (bfqq == bfqd->in_service_queue) {

3962

++		__bfq_bfqq_expire(bfqd, bfqq);

3963

++		bfq_schedule_dispatch(bfqd);

3964

++	}

3965

++

3966

++	bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq,

3967

++		     atomic_read(&bfqq->ref));

3968

++

3969

++	bfq_put_queue(bfqq);

3970

++}

3971

++

3972

++static void bfq_init_icq(struct io_cq *icq)

3973

++{

3974

++	struct bfq_io_cq *bic = icq_to_bic(icq);

3975

++

3976

++	bic->ttime.last_end_request = jiffies;

3977

++}

3978

++

3979

++static void bfq_exit_icq(struct io_cq *icq)

3980

++{

3981

++	struct bfq_io_cq *bic = icq_to_bic(icq);

3982

++	struct bfq_data *bfqd = bic_to_bfqd(bic);

3983

++

3984

++	if (bic->bfqq[BLK_RW_ASYNC]) {

3985

++		bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_ASYNC]);

3986

++		bic->bfqq[BLK_RW_ASYNC] = NULL;

3987

++	}

3988

++

3989

++	if (bic->bfqq[BLK_RW_SYNC]) {

3990

++		bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_SYNC]);

3991

++		bic->bfqq[BLK_RW_SYNC] = NULL;

3992

++	}

3993

++}

3994

++

3995

++/*

3996

++ * Update the entity prio values; note that the new values will not

3997

++ * be used until the next (re)activation.

3998

++ */

3999

++static void bfq_set_next_ioprio_data(struct bfq_queue *bfqq, struct bfq_io_cq *bic)

4000

++{

4001

++	struct task_struct *tsk = current;

4002

++	int ioprio_class;

4003

++

4004

++	ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio);

4005

++	switch (ioprio_class) {

4006

++	default:

4007

++		dev_err(bfqq->bfqd->queue->backing_dev_info.dev,

4008

++			"bfq: bad prio class %d\n", ioprio_class);

4009

++	case IOPRIO_CLASS_NONE:

4010

++		/*

4011

++		 * No prio set, inherit CPU scheduling settings.

4012

++		 */

4013

++		bfqq->new_ioprio = task_nice_ioprio(tsk);

4014

++		bfqq->new_ioprio_class = task_nice_ioclass(tsk);

4015

++		break;

4016

++	case IOPRIO_CLASS_RT:

4017

++		bfqq->new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio);

4018

++		bfqq->new_ioprio_class = IOPRIO_CLASS_RT;

4019

++		break;

4020

++	case IOPRIO_CLASS_BE:

4021

++		bfqq->new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio);

4022

++		bfqq->new_ioprio_class = IOPRIO_CLASS_BE;

4023

++		break;

4024

++	case IOPRIO_CLASS_IDLE:

4025

++		bfqq->new_ioprio_class = IOPRIO_CLASS_IDLE;

4026

++		bfqq->new_ioprio = 7;

4027

++		bfq_clear_bfqq_idle_window(bfqq);

4028

++		break;

4029

++	}

4030

++

4031

++	if (bfqq->new_ioprio < 0 || bfqq->new_ioprio >= IOPRIO_BE_NR) {

4032

++		printk(KERN_CRIT "bfq_set_next_ioprio_data: new_ioprio %d\n",

4033

++				 bfqq->new_ioprio);

4034

++		BUG();

4035

++	}

4036

++

4037

++	bfqq->entity.new_weight = bfq_ioprio_to_weight(bfqq->new_ioprio);

4038

++	bfqq->entity.prio_changed = 1;

4039

++}

4040

++

4041

++static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio)

4042

++{

4043

++	struct bfq_data *bfqd;

4044

++	struct bfq_queue *bfqq, *new_bfqq;

4045

++	unsigned long uninitialized_var(flags);

4046

++	int ioprio = bic->icq.ioc->ioprio;

4047

++

4048

++	bfqd = bfq_get_bfqd_locked(&(bic->icq.q->elevator->elevator_data),

4049

++				   &flags);

4050

++	/*

4051

++	 * This condition may trigger on a newly created bic, be sure to

4052

++	 * drop the lock before returning.

4053

++	 */

4054

++	if (unlikely(!bfqd) || likely(bic->ioprio == ioprio))

4055

++		goto out;

4056

++

4057

++	bic->ioprio = ioprio;

4058

++

4059

++	bfqq = bic->bfqq[BLK_RW_ASYNC];

4060

++	if (bfqq) {

4061

++		new_bfqq = bfq_get_queue(bfqd, bio, BLK_RW_ASYNC, bic,

4062

++					 GFP_ATOMIC);

4063

++		if (new_bfqq) {

4064

++			bic->bfqq[BLK_RW_ASYNC] = new_bfqq;

4065

++			bfq_log_bfqq(bfqd, bfqq,

4066

++				     "check_ioprio_change: bfqq %p %d",

4067

++				     bfqq, atomic_read(&bfqq->ref));

4068

++			bfq_put_queue(bfqq);

4069

++		}

4070

++	}

4071

++

4072

++	bfqq = bic->bfqq[BLK_RW_SYNC];

4073

++	if (bfqq)

4074

++		bfq_set_next_ioprio_data(bfqq, bic);

4075

++

4076

++out:

4077

++	bfq_put_bfqd_unlock(bfqd, &flags);

4078

++}

4079

++

4080

++static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,

4081

++			  struct bfq_io_cq *bic, pid_t pid, int is_sync)

4082

++{

4083

++	RB_CLEAR_NODE(&bfqq->entity.rb_node);

4084

++	INIT_LIST_HEAD(&bfqq->fifo);

4085

++	INIT_HLIST_NODE(&bfqq->burst_list_node);

4086

++

4087

++	atomic_set(&bfqq->ref, 0);

4088

++	bfqq->bfqd = bfqd;

4089

++

4090

++	if (bic)

4091

++		bfq_set_next_ioprio_data(bfqq, bic);

4092

++

4093

++	if (is_sync) {

4094

++		if (!bfq_class_idle(bfqq))

4095

++			bfq_mark_bfqq_idle_window(bfqq);

4096

++		bfq_mark_bfqq_sync(bfqq);

4097

++	} else

4098

++		bfq_clear_bfqq_sync(bfqq);

4099

++	bfq_mark_bfqq_IO_bound(bfqq);

4100

++

4101

++	/* Tentative initial value to trade off between thr and lat */

4102

++	bfqq->max_budget = (2 * bfq_max_budget(bfqd)) / 3;

4103

++	bfqq->pid = pid;

4104

++

4105

++	bfqq->wr_coeff = 1;

4106

++	bfqq->last_wr_start_finish = 0;

4107

++	/*

4108

++	 * Set to the value for which bfqq will not be deemed as

4109

++	 * soft rt when it becomes backlogged.

4110

++	 */

4111

++	bfqq->soft_rt_next_start = bfq_infinity_from_now(jiffies);

4112

++}

4113

++

4114

++static struct bfq_queue *bfq_find_alloc_queue(struct bfq_data *bfqd,

4115

++					      struct bio *bio, int is_sync,

4116

++					      struct bfq_io_cq *bic,

4117

++					      gfp_t gfp_mask)

4118

++{

4119

++	struct bfq_group *bfqg;

4120

++	struct bfq_queue *bfqq, *new_bfqq = NULL;

4121

++	struct blkcg *blkcg;

4122

++

4123

++retry:

4124

++	rcu_read_lock();

4125

++

4126

++	blkcg = bio_blkcg(bio);

4127

++	bfqg = bfq_find_alloc_group(bfqd, blkcg);

4128

++	/* bic always exists here */

4129

++	bfqq = bic_to_bfqq(bic, is_sync);

4130

++

4131

++	/*

4132

++	 * Always try a new alloc if we fall back to the OOM bfqq

4133

++	 * originally, since it should just be a temporary situation.

4134

++	 */

4135

++	if (!bfqq || bfqq == &bfqd->oom_bfqq) {

4136

++		bfqq = NULL;

4137

++		if (new_bfqq) {

4138

++			bfqq = new_bfqq;

4139

++			new_bfqq = NULL;

4140

++		} else if (gfpflags_allow_blocking(gfp_mask)) {

4141

++			rcu_read_unlock();

4142

++			spin_unlock_irq(bfqd->queue->queue_lock);

4143

++			new_bfqq = kmem_cache_alloc_node(bfq_pool,

4144

++					gfp_mask | __GFP_ZERO,

4145

++					bfqd->queue->node);

4146

++			spin_lock_irq(bfqd->queue->queue_lock);

4147

++			if (new_bfqq)

4148

++				goto retry;

4149

++		} else {

4150

++			bfqq = kmem_cache_alloc_node(bfq_pool,

4151

++					gfp_mask | __GFP_ZERO,

4152

++					bfqd->queue->node);

4153

++		}

4154

++

4155

++		if (bfqq) {

4156

++			bfq_init_bfqq(bfqd, bfqq, bic, current->pid,

4157

++                                      is_sync);

4158

++			bfq_init_entity(&bfqq->entity, bfqg);

4159

++			bfq_log_bfqq(bfqd, bfqq, "allocated");

4160

++		} else {

4161

++			bfqq = &bfqd->oom_bfqq;

4162

++			bfq_log_bfqq(bfqd, bfqq, "using oom bfqq");

4163

++		}

4164

++	}

4165

++

4166

++	if (new_bfqq)

4167

++		kmem_cache_free(bfq_pool, new_bfqq);

4168

++

4169

++	rcu_read_unlock();

4170

++

4171

++	return bfqq;

4172

++}

4173

++

4174

++static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd,

4175

++					       struct bfq_group *bfqg,

4176

++					       int ioprio_class, int ioprio)

4177

++{

4178

++	switch (ioprio_class) {

4179

++	case IOPRIO_CLASS_RT:

4180

++		return &bfqg->async_bfqq[0][ioprio];

4181

++	case IOPRIO_CLASS_NONE:

4182

++		ioprio = IOPRIO_NORM;

4183

++		/* fall through */

4184

++	case IOPRIO_CLASS_BE:

4185

++		return &bfqg->async_bfqq[1][ioprio];

4186

++	case IOPRIO_CLASS_IDLE:

4187

++		return &bfqg->async_idle_bfqq;

4188

++	default:

4189

++		BUG();

4190

++	}

4191

++}

4192

++

4193

++static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,

4194

++				       struct bio *bio, int is_sync,

4195

++				       struct bfq_io_cq *bic, gfp_t gfp_mask)

4196

++{

4197

++	const int ioprio = IOPRIO_PRIO_DATA(bic->ioprio);

4198

++	const int ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio);

4199

++	struct bfq_queue **async_bfqq = NULL;

4200

++	struct bfq_queue *bfqq = NULL;

4201

++

4202

++	if (!is_sync) {

4203

++		struct blkcg *blkcg;

4204

++		struct bfq_group *bfqg;

4205

++

4206

++		rcu_read_lock();

4207

++		blkcg = bio_blkcg(bio);

4208

++		rcu_read_unlock();

4209

++		bfqg = bfq_find_alloc_group(bfqd, blkcg);

4210

++		async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class,

4211

++						  ioprio);

4212

++		bfqq = *async_bfqq;

4213

++	}

4214

++

4215

++	if (!bfqq)

4216

++		bfqq = bfq_find_alloc_queue(bfqd, bio, is_sync, bic, gfp_mask);

4217

++

4218

++	/*

4219

++	 * Pin the queue now that it's allocated, scheduler exit will

4220

++	 * prune it.

4221

++	 */

4222

++	if (!is_sync && !(*async_bfqq)) {

4223

++		atomic_inc(&bfqq->ref);

4224

++		bfq_log_bfqq(bfqd, bfqq, "get_queue, bfqq not in async: %p, %d",

4225

++			     bfqq, atomic_read(&bfqq->ref));

4226

++		*async_bfqq = bfqq;

4227

++	}

4228

++

4229

++	atomic_inc(&bfqq->ref);

4230

++	bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq,

4231

++		     atomic_read(&bfqq->ref));

4232

++	return bfqq;

4233

++}

4234

++

4235

++static void bfq_update_io_thinktime(struct bfq_data *bfqd,

4236

++				    struct bfq_io_cq *bic)

4237

++{

4238

++	unsigned long elapsed = jiffies - bic->ttime.last_end_request;

4239

++	unsigned long ttime = min(elapsed, 2UL * bfqd->bfq_slice_idle);

4240

++

4241

++	bic->ttime.ttime_samples = (7*bic->ttime.ttime_samples + 256) / 8;

4242

++	bic->ttime.ttime_total = (7*bic->ttime.ttime_total + 256*ttime) / 8;

4243

++	bic->ttime.ttime_mean = (bic->ttime.ttime_total + 128) /

4244

++				bic->ttime.ttime_samples;

4245

++}

4246

++

4247

++static void bfq_update_io_seektime(struct bfq_data *bfqd,

4248

++				   struct bfq_queue *bfqq,

4249

++				   struct request *rq)

4250

++{

4251

++	sector_t sdist;

4252

++	u64 total;

4253

++

4254

++	if (bfqq->last_request_pos < blk_rq_pos(rq))

4255

++		sdist = blk_rq_pos(rq) - bfqq->last_request_pos;

4256

++	else

4257

++		sdist = bfqq->last_request_pos - blk_rq_pos(rq);

4258

++

4259

++	/*

4260

++	 * Don't allow the seek distance to get too large from the

4261

++	 * odd fragment, pagein, etc.

4262

++	 */

4263

++	if (bfqq->seek_samples == 0) /* first request, not really a seek */

4264

++		sdist = 0;

4265

++	else if (bfqq->seek_samples <= 60) /* second & third seek */

4266

++		sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*1024);

4267

++	else

4268

++		sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*64);

4269

++

4270

++	bfqq->seek_samples = (7*bfqq->seek_samples + 256) / 8;

4271

++	bfqq->seek_total = (7*bfqq->seek_total + (u64)256*sdist) / 8;

4272

++	total = bfqq->seek_total + (bfqq->seek_samples/2);

4273

++	do_div(total, bfqq->seek_samples);

4274

++	bfqq->seek_mean = (sector_t)total;

4275

++

4276

++	bfq_log_bfqq(bfqd, bfqq, "dist=%llu mean=%llu", (u64)sdist,

4277

++			(u64)bfqq->seek_mean);

4278

++}

4279

++

4280

++/*

4281

++ * Disable idle window if the process thinks too long or seeks so much that

4282

++ * it doesn't matter.

4283

++ */

4284

++static void bfq_update_idle_window(struct bfq_data *bfqd,

4285

++				   struct bfq_queue *bfqq,

4286

++				   struct bfq_io_cq *bic)

4287

++{

4288

++	int enable_idle;

4289

++

4290

++	/* Don't idle for async or idle io prio class. */

4291

++	if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq))

4292

++		return;

4293

++

4294

++	enable_idle = bfq_bfqq_idle_window(bfqq);

4295

++

4296

++	if (atomic_read(&bic->icq.ioc->active_ref) == 0 ||

4297

++	    bfqd->bfq_slice_idle == 0 ||

4298

++		(bfqd->hw_tag && BFQQ_SEEKY(bfqq) &&

4299

++			bfqq->wr_coeff == 1))

4300

++		enable_idle = 0;

4301

++	else if (bfq_sample_valid(bic->ttime.ttime_samples)) {

4302

++		if (bic->ttime.ttime_mean > bfqd->bfq_slice_idle &&

4303

++			bfqq->wr_coeff == 1)

4304

++			enable_idle = 0;

4305

++		else

4306

++			enable_idle = 1;

4307

++	}

4308

++	bfq_log_bfqq(bfqd, bfqq, "update_idle_window: enable_idle %d",

4309

++		enable_idle);

4310

++

4311

++	if (enable_idle)

4312

++		bfq_mark_bfqq_idle_window(bfqq);

4313

++	else

4314

++		bfq_clear_bfqq_idle_window(bfqq);

4315

++}

4316

++

4317

++/*

4318

++ * Called when a new fs request (rq) is added to bfqq.  Check if there's

4319

++ * something we should do about it.

4320

++ */

4321

++static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,

4322

++			    struct request *rq)

4323

++{

4324

++	struct bfq_io_cq *bic = RQ_BIC(rq);

4325

++

4326

++	if (rq->cmd_flags & REQ_META)

4327

++		bfqq->meta_pending++;

4328

++

4329

++	bfq_update_io_thinktime(bfqd, bic);

4330

++	bfq_update_io_seektime(bfqd, bfqq, rq);

4331

++	if (!BFQQ_SEEKY(bfqq) && bfq_bfqq_constantly_seeky(bfqq)) {

4332

++		bfq_clear_bfqq_constantly_seeky(bfqq);

4333

++		if (!blk_queue_nonrot(bfqd->queue)) {

4334

++			BUG_ON(!bfqd->const_seeky_busy_in_flight_queues);

4335

++			bfqd->const_seeky_busy_in_flight_queues--;

4336

++		}

4337

++	}

4338

++	if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 ||

4339

++	    !BFQQ_SEEKY(bfqq))

4340

++		bfq_update_idle_window(bfqd, bfqq, bic);

4341

++

4342

++	bfq_log_bfqq(bfqd, bfqq,

4343

++		     "rq_enqueued: idle_window=%d (seeky %d, mean %llu)",

4344

++		     bfq_bfqq_idle_window(bfqq), BFQQ_SEEKY(bfqq),

4345

++		     (long long unsigned)bfqq->seek_mean);

4346

++

4347

++	bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);

4348

++

4349

++	if (bfqq == bfqd->in_service_queue && bfq_bfqq_wait_request(bfqq)) {

4350

++		bool small_req = bfqq->queued[rq_is_sync(rq)] == 1 &&

4351

++				 blk_rq_sectors(rq) < 32;

4352

++		bool budget_timeout = bfq_bfqq_budget_timeout(bfqq);

4353

++

4354

++		/*

4355

++		 * There is just this request queued: if the request

4356

++		 * is small and the queue is not to be expired, then

4357

++		 * just exit.

4358

++		 *

4359

++		 * In this way, if the disk is being idled to wait for

4360

++		 * a new request from the in-service queue, we avoid

4361

++		 * unplugging the device and committing the disk to serve

4362

++		 * just a small request. On the contrary, we wait for

4363

++		 * the block layer to decide when to unplug the device:

4364

++		 * hopefully, new requests will be merged to this one

4365

++		 * quickly, then the device will be unplugged and

4366

++		 * larger requests will be dispatched.

4367

++		 */

4368

++		if (small_req && !budget_timeout)

4369

++			return;

4370

++

4371

++		/*

4372

++		 * A large enough request arrived, or the queue is to

4373

++		 * be expired: in both cases disk idling is to be

4374

++		 * stopped, so clear wait_request flag and reset

4375

++		 * timer.

4376

++		 */

4377

++		bfq_clear_bfqq_wait_request(bfqq);

4378

++		del_timer(&bfqd->idle_slice_timer);

4379

++#ifdef CONFIG_BFQ_GROUP_IOSCHED

4380

++		bfqg_stats_update_idle_time(bfqq_group(bfqq));

4381

++#endif

4382

++

4383

++		/*

4384

++		 * The queue is not empty, because a new request just

4385

++		 * arrived. Hence we can safely expire the queue, in

4386

++		 * case of budget timeout, without risking that the

4387

++		 * timestamps of the queue are not updated correctly.

4388

++		 * See [1] for more details.

4389

++		 */

4390

++		if (budget_timeout)

4391

++			bfq_bfqq_expire(bfqd, bfqq, false,

4392

++					BFQ_BFQQ_BUDGET_TIMEOUT);

4393

++

4394

++		/*

4395

++		 * Let the request rip immediately, or let a new queue be

4396

++		 * selected if bfqq has just been expired.

4397

++		 */

4398

++		__blk_run_queue(bfqd->queue);

4399

++	}

4400

++}

4401

++

4402

++static void bfq_insert_request(struct request_queue *q, struct request *rq)

4403

++{

4404

++	struct bfq_data *bfqd = q->elevator->elevator_data;

4405

++	struct bfq_queue *bfqq = RQ_BFQQ(rq);

4406

++

4407

++	assert_spin_locked(bfqd->queue->queue_lock);

4408

++

4409

++	bfq_add_request(rq);

4410

++

4411

++	rq->fifo_time = jiffies + bfqd->bfq_fifo_expire[rq_is_sync(rq)];

4412

++	list_add_tail(&rq->queuelist, &bfqq->fifo);

4413

++

4414

++	bfq_rq_enqueued(bfqd, bfqq, rq);

4415

++}

4416

++

4417

++static void bfq_update_hw_tag(struct bfq_data *bfqd)

4418

++{

4419

++	bfqd->max_rq_in_driver = max(bfqd->max_rq_in_driver,

4420

++				     bfqd->rq_in_driver);

4421

++

4422

++	if (bfqd->hw_tag == 1)

4423

++		return;

4424

++

4425

++	/*

4426

++	 * This sample is valid if the number of outstanding requests

4427

++	 * is large enough to allow a queueing behavior.  Note that the

4428

++	 * sum is not exact, as it's not taking into account deactivated

4429

++	 * requests.

4430

++	 */

4431

++	if (bfqd->rq_in_driver + bfqd->queued < BFQ_HW_QUEUE_THRESHOLD)

4432

++		return;

4433

++

4434

++	if (bfqd->hw_tag_samples++ < BFQ_HW_QUEUE_SAMPLES)

4435

++		return;

4436

++

4437

++	bfqd->hw_tag = bfqd->max_rq_in_driver > BFQ_HW_QUEUE_THRESHOLD;

4438

++	bfqd->max_rq_in_driver = 0;

4439

++	bfqd->hw_tag_samples = 0;

4440

++}

4441

++

4442

++static void bfq_completed_request(struct request_queue *q, struct request *rq)

4443

++{

4444

++	struct bfq_queue *bfqq = RQ_BFQQ(rq);

4445

++	struct bfq_data *bfqd = bfqq->bfqd;

4446

++	bool sync = bfq_bfqq_sync(bfqq);

4447

++

4448

++	bfq_log_bfqq(bfqd, bfqq, "completed one req with %u sects left (%d)",

4449

++		     blk_rq_sectors(rq), sync);

4450

++

4451

++	bfq_update_hw_tag(bfqd);

4452

++

4453

++	BUG_ON(!bfqd->rq_in_driver);

4454

++	BUG_ON(!bfqq->dispatched);

4455

++	bfqd->rq_in_driver--;

4456

++	bfqq->dispatched--;

4457

++#ifdef CONFIG_BFQ_GROUP_IOSCHED

4458

++	bfqg_stats_update_completion(bfqq_group(bfqq),

4459

++				     rq_start_time_ns(rq),

4460

++				     rq_io_start_time_ns(rq), rq->cmd_flags);

4461

++#endif

4462

++

4463

++	if (!bfqq->dispatched && !bfq_bfqq_busy(bfqq)) {

4464

++		bfq_weights_tree_remove(bfqd, &bfqq->entity,

4465

++					&bfqd->queue_weights_tree);

4466

++		if (!blk_queue_nonrot(bfqd->queue)) {

4467

++			BUG_ON(!bfqd->busy_in_flight_queues);

4468

++			bfqd->busy_in_flight_queues--;

4469

++			if (bfq_bfqq_constantly_seeky(bfqq)) {

4470

++				BUG_ON(!bfqd->

4471

++					const_seeky_busy_in_flight_queues);

4472

++				bfqd->const_seeky_busy_in_flight_queues--;

4473

++			}

4474

++		}

4475

++	}

4476

++

4477

++	if (sync) {

4478

++		bfqd->sync_flight--;

4479

++		RQ_BIC(rq)->ttime.last_end_request = jiffies;

4480

++	}

4481

++

4482

++	/*

4483

++	 * If we are waiting to discover whether the request pattern of the

4484

++	 * task associated with the queue is actually isochronous, and

4485

++	 * both requisites for this condition to hold are satisfied, then

4486

++	 * compute soft_rt_next_start (see the comments to the function

4487

++	 * bfq_bfqq_softrt_next_start()).

4488

++	 */

4489

++	if (bfq_bfqq_softrt_update(bfqq) && bfqq->dispatched == 0 &&

4490

++	    RB_EMPTY_ROOT(&bfqq->sort_list))

4491

++		bfqq->soft_rt_next_start =

4492

++			bfq_bfqq_softrt_next_start(bfqd, bfqq);

4493

++

4494

++	/*

4495

++	 * If this is the in-service queue, check if it needs to be expired,

4496

++	 * or if we want to idle in case it has no pending requests.

4497

++	 */

4498

++	if (bfqd->in_service_queue == bfqq) {

4499

++		if (bfq_bfqq_budget_new(bfqq))

4500

++			bfq_set_budget_timeout(bfqd);

4501

++

4502

++		if (bfq_bfqq_must_idle(bfqq)) {

4503

++			bfq_arm_slice_timer(bfqd);

4504

++			goto out;

4505

++		} else if (bfq_may_expire_for_budg_timeout(bfqq))

4506

++			bfq_bfqq_expire(bfqd, bfqq, false,

4507

++					BFQ_BFQQ_BUDGET_TIMEOUT);

4508

++		else if (RB_EMPTY_ROOT(&bfqq->sort_list) &&

4509

++			 (bfqq->dispatched == 0 ||

4510

++			  !bfq_bfqq_may_idle(bfqq)))

4511

++			bfq_bfqq_expire(bfqd, bfqq, false,

4512

++					BFQ_BFQQ_NO_MORE_REQUESTS);

4513

++	}

4514

++

4515

++	if (!bfqd->rq_in_driver)

4516

++		bfq_schedule_dispatch(bfqd);

4517

++

4518

++out:

4519

++	return;

4520

++}

4521

++

4522

++static int __bfq_may_queue(struct bfq_queue *bfqq)

4523

++{

4524

++	if (bfq_bfqq_wait_request(bfqq) && bfq_bfqq_must_alloc(bfqq)) {

4525

++		bfq_clear_bfqq_must_alloc(bfqq);

4526

++		return ELV_MQUEUE_MUST;

4527

++	}

4528

++

4529

++	return ELV_MQUEUE_MAY;

4530

++}

4531

++

4532

++static int bfq_may_queue(struct request_queue *q, int rw)

4533

++{

4534

++	struct bfq_data *bfqd = q->elevator->elevator_data;

4535

++	struct task_struct *tsk = current;

4536

++	struct bfq_io_cq *bic;

4537

++	struct bfq_queue *bfqq;

4538

++

4539

++	/*

4540

++	 * Don't force setup of a queue from here, as a call to may_queue

4541

++	 * does not necessarily imply that a request actually will be

4542

++	 * queued. So just lookup a possibly existing queue, or return

4543

++	 * 'may queue' if that fails.

4544

++	 */

4545

++	bic = bfq_bic_lookup(bfqd, tsk->io_context);

4546

++	if (!bic)

4547

++		return ELV_MQUEUE_MAY;

4548

++

4549

++	bfqq = bic_to_bfqq(bic, rw_is_sync(rw));

4550

++	if (bfqq)

4551

++		return __bfq_may_queue(bfqq);

4552

++

4553

++	return ELV_MQUEUE_MAY;

4554

++}

4555

++

4556

++/*

4557

++ * Queue lock held here.

4558

++ */

4559

++static void bfq_put_request(struct request *rq)

4560

++{

4561

++	struct bfq_queue *bfqq = RQ_BFQQ(rq);

4562

++

4563

++	if (bfqq) {

4564

++		const int rw = rq_data_dir(rq);

4565

++

4566

++		BUG_ON(!bfqq->allocated[rw]);

4567

++		bfqq->allocated[rw]--;

4568

++

4569

++		rq->elv.priv[0] = NULL;

4570

++		rq->elv.priv[1] = NULL;

4571

++

4572

++		bfq_log_bfqq(bfqq->bfqd, bfqq, "put_request %p, %d",

4573

++			     bfqq, atomic_read(&bfqq->ref));

4574

++		bfq_put_queue(bfqq);

4575

++	}

4576

++}

4577

++

4578

++/*

4579

++ * Allocate bfq data structures associated with this request.

4580

++ */

4581

++static int bfq_set_request(struct request_queue *q, struct request *rq,

4582

++			   struct bio *bio, gfp_t gfp_mask)

4583

++{

4584

++	struct bfq_data *bfqd = q->elevator->elevator_data;

4585

++	struct bfq_io_cq *bic = icq_to_bic(rq->elv.icq);

4586

++	const int rw = rq_data_dir(rq);

4587

++	const int is_sync = rq_is_sync(rq);

4588

++	struct bfq_queue *bfqq;

4589

++	unsigned long flags;

4590

++

4591

++	might_sleep_if(gfpflags_allow_blocking(gfp_mask));

4592

++

4593

++	bfq_check_ioprio_change(bic, bio);

4594

++

4595

++	spin_lock_irqsave(q->queue_lock, flags);

4596

++

4597

++	if (!bic)

4598

++		goto queue_fail;

4599

++

4600

++	bfq_bic_update_cgroup(bic, bio);

4601

++

4602

++	bfqq = bic_to_bfqq(bic, is_sync);

4603

++	if (!bfqq || bfqq == &bfqd->oom_bfqq) {

4604

++		bfqq = bfq_get_queue(bfqd, bio, is_sync, bic, gfp_mask);

4605

++		bic_set_bfqq(bic, bfqq, is_sync);

4606

++		if (is_sync) {

4607

++			if (bfqd->large_burst)

4608

++				bfq_mark_bfqq_in_large_burst(bfqq);

4609

++			else

4610

++				bfq_clear_bfqq_in_large_burst(bfqq);

4611

++		}

4612

++	}

4613

++

4614

++	bfqq->allocated[rw]++;

4615

++	atomic_inc(&bfqq->ref);

4616

++	bfq_log_bfqq(bfqd, bfqq, "set_request: bfqq %p, %d", bfqq,

4617

++		     atomic_read(&bfqq->ref));

4618

++

4619

++	rq->elv.priv[0] = bic;

4620

++	rq->elv.priv[1] = bfqq;

4621

++

4622

++	spin_unlock_irqrestore(q->queue_lock, flags);

4623

++

4624

++	return 0;

4625

++

4626

++queue_fail:

4627

++	bfq_schedule_dispatch(bfqd);

4628

++	spin_unlock_irqrestore(q->queue_lock, flags);

4629

++

4630

++	return 1;

4631

++}

4632

++

4633

++static void bfq_kick_queue(struct work_struct *work)

4634

++{

4635

++	struct bfq_data *bfqd =

4636

++		container_of(work, struct bfq_data, unplug_work);

4637

++	struct request_queue *q = bfqd->queue;

4638

++

4639

++	spin_lock_irq(q->queue_lock);

4640

++	__blk_run_queue(q);

4641

++	spin_unlock_irq(q->queue_lock);

4642

++}

4643

++

4644

++/*

4645

++ * Handler of the expiration of the timer running if the in-service queue

4646

++ * is idling inside its time slice.

4647

++ */

4648

++static void bfq_idle_slice_timer(unsigned long data)

4649

++{

4650

++	struct bfq_data *bfqd = (struct bfq_data *)data;

4651

++	struct bfq_queue *bfqq;

4652

++	unsigned long flags;

4653

++	enum bfqq_expiration reason;

4654

++

4655

++	spin_lock_irqsave(bfqd->queue->queue_lock, flags);

4656

++

4657

++	bfqq = bfqd->in_service_queue;

4658

++	/*

4659

++	 * Theoretical race here: the in-service queue can be NULL or

4660

++	 * different from the queue that was idling if the timer handler

4661

++	 * spins on the queue_lock and a new request arrives for the

4662

++	 * current queue and there is a full dispatch cycle that changes

4663

++	 * the in-service queue.  This can hardly happen, but in the worst

4664

++	 * case we just expire a queue too early.

4665

++	 */

4666

++	if (bfqq) {

4667

++		bfq_log_bfqq(bfqd, bfqq, "slice_timer expired");

4668

++		if (bfq_bfqq_budget_timeout(bfqq))

4669

++			/*

4670

++			 * Also here the queue can be safely expired

4671

++			 * for budget timeout without wasting

4672

++			 * guarantees

4673

++			 */

4674

++			reason = BFQ_BFQQ_BUDGET_TIMEOUT;

4675

++		else if (bfqq->queued[0] == 0 && bfqq->queued[1] == 0)

4676

++			/*

4677

++			 * The queue may not be empty upon timer expiration,

4678

++			 * because we may not disable the timer when the

4679

++			 * first request of the in-service queue arrives

4680

++			 * during disk idling.

4681

++			 */

4682

++			reason = BFQ_BFQQ_TOO_IDLE;

4683

++		else

4684

++			goto schedule_dispatch;

4685

++

4686

++		bfq_bfqq_expire(bfqd, bfqq, true, reason);

4687

++	}

4688

++

4689

++schedule_dispatch:

4690

++	bfq_schedule_dispatch(bfqd);

4691

++

4692

++	spin_unlock_irqrestore(bfqd->queue->queue_lock, flags);

4693

++}

4694

++

4695

++static void bfq_shutdown_timer_wq(struct bfq_data *bfqd)

4696

++{

4697

++	del_timer_sync(&bfqd->idle_slice_timer);

4698

++	cancel_work_sync(&bfqd->unplug_work);

4699

++}

4700

++

4701

++static void __bfq_put_async_bfqq(struct bfq_data *bfqd,

4702

++					struct bfq_queue **bfqq_ptr)

4703

++{

4704

++	struct bfq_group *root_group = bfqd->root_group;

4705

++	struct bfq_queue *bfqq = *bfqq_ptr;

4706

++

4707

++	bfq_log(bfqd, "put_async_bfqq: %p", bfqq);

4708

++	if (bfqq) {

4709

++		bfq_bfqq_move(bfqd, bfqq, &bfqq->entity, root_group);

4710

++		bfq_log_bfqq(bfqd, bfqq, "put_async_bfqq: putting %p, %d",

4711

++			     bfqq, atomic_read(&bfqq->ref));

4712

++		bfq_put_queue(bfqq);

4713

++		*bfqq_ptr = NULL;

4714

++	}

4715

++}

4716

++

4717

++/*

4718

++ * Release all the bfqg references to its async queues.  If we are

4719

++ * deallocating the group these queues may still contain requests, so

4720

++ * we reparent them to the root cgroup (i.e., the only one that will

4721

++ * exist for sure until all the requests on a device are gone).

4722

++ */

4723

++static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg)

4724

++{

4725

++	int i, j;

4726

++

4727

++	for (i = 0; i < 2; i++)

4728

++		for (j = 0; j < IOPRIO_BE_NR; j++)

4729

++			__bfq_put_async_bfqq(bfqd, &bfqg->async_bfqq[i][j]);

4730

++

4731

++	__bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq);

4732

++}

4733

++

4734

++static void bfq_exit_queue(struct elevator_queue *e)

4735

++{

4736

++	struct bfq_data *bfqd = e->elevator_data;

4737

++	struct request_queue *q = bfqd->queue;

4738

++	struct bfq_queue *bfqq, *n;

4739

++

4740

++	bfq_shutdown_timer_wq(bfqd);

4741

++

4742

++	spin_lock_irq(q->queue_lock);

4743

++

4744

++	BUG_ON(bfqd->in_service_queue);

4745

++	list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list)

4746

++		bfq_deactivate_bfqq(bfqd, bfqq, 0);

4747

++

4748

++	spin_unlock_irq(q->queue_lock);

4749

++

4750

++	bfq_shutdown_timer_wq(bfqd);

4751

++

4752

++	synchronize_rcu();

4753

++

4754

++	BUG_ON(timer_pending(&bfqd->idle_slice_timer));

4755

++

4756

++#ifdef CONFIG_BFQ_GROUP_IOSCHED

4757

++	blkcg_deactivate_policy(q, &blkcg_policy_bfq);

4758

++#else

4759

++	kfree(bfqd->root_group);

4760

++#endif

4761

++

4762

++	kfree(bfqd);

4763

++}

4764

++

4765

++static void bfq_init_root_group(struct bfq_group *root_group,

4766

++				struct bfq_data *bfqd)

4767

++{

4768

++	int i;

4769

++

4770

++#ifdef CONFIG_BFQ_GROUP_IOSCHED

4771

++	root_group->entity.parent = NULL;

4772

++	root_group->my_entity = NULL;

4773

++	root_group->bfqd = bfqd;

4774

++#endif

4775

++	for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)

4776

++		root_group->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;

4777

++}

4778

++

4779

++static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)

4780

++{

4781

++	struct bfq_data *bfqd;

4782

++	struct elevator_queue *eq;

4783

++

4784

++	eq = elevator_alloc(q, e);

4785

++	if (!eq)

4786

++		return -ENOMEM;

4787

++

4788

++	bfqd = kzalloc_node(sizeof(*bfqd), GFP_KERNEL, q->node);

4789

++	if (!bfqd) {

4790

++		kobject_put(&eq->kobj);

4791

++		return -ENOMEM;

4792

++	}

4793

++	eq->elevator_data = bfqd;

4794

++

4795

++	/*

4796

++	 * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues.

4797

++	 * Grab a permanent reference to it, so that the normal code flow

4798

++	 * will not attempt to free it.

4799

++	 */

4800

++	bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, NULL, 1, 0);

4801

++	atomic_inc(&bfqd->oom_bfqq.ref);

4802

++	bfqd->oom_bfqq.new_ioprio = BFQ_DEFAULT_QUEUE_IOPRIO;

4803

++	bfqd->oom_bfqq.new_ioprio_class = IOPRIO_CLASS_BE;

4804

++	bfqd->oom_bfqq.entity.new_weight =

4805

++		bfq_ioprio_to_weight(bfqd->oom_bfqq.new_ioprio);

4806

++	/*

4807

++	 * Trigger weight initialization, according to ioprio, at the

4808

++	 * oom_bfqq's first activation. The oom_bfqq's ioprio and ioprio

4809

++	 * class won't be changed any more.

4810

++	 */

4811

++	bfqd->oom_bfqq.entity.prio_changed = 1;

4812

++

4813

++	bfqd->queue = q;

4814

++

4815

++	spin_lock_irq(q->queue_lock);

4816

++	q->elevator = eq;

4817

++	spin_unlock_irq(q->queue_lock);

4818

++

4819

++	bfqd->root_group = bfq_create_group_hierarchy(bfqd, q->node);

4820

++	if (!bfqd->root_group)

4821

++		goto out_free;

4822

++	bfq_init_root_group(bfqd->root_group, bfqd);

4823

++	bfq_init_entity(&bfqd->oom_bfqq.entity, bfqd->root_group);

4824

++#ifdef CONFIG_BFQ_GROUP_IOSCHED

4825

++	bfqd->active_numerous_groups = 0;

4826

++#endif

4827

++

4828

++	init_timer(&bfqd->idle_slice_timer);

4829

++	bfqd->idle_slice_timer.function = bfq_idle_slice_timer;

4830

++	bfqd->idle_slice_timer.data = (unsigned long)bfqd;

4831

++

4832

++	bfqd->queue_weights_tree = RB_ROOT;

4833

++	bfqd->group_weights_tree = RB_ROOT;

4834

++

4835

++	INIT_WORK(&bfqd->unplug_work, bfq_kick_queue);

4836

++

4837

++	INIT_LIST_HEAD(&bfqd->active_list);

4838

++	INIT_LIST_HEAD(&bfqd->idle_list);

4839

++	INIT_HLIST_HEAD(&bfqd->burst_list);

4840

++

4841

++	bfqd->hw_tag = -1;

4842

++

4843

++	bfqd->bfq_max_budget = bfq_default_max_budget;

4844

++

4845

++	bfqd->bfq_fifo_expire[0] = bfq_fifo_expire[0];

4846

++	bfqd->bfq_fifo_expire[1] = bfq_fifo_expire[1];

4847

++	bfqd->bfq_back_max = bfq_back_max;

4848

++	bfqd->bfq_back_penalty = bfq_back_penalty;

4849

++	bfqd->bfq_slice_idle = bfq_slice_idle;

4850

++	bfqd->bfq_class_idle_last_service = 0;

4851

++	bfqd->bfq_max_budget_async_rq = bfq_max_budget_async_rq;

4852

++	bfqd->bfq_timeout[BLK_RW_ASYNC] = bfq_timeout_async;

4853

++	bfqd->bfq_timeout[BLK_RW_SYNC] = bfq_timeout_sync;

4854

++

4855

++	bfqd->bfq_requests_within_timer = 120;

4856

++

4857

++	bfqd->bfq_large_burst_thresh = 11;

4858

++	bfqd->bfq_burst_interval = msecs_to_jiffies(500);

4859

++

4860

++	bfqd->low_latency = true;

4861

++

4862

++	bfqd->bfq_wr_coeff = 20;

4863

++	bfqd->bfq_wr_rt_max_time = msecs_to_jiffies(300);

4864

++	bfqd->bfq_wr_max_time = 0;

4865

++	bfqd->bfq_wr_min_idle_time = msecs_to_jiffies(2000);

4866

++	bfqd->bfq_wr_min_inter_arr_async = msecs_to_jiffies(500);

4867

++	bfqd->bfq_wr_max_softrt_rate = 7000; /*

4868

++					      * Approximate rate required

4869

++					      * to playback or record a

4870

++					      * high-definition compressed

4871

++					      * video.

4872

++					      */

4873

++	bfqd->wr_busy_queues = 0;

4874

++	bfqd->busy_in_flight_queues = 0;

4875

++	bfqd->const_seeky_busy_in_flight_queues = 0;

4876

++

4877

++	/*

4878

++	 * Begin by assuming, optimistically, that the device peak rate is

4879

++	 * equal to the highest reference rate.

4880

++	 */

4881

++	bfqd->RT_prod = R_fast[blk_queue_nonrot(bfqd->queue)] *

4882

++			T_fast[blk_queue_nonrot(bfqd->queue)];

4883

++	bfqd->peak_rate = R_fast[blk_queue_nonrot(bfqd->queue)];

4884

++	bfqd->device_speed = BFQ_BFQD_FAST;

4885

++

4886

++	return 0;

4887

++

4888

++out_free:

4889

++	kfree(bfqd);

4890

++	kobject_put(&eq->kobj);

4891

++	return -ENOMEM;

4892

++}

4893

++

4894

++static void bfq_slab_kill(void)

4895

++{

4896

++	if (bfq_pool)

4897

++		kmem_cache_destroy(bfq_pool);

4898

++}

4899

++

4900

++static int __init bfq_slab_setup(void)

4901

++{

4902

++	bfq_pool = KMEM_CACHE(bfq_queue, 0);

4903

++	if (!bfq_pool)

4904

++		return -ENOMEM;

4905

++	return 0;

4906

++}

4907

++

4908

++static ssize_t bfq_var_show(unsigned int var, char *page)

4909

++{

4910

++	return sprintf(page, "%d\n", var);

4911

++}

4912

++

4913

++static ssize_t bfq_var_store(unsigned long *var, const char *page,

4914

++			     size_t count)

4915

++{

4916

++	unsigned long new_val;

4917

++	int ret = kstrtoul(page, 10, &new_val);

4918

++

4919

++	if (ret == 0)

4920

++		*var = new_val;

4921

++

4922

++	return count;

4923

++}

4924

++

4925

++static ssize_t bfq_wr_max_time_show(struct elevator_queue *e, char *page)

4926

++{

4927

++	struct bfq_data *bfqd = e->elevator_data;

4928

++	return sprintf(page, "%d\n", bfqd->bfq_wr_max_time > 0 ?

4929

++		       jiffies_to_msecs(bfqd->bfq_wr_max_time) :

4930

++		       jiffies_to_msecs(bfq_wr_duration(bfqd)));

4931

++}

4932

++

4933

++static ssize_t bfq_weights_show(struct elevator_queue *e, char *page)

4934

++{

4935

++	struct bfq_queue *bfqq;

4936

++	struct bfq_data *bfqd = e->elevator_data;

4937

++	ssize_t num_char = 0;

4938

++

4939

++	num_char += sprintf(page + num_char, "Tot reqs queued %d\n\n",

4940

++			    bfqd->queued);

4941

++

4942

++	spin_lock_irq(bfqd->queue->queue_lock);

4943

++

4944

++	num_char += sprintf(page + num_char, "Active:\n");

4945

++	list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) {

4946

++	  num_char += sprintf(page + num_char,

4947

++			      "pid%d: weight %hu, nr_queued %d %d, dur %d/%u\n",

4948

++			      bfqq->pid,

4949

++			      bfqq->entity.weight,

4950

++			      bfqq->queued[0],

4951

++			      bfqq->queued[1],

4952

++			jiffies_to_msecs(jiffies - bfqq->last_wr_start_finish),

4953

++			jiffies_to_msecs(bfqq->wr_cur_max_time));

4954

++	}

4955

++

4956

++	num_char += sprintf(page + num_char, "Idle:\n");

4957

++	list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list) {

4958

++			num_char += sprintf(page + num_char,

4959

++				"pid%d: weight %hu, dur %d/%u\n",

4960

++				bfqq->pid,

4961

++				bfqq->entity.weight,

4962

++				jiffies_to_msecs(jiffies -

4963

++					bfqq->last_wr_start_finish),

4964

++				jiffies_to_msecs(bfqq->wr_cur_max_time));

4965

++	}

4966

++

4967

++	spin_unlock_irq(bfqd->queue->queue_lock);

4968

++

4969

++	return num_char;

4970

++}

4971

++

4972

++#define SHOW_FUNCTION(__FUNC, __VAR, __CONV)				\

4973

++static ssize_t __FUNC(struct elevator_queue *e, char *page)		\

4974

++{									\

4975

++	struct bfq_data *bfqd = e->elevator_data;			\

4976

++	unsigned int __data = __VAR;					\

4977

++	if (__CONV)							\

4978

++		__data = jiffies_to_msecs(__data);			\

4979

++	return bfq_var_show(__data, (page));				\

4980

++}

4981

++SHOW_FUNCTION(bfq_fifo_expire_sync_show, bfqd->bfq_fifo_expire[1], 1);

4982

++SHOW_FUNCTION(bfq_fifo_expire_async_show, bfqd->bfq_fifo_expire[0], 1);

4983

++SHOW_FUNCTION(bfq_back_seek_max_show, bfqd->bfq_back_max, 0);

4984

++SHOW_FUNCTION(bfq_back_seek_penalty_show, bfqd->bfq_back_penalty, 0);

4985

++SHOW_FUNCTION(bfq_slice_idle_show, bfqd->bfq_slice_idle, 1);

4986

++SHOW_FUNCTION(bfq_max_budget_show, bfqd->bfq_user_max_budget, 0);

4987

++SHOW_FUNCTION(bfq_max_budget_async_rq_show,

4988

++	      bfqd->bfq_max_budget_async_rq, 0);

4989

++SHOW_FUNCTION(bfq_timeout_sync_show, bfqd->bfq_timeout[BLK_RW_SYNC], 1);

4990

++SHOW_FUNCTION(bfq_timeout_async_show, bfqd->bfq_timeout[BLK_RW_ASYNC], 1);

4991

++SHOW_FUNCTION(bfq_low_latency_show, bfqd->low_latency, 0);

4992

++SHOW_FUNCTION(bfq_wr_coeff_show, bfqd->bfq_wr_coeff, 0);

4993

++SHOW_FUNCTION(bfq_wr_rt_max_time_show, bfqd->bfq_wr_rt_max_time, 1);

4994

++SHOW_FUNCTION(bfq_wr_min_idle_time_show, bfqd->bfq_wr_min_idle_time, 1);

4995

++SHOW_FUNCTION(bfq_wr_min_inter_arr_async_show, bfqd->bfq_wr_min_inter_arr_async,

4996

++	1);

4997

++SHOW_FUNCTION(bfq_wr_max_softrt_rate_show, bfqd->bfq_wr_max_softrt_rate, 0);

4998

++#undef SHOW_FUNCTION

4999

++

5000

++#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV)			\

5001

++static ssize_t								\

5002

++__FUNC(struct elevator_queue *e, const char *page, size_t count)	\

5003

++{									\

5004

++	struct bfq_data *bfqd = e->elevator_data;			\

5005

++	unsigned long uninitialized_var(__data);			\

5006

++	int ret = bfq_var_store(&__data, (page), count);		\

5007

++	if (__data < (MIN))						\

5008

++		__data = (MIN);						\

5009

++	else if (__data > (MAX))					\

5010

++		__data = (MAX);						\

5011

++	if (__CONV)							\

5012

++		*(__PTR) = msecs_to_jiffies(__data);			\

5013

++	else								\

5014

++		*(__PTR) = __data;					\

5015

++	return ret;							\

5016

++}

5017

++STORE_FUNCTION(bfq_fifo_expire_sync_store, &bfqd->bfq_fifo_expire[1], 1,

5018

++		INT_MAX, 1);

5019

++STORE_FUNCTION(bfq_fifo_expire_async_store, &bfqd->bfq_fifo_expire[0], 1,

5020

++		INT_MAX, 1);

5021

++STORE_FUNCTION(bfq_back_seek_max_store, &bfqd->bfq_back_max, 0, INT_MAX, 0);

5022

++STORE_FUNCTION(bfq_back_seek_penalty_store, &bfqd->bfq_back_penalty, 1,

5023

++		INT_MAX, 0);

5024

++STORE_FUNCTION(bfq_slice_idle_store, &bfqd->bfq_slice_idle, 0, INT_MAX, 1);

5025

++STORE_FUNCTION(bfq_max_budget_async_rq_store, &bfqd->bfq_max_budget_async_rq,

5026

++		1, INT_MAX, 0);

5027

++STORE_FUNCTION(bfq_timeout_async_store, &bfqd->bfq_timeout[BLK_RW_ASYNC], 0,

5028

++		INT_MAX, 1);

5029

++STORE_FUNCTION(bfq_wr_coeff_store, &bfqd->bfq_wr_coeff, 1, INT_MAX, 0);

5030

++STORE_FUNCTION(bfq_wr_max_time_store, &bfqd->bfq_wr_max_time, 0, INT_MAX, 1);

5031

++STORE_FUNCTION(bfq_wr_rt_max_time_store, &bfqd->bfq_wr_rt_max_time, 0, INT_MAX,

5032

++		1);

5033

++STORE_FUNCTION(bfq_wr_min_idle_time_store, &bfqd->bfq_wr_min_idle_time, 0,

5034

++		INT_MAX, 1);

5035

++STORE_FUNCTION(bfq_wr_min_inter_arr_async_store,

5036

++		&bfqd->bfq_wr_min_inter_arr_async, 0, INT_MAX, 1);

5037

++STORE_FUNCTION(bfq_wr_max_softrt_rate_store, &bfqd->bfq_wr_max_softrt_rate, 0,

5038

++		INT_MAX, 0);

5039

++#undef STORE_FUNCTION

5040

++

5041

++/* do nothing for the moment */

5042

++static ssize_t bfq_weights_store(struct elevator_queue *e,

5043

++				    const char *page, size_t count)

5044

++{

5045

++	return count;

5046

++}

5047

++

5048

++static unsigned long bfq_estimated_max_budget(struct bfq_data *bfqd)

5049

++{

5050

++	u64 timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]);

5051

++

5052

++	if (bfqd->peak_rate_samples >= BFQ_PEAK_RATE_SAMPLES)

5053

++		return bfq_calc_max_budget(bfqd->peak_rate, timeout);

5054

++	else

5055

++		return bfq_default_max_budget;

5056

++}

5057

++

5058

++static ssize_t bfq_max_budget_store(struct elevator_queue *e,

5059

++				    const char *page, size_t count)

5060

++{

5061

++	struct bfq_data *bfqd = e->elevator_data;

5062

++	unsigned long uninitialized_var(__data);

5063

++	int ret = bfq_var_store(&__data, (page), count);

5064

++

5065

++	if (__data == 0)

5066

++		bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd);

5067

++	else {

5068

++		if (__data > INT_MAX)

5069

++			__data = INT_MAX;

5070

++		bfqd->bfq_max_budget = __data;

5071

++	}

5072

++

5073

++	bfqd->bfq_user_max_budget = __data;

5074

++

5075

++	return ret;

5076

++}

5077

++

5078

++static ssize_t bfq_timeout_sync_store(struct elevator_queue *e,

5079

++				      const char *page, size_t count)

5080

++{

5081

++	struct bfq_data *bfqd = e->elevator_data;

5082

++	unsigned long uninitialized_var(__data);

5083

++	int ret = bfq_var_store(&__data, (page), count);

5084

++

5085

++	if (__data < 1)

5086

++		__data = 1;

5087

++	else if (__data > INT_MAX)

5088

++		__data = INT_MAX;

5089

++

5090

++	bfqd->bfq_timeout[BLK_RW_SYNC] = msecs_to_jiffies(__data);

5091

++	if (bfqd->bfq_user_max_budget == 0)

5092

++		bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd);

5093

++

5094

++	return ret;

5095

++}

5096

++

5097

++static ssize_t bfq_low_latency_store(struct elevator_queue *e,

5098

++				     const char *page, size_t count)

5099

++{

5100

++	struct bfq_data *bfqd = e->elevator_data;

5101

++	unsigned long uninitialized_var(__data);

5102

++	int ret = bfq_var_store(&__data, (page), count);

5103

++

5104

++	if (__data > 1)

5105

++		__data = 1;

5106

++	if (__data == 0 && bfqd->low_latency != 0)

5107

++		bfq_end_wr(bfqd);

5108

++	bfqd->low_latency = __data;

5109

++

5110

++	return ret;

5111

++}

5112

++

5113

++#define BFQ_ATTR(name) \

5114

++	__ATTR(name, S_IRUGO|S_IWUSR, bfq_##name##_show, bfq_##name##_store)

5115

++

5116

++static struct elv_fs_entry bfq_attrs[] = {

5117

++	BFQ_ATTR(fifo_expire_sync),

5118

++	BFQ_ATTR(fifo_expire_async),

5119

++	BFQ_ATTR(back_seek_max),

5120

++	BFQ_ATTR(back_seek_penalty),

5121

++	BFQ_ATTR(slice_idle),

5122

++	BFQ_ATTR(max_budget),

5123

++	BFQ_ATTR(max_budget_async_rq),

5124

++	BFQ_ATTR(timeout_sync),

5125

++	BFQ_ATTR(timeout_async),

5126

++	BFQ_ATTR(low_latency),

5127

++	BFQ_ATTR(wr_coeff),

5128

++	BFQ_ATTR(wr_max_time),

5129

++	BFQ_ATTR(wr_rt_max_time),

5130

++	BFQ_ATTR(wr_min_idle_time),

5131

++	BFQ_ATTR(wr_min_inter_arr_async),

5132

++	BFQ_ATTR(wr_max_softrt_rate),

5133

++	BFQ_ATTR(weights),

5134

++	__ATTR_NULL

5135

++};

5136

++

5137

++static struct elevator_type iosched_bfq = {

5138

++	.ops = {

5139

++		.elevator_merge_fn =		bfq_merge,

5140

++		.elevator_merged_fn =		bfq_merged_request,

5141

++		.elevator_merge_req_fn =	bfq_merged_requests,

5142

++#ifdef CONFIG_BFQ_GROUP_IOSCHED

5143

++		.elevator_bio_merged_fn =	bfq_bio_merged,

5144

++#endif

5145

++		.elevator_allow_merge_fn =	bfq_allow_merge,

5146

++		.elevator_dispatch_fn =		bfq_dispatch_requests,

5147

++		.elevator_add_req_fn =		bfq_insert_request,

5148

++		.elevator_activate_req_fn =	bfq_activate_request,

5149

++		.elevator_deactivate_req_fn =	bfq_deactivate_request,

5150

++		.elevator_completed_req_fn =	bfq_completed_request,

5151

++		.elevator_former_req_fn =	elv_rb_former_request,

5152

++		.elevator_latter_req_fn =	elv_rb_latter_request,

5153

++		.elevator_init_icq_fn =		bfq_init_icq,

5154

++		.elevator_exit_icq_fn =		bfq_exit_icq,

5155

++		.elevator_set_req_fn =		bfq_set_request,

5156

++		.elevator_put_req_fn =		bfq_put_request,

5157

++		.elevator_may_queue_fn =	bfq_may_queue,

5158

++		.elevator_init_fn =		bfq_init_queue,

5159

++		.elevator_exit_fn =		bfq_exit_queue,

5160

++	},

5161

++	.icq_size =		sizeof(struct bfq_io_cq),

5162

++	.icq_align =		__alignof__(struct bfq_io_cq),

5163

++	.elevator_attrs =	bfq_attrs,

5164

++	.elevator_name =	"bfq",

5165

++	.elevator_owner =	THIS_MODULE,

5166

++};

5167

++

5168

++static int __init bfq_init(void)

5169

++{

5170

++	int ret;

5171

++

5172

++	/*

5173

++	 * Can be 0 on HZ < 1000 setups.

5174

++	 */

5175

++	if (bfq_slice_idle == 0)

5176

++		bfq_slice_idle = 1;

5177

++

5178

++	if (bfq_timeout_async == 0)

5179

++		bfq_timeout_async = 1;

5180

++

5181

++#ifdef CONFIG_BFQ_GROUP_IOSCHED

5182

++	ret = blkcg_policy_register(&blkcg_policy_bfq);

5183

++	if (ret)

5184

++		return ret;

5185

++#endif

5186

++

5187

++	ret = -ENOMEM;

5188

++	if (bfq_slab_setup())

5189

++		goto err_pol_unreg;

5190

++

5191

++	/*

5192

++	 * Times to load large popular applications for the typical systems

5193

++	 * installed on the reference devices (see the comments before the

5194

++	 * definitions of the two arrays).

5195

++	 */

5196

++	T_slow[0] = msecs_to_jiffies(2600);

5197

++	T_slow[1] = msecs_to_jiffies(1000);

5198

++	T_fast[0] = msecs_to_jiffies(5500);

5199

++	T_fast[1] = msecs_to_jiffies(2000);

5200

++

5201

++	/*

5202

++	 * Thresholds that determine the switch between speed classes (see

5203

++	 * the comments before the definition of the array).

5204

++	 */

5205

++	device_speed_thresh[0] = (R_fast[0] + R_slow[0]) / 2;

5206

++	device_speed_thresh[1] = (R_fast[1] + R_slow[1]) / 2;

5207

++

5208

++	ret = elv_register(&iosched_bfq);

5209

++	if (ret)

5210

++		goto err_pol_unreg;

5211

++

5212

++	pr_info("BFQ I/O-scheduler: v7r11");

5213

++

5214

++	return 0;

5215

++

5216

++err_pol_unreg:

5217

++#ifdef CONFIG_BFQ_GROUP_IOSCHED

5218

++	blkcg_policy_unregister(&blkcg_policy_bfq);

5219

++#endif

5220

++	return ret;

5221

++}

5222

++

5223

++static void __exit bfq_exit(void)

5224

++{

5225

++	elv_unregister(&iosched_bfq);

5226

++#ifdef CONFIG_BFQ_GROUP_IOSCHED

5227

++	blkcg_policy_unregister(&blkcg_policy_bfq);

5228

++#endif

5229

++	bfq_slab_kill();

5230

++}

5231

++

5232

++module_init(bfq_init);

5233

++module_exit(bfq_exit);

5234

++

5235

++MODULE_AUTHOR("Arianna Avanzini, Fabio Checconi, Paolo Valente");

5236

++MODULE_LICENSE("GPL");

5237

+diff --git a/block/bfq-sched.c b/block/bfq-sched.c

5238

+new file mode 100644

5239

+index 0000000..a64fec1

5240

+--- /dev/null

5241

++++ b/block/bfq-sched.c

5242

+@@ -0,0 +1,1200 @@

5243

++/*

5244

++ * BFQ: Hierarchical B-WF2Q+ scheduler.

5245

++ *

5246

++ * Based on ideas and code from CFQ:

5247

++ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>

5248

++ *

5249

++ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>

5250

++ *		      Paolo Valente <paolo.valente@×××××××.it>

5251

++ *

5252

++ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>

5253

++ */

5254

++

5255

++#ifdef CONFIG_BFQ_GROUP_IOSCHED

5256

++#define for_each_entity(entity)	\

5257

++	for (; entity ; entity = entity->parent)

5258

++

5259

++#define for_each_entity_safe(entity, parent) \

5260

++	for (; entity && ({ parent = entity->parent; 1; }); entity = parent)

5261

++

5262

++

5263

++static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd,

5264

++						 int extract,

5265

++						 struct bfq_data *bfqd);

5266

++

5267

++static struct bfq_group *bfqq_group(struct bfq_queue *bfqq);

5268

++

5269

++static void bfq_update_budget(struct bfq_entity *next_in_service)

5270

++{

5271

++	struct bfq_entity *bfqg_entity;

5272

++	struct bfq_group *bfqg;

5273

++	struct bfq_sched_data *group_sd;

5274

++

5275

++	BUG_ON(!next_in_service);

5276

++

5277

++	group_sd = next_in_service->sched_data;

5278

++

5279

++	bfqg = container_of(group_sd, struct bfq_group, sched_data);

5280

++	/*

5281

++	 * bfq_group's my_entity field is not NULL only if the group

5282

++	 * is not the root group. We must not touch the root entity

5283

++	 * as it must never become an in-service entity.

5284

++	 */

5285

++	bfqg_entity = bfqg->my_entity;

5286

++	if (bfqg_entity)

5287

++		bfqg_entity->budget = next_in_service->budget;

5288

++}

5289

++

5290

++static int bfq_update_next_in_service(struct bfq_sched_data *sd)

5291

++{

5292

++	struct bfq_entity *next_in_service;

5293

++

5294

++	if (sd->in_service_entity)

5295

++		/* will update/requeue at the end of service */

5296

++		return 0;

5297

++

5298

++	/*

5299

++	 * NOTE: this can be improved in many ways, such as returning

5300

++	 * 1 (and thus propagating upwards the update) only when the

5301

++	 * budget changes, or caching the bfqq that will be scheduled

5302

++	 * next from this subtree.  By now we worry more about

5303

++	 * correctness than about performance...

5304

++	 */

5305

++	next_in_service = bfq_lookup_next_entity(sd, 0, NULL);

5306

++	sd->next_in_service = next_in_service;

5307

++

5308

++	if (next_in_service)

5309

++		bfq_update_budget(next_in_service);

5310

++

5311

++	return 1;

5312

++}

5313

++

5314

++static void bfq_check_next_in_service(struct bfq_sched_data *sd,

5315

++				      struct bfq_entity *entity)

5316

++{

5317

++	BUG_ON(sd->next_in_service != entity);

5318

++}

5319

++#else

5320

++#define for_each_entity(entity)	\

5321

++	for (; entity ; entity = NULL)

5322

++

5323

++#define for_each_entity_safe(entity, parent) \

5324

++	for (parent = NULL; entity ; entity = parent)

5325

++

5326

++static int bfq_update_next_in_service(struct bfq_sched_data *sd)

5327

++{

5328

++	return 0;

5329

++}

5330

++

5331

++static void bfq_check_next_in_service(struct bfq_sched_data *sd,

5332

++				      struct bfq_entity *entity)

5333

++{

5334

++}

5335

++

5336

++static void bfq_update_budget(struct bfq_entity *next_in_service)

5337

++{

5338

++}

5339

++#endif

5340

++

5341

++/*

5342

++ * Shift for timestamp calculations.  This actually limits the maximum

5343

++ * service allowed in one timestamp delta (small shift values increase it),

5344

++ * the maximum total weight that can be used for the queues in the system

5345

++ * (big shift values increase it), and the period of virtual time

5346

++ * wraparounds.

5347

++ */

5348

++#define WFQ_SERVICE_SHIFT	22

5349

++

5350

++/**

5351

++ * bfq_gt - compare two timestamps.

5352

++ * @a: first ts.

5353

++ * @b: second ts.

5354

++ *

5355

++ * Return @a > @b, dealing with wrapping correctly.

5356

++ */

5357

++static int bfq_gt(u64 a, u64 b)

5358

++{

5359

++	return (s64)(a - b) > 0;

5360

++}

5361

++

5362

++static struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity)

5363

++{

5364

++	struct bfq_queue *bfqq = NULL;

5365

++

5366

++	BUG_ON(!entity);

5367

++

5368

++	if (!entity->my_sched_data)

5369

++		bfqq = container_of(entity, struct bfq_queue, entity);

5370

++

5371

++	return bfqq;

5372

++}

5373

++

5374

++

5375

++/**

5376

++ * bfq_delta - map service into the virtual time domain.

5377

++ * @service: amount of service.

5378

++ * @weight: scale factor (weight of an entity or weight sum).

5379

++ */

5380

++static u64 bfq_delta(unsigned long service, unsigned long weight)

5381

++{

5382

++	u64 d = (u64)service << WFQ_SERVICE_SHIFT;

5383

++

5384

++	do_div(d, weight);

5385

++	return d;

5386

++}

5387

++

5388

++/**

5389

++ * bfq_calc_finish - assign the finish time to an entity.

5390

++ * @entity: the entity to act upon.

5391

++ * @service: the service to be charged to the entity.

5392

++ */

5393

++static void bfq_calc_finish(struct bfq_entity *entity, unsigned long service)

5394

++{

5395

++	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

5396

++

5397

++	BUG_ON(entity->weight == 0);

5398

++

5399

++	entity->finish = entity->start +

5400

++		bfq_delta(service, entity->weight);

5401

++

5402

++	if (bfqq) {

5403

++		bfq_log_bfqq(bfqq->bfqd, bfqq,

5404

++			"calc_finish: serv %lu, w %d",

5405

++			service, entity->weight);

5406

++		bfq_log_bfqq(bfqq->bfqd, bfqq,

5407

++			"calc_finish: start %llu, finish %llu, delta %llu",

5408

++			entity->start, entity->finish,

5409

++			bfq_delta(service, entity->weight));

5410

++	}

5411

++}

5412

++

5413

++/**

5414

++ * bfq_entity_of - get an entity from a node.

5415

++ * @node: the node field of the entity.

5416

++ *

5417

++ * Convert a node pointer to the relative entity.  This is used only

5418

++ * to simplify the logic of some functions and not as the generic

5419

++ * conversion mechanism because, e.g., in the tree walking functions,

5420

++ * the check for a %NULL value would be redundant.

5421

++ */

5422

++static struct bfq_entity *bfq_entity_of(struct rb_node *node)

5423

++{

5424

++	struct bfq_entity *entity = NULL;

5425

++

5426

++	if (node)

5427

++		entity = rb_entry(node, struct bfq_entity, rb_node);

5428

++

5429

++	return entity;

5430

++}

5431

++

5432

++/**

5433

++ * bfq_extract - remove an entity from a tree.

5434

++ * @root: the tree root.

5435

++ * @entity: the entity to remove.

5436

++ */

5437

++static void bfq_extract(struct rb_root *root, struct bfq_entity *entity)

5438

++{

5439

++	BUG_ON(entity->tree != root);

5440

++

5441

++	entity->tree = NULL;

5442

++	rb_erase(&entity->rb_node, root);

5443

++}

5444

++

5445

++/**

5446

++ * bfq_idle_extract - extract an entity from the idle tree.

5447

++ * @st: the service tree of the owning @entity.

5448

++ * @entity: the entity being removed.

5449

++ */

5450

++static void bfq_idle_extract(struct bfq_service_tree *st,

5451

++			     struct bfq_entity *entity)

5452

++{

5453

++	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

5454

++	struct rb_node *next;

5455

++

5456

++	BUG_ON(entity->tree != &st->idle);

5457

++

5458

++	if (entity == st->first_idle) {

5459

++		next = rb_next(&entity->rb_node);

5460

++		st->first_idle = bfq_entity_of(next);

5461

++	}

5462

++

5463

++	if (entity == st->last_idle) {

5464

++		next = rb_prev(&entity->rb_node);

5465

++		st->last_idle = bfq_entity_of(next);

5466

++	}

5467

++

5468

++	bfq_extract(&st->idle, entity);

5469

++

5470

++	if (bfqq)

5471

++		list_del(&bfqq->bfqq_list);

5472

++}

5473

++

5474

++/**

5475

++ * bfq_insert - generic tree insertion.

5476

++ * @root: tree root.

5477

++ * @entity: entity to insert.

5478

++ *

5479

++ * This is used for the idle and the active tree, since they are both

5480

++ * ordered by finish time.

5481

++ */

5482

++static void bfq_insert(struct rb_root *root, struct bfq_entity *entity)

5483

++{

5484

++	struct bfq_entity *entry;

5485

++	struct rb_node **node = &root->rb_node;

5486

++	struct rb_node *parent = NULL;

5487

++

5488

++	BUG_ON(entity->tree);

5489

++

5490

++	while (*node) {

5491

++		parent = *node;

5492

++		entry = rb_entry(parent, struct bfq_entity, rb_node);

5493

++

5494

++		if (bfq_gt(entry->finish, entity->finish))

5495

++			node = &parent->rb_left;

5496

++		else

5497

++			node = &parent->rb_right;

5498

++	}

5499

++

5500

++	rb_link_node(&entity->rb_node, parent, node);

5501

++	rb_insert_color(&entity->rb_node, root);

5502

++

5503

++	entity->tree = root;

5504

++}

5505

++

5506

++/**

5507

++ * bfq_update_min - update the min_start field of a entity.

5508

++ * @entity: the entity to update.

5509

++ * @node: one of its children.

5510

++ *

5511

++ * This function is called when @entity may store an invalid value for

5512

++ * min_start due to updates to the active tree.  The function  assumes

5513

++ * that the subtree rooted at @node (which may be its left or its right

5514

++ * child) has a valid min_start value.

5515

++ */

5516

++static void bfq_update_min(struct bfq_entity *entity, struct rb_node *node)

5517

++{

5518

++	struct bfq_entity *child;

5519

++

5520

++	if (node) {

5521

++		child = rb_entry(node, struct bfq_entity, rb_node);

5522

++		if (bfq_gt(entity->min_start, child->min_start))

5523

++			entity->min_start = child->min_start;

5524

++	}

5525

++}

5526

++

5527

++/**

5528

++ * bfq_update_active_node - recalculate min_start.

5529

++ * @node: the node to update.

5530

++ *

5531

++ * @node may have changed position or one of its children may have moved,

5532

++ * this function updates its min_start value.  The left and right subtrees

5533

++ * are assumed to hold a correct min_start value.

5534

++ */

5535

++static void bfq_update_active_node(struct rb_node *node)

5536

++{

5537

++	struct bfq_entity *entity = rb_entry(node, struct bfq_entity, rb_node);

5538

++

5539

++	entity->min_start = entity->start;

5540

++	bfq_update_min(entity, node->rb_right);

5541

++	bfq_update_min(entity, node->rb_left);

5542

++}

5543

++

5544

++/**

5545

++ * bfq_update_active_tree - update min_start for the whole active tree.

5546

++ * @node: the starting node.

5547

++ *

5548

++ * @node must be the deepest modified node after an update.  This function

5549

++ * updates its min_start using the values held by its children, assuming

5550

++ * that they did not change, and then updates all the nodes that may have

5551

++ * changed in the path to the root.  The only nodes that may have changed

5552

++ * are the ones in the path or their siblings.

5553

++ */

5554

++static void bfq_update_active_tree(struct rb_node *node)

5555

++{

5556

++	struct rb_node *parent;

5557

++

5558

++up:

5559

++	bfq_update_active_node(node);

5560

++

5561

++	parent = rb_parent(node);

5562

++	if (!parent)

5563

++		return;

5564

++

5565

++	if (node == parent->rb_left && parent->rb_right)

5566

++		bfq_update_active_node(parent->rb_right);

5567

++	else if (parent->rb_left)

5568

++		bfq_update_active_node(parent->rb_left);

5569

++

5570

++	node = parent;

5571

++	goto up;

5572

++}

5573

++

5574

++static void bfq_weights_tree_add(struct bfq_data *bfqd,

5575

++				 struct bfq_entity *entity,

5576

++				 struct rb_root *root);

5577

++

5578

++static void bfq_weights_tree_remove(struct bfq_data *bfqd,

5579

++				    struct bfq_entity *entity,

5580

++				    struct rb_root *root);

5581

++

5582

++

5583

++/**

5584

++ * bfq_active_insert - insert an entity in the active tree of its

5585

++ *                     group/device.

5586

++ * @st: the service tree of the entity.

5587

++ * @entity: the entity being inserted.

5588

++ *

5589

++ * The active tree is ordered by finish time, but an extra key is kept

5590

++ * per each node, containing the minimum value for the start times of

5591

++ * its children (and the node itself), so it's possible to search for

5592

++ * the eligible node with the lowest finish time in logarithmic time.

5593

++ */

5594

++static void bfq_active_insert(struct bfq_service_tree *st,

5595

++			      struct bfq_entity *entity)

5596

++{

5597

++	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

5598

++	struct rb_node *node = &entity->rb_node;

5599

++#ifdef CONFIG_BFQ_GROUP_IOSCHED

5600

++	struct bfq_sched_data *sd = NULL;

5601

++	struct bfq_group *bfqg = NULL;

5602

++	struct bfq_data *bfqd = NULL;

5603

++#endif

5604

++

5605

++	bfq_insert(&st->active, entity);

5606

++

5607

++	if (node->rb_left)

5608

++		node = node->rb_left;

5609

++	else if (node->rb_right)

5610

++		node = node->rb_right;

5611

++

5612

++	bfq_update_active_tree(node);

5613

++

5614

++#ifdef CONFIG_BFQ_GROUP_IOSCHED

5615

++	sd = entity->sched_data;

5616

++	bfqg = container_of(sd, struct bfq_group, sched_data);

5617

++	BUG_ON(!bfqg);

5618

++	bfqd = (struct bfq_data *)bfqg->bfqd;

5619

++#endif

5620

++	if (bfqq)

5621

++		list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list);

5622

++#ifdef CONFIG_BFQ_GROUP_IOSCHED

5623

++	else { /* bfq_group */

5624

++		BUG_ON(!bfqd);

5625

++		bfq_weights_tree_add(bfqd, entity, &bfqd->group_weights_tree);

5626

++	}

5627

++	if (bfqg != bfqd->root_group) {

5628

++		BUG_ON(!bfqg);

5629

++		BUG_ON(!bfqd);

5630

++		bfqg->active_entities++;

5631

++		if (bfqg->active_entities == 2)

5632

++			bfqd->active_numerous_groups++;

5633

++	}

5634

++#endif

5635

++}

5636

++

5637

++/**

5638

++ * bfq_ioprio_to_weight - calc a weight from an ioprio.

5639

++ * @ioprio: the ioprio value to convert.

5640

++ */

5641

++static unsigned short bfq_ioprio_to_weight(int ioprio)

5642

++{

5643

++	BUG_ON(ioprio < 0 || ioprio >= IOPRIO_BE_NR);

5644

++	return IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF - ioprio;

5645

++}

5646

++

5647

++/**

5648

++ * bfq_weight_to_ioprio - calc an ioprio from a weight.

5649

++ * @weight: the weight value to convert.

5650

++ *

5651

++ * To preserve as much as possible the old only-ioprio user interface,

5652

++ * 0 is used as an escape ioprio value for weights (numerically) equal or

5653

++ * larger than IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF.

5654

++ */

5655

++static unsigned short bfq_weight_to_ioprio(int weight)

5656

++{

5657

++	BUG_ON(weight < BFQ_MIN_WEIGHT || weight > BFQ_MAX_WEIGHT);

5658

++	return IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF - weight < 0 ?

5659

++		0 : IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF - weight;

5660

++}

5661

++

5662

++static void bfq_get_entity(struct bfq_entity *entity)

5663

++{

5664

++	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

5665

++

5666

++	if (bfqq) {

5667

++		atomic_inc(&bfqq->ref);

5668

++		bfq_log_bfqq(bfqq->bfqd, bfqq, "get_entity: %p %d",

5669

++			     bfqq, atomic_read(&bfqq->ref));

5670

++	}

5671

++}

5672

++

5673

++/**

5674

++ * bfq_find_deepest - find the deepest node that an extraction can modify.

5675

++ * @node: the node being removed.

5676

++ *

5677

++ * Do the first step of an extraction in an rb tree, looking for the

5678

++ * node that will replace @node, and returning the deepest node that

5679

++ * the following modifications to the tree can touch.  If @node is the

5680

++ * last node in the tree return %NULL.

5681

++ */

5682

++static struct rb_node *bfq_find_deepest(struct rb_node *node)

5683

++{

5684

++	struct rb_node *deepest;

5685

++

5686

++	if (!node->rb_right && !node->rb_left)

5687

++		deepest = rb_parent(node);

5688

++	else if (!node->rb_right)

5689

++		deepest = node->rb_left;

5690

++	else if (!node->rb_left)

5691

++		deepest = node->rb_right;

5692

++	else {

5693

++		deepest = rb_next(node);

5694

++		if (deepest->rb_right)

5695

++			deepest = deepest->rb_right;

5696

++		else if (rb_parent(deepest) != node)

5697

++			deepest = rb_parent(deepest);

5698

++	}

5699

++

5700

++	return deepest;

5701

++}

5702

++

5703

++/**

5704

++ * bfq_active_extract - remove an entity from the active tree.

5705

++ * @st: the service_tree containing the tree.

5706

++ * @entity: the entity being removed.

5707

++ */

5708

++static void bfq_active_extract(struct bfq_service_tree *st,

5709

++			       struct bfq_entity *entity)

5710

++{

5711

++	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

5712

++	struct rb_node *node;

5713

++#ifdef CONFIG_BFQ_GROUP_IOSCHED

5714

++	struct bfq_sched_data *sd = NULL;

5715

++	struct bfq_group *bfqg = NULL;

5716

++	struct bfq_data *bfqd = NULL;

5717

++#endif

5718

++

5719

++	node = bfq_find_deepest(&entity->rb_node);

5720

++	bfq_extract(&st->active, entity);

5721

++

5722

++	if (node)

5723

++		bfq_update_active_tree(node);

5724

++

5725

++#ifdef CONFIG_BFQ_GROUP_IOSCHED

5726

++	sd = entity->sched_data;

5727

++	bfqg = container_of(sd, struct bfq_group, sched_data);

5728

++	BUG_ON(!bfqg);

5729

++	bfqd = (struct bfq_data *)bfqg->bfqd;

5730

++#endif

5731

++	if (bfqq)

5732

++		list_del(&bfqq->bfqq_list);

5733

++#ifdef CONFIG_BFQ_GROUP_IOSCHED

5734

++	else { /* bfq_group */

5735

++		BUG_ON(!bfqd);

5736

++		bfq_weights_tree_remove(bfqd, entity,

5737

++					&bfqd->group_weights_tree);

5738

++	}

5739

++	if (bfqg != bfqd->root_group) {

5740

++		BUG_ON(!bfqg);

5741

++		BUG_ON(!bfqd);

5742

++		BUG_ON(!bfqg->active_entities);

5743

++		bfqg->active_entities--;

5744

++		if (bfqg->active_entities == 1) {

5745

++			BUG_ON(!bfqd->active_numerous_groups);

5746

++			bfqd->active_numerous_groups--;

5747

++		}

5748

++	}

5749

++#endif

5750

++}

5751

++

5752

++/**

5753

++ * bfq_idle_insert - insert an entity into the idle tree.

5754

++ * @st: the service tree containing the tree.

5755

++ * @entity: the entity to insert.

5756

++ */

5757

++static void bfq_idle_insert(struct bfq_service_tree *st,

5758

++			    struct bfq_entity *entity)

5759

++{

5760

++	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

5761

++	struct bfq_entity *first_idle = st->first_idle;

5762

++	struct bfq_entity *last_idle = st->last_idle;

5763

++

5764

++	if (!first_idle || bfq_gt(first_idle->finish, entity->finish))

5765

++		st->first_idle = entity;

5766

++	if (!last_idle || bfq_gt(entity->finish, last_idle->finish))

5767

++		st->last_idle = entity;

5768

++

5769

++	bfq_insert(&st->idle, entity);

5770

++

5771

++	if (bfqq)

5772

++		list_add(&bfqq->bfqq_list, &bfqq->bfqd->idle_list);

5773

++}

5774

++

5775

++/**

5776

++ * bfq_forget_entity - remove an entity from the wfq trees.

5777

++ * @st: the service tree.

5778

++ * @entity: the entity being removed.

5779

++ *

5780

++ * Update the device status and forget everything about @entity, putting

5781

++ * the device reference to it, if it is a queue.  Entities belonging to

5782

++ * groups are not refcounted.

5783

++ */

5784

++static void bfq_forget_entity(struct bfq_service_tree *st,

5785

++			      struct bfq_entity *entity)

5786

++{

5787

++	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

5788

++	struct bfq_sched_data *sd;

5789

++

5790

++	BUG_ON(!entity->on_st);

5791

++

5792

++	entity->on_st = 0;

5793

++	st->wsum -= entity->weight;

5794

++	if (bfqq) {

5795

++		sd = entity->sched_data;

5796

++		bfq_log_bfqq(bfqq->bfqd, bfqq, "forget_entity: %p %d",

5797

++			     bfqq, atomic_read(&bfqq->ref));

5798

++		bfq_put_queue(bfqq);

5799

++	}

5800

++}

5801

++

5802

++/**

5803

++ * bfq_put_idle_entity - release the idle tree ref of an entity.

5804

++ * @st: service tree for the entity.

5805

++ * @entity: the entity being released.

5806

++ */

5807

++static void bfq_put_idle_entity(struct bfq_service_tree *st,

5808

++				struct bfq_entity *entity)

5809

++{

5810

++	bfq_idle_extract(st, entity);

5811

++	bfq_forget_entity(st, entity);

5812

++}

5813

++

5814

++/**

5815

++ * bfq_forget_idle - update the idle tree if necessary.

5816

++ * @st: the service tree to act upon.

5817

++ *

5818

++ * To preserve the global O(log N) complexity we only remove one entry here;

5819

++ * as the idle tree will not grow indefinitely this can be done safely.

5820

++ */

5821

++static void bfq_forget_idle(struct bfq_service_tree *st)

5822

++{

5823

++	struct bfq_entity *first_idle = st->first_idle;

5824

++	struct bfq_entity *last_idle = st->last_idle;

5825

++

5826

++	if (RB_EMPTY_ROOT(&st->active) && last_idle &&

5827

++	    !bfq_gt(last_idle->finish, st->vtime)) {

5828

++		/*

5829

++		 * Forget the whole idle tree, increasing the vtime past

5830

++		 * the last finish time of idle entities.

5831

++		 */

5832

++		st->vtime = last_idle->finish;

5833

++	}

5834

++

5835

++	if (first_idle && !bfq_gt(first_idle->finish, st->vtime))

5836

++		bfq_put_idle_entity(st, first_idle);

5837

++}

5838

++

5839

++static struct bfq_service_tree *

5840

++__bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,

5841

++			 struct bfq_entity *entity)

5842

++{

5843

++	struct bfq_service_tree *new_st = old_st;

5844

++

5845

++	if (entity->prio_changed) {

5846

++		struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

5847

++		unsigned short prev_weight, new_weight;

5848

++		struct bfq_data *bfqd = NULL;

5849

++		struct rb_root *root;

5850

++#ifdef CONFIG_BFQ_GROUP_IOSCHED

5851

++		struct bfq_sched_data *sd;

5852

++		struct bfq_group *bfqg;

5853

++#endif

5854

++

5855

++		if (bfqq)

5856

++			bfqd = bfqq->bfqd;

5857

++#ifdef CONFIG_BFQ_GROUP_IOSCHED

5858

++		else {

5859

++			sd = entity->my_sched_data;

5860

++			bfqg = container_of(sd, struct bfq_group, sched_data);

5861

++			BUG_ON(!bfqg);

5862

++			bfqd = (struct bfq_data *)bfqg->bfqd;

5863

++			BUG_ON(!bfqd);

5864

++		}

5865

++#endif

5866

++

5867

++		BUG_ON(old_st->wsum < entity->weight);

5868

++		old_st->wsum -= entity->weight;

5869

++

5870

++		if (entity->new_weight != entity->orig_weight) {

5871

++			if (entity->new_weight < BFQ_MIN_WEIGHT ||

5872

++			    entity->new_weight > BFQ_MAX_WEIGHT) {

5873

++				printk(KERN_CRIT "update_weight_prio: "

5874

++						 "new_weight %d\n",

5875

++					entity->new_weight);

5876

++				BUG();

5877

++			}

5878

++			entity->orig_weight = entity->new_weight;

5879

++			if (bfqq)

5880

++				bfqq->ioprio =

5881

++				  bfq_weight_to_ioprio(entity->orig_weight);

5882

++		}

5883

++

5884

++		if (bfqq)

5885

++			bfqq->ioprio_class = bfqq->new_ioprio_class;

5886

++		entity->prio_changed = 0;

5887

++

5888

++		/*

5889

++		 * NOTE: here we may be changing the weight too early,

5890

++		 * this will cause unfairness.  The correct approach

5891

++		 * would have required additional complexity to defer

5892

++		 * weight changes to the proper time instants (i.e.,

5893

++		 * when entity->finish <= old_st->vtime).

5894

++		 */

5895

++		new_st = bfq_entity_service_tree(entity);

5896

++

5897

++		prev_weight = entity->weight;

5898

++		new_weight = entity->orig_weight *

5899

++			     (bfqq ? bfqq->wr_coeff : 1);

5900

++		/*

5901

++		 * If the weight of the entity changes, remove the entity

5902

++		 * from its old weight counter (if there is a counter

5903

++		 * associated with the entity), and add it to the counter

5904

++		 * associated with its new weight.

5905

++		 */

5906

++		if (prev_weight != new_weight) {

5907

++			root = bfqq ? &bfqd->queue_weights_tree :

5908

++				      &bfqd->group_weights_tree;

5909

++			bfq_weights_tree_remove(bfqd, entity, root);

5910

++		}

5911

++		entity->weight = new_weight;

5912

++		/*

5913

++		 * Add the entity to its weights tree only if it is

5914

++		 * not associated with a weight-raised queue.

5915

++		 */

5916

++		if (prev_weight != new_weight &&

5917

++		    (bfqq ? bfqq->wr_coeff == 1 : 1))

5918

++			/* If we get here, root has been initialized. */

5919

++			bfq_weights_tree_add(bfqd, entity, root);

5920

++

5921

++		new_st->wsum += entity->weight;

5922

++

5923

++		if (new_st != old_st)

5924

++			entity->start = new_st->vtime;

5925

++	}

5926

++

5927

++	return new_st;

5928

++}

5929

++

5930

++#ifdef CONFIG_BFQ_GROUP_IOSCHED

5931

++static void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg);

5932

++#endif

5933

++

5934

++/**

5935

++ * bfq_bfqq_served - update the scheduler status after selection for

5936

++ *                   service.

5937

++ * @bfqq: the queue being served.

5938

++ * @served: bytes to transfer.

5939

++ *

5940

++ * NOTE: this can be optimized, as the timestamps of upper level entities

5941

++ * are synchronized every time a new bfqq is selected for service.  By now,

5942

++ * we keep it to better check consistency.

5943

++ */

5944

++static void bfq_bfqq_served(struct bfq_queue *bfqq, int served)

5945

++{

5946

++	struct bfq_entity *entity = &bfqq->entity;

5947

++	struct bfq_service_tree *st;

5948

++

5949

++	for_each_entity(entity) {

5950

++		st = bfq_entity_service_tree(entity);

5951

++

5952

++		entity->service += served;

5953

++		BUG_ON(entity->service > entity->budget);

5954

++		BUG_ON(st->wsum == 0);

5955

++

5956

++		st->vtime += bfq_delta(served, st->wsum);

5957

++		bfq_forget_idle(st);

5958

++	}

5959

++#ifdef CONFIG_BFQ_GROUP_IOSCHED

5960

++	bfqg_stats_set_start_empty_time(bfqq_group(bfqq));

5961

++#endif

5962

++	bfq_log_bfqq(bfqq->bfqd, bfqq, "bfqq_served %d secs", served);

5963

++}

5964

++

5965

++/**

5966

++ * bfq_bfqq_charge_full_budget - set the service to the entity budget.

5967

++ * @bfqq: the queue that needs a service update.

5968

++ *

5969

++ * When it's not possible to be fair in the service domain, because

5970

++ * a queue is not consuming its budget fast enough (the meaning of

5971

++ * fast depends on the timeout parameter), we charge it a full

5972

++ * budget.  In this way we should obtain a sort of time-domain

5973

++ * fairness among all the seeky/slow queues.

5974

++ */

5975

++static void bfq_bfqq_charge_full_budget(struct bfq_queue *bfqq)

5976

++{

5977

++	struct bfq_entity *entity = &bfqq->entity;

5978

++

5979

++	bfq_log_bfqq(bfqq->bfqd, bfqq, "charge_full_budget");

5980

++

5981

++	bfq_bfqq_served(bfqq, entity->budget - entity->service);

5982

++}

5983

++

5984

++/**

5985

++ * __bfq_activate_entity - activate an entity.

5986

++ * @entity: the entity being activated.

5987

++ *

5988

++ * Called whenever an entity is activated, i.e., it is not active and one

5989

++ * of its children receives a new request, or has to be reactivated due to

5990

++ * budget exhaustion.  It uses the current budget of the entity (and the

5991

++ * service received if @entity is active) of the queue to calculate its

5992

++ * timestamps.

5993

++ */

5994

++static void __bfq_activate_entity(struct bfq_entity *entity)

5995

++{

5996

++	struct bfq_sched_data *sd = entity->sched_data;

5997

++	struct bfq_service_tree *st = bfq_entity_service_tree(entity);

5998

++

5999

++	if (entity == sd->in_service_entity) {

6000

++		BUG_ON(entity->tree);

6001

++		/*

6002

++		 * If we are requeueing the current entity we have

6003

++		 * to take care of not charging to it service it has

6004

++		 * not received.

6005

++		 */

6006

++		bfq_calc_finish(entity, entity->service);

6007

++		entity->start = entity->finish;

6008

++		sd->in_service_entity = NULL;

6009

++	} else if (entity->tree == &st->active) {

6010

++		/*

6011

++		 * Requeueing an entity due to a change of some

6012

++		 * next_in_service entity below it.  We reuse the

6013

++		 * old start time.

6014

++		 */

6015

++		bfq_active_extract(st, entity);

6016

++	} else if (entity->tree == &st->idle) {

6017

++		/*

6018

++		 * Must be on the idle tree, bfq_idle_extract() will

6019

++		 * check for that.

6020

++		 */

6021

++		bfq_idle_extract(st, entity);

6022

++		entity->start = bfq_gt(st->vtime, entity->finish) ?

6023

++				       st->vtime : entity->finish;

6024

++	} else {

6025

++		/*

6026

++		 * The finish time of the entity may be invalid, and

6027

++		 * it is in the past for sure, otherwise the queue

6028

++		 * would have been on the idle tree.

6029

++		 */

6030

++		entity->start = st->vtime;

6031

++		st->wsum += entity->weight;

6032

++		bfq_get_entity(entity);

6033

++

6034

++		BUG_ON(entity->on_st);

6035

++		entity->on_st = 1;

6036

++	}

6037

++

6038

++	st = __bfq_entity_update_weight_prio(st, entity);

6039

++	bfq_calc_finish(entity, entity->budget);

6040

++	bfq_active_insert(st, entity);

6041

++}

6042

++

6043

++/**

6044

++ * bfq_activate_entity - activate an entity and its ancestors if necessary.

6045

++ * @entity: the entity to activate.

6046

++ *

6047

++ * Activate @entity and all the entities on the path from it to the root.

6048

++ */

6049

++static void bfq_activate_entity(struct bfq_entity *entity)

6050

++{

6051

++	struct bfq_sched_data *sd;

6052

++

6053

++	for_each_entity(entity) {

6054

++		__bfq_activate_entity(entity);

6055

++

6056

++		sd = entity->sched_data;

6057

++		if (!bfq_update_next_in_service(sd))

6058

++			/*

6059

++			 * No need to propagate the activation to the

6060

++			 * upper entities, as they will be updated when

6061

++			 * the in-service entity is rescheduled.

6062

++			 */

6063

++			break;

6064

++	}

6065

++}

6066

++

6067

++/**

6068

++ * __bfq_deactivate_entity - deactivate an entity from its service tree.

6069

++ * @entity: the entity to deactivate.

6070

++ * @requeue: if false, the entity will not be put into the idle tree.

6071

++ *

6072

++ * Deactivate an entity, independently from its previous state.  If the

6073

++ * entity was not on a service tree just return, otherwise if it is on

6074

++ * any scheduler tree, extract it from that tree, and if necessary

6075

++ * and if the caller did not specify @requeue, put it on the idle tree.

6076

++ *

6077

++ * Return %1 if the caller should update the entity hierarchy, i.e.,

6078

++ * if the entity was in service or if it was the next_in_service for

6079

++ * its sched_data; return %0 otherwise.

6080

++ */

6081

++static int __bfq_deactivate_entity(struct bfq_entity *entity, int requeue)

6082

++{

6083

++	struct bfq_sched_data *sd = entity->sched_data;

6084

++	struct bfq_service_tree *st;

6085

++	int was_in_service;

6086

++	int ret = 0;

6087

++

6088

++	if (sd == NULL || !entity->on_st) /* never activated, or inactive */

6089

++		return 0;

6090

++

6091

++	st = bfq_entity_service_tree(entity);

6092

++	was_in_service = entity == sd->in_service_entity;

6093

++

6094

++	BUG_ON(was_in_service && entity->tree);

6095

++

6096

++	if (was_in_service) {

6097

++		bfq_calc_finish(entity, entity->service);

6098

++		sd->in_service_entity = NULL;

6099

++	} else if (entity->tree == &st->active)

6100

++		bfq_active_extract(st, entity);

6101

++	else if (entity->tree == &st->idle)

6102

++		bfq_idle_extract(st, entity);

6103

++	else if (entity->tree)

6104

++		BUG();

6105

++

6106

++	if (was_in_service || sd->next_in_service == entity)

6107

++		ret = bfq_update_next_in_service(sd);

6108

++

6109

++	if (!requeue || !bfq_gt(entity->finish, st->vtime))

6110

++		bfq_forget_entity(st, entity);

6111

++	else

6112

++		bfq_idle_insert(st, entity);

6113

++

6114

++	BUG_ON(sd->in_service_entity == entity);

6115

++	BUG_ON(sd->next_in_service == entity);

6116

++

6117

++	return ret;

6118

++}

6119

++

6120

++/**

6121

++ * bfq_deactivate_entity - deactivate an entity.

6122

++ * @entity: the entity to deactivate.

6123

++ * @requeue: true if the entity can be put on the idle tree

6124

++ */

6125

++static void bfq_deactivate_entity(struct bfq_entity *entity, int requeue)

6126

++{

6127

++	struct bfq_sched_data *sd;

6128

++	struct bfq_entity *parent;

6129

++

6130

++	for_each_entity_safe(entity, parent) {

6131

++		sd = entity->sched_data;

6132

++

6133

++		if (!__bfq_deactivate_entity(entity, requeue))

6134

++			/*

6135

++			 * The parent entity is still backlogged, and

6136

++			 * we don't need to update it as it is still

6137

++			 * in service.

6138

++			 */

6139

++			break;

6140

++

6141

++		if (sd->next_in_service)

6142

++			/*

6143

++			 * The parent entity is still backlogged and

6144

++			 * the budgets on the path towards the root

6145

++			 * need to be updated.

6146

++			 */

6147

++			goto update;

6148

++

6149

++		/*

6150

++		 * If we reach there the parent is no more backlogged and

6151

++		 * we want to propagate the dequeue upwards.

6152

++		 */

6153

++		requeue = 1;

6154

++	}

6155

++

6156

++	return;

6157

++

6158

++update:

6159

++	entity = parent;

6160

++	for_each_entity(entity) {

6161

++		__bfq_activate_entity(entity);

6162

++

6163

++		sd = entity->sched_data;

6164

++		if (!bfq_update_next_in_service(sd))

6165

++			break;

6166

++	}

6167

++}

6168

++

6169

++/**

6170

++ * bfq_update_vtime - update vtime if necessary.

6171

++ * @st: the service tree to act upon.

6172

++ *

6173

++ * If necessary update the service tree vtime to have at least one

6174

++ * eligible entity, skipping to its start time.  Assumes that the

6175

++ * active tree of the device is not empty.

6176

++ *

6177

++ * NOTE: this hierarchical implementation updates vtimes quite often,

6178

++ * we may end up with reactivated processes getting timestamps after a

6179

++ * vtime skip done because we needed a ->first_active entity on some

6180

++ * intermediate node.

6181

++ */

6182

++static void bfq_update_vtime(struct bfq_service_tree *st)

6183

++{

6184

++	struct bfq_entity *entry;

6185

++	struct rb_node *node = st->active.rb_node;

6186

++

6187

++	entry = rb_entry(node, struct bfq_entity, rb_node);

6188

++	if (bfq_gt(entry->min_start, st->vtime)) {

6189

++		st->vtime = entry->min_start;

6190

++		bfq_forget_idle(st);

6191

++	}

6192

++}

6193

++

6194

++/**

6195

++ * bfq_first_active_entity - find the eligible entity with

6196

++ *                           the smallest finish time

6197

++ * @st: the service tree to select from.

6198

++ *

6199

++ * This function searches the first schedulable entity, starting from the

6200

++ * root of the tree and going on the left every time on this side there is

6201

++ * a subtree with at least one eligible (start >= vtime) entity. The path on

6202

++ * the right is followed only if a) the left subtree contains no eligible

6203

++ * entities and b) no eligible entity has been found yet.

6204

++ */

6205

++static struct bfq_entity *bfq_first_active_entity(struct bfq_service_tree *st)

6206

++{

6207

++	struct bfq_entity *entry, *first = NULL;

6208

++	struct rb_node *node = st->active.rb_node;

6209

++

6210

++	while (node) {

6211

++		entry = rb_entry(node, struct bfq_entity, rb_node);

6212

++left:

6213

++		if (!bfq_gt(entry->start, st->vtime))

6214

++			first = entry;

6215

++

6216

++		BUG_ON(bfq_gt(entry->min_start, st->vtime));

6217

++

6218

++		if (node->rb_left) {

6219

++			entry = rb_entry(node->rb_left,

6220

++					 struct bfq_entity, rb_node);

6221

++			if (!bfq_gt(entry->min_start, st->vtime)) {

6222

++				node = node->rb_left;

6223

++				goto left;

6224

++			}

6225

++		}

6226

++		if (first)

6227

++			break;

6228

++		node = node->rb_right;

6229

++	}

6230

++

6231

++	BUG_ON(!first && !RB_EMPTY_ROOT(&st->active));

6232

++	return first;

6233

++}

6234

++

6235

++/**

6236

++ * __bfq_lookup_next_entity - return the first eligible entity in @st.

6237

++ * @st: the service tree.

6238

++ *

6239

++ * Update the virtual time in @st and return the first eligible entity

6240

++ * it contains.

6241

++ */

6242

++static struct bfq_entity *__bfq_lookup_next_entity(struct bfq_service_tree *st,

6243

++						   bool force)

6244

++{

6245

++	struct bfq_entity *entity, *new_next_in_service = NULL;

6246

++

6247

++	if (RB_EMPTY_ROOT(&st->active))

6248

++		return NULL;

6249

++

6250

++	bfq_update_vtime(st);

6251

++	entity = bfq_first_active_entity(st);

6252

++	BUG_ON(bfq_gt(entity->start, st->vtime));

6253

++

6254

++	/*

6255

++	 * If the chosen entity does not match with the sched_data's

6256

++	 * next_in_service and we are forcedly serving the IDLE priority

6257

++	 * class tree, bubble up budget update.

6258

++	 */

6259

++	if (unlikely(force && entity != entity->sched_data->next_in_service)) {

6260

++		new_next_in_service = entity;

6261

++		for_each_entity(new_next_in_service)

6262

++			bfq_update_budget(new_next_in_service);

6263

++	}

6264

++

6265

++	return entity;

6266

++}

6267

++

6268

++/**

6269

++ * bfq_lookup_next_entity - return the first eligible entity in @sd.

6270

++ * @sd: the sched_data.

6271

++ * @extract: if true the returned entity will be also extracted from @sd.

6272

++ *

6273

++ * NOTE: since we cache the next_in_service entity at each level of the

6274

++ * hierarchy, the complexity of the lookup can be decreased with

6275

++ * absolutely no effort just returning the cached next_in_service value;

6276

++ * we prefer to do full lookups to test the consistency of * the data

6277

++ * structures.

6278

++ */

6279

++static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd,

6280

++						 int extract,

6281

++						 struct bfq_data *bfqd)

6282

++{

6283

++	struct bfq_service_tree *st = sd->service_tree;

6284

++	struct bfq_entity *entity;

6285

++	int i = 0;

6286

++

6287

++	BUG_ON(sd->in_service_entity);

6288

++

6289

++	if (bfqd &&

6290

++	    jiffies - bfqd->bfq_class_idle_last_service > BFQ_CL_IDLE_TIMEOUT) {

6291

++		entity = __bfq_lookup_next_entity(st + BFQ_IOPRIO_CLASSES - 1,

6292

++						  true);

6293

++		if (entity) {

6294

++			i = BFQ_IOPRIO_CLASSES - 1;

6295

++			bfqd->bfq_class_idle_last_service = jiffies;

6296

++			sd->next_in_service = entity;

6297

++		}

6298

++	}

6299

++	for (; i < BFQ_IOPRIO_CLASSES; i++) {

6300

++		entity = __bfq_lookup_next_entity(st + i, false);

6301

++		if (entity) {

6302

++			if (extract) {

6303

++				bfq_check_next_in_service(sd, entity);

6304

++				bfq_active_extract(st + i, entity);

6305

++				sd->in_service_entity = entity;

6306

++				sd->next_in_service = NULL;

6307

++			}

6308

++			break;

6309

++		}

6310

++	}

6311

++

6312

++	return entity;

6313

++}

6314

++

6315

++/*

6316

++ * Get next queue for service.

6317

++ */

6318

++static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd)

6319

++{

6320

++	struct bfq_entity *entity = NULL;

6321

++	struct bfq_sched_data *sd;

6322

++	struct bfq_queue *bfqq;

6323

++

6324

++	BUG_ON(bfqd->in_service_queue);

6325

++

6326

++	if (bfqd->busy_queues == 0)

6327

++		return NULL;

6328

++

6329

++	sd = &bfqd->root_group->sched_data;

6330

++	for (; sd ; sd = entity->my_sched_data) {

6331

++		entity = bfq_lookup_next_entity(sd, 1, bfqd);

6332

++		BUG_ON(!entity);

6333

++		entity->service = 0;

6334

++	}

6335

++

6336

++	bfqq = bfq_entity_to_bfqq(entity);

6337

++	BUG_ON(!bfqq);

6338

++

6339

++	return bfqq;

6340

++}

6341

++

6342

++static void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd)

6343

++{

6344

++	if (bfqd->in_service_bic) {

6345

++		put_io_context(bfqd->in_service_bic->icq.ioc);

6346

++		bfqd->in_service_bic = NULL;

6347

++	}

6348

++

6349

++	bfqd->in_service_queue = NULL;

6350

++	del_timer(&bfqd->idle_slice_timer);

6351

++}

6352

++

6353

++static void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,

6354

++				int requeue)

6355

++{

6356

++	struct bfq_entity *entity = &bfqq->entity;

6357

++

6358

++	if (bfqq == bfqd->in_service_queue)

6359

++		__bfq_bfqd_reset_in_service(bfqd);

6360

++

6361

++	bfq_deactivate_entity(entity, requeue);

6362

++}

6363

++

6364

++static void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)

6365

++{

6366

++	struct bfq_entity *entity = &bfqq->entity;

6367

++

6368

++	bfq_activate_entity(entity);

6369

++}

6370

++

6371

++#ifdef CONFIG_BFQ_GROUP_IOSCHED

6372

++static void bfqg_stats_update_dequeue(struct bfq_group *bfqg);

6373

++#endif

6374

++

6375

++/*

6376

++ * Called when the bfqq no longer has requests pending, remove it from

6377

++ * the service tree.

6378

++ */

6379

++static void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq,

6380

++			      int requeue)

6381

++{

6382

++	BUG_ON(!bfq_bfqq_busy(bfqq));

6383

++	BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list));

6384

++

6385

++	bfq_log_bfqq(bfqd, bfqq, "del from busy");

6386

++

6387

++	bfq_clear_bfqq_busy(bfqq);

6388

++

6389

++	BUG_ON(bfqd->busy_queues == 0);

6390

++	bfqd->busy_queues--;

6391

++

6392

++	if (!bfqq->dispatched) {

6393

++		bfq_weights_tree_remove(bfqd, &bfqq->entity,

6394

++					&bfqd->queue_weights_tree);

6395

++		if (!blk_queue_nonrot(bfqd->queue)) {

6396

++			BUG_ON(!bfqd->busy_in_flight_queues);

6397

++			bfqd->busy_in_flight_queues--;

6398

++			if (bfq_bfqq_constantly_seeky(bfqq)) {

6399

++				BUG_ON(!bfqd->

6400

++					const_seeky_busy_in_flight_queues);

6401

++				bfqd->const_seeky_busy_in_flight_queues--;

6402

++			}

6403

++		}

6404

++	}

6405

++	if (bfqq->wr_coeff > 1)

6406

++		bfqd->wr_busy_queues--;

6407

++

6408

++#ifdef CONFIG_BFQ_GROUP_IOSCHED

6409

++	bfqg_stats_update_dequeue(bfqq_group(bfqq));

6410

++#endif

6411

++

6412

++	bfq_deactivate_bfqq(bfqd, bfqq, requeue);

6413

++}

6414

++

6415

++/*

6416

++ * Called when an inactive queue receives a new request.

6417

++ */

6418

++static void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq)

6419

++{

6420

++	BUG_ON(bfq_bfqq_busy(bfqq));

6421

++	BUG_ON(bfqq == bfqd->in_service_queue);

6422

++

6423

++	bfq_log_bfqq(bfqd, bfqq, "add to busy");

6424

++

6425

++	bfq_activate_bfqq(bfqd, bfqq);

6426

++

6427

++	bfq_mark_bfqq_busy(bfqq);

6428

++	bfqd->busy_queues++;

6429

++

6430

++	if (!bfqq->dispatched) {

6431

++		if (bfqq->wr_coeff == 1)

6432

++			bfq_weights_tree_add(bfqd, &bfqq->entity,

6433

++					     &bfqd->queue_weights_tree);

6434

++		if (!blk_queue_nonrot(bfqd->queue)) {

6435

++			bfqd->busy_in_flight_queues++;

6436

++			if (bfq_bfqq_constantly_seeky(bfqq))

6437

++				bfqd->const_seeky_busy_in_flight_queues++;

6438

++		}

6439

++	}

6440

++	if (bfqq->wr_coeff > 1)

6441

++		bfqd->wr_busy_queues++;

6442

++}

6443

+diff --git a/block/bfq.h b/block/bfq.h

6444

+new file mode 100644

6445

+index 0000000..3bb7df2

6446

+--- /dev/null

6447

++++ b/block/bfq.h

6448

+@@ -0,0 +1,801 @@

6449

++/*

6450

++ * BFQ-v7r11 for 4.4.0: data structures and common functions prototypes.

6451

++ *

6452

++ * Based on ideas and code from CFQ:

6453

++ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>

6454

++ *

6455

++ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>

6456

++ *		      Paolo Valente <paolo.valente@×××××××.it>

6457

++ *

6458

++ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>

6459

++ */

6460

++

6461

++#ifndef _BFQ_H

6462

++#define _BFQ_H

6463

++

6464

++#include <linux/blktrace_api.h>

6465

++#include <linux/hrtimer.h>

6466

++#include <linux/ioprio.h>

6467

++#include <linux/rbtree.h>

6468

++#include <linux/blk-cgroup.h>

6469

++

6470

++#define BFQ_IOPRIO_CLASSES	3

6471

++#define BFQ_CL_IDLE_TIMEOUT	(HZ/5)

6472

++

6473

++#define BFQ_MIN_WEIGHT			1

6474

++#define BFQ_MAX_WEIGHT			1000

6475

++#define BFQ_WEIGHT_CONVERSION_COEFF	10

6476

++

6477

++#define BFQ_DEFAULT_QUEUE_IOPRIO	4

6478

++

6479

++#define BFQ_DEFAULT_GRP_WEIGHT	10

6480

++#define BFQ_DEFAULT_GRP_IOPRIO	0

6481

++#define BFQ_DEFAULT_GRP_CLASS	IOPRIO_CLASS_BE

6482

++

6483

++struct bfq_entity;

6484

++

6485

++/**

6486

++ * struct bfq_service_tree - per ioprio_class service tree.

6487

++ * @active: tree for active entities (i.e., those backlogged).

6488

++ * @idle: tree for idle entities (i.e., those not backlogged, with V <= F_i).

6489

++ * @first_idle: idle entity with minimum F_i.

6490

++ * @last_idle: idle entity with maximum F_i.

6491

++ * @vtime: scheduler virtual time.

6492

++ * @wsum: scheduler weight sum; active and idle entities contribute to it.

6493

++ *

6494

++ * Each service tree represents a B-WF2Q+ scheduler on its own.  Each

6495

++ * ioprio_class has its own independent scheduler, and so its own

6496

++ * bfq_service_tree.  All the fields are protected by the queue lock

6497

++ * of the containing bfqd.

6498

++ */

6499

++struct bfq_service_tree {

6500

++	struct rb_root active;

6501

++	struct rb_root idle;

6502

++

6503

++	struct bfq_entity *first_idle;

6504

++	struct bfq_entity *last_idle;

6505

++

6506

++	u64 vtime;

6507

++	unsigned long wsum;

6508

++};

6509

++

6510

++/**

6511

++ * struct bfq_sched_data - multi-class scheduler.

6512

++ * @in_service_entity: entity in service.

6513

++ * @next_in_service: head-of-the-line entity in the scheduler.

6514

++ * @service_tree: array of service trees, one per ioprio_class.

6515

++ *

6516

++ * bfq_sched_data is the basic scheduler queue.  It supports three

6517

++ * ioprio_classes, and can be used either as a toplevel queue or as

6518

++ * an intermediate queue on a hierarchical setup.

6519

++ * @next_in_service points to the active entity of the sched_data

6520

++ * service trees that will be scheduled next.

6521

++ *

6522

++ * The supported ioprio_classes are the same as in CFQ, in descending

6523

++ * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE.

6524

++ * Requests from higher priority queues are served before all the

6525

++ * requests from lower priority queues; among requests of the same

6526

++ * queue requests are served according to B-WF2Q+.

6527

++ * All the fields are protected by the queue lock of the containing bfqd.

6528

++ */

6529

++struct bfq_sched_data {

6530

++	struct bfq_entity *in_service_entity;

6531

++	struct bfq_entity *next_in_service;

6532

++	struct bfq_service_tree service_tree[BFQ_IOPRIO_CLASSES];

6533

++};

6534

++

6535

++/**

6536

++ * struct bfq_weight_counter - counter of the number of all active entities

6537

++ *                             with a given weight.

6538

++ * @weight: weight of the entities that this counter refers to.

6539

++ * @num_active: number of active entities with this weight.

6540

++ * @weights_node: weights tree member (see bfq_data's @queue_weights_tree

6541

++ *                and @group_weights_tree).

6542

++ */

6543

++struct bfq_weight_counter {

6544

++	short int weight;

6545

++	unsigned int num_active;

6546

++	struct rb_node weights_node;

6547

++};

6548

++

6549

++/**

6550

++ * struct bfq_entity - schedulable entity.

6551

++ * @rb_node: service_tree member.

6552

++ * @weight_counter: pointer to the weight counter associated with this entity.

6553

++ * @on_st: flag, true if the entity is on a tree (either the active or

6554

++ *         the idle one of its service_tree).

6555

++ * @finish: B-WF2Q+ finish timestamp (aka F_i).

6556

++ * @start: B-WF2Q+ start timestamp (aka S_i).

6557

++ * @tree: tree the entity is enqueued into; %NULL if not on a tree.

6558

++ * @min_start: minimum start time of the (active) subtree rooted at

6559

++ *             this entity; used for O(log N) lookups into active trees.

6560

++ * @service: service received during the last round of service.

6561

++ * @budget: budget used to calculate F_i; F_i = S_i + @budget / @weight.

6562

++ * @weight: weight of the queue

6563

++ * @parent: parent entity, for hierarchical scheduling.

6564

++ * @my_sched_data: for non-leaf nodes in the cgroup hierarchy, the

6565

++ *                 associated scheduler queue, %NULL on leaf nodes.

6566

++ * @sched_data: the scheduler queue this entity belongs to.

6567

++ * @ioprio: the ioprio in use.

6568

++ * @new_weight: when a weight change is requested, the new weight value.

6569

++ * @orig_weight: original weight, used to implement weight boosting

6570

++ * @prio_changed: flag, true when the user requested a weight, ioprio or

6571

++ *		  ioprio_class change.

6572

++ *

6573

++ * A bfq_entity is used to represent either a bfq_queue (leaf node in the

6574

++ * cgroup hierarchy) or a bfq_group into the upper level scheduler.  Each

6575

++ * entity belongs to the sched_data of the parent group in the cgroup

6576

++ * hierarchy.  Non-leaf entities have also their own sched_data, stored

6577

++ * in @my_sched_data.

6578

++ *

6579

++ * Each entity stores independently its priority values; this would

6580

++ * allow different weights on different devices, but this

6581

++ * functionality is not exported to userspace by now.  Priorities and

6582

++ * weights are updated lazily, first storing the new values into the

6583

++ * new_* fields, then setting the @prio_changed flag.  As soon as

6584

++ * there is a transition in the entity state that allows the priority

6585

++ * update to take place the effective and the requested priority

6586

++ * values are synchronized.

6587

++ *

6588

++ * Unless cgroups are used, the weight value is calculated from the

6589

++ * ioprio to export the same interface as CFQ.  When dealing with

6590

++ * ``well-behaved'' queues (i.e., queues that do not spend too much

6591

++ * time to consume their budget and have true sequential behavior, and

6592

++ * when there are no external factors breaking anticipation) the

6593

++ * relative weights at each level of the cgroups hierarchy should be

6594

++ * guaranteed.  All the fields are protected by the queue lock of the

6595

++ * containing bfqd.

6596

++ */

6597

++struct bfq_entity {

6598

++	struct rb_node rb_node;

6599

++	struct bfq_weight_counter *weight_counter;

6600

++

6601

++	int on_st;

6602

++

6603

++	u64 finish;

6604

++	u64 start;

6605

++

6606

++	struct rb_root *tree;

6607

++

6608

++	u64 min_start;

6609

++

6610

++	int service, budget;

6611

++	unsigned short weight, new_weight;

6612

++	unsigned short orig_weight;

6613

++

6614

++	struct bfq_entity *parent;

6615

++

6616

++	struct bfq_sched_data *my_sched_data;

6617

++	struct bfq_sched_data *sched_data;

6618

++

6619

++	int prio_changed;

6620

++};

6621

++

6622

++struct bfq_group;

6623

++

6624

++/**

6625

++ * struct bfq_queue - leaf schedulable entity.

6626

++ * @ref: reference counter.

6627

++ * @bfqd: parent bfq_data.

6628

++ * @new_ioprio: when an ioprio change is requested, the new ioprio value.

6629

++ * @ioprio_class: the ioprio_class in use.

6630

++ * @new_ioprio_class: when an ioprio_class change is requested, the new

6631

++ *                    ioprio_class value.

6632

++ * @new_bfqq: shared bfq_queue if queue is cooperating with

6633

++ *           one or more other queues.

6634

++ * @sort_list: sorted list of pending requests.

6635

++ * @next_rq: if fifo isn't expired, next request to serve.

6636

++ * @queued: nr of requests queued in @sort_list.

6637

++ * @allocated: currently allocated requests.

6638

++ * @meta_pending: pending metadata requests.

6639

++ * @fifo: fifo list of requests in sort_list.

6640

++ * @entity: entity representing this queue in the scheduler.

6641

++ * @max_budget: maximum budget allowed from the feedback mechanism.

6642

++ * @budget_timeout: budget expiration (in jiffies).

6643

++ * @dispatched: number of requests on the dispatch list or inside driver.

6644

++ * @flags: status flags.

6645

++ * @bfqq_list: node for active/idle bfqq list inside our bfqd.

6646

++ * @burst_list_node: node for the device's burst list.

6647

++ * @seek_samples: number of seeks sampled

6648

++ * @seek_total: sum of the distances of the seeks sampled

6649

++ * @seek_mean: mean seek distance

6650

++ * @last_request_pos: position of the last request enqueued

6651

++ * @requests_within_timer: number of consecutive pairs of request completion

6652

++ *                         and arrival, such that the queue becomes idle

6653

++ *                         after the completion, but the next request arrives

6654

++ *                         within an idle time slice; used only if the queue's

6655

++ *                         IO_bound has been cleared.

6656

++ * @pid: pid of the process owning the queue, used for logging purposes.

6657

++ * @last_wr_start_finish: start time of the current weight-raising period if

6658

++ *                        the @bfq-queue is being weight-raised, otherwise

6659

++ *                        finish time of the last weight-raising period

6660

++ * @wr_cur_max_time: current max raising time for this queue

6661

++ * @soft_rt_next_start: minimum time instant such that, only if a new

6662

++ *                      request is enqueued after this time instant in an

6663

++ *                      idle @bfq_queue with no outstanding requests, then

6664

++ *                      the task associated with the queue it is deemed as

6665

++ *                      soft real-time (see the comments to the function

6666

++ *                      bfq_bfqq_softrt_next_start())

6667

++ * @last_idle_bklogged: time of the last transition of the @bfq_queue from

6668

++ *                      idle to backlogged

6669

++ * @service_from_backlogged: cumulative service received from the @bfq_queue

6670

++ *                           since the last transition from idle to

6671

++ *                           backlogged

6672

++ * @bic: pointer to the bfq_io_cq owning the bfq_queue, set to %NULL if the

6673

++ *	 queue is shared

6674

++ *

6675

++ * A bfq_queue is a leaf request queue; it can be associated with an

6676

++ * io_context or more, if it  is  async or shared  between  cooperating

6677

++ * processes. @cgroup holds a reference to the cgroup, to be sure that it

6678

++ * does not disappear while a bfqq still references it (mostly to avoid

6679

++ * races between request issuing and task migration followed by cgroup

6680

++ * destruction).

6681

++ * All the fields are protected by the queue lock of the containing bfqd.

6682

++ */

6683

++struct bfq_queue {

6684

++	atomic_t ref;

6685

++	struct bfq_data *bfqd;

6686

++

6687

++	unsigned short ioprio, new_ioprio;

6688

++	unsigned short ioprio_class, new_ioprio_class;

6689

++

6690

++	/* fields for cooperating queues handling */

6691

++	struct bfq_queue *new_bfqq;

6692

++	struct rb_node pos_node;

6693

++	struct rb_root *pos_root;

6694

++

6695

++	struct rb_root sort_list;

6696

++	struct request *next_rq;

6697

++	int queued[2];

6698

++	int allocated[2];

6699

++	int meta_pending;

6700

++	struct list_head fifo;

6701

++

6702

++	struct bfq_entity entity;

6703

++

6704

++	int max_budget;

6705

++	unsigned long budget_timeout;

6706

++

6707

++	int dispatched;

6708

++

6709

++	unsigned int flags;

6710

++

6711

++	struct list_head bfqq_list;

6712

++

6713

++	struct hlist_node burst_list_node;

6714

++

6715

++	unsigned int seek_samples;

6716

++	u64 seek_total;

6717

++	sector_t seek_mean;

6718

++	sector_t last_request_pos;

6719

++

6720

++	unsigned int requests_within_timer;

6721

++

6722

++	pid_t pid;

6723

++	struct bfq_io_cq *bic;

6724

++

6725

++	/* weight-raising fields */

6726

++	unsigned long wr_cur_max_time;

6727

++	unsigned long soft_rt_next_start;

6728

++	unsigned long last_wr_start_finish;

6729

++	unsigned int wr_coeff;

6730

++	unsigned long last_idle_bklogged;

6731

++	unsigned long service_from_backlogged;

6732

++};

6733

++

6734

++/**

6735

++ * struct bfq_ttime - per process thinktime stats.

6736

++ * @ttime_total: total process thinktime

6737

++ * @ttime_samples: number of thinktime samples

6738

++ * @ttime_mean: average process thinktime

6739

++ */

6740

++struct bfq_ttime {

6741

++	unsigned long last_end_request;

6742

++

6743

++	unsigned long ttime_total;

6744

++	unsigned long ttime_samples;

6745

++	unsigned long ttime_mean;

6746

++};

6747

++

6748

++/**

6749

++ * struct bfq_io_cq - per (request_queue, io_context) structure.

6750

++ * @icq: associated io_cq structure

6751

++ * @bfqq: array of two process queues, the sync and the async

6752

++ * @ttime: associated @bfq_ttime struct

6753

++ * @ioprio: per (request_queue, blkcg) ioprio.

6754

++ * @blkcg_id: id of the blkcg the related io_cq belongs to.

6755

++ */

6756

++struct bfq_io_cq {

6757

++	struct io_cq icq; /* must be the first member */

6758

++	struct bfq_queue *bfqq[2];

6759

++	struct bfq_ttime ttime;

6760

++	int ioprio;

6761

++

6762

++#ifdef CONFIG_BFQ_GROUP_IOSCHED

6763

++	uint64_t blkcg_id; /* the current blkcg ID */

6764

++#endif

6765

++};

6766

++

6767

++enum bfq_device_speed {

6768

++	BFQ_BFQD_FAST,

6769

++	BFQ_BFQD_SLOW,

6770

++};

6771

++

6772

++/**

6773

++ * struct bfq_data - per device data structure.

6774

++ * @queue: request queue for the managed device.

6775

++ * @root_group: root bfq_group for the device.

6776

++ * @active_numerous_groups: number of bfq_groups containing more than one

6777

++ *                          active @bfq_entity.

6778

++ * @queue_weights_tree: rbtree of weight counters of @bfq_queues, sorted by

6779

++ *                      weight. Used to keep track of whether all @bfq_queues

6780

++ *                     have the same weight. The tree contains one counter

6781

++ *                     for each distinct weight associated to some active

6782

++ *                     and not weight-raised @bfq_queue (see the comments to

6783

++ *                      the functions bfq_weights_tree_[add|remove] for

6784

++ *                     further details).

6785

++ * @group_weights_tree: rbtree of non-queue @bfq_entity weight counters, sorted

6786

++ *                      by weight. Used to keep track of whether all

6787

++ *                     @bfq_groups have the same weight. The tree contains

6788

++ *                     one counter for each distinct weight associated to

6789

++ *                     some active @bfq_group (see the comments to the

6790

++ *                     functions bfq_weights_tree_[add|remove] for further

6791

++ *                     details).

6792

++ * @busy_queues: number of bfq_queues containing requests (including the

6793

++ *		 queue in service, even if it is idling).

6794

++ * @busy_in_flight_queues: number of @bfq_queues containing pending or

6795

++ *                         in-flight requests, plus the @bfq_queue in

6796

++ *                         service, even if idle but waiting for the

6797

++ *                         possible arrival of its next sync request. This

6798

++ *                         field is updated only if the device is rotational,

6799

++ *                         but used only if the device is also NCQ-capable.

6800

++ *                         The reason why the field is updated also for non-

6801

++ *                         NCQ-capable rotational devices is related to the

6802

++ *                         fact that the value of @hw_tag may be set also

6803

++ *                         later than when busy_in_flight_queues may need to

6804

++ *                         be incremented for the first time(s). Taking also

6805

++ *                         this possibility into account, to avoid unbalanced

6806

++ *                         increments/decrements, would imply more overhead

6807

++ *                         than just updating busy_in_flight_queues

6808

++ *                         regardless of the value of @hw_tag.

6809

++ * @const_seeky_busy_in_flight_queues: number of constantly-seeky @bfq_queues

6810

++ *                                     (that is, seeky queues that expired

6811

++ *                                     for budget timeout at least once)

6812

++ *                                     containing pending or in-flight

6813

++ *                                     requests, including the in-service

6814

++ *                                     @bfq_queue if constantly seeky. This

6815

++ *                                     field is updated only if the device

6816

++ *                                     is rotational, but used only if the

6817

++ *                                     device is also NCQ-capable (see the

6818

++ *                                     comments to @busy_in_flight_queues).

6819

++ * @wr_busy_queues: number of weight-raised busy @bfq_queues.

6820

++ * @queued: number of queued requests.

6821

++ * @rq_in_driver: number of requests dispatched and waiting for completion.

6822

++ * @sync_flight: number of sync requests in the driver.

6823

++ * @max_rq_in_driver: max number of reqs in driver in the last

6824

++ *                    @hw_tag_samples completed requests.

6825

++ * @hw_tag_samples: nr of samples used to calculate hw_tag.

6826

++ * @hw_tag: flag set to one if the driver is showing a queueing behavior.

6827

++ * @budgets_assigned: number of budgets assigned.

6828

++ * @idle_slice_timer: timer set when idling for the next sequential request

6829

++ *                    from the queue in service.

6830

++ * @unplug_work: delayed work to restart dispatching on the request queue.

6831

++ * @in_service_queue: bfq_queue in service.

6832

++ * @in_service_bic: bfq_io_cq (bic) associated with the @in_service_queue.

6833

++ * @last_position: on-disk position of the last served request.

6834

++ * @last_budget_start: beginning of the last budget.

6835

++ * @last_idling_start: beginning of the last idle slice.

6836

++ * @peak_rate: peak transfer rate observed for a budget.

6837

++ * @peak_rate_samples: number of samples used to calculate @peak_rate.

6838

++ * @bfq_max_budget: maximum budget allotted to a bfq_queue before

6839

++ *                  rescheduling.

6840

++ * @active_list: list of all the bfq_queues active on the device.

6841

++ * @idle_list: list of all the bfq_queues idle on the device.

6842

++ * @bfq_fifo_expire: timeout for async/sync requests; when it expires

6843

++ *                   requests are served in fifo order.

6844

++ * @bfq_back_penalty: weight of backward seeks wrt forward ones.

6845

++ * @bfq_back_max: maximum allowed backward seek.

6846

++ * @bfq_slice_idle: maximum idling time.

6847

++ * @bfq_user_max_budget: user-configured max budget value

6848

++ *                       (0 for auto-tuning).

6849

++ * @bfq_max_budget_async_rq: maximum budget (in nr of requests) allotted to

6850

++ *                           async queues.

6851

++ * @bfq_timeout: timeout for bfq_queues to consume their budget; used to

6852

++ *               to prevent seeky queues to impose long latencies to well

6853

++ *               behaved ones (this also implies that seeky queues cannot

6854

++ *               receive guarantees in the service domain; after a timeout

6855

++ *               they are charged for the whole allocated budget, to try

6856

++ *               to preserve a behavior reasonably fair among them, but

6857

++ *               without service-domain guarantees).

6858

++ * @bfq_coop_thresh: number of queue merges after which a @bfq_queue is

6859

++ *                   no more granted any weight-raising.

6860

++ * @bfq_failed_cooperations: number of consecutive failed cooperation

6861

++ *                           chances after which weight-raising is restored

6862

++ *                           to a queue subject to more than bfq_coop_thresh

6863

++ *                           queue merges.

6864

++ * @bfq_requests_within_timer: number of consecutive requests that must be

6865

++ *                             issued within the idle time slice to set

6866

++ *                             again idling to a queue which was marked as

6867

++ *                             non-I/O-bound (see the definition of the

6868

++ *                             IO_bound flag for further details).

6869

++ * @last_ins_in_burst: last time at which a queue entered the current

6870

++ *                     burst of queues being activated shortly after

6871

++ *                     each other; for more details about this and the

6872

++ *                     following parameters related to a burst of

6873

++ *                     activations, see the comments to the function

6874

++ *                     @bfq_handle_burst.

6875

++ * @bfq_burst_interval: reference time interval used to decide whether a

6876

++ *                      queue has been activated shortly after

6877

++ *                      @last_ins_in_burst.

6878

++ * @burst_size: number of queues in the current burst of queue activations.

6879

++ * @bfq_large_burst_thresh: maximum burst size above which the current

6880

++ * 			    queue-activation burst is deemed as 'large'.

6881

++ * @large_burst: true if a large queue-activation burst is in progress.

6882

++ * @burst_list: head of the burst list (as for the above fields, more details

6883

++ * 		in the comments to the function bfq_handle_burst).

6884

++ * @low_latency: if set to true, low-latency heuristics are enabled.

6885

++ * @bfq_wr_coeff: maximum factor by which the weight of a weight-raised

6886

++ *                queue is multiplied.

6887

++ * @bfq_wr_max_time: maximum duration of a weight-raising period (jiffies).

6888

++ * @bfq_wr_rt_max_time: maximum duration for soft real-time processes.

6889

++ * @bfq_wr_min_idle_time: minimum idle period after which weight-raising

6890

++ *			  may be reactivated for a queue (in jiffies).

6891

++ * @bfq_wr_min_inter_arr_async: minimum period between request arrivals

6892

++ *				after which weight-raising may be

6893

++ *				reactivated for an already busy queue

6894

++ *				(in jiffies).

6895

++ * @bfq_wr_max_softrt_rate: max service-rate for a soft real-time queue,

6896

++ *			    sectors per seconds.

6897

++ * @RT_prod: cached value of the product R*T used for computing the maximum

6898

++ *	     duration of the weight raising automatically.

6899

++ * @device_speed: device-speed class for the low-latency heuristic.

6900

++ * @oom_bfqq: fallback dummy bfqq for extreme OOM conditions.

6901

++ *

6902

++ * All the fields are protected by the @queue lock.

6903

++ */

6904

++struct bfq_data {

6905

++	struct request_queue *queue;

6906

++

6907

++	struct bfq_group *root_group;

6908

++

6909

++#ifdef CONFIG_BFQ_GROUP_IOSCHED

6910

++	int active_numerous_groups;

6911

++#endif

6912

++

6913

++	struct rb_root queue_weights_tree;

6914

++	struct rb_root group_weights_tree;

6915

++

6916

++	int busy_queues;

6917

++	int busy_in_flight_queues;

6918

++	int const_seeky_busy_in_flight_queues;

6919

++	int wr_busy_queues;

6920

++	int queued;

6921

++	int rq_in_driver;

6922

++	int sync_flight;

6923

++

6924

++	int max_rq_in_driver;

6925

++	int hw_tag_samples;

6926

++	int hw_tag;

6927

++

6928

++	int budgets_assigned;

6929

++

6930

++	struct timer_list idle_slice_timer;

6931

++	struct work_struct unplug_work;

6932

++

6933

++	struct bfq_queue *in_service_queue;

6934

++	struct bfq_io_cq *in_service_bic;

6935

++

6936

++	sector_t last_position;

6937

++

6938

++	ktime_t last_budget_start;

6939

++	ktime_t last_idling_start;

6940

++	int peak_rate_samples;

6941

++	u64 peak_rate;

6942

++	int bfq_max_budget;

6943

++

6944

++	struct list_head active_list;

6945

++	struct list_head idle_list;

6946

++

6947

++	unsigned int bfq_fifo_expire[2];

6948

++	unsigned int bfq_back_penalty;

6949

++	unsigned int bfq_back_max;

6950

++	unsigned int bfq_slice_idle;

6951

++	u64 bfq_class_idle_last_service;

6952

++

6953

++	int bfq_user_max_budget;

6954

++	int bfq_max_budget_async_rq;

6955

++	unsigned int bfq_timeout[2];

6956

++

6957

++	unsigned int bfq_coop_thresh;

6958

++	unsigned int bfq_failed_cooperations;

6959

++	unsigned int bfq_requests_within_timer;

6960

++

6961

++	unsigned long last_ins_in_burst;

6962

++	unsigned long bfq_burst_interval;

6963

++	int burst_size;

6964

++	unsigned long bfq_large_burst_thresh;

6965

++	bool large_burst;

6966

++	struct hlist_head burst_list;

6967

++

6968

++	bool low_latency;

6969

++

6970

++	/* parameters of the low_latency heuristics */

6971

++	unsigned int bfq_wr_coeff;

6972

++	unsigned int bfq_wr_max_time;

6973

++	unsigned int bfq_wr_rt_max_time;

6974

++	unsigned int bfq_wr_min_idle_time;

6975

++	unsigned long bfq_wr_min_inter_arr_async;

6976

++	unsigned int bfq_wr_max_softrt_rate;

6977

++	u64 RT_prod;

6978

++	enum bfq_device_speed device_speed;

6979

++

6980

++	struct bfq_queue oom_bfqq;

6981

++};

6982

++

6983

++enum bfqq_state_flags {

6984

++	BFQ_BFQQ_FLAG_busy = 0,		/* has requests or is in service */

6985

++	BFQ_BFQQ_FLAG_wait_request,	/* waiting for a request */

6986

++	BFQ_BFQQ_FLAG_must_alloc,	/* must be allowed rq alloc */

6987

++	BFQ_BFQQ_FLAG_fifo_expire,	/* FIFO checked in this slice */

6988

++	BFQ_BFQQ_FLAG_idle_window,	/* slice idling enabled */

6989

++	BFQ_BFQQ_FLAG_sync,		/* synchronous queue */

6990

++	BFQ_BFQQ_FLAG_budget_new,	/* no completion with this budget */

6991

++	BFQ_BFQQ_FLAG_IO_bound,		/*

6992

++					 * bfqq has timed-out at least once

6993

++					 * having consumed at most 2/10 of

6994

++					 * its budget

6995

++					 */

6996

++	BFQ_BFQQ_FLAG_in_large_burst,	/*

6997

++					 * bfqq activated in a large burst,

6998

++					 * see comments to bfq_handle_burst.

6999

++					 */

7000

++	BFQ_BFQQ_FLAG_constantly_seeky,	/*

7001

++					 * bfqq has proved to be slow and

7002

++					 * seeky until budget timeout

7003

++					 */

7004

++	BFQ_BFQQ_FLAG_softrt_update,	/*

7005

++					 * may need softrt-next-start

7006

++					 * update

7007

++					 */

7008

++};

7009

++

7010

++#define BFQ_BFQQ_FNS(name)						\

7011

++static void bfq_mark_bfqq_##name(struct bfq_queue *bfqq)		\

7012

++{									\

7013

++	(bfqq)->flags |= (1 << BFQ_BFQQ_FLAG_##name);			\

7014

++}									\

7015

++static void bfq_clear_bfqq_##name(struct bfq_queue *bfqq)		\

7016

++{									\

7017

++	(bfqq)->flags &= ~(1 << BFQ_BFQQ_FLAG_##name);			\

7018

++}									\

7019

++static int bfq_bfqq_##name(const struct bfq_queue *bfqq)		\

7020

++{									\

7021

++	return ((bfqq)->flags & (1 << BFQ_BFQQ_FLAG_##name)) != 0;	\

7022

++}

7023

++

7024

++BFQ_BFQQ_FNS(busy);

7025

++BFQ_BFQQ_FNS(wait_request);

7026

++BFQ_BFQQ_FNS(must_alloc);

7027

++BFQ_BFQQ_FNS(fifo_expire);

7028

++BFQ_BFQQ_FNS(idle_window);

7029

++BFQ_BFQQ_FNS(sync);

7030

++BFQ_BFQQ_FNS(budget_new);

7031

++BFQ_BFQQ_FNS(IO_bound);

7032

++BFQ_BFQQ_FNS(in_large_burst);

7033

++BFQ_BFQQ_FNS(constantly_seeky);

7034

++BFQ_BFQQ_FNS(softrt_update);

7035

++#undef BFQ_BFQQ_FNS

7036

++

7037

++/* Logging facilities. */

7038

++#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \

7039

++	blk_add_trace_msg((bfqd)->queue, "bfq%d " fmt, (bfqq)->pid, ##args)

7040

++

7041

++#define bfq_log(bfqd, fmt, args...) \

7042

++	blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args)

7043

++

7044

++/* Expiration reasons. */

7045

++enum bfqq_expiration {

7046

++	BFQ_BFQQ_TOO_IDLE = 0,		/*

7047

++					 * queue has been idling for

7048

++					 * too long

7049

++					 */

7050

++	BFQ_BFQQ_BUDGET_TIMEOUT,	/* budget took too long to be used */

7051

++	BFQ_BFQQ_BUDGET_EXHAUSTED,	/* budget consumed */

7052

++	BFQ_BFQQ_NO_MORE_REQUESTS,	/* the queue has no more requests */

7053

++};

7054

++

7055

++#ifdef CONFIG_BFQ_GROUP_IOSCHED

7056

++

7057

++struct bfqg_stats {

7058

++	/* total bytes transferred */

7059

++	struct blkg_rwstat		service_bytes;

7060

++	/* total IOs serviced, post merge */

7061

++	struct blkg_rwstat		serviced;

7062

++	/* number of ios merged */

7063

++	struct blkg_rwstat		merged;

7064

++	/* total time spent on device in ns, may not be accurate w/ queueing */

7065

++	struct blkg_rwstat		service_time;

7066

++	/* total time spent waiting in scheduler queue in ns */

7067

++	struct blkg_rwstat		wait_time;

7068

++	/* number of IOs queued up */

7069

++	struct blkg_rwstat		queued;

7070

++	/* total sectors transferred */

7071

++	struct blkg_stat		sectors;

7072

++	/* total disk time and nr sectors dispatched by this group */

7073

++	struct blkg_stat		time;

7074

++	/* time not charged to this cgroup */

7075

++	struct blkg_stat		unaccounted_time;

7076

++	/* sum of number of ios queued across all samples */

7077

++	struct blkg_stat		avg_queue_size_sum;

7078

++	/* count of samples taken for average */

7079

++	struct blkg_stat		avg_queue_size_samples;

7080

++	/* how many times this group has been removed from service tree */

7081

++	struct blkg_stat		dequeue;

7082

++	/* total time spent waiting for it to be assigned a timeslice. */

7083

++	struct blkg_stat		group_wait_time;

7084

++	/* time spent idling for this blkcg_gq */

7085

++	struct blkg_stat		idle_time;

7086

++	/* total time with empty current active q with other requests queued */

7087

++	struct blkg_stat		empty_time;

7088

++	/* fields after this shouldn't be cleared on stat reset */

7089

++	uint64_t			start_group_wait_time;

7090

++	uint64_t			start_idle_time;

7091

++	uint64_t			start_empty_time;

7092

++	uint16_t			flags;

7093

++};

7094

++

7095

++/*

7096

++ * struct bfq_group_data - per-blkcg storage for the blkio subsystem.

7097

++ *

7098

++ * @ps: @blkcg_policy_storage that this structure inherits

7099

++ * @weight: weight of the bfq_group

7100

++ */

7101

++struct bfq_group_data {

7102

++	/* must be the first member */

7103

++	struct blkcg_policy_data pd;

7104

++

7105

++	unsigned short weight;

7106

++};

7107

++

7108

++/**

7109

++ * struct bfq_group - per (device, cgroup) data structure.

7110

++ * @entity: schedulable entity to insert into the parent group sched_data.

7111

++ * @sched_data: own sched_data, to contain child entities (they may be

7112

++ *              both bfq_queues and bfq_groups).

7113

++ * @bfqd: the bfq_data for the device this group acts upon.

7114

++ * @async_bfqq: array of async queues for all the tasks belonging to

7115

++ *              the group, one queue per ioprio value per ioprio_class,

7116

++ *              except for the idle class that has only one queue.

7117

++ * @async_idle_bfqq: async queue for the idle class (ioprio is ignored).

7118

++ * @my_entity: pointer to @entity, %NULL for the toplevel group; used

7119

++ *             to avoid too many special cases during group creation/

7120

++ *             migration.

7121

++ * @active_entities: number of active entities belonging to the group;

7122

++ *                   unused for the root group. Used to know whether there

7123

++ *                   are groups with more than one active @bfq_entity

7124

++ *                   (see the comments to the function

7125

++ *                   bfq_bfqq_must_not_expire()).

7126

++ *

7127

++ * Each (device, cgroup) pair has its own bfq_group, i.e., for each cgroup

7128

++ * there is a set of bfq_groups, each one collecting the lower-level

7129

++ * entities belonging to the group that are acting on the same device.

7130

++ *

7131

++ * Locking works as follows:

7132

++ *    o @bfqd is protected by the queue lock, RCU is used to access it

7133

++ *      from the readers.

7134

++ *    o All the other fields are protected by the @bfqd queue lock.

7135

++ */

7136

++struct bfq_group {

7137

++	/* must be the first member */

7138

++	struct blkg_policy_data pd;

7139

++

7140

++	struct bfq_entity entity;

7141

++	struct bfq_sched_data sched_data;

7142

++

7143

++	void *bfqd;

7144

++

7145

++	struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];

7146

++	struct bfq_queue *async_idle_bfqq;

7147

++

7148

++	struct bfq_entity *my_entity;

7149

++

7150

++	int active_entities;

7151

++

7152

++	struct bfqg_stats stats;

7153

++	struct bfqg_stats dead_stats;	/* stats pushed from dead children */

7154

++};

7155

++

7156

++#else

7157

++struct bfq_group {

7158

++	struct bfq_sched_data sched_data;

7159

++

7160

++	struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];

7161

++	struct bfq_queue *async_idle_bfqq;

7162

++};

7163

++#endif

7164

++

7165

++static struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity);

7166

++

7167

++static struct bfq_service_tree *

7168

++bfq_entity_service_tree(struct bfq_entity *entity)

7169

++{

7170

++	struct bfq_sched_data *sched_data = entity->sched_data;

7171

++	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

7172

++	unsigned int idx = bfqq ? bfqq->ioprio_class - 1 :

7173

++				  BFQ_DEFAULT_GRP_CLASS;

7174

++

7175

++	BUG_ON(idx >= BFQ_IOPRIO_CLASSES);

7176

++	BUG_ON(sched_data == NULL);

7177

++

7178

++	return sched_data->service_tree + idx;

7179

++}

7180

++

7181

++static struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, bool is_sync)

7182

++{

7183

++	return bic->bfqq[is_sync];

7184

++}

7185

++

7186

++static void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq,

7187

++			 bool is_sync)

7188

++{

7189

++	bic->bfqq[is_sync] = bfqq;

7190

++}

7191

++

7192

++static struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic)

7193

++{

7194

++	return bic->icq.q->elevator->elevator_data;

7195

++}

7196

++

7197

++/**

7198

++ * bfq_get_bfqd_locked - get a lock to a bfqd using a RCU protected pointer.

7199

++ * @ptr: a pointer to a bfqd.

7200

++ * @flags: storage for the flags to be saved.

7201

++ *

7202

++ * This function allows bfqg->bfqd to be protected by the

7203

++ * queue lock of the bfqd they reference; the pointer is dereferenced

7204

++ * under RCU, so the storage for bfqd is assured to be safe as long

7205

++ * as the RCU read side critical section does not end.  After the

7206

++ * bfqd->queue->queue_lock is taken the pointer is rechecked, to be

7207

++ * sure that no other writer accessed it.  If we raced with a writer,

7208

++ * the function returns NULL, with the queue unlocked, otherwise it

7209

++ * returns the dereferenced pointer, with the queue locked.

7210

++ */

7211

++static struct bfq_data *bfq_get_bfqd_locked(void **ptr, unsigned long *flags)

7212

++{

7213

++	struct bfq_data *bfqd;

7214

++

7215

++	rcu_read_lock();

7216

++	bfqd = rcu_dereference(*(struct bfq_data **)ptr);

7217

++

7218

++	if (bfqd != NULL) {

7219

++		spin_lock_irqsave(bfqd->queue->queue_lock, *flags);

7220

++		if (ptr == NULL)

7221

++			printk(KERN_CRIT "get_bfqd_locked pointer NULL\n");

7222

++		else if (*ptr == bfqd)

7223

++			goto out;

7224

++		spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags);

7225

++	}

7226

++

7227

++	bfqd = NULL;

7228

++out:

7229

++	rcu_read_unlock();

7230

++	return bfqd;

7231

++}

7232

++

7233

++static void bfq_put_bfqd_unlock(struct bfq_data *bfqd, unsigned long *flags)

7234

++{

7235

++	spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags);

7236

++}

7237

++

7238

++static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio);

7239

++static void bfq_put_queue(struct bfq_queue *bfqq);

7240

++static void bfq_dispatch_insert(struct request_queue *q, struct request *rq);

7241

++static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,

7242

++				       struct bio *bio, int is_sync,

7243

++				       struct bfq_io_cq *bic, gfp_t gfp_mask);

7244

++static void bfq_end_wr_async_queues(struct bfq_data *bfqd,

7245

++				    struct bfq_group *bfqg);

7246

++static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg);

7247

++static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq);

7248

++

7249

++#endif /* _BFQ_H */

7250

+--

7251

+1.9.1

7252

+

7253

7254

diff --git a/5003_block-bfq-add-Early-Queue-Merge-EQM-to-BFQ-v7r11-for-4.4.patch b/5003_block-bfq-add-Early-Queue-Merge-EQM-to-BFQ-v7r11-for-4.4.patch

7255

new file mode 100644

7256

index 0000000..a49c430

7257

--- /dev/null

7258

+++ b/5003_block-bfq-add-Early-Queue-Merge-EQM-to-BFQ-v7r11-for-4.4.patch

7259

@@ -0,0 +1,1101 @@

7260

+From d3deade9dc903f58c2bf79e316b785f6eaf2441f Mon Sep 17 00:00:00 2001

7261

+From: Mauro Andreolini <mauro.andreolini@×××××××.it>

7262

+Date: Sun, 6 Sep 2015 16:09:05 +0200

7263

+Subject: [PATCH 3/3] block, bfq: add Early Queue Merge (EQM) to BFQ-v7r11 for

7264

+ 4.4.0

7265

+

7266

+A set of processes may happen  to  perform interleaved reads, i.e.,requests

7267

+whose union would give rise to a  sequential read  pattern.  There are two

7268

+typical  cases: in the first  case,   processes  read  fixed-size chunks of

7269

+data at a fixed distance from each other, while in the second case processes

7270

+may read variable-size chunks at  variable distances. The latter case occurs

7271

+for  example with  QEMU, which  splits the  I/O generated  by the  guest into

7272

+multiple chunks,  and lets these chunks  be served by a  pool of cooperating

7273

+processes,  iteratively  assigning  the  next  chunk of  I/O  to  the first

7274

+available  process. CFQ  uses actual  queue merging  for the  first type of

7275

+rocesses, whereas it  uses preemption to get a sequential  read pattern out

7276

+of the read requests  performed by the second type of  processes. In the end

7277

+it uses  two different  mechanisms to  achieve the  same goal: boosting the

7278

+throughput with interleaved I/O.

7279

+

7280

+This patch introduces  Early Queue Merge (EQM), a unified mechanism to get a

7281

+sequential  read pattern  with both  types of  processes. The  main idea is

7282

+checking newly arrived requests against the next request of the active queue

7283

+both in case of actual request insert and in case of request merge. By doing

7284

+so, both the types of processes can be handled by just merging their queues.

7285

+EQM is  then simpler and  more compact than the  pair of mechanisms used in

7286

+CFQ.

7287

+

7288

+Finally, EQM  also preserves the  typical low-latency properties of BFQ, by

7289

+properly restoring the weight-raising state of a queue when it gets back to

7290

+a non-merged state.

7291

+

7292

+Signed-off-by: Mauro Andreolini <mauro.andreolini@×××××××.it>

7293

+Signed-off-by: Arianna Avanzini <avanzini@××××××.com>

7294

+Signed-off-by: Paolo Valente <paolo.valente@×××××××.it>

7295

+Signed-off-by: Linus Walleij <linus.walleij@××××××.org>

7296

+---

7297

+ block/bfq-cgroup.c  |   4 +

7298

+ block/bfq-iosched.c | 687 ++++++++++++++++++++++++++++++++++++++++++++++++++--

7299

+ block/bfq.h         |  66 +++++

7300

+ 3 files changed, 743 insertions(+), 14 deletions(-)

7301

+

7302

+diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c

7303

+index 8610cd6..5ee99ec 100644

7304

+--- a/block/bfq-cgroup.c

7305

++++ b/block/bfq-cgroup.c

7306

+@@ -437,6 +437,7 @@ static void bfq_pd_init(struct blkg_policy_data *pd)

7307

+ 				   */

7308

+ 	bfqg->bfqd = bfqd;

7309

+ 	bfqg->active_entities = 0;

7310

++	bfqg->rq_pos_tree = RB_ROOT;

7311

+ }

7312

+

7313

+ static void bfq_pd_free(struct blkg_policy_data *pd)

7314

+@@ -530,6 +531,8 @@ static struct bfq_group *bfq_find_alloc_group(struct bfq_data *bfqd,

7315

+ 	return bfqg;

7316

+ }

7317

+

7318

++static void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq);

7319

++

7320

+ /**

7321

+  * bfq_bfqq_move - migrate @bfqq to @bfqg.

7322

+  * @bfqd: queue descriptor.

7323

+@@ -577,6 +580,7 @@ static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,

7324

+ 	bfqg_get(bfqg);

7325

+

7326

+ 	if (busy) {

7327

++		bfq_pos_tree_add_move(bfqd, bfqq);

7328

+ 		if (resume)

7329

+ 			bfq_activate_bfqq(bfqd, bfqq);

7330

+ 	}

7331

+diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c

7332

+index f9787a6..d1f648d 100644

7333

+--- a/block/bfq-iosched.c

7334

++++ b/block/bfq-iosched.c

7335

+@@ -296,6 +296,72 @@ static struct request *bfq_choose_req(struct bfq_data *bfqd,

7336

+ 	}

7337

+ }

7338

+

7339

++static struct bfq_queue *

7340

++bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root,

7341

++		     sector_t sector, struct rb_node **ret_parent,

7342

++		     struct rb_node ***rb_link)

7343

++{

7344

++	struct rb_node **p, *parent;

7345

++	struct bfq_queue *bfqq = NULL;

7346

++

7347

++	parent = NULL;

7348

++	p = &root->rb_node;

7349

++	while (*p) {

7350

++		struct rb_node **n;

7351

++

7352

++		parent = *p;

7353

++		bfqq = rb_entry(parent, struct bfq_queue, pos_node);

7354

++

7355

++		/*

7356

++		 * Sort strictly based on sector. Smallest to the left,

7357

++		 * largest to the right.

7358

++		 */

7359

++		if (sector > blk_rq_pos(bfqq->next_rq))

7360

++			n = &(*p)->rb_right;

7361

++		else if (sector < blk_rq_pos(bfqq->next_rq))

7362

++			n = &(*p)->rb_left;

7363

++		else

7364

++			break;

7365

++		p = n;

7366

++		bfqq = NULL;

7367

++	}

7368

++

7369

++	*ret_parent = parent;

7370

++	if (rb_link)

7371

++		*rb_link = p;

7372

++

7373

++	bfq_log(bfqd, "rq_pos_tree_lookup %llu: returning %d",

7374

++		(long long unsigned)sector,

7375

++		bfqq ? bfqq->pid : 0);

7376

++

7377

++	return bfqq;

7378

++}

7379

++

7380

++static void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq)

7381

++{

7382

++	struct rb_node **p, *parent;

7383

++	struct bfq_queue *__bfqq;

7384

++

7385

++	if (bfqq->pos_root) {

7386

++		rb_erase(&bfqq->pos_node, bfqq->pos_root);

7387

++		bfqq->pos_root = NULL;

7388

++	}

7389

++

7390

++	if (bfq_class_idle(bfqq))

7391

++		return;

7392

++	if (!bfqq->next_rq)

7393

++		return;

7394

++

7395

++	bfqq->pos_root = &bfq_bfqq_to_bfqg(bfqq)->rq_pos_tree;

7396

++	__bfqq = bfq_rq_pos_tree_lookup(bfqd, bfqq->pos_root,

7397

++			blk_rq_pos(bfqq->next_rq), &parent, &p);

7398

++	if (!__bfqq) {

7399

++		rb_link_node(&bfqq->pos_node, parent, p);

7400

++		rb_insert_color(&bfqq->pos_node, bfqq->pos_root);

7401

++	} else

7402

++		bfqq->pos_root = NULL;

7403

++}

7404

++

7405

+ /*

7406

+  * Tell whether there are active queues or groups with differentiated weights.

7407

+  */

7408

+@@ -528,6 +594,57 @@ static unsigned int bfq_wr_duration(struct bfq_data *bfqd)

7409

+ 	return dur;

7410

+ }

7411

+

7412

++static unsigned bfq_bfqq_cooperations(struct bfq_queue *bfqq)

7413

++{

7414

++	return bfqq->bic ? bfqq->bic->cooperations : 0;

7415

++}

7416

++

7417

++static void

7418

++bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_io_cq *bic)

7419

++{

7420

++	if (bic->saved_idle_window)

7421

++		bfq_mark_bfqq_idle_window(bfqq);

7422

++	else

7423

++		bfq_clear_bfqq_idle_window(bfqq);

7424

++	if (bic->saved_IO_bound)

7425

++		bfq_mark_bfqq_IO_bound(bfqq);

7426

++	else

7427

++		bfq_clear_bfqq_IO_bound(bfqq);

7428

++	/* Assuming that the flag in_large_burst is already correctly set */

7429

++	if (bic->wr_time_left && bfqq->bfqd->low_latency &&

7430

++	    !bfq_bfqq_in_large_burst(bfqq) &&

7431

++	    bic->cooperations < bfqq->bfqd->bfq_coop_thresh) {

7432

++		/*

7433

++		 * Start a weight raising period with the duration given by

7434

++		 * the raising_time_left snapshot.

7435

++		 */

7436

++		if (bfq_bfqq_busy(bfqq))

7437

++			bfqq->bfqd->wr_busy_queues++;

7438

++		bfqq->wr_coeff = bfqq->bfqd->bfq_wr_coeff;

7439

++		bfqq->wr_cur_max_time = bic->wr_time_left;

7440

++		bfqq->last_wr_start_finish = jiffies;

7441

++		bfqq->entity.prio_changed = 1;

7442

++	}

7443

++	/*

7444

++	 * Clear wr_time_left to prevent bfq_bfqq_save_state() from

7445

++	 * getting confused about the queue's need of a weight-raising

7446

++	 * period.

7447

++	 */

7448

++	bic->wr_time_left = 0;

7449

++}

7450

++

7451

++static int bfqq_process_refs(struct bfq_queue *bfqq)

7452

++{

7453

++	int process_refs, io_refs;

7454

++

7455

++	lockdep_assert_held(bfqq->bfqd->queue->queue_lock);

7456

++

7457

++	io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE];

7458

++	process_refs = atomic_read(&bfqq->ref) - io_refs - bfqq->entity.on_st;

7459

++	BUG_ON(process_refs < 0);

7460

++	return process_refs;

7461

++}

7462

++

7463

+ /* Empty burst list and add just bfqq (see comments to bfq_handle_burst) */

7464

+ static void bfq_reset_burst_list(struct bfq_data *bfqd, struct bfq_queue *bfqq)

7465

+ {

7466

+@@ -764,8 +881,14 @@ static void bfq_add_request(struct request *rq)

7467

+ 	BUG_ON(!next_rq);

7468

+ 	bfqq->next_rq = next_rq;

7469

+

7470

++	/*

7471

++	 * Adjust priority tree position, if next_rq changes.

7472

++	 */

7473

++	if (prev != bfqq->next_rq)

7474

++		bfq_pos_tree_add_move(bfqd, bfqq);

7475

++

7476

+ 	if (!bfq_bfqq_busy(bfqq)) {

7477

+-		bool soft_rt, in_burst,

7478

++		bool soft_rt, coop_or_in_burst,

7479

+ 		     idle_for_long_time = time_is_before_jiffies(

7480

+ 						bfqq->budget_timeout +

7481

+ 						bfqd->bfq_wr_min_idle_time);

7482

+@@ -793,11 +916,12 @@ static void bfq_add_request(struct request *rq)

7483

+ 				bfqd->last_ins_in_burst = jiffies;

7484

+ 		}

7485

+

7486

+-		in_burst = bfq_bfqq_in_large_burst(bfqq);

7487

++		coop_or_in_burst = bfq_bfqq_in_large_burst(bfqq) ||

7488

++			bfq_bfqq_cooperations(bfqq) >= bfqd->bfq_coop_thresh;

7489

+ 		soft_rt = bfqd->bfq_wr_max_softrt_rate > 0 &&

7490

+-			!in_burst &&

7491

++			!coop_or_in_burst &&

7492

+ 			time_is_before_jiffies(bfqq->soft_rt_next_start);

7493

+-		interactive = !in_burst && idle_for_long_time;

7494

++		interactive = !coop_or_in_burst && idle_for_long_time;

7495

+ 		entity->budget = max_t(unsigned long, bfqq->max_budget,

7496

+ 				       bfq_serv_to_charge(next_rq, bfqq));

7497

+

7498

+@@ -816,6 +940,9 @@ static void bfq_add_request(struct request *rq)

7499

+ 		if (!bfqd->low_latency)

7500

+ 			goto add_bfqq_busy;

7501

+

7502

++		if (bfq_bfqq_just_split(bfqq))

7503

++			goto set_prio_changed;

7504

++

7505

+ 		/*

7506

+ 		 * If the queue:

7507

+ 		 * - is not being boosted,

7508

+@@ -840,7 +967,7 @@ static void bfq_add_request(struct request *rq)

7509

+ 		} else if (old_wr_coeff > 1) {

7510

+ 			if (interactive)

7511

+ 				bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);

7512

+-			else if (in_burst ||

7513

++			else if (coop_or_in_burst ||

7514

+ 				 (bfqq->wr_cur_max_time ==

7515

+ 				  bfqd->bfq_wr_rt_max_time &&

7516

+ 				  !soft_rt)) {

7517

+@@ -905,6 +1032,7 @@ static void bfq_add_request(struct request *rq)

7518

+ 					bfqd->bfq_wr_rt_max_time;

7519

+ 			}

7520

+ 		}

7521

++set_prio_changed:

7522

+ 		if (old_wr_coeff != bfqq->wr_coeff)

7523

+ 			entity->prio_changed = 1;

7524

+ add_bfqq_busy:

7525

+@@ -1047,6 +1175,15 @@ static void bfq_merged_request(struct request_queue *q, struct request *req,

7526

+ 					 bfqd->last_position);

7527

+ 		BUG_ON(!next_rq);

7528

+ 		bfqq->next_rq = next_rq;

7529

++		/*

7530

++		 * If next_rq changes, update both the queue's budget to

7531

++		 * fit the new request and the queue's position in its

7532

++		 * rq_pos_tree.

7533

++		 */

7534

++		if (prev != bfqq->next_rq) {

7535

++			bfq_updated_next_req(bfqd, bfqq);

7536

++			bfq_pos_tree_add_move(bfqd, bfqq);

7537

++		}

7538

+ 	}

7539

+ }

7540

+

7541

+@@ -1129,11 +1266,346 @@ static void bfq_end_wr(struct bfq_data *bfqd)

7542

+ 	spin_unlock_irq(bfqd->queue->queue_lock);

7543

+ }

7544

+

7545

++static sector_t bfq_io_struct_pos(void *io_struct, bool request)

7546

++{

7547

++	if (request)

7548

++		return blk_rq_pos(io_struct);

7549

++	else

7550

++		return ((struct bio *)io_struct)->bi_iter.bi_sector;

7551

++}

7552

++

7553

++static int bfq_rq_close_to_sector(void *io_struct, bool request,

7554

++				  sector_t sector)

7555

++{

7556

++	return abs(bfq_io_struct_pos(io_struct, request) - sector) <=

7557

++	       BFQQ_SEEK_THR;

7558

++}

7559

++

7560

++static struct bfq_queue *bfqq_find_close(struct bfq_data *bfqd,

7561

++					 struct bfq_queue *bfqq,

7562

++					 sector_t sector)

7563

++{

7564

++	struct rb_root *root = &bfq_bfqq_to_bfqg(bfqq)->rq_pos_tree;

7565

++	struct rb_node *parent, *node;

7566

++	struct bfq_queue *__bfqq;

7567

++

7568

++	if (RB_EMPTY_ROOT(root))

7569

++		return NULL;

7570

++

7571

++	/*

7572

++	 * First, if we find a request starting at the end of the last

7573

++	 * request, choose it.

7574

++	 */

7575

++	__bfqq = bfq_rq_pos_tree_lookup(bfqd, root, sector, &parent, NULL);

7576

++	if (__bfqq)

7577

++		return __bfqq;

7578

++

7579

++	/*

7580

++	 * If the exact sector wasn't found, the parent of the NULL leaf

7581

++	 * will contain the closest sector (rq_pos_tree sorted by

7582

++	 * next_request position).

7583

++	 */

7584

++	__bfqq = rb_entry(parent, struct bfq_queue, pos_node);

7585

++	if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector))

7586

++		return __bfqq;

7587

++

7588

++	if (blk_rq_pos(__bfqq->next_rq) < sector)

7589

++		node = rb_next(&__bfqq->pos_node);

7590

++	else

7591

++		node = rb_prev(&__bfqq->pos_node);

7592

++	if (!node)

7593

++		return NULL;

7594

++

7595

++	__bfqq = rb_entry(node, struct bfq_queue, pos_node);

7596

++	if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector))

7597

++		return __bfqq;

7598

++

7599

++	return NULL;

7600

++}

7601

++

7602

++static struct bfq_queue *bfq_find_close_cooperator(struct bfq_data *bfqd,

7603

++						   struct bfq_queue *cur_bfqq,

7604

++						   sector_t sector)

7605

++{

7606

++	struct bfq_queue *bfqq;

7607

++

7608

++	/*

7609

++	 * We shall notice if some of the queues are cooperating,

7610

++	 * e.g., working closely on the same area of the device. In

7611

++	 * that case, we can group them together and: 1) don't waste

7612

++	 * time idling, and 2) serve the union of their requests in

7613

++	 * the best possible order for throughput.

7614

++	 */

7615

++	bfqq = bfqq_find_close(bfqd, cur_bfqq, sector);

7616

++	if (!bfqq || bfqq == cur_bfqq)

7617

++		return NULL;

7618

++

7619

++	return bfqq;

7620

++}

7621

++

7622

++static struct bfq_queue *

7623

++bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)

7624

++{

7625

++	int process_refs, new_process_refs;

7626

++	struct bfq_queue *__bfqq;

7627

++

7628

++	/*

7629

++	 * If there are no process references on the new_bfqq, then it is

7630

++	 * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain

7631

++	 * may have dropped their last reference (not just their last process

7632

++	 * reference).

7633

++	 */

7634

++	if (!bfqq_process_refs(new_bfqq))

7635

++		return NULL;

7636

++

7637

++	/* Avoid a circular list and skip interim queue merges. */

7638

++	while ((__bfqq = new_bfqq->new_bfqq)) {

7639

++		if (__bfqq == bfqq)

7640

++			return NULL;

7641

++		new_bfqq = __bfqq;

7642

++	}

7643

++

7644

++	process_refs = bfqq_process_refs(bfqq);

7645

++	new_process_refs = bfqq_process_refs(new_bfqq);

7646

++	/*

7647

++	 * If the process for the bfqq has gone away, there is no

7648

++	 * sense in merging the queues.

7649

++	 */

7650

++	if (process_refs == 0 || new_process_refs == 0)

7651

++		return NULL;

7652

++

7653

++	bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d",

7654

++		new_bfqq->pid);

7655

++

7656

++	/*

7657

++	 * Merging is just a redirection: the requests of the process

7658

++	 * owning one of the two queues are redirected to the other queue.

7659

++	 * The latter queue, in its turn, is set as shared if this is the

7660

++	 * first time that the requests of some process are redirected to

7661

++	 * it.

7662

++	 *

7663

++	 * We redirect bfqq to new_bfqq and not the opposite, because we

7664

++	 * are in the context of the process owning bfqq, hence we have

7665

++	 * the io_cq of this process. So we can immediately configure this

7666

++	 * io_cq to redirect the requests of the process to new_bfqq.

7667

++	 *

7668

++	 * NOTE, even if new_bfqq coincides with the in-service queue, the

7669

++	 * io_cq of new_bfqq is not available, because, if the in-service

7670

++	 * queue is shared, bfqd->in_service_bic may not point to the

7671

++	 * io_cq of the in-service queue.

7672

++	 * Redirecting the requests of the process owning bfqq to the

7673

++	 * currently in-service queue is in any case the best option, as

7674

++	 * we feed the in-service queue with new requests close to the

7675

++	 * last request served and, by doing so, hopefully increase the

7676

++	 * throughput.

7677

++	 */

7678

++	bfqq->new_bfqq = new_bfqq;

7679

++	atomic_add(process_refs, &new_bfqq->ref);

7680

++	return new_bfqq;

7681

++}

7682

++

7683

++static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq,

7684

++					struct bfq_queue *new_bfqq)

7685

++{

7686

++	if (bfq_class_idle(bfqq) || bfq_class_idle(new_bfqq) ||

7687

++	    (bfqq->ioprio_class != new_bfqq->ioprio_class))

7688

++		return false;

7689

++

7690

++	/*

7691

++	 * If either of the queues has already been detected as seeky,

7692

++	 * then merging it with the other queue is unlikely to lead to

7693

++	 * sequential I/O.

7694

++	 */

7695

++	if (BFQQ_SEEKY(bfqq) || BFQQ_SEEKY(new_bfqq))

7696

++		return false;

7697

++

7698

++	/*

7699

++	 * Interleaved I/O is known to be done by (some) applications

7700

++	 * only for reads, so it does not make sense to merge async

7701

++	 * queues.

7702

++	 */

7703

++	if (!bfq_bfqq_sync(bfqq) || !bfq_bfqq_sync(new_bfqq))

7704

++		return false;

7705

++

7706

++	return true;

7707

++}

7708

++

7709

++/*

7710

++ * Attempt to schedule a merge of bfqq with the currently in-service queue

7711

++ * or with a close queue among the scheduled queues.

7712

++ * Return NULL if no merge was scheduled, a pointer to the shared bfq_queue

7713

++ * structure otherwise.

7714

++ *

7715

++ * The OOM queue is not allowed to participate to cooperation: in fact, since

7716

++ * the requests temporarily redirected to the OOM queue could be redirected

7717

++ * again to dedicated queues at any time, the state needed to correctly

7718

++ * handle merging with the OOM queue would be quite complex and expensive

7719

++ * to maintain. Besides, in such a critical condition as an out of memory,

7720

++ * the benefits of queue merging may be little relevant, or even negligible.

7721

++ */

7722

++static struct bfq_queue *

7723

++bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq,

7724

++		     void *io_struct, bool request)

7725

++{

7726

++	struct bfq_queue *in_service_bfqq, *new_bfqq;

7727

++

7728

++	if (bfqq->new_bfqq)

7729

++		return bfqq->new_bfqq;

7730

++	if (!io_struct || unlikely(bfqq == &bfqd->oom_bfqq))

7731

++		return NULL;

7732

++	/* If device has only one backlogged bfq_queue, don't search. */

7733

++	if (bfqd->busy_queues == 1)

7734

++		return NULL;

7735

++

7736

++	in_service_bfqq = bfqd->in_service_queue;

7737

++

7738

++	if (!in_service_bfqq || in_service_bfqq == bfqq ||

7739

++	    !bfqd->in_service_bic ||

7740

++	    unlikely(in_service_bfqq == &bfqd->oom_bfqq))

7741

++		goto check_scheduled;

7742

++

7743

++	if (bfq_rq_close_to_sector(io_struct, request, bfqd->last_position) &&

7744

++	    bfqq->entity.parent == in_service_bfqq->entity.parent &&

7745

++	    bfq_may_be_close_cooperator(bfqq, in_service_bfqq)) {

7746

++		new_bfqq = bfq_setup_merge(bfqq, in_service_bfqq);

7747

++		if (new_bfqq)

7748

++			return new_bfqq;

7749

++	}

7750

++	/*

7751

++	 * Check whether there is a cooperator among currently scheduled

7752

++	 * queues. The only thing we need is that the bio/request is not

7753

++	 * NULL, as we need it to establish whether a cooperator exists.

7754

++	 */

7755

++check_scheduled:

7756

++	new_bfqq = bfq_find_close_cooperator(bfqd, bfqq,

7757

++			bfq_io_struct_pos(io_struct, request));

7758

++

7759

++	BUG_ON(new_bfqq && bfqq->entity.parent != new_bfqq->entity.parent);

7760

++

7761

++	if (new_bfqq && likely(new_bfqq != &bfqd->oom_bfqq) &&

7762

++	    bfq_may_be_close_cooperator(bfqq, new_bfqq))

7763

++		return bfq_setup_merge(bfqq, new_bfqq);

7764

++

7765

++	return NULL;

7766

++}

7767

++

7768

++static void bfq_bfqq_save_state(struct bfq_queue *bfqq)

7769

++{

7770

++	/*

7771

++	 * If !bfqq->bic, the queue is already shared or its requests

7772

++	 * have already been redirected to a shared queue; both idle window

7773

++	 * and weight raising state have already been saved. Do nothing.

7774

++	 */

7775

++	if (!bfqq->bic)

7776

++		return;

7777

++	if (bfqq->bic->wr_time_left)

7778

++		/*

7779

++		 * This is the queue of a just-started process, and would

7780

++		 * deserve weight raising: we set wr_time_left to the full

7781

++		 * weight-raising duration to trigger weight-raising when

7782

++		 * and if the queue is split and the first request of the

7783

++		 * queue is enqueued.

7784

++		 */

7785

++		bfqq->bic->wr_time_left = bfq_wr_duration(bfqq->bfqd);

7786

++	else if (bfqq->wr_coeff > 1) {

7787

++		unsigned long wr_duration =

7788

++			jiffies - bfqq->last_wr_start_finish;

7789

++		/*

7790

++		 * It may happen that a queue's weight raising period lasts

7791

++		 * longer than its wr_cur_max_time, as weight raising is

7792

++		 * handled only when a request is enqueued or dispatched (it

7793

++		 * does not use any timer). If the weight raising period is

7794

++		 * about to end, don't save it.

7795

++		 */

7796

++		if (bfqq->wr_cur_max_time <= wr_duration)

7797

++			bfqq->bic->wr_time_left = 0;

7798

++		else

7799

++			bfqq->bic->wr_time_left =

7800

++				bfqq->wr_cur_max_time - wr_duration;

7801

++		/*

7802

++		 * The bfq_queue is becoming shared or the requests of the

7803

++		 * process owning the queue are being redirected to a shared

7804

++		 * queue. Stop the weight raising period of the queue, as in

7805

++		 * both cases it should not be owned by an interactive or

7806

++		 * soft real-time application.

7807

++		 */

7808

++		bfq_bfqq_end_wr(bfqq);

7809

++	} else

7810

++		bfqq->bic->wr_time_left = 0;

7811

++	bfqq->bic->saved_idle_window = bfq_bfqq_idle_window(bfqq);

7812

++	bfqq->bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq);

7813

++	bfqq->bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq);

7814

++	bfqq->bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node);

7815

++	bfqq->bic->cooperations++;

7816

++	bfqq->bic->failed_cooperations = 0;

7817

++}

7818

++

7819

++static void bfq_get_bic_reference(struct bfq_queue *bfqq)

7820

++{

7821

++	/*

7822

++	 * If bfqq->bic has a non-NULL value, the bic to which it belongs

7823

++	 * is about to begin using a shared bfq_queue.

7824

++	 */

7825

++	if (bfqq->bic)

7826

++		atomic_long_inc(&bfqq->bic->icq.ioc->refcount);

7827

++}

7828

++

7829

++static void

7830

++bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,

7831

++		struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)

7832

++{

7833

++	bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu",

7834

++		(long unsigned)new_bfqq->pid);

7835

++	/* Save weight raising and idle window of the merged queues */

7836

++	bfq_bfqq_save_state(bfqq);

7837

++	bfq_bfqq_save_state(new_bfqq);

7838

++	if (bfq_bfqq_IO_bound(bfqq))

7839

++		bfq_mark_bfqq_IO_bound(new_bfqq);

7840

++	bfq_clear_bfqq_IO_bound(bfqq);

7841

++	/*

7842

++	 * Grab a reference to the bic, to prevent it from being destroyed

7843

++	 * before being possibly touched by a bfq_split_bfqq().

7844

++	 */

7845

++	bfq_get_bic_reference(bfqq);

7846

++	bfq_get_bic_reference(new_bfqq);

7847

++	/*

7848

++	 * Merge queues (that is, let bic redirect its requests to new_bfqq)

7849

++	 */

7850

++	bic_set_bfqq(bic, new_bfqq, 1);

7851

++	bfq_mark_bfqq_coop(new_bfqq);

7852

++	/*

7853

++	 * new_bfqq now belongs to at least two bics (it is a shared queue):

7854

++	 * set new_bfqq->bic to NULL. bfqq either:

7855

++	 * - does not belong to any bic any more, and hence bfqq->bic must

7856

++	 *   be set to NULL, or

7857

++	 * - is a queue whose owning bics have already been redirected to a

7858

++	 *   different queue, hence the queue is destined to not belong to

7859

++	 *   any bic soon and bfqq->bic is already NULL (therefore the next

7860

++	 *   assignment causes no harm).

7861

++	 */

7862

++	new_bfqq->bic = NULL;

7863

++	bfqq->bic = NULL;

7864

++	bfq_put_queue(bfqq);

7865

++}

7866

++

7867

++static void bfq_bfqq_increase_failed_cooperations(struct bfq_queue *bfqq)

7868

++{

7869

++	struct bfq_io_cq *bic = bfqq->bic;

7870

++	struct bfq_data *bfqd = bfqq->bfqd;

7871

++

7872

++	if (bic && bfq_bfqq_cooperations(bfqq) >= bfqd->bfq_coop_thresh) {

7873

++		bic->failed_cooperations++;

7874

++		if (bic->failed_cooperations >= bfqd->bfq_failed_cooperations)

7875

++			bic->cooperations = 0;

7876

++	}

7877

++}

7878

++

7879

+ static int bfq_allow_merge(struct request_queue *q, struct request *rq,

7880

+ 			   struct bio *bio)

7881

+ {

7882

+ 	struct bfq_data *bfqd = q->elevator->elevator_data;

7883

+ 	struct bfq_io_cq *bic;

7884

++	struct bfq_queue *bfqq, *new_bfqq;

7885

+

7886

+ 	/*

7887

+ 	 * Disallow merge of a sync bio into an async request.

7888

+@@ -1150,7 +1622,26 @@ static int bfq_allow_merge(struct request_queue *q, struct request *rq,

7889

+ 	if (!bic)

7890

+ 		return 0;

7891

+

7892

+-	return bic_to_bfqq(bic, bfq_bio_sync(bio)) == RQ_BFQQ(rq);

7893

++	bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));

7894

++	/*

7895

++	 * We take advantage of this function to perform an early merge

7896

++	 * of the queues of possible cooperating processes.

7897

++	 */

7898

++	if (bfqq) {

7899

++		new_bfqq = bfq_setup_cooperator(bfqd, bfqq, bio, false);

7900

++		if (new_bfqq) {

7901

++			bfq_merge_bfqqs(bfqd, bic, bfqq, new_bfqq);

7902

++			/*

7903

++			 * If we get here, the bio will be queued in the

7904

++			 * shared queue, i.e., new_bfqq, so use new_bfqq

7905

++			 * to decide whether bio and rq can be merged.

7906

++			 */

7907

++			bfqq = new_bfqq;

7908

++		} else

7909

++			bfq_bfqq_increase_failed_cooperations(bfqq);

7910

++	}

7911

++

7912

++	return bfqq == RQ_BFQQ(rq);

7913

+ }

7914

+

7915

+ static void __bfq_set_in_service_queue(struct bfq_data *bfqd,

7916

+@@ -1349,6 +1840,15 @@ static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq)

7917

+

7918

+ 	__bfq_bfqd_reset_in_service(bfqd);

7919

+

7920

++	/*

7921

++	 * If this bfqq is shared between multiple processes, check

7922

++	 * to make sure that those processes are still issuing I/Os

7923

++	 * within the mean seek distance. If not, it may be time to

7924

++	 * break the queues apart again.

7925

++	 */

7926

++	if (bfq_bfqq_coop(bfqq) && BFQQ_SEEKY(bfqq))

7927

++		bfq_mark_bfqq_split_coop(bfqq);

7928

++

7929

+ 	if (RB_EMPTY_ROOT(&bfqq->sort_list)) {

7930

+ 		/*

7931

+ 		 * Overloading budget_timeout field to store the time

7932

+@@ -1357,8 +1857,13 @@ static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq)

7933

+ 		 */

7934

+ 		bfqq->budget_timeout = jiffies;

7935

+ 		bfq_del_bfqq_busy(bfqd, bfqq, 1);

7936

+-	} else

7937

++	} else {

7938

+ 		bfq_activate_bfqq(bfqd, bfqq);

7939

++		/*

7940

++		 * Resort priority tree of potential close cooperators.

7941

++		 */

7942

++		bfq_pos_tree_add_move(bfqd, bfqq);

7943

++	}

7944

+ }

7945

+

7946

+ /**

7947

+@@ -2242,10 +2747,12 @@ static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq)

7948

+ 		/*

7949

+ 		 * If the queue was activated in a burst, or

7950

+ 		 * too much time has elapsed from the beginning

7951

+-		 * of this weight-raising period, then end weight

7952

+-		 * raising.

7953

++		 * of this weight-raising period, or the queue has

7954

++		 * exceeded the acceptable number of cooperations,

7955

++		 * then end weight raising.

7956

+ 		 */

7957

+ 		if (bfq_bfqq_in_large_burst(bfqq) ||

7958

++		    bfq_bfqq_cooperations(bfqq) >= bfqd->bfq_coop_thresh ||

7959

+ 		    time_is_before_jiffies(bfqq->last_wr_start_finish +

7960

+ 					   bfqq->wr_cur_max_time)) {

7961

+ 			bfqq->last_wr_start_finish = jiffies;

7962

+@@ -2474,6 +2981,25 @@ static void bfq_put_queue(struct bfq_queue *bfqq)

7963

+ #endif

7964

+ }

7965

+

7966

++static void bfq_put_cooperator(struct bfq_queue *bfqq)

7967

++{

7968

++	struct bfq_queue *__bfqq, *next;

7969

++

7970

++	/*

7971

++	 * If this queue was scheduled to merge with another queue, be

7972

++	 * sure to drop the reference taken on that queue (and others in

7973

++	 * the merge chain). See bfq_setup_merge and bfq_merge_bfqqs.

7974

++	 */

7975

++	__bfqq = bfqq->new_bfqq;

7976

++	while (__bfqq) {

7977

++		if (__bfqq == bfqq)

7978

++			break;

7979

++		next = __bfqq->new_bfqq;

7980

++		bfq_put_queue(__bfqq);

7981

++		__bfqq = next;

7982

++	}

7983

++}

7984

++

7985

+ static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)

7986

+ {

7987

+ 	if (bfqq == bfqd->in_service_queue) {

7988

+@@ -2484,6 +3010,8 @@ static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)

7989

+ 	bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq,

7990

+ 		     atomic_read(&bfqq->ref));

7991

+

7992

++	bfq_put_cooperator(bfqq);

7993

++

7994

+ 	bfq_put_queue(bfqq);

7995

+ }

7996

+

7997

+@@ -2492,6 +3020,25 @@ static void bfq_init_icq(struct io_cq *icq)

7998

+ 	struct bfq_io_cq *bic = icq_to_bic(icq);

7999

+

8000

+ 	bic->ttime.last_end_request = jiffies;

8001

++	/*

8002

++	 * A newly created bic indicates that the process has just

8003

++	 * started doing I/O, and is probably mapping into memory its

8004

++	 * executable and libraries: it definitely needs weight raising.

8005

++	 * There is however the possibility that the process performs,

8006

++	 * for a while, I/O close to some other process. EQM intercepts

8007

++	 * this behavior and may merge the queue corresponding to the

8008

++	 * process  with some other queue, BEFORE the weight of the queue

8009

++	 * is raised. Merged queues are not weight-raised (they are assumed

8010

++	 * to belong to processes that benefit only from high throughput).

8011

++	 * If the merge is basically the consequence of an accident, then

8012

++	 * the queue will be split soon and will get back its old weight.

8013

++	 * It is then important to write down somewhere that this queue

8014

++	 * does need weight raising, even if it did not make it to get its

8015

++	 * weight raised before being merged. To this purpose, we overload

8016

++	 * the field raising_time_left and assign 1 to it, to mark the queue

8017

++	 * as needing weight raising.

8018

++	 */

8019

++	bic->wr_time_left = 1;

8020

+ }

8021

+

8022

+ static void bfq_exit_icq(struct io_cq *icq)

8023

+@@ -2505,6 +3052,13 @@ static void bfq_exit_icq(struct io_cq *icq)

8024

+ 	}

8025

+

8026

+ 	if (bic->bfqq[BLK_RW_SYNC]) {

8027

++		/*

8028

++		 * If the bic is using a shared queue, put the reference

8029

++		 * taken on the io_context when the bic started using a

8030

++		 * shared bfq_queue.

8031

++		 */

8032

++		if (bfq_bfqq_coop(bic->bfqq[BLK_RW_SYNC]))

8033

++			put_io_context(icq->ioc);

8034

+ 		bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_SYNC]);

8035

+ 		bic->bfqq[BLK_RW_SYNC] = NULL;

8036

+ 	}

8037

+@@ -2809,6 +3363,10 @@ static void bfq_update_idle_window(struct bfq_data *bfqd,

8038

+ 	if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq))

8039

+ 		return;

8040

+

8041

++	/* Idle window just restored, statistics are meaningless. */

8042

++	if (bfq_bfqq_just_split(bfqq))

8043

++		return;

8044

++

8045

+ 	enable_idle = bfq_bfqq_idle_window(bfqq);

8046

+

8047

+ 	if (atomic_read(&bic->icq.ioc->active_ref) == 0 ||

8048

+@@ -2856,6 +3414,7 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,

8049

+ 	if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 ||

8050

+ 	    !BFQQ_SEEKY(bfqq))

8051

+ 		bfq_update_idle_window(bfqd, bfqq, bic);

8052

++	bfq_clear_bfqq_just_split(bfqq);

8053

+

8054

+ 	bfq_log_bfqq(bfqd, bfqq,

8055

+ 		     "rq_enqueued: idle_window=%d (seeky %d, mean %llu)",

8056

+@@ -2920,12 +3479,47 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,

8057

+ static void bfq_insert_request(struct request_queue *q, struct request *rq)

8058

+ {

8059

+ 	struct bfq_data *bfqd = q->elevator->elevator_data;

8060

+-	struct bfq_queue *bfqq = RQ_BFQQ(rq);

8061

++	struct bfq_queue *bfqq = RQ_BFQQ(rq), *new_bfqq;

8062

+

8063

+ 	assert_spin_locked(bfqd->queue->queue_lock);

8064

+

8065

++	/*

8066

++	 * An unplug may trigger a requeue of a request from the device

8067

++	 * driver: make sure we are in process context while trying to

8068

++	 * merge two bfq_queues.

8069

++	 */

8070

++	if (!in_interrupt()) {

8071

++		new_bfqq = bfq_setup_cooperator(bfqd, bfqq, rq, true);

8072

++		if (new_bfqq) {

8073

++			if (bic_to_bfqq(RQ_BIC(rq), 1) != bfqq)

8074

++				new_bfqq = bic_to_bfqq(RQ_BIC(rq), 1);

8075

++			/*

8076

++			 * Release the request's reference to the old bfqq

8077

++			 * and make sure one is taken to the shared queue.

8078

++			 */

8079

++			new_bfqq->allocated[rq_data_dir(rq)]++;

8080

++			bfqq->allocated[rq_data_dir(rq)]--;

8081

++			atomic_inc(&new_bfqq->ref);

8082

++			bfq_put_queue(bfqq);

8083

++			if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq)

8084

++				bfq_merge_bfqqs(bfqd, RQ_BIC(rq),

8085

++						bfqq, new_bfqq);

8086

++			rq->elv.priv[1] = new_bfqq;

8087

++			bfqq = new_bfqq;

8088

++		} else

8089

++			bfq_bfqq_increase_failed_cooperations(bfqq);

8090

++	}

8091

++

8092

+ 	bfq_add_request(rq);

8093

+

8094

++	/*

8095

++	 * Here a newly-created bfq_queue has already started a weight-raising

8096

++	 * period: clear raising_time_left to prevent bfq_bfqq_save_state()

8097

++	 * from assigning it a full weight-raising period. See the detailed

8098

++	 * comments about this field in bfq_init_icq().

8099

++	 */

8100

++	if (bfqq->bic)

8101

++		bfqq->bic->wr_time_left = 0;

8102

+ 	rq->fifo_time = jiffies + bfqd->bfq_fifo_expire[rq_is_sync(rq)];

8103

+ 	list_add_tail(&rq->queuelist, &bfqq->fifo);

8104

+

8105

+@@ -3094,6 +3688,32 @@ static void bfq_put_request(struct request *rq)

8106

+ }

8107

+

8108

+ /*

8109

++ * Returns NULL if a new bfqq should be allocated, or the old bfqq if this

8110

++ * was the last process referring to said bfqq.

8111

++ */

8112

++static struct bfq_queue *

8113

++bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq)

8114

++{

8115

++	bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue");

8116

++

8117

++	put_io_context(bic->icq.ioc);

8118

++

8119

++	if (bfqq_process_refs(bfqq) == 1) {

8120

++		bfqq->pid = current->pid;

8121

++		bfq_clear_bfqq_coop(bfqq);

8122

++		bfq_clear_bfqq_split_coop(bfqq);

8123

++		return bfqq;

8124

++	}

8125

++

8126

++	bic_set_bfqq(bic, NULL, 1);

8127

++

8128

++	bfq_put_cooperator(bfqq);

8129

++

8130

++	bfq_put_queue(bfqq);

8131

++	return NULL;

8132

++}

8133

++

8134

++/*

8135

+  * Allocate bfq data structures associated with this request.

8136

+  */

8137

+ static int bfq_set_request(struct request_queue *q, struct request *rq,

8138

+@@ -3105,6 +3725,7 @@ static int bfq_set_request(struct request_queue *q, struct request *rq,

8139

+ 	const int is_sync = rq_is_sync(rq);

8140

+ 	struct bfq_queue *bfqq;

8141

+ 	unsigned long flags;

8142

++	bool split = false;

8143

+

8144

+ 	might_sleep_if(gfpflags_allow_blocking(gfp_mask));

8145

+

8146

+@@ -3117,15 +3738,30 @@ static int bfq_set_request(struct request_queue *q, struct request *rq,

8147

+

8148

+ 	bfq_bic_update_cgroup(bic, bio);

8149

+

8150

++new_queue:

8151

+ 	bfqq = bic_to_bfqq(bic, is_sync);

8152

+ 	if (!bfqq || bfqq == &bfqd->oom_bfqq) {

8153

+ 		bfqq = bfq_get_queue(bfqd, bio, is_sync, bic, gfp_mask);

8154

+ 		bic_set_bfqq(bic, bfqq, is_sync);

8155

+-		if (is_sync) {

8156

+-			if (bfqd->large_burst)

8157

++		if (split && is_sync) {

8158

++			if ((bic->was_in_burst_list && bfqd->large_burst) ||

8159

++			    bic->saved_in_large_burst)

8160

+ 				bfq_mark_bfqq_in_large_burst(bfqq);

8161

+-			else

8162

+-				bfq_clear_bfqq_in_large_burst(bfqq);

8163

++			else {

8164

++			    bfq_clear_bfqq_in_large_burst(bfqq);

8165

++			    if (bic->was_in_burst_list)

8166

++			       hlist_add_head(&bfqq->burst_list_node,

8167

++				              &bfqd->burst_list);

8168

++			}

8169

++		}

8170

++	} else {

8171

++		/* If the queue was seeky for too long, break it apart. */

8172

++		if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) {

8173

++			bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq");

8174

++			bfqq = bfq_split_bfqq(bic, bfqq);

8175

++			split = true;

8176

++			if (!bfqq)

8177

++				goto new_queue;

8178

+ 		}

8179

+ 	}

8180

+

8181

+@@ -3137,6 +3773,26 @@ static int bfq_set_request(struct request_queue *q, struct request *rq,

8182

+ 	rq->elv.priv[0] = bic;

8183

+ 	rq->elv.priv[1] = bfqq;

8184

+

8185

++	/*

8186

++	 * If a bfq_queue has only one process reference, it is owned

8187

++	 * by only one bfq_io_cq: we can set the bic field of the

8188

++	 * bfq_queue to the address of that structure. Also, if the

8189

++	 * queue has just been split, mark a flag so that the

8190

++	 * information is available to the other scheduler hooks.

8191

++	 */

8192

++	if (likely(bfqq != &bfqd->oom_bfqq) && bfqq_process_refs(bfqq) == 1) {

8193

++		bfqq->bic = bic;

8194

++		if (split) {

8195

++			bfq_mark_bfqq_just_split(bfqq);

8196

++			/*

8197

++			 * If the queue has just been split from a shared

8198

++			 * queue, restore the idle window and the possible

8199

++			 * weight raising period.

8200

++			 */

8201

++			bfq_bfqq_resume_state(bfqq, bic);

8202

++		}

8203

++	}

8204

++

8205

+ 	spin_unlock_irqrestore(q->queue_lock, flags);

8206

+

8207

+ 	return 0;

8208

+@@ -3290,6 +3946,7 @@ static void bfq_init_root_group(struct bfq_group *root_group,

8209

+ 	root_group->my_entity = NULL;

8210

+ 	root_group->bfqd = bfqd;

8211

+ #endif

8212

++	root_group->rq_pos_tree = RB_ROOT;

8213

+ 	for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)

8214

+ 		root_group->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;

8215

+ }

8216

+@@ -3370,6 +4027,8 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)

8217

+ 	bfqd->bfq_timeout[BLK_RW_ASYNC] = bfq_timeout_async;

8218

+ 	bfqd->bfq_timeout[BLK_RW_SYNC] = bfq_timeout_sync;

8219

+

8220

++	bfqd->bfq_coop_thresh = 2;

8221

++	bfqd->bfq_failed_cooperations = 7000;

8222

+ 	bfqd->bfq_requests_within_timer = 120;

8223

+

8224

+ 	bfqd->bfq_large_burst_thresh = 11;

8225

+diff --git a/block/bfq.h b/block/bfq.h

8226

+index 3bb7df2..32dfcee 100644

8227

+--- a/block/bfq.h

8228

++++ b/block/bfq.h

8229

+@@ -183,6 +183,8 @@ struct bfq_group;

8230

+  *                    ioprio_class value.

8231

+  * @new_bfqq: shared bfq_queue if queue is cooperating with

8232

+  *           one or more other queues.

8233

++ * @pos_node: request-position tree member (see bfq_group's @rq_pos_tree).

8234

++ * @pos_root: request-position tree root (see bfq_group's @rq_pos_tree).

8235

+  * @sort_list: sorted list of pending requests.

8236

+  * @next_rq: if fifo isn't expired, next request to serve.

8237

+  * @queued: nr of requests queued in @sort_list.

8238

+@@ -304,6 +306,26 @@ struct bfq_ttime {

8239

+  * @ttime: associated @bfq_ttime struct

8240

+  * @ioprio: per (request_queue, blkcg) ioprio.

8241

+  * @blkcg_id: id of the blkcg the related io_cq belongs to.

8242

++ * @wr_time_left: snapshot of the time left before weight raising ends

8243

++ *                for the sync queue associated to this process; this

8244

++ *		  snapshot is taken to remember this value while the weight

8245

++ *		  raising is suspended because the queue is merged with a

8246

++ *		  shared queue, and is used to set @raising_cur_max_time

8247

++ *		  when the queue is split from the shared queue and its

8248

++ *		  weight is raised again

8249

++ * @saved_idle_window: same purpose as the previous field for the idle

8250

++ *                     window

8251

++ * @saved_IO_bound: same purpose as the previous two fields for the I/O

8252

++ *                  bound classification of a queue

8253

++ * @saved_in_large_burst: same purpose as the previous fields for the

8254

++ *                        value of the field keeping the queue's belonging

8255

++ *                        to a large burst

8256

++ * @was_in_burst_list: true if the queue belonged to a burst list

8257

++ *                     before its merge with another cooperating queue

8258

++ * @cooperations: counter of consecutive successful queue merges underwent

8259

++ *                by any of the process' @bfq_queues

8260

++ * @failed_cooperations: counter of consecutive failed queue merges of any

8261

++ *                       of the process' @bfq_queues

8262

+  */

8263

+ struct bfq_io_cq {

8264

+ 	struct io_cq icq; /* must be the first member */

8265

+@@ -314,6 +336,16 @@ struct bfq_io_cq {

8266

+ #ifdef CONFIG_BFQ_GROUP_IOSCHED

8267

+ 	uint64_t blkcg_id; /* the current blkcg ID */

8268

+ #endif

8269

++

8270

++	unsigned int wr_time_left;

8271

++	bool saved_idle_window;

8272

++	bool saved_IO_bound;

8273

++

8274

++	bool saved_in_large_burst;

8275

++	bool was_in_burst_list;

8276

++

8277

++	unsigned int cooperations;

8278

++	unsigned int failed_cooperations;

8279

+ };

8280

+

8281

+ enum bfq_device_speed {

8282

+@@ -557,6 +589,9 @@ enum bfqq_state_flags {

8283

+ 					 * may need softrt-next-start

8284

+ 					 * update

8285

+ 					 */

8286

++	BFQ_BFQQ_FLAG_coop,		/* bfqq is shared */

8287

++	BFQ_BFQQ_FLAG_split_coop,	/* shared bfqq will be split */

8288

++	BFQ_BFQQ_FLAG_just_split,	/* queue has just been split */

8289

+ };

8290

+

8291

+ #define BFQ_BFQQ_FNS(name)						\

8292

+@@ -583,6 +618,9 @@ BFQ_BFQQ_FNS(budget_new);

8293

+ BFQ_BFQQ_FNS(IO_bound);

8294

+ BFQ_BFQQ_FNS(in_large_burst);

8295

+ BFQ_BFQQ_FNS(constantly_seeky);

8296

++BFQ_BFQQ_FNS(coop);

8297

++BFQ_BFQQ_FNS(split_coop);

8298

++BFQ_BFQQ_FNS(just_split);

8299

+ BFQ_BFQQ_FNS(softrt_update);

8300

+ #undef BFQ_BFQQ_FNS

8301

+

8302

+@@ -675,6 +713,9 @@ struct bfq_group_data {

8303

+  *                   are groups with more than one active @bfq_entity

8304

+  *                   (see the comments to the function

8305

+  *                   bfq_bfqq_must_not_expire()).

8306

++ * @rq_pos_tree: rbtree sorted by next_request position, used when

8307

++ *               determining if two or more queues have interleaving

8308

++ *               requests (see bfq_find_close_cooperator()).

8309

+  *

8310

+  * Each (device, cgroup) pair has its own bfq_group, i.e., for each cgroup

8311

+  * there is a set of bfq_groups, each one collecting the lower-level

8312

+@@ -701,6 +742,8 @@ struct bfq_group {

8313

+

8314

+ 	int active_entities;

8315

+

8316

++	struct rb_root rq_pos_tree;

8317

++

8318

+ 	struct bfqg_stats stats;

8319

+ 	struct bfqg_stats dead_stats;	/* stats pushed from dead children */

8320

+ };

8321

+@@ -711,6 +754,8 @@ struct bfq_group {

8322

+

8323

+ 	struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];

8324

+ 	struct bfq_queue *async_idle_bfqq;

8325

++

8326

++	struct rb_root rq_pos_tree;

8327

+ };

8328

+ #endif

8329

+

8330

+@@ -787,6 +832,27 @@ static void bfq_put_bfqd_unlock(struct bfq_data *bfqd, unsigned long *flags)

8331

+ 	spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags);

8332

+ }

8333

+

8334

++#ifdef CONFIG_BFQ_GROUP_IOSCHED

8335

++

8336

++static struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq)

8337

++{

8338

++	struct bfq_entity *group_entity = bfqq->entity.parent;

8339

++

8340

++	if (!group_entity)

8341

++		group_entity = &bfqq->bfqd->root_group->entity;

8342

++

8343

++	return container_of(group_entity, struct bfq_group, entity);

8344

++}

8345

++

8346

++#else

8347

++

8348

++static struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq)

8349

++{

8350

++	return bfqq->bfqd->root_group;

8351

++}

8352

++

8353

++#endif

8354

++

8355

+ static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio);

8356

+ static void bfq_put_queue(struct bfq_queue *bfqq);

8357

+ static void bfq_dispatch_insert(struct request_queue *q, struct request *rq);

8358

+--

8359

+1.9.1

8360

+

Gentoo Archives: gentoo-commits