[gentoo-commits] proj/linux-patches:4.0 commit in: / - gentoo-commits

From:	Mike Pagano <mpagano@g.o>
To:	gentoo-commits@l.g.o
Subject:	[gentoo-commits] proj/linux-patches:4.0 commit in: /
Date:	Wed, 29 Apr 2015 17:33:48
Message-Id:	`1430328103.f8edf410c4ddd523917f01dfbef4378b4ad4c1b0.mpagano@gentoo`

1

commit:     f8edf410c4ddd523917f01dfbef4378b4ad4c1b0

2

Author:     Mike Pagano <mpagano <AT> gentoo <DOT> org>

3

AuthorDate: Wed Apr 29 17:21:43 2015 +0000

4

Commit:     Mike Pagano <mpagano <AT> gentoo <DOT> org>

5

CommitDate: Wed Apr 29 17:21:43 2015 +0000

6

URL:        https://gitweb.gentoo.org/proj/linux-patches.git/commit/?id=f8edf410

7

8

BFQ patchset for 4.0, v7r7.

9

10

 0000_README                                        |   12 +

11

 ...roups-kconfig-build-bits-for-BFQ-v7r7-4.0.patch |  104 +

12

 ...introduce-the-BFQ-v7r7-I-O-sched-for-4.0.patch1 | 6966 ++++++++++++++++++++

13

 ...rly-Queue-Merge-EQM-to-BFQ-v7r7-for-4.0.0.patch | 1222 ++++

14

 4 files changed, 8304 insertions(+)

15

16

diff --git a/0000_README b/0000_README

17

index 483ca42..bcce967 100644

18

--- a/0000_README

19

+++ b/0000_README

20

@@ -83,6 +83,18 @@ Patch:  5000_enable-additional-cpu-optimizations-for-gcc.patch

21

 From:   https://github.com/graysky2/kernel_gcc_patch/

22

 Desc:   Kernel patch enables gcc < v4.9 optimizations for additional CPUs.

23

24

+Patch:  5001_block-cgroups-kconfig-build-bits-for-BFQ-v7r7-4.0.patch

25

+From:   http://algo.ing.unimo.it/people/paolo/disk_sched/

26

+Desc:   BFQ v7r7 patch 1 for 4.0: Build, cgroups and kconfig bits

27

+

28

+Patch:  5002_block-introduce-the-BFQ-v7r7-I-O-sched-for-4.0.patch1

29

+From:   http://algo.ing.unimo.it/people/paolo/disk_sched/

30

+Desc:   BFQ v7r7 patch 2 for 4.0: BFQ Scheduler

31

+

32

+Patch:  5003_block-bfq-add-Early-Queue-Merge-EQM-to-BFQ-v7r7-for-4.0.0.patch

33

+From:   http://algo.ing.unimo.it/people/paolo/disk_sched/

34

+Desc:   BFQ v7r7 patch 3 for 4.0: Early Queue Merge (EQM)

35

+

36

 Patch:  5010_enable-additional-cpu-optimizations-for-gcc-4.9.patch

37

 From:   https://github.com/graysky2/kernel_gcc_patch/

38

 Desc:   Kernel patch enables gcc >= v4.9 optimizations for additional CPUs.

39

40

diff --git a/5001_block-cgroups-kconfig-build-bits-for-BFQ-v7r7-4.0.patch b/5001_block-cgroups-kconfig-build-bits-for-BFQ-v7r7-4.0.patch

41

new file mode 100644

42

index 0000000..468d157

43

--- /dev/null

44

+++ b/5001_block-cgroups-kconfig-build-bits-for-BFQ-v7r7-4.0.patch

45

@@ -0,0 +1,104 @@

46

+From 63e26848e2df36a3c29d2d38ce8b008539d64a5d Mon Sep 17 00:00:00 2001

47

+From: Paolo Valente <paolo.valente@×××××××.it>

48

+Date: Tue, 7 Apr 2015 13:39:12 +0200

49

+Subject: [PATCH 1/3] block: cgroups, kconfig, build bits for BFQ-v7r7-4.0

50

+

51

+Update Kconfig.iosched and do the related Makefile changes to include

52

+kernel configuration options for BFQ. Also add the bfqio controller

53

+to the cgroups subsystem.

54

+

55

+Signed-off-by: Paolo Valente <paolo.valente@×××××××.it>

56

+Signed-off-by: Arianna Avanzini <avanzini.arianna@×××××.com>

57

+---

58

+ block/Kconfig.iosched         | 32 ++++++++++++++++++++++++++++++++

59

+ block/Makefile                |  1 +

60

+ include/linux/cgroup_subsys.h |  4 ++++

61

+ 3 files changed, 37 insertions(+)

62

+

63

+diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched

64

+index 421bef9..0ee5f0f 100644

65

+--- a/block/Kconfig.iosched

66

++++ b/block/Kconfig.iosched

67

+@@ -39,6 +39,27 @@ config CFQ_GROUP_IOSCHED

68

+ 	---help---

69

+ 	  Enable group IO scheduling in CFQ.

70

+

71

++config IOSCHED_BFQ

72

++	tristate "BFQ I/O scheduler"

73

++	default n

74

++	---help---

75

++	  The BFQ I/O scheduler tries to distribute bandwidth among

76

++	  all processes according to their weights.

77

++	  It aims at distributing the bandwidth as desired, independently of

78

++	  the disk parameters and with any workload. It also tries to

79

++	  guarantee low latency to interactive and soft real-time

80

++	  applications. If compiled built-in (saying Y here), BFQ can

81

++	  be configured to support hierarchical scheduling.

82

++

83

++config CGROUP_BFQIO

84

++	bool "BFQ hierarchical scheduling support"

85

++	depends on CGROUPS && IOSCHED_BFQ=y

86

++	default n

87

++	---help---

88

++	  Enable hierarchical scheduling in BFQ, using the cgroups

89

++	  filesystem interface.  The name of the subsystem will be

90

++	  bfqio.

91

++

92

+ choice

93

+ 	prompt "Default I/O scheduler"

94

+ 	default DEFAULT_CFQ

95

+@@ -52,6 +73,16 @@ choice

96

+ 	config DEFAULT_CFQ

97

+ 		bool "CFQ" if IOSCHED_CFQ=y

98

+

99

++	config DEFAULT_BFQ

100

++		bool "BFQ" if IOSCHED_BFQ=y

101

++		help

102

++		  Selects BFQ as the default I/O scheduler which will be

103

++		  used by default for all block devices.

104

++		  The BFQ I/O scheduler aims at distributing the bandwidth

105

++		  as desired, independently of the disk parameters and with

106

++		  any workload. It also tries to guarantee low latency to

107

++		  interactive and soft real-time applications.

108

++

109

+ 	config DEFAULT_NOOP

110

+ 		bool "No-op"

111

+

112

+@@ -61,6 +92,7 @@ config DEFAULT_IOSCHED

113

+ 	string

114

+ 	default "deadline" if DEFAULT_DEADLINE

115

+ 	default "cfq" if DEFAULT_CFQ

116

++	default "bfq" if DEFAULT_BFQ

117

+ 	default "noop" if DEFAULT_NOOP

118

+

119

+ endmenu

120

+diff --git a/block/Makefile b/block/Makefile

121

+index 00ecc97..1ed86d5 100644

122

+--- a/block/Makefile

123

++++ b/block/Makefile

124

+@@ -18,6 +18,7 @@ obj-$(CONFIG_BLK_DEV_THROTTLING)	+= blk-throttle.o

125

+ obj-$(CONFIG_IOSCHED_NOOP)	+= noop-iosched.o

126

+ obj-$(CONFIG_IOSCHED_DEADLINE)	+= deadline-iosched.o

127

+ obj-$(CONFIG_IOSCHED_CFQ)	+= cfq-iosched.o

128

++obj-$(CONFIG_IOSCHED_BFQ)	+= bfq-iosched.o

129

+

130

+ obj-$(CONFIG_BLOCK_COMPAT)	+= compat_ioctl.o

131

+ obj-$(CONFIG_BLK_CMDLINE_PARSER)	+= cmdline-parser.o

132

+diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h

133

+index e4a96fb..267d681 100644

134

+--- a/include/linux/cgroup_subsys.h

135

++++ b/include/linux/cgroup_subsys.h

136

+@@ -35,6 +35,10 @@ SUBSYS(freezer)

137

+ SUBSYS(net_cls)

138

+ #endif

139

+

140

++#if IS_ENABLED(CONFIG_CGROUP_BFQIO)

141

++SUBSYS(bfqio)

142

++#endif

143

++

144

+ #if IS_ENABLED(CONFIG_CGROUP_PERF)

145

+ SUBSYS(perf_event)

146

+ #endif

147

+--

148

+2.1.0

149

+

150

151

diff --git a/5002_block-introduce-the-BFQ-v7r7-I-O-sched-for-4.0.patch1 b/5002_block-introduce-the-BFQ-v7r7-I-O-sched-for-4.0.patch1

152

new file mode 100644

153

index 0000000..a6cfc58

154

--- /dev/null

155

+++ b/5002_block-introduce-the-BFQ-v7r7-I-O-sched-for-4.0.patch1

156

@@ -0,0 +1,6966 @@

157

+From 8cdf2dae6ee87049c7bb086d34e2ce981b545813 Mon Sep 17 00:00:00 2001

158

+From: Paolo Valente <paolo.valente@×××××××.it>

159

+Date: Thu, 9 May 2013 19:10:02 +0200

160

+Subject: [PATCH 2/3] block: introduce the BFQ-v7r7 I/O sched for 4.0

161

+

162

+Add the BFQ-v7r7 I/O scheduler to 4.0.

163

+The general structure is borrowed from CFQ, as much of the code for

164

+handling I/O contexts. Over time, several useful features have been

165

+ported from CFQ as well (details in the changelog in README.BFQ). A

166

+(bfq_)queue is associated to each task doing I/O on a device, and each

167

+time a scheduling decision has to be made a queue is selected and served

168

+until it expires.

169

+

170

+    - Slices are given in the service domain: tasks are assigned

171

+      budgets, measured in number of sectors. Once got the disk, a task

172

+      must however consume its assigned budget within a configurable

173

+      maximum time (by default, the maximum possible value of the

174

+      budgets is automatically computed to comply with this timeout).

175

+      This allows the desired latency vs "throughput boosting" tradeoff

176

+      to be set.

177

+

178

+    - Budgets are scheduled according to a variant of WF2Q+, implemented

179

+      using an augmented rb-tree to take eligibility into account while

180

+      preserving an O(log N) overall complexity.

181

+

182

+    - A low-latency tunable is provided; if enabled, both interactive

183

+      and soft real-time applications are guaranteed a very low latency.

184

+

185

+    - Latency guarantees are preserved also in the presence of NCQ.

186

+

187

+    - Also with flash-based devices, a high throughput is achieved

188

+      while still preserving latency guarantees.

189

+

190

+    - BFQ features Early Queue Merge (EQM), a sort of fusion of the

191

+      cooperating-queue-merging and the preemption mechanisms present

192

+      in CFQ. EQM is in fact a unified mechanism that tries to get a

193

+      sequential read pattern, and hence a high throughput, with any

194

+      set of processes performing interleaved I/O over a contiguous

195

+      sequence of sectors.

196

+

197

+    - BFQ supports full hierarchical scheduling, exporting a cgroups

198

+      interface.  Since each node has a full scheduler, each group can

199

+      be assigned its own weight.

200

+

201

+    - If the cgroups interface is not used, only I/O priorities can be

202

+      assigned to processes, with ioprio values mapped to weights

203

+      with the relation weight = IOPRIO_BE_NR - ioprio.

204

+

205

+    - ioprio classes are served in strict priority order, i.e., lower

206

+      priority queues are not served as long as there are higher

207

+      priority queues.  Among queues in the same class the bandwidth is

208

+      distributed in proportion to the weight of each queue. A very

209

+      thin extra bandwidth is however guaranteed to the Idle class, to

210

+      prevent it from starving.

211

+

212

+Signed-off-by: Paolo Valente <paolo.valente@×××××××.it>

213

+Signed-off-by: Arianna Avanzini <avanzini.arianna@×××××.com>

214

+---

215

+ block/bfq-cgroup.c  |  936 ++++++++++++

216

+ block/bfq-ioc.c     |   36 +

217

+ block/bfq-iosched.c | 3902 +++++++++++++++++++++++++++++++++++++++++++++++++++

218

+ block/bfq-sched.c   | 1214 ++++++++++++++++

219

+ block/bfq.h         |  775 ++++++++++

220

+ 5 files changed, 6863 insertions(+)

221

+ create mode 100644 block/bfq-cgroup.c

222

+ create mode 100644 block/bfq-ioc.c

223

+ create mode 100644 block/bfq-iosched.c

224

+ create mode 100644 block/bfq-sched.c

225

+ create mode 100644 block/bfq.h

226

+

227

+diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c

228

+new file mode 100644

229

+index 0000000..11e2f1d

230

+--- /dev/null

231

++++ b/block/bfq-cgroup.c

232

+@@ -0,0 +1,936 @@

233

++/*

234

++ * BFQ: CGROUPS support.

235

++ *

236

++ * Based on ideas and code from CFQ:

237

++ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>

238

++ *

239

++ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>

240

++ *		      Paolo Valente <paolo.valente@×××××××.it>

241

++ *

242

++ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>

243

++ *

244

++ * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ

245

++ * file.

246

++ */

247

++

248

++#ifdef CONFIG_CGROUP_BFQIO

249

++

250

++static DEFINE_MUTEX(bfqio_mutex);

251

++

252

++static bool bfqio_is_removed(struct bfqio_cgroup *bgrp)

253

++{

254

++	return bgrp ? !bgrp->online : false;

255

++}

256

++

257

++static struct bfqio_cgroup bfqio_root_cgroup = {

258

++	.weight = BFQ_DEFAULT_GRP_WEIGHT,

259

++	.ioprio = BFQ_DEFAULT_GRP_IOPRIO,

260

++	.ioprio_class = BFQ_DEFAULT_GRP_CLASS,

261

++};

262

++

263

++static inline void bfq_init_entity(struct bfq_entity *entity,

264

++				   struct bfq_group *bfqg)

265

++{

266

++	entity->weight = entity->new_weight;

267

++	entity->orig_weight = entity->new_weight;

268

++	entity->ioprio = entity->new_ioprio;

269

++	entity->ioprio_class = entity->new_ioprio_class;

270

++	entity->parent = bfqg->my_entity;

271

++	entity->sched_data = &bfqg->sched_data;

272

++}

273

++

274

++static struct bfqio_cgroup *css_to_bfqio(struct cgroup_subsys_state *css)

275

++{

276

++	return css ? container_of(css, struct bfqio_cgroup, css) : NULL;

277

++}

278

++

279

++/*

280

++ * Search the bfq_group for bfqd into the hash table (by now only a list)

281

++ * of bgrp.  Must be called under rcu_read_lock().

282

++ */

283

++static struct bfq_group *bfqio_lookup_group(struct bfqio_cgroup *bgrp,

284

++					    struct bfq_data *bfqd)

285

++{

286

++	struct bfq_group *bfqg;

287

++	void *key;

288

++

289

++	hlist_for_each_entry_rcu(bfqg, &bgrp->group_data, group_node) {

290

++		key = rcu_dereference(bfqg->bfqd);

291

++		if (key == bfqd)

292

++			return bfqg;

293

++	}

294

++

295

++	return NULL;

296

++}

297

++

298

++static inline void bfq_group_init_entity(struct bfqio_cgroup *bgrp,

299

++					 struct bfq_group *bfqg)

300

++{

301

++	struct bfq_entity *entity = &bfqg->entity;

302

++

303

++	/*

304

++	 * If the weight of the entity has never been set via the sysfs

305

++	 * interface, then bgrp->weight == 0. In this case we initialize

306

++	 * the weight from the current ioprio value. Otherwise, the group

307

++	 * weight, if set, has priority over the ioprio value.

308

++	 */

309

++	if (bgrp->weight == 0) {

310

++		entity->new_weight = bfq_ioprio_to_weight(bgrp->ioprio);

311

++		entity->new_ioprio = bgrp->ioprio;

312

++	} else {

313

++		if (bgrp->weight < BFQ_MIN_WEIGHT ||

314

++		    bgrp->weight > BFQ_MAX_WEIGHT) {

315

++			printk(KERN_CRIT "bfq_group_init_entity: "

316

++					 "bgrp->weight %d\n", bgrp->weight);

317

++			BUG();

318

++		}

319

++		entity->new_weight = bgrp->weight;

320

++		entity->new_ioprio = bfq_weight_to_ioprio(bgrp->weight);

321

++	}

322

++	entity->orig_weight = entity->weight = entity->new_weight;

323

++	entity->ioprio = entity->new_ioprio;

324

++	entity->ioprio_class = entity->new_ioprio_class = bgrp->ioprio_class;

325

++	entity->my_sched_data = &bfqg->sched_data;

326

++	bfqg->active_entities = 0;

327

++}

328

++

329

++static inline void bfq_group_set_parent(struct bfq_group *bfqg,

330

++					struct bfq_group *parent)

331

++{

332

++	struct bfq_entity *entity;

333

++

334

++	BUG_ON(parent == NULL);

335

++	BUG_ON(bfqg == NULL);

336

++

337

++	entity = &bfqg->entity;

338

++	entity->parent = parent->my_entity;

339

++	entity->sched_data = &parent->sched_data;

340

++}

341

++

342

++/**

343

++ * bfq_group_chain_alloc - allocate a chain of groups.

344

++ * @bfqd: queue descriptor.

345

++ * @css: the leaf cgroup_subsys_state this chain starts from.

346

++ *

347

++ * Allocate a chain of groups starting from the one belonging to

348

++ * @cgroup up to the root cgroup.  Stop if a cgroup on the chain

349

++ * to the root has already an allocated group on @bfqd.

350

++ */

351

++static struct bfq_group *bfq_group_chain_alloc(struct bfq_data *bfqd,

352

++					       struct cgroup_subsys_state *css)

353

++{

354

++	struct bfqio_cgroup *bgrp;

355

++	struct bfq_group *bfqg, *prev = NULL, *leaf = NULL;

356

++

357

++	for (; css != NULL; css = css->parent) {

358

++		bgrp = css_to_bfqio(css);

359

++

360

++		bfqg = bfqio_lookup_group(bgrp, bfqd);

361

++		if (bfqg != NULL) {

362

++			/*

363

++			 * All the cgroups in the path from there to the

364

++			 * root must have a bfq_group for bfqd, so we don't

365

++			 * need any more allocations.

366

++			 */

367

++			break;

368

++		}

369

++

370

++		bfqg = kzalloc(sizeof(*bfqg), GFP_ATOMIC);

371

++		if (bfqg == NULL)

372

++			goto cleanup;

373

++

374

++		bfq_group_init_entity(bgrp, bfqg);

375

++		bfqg->my_entity = &bfqg->entity;

376

++

377

++		if (leaf == NULL) {

378

++			leaf = bfqg;

379

++			prev = leaf;

380

++		} else {

381

++			bfq_group_set_parent(prev, bfqg);

382

++			/*

383

++			 * Build a list of allocated nodes using the bfqd

384

++			 * filed, that is still unused and will be

385

++			 * initialized only after the node will be

386

++			 * connected.

387

++			 */

388

++			prev->bfqd = bfqg;

389

++			prev = bfqg;

390

++		}

391

++	}

392

++

393

++	return leaf;

394

++

395

++cleanup:

396

++	while (leaf != NULL) {

397

++		prev = leaf;

398

++		leaf = leaf->bfqd;

399

++		kfree(prev);

400

++	}

401

++

402

++	return NULL;

403

++}

404

++

405

++/**

406

++ * bfq_group_chain_link - link an allocated group chain to a cgroup

407

++ *                        hierarchy.

408

++ * @bfqd: the queue descriptor.

409

++ * @css: the leaf cgroup_subsys_state to start from.

410

++ * @leaf: the leaf group (to be associated to @cgroup).

411

++ *

412

++ * Try to link a chain of groups to a cgroup hierarchy, connecting the

413

++ * nodes bottom-up, so we can be sure that when we find a cgroup in the

414

++ * hierarchy that already as a group associated to @bfqd all the nodes

415

++ * in the path to the root cgroup have one too.

416

++ *

417

++ * On locking: the queue lock protects the hierarchy (there is a hierarchy

418

++ * per device) while the bfqio_cgroup lock protects the list of groups

419

++ * belonging to the same cgroup.

420

++ */

421

++static void bfq_group_chain_link(struct bfq_data *bfqd,

422

++				 struct cgroup_subsys_state *css,

423

++				 struct bfq_group *leaf)

424

++{

425

++	struct bfqio_cgroup *bgrp;

426

++	struct bfq_group *bfqg, *next, *prev = NULL;

427

++	unsigned long flags;

428

++

429

++	assert_spin_locked(bfqd->queue->queue_lock);

430

++

431

++	for (; css != NULL && leaf != NULL; css = css->parent) {

432

++		bgrp = css_to_bfqio(css);

433

++		next = leaf->bfqd;

434

++

435

++		bfqg = bfqio_lookup_group(bgrp, bfqd);

436

++		BUG_ON(bfqg != NULL);

437

++

438

++		spin_lock_irqsave(&bgrp->lock, flags);

439

++

440

++		rcu_assign_pointer(leaf->bfqd, bfqd);

441

++		hlist_add_head_rcu(&leaf->group_node, &bgrp->group_data);

442

++		hlist_add_head(&leaf->bfqd_node, &bfqd->group_list);

443

++

444

++		spin_unlock_irqrestore(&bgrp->lock, flags);

445

++

446

++		prev = leaf;

447

++		leaf = next;

448

++	}

449

++

450

++	BUG_ON(css == NULL && leaf != NULL);

451

++	if (css != NULL && prev != NULL) {

452

++		bgrp = css_to_bfqio(css);

453

++		bfqg = bfqio_lookup_group(bgrp, bfqd);

454

++		bfq_group_set_parent(prev, bfqg);

455

++	}

456

++}

457

++

458

++/**

459

++ * bfq_find_alloc_group - return the group associated to @bfqd in @cgroup.

460

++ * @bfqd: queue descriptor.

461

++ * @cgroup: cgroup being searched for.

462

++ *

463

++ * Return a group associated to @bfqd in @cgroup, allocating one if

464

++ * necessary.  When a group is returned all the cgroups in the path

465

++ * to the root have a group associated to @bfqd.

466

++ *

467

++ * If the allocation fails, return the root group: this breaks guarantees

468

++ * but is a safe fallback.  If this loss becomes a problem it can be

469

++ * mitigated using the equivalent weight (given by the product of the

470

++ * weights of the groups in the path from @group to the root) in the

471

++ * root scheduler.

472

++ *

473

++ * We allocate all the missing nodes in the path from the leaf cgroup

474

++ * to the root and we connect the nodes only after all the allocations

475

++ * have been successful.

476

++ */

477

++static struct bfq_group *bfq_find_alloc_group(struct bfq_data *bfqd,

478

++					      struct cgroup_subsys_state *css)

479

++{

480

++	struct bfqio_cgroup *bgrp = css_to_bfqio(css);

481

++	struct bfq_group *bfqg;

482

++

483

++	bfqg = bfqio_lookup_group(bgrp, bfqd);

484

++	if (bfqg != NULL)

485

++		return bfqg;

486

++

487

++	bfqg = bfq_group_chain_alloc(bfqd, css);

488

++	if (bfqg != NULL)

489

++		bfq_group_chain_link(bfqd, css, bfqg);

490

++	else

491

++		bfqg = bfqd->root_group;

492

++

493

++	return bfqg;

494

++}

495

++

496

++/**

497

++ * bfq_bfqq_move - migrate @bfqq to @bfqg.

498

++ * @bfqd: queue descriptor.

499

++ * @bfqq: the queue to move.

500

++ * @entity: @bfqq's entity.

501

++ * @bfqg: the group to move to.

502

++ *

503

++ * Move @bfqq to @bfqg, deactivating it from its old group and reactivating

504

++ * it on the new one.  Avoid putting the entity on the old group idle tree.

505

++ *

506

++ * Must be called under the queue lock; the cgroup owning @bfqg must

507

++ * not disappear (by now this just means that we are called under

508

++ * rcu_read_lock()).

509

++ */

510

++static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,

511

++			  struct bfq_entity *entity, struct bfq_group *bfqg)

512

++{

513

++	int busy, resume;

514

++

515

++	busy = bfq_bfqq_busy(bfqq);

516

++	resume = !RB_EMPTY_ROOT(&bfqq->sort_list);

517

++

518

++	BUG_ON(resume && !entity->on_st);

519

++	BUG_ON(busy && !resume && entity->on_st &&

520

++	       bfqq != bfqd->in_service_queue);

521

++

522

++	if (busy) {

523

++		BUG_ON(atomic_read(&bfqq->ref) < 2);

524

++

525

++		if (!resume)

526

++			bfq_del_bfqq_busy(bfqd, bfqq, 0);

527

++		else

528

++			bfq_deactivate_bfqq(bfqd, bfqq, 0);

529

++	} else if (entity->on_st)

530

++		bfq_put_idle_entity(bfq_entity_service_tree(entity), entity);

531

++

532

++	/*

533

++	 * Here we use a reference to bfqg.  We don't need a refcounter

534

++	 * as the cgroup reference will not be dropped, so that its

535

++	 * destroy() callback will not be invoked.

536

++	 */

537

++	entity->parent = bfqg->my_entity;

538

++	entity->sched_data = &bfqg->sched_data;

539

++

540

++	if (busy && resume)

541

++		bfq_activate_bfqq(bfqd, bfqq);

542

++

543

++	if (bfqd->in_service_queue == NULL && !bfqd->rq_in_driver)

544

++		bfq_schedule_dispatch(bfqd);

545

++}

546

++

547

++/**

548

++ * __bfq_bic_change_cgroup - move @bic to @cgroup.

549

++ * @bfqd: the queue descriptor.

550

++ * @bic: the bic to move.

551

++ * @cgroup: the cgroup to move to.

552

++ *

553

++ * Move bic to cgroup, assuming that bfqd->queue is locked; the caller

554

++ * has to make sure that the reference to cgroup is valid across the call.

555

++ *

556

++ * NOTE: an alternative approach might have been to store the current

557

++ * cgroup in bfqq and getting a reference to it, reducing the lookup

558

++ * time here, at the price of slightly more complex code.

559

++ */

560

++static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd,

561

++						struct bfq_io_cq *bic,

562

++						struct cgroup_subsys_state *css)

563

++{

564

++	struct bfq_queue *async_bfqq = bic_to_bfqq(bic, 0);

565

++	struct bfq_queue *sync_bfqq = bic_to_bfqq(bic, 1);

566

++	struct bfq_entity *entity;

567

++	struct bfq_group *bfqg;

568

++	struct bfqio_cgroup *bgrp;

569

++

570

++	bgrp = css_to_bfqio(css);

571

++

572

++	bfqg = bfq_find_alloc_group(bfqd, css);

573

++	if (async_bfqq != NULL) {

574

++		entity = &async_bfqq->entity;

575

++

576

++		if (entity->sched_data != &bfqg->sched_data) {

577

++			bic_set_bfqq(bic, NULL, 0);

578

++			bfq_log_bfqq(bfqd, async_bfqq,

579

++				     "bic_change_group: %p %d",

580

++				     async_bfqq, atomic_read(&async_bfqq->ref));

581

++			bfq_put_queue(async_bfqq);

582

++		}

583

++	}

584

++

585

++	if (sync_bfqq != NULL) {

586

++		entity = &sync_bfqq->entity;

587

++		if (entity->sched_data != &bfqg->sched_data)

588

++			bfq_bfqq_move(bfqd, sync_bfqq, entity, bfqg);

589

++	}

590

++

591

++	return bfqg;

592

++}

593

++

594

++/**

595

++ * bfq_bic_change_cgroup - move @bic to @cgroup.

596

++ * @bic: the bic being migrated.

597

++ * @cgroup: the destination cgroup.

598

++ *

599

++ * When the task owning @bic is moved to @cgroup, @bic is immediately

600

++ * moved into its new parent group.

601

++ */

602

++static void bfq_bic_change_cgroup(struct bfq_io_cq *bic,

603

++				  struct cgroup_subsys_state *css)

604

++{

605

++	struct bfq_data *bfqd;

606

++	unsigned long uninitialized_var(flags);

607

++

608

++	bfqd = bfq_get_bfqd_locked(&(bic->icq.q->elevator->elevator_data),

609

++				   &flags);

610

++	if (bfqd != NULL) {

611

++		__bfq_bic_change_cgroup(bfqd, bic, css);

612

++		bfq_put_bfqd_unlock(bfqd, &flags);

613

++	}

614

++}

615

++

616

++/**

617

++ * bfq_bic_update_cgroup - update the cgroup of @bic.

618

++ * @bic: the @bic to update.

619

++ *

620

++ * Make sure that @bic is enqueued in the cgroup of the current task.

621

++ * We need this in addition to moving bics during the cgroup attach

622

++ * phase because the task owning @bic could be at its first disk

623

++ * access or we may end up in the root cgroup as the result of a

624

++ * memory allocation failure and here we try to move to the right

625

++ * group.

626

++ *

627

++ * Must be called under the queue lock.  It is safe to use the returned

628

++ * value even after the rcu_read_unlock() as the migration/destruction

629

++ * paths act under the queue lock too.  IOW it is impossible to race with

630

++ * group migration/destruction and end up with an invalid group as:

631

++ *   a) here cgroup has not yet been destroyed, nor its destroy callback

632

++ *      has started execution, as current holds a reference to it,

633

++ *   b) if it is destroyed after rcu_read_unlock() [after current is

634

++ *      migrated to a different cgroup] its attach() callback will have

635

++ *      taken care of remove all the references to the old cgroup data.

636

++ */

637

++static struct bfq_group *bfq_bic_update_cgroup(struct bfq_io_cq *bic)

638

++{

639

++	struct bfq_data *bfqd = bic_to_bfqd(bic);

640

++	struct bfq_group *bfqg;

641

++	struct cgroup_subsys_state *css;

642

++

643

++	BUG_ON(bfqd == NULL);

644

++

645

++	rcu_read_lock();

646

++	css = task_css(current, bfqio_cgrp_id);

647

++	bfqg = __bfq_bic_change_cgroup(bfqd, bic, css);

648

++	rcu_read_unlock();

649

++

650

++	return bfqg;

651

++}

652

++

653

++/**

654

++ * bfq_flush_idle_tree - deactivate any entity on the idle tree of @st.

655

++ * @st: the service tree being flushed.

656

++ */

657

++static inline void bfq_flush_idle_tree(struct bfq_service_tree *st)

658

++{

659

++	struct bfq_entity *entity = st->first_idle;

660

++

661

++	for (; entity != NULL; entity = st->first_idle)

662

++		__bfq_deactivate_entity(entity, 0);

663

++}

664

++

665

++/**

666

++ * bfq_reparent_leaf_entity - move leaf entity to the root_group.

667

++ * @bfqd: the device data structure with the root group.

668

++ * @entity: the entity to move.

669

++ */

670

++static inline void bfq_reparent_leaf_entity(struct bfq_data *bfqd,

671

++					    struct bfq_entity *entity)

672

++{

673

++	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

674

++

675

++	BUG_ON(bfqq == NULL);

676

++	bfq_bfqq_move(bfqd, bfqq, entity, bfqd->root_group);

677

++	return;

678

++}

679

++

680

++/**

681

++ * bfq_reparent_active_entities - move to the root group all active

682

++ *                                entities.

683

++ * @bfqd: the device data structure with the root group.

684

++ * @bfqg: the group to move from.

685

++ * @st: the service tree with the entities.

686

++ *

687

++ * Needs queue_lock to be taken and reference to be valid over the call.

688

++ */

689

++static inline void bfq_reparent_active_entities(struct bfq_data *bfqd,

690

++						struct bfq_group *bfqg,

691

++						struct bfq_service_tree *st)

692

++{

693

++	struct rb_root *active = &st->active;

694

++	struct bfq_entity *entity = NULL;

695

++

696

++	if (!RB_EMPTY_ROOT(&st->active))

697

++		entity = bfq_entity_of(rb_first(active));

698

++

699

++	for (; entity != NULL; entity = bfq_entity_of(rb_first(active)))

700

++		bfq_reparent_leaf_entity(bfqd, entity);

701

++

702

++	if (bfqg->sched_data.in_service_entity != NULL)

703

++		bfq_reparent_leaf_entity(bfqd,

704

++			bfqg->sched_data.in_service_entity);

705

++

706

++	return;

707

++}

708

++

709

++/**

710

++ * bfq_destroy_group - destroy @bfqg.

711

++ * @bgrp: the bfqio_cgroup containing @bfqg.

712

++ * @bfqg: the group being destroyed.

713

++ *

714

++ * Destroy @bfqg, making sure that it is not referenced from its parent.

715

++ */

716

++static void bfq_destroy_group(struct bfqio_cgroup *bgrp, struct bfq_group *bfqg)

717

++{

718

++	struct bfq_data *bfqd;

719

++	struct bfq_service_tree *st;

720

++	struct bfq_entity *entity = bfqg->my_entity;

721

++	unsigned long uninitialized_var(flags);

722

++	int i;

723

++

724

++	hlist_del(&bfqg->group_node);

725

++

726

++	/*

727

++	 * Empty all service_trees belonging to this group before

728

++	 * deactivating the group itself.

729

++	 */

730

++	for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) {

731

++		st = bfqg->sched_data.service_tree + i;

732

++

733

++		/*

734

++		 * The idle tree may still contain bfq_queues belonging

735

++		 * to exited task because they never migrated to a different

736

++		 * cgroup from the one being destroyed now.  No one else

737

++		 * can access them so it's safe to act without any lock.

738

++		 */

739

++		bfq_flush_idle_tree(st);

740

++

741

++		/*

742

++		 * It may happen that some queues are still active

743

++		 * (busy) upon group destruction (if the corresponding

744

++		 * processes have been forced to terminate). We move

745

++		 * all the leaf entities corresponding to these queues

746

++		 * to the root_group.

747

++		 * Also, it may happen that the group has an entity

748

++		 * in service, which is disconnected from the active

749

++		 * tree: it must be moved, too.

750

++		 * There is no need to put the sync queues, as the

751

++		 * scheduler has taken no reference.

752

++		 */

753

++		bfqd = bfq_get_bfqd_locked(&bfqg->bfqd, &flags);

754

++		if (bfqd != NULL) {

755

++			bfq_reparent_active_entities(bfqd, bfqg, st);

756

++			bfq_put_bfqd_unlock(bfqd, &flags);

757

++		}

758

++		BUG_ON(!RB_EMPTY_ROOT(&st->active));

759

++		BUG_ON(!RB_EMPTY_ROOT(&st->idle));

760

++	}

761

++	BUG_ON(bfqg->sched_data.next_in_service != NULL);

762

++	BUG_ON(bfqg->sched_data.in_service_entity != NULL);

763

++

764

++	/*

765

++	 * We may race with device destruction, take extra care when

766

++	 * dereferencing bfqg->bfqd.

767

++	 */

768

++	bfqd = bfq_get_bfqd_locked(&bfqg->bfqd, &flags);

769

++	if (bfqd != NULL) {

770

++		hlist_del(&bfqg->bfqd_node);

771

++		__bfq_deactivate_entity(entity, 0);

772

++		bfq_put_async_queues(bfqd, bfqg);

773

++		bfq_put_bfqd_unlock(bfqd, &flags);

774

++	}

775

++	BUG_ON(entity->tree != NULL);

776

++

777

++	/*

778

++	 * No need to defer the kfree() to the end of the RCU grace

779

++	 * period: we are called from the destroy() callback of our

780

++	 * cgroup, so we can be sure that no one is a) still using

781

++	 * this cgroup or b) doing lookups in it.

782

++	 */

783

++	kfree(bfqg);

784

++}

785

++

786

++static void bfq_end_wr_async(struct bfq_data *bfqd)

787

++{

788

++	struct hlist_node *tmp;

789

++	struct bfq_group *bfqg;

790

++

791

++	hlist_for_each_entry_safe(bfqg, tmp, &bfqd->group_list, bfqd_node)

792

++		bfq_end_wr_async_queues(bfqd, bfqg);

793

++	bfq_end_wr_async_queues(bfqd, bfqd->root_group);

794

++}

795

++

796

++/**

797

++ * bfq_disconnect_groups - disconnect @bfqd from all its groups.

798

++ * @bfqd: the device descriptor being exited.

799

++ *

800

++ * When the device exits we just make sure that no lookup can return

801

++ * the now unused group structures.  They will be deallocated on cgroup

802

++ * destruction.

803

++ */

804

++static void bfq_disconnect_groups(struct bfq_data *bfqd)

805

++{

806

++	struct hlist_node *tmp;

807

++	struct bfq_group *bfqg;

808

++

809

++	bfq_log(bfqd, "disconnect_groups beginning");

810

++	hlist_for_each_entry_safe(bfqg, tmp, &bfqd->group_list, bfqd_node) {

811

++		hlist_del(&bfqg->bfqd_node);

812

++

813

++		__bfq_deactivate_entity(bfqg->my_entity, 0);

814

++

815

++		/*

816

++		 * Don't remove from the group hash, just set an

817

++		 * invalid key.  No lookups can race with the

818

++		 * assignment as bfqd is being destroyed; this

819

++		 * implies also that new elements cannot be added

820

++		 * to the list.

821

++		 */

822

++		rcu_assign_pointer(bfqg->bfqd, NULL);

823

++

824

++		bfq_log(bfqd, "disconnect_groups: put async for group %p",

825

++			bfqg);

826

++		bfq_put_async_queues(bfqd, bfqg);

827

++	}

828

++}

829

++

830

++static inline void bfq_free_root_group(struct bfq_data *bfqd)

831

++{

832

++	struct bfqio_cgroup *bgrp = &bfqio_root_cgroup;

833

++	struct bfq_group *bfqg = bfqd->root_group;

834

++

835

++	bfq_put_async_queues(bfqd, bfqg);

836

++

837

++	spin_lock_irq(&bgrp->lock);

838

++	hlist_del_rcu(&bfqg->group_node);

839

++	spin_unlock_irq(&bgrp->lock);

840

++

841

++	/*

842

++	 * No need to synchronize_rcu() here: since the device is gone

843

++	 * there cannot be any read-side access to its root_group.

844

++	 */

845

++	kfree(bfqg);

846

++}

847

++

848

++static struct bfq_group *bfq_alloc_root_group(struct bfq_data *bfqd, int node)

849

++{

850

++	struct bfq_group *bfqg;

851

++	struct bfqio_cgroup *bgrp;

852

++	int i;

853

++

854

++	bfqg = kzalloc_node(sizeof(*bfqg), GFP_KERNEL, node);

855

++	if (bfqg == NULL)

856

++		return NULL;

857

++

858

++	bfqg->entity.parent = NULL;

859

++	for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)

860

++		bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;

861

++

862

++	bgrp = &bfqio_root_cgroup;

863

++	spin_lock_irq(&bgrp->lock);

864

++	rcu_assign_pointer(bfqg->bfqd, bfqd);

865

++	hlist_add_head_rcu(&bfqg->group_node, &bgrp->group_data);

866

++	spin_unlock_irq(&bgrp->lock);

867

++

868

++	return bfqg;

869

++}

870

++

871

++#define SHOW_FUNCTION(__VAR)						\

872

++static u64 bfqio_cgroup_##__VAR##_read(struct cgroup_subsys_state *css, \

873

++				       struct cftype *cftype)		\

874

++{									\

875

++	struct bfqio_cgroup *bgrp = css_to_bfqio(css);			\

876

++	u64 ret = -ENODEV;						\

877

++									\

878

++	mutex_lock(&bfqio_mutex);					\

879

++	if (bfqio_is_removed(bgrp))					\

880

++		goto out_unlock;					\

881

++									\

882

++	spin_lock_irq(&bgrp->lock);					\

883

++	ret = bgrp->__VAR;						\

884

++	spin_unlock_irq(&bgrp->lock);					\

885

++									\

886

++out_unlock:								\

887

++	mutex_unlock(&bfqio_mutex);					\

888

++	return ret;							\

889

++}

890

++

891

++SHOW_FUNCTION(weight);

892

++SHOW_FUNCTION(ioprio);

893

++SHOW_FUNCTION(ioprio_class);

894

++#undef SHOW_FUNCTION

895

++

896

++#define STORE_FUNCTION(__VAR, __MIN, __MAX)				\

897

++static int bfqio_cgroup_##__VAR##_write(struct cgroup_subsys_state *css,\

898

++					struct cftype *cftype,		\

899

++					u64 val)			\

900

++{									\

901

++	struct bfqio_cgroup *bgrp = css_to_bfqio(css);			\

902

++	struct bfq_group *bfqg;						\

903

++	int ret = -EINVAL;						\

904

++									\

905

++	if (val < (__MIN) || val > (__MAX))				\

906

++		return ret;						\

907

++									\

908

++	ret = -ENODEV;							\

909

++	mutex_lock(&bfqio_mutex);					\

910

++	if (bfqio_is_removed(bgrp))					\

911

++		goto out_unlock;					\

912

++	ret = 0;							\

913

++									\

914

++	spin_lock_irq(&bgrp->lock);					\

915

++	bgrp->__VAR = (unsigned short)val;				\

916

++	hlist_for_each_entry(bfqg, &bgrp->group_data, group_node) {	\

917

++		/*							\

918

++		 * Setting the ioprio_changed flag of the entity        \

919

++		 * to 1 with new_##__VAR == ##__VAR would re-set        \

920

++		 * the value of the weight to its ioprio mapping.       \

921

++		 * Set the flag only if necessary.			\

922

++		 */							\

923

++		if ((unsigned short)val != bfqg->entity.new_##__VAR) {  \

924

++			bfqg->entity.new_##__VAR = (unsigned short)val; \

925

++			/*						\

926

++			 * Make sure that the above new value has been	\

927

++			 * stored in bfqg->entity.new_##__VAR before	\

928

++			 * setting the ioprio_changed flag. In fact,	\

929

++			 * this flag may be read asynchronously (in	\

930

++			 * critical sections protected by a different	\

931

++			 * lock than that held here), and finding this	\

932

++			 * flag set may cause the execution of the code	\

933

++			 * for updating parameters whose value may	\

934

++			 * depend also on bfqg->entity.new_##__VAR (in	\

935

++			 * __bfq_entity_update_weight_prio).		\

936

++			 * This barrier makes sure that the new value	\

937

++			 * of bfqg->entity.new_##__VAR is correctly	\

938

++			 * seen in that code.				\

939

++			 */						\

940

++			smp_wmb();                                      \

941

++			bfqg->entity.ioprio_changed = 1;                \

942

++		}							\

943

++	}								\

944

++	spin_unlock_irq(&bgrp->lock);					\

945

++									\

946

++out_unlock:								\

947

++	mutex_unlock(&bfqio_mutex);					\

948

++	return ret;							\

949

++}

950

++

951

++STORE_FUNCTION(weight, BFQ_MIN_WEIGHT, BFQ_MAX_WEIGHT);

952

++STORE_FUNCTION(ioprio, 0, IOPRIO_BE_NR - 1);

953

++STORE_FUNCTION(ioprio_class, IOPRIO_CLASS_RT, IOPRIO_CLASS_IDLE);

954

++#undef STORE_FUNCTION

955

++

956

++static struct cftype bfqio_files[] = {

957

++	{

958

++		.name = "weight",

959

++		.read_u64 = bfqio_cgroup_weight_read,

960

++		.write_u64 = bfqio_cgroup_weight_write,

961

++	},

962

++	{

963

++		.name = "ioprio",

964

++		.read_u64 = bfqio_cgroup_ioprio_read,

965

++		.write_u64 = bfqio_cgroup_ioprio_write,

966

++	},

967

++	{

968

++		.name = "ioprio_class",

969

++		.read_u64 = bfqio_cgroup_ioprio_class_read,

970

++		.write_u64 = bfqio_cgroup_ioprio_class_write,

971

++	},

972

++	{ },	/* terminate */

973

++};

974

++

975

++static struct cgroup_subsys_state *bfqio_create(struct cgroup_subsys_state

976

++						*parent_css)

977

++{

978

++	struct bfqio_cgroup *bgrp;

979

++

980

++	if (parent_css != NULL) {

981

++		bgrp = kzalloc(sizeof(*bgrp), GFP_KERNEL);

982

++		if (bgrp == NULL)

983

++			return ERR_PTR(-ENOMEM);

984

++	} else

985

++		bgrp = &bfqio_root_cgroup;

986

++

987

++	spin_lock_init(&bgrp->lock);

988

++	INIT_HLIST_HEAD(&bgrp->group_data);

989

++	bgrp->ioprio = BFQ_DEFAULT_GRP_IOPRIO;

990

++	bgrp->ioprio_class = BFQ_DEFAULT_GRP_CLASS;

991

++

992

++	return &bgrp->css;

993

++}

994

++

995

++/*

996

++ * We cannot support shared io contexts, as we have no means to support

997

++ * two tasks with the same ioc in two different groups without major rework

998

++ * of the main bic/bfqq data structures.  By now we allow a task to change

999

++ * its cgroup only if it's the only owner of its ioc; the drawback of this

1000

++ * behavior is that a group containing a task that forked using CLONE_IO

1001

++ * will not be destroyed until the tasks sharing the ioc die.

1002

++ */

1003

++static int bfqio_can_attach(struct cgroup_subsys_state *css,

1004

++			    struct cgroup_taskset *tset)

1005

++{

1006

++	struct task_struct *task;

1007

++	struct io_context *ioc;

1008

++	int ret = 0;

1009

++

1010

++	cgroup_taskset_for_each(task, tset) {

1011

++		/*

1012

++		 * task_lock() is needed to avoid races with

1013

++		 * exit_io_context()

1014

++		 */

1015

++		task_lock(task);

1016

++		ioc = task->io_context;

1017

++		if (ioc != NULL && atomic_read(&ioc->nr_tasks) > 1)

1018

++			/*

1019

++			 * ioc == NULL means that the task is either too

1020

++			 * young or exiting: if it has still no ioc the

1021

++			 * ioc can't be shared, if the task is exiting the

1022

++			 * attach will fail anyway, no matter what we

1023

++			 * return here.

1024

++			 */

1025

++			ret = -EINVAL;

1026

++		task_unlock(task);

1027

++		if (ret)

1028

++			break;

1029

++	}

1030

++

1031

++	return ret;

1032

++}

1033

++

1034

++static void bfqio_attach(struct cgroup_subsys_state *css,

1035

++			 struct cgroup_taskset *tset)

1036

++{

1037

++	struct task_struct *task;

1038

++	struct io_context *ioc;

1039

++	struct io_cq *icq;

1040

++

1041

++	/*

1042

++	 * IMPORTANT NOTE: The move of more than one process at a time to a

1043

++	 * new group has not yet been tested.

1044

++	 */

1045

++	cgroup_taskset_for_each(task, tset) {

1046

++		ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE);

1047

++		if (ioc) {

1048

++			/*

1049

++			 * Handle cgroup change here.

1050

++			 */

1051

++			rcu_read_lock();

1052

++			hlist_for_each_entry_rcu(icq, &ioc->icq_list, ioc_node)

1053

++				if (!strncmp(

1054

++					icq->q->elevator->type->elevator_name,

1055

++					"bfq", ELV_NAME_MAX))

1056

++					bfq_bic_change_cgroup(icq_to_bic(icq),

1057

++							      css);

1058

++			rcu_read_unlock();

1059

++			put_io_context(ioc);

1060

++		}

1061

++	}

1062

++}

1063

++

1064

++static void bfqio_destroy(struct cgroup_subsys_state *css)

1065

++{

1066

++	struct bfqio_cgroup *bgrp = css_to_bfqio(css);

1067

++	struct hlist_node *tmp;

1068

++	struct bfq_group *bfqg;

1069

++

1070

++	/*

1071

++	 * Since we are destroying the cgroup, there are no more tasks

1072

++	 * referencing it, and all the RCU grace periods that may have

1073

++	 * referenced it are ended (as the destruction of the parent

1074

++	 * cgroup is RCU-safe); bgrp->group_data will not be accessed by

1075

++	 * anything else and we don't need any synchronization.

1076

++	 */

1077

++	hlist_for_each_entry_safe(bfqg, tmp, &bgrp->group_data, group_node)

1078

++		bfq_destroy_group(bgrp, bfqg);

1079

++

1080

++	BUG_ON(!hlist_empty(&bgrp->group_data));

1081

++

1082

++	kfree(bgrp);

1083

++}

1084

++

1085

++static int bfqio_css_online(struct cgroup_subsys_state *css)

1086

++{

1087

++	struct bfqio_cgroup *bgrp = css_to_bfqio(css);

1088

++

1089

++	mutex_lock(&bfqio_mutex);

1090

++	bgrp->online = true;

1091

++	mutex_unlock(&bfqio_mutex);

1092

++

1093

++	return 0;

1094

++}

1095

++

1096

++static void bfqio_css_offline(struct cgroup_subsys_state *css)

1097

++{

1098

++	struct bfqio_cgroup *bgrp = css_to_bfqio(css);

1099

++

1100

++	mutex_lock(&bfqio_mutex);

1101

++	bgrp->online = false;

1102

++	mutex_unlock(&bfqio_mutex);

1103

++}

1104

++

1105

++struct cgroup_subsys bfqio_cgrp_subsys = {

1106

++	.css_alloc = bfqio_create,

1107

++	.css_online = bfqio_css_online,

1108

++	.css_offline = bfqio_css_offline,

1109

++	.can_attach = bfqio_can_attach,

1110

++	.attach = bfqio_attach,

1111

++	.css_free = bfqio_destroy,

1112

++	.legacy_cftypes = bfqio_files,

1113

++};

1114

++#else

1115

++static inline void bfq_init_entity(struct bfq_entity *entity,

1116

++				   struct bfq_group *bfqg)

1117

++{

1118

++	entity->weight = entity->new_weight;

1119

++	entity->orig_weight = entity->new_weight;

1120

++	entity->ioprio = entity->new_ioprio;

1121

++	entity->ioprio_class = entity->new_ioprio_class;

1122

++	entity->sched_data = &bfqg->sched_data;

1123

++}

1124

++

1125

++static inline struct bfq_group *

1126

++bfq_bic_update_cgroup(struct bfq_io_cq *bic)

1127

++{

1128

++	struct bfq_data *bfqd = bic_to_bfqd(bic);

1129

++	return bfqd->root_group;

1130

++}

1131

++

1132

++static inline void bfq_bfqq_move(struct bfq_data *bfqd,

1133

++				 struct bfq_queue *bfqq,

1134

++				 struct bfq_entity *entity,

1135

++				 struct bfq_group *bfqg)

1136

++{

1137

++}

1138

++

1139

++static void bfq_end_wr_async(struct bfq_data *bfqd)

1140

++{

1141

++	bfq_end_wr_async_queues(bfqd, bfqd->root_group);

1142

++}

1143

++

1144

++static inline void bfq_disconnect_groups(struct bfq_data *bfqd)

1145

++{

1146

++	bfq_put_async_queues(bfqd, bfqd->root_group);

1147

++}

1148

++

1149

++static inline void bfq_free_root_group(struct bfq_data *bfqd)

1150

++{

1151

++	kfree(bfqd->root_group);

1152

++}

1153

++

1154

++static struct bfq_group *bfq_alloc_root_group(struct bfq_data *bfqd, int node)

1155

++{

1156

++	struct bfq_group *bfqg;

1157

++	int i;

1158

++

1159

++	bfqg = kmalloc_node(sizeof(*bfqg), GFP_KERNEL | __GFP_ZERO, node);

1160

++	if (bfqg == NULL)

1161

++		return NULL;

1162

++

1163

++	for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)

1164

++		bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;

1165

++

1166

++	return bfqg;

1167

++}

1168

++#endif

1169

+diff --git a/block/bfq-ioc.c b/block/bfq-ioc.c

1170

+new file mode 100644

1171

+index 0000000..7f6b000

1172

+--- /dev/null

1173

++++ b/block/bfq-ioc.c

1174

+@@ -0,0 +1,36 @@

1175

++/*

1176

++ * BFQ: I/O context handling.

1177

++ *

1178

++ * Based on ideas and code from CFQ:

1179

++ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>

1180

++ *

1181

++ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>

1182

++ *		      Paolo Valente <paolo.valente@×××××××.it>

1183

++ *

1184

++ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>

1185

++ */

1186

++

1187

++/**

1188

++ * icq_to_bic - convert iocontext queue structure to bfq_io_cq.

1189

++ * @icq: the iocontext queue.

1190

++ */

1191

++static inline struct bfq_io_cq *icq_to_bic(struct io_cq *icq)

1192

++{

1193

++	/* bic->icq is the first member, %NULL will convert to %NULL */

1194

++	return container_of(icq, struct bfq_io_cq, icq);

1195

++}

1196

++

1197

++/**

1198

++ * bfq_bic_lookup - search into @ioc a bic associated to @bfqd.

1199

++ * @bfqd: the lookup key.

1200

++ * @ioc: the io_context of the process doing I/O.

1201

++ *

1202

++ * Queue lock must be held.

1203

++ */

1204

++static inline struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd,

1205

++					       struct io_context *ioc)

1206

++{

1207

++	if (ioc)

1208

++		return icq_to_bic(ioc_lookup_icq(ioc, bfqd->queue));

1209

++	return NULL;

1210

++}

1211

+diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c

1212

+new file mode 100644

1213

+index 0000000..97ee934

1214

+--- /dev/null

1215

++++ b/block/bfq-iosched.c

1216

+@@ -0,0 +1,3902 @@

1217

++/*

1218

++ * Budget Fair Queueing (BFQ) disk scheduler.

1219

++ *

1220

++ * Based on ideas and code from CFQ:

1221

++ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>

1222

++ *

1223

++ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>

1224

++ *		      Paolo Valente <paolo.valente@×××××××.it>

1225

++ *

1226

++ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>

1227

++ *

1228

++ * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ

1229

++ * file.

1230

++ *

1231

++ * BFQ is a proportional-share storage-I/O scheduling algorithm based on

1232

++ * the slice-by-slice service scheme of CFQ. But BFQ assigns budgets,

1233

++ * measured in number of sectors, to processes instead of time slices. The

1234

++ * device is not granted to the in-service process for a given time slice,

1235

++ * but until it has exhausted its assigned budget. This change from the time

1236

++ * to the service domain allows BFQ to distribute the device throughput

1237

++ * among processes as desired, without any distortion due to ZBR, workload

1238

++ * fluctuations or other factors. BFQ uses an ad hoc internal scheduler,

1239

++ * called B-WF2Q+, to schedule processes according to their budgets. More

1240

++ * precisely, BFQ schedules queues associated to processes. Thanks to the

1241

++ * accurate policy of B-WF2Q+, BFQ can afford to assign high budgets to

1242

++ * I/O-bound processes issuing sequential requests (to boost the

1243

++ * throughput), and yet guarantee a low latency to interactive and soft

1244

++ * real-time applications.

1245

++ *

1246

++ * BFQ is described in [1], where also a reference to the initial, more

1247

++ * theoretical paper on BFQ can be found. The interested reader can find

1248

++ * in the latter paper full details on the main algorithm, as well as

1249

++ * formulas of the guarantees and formal proofs of all the properties.

1250

++ * With respect to the version of BFQ presented in these papers, this

1251

++ * implementation adds a few more heuristics, such as the one that

1252

++ * guarantees a low latency to soft real-time applications, and a

1253

++ * hierarchical extension based on H-WF2Q+.

1254

++ *

1255

++ * B-WF2Q+ is based on WF2Q+, that is described in [2], together with

1256

++ * H-WF2Q+, while the augmented tree used to implement B-WF2Q+ with O(log N)

1257

++ * complexity derives from the one introduced with EEVDF in [3].

1258

++ *

1259

++ * [1] P. Valente and M. Andreolini, ``Improving Application Responsiveness

1260

++ *     with the BFQ Disk I/O Scheduler'',

1261

++ *     Proceedings of the 5th Annual International Systems and Storage

1262

++ *     Conference (SYSTOR '12), June 2012.

1263

++ *

1264

++ * http://algogroup.unimo.it/people/paolo/disk_sched/bf1-v1-suite-results.pdf

1265

++ *

1266

++ * [2] Jon C.R. Bennett and H. Zhang, ``Hierarchical Packet Fair Queueing

1267

++ *     Algorithms,'' IEEE/ACM Transactions on Networking, 5(5):675-689,

1268

++ *     Oct 1997.

1269

++ *

1270

++ * http://www.cs.cmu.edu/~hzhang/papers/TON-97-Oct.ps.gz

1271

++ *

1272

++ * [3] I. Stoica and H. Abdel-Wahab, ``Earliest Eligible Virtual Deadline

1273

++ *     First: A Flexible and Accurate Mechanism for Proportional Share

1274

++ *     Resource Allocation,'' technical report.

1275

++ *

1276

++ * http://www.cs.berkeley.edu/~istoica/papers/eevdf-tr-95.pdf

1277

++ */

1278

++#include <linux/module.h>

1279

++#include <linux/slab.h>

1280

++#include <linux/blkdev.h>

1281

++#include <linux/cgroup.h>

1282

++#include <linux/elevator.h>

1283

++#include <linux/jiffies.h>

1284

++#include <linux/rbtree.h>

1285

++#include <linux/ioprio.h>

1286

++#include "bfq.h"

1287

++#include "blk.h"

1288

++

1289

++/* Max number of dispatches in one round of service. */

1290

++static const int bfq_quantum = 4;

1291

++

1292

++/* Expiration time of sync (0) and async (1) requests, in jiffies. */

1293

++static const int bfq_fifo_expire[2] = { HZ / 4, HZ / 8 };

1294

++

1295

++/* Maximum backwards seek, in KiB. */

1296

++static const int bfq_back_max = 16 * 1024;

1297

++

1298

++/* Penalty of a backwards seek, in number of sectors. */

1299

++static const int bfq_back_penalty = 2;

1300

++

1301

++/* Idling period duration, in jiffies. */

1302

++static int bfq_slice_idle = HZ / 125;

1303

++

1304

++/* Default maximum budget values, in sectors and number of requests. */

1305

++static const int bfq_default_max_budget = 16 * 1024;

1306

++static const int bfq_max_budget_async_rq = 4;

1307

++

1308

++/*

1309

++ * Async to sync throughput distribution is controlled as follows:

1310

++ * when an async request is served, the entity is charged the number

1311

++ * of sectors of the request, multiplied by the factor below

1312

++ */

1313

++static const int bfq_async_charge_factor = 10;

1314

++

1315

++/* Default timeout values, in jiffies, approximating CFQ defaults. */

1316

++static const int bfq_timeout_sync = HZ / 8;

1317

++static int bfq_timeout_async = HZ / 25;

1318

++

1319

++struct kmem_cache *bfq_pool;

1320

++

1321

++/* Below this threshold (in ms), we consider thinktime immediate. */

1322

++#define BFQ_MIN_TT		2

1323

++

1324

++/* hw_tag detection: parallel requests threshold and min samples needed. */

1325

++#define BFQ_HW_QUEUE_THRESHOLD	4

1326

++#define BFQ_HW_QUEUE_SAMPLES	32

1327

++

1328

++#define BFQQ_SEEK_THR	 (sector_t)(8 * 1024)

1329

++#define BFQQ_SEEKY(bfqq) ((bfqq)->seek_mean > BFQQ_SEEK_THR)

1330

++

1331

++/* Min samples used for peak rate estimation (for autotuning). */

1332

++#define BFQ_PEAK_RATE_SAMPLES	32

1333

++

1334

++/* Shift used for peak rate fixed precision calculations. */

1335

++#define BFQ_RATE_SHIFT		16

1336

++

1337

++/*

1338

++ * By default, BFQ computes the duration of the weight raising for

1339

++ * interactive applications automatically, using the following formula:

1340

++ * duration = (R / r) * T, where r is the peak rate of the device, and

1341

++ * R and T are two reference parameters.

1342

++ * In particular, R is the peak rate of the reference device (see below),

1343

++ * and T is a reference time: given the systems that are likely to be

1344

++ * installed on the reference device according to its speed class, T is

1345

++ * about the maximum time needed, under BFQ and while reading two files in

1346

++ * parallel, to load typical large applications on these systems.

1347

++ * In practice, the slower/faster the device at hand is, the more/less it

1348

++ * takes to load applications with respect to the reference device.

1349

++ * Accordingly, the longer/shorter BFQ grants weight raising to interactive

1350

++ * applications.

1351

++ *

1352

++ * BFQ uses four different reference pairs (R, T), depending on:

1353

++ * . whether the device is rotational or non-rotational;

1354

++ * . whether the device is slow, such as old or portable HDDs, as well as

1355

++ *   SD cards, or fast, such as newer HDDs and SSDs.

1356

++ *

1357

++ * The device's speed class is dynamically (re)detected in

1358

++ * bfq_update_peak_rate() every time the estimated peak rate is updated.

1359

++ *

1360

++ * In the following definitions, R_slow[0]/R_fast[0] and T_slow[0]/T_fast[0]

1361

++ * are the reference values for a slow/fast rotational device, whereas

1362

++ * R_slow[1]/R_fast[1] and T_slow[1]/T_fast[1] are the reference values for

1363

++ * a slow/fast non-rotational device. Finally, device_speed_thresh are the

1364

++ * thresholds used to switch between speed classes.

1365

++ * Both the reference peak rates and the thresholds are measured in

1366

++ * sectors/usec, left-shifted by BFQ_RATE_SHIFT.

1367

++ */

1368

++static int R_slow[2] = {1536, 10752};

1369

++static int R_fast[2] = {17415, 34791};

1370

++/*

1371

++ * To improve readability, a conversion function is used to initialize the

1372

++ * following arrays, which entails that they can be initialized only in a

1373

++ * function.

1374

++ */

1375

++static int T_slow[2];

1376

++static int T_fast[2];

1377

++static int device_speed_thresh[2];

1378

++

1379

++#define BFQ_SERVICE_TREE_INIT	((struct bfq_service_tree)		\

1380

++				{ RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 })

1381

++

1382

++#define RQ_BIC(rq)		((struct bfq_io_cq *) (rq)->elv.priv[0])

1383

++#define RQ_BFQQ(rq)		((rq)->elv.priv[1])

1384

++

1385

++static inline void bfq_schedule_dispatch(struct bfq_data *bfqd);

1386

++

1387

++#include "bfq-ioc.c"

1388

++#include "bfq-sched.c"

1389

++#include "bfq-cgroup.c"

1390

++

1391

++#define bfq_class_idle(bfqq)	((bfqq)->entity.ioprio_class ==\

1392

++				 IOPRIO_CLASS_IDLE)

1393

++#define bfq_class_rt(bfqq)	((bfqq)->entity.ioprio_class ==\

1394

++				 IOPRIO_CLASS_RT)

1395

++

1396

++#define bfq_sample_valid(samples)	((samples) > 80)

1397

++

1398

++/*

1399

++ * We regard a request as SYNC, if either it's a read or has the SYNC bit

1400

++ * set (in which case it could also be a direct WRITE).

1401

++ */

1402

++static inline int bfq_bio_sync(struct bio *bio)

1403

++{

1404

++	if (bio_data_dir(bio) == READ || (bio->bi_rw & REQ_SYNC))

1405

++		return 1;

1406

++

1407

++	return 0;

1408

++}

1409

++

1410

++/*

1411

++ * Scheduler run of queue, if there are requests pending and no one in the

1412

++ * driver that will restart queueing.

1413

++ */

1414

++static inline void bfq_schedule_dispatch(struct bfq_data *bfqd)

1415

++{

1416

++	if (bfqd->queued != 0) {

1417

++		bfq_log(bfqd, "schedule dispatch");

1418

++		kblockd_schedule_work(&bfqd->unplug_work);

1419

++	}

1420

++}

1421

++

1422

++/*

1423

++ * Lifted from AS - choose which of rq1 and rq2 that is best served now.

1424

++ * We choose the request that is closesr to the head right now.  Distance

1425

++ * behind the head is penalized and only allowed to a certain extent.

1426

++ */

1427

++static struct request *bfq_choose_req(struct bfq_data *bfqd,

1428

++				      struct request *rq1,

1429

++				      struct request *rq2,

1430

++				      sector_t last)

1431

++{

1432

++	sector_t s1, s2, d1 = 0, d2 = 0;

1433

++	unsigned long back_max;

1434

++#define BFQ_RQ1_WRAP	0x01 /* request 1 wraps */

1435

++#define BFQ_RQ2_WRAP	0x02 /* request 2 wraps */

1436

++	unsigned wrap = 0; /* bit mask: requests behind the disk head? */

1437

++

1438

++	if (rq1 == NULL || rq1 == rq2)

1439

++		return rq2;

1440

++	if (rq2 == NULL)

1441

++		return rq1;

1442

++

1443

++	if (rq_is_sync(rq1) && !rq_is_sync(rq2))

1444

++		return rq1;

1445

++	else if (rq_is_sync(rq2) && !rq_is_sync(rq1))

1446

++		return rq2;

1447

++	if ((rq1->cmd_flags & REQ_META) && !(rq2->cmd_flags & REQ_META))

1448

++		return rq1;

1449

++	else if ((rq2->cmd_flags & REQ_META) && !(rq1->cmd_flags & REQ_META))

1450

++		return rq2;

1451

++

1452

++	s1 = blk_rq_pos(rq1);

1453

++	s2 = blk_rq_pos(rq2);

1454

++

1455

++	/*

1456

++	 * By definition, 1KiB is 2 sectors.

1457

++	 */

1458

++	back_max = bfqd->bfq_back_max * 2;

1459

++

1460

++	/*

1461

++	 * Strict one way elevator _except_ in the case where we allow

1462

++	 * short backward seeks which are biased as twice the cost of a

1463

++	 * similar forward seek.

1464

++	 */

1465

++	if (s1 >= last)

1466

++		d1 = s1 - last;

1467

++	else if (s1 + back_max >= last)

1468

++		d1 = (last - s1) * bfqd->bfq_back_penalty;

1469

++	else

1470

++		wrap |= BFQ_RQ1_WRAP;

1471

++

1472

++	if (s2 >= last)

1473

++		d2 = s2 - last;

1474

++	else if (s2 + back_max >= last)

1475

++		d2 = (last - s2) * bfqd->bfq_back_penalty;

1476

++	else

1477

++		wrap |= BFQ_RQ2_WRAP;

1478

++

1479

++	/* Found required data */

1480

++

1481

++	/*

1482

++	 * By doing switch() on the bit mask "wrap" we avoid having to

1483

++	 * check two variables for all permutations: --> faster!

1484

++	 */

1485

++	switch (wrap) {

1486

++	case 0: /* common case for CFQ: rq1 and rq2 not wrapped */

1487

++		if (d1 < d2)

1488

++			return rq1;

1489

++		else if (d2 < d1)

1490

++			return rq2;

1491

++		else {

1492

++			if (s1 >= s2)

1493

++				return rq1;

1494

++			else

1495

++				return rq2;

1496

++		}

1497

++

1498

++	case BFQ_RQ2_WRAP:

1499

++		return rq1;

1500

++	case BFQ_RQ1_WRAP:

1501

++		return rq2;

1502

++	case (BFQ_RQ1_WRAP|BFQ_RQ2_WRAP): /* both rqs wrapped */

1503

++	default:

1504

++		/*

1505

++		 * Since both rqs are wrapped,

1506

++		 * start with the one that's further behind head

1507

++		 * (--> only *one* back seek required),

1508

++		 * since back seek takes more time than forward.

1509

++		 */

1510

++		if (s1 <= s2)

1511

++			return rq1;

1512

++		else

1513

++			return rq2;

1514

++	}

1515

++}

1516

++

1517

++static struct bfq_queue *

1518

++bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root,

1519

++		     sector_t sector, struct rb_node **ret_parent,

1520

++		     struct rb_node ***rb_link)

1521

++{

1522

++	struct rb_node **p, *parent;

1523

++	struct bfq_queue *bfqq = NULL;

1524

++

1525

++	parent = NULL;

1526

++	p = &root->rb_node;

1527

++	while (*p) {

1528

++		struct rb_node **n;

1529

++

1530

++		parent = *p;

1531

++		bfqq = rb_entry(parent, struct bfq_queue, pos_node);

1532

++

1533

++		/*

1534

++		 * Sort strictly based on sector. Smallest to the left,

1535

++		 * largest to the right.

1536

++		 */

1537

++		if (sector > blk_rq_pos(bfqq->next_rq))

1538

++			n = &(*p)->rb_right;

1539

++		else if (sector < blk_rq_pos(bfqq->next_rq))

1540

++			n = &(*p)->rb_left;

1541

++		else

1542

++			break;

1543

++		p = n;

1544

++		bfqq = NULL;

1545

++	}

1546

++

1547

++	*ret_parent = parent;

1548

++	if (rb_link)

1549

++		*rb_link = p;

1550

++

1551

++	bfq_log(bfqd, "rq_pos_tree_lookup %llu: returning %d",

1552

++		(long long unsigned)sector,

1553

++		bfqq != NULL ? bfqq->pid : 0);

1554

++

1555

++	return bfqq;

1556

++}

1557

++

1558

++static void bfq_rq_pos_tree_add(struct bfq_data *bfqd, struct bfq_queue *bfqq)

1559

++{

1560

++	struct rb_node **p, *parent;

1561

++	struct bfq_queue *__bfqq;

1562

++

1563

++	if (bfqq->pos_root != NULL) {

1564

++		rb_erase(&bfqq->pos_node, bfqq->pos_root);

1565

++		bfqq->pos_root = NULL;

1566

++	}

1567

++

1568

++	if (bfq_class_idle(bfqq))

1569

++		return;

1570

++	if (!bfqq->next_rq)

1571

++		return;

1572

++

1573

++	bfqq->pos_root = &bfqd->rq_pos_tree;

1574

++	__bfqq = bfq_rq_pos_tree_lookup(bfqd, bfqq->pos_root,

1575

++			blk_rq_pos(bfqq->next_rq), &parent, &p);

1576

++	if (__bfqq == NULL) {

1577

++		rb_link_node(&bfqq->pos_node, parent, p);

1578

++		rb_insert_color(&bfqq->pos_node, bfqq->pos_root);

1579

++	} else

1580

++		bfqq->pos_root = NULL;

1581

++}

1582

++

1583

++/*

1584

++ * Tell whether there are active queues or groups with differentiated weights.

1585

++ */

1586

++static inline bool bfq_differentiated_weights(struct bfq_data *bfqd)

1587

++{

1588

++	BUG_ON(!bfqd->hw_tag);

1589

++	/*

1590

++	 * For weights to differ, at least one of the trees must contain

1591

++	 * at least two nodes.

1592

++	 */

1593

++	return (!RB_EMPTY_ROOT(&bfqd->queue_weights_tree) &&

1594

++		(bfqd->queue_weights_tree.rb_node->rb_left ||

1595

++		 bfqd->queue_weights_tree.rb_node->rb_right)

1596

++#ifdef CONFIG_CGROUP_BFQIO

1597

++	       ) ||

1598

++	       (!RB_EMPTY_ROOT(&bfqd->group_weights_tree) &&

1599

++		(bfqd->group_weights_tree.rb_node->rb_left ||

1600

++		 bfqd->group_weights_tree.rb_node->rb_right)

1601

++#endif

1602

++	       );

1603

++}

1604

++

1605

++/*

1606

++ * If the weight-counter tree passed as input contains no counter for

1607

++ * the weight of the input entity, then add that counter; otherwise just

1608

++ * increment the existing counter.

1609

++ *

1610

++ * Note that weight-counter trees contain few nodes in mostly symmetric

1611

++ * scenarios. For example, if all queues have the same weight, then the

1612

++ * weight-counter tree for the queues may contain at most one node.

1613

++ * This holds even if low_latency is on, because weight-raised queues

1614

++ * are not inserted in the tree.

1615

++ * In most scenarios, the rate at which nodes are created/destroyed

1616

++ * should be low too.

1617

++ */

1618

++static void bfq_weights_tree_add(struct bfq_data *bfqd,

1619

++				 struct bfq_entity *entity,

1620

++				 struct rb_root *root)

1621

++{

1622

++	struct rb_node **new = &(root->rb_node), *parent = NULL;

1623

++

1624

++	/*

1625

++	 * Do not insert if:

1626

++	 * - the device does not support queueing;

1627

++	 * - the entity is already associated with a counter, which happens if:

1628

++	 *   1) the entity is associated with a queue, 2) a request arrival

1629

++	 *   has caused the queue to become both non-weight-raised, and hence

1630

++	 *   change its weight, and backlogged; in this respect, each

1631

++	 *   of the two events causes an invocation of this function,

1632

++	 *   3) this is the invocation of this function caused by the second

1633

++	 *   event. This second invocation is actually useless, and we handle

1634

++	 *   this fact by exiting immediately. More efficient or clearer

1635

++	 *   solutions might possibly be adopted.

1636

++	 */

1637

++	if (!bfqd->hw_tag || entity->weight_counter)

1638

++		return;

1639

++

1640

++	while (*new) {

1641

++		struct bfq_weight_counter *__counter = container_of(*new,

1642

++						struct bfq_weight_counter,

1643

++						weights_node);

1644

++		parent = *new;

1645

++

1646

++		if (entity->weight == __counter->weight) {

1647

++			entity->weight_counter = __counter;

1648

++			goto inc_counter;

1649

++		}

1650

++		if (entity->weight < __counter->weight)

1651

++			new = &((*new)->rb_left);

1652

++		else

1653

++			new = &((*new)->rb_right);

1654

++	}

1655

++

1656

++	entity->weight_counter = kzalloc(sizeof(struct bfq_weight_counter),

1657

++					 GFP_ATOMIC);

1658

++	entity->weight_counter->weight = entity->weight;

1659

++	rb_link_node(&entity->weight_counter->weights_node, parent, new);

1660

++	rb_insert_color(&entity->weight_counter->weights_node, root);

1661

++

1662

++inc_counter:

1663

++	entity->weight_counter->num_active++;

1664

++}

1665

++

1666

++/*

1667

++ * Decrement the weight counter associated with the entity, and, if the

1668

++ * counter reaches 0, remove the counter from the tree.

1669

++ * See the comments to the function bfq_weights_tree_add() for considerations

1670

++ * about overhead.

1671

++ */

1672

++static void bfq_weights_tree_remove(struct bfq_data *bfqd,

1673

++				    struct bfq_entity *entity,

1674

++				    struct rb_root *root)

1675

++{

1676

++	/*

1677

++	 * Check whether the entity is actually associated with a counter.

1678

++	 * In fact, the device may not be considered NCQ-capable for a while,

1679

++	 * which implies that no insertion in the weight trees is performed,

1680

++	 * after which the device may start to be deemed NCQ-capable, and hence

1681

++	 * this function may start to be invoked. This may cause the function

1682

++	 * to be invoked for entities that are not associated with any counter.

1683

++	 */

1684

++	if (!entity->weight_counter)

1685

++		return;

1686

++

1687

++	BUG_ON(RB_EMPTY_ROOT(root));

1688

++	BUG_ON(entity->weight_counter->weight != entity->weight);

1689

++

1690

++	BUG_ON(!entity->weight_counter->num_active);

1691

++	entity->weight_counter->num_active--;

1692

++	if (entity->weight_counter->num_active > 0)

1693

++		goto reset_entity_pointer;

1694

++

1695

++	rb_erase(&entity->weight_counter->weights_node, root);

1696

++	kfree(entity->weight_counter);

1697

++

1698

++reset_entity_pointer:

1699

++	entity->weight_counter = NULL;

1700

++}

1701

++

1702

++static struct request *bfq_find_next_rq(struct bfq_data *bfqd,

1703

++					struct bfq_queue *bfqq,

1704

++					struct request *last)

1705

++{

1706

++	struct rb_node *rbnext = rb_next(&last->rb_node);

1707

++	struct rb_node *rbprev = rb_prev(&last->rb_node);

1708

++	struct request *next = NULL, *prev = NULL;

1709

++

1710

++	BUG_ON(RB_EMPTY_NODE(&last->rb_node));

1711

++

1712

++	if (rbprev != NULL)

1713

++		prev = rb_entry_rq(rbprev);

1714

++

1715

++	if (rbnext != NULL)

1716

++		next = rb_entry_rq(rbnext);

1717

++	else {

1718

++		rbnext = rb_first(&bfqq->sort_list);

1719

++		if (rbnext && rbnext != &last->rb_node)

1720

++			next = rb_entry_rq(rbnext);

1721

++	}

1722

++

1723

++	return bfq_choose_req(bfqd, next, prev, blk_rq_pos(last));

1724

++}

1725

++

1726

++/* see the definition of bfq_async_charge_factor for details */

1727

++static inline unsigned long bfq_serv_to_charge(struct request *rq,

1728

++					       struct bfq_queue *bfqq)

1729

++{

1730

++	return blk_rq_sectors(rq) *

1731

++		(1 + ((!bfq_bfqq_sync(bfqq)) * (bfqq->wr_coeff == 1) *

1732

++		bfq_async_charge_factor));

1733

++}

1734

++

1735

++/**

1736

++ * bfq_updated_next_req - update the queue after a new next_rq selection.

1737

++ * @bfqd: the device data the queue belongs to.

1738

++ * @bfqq: the queue to update.

1739

++ *

1740

++ * If the first request of a queue changes we make sure that the queue

1741

++ * has enough budget to serve at least its first request (if the

1742

++ * request has grown).  We do this because if the queue has not enough

1743

++ * budget for its first request, it has to go through two dispatch

1744

++ * rounds to actually get it dispatched.

1745

++ */

1746

++static void bfq_updated_next_req(struct bfq_data *bfqd,

1747

++				 struct bfq_queue *bfqq)

1748

++{

1749

++	struct bfq_entity *entity = &bfqq->entity;

1750

++	struct bfq_service_tree *st = bfq_entity_service_tree(entity);

1751

++	struct request *next_rq = bfqq->next_rq;

1752

++	unsigned long new_budget;

1753

++

1754

++	if (next_rq == NULL)

1755

++		return;

1756

++

1757

++	if (bfqq == bfqd->in_service_queue)

1758

++		/*

1759

++		 * In order not to break guarantees, budgets cannot be

1760

++		 * changed after an entity has been selected.

1761

++		 */

1762

++		return;

1763

++

1764

++	BUG_ON(entity->tree != &st->active);

1765

++	BUG_ON(entity == entity->sched_data->in_service_entity);

1766

++

1767

++	new_budget = max_t(unsigned long, bfqq->max_budget,

1768

++			   bfq_serv_to_charge(next_rq, bfqq));

1769

++	if (entity->budget != new_budget) {

1770

++		entity->budget = new_budget;

1771

++		bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu",

1772

++					 new_budget);

1773

++		bfq_activate_bfqq(bfqd, bfqq);

1774

++	}

1775

++}

1776

++

1777

++static inline unsigned int bfq_wr_duration(struct bfq_data *bfqd)

1778

++{

1779

++	u64 dur;

1780

++

1781

++	if (bfqd->bfq_wr_max_time > 0)

1782

++		return bfqd->bfq_wr_max_time;

1783

++

1784

++	dur = bfqd->RT_prod;

1785

++	do_div(dur, bfqd->peak_rate);

1786

++

1787

++	return dur;

1788

++}

1789

++

1790

++/* Empty burst list and add just bfqq (see comments to bfq_handle_burst) */

1791

++static inline void bfq_reset_burst_list(struct bfq_data *bfqd,

1792

++					struct bfq_queue *bfqq)

1793

++{

1794

++	struct bfq_queue *item;

1795

++	struct hlist_node *n;

1796

++

1797

++	hlist_for_each_entry_safe(item, n, &bfqd->burst_list, burst_list_node)

1798

++		hlist_del_init(&item->burst_list_node);

1799

++	hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list);

1800

++	bfqd->burst_size = 1;

1801

++}

1802

++

1803

++/* Add bfqq to the list of queues in current burst (see bfq_handle_burst) */

1804

++static void bfq_add_to_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq)

1805

++{

1806

++	/* Increment burst size to take into account also bfqq */

1807

++	bfqd->burst_size++;

1808

++

1809

++	if (bfqd->burst_size == bfqd->bfq_large_burst_thresh) {

1810

++		struct bfq_queue *pos, *bfqq_item;

1811

++		struct hlist_node *n;

1812

++

1813

++		/*

1814

++		 * Enough queues have been activated shortly after each

1815

++		 * other to consider this burst as large.

1816

++		 */

1817

++		bfqd->large_burst = true;

1818

++

1819

++		/*

1820

++		 * We can now mark all queues in the burst list as

1821

++		 * belonging to a large burst.

1822

++		 */

1823

++		hlist_for_each_entry(bfqq_item, &bfqd->burst_list,

1824

++				     burst_list_node)

1825

++		        bfq_mark_bfqq_in_large_burst(bfqq_item);

1826

++		bfq_mark_bfqq_in_large_burst(bfqq);

1827

++

1828

++		/*

1829

++		 * From now on, and until the current burst finishes, any

1830

++		 * new queue being activated shortly after the last queue

1831

++		 * was inserted in the burst can be immediately marked as

1832

++		 * belonging to a large burst. So the burst list is not

1833

++		 * needed any more. Remove it.

1834

++		 */

1835

++		hlist_for_each_entry_safe(pos, n, &bfqd->burst_list,

1836

++					  burst_list_node)

1837

++			hlist_del_init(&pos->burst_list_node);

1838

++	} else /* burst not yet large: add bfqq to the burst list */

1839

++		hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list);

1840

++}

1841

++

1842

++/*

1843

++ * If many queues happen to become active shortly after each other, then,

1844

++ * to help the processes associated to these queues get their job done as

1845

++ * soon as possible, it is usually better to not grant either weight-raising

1846

++ * or device idling to these queues. In this comment we describe, firstly,

1847

++ * the reasons why this fact holds, and, secondly, the next function, which

1848

++ * implements the main steps needed to properly mark these queues so that

1849

++ * they can then be treated in a different way.

1850

++ *

1851

++ * As for the terminology, we say that a queue becomes active, i.e.,

1852

++ * switches from idle to backlogged, either when it is created (as a

1853

++ * consequence of the arrival of an I/O request), or, if already existing,

1854

++ * when a new request for the queue arrives while the queue is idle.

1855

++ * Bursts of activations, i.e., activations of different queues occurring

1856

++ * shortly after each other, are typically caused by services or applications

1857

++ * that spawn or reactivate many parallel threads/processes. Examples are

1858

++ * systemd during boot or git grep.

1859

++ *

1860

++ * These services or applications benefit mostly from a high throughput:

1861

++ * the quicker the requests of the activated queues are cumulatively served,

1862

++ * the sooner the target job of these queues gets completed. As a consequence,

1863

++ * weight-raising any of these queues, which also implies idling the device

1864

++ * for it, is almost always counterproductive: in most cases it just lowers

1865

++ * throughput.

1866

++ *

1867

++ * On the other hand, a burst of activations may be also caused by the start

1868

++ * of an application that does not consist in a lot of parallel I/O-bound

1869

++ * threads. In fact, with a complex application, the burst may be just a

1870

++ * consequence of the fact that several processes need to be executed to

1871

++ * start-up the application. To start an application as quickly as possible,

1872

++ * the best thing to do is to privilege the I/O related to the application

1873

++ * with respect to all other I/O. Therefore, the best strategy to start as

1874

++ * quickly as possible an application that causes a burst of activations is

1875

++ * to weight-raise all the queues activated during the burst. This is the

1876

++ * exact opposite of the best strategy for the other type of bursts.

1877

++ *

1878

++ * In the end, to take the best action for each of the two cases, the two

1879

++ * types of bursts need to be distinguished. Fortunately, this seems

1880

++ * relatively easy to do, by looking at the sizes of the bursts. In

1881

++ * particular, we found a threshold such that bursts with a larger size

1882

++ * than that threshold are apparently caused only by services or commands

1883

++ * such as systemd or git grep. For brevity, hereafter we call just 'large'

1884

++ * these bursts. BFQ *does not* weight-raise queues whose activations occur

1885

++ * in a large burst. In addition, for each of these queues BFQ performs or

1886

++ * does not perform idling depending on which choice boosts the throughput

1887

++ * most. The exact choice depends on the device and request pattern at

1888

++ * hand.

1889

++ *

1890

++ * Turning back to the next function, it implements all the steps needed

1891

++ * to detect the occurrence of a large burst and to properly mark all the

1892

++ * queues belonging to it (so that they can then be treated in a different

1893

++ * way). This goal is achieved by maintaining a special "burst list" that

1894

++ * holds, temporarily, the queues that belong to the burst in progress. The

1895

++ * list is then used to mark these queues as belonging to a large burst if

1896

++ * the burst does become large. The main steps are the following.

1897

++ *

1898

++ * . when the very first queue is activated, the queue is inserted into the

1899

++ *   list (as it could be the first queue in a possible burst)

1900

++ *

1901

++ * . if the current burst has not yet become large, and a queue Q that does

1902

++ *   not yet belong to the burst is activated shortly after the last time

1903

++ *   at which a new queue entered the burst list, then the function appends

1904

++ *   Q to the burst list

1905

++ *

1906

++ * . if, as a consequence of the previous step, the burst size reaches

1907

++ *   the large-burst threshold, then

1908

++ *

1909

++ *     . all the queues in the burst list are marked as belonging to a

1910

++ *       large burst

1911

++ *

1912

++ *     . the burst list is deleted; in fact, the burst list already served

1913

++ *       its purpose (keeping temporarily track of the queues in a burst,

1914

++ *       so as to be able to mark them as belonging to a large burst in the

1915

++ *       previous sub-step), and now is not needed any more

1916

++ *

1917

++ *     . the device enters a large-burst mode

1918

++ *

1919

++ * . if a queue Q that does not belong to the burst is activated while

1920

++ *   the device is in large-burst mode and shortly after the last time

1921

++ *   at which a queue either entered the burst list or was marked as

1922

++ *   belonging to the current large burst, then Q is immediately marked

1923

++ *   as belonging to a large burst.

1924

++ *

1925

++ * . if a queue Q that does not belong to the burst is activated a while

1926

++ *   later, i.e., not shortly after, than the last time at which a queue

1927

++ *   either entered the burst list or was marked as belonging to the

1928

++ *   current large burst, then the current burst is deemed as finished and:

1929

++ *

1930

++ *        . the large-burst mode is reset if set

1931

++ *

1932

++ *        . the burst list is emptied

1933

++ *

1934

++ *        . Q is inserted in the burst list, as Q may be the first queue

1935

++ *          in a possible new burst (then the burst list contains just Q

1936

++ *          after this step).

1937

++ */

1938

++static void bfq_handle_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq,

1939

++			     bool idle_for_long_time)

1940

++{

1941

++	/*

1942

++	 * If bfqq happened to be activated in a burst, but has been idle

1943

++	 * for at least as long as an interactive queue, then we assume

1944

++	 * that, in the overall I/O initiated in the burst, the I/O

1945

++	 * associated to bfqq is finished. So bfqq does not need to be

1946

++	 * treated as a queue belonging to a burst anymore. Accordingly,

1947

++	 * we reset bfqq's in_large_burst flag if set, and remove bfqq

1948

++	 * from the burst list if it's there. We do not decrement instead

1949

++	 * burst_size, because the fact that bfqq does not need to belong

1950

++	 * to the burst list any more does not invalidate the fact that

1951

++	 * bfqq may have been activated during the current burst.

1952

++	 */

1953

++	if (idle_for_long_time) {

1954

++		hlist_del_init(&bfqq->burst_list_node);

1955

++		bfq_clear_bfqq_in_large_burst(bfqq);

1956

++	}

1957

++

1958

++	/*

1959

++	 * If bfqq is already in the burst list or is part of a large

1960

++	 * burst, then there is nothing else to do.

1961

++	 */

1962

++	if (!hlist_unhashed(&bfqq->burst_list_node) ||

1963

++	    bfq_bfqq_in_large_burst(bfqq))

1964

++		return;

1965

++

1966

++	/*

1967

++	 * If bfqq's activation happens late enough, then the current

1968

++	 * burst is finished, and related data structures must be reset.

1969

++	 *

1970

++	 * In this respect, consider the special case where bfqq is the very

1971

++	 * first queue being activated. In this case, last_ins_in_burst is

1972

++	 * not yet significant when we get here. But it is easy to verify

1973

++	 * that, whether or not the following condition is true, bfqq will

1974

++	 * end up being inserted into the burst list. In particular the

1975

++	 * list will happen to contain only bfqq. And this is exactly what

1976

++	 * has to happen, as bfqq may be the first queue in a possible

1977

++	 * burst.

1978

++	 */

1979

++	if (time_is_before_jiffies(bfqd->last_ins_in_burst +

1980

++	    bfqd->bfq_burst_interval)) {

1981

++		bfqd->large_burst = false;

1982

++		bfq_reset_burst_list(bfqd, bfqq);

1983

++		return;

1984

++	}

1985

++

1986

++	/*

1987

++	 * If we get here, then bfqq is being activated shortly after the

1988

++	 * last queue. So, if the current burst is also large, we can mark

1989

++	 * bfqq as belonging to this large burst immediately.

1990

++	 */

1991

++	if (bfqd->large_burst) {

1992

++		bfq_mark_bfqq_in_large_burst(bfqq);

1993

++		return;

1994

++	}

1995

++

1996

++	/*

1997

++	 * If we get here, then a large-burst state has not yet been

1998

++	 * reached, but bfqq is being activated shortly after the last

1999

++	 * queue. Then we add bfqq to the burst.

2000

++	 */

2001

++	bfq_add_to_burst(bfqd, bfqq);

2002

++}

2003

++

2004

++static void bfq_add_request(struct request *rq)

2005

++{

2006

++	struct bfq_queue *bfqq = RQ_BFQQ(rq);

2007

++	struct bfq_entity *entity = &bfqq->entity;

2008

++	struct bfq_data *bfqd = bfqq->bfqd;

2009

++	struct request *next_rq, *prev;

2010

++	unsigned long old_wr_coeff = bfqq->wr_coeff;

2011

++	bool interactive = false;

2012

++

2013

++	bfq_log_bfqq(bfqd, bfqq, "add_request %d", rq_is_sync(rq));

2014

++	bfqq->queued[rq_is_sync(rq)]++;

2015

++	bfqd->queued++;

2016

++

2017

++	elv_rb_add(&bfqq->sort_list, rq);

2018

++

2019

++	/*

2020

++	 * Check if this request is a better next-serve candidate.

2021

++	 */

2022

++	prev = bfqq->next_rq;

2023

++	next_rq = bfq_choose_req(bfqd, bfqq->next_rq, rq, bfqd->last_position);

2024

++	BUG_ON(next_rq == NULL);

2025

++	bfqq->next_rq = next_rq;

2026

++

2027

++	/*

2028

++	 * Adjust priority tree position, if next_rq changes.

2029

++	 */

2030

++	if (prev != bfqq->next_rq)

2031

++		bfq_rq_pos_tree_add(bfqd, bfqq);

2032

++

2033

++	if (!bfq_bfqq_busy(bfqq)) {

2034

++		bool soft_rt,

2035

++		     idle_for_long_time = time_is_before_jiffies(

2036

++						bfqq->budget_timeout +

2037

++						bfqd->bfq_wr_min_idle_time);

2038

++

2039

++		if (bfq_bfqq_sync(bfqq)) {

2040

++			bool already_in_burst =

2041

++			   !hlist_unhashed(&bfqq->burst_list_node) ||

2042

++			   bfq_bfqq_in_large_burst(bfqq);

2043

++			bfq_handle_burst(bfqd, bfqq, idle_for_long_time);

2044

++			/*

2045

++			 * If bfqq was not already in the current burst,

2046

++			 * then, at this point, bfqq either has been

2047

++			 * added to the current burst or has caused the

2048

++			 * current burst to terminate. In particular, in

2049

++			 * the second case, bfqq has become the first

2050

++			 * queue in a possible new burst.

2051

++			 * In both cases last_ins_in_burst needs to be

2052

++			 * moved forward.

2053

++			 */

2054

++			if (!already_in_burst)

2055

++				bfqd->last_ins_in_burst = jiffies;

2056

++		}

2057

++

2058

++		soft_rt = bfqd->bfq_wr_max_softrt_rate > 0 &&

2059

++			!bfq_bfqq_in_large_burst(bfqq) &&

2060

++			time_is_before_jiffies(bfqq->soft_rt_next_start);

2061

++		interactive = !bfq_bfqq_in_large_burst(bfqq) &&

2062

++			      idle_for_long_time;

2063

++		entity->budget = max_t(unsigned long, bfqq->max_budget,

2064

++				       bfq_serv_to_charge(next_rq, bfqq));

2065

++

2066

++		if (!bfq_bfqq_IO_bound(bfqq)) {

2067

++			if (time_before(jiffies,

2068

++					RQ_BIC(rq)->ttime.last_end_request +

2069

++					bfqd->bfq_slice_idle)) {

2070

++				bfqq->requests_within_timer++;

2071

++				if (bfqq->requests_within_timer >=

2072

++				    bfqd->bfq_requests_within_timer)

2073

++					bfq_mark_bfqq_IO_bound(bfqq);

2074

++			} else

2075

++				bfqq->requests_within_timer = 0;

2076

++		}

2077

++

2078

++		if (!bfqd->low_latency)

2079

++			goto add_bfqq_busy;

2080

++

2081

++		/*

2082

++		 * If the queue is not being boosted and has been idle

2083

++		 * for enough time, start a weight-raising period

2084

++		 */

2085

++		if (old_wr_coeff == 1 && (interactive || soft_rt)) {

2086

++			bfqq->wr_coeff = bfqd->bfq_wr_coeff;

2087

++			if (interactive)

2088

++				bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);

2089

++			else

2090

++				bfqq->wr_cur_max_time =

2091

++					bfqd->bfq_wr_rt_max_time;

2092

++			bfq_log_bfqq(bfqd, bfqq,

2093

++				     "wrais starting at %lu, rais_max_time %u",

2094

++				     jiffies,

2095

++				     jiffies_to_msecs(bfqq->wr_cur_max_time));

2096

++		} else if (old_wr_coeff > 1) {

2097

++			if (interactive)

2098

++				bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);

2099

++			else if (bfq_bfqq_in_large_burst(bfqq) ||

2100

++				 (bfqq->wr_cur_max_time ==

2101

++				  bfqd->bfq_wr_rt_max_time &&

2102

++				  !soft_rt)) {

2103

++				bfqq->wr_coeff = 1;

2104

++				bfq_log_bfqq(bfqd, bfqq,

2105

++					"wrais ending at %lu, rais_max_time %u",

2106

++					jiffies,

2107

++					jiffies_to_msecs(bfqq->

2108

++						wr_cur_max_time));

2109

++			} else if (time_before(

2110

++					bfqq->last_wr_start_finish +

2111

++					bfqq->wr_cur_max_time,

2112

++					jiffies +

2113

++					bfqd->bfq_wr_rt_max_time) &&

2114

++				   soft_rt) {

2115

++				/*

2116

++				 *

2117

++				 * The remaining weight-raising time is lower

2118

++				 * than bfqd->bfq_wr_rt_max_time, which

2119

++				 * means that the application is enjoying

2120

++				 * weight raising either because deemed soft-

2121

++				 * rt in the near past, or because deemed

2122

++				 * interactive a long ago. In both cases,

2123

++				 * resetting now the current remaining weight-

2124

++				 * raising time for the application to the

2125

++				 * weight-raising duration for soft rt

2126

++				 * applications would not cause any latency

2127

++				 * increase for the application (as the new

2128

++				 * duration would be higher than the remaining

2129

++				 * time).

2130

++				 *

2131

++				 * In addition, the application is now meeting

2132

++				 * the requirements for being deemed soft rt.

2133

++				 * In the end we can correctly and safely

2134

++				 * (re)charge the weight-raising duration for

2135

++				 * the application with the weight-raising

2136

++				 * duration for soft rt applications.

2137

++				 *

2138

++				 * In particular, doing this recharge now, i.e.,

2139

++				 * before the weight-raising period for the

2140

++				 * application finishes, reduces the probability

2141

++				 * of the following negative scenario:

2142

++				 * 1) the weight of a soft rt application is

2143

++				 *    raised at startup (as for any newly

2144

++				 *    created application),

2145

++				 * 2) since the application is not interactive,

2146

++				 *    at a certain time weight-raising is

2147

++				 *    stopped for the application,

2148

++				 * 3) at that time the application happens to

2149

++				 *    still have pending requests, and hence

2150

++				 *    is destined to not have a chance to be

2151

++				 *    deemed soft rt before these requests are

2152

++				 *    completed (see the comments to the

2153

++				 *    function bfq_bfqq_softrt_next_start()

2154

++				 *    for details on soft rt detection),

2155

++				 * 4) these pending requests experience a high

2156

++				 *    latency because the application is not

2157

++				 *    weight-raised while they are pending.

2158

++				 */

2159

++				bfqq->last_wr_start_finish = jiffies;

2160

++				bfqq->wr_cur_max_time =

2161

++					bfqd->bfq_wr_rt_max_time;

2162

++			}

2163

++		}

2164

++		if (old_wr_coeff != bfqq->wr_coeff)

2165

++			entity->ioprio_changed = 1;

2166

++add_bfqq_busy:

2167

++		bfqq->last_idle_bklogged = jiffies;

2168

++		bfqq->service_from_backlogged = 0;

2169

++		bfq_clear_bfqq_softrt_update(bfqq);

2170

++		bfq_add_bfqq_busy(bfqd, bfqq);

2171

++	} else {

2172

++		if (bfqd->low_latency && old_wr_coeff == 1 && !rq_is_sync(rq) &&

2173

++		    time_is_before_jiffies(

2174

++				bfqq->last_wr_start_finish +

2175

++				bfqd->bfq_wr_min_inter_arr_async)) {

2176

++			bfqq->wr_coeff = bfqd->bfq_wr_coeff;

2177

++			bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);

2178

++

2179

++			bfqd->wr_busy_queues++;

2180

++			entity->ioprio_changed = 1;

2181

++			bfq_log_bfqq(bfqd, bfqq,

2182

++			    "non-idle wrais starting at %lu, rais_max_time %u",

2183

++			    jiffies,

2184

++			    jiffies_to_msecs(bfqq->wr_cur_max_time));

2185

++		}

2186

++		if (prev != bfqq->next_rq)

2187

++			bfq_updated_next_req(bfqd, bfqq);

2188

++	}

2189

++

2190

++	if (bfqd->low_latency &&

2191

++		(old_wr_coeff == 1 || bfqq->wr_coeff == 1 || interactive))

2192

++		bfqq->last_wr_start_finish = jiffies;

2193

++}

2194

++

2195

++static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd,

2196

++					  struct bio *bio)

2197

++{

2198

++	struct task_struct *tsk = current;

2199

++	struct bfq_io_cq *bic;

2200

++	struct bfq_queue *bfqq;

2201

++

2202

++	bic = bfq_bic_lookup(bfqd, tsk->io_context);

2203

++	if (bic == NULL)

2204

++		return NULL;

2205

++

2206

++	bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));

2207

++	if (bfqq != NULL)

2208

++		return elv_rb_find(&bfqq->sort_list, bio_end_sector(bio));

2209

++

2210

++	return NULL;

2211

++}

2212

++

2213

++static void bfq_activate_request(struct request_queue *q, struct request *rq)

2214

++{

2215

++	struct bfq_data *bfqd = q->elevator->elevator_data;

2216

++

2217

++	bfqd->rq_in_driver++;

2218

++	bfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq);

2219

++	bfq_log(bfqd, "activate_request: new bfqd->last_position %llu",

2220

++		(long long unsigned)bfqd->last_position);

2221

++}

2222

++

2223

++static inline void bfq_deactivate_request(struct request_queue *q,

2224

++					  struct request *rq)

2225

++{

2226

++	struct bfq_data *bfqd = q->elevator->elevator_data;

2227

++

2228

++	BUG_ON(bfqd->rq_in_driver == 0);

2229

++	bfqd->rq_in_driver--;

2230

++}

2231

++

2232

++static void bfq_remove_request(struct request *rq)

2233

++{

2234

++	struct bfq_queue *bfqq = RQ_BFQQ(rq);

2235

++	struct bfq_data *bfqd = bfqq->bfqd;

2236

++	const int sync = rq_is_sync(rq);

2237

++

2238

++	if (bfqq->next_rq == rq) {

2239

++		bfqq->next_rq = bfq_find_next_rq(bfqd, bfqq, rq);

2240

++		bfq_updated_next_req(bfqd, bfqq);

2241

++	}

2242

++

2243

++	list_del_init(&rq->queuelist);

2244

++	BUG_ON(bfqq->queued[sync] == 0);

2245

++	bfqq->queued[sync]--;

2246

++	bfqd->queued--;

2247

++	elv_rb_del(&bfqq->sort_list, rq);

2248

++

2249

++	if (RB_EMPTY_ROOT(&bfqq->sort_list)) {

2250

++		if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->in_service_queue)

2251

++			bfq_del_bfqq_busy(bfqd, bfqq, 1);

2252

++		/*

2253

++		 * Remove queue from request-position tree as it is empty.

2254

++		 */

2255

++		if (bfqq->pos_root != NULL) {

2256

++			rb_erase(&bfqq->pos_node, bfqq->pos_root);

2257

++			bfqq->pos_root = NULL;

2258

++		}

2259

++	}

2260

++

2261

++	if (rq->cmd_flags & REQ_META) {

2262

++		BUG_ON(bfqq->meta_pending == 0);

2263

++		bfqq->meta_pending--;

2264

++	}

2265

++}

2266

++

2267

++static int bfq_merge(struct request_queue *q, struct request **req,

2268

++		     struct bio *bio)

2269

++{

2270

++	struct bfq_data *bfqd = q->elevator->elevator_data;

2271

++	struct request *__rq;

2272

++

2273

++	__rq = bfq_find_rq_fmerge(bfqd, bio);

2274

++	if (__rq != NULL && elv_rq_merge_ok(__rq, bio)) {

2275

++		*req = __rq;

2276

++		return ELEVATOR_FRONT_MERGE;

2277

++	}

2278

++

2279

++	return ELEVATOR_NO_MERGE;

2280

++}

2281

++

2282

++static void bfq_merged_request(struct request_queue *q, struct request *req,

2283

++			       int type)

2284

++{

2285

++	if (type == ELEVATOR_FRONT_MERGE &&

2286

++	    rb_prev(&req->rb_node) &&

2287

++	    blk_rq_pos(req) <

2288

++	    blk_rq_pos(container_of(rb_prev(&req->rb_node),

2289

++				    struct request, rb_node))) {

2290

++		struct bfq_queue *bfqq = RQ_BFQQ(req);

2291

++		struct bfq_data *bfqd = bfqq->bfqd;

2292

++		struct request *prev, *next_rq;

2293

++

2294

++		/* Reposition request in its sort_list */

2295

++		elv_rb_del(&bfqq->sort_list, req);

2296

++		elv_rb_add(&bfqq->sort_list, req);

2297

++		/* Choose next request to be served for bfqq */

2298

++		prev = bfqq->next_rq;

2299

++		next_rq = bfq_choose_req(bfqd, bfqq->next_rq, req,

2300

++					 bfqd->last_position);

2301

++		BUG_ON(next_rq == NULL);

2302

++		bfqq->next_rq = next_rq;

2303

++		/*

2304

++		 * If next_rq changes, update both the queue's budget to

2305

++		 * fit the new request and the queue's position in its

2306

++		 * rq_pos_tree.

2307

++		 */

2308

++		if (prev != bfqq->next_rq) {

2309

++			bfq_updated_next_req(bfqd, bfqq);

2310

++			bfq_rq_pos_tree_add(bfqd, bfqq);

2311

++		}

2312

++	}

2313

++}

2314

++

2315

++static void bfq_merged_requests(struct request_queue *q, struct request *rq,

2316

++				struct request *next)

2317

++{

2318

++	struct bfq_queue *bfqq = RQ_BFQQ(rq);

2319

++

2320

++	/*

2321

++	 * Reposition in fifo if next is older than rq.

2322

++	 */

2323

++	if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist) &&

2324

++	    time_before(next->fifo_time, rq->fifo_time)) {

2325

++		list_move(&rq->queuelist, &next->queuelist);

2326

++		rq->fifo_time = next->fifo_time;

2327

++	}

2328

++

2329

++	if (bfqq->next_rq == next)

2330

++		bfqq->next_rq = rq;

2331

++

2332

++	bfq_remove_request(next);

2333

++}

2334

++

2335

++/* Must be called with bfqq != NULL */

2336

++static inline void bfq_bfqq_end_wr(struct bfq_queue *bfqq)

2337

++{

2338

++	BUG_ON(bfqq == NULL);

2339

++	if (bfq_bfqq_busy(bfqq))

2340

++		bfqq->bfqd->wr_busy_queues--;

2341

++	bfqq->wr_coeff = 1;

2342

++	bfqq->wr_cur_max_time = 0;

2343

++	/* Trigger a weight change on the next activation of the queue */

2344

++	bfqq->entity.ioprio_changed = 1;

2345

++}

2346

++

2347

++static void bfq_end_wr_async_queues(struct bfq_data *bfqd,

2348

++				    struct bfq_group *bfqg)

2349

++{

2350

++	int i, j;

2351

++

2352

++	for (i = 0; i < 2; i++)

2353

++		for (j = 0; j < IOPRIO_BE_NR; j++)

2354

++			if (bfqg->async_bfqq[i][j] != NULL)

2355

++				bfq_bfqq_end_wr(bfqg->async_bfqq[i][j]);

2356

++	if (bfqg->async_idle_bfqq != NULL)

2357

++		bfq_bfqq_end_wr(bfqg->async_idle_bfqq);

2358

++}

2359

++

2360

++static void bfq_end_wr(struct bfq_data *bfqd)

2361

++{

2362

++	struct bfq_queue *bfqq;

2363

++

2364

++	spin_lock_irq(bfqd->queue->queue_lock);

2365

++

2366

++	list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list)

2367

++		bfq_bfqq_end_wr(bfqq);

2368

++	list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list)

2369

++		bfq_bfqq_end_wr(bfqq);

2370

++	bfq_end_wr_async(bfqd);

2371

++

2372

++	spin_unlock_irq(bfqd->queue->queue_lock);

2373

++}

2374

++

2375

++static int bfq_allow_merge(struct request_queue *q, struct request *rq,

2376

++			   struct bio *bio)

2377

++{

2378

++	struct bfq_data *bfqd = q->elevator->elevator_data;

2379

++	struct bfq_io_cq *bic;

2380

++	struct bfq_queue *bfqq;

2381

++

2382

++	/*

2383

++	 * Disallow merge of a sync bio into an async request.

2384

++	 */

2385

++	if (bfq_bio_sync(bio) && !rq_is_sync(rq))

2386

++		return 0;

2387

++

2388

++	/*

2389

++	 * Lookup the bfqq that this bio will be queued with. Allow

2390

++	 * merge only if rq is queued there.

2391

++	 * Queue lock is held here.

2392

++	 */

2393

++	bic = bfq_bic_lookup(bfqd, current->io_context);

2394

++	if (bic == NULL)

2395

++		return 0;

2396

++

2397

++	bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));

2398

++	return bfqq == RQ_BFQQ(rq);

2399

++}

2400

++

2401

++static void __bfq_set_in_service_queue(struct bfq_data *bfqd,

2402

++				       struct bfq_queue *bfqq)

2403

++{

2404

++	if (bfqq != NULL) {

2405

++		bfq_mark_bfqq_must_alloc(bfqq);

2406

++		bfq_mark_bfqq_budget_new(bfqq);

2407

++		bfq_clear_bfqq_fifo_expire(bfqq);

2408

++

2409

++		bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8;

2410

++

2411

++		bfq_log_bfqq(bfqd, bfqq,

2412

++			     "set_in_service_queue, cur-budget = %lu",

2413

++			     bfqq->entity.budget);

2414

++	}

2415

++

2416

++	bfqd->in_service_queue = bfqq;

2417

++}

2418

++

2419

++/*

2420

++ * Get and set a new queue for service.

2421

++ */

2422

++static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd,

2423

++						  struct bfq_queue *bfqq)

2424

++{

2425

++	if (!bfqq)

2426

++		bfqq = bfq_get_next_queue(bfqd);

2427

++	else

2428

++		bfq_get_next_queue_forced(bfqd, bfqq);

2429

++

2430

++	__bfq_set_in_service_queue(bfqd, bfqq);

2431

++	return bfqq;

2432

++}

2433

++

2434

++static inline sector_t bfq_dist_from_last(struct bfq_data *bfqd,

2435

++					  struct request *rq)

2436

++{

2437

++	if (blk_rq_pos(rq) >= bfqd->last_position)

2438

++		return blk_rq_pos(rq) - bfqd->last_position;

2439

++	else

2440

++		return bfqd->last_position - blk_rq_pos(rq);

2441

++}

2442

++

2443

++/*

2444

++ * Return true if bfqq has no request pending and rq is close enough to

2445

++ * bfqd->last_position, or if rq is closer to bfqd->last_position than

2446

++ * bfqq->next_rq

2447

++ */

2448

++static inline int bfq_rq_close(struct bfq_data *bfqd, struct request *rq)

2449

++{

2450

++	return bfq_dist_from_last(bfqd, rq) <= BFQQ_SEEK_THR;

2451

++}

2452

++

2453

++static struct bfq_queue *bfqq_close(struct bfq_data *bfqd)

2454

++{

2455

++	struct rb_root *root = &bfqd->rq_pos_tree;

2456

++	struct rb_node *parent, *node;

2457

++	struct bfq_queue *__bfqq;

2458

++	sector_t sector = bfqd->last_position;

2459

++

2460

++	if (RB_EMPTY_ROOT(root))

2461

++		return NULL;

2462

++

2463

++	/*

2464

++	 * First, if we find a request starting at the end of the last

2465

++	 * request, choose it.

2466

++	 */

2467

++	__bfqq = bfq_rq_pos_tree_lookup(bfqd, root, sector, &parent, NULL);

2468

++	if (__bfqq != NULL)

2469

++		return __bfqq;

2470

++

2471

++	/*

2472

++	 * If the exact sector wasn't found, the parent of the NULL leaf

2473

++	 * will contain the closest sector (rq_pos_tree sorted by

2474

++	 * next_request position).

2475

++	 */

2476

++	__bfqq = rb_entry(parent, struct bfq_queue, pos_node);

2477

++	if (bfq_rq_close(bfqd, __bfqq->next_rq))

2478

++		return __bfqq;

2479

++

2480

++	if (blk_rq_pos(__bfqq->next_rq) < sector)

2481

++		node = rb_next(&__bfqq->pos_node);

2482

++	else

2483

++		node = rb_prev(&__bfqq->pos_node);

2484

++	if (node == NULL)

2485

++		return NULL;

2486

++

2487

++	__bfqq = rb_entry(node, struct bfq_queue, pos_node);

2488

++	if (bfq_rq_close(bfqd, __bfqq->next_rq))

2489

++		return __bfqq;

2490

++

2491

++	return NULL;

2492

++}

2493

++

2494

++/*

2495

++ * bfqd - obvious

2496

++ * cur_bfqq - passed in so that we don't decide that the current queue

2497

++ *            is closely cooperating with itself.

2498

++ *

2499

++ * We are assuming that cur_bfqq has dispatched at least one request,

2500

++ * and that bfqd->last_position reflects a position on the disk associated

2501

++ * with the I/O issued by cur_bfqq.

2502

++ */

2503

++static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd,

2504

++					      struct bfq_queue *cur_bfqq)

2505

++{

2506

++	struct bfq_queue *bfqq;

2507

++

2508

++	if (bfq_class_idle(cur_bfqq))

2509

++		return NULL;

2510

++	if (!bfq_bfqq_sync(cur_bfqq))

2511

++		return NULL;

2512

++	if (BFQQ_SEEKY(cur_bfqq))

2513

++		return NULL;

2514

++

2515

++	/* If device has only one backlogged bfq_queue, don't search. */

2516

++	if (bfqd->busy_queues == 1)

2517

++		return NULL;

2518

++

2519

++	/*

2520

++	 * We should notice if some of the queues are cooperating, e.g.

2521

++	 * working closely on the same area of the disk. In that case,

2522

++	 * we can group them together and don't waste time idling.

2523

++	 */

2524

++	bfqq = bfqq_close(bfqd);

2525

++	if (bfqq == NULL || bfqq == cur_bfqq)

2526

++		return NULL;

2527

++

2528

++	/*

2529

++	 * Do not merge queues from different bfq_groups.

2530

++	*/

2531

++	if (bfqq->entity.parent != cur_bfqq->entity.parent)

2532

++		return NULL;

2533

++

2534

++	/*

2535

++	 * It only makes sense to merge sync queues.

2536

++	 */

2537

++	if (!bfq_bfqq_sync(bfqq))

2538

++		return NULL;

2539

++	if (BFQQ_SEEKY(bfqq))

2540

++		return NULL;

2541

++

2542

++	/*

2543

++	 * Do not merge queues of different priority classes.

2544

++	 */

2545

++	if (bfq_class_rt(bfqq) != bfq_class_rt(cur_bfqq))

2546

++		return NULL;

2547

++

2548

++	return bfqq;

2549

++}

2550

++

2551

++/*

2552

++ * If enough samples have been computed, return the current max budget

2553

++ * stored in bfqd, which is dynamically updated according to the

2554

++ * estimated disk peak rate; otherwise return the default max budget

2555

++ */

2556

++static inline unsigned long bfq_max_budget(struct bfq_data *bfqd)

2557

++{

2558

++	if (bfqd->budgets_assigned < 194)

2559

++		return bfq_default_max_budget;

2560

++	else

2561

++		return bfqd->bfq_max_budget;

2562

++}

2563

++

2564

++/*

2565

++ * Return min budget, which is a fraction of the current or default

2566

++ * max budget (trying with 1/32)

2567

++ */

2568

++static inline unsigned long bfq_min_budget(struct bfq_data *bfqd)

2569

++{

2570

++	if (bfqd->budgets_assigned < 194)

2571

++		return bfq_default_max_budget / 32;

2572

++	else

2573

++		return bfqd->bfq_max_budget / 32;

2574

++}

2575

++

2576

++static void bfq_arm_slice_timer(struct bfq_data *bfqd)

2577

++{

2578

++	struct bfq_queue *bfqq = bfqd->in_service_queue;

2579

++	struct bfq_io_cq *bic;

2580

++	unsigned long sl;

2581

++

2582

++	BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list));

2583

++

2584

++	/* Processes have exited, don't wait. */

2585

++	bic = bfqd->in_service_bic;

2586

++	if (bic == NULL || atomic_read(&bic->icq.ioc->active_ref) == 0)

2587

++		return;

2588

++

2589

++	bfq_mark_bfqq_wait_request(bfqq);

2590

++

2591

++	/*

2592

++	 * We don't want to idle for seeks, but we do want to allow

2593

++	 * fair distribution of slice time for a process doing back-to-back

2594

++	 * seeks. So allow a little bit of time for him to submit a new rq.

2595

++	 *

2596

++	 * To prevent processes with (partly) seeky workloads from

2597

++	 * being too ill-treated, grant them a small fraction of the

2598

++	 * assigned budget before reducing the waiting time to

2599

++	 * BFQ_MIN_TT. This happened to help reduce latency.

2600

++	 */

2601

++	sl = bfqd->bfq_slice_idle;

2602

++	/*

2603

++	 * Unless the queue is being weight-raised, grant only minimum idle

2604

++	 * time if the queue either has been seeky for long enough or has

2605

++	 * already proved to be constantly seeky.

2606

++	 */

2607

++	if (bfq_sample_valid(bfqq->seek_samples) &&

2608

++	    ((BFQQ_SEEKY(bfqq) && bfqq->entity.service >

2609

++				  bfq_max_budget(bfqq->bfqd) / 8) ||

2610

++	      bfq_bfqq_constantly_seeky(bfqq)) && bfqq->wr_coeff == 1)

2611

++		sl = min(sl, msecs_to_jiffies(BFQ_MIN_TT));

2612

++	else if (bfqq->wr_coeff > 1)

2613

++		sl = sl * 3;

2614

++	bfqd->last_idling_start = ktime_get();

2615

++	mod_timer(&bfqd->idle_slice_timer, jiffies + sl);

2616

++	bfq_log(bfqd, "arm idle: %u/%u ms",

2617

++		jiffies_to_msecs(sl), jiffies_to_msecs(bfqd->bfq_slice_idle));

2618

++}

2619

++

2620

++/*

2621

++ * Set the maximum time for the in-service queue to consume its

2622

++ * budget. This prevents seeky processes from lowering the disk

2623

++ * throughput (always guaranteed with a time slice scheme as in CFQ).

2624

++ */

2625

++static void bfq_set_budget_timeout(struct bfq_data *bfqd)

2626

++{

2627

++	struct bfq_queue *bfqq = bfqd->in_service_queue;

2628

++	unsigned int timeout_coeff;

2629

++	if (bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time)

2630

++		timeout_coeff = 1;

2631

++	else

2632

++		timeout_coeff = bfqq->entity.weight / bfqq->entity.orig_weight;

2633

++

2634

++	bfqd->last_budget_start = ktime_get();

2635

++

2636

++	bfq_clear_bfqq_budget_new(bfqq);

2637

++	bfqq->budget_timeout = jiffies +

2638

++		bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] * timeout_coeff;

2639

++

2640

++	bfq_log_bfqq(bfqd, bfqq, "set budget_timeout %u",

2641

++		jiffies_to_msecs(bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] *

2642

++		timeout_coeff));

2643

++}

2644

++

2645

++/*

2646

++ * Move request from internal lists to the request queue dispatch list.

2647

++ */

2648

++static void bfq_dispatch_insert(struct request_queue *q, struct request *rq)

2649

++{

2650

++	struct bfq_data *bfqd = q->elevator->elevator_data;

2651

++	struct bfq_queue *bfqq = RQ_BFQQ(rq);

2652

++

2653

++	/*

2654

++	 * For consistency, the next instruction should have been executed

2655

++	 * after removing the request from the queue and dispatching it.

2656

++	 * We execute instead this instruction before bfq_remove_request()

2657

++	 * (and hence introduce a temporary inconsistency), for efficiency.

2658

++	 * In fact, in a forced_dispatch, this prevents two counters related

2659

++	 * to bfqq->dispatched to risk to be uselessly decremented if bfqq

2660

++	 * is not in service, and then to be incremented again after

2661

++	 * incrementing bfqq->dispatched.

2662

++	 */

2663

++	bfqq->dispatched++;

2664

++	bfq_remove_request(rq);

2665

++	elv_dispatch_sort(q, rq);

2666

++

2667

++	if (bfq_bfqq_sync(bfqq))

2668

++		bfqd->sync_flight++;

2669

++}

2670

++

2671

++/*

2672

++ * Return expired entry, or NULL to just start from scratch in rbtree.

2673

++ */

2674

++static struct request *bfq_check_fifo(struct bfq_queue *bfqq)

2675

++{

2676

++	struct request *rq = NULL;

2677

++

2678

++	if (bfq_bfqq_fifo_expire(bfqq))

2679

++		return NULL;

2680

++

2681

++	bfq_mark_bfqq_fifo_expire(bfqq);

2682

++

2683

++	if (list_empty(&bfqq->fifo))

2684

++		return NULL;

2685

++

2686

++	rq = rq_entry_fifo(bfqq->fifo.next);

2687

++

2688

++	if (time_before(jiffies, rq->fifo_time))

2689

++		return NULL;

2690

++

2691

++	return rq;

2692

++}

2693

++

2694

++/* Must be called with the queue_lock held. */

2695

++static int bfqq_process_refs(struct bfq_queue *bfqq)

2696

++{

2697

++	int process_refs, io_refs;

2698

++

2699

++	io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE];

2700

++	process_refs = atomic_read(&bfqq->ref) - io_refs - bfqq->entity.on_st;

2701

++	BUG_ON(process_refs < 0);

2702

++	return process_refs;

2703

++}

2704

++

2705

++static void bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)

2706

++{

2707

++	int process_refs, new_process_refs;

2708

++	struct bfq_queue *__bfqq;

2709

++

2710

++	/*

2711

++	 * If there are no process references on the new_bfqq, then it is

2712

++	 * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain

2713

++	 * may have dropped their last reference (not just their last process

2714

++	 * reference).

2715

++	 */

2716

++	if (!bfqq_process_refs(new_bfqq))

2717

++		return;

2718

++

2719

++	/* Avoid a circular list and skip interim queue merges. */

2720

++	while ((__bfqq = new_bfqq->new_bfqq)) {

2721

++		if (__bfqq == bfqq)

2722

++			return;

2723

++		new_bfqq = __bfqq;

2724

++	}

2725

++

2726

++	process_refs = bfqq_process_refs(bfqq);

2727

++	new_process_refs = bfqq_process_refs(new_bfqq);

2728

++	/*

2729

++	 * If the process for the bfqq has gone away, there is no

2730

++	 * sense in merging the queues.

2731

++	 */

2732

++	if (process_refs == 0 || new_process_refs == 0)

2733

++		return;

2734

++

2735

++	/*

2736

++	 * Merge in the direction of the lesser amount of work.

2737

++	 */

2738

++	if (new_process_refs >= process_refs) {

2739

++		bfqq->new_bfqq = new_bfqq;

2740

++		atomic_add(process_refs, &new_bfqq->ref);

2741

++	} else {

2742

++		new_bfqq->new_bfqq = bfqq;

2743

++		atomic_add(new_process_refs, &bfqq->ref);

2744

++	}

2745

++	bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d",

2746

++		new_bfqq->pid);

2747

++}

2748

++

2749

++static inline unsigned long bfq_bfqq_budget_left(struct bfq_queue *bfqq)

2750

++{

2751

++	struct bfq_entity *entity = &bfqq->entity;

2752

++	return entity->budget - entity->service;

2753

++}

2754

++

2755

++static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq)

2756

++{

2757

++	BUG_ON(bfqq != bfqd->in_service_queue);

2758

++

2759

++	__bfq_bfqd_reset_in_service(bfqd);

2760

++

2761

++	/*

2762

++	 * If this bfqq is shared between multiple processes, check

2763

++	 * to make sure that those processes are still issuing I/Os

2764

++	 * within the mean seek distance. If not, it may be time to

2765

++	 * break the queues apart again.

2766

++	 */

2767

++	if (bfq_bfqq_coop(bfqq) && BFQQ_SEEKY(bfqq))

2768

++		bfq_mark_bfqq_split_coop(bfqq);

2769

++

2770

++	if (RB_EMPTY_ROOT(&bfqq->sort_list)) {

2771

++		/*

2772

++		 * Overloading budget_timeout field to store the time

2773

++		 * at which the queue remains with no backlog; used by

2774

++		 * the weight-raising mechanism.

2775

++		 */

2776

++		bfqq->budget_timeout = jiffies;

2777

++		bfq_del_bfqq_busy(bfqd, bfqq, 1);

2778

++	} else {

2779

++		bfq_activate_bfqq(bfqd, bfqq);

2780

++		/*

2781

++		 * Resort priority tree of potential close cooperators.

2782

++		 */

2783

++		bfq_rq_pos_tree_add(bfqd, bfqq);

2784

++	}

2785

++}

2786

++

2787

++/**

2788

++ * __bfq_bfqq_recalc_budget - try to adapt the budget to the @bfqq behavior.

2789

++ * @bfqd: device data.

2790

++ * @bfqq: queue to update.

2791

++ * @reason: reason for expiration.

2792

++ *

2793

++ * Handle the feedback on @bfqq budget.  See the body for detailed

2794

++ * comments.

2795

++ */

2796

++static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd,

2797

++				     struct bfq_queue *bfqq,

2798

++				     enum bfqq_expiration reason)

2799

++{

2800

++	struct request *next_rq;

2801

++	unsigned long budget, min_budget;

2802

++

2803

++	budget = bfqq->max_budget;

2804

++	min_budget = bfq_min_budget(bfqd);

2805

++

2806

++	BUG_ON(bfqq != bfqd->in_service_queue);

2807

++

2808

++	bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last budg %lu, budg left %lu",

2809

++		bfqq->entity.budget, bfq_bfqq_budget_left(bfqq));

2810

++	bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last max_budg %lu, min budg %lu",

2811

++		budget, bfq_min_budget(bfqd));

2812

++	bfq_log_bfqq(bfqd, bfqq, "recalc_budg: sync %d, seeky %d",

2813

++		bfq_bfqq_sync(bfqq), BFQQ_SEEKY(bfqd->in_service_queue));

2814

++

2815

++	if (bfq_bfqq_sync(bfqq)) {

2816

++		switch (reason) {

2817

++		/*

2818

++		 * Caveat: in all the following cases we trade latency

2819

++		 * for throughput.

2820

++		 */

2821

++		case BFQ_BFQQ_TOO_IDLE:

2822

++			/*

2823

++			 * This is the only case where we may reduce

2824

++			 * the budget: if there is no request of the

2825

++			 * process still waiting for completion, then

2826

++			 * we assume (tentatively) that the timer has

2827

++			 * expired because the batch of requests of

2828

++			 * the process could have been served with a

2829

++			 * smaller budget.  Hence, betting that

2830

++			 * process will behave in the same way when it

2831

++			 * becomes backlogged again, we reduce its

2832

++			 * next budget.  As long as we guess right,

2833

++			 * this budget cut reduces the latency

2834

++			 * experienced by the process.

2835

++			 *

2836

++			 * However, if there are still outstanding

2837

++			 * requests, then the process may have not yet

2838

++			 * issued its next request just because it is

2839

++			 * still waiting for the completion of some of

2840

++			 * the still outstanding ones.  So in this

2841

++			 * subcase we do not reduce its budget, on the

2842

++			 * contrary we increase it to possibly boost

2843

++			 * the throughput, as discussed in the

2844

++			 * comments to the BUDGET_TIMEOUT case.

2845

++			 */

2846

++			if (bfqq->dispatched > 0) /* still outstanding reqs */

2847

++				budget = min(budget * 2, bfqd->bfq_max_budget);

2848

++			else {

2849

++				if (budget > 5 * min_budget)

2850

++					budget -= 4 * min_budget;

2851

++				else

2852

++					budget = min_budget;

2853

++			}

2854

++			break;

2855

++		case BFQ_BFQQ_BUDGET_TIMEOUT:

2856

++			/*

2857

++			 * We double the budget here because: 1) it

2858

++			 * gives the chance to boost the throughput if

2859

++			 * this is not a seeky process (which may have

2860

++			 * bumped into this timeout because of, e.g.,

2861

++			 * ZBR), 2) together with charge_full_budget

2862

++			 * it helps give seeky processes higher

2863

++			 * timestamps, and hence be served less

2864

++			 * frequently.

2865

++			 */

2866

++			budget = min(budget * 2, bfqd->bfq_max_budget);

2867

++			break;

2868

++		case BFQ_BFQQ_BUDGET_EXHAUSTED:

2869

++			/*

2870

++			 * The process still has backlog, and did not

2871

++			 * let either the budget timeout or the disk

2872

++			 * idling timeout expire. Hence it is not

2873

++			 * seeky, has a short thinktime and may be

2874

++			 * happy with a higher budget too. So

2875

++			 * definitely increase the budget of this good

2876

++			 * candidate to boost the disk throughput.

2877

++			 */

2878

++			budget = min(budget * 4, bfqd->bfq_max_budget);

2879

++			break;

2880

++		case BFQ_BFQQ_NO_MORE_REQUESTS:

2881

++		       /*

2882

++			* Leave the budget unchanged.

2883

++			*/

2884

++		default:

2885

++			return;

2886

++		}

2887

++	} else /* async queue */

2888

++	    /* async queues get always the maximum possible budget

2889

++	     * (their ability to dispatch is limited by

2890

++	     * @bfqd->bfq_max_budget_async_rq).

2891

++	     */

2892

++		budget = bfqd->bfq_max_budget;

2893

++

2894

++	bfqq->max_budget = budget;

2895

++

2896

++	if (bfqd->budgets_assigned >= 194 && bfqd->bfq_user_max_budget == 0 &&

2897

++	    bfqq->max_budget > bfqd->bfq_max_budget)

2898

++		bfqq->max_budget = bfqd->bfq_max_budget;

2899

++

2900

++	/*

2901

++	 * Make sure that we have enough budget for the next request.

2902

++	 * Since the finish time of the bfqq must be kept in sync with

2903

++	 * the budget, be sure to call __bfq_bfqq_expire() after the

2904

++	 * update.

2905

++	 */

2906

++	next_rq = bfqq->next_rq;

2907

++	if (next_rq != NULL)

2908

++		bfqq->entity.budget = max_t(unsigned long, bfqq->max_budget,

2909

++					    bfq_serv_to_charge(next_rq, bfqq));

2910

++	else

2911

++		bfqq->entity.budget = bfqq->max_budget;

2912

++

2913

++	bfq_log_bfqq(bfqd, bfqq, "head sect: %u, new budget %lu",

2914

++			next_rq != NULL ? blk_rq_sectors(next_rq) : 0,

2915

++			bfqq->entity.budget);

2916

++}

2917

++

2918

++static unsigned long bfq_calc_max_budget(u64 peak_rate, u64 timeout)

2919

++{

2920

++	unsigned long max_budget;

2921

++

2922

++	/*

2923

++	 * The max_budget calculated when autotuning is equal to the

2924

++	 * amount of sectors transfered in timeout_sync at the

2925

++	 * estimated peak rate.

2926

++	 */

2927

++	max_budget = (unsigned long)(peak_rate * 1000 *

2928

++				     timeout >> BFQ_RATE_SHIFT);

2929

++

2930

++	return max_budget;

2931

++}

2932

++

2933

++/*

2934

++ * In addition to updating the peak rate, checks whether the process

2935

++ * is "slow", and returns 1 if so. This slow flag is used, in addition

2936

++ * to the budget timeout, to reduce the amount of service provided to

2937

++ * seeky processes, and hence reduce their chances to lower the

2938

++ * throughput. See the code for more details.

2939

++ */

2940

++static int bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq,

2941

++				int compensate, enum bfqq_expiration reason)

2942

++{

2943

++	u64 bw, usecs, expected, timeout;

2944

++	ktime_t delta;

2945

++	int update = 0;

2946

++

2947

++	if (!bfq_bfqq_sync(bfqq) || bfq_bfqq_budget_new(bfqq))

2948

++		return 0;

2949

++

2950

++	if (compensate)

2951

++		delta = bfqd->last_idling_start;

2952

++	else

2953

++		delta = ktime_get();

2954

++	delta = ktime_sub(delta, bfqd->last_budget_start);

2955

++	usecs = ktime_to_us(delta);

2956

++

2957

++	/* Don't trust short/unrealistic values. */

2958

++	if (usecs < 100 || usecs >= LONG_MAX)

2959

++		return 0;

2960

++

2961

++	/*

2962

++	 * Calculate the bandwidth for the last slice.  We use a 64 bit

2963

++	 * value to store the peak rate, in sectors per usec in fixed

2964

++	 * point math.  We do so to have enough precision in the estimate

2965

++	 * and to avoid overflows.

2966

++	 */

2967

++	bw = (u64)bfqq->entity.service << BFQ_RATE_SHIFT;

2968

++	do_div(bw, (unsigned long)usecs);

2969

++

2970

++	timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]);

2971

++

2972

++	/*

2973

++	 * Use only long (> 20ms) intervals to filter out spikes for

2974

++	 * the peak rate estimation.

2975

++	 */

2976

++	if (usecs > 20000) {

2977

++		if (bw > bfqd->peak_rate ||

2978

++		   (!BFQQ_SEEKY(bfqq) &&

2979

++		    reason == BFQ_BFQQ_BUDGET_TIMEOUT)) {

2980

++			bfq_log(bfqd, "measured bw =%llu", bw);

2981

++			/*

2982

++			 * To smooth oscillations use a low-pass filter with

2983

++			 * alpha=7/8, i.e.,

2984

++			 * new_rate = (7/8) * old_rate + (1/8) * bw

2985

++			 */

2986

++			do_div(bw, 8);

2987

++			if (bw == 0)

2988

++				return 0;

2989

++			bfqd->peak_rate *= 7;

2990

++			do_div(bfqd->peak_rate, 8);

2991

++			bfqd->peak_rate += bw;

2992

++			update = 1;

2993

++			bfq_log(bfqd, "new peak_rate=%llu", bfqd->peak_rate);

2994

++		}

2995

++

2996

++		update |= bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES - 1;

2997

++

2998

++		if (bfqd->peak_rate_samples < BFQ_PEAK_RATE_SAMPLES)

2999

++			bfqd->peak_rate_samples++;

3000

++

3001

++		if (bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES &&

3002

++		    update) {

3003

++			int dev_type = blk_queue_nonrot(bfqd->queue);

3004

++			if (bfqd->bfq_user_max_budget == 0) {

3005

++				bfqd->bfq_max_budget =

3006

++					bfq_calc_max_budget(bfqd->peak_rate,

3007

++							    timeout);

3008

++				bfq_log(bfqd, "new max_budget=%lu",

3009

++					bfqd->bfq_max_budget);

3010

++			}

3011

++			if (bfqd->device_speed == BFQ_BFQD_FAST &&

3012

++			    bfqd->peak_rate < device_speed_thresh[dev_type]) {

3013

++				bfqd->device_speed = BFQ_BFQD_SLOW;

3014

++				bfqd->RT_prod = R_slow[dev_type] *

3015

++						T_slow[dev_type];

3016

++			} else if (bfqd->device_speed == BFQ_BFQD_SLOW &&

3017

++			    bfqd->peak_rate > device_speed_thresh[dev_type]) {

3018

++				bfqd->device_speed = BFQ_BFQD_FAST;

3019

++				bfqd->RT_prod = R_fast[dev_type] *

3020

++						T_fast[dev_type];

3021

++			}

3022

++		}

3023

++	}

3024

++

3025

++	/*

3026

++	 * If the process has been served for a too short time

3027

++	 * interval to let its possible sequential accesses prevail on

3028

++	 * the initial seek time needed to move the disk head on the

3029

++	 * first sector it requested, then give the process a chance

3030

++	 * and for the moment return false.

3031

++	 */

3032

++	if (bfqq->entity.budget <= bfq_max_budget(bfqd) / 8)

3033

++		return 0;

3034

++

3035

++	/*

3036

++	 * A process is considered ``slow'' (i.e., seeky, so that we

3037

++	 * cannot treat it fairly in the service domain, as it would

3038

++	 * slow down too much the other processes) if, when a slice

3039

++	 * ends for whatever reason, it has received service at a

3040

++	 * rate that would not be high enough to complete the budget

3041

++	 * before the budget timeout expiration.

3042

++	 */

3043

++	expected = bw * 1000 * timeout >> BFQ_RATE_SHIFT;

3044

++

3045

++	/*

3046

++	 * Caveat: processes doing IO in the slower disk zones will

3047

++	 * tend to be slow(er) even if not seeky. And the estimated

3048

++	 * peak rate will actually be an average over the disk

3049

++	 * surface. Hence, to not be too harsh with unlucky processes,

3050

++	 * we keep a budget/3 margin of safety before declaring a

3051

++	 * process slow.

3052

++	 */

3053

++	return expected > (4 * bfqq->entity.budget) / 3;

3054

++}

3055

++

3056

++/*

3057

++ * To be deemed as soft real-time, an application must meet two

3058

++ * requirements. First, the application must not require an average

3059

++ * bandwidth higher than the approximate bandwidth required to playback or

3060

++ * record a compressed high-definition video.

3061

++ * The next function is invoked on the completion of the last request of a

3062

++ * batch, to compute the next-start time instant, soft_rt_next_start, such

3063

++ * that, if the next request of the application does not arrive before

3064

++ * soft_rt_next_start, then the above requirement on the bandwidth is met.

3065

++ *

3066

++ * The second requirement is that the request pattern of the application is

3067

++ * isochronous, i.e., that, after issuing a request or a batch of requests,

3068

++ * the application stops issuing new requests until all its pending requests

3069

++ * have been completed. After that, the application may issue a new batch,

3070

++ * and so on.

3071

++ * For this reason the next function is invoked to compute

3072

++ * soft_rt_next_start only for applications that meet this requirement,

3073

++ * whereas soft_rt_next_start is set to infinity for applications that do

3074

++ * not.

3075

++ *

3076

++ * Unfortunately, even a greedy application may happen to behave in an

3077

++ * isochronous way if the CPU load is high. In fact, the application may

3078

++ * stop issuing requests while the CPUs are busy serving other processes,

3079

++ * then restart, then stop again for a while, and so on. In addition, if

3080

++ * the disk achieves a low enough throughput with the request pattern

3081

++ * issued by the application (e.g., because the request pattern is random

3082

++ * and/or the device is slow), then the application may meet the above

3083

++ * bandwidth requirement too. To prevent such a greedy application to be

3084

++ * deemed as soft real-time, a further rule is used in the computation of

3085

++ * soft_rt_next_start: soft_rt_next_start must be higher than the current

3086

++ * time plus the maximum time for which the arrival of a request is waited

3087

++ * for when a sync queue becomes idle, namely bfqd->bfq_slice_idle.

3088

++ * This filters out greedy applications, as the latter issue instead their

3089

++ * next request as soon as possible after the last one has been completed

3090

++ * (in contrast, when a batch of requests is completed, a soft real-time

3091

++ * application spends some time processing data).

3092

++ *

3093

++ * Unfortunately, the last filter may easily generate false positives if

3094

++ * only bfqd->bfq_slice_idle is used as a reference time interval and one

3095

++ * or both the following cases occur:

3096

++ * 1) HZ is so low that the duration of a jiffy is comparable to or higher

3097

++ *    than bfqd->bfq_slice_idle. This happens, e.g., on slow devices with

3098

++ *    HZ=100.

3099

++ * 2) jiffies, instead of increasing at a constant rate, may stop increasing

3100

++ *    for a while, then suddenly 'jump' by several units to recover the lost

3101

++ *    increments. This seems to happen, e.g., inside virtual machines.

3102

++ * To address this issue, we do not use as a reference time interval just

3103

++ * bfqd->bfq_slice_idle, but bfqd->bfq_slice_idle plus a few jiffies. In

3104

++ * particular we add the minimum number of jiffies for which the filter

3105

++ * seems to be quite precise also in embedded systems and KVM/QEMU virtual

3106

++ * machines.

3107

++ */

3108

++static inline unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd,

3109

++						       struct bfq_queue *bfqq)

3110

++{

3111

++	return max(bfqq->last_idle_bklogged +

3112

++		   HZ * bfqq->service_from_backlogged /

3113

++		   bfqd->bfq_wr_max_softrt_rate,

3114

++		   jiffies + bfqq->bfqd->bfq_slice_idle + 4);

3115

++}

3116

++

3117

++/*

3118

++ * Return the largest-possible time instant such that, for as long as possible,

3119

++ * the current time will be lower than this time instant according to the macro

3120

++ * time_is_before_jiffies().

3121

++ */

3122

++static inline unsigned long bfq_infinity_from_now(unsigned long now)

3123

++{

3124

++	return now + ULONG_MAX / 2;

3125

++}

3126

++

3127

++/**

3128

++ * bfq_bfqq_expire - expire a queue.

3129

++ * @bfqd: device owning the queue.

3130

++ * @bfqq: the queue to expire.

3131

++ * @compensate: if true, compensate for the time spent idling.

3132

++ * @reason: the reason causing the expiration.

3133

++ *

3134

++ *

3135

++ * If the process associated to the queue is slow (i.e., seeky), or in

3136

++ * case of budget timeout, or, finally, if it is async, we

3137

++ * artificially charge it an entire budget (independently of the

3138

++ * actual service it received). As a consequence, the queue will get

3139

++ * higher timestamps than the correct ones upon reactivation, and

3140

++ * hence it will be rescheduled as if it had received more service

3141

++ * than what it actually received. In the end, this class of processes

3142

++ * will receive less service in proportion to how slowly they consume

3143

++ * their budgets (and hence how seriously they tend to lower the

3144

++ * throughput).

3145

++ *

3146

++ * In contrast, when a queue expires because it has been idling for

3147

++ * too much or because it exhausted its budget, we do not touch the

3148

++ * amount of service it has received. Hence when the queue will be

3149

++ * reactivated and its timestamps updated, the latter will be in sync

3150

++ * with the actual service received by the queue until expiration.

3151

++ *

3152

++ * Charging a full budget to the first type of queues and the exact

3153

++ * service to the others has the effect of using the WF2Q+ policy to

3154

++ * schedule the former on a timeslice basis, without violating the

3155

++ * service domain guarantees of the latter.

3156

++ */

3157

++static void bfq_bfqq_expire(struct bfq_data *bfqd,

3158

++			    struct bfq_queue *bfqq,

3159

++			    int compensate,

3160

++			    enum bfqq_expiration reason)

3161

++{

3162

++	int slow;

3163

++	BUG_ON(bfqq != bfqd->in_service_queue);

3164

++

3165

++	/* Update disk peak rate for autotuning and check whether the

3166

++	 * process is slow (see bfq_update_peak_rate).

3167

++	 */

3168

++	slow = bfq_update_peak_rate(bfqd, bfqq, compensate, reason);

3169

++

3170

++	/*

3171

++	 * As above explained, 'punish' slow (i.e., seeky), timed-out

3172

++	 * and async queues, to favor sequential sync workloads.

3173

++	 *

3174

++	 * Processes doing I/O in the slower disk zones will tend to be

3175

++	 * slow(er) even if not seeky. Hence, since the estimated peak

3176

++	 * rate is actually an average over the disk surface, these

3177

++	 * processes may timeout just for bad luck. To avoid punishing

3178

++	 * them we do not charge a full budget to a process that

3179

++	 * succeeded in consuming at least 2/3 of its budget.

3180

++	 */

3181

++	if (slow || (reason == BFQ_BFQQ_BUDGET_TIMEOUT &&

3182

++		     bfq_bfqq_budget_left(bfqq) >=  bfqq->entity.budget / 3))

3183

++		bfq_bfqq_charge_full_budget(bfqq);

3184

++

3185

++	bfqq->service_from_backlogged += bfqq->entity.service;

3186

++

3187

++	if (BFQQ_SEEKY(bfqq) && reason == BFQ_BFQQ_BUDGET_TIMEOUT &&

3188

++	    !bfq_bfqq_constantly_seeky(bfqq)) {

3189

++		bfq_mark_bfqq_constantly_seeky(bfqq);

3190

++		if (!blk_queue_nonrot(bfqd->queue))

3191

++			bfqd->const_seeky_busy_in_flight_queues++;

3192

++	}

3193

++

3194

++	if (reason == BFQ_BFQQ_TOO_IDLE &&

3195

++	    bfqq->entity.service <= 2 * bfqq->entity.budget / 10 )

3196

++		bfq_clear_bfqq_IO_bound(bfqq);

3197

++

3198

++	if (bfqd->low_latency && bfqq->wr_coeff == 1)

3199

++		bfqq->last_wr_start_finish = jiffies;

3200

++

3201

++	if (bfqd->low_latency && bfqd->bfq_wr_max_softrt_rate > 0 &&

3202

++	    RB_EMPTY_ROOT(&bfqq->sort_list)) {

3203

++		/*

3204

++		 * If we get here, and there are no outstanding requests,

3205

++		 * then the request pattern is isochronous (see the comments

3206

++		 * to the function bfq_bfqq_softrt_next_start()). Hence we

3207

++		 * can compute soft_rt_next_start. If, instead, the queue

3208

++		 * still has outstanding requests, then we have to wait

3209

++		 * for the completion of all the outstanding requests to

3210

++		 * discover whether the request pattern is actually

3211

++		 * isochronous.

3212

++		 */

3213

++		if (bfqq->dispatched == 0)

3214

++			bfqq->soft_rt_next_start =

3215

++				bfq_bfqq_softrt_next_start(bfqd, bfqq);

3216

++		else {

3217

++			/*

3218

++			 * The application is still waiting for the

3219

++			 * completion of one or more requests:

3220

++			 * prevent it from possibly being incorrectly

3221

++			 * deemed as soft real-time by setting its

3222

++			 * soft_rt_next_start to infinity. In fact,

3223

++			 * without this assignment, the application

3224

++			 * would be incorrectly deemed as soft

3225

++			 * real-time if:

3226

++			 * 1) it issued a new request before the

3227

++			 *    completion of all its in-flight

3228

++			 *    requests, and

3229

++			 * 2) at that time, its soft_rt_next_start

3230

++			 *    happened to be in the past.

3231

++			 */

3232

++			bfqq->soft_rt_next_start =

3233

++				bfq_infinity_from_now(jiffies);

3234

++			/*

3235

++			 * Schedule an update of soft_rt_next_start to when

3236

++			 * the task may be discovered to be isochronous.

3237

++			 */

3238

++			bfq_mark_bfqq_softrt_update(bfqq);

3239

++		}

3240

++	}

3241

++

3242

++	bfq_log_bfqq(bfqd, bfqq,

3243

++		"expire (%d, slow %d, num_disp %d, idle_win %d)", reason,

3244

++		slow, bfqq->dispatched, bfq_bfqq_idle_window(bfqq));

3245

++

3246

++	/*

3247

++	 * Increase, decrease or leave budget unchanged according to

3248

++	 * reason.

3249

++	 */

3250

++	__bfq_bfqq_recalc_budget(bfqd, bfqq, reason);

3251

++	__bfq_bfqq_expire(bfqd, bfqq);

3252

++}

3253

++

3254

++/*

3255

++ * Budget timeout is not implemented through a dedicated timer, but

3256

++ * just checked on request arrivals and completions, as well as on

3257

++ * idle timer expirations.

3258

++ */

3259

++static int bfq_bfqq_budget_timeout(struct bfq_queue *bfqq)

3260

++{

3261

++	if (bfq_bfqq_budget_new(bfqq) ||

3262

++	    time_before(jiffies, bfqq->budget_timeout))

3263

++		return 0;

3264

++	return 1;

3265

++}

3266

++

3267

++/*

3268

++ * If we expire a queue that is waiting for the arrival of a new

3269

++ * request, we may prevent the fictitious timestamp back-shifting that

3270

++ * allows the guarantees of the queue to be preserved (see [1] for

3271

++ * this tricky aspect). Hence we return true only if this condition

3272

++ * does not hold, or if the queue is slow enough to deserve only to be

3273

++ * kicked off for preserving a high throughput.

3274

++*/

3275

++static inline int bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq)

3276

++{

3277

++	bfq_log_bfqq(bfqq->bfqd, bfqq,

3278

++		"may_budget_timeout: wait_request %d left %d timeout %d",

3279

++		bfq_bfqq_wait_request(bfqq),

3280

++			bfq_bfqq_budget_left(bfqq) >=  bfqq->entity.budget / 3,

3281

++		bfq_bfqq_budget_timeout(bfqq));

3282

++

3283

++	return (!bfq_bfqq_wait_request(bfqq) ||

3284

++		bfq_bfqq_budget_left(bfqq) >=  bfqq->entity.budget / 3)

3285

++		&&

3286

++		bfq_bfqq_budget_timeout(bfqq);

3287

++}

3288

++

3289

++/*

3290

++ * Device idling is allowed only for the queues for which this function

3291

++ * returns true. For this reason, the return value of this function plays a

3292

++ * critical role for both throughput boosting and service guarantees. The

3293

++ * return value is computed through a logical expression. In this rather

3294

++ * long comment, we try to briefly describe all the details and motivations

3295

++ * behind the components of this logical expression.

3296

++ *

3297

++ * First, the expression is false if bfqq is not sync, or if: bfqq happened

3298

++ * to become active during a large burst of queue activations, and the

3299

++ * pattern of requests bfqq contains boosts the throughput if bfqq is

3300

++ * expired. In fact, queues that became active during a large burst benefit

3301

++ * only from throughput, as discussed in the comments to bfq_handle_burst.

3302

++ * In this respect, expiring bfqq certainly boosts the throughput on NCQ-

3303

++ * capable flash-based devices, whereas, on rotational devices, it boosts

3304

++ * the throughput only if bfqq contains random requests.

3305

++ *

3306

++ * On the opposite end, if (a) bfqq is sync, (b) the above burst-related

3307

++ * condition does not hold, and (c) bfqq is being weight-raised, then the

3308

++ * expression always evaluates to true, as device idling is instrumental

3309

++ * for preserving low-latency guarantees (see [1]). If, instead, conditions

3310

++ * (a) and (b) do hold, but (c) does not, then the expression evaluates to

3311

++ * true only if: (1) bfqq is I/O-bound and has a non-null idle window, and

3312

++ * (2) at least one of the following two conditions holds.

3313

++ * The first condition is that the device is not performing NCQ, because

3314

++ * idling the device most certainly boosts the throughput if this condition

3315

++ * holds and bfqq is I/O-bound and has been granted a non-null idle window.

3316

++ * The second compound condition is made of the logical AND of two components.

3317

++ *

3318

++ * The first component is true only if there is no weight-raised busy

3319

++ * queue. This guarantees that the device is not idled for a sync non-

3320

++ * weight-raised queue when there are busy weight-raised queues. The former

3321

++ * is then expired immediately if empty. Combined with the timestamping

3322

++ * rules of BFQ (see [1] for details), this causes sync non-weight-raised

3323

++ * queues to get a lower number of requests served, and hence to ask for a

3324

++ * lower number of requests from the request pool, before the busy weight-

3325

++ * raised queues get served again.

3326

++ *

3327

++ * This is beneficial for the processes associated with weight-raised

3328

++ * queues, when the request pool is saturated (e.g., in the presence of

3329

++ * write hogs). In fact, if the processes associated with the other queues

3330

++ * ask for requests at a lower rate, then weight-raised processes have a

3331

++ * higher probability to get a request from the pool immediately (or at

3332

++ * least soon) when they need one. Hence they have a higher probability to

3333

++ * actually get a fraction of the disk throughput proportional to their

3334

++ * high weight. This is especially true with NCQ-capable drives, which

3335

++ * enqueue several requests in advance and further reorder internally-

3336

++ * queued requests.

3337

++ *

3338

++ * In the end, mistreating non-weight-raised queues when there are busy

3339

++ * weight-raised queues seems to mitigate starvation problems in the

3340

++ * presence of heavy write workloads and NCQ, and hence to guarantee a

3341

++ * higher application and system responsiveness in these hostile scenarios.

3342

++ *

3343

++ * If the first component of the compound condition is instead true, i.e.,

3344

++ * there is no weight-raised busy queue, then the second component of the

3345

++ * compound condition takes into account service-guarantee and throughput

3346

++ * issues related to NCQ (recall that the compound condition is evaluated

3347

++ * only if the device is detected as supporting NCQ).

3348

++ *

3349

++ * As for service guarantees, allowing the drive to enqueue more than one

3350

++ * request at a time, and hence delegating de facto final scheduling

3351

++ * decisions to the drive's internal scheduler, causes loss of control on

3352

++ * the actual request service order. In this respect, when the drive is

3353

++ * allowed to enqueue more than one request at a time, the service

3354

++ * distribution enforced by the drive's internal scheduler is likely to

3355

++ * coincide with the desired device-throughput distribution only in the

3356

++ * following, perfectly symmetric, scenario:

3357

++ * 1) all active queues have the same weight,

3358

++ * 2) all active groups at the same level in the groups tree have the same

3359

++ *    weight,

3360

++ * 3) all active groups at the same level in the groups tree have the same

3361

++ *    number of children.

3362

++ *

3363

++ * Even in such a scenario, sequential I/O may still receive a preferential

3364

++ * treatment, but this is not likely to be a big issue with flash-based

3365

++ * devices, because of their non-dramatic loss of throughput with random

3366

++ * I/O. Things do differ with HDDs, for which additional care is taken, as

3367

++ * explained after completing the discussion for flash-based devices.

3368

++ *

3369

++ * Unfortunately, keeping the necessary state for evaluating exactly the

3370

++ * above symmetry conditions would be quite complex and time-consuming.

3371

++ * Therefore BFQ evaluates instead the following stronger sub-conditions,

3372

++ * for which it is much easier to maintain the needed state:

3373

++ * 1) all active queues have the same weight,

3374

++ * 2) all active groups have the same weight,

3375

++ * 3) all active groups have at most one active child each.

3376

++ * In particular, the last two conditions are always true if hierarchical

3377

++ * support and the cgroups interface are not enabled, hence no state needs

3378

++ * to be maintained in this case.

3379

++ *

3380

++ * According to the above considerations, the second component of the

3381

++ * compound condition evaluates to true if any of the above symmetry

3382

++ * sub-condition does not hold, or the device is not flash-based. Therefore,

3383

++ * if also the first component is true, then idling is allowed for a sync

3384

++ * queue. These are the only sub-conditions considered if the device is

3385

++ * flash-based, as, for such a device, it is sensible to force idling only

3386

++ * for service-guarantee issues. In fact, as for throughput, idling

3387

++ * NCQ-capable flash-based devices would not boost the throughput even

3388

++ * with sequential I/O; rather it would lower the throughput in proportion

3389

++ * to how fast the device is. In the end, (only) if all the three

3390

++ * sub-conditions hold and the device is flash-based, the compound

3391

++ * condition evaluates to false and therefore no idling is performed.

3392

++ *

3393

++ * As already said, things change with a rotational device, where idling

3394

++ * boosts the throughput with sequential I/O (even with NCQ). Hence, for

3395

++ * such a device the second component of the compound condition evaluates

3396

++ * to true also if the following additional sub-condition does not hold:

3397

++ * the queue is constantly seeky. Unfortunately, this different behavior

3398

++ * with respect to flash-based devices causes an additional asymmetry: if

3399

++ * some sync queues enjoy idling and some other sync queues do not, then

3400

++ * the latter get a low share of the device throughput, simply because the

3401

++ * former get many requests served after being set as in service, whereas

3402

++ * the latter do not. As a consequence, to guarantee the desired throughput

3403

++ * distribution, on HDDs the compound expression evaluates to true (and

3404

++ * hence device idling is performed) also if the following last symmetry

3405

++ * condition does not hold: no other queue is benefiting from idling. Also

3406

++ * this last condition is actually replaced with a simpler-to-maintain and

3407

++ * stronger condition: there is no busy queue which is not constantly seeky

3408

++ * (and hence may also benefit from idling).

3409

++ *

3410

++ * To sum up, when all the required symmetry and throughput-boosting

3411

++ * sub-conditions hold, the second component of the compound condition

3412

++ * evaluates to false, and hence no idling is performed. This helps to

3413

++ * keep the drives' internal queues full on NCQ-capable devices, and hence

3414

++ * to boost the throughput, without causing 'almost' any loss of service

3415

++ * guarantees. The 'almost' follows from the fact that, if the internal

3416

++ * queue of one such device is filled while all the sub-conditions hold,

3417

++ * but at some point in time some sub-condition stops to hold, then it may

3418

++ * become impossible to let requests be served in the new desired order

3419

++ * until all the requests already queued in the device have been served.

3420

++ */

3421

++static inline bool bfq_bfqq_must_not_expire(struct bfq_queue *bfqq)

3422

++{

3423

++	struct bfq_data *bfqd = bfqq->bfqd;

3424

++#ifdef CONFIG_CGROUP_BFQIO

3425

++#define symmetric_scenario	  (!bfqd->active_numerous_groups && \

3426

++				   !bfq_differentiated_weights(bfqd))

3427

++#else

3428

++#define symmetric_scenario	  (!bfq_differentiated_weights(bfqd))

3429

++#endif

3430

++#define cond_for_seeky_on_ncq_hdd (bfq_bfqq_constantly_seeky(bfqq) && \

3431

++				   bfqd->busy_in_flight_queues == \

3432

++				   bfqd->const_seeky_busy_in_flight_queues)

3433

++

3434

++#define cond_for_expiring_in_burst	(bfq_bfqq_in_large_burst(bfqq) && \

3435

++					 bfqd->hw_tag && \

3436

++					 (blk_queue_nonrot(bfqd->queue) || \

3437

++					  bfq_bfqq_constantly_seeky(bfqq)))

3438

++

3439

++/*

3440

++ * Condition for expiring a non-weight-raised queue (and hence not idling

3441

++ * the device).

3442

++ */

3443

++#define cond_for_expiring_non_wr  (bfqd->hw_tag && \

3444

++				   (bfqd->wr_busy_queues > 0 || \

3445

++				    (symmetric_scenario && \

3446

++				     (blk_queue_nonrot(bfqd->queue) || \

3447

++				      cond_for_seeky_on_ncq_hdd))))

3448

++

3449

++	return bfq_bfqq_sync(bfqq) &&

3450

++		!cond_for_expiring_in_burst &&

3451

++		(bfqq->wr_coeff > 1 ||

3452

++		 (bfq_bfqq_IO_bound(bfqq) && bfq_bfqq_idle_window(bfqq) &&

3453

++		  !cond_for_expiring_non_wr)

3454

++	);

3455

++}

3456

++

3457

++/*

3458

++ * If the in-service queue is empty but sync, and the function

3459

++ * bfq_bfqq_must_not_expire returns true, then:

3460

++ * 1) the queue must remain in service and cannot be expired, and

3461

++ * 2) the disk must be idled to wait for the possible arrival of a new

3462

++ *    request for the queue.

3463

++ * See the comments to the function bfq_bfqq_must_not_expire for the reasons

3464

++ * why performing device idling is the best choice to boost the throughput

3465

++ * and preserve service guarantees when bfq_bfqq_must_not_expire itself

3466

++ * returns true.

3467

++ */

3468

++static inline bool bfq_bfqq_must_idle(struct bfq_queue *bfqq)

3469

++{

3470

++	struct bfq_data *bfqd = bfqq->bfqd;

3471

++

3472

++	return RB_EMPTY_ROOT(&bfqq->sort_list) && bfqd->bfq_slice_idle != 0 &&

3473

++	       bfq_bfqq_must_not_expire(bfqq);

3474

++}

3475

++

3476

++/*

3477

++ * Select a queue for service.  If we have a current queue in service,

3478

++ * check whether to continue servicing it, or retrieve and set a new one.

3479

++ */

3480

++static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)

3481

++{

3482

++	struct bfq_queue *bfqq, *new_bfqq = NULL;

3483

++	struct request *next_rq;

3484

++	enum bfqq_expiration reason = BFQ_BFQQ_BUDGET_TIMEOUT;

3485

++

3486

++	bfqq = bfqd->in_service_queue;

3487

++	if (bfqq == NULL)

3488

++		goto new_queue;

3489

++

3490

++	bfq_log_bfqq(bfqd, bfqq, "select_queue: already in-service queue");

3491

++

3492

++	/*

3493

++         * If another queue has a request waiting within our mean seek

3494

++         * distance, let it run. The expire code will check for close

3495

++         * cooperators and put the close queue at the front of the

3496

++         * service tree. If possible, merge the expiring queue with the

3497

++         * new bfqq.

3498

++         */

3499

++        new_bfqq = bfq_close_cooperator(bfqd, bfqq);

3500

++        if (new_bfqq != NULL && bfqq->new_bfqq == NULL)

3501

++                bfq_setup_merge(bfqq, new_bfqq);

3502

++

3503

++	if (bfq_may_expire_for_budg_timeout(bfqq) &&

3504

++	    !timer_pending(&bfqd->idle_slice_timer) &&

3505

++	    !bfq_bfqq_must_idle(bfqq))

3506

++		goto expire;

3507

++

3508

++	next_rq = bfqq->next_rq;

3509

++	/*

3510

++	 * If bfqq has requests queued and it has enough budget left to

3511

++	 * serve them, keep the queue, otherwise expire it.

3512

++	 */

3513

++	if (next_rq != NULL) {

3514

++		if (bfq_serv_to_charge(next_rq, bfqq) >

3515

++			bfq_bfqq_budget_left(bfqq)) {

3516

++			reason = BFQ_BFQQ_BUDGET_EXHAUSTED;

3517

++			goto expire;

3518

++		} else {

3519

++			/*

3520

++			 * The idle timer may be pending because we may

3521

++			 * not disable disk idling even when a new request

3522

++			 * arrives.

3523

++			 */

3524

++			if (timer_pending(&bfqd->idle_slice_timer)) {

3525

++				/*

3526

++				 * If we get here: 1) at least a new request

3527

++				 * has arrived but we have not disabled the

3528

++				 * timer because the request was too small,

3529

++				 * 2) then the block layer has unplugged

3530

++				 * the device, causing the dispatch to be

3531

++				 * invoked.

3532

++				 *

3533

++				 * Since the device is unplugged, now the

3534

++				 * requests are probably large enough to

3535

++				 * provide a reasonable throughput.

3536

++				 * So we disable idling.

3537

++				 */

3538

++				bfq_clear_bfqq_wait_request(bfqq);

3539

++				del_timer(&bfqd->idle_slice_timer);

3540

++			}

3541

++			if (new_bfqq == NULL)

3542

++				goto keep_queue;

3543

++			else

3544

++				goto expire;

3545

++		}

3546

++	}

3547

++

3548

++	/*

3549

++	 * No requests pending.  If the in-service queue still has requests

3550

++	 * in flight (possibly waiting for a completion) or is idling for a

3551

++	 * new request, then keep it.

3552

++	 */

3553

++	if (new_bfqq == NULL && (timer_pending(&bfqd->idle_slice_timer) ||

3554

++	    (bfqq->dispatched != 0 && bfq_bfqq_must_not_expire(bfqq)))) {

3555

++		bfqq = NULL;

3556

++		goto keep_queue;

3557

++	} else if (new_bfqq != NULL && timer_pending(&bfqd->idle_slice_timer)) {

3558

++		/*

3559

++		 * Expiring the queue because there is a close cooperator,

3560

++		 * cancel timer.

3561

++		 */

3562

++		bfq_clear_bfqq_wait_request(bfqq);

3563

++		del_timer(&bfqd->idle_slice_timer);

3564

++	}

3565

++

3566

++	reason = BFQ_BFQQ_NO_MORE_REQUESTS;

3567

++expire:

3568

++	bfq_bfqq_expire(bfqd, bfqq, 0, reason);

3569

++new_queue:

3570

++	bfqq = bfq_set_in_service_queue(bfqd, new_bfqq);

3571

++	bfq_log(bfqd, "select_queue: new queue %d returned",

3572

++		bfqq != NULL ? bfqq->pid : 0);

3573

++keep_queue:

3574

++	return bfqq;

3575

++}

3576

++

3577

++static void bfq_update_wr_data(struct bfq_data *bfqd,

3578

++			       struct bfq_queue *bfqq)

3579

++{

3580

++	if (bfqq->wr_coeff > 1) { /* queue is being boosted */

3581

++		struct bfq_entity *entity = &bfqq->entity;

3582

++

3583

++		bfq_log_bfqq(bfqd, bfqq,

3584

++			"raising period dur %u/%u msec, old coeff %u, w %d(%d)",

3585

++			jiffies_to_msecs(jiffies -

3586

++				bfqq->last_wr_start_finish),

3587

++			jiffies_to_msecs(bfqq->wr_cur_max_time),

3588

++			bfqq->wr_coeff,

3589

++			bfqq->entity.weight, bfqq->entity.orig_weight);

3590

++

3591

++		BUG_ON(bfqq != bfqd->in_service_queue && entity->weight !=

3592

++		       entity->orig_weight * bfqq->wr_coeff);

3593

++		if (entity->ioprio_changed)

3594

++			bfq_log_bfqq(bfqd, bfqq, "WARN: pending prio change");

3595

++		/*

3596

++		 * If the queue was activated in a burst, or

3597

++		 * too much time has elapsed from the beginning

3598

++		 * of this weight-raising, then end weight raising.

3599

++		 */

3600

++		if (bfq_bfqq_in_large_burst(bfqq) ||

3601

++		    time_is_before_jiffies(bfqq->last_wr_start_finish +

3602

++					   bfqq->wr_cur_max_time)) {

3603

++			bfqq->last_wr_start_finish = jiffies;

3604

++			bfq_log_bfqq(bfqd, bfqq,

3605

++				     "wrais ending at %lu, rais_max_time %u",

3606

++				     bfqq->last_wr_start_finish,

3607

++				     jiffies_to_msecs(bfqq->wr_cur_max_time));

3608

++			bfq_bfqq_end_wr(bfqq);

3609

++			__bfq_entity_update_weight_prio(

3610

++				bfq_entity_service_tree(entity),

3611

++				entity);

3612

++		}

3613

++	}

3614

++}

3615

++

3616

++/*

3617

++ * Dispatch one request from bfqq, moving it to the request queue

3618

++ * dispatch list.

3619

++ */

3620

++static int bfq_dispatch_request(struct bfq_data *bfqd,

3621

++				struct bfq_queue *bfqq)

3622

++{

3623

++	int dispatched = 0;

3624

++	struct request *rq;

3625

++	unsigned long service_to_charge;

3626

++

3627

++	BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list));

3628

++

3629

++	/* Follow expired path, else get first next available. */

3630

++	rq = bfq_check_fifo(bfqq);

3631

++	if (rq == NULL)

3632

++		rq = bfqq->next_rq;

3633

++	service_to_charge = bfq_serv_to_charge(rq, bfqq);

3634

++

3635

++	if (service_to_charge > bfq_bfqq_budget_left(bfqq)) {

3636

++		/*

3637

++		 * This may happen if the next rq is chosen in fifo order

3638

++		 * instead of sector order. The budget is properly

3639

++		 * dimensioned to be always sufficient to serve the next

3640

++		 * request only if it is chosen in sector order. The reason

3641

++		 * is that it would be quite inefficient and little useful

3642

++		 * to always make sure that the budget is large enough to

3643

++		 * serve even the possible next rq in fifo order.

3644

++		 * In fact, requests are seldom served in fifo order.

3645

++		 *

3646

++		 * Expire the queue for budget exhaustion, and make sure

3647

++		 * that the next act_budget is enough to serve the next

3648

++		 * request, even if it comes from the fifo expired path.

3649

++		 */

3650

++		bfqq->next_rq = rq;

3651

++		/*

3652

++		 * Since this dispatch is failed, make sure that

3653

++		 * a new one will be performed

3654

++		 */

3655

++		if (!bfqd->rq_in_driver)

3656

++			bfq_schedule_dispatch(bfqd);

3657

++		goto expire;

3658

++	}

3659

++

3660

++	/* Finally, insert request into driver dispatch list. */

3661

++	bfq_bfqq_served(bfqq, service_to_charge);

3662

++	bfq_dispatch_insert(bfqd->queue, rq);

3663

++

3664

++	bfq_update_wr_data(bfqd, bfqq);

3665

++

3666

++	bfq_log_bfqq(bfqd, bfqq,

3667

++			"dispatched %u sec req (%llu), budg left %lu",

3668

++			blk_rq_sectors(rq),

3669

++			(long long unsigned)blk_rq_pos(rq),

3670

++			bfq_bfqq_budget_left(bfqq));

3671

++

3672

++	dispatched++;

3673

++

3674

++	if (bfqd->in_service_bic == NULL) {

3675

++		atomic_long_inc(&RQ_BIC(rq)->icq.ioc->refcount);

3676

++		bfqd->in_service_bic = RQ_BIC(rq);

3677

++	}

3678

++

3679

++	if (bfqd->busy_queues > 1 && ((!bfq_bfqq_sync(bfqq) &&

3680

++	    dispatched >= bfqd->bfq_max_budget_async_rq) ||

3681

++	    bfq_class_idle(bfqq)))

3682

++		goto expire;

3683

++

3684

++	return dispatched;

3685

++

3686

++expire:

3687

++	bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_EXHAUSTED);

3688

++	return dispatched;

3689

++}

3690

++

3691

++static int __bfq_forced_dispatch_bfqq(struct bfq_queue *bfqq)

3692

++{

3693

++	int dispatched = 0;

3694

++

3695

++	while (bfqq->next_rq != NULL) {

3696

++		bfq_dispatch_insert(bfqq->bfqd->queue, bfqq->next_rq);

3697

++		dispatched++;

3698

++	}

3699

++

3700

++	BUG_ON(!list_empty(&bfqq->fifo));

3701

++	return dispatched;

3702

++}

3703

++

3704

++/*

3705

++ * Drain our current requests.

3706

++ * Used for barriers and when switching io schedulers on-the-fly.

3707

++ */

3708

++static int bfq_forced_dispatch(struct bfq_data *bfqd)

3709

++{

3710

++	struct bfq_queue *bfqq, *n;

3711

++	struct bfq_service_tree *st;

3712

++	int dispatched = 0;

3713

++

3714

++	bfqq = bfqd->in_service_queue;

3715

++	if (bfqq != NULL)

3716

++		__bfq_bfqq_expire(bfqd, bfqq);

3717

++

3718

++	/*

3719

++	 * Loop through classes, and be careful to leave the scheduler

3720

++	 * in a consistent state, as feedback mechanisms and vtime

3721

++	 * updates cannot be disabled during the process.

3722

++	 */

3723

++	list_for_each_entry_safe(bfqq, n, &bfqd->active_list, bfqq_list) {

3724

++		st = bfq_entity_service_tree(&bfqq->entity);

3725

++

3726

++		dispatched += __bfq_forced_dispatch_bfqq(bfqq);

3727

++		bfqq->max_budget = bfq_max_budget(bfqd);

3728

++

3729

++		bfq_forget_idle(st);

3730

++	}

3731

++

3732

++	BUG_ON(bfqd->busy_queues != 0);

3733

++

3734

++	return dispatched;

3735

++}

3736

++

3737

++static int bfq_dispatch_requests(struct request_queue *q, int force)

3738

++{

3739

++	struct bfq_data *bfqd = q->elevator->elevator_data;

3740

++	struct bfq_queue *bfqq;

3741

++	int max_dispatch;

3742

++

3743

++	bfq_log(bfqd, "dispatch requests: %d busy queues", bfqd->busy_queues);

3744

++	if (bfqd->busy_queues == 0)

3745

++		return 0;

3746

++

3747

++	if (unlikely(force))

3748

++		return bfq_forced_dispatch(bfqd);

3749

++

3750

++	bfqq = bfq_select_queue(bfqd);

3751

++	if (bfqq == NULL)

3752

++		return 0;

3753

++

3754

++	max_dispatch = bfqd->bfq_quantum;

3755

++	if (bfq_class_idle(bfqq))

3756

++		max_dispatch = 1;

3757

++

3758

++	if (!bfq_bfqq_sync(bfqq))

3759

++		max_dispatch = bfqd->bfq_max_budget_async_rq;

3760

++

3761

++	if (bfqq->dispatched >= max_dispatch) {

3762

++		if (bfqd->busy_queues > 1)

3763

++			return 0;

3764

++		if (bfqq->dispatched >= 4 * max_dispatch)

3765

++			return 0;

3766

++	}

3767

++

3768

++	if (bfqd->sync_flight != 0 && !bfq_bfqq_sync(bfqq))

3769

++		return 0;

3770

++

3771

++	bfq_clear_bfqq_wait_request(bfqq);

3772

++	BUG_ON(timer_pending(&bfqd->idle_slice_timer));

3773

++

3774

++	if (!bfq_dispatch_request(bfqd, bfqq))

3775

++		return 0;

3776

++

3777

++	bfq_log_bfqq(bfqd, bfqq, "dispatched one request of %d (max_disp %d)",

3778

++			bfqq->pid, max_dispatch);

3779

++

3780

++	return 1;

3781

++}

3782

++

3783

++/*

3784

++ * Task holds one reference to the queue, dropped when task exits.  Each rq

3785

++ * in-flight on this queue also holds a reference, dropped when rq is freed.

3786

++ *

3787

++ * Queue lock must be held here.

3788

++ */

3789

++static void bfq_put_queue(struct bfq_queue *bfqq)

3790

++{

3791

++	struct bfq_data *bfqd = bfqq->bfqd;

3792

++

3793

++	BUG_ON(atomic_read(&bfqq->ref) <= 0);

3794

++

3795

++	bfq_log_bfqq(bfqd, bfqq, "put_queue: %p %d", bfqq,

3796

++		     atomic_read(&bfqq->ref));

3797

++	if (!atomic_dec_and_test(&bfqq->ref))

3798

++		return;

3799

++

3800

++	BUG_ON(rb_first(&bfqq->sort_list) != NULL);

3801

++	BUG_ON(bfqq->allocated[READ] + bfqq->allocated[WRITE] != 0);

3802

++	BUG_ON(bfqq->entity.tree != NULL);

3803

++	BUG_ON(bfq_bfqq_busy(bfqq));

3804

++	BUG_ON(bfqd->in_service_queue == bfqq);

3805

++

3806

++	if (bfq_bfqq_sync(bfqq))

3807

++		/*

3808

++		 * The fact that this queue is being destroyed does not

3809

++		 * invalidate the fact that this queue may have been

3810

++		 * activated during the current burst. As a consequence,

3811

++		 * although the queue does not exist anymore, and hence

3812

++		 * needs to be removed from the burst list if there,

3813

++		 * the burst size has not to be decremented.

3814

++		 */

3815

++		hlist_del_init(&bfqq->burst_list_node);

3816

++

3817

++	bfq_log_bfqq(bfqd, bfqq, "put_queue: %p freed", bfqq);

3818

++

3819

++	kmem_cache_free(bfq_pool, bfqq);

3820

++}

3821

++

3822

++static void bfq_put_cooperator(struct bfq_queue *bfqq)

3823

++{

3824

++	struct bfq_queue *__bfqq, *next;

3825

++

3826

++	/*

3827

++	 * If this queue was scheduled to merge with another queue, be

3828

++	 * sure to drop the reference taken on that queue (and others in

3829

++	 * the merge chain). See bfq_setup_merge and bfq_merge_bfqqs.

3830

++	 */

3831

++	__bfqq = bfqq->new_bfqq;

3832

++	while (__bfqq) {

3833

++		if (__bfqq == bfqq)

3834

++			break;

3835

++		next = __bfqq->new_bfqq;

3836

++		bfq_put_queue(__bfqq);

3837

++		__bfqq = next;

3838

++	}

3839

++}

3840

++

3841

++static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)

3842

++{

3843

++	if (bfqq == bfqd->in_service_queue) {

3844

++		__bfq_bfqq_expire(bfqd, bfqq);

3845

++		bfq_schedule_dispatch(bfqd);

3846

++	}

3847

++

3848

++	bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq,

3849

++		     atomic_read(&bfqq->ref));

3850

++

3851

++	bfq_put_cooperator(bfqq);

3852

++

3853

++	bfq_put_queue(bfqq);

3854

++}

3855

++

3856

++static inline void bfq_init_icq(struct io_cq *icq)

3857

++{

3858

++	struct bfq_io_cq *bic = icq_to_bic(icq);

3859

++

3860

++	bic->ttime.last_end_request = jiffies;

3861

++}

3862

++

3863

++static void bfq_exit_icq(struct io_cq *icq)

3864

++{

3865

++	struct bfq_io_cq *bic = icq_to_bic(icq);

3866

++	struct bfq_data *bfqd = bic_to_bfqd(bic);

3867

++

3868

++	if (bic->bfqq[BLK_RW_ASYNC]) {

3869

++		bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_ASYNC]);

3870

++		bic->bfqq[BLK_RW_ASYNC] = NULL;

3871

++	}

3872

++

3873

++	if (bic->bfqq[BLK_RW_SYNC]) {

3874

++		bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_SYNC]);

3875

++		bic->bfqq[BLK_RW_SYNC] = NULL;

3876

++	}

3877

++}

3878

++

3879

++/*

3880

++ * Update the entity prio values; note that the new values will not

3881

++ * be used until the next (re)activation.

3882

++ */

3883

++static void bfq_init_prio_data(struct bfq_queue *bfqq, struct bfq_io_cq *bic)

3884

++{

3885

++	struct task_struct *tsk = current;

3886

++	int ioprio_class;

3887

++

3888

++	if (!bfq_bfqq_prio_changed(bfqq))

3889

++		return;

3890

++

3891

++	ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio);

3892

++	switch (ioprio_class) {

3893

++	default:

3894

++		dev_err(bfqq->bfqd->queue->backing_dev_info.dev,

3895

++			"bfq: bad prio class %d\n", ioprio_class);

3896

++	case IOPRIO_CLASS_NONE:

3897

++		/*

3898

++		 * No prio set, inherit CPU scheduling settings.

3899

++		 */

3900

++		bfqq->entity.new_ioprio = task_nice_ioprio(tsk);

3901

++		bfqq->entity.new_ioprio_class = task_nice_ioclass(tsk);

3902

++		break;

3903

++	case IOPRIO_CLASS_RT:

3904

++		bfqq->entity.new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio);

3905

++		bfqq->entity.new_ioprio_class = IOPRIO_CLASS_RT;

3906

++		break;

3907

++	case IOPRIO_CLASS_BE:

3908

++		bfqq->entity.new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio);

3909

++		bfqq->entity.new_ioprio_class = IOPRIO_CLASS_BE;

3910

++		break;

3911

++	case IOPRIO_CLASS_IDLE:

3912

++		bfqq->entity.new_ioprio_class = IOPRIO_CLASS_IDLE;

3913

++		bfqq->entity.new_ioprio = 7;

3914

++		bfq_clear_bfqq_idle_window(bfqq);

3915

++		break;

3916

++	}

3917

++

3918

++	if (bfqq->entity.new_ioprio < 0 ||

3919

++	    bfqq->entity.new_ioprio >= IOPRIO_BE_NR) {

3920

++		printk(KERN_CRIT "bfq_init_prio_data: new_ioprio %d\n",

3921

++				 bfqq->entity.new_ioprio);

3922

++		BUG();

3923

++	}

3924

++

3925

++	bfqq->entity.ioprio_changed = 1;

3926

++

3927

++	bfq_clear_bfqq_prio_changed(bfqq);

3928

++}

3929

++

3930

++static void bfq_changed_ioprio(struct bfq_io_cq *bic)

3931

++{

3932

++	struct bfq_data *bfqd;

3933

++	struct bfq_queue *bfqq, *new_bfqq;

3934

++	struct bfq_group *bfqg;

3935

++	unsigned long uninitialized_var(flags);

3936

++	int ioprio = bic->icq.ioc->ioprio;

3937

++

3938

++	bfqd = bfq_get_bfqd_locked(&(bic->icq.q->elevator->elevator_data),

3939

++				   &flags);

3940

++	/*

3941

++	 * This condition may trigger on a newly created bic, be sure to

3942

++	 * drop the lock before returning.

3943

++	 */

3944

++	if (unlikely(bfqd == NULL) || likely(bic->ioprio == ioprio))

3945

++		goto out;

3946

++

3947

++	bfqq = bic->bfqq[BLK_RW_ASYNC];

3948

++	if (bfqq != NULL) {

3949

++		bfqg = container_of(bfqq->entity.sched_data, struct bfq_group,

3950

++				    sched_data);

3951

++		new_bfqq = bfq_get_queue(bfqd, bfqg, BLK_RW_ASYNC, bic,

3952

++					 GFP_ATOMIC);

3953

++		if (new_bfqq != NULL) {

3954

++			bic->bfqq[BLK_RW_ASYNC] = new_bfqq;

3955

++			bfq_log_bfqq(bfqd, bfqq,

3956

++				     "changed_ioprio: bfqq %p %d",

3957

++				     bfqq, atomic_read(&bfqq->ref));

3958

++			bfq_put_queue(bfqq);

3959

++		}

3960

++	}

3961

++

3962

++	bfqq = bic->bfqq[BLK_RW_SYNC];

3963

++	if (bfqq != NULL)

3964

++		bfq_mark_bfqq_prio_changed(bfqq);

3965

++

3966

++	bic->ioprio = ioprio;

3967

++

3968

++out:

3969

++	bfq_put_bfqd_unlock(bfqd, &flags);

3970

++}

3971

++

3972

++static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,

3973

++			  pid_t pid, int is_sync)

3974

++{

3975

++	RB_CLEAR_NODE(&bfqq->entity.rb_node);

3976

++	INIT_LIST_HEAD(&bfqq->fifo);

3977

++	INIT_HLIST_NODE(&bfqq->burst_list_node);

3978

++

3979

++	atomic_set(&bfqq->ref, 0);

3980

++	bfqq->bfqd = bfqd;

3981

++

3982

++	bfq_mark_bfqq_prio_changed(bfqq);

3983

++

3984

++	if (is_sync) {

3985

++		if (!bfq_class_idle(bfqq))

3986

++			bfq_mark_bfqq_idle_window(bfqq);

3987

++		bfq_mark_bfqq_sync(bfqq);

3988

++	}

3989

++	bfq_mark_bfqq_IO_bound(bfqq);

3990

++

3991

++	/* Tentative initial value to trade off between thr and lat */

3992

++	bfqq->max_budget = (2 * bfq_max_budget(bfqd)) / 3;

3993

++	bfqq->pid = pid;

3994

++

3995

++	bfqq->wr_coeff = 1;

3996

++	bfqq->last_wr_start_finish = 0;

3997

++	/*

3998

++	 * Set to the value for which bfqq will not be deemed as

3999

++	 * soft rt when it becomes backlogged.

4000

++	 */

4001

++	bfqq->soft_rt_next_start = bfq_infinity_from_now(jiffies);

4002

++}

4003

++

4004

++static struct bfq_queue *bfq_find_alloc_queue(struct bfq_data *bfqd,

4005

++					      struct bfq_group *bfqg,

4006

++					      int is_sync,

4007

++					      struct bfq_io_cq *bic,

4008

++					      gfp_t gfp_mask)

4009

++{

4010

++	struct bfq_queue *bfqq, *new_bfqq = NULL;

4011

++

4012

++retry:

4013

++	/* bic always exists here */

4014

++	bfqq = bic_to_bfqq(bic, is_sync);

4015

++

4016

++	/*

4017

++	 * Always try a new alloc if we fall back to the OOM bfqq

4018

++	 * originally, since it should just be a temporary situation.

4019

++	 */

4020

++	if (bfqq == NULL || bfqq == &bfqd->oom_bfqq) {

4021

++		bfqq = NULL;

4022

++		if (new_bfqq != NULL) {

4023

++			bfqq = new_bfqq;

4024

++			new_bfqq = NULL;

4025

++		} else if (gfp_mask & __GFP_WAIT) {

4026

++			spin_unlock_irq(bfqd->queue->queue_lock);

4027

++			new_bfqq = kmem_cache_alloc_node(bfq_pool,

4028

++					gfp_mask | __GFP_ZERO,

4029

++					bfqd->queue->node);

4030

++			spin_lock_irq(bfqd->queue->queue_lock);

4031

++			if (new_bfqq != NULL)

4032

++				goto retry;

4033

++		} else {

4034

++			bfqq = kmem_cache_alloc_node(bfq_pool,

4035

++					gfp_mask | __GFP_ZERO,

4036

++					bfqd->queue->node);

4037

++		}

4038

++

4039

++		if (bfqq != NULL) {

4040

++			bfq_init_bfqq(bfqd, bfqq, current->pid, is_sync);

4041

++			bfq_init_prio_data(bfqq, bic);

4042

++			bfq_init_entity(&bfqq->entity, bfqg);

4043

++			bfq_log_bfqq(bfqd, bfqq, "allocated");

4044

++		} else {

4045

++			bfqq = &bfqd->oom_bfqq;

4046

++			bfq_log_bfqq(bfqd, bfqq, "using oom bfqq");

4047

++		}

4048

++	}

4049

++

4050

++	if (new_bfqq != NULL)

4051

++		kmem_cache_free(bfq_pool, new_bfqq);

4052

++

4053

++	return bfqq;

4054

++}

4055

++

4056

++static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd,

4057

++					       struct bfq_group *bfqg,

4058

++					       int ioprio_class, int ioprio)

4059

++{

4060

++	switch (ioprio_class) {

4061

++	case IOPRIO_CLASS_RT:

4062

++		return &bfqg->async_bfqq[0][ioprio];

4063

++	case IOPRIO_CLASS_NONE:

4064

++		ioprio = IOPRIO_NORM;

4065

++		/* fall through */

4066

++	case IOPRIO_CLASS_BE:

4067

++		return &bfqg->async_bfqq[1][ioprio];

4068

++	case IOPRIO_CLASS_IDLE:

4069

++		return &bfqg->async_idle_bfqq;

4070

++	default:

4071

++		BUG();

4072

++	}

4073

++}

4074

++

4075

++static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,

4076

++				       struct bfq_group *bfqg, int is_sync,

4077

++				       struct bfq_io_cq *bic, gfp_t gfp_mask)

4078

++{

4079

++	const int ioprio = IOPRIO_PRIO_DATA(bic->ioprio);

4080

++	const int ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio);

4081

++	struct bfq_queue **async_bfqq = NULL;

4082

++	struct bfq_queue *bfqq = NULL;

4083

++

4084

++	if (!is_sync) {

4085

++		async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class,

4086

++						  ioprio);

4087

++		bfqq = *async_bfqq;

4088

++	}

4089

++

4090

++	if (bfqq == NULL)

4091

++		bfqq = bfq_find_alloc_queue(bfqd, bfqg, is_sync, bic, gfp_mask);

4092

++

4093

++	/*

4094

++	 * Pin the queue now that it's allocated, scheduler exit will

4095

++	 * prune it.

4096

++	 */

4097

++	if (!is_sync && *async_bfqq == NULL) {

4098

++		atomic_inc(&bfqq->ref);

4099

++		bfq_log_bfqq(bfqd, bfqq, "get_queue, bfqq not in async: %p, %d",

4100

++			     bfqq, atomic_read(&bfqq->ref));

4101

++		*async_bfqq = bfqq;

4102

++	}

4103

++

4104

++	atomic_inc(&bfqq->ref);

4105

++	bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq,

4106

++		     atomic_read(&bfqq->ref));

4107

++	return bfqq;

4108

++}

4109

++

4110

++static void bfq_update_io_thinktime(struct bfq_data *bfqd,

4111

++				    struct bfq_io_cq *bic)

4112

++{

4113

++	unsigned long elapsed = jiffies - bic->ttime.last_end_request;

4114

++	unsigned long ttime = min(elapsed, 2UL * bfqd->bfq_slice_idle);

4115

++

4116

++	bic->ttime.ttime_samples = (7*bic->ttime.ttime_samples + 256) / 8;

4117

++	bic->ttime.ttime_total = (7*bic->ttime.ttime_total + 256*ttime) / 8;

4118

++	bic->ttime.ttime_mean = (bic->ttime.ttime_total + 128) /

4119

++				bic->ttime.ttime_samples;

4120

++}

4121

++

4122

++static void bfq_update_io_seektime(struct bfq_data *bfqd,

4123

++				   struct bfq_queue *bfqq,

4124

++				   struct request *rq)

4125

++{

4126

++	sector_t sdist;

4127

++	u64 total;

4128

++

4129

++	if (bfqq->last_request_pos < blk_rq_pos(rq))

4130

++		sdist = blk_rq_pos(rq) - bfqq->last_request_pos;

4131

++	else

4132

++		sdist = bfqq->last_request_pos - blk_rq_pos(rq);

4133

++

4134

++	/*

4135

++	 * Don't allow the seek distance to get too large from the

4136

++	 * odd fragment, pagein, etc.

4137

++	 */

4138

++	if (bfqq->seek_samples == 0) /* first request, not really a seek */

4139

++		sdist = 0;

4140

++	else if (bfqq->seek_samples <= 60) /* second & third seek */

4141

++		sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*1024);

4142

++	else

4143

++		sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*64);

4144

++

4145

++	bfqq->seek_samples = (7*bfqq->seek_samples + 256) / 8;

4146

++	bfqq->seek_total = (7*bfqq->seek_total + (u64)256*sdist) / 8;

4147

++	total = bfqq->seek_total + (bfqq->seek_samples/2);

4148

++	do_div(total, bfqq->seek_samples);

4149

++	bfqq->seek_mean = (sector_t)total;

4150

++

4151

++	bfq_log_bfqq(bfqd, bfqq, "dist=%llu mean=%llu", (u64)sdist,

4152

++			(u64)bfqq->seek_mean);

4153

++}

4154

++

4155

++/*

4156

++ * Disable idle window if the process thinks too long or seeks so much that

4157

++ * it doesn't matter.

4158

++ */

4159

++static void bfq_update_idle_window(struct bfq_data *bfqd,

4160

++				   struct bfq_queue *bfqq,

4161

++				   struct bfq_io_cq *bic)

4162

++{

4163

++	int enable_idle;

4164

++

4165

++	/* Don't idle for async or idle io prio class. */

4166

++	if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq))

4167

++		return;

4168

++

4169

++	enable_idle = bfq_bfqq_idle_window(bfqq);

4170

++

4171

++	if (atomic_read(&bic->icq.ioc->active_ref) == 0 ||

4172

++	    bfqd->bfq_slice_idle == 0 ||

4173

++		(bfqd->hw_tag && BFQQ_SEEKY(bfqq) &&

4174

++			bfqq->wr_coeff == 1))

4175

++		enable_idle = 0;

4176

++	else if (bfq_sample_valid(bic->ttime.ttime_samples)) {

4177

++		if (bic->ttime.ttime_mean > bfqd->bfq_slice_idle &&

4178

++			bfqq->wr_coeff == 1)

4179

++			enable_idle = 0;

4180

++		else

4181

++			enable_idle = 1;

4182

++	}

4183

++	bfq_log_bfqq(bfqd, bfqq, "update_idle_window: enable_idle %d",

4184

++		enable_idle);

4185

++

4186

++	if (enable_idle)

4187

++		bfq_mark_bfqq_idle_window(bfqq);

4188

++	else

4189

++		bfq_clear_bfqq_idle_window(bfqq);

4190

++}

4191

++

4192

++/*

4193

++ * Called when a new fs request (rq) is added to bfqq.  Check if there's

4194

++ * something we should do about it.

4195

++ */

4196

++static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,

4197

++			    struct request *rq)

4198

++{

4199

++	struct bfq_io_cq *bic = RQ_BIC(rq);

4200

++

4201

++	if (rq->cmd_flags & REQ_META)

4202

++		bfqq->meta_pending++;

4203

++

4204

++	bfq_update_io_thinktime(bfqd, bic);

4205

++	bfq_update_io_seektime(bfqd, bfqq, rq);

4206

++	if (!BFQQ_SEEKY(bfqq) && bfq_bfqq_constantly_seeky(bfqq)) {

4207

++		bfq_clear_bfqq_constantly_seeky(bfqq);

4208

++		if (!blk_queue_nonrot(bfqd->queue)) {

4209

++			BUG_ON(!bfqd->const_seeky_busy_in_flight_queues);

4210

++			bfqd->const_seeky_busy_in_flight_queues--;

4211

++		}

4212

++	}

4213

++	if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 ||

4214

++	    !BFQQ_SEEKY(bfqq))

4215

++		bfq_update_idle_window(bfqd, bfqq, bic);

4216

++

4217

++	bfq_log_bfqq(bfqd, bfqq,

4218

++		     "rq_enqueued: idle_window=%d (seeky %d, mean %llu)",

4219

++		     bfq_bfqq_idle_window(bfqq), BFQQ_SEEKY(bfqq),

4220

++		     (long long unsigned)bfqq->seek_mean);

4221

++

4222

++	bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);

4223

++

4224

++	if (bfqq == bfqd->in_service_queue && bfq_bfqq_wait_request(bfqq)) {

4225

++		int small_req = bfqq->queued[rq_is_sync(rq)] == 1 &&

4226

++				blk_rq_sectors(rq) < 32;

4227

++		int budget_timeout = bfq_bfqq_budget_timeout(bfqq);

4228

++

4229

++		/*

4230

++		 * There is just this request queued: if the request

4231

++		 * is small and the queue is not to be expired, then

4232

++		 * just exit.

4233

++		 *

4234

++		 * In this way, if the disk is being idled to wait for

4235

++		 * a new request from the in-service queue, we avoid

4236

++		 * unplugging the device and committing the disk to serve

4237

++		 * just a small request. On the contrary, we wait for

4238

++		 * the block layer to decide when to unplug the device:

4239

++		 * hopefully, new requests will be merged to this one

4240

++		 * quickly, then the device will be unplugged and

4241

++		 * larger requests will be dispatched.

4242

++		 */

4243

++		if (small_req && !budget_timeout)

4244

++			return;

4245

++

4246

++		/*

4247

++		 * A large enough request arrived, or the queue is to

4248

++		 * be expired: in both cases disk idling is to be

4249

++		 * stopped, so clear wait_request flag and reset

4250

++		 * timer.

4251

++		 */

4252

++		bfq_clear_bfqq_wait_request(bfqq);

4253

++		del_timer(&bfqd->idle_slice_timer);

4254

++

4255

++		/*

4256

++		 * The queue is not empty, because a new request just

4257

++		 * arrived. Hence we can safely expire the queue, in

4258

++		 * case of budget timeout, without risking that the

4259

++		 * timestamps of the queue are not updated correctly.

4260

++		 * See [1] for more details.

4261

++		 */

4262

++		if (budget_timeout)

4263

++			bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_TIMEOUT);

4264

++

4265

++		/*

4266

++		 * Let the request rip immediately, or let a new queue be

4267

++		 * selected if bfqq has just been expired.

4268

++		 */

4269

++		__blk_run_queue(bfqd->queue);

4270

++	}

4271

++}

4272

++

4273

++static void bfq_insert_request(struct request_queue *q, struct request *rq)

4274

++{

4275

++	struct bfq_data *bfqd = q->elevator->elevator_data;

4276

++	struct bfq_queue *bfqq = RQ_BFQQ(rq);

4277

++

4278

++	assert_spin_locked(bfqd->queue->queue_lock);

4279

++	bfq_init_prio_data(bfqq, RQ_BIC(rq));

4280

++

4281

++	bfq_add_request(rq);

4282

++

4283

++	rq->fifo_time = jiffies + bfqd->bfq_fifo_expire[rq_is_sync(rq)];

4284

++	list_add_tail(&rq->queuelist, &bfqq->fifo);

4285

++

4286

++	bfq_rq_enqueued(bfqd, bfqq, rq);

4287

++}

4288

++

4289

++static void bfq_update_hw_tag(struct bfq_data *bfqd)

4290

++{

4291

++	bfqd->max_rq_in_driver = max(bfqd->max_rq_in_driver,

4292

++				     bfqd->rq_in_driver);

4293

++

4294

++	if (bfqd->hw_tag == 1)

4295

++		return;

4296

++

4297

++	/*

4298

++	 * This sample is valid if the number of outstanding requests

4299

++	 * is large enough to allow a queueing behavior.  Note that the

4300

++	 * sum is not exact, as it's not taking into account deactivated

4301

++	 * requests.

4302

++	 */

4303

++	if (bfqd->rq_in_driver + bfqd->queued < BFQ_HW_QUEUE_THRESHOLD)

4304

++		return;

4305

++

4306

++	if (bfqd->hw_tag_samples++ < BFQ_HW_QUEUE_SAMPLES)

4307

++		return;

4308

++

4309

++	bfqd->hw_tag = bfqd->max_rq_in_driver > BFQ_HW_QUEUE_THRESHOLD;

4310

++	bfqd->max_rq_in_driver = 0;

4311

++	bfqd->hw_tag_samples = 0;

4312

++}

4313

++

4314

++static void bfq_completed_request(struct request_queue *q, struct request *rq)

4315

++{

4316

++	struct bfq_queue *bfqq = RQ_BFQQ(rq);

4317

++	struct bfq_data *bfqd = bfqq->bfqd;

4318

++	bool sync = bfq_bfqq_sync(bfqq);

4319

++

4320

++	bfq_log_bfqq(bfqd, bfqq, "completed one req with %u sects left (%d)",

4321

++		     blk_rq_sectors(rq), sync);

4322

++

4323

++	bfq_update_hw_tag(bfqd);

4324

++

4325

++	BUG_ON(!bfqd->rq_in_driver);

4326

++	BUG_ON(!bfqq->dispatched);

4327

++	bfqd->rq_in_driver--;

4328

++	bfqq->dispatched--;

4329

++

4330

++	if (!bfqq->dispatched && !bfq_bfqq_busy(bfqq)) {

4331

++		bfq_weights_tree_remove(bfqd, &bfqq->entity,

4332

++					&bfqd->queue_weights_tree);

4333

++		if (!blk_queue_nonrot(bfqd->queue)) {

4334

++			BUG_ON(!bfqd->busy_in_flight_queues);

4335

++			bfqd->busy_in_flight_queues--;

4336

++			if (bfq_bfqq_constantly_seeky(bfqq)) {

4337

++				BUG_ON(!bfqd->

4338

++					const_seeky_busy_in_flight_queues);

4339

++				bfqd->const_seeky_busy_in_flight_queues--;

4340

++			}

4341

++		}

4342

++	}

4343

++

4344

++	if (sync) {

4345

++		bfqd->sync_flight--;

4346

++		RQ_BIC(rq)->ttime.last_end_request = jiffies;

4347

++	}

4348

++

4349

++	/*

4350

++	 * If we are waiting to discover whether the request pattern of the

4351

++	 * task associated with the queue is actually isochronous, and

4352

++	 * both requisites for this condition to hold are satisfied, then

4353

++	 * compute soft_rt_next_start (see the comments to the function

4354

++	 * bfq_bfqq_softrt_next_start()).

4355

++	 */

4356

++	if (bfq_bfqq_softrt_update(bfqq) && bfqq->dispatched == 0 &&

4357

++	    RB_EMPTY_ROOT(&bfqq->sort_list))

4358

++		bfqq->soft_rt_next_start =

4359

++			bfq_bfqq_softrt_next_start(bfqd, bfqq);

4360

++

4361

++	/*

4362

++	 * If this is the in-service queue, check if it needs to be expired,

4363

++	 * or if we want to idle in case it has no pending requests.

4364

++	 */

4365

++	if (bfqd->in_service_queue == bfqq) {

4366

++		if (bfq_bfqq_budget_new(bfqq))

4367

++			bfq_set_budget_timeout(bfqd);

4368

++

4369

++		if (bfq_bfqq_must_idle(bfqq)) {

4370

++			bfq_arm_slice_timer(bfqd);

4371

++			goto out;

4372

++		} else if (bfq_may_expire_for_budg_timeout(bfqq))

4373

++			bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_TIMEOUT);

4374

++		else if (RB_EMPTY_ROOT(&bfqq->sort_list) &&

4375

++			 (bfqq->dispatched == 0 ||

4376

++			  !bfq_bfqq_must_not_expire(bfqq)))

4377

++			bfq_bfqq_expire(bfqd, bfqq, 0,

4378

++					BFQ_BFQQ_NO_MORE_REQUESTS);

4379

++	}

4380

++

4381

++	if (!bfqd->rq_in_driver)

4382

++		bfq_schedule_dispatch(bfqd);

4383

++

4384

++out:

4385

++	return;

4386

++}

4387

++

4388

++static inline int __bfq_may_queue(struct bfq_queue *bfqq)

4389

++{

4390

++	if (bfq_bfqq_wait_request(bfqq) && bfq_bfqq_must_alloc(bfqq)) {

4391

++		bfq_clear_bfqq_must_alloc(bfqq);

4392

++		return ELV_MQUEUE_MUST;

4393

++	}

4394

++

4395

++	return ELV_MQUEUE_MAY;

4396

++}

4397

++

4398

++static int bfq_may_queue(struct request_queue *q, int rw)

4399

++{

4400

++	struct bfq_data *bfqd = q->elevator->elevator_data;

4401

++	struct task_struct *tsk = current;

4402

++	struct bfq_io_cq *bic;

4403

++	struct bfq_queue *bfqq;

4404

++

4405

++	/*

4406

++	 * Don't force setup of a queue from here, as a call to may_queue

4407

++	 * does not necessarily imply that a request actually will be

4408

++	 * queued. So just lookup a possibly existing queue, or return

4409

++	 * 'may queue' if that fails.

4410

++	 */

4411

++	bic = bfq_bic_lookup(bfqd, tsk->io_context);

4412

++	if (bic == NULL)

4413

++		return ELV_MQUEUE_MAY;

4414

++

4415

++	bfqq = bic_to_bfqq(bic, rw_is_sync(rw));

4416

++	if (bfqq != NULL) {

4417

++		bfq_init_prio_data(bfqq, bic);

4418

++

4419

++		return __bfq_may_queue(bfqq);

4420

++	}

4421

++

4422

++	return ELV_MQUEUE_MAY;

4423

++}

4424

++

4425

++/*

4426

++ * Queue lock held here.

4427

++ */

4428

++static void bfq_put_request(struct request *rq)

4429

++{

4430

++	struct bfq_queue *bfqq = RQ_BFQQ(rq);

4431

++

4432

++	if (bfqq != NULL) {

4433

++		const int rw = rq_data_dir(rq);

4434

++

4435

++		BUG_ON(!bfqq->allocated[rw]);

4436

++		bfqq->allocated[rw]--;

4437

++

4438

++		rq->elv.priv[0] = NULL;

4439

++		rq->elv.priv[1] = NULL;

4440

++

4441

++		bfq_log_bfqq(bfqq->bfqd, bfqq, "put_request %p, %d",

4442

++			     bfqq, atomic_read(&bfqq->ref));

4443

++		bfq_put_queue(bfqq);

4444

++	}

4445

++}

4446

++

4447

++static struct bfq_queue *

4448

++bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,

4449

++		struct bfq_queue *bfqq)

4450

++{

4451

++	bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu",

4452

++		(long unsigned)bfqq->new_bfqq->pid);

4453

++	bic_set_bfqq(bic, bfqq->new_bfqq, 1);

4454

++	bfq_mark_bfqq_coop(bfqq->new_bfqq);

4455

++	bfq_put_queue(bfqq);

4456

++	return bic_to_bfqq(bic, 1);

4457

++}

4458

++

4459

++/*

4460

++ * Returns NULL if a new bfqq should be allocated, or the old bfqq if this

4461

++ * was the last process referring to said bfqq.

4462

++ */

4463

++static struct bfq_queue *

4464

++bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq)

4465

++{

4466

++	bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue");

4467

++	if (bfqq_process_refs(bfqq) == 1) {

4468

++		bfqq->pid = current->pid;

4469

++		bfq_clear_bfqq_coop(bfqq);

4470

++		bfq_clear_bfqq_split_coop(bfqq);

4471

++		return bfqq;

4472

++	}

4473

++

4474

++	bic_set_bfqq(bic, NULL, 1);

4475

++

4476

++	bfq_put_cooperator(bfqq);

4477

++

4478

++	bfq_put_queue(bfqq);

4479

++	return NULL;

4480

++}

4481

++

4482

++/*

4483

++ * Allocate bfq data structures associated with this request.

4484

++ */

4485

++static int bfq_set_request(struct request_queue *q, struct request *rq,

4486

++			   struct bio *bio, gfp_t gfp_mask)

4487

++{

4488

++	struct bfq_data *bfqd = q->elevator->elevator_data;

4489

++	struct bfq_io_cq *bic = icq_to_bic(rq->elv.icq);

4490

++	const int rw = rq_data_dir(rq);

4491

++	const int is_sync = rq_is_sync(rq);

4492

++	struct bfq_queue *bfqq;

4493

++	struct bfq_group *bfqg;

4494

++	unsigned long flags;

4495

++

4496

++	might_sleep_if(gfp_mask & __GFP_WAIT);

4497

++

4498

++	bfq_changed_ioprio(bic);

4499

++

4500

++	spin_lock_irqsave(q->queue_lock, flags);

4501

++

4502

++	if (bic == NULL)

4503

++		goto queue_fail;

4504

++

4505

++	bfqg = bfq_bic_update_cgroup(bic);

4506

++

4507

++new_queue:

4508

++	bfqq = bic_to_bfqq(bic, is_sync);

4509

++	if (bfqq == NULL || bfqq == &bfqd->oom_bfqq) {

4510

++		bfqq = bfq_get_queue(bfqd, bfqg, is_sync, bic, gfp_mask);

4511

++		bic_set_bfqq(bic, bfqq, is_sync);

4512

++	} else {

4513

++		/*

4514

++		 * If the queue was seeky for too long, break it apart.

4515

++		 */

4516

++		if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) {

4517

++			bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq");

4518

++			bfqq = bfq_split_bfqq(bic, bfqq);

4519

++			if (!bfqq)

4520

++				goto new_queue;

4521

++		}

4522

++

4523

++		/*

4524

++		 * Check to see if this queue is scheduled to merge with

4525

++		 * another closely cooperating queue. The merging of queues

4526

++		 * happens here as it must be done in process context.

4527

++		 * The reference on new_bfqq was taken in merge_bfqqs.

4528

++		 */

4529

++		if (bfqq->new_bfqq != NULL)

4530

++			bfqq = bfq_merge_bfqqs(bfqd, bic, bfqq);

4531

++	}

4532

++

4533

++	bfqq->allocated[rw]++;

4534

++	atomic_inc(&bfqq->ref);

4535

++	bfq_log_bfqq(bfqd, bfqq, "set_request: bfqq %p, %d", bfqq,

4536

++		     atomic_read(&bfqq->ref));

4537

++

4538

++	rq->elv.priv[0] = bic;

4539

++	rq->elv.priv[1] = bfqq;

4540

++

4541

++	spin_unlock_irqrestore(q->queue_lock, flags);

4542

++

4543

++	return 0;

4544

++

4545

++queue_fail:

4546

++	bfq_schedule_dispatch(bfqd);

4547

++	spin_unlock_irqrestore(q->queue_lock, flags);

4548

++

4549

++	return 1;

4550

++}

4551

++

4552

++static void bfq_kick_queue(struct work_struct *work)

4553

++{

4554

++	struct bfq_data *bfqd =

4555

++		container_of(work, struct bfq_data, unplug_work);

4556

++	struct request_queue *q = bfqd->queue;

4557

++

4558

++	spin_lock_irq(q->queue_lock);

4559

++	__blk_run_queue(q);

4560

++	spin_unlock_irq(q->queue_lock);

4561

++}

4562

++

4563

++/*

4564

++ * Handler of the expiration of the timer running if the in-service queue

4565

++ * is idling inside its time slice.

4566

++ */

4567

++static void bfq_idle_slice_timer(unsigned long data)

4568

++{

4569

++	struct bfq_data *bfqd = (struct bfq_data *)data;

4570

++	struct bfq_queue *bfqq;

4571

++	unsigned long flags;

4572

++	enum bfqq_expiration reason;

4573

++

4574

++	spin_lock_irqsave(bfqd->queue->queue_lock, flags);

4575

++

4576

++	bfqq = bfqd->in_service_queue;

4577

++	/*

4578

++	 * Theoretical race here: the in-service queue can be NULL or

4579

++	 * different from the queue that was idling if the timer handler

4580

++	 * spins on the queue_lock and a new request arrives for the

4581

++	 * current queue and there is a full dispatch cycle that changes

4582

++	 * the in-service queue.  This can hardly happen, but in the worst

4583

++	 * case we just expire a queue too early.

4584

++	 */

4585

++	if (bfqq != NULL) {

4586

++		bfq_log_bfqq(bfqd, bfqq, "slice_timer expired");

4587

++		if (bfq_bfqq_budget_timeout(bfqq))

4588

++			/*

4589

++			 * Also here the queue can be safely expired

4590

++			 * for budget timeout without wasting

4591

++			 * guarantees

4592

++			 */

4593

++			reason = BFQ_BFQQ_BUDGET_TIMEOUT;

4594

++		else if (bfqq->queued[0] == 0 && bfqq->queued[1] == 0)

4595

++			/*

4596

++			 * The queue may not be empty upon timer expiration,

4597

++			 * because we may not disable the timer when the

4598

++			 * first request of the in-service queue arrives

4599

++			 * during disk idling.

4600

++			 */

4601

++			reason = BFQ_BFQQ_TOO_IDLE;

4602

++		else

4603

++			goto schedule_dispatch;

4604

++

4605

++		bfq_bfqq_expire(bfqd, bfqq, 1, reason);

4606

++	}

4607

++

4608

++schedule_dispatch:

4609

++	bfq_schedule_dispatch(bfqd);

4610

++

4611

++	spin_unlock_irqrestore(bfqd->queue->queue_lock, flags);

4612

++}

4613

++

4614

++static void bfq_shutdown_timer_wq(struct bfq_data *bfqd)

4615

++{

4616

++	del_timer_sync(&bfqd->idle_slice_timer);

4617

++	cancel_work_sync(&bfqd->unplug_work);

4618

++}

4619

++

4620

++static inline void __bfq_put_async_bfqq(struct bfq_data *bfqd,

4621

++					struct bfq_queue **bfqq_ptr)

4622

++{

4623

++	struct bfq_group *root_group = bfqd->root_group;

4624

++	struct bfq_queue *bfqq = *bfqq_ptr;

4625

++

4626

++	bfq_log(bfqd, "put_async_bfqq: %p", bfqq);

4627

++	if (bfqq != NULL) {

4628

++		bfq_bfqq_move(bfqd, bfqq, &bfqq->entity, root_group);

4629

++		bfq_log_bfqq(bfqd, bfqq, "put_async_bfqq: putting %p, %d",

4630

++			     bfqq, atomic_read(&bfqq->ref));

4631

++		bfq_put_queue(bfqq);

4632

++		*bfqq_ptr = NULL;

4633

++	}

4634

++}

4635

++

4636

++/*

4637

++ * Release all the bfqg references to its async queues.  If we are

4638

++ * deallocating the group these queues may still contain requests, so

4639

++ * we reparent them to the root cgroup (i.e., the only one that will

4640

++ * exist for sure until all the requests on a device are gone).

4641

++ */

4642

++static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg)

4643

++{

4644

++	int i, j;

4645

++

4646

++	for (i = 0; i < 2; i++)

4647

++		for (j = 0; j < IOPRIO_BE_NR; j++)

4648

++			__bfq_put_async_bfqq(bfqd, &bfqg->async_bfqq[i][j]);

4649

++

4650

++	__bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq);

4651

++}

4652

++

4653

++static void bfq_exit_queue(struct elevator_queue *e)

4654

++{

4655

++	struct bfq_data *bfqd = e->elevator_data;

4656

++	struct request_queue *q = bfqd->queue;

4657

++	struct bfq_queue *bfqq, *n;

4658

++

4659

++	bfq_shutdown_timer_wq(bfqd);

4660

++

4661

++	spin_lock_irq(q->queue_lock);

4662

++

4663

++	BUG_ON(bfqd->in_service_queue != NULL);

4664

++	list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list)

4665

++		bfq_deactivate_bfqq(bfqd, bfqq, 0);

4666

++

4667

++	bfq_disconnect_groups(bfqd);

4668

++	spin_unlock_irq(q->queue_lock);

4669

++

4670

++	bfq_shutdown_timer_wq(bfqd);

4671

++

4672

++	synchronize_rcu();

4673

++

4674

++	BUG_ON(timer_pending(&bfqd->idle_slice_timer));

4675

++

4676

++	bfq_free_root_group(bfqd);

4677

++	kfree(bfqd);

4678

++}

4679

++

4680

++static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)

4681

++{

4682

++	struct bfq_group *bfqg;

4683

++	struct bfq_data *bfqd;

4684

++	struct elevator_queue *eq;

4685

++

4686

++	eq = elevator_alloc(q, e);

4687

++	if (eq == NULL)

4688

++		return -ENOMEM;

4689

++

4690

++	bfqd = kzalloc_node(sizeof(*bfqd), GFP_KERNEL, q->node);

4691

++	if (bfqd == NULL) {

4692

++		kobject_put(&eq->kobj);

4693

++		return -ENOMEM;

4694

++	}

4695

++	eq->elevator_data = bfqd;

4696

++

4697

++	/*

4698

++	 * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues.

4699

++	 * Grab a permanent reference to it, so that the normal code flow

4700

++	 * will not attempt to free it.

4701

++	 */

4702

++	bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, 1, 0);

4703

++	atomic_inc(&bfqd->oom_bfqq.ref);

4704

++	bfqd->oom_bfqq.entity.new_ioprio = BFQ_DEFAULT_QUEUE_IOPRIO;

4705

++	bfqd->oom_bfqq.entity.new_ioprio_class = IOPRIO_CLASS_BE;

4706

++	/*

4707

++	 * Trigger weight initialization, according to ioprio, at the

4708

++	 * oom_bfqq's first activation. The oom_bfqq's ioprio and ioprio

4709

++	 * class won't be changed any more.

4710

++	 */

4711

++	bfqd->oom_bfqq.entity.ioprio_changed = 1;

4712

++

4713

++	bfqd->queue = q;

4714

++

4715

++	spin_lock_irq(q->queue_lock);

4716

++	q->elevator = eq;

4717

++	spin_unlock_irq(q->queue_lock);

4718

++

4719

++	bfqg = bfq_alloc_root_group(bfqd, q->node);

4720

++	if (bfqg == NULL) {

4721

++		kfree(bfqd);

4722

++		kobject_put(&eq->kobj);

4723

++		return -ENOMEM;

4724

++	}

4725

++

4726

++	bfqd->root_group = bfqg;

4727

++	bfq_init_entity(&bfqd->oom_bfqq.entity, bfqd->root_group);

4728

++#ifdef CONFIG_CGROUP_BFQIO

4729

++	bfqd->active_numerous_groups = 0;

4730

++#endif

4731

++

4732

++	init_timer(&bfqd->idle_slice_timer);

4733

++	bfqd->idle_slice_timer.function = bfq_idle_slice_timer;

4734

++	bfqd->idle_slice_timer.data = (unsigned long)bfqd;

4735

++

4736

++	bfqd->rq_pos_tree = RB_ROOT;

4737

++	bfqd->queue_weights_tree = RB_ROOT;

4738

++	bfqd->group_weights_tree = RB_ROOT;

4739

++

4740

++	INIT_WORK(&bfqd->unplug_work, bfq_kick_queue);

4741

++

4742

++	INIT_LIST_HEAD(&bfqd->active_list);

4743

++	INIT_LIST_HEAD(&bfqd->idle_list);

4744

++	INIT_HLIST_HEAD(&bfqd->burst_list);

4745

++

4746

++	bfqd->hw_tag = -1;

4747

++

4748

++	bfqd->bfq_max_budget = bfq_default_max_budget;

4749

++

4750

++	bfqd->bfq_quantum = bfq_quantum;

4751

++	bfqd->bfq_fifo_expire[0] = bfq_fifo_expire[0];

4752

++	bfqd->bfq_fifo_expire[1] = bfq_fifo_expire[1];

4753

++	bfqd->bfq_back_max = bfq_back_max;

4754

++	bfqd->bfq_back_penalty = bfq_back_penalty;

4755

++	bfqd->bfq_slice_idle = bfq_slice_idle;

4756

++	bfqd->bfq_class_idle_last_service = 0;

4757

++	bfqd->bfq_max_budget_async_rq = bfq_max_budget_async_rq;

4758

++	bfqd->bfq_timeout[BLK_RW_ASYNC] = bfq_timeout_async;

4759

++	bfqd->bfq_timeout[BLK_RW_SYNC] = bfq_timeout_sync;

4760

++

4761

++	bfqd->bfq_coop_thresh = 2;

4762

++	bfqd->bfq_failed_cooperations = 7000;

4763

++	bfqd->bfq_requests_within_timer = 120;

4764

++

4765

++	bfqd->bfq_large_burst_thresh = 11;

4766

++	bfqd->bfq_burst_interval = msecs_to_jiffies(500);

4767

++

4768

++	bfqd->low_latency = true;

4769

++

4770

++	bfqd->bfq_wr_coeff = 20;

4771

++	bfqd->bfq_wr_rt_max_time = msecs_to_jiffies(300);

4772

++	bfqd->bfq_wr_max_time = 0;

4773

++	bfqd->bfq_wr_min_idle_time = msecs_to_jiffies(2000);

4774

++	bfqd->bfq_wr_min_inter_arr_async = msecs_to_jiffies(500);

4775

++	bfqd->bfq_wr_max_softrt_rate = 7000; /*

4776

++					      * Approximate rate required

4777

++					      * to playback or record a

4778

++					      * high-definition compressed

4779

++					      * video.

4780

++					      */

4781

++	bfqd->wr_busy_queues = 0;

4782

++	bfqd->busy_in_flight_queues = 0;

4783

++	bfqd->const_seeky_busy_in_flight_queues = 0;

4784

++

4785

++	/*

4786

++	 * Begin by assuming, optimistically, that the device peak rate is

4787

++	 * equal to the highest reference rate.

4788

++	 */

4789

++	bfqd->RT_prod = R_fast[blk_queue_nonrot(bfqd->queue)] *

4790

++			T_fast[blk_queue_nonrot(bfqd->queue)];

4791

++	bfqd->peak_rate = R_fast[blk_queue_nonrot(bfqd->queue)];

4792

++	bfqd->device_speed = BFQ_BFQD_FAST;

4793

++

4794

++	return 0;

4795

++}

4796

++

4797

++static void bfq_slab_kill(void)

4798

++{

4799

++	if (bfq_pool != NULL)

4800

++		kmem_cache_destroy(bfq_pool);

4801

++}

4802

++

4803

++static int __init bfq_slab_setup(void)

4804

++{

4805

++	bfq_pool = KMEM_CACHE(bfq_queue, 0);

4806

++	if (bfq_pool == NULL)

4807

++		return -ENOMEM;

4808

++	return 0;

4809

++}

4810

++

4811

++static ssize_t bfq_var_show(unsigned int var, char *page)

4812

++{

4813

++	return sprintf(page, "%d\n", var);

4814

++}

4815

++

4816

++static ssize_t bfq_var_store(unsigned long *var, const char *page,

4817

++			     size_t count)

4818

++{

4819

++	unsigned long new_val;

4820

++	int ret = kstrtoul(page, 10, &new_val);

4821

++

4822

++	if (ret == 0)

4823

++		*var = new_val;

4824

++

4825

++	return count;

4826

++}

4827

++

4828

++static ssize_t bfq_wr_max_time_show(struct elevator_queue *e, char *page)

4829

++{

4830

++	struct bfq_data *bfqd = e->elevator_data;

4831

++	return sprintf(page, "%d\n", bfqd->bfq_wr_max_time > 0 ?

4832

++		       jiffies_to_msecs(bfqd->bfq_wr_max_time) :

4833

++		       jiffies_to_msecs(bfq_wr_duration(bfqd)));

4834

++}

4835

++

4836

++static ssize_t bfq_weights_show(struct elevator_queue *e, char *page)

4837

++{

4838

++	struct bfq_queue *bfqq;

4839

++	struct bfq_data *bfqd = e->elevator_data;

4840

++	ssize_t num_char = 0;

4841

++

4842

++	num_char += sprintf(page + num_char, "Tot reqs queued %d\n\n",

4843

++			    bfqd->queued);

4844

++

4845

++	spin_lock_irq(bfqd->queue->queue_lock);

4846

++

4847

++	num_char += sprintf(page + num_char, "Active:\n");

4848

++	list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) {

4849

++	  num_char += sprintf(page + num_char,

4850

++			      "pid%d: weight %hu, nr_queued %d %d, dur %d/%u\n",

4851

++			      bfqq->pid,

4852

++			      bfqq->entity.weight,

4853

++			      bfqq->queued[0],

4854

++			      bfqq->queued[1],

4855

++			jiffies_to_msecs(jiffies - bfqq->last_wr_start_finish),

4856

++			jiffies_to_msecs(bfqq->wr_cur_max_time));

4857

++	}

4858

++

4859

++	num_char += sprintf(page + num_char, "Idle:\n");

4860

++	list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list) {

4861

++			num_char += sprintf(page + num_char,

4862

++				"pid%d: weight %hu, dur %d/%u\n",

4863

++				bfqq->pid,

4864

++				bfqq->entity.weight,

4865

++				jiffies_to_msecs(jiffies -

4866

++					bfqq->last_wr_start_finish),

4867

++				jiffies_to_msecs(bfqq->wr_cur_max_time));

4868

++	}

4869

++

4870

++	spin_unlock_irq(bfqd->queue->queue_lock);

4871

++

4872

++	return num_char;

4873

++}

4874

++

4875

++#define SHOW_FUNCTION(__FUNC, __VAR, __CONV)				\

4876

++static ssize_t __FUNC(struct elevator_queue *e, char *page)		\

4877

++{									\

4878

++	struct bfq_data *bfqd = e->elevator_data;			\

4879

++	unsigned int __data = __VAR;					\

4880

++	if (__CONV)							\

4881

++		__data = jiffies_to_msecs(__data);			\

4882

++	return bfq_var_show(__data, (page));				\

4883

++}

4884

++SHOW_FUNCTION(bfq_quantum_show, bfqd->bfq_quantum, 0);

4885

++SHOW_FUNCTION(bfq_fifo_expire_sync_show, bfqd->bfq_fifo_expire[1], 1);

4886

++SHOW_FUNCTION(bfq_fifo_expire_async_show, bfqd->bfq_fifo_expire[0], 1);

4887

++SHOW_FUNCTION(bfq_back_seek_max_show, bfqd->bfq_back_max, 0);

4888

++SHOW_FUNCTION(bfq_back_seek_penalty_show, bfqd->bfq_back_penalty, 0);

4889

++SHOW_FUNCTION(bfq_slice_idle_show, bfqd->bfq_slice_idle, 1);

4890

++SHOW_FUNCTION(bfq_max_budget_show, bfqd->bfq_user_max_budget, 0);

4891

++SHOW_FUNCTION(bfq_max_budget_async_rq_show,

4892

++	      bfqd->bfq_max_budget_async_rq, 0);

4893

++SHOW_FUNCTION(bfq_timeout_sync_show, bfqd->bfq_timeout[BLK_RW_SYNC], 1);

4894

++SHOW_FUNCTION(bfq_timeout_async_show, bfqd->bfq_timeout[BLK_RW_ASYNC], 1);

4895

++SHOW_FUNCTION(bfq_low_latency_show, bfqd->low_latency, 0);

4896

++SHOW_FUNCTION(bfq_wr_coeff_show, bfqd->bfq_wr_coeff, 0);

4897

++SHOW_FUNCTION(bfq_wr_rt_max_time_show, bfqd->bfq_wr_rt_max_time, 1);

4898

++SHOW_FUNCTION(bfq_wr_min_idle_time_show, bfqd->bfq_wr_min_idle_time, 1);

4899

++SHOW_FUNCTION(bfq_wr_min_inter_arr_async_show, bfqd->bfq_wr_min_inter_arr_async,

4900

++	1);

4901

++SHOW_FUNCTION(bfq_wr_max_softrt_rate_show, bfqd->bfq_wr_max_softrt_rate, 0);

4902

++#undef SHOW_FUNCTION

4903

++

4904

++#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV)			\

4905

++static ssize_t								\

4906

++__FUNC(struct elevator_queue *e, const char *page, size_t count)	\

4907

++{									\

4908

++	struct bfq_data *bfqd = e->elevator_data;			\

4909

++	unsigned long uninitialized_var(__data);			\

4910

++	int ret = bfq_var_store(&__data, (page), count);		\

4911

++	if (__data < (MIN))						\

4912

++		__data = (MIN);						\

4913

++	else if (__data > (MAX))					\

4914

++		__data = (MAX);						\

4915

++	if (__CONV)							\

4916

++		*(__PTR) = msecs_to_jiffies(__data);			\

4917

++	else								\

4918

++		*(__PTR) = __data;					\

4919

++	return ret;							\

4920

++}

4921

++STORE_FUNCTION(bfq_quantum_store, &bfqd->bfq_quantum, 1, INT_MAX, 0);

4922

++STORE_FUNCTION(bfq_fifo_expire_sync_store, &bfqd->bfq_fifo_expire[1], 1,

4923

++		INT_MAX, 1);

4924

++STORE_FUNCTION(bfq_fifo_expire_async_store, &bfqd->bfq_fifo_expire[0], 1,

4925

++		INT_MAX, 1);

4926

++STORE_FUNCTION(bfq_back_seek_max_store, &bfqd->bfq_back_max, 0, INT_MAX, 0);

4927

++STORE_FUNCTION(bfq_back_seek_penalty_store, &bfqd->bfq_back_penalty, 1,

4928

++		INT_MAX, 0);

4929

++STORE_FUNCTION(bfq_slice_idle_store, &bfqd->bfq_slice_idle, 0, INT_MAX, 1);

4930

++STORE_FUNCTION(bfq_max_budget_async_rq_store, &bfqd->bfq_max_budget_async_rq,

4931

++		1, INT_MAX, 0);

4932

++STORE_FUNCTION(bfq_timeout_async_store, &bfqd->bfq_timeout[BLK_RW_ASYNC], 0,

4933

++		INT_MAX, 1);

4934

++STORE_FUNCTION(bfq_wr_coeff_store, &bfqd->bfq_wr_coeff, 1, INT_MAX, 0);

4935

++STORE_FUNCTION(bfq_wr_max_time_store, &bfqd->bfq_wr_max_time, 0, INT_MAX, 1);

4936

++STORE_FUNCTION(bfq_wr_rt_max_time_store, &bfqd->bfq_wr_rt_max_time, 0, INT_MAX,

4937

++		1);

4938

++STORE_FUNCTION(bfq_wr_min_idle_time_store, &bfqd->bfq_wr_min_idle_time, 0,

4939

++		INT_MAX, 1);

4940

++STORE_FUNCTION(bfq_wr_min_inter_arr_async_store,

4941

++		&bfqd->bfq_wr_min_inter_arr_async, 0, INT_MAX, 1);

4942

++STORE_FUNCTION(bfq_wr_max_softrt_rate_store, &bfqd->bfq_wr_max_softrt_rate, 0,

4943

++		INT_MAX, 0);

4944

++#undef STORE_FUNCTION

4945

++

4946

++/* do nothing for the moment */

4947

++static ssize_t bfq_weights_store(struct elevator_queue *e,

4948

++				    const char *page, size_t count)

4949

++{

4950

++	return count;

4951

++}

4952

++

4953

++static inline unsigned long bfq_estimated_max_budget(struct bfq_data *bfqd)

4954

++{

4955

++	u64 timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]);

4956

++

4957

++	if (bfqd->peak_rate_samples >= BFQ_PEAK_RATE_SAMPLES)

4958

++		return bfq_calc_max_budget(bfqd->peak_rate, timeout);

4959

++	else

4960

++		return bfq_default_max_budget;

4961

++}

4962

++

4963

++static ssize_t bfq_max_budget_store(struct elevator_queue *e,

4964

++				    const char *page, size_t count)

4965

++{

4966

++	struct bfq_data *bfqd = e->elevator_data;

4967

++	unsigned long uninitialized_var(__data);

4968

++	int ret = bfq_var_store(&__data, (page), count);

4969

++

4970

++	if (__data == 0)

4971

++		bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd);

4972

++	else {

4973

++		if (__data > INT_MAX)

4974

++			__data = INT_MAX;

4975

++		bfqd->bfq_max_budget = __data;

4976

++	}

4977

++

4978

++	bfqd->bfq_user_max_budget = __data;

4979

++

4980

++	return ret;

4981

++}

4982

++

4983

++static ssize_t bfq_timeout_sync_store(struct elevator_queue *e,

4984

++				      const char *page, size_t count)

4985

++{

4986

++	struct bfq_data *bfqd = e->elevator_data;

4987

++	unsigned long uninitialized_var(__data);

4988

++	int ret = bfq_var_store(&__data, (page), count);

4989

++

4990

++	if (__data < 1)

4991

++		__data = 1;

4992

++	else if (__data > INT_MAX)

4993

++		__data = INT_MAX;

4994

++

4995

++	bfqd->bfq_timeout[BLK_RW_SYNC] = msecs_to_jiffies(__data);

4996

++	if (bfqd->bfq_user_max_budget == 0)

4997

++		bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd);

4998

++

4999

++	return ret;

5000

++}

5001

++

5002

++static ssize_t bfq_low_latency_store(struct elevator_queue *e,

5003

++				     const char *page, size_t count)

5004

++{

5005

++	struct bfq_data *bfqd = e->elevator_data;

5006

++	unsigned long uninitialized_var(__data);

5007

++	int ret = bfq_var_store(&__data, (page), count);

5008

++

5009

++	if (__data > 1)

5010

++		__data = 1;

5011

++	if (__data == 0 && bfqd->low_latency != 0)

5012

++		bfq_end_wr(bfqd);

5013

++	bfqd->low_latency = __data;

5014

++

5015

++	return ret;

5016

++}

5017

++

5018

++#define BFQ_ATTR(name) \

5019

++	__ATTR(name, S_IRUGO|S_IWUSR, bfq_##name##_show, bfq_##name##_store)

5020

++

5021

++static struct elv_fs_entry bfq_attrs[] = {

5022

++	BFQ_ATTR(quantum),

5023

++	BFQ_ATTR(fifo_expire_sync),

5024

++	BFQ_ATTR(fifo_expire_async),

5025

++	BFQ_ATTR(back_seek_max),

5026

++	BFQ_ATTR(back_seek_penalty),

5027

++	BFQ_ATTR(slice_idle),

5028

++	BFQ_ATTR(max_budget),

5029

++	BFQ_ATTR(max_budget_async_rq),

5030

++	BFQ_ATTR(timeout_sync),

5031

++	BFQ_ATTR(timeout_async),

5032

++	BFQ_ATTR(low_latency),

5033

++	BFQ_ATTR(wr_coeff),

5034

++	BFQ_ATTR(wr_max_time),

5035

++	BFQ_ATTR(wr_rt_max_time),

5036

++	BFQ_ATTR(wr_min_idle_time),

5037

++	BFQ_ATTR(wr_min_inter_arr_async),

5038

++	BFQ_ATTR(wr_max_softrt_rate),

5039

++	BFQ_ATTR(weights),

5040

++	__ATTR_NULL

5041

++};

5042

++

5043

++static struct elevator_type iosched_bfq = {

5044

++	.ops = {

5045

++		.elevator_merge_fn =		bfq_merge,

5046

++		.elevator_merged_fn =		bfq_merged_request,

5047

++		.elevator_merge_req_fn =	bfq_merged_requests,

5048

++		.elevator_allow_merge_fn =	bfq_allow_merge,

5049

++		.elevator_dispatch_fn =		bfq_dispatch_requests,

5050

++		.elevator_add_req_fn =		bfq_insert_request,

5051

++		.elevator_activate_req_fn =	bfq_activate_request,

5052

++		.elevator_deactivate_req_fn =	bfq_deactivate_request,

5053

++		.elevator_completed_req_fn =	bfq_completed_request,

5054

++		.elevator_former_req_fn =	elv_rb_former_request,

5055

++		.elevator_latter_req_fn =	elv_rb_latter_request,

5056

++		.elevator_init_icq_fn =		bfq_init_icq,

5057

++		.elevator_exit_icq_fn =		bfq_exit_icq,

5058

++		.elevator_set_req_fn =		bfq_set_request,

5059

++		.elevator_put_req_fn =		bfq_put_request,

5060

++		.elevator_may_queue_fn =	bfq_may_queue,

5061

++		.elevator_init_fn =		bfq_init_queue,

5062

++		.elevator_exit_fn =		bfq_exit_queue,

5063

++	},

5064

++	.icq_size =		sizeof(struct bfq_io_cq),

5065

++	.icq_align =		__alignof__(struct bfq_io_cq),

5066

++	.elevator_attrs =	bfq_attrs,

5067

++	.elevator_name =	"bfq",

5068

++	.elevator_owner =	THIS_MODULE,

5069

++};

5070

++

5071

++static int __init bfq_init(void)

5072

++{

5073

++	/*

5074

++	 * Can be 0 on HZ < 1000 setups.

5075

++	 */

5076

++	if (bfq_slice_idle == 0)

5077

++		bfq_slice_idle = 1;

5078

++

5079

++	if (bfq_timeout_async == 0)

5080

++		bfq_timeout_async = 1;

5081

++

5082

++	if (bfq_slab_setup())

5083

++		return -ENOMEM;

5084

++

5085

++	/*

5086

++	 * Times to load large popular applications for the typical systems

5087

++	 * installed on the reference devices (see the comments before the

5088

++	 * definitions of the two arrays).

5089

++	 */

5090

++	T_slow[0] = msecs_to_jiffies(2600);

5091

++	T_slow[1] = msecs_to_jiffies(1000);

5092

++	T_fast[0] = msecs_to_jiffies(5500);

5093

++	T_fast[1] = msecs_to_jiffies(2000);

5094

++

5095

++	/*

5096

++	 * Thresholds that determine the switch between speed classes (see

5097

++	 * the comments before the definition of the array).

5098

++	 */

5099

++	device_speed_thresh[0] = (R_fast[0] + R_slow[0]) / 2;

5100

++	device_speed_thresh[1] = (R_fast[1] + R_slow[1]) / 2;

5101

++

5102

++	elv_register(&iosched_bfq);

5103

++	pr_info("BFQ I/O-scheduler version: v7r7");

5104

++

5105

++	return 0;

5106

++}

5107

++

5108

++static void __exit bfq_exit(void)

5109

++{

5110

++	elv_unregister(&iosched_bfq);

5111

++	bfq_slab_kill();

5112

++}

5113

++

5114

++module_init(bfq_init);

5115

++module_exit(bfq_exit);

5116

++

5117

++MODULE_AUTHOR("Fabio Checconi, Paolo Valente");

5118

++MODULE_LICENSE("GPL");

5119

+diff --git a/block/bfq-sched.c b/block/bfq-sched.c

5120

+new file mode 100644

5121

+index 0000000..2931563

5122

+--- /dev/null

5123

++++ b/block/bfq-sched.c

5124

+@@ -0,0 +1,1214 @@

5125

++/*

5126

++ * BFQ: Hierarchical B-WF2Q+ scheduler.

5127

++ *

5128

++ * Based on ideas and code from CFQ:

5129

++ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>

5130

++ *

5131

++ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>

5132

++ *		      Paolo Valente <paolo.valente@×××××××.it>

5133

++ *

5134

++ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>

5135

++ */

5136

++

5137

++#ifdef CONFIG_CGROUP_BFQIO

5138

++#define for_each_entity(entity)	\

5139

++	for (; entity != NULL; entity = entity->parent)

5140

++

5141

++#define for_each_entity_safe(entity, parent) \

5142

++	for (; entity && ({ parent = entity->parent; 1; }); entity = parent)

5143

++

5144

++static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd,

5145

++						 int extract,

5146

++						 struct bfq_data *bfqd);

5147

++

5148

++static inline void bfq_update_budget(struct bfq_entity *next_in_service)

5149

++{

5150

++	struct bfq_entity *bfqg_entity;

5151

++	struct bfq_group *bfqg;

5152

++	struct bfq_sched_data *group_sd;

5153

++

5154

++	BUG_ON(next_in_service == NULL);

5155

++

5156

++	group_sd = next_in_service->sched_data;

5157

++

5158

++	bfqg = container_of(group_sd, struct bfq_group, sched_data);

5159

++	/*

5160

++	 * bfq_group's my_entity field is not NULL only if the group

5161

++	 * is not the root group. We must not touch the root entity

5162

++	 * as it must never become an in-service entity.

5163

++	 */

5164

++	bfqg_entity = bfqg->my_entity;

5165

++	if (bfqg_entity != NULL)

5166

++		bfqg_entity->budget = next_in_service->budget;

5167

++}

5168

++

5169

++static int bfq_update_next_in_service(struct bfq_sched_data *sd)

5170

++{

5171

++	struct bfq_entity *next_in_service;

5172

++

5173

++	if (sd->in_service_entity != NULL)

5174

++		/* will update/requeue at the end of service */

5175

++		return 0;

5176

++

5177

++	/*

5178

++	 * NOTE: this can be improved in many ways, such as returning

5179

++	 * 1 (and thus propagating upwards the update) only when the

5180

++	 * budget changes, or caching the bfqq that will be scheduled

5181

++	 * next from this subtree.  By now we worry more about

5182

++	 * correctness than about performance...

5183

++	 */

5184

++	next_in_service = bfq_lookup_next_entity(sd, 0, NULL);

5185

++	sd->next_in_service = next_in_service;

5186

++

5187

++	if (next_in_service != NULL)

5188

++		bfq_update_budget(next_in_service);

5189

++

5190

++	return 1;

5191

++}

5192

++

5193

++static inline void bfq_check_next_in_service(struct bfq_sched_data *sd,

5194

++					     struct bfq_entity *entity)

5195

++{

5196

++	BUG_ON(sd->next_in_service != entity);

5197

++}

5198

++#else

5199

++#define for_each_entity(entity)	\

5200

++	for (; entity != NULL; entity = NULL)

5201

++

5202

++#define for_each_entity_safe(entity, parent) \

5203

++	for (parent = NULL; entity != NULL; entity = parent)

5204

++

5205

++static inline int bfq_update_next_in_service(struct bfq_sched_data *sd)

5206

++{

5207

++	return 0;

5208

++}

5209

++

5210

++static inline void bfq_check_next_in_service(struct bfq_sched_data *sd,

5211

++					     struct bfq_entity *entity)

5212

++{

5213

++}

5214

++

5215

++static inline void bfq_update_budget(struct bfq_entity *next_in_service)

5216

++{

5217

++}

5218

++#endif

5219

++

5220

++/*

5221

++ * Shift for timestamp calculations.  This actually limits the maximum

5222

++ * service allowed in one timestamp delta (small shift values increase it),

5223

++ * the maximum total weight that can be used for the queues in the system

5224

++ * (big shift values increase it), and the period of virtual time

5225

++ * wraparounds.

5226

++ */

5227

++#define WFQ_SERVICE_SHIFT	22

5228

++

5229

++/**

5230

++ * bfq_gt - compare two timestamps.

5231

++ * @a: first ts.

5232

++ * @b: second ts.

5233

++ *

5234

++ * Return @a > @b, dealing with wrapping correctly.

5235

++ */

5236

++static inline int bfq_gt(u64 a, u64 b)

5237

++{

5238

++	return (s64)(a - b) > 0;

5239

++}

5240

++

5241

++static inline struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity)

5242

++{

5243

++	struct bfq_queue *bfqq = NULL;

5244

++

5245

++	BUG_ON(entity == NULL);

5246

++

5247

++	if (entity->my_sched_data == NULL)

5248

++		bfqq = container_of(entity, struct bfq_queue, entity);

5249

++

5250

++	return bfqq;

5251

++}

5252

++

5253

++

5254

++/**

5255

++ * bfq_delta - map service into the virtual time domain.

5256

++ * @service: amount of service.

5257

++ * @weight: scale factor (weight of an entity or weight sum).

5258

++ */

5259

++static inline u64 bfq_delta(unsigned long service,

5260

++					unsigned long weight)

5261

++{

5262

++	u64 d = (u64)service << WFQ_SERVICE_SHIFT;

5263

++

5264

++	do_div(d, weight);

5265

++	return d;

5266

++}

5267

++

5268

++/**

5269

++ * bfq_calc_finish - assign the finish time to an entity.

5270

++ * @entity: the entity to act upon.

5271

++ * @service: the service to be charged to the entity.

5272

++ */

5273

++static inline void bfq_calc_finish(struct bfq_entity *entity,

5274

++				   unsigned long service)

5275

++{

5276

++	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

5277

++

5278

++	BUG_ON(entity->weight == 0);

5279

++

5280

++	entity->finish = entity->start +

5281

++		bfq_delta(service, entity->weight);

5282

++

5283

++	if (bfqq != NULL) {

5284

++		bfq_log_bfqq(bfqq->bfqd, bfqq,

5285

++			"calc_finish: serv %lu, w %d",

5286

++			service, entity->weight);

5287

++		bfq_log_bfqq(bfqq->bfqd, bfqq,

5288

++			"calc_finish: start %llu, finish %llu, delta %llu",

5289

++			entity->start, entity->finish,

5290

++			bfq_delta(service, entity->weight));

5291

++	}

5292

++}

5293

++

5294

++/**

5295

++ * bfq_entity_of - get an entity from a node.

5296

++ * @node: the node field of the entity.

5297

++ *

5298

++ * Convert a node pointer to the relative entity.  This is used only

5299

++ * to simplify the logic of some functions and not as the generic

5300

++ * conversion mechanism because, e.g., in the tree walking functions,

5301

++ * the check for a %NULL value would be redundant.

5302

++ */

5303

++static inline struct bfq_entity *bfq_entity_of(struct rb_node *node)

5304

++{

5305

++	struct bfq_entity *entity = NULL;

5306

++

5307

++	if (node != NULL)

5308

++		entity = rb_entry(node, struct bfq_entity, rb_node);

5309

++

5310

++	return entity;

5311

++}

5312

++

5313

++/**

5314

++ * bfq_extract - remove an entity from a tree.

5315

++ * @root: the tree root.

5316

++ * @entity: the entity to remove.

5317

++ */

5318

++static inline void bfq_extract(struct rb_root *root,

5319

++			       struct bfq_entity *entity)

5320

++{

5321

++	BUG_ON(entity->tree != root);

5322

++

5323

++	entity->tree = NULL;

5324

++	rb_erase(&entity->rb_node, root);

5325

++}

5326

++

5327

++/**

5328

++ * bfq_idle_extract - extract an entity from the idle tree.

5329

++ * @st: the service tree of the owning @entity.

5330

++ * @entity: the entity being removed.

5331

++ */

5332

++static void bfq_idle_extract(struct bfq_service_tree *st,

5333

++			     struct bfq_entity *entity)

5334

++{

5335

++	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

5336

++	struct rb_node *next;

5337

++

5338

++	BUG_ON(entity->tree != &st->idle);

5339

++

5340

++	if (entity == st->first_idle) {

5341

++		next = rb_next(&entity->rb_node);

5342

++		st->first_idle = bfq_entity_of(next);

5343

++	}

5344

++

5345

++	if (entity == st->last_idle) {

5346

++		next = rb_prev(&entity->rb_node);

5347

++		st->last_idle = bfq_entity_of(next);

5348

++	}

5349

++

5350

++	bfq_extract(&st->idle, entity);

5351

++

5352

++	if (bfqq != NULL)

5353

++		list_del(&bfqq->bfqq_list);

5354

++}

5355

++

5356

++/**

5357

++ * bfq_insert - generic tree insertion.

5358

++ * @root: tree root.

5359

++ * @entity: entity to insert.

5360

++ *

5361

++ * This is used for the idle and the active tree, since they are both

5362

++ * ordered by finish time.

5363

++ */

5364

++static void bfq_insert(struct rb_root *root, struct bfq_entity *entity)

5365

++{

5366

++	struct bfq_entity *entry;

5367

++	struct rb_node **node = &root->rb_node;

5368

++	struct rb_node *parent = NULL;

5369

++

5370

++	BUG_ON(entity->tree != NULL);

5371

++

5372

++	while (*node != NULL) {

5373

++		parent = *node;

5374

++		entry = rb_entry(parent, struct bfq_entity, rb_node);

5375

++

5376

++		if (bfq_gt(entry->finish, entity->finish))

5377

++			node = &parent->rb_left;

5378

++		else

5379

++			node = &parent->rb_right;

5380

++	}

5381

++

5382

++	rb_link_node(&entity->rb_node, parent, node);

5383

++	rb_insert_color(&entity->rb_node, root);

5384

++

5385

++	entity->tree = root;

5386

++}

5387

++

5388

++/**

5389

++ * bfq_update_min - update the min_start field of a entity.

5390

++ * @entity: the entity to update.

5391

++ * @node: one of its children.

5392

++ *

5393

++ * This function is called when @entity may store an invalid value for

5394

++ * min_start due to updates to the active tree.  The function  assumes

5395

++ * that the subtree rooted at @node (which may be its left or its right

5396

++ * child) has a valid min_start value.

5397

++ */

5398

++static inline void bfq_update_min(struct bfq_entity *entity,

5399

++				  struct rb_node *node)

5400

++{

5401

++	struct bfq_entity *child;

5402

++

5403

++	if (node != NULL) {

5404

++		child = rb_entry(node, struct bfq_entity, rb_node);

5405

++		if (bfq_gt(entity->min_start, child->min_start))

5406

++			entity->min_start = child->min_start;

5407

++	}

5408

++}

5409

++

5410

++/**

5411

++ * bfq_update_active_node - recalculate min_start.

5412

++ * @node: the node to update.

5413

++ *

5414

++ * @node may have changed position or one of its children may have moved,

5415

++ * this function updates its min_start value.  The left and right subtrees

5416

++ * are assumed to hold a correct min_start value.

5417

++ */

5418

++static inline void bfq_update_active_node(struct rb_node *node)

5419

++{

5420

++	struct bfq_entity *entity = rb_entry(node, struct bfq_entity, rb_node);

5421

++

5422

++	entity->min_start = entity->start;

5423

++	bfq_update_min(entity, node->rb_right);

5424

++	bfq_update_min(entity, node->rb_left);

5425

++}

5426

++

5427

++/**

5428

++ * bfq_update_active_tree - update min_start for the whole active tree.

5429

++ * @node: the starting node.

5430

++ *

5431

++ * @node must be the deepest modified node after an update.  This function

5432

++ * updates its min_start using the values held by its children, assuming

5433

++ * that they did not change, and then updates all the nodes that may have

5434

++ * changed in the path to the root.  The only nodes that may have changed

5435

++ * are the ones in the path or their siblings.

5436

++ */

5437

++static void bfq_update_active_tree(struct rb_node *node)

5438

++{

5439

++	struct rb_node *parent;

5440

++

5441

++up:

5442

++	bfq_update_active_node(node);

5443

++

5444

++	parent = rb_parent(node);

5445

++	if (parent == NULL)

5446

++		return;

5447

++

5448

++	if (node == parent->rb_left && parent->rb_right != NULL)

5449

++		bfq_update_active_node(parent->rb_right);

5450

++	else if (parent->rb_left != NULL)

5451

++		bfq_update_active_node(parent->rb_left);

5452

++

5453

++	node = parent;

5454

++	goto up;

5455

++}

5456

++

5457

++static void bfq_weights_tree_add(struct bfq_data *bfqd,

5458

++				 struct bfq_entity *entity,

5459

++				 struct rb_root *root);

5460

++

5461

++static void bfq_weights_tree_remove(struct bfq_data *bfqd,

5462

++				    struct bfq_entity *entity,

5463

++				    struct rb_root *root);

5464

++

5465

++

5466

++/**

5467

++ * bfq_active_insert - insert an entity in the active tree of its

5468

++ *                     group/device.

5469

++ * @st: the service tree of the entity.

5470

++ * @entity: the entity being inserted.

5471

++ *

5472

++ * The active tree is ordered by finish time, but an extra key is kept

5473

++ * per each node, containing the minimum value for the start times of

5474

++ * its children (and the node itself), so it's possible to search for

5475

++ * the eligible node with the lowest finish time in logarithmic time.

5476

++ */

5477

++static void bfq_active_insert(struct bfq_service_tree *st,

5478

++			      struct bfq_entity *entity)

5479

++{

5480

++	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

5481

++	struct rb_node *node = &entity->rb_node;

5482

++#ifdef CONFIG_CGROUP_BFQIO

5483

++	struct bfq_sched_data *sd = NULL;

5484

++	struct bfq_group *bfqg = NULL;

5485

++	struct bfq_data *bfqd = NULL;

5486

++#endif

5487

++

5488

++	bfq_insert(&st->active, entity);

5489

++

5490

++	if (node->rb_left != NULL)

5491

++		node = node->rb_left;

5492

++	else if (node->rb_right != NULL)

5493

++		node = node->rb_right;

5494

++

5495

++	bfq_update_active_tree(node);

5496

++

5497

++#ifdef CONFIG_CGROUP_BFQIO

5498

++	sd = entity->sched_data;

5499

++	bfqg = container_of(sd, struct bfq_group, sched_data);

5500

++	BUG_ON(!bfqg);

5501

++	bfqd = (struct bfq_data *)bfqg->bfqd;

5502

++#endif

5503

++	if (bfqq != NULL)

5504

++		list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list);

5505

++#ifdef CONFIG_CGROUP_BFQIO

5506

++	else { /* bfq_group */

5507

++		BUG_ON(!bfqd);

5508

++		bfq_weights_tree_add(bfqd, entity, &bfqd->group_weights_tree);

5509

++	}

5510

++	if (bfqg != bfqd->root_group) {

5511

++		BUG_ON(!bfqg);

5512

++		BUG_ON(!bfqd);

5513

++		bfqg->active_entities++;

5514

++		if (bfqg->active_entities == 2)

5515

++			bfqd->active_numerous_groups++;

5516

++	}

5517

++#endif

5518

++}

5519

++

5520

++/**

5521

++ * bfq_ioprio_to_weight - calc a weight from an ioprio.

5522

++ * @ioprio: the ioprio value to convert.

5523

++ */

5524

++static inline unsigned short bfq_ioprio_to_weight(int ioprio)

5525

++{

5526

++	BUG_ON(ioprio < 0 || ioprio >= IOPRIO_BE_NR);

5527

++	return IOPRIO_BE_NR - ioprio;

5528

++}

5529

++

5530

++/**

5531

++ * bfq_weight_to_ioprio - calc an ioprio from a weight.

5532

++ * @weight: the weight value to convert.

5533

++ *

5534

++ * To preserve as mush as possible the old only-ioprio user interface,

5535

++ * 0 is used as an escape ioprio value for weights (numerically) equal or

5536

++ * larger than IOPRIO_BE_NR

5537

++ */

5538

++static inline unsigned short bfq_weight_to_ioprio(int weight)

5539

++{

5540

++	BUG_ON(weight < BFQ_MIN_WEIGHT || weight > BFQ_MAX_WEIGHT);

5541

++	return IOPRIO_BE_NR - weight < 0 ? 0 : IOPRIO_BE_NR - weight;

5542

++}

5543

++

5544

++static inline void bfq_get_entity(struct bfq_entity *entity)

5545

++{

5546

++	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

5547

++

5548

++	if (bfqq != NULL) {

5549

++		atomic_inc(&bfqq->ref);

5550

++		bfq_log_bfqq(bfqq->bfqd, bfqq, "get_entity: %p %d",

5551

++			     bfqq, atomic_read(&bfqq->ref));

5552

++	}

5553

++}

5554

++

5555

++/**

5556

++ * bfq_find_deepest - find the deepest node that an extraction can modify.

5557

++ * @node: the node being removed.

5558

++ *

5559

++ * Do the first step of an extraction in an rb tree, looking for the

5560

++ * node that will replace @node, and returning the deepest node that

5561

++ * the following modifications to the tree can touch.  If @node is the

5562

++ * last node in the tree return %NULL.

5563

++ */

5564

++static struct rb_node *bfq_find_deepest(struct rb_node *node)

5565

++{

5566

++	struct rb_node *deepest;

5567

++

5568

++	if (node->rb_right == NULL && node->rb_left == NULL)

5569

++		deepest = rb_parent(node);

5570

++	else if (node->rb_right == NULL)

5571

++		deepest = node->rb_left;

5572

++	else if (node->rb_left == NULL)

5573

++		deepest = node->rb_right;

5574

++	else {

5575

++		deepest = rb_next(node);

5576

++		if (deepest->rb_right != NULL)

5577

++			deepest = deepest->rb_right;

5578

++		else if (rb_parent(deepest) != node)

5579

++			deepest = rb_parent(deepest);

5580

++	}

5581

++

5582

++	return deepest;

5583

++}

5584

++

5585

++/**

5586

++ * bfq_active_extract - remove an entity from the active tree.

5587

++ * @st: the service_tree containing the tree.

5588

++ * @entity: the entity being removed.

5589

++ */

5590

++static void bfq_active_extract(struct bfq_service_tree *st,

5591

++			       struct bfq_entity *entity)

5592

++{

5593

++	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

5594

++	struct rb_node *node;

5595

++#ifdef CONFIG_CGROUP_BFQIO

5596

++	struct bfq_sched_data *sd = NULL;

5597

++	struct bfq_group *bfqg = NULL;

5598

++	struct bfq_data *bfqd = NULL;

5599

++#endif

5600

++

5601

++	node = bfq_find_deepest(&entity->rb_node);

5602

++	bfq_extract(&st->active, entity);

5603

++

5604

++	if (node != NULL)

5605

++		bfq_update_active_tree(node);

5606

++

5607

++#ifdef CONFIG_CGROUP_BFQIO

5608

++	sd = entity->sched_data;

5609

++	bfqg = container_of(sd, struct bfq_group, sched_data);

5610

++	BUG_ON(!bfqg);

5611

++	bfqd = (struct bfq_data *)bfqg->bfqd;

5612

++#endif

5613

++	if (bfqq != NULL)

5614

++		list_del(&bfqq->bfqq_list);

5615

++#ifdef CONFIG_CGROUP_BFQIO

5616

++	else { /* bfq_group */

5617

++		BUG_ON(!bfqd);

5618

++		bfq_weights_tree_remove(bfqd, entity,

5619

++					&bfqd->group_weights_tree);

5620

++	}

5621

++	if (bfqg != bfqd->root_group) {

5622

++		BUG_ON(!bfqg);

5623

++		BUG_ON(!bfqd);

5624

++		BUG_ON(!bfqg->active_entities);

5625

++		bfqg->active_entities--;

5626

++		if (bfqg->active_entities == 1) {

5627

++			BUG_ON(!bfqd->active_numerous_groups);

5628

++			bfqd->active_numerous_groups--;

5629

++		}

5630

++	}

5631

++#endif

5632

++}

5633

++

5634

++/**

5635

++ * bfq_idle_insert - insert an entity into the idle tree.

5636

++ * @st: the service tree containing the tree.

5637

++ * @entity: the entity to insert.

5638

++ */

5639

++static void bfq_idle_insert(struct bfq_service_tree *st,

5640

++			    struct bfq_entity *entity)

5641

++{

5642

++	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

5643

++	struct bfq_entity *first_idle = st->first_idle;

5644

++	struct bfq_entity *last_idle = st->last_idle;

5645

++

5646

++	if (first_idle == NULL || bfq_gt(first_idle->finish, entity->finish))

5647

++		st->first_idle = entity;

5648

++	if (last_idle == NULL || bfq_gt(entity->finish, last_idle->finish))

5649

++		st->last_idle = entity;

5650

++

5651

++	bfq_insert(&st->idle, entity);

5652

++

5653

++	if (bfqq != NULL)

5654

++		list_add(&bfqq->bfqq_list, &bfqq->bfqd->idle_list);

5655

++}

5656

++

5657

++/**

5658

++ * bfq_forget_entity - remove an entity from the wfq trees.

5659

++ * @st: the service tree.

5660

++ * @entity: the entity being removed.

5661

++ *

5662

++ * Update the device status and forget everything about @entity, putting

5663

++ * the device reference to it, if it is a queue.  Entities belonging to

5664

++ * groups are not refcounted.

5665

++ */

5666

++static void bfq_forget_entity(struct bfq_service_tree *st,

5667

++			      struct bfq_entity *entity)

5668

++{

5669

++	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

5670

++	struct bfq_sched_data *sd;

5671

++

5672

++	BUG_ON(!entity->on_st);

5673

++

5674

++	entity->on_st = 0;

5675

++	st->wsum -= entity->weight;

5676

++	if (bfqq != NULL) {

5677

++		sd = entity->sched_data;

5678

++		bfq_log_bfqq(bfqq->bfqd, bfqq, "forget_entity: %p %d",

5679

++			     bfqq, atomic_read(&bfqq->ref));

5680

++		bfq_put_queue(bfqq);

5681

++	}

5682

++}

5683

++

5684

++/**

5685

++ * bfq_put_idle_entity - release the idle tree ref of an entity.

5686

++ * @st: service tree for the entity.

5687

++ * @entity: the entity being released.

5688

++ */

5689

++static void bfq_put_idle_entity(struct bfq_service_tree *st,

5690

++				struct bfq_entity *entity)

5691

++{

5692

++	bfq_idle_extract(st, entity);

5693

++	bfq_forget_entity(st, entity);

5694

++}

5695

++

5696

++/**

5697

++ * bfq_forget_idle - update the idle tree if necessary.

5698

++ * @st: the service tree to act upon.

5699

++ *

5700

++ * To preserve the global O(log N) complexity we only remove one entry here;

5701

++ * as the idle tree will not grow indefinitely this can be done safely.

5702

++ */

5703

++static void bfq_forget_idle(struct bfq_service_tree *st)

5704

++{

5705

++	struct bfq_entity *first_idle = st->first_idle;

5706

++	struct bfq_entity *last_idle = st->last_idle;

5707

++

5708

++	if (RB_EMPTY_ROOT(&st->active) && last_idle != NULL &&

5709

++	    !bfq_gt(last_idle->finish, st->vtime)) {

5710

++		/*

5711

++		 * Forget the whole idle tree, increasing the vtime past

5712

++		 * the last finish time of idle entities.

5713

++		 */

5714

++		st->vtime = last_idle->finish;

5715

++	}

5716

++

5717

++	if (first_idle != NULL && !bfq_gt(first_idle->finish, st->vtime))

5718

++		bfq_put_idle_entity(st, first_idle);

5719

++}

5720

++

5721

++static struct bfq_service_tree *

5722

++__bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,

5723

++			 struct bfq_entity *entity)

5724

++{

5725

++	struct bfq_service_tree *new_st = old_st;

5726

++

5727

++	if (entity->ioprio_changed) {

5728

++		struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

5729

++		unsigned short prev_weight, new_weight;

5730

++		struct bfq_data *bfqd = NULL;

5731

++		struct rb_root *root;

5732

++#ifdef CONFIG_CGROUP_BFQIO

5733

++		struct bfq_sched_data *sd;

5734

++		struct bfq_group *bfqg;

5735

++#endif

5736

++

5737

++		if (bfqq != NULL)

5738

++			bfqd = bfqq->bfqd;

5739

++#ifdef CONFIG_CGROUP_BFQIO

5740

++		else {

5741

++			sd = entity->my_sched_data;

5742

++			bfqg = container_of(sd, struct bfq_group, sched_data);

5743

++			BUG_ON(!bfqg);

5744

++			bfqd = (struct bfq_data *)bfqg->bfqd;

5745

++			BUG_ON(!bfqd);

5746

++		}

5747

++#endif

5748

++

5749

++		BUG_ON(old_st->wsum < entity->weight);

5750

++		old_st->wsum -= entity->weight;

5751

++

5752

++		if (entity->new_weight != entity->orig_weight) {

5753

++			if (entity->new_weight < BFQ_MIN_WEIGHT ||

5754

++			    entity->new_weight > BFQ_MAX_WEIGHT) {

5755

++				printk(KERN_CRIT "update_weight_prio: "

5756

++						 "new_weight %d\n",

5757

++					entity->new_weight);

5758

++				BUG();

5759

++			}

5760

++			entity->orig_weight = entity->new_weight;

5761

++			entity->ioprio =

5762

++				bfq_weight_to_ioprio(entity->orig_weight);

5763

++		} else if (entity->new_ioprio != entity->ioprio) {

5764

++			entity->ioprio = entity->new_ioprio;

5765

++			entity->orig_weight =

5766

++					bfq_ioprio_to_weight(entity->ioprio);

5767

++		} else

5768

++			entity->new_weight = entity->orig_weight =

5769

++				bfq_ioprio_to_weight(entity->ioprio);

5770

++

5771

++		entity->ioprio_class = entity->new_ioprio_class;

5772

++		entity->ioprio_changed = 0;

5773

++

5774

++		/*

5775

++		 * NOTE: here we may be changing the weight too early,

5776

++		 * this will cause unfairness.  The correct approach

5777

++		 * would have required additional complexity to defer

5778

++		 * weight changes to the proper time instants (i.e.,

5779

++		 * when entity->finish <= old_st->vtime).

5780

++		 */

5781

++		new_st = bfq_entity_service_tree(entity);

5782

++

5783

++		prev_weight = entity->weight;

5784

++		new_weight = entity->orig_weight *

5785

++			     (bfqq != NULL ? bfqq->wr_coeff : 1);

5786

++		/*

5787

++		 * If the weight of the entity changes, remove the entity

5788

++		 * from its old weight counter (if there is a counter

5789

++		 * associated with the entity), and add it to the counter

5790

++		 * associated with its new weight.

5791

++		 */

5792

++		if (prev_weight != new_weight) {

5793

++			root = bfqq ? &bfqd->queue_weights_tree :

5794

++				      &bfqd->group_weights_tree;

5795

++			bfq_weights_tree_remove(bfqd, entity, root);

5796

++		}

5797

++		entity->weight = new_weight;

5798

++		/*

5799

++		 * Add the entity to its weights tree only if it is

5800

++		 * not associated with a weight-raised queue.

5801

++		 */

5802

++		if (prev_weight != new_weight &&

5803

++		    (bfqq ? bfqq->wr_coeff == 1 : 1))

5804

++			/* If we get here, root has been initialized. */

5805

++			bfq_weights_tree_add(bfqd, entity, root);

5806

++

5807

++		new_st->wsum += entity->weight;

5808

++

5809

++		if (new_st != old_st)

5810

++			entity->start = new_st->vtime;

5811

++	}

5812

++

5813

++	return new_st;

5814

++}

5815

++

5816

++/**

5817

++ * bfq_bfqq_served - update the scheduler status after selection for

5818

++ *                   service.

5819

++ * @bfqq: the queue being served.

5820

++ * @served: bytes to transfer.

5821

++ *

5822

++ * NOTE: this can be optimized, as the timestamps of upper level entities

5823

++ * are synchronized every time a new bfqq is selected for service.  By now,

5824

++ * we keep it to better check consistency.

5825

++ */

5826

++static void bfq_bfqq_served(struct bfq_queue *bfqq, unsigned long served)

5827

++{

5828

++	struct bfq_entity *entity = &bfqq->entity;

5829

++	struct bfq_service_tree *st;

5830

++

5831

++	for_each_entity(entity) {

5832

++		st = bfq_entity_service_tree(entity);

5833

++

5834

++		entity->service += served;

5835

++		BUG_ON(entity->service > entity->budget);

5836

++		BUG_ON(st->wsum == 0);

5837

++

5838

++		st->vtime += bfq_delta(served, st->wsum);

5839

++		bfq_forget_idle(st);

5840

++	}

5841

++	bfq_log_bfqq(bfqq->bfqd, bfqq, "bfqq_served %lu secs", served);

5842

++}

5843

++

5844

++/**

5845

++ * bfq_bfqq_charge_full_budget - set the service to the entity budget.

5846

++ * @bfqq: the queue that needs a service update.

5847

++ *

5848

++ * When it's not possible to be fair in the service domain, because

5849

++ * a queue is not consuming its budget fast enough (the meaning of

5850

++ * fast depends on the timeout parameter), we charge it a full

5851

++ * budget.  In this way we should obtain a sort of time-domain

5852

++ * fairness among all the seeky/slow queues.

5853

++ */

5854

++static inline void bfq_bfqq_charge_full_budget(struct bfq_queue *bfqq)

5855

++{

5856

++	struct bfq_entity *entity = &bfqq->entity;

5857

++

5858

++	bfq_log_bfqq(bfqq->bfqd, bfqq, "charge_full_budget");

5859

++

5860

++	bfq_bfqq_served(bfqq, entity->budget - entity->service);

5861

++}

5862

++

5863

++/**

5864

++ * __bfq_activate_entity - activate an entity.

5865

++ * @entity: the entity being activated.

5866

++ *

5867

++ * Called whenever an entity is activated, i.e., it is not active and one

5868

++ * of its children receives a new request, or has to be reactivated due to

5869

++ * budget exhaustion.  It uses the current budget of the entity (and the

5870

++ * service received if @entity is active) of the queue to calculate its

5871

++ * timestamps.

5872

++ */

5873

++static void __bfq_activate_entity(struct bfq_entity *entity)

5874

++{

5875

++	struct bfq_sched_data *sd = entity->sched_data;

5876

++	struct bfq_service_tree *st = bfq_entity_service_tree(entity);

5877

++

5878

++	if (entity == sd->in_service_entity) {

5879

++		BUG_ON(entity->tree != NULL);

5880

++		/*

5881

++		 * If we are requeueing the current entity we have

5882

++		 * to take care of not charging to it service it has

5883

++		 * not received.

5884

++		 */

5885

++		bfq_calc_finish(entity, entity->service);

5886

++		entity->start = entity->finish;

5887

++		sd->in_service_entity = NULL;

5888

++	} else if (entity->tree == &st->active) {

5889

++		/*

5890

++		 * Requeueing an entity due to a change of some

5891

++		 * next_in_service entity below it.  We reuse the

5892

++		 * old start time.

5893

++		 */

5894

++		bfq_active_extract(st, entity);

5895

++	} else if (entity->tree == &st->idle) {

5896

++		/*

5897

++		 * Must be on the idle tree, bfq_idle_extract() will

5898

++		 * check for that.

5899

++		 */

5900

++		bfq_idle_extract(st, entity);

5901

++		entity->start = bfq_gt(st->vtime, entity->finish) ?

5902

++				       st->vtime : entity->finish;

5903

++	} else {

5904

++		/*

5905

++		 * The finish time of the entity may be invalid, and

5906

++		 * it is in the past for sure, otherwise the queue

5907

++		 * would have been on the idle tree.

5908

++		 */

5909

++		entity->start = st->vtime;

5910

++		st->wsum += entity->weight;

5911

++		bfq_get_entity(entity);

5912

++

5913

++		BUG_ON(entity->on_st);

5914

++		entity->on_st = 1;

5915

++	}

5916

++

5917

++	st = __bfq_entity_update_weight_prio(st, entity);

5918

++	bfq_calc_finish(entity, entity->budget);

5919

++	bfq_active_insert(st, entity);

5920

++}

5921

++

5922

++/**

5923

++ * bfq_activate_entity - activate an entity and its ancestors if necessary.

5924

++ * @entity: the entity to activate.

5925

++ *

5926

++ * Activate @entity and all the entities on the path from it to the root.

5927

++ */

5928

++static void bfq_activate_entity(struct bfq_entity *entity)

5929

++{

5930

++	struct bfq_sched_data *sd;

5931

++

5932

++	for_each_entity(entity) {

5933

++		__bfq_activate_entity(entity);

5934

++

5935

++		sd = entity->sched_data;

5936

++		if (!bfq_update_next_in_service(sd))

5937

++			/*

5938

++			 * No need to propagate the activation to the

5939

++			 * upper entities, as they will be updated when

5940

++			 * the in-service entity is rescheduled.

5941

++			 */

5942

++			break;

5943

++	}

5944

++}

5945

++

5946

++/**

5947

++ * __bfq_deactivate_entity - deactivate an entity from its service tree.

5948

++ * @entity: the entity to deactivate.

5949

++ * @requeue: if false, the entity will not be put into the idle tree.

5950

++ *

5951

++ * Deactivate an entity, independently from its previous state.  If the

5952

++ * entity was not on a service tree just return, otherwise if it is on

5953

++ * any scheduler tree, extract it from that tree, and if necessary

5954

++ * and if the caller did not specify @requeue, put it on the idle tree.

5955

++ *

5956

++ * Return %1 if the caller should update the entity hierarchy, i.e.,

5957

++ * if the entity was in service or if it was the next_in_service for

5958

++ * its sched_data; return %0 otherwise.

5959

++ */

5960

++static int __bfq_deactivate_entity(struct bfq_entity *entity, int requeue)

5961

++{

5962

++	struct bfq_sched_data *sd = entity->sched_data;

5963

++	struct bfq_service_tree *st = bfq_entity_service_tree(entity);

5964

++	int was_in_service = entity == sd->in_service_entity;

5965

++	int ret = 0;

5966

++

5967

++	if (!entity->on_st)

5968

++		return 0;

5969

++

5970

++	BUG_ON(was_in_service && entity->tree != NULL);

5971

++

5972

++	if (was_in_service) {

5973

++		bfq_calc_finish(entity, entity->service);

5974

++		sd->in_service_entity = NULL;

5975

++	} else if (entity->tree == &st->active)

5976

++		bfq_active_extract(st, entity);

5977

++	else if (entity->tree == &st->idle)

5978

++		bfq_idle_extract(st, entity);

5979

++	else if (entity->tree != NULL)

5980

++		BUG();

5981

++

5982

++	if (was_in_service || sd->next_in_service == entity)

5983

++		ret = bfq_update_next_in_service(sd);

5984

++

5985

++	if (!requeue || !bfq_gt(entity->finish, st->vtime))

5986

++		bfq_forget_entity(st, entity);

5987

++	else

5988

++		bfq_idle_insert(st, entity);

5989

++

5990

++	BUG_ON(sd->in_service_entity == entity);

5991

++	BUG_ON(sd->next_in_service == entity);

5992

++

5993

++	return ret;

5994

++}

5995

++

5996

++/**

5997

++ * bfq_deactivate_entity - deactivate an entity.

5998

++ * @entity: the entity to deactivate.

5999

++ * @requeue: true if the entity can be put on the idle tree

6000

++ */

6001

++static void bfq_deactivate_entity(struct bfq_entity *entity, int requeue)

6002

++{

6003

++	struct bfq_sched_data *sd;

6004

++	struct bfq_entity *parent;

6005

++

6006

++	for_each_entity_safe(entity, parent) {

6007

++		sd = entity->sched_data;

6008

++

6009

++		if (!__bfq_deactivate_entity(entity, requeue))

6010

++			/*

6011

++			 * The parent entity is still backlogged, and

6012

++			 * we don't need to update it as it is still

6013

++			 * in service.

6014

++			 */

6015

++			break;

6016

++

6017

++		if (sd->next_in_service != NULL)

6018

++			/*

6019

++			 * The parent entity is still backlogged and

6020

++			 * the budgets on the path towards the root

6021

++			 * need to be updated.

6022

++			 */

6023

++			goto update;

6024

++

6025

++		/*

6026

++		 * If we reach there the parent is no more backlogged and

6027

++		 * we want to propagate the dequeue upwards.

6028

++		 */

6029

++		requeue = 1;

6030

++	}

6031

++

6032

++	return;

6033

++

6034

++update:

6035

++	entity = parent;

6036

++	for_each_entity(entity) {

6037

++		__bfq_activate_entity(entity);

6038

++

6039

++		sd = entity->sched_data;

6040

++		if (!bfq_update_next_in_service(sd))

6041

++			break;

6042

++	}

6043

++}

6044

++

6045

++/**

6046

++ * bfq_update_vtime - update vtime if necessary.

6047

++ * @st: the service tree to act upon.

6048

++ *

6049

++ * If necessary update the service tree vtime to have at least one

6050

++ * eligible entity, skipping to its start time.  Assumes that the

6051

++ * active tree of the device is not empty.

6052

++ *

6053

++ * NOTE: this hierarchical implementation updates vtimes quite often,

6054

++ * we may end up with reactivated processes getting timestamps after a

6055

++ * vtime skip done because we needed a ->first_active entity on some

6056

++ * intermediate node.

6057

++ */

6058

++static void bfq_update_vtime(struct bfq_service_tree *st)

6059

++{

6060

++	struct bfq_entity *entry;

6061

++	struct rb_node *node = st->active.rb_node;

6062

++

6063

++	entry = rb_entry(node, struct bfq_entity, rb_node);

6064

++	if (bfq_gt(entry->min_start, st->vtime)) {

6065

++		st->vtime = entry->min_start;

6066

++		bfq_forget_idle(st);

6067

++	}

6068

++}

6069

++

6070

++/**

6071

++ * bfq_first_active_entity - find the eligible entity with

6072

++ *                           the smallest finish time

6073

++ * @st: the service tree to select from.

6074

++ *

6075

++ * This function searches the first schedulable entity, starting from the

6076

++ * root of the tree and going on the left every time on this side there is

6077

++ * a subtree with at least one eligible (start >= vtime) entity. The path on

6078

++ * the right is followed only if a) the left subtree contains no eligible

6079

++ * entities and b) no eligible entity has been found yet.

6080

++ */

6081

++static struct bfq_entity *bfq_first_active_entity(struct bfq_service_tree *st)

6082

++{

6083

++	struct bfq_entity *entry, *first = NULL;

6084

++	struct rb_node *node = st->active.rb_node;

6085

++

6086

++	while (node != NULL) {

6087

++		entry = rb_entry(node, struct bfq_entity, rb_node);

6088

++left:

6089

++		if (!bfq_gt(entry->start, st->vtime))

6090

++			first = entry;

6091

++

6092

++		BUG_ON(bfq_gt(entry->min_start, st->vtime));

6093

++

6094

++		if (node->rb_left != NULL) {

6095

++			entry = rb_entry(node->rb_left,

6096

++					 struct bfq_entity, rb_node);

6097

++			if (!bfq_gt(entry->min_start, st->vtime)) {

6098

++				node = node->rb_left;

6099

++				goto left;

6100

++			}

6101

++		}

6102

++		if (first != NULL)

6103

++			break;

6104

++		node = node->rb_right;

6105

++	}

6106

++

6107

++	BUG_ON(first == NULL && !RB_EMPTY_ROOT(&st->active));

6108

++	return first;

6109

++}

6110

++

6111

++/**

6112

++ * __bfq_lookup_next_entity - return the first eligible entity in @st.

6113

++ * @st: the service tree.

6114

++ *

6115

++ * Update the virtual time in @st and return the first eligible entity

6116

++ * it contains.

6117

++ */

6118

++static struct bfq_entity *__bfq_lookup_next_entity(struct bfq_service_tree *st,

6119

++						   bool force)

6120

++{

6121

++	struct bfq_entity *entity, *new_next_in_service = NULL;

6122

++

6123

++	if (RB_EMPTY_ROOT(&st->active))

6124

++		return NULL;

6125

++

6126

++	bfq_update_vtime(st);

6127

++	entity = bfq_first_active_entity(st);

6128

++	BUG_ON(bfq_gt(entity->start, st->vtime));

6129

++

6130

++	/*

6131

++	 * If the chosen entity does not match with the sched_data's

6132

++	 * next_in_service and we are forcedly serving the IDLE priority

6133

++	 * class tree, bubble up budget update.

6134

++	 */

6135

++	if (unlikely(force && entity != entity->sched_data->next_in_service)) {

6136

++		new_next_in_service = entity;

6137

++		for_each_entity(new_next_in_service)

6138

++			bfq_update_budget(new_next_in_service);

6139

++	}

6140

++

6141

++	return entity;

6142

++}

6143

++

6144

++/**

6145

++ * bfq_lookup_next_entity - return the first eligible entity in @sd.

6146

++ * @sd: the sched_data.

6147

++ * @extract: if true the returned entity will be also extracted from @sd.

6148

++ *

6149

++ * NOTE: since we cache the next_in_service entity at each level of the

6150

++ * hierarchy, the complexity of the lookup can be decreased with

6151

++ * absolutely no effort just returning the cached next_in_service value;

6152

++ * we prefer to do full lookups to test the consistency of * the data

6153

++ * structures.

6154

++ */

6155

++static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd,

6156

++						 int extract,

6157

++						 struct bfq_data *bfqd)

6158

++{

6159

++	struct bfq_service_tree *st = sd->service_tree;

6160

++	struct bfq_entity *entity;

6161

++	int i = 0;

6162

++

6163

++	BUG_ON(sd->in_service_entity != NULL);

6164

++

6165

++	if (bfqd != NULL &&

6166

++	    jiffies - bfqd->bfq_class_idle_last_service > BFQ_CL_IDLE_TIMEOUT) {

6167

++		entity = __bfq_lookup_next_entity(st + BFQ_IOPRIO_CLASSES - 1,

6168

++						  true);

6169

++		if (entity != NULL) {

6170

++			i = BFQ_IOPRIO_CLASSES - 1;

6171

++			bfqd->bfq_class_idle_last_service = jiffies;

6172

++			sd->next_in_service = entity;

6173

++		}

6174

++	}

6175

++	for (; i < BFQ_IOPRIO_CLASSES; i++) {

6176

++		entity = __bfq_lookup_next_entity(st + i, false);

6177

++		if (entity != NULL) {

6178

++			if (extract) {

6179

++				bfq_check_next_in_service(sd, entity);

6180

++				bfq_active_extract(st + i, entity);

6181

++				sd->in_service_entity = entity;

6182

++				sd->next_in_service = NULL;

6183

++			}

6184

++			break;

6185

++		}

6186

++	}

6187

++

6188

++	return entity;

6189

++}

6190

++

6191

++/*

6192

++ * Get next queue for service.

6193

++ */

6194

++static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd)

6195

++{

6196

++	struct bfq_entity *entity = NULL;

6197

++	struct bfq_sched_data *sd;

6198

++	struct bfq_queue *bfqq;

6199

++

6200

++	BUG_ON(bfqd->in_service_queue != NULL);

6201

++

6202

++	if (bfqd->busy_queues == 0)

6203

++		return NULL;

6204

++

6205

++	sd = &bfqd->root_group->sched_data;

6206

++	for (; sd != NULL; sd = entity->my_sched_data) {

6207

++		entity = bfq_lookup_next_entity(sd, 1, bfqd);

6208

++		BUG_ON(entity == NULL);

6209

++		entity->service = 0;

6210

++	}

6211

++

6212

++	bfqq = bfq_entity_to_bfqq(entity);

6213

++	BUG_ON(bfqq == NULL);

6214

++

6215

++	return bfqq;

6216

++}

6217

++

6218

++/*

6219

++ * Forced extraction of the given queue.

6220

++ */

6221

++static void bfq_get_next_queue_forced(struct bfq_data *bfqd,

6222

++				      struct bfq_queue *bfqq)

6223

++{

6224

++	struct bfq_entity *entity;

6225

++	struct bfq_sched_data *sd;

6226

++

6227

++	BUG_ON(bfqd->in_service_queue != NULL);

6228

++

6229

++	entity = &bfqq->entity;

6230

++	/*

6231

++	 * Bubble up extraction/update from the leaf to the root.

6232

++	*/

6233

++	for_each_entity(entity) {

6234

++		sd = entity->sched_data;

6235

++		bfq_update_budget(entity);

6236

++		bfq_update_vtime(bfq_entity_service_tree(entity));

6237

++		bfq_active_extract(bfq_entity_service_tree(entity), entity);

6238

++		sd->in_service_entity = entity;

6239

++		sd->next_in_service = NULL;

6240

++		entity->service = 0;

6241

++	}

6242

++

6243

++	return;

6244

++}

6245

++

6246

++static void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd)

6247

++{

6248

++	if (bfqd->in_service_bic != NULL) {

6249

++		put_io_context(bfqd->in_service_bic->icq.ioc);

6250

++		bfqd->in_service_bic = NULL;

6251

++	}

6252

++

6253

++	bfqd->in_service_queue = NULL;

6254

++	del_timer(&bfqd->idle_slice_timer);

6255

++}

6256

++

6257

++static void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,

6258

++				int requeue)

6259

++{

6260

++	struct bfq_entity *entity = &bfqq->entity;

6261

++

6262

++	if (bfqq == bfqd->in_service_queue)

6263

++		__bfq_bfqd_reset_in_service(bfqd);

6264

++

6265

++	bfq_deactivate_entity(entity, requeue);

6266

++}

6267

++

6268

++static void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)

6269

++{

6270

++	struct bfq_entity *entity = &bfqq->entity;

6271

++

6272

++	bfq_activate_entity(entity);

6273

++}

6274

++

6275

++/*

6276

++ * Called when the bfqq no longer has requests pending, remove it from

6277

++ * the service tree.

6278

++ */

6279

++static void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq,

6280

++			      int requeue)

6281

++{

6282

++	BUG_ON(!bfq_bfqq_busy(bfqq));

6283

++	BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list));

6284

++

6285

++	bfq_log_bfqq(bfqd, bfqq, "del from busy");

6286

++

6287

++	bfq_clear_bfqq_busy(bfqq);

6288

++

6289

++	BUG_ON(bfqd->busy_queues == 0);

6290

++	bfqd->busy_queues--;

6291

++

6292

++	if (!bfqq->dispatched) {

6293

++		bfq_weights_tree_remove(bfqd, &bfqq->entity,

6294

++					&bfqd->queue_weights_tree);

6295

++		if (!blk_queue_nonrot(bfqd->queue)) {

6296

++			BUG_ON(!bfqd->busy_in_flight_queues);

6297

++			bfqd->busy_in_flight_queues--;

6298

++			if (bfq_bfqq_constantly_seeky(bfqq)) {

6299

++				BUG_ON(!bfqd->

6300

++					const_seeky_busy_in_flight_queues);

6301

++				bfqd->const_seeky_busy_in_flight_queues--;

6302

++			}

6303

++		}

6304

++	}

6305

++	if (bfqq->wr_coeff > 1)

6306

++		bfqd->wr_busy_queues--;

6307

++

6308

++	bfq_deactivate_bfqq(bfqd, bfqq, requeue);

6309

++}

6310

++

6311

++/*

6312

++ * Called when an inactive queue receives a new request.

6313

++ */

6314

++static void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq)

6315

++{

6316

++	BUG_ON(bfq_bfqq_busy(bfqq));

6317

++	BUG_ON(bfqq == bfqd->in_service_queue);

6318

++

6319

++	bfq_log_bfqq(bfqd, bfqq, "add to busy");

6320

++

6321

++	bfq_activate_bfqq(bfqd, bfqq);

6322

++

6323

++	bfq_mark_bfqq_busy(bfqq);

6324

++	bfqd->busy_queues++;

6325

++

6326

++	if (!bfqq->dispatched) {

6327

++		if (bfqq->wr_coeff == 1)

6328

++			bfq_weights_tree_add(bfqd, &bfqq->entity,

6329

++					     &bfqd->queue_weights_tree);

6330

++		if (!blk_queue_nonrot(bfqd->queue)) {

6331

++			bfqd->busy_in_flight_queues++;

6332

++			if (bfq_bfqq_constantly_seeky(bfqq))

6333

++				bfqd->const_seeky_busy_in_flight_queues++;

6334

++		}

6335

++	}

6336

++	if (bfqq->wr_coeff > 1)

6337

++		bfqd->wr_busy_queues++;

6338

++}

6339

+diff --git a/block/bfq.h b/block/bfq.h

6340

+new file mode 100644

6341

+index 0000000..518f2ac

6342

+--- /dev/null

6343

++++ b/block/bfq.h

6344

+@@ -0,0 +1,775 @@

6345

++/*

6346

++ * BFQ-v7r7 for 4.0.0: data structures and common functions prototypes.

6347

++ *

6348

++ * Based on ideas and code from CFQ:

6349

++ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>

6350

++ *

6351

++ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>

6352

++ *		      Paolo Valente <paolo.valente@×××××××.it>

6353

++ *

6354

++ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>

6355

++ */

6356

++

6357

++#ifndef _BFQ_H

6358

++#define _BFQ_H

6359

++

6360

++#include <linux/blktrace_api.h>

6361

++#include <linux/hrtimer.h>

6362

++#include <linux/ioprio.h>

6363

++#include <linux/rbtree.h>

6364

++

6365

++#define BFQ_IOPRIO_CLASSES	3

6366

++#define BFQ_CL_IDLE_TIMEOUT	(HZ/5)

6367

++

6368

++#define BFQ_MIN_WEIGHT	1

6369

++#define BFQ_MAX_WEIGHT	1000

6370

++

6371

++#define BFQ_DEFAULT_QUEUE_IOPRIO	4

6372

++

6373

++#define BFQ_DEFAULT_GRP_WEIGHT	10

6374

++#define BFQ_DEFAULT_GRP_IOPRIO	0

6375

++#define BFQ_DEFAULT_GRP_CLASS	IOPRIO_CLASS_BE

6376

++

6377

++struct bfq_entity;

6378

++

6379

++/**

6380

++ * struct bfq_service_tree - per ioprio_class service tree.

6381

++ * @active: tree for active entities (i.e., those backlogged).

6382

++ * @idle: tree for idle entities (i.e., those not backlogged, with V <= F_i).

6383

++ * @first_idle: idle entity with minimum F_i.

6384

++ * @last_idle: idle entity with maximum F_i.

6385

++ * @vtime: scheduler virtual time.

6386

++ * @wsum: scheduler weight sum; active and idle entities contribute to it.

6387

++ *

6388

++ * Each service tree represents a B-WF2Q+ scheduler on its own.  Each

6389

++ * ioprio_class has its own independent scheduler, and so its own

6390

++ * bfq_service_tree.  All the fields are protected by the queue lock

6391

++ * of the containing bfqd.

6392

++ */

6393

++struct bfq_service_tree {

6394

++	struct rb_root active;

6395

++	struct rb_root idle;

6396

++

6397

++	struct bfq_entity *first_idle;

6398

++	struct bfq_entity *last_idle;

6399

++

6400

++	u64 vtime;

6401

++	unsigned long wsum;

6402

++};

6403

++

6404

++/**

6405

++ * struct bfq_sched_data - multi-class scheduler.

6406

++ * @in_service_entity: entity in service.

6407

++ * @next_in_service: head-of-the-line entity in the scheduler.

6408

++ * @service_tree: array of service trees, one per ioprio_class.

6409

++ *

6410

++ * bfq_sched_data is the basic scheduler queue.  It supports three

6411

++ * ioprio_classes, and can be used either as a toplevel queue or as

6412

++ * an intermediate queue on a hierarchical setup.

6413

++ * @next_in_service points to the active entity of the sched_data

6414

++ * service trees that will be scheduled next.

6415

++ *

6416

++ * The supported ioprio_classes are the same as in CFQ, in descending

6417

++ * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE.

6418

++ * Requests from higher priority queues are served before all the

6419

++ * requests from lower priority queues; among requests of the same

6420

++ * queue requests are served according to B-WF2Q+.

6421

++ * All the fields are protected by the queue lock of the containing bfqd.

6422

++ */

6423

++struct bfq_sched_data {

6424

++	struct bfq_entity *in_service_entity;

6425

++	struct bfq_entity *next_in_service;

6426

++	struct bfq_service_tree service_tree[BFQ_IOPRIO_CLASSES];

6427

++};

6428

++

6429

++/**

6430

++ * struct bfq_weight_counter - counter of the number of all active entities

6431

++ *                             with a given weight.

6432

++ * @weight: weight of the entities that this counter refers to.

6433

++ * @num_active: number of active entities with this weight.

6434

++ * @weights_node: weights tree member (see bfq_data's @queue_weights_tree

6435

++ *                and @group_weights_tree).

6436

++ */

6437

++struct bfq_weight_counter {

6438

++	short int weight;

6439

++	unsigned int num_active;

6440

++	struct rb_node weights_node;

6441

++};

6442

++

6443

++/**

6444

++ * struct bfq_entity - schedulable entity.

6445

++ * @rb_node: service_tree member.

6446

++ * @weight_counter: pointer to the weight counter associated with this entity.

6447

++ * @on_st: flag, true if the entity is on a tree (either the active or

6448

++ *         the idle one of its service_tree).

6449

++ * @finish: B-WF2Q+ finish timestamp (aka F_i).

6450

++ * @start: B-WF2Q+ start timestamp (aka S_i).

6451

++ * @tree: tree the entity is enqueued into; %NULL if not on a tree.

6452

++ * @min_start: minimum start time of the (active) subtree rooted at

6453

++ *             this entity; used for O(log N) lookups into active trees.

6454

++ * @service: service received during the last round of service.

6455

++ * @budget: budget used to calculate F_i; F_i = S_i + @budget / @weight.

6456

++ * @weight: weight of the queue

6457

++ * @parent: parent entity, for hierarchical scheduling.

6458

++ * @my_sched_data: for non-leaf nodes in the cgroup hierarchy, the

6459

++ *                 associated scheduler queue, %NULL on leaf nodes.

6460

++ * @sched_data: the scheduler queue this entity belongs to.

6461

++ * @ioprio: the ioprio in use.

6462

++ * @new_weight: when a weight change is requested, the new weight value.

6463

++ * @orig_weight: original weight, used to implement weight boosting

6464

++ * @new_ioprio: when an ioprio change is requested, the new ioprio value.

6465

++ * @ioprio_class: the ioprio_class in use.

6466

++ * @new_ioprio_class: when an ioprio_class change is requested, the new

6467

++ *                    ioprio_class value.

6468

++ * @ioprio_changed: flag, true when the user requested a weight, ioprio or

6469

++ *                  ioprio_class change.

6470

++ *

6471

++ * A bfq_entity is used to represent either a bfq_queue (leaf node in the

6472

++ * cgroup hierarchy) or a bfq_group into the upper level scheduler.  Each

6473

++ * entity belongs to the sched_data of the parent group in the cgroup

6474

++ * hierarchy.  Non-leaf entities have also their own sched_data, stored

6475

++ * in @my_sched_data.

6476

++ *

6477

++ * Each entity stores independently its priority values; this would

6478

++ * allow different weights on different devices, but this

6479

++ * functionality is not exported to userspace by now.  Priorities and

6480

++ * weights are updated lazily, first storing the new values into the

6481

++ * new_* fields, then setting the @ioprio_changed flag.  As soon as

6482

++ * there is a transition in the entity state that allows the priority

6483

++ * update to take place the effective and the requested priority

6484

++ * values are synchronized.

6485

++ *

6486

++ * Unless cgroups are used, the weight value is calculated from the

6487

++ * ioprio to export the same interface as CFQ.  When dealing with

6488

++ * ``well-behaved'' queues (i.e., queues that do not spend too much

6489

++ * time to consume their budget and have true sequential behavior, and

6490

++ * when there are no external factors breaking anticipation) the

6491

++ * relative weights at each level of the cgroups hierarchy should be

6492

++ * guaranteed.  All the fields are protected by the queue lock of the

6493

++ * containing bfqd.

6494

++ */

6495

++struct bfq_entity {

6496

++	struct rb_node rb_node;

6497

++	struct bfq_weight_counter *weight_counter;

6498

++

6499

++	int on_st;

6500

++

6501

++	u64 finish;

6502

++	u64 start;

6503

++

6504

++	struct rb_root *tree;

6505

++

6506

++	u64 min_start;

6507

++

6508

++	unsigned long service, budget;

6509

++	unsigned short weight, new_weight;

6510

++	unsigned short orig_weight;

6511

++

6512

++	struct bfq_entity *parent;

6513

++

6514

++	struct bfq_sched_data *my_sched_data;

6515

++	struct bfq_sched_data *sched_data;

6516

++

6517

++	unsigned short ioprio, new_ioprio;

6518

++	unsigned short ioprio_class, new_ioprio_class;

6519

++

6520

++	int ioprio_changed;

6521

++};

6522

++

6523

++struct bfq_group;

6524

++

6525

++/**

6526

++ * struct bfq_queue - leaf schedulable entity.

6527

++ * @ref: reference counter.

6528

++ * @bfqd: parent bfq_data.

6529

++ * @new_bfqq: shared bfq_queue if queue is cooperating with

6530

++ *           one or more other queues.

6531

++ * @pos_node: request-position tree member (see bfq_data's @rq_pos_tree).

6532

++ * @pos_root: request-position tree root (see bfq_data's @rq_pos_tree).

6533

++ * @sort_list: sorted list of pending requests.

6534

++ * @next_rq: if fifo isn't expired, next request to serve.

6535

++ * @queued: nr of requests queued in @sort_list.

6536

++ * @allocated: currently allocated requests.

6537

++ * @meta_pending: pending metadata requests.

6538

++ * @fifo: fifo list of requests in sort_list.

6539

++ * @entity: entity representing this queue in the scheduler.

6540

++ * @max_budget: maximum budget allowed from the feedback mechanism.

6541

++ * @budget_timeout: budget expiration (in jiffies).

6542

++ * @dispatched: number of requests on the dispatch list or inside driver.

6543

++ * @flags: status flags.

6544

++ * @bfqq_list: node for active/idle bfqq list inside our bfqd.

6545

++ * @burst_list_node: node for the device's burst list.

6546

++ * @seek_samples: number of seeks sampled

6547

++ * @seek_total: sum of the distances of the seeks sampled

6548

++ * @seek_mean: mean seek distance

6549

++ * @last_request_pos: position of the last request enqueued

6550

++ * @requests_within_timer: number of consecutive pairs of request completion

6551

++ *                         and arrival, such that the queue becomes idle

6552

++ *                         after the completion, but the next request arrives

6553

++ *                         within an idle time slice; used only if the queue's

6554

++ *                         IO_bound has been cleared.

6555

++ * @pid: pid of the process owning the queue, used for logging purposes.

6556

++ * @last_wr_start_finish: start time of the current weight-raising period if

6557

++ *                        the @bfq-queue is being weight-raised, otherwise

6558

++ *                        finish time of the last weight-raising period

6559

++ * @wr_cur_max_time: current max raising time for this queue

6560

++ * @soft_rt_next_start: minimum time instant such that, only if a new

6561

++ *                      request is enqueued after this time instant in an

6562

++ *                      idle @bfq_queue with no outstanding requests, then

6563

++ *                      the task associated with the queue it is deemed as

6564

++ *                      soft real-time (see the comments to the function

6565

++ *                      bfq_bfqq_softrt_next_start()).

6566

++ * @last_idle_bklogged: time of the last transition of the @bfq_queue from

6567

++ *                      idle to backlogged

6568

++ * @service_from_backlogged: cumulative service received from the @bfq_queue

6569

++ *                           since the last transition from idle to

6570

++ *                           backlogged

6571

++ *

6572

++ * A bfq_queue is a leaf request queue; it can be associated with an io_context

6573

++ * or more, if it is async or shared between cooperating processes. @cgroup

6574

++ * holds a reference to the cgroup, to be sure that it does not disappear while

6575

++ * a bfqq still references it (mostly to avoid races between request issuing and

6576

++ * task migration followed by cgroup destruction).

6577

++ * All the fields are protected by the queue lock of the containing bfqd.

6578

++ */

6579

++struct bfq_queue {

6580

++	atomic_t ref;

6581

++	struct bfq_data *bfqd;

6582

++

6583

++	/* fields for cooperating queues handling */

6584

++	struct bfq_queue *new_bfqq;

6585

++	struct rb_node pos_node;

6586

++	struct rb_root *pos_root;

6587

++

6588

++	struct rb_root sort_list;

6589

++	struct request *next_rq;

6590

++	int queued[2];

6591

++	int allocated[2];

6592

++	int meta_pending;

6593

++	struct list_head fifo;

6594

++

6595

++	struct bfq_entity entity;

6596

++

6597

++	unsigned long max_budget;

6598

++	unsigned long budget_timeout;

6599

++

6600

++	int dispatched;

6601

++

6602

++	unsigned int flags;

6603

++

6604

++	struct list_head bfqq_list;

6605

++

6606

++	struct hlist_node burst_list_node;

6607

++

6608

++	unsigned int seek_samples;

6609

++	u64 seek_total;

6610

++	sector_t seek_mean;

6611

++	sector_t last_request_pos;

6612

++

6613

++	unsigned int requests_within_timer;

6614

++

6615

++	pid_t pid;

6616

++

6617

++	/* weight-raising fields */

6618

++	unsigned long wr_cur_max_time;

6619

++	unsigned long soft_rt_next_start;

6620

++	unsigned long last_wr_start_finish;

6621

++	unsigned int wr_coeff;

6622

++	unsigned long last_idle_bklogged;

6623

++	unsigned long service_from_backlogged;

6624

++};

6625

++

6626

++/**

6627

++ * struct bfq_ttime - per process thinktime stats.

6628

++ * @ttime_total: total process thinktime

6629

++ * @ttime_samples: number of thinktime samples

6630

++ * @ttime_mean: average process thinktime

6631

++ */

6632

++struct bfq_ttime {

6633

++	unsigned long last_end_request;

6634

++

6635

++	unsigned long ttime_total;

6636

++	unsigned long ttime_samples;

6637

++	unsigned long ttime_mean;

6638

++};

6639

++

6640

++/**

6641

++ * struct bfq_io_cq - per (request_queue, io_context) structure.

6642

++ * @icq: associated io_cq structure

6643

++ * @bfqq: array of two process queues, the sync and the async

6644

++ * @ttime: associated @bfq_ttime struct

6645

++ */

6646

++struct bfq_io_cq {

6647

++	struct io_cq icq; /* must be the first member */

6648

++	struct bfq_queue *bfqq[2];

6649

++	struct bfq_ttime ttime;

6650

++	int ioprio;

6651

++};

6652

++

6653

++enum bfq_device_speed {

6654

++	BFQ_BFQD_FAST,

6655

++	BFQ_BFQD_SLOW,

6656

++};

6657

++

6658

++/**

6659

++ * struct bfq_data - per device data structure.

6660

++ * @queue: request queue for the managed device.

6661

++ * @root_group: root bfq_group for the device.

6662

++ * @rq_pos_tree: rbtree sorted by next_request position, used when

6663

++ *               determining if two or more queues have interleaving

6664

++ *               requests (see bfq_close_cooperator()).

6665

++ * @active_numerous_groups: number of bfq_groups containing more than one

6666

++ *                          active @bfq_entity.

6667

++ * @queue_weights_tree: rbtree of weight counters of @bfq_queues, sorted by

6668

++ *                      weight. Used to keep track of whether all @bfq_queues

6669

++ *                     have the same weight. The tree contains one counter

6670

++ *                     for each distinct weight associated to some active

6671

++ *                     and not weight-raised @bfq_queue (see the comments to

6672

++ *                      the functions bfq_weights_tree_[add|remove] for

6673

++ *                     further details).

6674

++ * @group_weights_tree: rbtree of non-queue @bfq_entity weight counters, sorted

6675

++ *                      by weight. Used to keep track of whether all

6676

++ *                     @bfq_groups have the same weight. The tree contains

6677

++ *                     one counter for each distinct weight associated to

6678

++ *                     some active @bfq_group (see the comments to the

6679

++ *                     functions bfq_weights_tree_[add|remove] for further

6680

++ *                     details).

6681

++ * @busy_queues: number of bfq_queues containing requests (including the

6682

++ *		 queue in service, even if it is idling).

6683

++ * @busy_in_flight_queues: number of @bfq_queues containing pending or

6684

++ *                         in-flight requests, plus the @bfq_queue in

6685

++ *                         service, even if idle but waiting for the

6686

++ *                         possible arrival of its next sync request. This

6687

++ *                         field is updated only if the device is rotational,

6688

++ *                         but used only if the device is also NCQ-capable.

6689

++ *                         The reason why the field is updated also for non-

6690

++ *                         NCQ-capable rotational devices is related to the

6691

++ *                         fact that the value of @hw_tag may be set also

6692

++ *                         later than when busy_in_flight_queues may need to

6693

++ *                         be incremented for the first time(s). Taking also

6694

++ *                         this possibility into account, to avoid unbalanced

6695

++ *                         increments/decrements, would imply more overhead

6696

++ *                         than just updating busy_in_flight_queues

6697

++ *                         regardless of the value of @hw_tag.

6698

++ * @const_seeky_busy_in_flight_queues: number of constantly-seeky @bfq_queues

6699

++ *                                     (that is, seeky queues that expired

6700

++ *                                     for budget timeout at least once)

6701

++ *                                     containing pending or in-flight

6702

++ *                                     requests, including the in-service

6703

++ *                                     @bfq_queue if constantly seeky. This

6704

++ *                                     field is updated only if the device

6705

++ *                                     is rotational, but used only if the

6706

++ *                                     device is also NCQ-capable (see the

6707

++ *                                     comments to @busy_in_flight_queues).

6708

++ * @wr_busy_queues: number of weight-raised busy @bfq_queues.

6709

++ * @queued: number of queued requests.

6710

++ * @rq_in_driver: number of requests dispatched and waiting for completion.

6711

++ * @sync_flight: number of sync requests in the driver.

6712

++ * @max_rq_in_driver: max number of reqs in driver in the last

6713

++ *                    @hw_tag_samples completed requests.

6714

++ * @hw_tag_samples: nr of samples used to calculate hw_tag.

6715

++ * @hw_tag: flag set to one if the driver is showing a queueing behavior.

6716

++ * @budgets_assigned: number of budgets assigned.

6717

++ * @idle_slice_timer: timer set when idling for the next sequential request

6718

++ *                    from the queue in service.

6719

++ * @unplug_work: delayed work to restart dispatching on the request queue.

6720

++ * @in_service_queue: bfq_queue in service.

6721

++ * @in_service_bic: bfq_io_cq (bic) associated with the @in_service_queue.

6722

++ * @last_position: on-disk position of the last served request.

6723

++ * @last_budget_start: beginning of the last budget.

6724

++ * @last_idling_start: beginning of the last idle slice.

6725

++ * @peak_rate: peak transfer rate observed for a budget.

6726

++ * @peak_rate_samples: number of samples used to calculate @peak_rate.

6727

++ * @bfq_max_budget: maximum budget allotted to a bfq_queue before

6728

++ *                  rescheduling.

6729

++ * @group_list: list of all the bfq_groups active on the device.

6730

++ * @active_list: list of all the bfq_queues active on the device.

6731

++ * @idle_list: list of all the bfq_queues idle on the device.

6732

++ * @bfq_quantum: max number of requests dispatched per dispatch round.

6733

++ * @bfq_fifo_expire: timeout for async/sync requests; when it expires

6734

++ *                   requests are served in fifo order.

6735

++ * @bfq_back_penalty: weight of backward seeks wrt forward ones.

6736

++ * @bfq_back_max: maximum allowed backward seek.

6737

++ * @bfq_slice_idle: maximum idling time.

6738

++ * @bfq_user_max_budget: user-configured max budget value

6739

++ *                       (0 for auto-tuning).

6740

++ * @bfq_max_budget_async_rq: maximum budget (in nr of requests) allotted to

6741

++ *                           async queues.

6742

++ * @bfq_timeout: timeout for bfq_queues to consume their budget; used to

6743

++ *               to prevent seeky queues to impose long latencies to well

6744

++ *               behaved ones (this also implies that seeky queues cannot

6745

++ *               receive guarantees in the service domain; after a timeout

6746

++ *               they are charged for the whole allocated budget, to try

6747

++ *               to preserve a behavior reasonably fair among them, but

6748

++ *               without service-domain guarantees).

6749

++ * @bfq_coop_thresh: number of queue merges after which a @bfq_queue is

6750

++ *                   no more granted any weight-raising.

6751

++ * @bfq_failed_cooperations: number of consecutive failed cooperation

6752

++ *                           chances after which weight-raising is restored

6753

++ *                           to a queue subject to more than bfq_coop_thresh

6754

++ *                           queue merges.

6755

++ * @bfq_requests_within_timer: number of consecutive requests that must be

6756

++ *                             issued within the idle time slice to set

6757

++ *                             again idling to a queue which was marked as

6758

++ *                             non-I/O-bound (see the definition of the

6759

++ *                             IO_bound flag for further details).

6760

++ * @last_ins_in_burst: last time at which a queue entered the current

6761

++ *                     burst of queues being activated shortly after

6762

++ *                     each other; for more details about this and the

6763

++ *                     following parameters related to a burst of

6764

++ *                     activations, see the comments to the function

6765

++ *                     @bfq_handle_burst.

6766

++ * @bfq_burst_interval: reference time interval used to decide whether a

6767

++ *                      queue has been activated shortly after

6768

++ *                      @last_ins_in_burst.

6769

++ * @burst_size: number of queues in the current burst of queue activations.

6770

++ * @bfq_large_burst_thresh: maximum burst size above which the current

6771

++ * 			    queue-activation burst is deemed as 'large'.

6772

++ * @large_burst: true if a large queue-activation burst is in progress.

6773

++ * @burst_list: head of the burst list (as for the above fields, more details

6774

++ * 		in the comments to the function bfq_handle_burst).

6775

++ * @low_latency: if set to true, low-latency heuristics are enabled.

6776

++ * @bfq_wr_coeff: maximum factor by which the weight of a weight-raised

6777

++ *                queue is multiplied.

6778

++ * @bfq_wr_max_time: maximum duration of a weight-raising period (jiffies).

6779

++ * @bfq_wr_rt_max_time: maximum duration for soft real-time processes.

6780

++ * @bfq_wr_min_idle_time: minimum idle period after which weight-raising

6781

++ *			  may be reactivated for a queue (in jiffies).

6782

++ * @bfq_wr_min_inter_arr_async: minimum period between request arrivals

6783

++ *				after which weight-raising may be

6784

++ *				reactivated for an already busy queue

6785

++ *				(in jiffies).

6786

++ * @bfq_wr_max_softrt_rate: max service-rate for a soft real-time queue,

6787

++ *			    sectors per seconds.

6788

++ * @RT_prod: cached value of the product R*T used for computing the maximum

6789

++ *	     duration of the weight raising automatically.

6790

++ * @device_speed: device-speed class for the low-latency heuristic.

6791

++ * @oom_bfqq: fallback dummy bfqq for extreme OOM conditions.

6792

++ *

6793

++ * All the fields are protected by the @queue lock.

6794

++ */

6795

++struct bfq_data {

6796

++	struct request_queue *queue;

6797

++

6798

++	struct bfq_group *root_group;

6799

++	struct rb_root rq_pos_tree;

6800

++

6801

++#ifdef CONFIG_CGROUP_BFQIO

6802

++	int active_numerous_groups;

6803

++#endif

6804

++

6805

++	struct rb_root queue_weights_tree;

6806

++	struct rb_root group_weights_tree;

6807

++

6808

++	int busy_queues;

6809

++	int busy_in_flight_queues;

6810

++	int const_seeky_busy_in_flight_queues;

6811

++	int wr_busy_queues;

6812

++	int queued;

6813

++	int rq_in_driver;

6814

++	int sync_flight;

6815

++

6816

++	int max_rq_in_driver;

6817

++	int hw_tag_samples;

6818

++	int hw_tag;

6819

++

6820

++	int budgets_assigned;

6821

++

6822

++	struct timer_list idle_slice_timer;

6823

++	struct work_struct unplug_work;

6824

++

6825

++	struct bfq_queue *in_service_queue;

6826

++	struct bfq_io_cq *in_service_bic;

6827

++

6828

++	sector_t last_position;

6829

++

6830

++	ktime_t last_budget_start;

6831

++	ktime_t last_idling_start;

6832

++	int peak_rate_samples;

6833

++	u64 peak_rate;

6834

++	unsigned long bfq_max_budget;

6835

++

6836

++	struct hlist_head group_list;

6837

++	struct list_head active_list;

6838

++	struct list_head idle_list;

6839

++

6840

++	unsigned int bfq_quantum;

6841

++	unsigned int bfq_fifo_expire[2];

6842

++	unsigned int bfq_back_penalty;

6843

++	unsigned int bfq_back_max;

6844

++	unsigned int bfq_slice_idle;

6845

++	u64 bfq_class_idle_last_service;

6846

++

6847

++	unsigned int bfq_user_max_budget;

6848

++	unsigned int bfq_max_budget_async_rq;

6849

++	unsigned int bfq_timeout[2];

6850

++

6851

++	unsigned int bfq_coop_thresh;

6852

++	unsigned int bfq_failed_cooperations;

6853

++	unsigned int bfq_requests_within_timer;

6854

++

6855

++	unsigned long last_ins_in_burst;

6856

++	unsigned long bfq_burst_interval;

6857

++	int burst_size;

6858

++	unsigned long bfq_large_burst_thresh;

6859

++	bool large_burst;

6860

++	struct hlist_head burst_list;

6861

++

6862

++	bool low_latency;

6863

++

6864

++	/* parameters of the low_latency heuristics */

6865

++	unsigned int bfq_wr_coeff;

6866

++	unsigned int bfq_wr_max_time;

6867

++	unsigned int bfq_wr_rt_max_time;

6868

++	unsigned int bfq_wr_min_idle_time;

6869

++	unsigned long bfq_wr_min_inter_arr_async;

6870

++	unsigned int bfq_wr_max_softrt_rate;

6871

++	u64 RT_prod;

6872

++	enum bfq_device_speed device_speed;

6873

++

6874

++	struct bfq_queue oom_bfqq;

6875

++};

6876

++

6877

++enum bfqq_state_flags {

6878

++	BFQ_BFQQ_FLAG_busy = 0,		/* has requests or is in service */

6879

++	BFQ_BFQQ_FLAG_wait_request,	/* waiting for a request */

6880

++	BFQ_BFQQ_FLAG_must_alloc,	/* must be allowed rq alloc */

6881

++	BFQ_BFQQ_FLAG_fifo_expire,	/* FIFO checked in this slice */

6882

++	BFQ_BFQQ_FLAG_idle_window,	/* slice idling enabled */

6883

++	BFQ_BFQQ_FLAG_prio_changed,	/* task priority has changed */

6884

++	BFQ_BFQQ_FLAG_sync,		/* synchronous queue */

6885

++	BFQ_BFQQ_FLAG_budget_new,	/* no completion with this budget */

6886

++	BFQ_BFQQ_FLAG_IO_bound,         /*

6887

++					 * bfqq has timed-out at least once

6888

++					 * having consumed at most 2/10 of

6889

++					 * its budget

6890

++					 */

6891

++	BFQ_BFQQ_FLAG_in_large_burst,	/*

6892

++					 * bfqq activated in a large burst,

6893

++					 * see comments to bfq_handle_burst.

6894

++					 */

6895

++	BFQ_BFQQ_FLAG_constantly_seeky,	/*

6896

++					 * bfqq has proved to be slow and

6897

++					 * seeky until budget timeout

6898

++					 */

6899

++	BFQ_BFQQ_FLAG_softrt_update,    /*

6900

++					 * may need softrt-next-start

6901

++					 * update

6902

++					 */

6903

++	BFQ_BFQQ_FLAG_coop,		/* bfqq is shared */

6904

++	BFQ_BFQQ_FLAG_split_coop,	/* shared bfqq will be splitted */

6905

++};

6906

++

6907

++#define BFQ_BFQQ_FNS(name)						\

6908

++static inline void bfq_mark_bfqq_##name(struct bfq_queue *bfqq)		\

6909

++{									\

6910

++	(bfqq)->flags |= (1 << BFQ_BFQQ_FLAG_##name);			\

6911

++}									\

6912

++static inline void bfq_clear_bfqq_##name(struct bfq_queue *bfqq)	\

6913

++{									\

6914

++	(bfqq)->flags &= ~(1 << BFQ_BFQQ_FLAG_##name);			\

6915

++}									\

6916

++static inline int bfq_bfqq_##name(const struct bfq_queue *bfqq)		\

6917

++{									\

6918

++	return ((bfqq)->flags & (1 << BFQ_BFQQ_FLAG_##name)) != 0;	\

6919

++}

6920

++

6921

++BFQ_BFQQ_FNS(busy);

6922

++BFQ_BFQQ_FNS(wait_request);

6923

++BFQ_BFQQ_FNS(must_alloc);

6924

++BFQ_BFQQ_FNS(fifo_expire);

6925

++BFQ_BFQQ_FNS(idle_window);

6926

++BFQ_BFQQ_FNS(prio_changed);

6927

++BFQ_BFQQ_FNS(sync);

6928

++BFQ_BFQQ_FNS(budget_new);

6929

++BFQ_BFQQ_FNS(IO_bound);

6930

++BFQ_BFQQ_FNS(in_large_burst);

6931

++BFQ_BFQQ_FNS(constantly_seeky);

6932

++BFQ_BFQQ_FNS(coop);

6933

++BFQ_BFQQ_FNS(split_coop);

6934

++BFQ_BFQQ_FNS(softrt_update);

6935

++#undef BFQ_BFQQ_FNS

6936

++

6937

++/* Logging facilities. */

6938

++#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \

6939

++	blk_add_trace_msg((bfqd)->queue, "bfq%d " fmt, (bfqq)->pid, ##args)

6940

++

6941

++#define bfq_log(bfqd, fmt, args...) \

6942

++	blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args)

6943

++

6944

++/* Expiration reasons. */

6945

++enum bfqq_expiration {

6946

++	BFQ_BFQQ_TOO_IDLE = 0,		/*

6947

++					 * queue has been idling for

6948

++					 * too long

6949

++					 */

6950

++	BFQ_BFQQ_BUDGET_TIMEOUT,	/* budget took too long to be used */

6951

++	BFQ_BFQQ_BUDGET_EXHAUSTED,	/* budget consumed */

6952

++	BFQ_BFQQ_NO_MORE_REQUESTS,	/* the queue has no more requests */

6953

++};

6954

++

6955

++#ifdef CONFIG_CGROUP_BFQIO

6956

++/**

6957

++ * struct bfq_group - per (device, cgroup) data structure.

6958

++ * @entity: schedulable entity to insert into the parent group sched_data.

6959

++ * @sched_data: own sched_data, to contain child entities (they may be

6960

++ *              both bfq_queues and bfq_groups).

6961

++ * @group_node: node to be inserted into the bfqio_cgroup->group_data

6962

++ *              list of the containing cgroup's bfqio_cgroup.

6963

++ * @bfqd_node: node to be inserted into the @bfqd->group_list list

6964

++ *             of the groups active on the same device; used for cleanup.

6965

++ * @bfqd: the bfq_data for the device this group acts upon.

6966

++ * @async_bfqq: array of async queues for all the tasks belonging to

6967

++ *              the group, one queue per ioprio value per ioprio_class,

6968

++ *              except for the idle class that has only one queue.

6969

++ * @async_idle_bfqq: async queue for the idle class (ioprio is ignored).

6970

++ * @my_entity: pointer to @entity, %NULL for the toplevel group; used

6971

++ *             to avoid too many special cases during group creation/

6972

++ *             migration.

6973

++ * @active_entities: number of active entities belonging to the group;

6974

++ *                   unused for the root group. Used to know whether there

6975

++ *                   are groups with more than one active @bfq_entity

6976

++ *                   (see the comments to the function

6977

++ *                   bfq_bfqq_must_not_expire()).

6978

++ *

6979

++ * Each (device, cgroup) pair has its own bfq_group, i.e., for each cgroup

6980

++ * there is a set of bfq_groups, each one collecting the lower-level

6981

++ * entities belonging to the group that are acting on the same device.

6982

++ *

6983

++ * Locking works as follows:

6984

++ *    o @group_node is protected by the bfqio_cgroup lock, and is accessed

6985

++ *      via RCU from its readers.

6986

++ *    o @bfqd is protected by the queue lock, RCU is used to access it

6987

++ *      from the readers.

6988

++ *    o All the other fields are protected by the @bfqd queue lock.

6989

++ */

6990

++struct bfq_group {

6991

++	struct bfq_entity entity;

6992

++	struct bfq_sched_data sched_data;

6993

++

6994

++	struct hlist_node group_node;

6995

++	struct hlist_node bfqd_node;

6996

++

6997

++	void *bfqd;

6998

++

6999

++	struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];

7000

++	struct bfq_queue *async_idle_bfqq;

7001

++

7002

++	struct bfq_entity *my_entity;

7003

++

7004

++	int active_entities;

7005

++};

7006

++

7007

++/**

7008

++ * struct bfqio_cgroup - bfq cgroup data structure.

7009

++ * @css: subsystem state for bfq in the containing cgroup.

7010

++ * @online: flag marked when the subsystem is inserted.

7011

++ * @weight: cgroup weight.

7012

++ * @ioprio: cgroup ioprio.

7013

++ * @ioprio_class: cgroup ioprio_class.

7014

++ * @lock: spinlock that protects @ioprio, @ioprio_class and @group_data.

7015

++ * @group_data: list containing the bfq_group belonging to this cgroup.

7016

++ *

7017

++ * @group_data is accessed using RCU, with @lock protecting the updates,

7018

++ * @ioprio and @ioprio_class are protected by @lock.

7019

++ */

7020

++struct bfqio_cgroup {

7021

++	struct cgroup_subsys_state css;

7022

++	bool online;

7023

++

7024

++	unsigned short weight, ioprio, ioprio_class;

7025

++

7026

++	spinlock_t lock;

7027

++	struct hlist_head group_data;

7028

++};

7029

++#else

7030

++struct bfq_group {

7031

++	struct bfq_sched_data sched_data;

7032

++

7033

++	struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];

7034

++	struct bfq_queue *async_idle_bfqq;

7035

++};

7036

++#endif

7037

++

7038

++static inline struct bfq_service_tree *

7039

++bfq_entity_service_tree(struct bfq_entity *entity)

7040

++{

7041

++	struct bfq_sched_data *sched_data = entity->sched_data;

7042

++	unsigned int idx = entity->ioprio_class - 1;

7043

++

7044

++	BUG_ON(idx >= BFQ_IOPRIO_CLASSES);

7045

++	BUG_ON(sched_data == NULL);

7046

++

7047

++	return sched_data->service_tree + idx;

7048

++}

7049

++

7050

++static inline struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic,

7051

++					    bool is_sync)

7052

++{

7053

++	return bic->bfqq[is_sync];

7054

++}

7055

++

7056

++static inline void bic_set_bfqq(struct bfq_io_cq *bic,

7057

++				struct bfq_queue *bfqq, bool is_sync)

7058

++{

7059

++	bic->bfqq[is_sync] = bfqq;

7060

++}

7061

++

7062

++static inline struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic)

7063

++{

7064

++	return bic->icq.q->elevator->elevator_data;

7065

++}

7066

++

7067

++/**

7068

++ * bfq_get_bfqd_locked - get a lock to a bfqd using a RCU protected pointer.

7069

++ * @ptr: a pointer to a bfqd.

7070

++ * @flags: storage for the flags to be saved.

7071

++ *

7072

++ * This function allows bfqg->bfqd to be protected by the

7073

++ * queue lock of the bfqd they reference; the pointer is dereferenced

7074

++ * under RCU, so the storage for bfqd is assured to be safe as long

7075

++ * as the RCU read side critical section does not end.  After the

7076

++ * bfqd->queue->queue_lock is taken the pointer is rechecked, to be

7077

++ * sure that no other writer accessed it.  If we raced with a writer,

7078

++ * the function returns NULL, with the queue unlocked, otherwise it

7079

++ * returns the dereferenced pointer, with the queue locked.

7080

++ */

7081

++static inline struct bfq_data *bfq_get_bfqd_locked(void **ptr,

7082

++						   unsigned long *flags)

7083

++{

7084

++	struct bfq_data *bfqd;

7085

++

7086

++	rcu_read_lock();

7087

++	bfqd = rcu_dereference(*(struct bfq_data **)ptr);

7088

++

7089

++	if (bfqd != NULL) {

7090

++		spin_lock_irqsave(bfqd->queue->queue_lock, *flags);

7091

++		if (*ptr == bfqd)

7092

++			goto out;

7093

++		spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags);

7094

++	}

7095

++

7096

++	bfqd = NULL;

7097

++out:

7098

++	rcu_read_unlock();

7099

++	return bfqd;

7100

++}

7101

++

7102

++static inline void bfq_put_bfqd_unlock(struct bfq_data *bfqd,

7103

++				       unsigned long *flags)

7104

++{

7105

++	spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags);

7106

++}

7107

++

7108

++static void bfq_changed_ioprio(struct bfq_io_cq *bic);

7109

++static void bfq_put_queue(struct bfq_queue *bfqq);

7110

++static void bfq_dispatch_insert(struct request_queue *q, struct request *rq);

7111

++static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,

7112

++				       struct bfq_group *bfqg, int is_sync,

7113

++				       struct bfq_io_cq *bic, gfp_t gfp_mask);

7114

++static void bfq_end_wr_async_queues(struct bfq_data *bfqd,

7115

++				    struct bfq_group *bfqg);

7116

++static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg);

7117

++static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq);

7118

++

7119

++#endif /* _BFQ_H */

7120

+--

7121

+2.1.0

7122

+

7123

7124

diff --git a/5003_block-bfq-add-Early-Queue-Merge-EQM-to-BFQ-v7r7-for-4.0.0.patch b/5003_block-bfq-add-Early-Queue-Merge-EQM-to-BFQ-v7r7-for-4.0.0.patch

7125

new file mode 100644

7126

index 0000000..53267cd

7127

--- /dev/null

7128

+++ b/5003_block-bfq-add-Early-Queue-Merge-EQM-to-BFQ-v7r7-for-4.0.0.patch

7129

@@ -0,0 +1,1222 @@

7130

+From d49cf2e7913ec1c4b86a9de657140d9ec5fa8c19 Mon Sep 17 00:00:00 2001

7131

+From: Mauro Andreolini <mauro.andreolini@×××××××.it>

7132

+Date: Thu, 18 Dec 2014 21:32:08 +0100

7133

+Subject: [PATCH 3/3] block, bfq: add Early Queue Merge (EQM) to BFQ-v7r7 for

7134

+ 4.0.0

7135

+

7136

+A set of processes may happen  to  perform interleaved reads, i.e.,requests

7137

+whose union would give rise to a  sequential read  pattern.  There are two

7138

+typical  cases: in the first  case,   processes  read  fixed-size chunks of

7139

+data at a fixed distance from each other, while in the second case processes

7140

+may read variable-size chunks at  variable distances. The latter case occurs

7141

+for  example with  QEMU, which  splits the  I/O generated  by the  guest into

7142

+multiple chunks,  and lets these chunks  be served by a  pool of cooperating

7143

+processes,  iteratively  assigning  the  next  chunk of  I/O  to  the first

7144

+available  process. CFQ  uses actual  queue merging  for the  first type of

7145

+rocesses, whereas it  uses preemption to get a sequential  read pattern out

7146

+of the read requests  performed by the second type of  processes. In the end

7147

+it uses  two different  mechanisms to  achieve the  same goal: boosting the

7148

+throughput with interleaved I/O.

7149

+

7150

+This patch introduces  Early Queue Merge (EQM), a unified mechanism to get a

7151

+sequential  read pattern  with both  types of  processes. The  main idea is

7152

+checking newly arrived requests against the next request of the active queue

7153

+both in case of actual request insert and in case of request merge. By doing

7154

+so, both the types of processes can be handled by just merging their queues.

7155

+EQM is  then simpler and  more compact than the  pair of mechanisms used in

7156

+CFQ.

7157

+

7158

+Finally, EQM  also preserves the  typical low-latency properties of BFQ, by

7159

+properly restoring the weight-raising state of a queue when it gets back to

7160

+a non-merged state.

7161

+

7162

+Signed-off-by: Mauro Andreolini <mauro.andreolini@×××××××.it>

7163

+Signed-off-by: Arianna Avanzini <avanzini.arianna@×××××.com>

7164

+Signed-off-by: Paolo Valente <paolo.valente@×××××××.it>

7165

+---

7166

+ block/bfq-iosched.c | 751 +++++++++++++++++++++++++++++++++++++---------------

7167

+ block/bfq-sched.c   |  28 --

7168

+ block/bfq.h         |  54 +++-

7169

+ 3 files changed, 581 insertions(+), 252 deletions(-)

7170

+

7171

+diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c

7172

+index 97ee934..328f33c 100644

7173

+--- a/block/bfq-iosched.c

7174

++++ b/block/bfq-iosched.c

7175

+@@ -571,6 +571,57 @@ static inline unsigned int bfq_wr_duration(struct bfq_data *bfqd)

7176

+ 	return dur;

7177

+ }

7178

+

7179

++static inline unsigned

7180

++bfq_bfqq_cooperations(struct bfq_queue *bfqq)

7181

++{

7182

++	return bfqq->bic ? bfqq->bic->cooperations : 0;

7183

++}

7184

++

7185

++static inline void

7186

++bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_io_cq *bic)

7187

++{

7188

++	if (bic->saved_idle_window)

7189

++		bfq_mark_bfqq_idle_window(bfqq);

7190

++	else

7191

++		bfq_clear_bfqq_idle_window(bfqq);

7192

++	if (bic->saved_IO_bound)

7193

++		bfq_mark_bfqq_IO_bound(bfqq);

7194

++	else

7195

++		bfq_clear_bfqq_IO_bound(bfqq);

7196

++	/* Assuming that the flag in_large_burst is already correctly set */

7197

++	if (bic->wr_time_left && bfqq->bfqd->low_latency &&

7198

++	    !bfq_bfqq_in_large_burst(bfqq) &&

7199

++	    bic->cooperations < bfqq->bfqd->bfq_coop_thresh) {

7200

++		/*

7201

++		 * Start a weight raising period with the duration given by

7202

++		 * the raising_time_left snapshot.

7203

++		 */

7204

++		if (bfq_bfqq_busy(bfqq))

7205

++			bfqq->bfqd->wr_busy_queues++;

7206

++		bfqq->wr_coeff = bfqq->bfqd->bfq_wr_coeff;

7207

++		bfqq->wr_cur_max_time = bic->wr_time_left;

7208

++		bfqq->last_wr_start_finish = jiffies;

7209

++		bfqq->entity.ioprio_changed = 1;

7210

++	}

7211

++	/*

7212

++	 * Clear wr_time_left to prevent bfq_bfqq_save_state() from

7213

++	 * getting confused about the queue's need of a weight-raising

7214

++	 * period.

7215

++	 */

7216

++	bic->wr_time_left = 0;

7217

++}

7218

++

7219

++/* Must be called with the queue_lock held. */

7220

++static int bfqq_process_refs(struct bfq_queue *bfqq)

7221

++{

7222

++	int process_refs, io_refs;

7223

++

7224

++	io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE];

7225

++	process_refs = atomic_read(&bfqq->ref) - io_refs - bfqq->entity.on_st;

7226

++	BUG_ON(process_refs < 0);

7227

++	return process_refs;

7228

++}

7229

++

7230

+ /* Empty burst list and add just bfqq (see comments to bfq_handle_burst) */

7231

+ static inline void bfq_reset_burst_list(struct bfq_data *bfqd,

7232

+ 					struct bfq_queue *bfqq)

7233

+@@ -815,7 +866,7 @@ static void bfq_add_request(struct request *rq)

7234

+ 		bfq_rq_pos_tree_add(bfqd, bfqq);

7235

+

7236

+ 	if (!bfq_bfqq_busy(bfqq)) {

7237

+-		bool soft_rt,

7238

++		bool soft_rt, coop_or_in_burst,

7239

+ 		     idle_for_long_time = time_is_before_jiffies(

7240

+ 						bfqq->budget_timeout +

7241

+ 						bfqd->bfq_wr_min_idle_time);

7242

+@@ -839,11 +890,12 @@ static void bfq_add_request(struct request *rq)

7243

+ 				bfqd->last_ins_in_burst = jiffies;

7244

+ 		}

7245

+

7246

++		coop_or_in_burst = bfq_bfqq_in_large_burst(bfqq) ||

7247

++			bfq_bfqq_cooperations(bfqq) >= bfqd->bfq_coop_thresh;

7248

+ 		soft_rt = bfqd->bfq_wr_max_softrt_rate > 0 &&

7249

+-			!bfq_bfqq_in_large_burst(bfqq) &&

7250

++			!coop_or_in_burst &&

7251

+ 			time_is_before_jiffies(bfqq->soft_rt_next_start);

7252

+-		interactive = !bfq_bfqq_in_large_burst(bfqq) &&

7253

+-			      idle_for_long_time;

7254

++		interactive = !coop_or_in_burst && idle_for_long_time;

7255

+ 		entity->budget = max_t(unsigned long, bfqq->max_budget,

7256

+ 				       bfq_serv_to_charge(next_rq, bfqq));

7257

+

7258

+@@ -862,11 +914,20 @@ static void bfq_add_request(struct request *rq)

7259

+ 		if (!bfqd->low_latency)

7260

+ 			goto add_bfqq_busy;

7261

+

7262

++		if (bfq_bfqq_just_split(bfqq))

7263

++			goto set_ioprio_changed;

7264

++

7265

+ 		/*

7266

+-		 * If the queue is not being boosted and has been idle

7267

+-		 * for enough time, start a weight-raising period

7268

++		 * If the queue:

7269

++		 * - is not being boosted,

7270

++		 * - has been idle for enough time,

7271

++		 * - is not a sync queue or is linked to a bfq_io_cq (it is

7272

++		 *   shared "for its nature" or it is not shared and its

7273

++		 *   requests have not been redirected to a shared queue)

7274

++		 * start a weight-raising period.

7275

+ 		 */

7276

+-		if (old_wr_coeff == 1 && (interactive || soft_rt)) {

7277

++		if (old_wr_coeff == 1 && (interactive || soft_rt) &&

7278

++		    (!bfq_bfqq_sync(bfqq) || bfqq->bic != NULL)) {

7279

+ 			bfqq->wr_coeff = bfqd->bfq_wr_coeff;

7280

+ 			if (interactive)

7281

+ 				bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);

7282

+@@ -880,7 +941,7 @@ static void bfq_add_request(struct request *rq)

7283

+ 		} else if (old_wr_coeff > 1) {

7284

+ 			if (interactive)

7285

+ 				bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);

7286

+-			else if (bfq_bfqq_in_large_burst(bfqq) ||

7287

++			else if (coop_or_in_burst ||

7288

+ 				 (bfqq->wr_cur_max_time ==

7289

+ 				  bfqd->bfq_wr_rt_max_time &&

7290

+ 				  !soft_rt)) {

7291

+@@ -899,18 +960,18 @@ static void bfq_add_request(struct request *rq)

7292

+ 				/*

7293

+ 				 *

7294

+ 				 * The remaining weight-raising time is lower

7295

+-				 * than bfqd->bfq_wr_rt_max_time, which

7296

+-				 * means that the application is enjoying

7297

+-				 * weight raising either because deemed soft-

7298

+-				 * rt in the near past, or because deemed

7299

+-				 * interactive a long ago. In both cases,

7300

+-				 * resetting now the current remaining weight-

7301

+-				 * raising time for the application to the

7302

+-				 * weight-raising duration for soft rt

7303

+-				 * applications would not cause any latency

7304

+-				 * increase for the application (as the new

7305

+-				 * duration would be higher than the remaining

7306

+-				 * time).

7307

++				 * than bfqd->bfq_wr_rt_max_time, which means

7308

++				 * that the application is enjoying weight

7309

++				 * raising either because deemed soft-rt in

7310

++				 * the near past, or because deemed interactive

7311

++				 * a long ago.

7312

++				 * In both cases, resetting now the current

7313

++				 * remaining weight-raising time for the

7314

++				 * application to the weight-raising duration

7315

++				 * for soft rt applications would not cause any

7316

++				 * latency increase for the application (as the

7317

++				 * new duration would be higher than the

7318

++				 * remaining time).

7319

+ 				 *

7320

+ 				 * In addition, the application is now meeting

7321

+ 				 * the requirements for being deemed soft rt.

7322

+@@ -945,6 +1006,7 @@ static void bfq_add_request(struct request *rq)

7323

+ 					bfqd->bfq_wr_rt_max_time;

7324

+ 			}

7325

+ 		}

7326

++set_ioprio_changed:

7327

+ 		if (old_wr_coeff != bfqq->wr_coeff)

7328

+ 			entity->ioprio_changed = 1;

7329

+ add_bfqq_busy:

7330

+@@ -1156,90 +1218,35 @@ static void bfq_end_wr(struct bfq_data *bfqd)

7331

+ 	spin_unlock_irq(bfqd->queue->queue_lock);

7332

+ }

7333

+

7334

+-static int bfq_allow_merge(struct request_queue *q, struct request *rq,

7335

+-			   struct bio *bio)

7336

++static inline sector_t bfq_io_struct_pos(void *io_struct, bool request)

7337

+ {

7338

+-	struct bfq_data *bfqd = q->elevator->elevator_data;

7339

+-	struct bfq_io_cq *bic;

7340

+-	struct bfq_queue *bfqq;

7341

+-

7342

+-	/*

7343

+-	 * Disallow merge of a sync bio into an async request.

7344

+-	 */

7345

+-	if (bfq_bio_sync(bio) && !rq_is_sync(rq))

7346

+-		return 0;

7347

+-

7348

+-	/*

7349

+-	 * Lookup the bfqq that this bio will be queued with. Allow

7350

+-	 * merge only if rq is queued there.

7351

+-	 * Queue lock is held here.

7352

+-	 */

7353

+-	bic = bfq_bic_lookup(bfqd, current->io_context);

7354

+-	if (bic == NULL)

7355

+-		return 0;

7356

+-

7357

+-	bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));

7358

+-	return bfqq == RQ_BFQQ(rq);

7359

+-}

7360

+-

7361

+-static void __bfq_set_in_service_queue(struct bfq_data *bfqd,

7362

+-				       struct bfq_queue *bfqq)

7363

+-{

7364

+-	if (bfqq != NULL) {

7365

+-		bfq_mark_bfqq_must_alloc(bfqq);

7366

+-		bfq_mark_bfqq_budget_new(bfqq);

7367

+-		bfq_clear_bfqq_fifo_expire(bfqq);

7368

+-

7369

+-		bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8;

7370

+-

7371

+-		bfq_log_bfqq(bfqd, bfqq,

7372

+-			     "set_in_service_queue, cur-budget = %lu",

7373

+-			     bfqq->entity.budget);

7374

+-	}

7375

+-

7376

+-	bfqd->in_service_queue = bfqq;

7377

+-}

7378

+-

7379

+-/*

7380

+- * Get and set a new queue for service.

7381

+- */

7382

+-static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd,

7383

+-						  struct bfq_queue *bfqq)

7384

+-{

7385

+-	if (!bfqq)

7386

+-		bfqq = bfq_get_next_queue(bfqd);

7387

++	if (request)

7388

++		return blk_rq_pos(io_struct);

7389

+ 	else

7390

+-		bfq_get_next_queue_forced(bfqd, bfqq);

7391

+-

7392

+-	__bfq_set_in_service_queue(bfqd, bfqq);

7393

+-	return bfqq;

7394

++		return ((struct bio *)io_struct)->bi_iter.bi_sector;

7395

+ }

7396

+

7397

+-static inline sector_t bfq_dist_from_last(struct bfq_data *bfqd,

7398

+-					  struct request *rq)

7399

++static inline sector_t bfq_dist_from(sector_t pos1,

7400

++				     sector_t pos2)

7401

+ {

7402

+-	if (blk_rq_pos(rq) >= bfqd->last_position)

7403

+-		return blk_rq_pos(rq) - bfqd->last_position;

7404

++	if (pos1 >= pos2)

7405

++		return pos1 - pos2;

7406

+ 	else

7407

+-		return bfqd->last_position - blk_rq_pos(rq);

7408

++		return pos2 - pos1;

7409

+ }

7410

+

7411

+-/*

7412

+- * Return true if bfqq has no request pending and rq is close enough to

7413

+- * bfqd->last_position, or if rq is closer to bfqd->last_position than

7414

+- * bfqq->next_rq

7415

+- */

7416

+-static inline int bfq_rq_close(struct bfq_data *bfqd, struct request *rq)

7417

++static inline int bfq_rq_close_to_sector(void *io_struct, bool request,

7418

++					 sector_t sector)

7419

+ {

7420

+-	return bfq_dist_from_last(bfqd, rq) <= BFQQ_SEEK_THR;

7421

++	return bfq_dist_from(bfq_io_struct_pos(io_struct, request), sector) <=

7422

++	       BFQQ_SEEK_THR;

7423

+ }

7424

+

7425

+-static struct bfq_queue *bfqq_close(struct bfq_data *bfqd)

7426

++static struct bfq_queue *bfqq_close(struct bfq_data *bfqd, sector_t sector)

7427

+ {

7428

+ 	struct rb_root *root = &bfqd->rq_pos_tree;

7429

+ 	struct rb_node *parent, *node;

7430

+ 	struct bfq_queue *__bfqq;

7431

+-	sector_t sector = bfqd->last_position;

7432

+

7433

+ 	if (RB_EMPTY_ROOT(root))

7434

+ 		return NULL;

7435

+@@ -1258,7 +1265,7 @@ static struct bfq_queue *bfqq_close(struct bfq_data *bfqd)

7436

+ 	 * next_request position).

7437

+ 	 */

7438

+ 	__bfqq = rb_entry(parent, struct bfq_queue, pos_node);

7439

+-	if (bfq_rq_close(bfqd, __bfqq->next_rq))

7440

++	if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector))

7441

+ 		return __bfqq;

7442

+

7443

+ 	if (blk_rq_pos(__bfqq->next_rq) < sector)

7444

+@@ -1269,7 +1276,7 @@ static struct bfq_queue *bfqq_close(struct bfq_data *bfqd)

7445

+ 		return NULL;

7446

+

7447

+ 	__bfqq = rb_entry(node, struct bfq_queue, pos_node);

7448

+-	if (bfq_rq_close(bfqd, __bfqq->next_rq))

7449

++	if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector))

7450

+ 		return __bfqq;

7451

+

7452

+ 	return NULL;

7453

+@@ -1278,14 +1285,12 @@ static struct bfq_queue *bfqq_close(struct bfq_data *bfqd)

7454

+ /*

7455

+  * bfqd - obvious

7456

+  * cur_bfqq - passed in so that we don't decide that the current queue

7457

+- *            is closely cooperating with itself.

7458

+- *

7459

+- * We are assuming that cur_bfqq has dispatched at least one request,

7460

+- * and that bfqd->last_position reflects a position on the disk associated

7461

+- * with the I/O issued by cur_bfqq.

7462

++ *            is closely cooperating with itself

7463

++ * sector - used as a reference point to search for a close queue

7464

+  */

7465

+ static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd,

7466

+-					      struct bfq_queue *cur_bfqq)

7467

++					      struct bfq_queue *cur_bfqq,

7468

++					      sector_t sector)

7469

+ {

7470

+ 	struct bfq_queue *bfqq;

7471

+

7472

+@@ -1305,7 +1310,7 @@ static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd,

7473

+ 	 * working closely on the same area of the disk. In that case,

7474

+ 	 * we can group them together and don't waste time idling.

7475

+ 	 */

7476

+-	bfqq = bfqq_close(bfqd);

7477

++	bfqq = bfqq_close(bfqd, sector);

7478

+ 	if (bfqq == NULL || bfqq == cur_bfqq)

7479

+ 		return NULL;

7480

+

7481

+@@ -1332,6 +1337,315 @@ static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd,

7482

+ 	return bfqq;

7483

+ }

7484

+

7485

++static struct bfq_queue *

7486

++bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)

7487

++{

7488

++	int process_refs, new_process_refs;

7489

++	struct bfq_queue *__bfqq;

7490

++

7491

++	/*

7492

++	 * If there are no process references on the new_bfqq, then it is

7493

++	 * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain

7494

++	 * may have dropped their last reference (not just their last process

7495

++	 * reference).

7496

++	 */

7497

++	if (!bfqq_process_refs(new_bfqq))

7498

++		return NULL;

7499

++

7500

++	/* Avoid a circular list and skip interim queue merges. */

7501

++	while ((__bfqq = new_bfqq->new_bfqq)) {

7502

++		if (__bfqq == bfqq)

7503

++			return NULL;

7504

++		new_bfqq = __bfqq;

7505

++	}

7506

++

7507

++	process_refs = bfqq_process_refs(bfqq);

7508

++	new_process_refs = bfqq_process_refs(new_bfqq);

7509

++	/*

7510

++	 * If the process for the bfqq has gone away, there is no

7511

++	 * sense in merging the queues.

7512

++	 */

7513

++	if (process_refs == 0 || new_process_refs == 0)

7514

++		return NULL;

7515

++

7516

++	bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d",

7517

++		new_bfqq->pid);

7518

++

7519

++	/*

7520

++	 * Merging is just a redirection: the requests of the process

7521

++	 * owning one of the two queues are redirected to the other queue.

7522

++	 * The latter queue, in its turn, is set as shared if this is the

7523

++	 * first time that the requests of some process are redirected to

7524

++	 * it.

7525

++	 *

7526

++	 * We redirect bfqq to new_bfqq and not the opposite, because we

7527

++	 * are in the context of the process owning bfqq, hence we have

7528

++	 * the io_cq of this process. So we can immediately configure this

7529

++	 * io_cq to redirect the requests of the process to new_bfqq.

7530

++	 *

7531

++	 * NOTE, even if new_bfqq coincides with the in-service queue, the

7532

++	 * io_cq of new_bfqq is not available, because, if the in-service

7533

++	 * queue is shared, bfqd->in_service_bic may not point to the

7534

++	 * io_cq of the in-service queue.

7535

++	 * Redirecting the requests of the process owning bfqq to the

7536

++	 * currently in-service queue is in any case the best option, as

7537

++	 * we feed the in-service queue with new requests close to the

7538

++	 * last request served and, by doing so, hopefully increase the

7539

++	 * throughput.

7540

++	 */

7541

++	bfqq->new_bfqq = new_bfqq;

7542

++	atomic_add(process_refs, &new_bfqq->ref);

7543

++	return new_bfqq;

7544

++}

7545

++

7546

++/*

7547

++ * Attempt to schedule a merge of bfqq with the currently in-service queue

7548

++ * or with a close queue among the scheduled queues.

7549

++ * Return NULL if no merge was scheduled, a pointer to the shared bfq_queue

7550

++ * structure otherwise.

7551

++ *

7552

++ * The OOM queue is not allowed to participate to cooperation: in fact, since

7553

++ * the requests temporarily redirected to the OOM queue could be redirected

7554

++ * again to dedicated queues at any time, the state needed to correctly

7555

++ * handle merging with the OOM queue would be quite complex and expensive

7556

++ * to maintain. Besides, in such a critical condition as an out of memory,

7557

++ * the benefits of queue merging may be little relevant, or even negligible.

7558

++ */

7559

++static struct bfq_queue *

7560

++bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq,

7561

++		     void *io_struct, bool request)

7562

++{

7563

++	struct bfq_queue *in_service_bfqq, *new_bfqq;

7564

++

7565

++	if (bfqq->new_bfqq)

7566

++		return bfqq->new_bfqq;

7567

++

7568

++	if (!io_struct || unlikely(bfqq == &bfqd->oom_bfqq))

7569

++		return NULL;

7570

++

7571

++	in_service_bfqq = bfqd->in_service_queue;

7572

++

7573

++	if (in_service_bfqq == NULL || in_service_bfqq == bfqq ||

7574

++	    !bfqd->in_service_bic ||

7575

++	    unlikely(in_service_bfqq == &bfqd->oom_bfqq))

7576

++		goto check_scheduled;

7577

++

7578

++	if (bfq_class_idle(in_service_bfqq) || bfq_class_idle(bfqq))

7579

++		goto check_scheduled;

7580

++

7581

++	if (bfq_class_rt(in_service_bfqq) != bfq_class_rt(bfqq))

7582

++		goto check_scheduled;

7583

++

7584

++	if (in_service_bfqq->entity.parent != bfqq->entity.parent)

7585

++		goto check_scheduled;

7586

++

7587

++	if (bfq_rq_close_to_sector(io_struct, request, bfqd->last_position) &&

7588

++	    bfq_bfqq_sync(in_service_bfqq) && bfq_bfqq_sync(bfqq)) {

7589

++		new_bfqq = bfq_setup_merge(bfqq, in_service_bfqq);

7590

++		if (new_bfqq != NULL)

7591

++			return new_bfqq; /* Merge with in-service queue */

7592

++	}

7593

++

7594

++	/*

7595

++	 * Check whether there is a cooperator among currently scheduled

7596

++	 * queues. The only thing we need is that the bio/request is not

7597

++	 * NULL, as we need it to establish whether a cooperator exists.

7598

++	 */

7599

++check_scheduled:

7600

++	new_bfqq = bfq_close_cooperator(bfqd, bfqq,

7601

++					bfq_io_struct_pos(io_struct, request));

7602

++	if (new_bfqq && likely(new_bfqq != &bfqd->oom_bfqq))

7603

++		return bfq_setup_merge(bfqq, new_bfqq);

7604

++

7605

++	return NULL;

7606

++}

7607

++

7608

++static inline void

7609

++bfq_bfqq_save_state(struct bfq_queue *bfqq)

7610

++{

7611

++	/*

7612

++	 * If bfqq->bic == NULL, the queue is already shared or its requests

7613

++	 * have already been redirected to a shared queue; both idle window

7614

++	 * and weight raising state have already been saved. Do nothing.

7615

++	 */

7616

++	if (bfqq->bic == NULL)

7617

++		return;

7618

++	if (bfqq->bic->wr_time_left)

7619

++		/*

7620

++		 * This is the queue of a just-started process, and would

7621

++		 * deserve weight raising: we set wr_time_left to the full

7622

++		 * weight-raising duration to trigger weight-raising when

7623

++		 * and if the queue is split and the first request of the

7624

++		 * queue is enqueued.

7625

++		 */

7626

++		bfqq->bic->wr_time_left = bfq_wr_duration(bfqq->bfqd);

7627

++	else if (bfqq->wr_coeff > 1) {

7628

++		unsigned long wr_duration =

7629

++			jiffies - bfqq->last_wr_start_finish;

7630

++		/*

7631

++		 * It may happen that a queue's weight raising period lasts

7632

++		 * longer than its wr_cur_max_time, as weight raising is

7633

++		 * handled only when a request is enqueued or dispatched (it

7634

++		 * does not use any timer). If the weight raising period is

7635

++		 * about to end, don't save it.

7636

++		 */

7637

++		if (bfqq->wr_cur_max_time <= wr_duration)

7638

++			bfqq->bic->wr_time_left = 0;

7639

++		else

7640

++			bfqq->bic->wr_time_left =

7641

++				bfqq->wr_cur_max_time - wr_duration;

7642

++		/*

7643

++		 * The bfq_queue is becoming shared or the requests of the

7644

++		 * process owning the queue are being redirected to a shared

7645

++		 * queue. Stop the weight raising period of the queue, as in

7646

++		 * both cases it should not be owned by an interactive or

7647

++		 * soft real-time application.

7648

++		 */

7649

++		bfq_bfqq_end_wr(bfqq);

7650

++	} else

7651

++		bfqq->bic->wr_time_left = 0;

7652

++	bfqq->bic->saved_idle_window = bfq_bfqq_idle_window(bfqq);

7653

++	bfqq->bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq);

7654

++	bfqq->bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq);

7655

++	bfqq->bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node);

7656

++	bfqq->bic->cooperations++;

7657

++	bfqq->bic->failed_cooperations = 0;

7658

++}

7659

++

7660

++static inline void

7661

++bfq_get_bic_reference(struct bfq_queue *bfqq)

7662

++{

7663

++	/*

7664

++	 * If bfqq->bic has a non-NULL value, the bic to which it belongs

7665

++	 * is about to begin using a shared bfq_queue.

7666

++	 */

7667

++	if (bfqq->bic)

7668

++		atomic_long_inc(&bfqq->bic->icq.ioc->refcount);

7669

++}

7670

++

7671

++static void

7672

++bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,

7673

++		struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)

7674

++{

7675

++	bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu",

7676

++		(long unsigned)new_bfqq->pid);

7677

++	/* Save weight raising and idle window of the merged queues */

7678

++	bfq_bfqq_save_state(bfqq);

7679

++	bfq_bfqq_save_state(new_bfqq);

7680

++	if (bfq_bfqq_IO_bound(bfqq))

7681

++		bfq_mark_bfqq_IO_bound(new_bfqq);

7682

++	bfq_clear_bfqq_IO_bound(bfqq);

7683

++	/*

7684

++	 * Grab a reference to the bic, to prevent it from being destroyed

7685

++	 * before being possibly touched by a bfq_split_bfqq().

7686

++	 */

7687

++	bfq_get_bic_reference(bfqq);

7688

++	bfq_get_bic_reference(new_bfqq);

7689

++	/*

7690

++	 * Merge queues (that is, let bic redirect its requests to new_bfqq)

7691

++	 */

7692

++	bic_set_bfqq(bic, new_bfqq, 1);

7693

++	bfq_mark_bfqq_coop(new_bfqq);

7694

++	/*

7695

++	 * new_bfqq now belongs to at least two bics (it is a shared queue):

7696

++	 * set new_bfqq->bic to NULL. bfqq either:

7697

++	 * - does not belong to any bic any more, and hence bfqq->bic must

7698

++	 *   be set to NULL, or

7699

++	 * - is a queue whose owning bics have already been redirected to a

7700

++	 *   different queue, hence the queue is destined to not belong to

7701

++	 *   any bic soon and bfqq->bic is already NULL (therefore the next

7702

++	 *   assignment causes no harm).

7703

++	 */

7704

++	new_bfqq->bic = NULL;

7705

++	bfqq->bic = NULL;

7706

++	bfq_put_queue(bfqq);

7707

++}

7708

++

7709

++static inline void bfq_bfqq_increase_failed_cooperations(struct bfq_queue *bfqq)

7710

++{

7711

++	struct bfq_io_cq *bic = bfqq->bic;

7712

++	struct bfq_data *bfqd = bfqq->bfqd;

7713

++

7714

++	if (bic && bfq_bfqq_cooperations(bfqq) >= bfqd->bfq_coop_thresh) {

7715

++		bic->failed_cooperations++;

7716

++		if (bic->failed_cooperations >= bfqd->bfq_failed_cooperations)

7717

++			bic->cooperations = 0;

7718

++	}

7719

++}

7720

++

7721

++static int bfq_allow_merge(struct request_queue *q, struct request *rq,

7722

++			   struct bio *bio)

7723

++{

7724

++	struct bfq_data *bfqd = q->elevator->elevator_data;

7725

++	struct bfq_io_cq *bic;

7726

++	struct bfq_queue *bfqq, *new_bfqq;

7727

++

7728

++	/*

7729

++	 * Disallow merge of a sync bio into an async request.

7730

++	 */

7731

++	if (bfq_bio_sync(bio) && !rq_is_sync(rq))

7732

++		return 0;

7733

++

7734

++	/*

7735

++	 * Lookup the bfqq that this bio will be queued with. Allow

7736

++	 * merge only if rq is queued there.

7737

++	 * Queue lock is held here.

7738

++	 */

7739

++	bic = bfq_bic_lookup(bfqd, current->io_context);

7740

++	if (bic == NULL)

7741

++		return 0;

7742

++

7743

++	bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));

7744

++	/*

7745

++	 * We take advantage of this function to perform an early merge

7746

++	 * of the queues of possible cooperating processes.

7747

++	 */

7748

++	if (bfqq != NULL) {

7749

++		new_bfqq = bfq_setup_cooperator(bfqd, bfqq, bio, false);

7750

++		if (new_bfqq != NULL) {

7751

++			bfq_merge_bfqqs(bfqd, bic, bfqq, new_bfqq);

7752

++			/*

7753

++			 * If we get here, the bio will be queued in the

7754

++			 * shared queue, i.e., new_bfqq, so use new_bfqq

7755

++			 * to decide whether bio and rq can be merged.

7756

++			 */

7757

++			bfqq = new_bfqq;

7758

++		} else

7759

++			bfq_bfqq_increase_failed_cooperations(bfqq);

7760

++	}

7761

++

7762

++	return bfqq == RQ_BFQQ(rq);

7763

++}

7764

++

7765

++static void __bfq_set_in_service_queue(struct bfq_data *bfqd,

7766

++				       struct bfq_queue *bfqq)

7767

++{

7768

++	if (bfqq != NULL) {

7769

++		bfq_mark_bfqq_must_alloc(bfqq);

7770

++		bfq_mark_bfqq_budget_new(bfqq);

7771

++		bfq_clear_bfqq_fifo_expire(bfqq);

7772

++

7773

++		bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8;

7774

++

7775

++		bfq_log_bfqq(bfqd, bfqq,

7776

++			     "set_in_service_queue, cur-budget = %lu",

7777

++			     bfqq->entity.budget);

7778

++	}

7779

++

7780

++	bfqd->in_service_queue = bfqq;

7781

++}

7782

++

7783

++/*

7784

++ * Get and set a new queue for service.

7785

++ */

7786

++static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd)

7787

++{

7788

++	struct bfq_queue *bfqq = bfq_get_next_queue(bfqd);

7789

++

7790

++	__bfq_set_in_service_queue(bfqd, bfqq);

7791

++	return bfqq;

7792

++}

7793

++

7794

+ /*

7795

+  * If enough samples have been computed, return the current max budget

7796

+  * stored in bfqd, which is dynamically updated according to the

7797

+@@ -1475,61 +1789,6 @@ static struct request *bfq_check_fifo(struct bfq_queue *bfqq)

7798

+ 	return rq;

7799

+ }

7800

+

7801

+-/* Must be called with the queue_lock held. */

7802

+-static int bfqq_process_refs(struct bfq_queue *bfqq)

7803

+-{

7804

+-	int process_refs, io_refs;

7805

+-

7806

+-	io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE];

7807

+-	process_refs = atomic_read(&bfqq->ref) - io_refs - bfqq->entity.on_st;

7808

+-	BUG_ON(process_refs < 0);

7809

+-	return process_refs;

7810

+-}

7811

+-

7812

+-static void bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)

7813

+-{

7814

+-	int process_refs, new_process_refs;

7815

+-	struct bfq_queue *__bfqq;

7816

+-

7817

+-	/*

7818

+-	 * If there are no process references on the new_bfqq, then it is

7819

+-	 * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain

7820

+-	 * may have dropped their last reference (not just their last process

7821

+-	 * reference).

7822

+-	 */

7823

+-	if (!bfqq_process_refs(new_bfqq))

7824

+-		return;

7825

+-

7826

+-	/* Avoid a circular list and skip interim queue merges. */

7827

+-	while ((__bfqq = new_bfqq->new_bfqq)) {

7828

+-		if (__bfqq == bfqq)

7829

+-			return;

7830

+-		new_bfqq = __bfqq;

7831

+-	}

7832

+-

7833

+-	process_refs = bfqq_process_refs(bfqq);

7834

+-	new_process_refs = bfqq_process_refs(new_bfqq);

7835

+-	/*

7836

+-	 * If the process for the bfqq has gone away, there is no

7837

+-	 * sense in merging the queues.

7838

+-	 */

7839

+-	if (process_refs == 0 || new_process_refs == 0)

7840

+-		return;

7841

+-

7842

+-	/*

7843

+-	 * Merge in the direction of the lesser amount of work.

7844

+-	 */

7845

+-	if (new_process_refs >= process_refs) {

7846

+-		bfqq->new_bfqq = new_bfqq;

7847

+-		atomic_add(process_refs, &new_bfqq->ref);

7848

+-	} else {

7849

+-		new_bfqq->new_bfqq = bfqq;

7850

+-		atomic_add(new_process_refs, &bfqq->ref);

7851

+-	}

7852

+-	bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d",

7853

+-		new_bfqq->pid);

7854

+-}

7855

+-

7856

+ static inline unsigned long bfq_bfqq_budget_left(struct bfq_queue *bfqq)

7857

+ {

7858

+ 	struct bfq_entity *entity = &bfqq->entity;

7859

+@@ -2263,7 +2522,7 @@ static inline bool bfq_bfqq_must_idle(struct bfq_queue *bfqq)

7860

+  */

7861

+ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)

7862

+ {

7863

+-	struct bfq_queue *bfqq, *new_bfqq = NULL;

7864

++	struct bfq_queue *bfqq;

7865

+ 	struct request *next_rq;

7866

+ 	enum bfqq_expiration reason = BFQ_BFQQ_BUDGET_TIMEOUT;

7867

+

7868

+@@ -2273,17 +2532,6 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)

7869

+

7870

+ 	bfq_log_bfqq(bfqd, bfqq, "select_queue: already in-service queue");

7871

+

7872

+-	/*

7873

+-         * If another queue has a request waiting within our mean seek

7874

+-         * distance, let it run. The expire code will check for close

7875

+-         * cooperators and put the close queue at the front of the

7876

+-         * service tree. If possible, merge the expiring queue with the

7877

+-         * new bfqq.

7878

+-         */

7879

+-        new_bfqq = bfq_close_cooperator(bfqd, bfqq);

7880

+-        if (new_bfqq != NULL && bfqq->new_bfqq == NULL)

7881

+-                bfq_setup_merge(bfqq, new_bfqq);

7882

+-

7883

+ 	if (bfq_may_expire_for_budg_timeout(bfqq) &&

7884

+ 	    !timer_pending(&bfqd->idle_slice_timer) &&

7885

+ 	    !bfq_bfqq_must_idle(bfqq))

7886

+@@ -2322,10 +2570,7 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)

7887

+ 				bfq_clear_bfqq_wait_request(bfqq);

7888

+ 				del_timer(&bfqd->idle_slice_timer);

7889

+ 			}

7890

+-			if (new_bfqq == NULL)

7891

+-				goto keep_queue;

7892

+-			else

7893

+-				goto expire;

7894

++			goto keep_queue;

7895

+ 		}

7896

+ 	}

7897

+

7898

+@@ -2334,40 +2579,30 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)

7899

+ 	 * in flight (possibly waiting for a completion) or is idling for a

7900

+ 	 * new request, then keep it.

7901

+ 	 */

7902

+-	if (new_bfqq == NULL && (timer_pending(&bfqd->idle_slice_timer) ||

7903

+-	    (bfqq->dispatched != 0 && bfq_bfqq_must_not_expire(bfqq)))) {

7904

++	if (timer_pending(&bfqd->idle_slice_timer) ||

7905

++	    (bfqq->dispatched != 0 && bfq_bfqq_must_not_expire(bfqq))) {

7906

+ 		bfqq = NULL;

7907

+ 		goto keep_queue;

7908

+-	} else if (new_bfqq != NULL && timer_pending(&bfqd->idle_slice_timer)) {

7909

+-		/*

7910

+-		 * Expiring the queue because there is a close cooperator,

7911

+-		 * cancel timer.

7912

+-		 */

7913

+-		bfq_clear_bfqq_wait_request(bfqq);

7914

+-		del_timer(&bfqd->idle_slice_timer);

7915

+ 	}

7916

+

7917

+ 	reason = BFQ_BFQQ_NO_MORE_REQUESTS;

7918

+ expire:

7919

+ 	bfq_bfqq_expire(bfqd, bfqq, 0, reason);

7920

+ new_queue:

7921

+-	bfqq = bfq_set_in_service_queue(bfqd, new_bfqq);

7922

++	bfqq = bfq_set_in_service_queue(bfqd);

7923

+ 	bfq_log(bfqd, "select_queue: new queue %d returned",

7924

+ 		bfqq != NULL ? bfqq->pid : 0);

7925

+ keep_queue:

7926

+ 	return bfqq;

7927

+ }

7928

+

7929

+-static void bfq_update_wr_data(struct bfq_data *bfqd,

7930

+-			       struct bfq_queue *bfqq)

7931

++static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq)

7932

+ {

7933

+-	if (bfqq->wr_coeff > 1) { /* queue is being boosted */

7934

+-		struct bfq_entity *entity = &bfqq->entity;

7935

+-

7936

++	struct bfq_entity *entity = &bfqq->entity;

7937

++	if (bfqq->wr_coeff > 1) { /* queue is being weight-raised */

7938

+ 		bfq_log_bfqq(bfqd, bfqq,

7939

+ 			"raising period dur %u/%u msec, old coeff %u, w %d(%d)",

7940

+-			jiffies_to_msecs(jiffies -

7941

+-				bfqq->last_wr_start_finish),

7942

++			jiffies_to_msecs(jiffies - bfqq->last_wr_start_finish),

7943

+ 			jiffies_to_msecs(bfqq->wr_cur_max_time),

7944

+ 			bfqq->wr_coeff,

7945

+ 			bfqq->entity.weight, bfqq->entity.orig_weight);

7946

+@@ -2376,12 +2611,16 @@ static void bfq_update_wr_data(struct bfq_data *bfqd,

7947

+ 		       entity->orig_weight * bfqq->wr_coeff);

7948

+ 		if (entity->ioprio_changed)

7949

+ 			bfq_log_bfqq(bfqd, bfqq, "WARN: pending prio change");

7950

++

7951

+ 		/*

7952

+ 		 * If the queue was activated in a burst, or

7953

+ 		 * too much time has elapsed from the beginning

7954

+-		 * of this weight-raising, then end weight raising.

7955

++		 * of this weight-raising period, or the queue has

7956

++		 * exceeded the acceptable number of cooperations,

7957

++		 * then end weight raising.

7958

+ 		 */

7959

+ 		if (bfq_bfqq_in_large_burst(bfqq) ||

7960

++		    bfq_bfqq_cooperations(bfqq) >= bfqd->bfq_coop_thresh ||

7961

+ 		    time_is_before_jiffies(bfqq->last_wr_start_finish +

7962

+ 					   bfqq->wr_cur_max_time)) {

7963

+ 			bfqq->last_wr_start_finish = jiffies;

7964

+@@ -2390,11 +2629,13 @@ static void bfq_update_wr_data(struct bfq_data *bfqd,

7965

+ 				     bfqq->last_wr_start_finish,

7966

+ 				     jiffies_to_msecs(bfqq->wr_cur_max_time));

7967

+ 			bfq_bfqq_end_wr(bfqq);

7968

+-			__bfq_entity_update_weight_prio(

7969

+-				bfq_entity_service_tree(entity),

7970

+-				entity);

7971

+ 		}

7972

+ 	}

7973

++	/* Update weight both if it must be raised and if it must be lowered */

7974

++	if ((entity->weight > entity->orig_weight) != (bfqq->wr_coeff > 1))

7975

++		__bfq_entity_update_weight_prio(

7976

++			bfq_entity_service_tree(entity),

7977

++			entity);

7978

+ }

7979

+

7980

+ /*

7981

+@@ -2642,6 +2883,25 @@ static inline void bfq_init_icq(struct io_cq *icq)

7982

+ 	struct bfq_io_cq *bic = icq_to_bic(icq);

7983

+

7984

+ 	bic->ttime.last_end_request = jiffies;

7985

++	/*

7986

++	 * A newly created bic indicates that the process has just

7987

++	 * started doing I/O, and is probably mapping into memory its

7988

++	 * executable and libraries: it definitely needs weight raising.

7989

++	 * There is however the possibility that the process performs,

7990

++	 * for a while, I/O close to some other process. EQM intercepts

7991

++	 * this behavior and may merge the queue corresponding to the

7992

++	 * process  with some other queue, BEFORE the weight of the queue

7993

++	 * is raised. Merged queues are not weight-raised (they are assumed

7994

++	 * to belong to processes that benefit only from high throughput).

7995

++	 * If the merge is basically the consequence of an accident, then

7996

++	 * the queue will be split soon and will get back its old weight.

7997

++	 * It is then important to write down somewhere that this queue

7998

++	 * does need weight raising, even if it did not make it to get its

7999

++	 * weight raised before being merged. To this purpose, we overload

8000

++	 * the field raising_time_left and assign 1 to it, to mark the queue

8001

++	 * as needing weight raising.

8002

++	 */

8003

++	bic->wr_time_left = 1;

8004

+ }

8005

+

8006

+ static void bfq_exit_icq(struct io_cq *icq)

8007

+@@ -2655,6 +2915,13 @@ static void bfq_exit_icq(struct io_cq *icq)

8008

+ 	}

8009

+

8010

+ 	if (bic->bfqq[BLK_RW_SYNC]) {

8011

++		/*

8012

++		 * If the bic is using a shared queue, put the reference

8013

++		 * taken on the io_context when the bic started using a

8014

++		 * shared bfq_queue.

8015

++		 */

8016

++		if (bfq_bfqq_coop(bic->bfqq[BLK_RW_SYNC]))

8017

++			put_io_context(icq->ioc);

8018

+ 		bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_SYNC]);

8019

+ 		bic->bfqq[BLK_RW_SYNC] = NULL;

8020

+ 	}

8021

+@@ -2950,6 +3217,10 @@ static void bfq_update_idle_window(struct bfq_data *bfqd,

8022

+ 	if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq))

8023

+ 		return;

8024

+

8025

++	/* Idle window just restored, statistics are meaningless. */

8026

++	if (bfq_bfqq_just_split(bfqq))

8027

++		return;

8028

++

8029

+ 	enable_idle = bfq_bfqq_idle_window(bfqq);

8030

+

8031

+ 	if (atomic_read(&bic->icq.ioc->active_ref) == 0 ||

8032

+@@ -2997,6 +3268,7 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,

8033

+ 	if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 ||

8034

+ 	    !BFQQ_SEEKY(bfqq))

8035

+ 		bfq_update_idle_window(bfqd, bfqq, bic);

8036

++	bfq_clear_bfqq_just_split(bfqq);

8037

+

8038

+ 	bfq_log_bfqq(bfqd, bfqq,

8039

+ 		     "rq_enqueued: idle_window=%d (seeky %d, mean %llu)",

8040

+@@ -3057,13 +3329,49 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,

8041

+ static void bfq_insert_request(struct request_queue *q, struct request *rq)

8042

+ {

8043

+ 	struct bfq_data *bfqd = q->elevator->elevator_data;

8044

+-	struct bfq_queue *bfqq = RQ_BFQQ(rq);

8045

++	struct bfq_queue *bfqq = RQ_BFQQ(rq), *new_bfqq;

8046

+

8047

+ 	assert_spin_locked(bfqd->queue->queue_lock);

8048

++

8049

++	/*

8050

++	 * An unplug may trigger a requeue of a request from the device

8051

++	 * driver: make sure we are in process context while trying to

8052

++	 * merge two bfq_queues.

8053

++	 */

8054

++	if (!in_interrupt()) {

8055

++		new_bfqq = bfq_setup_cooperator(bfqd, bfqq, rq, true);

8056

++		if (new_bfqq != NULL) {

8057

++			if (bic_to_bfqq(RQ_BIC(rq), 1) != bfqq)

8058

++				new_bfqq = bic_to_bfqq(RQ_BIC(rq), 1);

8059

++			/*

8060

++			 * Release the request's reference to the old bfqq

8061

++			 * and make sure one is taken to the shared queue.

8062

++			 */

8063

++			new_bfqq->allocated[rq_data_dir(rq)]++;

8064

++			bfqq->allocated[rq_data_dir(rq)]--;

8065

++			atomic_inc(&new_bfqq->ref);

8066

++			bfq_put_queue(bfqq);

8067

++			if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq)

8068

++				bfq_merge_bfqqs(bfqd, RQ_BIC(rq),

8069

++						bfqq, new_bfqq);

8070

++			rq->elv.priv[1] = new_bfqq;

8071

++			bfqq = new_bfqq;

8072

++		} else

8073

++			bfq_bfqq_increase_failed_cooperations(bfqq);

8074

++	}

8075

++

8076

+ 	bfq_init_prio_data(bfqq, RQ_BIC(rq));

8077

+

8078

+ 	bfq_add_request(rq);

8079

+

8080

++	/*

8081

++	 * Here a newly-created bfq_queue has already started a weight-raising

8082

++	 * period: clear raising_time_left to prevent bfq_bfqq_save_state()

8083

++	 * from assigning it a full weight-raising period. See the detailed

8084

++	 * comments about this field in bfq_init_icq().

8085

++	 */

8086

++	if (bfqq->bic != NULL)

8087

++		bfqq->bic->wr_time_left = 0;

8088

+ 	rq->fifo_time = jiffies + bfqd->bfq_fifo_expire[rq_is_sync(rq)];

8089

+ 	list_add_tail(&rq->queuelist, &bfqq->fifo);

8090

+

8091

+@@ -3228,18 +3536,6 @@ static void bfq_put_request(struct request *rq)

8092

+ 	}

8093

+ }

8094

+

8095

+-static struct bfq_queue *

8096

+-bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,

8097

+-		struct bfq_queue *bfqq)

8098

+-{

8099

+-	bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu",

8100

+-		(long unsigned)bfqq->new_bfqq->pid);

8101

+-	bic_set_bfqq(bic, bfqq->new_bfqq, 1);

8102

+-	bfq_mark_bfqq_coop(bfqq->new_bfqq);

8103

+-	bfq_put_queue(bfqq);

8104

+-	return bic_to_bfqq(bic, 1);

8105

+-}

8106

+-

8107

+ /*

8108

+  * Returns NULL if a new bfqq should be allocated, or the old bfqq if this

8109

+  * was the last process referring to said bfqq.

8110

+@@ -3248,6 +3544,9 @@ static struct bfq_queue *

8111

+ bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq)

8112

+ {

8113

+ 	bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue");

8114

++

8115

++	put_io_context(bic->icq.ioc);

8116

++

8117

+ 	if (bfqq_process_refs(bfqq) == 1) {

8118

+ 		bfqq->pid = current->pid;

8119

+ 		bfq_clear_bfqq_coop(bfqq);

8120

+@@ -3276,6 +3575,7 @@ static int bfq_set_request(struct request_queue *q, struct request *rq,

8121

+ 	struct bfq_queue *bfqq;

8122

+ 	struct bfq_group *bfqg;

8123

+ 	unsigned long flags;

8124

++	bool split = false;

8125

+

8126

+ 	might_sleep_if(gfp_mask & __GFP_WAIT);

8127

+

8128

+@@ -3293,25 +3593,26 @@ new_queue:

8129

+ 	if (bfqq == NULL || bfqq == &bfqd->oom_bfqq) {

8130

+ 		bfqq = bfq_get_queue(bfqd, bfqg, is_sync, bic, gfp_mask);

8131

+ 		bic_set_bfqq(bic, bfqq, is_sync);

8132

++		if (split && is_sync) {

8133

++			if ((bic->was_in_burst_list && bfqd->large_burst) ||

8134

++			    bic->saved_in_large_burst)

8135

++				bfq_mark_bfqq_in_large_burst(bfqq);

8136

++			else {

8137

++			    bfq_clear_bfqq_in_large_burst(bfqq);

8138

++			    if (bic->was_in_burst_list)

8139

++			       hlist_add_head(&bfqq->burst_list_node,

8140

++				              &bfqd->burst_list);

8141

++			}

8142

++		}

8143

+ 	} else {

8144

+-		/*

8145

+-		 * If the queue was seeky for too long, break it apart.

8146

+-		 */

8147

++		/* If the queue was seeky for too long, break it apart. */

8148

+ 		if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) {

8149

+ 			bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq");

8150

+ 			bfqq = bfq_split_bfqq(bic, bfqq);

8151

++			split = true;

8152

+ 			if (!bfqq)

8153

+ 				goto new_queue;

8154

+ 		}

8155

+-

8156

+-		/*

8157

+-		 * Check to see if this queue is scheduled to merge with

8158

+-		 * another closely cooperating queue. The merging of queues

8159

+-		 * happens here as it must be done in process context.

8160

+-		 * The reference on new_bfqq was taken in merge_bfqqs.

8161

+-		 */

8162

+-		if (bfqq->new_bfqq != NULL)

8163

+-			bfqq = bfq_merge_bfqqs(bfqd, bic, bfqq);

8164

+ 	}

8165

+

8166

+ 	bfqq->allocated[rw]++;

8167

+@@ -3322,6 +3623,26 @@ new_queue:

8168

+ 	rq->elv.priv[0] = bic;

8169

+ 	rq->elv.priv[1] = bfqq;

8170

+

8171

++	/*

8172

++	 * If a bfq_queue has only one process reference, it is owned

8173

++	 * by only one bfq_io_cq: we can set the bic field of the

8174

++	 * bfq_queue to the address of that structure. Also, if the

8175

++	 * queue has just been split, mark a flag so that the

8176

++	 * information is available to the other scheduler hooks.

8177

++	 */

8178

++	if (likely(bfqq != &bfqd->oom_bfqq) && bfqq_process_refs(bfqq) == 1) {

8179

++		bfqq->bic = bic;

8180

++		if (split) {

8181

++			bfq_mark_bfqq_just_split(bfqq);

8182

++			/*

8183

++			 * If the queue has just been split from a shared

8184

++			 * queue, restore the idle window and the possible

8185

++			 * weight raising period.

8186

++			 */

8187

++			bfq_bfqq_resume_state(bfqq, bic);

8188

++		}

8189

++	}

8190

++

8191

+ 	spin_unlock_irqrestore(q->queue_lock, flags);

8192

+

8193

+ 	return 0;

8194

+diff --git a/block/bfq-sched.c b/block/bfq-sched.c

8195

+index 2931563..6764a7e 100644

8196

+--- a/block/bfq-sched.c

8197

++++ b/block/bfq-sched.c

8198

+@@ -1091,34 +1091,6 @@ static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd)

8199

+ 	return bfqq;

8200

+ }

8201

+

8202

+-/*

8203

+- * Forced extraction of the given queue.

8204

+- */

8205

+-static void bfq_get_next_queue_forced(struct bfq_data *bfqd,

8206

+-				      struct bfq_queue *bfqq)

8207

+-{

8208

+-	struct bfq_entity *entity;

8209

+-	struct bfq_sched_data *sd;

8210

+-

8211

+-	BUG_ON(bfqd->in_service_queue != NULL);

8212

+-

8213

+-	entity = &bfqq->entity;

8214

+-	/*

8215

+-	 * Bubble up extraction/update from the leaf to the root.

8216

+-	*/

8217

+-	for_each_entity(entity) {

8218

+-		sd = entity->sched_data;

8219

+-		bfq_update_budget(entity);

8220

+-		bfq_update_vtime(bfq_entity_service_tree(entity));

8221

+-		bfq_active_extract(bfq_entity_service_tree(entity), entity);

8222

+-		sd->in_service_entity = entity;

8223

+-		sd->next_in_service = NULL;

8224

+-		entity->service = 0;

8225

+-	}

8226

+-

8227

+-	return;

8228

+-}

8229

+-

8230

+ static void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd)

8231

+ {

8232

+ 	if (bfqd->in_service_bic != NULL) {

8233

+diff --git a/block/bfq.h b/block/bfq.h

8234

+index 518f2ac..4f519ea 100644

8235

+--- a/block/bfq.h

8236

++++ b/block/bfq.h

8237

+@@ -218,18 +218,21 @@ struct bfq_group;

8238

+  *                      idle @bfq_queue with no outstanding requests, then

8239

+  *                      the task associated with the queue it is deemed as

8240

+  *                      soft real-time (see the comments to the function

8241

+- *                      bfq_bfqq_softrt_next_start()).

8242

++ *                      bfq_bfqq_softrt_next_start())

8243

+  * @last_idle_bklogged: time of the last transition of the @bfq_queue from

8244

+  *                      idle to backlogged

8245

+  * @service_from_backlogged: cumulative service received from the @bfq_queue

8246

+  *                           since the last transition from idle to

8247

+  *                           backlogged

8248

++ * @bic: pointer to the bfq_io_cq owning the bfq_queue, set to %NULL if the

8249

++ *	 queue is shared

8250

+  *

8251

+- * A bfq_queue is a leaf request queue; it can be associated with an io_context

8252

+- * or more, if it is async or shared between cooperating processes. @cgroup

8253

+- * holds a reference to the cgroup, to be sure that it does not disappear while

8254

+- * a bfqq still references it (mostly to avoid races between request issuing and

8255

+- * task migration followed by cgroup destruction).

8256

++ * A bfq_queue is a leaf request queue; it can be associated with an

8257

++ * io_context or more, if it  is  async or shared  between  cooperating

8258

++ * processes. @cgroup holds a reference to the cgroup, to be sure that it

8259

++ * does not disappear while a bfqq still references it (mostly to avoid

8260

++ * races between request issuing and task migration followed by cgroup

8261

++ * destruction).

8262

+  * All the fields are protected by the queue lock of the containing bfqd.

8263

+  */

8264

+ struct bfq_queue {

8265

+@@ -269,6 +272,7 @@ struct bfq_queue {

8266

+ 	unsigned int requests_within_timer;

8267

+

8268

+ 	pid_t pid;

8269

++	struct bfq_io_cq *bic;

8270

+

8271

+ 	/* weight-raising fields */

8272

+ 	unsigned long wr_cur_max_time;

8273

+@@ -298,12 +302,42 @@ struct bfq_ttime {

8274

+  * @icq: associated io_cq structure

8275

+  * @bfqq: array of two process queues, the sync and the async

8276

+  * @ttime: associated @bfq_ttime struct

8277

++ * @wr_time_left: snapshot of the time left before weight raising ends

8278

++ *                for the sync queue associated to this process; this

8279

++ *		  snapshot is taken to remember this value while the weight

8280

++ *		  raising is suspended because the queue is merged with a

8281

++ *		  shared queue, and is used to set @raising_cur_max_time

8282

++ *		  when the queue is split from the shared queue and its

8283

++ *		  weight is raised again

8284

++ * @saved_idle_window: same purpose as the previous field for the idle

8285

++ *                     window

8286

++ * @saved_IO_bound: same purpose as the previous two fields for the I/O

8287

++ *                  bound classification of a queue

8288

++ * @saved_in_large_burst: same purpose as the previous fields for the

8289

++ *                        value of the field keeping the queue's belonging

8290

++ *                        to a large burst

8291

++ * @was_in_burst_list: true if the queue belonged to a burst list

8292

++ *                     before its merge with another cooperating queue

8293

++ * @cooperations: counter of consecutive successful queue merges underwent

8294

++ *                by any of the process' @bfq_queues

8295

++ * @failed_cooperations: counter of consecutive failed queue merges of any

8296

++ *                       of the process' @bfq_queues

8297

+  */

8298

+ struct bfq_io_cq {

8299

+ 	struct io_cq icq; /* must be the first member */

8300

+ 	struct bfq_queue *bfqq[2];

8301

+ 	struct bfq_ttime ttime;

8302

+ 	int ioprio;

8303

++

8304

++	unsigned int wr_time_left;

8305

++	bool saved_idle_window;

8306

++	bool saved_IO_bound;

8307

++

8308

++	bool saved_in_large_burst;

8309

++	bool was_in_burst_list;

8310

++

8311

++	unsigned int cooperations;

8312

++	unsigned int failed_cooperations;

8313

+ };

8314

+

8315

+ enum bfq_device_speed {

8316

+@@ -539,7 +573,7 @@ enum bfqq_state_flags {

8317

+ 	BFQ_BFQQ_FLAG_prio_changed,	/* task priority has changed */

8318

+ 	BFQ_BFQQ_FLAG_sync,		/* synchronous queue */

8319

+ 	BFQ_BFQQ_FLAG_budget_new,	/* no completion with this budget */

8320

+-	BFQ_BFQQ_FLAG_IO_bound,         /*

8321

++	BFQ_BFQQ_FLAG_IO_bound,		/*

8322

+ 					 * bfqq has timed-out at least once

8323

+ 					 * having consumed at most 2/10 of

8324

+ 					 * its budget

8325

+@@ -552,12 +586,13 @@ enum bfqq_state_flags {

8326

+ 					 * bfqq has proved to be slow and

8327

+ 					 * seeky until budget timeout

8328

+ 					 */

8329

+-	BFQ_BFQQ_FLAG_softrt_update,    /*

8330

++	BFQ_BFQQ_FLAG_softrt_update,	/*

8331

+ 					 * may need softrt-next-start

8332

+ 					 * update

8333

+ 					 */

8334

+ 	BFQ_BFQQ_FLAG_coop,		/* bfqq is shared */

8335

+-	BFQ_BFQQ_FLAG_split_coop,	/* shared bfqq will be splitted */

8336

++	BFQ_BFQQ_FLAG_split_coop,	/* shared bfqq will be split */

8337

++	BFQ_BFQQ_FLAG_just_split,	/* queue has just been split */

8338

+ };

8339

+

8340

+ #define BFQ_BFQQ_FNS(name)						\

8341

+@@ -587,6 +622,7 @@ BFQ_BFQQ_FNS(in_large_burst);

8342

+ BFQ_BFQQ_FNS(constantly_seeky);

8343

+ BFQ_BFQQ_FNS(coop);

8344

+ BFQ_BFQQ_FNS(split_coop);

8345

++BFQ_BFQQ_FNS(just_split);

8346

+ BFQ_BFQQ_FNS(softrt_update);

8347

+ #undef BFQ_BFQQ_FNS

8348

+

8349

+--

8350

+2.1.0

8351

+

Gentoo Archives: gentoo-commits