[gentoo-commits] proj/linux-patches:4.2 commit in: / - gentoo-commits

From:	Mike Pagano <mpagano@g.o>
To:	gentoo-commits@l.g.o
Subject:	[gentoo-commits] proj/linux-patches:4.2 commit in: /
Date:	Mon, 28 Sep 2015 16:50:09
Message-Id:	`1443458985.24113c3716b8d5a19a98dca269fbd61c48ce37dc.mpagano@gentoo`

1

commit:     24113c3716b8d5a19a98dca269fbd61c48ce37dc

2

Author:     Mike Pagano <mpagano <AT> gentoo <DOT> org>

3

AuthorDate: Mon Sep 28 16:49:45 2015 +0000

4

Commit:     Mike Pagano <mpagano <AT> gentoo <DOT> org>

5

CommitDate: Mon Sep 28 16:49:45 2015 +0000

6

URL:        https://gitweb.gentoo.org/proj/linux-patches.git/commit/?id=24113c37

7

8

Add BFQ v7r8.

9

10

 0000_README                                        |   12 +

11

 ...roups-kconfig-build-bits-for-BFQ-v7r8-4.2.patch |  104 +

12

 ...introduce-the-BFQ-v7r8-I-O-sched-for-4.2.patch1 | 6952 ++++++++++++++++++++

13

 ...Early-Queue-Merge-EQM-to-BFQ-v7r8-for-4.2.patch | 1220 ++++

14

 4 files changed, 8288 insertions(+)

15

16

diff --git a/0000_README b/0000_README

17

index 7050114..93b94b6 100644

18

--- a/0000_README

19

+++ b/0000_README

20

@@ -79,6 +79,18 @@ Patch:  5000_enable-additional-cpu-optimizations-for-gcc.patch

21

 From:   https://github.com/graysky2/kernel_gcc_patch/

22

 Desc:   Kernel patch enables gcc < v4.9 optimizations for additional CPUs.

23

24

+Patch:  5001_block-cgroups-kconfig-build-bits-for-BFQ-v7r8-4.2.patch

25

+From:   http://algo.ing.unimo.it/people/paolo/disk_sched/

26

+Desc:   BFQ v7r8 patch 1 for 4.2: Build, cgroups and kconfig bits

27

+

28

+Patch:  5002_block-introduce-the-BFQ-v7r8-I-O-sched-for-4.2.patch1

29

+From:   http://algo.ing.unimo.it/people/paolo/disk_sched/

30

+Desc:   BFQ v7r8 patch 2 for 4.2: BFQ Scheduler

31

+

32

+Patch:  5003_block-bfq-add-Early-Queue-Merge-EQM-to-BFQ-v7r8-for-4.2.0.patch

33

+From:   http://algo.ing.unimo.it/people/paolo/disk_sched/

34

+Desc:   BFQ v7r8 patch 3 for 4.2: Early Queue Merge (EQM)

35

+

36

 Patch:  5010_enable-additional-cpu-optimizations-for-gcc-4.9.patch

37

 From:   https://github.com/graysky2/kernel_gcc_patch/

38

 Desc:   Kernel patch enables gcc >= v4.9 optimizations for additional CPUs.

39

40

diff --git a/5001_block-cgroups-kconfig-build-bits-for-BFQ-v7r8-4.2.patch b/5001_block-cgroups-kconfig-build-bits-for-BFQ-v7r8-4.2.patch

41

new file mode 100644

42

index 0000000..daf9be7

43

--- /dev/null

44

+++ b/5001_block-cgroups-kconfig-build-bits-for-BFQ-v7r8-4.2.patch

45

@@ -0,0 +1,104 @@

46

+From c710d693f32c3d4952626aa2bdcf68ac7b40dd0e Mon Sep 17 00:00:00 2001

47

+From: Paolo Valente <paolo.valente@×××××××.it>

48

+Date: Tue, 7 Apr 2015 13:39:12 +0200

49

+Subject: [PATCH 1/3] block: cgroups, kconfig, build bits for BFQ-v7r8-4.2

50

+

51

+Update Kconfig.iosched and do the related Makefile changes to include

52

+kernel configuration options for BFQ. Also add the bfqio controller

53

+to the cgroups subsystem.

54

+

55

+Signed-off-by: Paolo Valente <paolo.valente@×××××××.it>

56

+Signed-off-by: Arianna Avanzini <avanzini@××××××.com>

57

+---

58

+ block/Kconfig.iosched         | 32 ++++++++++++++++++++++++++++++++

59

+ block/Makefile                |  1 +

60

+ include/linux/cgroup_subsys.h |  4 ++++

61

+ 3 files changed, 37 insertions(+)

62

+

63

+diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched

64

+index 421bef9..0ee5f0f 100644

65

+--- a/block/Kconfig.iosched

66

++++ b/block/Kconfig.iosched

67

+@@ -39,6 +39,27 @@ config CFQ_GROUP_IOSCHED

68

+ 	---help---

69

+ 	  Enable group IO scheduling in CFQ.

70

+

71

++config IOSCHED_BFQ

72

++	tristate "BFQ I/O scheduler"

73

++	default n

74

++	---help---

75

++	  The BFQ I/O scheduler tries to distribute bandwidth among

76

++	  all processes according to their weights.

77

++	  It aims at distributing the bandwidth as desired, independently of

78

++	  the disk parameters and with any workload. It also tries to

79

++	  guarantee low latency to interactive and soft real-time

80

++	  applications. If compiled built-in (saying Y here), BFQ can

81

++	  be configured to support hierarchical scheduling.

82

++

83

++config CGROUP_BFQIO

84

++	bool "BFQ hierarchical scheduling support"

85

++	depends on CGROUPS && IOSCHED_BFQ=y

86

++	default n

87

++	---help---

88

++	  Enable hierarchical scheduling in BFQ, using the cgroups

89

++	  filesystem interface.  The name of the subsystem will be

90

++	  bfqio.

91

++

92

+ choice

93

+ 	prompt "Default I/O scheduler"

94

+ 	default DEFAULT_CFQ

95

+@@ -52,6 +73,16 @@ choice

96

+ 	config DEFAULT_CFQ

97

+ 		bool "CFQ" if IOSCHED_CFQ=y

98

+

99

++	config DEFAULT_BFQ

100

++		bool "BFQ" if IOSCHED_BFQ=y

101

++		help

102

++		  Selects BFQ as the default I/O scheduler which will be

103

++		  used by default for all block devices.

104

++		  The BFQ I/O scheduler aims at distributing the bandwidth

105

++		  as desired, independently of the disk parameters and with

106

++		  any workload. It also tries to guarantee low latency to

107

++		  interactive and soft real-time applications.

108

++

109

+ 	config DEFAULT_NOOP

110

+ 		bool "No-op"

111

+

112

+@@ -61,6 +92,7 @@ config DEFAULT_IOSCHED

113

+ 	string

114

+ 	default "deadline" if DEFAULT_DEADLINE

115

+ 	default "cfq" if DEFAULT_CFQ

116

++	default "bfq" if DEFAULT_BFQ

117

+ 	default "noop" if DEFAULT_NOOP

118

+

119

+ endmenu

120

+diff --git a/block/Makefile b/block/Makefile

121

+index 00ecc97..1ed86d5 100644

122

+--- a/block/Makefile

123

++++ b/block/Makefile

124

+@@ -18,6 +18,7 @@ obj-$(CONFIG_BLK_DEV_THROTTLING)	+= blk-throttle.o

125

+ obj-$(CONFIG_IOSCHED_NOOP)	+= noop-iosched.o

126

+ obj-$(CONFIG_IOSCHED_DEADLINE)	+= deadline-iosched.o

127

+ obj-$(CONFIG_IOSCHED_CFQ)	+= cfq-iosched.o

128

++obj-$(CONFIG_IOSCHED_BFQ)	+= bfq-iosched.o

129

+

130

+ obj-$(CONFIG_BLOCK_COMPAT)	+= compat_ioctl.o

131

+ obj-$(CONFIG_BLK_CMDLINE_PARSER)	+= cmdline-parser.o

132

+diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h

133

+index 1a96fda..81ad8a0 100644

134

+--- a/include/linux/cgroup_subsys.h

135

++++ b/include/linux/cgroup_subsys.h

136

+@@ -46,6 +46,10 @@ SUBSYS(freezer)

137

+ SUBSYS(net_cls)

138

+ #endif

139

+

140

++#if IS_ENABLED(CONFIG_CGROUP_BFQIO)

141

++SUBSYS(bfqio)

142

++#endif

143

++

144

+ #if IS_ENABLED(CONFIG_CGROUP_PERF)

145

+ SUBSYS(perf_event)

146

+ #endif

147

+--

148

+1.9.1

149

+

150

151

diff --git a/5002_block-introduce-the-BFQ-v7r8-I-O-sched-for-4.2.patch1 b/5002_block-introduce-the-BFQ-v7r8-I-O-sched-for-4.2.patch1

152

new file mode 100644

153

index 0000000..4cc232d

154

--- /dev/null

155

+++ b/5002_block-introduce-the-BFQ-v7r8-I-O-sched-for-4.2.patch1

156

@@ -0,0 +1,6952 @@

157

+From a364e1785d2eef24c2ca0ade5db036721b86c185 Mon Sep 17 00:00:00 2001

158

+From: Paolo Valente <paolo.valente@×××××××.it>

159

+Date: Thu, 9 May 2013 19:10:02 +0200

160

+Subject: [PATCH 2/3] block: introduce the BFQ-v7r8 I/O sched for 4.2

161

+

162

+Add the BFQ-v7r8 I/O scheduler to 4.2.

163

+The general structure is borrowed from CFQ, as much of the code for

164

+handling I/O contexts. Over time, several useful features have been

165

+ported from CFQ as well (details in the changelog in README.BFQ). A

166

+(bfq_)queue is associated to each task doing I/O on a device, and each

167

+time a scheduling decision has to be made a queue is selected and served

168

+until it expires.

169

+

170

+    - Slices are given in the service domain: tasks are assigned

171

+      budgets, measured in number of sectors. Once got the disk, a task

172

+      must however consume its assigned budget within a configurable

173

+      maximum time (by default, the maximum possible value of the

174

+      budgets is automatically computed to comply with this timeout).

175

+      This allows the desired latency vs "throughput boosting" tradeoff

176

+      to be set.

177

+

178

+    - Budgets are scheduled according to a variant of WF2Q+, implemented

179

+      using an augmented rb-tree to take eligibility into account while

180

+      preserving an O(log N) overall complexity.

181

+

182

+    - A low-latency tunable is provided; if enabled, both interactive

183

+      and soft real-time applications are guaranteed a very low latency.

184

+

185

+    - Latency guarantees are preserved also in the presence of NCQ.

186

+

187

+    - Also with flash-based devices, a high throughput is achieved

188

+      while still preserving latency guarantees.

189

+

190

+    - BFQ features Early Queue Merge (EQM), a sort of fusion of the

191

+      cooperating-queue-merging and the preemption mechanisms present

192

+      in CFQ. EQM is in fact a unified mechanism that tries to get a

193

+      sequential read pattern, and hence a high throughput, with any

194

+      set of processes performing interleaved I/O over a contiguous

195

+      sequence of sectors.

196

+

197

+    - BFQ supports full hierarchical scheduling, exporting a cgroups

198

+      interface.  Since each node has a full scheduler, each group can

199

+      be assigned its own weight.

200

+

201

+    - If the cgroups interface is not used, only I/O priorities can be

202

+      assigned to processes, with ioprio values mapped to weights

203

+      with the relation weight = IOPRIO_BE_NR - ioprio.

204

+

205

+    - ioprio classes are served in strict priority order, i.e., lower

206

+      priority queues are not served as long as there are higher

207

+      priority queues.  Among queues in the same class the bandwidth is

208

+      distributed in proportion to the weight of each queue. A very

209

+      thin extra bandwidth is however guaranteed to the Idle class, to

210

+      prevent it from starving.

211

+

212

+Signed-off-by: Paolo Valente <paolo.valente@×××××××.it>

213

+Signed-off-by: Arianna Avanzini <avanzini@××××××.com>

214

+---

215

+ block/bfq-cgroup.c  |  936 +++++++++++++

216

+ block/bfq-ioc.c     |   36 +

217

+ block/bfq-iosched.c | 3898 +++++++++++++++++++++++++++++++++++++++++++++++++++

218

+ block/bfq-sched.c   | 1208 ++++++++++++++++

219

+ block/bfq.h         |  771 ++++++++++

220

+ 5 files changed, 6849 insertions(+)

221

+ create mode 100644 block/bfq-cgroup.c

222

+ create mode 100644 block/bfq-ioc.c

223

+ create mode 100644 block/bfq-iosched.c

224

+ create mode 100644 block/bfq-sched.c

225

+ create mode 100644 block/bfq.h

226

+

227

+diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c

228

+new file mode 100644

229

+index 0000000..11e2f1d

230

+--- /dev/null

231

++++ b/block/bfq-cgroup.c

232

+@@ -0,0 +1,936 @@

233

++/*

234

++ * BFQ: CGROUPS support.

235

++ *

236

++ * Based on ideas and code from CFQ:

237

++ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>

238

++ *

239

++ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>

240

++ *		      Paolo Valente <paolo.valente@×××××××.it>

241

++ *

242

++ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>

243

++ *

244

++ * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ

245

++ * file.

246

++ */

247

++

248

++#ifdef CONFIG_CGROUP_BFQIO

249

++

250

++static DEFINE_MUTEX(bfqio_mutex);

251

++

252

++static bool bfqio_is_removed(struct bfqio_cgroup *bgrp)

253

++{

254

++	return bgrp ? !bgrp->online : false;

255

++}

256

++

257

++static struct bfqio_cgroup bfqio_root_cgroup = {

258

++	.weight = BFQ_DEFAULT_GRP_WEIGHT,

259

++	.ioprio = BFQ_DEFAULT_GRP_IOPRIO,

260

++	.ioprio_class = BFQ_DEFAULT_GRP_CLASS,

261

++};

262

++

263

++static inline void bfq_init_entity(struct bfq_entity *entity,

264

++				   struct bfq_group *bfqg)

265

++{

266

++	entity->weight = entity->new_weight;

267

++	entity->orig_weight = entity->new_weight;

268

++	entity->ioprio = entity->new_ioprio;

269

++	entity->ioprio_class = entity->new_ioprio_class;

270

++	entity->parent = bfqg->my_entity;

271

++	entity->sched_data = &bfqg->sched_data;

272

++}

273

++

274

++static struct bfqio_cgroup *css_to_bfqio(struct cgroup_subsys_state *css)

275

++{

276

++	return css ? container_of(css, struct bfqio_cgroup, css) : NULL;

277

++}

278

++

279

++/*

280

++ * Search the bfq_group for bfqd into the hash table (by now only a list)

281

++ * of bgrp.  Must be called under rcu_read_lock().

282

++ */

283

++static struct bfq_group *bfqio_lookup_group(struct bfqio_cgroup *bgrp,

284

++					    struct bfq_data *bfqd)

285

++{

286

++	struct bfq_group *bfqg;

287

++	void *key;

288

++

289

++	hlist_for_each_entry_rcu(bfqg, &bgrp->group_data, group_node) {

290

++		key = rcu_dereference(bfqg->bfqd);

291

++		if (key == bfqd)

292

++			return bfqg;

293

++	}

294

++

295

++	return NULL;

296

++}

297

++

298

++static inline void bfq_group_init_entity(struct bfqio_cgroup *bgrp,

299

++					 struct bfq_group *bfqg)

300

++{

301

++	struct bfq_entity *entity = &bfqg->entity;

302

++

303

++	/*

304

++	 * If the weight of the entity has never been set via the sysfs

305

++	 * interface, then bgrp->weight == 0. In this case we initialize

306

++	 * the weight from the current ioprio value. Otherwise, the group

307

++	 * weight, if set, has priority over the ioprio value.

308

++	 */

309

++	if (bgrp->weight == 0) {

310

++		entity->new_weight = bfq_ioprio_to_weight(bgrp->ioprio);

311

++		entity->new_ioprio = bgrp->ioprio;

312

++	} else {

313

++		if (bgrp->weight < BFQ_MIN_WEIGHT ||

314

++		    bgrp->weight > BFQ_MAX_WEIGHT) {

315

++			printk(KERN_CRIT "bfq_group_init_entity: "

316

++					 "bgrp->weight %d\n", bgrp->weight);

317

++			BUG();

318

++		}

319

++		entity->new_weight = bgrp->weight;

320

++		entity->new_ioprio = bfq_weight_to_ioprio(bgrp->weight);

321

++	}

322

++	entity->orig_weight = entity->weight = entity->new_weight;

323

++	entity->ioprio = entity->new_ioprio;

324

++	entity->ioprio_class = entity->new_ioprio_class = bgrp->ioprio_class;

325

++	entity->my_sched_data = &bfqg->sched_data;

326

++	bfqg->active_entities = 0;

327

++}

328

++

329

++static inline void bfq_group_set_parent(struct bfq_group *bfqg,

330

++					struct bfq_group *parent)

331

++{

332

++	struct bfq_entity *entity;

333

++

334

++	BUG_ON(parent == NULL);

335

++	BUG_ON(bfqg == NULL);

336

++

337

++	entity = &bfqg->entity;

338

++	entity->parent = parent->my_entity;

339

++	entity->sched_data = &parent->sched_data;

340

++}

341

++

342

++/**

343

++ * bfq_group_chain_alloc - allocate a chain of groups.

344

++ * @bfqd: queue descriptor.

345

++ * @css: the leaf cgroup_subsys_state this chain starts from.

346

++ *

347

++ * Allocate a chain of groups starting from the one belonging to

348

++ * @cgroup up to the root cgroup.  Stop if a cgroup on the chain

349

++ * to the root has already an allocated group on @bfqd.

350

++ */

351

++static struct bfq_group *bfq_group_chain_alloc(struct bfq_data *bfqd,

352

++					       struct cgroup_subsys_state *css)

353

++{

354

++	struct bfqio_cgroup *bgrp;

355

++	struct bfq_group *bfqg, *prev = NULL, *leaf = NULL;

356

++

357

++	for (; css != NULL; css = css->parent) {

358

++		bgrp = css_to_bfqio(css);

359

++

360

++		bfqg = bfqio_lookup_group(bgrp, bfqd);

361

++		if (bfqg != NULL) {

362

++			/*

363

++			 * All the cgroups in the path from there to the

364

++			 * root must have a bfq_group for bfqd, so we don't

365

++			 * need any more allocations.

366

++			 */

367

++			break;

368

++		}

369

++

370

++		bfqg = kzalloc(sizeof(*bfqg), GFP_ATOMIC);

371

++		if (bfqg == NULL)

372

++			goto cleanup;

373

++

374

++		bfq_group_init_entity(bgrp, bfqg);

375

++		bfqg->my_entity = &bfqg->entity;

376

++

377

++		if (leaf == NULL) {

378

++			leaf = bfqg;

379

++			prev = leaf;

380

++		} else {

381

++			bfq_group_set_parent(prev, bfqg);

382

++			/*

383

++			 * Build a list of allocated nodes using the bfqd

384

++			 * filed, that is still unused and will be

385

++			 * initialized only after the node will be

386

++			 * connected.

387

++			 */

388

++			prev->bfqd = bfqg;

389

++			prev = bfqg;

390

++		}

391

++	}

392

++

393

++	return leaf;

394

++

395

++cleanup:

396

++	while (leaf != NULL) {

397

++		prev = leaf;

398

++		leaf = leaf->bfqd;

399

++		kfree(prev);

400

++	}

401

++

402

++	return NULL;

403

++}

404

++

405

++/**

406

++ * bfq_group_chain_link - link an allocated group chain to a cgroup

407

++ *                        hierarchy.

408

++ * @bfqd: the queue descriptor.

409

++ * @css: the leaf cgroup_subsys_state to start from.

410

++ * @leaf: the leaf group (to be associated to @cgroup).

411

++ *

412

++ * Try to link a chain of groups to a cgroup hierarchy, connecting the

413

++ * nodes bottom-up, so we can be sure that when we find a cgroup in the

414

++ * hierarchy that already as a group associated to @bfqd all the nodes

415

++ * in the path to the root cgroup have one too.

416

++ *

417

++ * On locking: the queue lock protects the hierarchy (there is a hierarchy

418

++ * per device) while the bfqio_cgroup lock protects the list of groups

419

++ * belonging to the same cgroup.

420

++ */

421

++static void bfq_group_chain_link(struct bfq_data *bfqd,

422

++				 struct cgroup_subsys_state *css,

423

++				 struct bfq_group *leaf)

424

++{

425

++	struct bfqio_cgroup *bgrp;

426

++	struct bfq_group *bfqg, *next, *prev = NULL;

427

++	unsigned long flags;

428

++

429

++	assert_spin_locked(bfqd->queue->queue_lock);

430

++

431

++	for (; css != NULL && leaf != NULL; css = css->parent) {

432

++		bgrp = css_to_bfqio(css);

433

++		next = leaf->bfqd;

434

++

435

++		bfqg = bfqio_lookup_group(bgrp, bfqd);

436

++		BUG_ON(bfqg != NULL);

437

++

438

++		spin_lock_irqsave(&bgrp->lock, flags);

439

++

440

++		rcu_assign_pointer(leaf->bfqd, bfqd);

441

++		hlist_add_head_rcu(&leaf->group_node, &bgrp->group_data);

442

++		hlist_add_head(&leaf->bfqd_node, &bfqd->group_list);

443

++

444

++		spin_unlock_irqrestore(&bgrp->lock, flags);

445

++

446

++		prev = leaf;

447

++		leaf = next;

448

++	}

449

++

450

++	BUG_ON(css == NULL && leaf != NULL);

451

++	if (css != NULL && prev != NULL) {

452

++		bgrp = css_to_bfqio(css);

453

++		bfqg = bfqio_lookup_group(bgrp, bfqd);

454

++		bfq_group_set_parent(prev, bfqg);

455

++	}

456

++}

457

++

458

++/**

459

++ * bfq_find_alloc_group - return the group associated to @bfqd in @cgroup.

460

++ * @bfqd: queue descriptor.

461

++ * @cgroup: cgroup being searched for.

462

++ *

463

++ * Return a group associated to @bfqd in @cgroup, allocating one if

464

++ * necessary.  When a group is returned all the cgroups in the path

465

++ * to the root have a group associated to @bfqd.

466

++ *

467

++ * If the allocation fails, return the root group: this breaks guarantees

468

++ * but is a safe fallback.  If this loss becomes a problem it can be

469

++ * mitigated using the equivalent weight (given by the product of the

470

++ * weights of the groups in the path from @group to the root) in the

471

++ * root scheduler.

472

++ *

473

++ * We allocate all the missing nodes in the path from the leaf cgroup

474

++ * to the root and we connect the nodes only after all the allocations

475

++ * have been successful.

476

++ */

477

++static struct bfq_group *bfq_find_alloc_group(struct bfq_data *bfqd,

478

++					      struct cgroup_subsys_state *css)

479

++{

480

++	struct bfqio_cgroup *bgrp = css_to_bfqio(css);

481

++	struct bfq_group *bfqg;

482

++

483

++	bfqg = bfqio_lookup_group(bgrp, bfqd);

484

++	if (bfqg != NULL)

485

++		return bfqg;

486

++

487

++	bfqg = bfq_group_chain_alloc(bfqd, css);

488

++	if (bfqg != NULL)

489

++		bfq_group_chain_link(bfqd, css, bfqg);

490

++	else

491

++		bfqg = bfqd->root_group;

492

++

493

++	return bfqg;

494

++}

495

++

496

++/**

497

++ * bfq_bfqq_move - migrate @bfqq to @bfqg.

498

++ * @bfqd: queue descriptor.

499

++ * @bfqq: the queue to move.

500

++ * @entity: @bfqq's entity.

501

++ * @bfqg: the group to move to.

502

++ *

503

++ * Move @bfqq to @bfqg, deactivating it from its old group and reactivating

504

++ * it on the new one.  Avoid putting the entity on the old group idle tree.

505

++ *

506

++ * Must be called under the queue lock; the cgroup owning @bfqg must

507

++ * not disappear (by now this just means that we are called under

508

++ * rcu_read_lock()).

509

++ */

510

++static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,

511

++			  struct bfq_entity *entity, struct bfq_group *bfqg)

512

++{

513

++	int busy, resume;

514

++

515

++	busy = bfq_bfqq_busy(bfqq);

516

++	resume = !RB_EMPTY_ROOT(&bfqq->sort_list);

517

++

518

++	BUG_ON(resume && !entity->on_st);

519

++	BUG_ON(busy && !resume && entity->on_st &&

520

++	       bfqq != bfqd->in_service_queue);

521

++

522

++	if (busy) {

523

++		BUG_ON(atomic_read(&bfqq->ref) < 2);

524

++

525

++		if (!resume)

526

++			bfq_del_bfqq_busy(bfqd, bfqq, 0);

527

++		else

528

++			bfq_deactivate_bfqq(bfqd, bfqq, 0);

529

++	} else if (entity->on_st)

530

++		bfq_put_idle_entity(bfq_entity_service_tree(entity), entity);

531

++

532

++	/*

533

++	 * Here we use a reference to bfqg.  We don't need a refcounter

534

++	 * as the cgroup reference will not be dropped, so that its

535

++	 * destroy() callback will not be invoked.

536

++	 */

537

++	entity->parent = bfqg->my_entity;

538

++	entity->sched_data = &bfqg->sched_data;

539

++

540

++	if (busy && resume)

541

++		bfq_activate_bfqq(bfqd, bfqq);

542

++

543

++	if (bfqd->in_service_queue == NULL && !bfqd->rq_in_driver)

544

++		bfq_schedule_dispatch(bfqd);

545

++}

546

++

547

++/**

548

++ * __bfq_bic_change_cgroup - move @bic to @cgroup.

549

++ * @bfqd: the queue descriptor.

550

++ * @bic: the bic to move.

551

++ * @cgroup: the cgroup to move to.

552

++ *

553

++ * Move bic to cgroup, assuming that bfqd->queue is locked; the caller

554

++ * has to make sure that the reference to cgroup is valid across the call.

555

++ *

556

++ * NOTE: an alternative approach might have been to store the current

557

++ * cgroup in bfqq and getting a reference to it, reducing the lookup

558

++ * time here, at the price of slightly more complex code.

559

++ */

560

++static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd,

561

++						struct bfq_io_cq *bic,

562

++						struct cgroup_subsys_state *css)

563

++{

564

++	struct bfq_queue *async_bfqq = bic_to_bfqq(bic, 0);

565

++	struct bfq_queue *sync_bfqq = bic_to_bfqq(bic, 1);

566

++	struct bfq_entity *entity;

567

++	struct bfq_group *bfqg;

568

++	struct bfqio_cgroup *bgrp;

569

++

570

++	bgrp = css_to_bfqio(css);

571

++

572

++	bfqg = bfq_find_alloc_group(bfqd, css);

573

++	if (async_bfqq != NULL) {

574

++		entity = &async_bfqq->entity;

575

++

576

++		if (entity->sched_data != &bfqg->sched_data) {

577

++			bic_set_bfqq(bic, NULL, 0);

578

++			bfq_log_bfqq(bfqd, async_bfqq,

579

++				     "bic_change_group: %p %d",

580

++				     async_bfqq, atomic_read(&async_bfqq->ref));

581

++			bfq_put_queue(async_bfqq);

582

++		}

583

++	}

584

++

585

++	if (sync_bfqq != NULL) {

586

++		entity = &sync_bfqq->entity;

587

++		if (entity->sched_data != &bfqg->sched_data)

588

++			bfq_bfqq_move(bfqd, sync_bfqq, entity, bfqg);

589

++	}

590

++

591

++	return bfqg;

592

++}

593

++

594

++/**

595

++ * bfq_bic_change_cgroup - move @bic to @cgroup.

596

++ * @bic: the bic being migrated.

597

++ * @cgroup: the destination cgroup.

598

++ *

599

++ * When the task owning @bic is moved to @cgroup, @bic is immediately

600

++ * moved into its new parent group.

601

++ */

602

++static void bfq_bic_change_cgroup(struct bfq_io_cq *bic,

603

++				  struct cgroup_subsys_state *css)

604

++{

605

++	struct bfq_data *bfqd;

606

++	unsigned long uninitialized_var(flags);

607

++

608

++	bfqd = bfq_get_bfqd_locked(&(bic->icq.q->elevator->elevator_data),

609

++				   &flags);

610

++	if (bfqd != NULL) {

611

++		__bfq_bic_change_cgroup(bfqd, bic, css);

612

++		bfq_put_bfqd_unlock(bfqd, &flags);

613

++	}

614

++}

615

++

616

++/**

617

++ * bfq_bic_update_cgroup - update the cgroup of @bic.

618

++ * @bic: the @bic to update.

619

++ *

620

++ * Make sure that @bic is enqueued in the cgroup of the current task.

621

++ * We need this in addition to moving bics during the cgroup attach

622

++ * phase because the task owning @bic could be at its first disk

623

++ * access or we may end up in the root cgroup as the result of a

624

++ * memory allocation failure and here we try to move to the right

625

++ * group.

626

++ *

627

++ * Must be called under the queue lock.  It is safe to use the returned

628

++ * value even after the rcu_read_unlock() as the migration/destruction

629

++ * paths act under the queue lock too.  IOW it is impossible to race with

630

++ * group migration/destruction and end up with an invalid group as:

631

++ *   a) here cgroup has not yet been destroyed, nor its destroy callback

632

++ *      has started execution, as current holds a reference to it,

633

++ *   b) if it is destroyed after rcu_read_unlock() [after current is

634

++ *      migrated to a different cgroup] its attach() callback will have

635

++ *      taken care of remove all the references to the old cgroup data.

636

++ */

637

++static struct bfq_group *bfq_bic_update_cgroup(struct bfq_io_cq *bic)

638

++{

639

++	struct bfq_data *bfqd = bic_to_bfqd(bic);

640

++	struct bfq_group *bfqg;

641

++	struct cgroup_subsys_state *css;

642

++

643

++	BUG_ON(bfqd == NULL);

644

++

645

++	rcu_read_lock();

646

++	css = task_css(current, bfqio_cgrp_id);

647

++	bfqg = __bfq_bic_change_cgroup(bfqd, bic, css);

648

++	rcu_read_unlock();

649

++

650

++	return bfqg;

651

++}

652

++

653

++/**

654

++ * bfq_flush_idle_tree - deactivate any entity on the idle tree of @st.

655

++ * @st: the service tree being flushed.

656

++ */

657

++static inline void bfq_flush_idle_tree(struct bfq_service_tree *st)

658

++{

659

++	struct bfq_entity *entity = st->first_idle;

660

++

661

++	for (; entity != NULL; entity = st->first_idle)

662

++		__bfq_deactivate_entity(entity, 0);

663

++}

664

++

665

++/**

666

++ * bfq_reparent_leaf_entity - move leaf entity to the root_group.

667

++ * @bfqd: the device data structure with the root group.

668

++ * @entity: the entity to move.

669

++ */

670

++static inline void bfq_reparent_leaf_entity(struct bfq_data *bfqd,

671

++					    struct bfq_entity *entity)

672

++{

673

++	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

674

++

675

++	BUG_ON(bfqq == NULL);

676

++	bfq_bfqq_move(bfqd, bfqq, entity, bfqd->root_group);

677

++	return;

678

++}

679

++

680

++/**

681

++ * bfq_reparent_active_entities - move to the root group all active

682

++ *                                entities.

683

++ * @bfqd: the device data structure with the root group.

684

++ * @bfqg: the group to move from.

685

++ * @st: the service tree with the entities.

686

++ *

687

++ * Needs queue_lock to be taken and reference to be valid over the call.

688

++ */

689

++static inline void bfq_reparent_active_entities(struct bfq_data *bfqd,

690

++						struct bfq_group *bfqg,

691

++						struct bfq_service_tree *st)

692

++{

693

++	struct rb_root *active = &st->active;

694

++	struct bfq_entity *entity = NULL;

695

++

696

++	if (!RB_EMPTY_ROOT(&st->active))

697

++		entity = bfq_entity_of(rb_first(active));

698

++

699

++	for (; entity != NULL; entity = bfq_entity_of(rb_first(active)))

700

++		bfq_reparent_leaf_entity(bfqd, entity);

701

++

702

++	if (bfqg->sched_data.in_service_entity != NULL)

703

++		bfq_reparent_leaf_entity(bfqd,

704

++			bfqg->sched_data.in_service_entity);

705

++

706

++	return;

707

++}

708

++

709

++/**

710

++ * bfq_destroy_group - destroy @bfqg.

711

++ * @bgrp: the bfqio_cgroup containing @bfqg.

712

++ * @bfqg: the group being destroyed.

713

++ *

714

++ * Destroy @bfqg, making sure that it is not referenced from its parent.

715

++ */

716

++static void bfq_destroy_group(struct bfqio_cgroup *bgrp, struct bfq_group *bfqg)

717

++{

718

++	struct bfq_data *bfqd;

719

++	struct bfq_service_tree *st;

720

++	struct bfq_entity *entity = bfqg->my_entity;

721

++	unsigned long uninitialized_var(flags);

722

++	int i;

723

++

724

++	hlist_del(&bfqg->group_node);

725

++

726

++	/*

727

++	 * Empty all service_trees belonging to this group before

728

++	 * deactivating the group itself.

729

++	 */

730

++	for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) {

731

++		st = bfqg->sched_data.service_tree + i;

732

++

733

++		/*

734

++		 * The idle tree may still contain bfq_queues belonging

735

++		 * to exited task because they never migrated to a different

736

++		 * cgroup from the one being destroyed now.  No one else

737

++		 * can access them so it's safe to act without any lock.

738

++		 */

739

++		bfq_flush_idle_tree(st);

740

++

741

++		/*

742

++		 * It may happen that some queues are still active

743

++		 * (busy) upon group destruction (if the corresponding

744

++		 * processes have been forced to terminate). We move

745

++		 * all the leaf entities corresponding to these queues

746

++		 * to the root_group.

747

++		 * Also, it may happen that the group has an entity

748

++		 * in service, which is disconnected from the active

749

++		 * tree: it must be moved, too.

750

++		 * There is no need to put the sync queues, as the

751

++		 * scheduler has taken no reference.

752

++		 */

753

++		bfqd = bfq_get_bfqd_locked(&bfqg->bfqd, &flags);

754

++		if (bfqd != NULL) {

755

++			bfq_reparent_active_entities(bfqd, bfqg, st);

756

++			bfq_put_bfqd_unlock(bfqd, &flags);

757

++		}

758

++		BUG_ON(!RB_EMPTY_ROOT(&st->active));

759

++		BUG_ON(!RB_EMPTY_ROOT(&st->idle));

760

++	}

761

++	BUG_ON(bfqg->sched_data.next_in_service != NULL);

762

++	BUG_ON(bfqg->sched_data.in_service_entity != NULL);

763

++

764

++	/*

765

++	 * We may race with device destruction, take extra care when

766

++	 * dereferencing bfqg->bfqd.

767

++	 */

768

++	bfqd = bfq_get_bfqd_locked(&bfqg->bfqd, &flags);

769

++	if (bfqd != NULL) {

770

++		hlist_del(&bfqg->bfqd_node);

771

++		__bfq_deactivate_entity(entity, 0);

772

++		bfq_put_async_queues(bfqd, bfqg);

773

++		bfq_put_bfqd_unlock(bfqd, &flags);

774

++	}

775

++	BUG_ON(entity->tree != NULL);

776

++

777

++	/*

778

++	 * No need to defer the kfree() to the end of the RCU grace

779

++	 * period: we are called from the destroy() callback of our

780

++	 * cgroup, so we can be sure that no one is a) still using

781

++	 * this cgroup or b) doing lookups in it.

782

++	 */

783

++	kfree(bfqg);

784

++}

785

++

786

++static void bfq_end_wr_async(struct bfq_data *bfqd)

787

++{

788

++	struct hlist_node *tmp;

789

++	struct bfq_group *bfqg;

790

++

791

++	hlist_for_each_entry_safe(bfqg, tmp, &bfqd->group_list, bfqd_node)

792

++		bfq_end_wr_async_queues(bfqd, bfqg);

793

++	bfq_end_wr_async_queues(bfqd, bfqd->root_group);

794

++}

795

++

796

++/**

797

++ * bfq_disconnect_groups - disconnect @bfqd from all its groups.

798

++ * @bfqd: the device descriptor being exited.

799

++ *

800

++ * When the device exits we just make sure that no lookup can return

801

++ * the now unused group structures.  They will be deallocated on cgroup

802

++ * destruction.

803

++ */

804

++static void bfq_disconnect_groups(struct bfq_data *bfqd)

805

++{

806

++	struct hlist_node *tmp;

807

++	struct bfq_group *bfqg;

808

++

809

++	bfq_log(bfqd, "disconnect_groups beginning");

810

++	hlist_for_each_entry_safe(bfqg, tmp, &bfqd->group_list, bfqd_node) {

811

++		hlist_del(&bfqg->bfqd_node);

812

++

813

++		__bfq_deactivate_entity(bfqg->my_entity, 0);

814

++

815

++		/*

816

++		 * Don't remove from the group hash, just set an

817

++		 * invalid key.  No lookups can race with the

818

++		 * assignment as bfqd is being destroyed; this

819

++		 * implies also that new elements cannot be added

820

++		 * to the list.

821

++		 */

822

++		rcu_assign_pointer(bfqg->bfqd, NULL);

823

++

824

++		bfq_log(bfqd, "disconnect_groups: put async for group %p",

825

++			bfqg);

826

++		bfq_put_async_queues(bfqd, bfqg);

827

++	}

828

++}

829

++

830

++static inline void bfq_free_root_group(struct bfq_data *bfqd)

831

++{

832

++	struct bfqio_cgroup *bgrp = &bfqio_root_cgroup;

833

++	struct bfq_group *bfqg = bfqd->root_group;

834

++

835

++	bfq_put_async_queues(bfqd, bfqg);

836

++

837

++	spin_lock_irq(&bgrp->lock);

838

++	hlist_del_rcu(&bfqg->group_node);

839

++	spin_unlock_irq(&bgrp->lock);

840

++

841

++	/*

842

++	 * No need to synchronize_rcu() here: since the device is gone

843

++	 * there cannot be any read-side access to its root_group.

844

++	 */

845

++	kfree(bfqg);

846

++}

847

++

848

++static struct bfq_group *bfq_alloc_root_group(struct bfq_data *bfqd, int node)

849

++{

850

++	struct bfq_group *bfqg;

851

++	struct bfqio_cgroup *bgrp;

852

++	int i;

853

++

854

++	bfqg = kzalloc_node(sizeof(*bfqg), GFP_KERNEL, node);

855

++	if (bfqg == NULL)

856

++		return NULL;

857

++

858

++	bfqg->entity.parent = NULL;

859

++	for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)

860

++		bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;

861

++

862

++	bgrp = &bfqio_root_cgroup;

863

++	spin_lock_irq(&bgrp->lock);

864

++	rcu_assign_pointer(bfqg->bfqd, bfqd);

865

++	hlist_add_head_rcu(&bfqg->group_node, &bgrp->group_data);

866

++	spin_unlock_irq(&bgrp->lock);

867

++

868

++	return bfqg;

869

++}

870

++

871

++#define SHOW_FUNCTION(__VAR)						\

872

++static u64 bfqio_cgroup_##__VAR##_read(struct cgroup_subsys_state *css, \

873

++				       struct cftype *cftype)		\

874

++{									\

875

++	struct bfqio_cgroup *bgrp = css_to_bfqio(css);			\

876

++	u64 ret = -ENODEV;						\

877

++									\

878

++	mutex_lock(&bfqio_mutex);					\

879

++	if (bfqio_is_removed(bgrp))					\

880

++		goto out_unlock;					\

881

++									\

882

++	spin_lock_irq(&bgrp->lock);					\

883

++	ret = bgrp->__VAR;						\

884

++	spin_unlock_irq(&bgrp->lock);					\

885

++									\

886

++out_unlock:								\

887

++	mutex_unlock(&bfqio_mutex);					\

888

++	return ret;							\

889

++}

890

++

891

++SHOW_FUNCTION(weight);

892

++SHOW_FUNCTION(ioprio);

893

++SHOW_FUNCTION(ioprio_class);

894

++#undef SHOW_FUNCTION

895

++

896

++#define STORE_FUNCTION(__VAR, __MIN, __MAX)				\

897

++static int bfqio_cgroup_##__VAR##_write(struct cgroup_subsys_state *css,\

898

++					struct cftype *cftype,		\

899

++					u64 val)			\

900

++{									\

901

++	struct bfqio_cgroup *bgrp = css_to_bfqio(css);			\

902

++	struct bfq_group *bfqg;						\

903

++	int ret = -EINVAL;						\

904

++									\

905

++	if (val < (__MIN) || val > (__MAX))				\

906

++		return ret;						\

907

++									\

908

++	ret = -ENODEV;							\

909

++	mutex_lock(&bfqio_mutex);					\

910

++	if (bfqio_is_removed(bgrp))					\

911

++		goto out_unlock;					\

912

++	ret = 0;							\

913

++									\

914

++	spin_lock_irq(&bgrp->lock);					\

915

++	bgrp->__VAR = (unsigned short)val;				\

916

++	hlist_for_each_entry(bfqg, &bgrp->group_data, group_node) {	\

917

++		/*							\

918

++		 * Setting the ioprio_changed flag of the entity        \

919

++		 * to 1 with new_##__VAR == ##__VAR would re-set        \

920

++		 * the value of the weight to its ioprio mapping.       \

921

++		 * Set the flag only if necessary.			\

922

++		 */							\

923

++		if ((unsigned short)val != bfqg->entity.new_##__VAR) {  \

924

++			bfqg->entity.new_##__VAR = (unsigned short)val; \

925

++			/*						\

926

++			 * Make sure that the above new value has been	\

927

++			 * stored in bfqg->entity.new_##__VAR before	\

928

++			 * setting the ioprio_changed flag. In fact,	\

929

++			 * this flag may be read asynchronously (in	\

930

++			 * critical sections protected by a different	\

931

++			 * lock than that held here), and finding this	\

932

++			 * flag set may cause the execution of the code	\

933

++			 * for updating parameters whose value may	\

934

++			 * depend also on bfqg->entity.new_##__VAR (in	\

935

++			 * __bfq_entity_update_weight_prio).		\

936

++			 * This barrier makes sure that the new value	\

937

++			 * of bfqg->entity.new_##__VAR is correctly	\

938

++			 * seen in that code.				\

939

++			 */						\

940

++			smp_wmb();                                      \

941

++			bfqg->entity.ioprio_changed = 1;                \

942

++		}							\

943

++	}								\

944

++	spin_unlock_irq(&bgrp->lock);					\

945

++									\

946

++out_unlock:								\

947

++	mutex_unlock(&bfqio_mutex);					\

948

++	return ret;							\

949

++}

950

++

951

++STORE_FUNCTION(weight, BFQ_MIN_WEIGHT, BFQ_MAX_WEIGHT);

952

++STORE_FUNCTION(ioprio, 0, IOPRIO_BE_NR - 1);

953

++STORE_FUNCTION(ioprio_class, IOPRIO_CLASS_RT, IOPRIO_CLASS_IDLE);

954

++#undef STORE_FUNCTION

955

++

956

++static struct cftype bfqio_files[] = {

957

++	{

958

++		.name = "weight",

959

++		.read_u64 = bfqio_cgroup_weight_read,

960

++		.write_u64 = bfqio_cgroup_weight_write,

961

++	},

962

++	{

963

++		.name = "ioprio",

964

++		.read_u64 = bfqio_cgroup_ioprio_read,

965

++		.write_u64 = bfqio_cgroup_ioprio_write,

966

++	},

967

++	{

968

++		.name = "ioprio_class",

969

++		.read_u64 = bfqio_cgroup_ioprio_class_read,

970

++		.write_u64 = bfqio_cgroup_ioprio_class_write,

971

++	},

972

++	{ },	/* terminate */

973

++};

974

++

975

++static struct cgroup_subsys_state *bfqio_create(struct cgroup_subsys_state

976

++						*parent_css)

977

++{

978

++	struct bfqio_cgroup *bgrp;

979

++

980

++	if (parent_css != NULL) {

981

++		bgrp = kzalloc(sizeof(*bgrp), GFP_KERNEL);

982

++		if (bgrp == NULL)

983

++			return ERR_PTR(-ENOMEM);

984

++	} else

985

++		bgrp = &bfqio_root_cgroup;

986

++

987

++	spin_lock_init(&bgrp->lock);

988

++	INIT_HLIST_HEAD(&bgrp->group_data);

989

++	bgrp->ioprio = BFQ_DEFAULT_GRP_IOPRIO;

990

++	bgrp->ioprio_class = BFQ_DEFAULT_GRP_CLASS;

991

++

992

++	return &bgrp->css;

993

++}

994

++

995

++/*

996

++ * We cannot support shared io contexts, as we have no means to support

997

++ * two tasks with the same ioc in two different groups without major rework

998

++ * of the main bic/bfqq data structures.  By now we allow a task to change

999

++ * its cgroup only if it's the only owner of its ioc; the drawback of this

1000

++ * behavior is that a group containing a task that forked using CLONE_IO

1001

++ * will not be destroyed until the tasks sharing the ioc die.

1002

++ */

1003

++static int bfqio_can_attach(struct cgroup_subsys_state *css,

1004

++			    struct cgroup_taskset *tset)

1005

++{

1006

++	struct task_struct *task;

1007

++	struct io_context *ioc;

1008

++	int ret = 0;

1009

++

1010

++	cgroup_taskset_for_each(task, tset) {

1011

++		/*

1012

++		 * task_lock() is needed to avoid races with

1013

++		 * exit_io_context()

1014

++		 */

1015

++		task_lock(task);

1016

++		ioc = task->io_context;

1017

++		if (ioc != NULL && atomic_read(&ioc->nr_tasks) > 1)

1018

++			/*

1019

++			 * ioc == NULL means that the task is either too

1020

++			 * young or exiting: if it has still no ioc the

1021

++			 * ioc can't be shared, if the task is exiting the

1022

++			 * attach will fail anyway, no matter what we

1023

++			 * return here.

1024

++			 */

1025

++			ret = -EINVAL;

1026

++		task_unlock(task);

1027

++		if (ret)

1028

++			break;

1029

++	}

1030

++

1031

++	return ret;

1032

++}

1033

++

1034

++static void bfqio_attach(struct cgroup_subsys_state *css,

1035

++			 struct cgroup_taskset *tset)

1036

++{

1037

++	struct task_struct *task;

1038

++	struct io_context *ioc;

1039

++	struct io_cq *icq;

1040

++

1041

++	/*

1042

++	 * IMPORTANT NOTE: The move of more than one process at a time to a

1043

++	 * new group has not yet been tested.

1044

++	 */

1045

++	cgroup_taskset_for_each(task, tset) {

1046

++		ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE);

1047

++		if (ioc) {

1048

++			/*

1049

++			 * Handle cgroup change here.

1050

++			 */

1051

++			rcu_read_lock();

1052

++			hlist_for_each_entry_rcu(icq, &ioc->icq_list, ioc_node)

1053

++				if (!strncmp(

1054

++					icq->q->elevator->type->elevator_name,

1055

++					"bfq", ELV_NAME_MAX))

1056

++					bfq_bic_change_cgroup(icq_to_bic(icq),

1057

++							      css);

1058

++			rcu_read_unlock();

1059

++			put_io_context(ioc);

1060

++		}

1061

++	}

1062

++}

1063

++

1064

++static void bfqio_destroy(struct cgroup_subsys_state *css)

1065

++{

1066

++	struct bfqio_cgroup *bgrp = css_to_bfqio(css);

1067

++	struct hlist_node *tmp;

1068

++	struct bfq_group *bfqg;

1069

++

1070

++	/*

1071

++	 * Since we are destroying the cgroup, there are no more tasks

1072

++	 * referencing it, and all the RCU grace periods that may have

1073

++	 * referenced it are ended (as the destruction of the parent

1074

++	 * cgroup is RCU-safe); bgrp->group_data will not be accessed by

1075

++	 * anything else and we don't need any synchronization.

1076

++	 */

1077

++	hlist_for_each_entry_safe(bfqg, tmp, &bgrp->group_data, group_node)

1078

++		bfq_destroy_group(bgrp, bfqg);

1079

++

1080

++	BUG_ON(!hlist_empty(&bgrp->group_data));

1081

++

1082

++	kfree(bgrp);

1083

++}

1084

++

1085

++static int bfqio_css_online(struct cgroup_subsys_state *css)

1086

++{

1087

++	struct bfqio_cgroup *bgrp = css_to_bfqio(css);

1088

++

1089

++	mutex_lock(&bfqio_mutex);

1090

++	bgrp->online = true;

1091

++	mutex_unlock(&bfqio_mutex);

1092

++

1093

++	return 0;

1094

++}

1095

++

1096

++static void bfqio_css_offline(struct cgroup_subsys_state *css)

1097

++{

1098

++	struct bfqio_cgroup *bgrp = css_to_bfqio(css);

1099

++

1100

++	mutex_lock(&bfqio_mutex);

1101

++	bgrp->online = false;

1102

++	mutex_unlock(&bfqio_mutex);

1103

++}

1104

++

1105

++struct cgroup_subsys bfqio_cgrp_subsys = {

1106

++	.css_alloc = bfqio_create,

1107

++	.css_online = bfqio_css_online,

1108

++	.css_offline = bfqio_css_offline,

1109

++	.can_attach = bfqio_can_attach,

1110

++	.attach = bfqio_attach,

1111

++	.css_free = bfqio_destroy,

1112

++	.legacy_cftypes = bfqio_files,

1113

++};

1114

++#else

1115

++static inline void bfq_init_entity(struct bfq_entity *entity,

1116

++				   struct bfq_group *bfqg)

1117

++{

1118

++	entity->weight = entity->new_weight;

1119

++	entity->orig_weight = entity->new_weight;

1120

++	entity->ioprio = entity->new_ioprio;

1121

++	entity->ioprio_class = entity->new_ioprio_class;

1122

++	entity->sched_data = &bfqg->sched_data;

1123

++}

1124

++

1125

++static inline struct bfq_group *

1126

++bfq_bic_update_cgroup(struct bfq_io_cq *bic)

1127

++{

1128

++	struct bfq_data *bfqd = bic_to_bfqd(bic);

1129

++	return bfqd->root_group;

1130

++}

1131

++

1132

++static inline void bfq_bfqq_move(struct bfq_data *bfqd,

1133

++				 struct bfq_queue *bfqq,

1134

++				 struct bfq_entity *entity,

1135

++				 struct bfq_group *bfqg)

1136

++{

1137

++}

1138

++

1139

++static void bfq_end_wr_async(struct bfq_data *bfqd)

1140

++{

1141

++	bfq_end_wr_async_queues(bfqd, bfqd->root_group);

1142

++}

1143

++

1144

++static inline void bfq_disconnect_groups(struct bfq_data *bfqd)

1145

++{

1146

++	bfq_put_async_queues(bfqd, bfqd->root_group);

1147

++}

1148

++

1149

++static inline void bfq_free_root_group(struct bfq_data *bfqd)

1150

++{

1151

++	kfree(bfqd->root_group);

1152

++}

1153

++

1154

++static struct bfq_group *bfq_alloc_root_group(struct bfq_data *bfqd, int node)

1155

++{

1156

++	struct bfq_group *bfqg;

1157

++	int i;

1158

++

1159

++	bfqg = kmalloc_node(sizeof(*bfqg), GFP_KERNEL | __GFP_ZERO, node);

1160

++	if (bfqg == NULL)

1161

++		return NULL;

1162

++

1163

++	for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)

1164

++		bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;

1165

++

1166

++	return bfqg;

1167

++}

1168

++#endif

1169

+diff --git a/block/bfq-ioc.c b/block/bfq-ioc.c

1170

+new file mode 100644

1171

+index 0000000..7f6b000

1172

+--- /dev/null

1173

++++ b/block/bfq-ioc.c

1174

+@@ -0,0 +1,36 @@

1175

++/*

1176

++ * BFQ: I/O context handling.

1177

++ *

1178

++ * Based on ideas and code from CFQ:

1179

++ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>

1180

++ *

1181

++ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>

1182

++ *		      Paolo Valente <paolo.valente@×××××××.it>

1183

++ *

1184

++ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>

1185

++ */

1186

++

1187

++/**

1188

++ * icq_to_bic - convert iocontext queue structure to bfq_io_cq.

1189

++ * @icq: the iocontext queue.

1190

++ */

1191

++static inline struct bfq_io_cq *icq_to_bic(struct io_cq *icq)

1192

++{

1193

++	/* bic->icq is the first member, %NULL will convert to %NULL */

1194

++	return container_of(icq, struct bfq_io_cq, icq);

1195

++}

1196

++

1197

++/**

1198

++ * bfq_bic_lookup - search into @ioc a bic associated to @bfqd.

1199

++ * @bfqd: the lookup key.

1200

++ * @ioc: the io_context of the process doing I/O.

1201

++ *

1202

++ * Queue lock must be held.

1203

++ */

1204

++static inline struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd,

1205

++					       struct io_context *ioc)

1206

++{

1207

++	if (ioc)

1208

++		return icq_to_bic(ioc_lookup_icq(ioc, bfqd->queue));

1209

++	return NULL;

1210

++}

1211

+diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c

1212

+new file mode 100644

1213

+index 0000000..773b2ee

1214

+--- /dev/null

1215

++++ b/block/bfq-iosched.c

1216

+@@ -0,0 +1,3898 @@

1217

++/*

1218

++ * Budget Fair Queueing (BFQ) disk scheduler.

1219

++ *

1220

++ * Based on ideas and code from CFQ:

1221

++ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>

1222

++ *

1223

++ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>

1224

++ *		      Paolo Valente <paolo.valente@×××××××.it>

1225

++ *

1226

++ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>

1227

++ *

1228

++ * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ

1229

++ * file.

1230

++ *

1231

++ * BFQ is a proportional-share storage-I/O scheduling algorithm based on

1232

++ * the slice-by-slice service scheme of CFQ. But BFQ assigns budgets,

1233

++ * measured in number of sectors, to processes instead of time slices. The

1234

++ * device is not granted to the in-service process for a given time slice,

1235

++ * but until it has exhausted its assigned budget. This change from the time

1236

++ * to the service domain allows BFQ to distribute the device throughput

1237

++ * among processes as desired, without any distortion due to ZBR, workload

1238

++ * fluctuations or other factors. BFQ uses an ad hoc internal scheduler,

1239

++ * called B-WF2Q+, to schedule processes according to their budgets. More

1240

++ * precisely, BFQ schedules queues associated to processes. Thanks to the

1241

++ * accurate policy of B-WF2Q+, BFQ can afford to assign high budgets to

1242

++ * I/O-bound processes issuing sequential requests (to boost the

1243

++ * throughput), and yet guarantee a low latency to interactive and soft

1244

++ * real-time applications.

1245

++ *

1246

++ * BFQ is described in [1], where also a reference to the initial, more

1247

++ * theoretical paper on BFQ can be found. The interested reader can find

1248

++ * in the latter paper full details on the main algorithm, as well as

1249

++ * formulas of the guarantees and formal proofs of all the properties.

1250

++ * With respect to the version of BFQ presented in these papers, this

1251

++ * implementation adds a few more heuristics, such as the one that

1252

++ * guarantees a low latency to soft real-time applications, and a

1253

++ * hierarchical extension based on H-WF2Q+.

1254

++ *

1255

++ * B-WF2Q+ is based on WF2Q+, that is described in [2], together with

1256

++ * H-WF2Q+, while the augmented tree used to implement B-WF2Q+ with O(log N)

1257

++ * complexity derives from the one introduced with EEVDF in [3].

1258

++ *

1259

++ * [1] P. Valente and M. Andreolini, ``Improving Application Responsiveness

1260

++ *     with the BFQ Disk I/O Scheduler'',

1261

++ *     Proceedings of the 5th Annual International Systems and Storage

1262

++ *     Conference (SYSTOR '12), June 2012.

1263

++ *

1264

++ * http://algogroup.unimo.it/people/paolo/disk_sched/bf1-v1-suite-results.pdf

1265

++ *

1266

++ * [2] Jon C.R. Bennett and H. Zhang, ``Hierarchical Packet Fair Queueing

1267

++ *     Algorithms,'' IEEE/ACM Transactions on Networking, 5(5):675-689,

1268

++ *     Oct 1997.

1269

++ *

1270

++ * http://www.cs.cmu.edu/~hzhang/papers/TON-97-Oct.ps.gz

1271

++ *

1272

++ * [3] I. Stoica and H. Abdel-Wahab, ``Earliest Eligible Virtual Deadline

1273

++ *     First: A Flexible and Accurate Mechanism for Proportional Share

1274

++ *     Resource Allocation,'' technical report.

1275

++ *

1276

++ * http://www.cs.berkeley.edu/~istoica/papers/eevdf-tr-95.pdf

1277

++ */

1278

++#include <linux/module.h>

1279

++#include <linux/slab.h>

1280

++#include <linux/blkdev.h>

1281

++#include <linux/cgroup.h>

1282

++#include <linux/elevator.h>

1283

++#include <linux/jiffies.h>

1284

++#include <linux/rbtree.h>

1285

++#include <linux/ioprio.h>

1286

++#include "bfq.h"

1287

++#include "blk.h"

1288

++

1289

++/* Expiration time of sync (0) and async (1) requests, in jiffies. */

1290

++static const int bfq_fifo_expire[2] = { HZ / 4, HZ / 8 };

1291

++

1292

++/* Maximum backwards seek, in KiB. */

1293

++static const int bfq_back_max = 16 * 1024;

1294

++

1295

++/* Penalty of a backwards seek, in number of sectors. */

1296

++static const int bfq_back_penalty = 2;

1297

++

1298

++/* Idling period duration, in jiffies. */

1299

++static int bfq_slice_idle = HZ / 125;

1300

++

1301

++/* Default maximum budget values, in sectors and number of requests. */

1302

++static const int bfq_default_max_budget = 16 * 1024;

1303

++static const int bfq_max_budget_async_rq = 4;

1304

++

1305

++/*

1306

++ * Async to sync throughput distribution is controlled as follows:

1307

++ * when an async request is served, the entity is charged the number

1308

++ * of sectors of the request, multiplied by the factor below

1309

++ */

1310

++static const int bfq_async_charge_factor = 10;

1311

++

1312

++/* Default timeout values, in jiffies, approximating CFQ defaults. */

1313

++static const int bfq_timeout_sync = HZ / 8;

1314

++static int bfq_timeout_async = HZ / 25;

1315

++

1316

++struct kmem_cache *bfq_pool;

1317

++

1318

++/* Below this threshold (in ms), we consider thinktime immediate. */

1319

++#define BFQ_MIN_TT		2

1320

++

1321

++/* hw_tag detection: parallel requests threshold and min samples needed. */

1322

++#define BFQ_HW_QUEUE_THRESHOLD	4

1323

++#define BFQ_HW_QUEUE_SAMPLES	32

1324

++

1325

++#define BFQQ_SEEK_THR	 (sector_t)(8 * 1024)

1326

++#define BFQQ_SEEKY(bfqq) ((bfqq)->seek_mean > BFQQ_SEEK_THR)

1327

++

1328

++/* Min samples used for peak rate estimation (for autotuning). */

1329

++#define BFQ_PEAK_RATE_SAMPLES	32

1330

++

1331

++/* Shift used for peak rate fixed precision calculations. */

1332

++#define BFQ_RATE_SHIFT		16

1333

++

1334

++/*

1335

++ * By default, BFQ computes the duration of the weight raising for

1336

++ * interactive applications automatically, using the following formula:

1337

++ * duration = (R / r) * T, where r is the peak rate of the device, and

1338

++ * R and T are two reference parameters.

1339

++ * In particular, R is the peak rate of the reference device (see below),

1340

++ * and T is a reference time: given the systems that are likely to be

1341

++ * installed on the reference device according to its speed class, T is

1342

++ * about the maximum time needed, under BFQ and while reading two files in

1343

++ * parallel, to load typical large applications on these systems.

1344

++ * In practice, the slower/faster the device at hand is, the more/less it

1345

++ * takes to load applications with respect to the reference device.

1346

++ * Accordingly, the longer/shorter BFQ grants weight raising to interactive

1347

++ * applications.

1348

++ *

1349

++ * BFQ uses four different reference pairs (R, T), depending on:

1350

++ * . whether the device is rotational or non-rotational;

1351

++ * . whether the device is slow, such as old or portable HDDs, as well as

1352

++ *   SD cards, or fast, such as newer HDDs and SSDs.

1353

++ *

1354

++ * The device's speed class is dynamically (re)detected in

1355

++ * bfq_update_peak_rate() every time the estimated peak rate is updated.

1356

++ *

1357

++ * In the following definitions, R_slow[0]/R_fast[0] and T_slow[0]/T_fast[0]

1358

++ * are the reference values for a slow/fast rotational device, whereas

1359

++ * R_slow[1]/R_fast[1] and T_slow[1]/T_fast[1] are the reference values for

1360

++ * a slow/fast non-rotational device. Finally, device_speed_thresh are the

1361

++ * thresholds used to switch between speed classes.

1362

++ * Both the reference peak rates and the thresholds are measured in

1363

++ * sectors/usec, left-shifted by BFQ_RATE_SHIFT.

1364

++ */

1365

++static int R_slow[2] = {1536, 10752};

1366

++static int R_fast[2] = {17415, 34791};

1367

++/*

1368

++ * To improve readability, a conversion function is used to initialize the

1369

++ * following arrays, which entails that they can be initialized only in a

1370

++ * function.

1371

++ */

1372

++static int T_slow[2];

1373

++static int T_fast[2];

1374

++static int device_speed_thresh[2];

1375

++

1376

++#define BFQ_SERVICE_TREE_INIT	((struct bfq_service_tree)		\

1377

++				{ RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 })

1378

++

1379

++#define RQ_BIC(rq)		((struct bfq_io_cq *) (rq)->elv.priv[0])

1380

++#define RQ_BFQQ(rq)		((rq)->elv.priv[1])

1381

++

1382

++static inline void bfq_schedule_dispatch(struct bfq_data *bfqd);

1383

++

1384

++#include "bfq-ioc.c"

1385

++#include "bfq-sched.c"

1386

++#include "bfq-cgroup.c"

1387

++

1388

++#define bfq_class_idle(bfqq)	((bfqq)->entity.ioprio_class ==\

1389

++				 IOPRIO_CLASS_IDLE)

1390

++#define bfq_class_rt(bfqq)	((bfqq)->entity.ioprio_class ==\

1391

++				 IOPRIO_CLASS_RT)

1392

++

1393

++#define bfq_sample_valid(samples)	((samples) > 80)

1394

++

1395

++/*

1396

++ * The following macro groups conditions that need to be evaluated when

1397

++ * checking if existing queues and groups form a symmetric scenario

1398

++ * and therefore idling can be reduced or disabled for some of the

1399

++ * queues. See the comment to the function bfq_bfqq_must_not_expire()

1400

++ * for further details.

1401

++ */

1402

++#ifdef CONFIG_CGROUP_BFQIO

1403

++#define symmetric_scenario	  (!bfqd->active_numerous_groups && \

1404

++				   !bfq_differentiated_weights(bfqd))

1405

++#else

1406

++#define symmetric_scenario	  (!bfq_differentiated_weights(bfqd))

1407

++#endif

1408

++

1409

++/*

1410

++ * We regard a request as SYNC, if either it's a read or has the SYNC bit

1411

++ * set (in which case it could also be a direct WRITE).

1412

++ */

1413

++static inline int bfq_bio_sync(struct bio *bio)

1414

++{

1415

++	if (bio_data_dir(bio) == READ || (bio->bi_rw & REQ_SYNC))

1416

++		return 1;

1417

++

1418

++	return 0;

1419

++}

1420

++

1421

++/*

1422

++ * Scheduler run of queue, if there are requests pending and no one in the

1423

++ * driver that will restart queueing.

1424

++ */

1425

++static inline void bfq_schedule_dispatch(struct bfq_data *bfqd)

1426

++{

1427

++	if (bfqd->queued != 0) {

1428

++		bfq_log(bfqd, "schedule dispatch");

1429

++		kblockd_schedule_work(&bfqd->unplug_work);

1430

++	}

1431

++}

1432

++

1433

++/*

1434

++ * Lifted from AS - choose which of rq1 and rq2 that is best served now.

1435

++ * We choose the request that is closesr to the head right now.  Distance

1436

++ * behind the head is penalized and only allowed to a certain extent.

1437

++ */

1438

++static struct request *bfq_choose_req(struct bfq_data *bfqd,

1439

++				      struct request *rq1,

1440

++				      struct request *rq2,

1441

++				      sector_t last)

1442

++{

1443

++	sector_t s1, s2, d1 = 0, d2 = 0;

1444

++	unsigned long back_max;

1445

++#define BFQ_RQ1_WRAP	0x01 /* request 1 wraps */

1446

++#define BFQ_RQ2_WRAP	0x02 /* request 2 wraps */

1447

++	unsigned wrap = 0; /* bit mask: requests behind the disk head? */

1448

++

1449

++	if (rq1 == NULL || rq1 == rq2)

1450

++		return rq2;

1451

++	if (rq2 == NULL)

1452

++		return rq1;

1453

++

1454

++	if (rq_is_sync(rq1) && !rq_is_sync(rq2))

1455

++		return rq1;

1456

++	else if (rq_is_sync(rq2) && !rq_is_sync(rq1))

1457

++		return rq2;

1458

++	if ((rq1->cmd_flags & REQ_META) && !(rq2->cmd_flags & REQ_META))

1459

++		return rq1;

1460

++	else if ((rq2->cmd_flags & REQ_META) && !(rq1->cmd_flags & REQ_META))

1461

++		return rq2;

1462

++

1463

++	s1 = blk_rq_pos(rq1);

1464

++	s2 = blk_rq_pos(rq2);

1465

++

1466

++	/*

1467

++	 * By definition, 1KiB is 2 sectors.

1468

++	 */

1469

++	back_max = bfqd->bfq_back_max * 2;

1470

++

1471

++	/*

1472

++	 * Strict one way elevator _except_ in the case where we allow

1473

++	 * short backward seeks which are biased as twice the cost of a

1474

++	 * similar forward seek.

1475

++	 */

1476

++	if (s1 >= last)

1477

++		d1 = s1 - last;

1478

++	else if (s1 + back_max >= last)

1479

++		d1 = (last - s1) * bfqd->bfq_back_penalty;

1480

++	else

1481

++		wrap |= BFQ_RQ1_WRAP;

1482

++

1483

++	if (s2 >= last)

1484

++		d2 = s2 - last;

1485

++	else if (s2 + back_max >= last)

1486

++		d2 = (last - s2) * bfqd->bfq_back_penalty;

1487

++	else

1488

++		wrap |= BFQ_RQ2_WRAP;

1489

++

1490

++	/* Found required data */

1491

++

1492

++	/*

1493

++	 * By doing switch() on the bit mask "wrap" we avoid having to

1494

++	 * check two variables for all permutations: --> faster!

1495

++	 */

1496

++	switch (wrap) {

1497

++	case 0: /* common case for CFQ: rq1 and rq2 not wrapped */

1498

++		if (d1 < d2)

1499

++			return rq1;

1500

++		else if (d2 < d1)

1501

++			return rq2;

1502

++		else {

1503

++			if (s1 >= s2)

1504

++				return rq1;

1505

++			else

1506

++				return rq2;

1507

++		}

1508

++

1509

++	case BFQ_RQ2_WRAP:

1510

++		return rq1;

1511

++	case BFQ_RQ1_WRAP:

1512

++		return rq2;

1513

++	case (BFQ_RQ1_WRAP|BFQ_RQ2_WRAP): /* both rqs wrapped */

1514

++	default:

1515

++		/*

1516

++		 * Since both rqs are wrapped,

1517

++		 * start with the one that's further behind head

1518

++		 * (--> only *one* back seek required),

1519

++		 * since back seek takes more time than forward.

1520

++		 */

1521

++		if (s1 <= s2)

1522

++			return rq1;

1523

++		else

1524

++			return rq2;

1525

++	}

1526

++}

1527

++

1528

++static struct bfq_queue *

1529

++bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root,

1530

++		     sector_t sector, struct rb_node **ret_parent,

1531

++		     struct rb_node ***rb_link)

1532

++{

1533

++	struct rb_node **p, *parent;

1534

++	struct bfq_queue *bfqq = NULL;

1535

++

1536

++	parent = NULL;

1537

++	p = &root->rb_node;

1538

++	while (*p) {

1539

++		struct rb_node **n;

1540

++

1541

++		parent = *p;

1542

++		bfqq = rb_entry(parent, struct bfq_queue, pos_node);

1543

++

1544

++		/*

1545

++		 * Sort strictly based on sector. Smallest to the left,

1546

++		 * largest to the right.

1547

++		 */

1548

++		if (sector > blk_rq_pos(bfqq->next_rq))

1549

++			n = &(*p)->rb_right;

1550

++		else if (sector < blk_rq_pos(bfqq->next_rq))

1551

++			n = &(*p)->rb_left;

1552

++		else

1553

++			break;

1554

++		p = n;

1555

++		bfqq = NULL;

1556

++	}

1557

++

1558

++	*ret_parent = parent;

1559

++	if (rb_link)

1560

++		*rb_link = p;

1561

++

1562

++	bfq_log(bfqd, "rq_pos_tree_lookup %llu: returning %d",

1563

++		(long long unsigned)sector,

1564

++		bfqq != NULL ? bfqq->pid : 0);

1565

++

1566

++	return bfqq;

1567

++}

1568

++

1569

++static void bfq_rq_pos_tree_add(struct bfq_data *bfqd, struct bfq_queue *bfqq)

1570

++{

1571

++	struct rb_node **p, *parent;

1572

++	struct bfq_queue *__bfqq;

1573

++

1574

++	if (bfqq->pos_root != NULL) {

1575

++		rb_erase(&bfqq->pos_node, bfqq->pos_root);

1576

++		bfqq->pos_root = NULL;

1577

++	}

1578

++

1579

++	if (bfq_class_idle(bfqq))

1580

++		return;

1581

++	if (!bfqq->next_rq)

1582

++		return;

1583

++

1584

++	bfqq->pos_root = &bfqd->rq_pos_tree;

1585

++	__bfqq = bfq_rq_pos_tree_lookup(bfqd, bfqq->pos_root,

1586

++			blk_rq_pos(bfqq->next_rq), &parent, &p);

1587

++	if (__bfqq == NULL) {

1588

++		rb_link_node(&bfqq->pos_node, parent, p);

1589

++		rb_insert_color(&bfqq->pos_node, bfqq->pos_root);

1590

++	} else

1591

++		bfqq->pos_root = NULL;

1592

++}

1593

++

1594

++/*

1595

++ * Tell whether there are active queues or groups with differentiated weights.

1596

++ */

1597

++static inline bool bfq_differentiated_weights(struct bfq_data *bfqd)

1598

++{

1599

++	/*

1600

++	 * For weights to differ, at least one of the trees must contain

1601

++	 * at least two nodes.

1602

++	 */

1603

++	return (!RB_EMPTY_ROOT(&bfqd->queue_weights_tree) &&

1604

++		(bfqd->queue_weights_tree.rb_node->rb_left ||

1605

++		 bfqd->queue_weights_tree.rb_node->rb_right)

1606

++#ifdef CONFIG_CGROUP_BFQIO

1607

++	       ) ||

1608

++	       (!RB_EMPTY_ROOT(&bfqd->group_weights_tree) &&

1609

++		(bfqd->group_weights_tree.rb_node->rb_left ||

1610

++		 bfqd->group_weights_tree.rb_node->rb_right)

1611

++#endif

1612

++	       );

1613

++}

1614

++

1615

++/*

1616

++ * If the weight-counter tree passed as input contains no counter for

1617

++ * the weight of the input entity, then add that counter; otherwise just

1618

++ * increment the existing counter.

1619

++ *

1620

++ * Note that weight-counter trees contain few nodes in mostly symmetric

1621

++ * scenarios. For example, if all queues have the same weight, then the

1622

++ * weight-counter tree for the queues may contain at most one node.

1623

++ * This holds even if low_latency is on, because weight-raised queues

1624

++ * are not inserted in the tree.

1625

++ * In most scenarios, the rate at which nodes are created/destroyed

1626

++ * should be low too.

1627

++ */

1628

++static void bfq_weights_tree_add(struct bfq_data *bfqd,

1629

++				 struct bfq_entity *entity,

1630

++				 struct rb_root *root)

1631

++{

1632

++	struct rb_node **new = &(root->rb_node), *parent = NULL;

1633

++

1634

++	/*

1635

++	 * Do not insert if the entity is already associated with a

1636

++	 * counter, which happens if:

1637

++	 *   1) the entity is associated with a queue,

1638

++	 *   2) a request arrival has caused the queue to become both

1639

++	 *      non-weight-raised, and hence change its weight, and

1640

++	 *      backlogged; in this respect, each of the two events

1641

++	 *      causes an invocation of this function,

1642

++	 *   3) this is the invocation of this function caused by the

1643

++	 *      second event. This second invocation is actually useless,

1644

++	 *      and we handle this fact by exiting immediately. More

1645

++	 *      efficient or clearer solutions might possibly be adopted.

1646

++	 */

1647

++	if (entity->weight_counter)

1648

++		return;

1649

++

1650

++	while (*new) {

1651

++		struct bfq_weight_counter *__counter = container_of(*new,

1652

++						struct bfq_weight_counter,

1653

++						weights_node);

1654

++		parent = *new;

1655

++

1656

++		if (entity->weight == __counter->weight) {

1657

++			entity->weight_counter = __counter;

1658

++			goto inc_counter;

1659

++		}

1660

++		if (entity->weight < __counter->weight)

1661

++			new = &((*new)->rb_left);

1662

++		else

1663

++			new = &((*new)->rb_right);

1664

++	}

1665

++

1666

++	entity->weight_counter = kzalloc(sizeof(struct bfq_weight_counter),

1667

++					 GFP_ATOMIC);

1668

++	entity->weight_counter->weight = entity->weight;

1669

++	rb_link_node(&entity->weight_counter->weights_node, parent, new);

1670

++	rb_insert_color(&entity->weight_counter->weights_node, root);

1671

++

1672

++inc_counter:

1673

++	entity->weight_counter->num_active++;

1674

++}

1675

++

1676

++/*

1677

++ * Decrement the weight counter associated with the entity, and, if the

1678

++ * counter reaches 0, remove the counter from the tree.

1679

++ * See the comments to the function bfq_weights_tree_add() for considerations

1680

++ * about overhead.

1681

++ */

1682

++static void bfq_weights_tree_remove(struct bfq_data *bfqd,

1683

++				    struct bfq_entity *entity,

1684

++				    struct rb_root *root)

1685

++{

1686

++	if (!entity->weight_counter)

1687

++		return;

1688

++

1689

++	BUG_ON(RB_EMPTY_ROOT(root));

1690

++	BUG_ON(entity->weight_counter->weight != entity->weight);

1691

++

1692

++	BUG_ON(!entity->weight_counter->num_active);

1693

++	entity->weight_counter->num_active--;

1694

++	if (entity->weight_counter->num_active > 0)

1695

++		goto reset_entity_pointer;

1696

++

1697

++	rb_erase(&entity->weight_counter->weights_node, root);

1698

++	kfree(entity->weight_counter);

1699

++

1700

++reset_entity_pointer:

1701

++	entity->weight_counter = NULL;

1702

++}

1703

++

1704

++static struct request *bfq_find_next_rq(struct bfq_data *bfqd,

1705

++					struct bfq_queue *bfqq,

1706

++					struct request *last)

1707

++{

1708

++	struct rb_node *rbnext = rb_next(&last->rb_node);

1709

++	struct rb_node *rbprev = rb_prev(&last->rb_node);

1710

++	struct request *next = NULL, *prev = NULL;

1711

++

1712

++	BUG_ON(RB_EMPTY_NODE(&last->rb_node));

1713

++

1714

++	if (rbprev != NULL)

1715

++		prev = rb_entry_rq(rbprev);

1716

++

1717

++	if (rbnext != NULL)

1718

++		next = rb_entry_rq(rbnext);

1719

++	else {

1720

++		rbnext = rb_first(&bfqq->sort_list);

1721

++		if (rbnext && rbnext != &last->rb_node)

1722

++			next = rb_entry_rq(rbnext);

1723

++	}

1724

++

1725

++	return bfq_choose_req(bfqd, next, prev, blk_rq_pos(last));

1726

++}

1727

++

1728

++/* see the definition of bfq_async_charge_factor for details */

1729

++static inline unsigned long bfq_serv_to_charge(struct request *rq,

1730

++					       struct bfq_queue *bfqq)

1731

++{

1732

++	return blk_rq_sectors(rq) *

1733

++		(1 + ((!bfq_bfqq_sync(bfqq)) * (bfqq->wr_coeff == 1) *

1734

++		bfq_async_charge_factor));

1735

++}

1736

++

1737

++/**

1738

++ * bfq_updated_next_req - update the queue after a new next_rq selection.

1739

++ * @bfqd: the device data the queue belongs to.

1740

++ * @bfqq: the queue to update.

1741

++ *

1742

++ * If the first request of a queue changes we make sure that the queue

1743

++ * has enough budget to serve at least its first request (if the

1744

++ * request has grown).  We do this because if the queue has not enough

1745

++ * budget for its first request, it has to go through two dispatch

1746

++ * rounds to actually get it dispatched.

1747

++ */

1748

++static void bfq_updated_next_req(struct bfq_data *bfqd,

1749

++				 struct bfq_queue *bfqq)

1750

++{

1751

++	struct bfq_entity *entity = &bfqq->entity;

1752

++	struct bfq_service_tree *st = bfq_entity_service_tree(entity);

1753

++	struct request *next_rq = bfqq->next_rq;

1754

++	unsigned long new_budget;

1755

++

1756

++	if (next_rq == NULL)

1757

++		return;

1758

++

1759

++	if (bfqq == bfqd->in_service_queue)

1760

++		/*

1761

++		 * In order not to break guarantees, budgets cannot be

1762

++		 * changed after an entity has been selected.

1763

++		 */

1764

++		return;

1765

++

1766

++	BUG_ON(entity->tree != &st->active);

1767

++	BUG_ON(entity == entity->sched_data->in_service_entity);

1768

++

1769

++	new_budget = max_t(unsigned long, bfqq->max_budget,

1770

++			   bfq_serv_to_charge(next_rq, bfqq));

1771

++	if (entity->budget != new_budget) {

1772

++		entity->budget = new_budget;

1773

++		bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu",

1774

++					 new_budget);

1775

++		bfq_activate_bfqq(bfqd, bfqq);

1776

++	}

1777

++}

1778

++

1779

++static inline unsigned int bfq_wr_duration(struct bfq_data *bfqd)

1780

++{

1781

++	u64 dur;

1782

++

1783

++	if (bfqd->bfq_wr_max_time > 0)

1784

++		return bfqd->bfq_wr_max_time;

1785

++

1786

++	dur = bfqd->RT_prod;

1787

++	do_div(dur, bfqd->peak_rate);

1788

++

1789

++	return dur;

1790

++}

1791

++

1792

++/* Empty burst list and add just bfqq (see comments to bfq_handle_burst) */

1793

++static inline void bfq_reset_burst_list(struct bfq_data *bfqd,

1794

++					struct bfq_queue *bfqq)

1795

++{

1796

++	struct bfq_queue *item;

1797

++	struct hlist_node *n;

1798

++

1799

++	hlist_for_each_entry_safe(item, n, &bfqd->burst_list, burst_list_node)

1800

++		hlist_del_init(&item->burst_list_node);

1801

++	hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list);

1802

++	bfqd->burst_size = 1;

1803

++}

1804

++

1805

++/* Add bfqq to the list of queues in current burst (see bfq_handle_burst) */

1806

++static void bfq_add_to_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq)

1807

++{

1808

++	/* Increment burst size to take into account also bfqq */

1809

++	bfqd->burst_size++;

1810

++

1811

++	if (bfqd->burst_size == bfqd->bfq_large_burst_thresh) {

1812

++		struct bfq_queue *pos, *bfqq_item;

1813

++		struct hlist_node *n;

1814

++

1815

++		/*

1816

++		 * Enough queues have been activated shortly after each

1817

++		 * other to consider this burst as large.

1818

++		 */

1819

++		bfqd->large_burst = true;

1820

++

1821

++		/*

1822

++		 * We can now mark all queues in the burst list as

1823

++		 * belonging to a large burst.

1824

++		 */

1825

++		hlist_for_each_entry(bfqq_item, &bfqd->burst_list,

1826

++				     burst_list_node)

1827

++		        bfq_mark_bfqq_in_large_burst(bfqq_item);

1828

++		bfq_mark_bfqq_in_large_burst(bfqq);

1829

++

1830

++		/*

1831

++		 * From now on, and until the current burst finishes, any

1832

++		 * new queue being activated shortly after the last queue

1833

++		 * was inserted in the burst can be immediately marked as

1834

++		 * belonging to a large burst. So the burst list is not

1835

++		 * needed any more. Remove it.

1836

++		 */

1837

++		hlist_for_each_entry_safe(pos, n, &bfqd->burst_list,

1838

++					  burst_list_node)

1839

++			hlist_del_init(&pos->burst_list_node);

1840

++	} else /* burst not yet large: add bfqq to the burst list */

1841

++		hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list);

1842

++}

1843

++

1844

++/*

1845

++ * If many queues happen to become active shortly after each other, then,

1846

++ * to help the processes associated to these queues get their job done as

1847

++ * soon as possible, it is usually better to not grant either weight-raising

1848

++ * or device idling to these queues. In this comment we describe, firstly,

1849

++ * the reasons why this fact holds, and, secondly, the next function, which

1850

++ * implements the main steps needed to properly mark these queues so that

1851

++ * they can then be treated in a different way.

1852

++ *

1853

++ * As for the terminology, we say that a queue becomes active, i.e.,

1854

++ * switches from idle to backlogged, either when it is created (as a

1855

++ * consequence of the arrival of an I/O request), or, if already existing,

1856

++ * when a new request for the queue arrives while the queue is idle.

1857

++ * Bursts of activations, i.e., activations of different queues occurring

1858

++ * shortly after each other, are typically caused by services or applications

1859

++ * that spawn or reactivate many parallel threads/processes. Examples are

1860

++ * systemd during boot or git grep.

1861

++ *

1862

++ * These services or applications benefit mostly from a high throughput:

1863

++ * the quicker the requests of the activated queues are cumulatively served,

1864

++ * the sooner the target job of these queues gets completed. As a consequence,

1865

++ * weight-raising any of these queues, which also implies idling the device

1866

++ * for it, is almost always counterproductive: in most cases it just lowers

1867

++ * throughput.

1868

++ *

1869

++ * On the other hand, a burst of activations may be also caused by the start

1870

++ * of an application that does not consist in a lot of parallel I/O-bound

1871

++ * threads. In fact, with a complex application, the burst may be just a

1872

++ * consequence of the fact that several processes need to be executed to

1873

++ * start-up the application. To start an application as quickly as possible,

1874

++ * the best thing to do is to privilege the I/O related to the application

1875

++ * with respect to all other I/O. Therefore, the best strategy to start as

1876

++ * quickly as possible an application that causes a burst of activations is

1877

++ * to weight-raise all the queues activated during the burst. This is the

1878

++ * exact opposite of the best strategy for the other type of bursts.

1879

++ *

1880

++ * In the end, to take the best action for each of the two cases, the two

1881

++ * types of bursts need to be distinguished. Fortunately, this seems

1882

++ * relatively easy to do, by looking at the sizes of the bursts. In

1883

++ * particular, we found a threshold such that bursts with a larger size

1884

++ * than that threshold are apparently caused only by services or commands

1885

++ * such as systemd or git grep. For brevity, hereafter we call just 'large'

1886

++ * these bursts. BFQ *does not* weight-raise queues whose activations occur

1887

++ * in a large burst. In addition, for each of these queues BFQ performs or

1888

++ * does not perform idling depending on which choice boosts the throughput

1889

++ * most. The exact choice depends on the device and request pattern at

1890

++ * hand.

1891

++ *

1892

++ * Turning back to the next function, it implements all the steps needed

1893

++ * to detect the occurrence of a large burst and to properly mark all the

1894

++ * queues belonging to it (so that they can then be treated in a different

1895

++ * way). This goal is achieved by maintaining a special "burst list" that

1896

++ * holds, temporarily, the queues that belong to the burst in progress. The

1897

++ * list is then used to mark these queues as belonging to a large burst if

1898

++ * the burst does become large. The main steps are the following.

1899

++ *

1900

++ * . when the very first queue is activated, the queue is inserted into the

1901

++ *   list (as it could be the first queue in a possible burst)

1902

++ *

1903

++ * . if the current burst has not yet become large, and a queue Q that does

1904

++ *   not yet belong to the burst is activated shortly after the last time

1905

++ *   at which a new queue entered the burst list, then the function appends

1906

++ *   Q to the burst list

1907

++ *

1908

++ * . if, as a consequence of the previous step, the burst size reaches

1909

++ *   the large-burst threshold, then

1910

++ *

1911

++ *     . all the queues in the burst list are marked as belonging to a

1912

++ *       large burst

1913

++ *

1914

++ *     . the burst list is deleted; in fact, the burst list already served

1915

++ *       its purpose (keeping temporarily track of the queues in a burst,

1916

++ *       so as to be able to mark them as belonging to a large burst in the

1917

++ *       previous sub-step), and now is not needed any more

1918

++ *

1919

++ *     . the device enters a large-burst mode

1920

++ *

1921

++ * . if a queue Q that does not belong to the burst is activated while

1922

++ *   the device is in large-burst mode and shortly after the last time

1923

++ *   at which a queue either entered the burst list or was marked as

1924

++ *   belonging to the current large burst, then Q is immediately marked

1925

++ *   as belonging to a large burst.

1926

++ *

1927

++ * . if a queue Q that does not belong to the burst is activated a while

1928

++ *   later, i.e., not shortly after, than the last time at which a queue

1929

++ *   either entered the burst list or was marked as belonging to the

1930

++ *   current large burst, then the current burst is deemed as finished and:

1931

++ *

1932

++ *        . the large-burst mode is reset if set

1933

++ *

1934

++ *        . the burst list is emptied

1935

++ *

1936

++ *        . Q is inserted in the burst list, as Q may be the first queue

1937

++ *          in a possible new burst (then the burst list contains just Q

1938

++ *          after this step).

1939

++ */

1940

++static void bfq_handle_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq,

1941

++			     bool idle_for_long_time)

1942

++{

1943

++	/*

1944

++	 * If bfqq happened to be activated in a burst, but has been idle

1945

++	 * for at least as long as an interactive queue, then we assume

1946

++	 * that, in the overall I/O initiated in the burst, the I/O

1947

++	 * associated to bfqq is finished. So bfqq does not need to be

1948

++	 * treated as a queue belonging to a burst anymore. Accordingly,

1949

++	 * we reset bfqq's in_large_burst flag if set, and remove bfqq

1950

++	 * from the burst list if it's there. We do not decrement instead

1951

++	 * burst_size, because the fact that bfqq does not need to belong

1952

++	 * to the burst list any more does not invalidate the fact that

1953

++	 * bfqq may have been activated during the current burst.

1954

++	 */

1955

++	if (idle_for_long_time) {

1956

++		hlist_del_init(&bfqq->burst_list_node);

1957

++		bfq_clear_bfqq_in_large_burst(bfqq);

1958

++	}

1959

++

1960

++	/*

1961

++	 * If bfqq is already in the burst list or is part of a large

1962

++	 * burst, then there is nothing else to do.

1963

++	 */

1964

++	if (!hlist_unhashed(&bfqq->burst_list_node) ||

1965

++	    bfq_bfqq_in_large_burst(bfqq))

1966

++		return;

1967

++

1968

++	/*

1969

++	 * If bfqq's activation happens late enough, then the current

1970

++	 * burst is finished, and related data structures must be reset.

1971

++	 *

1972

++	 * In this respect, consider the special case where bfqq is the very

1973

++	 * first queue being activated. In this case, last_ins_in_burst is

1974

++	 * not yet significant when we get here. But it is easy to verify

1975

++	 * that, whether or not the following condition is true, bfqq will

1976

++	 * end up being inserted into the burst list. In particular the

1977

++	 * list will happen to contain only bfqq. And this is exactly what

1978

++	 * has to happen, as bfqq may be the first queue in a possible

1979

++	 * burst.

1980

++	 */

1981

++	if (time_is_before_jiffies(bfqd->last_ins_in_burst +

1982

++	    bfqd->bfq_burst_interval)) {

1983

++		bfqd->large_burst = false;

1984

++		bfq_reset_burst_list(bfqd, bfqq);

1985

++		return;

1986

++	}

1987

++

1988

++	/*

1989

++	 * If we get here, then bfqq is being activated shortly after the

1990

++	 * last queue. So, if the current burst is also large, we can mark

1991

++	 * bfqq as belonging to this large burst immediately.

1992

++	 */

1993

++	if (bfqd->large_burst) {

1994

++		bfq_mark_bfqq_in_large_burst(bfqq);

1995

++		return;

1996

++	}

1997

++

1998

++	/*

1999

++	 * If we get here, then a large-burst state has not yet been

2000

++	 * reached, but bfqq is being activated shortly after the last

2001

++	 * queue. Then we add bfqq to the burst.

2002

++	 */

2003

++	bfq_add_to_burst(bfqd, bfqq);

2004

++}

2005

++

2006

++static void bfq_add_request(struct request *rq)

2007

++{

2008

++	struct bfq_queue *bfqq = RQ_BFQQ(rq);

2009

++	struct bfq_entity *entity = &bfqq->entity;

2010

++	struct bfq_data *bfqd = bfqq->bfqd;

2011

++	struct request *next_rq, *prev;

2012

++	unsigned long old_wr_coeff = bfqq->wr_coeff;

2013

++	bool interactive = false;

2014

++

2015

++	bfq_log_bfqq(bfqd, bfqq, "add_request %d", rq_is_sync(rq));

2016

++	bfqq->queued[rq_is_sync(rq)]++;

2017

++	bfqd->queued++;

2018

++

2019

++	elv_rb_add(&bfqq->sort_list, rq);

2020

++

2021

++	/*

2022

++	 * Check if this request is a better next-serve candidate.

2023

++	 */

2024

++	prev = bfqq->next_rq;

2025

++	next_rq = bfq_choose_req(bfqd, bfqq->next_rq, rq, bfqd->last_position);

2026

++	BUG_ON(next_rq == NULL);

2027

++	bfqq->next_rq = next_rq;

2028

++

2029

++	/*

2030

++	 * Adjust priority tree position, if next_rq changes.

2031

++	 */

2032

++	if (prev != bfqq->next_rq)

2033

++		bfq_rq_pos_tree_add(bfqd, bfqq);

2034

++

2035

++	if (!bfq_bfqq_busy(bfqq)) {

2036

++		bool soft_rt,

2037

++		     idle_for_long_time = time_is_before_jiffies(

2038

++						bfqq->budget_timeout +

2039

++						bfqd->bfq_wr_min_idle_time);

2040

++

2041

++		if (bfq_bfqq_sync(bfqq)) {

2042

++			bool already_in_burst =

2043

++			   !hlist_unhashed(&bfqq->burst_list_node) ||

2044

++			   bfq_bfqq_in_large_burst(bfqq);

2045

++			bfq_handle_burst(bfqd, bfqq, idle_for_long_time);

2046

++			/*

2047

++			 * If bfqq was not already in the current burst,

2048

++			 * then, at this point, bfqq either has been

2049

++			 * added to the current burst or has caused the

2050

++			 * current burst to terminate. In particular, in

2051

++			 * the second case, bfqq has become the first

2052

++			 * queue in a possible new burst.

2053

++			 * In both cases last_ins_in_burst needs to be

2054

++			 * moved forward.

2055

++			 */

2056

++			if (!already_in_burst)

2057

++				bfqd->last_ins_in_burst = jiffies;

2058

++		}

2059

++

2060

++		soft_rt = bfqd->bfq_wr_max_softrt_rate > 0 &&

2061

++			!bfq_bfqq_in_large_burst(bfqq) &&

2062

++			time_is_before_jiffies(bfqq->soft_rt_next_start);

2063

++		interactive = !bfq_bfqq_in_large_burst(bfqq) &&

2064

++			      idle_for_long_time;

2065

++		entity->budget = max_t(unsigned long, bfqq->max_budget,

2066

++				       bfq_serv_to_charge(next_rq, bfqq));

2067

++

2068

++		if (!bfq_bfqq_IO_bound(bfqq)) {

2069

++			if (time_before(jiffies,

2070

++					RQ_BIC(rq)->ttime.last_end_request +

2071

++					bfqd->bfq_slice_idle)) {

2072

++				bfqq->requests_within_timer++;

2073

++				if (bfqq->requests_within_timer >=

2074

++				    bfqd->bfq_requests_within_timer)

2075

++					bfq_mark_bfqq_IO_bound(bfqq);

2076

++			} else

2077

++				bfqq->requests_within_timer = 0;

2078

++		}

2079

++

2080

++		if (!bfqd->low_latency)

2081

++			goto add_bfqq_busy;

2082

++

2083

++		/*

2084

++		 * If the queue is not being boosted and has been idle

2085

++		 * for enough time, start a weight-raising period

2086

++		 */

2087

++		if (old_wr_coeff == 1 && (interactive || soft_rt)) {

2088

++			bfqq->wr_coeff = bfqd->bfq_wr_coeff;

2089

++			if (interactive)

2090

++				bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);

2091

++			else

2092

++				bfqq->wr_cur_max_time =

2093

++					bfqd->bfq_wr_rt_max_time;

2094

++			bfq_log_bfqq(bfqd, bfqq,

2095

++				     "wrais starting at %lu, rais_max_time %u",

2096

++				     jiffies,

2097

++				     jiffies_to_msecs(bfqq->wr_cur_max_time));

2098

++		} else if (old_wr_coeff > 1) {

2099

++			if (interactive)

2100

++				bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);

2101

++			else if (bfq_bfqq_in_large_burst(bfqq) ||

2102

++				 (bfqq->wr_cur_max_time ==

2103

++				  bfqd->bfq_wr_rt_max_time &&

2104

++				  !soft_rt)) {

2105

++				bfqq->wr_coeff = 1;

2106

++				bfq_log_bfqq(bfqd, bfqq,

2107

++					"wrais ending at %lu, rais_max_time %u",

2108

++					jiffies,

2109

++					jiffies_to_msecs(bfqq->

2110

++						wr_cur_max_time));

2111

++			} else if (time_before(

2112

++					bfqq->last_wr_start_finish +

2113

++					bfqq->wr_cur_max_time,

2114

++					jiffies +

2115

++					bfqd->bfq_wr_rt_max_time) &&

2116

++				   soft_rt) {

2117

++				/*

2118

++				 *

2119

++				 * The remaining weight-raising time is lower

2120

++				 * than bfqd->bfq_wr_rt_max_time, which

2121

++				 * means that the application is enjoying

2122

++				 * weight raising either because deemed soft-

2123

++				 * rt in the near past, or because deemed

2124

++				 * interactive a long ago. In both cases,

2125

++				 * resetting now the current remaining weight-

2126

++				 * raising time for the application to the

2127

++				 * weight-raising duration for soft rt

2128

++				 * applications would not cause any latency

2129

++				 * increase for the application (as the new

2130

++				 * duration would be higher than the remaining

2131

++				 * time).

2132

++				 *

2133

++				 * In addition, the application is now meeting

2134

++				 * the requirements for being deemed soft rt.

2135

++				 * In the end we can correctly and safely

2136

++				 * (re)charge the weight-raising duration for

2137

++				 * the application with the weight-raising

2138

++				 * duration for soft rt applications.

2139

++				 *

2140

++				 * In particular, doing this recharge now, i.e.,

2141

++				 * before the weight-raising period for the

2142

++				 * application finishes, reduces the probability

2143

++				 * of the following negative scenario:

2144

++				 * 1) the weight of a soft rt application is

2145

++				 *    raised at startup (as for any newly

2146

++				 *    created application),

2147

++				 * 2) since the application is not interactive,

2148

++				 *    at a certain time weight-raising is

2149

++				 *    stopped for the application,

2150

++				 * 3) at that time the application happens to

2151

++				 *    still have pending requests, and hence

2152

++				 *    is destined to not have a chance to be

2153

++				 *    deemed soft rt before these requests are

2154

++				 *    completed (see the comments to the

2155

++				 *    function bfq_bfqq_softrt_next_start()

2156

++				 *    for details on soft rt detection),

2157

++				 * 4) these pending requests experience a high

2158

++				 *    latency because the application is not

2159

++				 *    weight-raised while they are pending.

2160

++				 */

2161

++				bfqq->last_wr_start_finish = jiffies;

2162

++				bfqq->wr_cur_max_time =

2163

++					bfqd->bfq_wr_rt_max_time;

2164

++			}

2165

++		}

2166

++		if (old_wr_coeff != bfqq->wr_coeff)

2167

++			entity->ioprio_changed = 1;

2168

++add_bfqq_busy:

2169

++		bfqq->last_idle_bklogged = jiffies;

2170

++		bfqq->service_from_backlogged = 0;

2171

++		bfq_clear_bfqq_softrt_update(bfqq);

2172

++		bfq_add_bfqq_busy(bfqd, bfqq);

2173

++	} else {

2174

++		if (bfqd->low_latency && old_wr_coeff == 1 && !rq_is_sync(rq) &&

2175

++		    time_is_before_jiffies(

2176

++				bfqq->last_wr_start_finish +

2177

++				bfqd->bfq_wr_min_inter_arr_async)) {

2178

++			bfqq->wr_coeff = bfqd->bfq_wr_coeff;

2179

++			bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);

2180

++

2181

++			bfqd->wr_busy_queues++;

2182

++			entity->ioprio_changed = 1;

2183

++			bfq_log_bfqq(bfqd, bfqq,

2184

++			    "non-idle wrais starting at %lu, rais_max_time %u",

2185

++			    jiffies,

2186

++			    jiffies_to_msecs(bfqq->wr_cur_max_time));

2187

++		}

2188

++		if (prev != bfqq->next_rq)

2189

++			bfq_updated_next_req(bfqd, bfqq);

2190

++	}

2191

++

2192

++	if (bfqd->low_latency &&

2193

++		(old_wr_coeff == 1 || bfqq->wr_coeff == 1 || interactive))

2194

++		bfqq->last_wr_start_finish = jiffies;

2195

++}

2196

++

2197

++static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd,

2198

++					  struct bio *bio)

2199

++{

2200

++	struct task_struct *tsk = current;

2201

++	struct bfq_io_cq *bic;

2202

++	struct bfq_queue *bfqq;

2203

++

2204

++	bic = bfq_bic_lookup(bfqd, tsk->io_context);

2205

++	if (bic == NULL)

2206

++		return NULL;

2207

++

2208

++	bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));

2209

++	if (bfqq != NULL)

2210

++		return elv_rb_find(&bfqq->sort_list, bio_end_sector(bio));

2211

++

2212

++	return NULL;

2213

++}

2214

++

2215

++static void bfq_activate_request(struct request_queue *q, struct request *rq)

2216

++{

2217

++	struct bfq_data *bfqd = q->elevator->elevator_data;

2218

++

2219

++	bfqd->rq_in_driver++;

2220

++	bfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq);

2221

++	bfq_log(bfqd, "activate_request: new bfqd->last_position %llu",

2222

++		(long long unsigned)bfqd->last_position);

2223

++}

2224

++

2225

++static inline void bfq_deactivate_request(struct request_queue *q,

2226

++					  struct request *rq)

2227

++{

2228

++	struct bfq_data *bfqd = q->elevator->elevator_data;

2229

++

2230

++	BUG_ON(bfqd->rq_in_driver == 0);

2231

++	bfqd->rq_in_driver--;

2232

++}

2233

++

2234

++static void bfq_remove_request(struct request *rq)

2235

++{

2236

++	struct bfq_queue *bfqq = RQ_BFQQ(rq);

2237

++	struct bfq_data *bfqd = bfqq->bfqd;

2238

++	const int sync = rq_is_sync(rq);

2239

++

2240

++	if (bfqq->next_rq == rq) {

2241

++		bfqq->next_rq = bfq_find_next_rq(bfqd, bfqq, rq);

2242

++		bfq_updated_next_req(bfqd, bfqq);

2243

++	}

2244

++

2245

++	if (rq->queuelist.prev != &rq->queuelist)

2246

++		list_del_init(&rq->queuelist);

2247

++	BUG_ON(bfqq->queued[sync] == 0);

2248

++	bfqq->queued[sync]--;

2249

++	bfqd->queued--;

2250

++	elv_rb_del(&bfqq->sort_list, rq);

2251

++

2252

++	if (RB_EMPTY_ROOT(&bfqq->sort_list)) {

2253

++		if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->in_service_queue)

2254

++			bfq_del_bfqq_busy(bfqd, bfqq, 1);

2255

++		/*

2256

++		 * Remove queue from request-position tree as it is empty.

2257

++		 */

2258

++		if (bfqq->pos_root != NULL) {

2259

++			rb_erase(&bfqq->pos_node, bfqq->pos_root);

2260

++			bfqq->pos_root = NULL;

2261

++		}

2262

++	}

2263

++

2264

++	if (rq->cmd_flags & REQ_META) {

2265

++		BUG_ON(bfqq->meta_pending == 0);

2266

++		bfqq->meta_pending--;

2267

++	}

2268

++}

2269

++

2270

++static int bfq_merge(struct request_queue *q, struct request **req,

2271

++		     struct bio *bio)

2272

++{

2273

++	struct bfq_data *bfqd = q->elevator->elevator_data;

2274

++	struct request *__rq;

2275

++

2276

++	__rq = bfq_find_rq_fmerge(bfqd, bio);

2277

++	if (__rq != NULL && elv_rq_merge_ok(__rq, bio)) {

2278

++		*req = __rq;

2279

++		return ELEVATOR_FRONT_MERGE;

2280

++	}

2281

++

2282

++	return ELEVATOR_NO_MERGE;

2283

++}

2284

++

2285

++static void bfq_merged_request(struct request_queue *q, struct request *req,

2286

++			       int type)

2287

++{

2288

++	if (type == ELEVATOR_FRONT_MERGE &&

2289

++	    rb_prev(&req->rb_node) &&

2290

++	    blk_rq_pos(req) <

2291

++	    blk_rq_pos(container_of(rb_prev(&req->rb_node),

2292

++				    struct request, rb_node))) {

2293

++		struct bfq_queue *bfqq = RQ_BFQQ(req);

2294

++		struct bfq_data *bfqd = bfqq->bfqd;

2295

++		struct request *prev, *next_rq;

2296

++

2297

++		/* Reposition request in its sort_list */

2298

++		elv_rb_del(&bfqq->sort_list, req);

2299

++		elv_rb_add(&bfqq->sort_list, req);

2300

++		/* Choose next request to be served for bfqq */

2301

++		prev = bfqq->next_rq;

2302

++		next_rq = bfq_choose_req(bfqd, bfqq->next_rq, req,

2303

++					 bfqd->last_position);

2304

++		BUG_ON(next_rq == NULL);

2305

++		bfqq->next_rq = next_rq;

2306

++		/*

2307

++		 * If next_rq changes, update both the queue's budget to

2308

++		 * fit the new request and the queue's position in its

2309

++		 * rq_pos_tree.

2310

++		 */

2311

++		if (prev != bfqq->next_rq) {

2312

++			bfq_updated_next_req(bfqd, bfqq);

2313

++			bfq_rq_pos_tree_add(bfqd, bfqq);

2314

++		}

2315

++	}

2316

++}

2317

++

2318

++static void bfq_merged_requests(struct request_queue *q, struct request *rq,

2319

++				struct request *next)

2320

++{

2321

++	struct bfq_queue *bfqq = RQ_BFQQ(rq), *next_bfqq = RQ_BFQQ(next);

2322

++

2323

++	/*

2324

++	 * If next and rq belong to the same bfq_queue and next is older

2325

++	 * than rq, then reposition rq in the fifo (by substituting next

2326

++	 * with rq). Otherwise, if next and rq belong to different

2327

++	 * bfq_queues, never reposition rq: in fact, we would have to

2328

++	 * reposition it with respect to next's position in its own fifo,

2329

++	 * which would most certainly be too expensive with respect to

2330

++	 * the benefits.

2331

++	 */

2332

++	if (bfqq == next_bfqq &&

2333

++	    !list_empty(&rq->queuelist) && !list_empty(&next->queuelist) &&

2334

++	    time_before(next->fifo_time, rq->fifo_time)) {

2335

++		list_del_init(&rq->queuelist);

2336

++		list_replace_init(&next->queuelist, &rq->queuelist);

2337

++		rq->fifo_time = next->fifo_time;

2338

++	}

2339

++

2340

++	if (bfqq->next_rq == next)

2341

++		bfqq->next_rq = rq;

2342

++

2343

++	bfq_remove_request(next);

2344

++}

2345

++

2346

++/* Must be called with bfqq != NULL */

2347

++static inline void bfq_bfqq_end_wr(struct bfq_queue *bfqq)

2348

++{

2349

++	BUG_ON(bfqq == NULL);

2350

++	if (bfq_bfqq_busy(bfqq))

2351

++		bfqq->bfqd->wr_busy_queues--;

2352

++	bfqq->wr_coeff = 1;

2353

++	bfqq->wr_cur_max_time = 0;

2354

++	/* Trigger a weight change on the next activation of the queue */

2355

++	bfqq->entity.ioprio_changed = 1;

2356

++}

2357

++

2358

++static void bfq_end_wr_async_queues(struct bfq_data *bfqd,

2359

++				    struct bfq_group *bfqg)

2360

++{

2361

++	int i, j;

2362

++

2363

++	for (i = 0; i < 2; i++)

2364

++		for (j = 0; j < IOPRIO_BE_NR; j++)

2365

++			if (bfqg->async_bfqq[i][j] != NULL)

2366

++				bfq_bfqq_end_wr(bfqg->async_bfqq[i][j]);

2367

++	if (bfqg->async_idle_bfqq != NULL)

2368

++		bfq_bfqq_end_wr(bfqg->async_idle_bfqq);

2369

++}

2370

++

2371

++static void bfq_end_wr(struct bfq_data *bfqd)

2372

++{

2373

++	struct bfq_queue *bfqq;

2374

++

2375

++	spin_lock_irq(bfqd->queue->queue_lock);

2376

++

2377

++	list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list)

2378

++		bfq_bfqq_end_wr(bfqq);

2379

++	list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list)

2380

++		bfq_bfqq_end_wr(bfqq);

2381

++	bfq_end_wr_async(bfqd);

2382

++

2383

++	spin_unlock_irq(bfqd->queue->queue_lock);

2384

++}

2385

++

2386

++static int bfq_allow_merge(struct request_queue *q, struct request *rq,

2387

++			   struct bio *bio)

2388

++{

2389

++	struct bfq_data *bfqd = q->elevator->elevator_data;

2390

++	struct bfq_io_cq *bic;

2391

++	struct bfq_queue *bfqq;

2392

++

2393

++	/*

2394

++	 * Disallow merge of a sync bio into an async request.

2395

++	 */

2396

++	if (bfq_bio_sync(bio) && !rq_is_sync(rq))

2397

++		return 0;

2398

++

2399

++	/*

2400

++	 * Lookup the bfqq that this bio will be queued with. Allow

2401

++	 * merge only if rq is queued there.

2402

++	 * Queue lock is held here.

2403

++	 */

2404

++	bic = bfq_bic_lookup(bfqd, current->io_context);

2405

++	if (bic == NULL)

2406

++		return 0;

2407

++

2408

++	bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));

2409

++	return bfqq == RQ_BFQQ(rq);

2410

++}

2411

++

2412

++static void __bfq_set_in_service_queue(struct bfq_data *bfqd,

2413

++				       struct bfq_queue *bfqq)

2414

++{

2415

++	if (bfqq != NULL) {

2416

++		bfq_mark_bfqq_must_alloc(bfqq);

2417

++		bfq_mark_bfqq_budget_new(bfqq);

2418

++		bfq_clear_bfqq_fifo_expire(bfqq);

2419

++

2420

++		bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8;

2421

++

2422

++		bfq_log_bfqq(bfqd, bfqq,

2423

++			     "set_in_service_queue, cur-budget = %lu",

2424

++			     bfqq->entity.budget);

2425

++	}

2426

++

2427

++	bfqd->in_service_queue = bfqq;

2428

++}

2429

++

2430

++/*

2431

++ * Get and set a new queue for service.

2432

++ */

2433

++static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd,

2434

++						  struct bfq_queue *bfqq)

2435

++{

2436

++	if (!bfqq)

2437

++		bfqq = bfq_get_next_queue(bfqd);

2438

++	else

2439

++		bfq_get_next_queue_forced(bfqd, bfqq);

2440

++

2441

++	__bfq_set_in_service_queue(bfqd, bfqq);

2442

++	return bfqq;

2443

++}

2444

++

2445

++static inline sector_t bfq_dist_from_last(struct bfq_data *bfqd,

2446

++					  struct request *rq)

2447

++{

2448

++	if (blk_rq_pos(rq) >= bfqd->last_position)

2449

++		return blk_rq_pos(rq) - bfqd->last_position;

2450

++	else

2451

++		return bfqd->last_position - blk_rq_pos(rq);

2452

++}

2453

++

2454

++/*

2455

++ * Return true if bfqq has no request pending and rq is close enough to

2456

++ * bfqd->last_position, or if rq is closer to bfqd->last_position than

2457

++ * bfqq->next_rq

2458

++ */

2459

++static inline int bfq_rq_close(struct bfq_data *bfqd, struct request *rq)

2460

++{

2461

++	return bfq_dist_from_last(bfqd, rq) <= BFQQ_SEEK_THR;

2462

++}

2463

++

2464

++static struct bfq_queue *bfqq_close(struct bfq_data *bfqd)

2465

++{

2466

++	struct rb_root *root = &bfqd->rq_pos_tree;

2467

++	struct rb_node *parent, *node;

2468

++	struct bfq_queue *__bfqq;

2469

++	sector_t sector = bfqd->last_position;

2470

++

2471

++	if (RB_EMPTY_ROOT(root))

2472

++		return NULL;

2473

++

2474

++	/*

2475

++	 * First, if we find a request starting at the end of the last

2476

++	 * request, choose it.

2477

++	 */

2478

++	__bfqq = bfq_rq_pos_tree_lookup(bfqd, root, sector, &parent, NULL);

2479

++	if (__bfqq != NULL)

2480

++		return __bfqq;

2481

++

2482

++	/*

2483

++	 * If the exact sector wasn't found, the parent of the NULL leaf

2484

++	 * will contain the closest sector (rq_pos_tree sorted by

2485

++	 * next_request position).

2486

++	 */

2487

++	__bfqq = rb_entry(parent, struct bfq_queue, pos_node);

2488

++	if (bfq_rq_close(bfqd, __bfqq->next_rq))

2489

++		return __bfqq;

2490

++

2491

++	if (blk_rq_pos(__bfqq->next_rq) < sector)

2492

++		node = rb_next(&__bfqq->pos_node);

2493

++	else

2494

++		node = rb_prev(&__bfqq->pos_node);

2495

++	if (node == NULL)

2496

++		return NULL;

2497

++

2498

++	__bfqq = rb_entry(node, struct bfq_queue, pos_node);

2499

++	if (bfq_rq_close(bfqd, __bfqq->next_rq))

2500

++		return __bfqq;

2501

++

2502

++	return NULL;

2503

++}

2504

++

2505

++/*

2506

++ * bfqd - obvious

2507

++ * cur_bfqq - passed in so that we don't decide that the current queue

2508

++ *            is closely cooperating with itself.

2509

++ *

2510

++ * We are assuming that cur_bfqq has dispatched at least one request,

2511

++ * and that bfqd->last_position reflects a position on the disk associated

2512

++ * with the I/O issued by cur_bfqq.

2513

++ */

2514

++static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd,

2515

++					      struct bfq_queue *cur_bfqq)

2516

++{

2517

++	struct bfq_queue *bfqq;

2518

++

2519

++	if (bfq_class_idle(cur_bfqq))

2520

++		return NULL;

2521

++	if (!bfq_bfqq_sync(cur_bfqq))

2522

++		return NULL;

2523

++	if (BFQQ_SEEKY(cur_bfqq))

2524

++		return NULL;

2525

++

2526

++	/* If device has only one backlogged bfq_queue, don't search. */

2527

++	if (bfqd->busy_queues == 1)

2528

++		return NULL;

2529

++

2530

++	/*

2531

++	 * We should notice if some of the queues are cooperating, e.g.

2532

++	 * working closely on the same area of the disk. In that case,

2533

++	 * we can group them together and don't waste time idling.

2534

++	 */

2535

++	bfqq = bfqq_close(bfqd);

2536

++	if (bfqq == NULL || bfqq == cur_bfqq)

2537

++		return NULL;

2538

++

2539

++	/*

2540

++	 * Do not merge queues from different bfq_groups.

2541

++	*/

2542

++	if (bfqq->entity.parent != cur_bfqq->entity.parent)

2543

++		return NULL;

2544

++

2545

++	/*

2546

++	 * It only makes sense to merge sync queues.

2547

++	 */

2548

++	if (!bfq_bfqq_sync(bfqq))

2549

++		return NULL;

2550

++	if (BFQQ_SEEKY(bfqq))

2551

++		return NULL;

2552

++

2553

++	/*

2554

++	 * Do not merge queues of different priority classes.

2555

++	 */

2556

++	if (bfq_class_rt(bfqq) != bfq_class_rt(cur_bfqq))

2557

++		return NULL;

2558

++

2559

++	return bfqq;

2560

++}

2561

++

2562

++/*

2563

++ * If enough samples have been computed, return the current max budget

2564

++ * stored in bfqd, which is dynamically updated according to the

2565

++ * estimated disk peak rate; otherwise return the default max budget

2566

++ */

2567

++static inline unsigned long bfq_max_budget(struct bfq_data *bfqd)

2568

++{

2569

++	if (bfqd->budgets_assigned < 194)

2570

++		return bfq_default_max_budget;

2571

++	else

2572

++		return bfqd->bfq_max_budget;

2573

++}

2574

++

2575

++/*

2576

++ * Return min budget, which is a fraction of the current or default

2577

++ * max budget (trying with 1/32)

2578

++ */

2579

++static inline unsigned long bfq_min_budget(struct bfq_data *bfqd)

2580

++{

2581

++	if (bfqd->budgets_assigned < 194)

2582

++		return bfq_default_max_budget / 32;

2583

++	else

2584

++		return bfqd->bfq_max_budget / 32;

2585

++}

2586

++

2587

++static void bfq_arm_slice_timer(struct bfq_data *bfqd)

2588

++{

2589

++	struct bfq_queue *bfqq = bfqd->in_service_queue;

2590

++	struct bfq_io_cq *bic;

2591

++	unsigned long sl;

2592

++

2593

++	BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list));

2594

++

2595

++	/* Processes have exited, don't wait. */

2596

++	bic = bfqd->in_service_bic;

2597

++	if (bic == NULL || atomic_read(&bic->icq.ioc->active_ref) == 0)

2598

++		return;

2599

++

2600

++	bfq_mark_bfqq_wait_request(bfqq);

2601

++

2602

++	/*

2603

++	 * We don't want to idle for seeks, but we do want to allow

2604

++	 * fair distribution of slice time for a process doing back-to-back

2605

++	 * seeks. So allow a little bit of time for him to submit a new rq.

2606

++	 *

2607

++	 * To prevent processes with (partly) seeky workloads from

2608

++	 * being too ill-treated, grant them a small fraction of the

2609

++	 * assigned budget before reducing the waiting time to

2610

++	 * BFQ_MIN_TT. This happened to help reduce latency.

2611

++	 */

2612

++	sl = bfqd->bfq_slice_idle;

2613

++	/*

2614

++	 * Unless the queue is being weight-raised or the scenario is

2615

++	 * asymmetric, grant only minimum idle time if the queue either

2616

++	 * has been seeky for long enough or has already proved to be

2617

++	 * constantly seeky.

2618

++	 */

2619

++	if (bfq_sample_valid(bfqq->seek_samples) &&

2620

++	    ((BFQQ_SEEKY(bfqq) && bfqq->entity.service >

2621

++				  bfq_max_budget(bfqq->bfqd) / 8) ||

2622

++	      bfq_bfqq_constantly_seeky(bfqq)) && bfqq->wr_coeff == 1 &&

2623

++	    symmetric_scenario)

2624

++		sl = min(sl, msecs_to_jiffies(BFQ_MIN_TT));

2625

++	else if (bfqq->wr_coeff > 1)

2626

++		sl = sl * 3;

2627

++	bfqd->last_idling_start = ktime_get();

2628

++	mod_timer(&bfqd->idle_slice_timer, jiffies + sl);

2629

++	bfq_log(bfqd, "arm idle: %u/%u ms",

2630

++		jiffies_to_msecs(sl), jiffies_to_msecs(bfqd->bfq_slice_idle));

2631

++}

2632

++

2633

++/*

2634

++ * Set the maximum time for the in-service queue to consume its

2635

++ * budget. This prevents seeky processes from lowering the disk

2636

++ * throughput (always guaranteed with a time slice scheme as in CFQ).

2637

++ */

2638

++static void bfq_set_budget_timeout(struct bfq_data *bfqd)

2639

++{

2640

++	struct bfq_queue *bfqq = bfqd->in_service_queue;

2641

++	unsigned int timeout_coeff;

2642

++	if (bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time)

2643

++		timeout_coeff = 1;

2644

++	else

2645

++		timeout_coeff = bfqq->entity.weight / bfqq->entity.orig_weight;

2646

++

2647

++	bfqd->last_budget_start = ktime_get();

2648

++

2649

++	bfq_clear_bfqq_budget_new(bfqq);

2650

++	bfqq->budget_timeout = jiffies +

2651

++		bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] * timeout_coeff;

2652

++

2653

++	bfq_log_bfqq(bfqd, bfqq, "set budget_timeout %u",

2654

++		jiffies_to_msecs(bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] *

2655

++		timeout_coeff));

2656

++}

2657

++

2658

++/*

2659

++ * Move request from internal lists to the request queue dispatch list.

2660

++ */

2661

++static void bfq_dispatch_insert(struct request_queue *q, struct request *rq)

2662

++{

2663

++	struct bfq_data *bfqd = q->elevator->elevator_data;

2664

++	struct bfq_queue *bfqq = RQ_BFQQ(rq);

2665

++

2666

++	/*

2667

++	 * For consistency, the next instruction should have been executed

2668

++	 * after removing the request from the queue and dispatching it.

2669

++	 * We execute instead this instruction before bfq_remove_request()

2670

++	 * (and hence introduce a temporary inconsistency), for efficiency.

2671

++	 * In fact, in a forced_dispatch, this prevents two counters related

2672

++	 * to bfqq->dispatched to risk to be uselessly decremented if bfqq

2673

++	 * is not in service, and then to be incremented again after

2674

++	 * incrementing bfqq->dispatched.

2675

++	 */

2676

++	bfqq->dispatched++;

2677

++	bfq_remove_request(rq);

2678

++	elv_dispatch_sort(q, rq);

2679

++

2680

++	if (bfq_bfqq_sync(bfqq))

2681

++		bfqd->sync_flight++;

2682

++}

2683

++

2684

++/*

2685

++ * Return expired entry, or NULL to just start from scratch in rbtree.

2686

++ */

2687

++static struct request *bfq_check_fifo(struct bfq_queue *bfqq)

2688

++{

2689

++	struct request *rq = NULL;

2690

++

2691

++	if (bfq_bfqq_fifo_expire(bfqq))

2692

++		return NULL;

2693

++

2694

++	bfq_mark_bfqq_fifo_expire(bfqq);

2695

++

2696

++	if (list_empty(&bfqq->fifo))

2697

++		return NULL;

2698

++

2699

++	rq = rq_entry_fifo(bfqq->fifo.next);

2700

++

2701

++	if (time_before(jiffies, rq->fifo_time))

2702

++		return NULL;

2703

++

2704

++	return rq;

2705

++}

2706

++

2707

++/* Must be called with the queue_lock held. */

2708

++static int bfqq_process_refs(struct bfq_queue *bfqq)

2709

++{

2710

++	int process_refs, io_refs;

2711

++

2712

++	io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE];

2713

++	process_refs = atomic_read(&bfqq->ref) - io_refs - bfqq->entity.on_st;

2714

++	BUG_ON(process_refs < 0);

2715

++	return process_refs;

2716

++}

2717

++

2718

++static void bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)

2719

++{

2720

++	int process_refs, new_process_refs;

2721

++	struct bfq_queue *__bfqq;

2722

++

2723

++	/*

2724

++	 * If there are no process references on the new_bfqq, then it is

2725

++	 * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain

2726

++	 * may have dropped their last reference (not just their last process

2727

++	 * reference).

2728

++	 */

2729

++	if (!bfqq_process_refs(new_bfqq))

2730

++		return;

2731

++

2732

++	/* Avoid a circular list and skip interim queue merges. */

2733

++	while ((__bfqq = new_bfqq->new_bfqq)) {

2734

++		if (__bfqq == bfqq)

2735

++			return;

2736

++		new_bfqq = __bfqq;

2737

++	}

2738

++

2739

++	process_refs = bfqq_process_refs(bfqq);

2740

++	new_process_refs = bfqq_process_refs(new_bfqq);

2741

++	/*

2742

++	 * If the process for the bfqq has gone away, there is no

2743

++	 * sense in merging the queues.

2744

++	 */

2745

++	if (process_refs == 0 || new_process_refs == 0)

2746

++		return;

2747

++

2748

++	/*

2749

++	 * Merge in the direction of the lesser amount of work.

2750

++	 */

2751

++	if (new_process_refs >= process_refs) {

2752

++		bfqq->new_bfqq = new_bfqq;

2753

++		atomic_add(process_refs, &new_bfqq->ref);

2754

++	} else {

2755

++		new_bfqq->new_bfqq = bfqq;

2756

++		atomic_add(new_process_refs, &bfqq->ref);

2757

++	}

2758

++	bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d",

2759

++		new_bfqq->pid);

2760

++}

2761

++

2762

++static inline unsigned long bfq_bfqq_budget_left(struct bfq_queue *bfqq)

2763

++{

2764

++	struct bfq_entity *entity = &bfqq->entity;

2765

++	return entity->budget - entity->service;

2766

++}

2767

++

2768

++static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq)

2769

++{

2770

++	BUG_ON(bfqq != bfqd->in_service_queue);

2771

++

2772

++	__bfq_bfqd_reset_in_service(bfqd);

2773

++

2774

++	/*

2775

++	 * If this bfqq is shared between multiple processes, check

2776

++	 * to make sure that those processes are still issuing I/Os

2777

++	 * within the mean seek distance. If not, it may be time to

2778

++	 * break the queues apart again.

2779

++	 */

2780

++	if (bfq_bfqq_coop(bfqq) && BFQQ_SEEKY(bfqq))

2781

++		bfq_mark_bfqq_split_coop(bfqq);

2782

++

2783

++	if (RB_EMPTY_ROOT(&bfqq->sort_list)) {

2784

++		/*

2785

++		 * Overloading budget_timeout field to store the time

2786

++		 * at which the queue remains with no backlog; used by

2787

++		 * the weight-raising mechanism.

2788

++		 */

2789

++		bfqq->budget_timeout = jiffies;

2790

++		bfq_del_bfqq_busy(bfqd, bfqq, 1);

2791

++	} else {

2792

++		bfq_activate_bfqq(bfqd, bfqq);

2793

++		/*

2794

++		 * Resort priority tree of potential close cooperators.

2795

++		 */

2796

++		bfq_rq_pos_tree_add(bfqd, bfqq);

2797

++	}

2798

++}

2799

++

2800

++/**

2801

++ * __bfq_bfqq_recalc_budget - try to adapt the budget to the @bfqq behavior.

2802

++ * @bfqd: device data.

2803

++ * @bfqq: queue to update.

2804

++ * @reason: reason for expiration.

2805

++ *

2806

++ * Handle the feedback on @bfqq budget.  See the body for detailed

2807

++ * comments.

2808

++ */

2809

++static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd,

2810

++				     struct bfq_queue *bfqq,

2811

++				     enum bfqq_expiration reason)

2812

++{

2813

++	struct request *next_rq;

2814

++	unsigned long budget, min_budget;

2815

++

2816

++	budget = bfqq->max_budget;

2817

++	min_budget = bfq_min_budget(bfqd);

2818

++

2819

++	BUG_ON(bfqq != bfqd->in_service_queue);

2820

++

2821

++	bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last budg %lu, budg left %lu",

2822

++		bfqq->entity.budget, bfq_bfqq_budget_left(bfqq));

2823

++	bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last max_budg %lu, min budg %lu",

2824

++		budget, bfq_min_budget(bfqd));

2825

++	bfq_log_bfqq(bfqd, bfqq, "recalc_budg: sync %d, seeky %d",

2826

++		bfq_bfqq_sync(bfqq), BFQQ_SEEKY(bfqd->in_service_queue));

2827

++

2828

++	if (bfq_bfqq_sync(bfqq)) {

2829

++		switch (reason) {

2830

++		/*

2831

++		 * Caveat: in all the following cases we trade latency

2832

++		 * for throughput.

2833

++		 */

2834

++		case BFQ_BFQQ_TOO_IDLE:

2835

++			/*

2836

++			 * This is the only case where we may reduce

2837

++			 * the budget: if there is no request of the

2838

++			 * process still waiting for completion, then

2839

++			 * we assume (tentatively) that the timer has

2840

++			 * expired because the batch of requests of

2841

++			 * the process could have been served with a

2842

++			 * smaller budget.  Hence, betting that

2843

++			 * process will behave in the same way when it

2844

++			 * becomes backlogged again, we reduce its

2845

++			 * next budget.  As long as we guess right,

2846

++			 * this budget cut reduces the latency

2847

++			 * experienced by the process.

2848

++			 *

2849

++			 * However, if there are still outstanding

2850

++			 * requests, then the process may have not yet

2851

++			 * issued its next request just because it is

2852

++			 * still waiting for the completion of some of

2853

++			 * the still outstanding ones.  So in this

2854

++			 * subcase we do not reduce its budget, on the

2855

++			 * contrary we increase it to possibly boost

2856

++			 * the throughput, as discussed in the

2857

++			 * comments to the BUDGET_TIMEOUT case.

2858

++			 */

2859

++			if (bfqq->dispatched > 0) /* still outstanding reqs */

2860

++				budget = min(budget * 2, bfqd->bfq_max_budget);

2861

++			else {

2862

++				if (budget > 5 * min_budget)

2863

++					budget -= 4 * min_budget;

2864

++				else

2865

++					budget = min_budget;

2866

++			}

2867

++			break;

2868

++		case BFQ_BFQQ_BUDGET_TIMEOUT:

2869

++			/*

2870

++			 * We double the budget here because: 1) it

2871

++			 * gives the chance to boost the throughput if

2872

++			 * this is not a seeky process (which may have

2873

++			 * bumped into this timeout because of, e.g.,

2874

++			 * ZBR), 2) together with charge_full_budget

2875

++			 * it helps give seeky processes higher

2876

++			 * timestamps, and hence be served less

2877

++			 * frequently.

2878

++			 */

2879

++			budget = min(budget * 2, bfqd->bfq_max_budget);

2880

++			break;

2881

++		case BFQ_BFQQ_BUDGET_EXHAUSTED:

2882

++			/*

2883

++			 * The process still has backlog, and did not

2884

++			 * let either the budget timeout or the disk

2885

++			 * idling timeout expire. Hence it is not

2886

++			 * seeky, has a short thinktime and may be

2887

++			 * happy with a higher budget too. So

2888

++			 * definitely increase the budget of this good

2889

++			 * candidate to boost the disk throughput.

2890

++			 */

2891

++			budget = min(budget * 4, bfqd->bfq_max_budget);

2892

++			break;

2893

++		case BFQ_BFQQ_NO_MORE_REQUESTS:

2894

++		       /*

2895

++			* Leave the budget unchanged.

2896

++			*/

2897

++		default:

2898

++			return;

2899

++		}

2900

++	} else /* async queue */

2901

++	    /* async queues get always the maximum possible budget

2902

++	     * (their ability to dispatch is limited by

2903

++	     * @bfqd->bfq_max_budget_async_rq).

2904

++	     */

2905

++		budget = bfqd->bfq_max_budget;

2906

++

2907

++	bfqq->max_budget = budget;

2908

++

2909

++	if (bfqd->budgets_assigned >= 194 && bfqd->bfq_user_max_budget == 0 &&

2910

++	    bfqq->max_budget > bfqd->bfq_max_budget)

2911

++		bfqq->max_budget = bfqd->bfq_max_budget;

2912

++

2913

++	/*

2914

++	 * Make sure that we have enough budget for the next request.

2915

++	 * Since the finish time of the bfqq must be kept in sync with

2916

++	 * the budget, be sure to call __bfq_bfqq_expire() after the

2917

++	 * update.

2918

++	 */

2919

++	next_rq = bfqq->next_rq;

2920

++	if (next_rq != NULL)

2921

++		bfqq->entity.budget = max_t(unsigned long, bfqq->max_budget,

2922

++					    bfq_serv_to_charge(next_rq, bfqq));

2923

++	else

2924

++		bfqq->entity.budget = bfqq->max_budget;

2925

++

2926

++	bfq_log_bfqq(bfqd, bfqq, "head sect: %u, new budget %lu",

2927

++			next_rq != NULL ? blk_rq_sectors(next_rq) : 0,

2928

++			bfqq->entity.budget);

2929

++}

2930

++

2931

++static unsigned long bfq_calc_max_budget(u64 peak_rate, u64 timeout)

2932

++{

2933

++	unsigned long max_budget;

2934

++

2935

++	/*

2936

++	 * The max_budget calculated when autotuning is equal to the

2937

++	 * amount of sectors transfered in timeout_sync at the

2938

++	 * estimated peak rate.

2939

++	 */

2940

++	max_budget = (unsigned long)(peak_rate * 1000 *

2941

++				     timeout >> BFQ_RATE_SHIFT);

2942

++

2943

++	return max_budget;

2944

++}

2945

++

2946

++/*

2947

++ * In addition to updating the peak rate, checks whether the process

2948

++ * is "slow", and returns 1 if so. This slow flag is used, in addition

2949

++ * to the budget timeout, to reduce the amount of service provided to

2950

++ * seeky processes, and hence reduce their chances to lower the

2951

++ * throughput. See the code for more details.

2952

++ */

2953

++static int bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq,

2954

++				int compensate, enum bfqq_expiration reason)

2955

++{

2956

++	u64 bw, usecs, expected, timeout;

2957

++	ktime_t delta;

2958

++	int update = 0;

2959

++

2960

++	if (!bfq_bfqq_sync(bfqq) || bfq_bfqq_budget_new(bfqq))

2961

++		return 0;

2962

++

2963

++	if (compensate)

2964

++		delta = bfqd->last_idling_start;

2965

++	else

2966

++		delta = ktime_get();

2967

++	delta = ktime_sub(delta, bfqd->last_budget_start);

2968

++	usecs = ktime_to_us(delta);

2969

++

2970

++	/* Don't trust short/unrealistic values. */

2971

++	if (usecs < 100 || usecs >= LONG_MAX)

2972

++		return 0;

2973

++

2974

++	/*

2975

++	 * Calculate the bandwidth for the last slice.  We use a 64 bit

2976

++	 * value to store the peak rate, in sectors per usec in fixed

2977

++	 * point math.  We do so to have enough precision in the estimate

2978

++	 * and to avoid overflows.

2979

++	 */

2980

++	bw = (u64)bfqq->entity.service << BFQ_RATE_SHIFT;

2981

++	do_div(bw, (unsigned long)usecs);

2982

++

2983

++	timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]);

2984

++

2985

++	/*

2986

++	 * Use only long (> 20ms) intervals to filter out spikes for

2987

++	 * the peak rate estimation.

2988

++	 */

2989

++	if (usecs > 20000) {

2990

++		if (bw > bfqd->peak_rate ||

2991

++		   (!BFQQ_SEEKY(bfqq) &&

2992

++		    reason == BFQ_BFQQ_BUDGET_TIMEOUT)) {

2993

++			bfq_log(bfqd, "measured bw =%llu", bw);

2994

++			/*

2995

++			 * To smooth oscillations use a low-pass filter with

2996

++			 * alpha=7/8, i.e.,

2997

++			 * new_rate = (7/8) * old_rate + (1/8) * bw

2998

++			 */

2999

++			do_div(bw, 8);

3000

++			if (bw == 0)

3001

++				return 0;

3002

++			bfqd->peak_rate *= 7;

3003

++			do_div(bfqd->peak_rate, 8);

3004

++			bfqd->peak_rate += bw;

3005

++			update = 1;

3006

++			bfq_log(bfqd, "new peak_rate=%llu", bfqd->peak_rate);

3007

++		}

3008

++

3009

++		update |= bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES - 1;

3010

++

3011

++		if (bfqd->peak_rate_samples < BFQ_PEAK_RATE_SAMPLES)

3012

++			bfqd->peak_rate_samples++;

3013

++

3014

++		if (bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES &&

3015

++		    update) {

3016

++			int dev_type = blk_queue_nonrot(bfqd->queue);

3017

++			if (bfqd->bfq_user_max_budget == 0) {

3018

++				bfqd->bfq_max_budget =

3019

++					bfq_calc_max_budget(bfqd->peak_rate,

3020

++							    timeout);

3021

++				bfq_log(bfqd, "new max_budget=%lu",

3022

++					bfqd->bfq_max_budget);

3023

++			}

3024

++			if (bfqd->device_speed == BFQ_BFQD_FAST &&

3025

++			    bfqd->peak_rate < device_speed_thresh[dev_type]) {

3026

++				bfqd->device_speed = BFQ_BFQD_SLOW;

3027

++				bfqd->RT_prod = R_slow[dev_type] *

3028

++						T_slow[dev_type];

3029

++			} else if (bfqd->device_speed == BFQ_BFQD_SLOW &&

3030

++			    bfqd->peak_rate > device_speed_thresh[dev_type]) {

3031

++				bfqd->device_speed = BFQ_BFQD_FAST;

3032

++				bfqd->RT_prod = R_fast[dev_type] *

3033

++						T_fast[dev_type];

3034

++			}

3035

++		}

3036

++	}

3037

++

3038

++	/*

3039

++	 * If the process has been served for a too short time

3040

++	 * interval to let its possible sequential accesses prevail on

3041

++	 * the initial seek time needed to move the disk head on the

3042

++	 * first sector it requested, then give the process a chance

3043

++	 * and for the moment return false.

3044

++	 */

3045

++	if (bfqq->entity.budget <= bfq_max_budget(bfqd) / 8)

3046

++		return 0;

3047

++

3048

++	/*

3049

++	 * A process is considered ``slow'' (i.e., seeky, so that we

3050

++	 * cannot treat it fairly in the service domain, as it would

3051

++	 * slow down too much the other processes) if, when a slice

3052

++	 * ends for whatever reason, it has received service at a

3053

++	 * rate that would not be high enough to complete the budget

3054

++	 * before the budget timeout expiration.

3055

++	 */

3056

++	expected = bw * 1000 * timeout >> BFQ_RATE_SHIFT;

3057

++

3058

++	/*

3059

++	 * Caveat: processes doing IO in the slower disk zones will

3060

++	 * tend to be slow(er) even if not seeky. And the estimated

3061

++	 * peak rate will actually be an average over the disk

3062

++	 * surface. Hence, to not be too harsh with unlucky processes,

3063

++	 * we keep a budget/3 margin of safety before declaring a

3064

++	 * process slow.

3065

++	 */

3066

++	return expected > (4 * bfqq->entity.budget) / 3;

3067

++}

3068

++

3069

++/*

3070

++ * To be deemed as soft real-time, an application must meet two

3071

++ * requirements. First, the application must not require an average

3072

++ * bandwidth higher than the approximate bandwidth required to playback or

3073

++ * record a compressed high-definition video.

3074

++ * The next function is invoked on the completion of the last request of a

3075

++ * batch, to compute the next-start time instant, soft_rt_next_start, such

3076

++ * that, if the next request of the application does not arrive before

3077

++ * soft_rt_next_start, then the above requirement on the bandwidth is met.

3078

++ *

3079

++ * The second requirement is that the request pattern of the application is

3080

++ * isochronous, i.e., that, after issuing a request or a batch of requests,

3081

++ * the application stops issuing new requests until all its pending requests

3082

++ * have been completed. After that, the application may issue a new batch,

3083

++ * and so on.

3084

++ * For this reason the next function is invoked to compute

3085

++ * soft_rt_next_start only for applications that meet this requirement,

3086

++ * whereas soft_rt_next_start is set to infinity for applications that do

3087

++ * not.

3088

++ *

3089

++ * Unfortunately, even a greedy application may happen to behave in an

3090

++ * isochronous way if the CPU load is high. In fact, the application may

3091

++ * stop issuing requests while the CPUs are busy serving other processes,

3092

++ * then restart, then stop again for a while, and so on. In addition, if

3093

++ * the disk achieves a low enough throughput with the request pattern

3094

++ * issued by the application (e.g., because the request pattern is random

3095

++ * and/or the device is slow), then the application may meet the above

3096

++ * bandwidth requirement too. To prevent such a greedy application to be

3097

++ * deemed as soft real-time, a further rule is used in the computation of

3098

++ * soft_rt_next_start: soft_rt_next_start must be higher than the current

3099

++ * time plus the maximum time for which the arrival of a request is waited

3100

++ * for when a sync queue becomes idle, namely bfqd->bfq_slice_idle.

3101

++ * This filters out greedy applications, as the latter issue instead their

3102

++ * next request as soon as possible after the last one has been completed

3103

++ * (in contrast, when a batch of requests is completed, a soft real-time

3104

++ * application spends some time processing data).

3105

++ *

3106

++ * Unfortunately, the last filter may easily generate false positives if

3107

++ * only bfqd->bfq_slice_idle is used as a reference time interval and one

3108

++ * or both the following cases occur:

3109

++ * 1) HZ is so low that the duration of a jiffy is comparable to or higher

3110

++ *    than bfqd->bfq_slice_idle. This happens, e.g., on slow devices with

3111

++ *    HZ=100.

3112

++ * 2) jiffies, instead of increasing at a constant rate, may stop increasing

3113

++ *    for a while, then suddenly 'jump' by several units to recover the lost

3114

++ *    increments. This seems to happen, e.g., inside virtual machines.

3115

++ * To address this issue, we do not use as a reference time interval just

3116

++ * bfqd->bfq_slice_idle, but bfqd->bfq_slice_idle plus a few jiffies. In

3117

++ * particular we add the minimum number of jiffies for which the filter

3118

++ * seems to be quite precise also in embedded systems and KVM/QEMU virtual

3119

++ * machines.

3120

++ */

3121

++static inline unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd,

3122

++						       struct bfq_queue *bfqq)

3123

++{

3124

++	return max(bfqq->last_idle_bklogged +

3125

++		   HZ * bfqq->service_from_backlogged /

3126

++		   bfqd->bfq_wr_max_softrt_rate,

3127

++		   jiffies + bfqq->bfqd->bfq_slice_idle + 4);

3128

++}

3129

++

3130

++/*

3131

++ * Return the largest-possible time instant such that, for as long as possible,

3132

++ * the current time will be lower than this time instant according to the macro

3133

++ * time_is_before_jiffies().

3134

++ */

3135

++static inline unsigned long bfq_infinity_from_now(unsigned long now)

3136

++{

3137

++	return now + ULONG_MAX / 2;

3138

++}

3139

++

3140

++/**

3141

++ * bfq_bfqq_expire - expire a queue.

3142

++ * @bfqd: device owning the queue.

3143

++ * @bfqq: the queue to expire.

3144

++ * @compensate: if true, compensate for the time spent idling.

3145

++ * @reason: the reason causing the expiration.

3146

++ *

3147

++ *

3148

++ * If the process associated to the queue is slow (i.e., seeky), or in

3149

++ * case of budget timeout, or, finally, if it is async, we

3150

++ * artificially charge it an entire budget (independently of the

3151

++ * actual service it received). As a consequence, the queue will get

3152

++ * higher timestamps than the correct ones upon reactivation, and

3153

++ * hence it will be rescheduled as if it had received more service

3154

++ * than what it actually received. In the end, this class of processes

3155

++ * will receive less service in proportion to how slowly they consume

3156

++ * their budgets (and hence how seriously they tend to lower the

3157

++ * throughput).

3158

++ *

3159

++ * In contrast, when a queue expires because it has been idling for

3160

++ * too much or because it exhausted its budget, we do not touch the

3161

++ * amount of service it has received. Hence when the queue will be

3162

++ * reactivated and its timestamps updated, the latter will be in sync

3163

++ * with the actual service received by the queue until expiration.

3164

++ *

3165

++ * Charging a full budget to the first type of queues and the exact

3166

++ * service to the others has the effect of using the WF2Q+ policy to

3167

++ * schedule the former on a timeslice basis, without violating the

3168

++ * service domain guarantees of the latter.

3169

++ */

3170

++static void bfq_bfqq_expire(struct bfq_data *bfqd,

3171

++			    struct bfq_queue *bfqq,

3172

++			    int compensate,

3173

++			    enum bfqq_expiration reason)

3174

++{

3175

++	int slow;

3176

++	BUG_ON(bfqq != bfqd->in_service_queue);

3177

++

3178

++	/* Update disk peak rate for autotuning and check whether the

3179

++	 * process is slow (see bfq_update_peak_rate).

3180

++	 */

3181

++	slow = bfq_update_peak_rate(bfqd, bfqq, compensate, reason);

3182

++

3183

++	/*

3184

++	 * As above explained, 'punish' slow (i.e., seeky), timed-out

3185

++	 * and async queues, to favor sequential sync workloads.

3186

++	 *

3187

++	 * Processes doing I/O in the slower disk zones will tend to be

3188

++	 * slow(er) even if not seeky. Hence, since the estimated peak

3189

++	 * rate is actually an average over the disk surface, these

3190

++	 * processes may timeout just for bad luck. To avoid punishing

3191

++	 * them we do not charge a full budget to a process that

3192

++	 * succeeded in consuming at least 2/3 of its budget.

3193

++	 */

3194

++	if (slow || (reason == BFQ_BFQQ_BUDGET_TIMEOUT &&

3195

++		     bfq_bfqq_budget_left(bfqq) >=  bfqq->entity.budget / 3))

3196

++		bfq_bfqq_charge_full_budget(bfqq);

3197

++

3198

++	bfqq->service_from_backlogged += bfqq->entity.service;

3199

++

3200

++	if (BFQQ_SEEKY(bfqq) && reason == BFQ_BFQQ_BUDGET_TIMEOUT &&

3201

++	    !bfq_bfqq_constantly_seeky(bfqq)) {

3202

++		bfq_mark_bfqq_constantly_seeky(bfqq);

3203

++		if (!blk_queue_nonrot(bfqd->queue))

3204

++			bfqd->const_seeky_busy_in_flight_queues++;

3205

++	}

3206

++

3207

++	if (reason == BFQ_BFQQ_TOO_IDLE &&

3208

++	    bfqq->entity.service <= 2 * bfqq->entity.budget / 10 )

3209

++		bfq_clear_bfqq_IO_bound(bfqq);

3210

++

3211

++	if (bfqd->low_latency && bfqq->wr_coeff == 1)

3212

++		bfqq->last_wr_start_finish = jiffies;

3213

++

3214

++	if (bfqd->low_latency && bfqd->bfq_wr_max_softrt_rate > 0 &&

3215

++	    RB_EMPTY_ROOT(&bfqq->sort_list)) {

3216

++		/*

3217

++		 * If we get here, and there are no outstanding requests,

3218

++		 * then the request pattern is isochronous (see the comments

3219

++		 * to the function bfq_bfqq_softrt_next_start()). Hence we

3220

++		 * can compute soft_rt_next_start. If, instead, the queue

3221

++		 * still has outstanding requests, then we have to wait

3222

++		 * for the completion of all the outstanding requests to

3223

++		 * discover whether the request pattern is actually

3224

++		 * isochronous.

3225

++		 */

3226

++		if (bfqq->dispatched == 0)

3227

++			bfqq->soft_rt_next_start =

3228

++				bfq_bfqq_softrt_next_start(bfqd, bfqq);

3229

++		else {

3230

++			/*

3231

++			 * The application is still waiting for the

3232

++			 * completion of one or more requests:

3233

++			 * prevent it from possibly being incorrectly

3234

++			 * deemed as soft real-time by setting its

3235

++			 * soft_rt_next_start to infinity. In fact,

3236

++			 * without this assignment, the application

3237

++			 * would be incorrectly deemed as soft

3238

++			 * real-time if:

3239

++			 * 1) it issued a new request before the

3240

++			 *    completion of all its in-flight

3241

++			 *    requests, and

3242

++			 * 2) at that time, its soft_rt_next_start

3243

++			 *    happened to be in the past.

3244

++			 */

3245

++			bfqq->soft_rt_next_start =

3246

++				bfq_infinity_from_now(jiffies);

3247

++			/*

3248

++			 * Schedule an update of soft_rt_next_start to when

3249

++			 * the task may be discovered to be isochronous.

3250

++			 */

3251

++			bfq_mark_bfqq_softrt_update(bfqq);

3252

++		}

3253

++	}

3254

++

3255

++	bfq_log_bfqq(bfqd, bfqq,

3256

++		"expire (%d, slow %d, num_disp %d, idle_win %d)", reason,

3257

++		slow, bfqq->dispatched, bfq_bfqq_idle_window(bfqq));

3258

++

3259

++	/*

3260

++	 * Increase, decrease or leave budget unchanged according to

3261

++	 * reason.

3262

++	 */

3263

++	__bfq_bfqq_recalc_budget(bfqd, bfqq, reason);

3264

++	__bfq_bfqq_expire(bfqd, bfqq);

3265

++}

3266

++

3267

++/*

3268

++ * Budget timeout is not implemented through a dedicated timer, but

3269

++ * just checked on request arrivals and completions, as well as on

3270

++ * idle timer expirations.

3271

++ */

3272

++static int bfq_bfqq_budget_timeout(struct bfq_queue *bfqq)

3273

++{

3274

++	if (bfq_bfqq_budget_new(bfqq) ||

3275

++	    time_before(jiffies, bfqq->budget_timeout))

3276

++		return 0;

3277

++	return 1;

3278

++}

3279

++

3280

++/*

3281

++ * If we expire a queue that is waiting for the arrival of a new

3282

++ * request, we may prevent the fictitious timestamp back-shifting that

3283

++ * allows the guarantees of the queue to be preserved (see [1] for

3284

++ * this tricky aspect). Hence we return true only if this condition

3285

++ * does not hold, or if the queue is slow enough to deserve only to be

3286

++ * kicked off for preserving a high throughput.

3287

++*/

3288

++static inline int bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq)

3289

++{

3290

++	bfq_log_bfqq(bfqq->bfqd, bfqq,

3291

++		"may_budget_timeout: wait_request %d left %d timeout %d",

3292

++		bfq_bfqq_wait_request(bfqq),

3293

++			bfq_bfqq_budget_left(bfqq) >=  bfqq->entity.budget / 3,

3294

++		bfq_bfqq_budget_timeout(bfqq));

3295

++

3296

++	return (!bfq_bfqq_wait_request(bfqq) ||

3297

++		bfq_bfqq_budget_left(bfqq) >=  bfqq->entity.budget / 3)

3298

++		&&

3299

++		bfq_bfqq_budget_timeout(bfqq);

3300

++}

3301

++

3302

++/*

3303

++ * Device idling is allowed only for the queues for which this function

3304

++ * returns true. For this reason, the return value of this function plays a

3305

++ * critical role for both throughput boosting and service guarantees. The

3306

++ * return value is computed through a logical expression. In this rather

3307

++ * long comment, we try to briefly describe all the details and motivations

3308

++ * behind the components of this logical expression.

3309

++ *

3310

++ * First, the expression is false if bfqq is not sync, or if: bfqq happened

3311

++ * to become active during a large burst of queue activations, and the

3312

++ * pattern of requests bfqq contains boosts the throughput if bfqq is

3313

++ * expired. In fact, queues that became active during a large burst benefit

3314

++ * only from throughput, as discussed in the comments to bfq_handle_burst.

3315

++ * In this respect, expiring bfqq certainly boosts the throughput on NCQ-

3316

++ * capable flash-based devices, whereas, on rotational devices, it boosts

3317

++ * the throughput only if bfqq contains random requests.

3318

++ *

3319

++ * On the opposite end, if (a) bfqq is sync, (b) the above burst-related

3320

++ * condition does not hold, and (c) bfqq is being weight-raised, then the

3321

++ * expression always evaluates to true, as device idling is instrumental

3322

++ * for preserving low-latency guarantees (see [1]). If, instead, conditions

3323

++ * (a) and (b) do hold, but (c) does not, then the expression evaluates to

3324

++ * true only if: (1) bfqq is I/O-bound and has a non-null idle window, and

3325

++ * (2) at least one of the following two conditions holds.

3326

++ * The first condition is that the device is not performing NCQ, because

3327

++ * idling the device most certainly boosts the throughput if this condition

3328

++ * holds and bfqq is I/O-bound and has been granted a non-null idle window.

3329

++ * The second compound condition is made of the logical AND of two components.

3330

++ *

3331

++ * The first component is true only if there is no weight-raised busy

3332

++ * queue. This guarantees that the device is not idled for a sync non-

3333

++ * weight-raised queue when there are busy weight-raised queues. The former

3334

++ * is then expired immediately if empty. Combined with the timestamping

3335

++ * rules of BFQ (see [1] for details), this causes sync non-weight-raised

3336

++ * queues to get a lower number of requests served, and hence to ask for a

3337

++ * lower number of requests from the request pool, before the busy weight-

3338

++ * raised queues get served again.

3339

++ *

3340

++ * This is beneficial for the processes associated with weight-raised

3341

++ * queues, when the request pool is saturated (e.g., in the presence of

3342

++ * write hogs). In fact, if the processes associated with the other queues

3343

++ * ask for requests at a lower rate, then weight-raised processes have a

3344

++ * higher probability to get a request from the pool immediately (or at

3345

++ * least soon) when they need one. Hence they have a higher probability to

3346

++ * actually get a fraction of the disk throughput proportional to their

3347

++ * high weight. This is especially true with NCQ-capable drives, which

3348

++ * enqueue several requests in advance and further reorder internally-

3349

++ * queued requests.

3350

++ *

3351

++ * In the end, mistreating non-weight-raised queues when there are busy

3352

++ * weight-raised queues seems to mitigate starvation problems in the

3353

++ * presence of heavy write workloads and NCQ, and hence to guarantee a

3354

++ * higher application and system responsiveness in these hostile scenarios.

3355

++ *

3356

++ * If the first component of the compound condition is instead true, i.e.,

3357

++ * there is no weight-raised busy queue, then the second component of the

3358

++ * compound condition takes into account service-guarantee and throughput

3359

++ * issues related to NCQ (recall that the compound condition is evaluated

3360

++ * only if the device is detected as supporting NCQ).

3361

++ *

3362

++ * As for service guarantees, allowing the drive to enqueue more than one

3363

++ * request at a time, and hence delegating de facto final scheduling

3364

++ * decisions to the drive's internal scheduler, causes loss of control on

3365

++ * the actual request service order. In this respect, when the drive is

3366

++ * allowed to enqueue more than one request at a time, the service

3367

++ * distribution enforced by the drive's internal scheduler is likely to

3368

++ * coincide with the desired device-throughput distribution only in the

3369

++ * following, perfectly symmetric, scenario:

3370

++ * 1) all active queues have the same weight,

3371

++ * 2) all active groups at the same level in the groups tree have the same

3372

++ *    weight,

3373

++ * 3) all active groups at the same level in the groups tree have the same

3374

++ *    number of children.

3375

++ *

3376

++ * Even in such a scenario, sequential I/O may still receive a preferential

3377

++ * treatment, but this is not likely to be a big issue with flash-based

3378

++ * devices, because of their non-dramatic loss of throughput with random

3379

++ * I/O. Things do differ with HDDs, for which additional care is taken, as

3380

++ * explained after completing the discussion for flash-based devices.

3381

++ *

3382

++ * Unfortunately, keeping the necessary state for evaluating exactly the

3383

++ * above symmetry conditions would be quite complex and time-consuming.

3384

++ * Therefore BFQ evaluates instead the following stronger sub-conditions,

3385

++ * for which it is much easier to maintain the needed state:

3386

++ * 1) all active queues have the same weight,

3387

++ * 2) all active groups have the same weight,

3388

++ * 3) all active groups have at most one active child each.

3389

++ * In particular, the last two conditions are always true if hierarchical

3390

++ * support and the cgroups interface are not enabled, hence no state needs

3391

++ * to be maintained in this case.

3392

++ *

3393

++ * According to the above considerations, the second component of the

3394

++ * compound condition evaluates to true if any of the above symmetry

3395

++ * sub-condition does not hold, or the device is not flash-based. Therefore,

3396

++ * if also the first component is true, then idling is allowed for a sync

3397

++ * queue. These are the only sub-conditions considered if the device is

3398

++ * flash-based, as, for such a device, it is sensible to force idling only

3399

++ * for service-guarantee issues. In fact, as for throughput, idling

3400

++ * NCQ-capable flash-based devices would not boost the throughput even

3401

++ * with sequential I/O; rather it would lower the throughput in proportion

3402

++ * to how fast the device is. In the end, (only) if all the three

3403

++ * sub-conditions hold and the device is flash-based, the compound

3404

++ * condition evaluates to false and therefore no idling is performed.

3405

++ *

3406

++ * As already said, things change with a rotational device, where idling

3407

++ * boosts the throughput with sequential I/O (even with NCQ). Hence, for

3408

++ * such a device the second component of the compound condition evaluates

3409

++ * to true also if the following additional sub-condition does not hold:

3410

++ * the queue is constantly seeky. Unfortunately, this different behavior

3411

++ * with respect to flash-based devices causes an additional asymmetry: if

3412

++ * some sync queues enjoy idling and some other sync queues do not, then

3413

++ * the latter get a low share of the device throughput, simply because the

3414

++ * former get many requests served after being set as in service, whereas

3415

++ * the latter do not. As a consequence, to guarantee the desired throughput

3416

++ * distribution, on HDDs the compound expression evaluates to true (and

3417

++ * hence device idling is performed) also if the following last symmetry

3418

++ * condition does not hold: no other queue is benefiting from idling. Also

3419

++ * this last condition is actually replaced with a simpler-to-maintain and

3420

++ * stronger condition: there is no busy queue which is not constantly seeky

3421

++ * (and hence may also benefit from idling).

3422

++ *

3423

++ * To sum up, when all the required symmetry and throughput-boosting

3424

++ * sub-conditions hold, the second component of the compound condition

3425

++ * evaluates to false, and hence no idling is performed. This helps to

3426

++ * keep the drives' internal queues full on NCQ-capable devices, and hence

3427

++ * to boost the throughput, without causing 'almost' any loss of service

3428

++ * guarantees. The 'almost' follows from the fact that, if the internal

3429

++ * queue of one such device is filled while all the sub-conditions hold,

3430

++ * but at some point in time some sub-condition stops to hold, then it may

3431

++ * become impossible to let requests be served in the new desired order

3432

++ * until all the requests already queued in the device have been served.

3433

++ */

3434

++static inline bool bfq_bfqq_must_not_expire(struct bfq_queue *bfqq)

3435

++{

3436

++	struct bfq_data *bfqd = bfqq->bfqd;

3437

++#define cond_for_seeky_on_ncq_hdd (bfq_bfqq_constantly_seeky(bfqq) && \

3438

++				   bfqd->busy_in_flight_queues == \

3439

++				   bfqd->const_seeky_busy_in_flight_queues)

3440

++

3441

++#define cond_for_expiring_in_burst	(bfq_bfqq_in_large_burst(bfqq) && \

3442

++					 bfqd->hw_tag && \

3443

++					 (blk_queue_nonrot(bfqd->queue) || \

3444

++					  bfq_bfqq_constantly_seeky(bfqq)))

3445

++

3446

++/*

3447

++ * Condition for expiring a non-weight-raised queue (and hence not idling

3448

++ * the device).

3449

++ */

3450

++#define cond_for_expiring_non_wr  (bfqd->hw_tag && \

3451

++				   (bfqd->wr_busy_queues > 0 || \

3452

++				    (blk_queue_nonrot(bfqd->queue) || \

3453

++				      cond_for_seeky_on_ncq_hdd)))

3454

++

3455

++	return bfq_bfqq_sync(bfqq) &&

3456

++		!cond_for_expiring_in_burst &&

3457

++		(bfqq->wr_coeff > 1 || !symmetric_scenario ||

3458

++		 (bfq_bfqq_IO_bound(bfqq) && bfq_bfqq_idle_window(bfqq) &&

3459

++		  !cond_for_expiring_non_wr)

3460

++	);

3461

++}

3462

++

3463

++/*

3464

++ * If the in-service queue is empty but sync, and the function

3465

++ * bfq_bfqq_must_not_expire returns true, then:

3466

++ * 1) the queue must remain in service and cannot be expired, and

3467

++ * 2) the disk must be idled to wait for the possible arrival of a new

3468

++ *    request for the queue.

3469

++ * See the comments to the function bfq_bfqq_must_not_expire for the reasons

3470

++ * why performing device idling is the best choice to boost the throughput

3471

++ * and preserve service guarantees when bfq_bfqq_must_not_expire itself

3472

++ * returns true.

3473

++ */

3474

++static inline bool bfq_bfqq_must_idle(struct bfq_queue *bfqq)

3475

++{

3476

++	struct bfq_data *bfqd = bfqq->bfqd;

3477

++

3478

++	return RB_EMPTY_ROOT(&bfqq->sort_list) && bfqd->bfq_slice_idle != 0 &&

3479

++	       bfq_bfqq_must_not_expire(bfqq);

3480

++}

3481

++

3482

++/*

3483

++ * Select a queue for service.  If we have a current queue in service,

3484

++ * check whether to continue servicing it, or retrieve and set a new one.

3485

++ */

3486

++static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)

3487

++{

3488

++	struct bfq_queue *bfqq, *new_bfqq = NULL;

3489

++	struct request *next_rq;

3490

++	enum bfqq_expiration reason = BFQ_BFQQ_BUDGET_TIMEOUT;

3491

++

3492

++	bfqq = bfqd->in_service_queue;

3493

++	if (bfqq == NULL)

3494

++		goto new_queue;

3495

++

3496

++	bfq_log_bfqq(bfqd, bfqq, "select_queue: already in-service queue");

3497

++

3498

++	/*

3499

++         * If another queue has a request waiting within our mean seek

3500

++         * distance, let it run. The expire code will check for close

3501

++         * cooperators and put the close queue at the front of the

3502

++         * service tree. If possible, merge the expiring queue with the

3503

++         * new bfqq.

3504

++         */

3505

++        new_bfqq = bfq_close_cooperator(bfqd, bfqq);

3506

++        if (new_bfqq != NULL && bfqq->new_bfqq == NULL)

3507

++                bfq_setup_merge(bfqq, new_bfqq);

3508

++

3509

++	if (bfq_may_expire_for_budg_timeout(bfqq) &&

3510

++	    !timer_pending(&bfqd->idle_slice_timer) &&

3511

++	    !bfq_bfqq_must_idle(bfqq))

3512

++		goto expire;

3513

++

3514

++	next_rq = bfqq->next_rq;

3515

++	/*

3516

++	 * If bfqq has requests queued and it has enough budget left to

3517

++	 * serve them, keep the queue, otherwise expire it.

3518

++	 */

3519

++	if (next_rq != NULL) {

3520

++		if (bfq_serv_to_charge(next_rq, bfqq) >

3521

++			bfq_bfqq_budget_left(bfqq)) {

3522

++			reason = BFQ_BFQQ_BUDGET_EXHAUSTED;

3523

++			goto expire;

3524

++		} else {

3525

++			/*

3526

++			 * The idle timer may be pending because we may

3527

++			 * not disable disk idling even when a new request

3528

++			 * arrives.

3529

++			 */

3530

++			if (timer_pending(&bfqd->idle_slice_timer)) {

3531

++				/*

3532

++				 * If we get here: 1) at least a new request

3533

++				 * has arrived but we have not disabled the

3534

++				 * timer because the request was too small,

3535

++				 * 2) then the block layer has unplugged

3536

++				 * the device, causing the dispatch to be

3537

++				 * invoked.

3538

++				 *

3539

++				 * Since the device is unplugged, now the

3540

++				 * requests are probably large enough to

3541

++				 * provide a reasonable throughput.

3542

++				 * So we disable idling.

3543

++				 */

3544

++				bfq_clear_bfqq_wait_request(bfqq);

3545

++				del_timer(&bfqd->idle_slice_timer);

3546

++			}

3547

++			if (new_bfqq == NULL)

3548

++				goto keep_queue;

3549

++			else

3550

++				goto expire;

3551

++		}

3552

++	}

3553

++

3554

++	/*

3555

++	 * No requests pending. However, if the in-service queue is idling

3556

++	 * for a new request, or has requests waiting for a completion and

3557

++	 * may idle after their completion, then keep it anyway.

3558

++	 */

3559

++	if (new_bfqq == NULL && (timer_pending(&bfqd->idle_slice_timer) ||

3560

++	    (bfqq->dispatched != 0 && bfq_bfqq_must_not_expire(bfqq)))) {

3561

++		bfqq = NULL;

3562

++		goto keep_queue;

3563

++	} else if (new_bfqq != NULL && timer_pending(&bfqd->idle_slice_timer)) {

3564

++		/*

3565

++		 * Expiring the queue because there is a close cooperator,

3566

++		 * cancel timer.

3567

++		 */

3568

++		bfq_clear_bfqq_wait_request(bfqq);

3569

++		del_timer(&bfqd->idle_slice_timer);

3570

++	}

3571

++

3572

++	reason = BFQ_BFQQ_NO_MORE_REQUESTS;

3573

++expire:

3574

++	bfq_bfqq_expire(bfqd, bfqq, 0, reason);

3575

++new_queue:

3576

++	bfqq = bfq_set_in_service_queue(bfqd, new_bfqq);

3577

++	bfq_log(bfqd, "select_queue: new queue %d returned",

3578

++		bfqq != NULL ? bfqq->pid : 0);

3579

++keep_queue:

3580

++	return bfqq;

3581

++}

3582

++

3583

++static void bfq_update_wr_data(struct bfq_data *bfqd,

3584

++			       struct bfq_queue *bfqq)

3585

++{

3586

++	if (bfqq->wr_coeff > 1) { /* queue is being boosted */

3587

++		struct bfq_entity *entity = &bfqq->entity;

3588

++

3589

++		bfq_log_bfqq(bfqd, bfqq,

3590

++			"raising period dur %u/%u msec, old coeff %u, w %d(%d)",

3591

++			jiffies_to_msecs(jiffies -

3592

++				bfqq->last_wr_start_finish),

3593

++			jiffies_to_msecs(bfqq->wr_cur_max_time),

3594

++			bfqq->wr_coeff,

3595

++			bfqq->entity.weight, bfqq->entity.orig_weight);

3596

++

3597

++		BUG_ON(bfqq != bfqd->in_service_queue && entity->weight !=

3598

++		       entity->orig_weight * bfqq->wr_coeff);

3599

++		if (entity->ioprio_changed)

3600

++			bfq_log_bfqq(bfqd, bfqq, "WARN: pending prio change");

3601

++		/*

3602

++		 * If the queue was activated in a burst, or

3603

++		 * too much time has elapsed from the beginning

3604

++		 * of this weight-raising, then end weight raising.

3605

++		 */

3606

++		if (bfq_bfqq_in_large_burst(bfqq) ||

3607

++		    time_is_before_jiffies(bfqq->last_wr_start_finish +

3608

++					   bfqq->wr_cur_max_time)) {

3609

++			bfqq->last_wr_start_finish = jiffies;

3610

++			bfq_log_bfqq(bfqd, bfqq,

3611

++				     "wrais ending at %lu, rais_max_time %u",

3612

++				     bfqq->last_wr_start_finish,

3613

++				     jiffies_to_msecs(bfqq->wr_cur_max_time));

3614

++			bfq_bfqq_end_wr(bfqq);

3615

++			__bfq_entity_update_weight_prio(

3616

++				bfq_entity_service_tree(entity),

3617

++				entity);

3618

++		}

3619

++	}

3620

++}

3621

++

3622

++/*

3623

++ * Dispatch one request from bfqq, moving it to the request queue

3624

++ * dispatch list.

3625

++ */

3626

++static int bfq_dispatch_request(struct bfq_data *bfqd,

3627

++				struct bfq_queue *bfqq)

3628

++{

3629

++	int dispatched = 0;

3630

++	struct request *rq;

3631

++	unsigned long service_to_charge;

3632

++

3633

++	BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list));

3634

++

3635

++	/* Follow expired path, else get first next available. */

3636

++	rq = bfq_check_fifo(bfqq);

3637

++	if (rq == NULL)

3638

++		rq = bfqq->next_rq;

3639

++	service_to_charge = bfq_serv_to_charge(rq, bfqq);

3640

++

3641

++	if (service_to_charge > bfq_bfqq_budget_left(bfqq)) {

3642

++		/*

3643

++		 * This may happen if the next rq is chosen in fifo order

3644

++		 * instead of sector order. The budget is properly

3645

++		 * dimensioned to be always sufficient to serve the next

3646

++		 * request only if it is chosen in sector order. The reason

3647

++		 * is that it would be quite inefficient and little useful

3648

++		 * to always make sure that the budget is large enough to

3649

++		 * serve even the possible next rq in fifo order.

3650

++		 * In fact, requests are seldom served in fifo order.

3651

++		 *

3652

++		 * Expire the queue for budget exhaustion, and make sure

3653

++		 * that the next act_budget is enough to serve the next

3654

++		 * request, even if it comes from the fifo expired path.

3655

++		 */

3656

++		bfqq->next_rq = rq;

3657

++		/*

3658

++		 * Since this dispatch is failed, make sure that

3659

++		 * a new one will be performed

3660

++		 */

3661

++		if (!bfqd->rq_in_driver)

3662

++			bfq_schedule_dispatch(bfqd);

3663

++		goto expire;

3664

++	}

3665

++

3666

++	/* Finally, insert request into driver dispatch list. */

3667

++	bfq_bfqq_served(bfqq, service_to_charge);

3668

++	bfq_dispatch_insert(bfqd->queue, rq);

3669

++

3670

++	bfq_update_wr_data(bfqd, bfqq);

3671

++

3672

++	bfq_log_bfqq(bfqd, bfqq,

3673

++			"dispatched %u sec req (%llu), budg left %lu",

3674

++			blk_rq_sectors(rq),

3675

++			(long long unsigned)blk_rq_pos(rq),

3676

++			bfq_bfqq_budget_left(bfqq));

3677

++

3678

++	dispatched++;

3679

++

3680

++	if (bfqd->in_service_bic == NULL) {

3681

++		atomic_long_inc(&RQ_BIC(rq)->icq.ioc->refcount);

3682

++		bfqd->in_service_bic = RQ_BIC(rq);

3683

++	}

3684

++

3685

++	if (bfqd->busy_queues > 1 && ((!bfq_bfqq_sync(bfqq) &&

3686

++	    dispatched >= bfqd->bfq_max_budget_async_rq) ||

3687

++	    bfq_class_idle(bfqq)))

3688

++		goto expire;

3689

++

3690

++	return dispatched;

3691

++

3692

++expire:

3693

++	bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_EXHAUSTED);

3694

++	return dispatched;

3695

++}

3696

++

3697

++static int __bfq_forced_dispatch_bfqq(struct bfq_queue *bfqq)

3698

++{

3699

++	int dispatched = 0;

3700

++

3701

++	while (bfqq->next_rq != NULL) {

3702

++		bfq_dispatch_insert(bfqq->bfqd->queue, bfqq->next_rq);

3703

++		dispatched++;

3704

++	}

3705

++

3706

++	BUG_ON(!list_empty(&bfqq->fifo));

3707

++	return dispatched;

3708

++}

3709

++

3710

++/*

3711

++ * Drain our current requests.

3712

++ * Used for barriers and when switching io schedulers on-the-fly.

3713

++ */

3714

++static int bfq_forced_dispatch(struct bfq_data *bfqd)

3715

++{

3716

++	struct bfq_queue *bfqq, *n;

3717

++	struct bfq_service_tree *st;

3718

++	int dispatched = 0;

3719

++

3720

++	bfqq = bfqd->in_service_queue;

3721

++	if (bfqq != NULL)

3722

++		__bfq_bfqq_expire(bfqd, bfqq);

3723

++

3724

++	/*

3725

++	 * Loop through classes, and be careful to leave the scheduler

3726

++	 * in a consistent state, as feedback mechanisms and vtime

3727

++	 * updates cannot be disabled during the process.

3728

++	 */

3729

++	list_for_each_entry_safe(bfqq, n, &bfqd->active_list, bfqq_list) {

3730

++		st = bfq_entity_service_tree(&bfqq->entity);

3731

++

3732

++		dispatched += __bfq_forced_dispatch_bfqq(bfqq);

3733

++		bfqq->max_budget = bfq_max_budget(bfqd);

3734

++

3735

++		bfq_forget_idle(st);

3736

++	}

3737

++

3738

++	BUG_ON(bfqd->busy_queues != 0);

3739

++

3740

++	return dispatched;

3741

++}

3742

++

3743

++static int bfq_dispatch_requests(struct request_queue *q, int force)

3744

++{

3745

++	struct bfq_data *bfqd = q->elevator->elevator_data;

3746

++	struct bfq_queue *bfqq;

3747

++	int max_dispatch;

3748

++

3749

++	bfq_log(bfqd, "dispatch requests: %d busy queues", bfqd->busy_queues);

3750

++	if (bfqd->busy_queues == 0)

3751

++		return 0;

3752

++

3753

++	if (unlikely(force))

3754

++		return bfq_forced_dispatch(bfqd);

3755

++

3756

++	bfqq = bfq_select_queue(bfqd);

3757

++	if (bfqq == NULL)

3758

++		return 0;

3759

++

3760

++	if (bfq_class_idle(bfqq))

3761

++		max_dispatch = 1;

3762

++

3763

++	if (!bfq_bfqq_sync(bfqq))

3764

++		max_dispatch = bfqd->bfq_max_budget_async_rq;

3765

++

3766

++	if (!bfq_bfqq_sync(bfqq) && bfqq->dispatched >= max_dispatch) {

3767

++		if (bfqd->busy_queues > 1)

3768

++			return 0;

3769

++		if (bfqq->dispatched >= 4 * max_dispatch)

3770

++			return 0;

3771

++	}

3772

++

3773

++	if (bfqd->sync_flight != 0 && !bfq_bfqq_sync(bfqq))

3774

++		return 0;

3775

++

3776

++	bfq_clear_bfqq_wait_request(bfqq);

3777

++	BUG_ON(timer_pending(&bfqd->idle_slice_timer));

3778

++

3779

++	if (!bfq_dispatch_request(bfqd, bfqq))

3780

++		return 0;

3781

++

3782

++	bfq_log_bfqq(bfqd, bfqq, "dispatched %s request",

3783

++			bfq_bfqq_sync(bfqq) ? "sync" : "async");

3784

++

3785

++	return 1;

3786

++}

3787

++

3788

++/*

3789

++ * Task holds one reference to the queue, dropped when task exits.  Each rq

3790

++ * in-flight on this queue also holds a reference, dropped when rq is freed.

3791

++ *

3792

++ * Queue lock must be held here.

3793

++ */

3794

++static void bfq_put_queue(struct bfq_queue *bfqq)

3795

++{

3796

++	struct bfq_data *bfqd = bfqq->bfqd;

3797

++

3798

++	BUG_ON(atomic_read(&bfqq->ref) <= 0);

3799

++

3800

++	bfq_log_bfqq(bfqd, bfqq, "put_queue: %p %d", bfqq,

3801

++		     atomic_read(&bfqq->ref));

3802

++	if (!atomic_dec_and_test(&bfqq->ref))

3803

++		return;

3804

++

3805

++	BUG_ON(rb_first(&bfqq->sort_list) != NULL);

3806

++	BUG_ON(bfqq->allocated[READ] + bfqq->allocated[WRITE] != 0);

3807

++	BUG_ON(bfqq->entity.tree != NULL);

3808

++	BUG_ON(bfq_bfqq_busy(bfqq));

3809

++	BUG_ON(bfqd->in_service_queue == bfqq);

3810

++

3811

++	if (bfq_bfqq_sync(bfqq))

3812

++		/*

3813

++		 * The fact that this queue is being destroyed does not

3814

++		 * invalidate the fact that this queue may have been

3815

++		 * activated during the current burst. As a consequence,

3816

++		 * although the queue does not exist anymore, and hence

3817

++		 * needs to be removed from the burst list if there,

3818

++		 * the burst size has not to be decremented.

3819

++		 */

3820

++		hlist_del_init(&bfqq->burst_list_node);

3821

++

3822

++	bfq_log_bfqq(bfqd, bfqq, "put_queue: %p freed", bfqq);

3823

++

3824

++	kmem_cache_free(bfq_pool, bfqq);

3825

++}

3826

++

3827

++static void bfq_put_cooperator(struct bfq_queue *bfqq)

3828

++{

3829

++	struct bfq_queue *__bfqq, *next;

3830

++

3831

++	/*

3832

++	 * If this queue was scheduled to merge with another queue, be

3833

++	 * sure to drop the reference taken on that queue (and others in

3834

++	 * the merge chain). See bfq_setup_merge and bfq_merge_bfqqs.

3835

++	 */

3836

++	__bfqq = bfqq->new_bfqq;

3837

++	while (__bfqq) {

3838

++		if (__bfqq == bfqq)

3839

++			break;

3840

++		next = __bfqq->new_bfqq;

3841

++		bfq_put_queue(__bfqq);

3842

++		__bfqq = next;

3843

++	}

3844

++}

3845

++

3846

++static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)

3847

++{

3848

++	if (bfqq == bfqd->in_service_queue) {

3849

++		__bfq_bfqq_expire(bfqd, bfqq);

3850

++		bfq_schedule_dispatch(bfqd);

3851

++	}

3852

++

3853

++	bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq,

3854

++		     atomic_read(&bfqq->ref));

3855

++

3856

++	bfq_put_cooperator(bfqq);

3857

++

3858

++	bfq_put_queue(bfqq);

3859

++}

3860

++

3861

++static inline void bfq_init_icq(struct io_cq *icq)

3862

++{

3863

++	struct bfq_io_cq *bic = icq_to_bic(icq);

3864

++

3865

++	bic->ttime.last_end_request = jiffies;

3866

++}

3867

++

3868

++static void bfq_exit_icq(struct io_cq *icq)

3869

++{

3870

++	struct bfq_io_cq *bic = icq_to_bic(icq);

3871

++	struct bfq_data *bfqd = bic_to_bfqd(bic);

3872

++

3873

++	if (bic->bfqq[BLK_RW_ASYNC]) {

3874

++		bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_ASYNC]);

3875

++		bic->bfqq[BLK_RW_ASYNC] = NULL;

3876

++	}

3877

++

3878

++	if (bic->bfqq[BLK_RW_SYNC]) {

3879

++		bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_SYNC]);

3880

++		bic->bfqq[BLK_RW_SYNC] = NULL;

3881

++	}

3882

++}

3883

++

3884

++/*

3885

++ * Update the entity prio values; note that the new values will not

3886

++ * be used until the next (re)activation.

3887

++ */

3888

++static void bfq_set_next_ioprio_data(struct bfq_queue *bfqq, struct bfq_io_cq *bic)

3889

++{

3890

++	struct task_struct *tsk = current;

3891

++	int ioprio_class;

3892

++

3893

++	ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio);

3894

++	switch (ioprio_class) {

3895

++	default:

3896

++		dev_err(bfqq->bfqd->queue->backing_dev_info.dev,

3897

++			"bfq: bad prio class %d\n", ioprio_class);

3898

++	case IOPRIO_CLASS_NONE:

3899

++		/*

3900

++		 * No prio set, inherit CPU scheduling settings.

3901

++		 */

3902

++		bfqq->entity.new_ioprio = task_nice_ioprio(tsk);

3903

++		bfqq->entity.new_ioprio_class = task_nice_ioclass(tsk);

3904

++		break;

3905

++	case IOPRIO_CLASS_RT:

3906

++		bfqq->entity.new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio);

3907

++		bfqq->entity.new_ioprio_class = IOPRIO_CLASS_RT;

3908

++		break;

3909

++	case IOPRIO_CLASS_BE:

3910

++		bfqq->entity.new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio);

3911

++		bfqq->entity.new_ioprio_class = IOPRIO_CLASS_BE;

3912

++		break;

3913

++	case IOPRIO_CLASS_IDLE:

3914

++		bfqq->entity.new_ioprio_class = IOPRIO_CLASS_IDLE;

3915

++		bfqq->entity.new_ioprio = 7;

3916

++		bfq_clear_bfqq_idle_window(bfqq);

3917

++		break;

3918

++	}

3919

++

3920

++	if (bfqq->entity.new_ioprio < 0 ||

3921

++	    bfqq->entity.new_ioprio >= IOPRIO_BE_NR) {

3922

++		printk(KERN_CRIT "bfq_set_next_ioprio_data: new_ioprio %d\n",

3923

++				 bfqq->entity.new_ioprio);

3924

++		BUG();

3925

++	}

3926

++

3927

++	bfqq->entity.new_weight = bfq_ioprio_to_weight(bfqq->entity.new_ioprio);

3928

++	bfqq->entity.ioprio_changed = 1;

3929

++}

3930

++

3931

++static void bfq_check_ioprio_change(struct bfq_io_cq *bic)

3932

++{

3933

++	struct bfq_data *bfqd;

3934

++	struct bfq_queue *bfqq, *new_bfqq;

3935

++	struct bfq_group *bfqg;

3936

++	unsigned long uninitialized_var(flags);

3937

++	int ioprio = bic->icq.ioc->ioprio;

3938

++

3939

++	bfqd = bfq_get_bfqd_locked(&(bic->icq.q->elevator->elevator_data),

3940

++				   &flags);

3941

++	/*

3942

++	 * This condition may trigger on a newly created bic, be sure to

3943

++	 * drop the lock before returning.

3944

++	 */

3945

++	if (unlikely(bfqd == NULL) || likely(bic->ioprio == ioprio))

3946

++		goto out;

3947

++

3948

++	bic->ioprio = ioprio;

3949

++

3950

++	bfqq = bic->bfqq[BLK_RW_ASYNC];

3951

++	if (bfqq != NULL) {

3952

++		bfqg = container_of(bfqq->entity.sched_data, struct bfq_group,

3953

++				    sched_data);

3954

++		new_bfqq = bfq_get_queue(bfqd, bfqg, BLK_RW_ASYNC, bic,

3955

++					 GFP_ATOMIC);

3956

++		if (new_bfqq != NULL) {

3957

++			bic->bfqq[BLK_RW_ASYNC] = new_bfqq;

3958

++			bfq_log_bfqq(bfqd, bfqq,

3959

++				     "check_ioprio_change: bfqq %p %d",

3960

++				     bfqq, atomic_read(&bfqq->ref));

3961

++			bfq_put_queue(bfqq);

3962

++		}

3963

++	}

3964

++

3965

++	bfqq = bic->bfqq[BLK_RW_SYNC];

3966

++	if (bfqq != NULL)

3967

++		bfq_set_next_ioprio_data(bfqq, bic);

3968

++

3969

++out:

3970

++	bfq_put_bfqd_unlock(bfqd, &flags);

3971

++}

3972

++

3973

++static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,

3974

++			  struct bfq_io_cq *bic, pid_t pid, int is_sync)

3975

++{

3976

++	RB_CLEAR_NODE(&bfqq->entity.rb_node);

3977

++	INIT_LIST_HEAD(&bfqq->fifo);

3978

++	INIT_HLIST_NODE(&bfqq->burst_list_node);

3979

++

3980

++	atomic_set(&bfqq->ref, 0);

3981

++	bfqq->bfqd = bfqd;

3982

++

3983

++	if (bic)

3984

++		bfq_set_next_ioprio_data(bfqq, bic);

3985

++

3986

++	if (is_sync) {

3987

++		if (!bfq_class_idle(bfqq))

3988

++			bfq_mark_bfqq_idle_window(bfqq);

3989

++		bfq_mark_bfqq_sync(bfqq);

3990

++	}

3991

++	bfq_mark_bfqq_IO_bound(bfqq);

3992

++

3993

++	/* Tentative initial value to trade off between thr and lat */

3994

++	bfqq->max_budget = (2 * bfq_max_budget(bfqd)) / 3;

3995

++	bfqq->pid = pid;

3996

++

3997

++	bfqq->wr_coeff = 1;

3998

++	bfqq->last_wr_start_finish = 0;

3999

++	/*

4000

++	 * Set to the value for which bfqq will not be deemed as

4001

++	 * soft rt when it becomes backlogged.

4002

++	 */

4003

++	bfqq->soft_rt_next_start = bfq_infinity_from_now(jiffies);

4004

++}

4005

++

4006

++static struct bfq_queue *bfq_find_alloc_queue(struct bfq_data *bfqd,

4007

++					      struct bfq_group *bfqg,

4008

++					      int is_sync,

4009

++					      struct bfq_io_cq *bic,

4010

++					      gfp_t gfp_mask)

4011

++{

4012

++	struct bfq_queue *bfqq, *new_bfqq = NULL;

4013

++

4014

++retry:

4015

++	/* bic always exists here */

4016

++	bfqq = bic_to_bfqq(bic, is_sync);

4017

++

4018

++	/*

4019

++	 * Always try a new alloc if we fall back to the OOM bfqq

4020

++	 * originally, since it should just be a temporary situation.

4021

++	 */

4022

++	if (bfqq == NULL || bfqq == &bfqd->oom_bfqq) {

4023

++		bfqq = NULL;

4024

++		if (new_bfqq != NULL) {

4025

++			bfqq = new_bfqq;

4026

++			new_bfqq = NULL;

4027

++		} else if (gfp_mask & __GFP_WAIT) {

4028

++			spin_unlock_irq(bfqd->queue->queue_lock);

4029

++			new_bfqq = kmem_cache_alloc_node(bfq_pool,

4030

++					gfp_mask | __GFP_ZERO,

4031

++					bfqd->queue->node);

4032

++			spin_lock_irq(bfqd->queue->queue_lock);

4033

++			if (new_bfqq != NULL)

4034

++				goto retry;

4035

++		} else {

4036

++			bfqq = kmem_cache_alloc_node(bfq_pool,

4037

++					gfp_mask | __GFP_ZERO,

4038

++					bfqd->queue->node);

4039

++		}

4040

++

4041

++		if (bfqq != NULL) {

4042

++			bfq_init_bfqq(bfqd, bfqq, bic, current->pid,

4043

++                                      is_sync);

4044

++			bfq_init_entity(&bfqq->entity, bfqg);

4045

++			bfq_log_bfqq(bfqd, bfqq, "allocated");

4046

++		} else {

4047

++			bfqq = &bfqd->oom_bfqq;

4048

++			bfq_log_bfqq(bfqd, bfqq, "using oom bfqq");

4049

++		}

4050

++	}

4051

++

4052

++	if (new_bfqq != NULL)

4053

++		kmem_cache_free(bfq_pool, new_bfqq);

4054

++

4055

++	return bfqq;

4056

++}

4057

++

4058

++static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd,

4059

++					       struct bfq_group *bfqg,

4060

++					       int ioprio_class, int ioprio)

4061

++{

4062

++	switch (ioprio_class) {

4063

++	case IOPRIO_CLASS_RT:

4064

++		return &bfqg->async_bfqq[0][ioprio];

4065

++	case IOPRIO_CLASS_NONE:

4066

++		ioprio = IOPRIO_NORM;

4067

++		/* fall through */

4068

++	case IOPRIO_CLASS_BE:

4069

++		return &bfqg->async_bfqq[1][ioprio];

4070

++	case IOPRIO_CLASS_IDLE:

4071

++		return &bfqg->async_idle_bfqq;

4072

++	default:

4073

++		BUG();

4074

++	}

4075

++}

4076

++

4077

++static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,

4078

++				       struct bfq_group *bfqg, int is_sync,

4079

++				       struct bfq_io_cq *bic, gfp_t gfp_mask)

4080

++{

4081

++	const int ioprio = IOPRIO_PRIO_DATA(bic->ioprio);

4082

++	const int ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio);

4083

++	struct bfq_queue **async_bfqq = NULL;

4084

++	struct bfq_queue *bfqq = NULL;

4085

++

4086

++	if (!is_sync) {

4087

++		async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class,

4088

++						  ioprio);

4089

++		bfqq = *async_bfqq;

4090

++	}

4091

++

4092

++	if (bfqq == NULL)

4093

++		bfqq = bfq_find_alloc_queue(bfqd, bfqg, is_sync, bic, gfp_mask);

4094

++

4095

++	/*

4096

++	 * Pin the queue now that it's allocated, scheduler exit will

4097

++	 * prune it.

4098

++	 */

4099

++	if (!is_sync && *async_bfqq == NULL) {

4100

++		atomic_inc(&bfqq->ref);

4101

++		bfq_log_bfqq(bfqd, bfqq, "get_queue, bfqq not in async: %p, %d",

4102

++			     bfqq, atomic_read(&bfqq->ref));

4103

++		*async_bfqq = bfqq;

4104

++	}

4105

++

4106

++	atomic_inc(&bfqq->ref);

4107

++	bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq,

4108

++		     atomic_read(&bfqq->ref));

4109

++	return bfqq;

4110

++}

4111

++

4112

++static void bfq_update_io_thinktime(struct bfq_data *bfqd,

4113

++				    struct bfq_io_cq *bic)

4114

++{

4115

++	unsigned long elapsed = jiffies - bic->ttime.last_end_request;

4116

++	unsigned long ttime = min(elapsed, 2UL * bfqd->bfq_slice_idle);

4117

++

4118

++	bic->ttime.ttime_samples = (7*bic->ttime.ttime_samples + 256) / 8;

4119

++	bic->ttime.ttime_total = (7*bic->ttime.ttime_total + 256*ttime) / 8;

4120

++	bic->ttime.ttime_mean = (bic->ttime.ttime_total + 128) /

4121

++				bic->ttime.ttime_samples;

4122

++}

4123

++

4124

++static void bfq_update_io_seektime(struct bfq_data *bfqd,

4125

++				   struct bfq_queue *bfqq,

4126

++				   struct request *rq)

4127

++{

4128

++	sector_t sdist;

4129

++	u64 total;

4130

++

4131

++	if (bfqq->last_request_pos < blk_rq_pos(rq))

4132

++		sdist = blk_rq_pos(rq) - bfqq->last_request_pos;

4133

++	else

4134

++		sdist = bfqq->last_request_pos - blk_rq_pos(rq);

4135

++

4136

++	/*

4137

++	 * Don't allow the seek distance to get too large from the

4138

++	 * odd fragment, pagein, etc.

4139

++	 */

4140

++	if (bfqq->seek_samples == 0) /* first request, not really a seek */

4141

++		sdist = 0;

4142

++	else if (bfqq->seek_samples <= 60) /* second & third seek */

4143

++		sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*1024);

4144

++	else

4145

++		sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*64);

4146

++

4147

++	bfqq->seek_samples = (7*bfqq->seek_samples + 256) / 8;

4148

++	bfqq->seek_total = (7*bfqq->seek_total + (u64)256*sdist) / 8;

4149

++	total = bfqq->seek_total + (bfqq->seek_samples/2);

4150

++	do_div(total, bfqq->seek_samples);

4151

++	bfqq->seek_mean = (sector_t)total;

4152

++

4153

++	bfq_log_bfqq(bfqd, bfqq, "dist=%llu mean=%llu", (u64)sdist,

4154

++			(u64)bfqq->seek_mean);

4155

++}

4156

++

4157

++/*

4158

++ * Disable idle window if the process thinks too long or seeks so much that

4159

++ * it doesn't matter.

4160

++ */

4161

++static void bfq_update_idle_window(struct bfq_data *bfqd,

4162

++				   struct bfq_queue *bfqq,

4163

++				   struct bfq_io_cq *bic)

4164

++{

4165

++	int enable_idle;

4166

++

4167

++	/* Don't idle for async or idle io prio class. */

4168

++	if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq))

4169

++		return;

4170

++

4171

++	enable_idle = bfq_bfqq_idle_window(bfqq);

4172

++

4173

++	if (atomic_read(&bic->icq.ioc->active_ref) == 0 ||

4174

++	    bfqd->bfq_slice_idle == 0 ||

4175

++		(bfqd->hw_tag && BFQQ_SEEKY(bfqq) &&

4176

++			bfqq->wr_coeff == 1))

4177

++		enable_idle = 0;

4178

++	else if (bfq_sample_valid(bic->ttime.ttime_samples)) {

4179

++		if (bic->ttime.ttime_mean > bfqd->bfq_slice_idle &&

4180

++			bfqq->wr_coeff == 1)

4181

++			enable_idle = 0;

4182

++		else

4183

++			enable_idle = 1;

4184

++	}

4185

++	bfq_log_bfqq(bfqd, bfqq, "update_idle_window: enable_idle %d",

4186

++		enable_idle);

4187

++

4188

++	if (enable_idle)

4189

++		bfq_mark_bfqq_idle_window(bfqq);

4190

++	else

4191

++		bfq_clear_bfqq_idle_window(bfqq);

4192

++}

4193

++

4194

++/*

4195

++ * Called when a new fs request (rq) is added to bfqq.  Check if there's

4196

++ * something we should do about it.

4197

++ */

4198

++static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,

4199

++			    struct request *rq)

4200

++{

4201

++	struct bfq_io_cq *bic = RQ_BIC(rq);

4202

++

4203

++	if (rq->cmd_flags & REQ_META)

4204

++		bfqq->meta_pending++;

4205

++

4206

++	bfq_update_io_thinktime(bfqd, bic);

4207

++	bfq_update_io_seektime(bfqd, bfqq, rq);

4208

++	if (!BFQQ_SEEKY(bfqq) && bfq_bfqq_constantly_seeky(bfqq)) {

4209

++		bfq_clear_bfqq_constantly_seeky(bfqq);

4210

++		if (!blk_queue_nonrot(bfqd->queue)) {

4211

++			BUG_ON(!bfqd->const_seeky_busy_in_flight_queues);

4212

++			bfqd->const_seeky_busy_in_flight_queues--;

4213

++		}

4214

++	}

4215

++	if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 ||

4216

++	    !BFQQ_SEEKY(bfqq))

4217

++		bfq_update_idle_window(bfqd, bfqq, bic);

4218

++

4219

++	bfq_log_bfqq(bfqd, bfqq,

4220

++		     "rq_enqueued: idle_window=%d (seeky %d, mean %llu)",

4221

++		     bfq_bfqq_idle_window(bfqq), BFQQ_SEEKY(bfqq),

4222

++		     (long long unsigned)bfqq->seek_mean);

4223

++

4224

++	bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);

4225

++

4226

++	if (bfqq == bfqd->in_service_queue && bfq_bfqq_wait_request(bfqq)) {

4227

++		int small_req = bfqq->queued[rq_is_sync(rq)] == 1 &&

4228

++				blk_rq_sectors(rq) < 32;

4229

++		int budget_timeout = bfq_bfqq_budget_timeout(bfqq);

4230

++

4231

++		/*

4232

++		 * There is just this request queued: if the request

4233

++		 * is small and the queue is not to be expired, then

4234

++		 * just exit.

4235

++		 *

4236

++		 * In this way, if the disk is being idled to wait for

4237

++		 * a new request from the in-service queue, we avoid

4238

++		 * unplugging the device and committing the disk to serve

4239

++		 * just a small request. On the contrary, we wait for

4240

++		 * the block layer to decide when to unplug the device:

4241

++		 * hopefully, new requests will be merged to this one

4242

++		 * quickly, then the device will be unplugged and

4243

++		 * larger requests will be dispatched.

4244

++		 */

4245

++		if (small_req && !budget_timeout)

4246

++			return;

4247

++

4248

++		/*

4249

++		 * A large enough request arrived, or the queue is to

4250

++		 * be expired: in both cases disk idling is to be

4251

++		 * stopped, so clear wait_request flag and reset

4252

++		 * timer.

4253

++		 */

4254

++		bfq_clear_bfqq_wait_request(bfqq);

4255

++		del_timer(&bfqd->idle_slice_timer);

4256

++

4257

++		/*

4258

++		 * The queue is not empty, because a new request just

4259

++		 * arrived. Hence we can safely expire the queue, in

4260

++		 * case of budget timeout, without risking that the

4261

++		 * timestamps of the queue are not updated correctly.

4262

++		 * See [1] for more details.

4263

++		 */

4264

++		if (budget_timeout)

4265

++			bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_TIMEOUT);

4266

++

4267

++		/*

4268

++		 * Let the request rip immediately, or let a new queue be

4269

++		 * selected if bfqq has just been expired.

4270

++		 */

4271

++		__blk_run_queue(bfqd->queue);

4272

++	}

4273

++}

4274

++

4275

++static void bfq_insert_request(struct request_queue *q, struct request *rq)

4276

++{

4277

++	struct bfq_data *bfqd = q->elevator->elevator_data;

4278

++	struct bfq_queue *bfqq = RQ_BFQQ(rq);

4279

++

4280

++	assert_spin_locked(bfqd->queue->queue_lock);

4281

++

4282

++	bfq_add_request(rq);

4283

++

4284

++	rq->fifo_time = jiffies + bfqd->bfq_fifo_expire[rq_is_sync(rq)];

4285

++	list_add_tail(&rq->queuelist, &bfqq->fifo);

4286

++

4287

++	bfq_rq_enqueued(bfqd, bfqq, rq);

4288

++}

4289

++

4290

++static void bfq_update_hw_tag(struct bfq_data *bfqd)

4291

++{

4292

++	bfqd->max_rq_in_driver = max(bfqd->max_rq_in_driver,

4293

++				     bfqd->rq_in_driver);

4294

++

4295

++	if (bfqd->hw_tag == 1)

4296

++		return;

4297

++

4298

++	/*

4299

++	 * This sample is valid if the number of outstanding requests

4300

++	 * is large enough to allow a queueing behavior.  Note that the

4301

++	 * sum is not exact, as it's not taking into account deactivated

4302

++	 * requests.

4303

++	 */

4304

++	if (bfqd->rq_in_driver + bfqd->queued < BFQ_HW_QUEUE_THRESHOLD)

4305

++		return;

4306

++

4307

++	if (bfqd->hw_tag_samples++ < BFQ_HW_QUEUE_SAMPLES)

4308

++		return;

4309

++

4310

++	bfqd->hw_tag = bfqd->max_rq_in_driver > BFQ_HW_QUEUE_THRESHOLD;

4311

++	bfqd->max_rq_in_driver = 0;

4312

++	bfqd->hw_tag_samples = 0;

4313

++}

4314

++

4315

++static void bfq_completed_request(struct request_queue *q, struct request *rq)

4316

++{

4317

++	struct bfq_queue *bfqq = RQ_BFQQ(rq);

4318

++	struct bfq_data *bfqd = bfqq->bfqd;

4319

++	bool sync = bfq_bfqq_sync(bfqq);

4320

++

4321

++	bfq_log_bfqq(bfqd, bfqq, "completed one req with %u sects left (%d)",

4322

++		     blk_rq_sectors(rq), sync);

4323

++

4324

++	bfq_update_hw_tag(bfqd);

4325

++

4326

++	BUG_ON(!bfqd->rq_in_driver);

4327

++	BUG_ON(!bfqq->dispatched);

4328

++	bfqd->rq_in_driver--;

4329

++	bfqq->dispatched--;

4330

++

4331

++	if (!bfqq->dispatched && !bfq_bfqq_busy(bfqq)) {

4332

++		bfq_weights_tree_remove(bfqd, &bfqq->entity,

4333

++					&bfqd->queue_weights_tree);

4334

++		if (!blk_queue_nonrot(bfqd->queue)) {

4335

++			BUG_ON(!bfqd->busy_in_flight_queues);

4336

++			bfqd->busy_in_flight_queues--;

4337

++			if (bfq_bfqq_constantly_seeky(bfqq)) {

4338

++				BUG_ON(!bfqd->

4339

++					const_seeky_busy_in_flight_queues);

4340

++				bfqd->const_seeky_busy_in_flight_queues--;

4341

++			}

4342

++		}

4343

++	}

4344

++

4345

++	if (sync) {

4346

++		bfqd->sync_flight--;

4347

++		RQ_BIC(rq)->ttime.last_end_request = jiffies;

4348

++	}

4349

++

4350

++	/*

4351

++	 * If we are waiting to discover whether the request pattern of the

4352

++	 * task associated with the queue is actually isochronous, and

4353

++	 * both requisites for this condition to hold are satisfied, then

4354

++	 * compute soft_rt_next_start (see the comments to the function

4355

++	 * bfq_bfqq_softrt_next_start()).

4356

++	 */

4357

++	if (bfq_bfqq_softrt_update(bfqq) && bfqq->dispatched == 0 &&

4358

++	    RB_EMPTY_ROOT(&bfqq->sort_list))

4359

++		bfqq->soft_rt_next_start =

4360

++			bfq_bfqq_softrt_next_start(bfqd, bfqq);

4361

++

4362

++	/*

4363

++	 * If this is the in-service queue, check if it needs to be expired,

4364

++	 * or if we want to idle in case it has no pending requests.

4365

++	 */

4366

++	if (bfqd->in_service_queue == bfqq) {

4367

++		if (bfq_bfqq_budget_new(bfqq))

4368

++			bfq_set_budget_timeout(bfqd);

4369

++

4370

++		if (bfq_bfqq_must_idle(bfqq)) {

4371

++			bfq_arm_slice_timer(bfqd);

4372

++			goto out;

4373

++		} else if (bfq_may_expire_for_budg_timeout(bfqq))

4374

++			bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_TIMEOUT);

4375

++		else if (RB_EMPTY_ROOT(&bfqq->sort_list) &&

4376

++			 (bfqq->dispatched == 0 ||

4377

++			  !bfq_bfqq_must_not_expire(bfqq)))

4378

++			bfq_bfqq_expire(bfqd, bfqq, 0,

4379

++					BFQ_BFQQ_NO_MORE_REQUESTS);

4380

++	}

4381

++

4382

++	if (!bfqd->rq_in_driver)

4383

++		bfq_schedule_dispatch(bfqd);

4384

++

4385

++out:

4386

++	return;

4387

++}

4388

++

4389

++static inline int __bfq_may_queue(struct bfq_queue *bfqq)

4390

++{

4391

++	if (bfq_bfqq_wait_request(bfqq) && bfq_bfqq_must_alloc(bfqq)) {

4392

++		bfq_clear_bfqq_must_alloc(bfqq);

4393

++		return ELV_MQUEUE_MUST;

4394

++	}

4395

++

4396

++	return ELV_MQUEUE_MAY;

4397

++}

4398

++

4399

++static int bfq_may_queue(struct request_queue *q, int rw)

4400

++{

4401

++	struct bfq_data *bfqd = q->elevator->elevator_data;

4402

++	struct task_struct *tsk = current;

4403

++	struct bfq_io_cq *bic;

4404

++	struct bfq_queue *bfqq;

4405

++

4406

++	/*

4407

++	 * Don't force setup of a queue from here, as a call to may_queue

4408

++	 * does not necessarily imply that a request actually will be

4409

++	 * queued. So just lookup a possibly existing queue, or return

4410

++	 * 'may queue' if that fails.

4411

++	 */

4412

++	bic = bfq_bic_lookup(bfqd, tsk->io_context);

4413

++	if (bic == NULL)

4414

++		return ELV_MQUEUE_MAY;

4415

++

4416

++	bfqq = bic_to_bfqq(bic, rw_is_sync(rw));

4417

++	if (bfqq != NULL)

4418

++		return __bfq_may_queue(bfqq);

4419

++

4420

++	return ELV_MQUEUE_MAY;

4421

++}

4422

++

4423

++/*

4424

++ * Queue lock held here.

4425

++ */

4426

++static void bfq_put_request(struct request *rq)

4427

++{

4428

++	struct bfq_queue *bfqq = RQ_BFQQ(rq);

4429

++

4430

++	if (bfqq != NULL) {

4431

++		const int rw = rq_data_dir(rq);

4432

++

4433

++		BUG_ON(!bfqq->allocated[rw]);

4434

++		bfqq->allocated[rw]--;

4435

++

4436

++		rq->elv.priv[0] = NULL;

4437

++		rq->elv.priv[1] = NULL;

4438

++

4439

++		bfq_log_bfqq(bfqq->bfqd, bfqq, "put_request %p, %d",

4440

++			     bfqq, atomic_read(&bfqq->ref));

4441

++		bfq_put_queue(bfqq);

4442

++	}

4443

++}

4444

++

4445

++static struct bfq_queue *

4446

++bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,

4447

++		struct bfq_queue *bfqq)

4448

++{

4449

++	bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu",

4450

++		(long unsigned)bfqq->new_bfqq->pid);

4451

++	bic_set_bfqq(bic, bfqq->new_bfqq, 1);

4452

++	bfq_mark_bfqq_coop(bfqq->new_bfqq);

4453

++	bfq_put_queue(bfqq);

4454

++	return bic_to_bfqq(bic, 1);

4455

++}

4456

++

4457

++/*

4458

++ * Returns NULL if a new bfqq should be allocated, or the old bfqq if this

4459

++ * was the last process referring to said bfqq.

4460

++ */

4461

++static struct bfq_queue *

4462

++bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq)

4463

++{

4464

++	bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue");

4465

++	if (bfqq_process_refs(bfqq) == 1) {

4466

++		bfqq->pid = current->pid;

4467

++		bfq_clear_bfqq_coop(bfqq);

4468

++		bfq_clear_bfqq_split_coop(bfqq);

4469

++		return bfqq;

4470

++	}

4471

++

4472

++	bic_set_bfqq(bic, NULL, 1);

4473

++

4474

++	bfq_put_cooperator(bfqq);

4475

++

4476

++	bfq_put_queue(bfqq);

4477

++	return NULL;

4478

++}

4479

++

4480

++/*

4481

++ * Allocate bfq data structures associated with this request.

4482

++ */

4483

++static int bfq_set_request(struct request_queue *q, struct request *rq,

4484

++			   struct bio *bio, gfp_t gfp_mask)

4485

++{

4486

++	struct bfq_data *bfqd = q->elevator->elevator_data;

4487

++	struct bfq_io_cq *bic = icq_to_bic(rq->elv.icq);

4488

++	const int rw = rq_data_dir(rq);

4489

++	const int is_sync = rq_is_sync(rq);

4490

++	struct bfq_queue *bfqq;

4491

++	struct bfq_group *bfqg;

4492

++	unsigned long flags;

4493

++

4494

++	might_sleep_if(gfp_mask & __GFP_WAIT);

4495

++

4496

++	bfq_check_ioprio_change(bic);

4497

++

4498

++	spin_lock_irqsave(q->queue_lock, flags);

4499

++

4500

++	if (bic == NULL)

4501

++		goto queue_fail;

4502

++

4503

++	bfqg = bfq_bic_update_cgroup(bic);

4504

++

4505

++new_queue:

4506

++	bfqq = bic_to_bfqq(bic, is_sync);

4507

++	if (bfqq == NULL || bfqq == &bfqd->oom_bfqq) {

4508

++		bfqq = bfq_get_queue(bfqd, bfqg, is_sync, bic, gfp_mask);

4509

++		bic_set_bfqq(bic, bfqq, is_sync);

4510

++	} else {

4511

++		/*

4512

++		 * If the queue was seeky for too long, break it apart.

4513

++		 */

4514

++		if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) {

4515

++			bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq");

4516

++			bfqq = bfq_split_bfqq(bic, bfqq);

4517

++			if (!bfqq)

4518

++				goto new_queue;

4519

++		}

4520

++

4521

++		/*

4522

++		 * Check to see if this queue is scheduled to merge with

4523

++		 * another closely cooperating queue. The merging of queues

4524

++		 * happens here as it must be done in process context.

4525

++		 * The reference on new_bfqq was taken in merge_bfqqs.

4526

++		 */

4527

++		if (bfqq->new_bfqq != NULL)

4528

++			bfqq = bfq_merge_bfqqs(bfqd, bic, bfqq);

4529

++	}

4530

++

4531

++	bfqq->allocated[rw]++;

4532

++	atomic_inc(&bfqq->ref);

4533

++	bfq_log_bfqq(bfqd, bfqq, "set_request: bfqq %p, %d", bfqq,

4534

++		     atomic_read(&bfqq->ref));

4535

++

4536

++	rq->elv.priv[0] = bic;

4537

++	rq->elv.priv[1] = bfqq;

4538

++

4539

++	spin_unlock_irqrestore(q->queue_lock, flags);

4540

++

4541

++	return 0;

4542

++

4543

++queue_fail:

4544

++	bfq_schedule_dispatch(bfqd);

4545

++	spin_unlock_irqrestore(q->queue_lock, flags);

4546

++

4547

++	return 1;

4548

++}

4549

++

4550

++static void bfq_kick_queue(struct work_struct *work)

4551

++{

4552

++	struct bfq_data *bfqd =

4553

++		container_of(work, struct bfq_data, unplug_work);

4554

++	struct request_queue *q = bfqd->queue;

4555

++

4556

++	spin_lock_irq(q->queue_lock);

4557

++	__blk_run_queue(q);

4558

++	spin_unlock_irq(q->queue_lock);

4559

++}

4560

++

4561

++/*

4562

++ * Handler of the expiration of the timer running if the in-service queue

4563

++ * is idling inside its time slice.

4564

++ */

4565

++static void bfq_idle_slice_timer(unsigned long data)

4566

++{

4567

++	struct bfq_data *bfqd = (struct bfq_data *)data;

4568

++	struct bfq_queue *bfqq;

4569

++	unsigned long flags;

4570

++	enum bfqq_expiration reason;

4571

++

4572

++	spin_lock_irqsave(bfqd->queue->queue_lock, flags);

4573

++

4574

++	bfqq = bfqd->in_service_queue;

4575

++	/*

4576

++	 * Theoretical race here: the in-service queue can be NULL or

4577

++	 * different from the queue that was idling if the timer handler

4578

++	 * spins on the queue_lock and a new request arrives for the

4579

++	 * current queue and there is a full dispatch cycle that changes

4580

++	 * the in-service queue.  This can hardly happen, but in the worst

4581

++	 * case we just expire a queue too early.

4582

++	 */

4583

++	if (bfqq != NULL) {

4584

++		bfq_log_bfqq(bfqd, bfqq, "slice_timer expired");

4585

++		if (bfq_bfqq_budget_timeout(bfqq))

4586

++			/*

4587

++			 * Also here the queue can be safely expired

4588

++			 * for budget timeout without wasting

4589

++			 * guarantees

4590

++			 */

4591

++			reason = BFQ_BFQQ_BUDGET_TIMEOUT;

4592

++		else if (bfqq->queued[0] == 0 && bfqq->queued[1] == 0)

4593

++			/*

4594

++			 * The queue may not be empty upon timer expiration,

4595

++			 * because we may not disable the timer when the

4596

++			 * first request of the in-service queue arrives

4597

++			 * during disk idling.

4598

++			 */

4599

++			reason = BFQ_BFQQ_TOO_IDLE;

4600

++		else

4601

++			goto schedule_dispatch;

4602

++

4603

++		bfq_bfqq_expire(bfqd, bfqq, 1, reason);

4604

++	}

4605

++

4606

++schedule_dispatch:

4607

++	bfq_schedule_dispatch(bfqd);

4608

++

4609

++	spin_unlock_irqrestore(bfqd->queue->queue_lock, flags);

4610

++}

4611

++

4612

++static void bfq_shutdown_timer_wq(struct bfq_data *bfqd)

4613

++{

4614

++	del_timer_sync(&bfqd->idle_slice_timer);

4615

++	cancel_work_sync(&bfqd->unplug_work);

4616

++}

4617

++

4618

++static inline void __bfq_put_async_bfqq(struct bfq_data *bfqd,

4619

++					struct bfq_queue **bfqq_ptr)

4620

++{

4621

++	struct bfq_group *root_group = bfqd->root_group;

4622

++	struct bfq_queue *bfqq = *bfqq_ptr;

4623

++

4624

++	bfq_log(bfqd, "put_async_bfqq: %p", bfqq);

4625

++	if (bfqq != NULL) {

4626

++		bfq_bfqq_move(bfqd, bfqq, &bfqq->entity, root_group);

4627

++		bfq_log_bfqq(bfqd, bfqq, "put_async_bfqq: putting %p, %d",

4628

++			     bfqq, atomic_read(&bfqq->ref));

4629

++		bfq_put_queue(bfqq);

4630

++		*bfqq_ptr = NULL;

4631

++	}

4632

++}

4633

++

4634

++/*

4635

++ * Release all the bfqg references to its async queues.  If we are

4636

++ * deallocating the group these queues may still contain requests, so

4637

++ * we reparent them to the root cgroup (i.e., the only one that will

4638

++ * exist for sure until all the requests on a device are gone).

4639

++ */

4640

++static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg)

4641

++{

4642

++	int i, j;

4643

++

4644

++	for (i = 0; i < 2; i++)

4645

++		for (j = 0; j < IOPRIO_BE_NR; j++)

4646

++			__bfq_put_async_bfqq(bfqd, &bfqg->async_bfqq[i][j]);

4647

++

4648

++	__bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq);

4649

++}

4650

++

4651

++static void bfq_exit_queue(struct elevator_queue *e)

4652

++{

4653

++	struct bfq_data *bfqd = e->elevator_data;

4654

++	struct request_queue *q = bfqd->queue;

4655

++	struct bfq_queue *bfqq, *n;

4656

++

4657

++	bfq_shutdown_timer_wq(bfqd);

4658

++

4659

++	spin_lock_irq(q->queue_lock);

4660

++

4661

++	BUG_ON(bfqd->in_service_queue != NULL);

4662

++	list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list)

4663

++		bfq_deactivate_bfqq(bfqd, bfqq, 0);

4664

++

4665

++	bfq_disconnect_groups(bfqd);

4666

++	spin_unlock_irq(q->queue_lock);

4667

++

4668

++	bfq_shutdown_timer_wq(bfqd);

4669

++

4670

++	synchronize_rcu();

4671

++

4672

++	BUG_ON(timer_pending(&bfqd->idle_slice_timer));

4673

++

4674

++	bfq_free_root_group(bfqd);

4675

++	kfree(bfqd);

4676

++}

4677

++

4678

++static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)

4679

++{

4680

++	struct bfq_group *bfqg;

4681

++	struct bfq_data *bfqd;

4682

++	struct elevator_queue *eq;

4683

++

4684

++	eq = elevator_alloc(q, e);

4685

++	if (eq == NULL)

4686

++		return -ENOMEM;

4687

++

4688

++	bfqd = kzalloc_node(sizeof(*bfqd), GFP_KERNEL, q->node);

4689

++	if (bfqd == NULL) {

4690

++		kobject_put(&eq->kobj);

4691

++		return -ENOMEM;

4692

++	}

4693

++	eq->elevator_data = bfqd;

4694

++

4695

++	/*

4696

++	 * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues.

4697

++	 * Grab a permanent reference to it, so that the normal code flow

4698

++	 * will not attempt to free it.

4699

++	 */

4700

++	bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, NULL, 1, 0);

4701

++	atomic_inc(&bfqd->oom_bfqq.ref);

4702

++	bfqd->oom_bfqq.entity.new_ioprio = BFQ_DEFAULT_QUEUE_IOPRIO;

4703

++	bfqd->oom_bfqq.entity.new_ioprio_class = IOPRIO_CLASS_BE;

4704

++	bfqd->oom_bfqq.entity.new_weight =

4705

++		bfq_ioprio_to_weight(bfqd->oom_bfqq.entity.new_ioprio);

4706

++	/*

4707

++	 * Trigger weight initialization, according to ioprio, at the

4708

++	 * oom_bfqq's first activation. The oom_bfqq's ioprio and ioprio

4709

++	 * class won't be changed any more.

4710

++	 */

4711

++	bfqd->oom_bfqq.entity.ioprio_changed = 1;

4712

++

4713

++	bfqd->queue = q;

4714

++

4715

++	spin_lock_irq(q->queue_lock);

4716

++	q->elevator = eq;

4717

++	spin_unlock_irq(q->queue_lock);

4718

++

4719

++	bfqg = bfq_alloc_root_group(bfqd, q->node);

4720

++	if (bfqg == NULL) {

4721

++		kfree(bfqd);

4722

++		kobject_put(&eq->kobj);

4723

++		return -ENOMEM;

4724

++	}

4725

++

4726

++	bfqd->root_group = bfqg;

4727

++	bfq_init_entity(&bfqd->oom_bfqq.entity, bfqd->root_group);

4728

++#ifdef CONFIG_CGROUP_BFQIO

4729

++	bfqd->active_numerous_groups = 0;

4730

++#endif

4731

++

4732

++	init_timer(&bfqd->idle_slice_timer);

4733

++	bfqd->idle_slice_timer.function = bfq_idle_slice_timer;

4734

++	bfqd->idle_slice_timer.data = (unsigned long)bfqd;

4735

++

4736

++	bfqd->rq_pos_tree = RB_ROOT;

4737

++	bfqd->queue_weights_tree = RB_ROOT;

4738

++	bfqd->group_weights_tree = RB_ROOT;

4739

++

4740

++	INIT_WORK(&bfqd->unplug_work, bfq_kick_queue);

4741

++

4742

++	INIT_LIST_HEAD(&bfqd->active_list);

4743

++	INIT_LIST_HEAD(&bfqd->idle_list);

4744

++	INIT_HLIST_HEAD(&bfqd->burst_list);

4745

++

4746

++	bfqd->hw_tag = -1;

4747

++

4748

++	bfqd->bfq_max_budget = bfq_default_max_budget;

4749

++

4750

++	bfqd->bfq_fifo_expire[0] = bfq_fifo_expire[0];

4751

++	bfqd->bfq_fifo_expire[1] = bfq_fifo_expire[1];

4752

++	bfqd->bfq_back_max = bfq_back_max;

4753

++	bfqd->bfq_back_penalty = bfq_back_penalty;

4754

++	bfqd->bfq_slice_idle = bfq_slice_idle;

4755

++	bfqd->bfq_class_idle_last_service = 0;

4756

++	bfqd->bfq_max_budget_async_rq = bfq_max_budget_async_rq;

4757

++	bfqd->bfq_timeout[BLK_RW_ASYNC] = bfq_timeout_async;

4758

++	bfqd->bfq_timeout[BLK_RW_SYNC] = bfq_timeout_sync;

4759

++

4760

++	bfqd->bfq_coop_thresh = 2;

4761

++	bfqd->bfq_failed_cooperations = 7000;

4762

++	bfqd->bfq_requests_within_timer = 120;

4763

++

4764

++	bfqd->bfq_large_burst_thresh = 11;

4765

++	bfqd->bfq_burst_interval = msecs_to_jiffies(500);

4766

++

4767

++	bfqd->low_latency = true;

4768

++

4769

++	bfqd->bfq_wr_coeff = 20;

4770

++	bfqd->bfq_wr_rt_max_time = msecs_to_jiffies(300);

4771

++	bfqd->bfq_wr_max_time = 0;

4772

++	bfqd->bfq_wr_min_idle_time = msecs_to_jiffies(2000);

4773

++	bfqd->bfq_wr_min_inter_arr_async = msecs_to_jiffies(500);

4774

++	bfqd->bfq_wr_max_softrt_rate = 7000; /*

4775

++					      * Approximate rate required

4776

++					      * to playback or record a

4777

++					      * high-definition compressed

4778

++					      * video.

4779

++					      */

4780

++	bfqd->wr_busy_queues = 0;

4781

++	bfqd->busy_in_flight_queues = 0;

4782

++	bfqd->const_seeky_busy_in_flight_queues = 0;

4783

++

4784

++	/*

4785

++	 * Begin by assuming, optimistically, that the device peak rate is

4786

++	 * equal to the highest reference rate.

4787

++	 */

4788

++	bfqd->RT_prod = R_fast[blk_queue_nonrot(bfqd->queue)] *

4789

++			T_fast[blk_queue_nonrot(bfqd->queue)];

4790

++	bfqd->peak_rate = R_fast[blk_queue_nonrot(bfqd->queue)];

4791

++	bfqd->device_speed = BFQ_BFQD_FAST;

4792

++

4793

++	return 0;

4794

++}

4795

++

4796

++static void bfq_slab_kill(void)

4797

++{

4798

++	if (bfq_pool != NULL)

4799

++		kmem_cache_destroy(bfq_pool);

4800

++}

4801

++

4802

++static int __init bfq_slab_setup(void)

4803

++{

4804

++	bfq_pool = KMEM_CACHE(bfq_queue, 0);

4805

++	if (bfq_pool == NULL)

4806

++		return -ENOMEM;

4807

++	return 0;

4808

++}

4809

++

4810

++static ssize_t bfq_var_show(unsigned int var, char *page)

4811

++{

4812

++	return sprintf(page, "%d\n", var);

4813

++}

4814

++

4815

++static ssize_t bfq_var_store(unsigned long *var, const char *page,

4816

++			     size_t count)

4817

++{

4818

++	unsigned long new_val;

4819

++	int ret = kstrtoul(page, 10, &new_val);

4820

++

4821

++	if (ret == 0)

4822

++		*var = new_val;

4823

++

4824

++	return count;

4825

++}

4826

++

4827

++static ssize_t bfq_wr_max_time_show(struct elevator_queue *e, char *page)

4828

++{

4829

++	struct bfq_data *bfqd = e->elevator_data;

4830

++	return sprintf(page, "%d\n", bfqd->bfq_wr_max_time > 0 ?

4831

++		       jiffies_to_msecs(bfqd->bfq_wr_max_time) :

4832

++		       jiffies_to_msecs(bfq_wr_duration(bfqd)));

4833

++}

4834

++

4835

++static ssize_t bfq_weights_show(struct elevator_queue *e, char *page)

4836

++{

4837

++	struct bfq_queue *bfqq;

4838

++	struct bfq_data *bfqd = e->elevator_data;

4839

++	ssize_t num_char = 0;

4840

++

4841

++	num_char += sprintf(page + num_char, "Tot reqs queued %d\n\n",

4842

++			    bfqd->queued);

4843

++

4844

++	spin_lock_irq(bfqd->queue->queue_lock);

4845

++

4846

++	num_char += sprintf(page + num_char, "Active:\n");

4847

++	list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) {

4848

++	  num_char += sprintf(page + num_char,

4849

++			      "pid%d: weight %hu, nr_queued %d %d, dur %d/%u\n",

4850

++			      bfqq->pid,

4851

++			      bfqq->entity.weight,

4852

++			      bfqq->queued[0],

4853

++			      bfqq->queued[1],

4854

++			jiffies_to_msecs(jiffies - bfqq->last_wr_start_finish),

4855

++			jiffies_to_msecs(bfqq->wr_cur_max_time));

4856

++	}

4857

++

4858

++	num_char += sprintf(page + num_char, "Idle:\n");

4859

++	list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list) {

4860

++			num_char += sprintf(page + num_char,

4861

++				"pid%d: weight %hu, dur %d/%u\n",

4862

++				bfqq->pid,

4863

++				bfqq->entity.weight,

4864

++				jiffies_to_msecs(jiffies -

4865

++					bfqq->last_wr_start_finish),

4866

++				jiffies_to_msecs(bfqq->wr_cur_max_time));

4867

++	}

4868

++

4869

++	spin_unlock_irq(bfqd->queue->queue_lock);

4870

++

4871

++	return num_char;

4872

++}

4873

++

4874

++#define SHOW_FUNCTION(__FUNC, __VAR, __CONV)				\

4875

++static ssize_t __FUNC(struct elevator_queue *e, char *page)		\

4876

++{									\

4877

++	struct bfq_data *bfqd = e->elevator_data;			\

4878

++	unsigned int __data = __VAR;					\

4879

++	if (__CONV)							\

4880

++		__data = jiffies_to_msecs(__data);			\

4881

++	return bfq_var_show(__data, (page));				\

4882

++}

4883

++SHOW_FUNCTION(bfq_fifo_expire_sync_show, bfqd->bfq_fifo_expire[1], 1);

4884

++SHOW_FUNCTION(bfq_fifo_expire_async_show, bfqd->bfq_fifo_expire[0], 1);

4885

++SHOW_FUNCTION(bfq_back_seek_max_show, bfqd->bfq_back_max, 0);

4886

++SHOW_FUNCTION(bfq_back_seek_penalty_show, bfqd->bfq_back_penalty, 0);

4887

++SHOW_FUNCTION(bfq_slice_idle_show, bfqd->bfq_slice_idle, 1);

4888

++SHOW_FUNCTION(bfq_max_budget_show, bfqd->bfq_user_max_budget, 0);

4889

++SHOW_FUNCTION(bfq_max_budget_async_rq_show,

4890

++	      bfqd->bfq_max_budget_async_rq, 0);

4891

++SHOW_FUNCTION(bfq_timeout_sync_show, bfqd->bfq_timeout[BLK_RW_SYNC], 1);

4892

++SHOW_FUNCTION(bfq_timeout_async_show, bfqd->bfq_timeout[BLK_RW_ASYNC], 1);

4893

++SHOW_FUNCTION(bfq_low_latency_show, bfqd->low_latency, 0);

4894

++SHOW_FUNCTION(bfq_wr_coeff_show, bfqd->bfq_wr_coeff, 0);

4895

++SHOW_FUNCTION(bfq_wr_rt_max_time_show, bfqd->bfq_wr_rt_max_time, 1);

4896

++SHOW_FUNCTION(bfq_wr_min_idle_time_show, bfqd->bfq_wr_min_idle_time, 1);

4897

++SHOW_FUNCTION(bfq_wr_min_inter_arr_async_show, bfqd->bfq_wr_min_inter_arr_async,

4898

++	1);

4899

++SHOW_FUNCTION(bfq_wr_max_softrt_rate_show, bfqd->bfq_wr_max_softrt_rate, 0);

4900

++#undef SHOW_FUNCTION

4901

++

4902

++#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV)			\

4903

++static ssize_t								\

4904

++__FUNC(struct elevator_queue *e, const char *page, size_t count)	\

4905

++{									\

4906

++	struct bfq_data *bfqd = e->elevator_data;			\

4907

++	unsigned long uninitialized_var(__data);			\

4908

++	int ret = bfq_var_store(&__data, (page), count);		\

4909

++	if (__data < (MIN))						\

4910

++		__data = (MIN);						\

4911

++	else if (__data > (MAX))					\

4912

++		__data = (MAX);						\

4913

++	if (__CONV)							\

4914

++		*(__PTR) = msecs_to_jiffies(__data);			\

4915

++	else								\

4916

++		*(__PTR) = __data;					\

4917

++	return ret;							\

4918

++}

4919

++STORE_FUNCTION(bfq_fifo_expire_sync_store, &bfqd->bfq_fifo_expire[1], 1,

4920

++		INT_MAX, 1);

4921

++STORE_FUNCTION(bfq_fifo_expire_async_store, &bfqd->bfq_fifo_expire[0], 1,

4922

++		INT_MAX, 1);

4923

++STORE_FUNCTION(bfq_back_seek_max_store, &bfqd->bfq_back_max, 0, INT_MAX, 0);

4924

++STORE_FUNCTION(bfq_back_seek_penalty_store, &bfqd->bfq_back_penalty, 1,

4925

++		INT_MAX, 0);

4926

++STORE_FUNCTION(bfq_slice_idle_store, &bfqd->bfq_slice_idle, 0, INT_MAX, 1);

4927

++STORE_FUNCTION(bfq_max_budget_async_rq_store, &bfqd->bfq_max_budget_async_rq,

4928

++		1, INT_MAX, 0);

4929

++STORE_FUNCTION(bfq_timeout_async_store, &bfqd->bfq_timeout[BLK_RW_ASYNC], 0,

4930

++		INT_MAX, 1);

4931

++STORE_FUNCTION(bfq_wr_coeff_store, &bfqd->bfq_wr_coeff, 1, INT_MAX, 0);

4932

++STORE_FUNCTION(bfq_wr_max_time_store, &bfqd->bfq_wr_max_time, 0, INT_MAX, 1);

4933

++STORE_FUNCTION(bfq_wr_rt_max_time_store, &bfqd->bfq_wr_rt_max_time, 0, INT_MAX,

4934

++		1);

4935

++STORE_FUNCTION(bfq_wr_min_idle_time_store, &bfqd->bfq_wr_min_idle_time, 0,

4936

++		INT_MAX, 1);

4937

++STORE_FUNCTION(bfq_wr_min_inter_arr_async_store,

4938

++		&bfqd->bfq_wr_min_inter_arr_async, 0, INT_MAX, 1);

4939

++STORE_FUNCTION(bfq_wr_max_softrt_rate_store, &bfqd->bfq_wr_max_softrt_rate, 0,

4940

++		INT_MAX, 0);

4941

++#undef STORE_FUNCTION

4942

++

4943

++/* do nothing for the moment */

4944

++static ssize_t bfq_weights_store(struct elevator_queue *e,

4945

++				    const char *page, size_t count)

4946

++{

4947

++	return count;

4948

++}

4949

++

4950

++static inline unsigned long bfq_estimated_max_budget(struct bfq_data *bfqd)

4951

++{

4952

++	u64 timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]);

4953

++

4954

++	if (bfqd->peak_rate_samples >= BFQ_PEAK_RATE_SAMPLES)

4955

++		return bfq_calc_max_budget(bfqd->peak_rate, timeout);

4956

++	else

4957

++		return bfq_default_max_budget;

4958

++}

4959

++

4960

++static ssize_t bfq_max_budget_store(struct elevator_queue *e,

4961

++				    const char *page, size_t count)

4962

++{

4963

++	struct bfq_data *bfqd = e->elevator_data;

4964

++	unsigned long uninitialized_var(__data);

4965

++	int ret = bfq_var_store(&__data, (page), count);

4966

++

4967

++	if (__data == 0)

4968

++		bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd);

4969

++	else {

4970

++		if (__data > INT_MAX)

4971

++			__data = INT_MAX;

4972

++		bfqd->bfq_max_budget = __data;

4973

++	}

4974

++

4975

++	bfqd->bfq_user_max_budget = __data;

4976

++

4977

++	return ret;

4978

++}

4979

++

4980

++static ssize_t bfq_timeout_sync_store(struct elevator_queue *e,

4981

++				      const char *page, size_t count)

4982

++{

4983

++	struct bfq_data *bfqd = e->elevator_data;

4984

++	unsigned long uninitialized_var(__data);

4985

++	int ret = bfq_var_store(&__data, (page), count);

4986

++

4987

++	if (__data < 1)

4988

++		__data = 1;

4989

++	else if (__data > INT_MAX)

4990

++		__data = INT_MAX;

4991

++

4992

++	bfqd->bfq_timeout[BLK_RW_SYNC] = msecs_to_jiffies(__data);

4993

++	if (bfqd->bfq_user_max_budget == 0)

4994

++		bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd);

4995

++

4996

++	return ret;

4997

++}

4998

++

4999

++static ssize_t bfq_low_latency_store(struct elevator_queue *e,

5000

++				     const char *page, size_t count)

5001

++{

5002

++	struct bfq_data *bfqd = e->elevator_data;

5003

++	unsigned long uninitialized_var(__data);

5004

++	int ret = bfq_var_store(&__data, (page), count);

5005

++

5006

++	if (__data > 1)

5007

++		__data = 1;

5008

++	if (__data == 0 && bfqd->low_latency != 0)

5009

++		bfq_end_wr(bfqd);

5010

++	bfqd->low_latency = __data;

5011

++

5012

++	return ret;

5013

++}

5014

++

5015

++#define BFQ_ATTR(name) \

5016

++	__ATTR(name, S_IRUGO|S_IWUSR, bfq_##name##_show, bfq_##name##_store)

5017

++

5018

++static struct elv_fs_entry bfq_attrs[] = {

5019

++	BFQ_ATTR(fifo_expire_sync),

5020

++	BFQ_ATTR(fifo_expire_async),

5021

++	BFQ_ATTR(back_seek_max),

5022

++	BFQ_ATTR(back_seek_penalty),

5023

++	BFQ_ATTR(slice_idle),

5024

++	BFQ_ATTR(max_budget),

5025

++	BFQ_ATTR(max_budget_async_rq),

5026

++	BFQ_ATTR(timeout_sync),

5027

++	BFQ_ATTR(timeout_async),

5028

++	BFQ_ATTR(low_latency),

5029

++	BFQ_ATTR(wr_coeff),

5030

++	BFQ_ATTR(wr_max_time),

5031

++	BFQ_ATTR(wr_rt_max_time),

5032

++	BFQ_ATTR(wr_min_idle_time),

5033

++	BFQ_ATTR(wr_min_inter_arr_async),

5034

++	BFQ_ATTR(wr_max_softrt_rate),

5035

++	BFQ_ATTR(weights),

5036

++	__ATTR_NULL

5037

++};

5038

++

5039

++static struct elevator_type iosched_bfq = {

5040

++	.ops = {

5041

++		.elevator_merge_fn =		bfq_merge,

5042

++		.elevator_merged_fn =		bfq_merged_request,

5043

++		.elevator_merge_req_fn =	bfq_merged_requests,

5044

++		.elevator_allow_merge_fn =	bfq_allow_merge,

5045

++		.elevator_dispatch_fn =		bfq_dispatch_requests,

5046

++		.elevator_add_req_fn =		bfq_insert_request,

5047

++		.elevator_activate_req_fn =	bfq_activate_request,

5048

++		.elevator_deactivate_req_fn =	bfq_deactivate_request,

5049

++		.elevator_completed_req_fn =	bfq_completed_request,

5050

++		.elevator_former_req_fn =	elv_rb_former_request,

5051

++		.elevator_latter_req_fn =	elv_rb_latter_request,

5052

++		.elevator_init_icq_fn =		bfq_init_icq,

5053

++		.elevator_exit_icq_fn =		bfq_exit_icq,

5054

++		.elevator_set_req_fn =		bfq_set_request,

5055

++		.elevator_put_req_fn =		bfq_put_request,

5056

++		.elevator_may_queue_fn =	bfq_may_queue,

5057

++		.elevator_init_fn =		bfq_init_queue,

5058

++		.elevator_exit_fn =		bfq_exit_queue,

5059

++	},

5060

++	.icq_size =		sizeof(struct bfq_io_cq),

5061

++	.icq_align =		__alignof__(struct bfq_io_cq),

5062

++	.elevator_attrs =	bfq_attrs,

5063

++	.elevator_name =	"bfq",

5064

++	.elevator_owner =	THIS_MODULE,

5065

++};

5066

++

5067

++static int __init bfq_init(void)

5068

++{

5069

++	/*

5070

++	 * Can be 0 on HZ < 1000 setups.

5071

++	 */

5072

++	if (bfq_slice_idle == 0)

5073

++		bfq_slice_idle = 1;

5074

++

5075

++	if (bfq_timeout_async == 0)

5076

++		bfq_timeout_async = 1;

5077

++

5078

++	if (bfq_slab_setup())

5079

++		return -ENOMEM;

5080

++

5081

++	/*

5082

++	 * Times to load large popular applications for the typical systems

5083

++	 * installed on the reference devices (see the comments before the

5084

++	 * definitions of the two arrays).

5085

++	 */

5086

++	T_slow[0] = msecs_to_jiffies(2600);

5087

++	T_slow[1] = msecs_to_jiffies(1000);

5088

++	T_fast[0] = msecs_to_jiffies(5500);

5089

++	T_fast[1] = msecs_to_jiffies(2000);

5090

++

5091

++	/*

5092

++	 * Thresholds that determine the switch between speed classes (see

5093

++	 * the comments before the definition of the array).

5094

++	 */

5095

++	device_speed_thresh[0] = (R_fast[0] + R_slow[0]) / 2;

5096

++	device_speed_thresh[1] = (R_fast[1] + R_slow[1]) / 2;

5097

++

5098

++	elv_register(&iosched_bfq);

5099

++	pr_info("BFQ I/O-scheduler: v7r8");

5100

++

5101

++	return 0;

5102

++}

5103

++

5104

++static void __exit bfq_exit(void)

5105

++{

5106

++	elv_unregister(&iosched_bfq);

5107

++	bfq_slab_kill();

5108

++}

5109

++

5110

++module_init(bfq_init);

5111

++module_exit(bfq_exit);

5112

++

5113

++MODULE_AUTHOR("Fabio Checconi, Paolo Valente");

5114

++MODULE_LICENSE("GPL");

5115

+diff --git a/block/bfq-sched.c b/block/bfq-sched.c

5116

+new file mode 100644

5117

+index 0000000..c343099

5118

+--- /dev/null

5119

++++ b/block/bfq-sched.c

5120

+@@ -0,0 +1,1208 @@

5121

++/*

5122

++ * BFQ: Hierarchical B-WF2Q+ scheduler.

5123

++ *

5124

++ * Based on ideas and code from CFQ:

5125

++ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>

5126

++ *

5127

++ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>

5128

++ *		      Paolo Valente <paolo.valente@×××××××.it>

5129

++ *

5130

++ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>

5131

++ */

5132

++

5133

++#ifdef CONFIG_CGROUP_BFQIO

5134

++#define for_each_entity(entity)	\

5135

++	for (; entity != NULL; entity = entity->parent)

5136

++

5137

++#define for_each_entity_safe(entity, parent) \

5138

++	for (; entity && ({ parent = entity->parent; 1; }); entity = parent)

5139

++

5140

++static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd,

5141

++						 int extract,

5142

++						 struct bfq_data *bfqd);

5143

++

5144

++static inline void bfq_update_budget(struct bfq_entity *next_in_service)

5145

++{

5146

++	struct bfq_entity *bfqg_entity;

5147

++	struct bfq_group *bfqg;

5148

++	struct bfq_sched_data *group_sd;

5149

++

5150

++	BUG_ON(next_in_service == NULL);

5151

++

5152

++	group_sd = next_in_service->sched_data;

5153

++

5154

++	bfqg = container_of(group_sd, struct bfq_group, sched_data);

5155

++	/*

5156

++	 * bfq_group's my_entity field is not NULL only if the group

5157

++	 * is not the root group. We must not touch the root entity

5158

++	 * as it must never become an in-service entity.

5159

++	 */

5160

++	bfqg_entity = bfqg->my_entity;

5161

++	if (bfqg_entity != NULL)

5162

++		bfqg_entity->budget = next_in_service->budget;

5163

++}

5164

++

5165

++static int bfq_update_next_in_service(struct bfq_sched_data *sd)

5166

++{

5167

++	struct bfq_entity *next_in_service;

5168

++

5169

++	if (sd->in_service_entity != NULL)

5170

++		/* will update/requeue at the end of service */

5171

++		return 0;

5172

++

5173

++	/*

5174

++	 * NOTE: this can be improved in many ways, such as returning

5175

++	 * 1 (and thus propagating upwards the update) only when the

5176

++	 * budget changes, or caching the bfqq that will be scheduled

5177

++	 * next from this subtree.  By now we worry more about

5178

++	 * correctness than about performance...

5179

++	 */

5180

++	next_in_service = bfq_lookup_next_entity(sd, 0, NULL);

5181

++	sd->next_in_service = next_in_service;

5182

++

5183

++	if (next_in_service != NULL)

5184

++		bfq_update_budget(next_in_service);

5185

++

5186

++	return 1;

5187

++}

5188

++

5189

++static inline void bfq_check_next_in_service(struct bfq_sched_data *sd,

5190

++					     struct bfq_entity *entity)

5191

++{

5192

++	BUG_ON(sd->next_in_service != entity);

5193

++}

5194

++#else

5195

++#define for_each_entity(entity)	\

5196

++	for (; entity != NULL; entity = NULL)

5197

++

5198

++#define for_each_entity_safe(entity, parent) \

5199

++	for (parent = NULL; entity != NULL; entity = parent)

5200

++

5201

++static inline int bfq_update_next_in_service(struct bfq_sched_data *sd)

5202

++{

5203

++	return 0;

5204

++}

5205

++

5206

++static inline void bfq_check_next_in_service(struct bfq_sched_data *sd,

5207

++					     struct bfq_entity *entity)

5208

++{

5209

++}

5210

++

5211

++static inline void bfq_update_budget(struct bfq_entity *next_in_service)

5212

++{

5213

++}

5214

++#endif

5215

++

5216

++/*

5217

++ * Shift for timestamp calculations.  This actually limits the maximum

5218

++ * service allowed in one timestamp delta (small shift values increase it),

5219

++ * the maximum total weight that can be used for the queues in the system

5220

++ * (big shift values increase it), and the period of virtual time

5221

++ * wraparounds.

5222

++ */

5223

++#define WFQ_SERVICE_SHIFT	22

5224

++

5225

++/**

5226

++ * bfq_gt - compare two timestamps.

5227

++ * @a: first ts.

5228

++ * @b: second ts.

5229

++ *

5230

++ * Return @a > @b, dealing with wrapping correctly.

5231

++ */

5232

++static inline int bfq_gt(u64 a, u64 b)

5233

++{

5234

++	return (s64)(a - b) > 0;

5235

++}

5236

++

5237

++static inline struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity)

5238

++{

5239

++	struct bfq_queue *bfqq = NULL;

5240

++

5241

++	BUG_ON(entity == NULL);

5242

++

5243

++	if (entity->my_sched_data == NULL)

5244

++		bfqq = container_of(entity, struct bfq_queue, entity);

5245

++

5246

++	return bfqq;

5247

++}

5248

++

5249

++

5250

++/**

5251

++ * bfq_delta - map service into the virtual time domain.

5252

++ * @service: amount of service.

5253

++ * @weight: scale factor (weight of an entity or weight sum).

5254

++ */

5255

++static inline u64 bfq_delta(unsigned long service,

5256

++					unsigned long weight)

5257

++{

5258

++	u64 d = (u64)service << WFQ_SERVICE_SHIFT;

5259

++

5260

++	do_div(d, weight);

5261

++	return d;

5262

++}

5263

++

5264

++/**

5265

++ * bfq_calc_finish - assign the finish time to an entity.

5266

++ * @entity: the entity to act upon.

5267

++ * @service: the service to be charged to the entity.

5268

++ */

5269

++static inline void bfq_calc_finish(struct bfq_entity *entity,

5270

++				   unsigned long service)

5271

++{

5272

++	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

5273

++

5274

++	BUG_ON(entity->weight == 0);

5275

++

5276

++	entity->finish = entity->start +

5277

++		bfq_delta(service, entity->weight);

5278

++

5279

++	if (bfqq != NULL) {

5280

++		bfq_log_bfqq(bfqq->bfqd, bfqq,

5281

++			"calc_finish: serv %lu, w %d",

5282

++			service, entity->weight);

5283

++		bfq_log_bfqq(bfqq->bfqd, bfqq,

5284

++			"calc_finish: start %llu, finish %llu, delta %llu",

5285

++			entity->start, entity->finish,

5286

++			bfq_delta(service, entity->weight));

5287

++	}

5288

++}

5289

++

5290

++/**

5291

++ * bfq_entity_of - get an entity from a node.

5292

++ * @node: the node field of the entity.

5293

++ *

5294

++ * Convert a node pointer to the relative entity.  This is used only

5295

++ * to simplify the logic of some functions and not as the generic

5296

++ * conversion mechanism because, e.g., in the tree walking functions,

5297

++ * the check for a %NULL value would be redundant.

5298

++ */

5299

++static inline struct bfq_entity *bfq_entity_of(struct rb_node *node)

5300

++{

5301

++	struct bfq_entity *entity = NULL;

5302

++

5303

++	if (node != NULL)

5304

++		entity = rb_entry(node, struct bfq_entity, rb_node);

5305

++

5306

++	return entity;

5307

++}

5308

++

5309

++/**

5310

++ * bfq_extract - remove an entity from a tree.

5311

++ * @root: the tree root.

5312

++ * @entity: the entity to remove.

5313

++ */

5314

++static inline void bfq_extract(struct rb_root *root,

5315

++			       struct bfq_entity *entity)

5316

++{

5317

++	BUG_ON(entity->tree != root);

5318

++

5319

++	entity->tree = NULL;

5320

++	rb_erase(&entity->rb_node, root);

5321

++}

5322

++

5323

++/**

5324

++ * bfq_idle_extract - extract an entity from the idle tree.

5325

++ * @st: the service tree of the owning @entity.

5326

++ * @entity: the entity being removed.

5327

++ */

5328

++static void bfq_idle_extract(struct bfq_service_tree *st,

5329

++			     struct bfq_entity *entity)

5330

++{

5331

++	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

5332

++	struct rb_node *next;

5333

++

5334

++	BUG_ON(entity->tree != &st->idle);

5335

++

5336

++	if (entity == st->first_idle) {

5337

++		next = rb_next(&entity->rb_node);

5338

++		st->first_idle = bfq_entity_of(next);

5339

++	}

5340

++

5341

++	if (entity == st->last_idle) {

5342

++		next = rb_prev(&entity->rb_node);

5343

++		st->last_idle = bfq_entity_of(next);

5344

++	}

5345

++

5346

++	bfq_extract(&st->idle, entity);

5347

++

5348

++	if (bfqq != NULL)

5349

++		list_del(&bfqq->bfqq_list);

5350

++}

5351

++

5352

++/**

5353

++ * bfq_insert - generic tree insertion.

5354

++ * @root: tree root.

5355

++ * @entity: entity to insert.

5356

++ *

5357

++ * This is used for the idle and the active tree, since they are both

5358

++ * ordered by finish time.

5359

++ */

5360

++static void bfq_insert(struct rb_root *root, struct bfq_entity *entity)

5361

++{

5362

++	struct bfq_entity *entry;

5363

++	struct rb_node **node = &root->rb_node;

5364

++	struct rb_node *parent = NULL;

5365

++

5366

++	BUG_ON(entity->tree != NULL);

5367

++

5368

++	while (*node != NULL) {

5369

++		parent = *node;

5370

++		entry = rb_entry(parent, struct bfq_entity, rb_node);

5371

++

5372

++		if (bfq_gt(entry->finish, entity->finish))

5373

++			node = &parent->rb_left;

5374

++		else

5375

++			node = &parent->rb_right;

5376

++	}

5377

++

5378

++	rb_link_node(&entity->rb_node, parent, node);

5379

++	rb_insert_color(&entity->rb_node, root);

5380

++

5381

++	entity->tree = root;

5382

++}

5383

++

5384

++/**

5385

++ * bfq_update_min - update the min_start field of a entity.

5386

++ * @entity: the entity to update.

5387

++ * @node: one of its children.

5388

++ *

5389

++ * This function is called when @entity may store an invalid value for

5390

++ * min_start due to updates to the active tree.  The function  assumes

5391

++ * that the subtree rooted at @node (which may be its left or its right

5392

++ * child) has a valid min_start value.

5393

++ */

5394

++static inline void bfq_update_min(struct bfq_entity *entity,

5395

++				  struct rb_node *node)

5396

++{

5397

++	struct bfq_entity *child;

5398

++

5399

++	if (node != NULL) {

5400

++		child = rb_entry(node, struct bfq_entity, rb_node);

5401

++		if (bfq_gt(entity->min_start, child->min_start))

5402

++			entity->min_start = child->min_start;

5403

++	}

5404

++}

5405

++

5406

++/**

5407

++ * bfq_update_active_node - recalculate min_start.

5408

++ * @node: the node to update.

5409

++ *

5410

++ * @node may have changed position or one of its children may have moved,

5411

++ * this function updates its min_start value.  The left and right subtrees

5412

++ * are assumed to hold a correct min_start value.

5413

++ */

5414

++static inline void bfq_update_active_node(struct rb_node *node)

5415

++{

5416

++	struct bfq_entity *entity = rb_entry(node, struct bfq_entity, rb_node);

5417

++

5418

++	entity->min_start = entity->start;

5419

++	bfq_update_min(entity, node->rb_right);

5420

++	bfq_update_min(entity, node->rb_left);

5421

++}

5422

++

5423

++/**

5424

++ * bfq_update_active_tree - update min_start for the whole active tree.

5425

++ * @node: the starting node.

5426

++ *

5427

++ * @node must be the deepest modified node after an update.  This function

5428

++ * updates its min_start using the values held by its children, assuming

5429

++ * that they did not change, and then updates all the nodes that may have

5430

++ * changed in the path to the root.  The only nodes that may have changed

5431

++ * are the ones in the path or their siblings.

5432

++ */

5433

++static void bfq_update_active_tree(struct rb_node *node)

5434

++{

5435

++	struct rb_node *parent;

5436

++

5437

++up:

5438

++	bfq_update_active_node(node);

5439

++

5440

++	parent = rb_parent(node);

5441

++	if (parent == NULL)

5442

++		return;

5443

++

5444

++	if (node == parent->rb_left && parent->rb_right != NULL)

5445

++		bfq_update_active_node(parent->rb_right);

5446

++	else if (parent->rb_left != NULL)

5447

++		bfq_update_active_node(parent->rb_left);

5448

++

5449

++	node = parent;

5450

++	goto up;

5451

++}

5452

++

5453

++static void bfq_weights_tree_add(struct bfq_data *bfqd,

5454

++				 struct bfq_entity *entity,

5455

++				 struct rb_root *root);

5456

++

5457

++static void bfq_weights_tree_remove(struct bfq_data *bfqd,

5458

++				    struct bfq_entity *entity,

5459

++				    struct rb_root *root);

5460

++

5461

++

5462

++/**

5463

++ * bfq_active_insert - insert an entity in the active tree of its

5464

++ *                     group/device.

5465

++ * @st: the service tree of the entity.

5466

++ * @entity: the entity being inserted.

5467

++ *

5468

++ * The active tree is ordered by finish time, but an extra key is kept

5469

++ * per each node, containing the minimum value for the start times of

5470

++ * its children (and the node itself), so it's possible to search for

5471

++ * the eligible node with the lowest finish time in logarithmic time.

5472

++ */

5473

++static void bfq_active_insert(struct bfq_service_tree *st,

5474

++			      struct bfq_entity *entity)

5475

++{

5476

++	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

5477

++	struct rb_node *node = &entity->rb_node;

5478

++#ifdef CONFIG_CGROUP_BFQIO

5479

++	struct bfq_sched_data *sd = NULL;

5480

++	struct bfq_group *bfqg = NULL;

5481

++	struct bfq_data *bfqd = NULL;

5482

++#endif

5483

++

5484

++	bfq_insert(&st->active, entity);

5485

++

5486

++	if (node->rb_left != NULL)

5487

++		node = node->rb_left;

5488

++	else if (node->rb_right != NULL)

5489

++		node = node->rb_right;

5490

++

5491

++	bfq_update_active_tree(node);

5492

++

5493

++#ifdef CONFIG_CGROUP_BFQIO

5494

++	sd = entity->sched_data;

5495

++	bfqg = container_of(sd, struct bfq_group, sched_data);

5496

++	BUG_ON(!bfqg);

5497

++	bfqd = (struct bfq_data *)bfqg->bfqd;

5498

++#endif

5499

++	if (bfqq != NULL)

5500

++		list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list);

5501

++#ifdef CONFIG_CGROUP_BFQIO

5502

++	else { /* bfq_group */

5503

++		BUG_ON(!bfqd);

5504

++		bfq_weights_tree_add(bfqd, entity, &bfqd->group_weights_tree);

5505

++	}

5506

++	if (bfqg != bfqd->root_group) {

5507

++		BUG_ON(!bfqg);

5508

++		BUG_ON(!bfqd);

5509

++		bfqg->active_entities++;

5510

++		if (bfqg->active_entities == 2)

5511

++			bfqd->active_numerous_groups++;

5512

++	}

5513

++#endif

5514

++}

5515

++

5516

++/**

5517

++ * bfq_ioprio_to_weight - calc a weight from an ioprio.

5518

++ * @ioprio: the ioprio value to convert.

5519

++ */

5520

++static inline unsigned short bfq_ioprio_to_weight(int ioprio)

5521

++{

5522

++	BUG_ON(ioprio < 0 || ioprio >= IOPRIO_BE_NR);

5523

++	return IOPRIO_BE_NR - ioprio;

5524

++}

5525

++

5526

++/**

5527

++ * bfq_weight_to_ioprio - calc an ioprio from a weight.

5528

++ * @weight: the weight value to convert.

5529

++ *

5530

++ * To preserve as mush as possible the old only-ioprio user interface,

5531

++ * 0 is used as an escape ioprio value for weights (numerically) equal or

5532

++ * larger than IOPRIO_BE_NR

5533

++ */

5534

++static inline unsigned short bfq_weight_to_ioprio(int weight)

5535

++{

5536

++	BUG_ON(weight < BFQ_MIN_WEIGHT || weight > BFQ_MAX_WEIGHT);

5537

++	return IOPRIO_BE_NR - weight < 0 ? 0 : IOPRIO_BE_NR - weight;

5538

++}

5539

++

5540

++static inline void bfq_get_entity(struct bfq_entity *entity)

5541

++{

5542

++	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

5543

++

5544

++	if (bfqq != NULL) {

5545

++		atomic_inc(&bfqq->ref);

5546

++		bfq_log_bfqq(bfqq->bfqd, bfqq, "get_entity: %p %d",

5547

++			     bfqq, atomic_read(&bfqq->ref));

5548

++	}

5549

++}

5550

++

5551

++/**

5552

++ * bfq_find_deepest - find the deepest node that an extraction can modify.

5553

++ * @node: the node being removed.

5554

++ *

5555

++ * Do the first step of an extraction in an rb tree, looking for the

5556

++ * node that will replace @node, and returning the deepest node that

5557

++ * the following modifications to the tree can touch.  If @node is the

5558

++ * last node in the tree return %NULL.

5559

++ */

5560

++static struct rb_node *bfq_find_deepest(struct rb_node *node)

5561

++{

5562

++	struct rb_node *deepest;

5563

++

5564

++	if (node->rb_right == NULL && node->rb_left == NULL)

5565

++		deepest = rb_parent(node);

5566

++	else if (node->rb_right == NULL)

5567

++		deepest = node->rb_left;

5568

++	else if (node->rb_left == NULL)

5569

++		deepest = node->rb_right;

5570

++	else {

5571

++		deepest = rb_next(node);

5572

++		if (deepest->rb_right != NULL)

5573

++			deepest = deepest->rb_right;

5574

++		else if (rb_parent(deepest) != node)

5575

++			deepest = rb_parent(deepest);

5576

++	}

5577

++

5578

++	return deepest;

5579

++}

5580

++

5581

++/**

5582

++ * bfq_active_extract - remove an entity from the active tree.

5583

++ * @st: the service_tree containing the tree.

5584

++ * @entity: the entity being removed.

5585

++ */

5586

++static void bfq_active_extract(struct bfq_service_tree *st,

5587

++			       struct bfq_entity *entity)

5588

++{

5589

++	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

5590

++	struct rb_node *node;

5591

++#ifdef CONFIG_CGROUP_BFQIO

5592

++	struct bfq_sched_data *sd = NULL;

5593

++	struct bfq_group *bfqg = NULL;

5594

++	struct bfq_data *bfqd = NULL;

5595

++#endif

5596

++

5597

++	node = bfq_find_deepest(&entity->rb_node);

5598

++	bfq_extract(&st->active, entity);

5599

++

5600

++	if (node != NULL)

5601

++		bfq_update_active_tree(node);

5602

++

5603

++#ifdef CONFIG_CGROUP_BFQIO

5604

++	sd = entity->sched_data;

5605

++	bfqg = container_of(sd, struct bfq_group, sched_data);

5606

++	BUG_ON(!bfqg);

5607

++	bfqd = (struct bfq_data *)bfqg->bfqd;

5608

++#endif

5609

++	if (bfqq != NULL)

5610

++		list_del(&bfqq->bfqq_list);

5611

++#ifdef CONFIG_CGROUP_BFQIO

5612

++	else { /* bfq_group */

5613

++		BUG_ON(!bfqd);

5614

++		bfq_weights_tree_remove(bfqd, entity,

5615

++					&bfqd->group_weights_tree);

5616

++	}

5617

++	if (bfqg != bfqd->root_group) {

5618

++		BUG_ON(!bfqg);

5619

++		BUG_ON(!bfqd);

5620

++		BUG_ON(!bfqg->active_entities);

5621

++		bfqg->active_entities--;

5622

++		if (bfqg->active_entities == 1) {

5623

++			BUG_ON(!bfqd->active_numerous_groups);

5624

++			bfqd->active_numerous_groups--;

5625

++		}

5626

++	}

5627

++#endif

5628

++}

5629

++

5630

++/**

5631

++ * bfq_idle_insert - insert an entity into the idle tree.

5632

++ * @st: the service tree containing the tree.

5633

++ * @entity: the entity to insert.

5634

++ */

5635

++static void bfq_idle_insert(struct bfq_service_tree *st,

5636

++			    struct bfq_entity *entity)

5637

++{

5638

++	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

5639

++	struct bfq_entity *first_idle = st->first_idle;

5640

++	struct bfq_entity *last_idle = st->last_idle;

5641

++

5642

++	if (first_idle == NULL || bfq_gt(first_idle->finish, entity->finish))

5643

++		st->first_idle = entity;

5644

++	if (last_idle == NULL || bfq_gt(entity->finish, last_idle->finish))

5645

++		st->last_idle = entity;

5646

++

5647

++	bfq_insert(&st->idle, entity);

5648

++

5649

++	if (bfqq != NULL)

5650

++		list_add(&bfqq->bfqq_list, &bfqq->bfqd->idle_list);

5651

++}

5652

++

5653

++/**

5654

++ * bfq_forget_entity - remove an entity from the wfq trees.

5655

++ * @st: the service tree.

5656

++ * @entity: the entity being removed.

5657

++ *

5658

++ * Update the device status and forget everything about @entity, putting

5659

++ * the device reference to it, if it is a queue.  Entities belonging to

5660

++ * groups are not refcounted.

5661

++ */

5662

++static void bfq_forget_entity(struct bfq_service_tree *st,

5663

++			      struct bfq_entity *entity)

5664

++{

5665

++	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

5666

++	struct bfq_sched_data *sd;

5667

++

5668

++	BUG_ON(!entity->on_st);

5669

++

5670

++	entity->on_st = 0;

5671

++	st->wsum -= entity->weight;

5672

++	if (bfqq != NULL) {

5673

++		sd = entity->sched_data;

5674

++		bfq_log_bfqq(bfqq->bfqd, bfqq, "forget_entity: %p %d",

5675

++			     bfqq, atomic_read(&bfqq->ref));

5676

++		bfq_put_queue(bfqq);

5677

++	}

5678

++}

5679

++

5680

++/**

5681

++ * bfq_put_idle_entity - release the idle tree ref of an entity.

5682

++ * @st: service tree for the entity.

5683

++ * @entity: the entity being released.

5684

++ */

5685

++static void bfq_put_idle_entity(struct bfq_service_tree *st,

5686

++				struct bfq_entity *entity)

5687

++{

5688

++	bfq_idle_extract(st, entity);

5689

++	bfq_forget_entity(st, entity);

5690

++}

5691

++

5692

++/**

5693

++ * bfq_forget_idle - update the idle tree if necessary.

5694

++ * @st: the service tree to act upon.

5695

++ *

5696

++ * To preserve the global O(log N) complexity we only remove one entry here;

5697

++ * as the idle tree will not grow indefinitely this can be done safely.

5698

++ */

5699

++static void bfq_forget_idle(struct bfq_service_tree *st)

5700

++{

5701

++	struct bfq_entity *first_idle = st->first_idle;

5702

++	struct bfq_entity *last_idle = st->last_idle;

5703

++

5704

++	if (RB_EMPTY_ROOT(&st->active) && last_idle != NULL &&

5705

++	    !bfq_gt(last_idle->finish, st->vtime)) {

5706

++		/*

5707

++		 * Forget the whole idle tree, increasing the vtime past

5708

++		 * the last finish time of idle entities.

5709

++		 */

5710

++		st->vtime = last_idle->finish;

5711

++	}

5712

++

5713

++	if (first_idle != NULL && !bfq_gt(first_idle->finish, st->vtime))

5714

++		bfq_put_idle_entity(st, first_idle);

5715

++}

5716

++

5717

++static struct bfq_service_tree *

5718

++__bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,

5719

++			 struct bfq_entity *entity)

5720

++{

5721

++	struct bfq_service_tree *new_st = old_st;

5722

++

5723

++	if (entity->ioprio_changed) {

5724

++		struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

5725

++		unsigned short prev_weight, new_weight;

5726

++		struct bfq_data *bfqd = NULL;

5727

++		struct rb_root *root;

5728

++#ifdef CONFIG_CGROUP_BFQIO

5729

++		struct bfq_sched_data *sd;

5730

++		struct bfq_group *bfqg;

5731

++#endif

5732

++

5733

++		if (bfqq != NULL)

5734

++			bfqd = bfqq->bfqd;

5735

++#ifdef CONFIG_CGROUP_BFQIO

5736

++		else {

5737

++			sd = entity->my_sched_data;

5738

++			bfqg = container_of(sd, struct bfq_group, sched_data);

5739

++			BUG_ON(!bfqg);

5740

++			bfqd = (struct bfq_data *)bfqg->bfqd;

5741

++			BUG_ON(!bfqd);

5742

++		}

5743

++#endif

5744

++

5745

++		BUG_ON(old_st->wsum < entity->weight);

5746

++		old_st->wsum -= entity->weight;

5747

++

5748

++		if (entity->new_weight != entity->orig_weight) {

5749

++			if (entity->new_weight < BFQ_MIN_WEIGHT ||

5750

++			    entity->new_weight > BFQ_MAX_WEIGHT) {

5751

++				printk(KERN_CRIT "update_weight_prio: "

5752

++						 "new_weight %d\n",

5753

++					entity->new_weight);

5754

++				BUG();

5755

++			}

5756

++			entity->orig_weight = entity->new_weight;

5757

++			entity->ioprio =

5758

++				bfq_weight_to_ioprio(entity->orig_weight);

5759

++		}

5760

++

5761

++		entity->ioprio_class = entity->new_ioprio_class;

5762

++		entity->ioprio_changed = 0;

5763

++

5764

++		/*

5765

++		 * NOTE: here we may be changing the weight too early,

5766

++		 * this will cause unfairness.  The correct approach

5767

++		 * would have required additional complexity to defer

5768

++		 * weight changes to the proper time instants (i.e.,

5769

++		 * when entity->finish <= old_st->vtime).

5770

++		 */

5771

++		new_st = bfq_entity_service_tree(entity);

5772

++

5773

++		prev_weight = entity->weight;

5774

++		new_weight = entity->orig_weight *

5775

++			     (bfqq != NULL ? bfqq->wr_coeff : 1);

5776

++		/*

5777

++		 * If the weight of the entity changes, remove the entity

5778

++		 * from its old weight counter (if there is a counter

5779

++		 * associated with the entity), and add it to the counter

5780

++		 * associated with its new weight.

5781

++		 */

5782

++		if (prev_weight != new_weight) {

5783

++			root = bfqq ? &bfqd->queue_weights_tree :

5784

++				      &bfqd->group_weights_tree;

5785

++			bfq_weights_tree_remove(bfqd, entity, root);

5786

++		}

5787

++		entity->weight = new_weight;

5788

++		/*

5789

++		 * Add the entity to its weights tree only if it is

5790

++		 * not associated with a weight-raised queue.

5791

++		 */

5792

++		if (prev_weight != new_weight &&

5793

++		    (bfqq ? bfqq->wr_coeff == 1 : 1))

5794

++			/* If we get here, root has been initialized. */

5795

++			bfq_weights_tree_add(bfqd, entity, root);

5796

++

5797

++		new_st->wsum += entity->weight;

5798

++

5799

++		if (new_st != old_st)

5800

++			entity->start = new_st->vtime;

5801

++	}

5802

++

5803

++	return new_st;

5804

++}

5805

++

5806

++/**

5807

++ * bfq_bfqq_served - update the scheduler status after selection for

5808

++ *                   service.

5809

++ * @bfqq: the queue being served.

5810

++ * @served: bytes to transfer.

5811

++ *

5812

++ * NOTE: this can be optimized, as the timestamps of upper level entities

5813

++ * are synchronized every time a new bfqq is selected for service.  By now,

5814

++ * we keep it to better check consistency.

5815

++ */

5816

++static void bfq_bfqq_served(struct bfq_queue *bfqq, unsigned long served)

5817

++{

5818

++	struct bfq_entity *entity = &bfqq->entity;

5819

++	struct bfq_service_tree *st;

5820

++

5821

++	for_each_entity(entity) {

5822

++		st = bfq_entity_service_tree(entity);

5823

++

5824

++		entity->service += served;

5825

++		BUG_ON(entity->service > entity->budget);

5826

++		BUG_ON(st->wsum == 0);

5827

++

5828

++		st->vtime += bfq_delta(served, st->wsum);

5829

++		bfq_forget_idle(st);

5830

++	}

5831

++	bfq_log_bfqq(bfqq->bfqd, bfqq, "bfqq_served %lu secs", served);

5832

++}

5833

++

5834

++/**

5835

++ * bfq_bfqq_charge_full_budget - set the service to the entity budget.

5836

++ * @bfqq: the queue that needs a service update.

5837

++ *

5838

++ * When it's not possible to be fair in the service domain, because

5839

++ * a queue is not consuming its budget fast enough (the meaning of

5840

++ * fast depends on the timeout parameter), we charge it a full

5841

++ * budget.  In this way we should obtain a sort of time-domain

5842

++ * fairness among all the seeky/slow queues.

5843

++ */

5844

++static inline void bfq_bfqq_charge_full_budget(struct bfq_queue *bfqq)

5845

++{

5846

++	struct bfq_entity *entity = &bfqq->entity;

5847

++

5848

++	bfq_log_bfqq(bfqq->bfqd, bfqq, "charge_full_budget");

5849

++

5850

++	bfq_bfqq_served(bfqq, entity->budget - entity->service);

5851

++}

5852

++

5853

++/**

5854

++ * __bfq_activate_entity - activate an entity.

5855

++ * @entity: the entity being activated.

5856

++ *

5857

++ * Called whenever an entity is activated, i.e., it is not active and one

5858

++ * of its children receives a new request, or has to be reactivated due to

5859

++ * budget exhaustion.  It uses the current budget of the entity (and the

5860

++ * service received if @entity is active) of the queue to calculate its

5861

++ * timestamps.

5862

++ */

5863

++static void __bfq_activate_entity(struct bfq_entity *entity)

5864

++{

5865

++	struct bfq_sched_data *sd = entity->sched_data;

5866

++	struct bfq_service_tree *st = bfq_entity_service_tree(entity);

5867

++

5868

++	if (entity == sd->in_service_entity) {

5869

++		BUG_ON(entity->tree != NULL);

5870

++		/*

5871

++		 * If we are requeueing the current entity we have

5872

++		 * to take care of not charging to it service it has

5873

++		 * not received.

5874

++		 */

5875

++		bfq_calc_finish(entity, entity->service);

5876

++		entity->start = entity->finish;

5877

++		sd->in_service_entity = NULL;

5878

++	} else if (entity->tree == &st->active) {

5879

++		/*

5880

++		 * Requeueing an entity due to a change of some

5881

++		 * next_in_service entity below it.  We reuse the

5882

++		 * old start time.

5883

++		 */

5884

++		bfq_active_extract(st, entity);

5885

++	} else if (entity->tree == &st->idle) {

5886

++		/*

5887

++		 * Must be on the idle tree, bfq_idle_extract() will

5888

++		 * check for that.

5889

++		 */

5890

++		bfq_idle_extract(st, entity);

5891

++		entity->start = bfq_gt(st->vtime, entity->finish) ?

5892

++				       st->vtime : entity->finish;

5893

++	} else {

5894

++		/*

5895

++		 * The finish time of the entity may be invalid, and

5896

++		 * it is in the past for sure, otherwise the queue

5897

++		 * would have been on the idle tree.

5898

++		 */

5899

++		entity->start = st->vtime;

5900

++		st->wsum += entity->weight;

5901

++		bfq_get_entity(entity);

5902

++

5903

++		BUG_ON(entity->on_st);

5904

++		entity->on_st = 1;

5905

++	}

5906

++

5907

++	st = __bfq_entity_update_weight_prio(st, entity);

5908

++	bfq_calc_finish(entity, entity->budget);

5909

++	bfq_active_insert(st, entity);

5910

++}

5911

++

5912

++/**

5913

++ * bfq_activate_entity - activate an entity and its ancestors if necessary.

5914

++ * @entity: the entity to activate.

5915

++ *

5916

++ * Activate @entity and all the entities on the path from it to the root.

5917

++ */

5918

++static void bfq_activate_entity(struct bfq_entity *entity)

5919

++{

5920

++	struct bfq_sched_data *sd;

5921

++

5922

++	for_each_entity(entity) {

5923

++		__bfq_activate_entity(entity);

5924

++

5925

++		sd = entity->sched_data;

5926

++		if (!bfq_update_next_in_service(sd))

5927

++			/*

5928

++			 * No need to propagate the activation to the

5929

++			 * upper entities, as they will be updated when

5930

++			 * the in-service entity is rescheduled.

5931

++			 */

5932

++			break;

5933

++	}

5934

++}

5935

++

5936

++/**

5937

++ * __bfq_deactivate_entity - deactivate an entity from its service tree.

5938

++ * @entity: the entity to deactivate.

5939

++ * @requeue: if false, the entity will not be put into the idle tree.

5940

++ *

5941

++ * Deactivate an entity, independently from its previous state.  If the

5942

++ * entity was not on a service tree just return, otherwise if it is on

5943

++ * any scheduler tree, extract it from that tree, and if necessary

5944

++ * and if the caller did not specify @requeue, put it on the idle tree.

5945

++ *

5946

++ * Return %1 if the caller should update the entity hierarchy, i.e.,

5947

++ * if the entity was in service or if it was the next_in_service for

5948

++ * its sched_data; return %0 otherwise.

5949

++ */

5950

++static int __bfq_deactivate_entity(struct bfq_entity *entity, int requeue)

5951

++{

5952

++	struct bfq_sched_data *sd = entity->sched_data;

5953

++	struct bfq_service_tree *st = bfq_entity_service_tree(entity);

5954

++	int was_in_service = entity == sd->in_service_entity;

5955

++	int ret = 0;

5956

++

5957

++	if (!entity->on_st)

5958

++		return 0;

5959

++

5960

++	BUG_ON(was_in_service && entity->tree != NULL);

5961

++

5962

++	if (was_in_service) {

5963

++		bfq_calc_finish(entity, entity->service);

5964

++		sd->in_service_entity = NULL;

5965

++	} else if (entity->tree == &st->active)

5966

++		bfq_active_extract(st, entity);

5967

++	else if (entity->tree == &st->idle)

5968

++		bfq_idle_extract(st, entity);

5969

++	else if (entity->tree != NULL)

5970

++		BUG();

5971

++

5972

++	if (was_in_service || sd->next_in_service == entity)

5973

++		ret = bfq_update_next_in_service(sd);

5974

++

5975

++	if (!requeue || !bfq_gt(entity->finish, st->vtime))

5976

++		bfq_forget_entity(st, entity);

5977

++	else

5978

++		bfq_idle_insert(st, entity);

5979

++

5980

++	BUG_ON(sd->in_service_entity == entity);

5981

++	BUG_ON(sd->next_in_service == entity);

5982

++

5983

++	return ret;

5984

++}

5985

++

5986

++/**

5987

++ * bfq_deactivate_entity - deactivate an entity.

5988

++ * @entity: the entity to deactivate.

5989

++ * @requeue: true if the entity can be put on the idle tree

5990

++ */

5991

++static void bfq_deactivate_entity(struct bfq_entity *entity, int requeue)

5992

++{

5993

++	struct bfq_sched_data *sd;

5994

++	struct bfq_entity *parent;

5995

++

5996

++	for_each_entity_safe(entity, parent) {

5997

++		sd = entity->sched_data;

5998

++

5999

++		if (!__bfq_deactivate_entity(entity, requeue))

6000

++			/*

6001

++			 * The parent entity is still backlogged, and

6002

++			 * we don't need to update it as it is still

6003

++			 * in service.

6004

++			 */

6005

++			break;

6006

++

6007

++		if (sd->next_in_service != NULL)

6008

++			/*

6009

++			 * The parent entity is still backlogged and

6010

++			 * the budgets on the path towards the root

6011

++			 * need to be updated.

6012

++			 */

6013

++			goto update;

6014

++

6015

++		/*

6016

++		 * If we reach there the parent is no more backlogged and

6017

++		 * we want to propagate the dequeue upwards.

6018

++		 */

6019

++		requeue = 1;

6020

++	}

6021

++

6022

++	return;

6023

++

6024

++update:

6025

++	entity = parent;

6026

++	for_each_entity(entity) {

6027

++		__bfq_activate_entity(entity);

6028

++

6029

++		sd = entity->sched_data;

6030

++		if (!bfq_update_next_in_service(sd))

6031

++			break;

6032

++	}

6033

++}

6034

++

6035

++/**

6036

++ * bfq_update_vtime - update vtime if necessary.

6037

++ * @st: the service tree to act upon.

6038

++ *

6039

++ * If necessary update the service tree vtime to have at least one

6040

++ * eligible entity, skipping to its start time.  Assumes that the

6041

++ * active tree of the device is not empty.

6042

++ *

6043

++ * NOTE: this hierarchical implementation updates vtimes quite often,

6044

++ * we may end up with reactivated processes getting timestamps after a

6045

++ * vtime skip done because we needed a ->first_active entity on some

6046

++ * intermediate node.

6047

++ */

6048

++static void bfq_update_vtime(struct bfq_service_tree *st)

6049

++{

6050

++	struct bfq_entity *entry;

6051

++	struct rb_node *node = st->active.rb_node;

6052

++

6053

++	entry = rb_entry(node, struct bfq_entity, rb_node);

6054

++	if (bfq_gt(entry->min_start, st->vtime)) {

6055

++		st->vtime = entry->min_start;

6056

++		bfq_forget_idle(st);

6057

++	}

6058

++}

6059

++

6060

++/**

6061

++ * bfq_first_active_entity - find the eligible entity with

6062

++ *                           the smallest finish time

6063

++ * @st: the service tree to select from.

6064

++ *

6065

++ * This function searches the first schedulable entity, starting from the

6066

++ * root of the tree and going on the left every time on this side there is

6067

++ * a subtree with at least one eligible (start >= vtime) entity. The path on

6068

++ * the right is followed only if a) the left subtree contains no eligible

6069

++ * entities and b) no eligible entity has been found yet.

6070

++ */

6071

++static struct bfq_entity *bfq_first_active_entity(struct bfq_service_tree *st)

6072

++{

6073

++	struct bfq_entity *entry, *first = NULL;

6074

++	struct rb_node *node = st->active.rb_node;

6075

++

6076

++	while (node != NULL) {

6077

++		entry = rb_entry(node, struct bfq_entity, rb_node);

6078

++left:

6079

++		if (!bfq_gt(entry->start, st->vtime))

6080

++			first = entry;

6081

++

6082

++		BUG_ON(bfq_gt(entry->min_start, st->vtime));

6083

++

6084

++		if (node->rb_left != NULL) {

6085

++			entry = rb_entry(node->rb_left,

6086

++					 struct bfq_entity, rb_node);

6087

++			if (!bfq_gt(entry->min_start, st->vtime)) {

6088

++				node = node->rb_left;

6089

++				goto left;

6090

++			}

6091

++		}

6092

++		if (first != NULL)

6093

++			break;

6094

++		node = node->rb_right;

6095

++	}

6096

++

6097

++	BUG_ON(first == NULL && !RB_EMPTY_ROOT(&st->active));

6098

++	return first;

6099

++}

6100

++

6101

++/**

6102

++ * __bfq_lookup_next_entity - return the first eligible entity in @st.

6103

++ * @st: the service tree.

6104

++ *

6105

++ * Update the virtual time in @st and return the first eligible entity

6106

++ * it contains.

6107

++ */

6108

++static struct bfq_entity *__bfq_lookup_next_entity(struct bfq_service_tree *st,

6109

++						   bool force)

6110

++{

6111

++	struct bfq_entity *entity, *new_next_in_service = NULL;

6112

++

6113

++	if (RB_EMPTY_ROOT(&st->active))

6114

++		return NULL;

6115

++

6116

++	bfq_update_vtime(st);

6117

++	entity = bfq_first_active_entity(st);

6118

++	BUG_ON(bfq_gt(entity->start, st->vtime));

6119

++

6120

++	/*

6121

++	 * If the chosen entity does not match with the sched_data's

6122

++	 * next_in_service and we are forcedly serving the IDLE priority

6123

++	 * class tree, bubble up budget update.

6124

++	 */

6125

++	if (unlikely(force && entity != entity->sched_data->next_in_service)) {

6126

++		new_next_in_service = entity;

6127

++		for_each_entity(new_next_in_service)

6128

++			bfq_update_budget(new_next_in_service);

6129

++	}

6130

++

6131

++	return entity;

6132

++}

6133

++

6134

++/**

6135

++ * bfq_lookup_next_entity - return the first eligible entity in @sd.

6136

++ * @sd: the sched_data.

6137

++ * @extract: if true the returned entity will be also extracted from @sd.

6138

++ *

6139

++ * NOTE: since we cache the next_in_service entity at each level of the

6140

++ * hierarchy, the complexity of the lookup can be decreased with

6141

++ * absolutely no effort just returning the cached next_in_service value;

6142

++ * we prefer to do full lookups to test the consistency of * the data

6143

++ * structures.

6144

++ */

6145

++static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd,

6146

++						 int extract,

6147

++						 struct bfq_data *bfqd)

6148

++{

6149

++	struct bfq_service_tree *st = sd->service_tree;

6150

++	struct bfq_entity *entity;

6151

++	int i = 0;

6152

++

6153

++	BUG_ON(sd->in_service_entity != NULL);

6154

++

6155

++	if (bfqd != NULL &&

6156

++	    jiffies - bfqd->bfq_class_idle_last_service > BFQ_CL_IDLE_TIMEOUT) {

6157

++		entity = __bfq_lookup_next_entity(st + BFQ_IOPRIO_CLASSES - 1,

6158

++						  true);

6159

++		if (entity != NULL) {

6160

++			i = BFQ_IOPRIO_CLASSES - 1;

6161

++			bfqd->bfq_class_idle_last_service = jiffies;

6162

++			sd->next_in_service = entity;

6163

++		}

6164

++	}

6165

++	for (; i < BFQ_IOPRIO_CLASSES; i++) {

6166

++		entity = __bfq_lookup_next_entity(st + i, false);

6167

++		if (entity != NULL) {

6168

++			if (extract) {

6169

++				bfq_check_next_in_service(sd, entity);

6170

++				bfq_active_extract(st + i, entity);

6171

++				sd->in_service_entity = entity;

6172

++				sd->next_in_service = NULL;

6173

++			}

6174

++			break;

6175

++		}

6176

++	}

6177

++

6178

++	return entity;

6179

++}

6180

++

6181

++/*

6182

++ * Get next queue for service.

6183

++ */

6184

++static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd)

6185

++{

6186

++	struct bfq_entity *entity = NULL;

6187

++	struct bfq_sched_data *sd;

6188

++	struct bfq_queue *bfqq;

6189

++

6190

++	BUG_ON(bfqd->in_service_queue != NULL);

6191

++

6192

++	if (bfqd->busy_queues == 0)

6193

++		return NULL;

6194

++

6195

++	sd = &bfqd->root_group->sched_data;

6196

++	for (; sd != NULL; sd = entity->my_sched_data) {

6197

++		entity = bfq_lookup_next_entity(sd, 1, bfqd);

6198

++		BUG_ON(entity == NULL);

6199

++		entity->service = 0;

6200

++	}

6201

++

6202

++	bfqq = bfq_entity_to_bfqq(entity);

6203

++	BUG_ON(bfqq == NULL);

6204

++

6205

++	return bfqq;

6206

++}

6207

++

6208

++/*

6209

++ * Forced extraction of the given queue.

6210

++ */

6211

++static void bfq_get_next_queue_forced(struct bfq_data *bfqd,

6212

++				      struct bfq_queue *bfqq)

6213

++{

6214

++	struct bfq_entity *entity;

6215

++	struct bfq_sched_data *sd;

6216

++

6217

++	BUG_ON(bfqd->in_service_queue != NULL);

6218

++

6219

++	entity = &bfqq->entity;

6220

++	/*

6221

++	 * Bubble up extraction/update from the leaf to the root.

6222

++	*/

6223

++	for_each_entity(entity) {

6224

++		sd = entity->sched_data;

6225

++		bfq_update_budget(entity);

6226

++		bfq_update_vtime(bfq_entity_service_tree(entity));

6227

++		bfq_active_extract(bfq_entity_service_tree(entity), entity);

6228

++		sd->in_service_entity = entity;

6229

++		sd->next_in_service = NULL;

6230

++		entity->service = 0;

6231

++	}

6232

++

6233

++	return;

6234

++}

6235

++

6236

++static void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd)

6237

++{

6238

++	if (bfqd->in_service_bic != NULL) {

6239

++		put_io_context(bfqd->in_service_bic->icq.ioc);

6240

++		bfqd->in_service_bic = NULL;

6241

++	}

6242

++

6243

++	bfqd->in_service_queue = NULL;

6244

++	del_timer(&bfqd->idle_slice_timer);

6245

++}

6246

++

6247

++static void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,

6248

++				int requeue)

6249

++{

6250

++	struct bfq_entity *entity = &bfqq->entity;

6251

++

6252

++	if (bfqq == bfqd->in_service_queue)

6253

++		__bfq_bfqd_reset_in_service(bfqd);

6254

++

6255

++	bfq_deactivate_entity(entity, requeue);

6256

++}

6257

++

6258

++static void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)

6259

++{

6260

++	struct bfq_entity *entity = &bfqq->entity;

6261

++

6262

++	bfq_activate_entity(entity);

6263

++}

6264

++

6265

++/*

6266

++ * Called when the bfqq no longer has requests pending, remove it from

6267

++ * the service tree.

6268

++ */

6269

++static void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq,

6270

++			      int requeue)

6271

++{

6272

++	BUG_ON(!bfq_bfqq_busy(bfqq));

6273

++	BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list));

6274

++

6275

++	bfq_log_bfqq(bfqd, bfqq, "del from busy");

6276

++

6277

++	bfq_clear_bfqq_busy(bfqq);

6278

++

6279

++	BUG_ON(bfqd->busy_queues == 0);

6280

++	bfqd->busy_queues--;

6281

++

6282

++	if (!bfqq->dispatched) {

6283

++		bfq_weights_tree_remove(bfqd, &bfqq->entity,

6284

++					&bfqd->queue_weights_tree);

6285

++		if (!blk_queue_nonrot(bfqd->queue)) {

6286

++			BUG_ON(!bfqd->busy_in_flight_queues);

6287

++			bfqd->busy_in_flight_queues--;

6288

++			if (bfq_bfqq_constantly_seeky(bfqq)) {

6289

++				BUG_ON(!bfqd->

6290

++					const_seeky_busy_in_flight_queues);

6291

++				bfqd->const_seeky_busy_in_flight_queues--;

6292

++			}

6293

++		}

6294

++	}

6295

++	if (bfqq->wr_coeff > 1)

6296

++		bfqd->wr_busy_queues--;

6297

++

6298

++	bfq_deactivate_bfqq(bfqd, bfqq, requeue);

6299

++}

6300

++

6301

++/*

6302

++ * Called when an inactive queue receives a new request.

6303

++ */

6304

++static void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq)

6305

++{

6306

++	BUG_ON(bfq_bfqq_busy(bfqq));

6307

++	BUG_ON(bfqq == bfqd->in_service_queue);

6308

++

6309

++	bfq_log_bfqq(bfqd, bfqq, "add to busy");

6310

++

6311

++	bfq_activate_bfqq(bfqd, bfqq);

6312

++

6313

++	bfq_mark_bfqq_busy(bfqq);

6314

++	bfqd->busy_queues++;

6315

++

6316

++	if (!bfqq->dispatched) {

6317

++		if (bfqq->wr_coeff == 1)

6318

++			bfq_weights_tree_add(bfqd, &bfqq->entity,

6319

++					     &bfqd->queue_weights_tree);

6320

++		if (!blk_queue_nonrot(bfqd->queue)) {

6321

++			bfqd->busy_in_flight_queues++;

6322

++			if (bfq_bfqq_constantly_seeky(bfqq))

6323

++				bfqd->const_seeky_busy_in_flight_queues++;

6324

++		}

6325

++	}

6326

++	if (bfqq->wr_coeff > 1)

6327

++		bfqd->wr_busy_queues++;

6328

++}

6329

+diff --git a/block/bfq.h b/block/bfq.h

6330

+new file mode 100644

6331

+index 0000000..e350b5f

6332

+--- /dev/null

6333

++++ b/block/bfq.h

6334

+@@ -0,0 +1,771 @@

6335

++/*

6336

++ * BFQ-v7r8 for 4.2.0: data structures and common functions prototypes.

6337

++ *

6338

++ * Based on ideas and code from CFQ:

6339

++ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>

6340

++ *

6341

++ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>

6342

++ *		      Paolo Valente <paolo.valente@×××××××.it>

6343

++ *

6344

++ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>

6345

++ */

6346

++

6347

++#ifndef _BFQ_H

6348

++#define _BFQ_H

6349

++

6350

++#include <linux/blktrace_api.h>

6351

++#include <linux/hrtimer.h>

6352

++#include <linux/ioprio.h>

6353

++#include <linux/rbtree.h>

6354

++

6355

++#define BFQ_IOPRIO_CLASSES	3

6356

++#define BFQ_CL_IDLE_TIMEOUT	(HZ/5)

6357

++

6358

++#define BFQ_MIN_WEIGHT	1

6359

++#define BFQ_MAX_WEIGHT	1000

6360

++

6361

++#define BFQ_DEFAULT_QUEUE_IOPRIO	4

6362

++

6363

++#define BFQ_DEFAULT_GRP_WEIGHT	10

6364

++#define BFQ_DEFAULT_GRP_IOPRIO	0

6365

++#define BFQ_DEFAULT_GRP_CLASS	IOPRIO_CLASS_BE

6366

++

6367

++struct bfq_entity;

6368

++

6369

++/**

6370

++ * struct bfq_service_tree - per ioprio_class service tree.

6371

++ * @active: tree for active entities (i.e., those backlogged).

6372

++ * @idle: tree for idle entities (i.e., those not backlogged, with V <= F_i).

6373

++ * @first_idle: idle entity with minimum F_i.

6374

++ * @last_idle: idle entity with maximum F_i.

6375

++ * @vtime: scheduler virtual time.

6376

++ * @wsum: scheduler weight sum; active and idle entities contribute to it.

6377

++ *

6378

++ * Each service tree represents a B-WF2Q+ scheduler on its own.  Each

6379

++ * ioprio_class has its own independent scheduler, and so its own

6380

++ * bfq_service_tree.  All the fields are protected by the queue lock

6381

++ * of the containing bfqd.

6382

++ */

6383

++struct bfq_service_tree {

6384

++	struct rb_root active;

6385

++	struct rb_root idle;

6386

++

6387

++	struct bfq_entity *first_idle;

6388

++	struct bfq_entity *last_idle;

6389

++

6390

++	u64 vtime;

6391

++	unsigned long wsum;

6392

++};

6393

++

6394

++/**

6395

++ * struct bfq_sched_data - multi-class scheduler.

6396

++ * @in_service_entity: entity in service.

6397

++ * @next_in_service: head-of-the-line entity in the scheduler.

6398

++ * @service_tree: array of service trees, one per ioprio_class.

6399

++ *

6400

++ * bfq_sched_data is the basic scheduler queue.  It supports three

6401

++ * ioprio_classes, and can be used either as a toplevel queue or as

6402

++ * an intermediate queue on a hierarchical setup.

6403

++ * @next_in_service points to the active entity of the sched_data

6404

++ * service trees that will be scheduled next.

6405

++ *

6406

++ * The supported ioprio_classes are the same as in CFQ, in descending

6407

++ * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE.

6408

++ * Requests from higher priority queues are served before all the

6409

++ * requests from lower priority queues; among requests of the same

6410

++ * queue requests are served according to B-WF2Q+.

6411

++ * All the fields are protected by the queue lock of the containing bfqd.

6412

++ */

6413

++struct bfq_sched_data {

6414

++	struct bfq_entity *in_service_entity;

6415

++	struct bfq_entity *next_in_service;

6416

++	struct bfq_service_tree service_tree[BFQ_IOPRIO_CLASSES];

6417

++};

6418

++

6419

++/**

6420

++ * struct bfq_weight_counter - counter of the number of all active entities

6421

++ *                             with a given weight.

6422

++ * @weight: weight of the entities that this counter refers to.

6423

++ * @num_active: number of active entities with this weight.

6424

++ * @weights_node: weights tree member (see bfq_data's @queue_weights_tree

6425

++ *                and @group_weights_tree).

6426

++ */

6427

++struct bfq_weight_counter {

6428

++	short int weight;

6429

++	unsigned int num_active;

6430

++	struct rb_node weights_node;

6431

++};

6432

++

6433

++/**

6434

++ * struct bfq_entity - schedulable entity.

6435

++ * @rb_node: service_tree member.

6436

++ * @weight_counter: pointer to the weight counter associated with this entity.

6437

++ * @on_st: flag, true if the entity is on a tree (either the active or

6438

++ *         the idle one of its service_tree).

6439

++ * @finish: B-WF2Q+ finish timestamp (aka F_i).

6440

++ * @start: B-WF2Q+ start timestamp (aka S_i).

6441

++ * @tree: tree the entity is enqueued into; %NULL if not on a tree.

6442

++ * @min_start: minimum start time of the (active) subtree rooted at

6443

++ *             this entity; used for O(log N) lookups into active trees.

6444

++ * @service: service received during the last round of service.

6445

++ * @budget: budget used to calculate F_i; F_i = S_i + @budget / @weight.

6446

++ * @weight: weight of the queue

6447

++ * @parent: parent entity, for hierarchical scheduling.

6448

++ * @my_sched_data: for non-leaf nodes in the cgroup hierarchy, the

6449

++ *                 associated scheduler queue, %NULL on leaf nodes.

6450

++ * @sched_data: the scheduler queue this entity belongs to.

6451

++ * @ioprio: the ioprio in use.

6452

++ * @new_weight: when a weight change is requested, the new weight value.

6453

++ * @orig_weight: original weight, used to implement weight boosting

6454

++ * @new_ioprio: when an ioprio change is requested, the new ioprio value.

6455

++ * @ioprio_class: the ioprio_class in use.

6456

++ * @new_ioprio_class: when an ioprio_class change is requested, the new

6457

++ *                    ioprio_class value.

6458

++ * @ioprio_changed: flag, true when the user requested a weight, ioprio or

6459

++ *                  ioprio_class change.

6460

++ *

6461

++ * A bfq_entity is used to represent either a bfq_queue (leaf node in the

6462

++ * cgroup hierarchy) or a bfq_group into the upper level scheduler.  Each

6463

++ * entity belongs to the sched_data of the parent group in the cgroup

6464

++ * hierarchy.  Non-leaf entities have also their own sched_data, stored

6465

++ * in @my_sched_data.

6466

++ *

6467

++ * Each entity stores independently its priority values; this would

6468

++ * allow different weights on different devices, but this

6469

++ * functionality is not exported to userspace by now.  Priorities and

6470

++ * weights are updated lazily, first storing the new values into the

6471

++ * new_* fields, then setting the @ioprio_changed flag.  As soon as

6472

++ * there is a transition in the entity state that allows the priority

6473

++ * update to take place the effective and the requested priority

6474

++ * values are synchronized.

6475

++ *

6476

++ * Unless cgroups are used, the weight value is calculated from the

6477

++ * ioprio to export the same interface as CFQ.  When dealing with

6478

++ * ``well-behaved'' queues (i.e., queues that do not spend too much

6479

++ * time to consume their budget and have true sequential behavior, and

6480

++ * when there are no external factors breaking anticipation) the

6481

++ * relative weights at each level of the cgroups hierarchy should be

6482

++ * guaranteed.  All the fields are protected by the queue lock of the

6483

++ * containing bfqd.

6484

++ */

6485

++struct bfq_entity {

6486

++	struct rb_node rb_node;

6487

++	struct bfq_weight_counter *weight_counter;

6488

++

6489

++	int on_st;

6490

++

6491

++	u64 finish;

6492

++	u64 start;

6493

++

6494

++	struct rb_root *tree;

6495

++

6496

++	u64 min_start;

6497

++

6498

++	unsigned long service, budget;

6499

++	unsigned short weight, new_weight;

6500

++	unsigned short orig_weight;

6501

++

6502

++	struct bfq_entity *parent;

6503

++

6504

++	struct bfq_sched_data *my_sched_data;

6505

++	struct bfq_sched_data *sched_data;

6506

++

6507

++	unsigned short ioprio, new_ioprio;

6508

++	unsigned short ioprio_class, new_ioprio_class;

6509

++

6510

++	int ioprio_changed;

6511

++};

6512

++

6513

++struct bfq_group;

6514

++

6515

++/**

6516

++ * struct bfq_queue - leaf schedulable entity.

6517

++ * @ref: reference counter.

6518

++ * @bfqd: parent bfq_data.

6519

++ * @new_bfqq: shared bfq_queue if queue is cooperating with

6520

++ *           one or more other queues.

6521

++ * @pos_node: request-position tree member (see bfq_data's @rq_pos_tree).

6522

++ * @pos_root: request-position tree root (see bfq_data's @rq_pos_tree).

6523

++ * @sort_list: sorted list of pending requests.

6524

++ * @next_rq: if fifo isn't expired, next request to serve.

6525

++ * @queued: nr of requests queued in @sort_list.

6526

++ * @allocated: currently allocated requests.

6527

++ * @meta_pending: pending metadata requests.

6528

++ * @fifo: fifo list of requests in sort_list.

6529

++ * @entity: entity representing this queue in the scheduler.

6530

++ * @max_budget: maximum budget allowed from the feedback mechanism.

6531

++ * @budget_timeout: budget expiration (in jiffies).

6532

++ * @dispatched: number of requests on the dispatch list or inside driver.

6533

++ * @flags: status flags.

6534

++ * @bfqq_list: node for active/idle bfqq list inside our bfqd.

6535

++ * @burst_list_node: node for the device's burst list.

6536

++ * @seek_samples: number of seeks sampled

6537

++ * @seek_total: sum of the distances of the seeks sampled

6538

++ * @seek_mean: mean seek distance

6539

++ * @last_request_pos: position of the last request enqueued

6540

++ * @requests_within_timer: number of consecutive pairs of request completion

6541

++ *                         and arrival, such that the queue becomes idle

6542

++ *                         after the completion, but the next request arrives

6543

++ *                         within an idle time slice; used only if the queue's

6544

++ *                         IO_bound has been cleared.

6545

++ * @pid: pid of the process owning the queue, used for logging purposes.

6546

++ * @last_wr_start_finish: start time of the current weight-raising period if

6547

++ *                        the @bfq-queue is being weight-raised, otherwise

6548

++ *                        finish time of the last weight-raising period

6549

++ * @wr_cur_max_time: current max raising time for this queue

6550

++ * @soft_rt_next_start: minimum time instant such that, only if a new

6551

++ *                      request is enqueued after this time instant in an

6552

++ *                      idle @bfq_queue with no outstanding requests, then

6553

++ *                      the task associated with the queue it is deemed as

6554

++ *                      soft real-time (see the comments to the function

6555

++ *                      bfq_bfqq_softrt_next_start()).

6556

++ * @last_idle_bklogged: time of the last transition of the @bfq_queue from

6557

++ *                      idle to backlogged

6558

++ * @service_from_backlogged: cumulative service received from the @bfq_queue

6559

++ *                           since the last transition from idle to

6560

++ *                           backlogged

6561

++ *

6562

++ * A bfq_queue is a leaf request queue; it can be associated with an io_context

6563

++ * or more, if it is async or shared between cooperating processes. @cgroup

6564

++ * holds a reference to the cgroup, to be sure that it does not disappear while

6565

++ * a bfqq still references it (mostly to avoid races between request issuing and

6566

++ * task migration followed by cgroup destruction).

6567

++ * All the fields are protected by the queue lock of the containing bfqd.

6568

++ */

6569

++struct bfq_queue {

6570

++	atomic_t ref;

6571

++	struct bfq_data *bfqd;

6572

++

6573

++	/* fields for cooperating queues handling */

6574

++	struct bfq_queue *new_bfqq;

6575

++	struct rb_node pos_node;

6576

++	struct rb_root *pos_root;

6577

++

6578

++	struct rb_root sort_list;

6579

++	struct request *next_rq;

6580

++	int queued[2];

6581

++	int allocated[2];

6582

++	int meta_pending;

6583

++	struct list_head fifo;

6584

++

6585

++	struct bfq_entity entity;

6586

++

6587

++	unsigned long max_budget;

6588

++	unsigned long budget_timeout;

6589

++

6590

++	int dispatched;

6591

++

6592

++	unsigned int flags;

6593

++

6594

++	struct list_head bfqq_list;

6595

++

6596

++	struct hlist_node burst_list_node;

6597

++

6598

++	unsigned int seek_samples;

6599

++	u64 seek_total;

6600

++	sector_t seek_mean;

6601

++	sector_t last_request_pos;

6602

++

6603

++	unsigned int requests_within_timer;

6604

++

6605

++	pid_t pid;

6606

++

6607

++	/* weight-raising fields */

6608

++	unsigned long wr_cur_max_time;

6609

++	unsigned long soft_rt_next_start;

6610

++	unsigned long last_wr_start_finish;

6611

++	unsigned int wr_coeff;

6612

++	unsigned long last_idle_bklogged;

6613

++	unsigned long service_from_backlogged;

6614

++};

6615

++

6616

++/**

6617

++ * struct bfq_ttime - per process thinktime stats.

6618

++ * @ttime_total: total process thinktime

6619

++ * @ttime_samples: number of thinktime samples

6620

++ * @ttime_mean: average process thinktime

6621

++ */

6622

++struct bfq_ttime {

6623

++	unsigned long last_end_request;

6624

++

6625

++	unsigned long ttime_total;

6626

++	unsigned long ttime_samples;

6627

++	unsigned long ttime_mean;

6628

++};

6629

++

6630

++/**

6631

++ * struct bfq_io_cq - per (request_queue, io_context) structure.

6632

++ * @icq: associated io_cq structure

6633

++ * @bfqq: array of two process queues, the sync and the async

6634

++ * @ttime: associated @bfq_ttime struct

6635

++ */

6636

++struct bfq_io_cq {

6637

++	struct io_cq icq; /* must be the first member */

6638

++	struct bfq_queue *bfqq[2];

6639

++	struct bfq_ttime ttime;

6640

++	int ioprio;

6641

++};

6642

++

6643

++enum bfq_device_speed {

6644

++	BFQ_BFQD_FAST,

6645

++	BFQ_BFQD_SLOW,

6646

++};

6647

++

6648

++/**

6649

++ * struct bfq_data - per device data structure.

6650

++ * @queue: request queue for the managed device.

6651

++ * @root_group: root bfq_group for the device.

6652

++ * @rq_pos_tree: rbtree sorted by next_request position, used when

6653

++ *               determining if two or more queues have interleaving

6654

++ *               requests (see bfq_close_cooperator()).

6655

++ * @active_numerous_groups: number of bfq_groups containing more than one

6656

++ *                          active @bfq_entity.

6657

++ * @queue_weights_tree: rbtree of weight counters of @bfq_queues, sorted by

6658

++ *                      weight. Used to keep track of whether all @bfq_queues

6659

++ *                     have the same weight. The tree contains one counter

6660

++ *                     for each distinct weight associated to some active

6661

++ *                     and not weight-raised @bfq_queue (see the comments to

6662

++ *                      the functions bfq_weights_tree_[add|remove] for

6663

++ *                     further details).

6664

++ * @group_weights_tree: rbtree of non-queue @bfq_entity weight counters, sorted

6665

++ *                      by weight. Used to keep track of whether all

6666

++ *                     @bfq_groups have the same weight. The tree contains

6667

++ *                     one counter for each distinct weight associated to

6668

++ *                     some active @bfq_group (see the comments to the

6669

++ *                     functions bfq_weights_tree_[add|remove] for further

6670

++ *                     details).

6671

++ * @busy_queues: number of bfq_queues containing requests (including the

6672

++ *		 queue in service, even if it is idling).

6673

++ * @busy_in_flight_queues: number of @bfq_queues containing pending or

6674

++ *                         in-flight requests, plus the @bfq_queue in

6675

++ *                         service, even if idle but waiting for the

6676

++ *                         possible arrival of its next sync request. This

6677

++ *                         field is updated only if the device is rotational,

6678

++ *                         but used only if the device is also NCQ-capable.

6679

++ *                         The reason why the field is updated also for non-

6680

++ *                         NCQ-capable rotational devices is related to the

6681

++ *                         fact that the value of @hw_tag may be set also

6682

++ *                         later than when busy_in_flight_queues may need to

6683

++ *                         be incremented for the first time(s). Taking also

6684

++ *                         this possibility into account, to avoid unbalanced

6685

++ *                         increments/decrements, would imply more overhead

6686

++ *                         than just updating busy_in_flight_queues

6687

++ *                         regardless of the value of @hw_tag.

6688

++ * @const_seeky_busy_in_flight_queues: number of constantly-seeky @bfq_queues

6689

++ *                                     (that is, seeky queues that expired

6690

++ *                                     for budget timeout at least once)

6691

++ *                                     containing pending or in-flight

6692

++ *                                     requests, including the in-service

6693

++ *                                     @bfq_queue if constantly seeky. This

6694

++ *                                     field is updated only if the device

6695

++ *                                     is rotational, but used only if the

6696

++ *                                     device is also NCQ-capable (see the

6697

++ *                                     comments to @busy_in_flight_queues).

6698

++ * @wr_busy_queues: number of weight-raised busy @bfq_queues.

6699

++ * @queued: number of queued requests.

6700

++ * @rq_in_driver: number of requests dispatched and waiting for completion.

6701

++ * @sync_flight: number of sync requests in the driver.

6702

++ * @max_rq_in_driver: max number of reqs in driver in the last

6703

++ *                    @hw_tag_samples completed requests.

6704

++ * @hw_tag_samples: nr of samples used to calculate hw_tag.

6705

++ * @hw_tag: flag set to one if the driver is showing a queueing behavior.

6706

++ * @budgets_assigned: number of budgets assigned.

6707

++ * @idle_slice_timer: timer set when idling for the next sequential request

6708

++ *                    from the queue in service.

6709

++ * @unplug_work: delayed work to restart dispatching on the request queue.

6710

++ * @in_service_queue: bfq_queue in service.

6711

++ * @in_service_bic: bfq_io_cq (bic) associated with the @in_service_queue.

6712

++ * @last_position: on-disk position of the last served request.

6713

++ * @last_budget_start: beginning of the last budget.

6714

++ * @last_idling_start: beginning of the last idle slice.

6715

++ * @peak_rate: peak transfer rate observed for a budget.

6716

++ * @peak_rate_samples: number of samples used to calculate @peak_rate.

6717

++ * @bfq_max_budget: maximum budget allotted to a bfq_queue before

6718

++ *                  rescheduling.

6719

++ * @group_list: list of all the bfq_groups active on the device.

6720

++ * @active_list: list of all the bfq_queues active on the device.

6721

++ * @idle_list: list of all the bfq_queues idle on the device.

6722

++ * @bfq_fifo_expire: timeout for async/sync requests; when it expires

6723

++ *                   requests are served in fifo order.

6724

++ * @bfq_back_penalty: weight of backward seeks wrt forward ones.

6725

++ * @bfq_back_max: maximum allowed backward seek.

6726

++ * @bfq_slice_idle: maximum idling time.

6727

++ * @bfq_user_max_budget: user-configured max budget value

6728

++ *                       (0 for auto-tuning).

6729

++ * @bfq_max_budget_async_rq: maximum budget (in nr of requests) allotted to

6730

++ *                           async queues.

6731

++ * @bfq_timeout: timeout for bfq_queues to consume their budget; used to

6732

++ *               to prevent seeky queues to impose long latencies to well

6733

++ *               behaved ones (this also implies that seeky queues cannot

6734

++ *               receive guarantees in the service domain; after a timeout

6735

++ *               they are charged for the whole allocated budget, to try

6736

++ *               to preserve a behavior reasonably fair among them, but

6737

++ *               without service-domain guarantees).

6738

++ * @bfq_coop_thresh: number of queue merges after which a @bfq_queue is

6739

++ *                   no more granted any weight-raising.

6740

++ * @bfq_failed_cooperations: number of consecutive failed cooperation

6741

++ *                           chances after which weight-raising is restored

6742

++ *                           to a queue subject to more than bfq_coop_thresh

6743

++ *                           queue merges.

6744

++ * @bfq_requests_within_timer: number of consecutive requests that must be

6745

++ *                             issued within the idle time slice to set

6746

++ *                             again idling to a queue which was marked as

6747

++ *                             non-I/O-bound (see the definition of the

6748

++ *                             IO_bound flag for further details).

6749

++ * @last_ins_in_burst: last time at which a queue entered the current

6750

++ *                     burst of queues being activated shortly after

6751

++ *                     each other; for more details about this and the

6752

++ *                     following parameters related to a burst of

6753

++ *                     activations, see the comments to the function

6754

++ *                     @bfq_handle_burst.

6755

++ * @bfq_burst_interval: reference time interval used to decide whether a

6756

++ *                      queue has been activated shortly after

6757

++ *                      @last_ins_in_burst.

6758

++ * @burst_size: number of queues in the current burst of queue activations.

6759

++ * @bfq_large_burst_thresh: maximum burst size above which the current

6760

++ * 			    queue-activation burst is deemed as 'large'.

6761

++ * @large_burst: true if a large queue-activation burst is in progress.

6762

++ * @burst_list: head of the burst list (as for the above fields, more details

6763

++ * 		in the comments to the function bfq_handle_burst).

6764

++ * @low_latency: if set to true, low-latency heuristics are enabled.

6765

++ * @bfq_wr_coeff: maximum factor by which the weight of a weight-raised

6766

++ *                queue is multiplied.

6767

++ * @bfq_wr_max_time: maximum duration of a weight-raising period (jiffies).

6768

++ * @bfq_wr_rt_max_time: maximum duration for soft real-time processes.

6769

++ * @bfq_wr_min_idle_time: minimum idle period after which weight-raising

6770

++ *			  may be reactivated for a queue (in jiffies).

6771

++ * @bfq_wr_min_inter_arr_async: minimum period between request arrivals

6772

++ *				after which weight-raising may be

6773

++ *				reactivated for an already busy queue

6774

++ *				(in jiffies).

6775

++ * @bfq_wr_max_softrt_rate: max service-rate for a soft real-time queue,

6776

++ *			    sectors per seconds.

6777

++ * @RT_prod: cached value of the product R*T used for computing the maximum

6778

++ *	     duration of the weight raising automatically.

6779

++ * @device_speed: device-speed class for the low-latency heuristic.

6780

++ * @oom_bfqq: fallback dummy bfqq for extreme OOM conditions.

6781

++ *

6782

++ * All the fields are protected by the @queue lock.

6783

++ */

6784

++struct bfq_data {

6785

++	struct request_queue *queue;

6786

++

6787

++	struct bfq_group *root_group;

6788

++	struct rb_root rq_pos_tree;

6789

++

6790

++#ifdef CONFIG_CGROUP_BFQIO

6791

++	int active_numerous_groups;

6792

++#endif

6793

++

6794

++	struct rb_root queue_weights_tree;

6795

++	struct rb_root group_weights_tree;

6796

++

6797

++	int busy_queues;

6798

++	int busy_in_flight_queues;

6799

++	int const_seeky_busy_in_flight_queues;

6800

++	int wr_busy_queues;

6801

++	int queued;

6802

++	int rq_in_driver;

6803

++	int sync_flight;

6804

++

6805

++	int max_rq_in_driver;

6806

++	int hw_tag_samples;

6807

++	int hw_tag;

6808

++

6809

++	int budgets_assigned;

6810

++

6811

++	struct timer_list idle_slice_timer;

6812

++	struct work_struct unplug_work;

6813

++

6814

++	struct bfq_queue *in_service_queue;

6815

++	struct bfq_io_cq *in_service_bic;

6816

++

6817

++	sector_t last_position;

6818

++

6819

++	ktime_t last_budget_start;

6820

++	ktime_t last_idling_start;

6821

++	int peak_rate_samples;

6822

++	u64 peak_rate;

6823

++	unsigned long bfq_max_budget;

6824

++

6825

++	struct hlist_head group_list;

6826

++	struct list_head active_list;

6827

++	struct list_head idle_list;

6828

++

6829

++	unsigned int bfq_fifo_expire[2];

6830

++	unsigned int bfq_back_penalty;

6831

++	unsigned int bfq_back_max;

6832

++	unsigned int bfq_slice_idle;

6833

++	u64 bfq_class_idle_last_service;

6834

++

6835

++	unsigned int bfq_user_max_budget;

6836

++	unsigned int bfq_max_budget_async_rq;

6837

++	unsigned int bfq_timeout[2];

6838

++

6839

++	unsigned int bfq_coop_thresh;

6840

++	unsigned int bfq_failed_cooperations;

6841

++	unsigned int bfq_requests_within_timer;

6842

++

6843

++	unsigned long last_ins_in_burst;

6844

++	unsigned long bfq_burst_interval;

6845

++	int burst_size;

6846

++	unsigned long bfq_large_burst_thresh;

6847

++	bool large_burst;

6848

++	struct hlist_head burst_list;

6849

++

6850

++	bool low_latency;

6851

++

6852

++	/* parameters of the low_latency heuristics */

6853

++	unsigned int bfq_wr_coeff;

6854

++	unsigned int bfq_wr_max_time;

6855

++	unsigned int bfq_wr_rt_max_time;

6856

++	unsigned int bfq_wr_min_idle_time;

6857

++	unsigned long bfq_wr_min_inter_arr_async;

6858

++	unsigned int bfq_wr_max_softrt_rate;

6859

++	u64 RT_prod;

6860

++	enum bfq_device_speed device_speed;

6861

++

6862

++	struct bfq_queue oom_bfqq;

6863

++};

6864

++

6865

++enum bfqq_state_flags {

6866

++	BFQ_BFQQ_FLAG_busy = 0,		/* has requests or is in service */

6867

++	BFQ_BFQQ_FLAG_wait_request,	/* waiting for a request */

6868

++	BFQ_BFQQ_FLAG_must_alloc,	/* must be allowed rq alloc */

6869

++	BFQ_BFQQ_FLAG_fifo_expire,	/* FIFO checked in this slice */

6870

++	BFQ_BFQQ_FLAG_idle_window,	/* slice idling enabled */

6871

++	BFQ_BFQQ_FLAG_sync,		/* synchronous queue */

6872

++	BFQ_BFQQ_FLAG_budget_new,	/* no completion with this budget */

6873

++	BFQ_BFQQ_FLAG_IO_bound,         /*

6874

++					 * bfqq has timed-out at least once

6875

++					 * having consumed at most 2/10 of

6876

++					 * its budget

6877

++					 */

6878

++	BFQ_BFQQ_FLAG_in_large_burst,	/*

6879

++					 * bfqq activated in a large burst,

6880

++					 * see comments to bfq_handle_burst.

6881

++					 */

6882

++	BFQ_BFQQ_FLAG_constantly_seeky,	/*

6883

++					 * bfqq has proved to be slow and

6884

++					 * seeky until budget timeout

6885

++					 */

6886

++	BFQ_BFQQ_FLAG_softrt_update,    /*

6887

++					 * may need softrt-next-start

6888

++					 * update

6889

++					 */

6890

++	BFQ_BFQQ_FLAG_coop,		/* bfqq is shared */

6891

++	BFQ_BFQQ_FLAG_split_coop,	/* shared bfqq will be splitted */

6892

++};

6893

++

6894

++#define BFQ_BFQQ_FNS(name)						\

6895

++static inline void bfq_mark_bfqq_##name(struct bfq_queue *bfqq)		\

6896

++{									\

6897

++	(bfqq)->flags |= (1 << BFQ_BFQQ_FLAG_##name);			\

6898

++}									\

6899

++static inline void bfq_clear_bfqq_##name(struct bfq_queue *bfqq)	\

6900

++{									\

6901

++	(bfqq)->flags &= ~(1 << BFQ_BFQQ_FLAG_##name);			\

6902

++}									\

6903

++static inline int bfq_bfqq_##name(const struct bfq_queue *bfqq)		\

6904

++{									\

6905

++	return ((bfqq)->flags & (1 << BFQ_BFQQ_FLAG_##name)) != 0;	\

6906

++}

6907

++

6908

++BFQ_BFQQ_FNS(busy);

6909

++BFQ_BFQQ_FNS(wait_request);

6910

++BFQ_BFQQ_FNS(must_alloc);

6911

++BFQ_BFQQ_FNS(fifo_expire);

6912

++BFQ_BFQQ_FNS(idle_window);

6913

++BFQ_BFQQ_FNS(sync);

6914

++BFQ_BFQQ_FNS(budget_new);

6915

++BFQ_BFQQ_FNS(IO_bound);

6916

++BFQ_BFQQ_FNS(in_large_burst);

6917

++BFQ_BFQQ_FNS(constantly_seeky);

6918

++BFQ_BFQQ_FNS(coop);

6919

++BFQ_BFQQ_FNS(split_coop);

6920

++BFQ_BFQQ_FNS(softrt_update);

6921

++#undef BFQ_BFQQ_FNS

6922

++

6923

++/* Logging facilities. */

6924

++#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \

6925

++	blk_add_trace_msg((bfqd)->queue, "bfq%d " fmt, (bfqq)->pid, ##args)

6926

++

6927

++#define bfq_log(bfqd, fmt, args...) \

6928

++	blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args)

6929

++

6930

++/* Expiration reasons. */

6931

++enum bfqq_expiration {

6932

++	BFQ_BFQQ_TOO_IDLE = 0,		/*

6933

++					 * queue has been idling for

6934

++					 * too long

6935

++					 */

6936

++	BFQ_BFQQ_BUDGET_TIMEOUT,	/* budget took too long to be used */

6937

++	BFQ_BFQQ_BUDGET_EXHAUSTED,	/* budget consumed */

6938

++	BFQ_BFQQ_NO_MORE_REQUESTS,	/* the queue has no more requests */

6939

++};

6940

++

6941

++#ifdef CONFIG_CGROUP_BFQIO

6942

++/**

6943

++ * struct bfq_group - per (device, cgroup) data structure.

6944

++ * @entity: schedulable entity to insert into the parent group sched_data.

6945

++ * @sched_data: own sched_data, to contain child entities (they may be

6946

++ *              both bfq_queues and bfq_groups).

6947

++ * @group_node: node to be inserted into the bfqio_cgroup->group_data

6948

++ *              list of the containing cgroup's bfqio_cgroup.

6949

++ * @bfqd_node: node to be inserted into the @bfqd->group_list list

6950

++ *             of the groups active on the same device; used for cleanup.

6951

++ * @bfqd: the bfq_data for the device this group acts upon.

6952

++ * @async_bfqq: array of async queues for all the tasks belonging to

6953

++ *              the group, one queue per ioprio value per ioprio_class,

6954

++ *              except for the idle class that has only one queue.

6955

++ * @async_idle_bfqq: async queue for the idle class (ioprio is ignored).

6956

++ * @my_entity: pointer to @entity, %NULL for the toplevel group; used

6957

++ *             to avoid too many special cases during group creation/

6958

++ *             migration.

6959

++ * @active_entities: number of active entities belonging to the group;

6960

++ *                   unused for the root group. Used to know whether there

6961

++ *                   are groups with more than one active @bfq_entity

6962

++ *                   (see the comments to the function

6963

++ *                   bfq_bfqq_must_not_expire()).

6964

++ *

6965

++ * Each (device, cgroup) pair has its own bfq_group, i.e., for each cgroup

6966

++ * there is a set of bfq_groups, each one collecting the lower-level

6967

++ * entities belonging to the group that are acting on the same device.

6968

++ *

6969

++ * Locking works as follows:

6970

++ *    o @group_node is protected by the bfqio_cgroup lock, and is accessed

6971

++ *      via RCU from its readers.

6972

++ *    o @bfqd is protected by the queue lock, RCU is used to access it

6973

++ *      from the readers.

6974

++ *    o All the other fields are protected by the @bfqd queue lock.

6975

++ */

6976

++struct bfq_group {

6977

++	struct bfq_entity entity;

6978

++	struct bfq_sched_data sched_data;

6979

++

6980

++	struct hlist_node group_node;

6981

++	struct hlist_node bfqd_node;

6982

++

6983

++	void *bfqd;

6984

++

6985

++	struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];

6986

++	struct bfq_queue *async_idle_bfqq;

6987

++

6988

++	struct bfq_entity *my_entity;

6989

++

6990

++	int active_entities;

6991

++};

6992

++

6993

++/**

6994

++ * struct bfqio_cgroup - bfq cgroup data structure.

6995

++ * @css: subsystem state for bfq in the containing cgroup.

6996

++ * @online: flag marked when the subsystem is inserted.

6997

++ * @weight: cgroup weight.

6998

++ * @ioprio: cgroup ioprio.

6999

++ * @ioprio_class: cgroup ioprio_class.

7000

++ * @lock: spinlock that protects @ioprio, @ioprio_class and @group_data.

7001

++ * @group_data: list containing the bfq_group belonging to this cgroup.

7002

++ *

7003

++ * @group_data is accessed using RCU, with @lock protecting the updates,

7004

++ * @ioprio and @ioprio_class are protected by @lock.

7005

++ */

7006

++struct bfqio_cgroup {

7007

++	struct cgroup_subsys_state css;

7008

++	bool online;

7009

++

7010

++	unsigned short weight, ioprio, ioprio_class;

7011

++

7012

++	spinlock_t lock;

7013

++	struct hlist_head group_data;

7014

++};

7015

++#else

7016

++struct bfq_group {

7017

++	struct bfq_sched_data sched_data;

7018

++

7019

++	struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];

7020

++	struct bfq_queue *async_idle_bfqq;

7021

++};

7022

++#endif

7023

++

7024

++static inline struct bfq_service_tree *

7025

++bfq_entity_service_tree(struct bfq_entity *entity)

7026

++{

7027

++	struct bfq_sched_data *sched_data = entity->sched_data;

7028

++	unsigned int idx = entity->ioprio_class - 1;

7029

++

7030

++	BUG_ON(idx >= BFQ_IOPRIO_CLASSES);

7031

++	BUG_ON(sched_data == NULL);

7032

++

7033

++	return sched_data->service_tree + idx;

7034

++}

7035

++

7036

++static inline struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic,

7037

++					    bool is_sync)

7038

++{

7039

++	return bic->bfqq[is_sync];

7040

++}

7041

++

7042

++static inline void bic_set_bfqq(struct bfq_io_cq *bic,

7043

++				struct bfq_queue *bfqq, bool is_sync)

7044

++{

7045

++	bic->bfqq[is_sync] = bfqq;

7046

++}

7047

++

7048

++static inline struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic)

7049

++{

7050

++	return bic->icq.q->elevator->elevator_data;

7051

++}

7052

++

7053

++/**

7054

++ * bfq_get_bfqd_locked - get a lock to a bfqd using a RCU protected pointer.

7055

++ * @ptr: a pointer to a bfqd.

7056

++ * @flags: storage for the flags to be saved.

7057

++ *

7058

++ * This function allows bfqg->bfqd to be protected by the

7059

++ * queue lock of the bfqd they reference; the pointer is dereferenced

7060

++ * under RCU, so the storage for bfqd is assured to be safe as long

7061

++ * as the RCU read side critical section does not end.  After the

7062

++ * bfqd->queue->queue_lock is taken the pointer is rechecked, to be

7063

++ * sure that no other writer accessed it.  If we raced with a writer,

7064

++ * the function returns NULL, with the queue unlocked, otherwise it

7065

++ * returns the dereferenced pointer, with the queue locked.

7066

++ */

7067

++static inline struct bfq_data *bfq_get_bfqd_locked(void **ptr,

7068

++						   unsigned long *flags)

7069

++{

7070

++	struct bfq_data *bfqd;

7071

++

7072

++	rcu_read_lock();

7073

++	bfqd = rcu_dereference(*(struct bfq_data **)ptr);

7074

++

7075

++	if (bfqd != NULL) {

7076

++		spin_lock_irqsave(bfqd->queue->queue_lock, *flags);

7077

++		if (*ptr == bfqd)

7078

++			goto out;

7079

++		spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags);

7080

++	}

7081

++

7082

++	bfqd = NULL;

7083

++out:

7084

++	rcu_read_unlock();

7085

++	return bfqd;

7086

++}

7087

++

7088

++static inline void bfq_put_bfqd_unlock(struct bfq_data *bfqd,

7089

++				       unsigned long *flags)

7090

++{

7091

++	spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags);

7092

++}

7093

++

7094

++static void bfq_check_ioprio_change(struct bfq_io_cq *bic);

7095

++static void bfq_put_queue(struct bfq_queue *bfqq);

7096

++static void bfq_dispatch_insert(struct request_queue *q, struct request *rq);

7097

++static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,

7098

++				       struct bfq_group *bfqg, int is_sync,

7099

++				       struct bfq_io_cq *bic, gfp_t gfp_mask);

7100

++static void bfq_end_wr_async_queues(struct bfq_data *bfqd,

7101

++				    struct bfq_group *bfqg);

7102

++static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg);

7103

++static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq);

7104

++

7105

++#endif /* _BFQ_H */

7106

+--

7107

+1.9.1

7108

+

7109

7110

diff --git a/5003_block-bfq-add-Early-Queue-Merge-EQM-to-BFQ-v7r8-for-4.2.patch b/5003_block-bfq-add-Early-Queue-Merge-EQM-to-BFQ-v7r8-for-4.2.patch

7111

new file mode 100644

7112

index 0000000..547a098

7113

--- /dev/null

7114

+++ b/5003_block-bfq-add-Early-Queue-Merge-EQM-to-BFQ-v7r8-for-4.2.patch

7115

@@ -0,0 +1,1220 @@

7116

+From e7a71ea27442adefc78628dedca1477a1ac6994e Mon Sep 17 00:00:00 2001

7117

+From: Mauro Andreolini <mauro.andreolini@×××××××.it>

7118

+Date: Fri, 5 Jun 2015 17:45:40 +0200

7119

+Subject: [PATCH 3/3] block, bfq: add Early Queue Merge (EQM) to BFQ-v7r8 for

7120

+ 4.2.0

7121

+

7122

+A set of processes may happen  to  perform interleaved reads, i.e.,requests

7123

+whose union would give rise to a  sequential read  pattern.  There are two

7124

+typical  cases: in the first  case,   processes  read  fixed-size chunks of

7125

+data at a fixed distance from each other, while in the second case processes

7126

+may read variable-size chunks at  variable distances. The latter case occurs

7127

+for  example with  QEMU, which  splits the  I/O generated  by the  guest into

7128

+multiple chunks,  and lets these chunks  be served by a  pool of cooperating

7129

+processes,  iteratively  assigning  the  next  chunk of  I/O  to  the first

7130

+available  process. CFQ  uses actual  queue merging  for the  first type of

7131

+rocesses, whereas it  uses preemption to get a sequential  read pattern out

7132

+of the read requests  performed by the second type of  processes. In the end

7133

+it uses  two different  mechanisms to  achieve the  same goal: boosting the

7134

+throughput with interleaved I/O.

7135

+

7136

+This patch introduces  Early Queue Merge (EQM), a unified mechanism to get a

7137

+sequential  read pattern  with both  types of  processes. The  main idea is

7138

+checking newly arrived requests against the next request of the active queue

7139

+both in case of actual request insert and in case of request merge. By doing

7140

+so, both the types of processes can be handled by just merging their queues.

7141

+EQM is  then simpler and  more compact than the  pair of mechanisms used in

7142

+CFQ.

7143

+

7144

+Finally, EQM  also preserves the  typical low-latency properties of BFQ, by

7145

+properly restoring the weight-raising state of a queue when it gets back to

7146

+a non-merged state.

7147

+

7148

+Signed-off-by: Mauro Andreolini <mauro.andreolini@×××××××.it>

7149

+Signed-off-by: Arianna Avanzini <avanzini@××××××.com>

7150

+Signed-off-by: Paolo Valente <paolo.valente@×××××××.it>

7151

+---

7152

+ block/bfq-iosched.c | 750 +++++++++++++++++++++++++++++++++++++---------------

7153

+ block/bfq-sched.c   |  28 --

7154

+ block/bfq.h         |  54 +++-

7155

+ 3 files changed, 580 insertions(+), 252 deletions(-)

7156

+

7157

+diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c

7158

+index 773b2ee..71b51c1 100644

7159

+--- a/block/bfq-iosched.c

7160

++++ b/block/bfq-iosched.c

7161

+@@ -573,6 +573,57 @@ static inline unsigned int bfq_wr_duration(struct bfq_data *bfqd)

7162

+ 	return dur;

7163

+ }

7164

+

7165

++static inline unsigned

7166

++bfq_bfqq_cooperations(struct bfq_queue *bfqq)

7167

++{

7168

++	return bfqq->bic ? bfqq->bic->cooperations : 0;

7169

++}

7170

++

7171

++static inline void

7172

++bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_io_cq *bic)

7173

++{

7174

++	if (bic->saved_idle_window)

7175

++		bfq_mark_bfqq_idle_window(bfqq);

7176

++	else

7177

++		bfq_clear_bfqq_idle_window(bfqq);

7178

++	if (bic->saved_IO_bound)

7179

++		bfq_mark_bfqq_IO_bound(bfqq);

7180

++	else

7181

++		bfq_clear_bfqq_IO_bound(bfqq);

7182

++	/* Assuming that the flag in_large_burst is already correctly set */

7183

++	if (bic->wr_time_left && bfqq->bfqd->low_latency &&

7184

++	    !bfq_bfqq_in_large_burst(bfqq) &&

7185

++	    bic->cooperations < bfqq->bfqd->bfq_coop_thresh) {

7186

++		/*

7187

++		 * Start a weight raising period with the duration given by

7188

++		 * the raising_time_left snapshot.

7189

++		 */

7190

++		if (bfq_bfqq_busy(bfqq))

7191

++			bfqq->bfqd->wr_busy_queues++;

7192

++		bfqq->wr_coeff = bfqq->bfqd->bfq_wr_coeff;

7193

++		bfqq->wr_cur_max_time = bic->wr_time_left;

7194

++		bfqq->last_wr_start_finish = jiffies;

7195

++		bfqq->entity.ioprio_changed = 1;

7196

++	}

7197

++	/*

7198

++	 * Clear wr_time_left to prevent bfq_bfqq_save_state() from

7199

++	 * getting confused about the queue's need of a weight-raising

7200

++	 * period.

7201

++	 */

7202

++	bic->wr_time_left = 0;

7203

++}

7204

++

7205

++/* Must be called with the queue_lock held. */

7206

++static int bfqq_process_refs(struct bfq_queue *bfqq)

7207

++{

7208

++	int process_refs, io_refs;

7209

++

7210

++	io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE];

7211

++	process_refs = atomic_read(&bfqq->ref) - io_refs - bfqq->entity.on_st;

7212

++	BUG_ON(process_refs < 0);

7213

++	return process_refs;

7214

++}

7215

++

7216

+ /* Empty burst list and add just bfqq (see comments to bfq_handle_burst) */

7217

+ static inline void bfq_reset_burst_list(struct bfq_data *bfqd,

7218

+ 					struct bfq_queue *bfqq)

7219

+@@ -817,7 +868,7 @@ static void bfq_add_request(struct request *rq)

7220

+ 		bfq_rq_pos_tree_add(bfqd, bfqq);

7221

+

7222

+ 	if (!bfq_bfqq_busy(bfqq)) {

7223

+-		bool soft_rt,

7224

++		bool soft_rt, coop_or_in_burst,

7225

+ 		     idle_for_long_time = time_is_before_jiffies(

7226

+ 						bfqq->budget_timeout +

7227

+ 						bfqd->bfq_wr_min_idle_time);

7228

+@@ -841,11 +892,12 @@ static void bfq_add_request(struct request *rq)

7229

+ 				bfqd->last_ins_in_burst = jiffies;

7230

+ 		}

7231

+

7232

++		coop_or_in_burst = bfq_bfqq_in_large_burst(bfqq) ||

7233

++			bfq_bfqq_cooperations(bfqq) >= bfqd->bfq_coop_thresh;

7234

+ 		soft_rt = bfqd->bfq_wr_max_softrt_rate > 0 &&

7235

+-			!bfq_bfqq_in_large_burst(bfqq) &&

7236

++			!coop_or_in_burst &&

7237

+ 			time_is_before_jiffies(bfqq->soft_rt_next_start);

7238

+-		interactive = !bfq_bfqq_in_large_burst(bfqq) &&

7239

+-			      idle_for_long_time;

7240

++		interactive = !coop_or_in_burst && idle_for_long_time;

7241

+ 		entity->budget = max_t(unsigned long, bfqq->max_budget,

7242

+ 				       bfq_serv_to_charge(next_rq, bfqq));

7243

+

7244

+@@ -864,11 +916,20 @@ static void bfq_add_request(struct request *rq)

7245

+ 		if (!bfqd->low_latency)

7246

+ 			goto add_bfqq_busy;

7247

+

7248

++		if (bfq_bfqq_just_split(bfqq))

7249

++			goto set_ioprio_changed;

7250

++

7251

+ 		/*

7252

+-		 * If the queue is not being boosted and has been idle

7253

+-		 * for enough time, start a weight-raising period

7254

++		 * If the queue:

7255

++		 * - is not being boosted,

7256

++		 * - has been idle for enough time,

7257

++		 * - is not a sync queue or is linked to a bfq_io_cq (it is

7258

++		 *   shared "for its nature" or it is not shared and its

7259

++		 *   requests have not been redirected to a shared queue)

7260

++		 * start a weight-raising period.

7261

+ 		 */

7262

+-		if (old_wr_coeff == 1 && (interactive || soft_rt)) {

7263

++		if (old_wr_coeff == 1 && (interactive || soft_rt) &&

7264

++		    (!bfq_bfqq_sync(bfqq) || bfqq->bic != NULL)) {

7265

+ 			bfqq->wr_coeff = bfqd->bfq_wr_coeff;

7266

+ 			if (interactive)

7267

+ 				bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);

7268

+@@ -882,7 +943,7 @@ static void bfq_add_request(struct request *rq)

7269

+ 		} else if (old_wr_coeff > 1) {

7270

+ 			if (interactive)

7271

+ 				bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);

7272

+-			else if (bfq_bfqq_in_large_burst(bfqq) ||

7273

++			else if (coop_or_in_burst ||

7274

+ 				 (bfqq->wr_cur_max_time ==

7275

+ 				  bfqd->bfq_wr_rt_max_time &&

7276

+ 				  !soft_rt)) {

7277

+@@ -901,18 +962,18 @@ static void bfq_add_request(struct request *rq)

7278

+ 				/*

7279

+ 				 *

7280

+ 				 * The remaining weight-raising time is lower

7281

+-				 * than bfqd->bfq_wr_rt_max_time, which

7282

+-				 * means that the application is enjoying

7283

+-				 * weight raising either because deemed soft-

7284

+-				 * rt in the near past, or because deemed

7285

+-				 * interactive a long ago. In both cases,

7286

+-				 * resetting now the current remaining weight-

7287

+-				 * raising time for the application to the

7288

+-				 * weight-raising duration for soft rt

7289

+-				 * applications would not cause any latency

7290

+-				 * increase for the application (as the new

7291

+-				 * duration would be higher than the remaining

7292

+-				 * time).

7293

++				 * than bfqd->bfq_wr_rt_max_time, which means

7294

++				 * that the application is enjoying weight

7295

++				 * raising either because deemed soft-rt in

7296

++				 * the near past, or because deemed interactive

7297

++				 * a long ago.

7298

++				 * In both cases, resetting now the current

7299

++				 * remaining weight-raising time for the

7300

++				 * application to the weight-raising duration

7301

++				 * for soft rt applications would not cause any

7302

++				 * latency increase for the application (as the

7303

++				 * new duration would be higher than the

7304

++				 * remaining time).

7305

+ 				 *

7306

+ 				 * In addition, the application is now meeting

7307

+ 				 * the requirements for being deemed soft rt.

7308

+@@ -947,6 +1008,7 @@ static void bfq_add_request(struct request *rq)

7309

+ 					bfqd->bfq_wr_rt_max_time;

7310

+ 			}

7311

+ 		}

7312

++set_ioprio_changed:

7313

+ 		if (old_wr_coeff != bfqq->wr_coeff)

7314

+ 			entity->ioprio_changed = 1;

7315

+ add_bfqq_busy:

7316

+@@ -1167,90 +1229,35 @@ static void bfq_end_wr(struct bfq_data *bfqd)

7317

+ 	spin_unlock_irq(bfqd->queue->queue_lock);

7318

+ }

7319

+

7320

+-static int bfq_allow_merge(struct request_queue *q, struct request *rq,

7321

+-			   struct bio *bio)

7322

++static inline sector_t bfq_io_struct_pos(void *io_struct, bool request)

7323

+ {

7324

+-	struct bfq_data *bfqd = q->elevator->elevator_data;

7325

+-	struct bfq_io_cq *bic;

7326

+-	struct bfq_queue *bfqq;

7327

+-

7328

+-	/*

7329

+-	 * Disallow merge of a sync bio into an async request.

7330

+-	 */

7331

+-	if (bfq_bio_sync(bio) && !rq_is_sync(rq))

7332

+-		return 0;

7333

+-

7334

+-	/*

7335

+-	 * Lookup the bfqq that this bio will be queued with. Allow

7336

+-	 * merge only if rq is queued there.

7337

+-	 * Queue lock is held here.

7338

+-	 */

7339

+-	bic = bfq_bic_lookup(bfqd, current->io_context);

7340

+-	if (bic == NULL)

7341

+-		return 0;

7342

+-

7343

+-	bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));

7344

+-	return bfqq == RQ_BFQQ(rq);

7345

+-}

7346

+-

7347

+-static void __bfq_set_in_service_queue(struct bfq_data *bfqd,

7348

+-				       struct bfq_queue *bfqq)

7349

+-{

7350

+-	if (bfqq != NULL) {

7351

+-		bfq_mark_bfqq_must_alloc(bfqq);

7352

+-		bfq_mark_bfqq_budget_new(bfqq);

7353

+-		bfq_clear_bfqq_fifo_expire(bfqq);

7354

+-

7355

+-		bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8;

7356

+-

7357

+-		bfq_log_bfqq(bfqd, bfqq,

7358

+-			     "set_in_service_queue, cur-budget = %lu",

7359

+-			     bfqq->entity.budget);

7360

+-	}

7361

+-

7362

+-	bfqd->in_service_queue = bfqq;

7363

+-}

7364

+-

7365

+-/*

7366

+- * Get and set a new queue for service.

7367

+- */

7368

+-static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd,

7369

+-						  struct bfq_queue *bfqq)

7370

+-{

7371

+-	if (!bfqq)

7372

+-		bfqq = bfq_get_next_queue(bfqd);

7373

++	if (request)

7374

++		return blk_rq_pos(io_struct);

7375

+ 	else

7376

+-		bfq_get_next_queue_forced(bfqd, bfqq);

7377

+-

7378

+-	__bfq_set_in_service_queue(bfqd, bfqq);

7379

+-	return bfqq;

7380

++		return ((struct bio *)io_struct)->bi_iter.bi_sector;

7381

+ }

7382

+

7383

+-static inline sector_t bfq_dist_from_last(struct bfq_data *bfqd,

7384

+-					  struct request *rq)

7385

++static inline sector_t bfq_dist_from(sector_t pos1,

7386

++				     sector_t pos2)

7387

+ {

7388

+-	if (blk_rq_pos(rq) >= bfqd->last_position)

7389

+-		return blk_rq_pos(rq) - bfqd->last_position;

7390

++	if (pos1 >= pos2)

7391

++		return pos1 - pos2;

7392

+ 	else

7393

+-		return bfqd->last_position - blk_rq_pos(rq);

7394

++		return pos2 - pos1;

7395

+ }

7396

+

7397

+-/*

7398

+- * Return true if bfqq has no request pending and rq is close enough to

7399

+- * bfqd->last_position, or if rq is closer to bfqd->last_position than

7400

+- * bfqq->next_rq

7401

+- */

7402

+-static inline int bfq_rq_close(struct bfq_data *bfqd, struct request *rq)

7403

++static inline int bfq_rq_close_to_sector(void *io_struct, bool request,

7404

++					 sector_t sector)

7405

+ {

7406

+-	return bfq_dist_from_last(bfqd, rq) <= BFQQ_SEEK_THR;

7407

++	return bfq_dist_from(bfq_io_struct_pos(io_struct, request), sector) <=

7408

++	       BFQQ_SEEK_THR;

7409

+ }

7410

+

7411

+-static struct bfq_queue *bfqq_close(struct bfq_data *bfqd)

7412

++static struct bfq_queue *bfqq_close(struct bfq_data *bfqd, sector_t sector)

7413

+ {

7414

+ 	struct rb_root *root = &bfqd->rq_pos_tree;

7415

+ 	struct rb_node *parent, *node;

7416

+ 	struct bfq_queue *__bfqq;

7417

+-	sector_t sector = bfqd->last_position;

7418

+

7419

+ 	if (RB_EMPTY_ROOT(root))

7420

+ 		return NULL;

7421

+@@ -1269,7 +1276,7 @@ static struct bfq_queue *bfqq_close(struct bfq_data *bfqd)

7422

+ 	 * next_request position).

7423

+ 	 */

7424

+ 	__bfqq = rb_entry(parent, struct bfq_queue, pos_node);

7425

+-	if (bfq_rq_close(bfqd, __bfqq->next_rq))

7426

++	if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector))

7427

+ 		return __bfqq;

7428

+

7429

+ 	if (blk_rq_pos(__bfqq->next_rq) < sector)

7430

+@@ -1280,7 +1287,7 @@ static struct bfq_queue *bfqq_close(struct bfq_data *bfqd)

7431

+ 		return NULL;

7432

+

7433

+ 	__bfqq = rb_entry(node, struct bfq_queue, pos_node);

7434

+-	if (bfq_rq_close(bfqd, __bfqq->next_rq))

7435

++	if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector))

7436

+ 		return __bfqq;

7437

+

7438

+ 	return NULL;

7439

+@@ -1289,14 +1296,12 @@ static struct bfq_queue *bfqq_close(struct bfq_data *bfqd)

7440

+ /*

7441

+  * bfqd - obvious

7442

+  * cur_bfqq - passed in so that we don't decide that the current queue

7443

+- *            is closely cooperating with itself.

7444

+- *

7445

+- * We are assuming that cur_bfqq has dispatched at least one request,

7446

+- * and that bfqd->last_position reflects a position on the disk associated

7447

+- * with the I/O issued by cur_bfqq.

7448

++ *            is closely cooperating with itself

7449

++ * sector - used as a reference point to search for a close queue

7450

+  */

7451

+ static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd,

7452

+-					      struct bfq_queue *cur_bfqq)

7453

++					      struct bfq_queue *cur_bfqq,

7454

++					      sector_t sector)

7455

+ {

7456

+ 	struct bfq_queue *bfqq;

7457

+

7458

+@@ -1316,7 +1321,7 @@ static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd,

7459

+ 	 * working closely on the same area of the disk. In that case,

7460

+ 	 * we can group them together and don't waste time idling.

7461

+ 	 */

7462

+-	bfqq = bfqq_close(bfqd);

7463

++	bfqq = bfqq_close(bfqd, sector);

7464

+ 	if (bfqq == NULL || bfqq == cur_bfqq)

7465

+ 		return NULL;

7466

+

7467

+@@ -1343,6 +1348,315 @@ static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd,

7468

+ 	return bfqq;

7469

+ }

7470

+

7471

++static struct bfq_queue *

7472

++bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)

7473

++{

7474

++	int process_refs, new_process_refs;

7475

++	struct bfq_queue *__bfqq;

7476

++

7477

++	/*

7478

++	 * If there are no process references on the new_bfqq, then it is

7479

++	 * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain

7480

++	 * may have dropped their last reference (not just their last process

7481

++	 * reference).

7482

++	 */

7483

++	if (!bfqq_process_refs(new_bfqq))

7484

++		return NULL;

7485

++

7486

++	/* Avoid a circular list and skip interim queue merges. */

7487

++	while ((__bfqq = new_bfqq->new_bfqq)) {

7488

++		if (__bfqq == bfqq)

7489

++			return NULL;

7490

++		new_bfqq = __bfqq;

7491

++	}

7492

++

7493

++	process_refs = bfqq_process_refs(bfqq);

7494

++	new_process_refs = bfqq_process_refs(new_bfqq);

7495

++	/*

7496

++	 * If the process for the bfqq has gone away, there is no

7497

++	 * sense in merging the queues.

7498

++	 */

7499

++	if (process_refs == 0 || new_process_refs == 0)

7500

++		return NULL;

7501

++

7502

++	bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d",

7503

++		new_bfqq->pid);

7504

++

7505

++	/*

7506

++	 * Merging is just a redirection: the requests of the process

7507

++	 * owning one of the two queues are redirected to the other queue.

7508

++	 * The latter queue, in its turn, is set as shared if this is the

7509

++	 * first time that the requests of some process are redirected to

7510

++	 * it.

7511

++	 *

7512

++	 * We redirect bfqq to new_bfqq and not the opposite, because we

7513

++	 * are in the context of the process owning bfqq, hence we have

7514

++	 * the io_cq of this process. So we can immediately configure this

7515

++	 * io_cq to redirect the requests of the process to new_bfqq.

7516

++	 *

7517

++	 * NOTE, even if new_bfqq coincides with the in-service queue, the

7518

++	 * io_cq of new_bfqq is not available, because, if the in-service

7519

++	 * queue is shared, bfqd->in_service_bic may not point to the

7520

++	 * io_cq of the in-service queue.

7521

++	 * Redirecting the requests of the process owning bfqq to the

7522

++	 * currently in-service queue is in any case the best option, as

7523

++	 * we feed the in-service queue with new requests close to the

7524

++	 * last request served and, by doing so, hopefully increase the

7525

++	 * throughput.

7526

++	 */

7527

++	bfqq->new_bfqq = new_bfqq;

7528

++	atomic_add(process_refs, &new_bfqq->ref);

7529

++	return new_bfqq;

7530

++}

7531

++

7532

++/*

7533

++ * Attempt to schedule a merge of bfqq with the currently in-service queue

7534

++ * or with a close queue among the scheduled queues.

7535

++ * Return NULL if no merge was scheduled, a pointer to the shared bfq_queue

7536

++ * structure otherwise.

7537

++ *

7538

++ * The OOM queue is not allowed to participate to cooperation: in fact, since

7539

++ * the requests temporarily redirected to the OOM queue could be redirected

7540

++ * again to dedicated queues at any time, the state needed to correctly

7541

++ * handle merging with the OOM queue would be quite complex and expensive

7542

++ * to maintain. Besides, in such a critical condition as an out of memory,

7543

++ * the benefits of queue merging may be little relevant, or even negligible.

7544

++ */

7545

++static struct bfq_queue *

7546

++bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq,

7547

++		     void *io_struct, bool request)

7548

++{

7549

++	struct bfq_queue *in_service_bfqq, *new_bfqq;

7550

++

7551

++	if (bfqq->new_bfqq)

7552

++		return bfqq->new_bfqq;

7553

++

7554

++	if (!io_struct || unlikely(bfqq == &bfqd->oom_bfqq))

7555

++		return NULL;

7556

++

7557

++	in_service_bfqq = bfqd->in_service_queue;

7558

++

7559

++	if (in_service_bfqq == NULL || in_service_bfqq == bfqq ||

7560

++	    !bfqd->in_service_bic ||

7561

++	    unlikely(in_service_bfqq == &bfqd->oom_bfqq))

7562

++		goto check_scheduled;

7563

++

7564

++	if (bfq_class_idle(in_service_bfqq) || bfq_class_idle(bfqq))

7565

++		goto check_scheduled;

7566

++

7567

++	if (bfq_class_rt(in_service_bfqq) != bfq_class_rt(bfqq))

7568

++		goto check_scheduled;

7569

++

7570

++	if (in_service_bfqq->entity.parent != bfqq->entity.parent)

7571

++		goto check_scheduled;

7572

++

7573

++	if (bfq_rq_close_to_sector(io_struct, request, bfqd->last_position) &&

7574

++	    bfq_bfqq_sync(in_service_bfqq) && bfq_bfqq_sync(bfqq)) {

7575

++		new_bfqq = bfq_setup_merge(bfqq, in_service_bfqq);

7576

++		if (new_bfqq != NULL)

7577

++			return new_bfqq; /* Merge with in-service queue */

7578

++	}

7579

++

7580

++	/*

7581

++	 * Check whether there is a cooperator among currently scheduled

7582

++	 * queues. The only thing we need is that the bio/request is not

7583

++	 * NULL, as we need it to establish whether a cooperator exists.

7584

++	 */

7585

++check_scheduled:

7586

++	new_bfqq = bfq_close_cooperator(bfqd, bfqq,

7587

++					bfq_io_struct_pos(io_struct, request));

7588

++	if (new_bfqq && likely(new_bfqq != &bfqd->oom_bfqq))

7589

++		return bfq_setup_merge(bfqq, new_bfqq);

7590

++

7591

++	return NULL;

7592

++}

7593

++

7594

++static inline void

7595

++bfq_bfqq_save_state(struct bfq_queue *bfqq)

7596

++{

7597

++	/*

7598

++	 * If bfqq->bic == NULL, the queue is already shared or its requests

7599

++	 * have already been redirected to a shared queue; both idle window

7600

++	 * and weight raising state have already been saved. Do nothing.

7601

++	 */

7602

++	if (bfqq->bic == NULL)

7603

++		return;

7604

++	if (bfqq->bic->wr_time_left)

7605

++		/*

7606

++		 * This is the queue of a just-started process, and would

7607

++		 * deserve weight raising: we set wr_time_left to the full

7608

++		 * weight-raising duration to trigger weight-raising when

7609

++		 * and if the queue is split and the first request of the

7610

++		 * queue is enqueued.

7611

++		 */

7612

++		bfqq->bic->wr_time_left = bfq_wr_duration(bfqq->bfqd);

7613

++	else if (bfqq->wr_coeff > 1) {

7614

++		unsigned long wr_duration =

7615

++			jiffies - bfqq->last_wr_start_finish;

7616

++		/*

7617

++		 * It may happen that a queue's weight raising period lasts

7618

++		 * longer than its wr_cur_max_time, as weight raising is

7619

++		 * handled only when a request is enqueued or dispatched (it

7620

++		 * does not use any timer). If the weight raising period is

7621

++		 * about to end, don't save it.

7622

++		 */

7623

++		if (bfqq->wr_cur_max_time <= wr_duration)

7624

++			bfqq->bic->wr_time_left = 0;

7625

++		else

7626

++			bfqq->bic->wr_time_left =

7627

++				bfqq->wr_cur_max_time - wr_duration;

7628

++		/*

7629

++		 * The bfq_queue is becoming shared or the requests of the

7630

++		 * process owning the queue are being redirected to a shared

7631

++		 * queue. Stop the weight raising period of the queue, as in

7632

++		 * both cases it should not be owned by an interactive or

7633

++		 * soft real-time application.

7634

++		 */

7635

++		bfq_bfqq_end_wr(bfqq);

7636

++	} else

7637

++		bfqq->bic->wr_time_left = 0;

7638

++	bfqq->bic->saved_idle_window = bfq_bfqq_idle_window(bfqq);

7639

++	bfqq->bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq);

7640

++	bfqq->bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq);

7641

++	bfqq->bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node);

7642

++	bfqq->bic->cooperations++;

7643

++	bfqq->bic->failed_cooperations = 0;

7644

++}

7645

++

7646

++static inline void

7647

++bfq_get_bic_reference(struct bfq_queue *bfqq)

7648

++{

7649

++	/*

7650

++	 * If bfqq->bic has a non-NULL value, the bic to which it belongs

7651

++	 * is about to begin using a shared bfq_queue.

7652

++	 */

7653

++	if (bfqq->bic)

7654

++		atomic_long_inc(&bfqq->bic->icq.ioc->refcount);

7655

++}

7656

++

7657

++static void

7658

++bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,

7659

++		struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)

7660

++{

7661

++	bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu",

7662

++		(long unsigned)new_bfqq->pid);

7663

++	/* Save weight raising and idle window of the merged queues */

7664

++	bfq_bfqq_save_state(bfqq);

7665

++	bfq_bfqq_save_state(new_bfqq);

7666

++	if (bfq_bfqq_IO_bound(bfqq))

7667

++		bfq_mark_bfqq_IO_bound(new_bfqq);

7668

++	bfq_clear_bfqq_IO_bound(bfqq);

7669

++	/*

7670

++	 * Grab a reference to the bic, to prevent it from being destroyed

7671

++	 * before being possibly touched by a bfq_split_bfqq().

7672

++	 */

7673

++	bfq_get_bic_reference(bfqq);

7674

++	bfq_get_bic_reference(new_bfqq);

7675

++	/*

7676

++	 * Merge queues (that is, let bic redirect its requests to new_bfqq)

7677

++	 */

7678

++	bic_set_bfqq(bic, new_bfqq, 1);

7679

++	bfq_mark_bfqq_coop(new_bfqq);

7680

++	/*

7681

++	 * new_bfqq now belongs to at least two bics (it is a shared queue):

7682

++	 * set new_bfqq->bic to NULL. bfqq either:

7683

++	 * - does not belong to any bic any more, and hence bfqq->bic must

7684

++	 *   be set to NULL, or

7685

++	 * - is a queue whose owning bics have already been redirected to a

7686

++	 *   different queue, hence the queue is destined to not belong to

7687

++	 *   any bic soon and bfqq->bic is already NULL (therefore the next

7688

++	 *   assignment causes no harm).

7689

++	 */

7690

++	new_bfqq->bic = NULL;

7691

++	bfqq->bic = NULL;

7692

++	bfq_put_queue(bfqq);

7693

++}

7694

++

7695

++static inline void bfq_bfqq_increase_failed_cooperations(struct bfq_queue *bfqq)

7696

++{

7697

++	struct bfq_io_cq *bic = bfqq->bic;

7698

++	struct bfq_data *bfqd = bfqq->bfqd;

7699

++

7700

++	if (bic && bfq_bfqq_cooperations(bfqq) >= bfqd->bfq_coop_thresh) {

7701

++		bic->failed_cooperations++;

7702

++		if (bic->failed_cooperations >= bfqd->bfq_failed_cooperations)

7703

++			bic->cooperations = 0;

7704

++	}

7705

++}

7706

++

7707

++static int bfq_allow_merge(struct request_queue *q, struct request *rq,

7708

++			   struct bio *bio)

7709

++{

7710

++	struct bfq_data *bfqd = q->elevator->elevator_data;

7711

++	struct bfq_io_cq *bic;

7712

++	struct bfq_queue *bfqq, *new_bfqq;

7713

++

7714

++	/*

7715

++	 * Disallow merge of a sync bio into an async request.

7716

++	 */

7717

++	if (bfq_bio_sync(bio) && !rq_is_sync(rq))

7718

++		return 0;

7719

++

7720

++	/*

7721

++	 * Lookup the bfqq that this bio will be queued with. Allow

7722

++	 * merge only if rq is queued there.

7723

++	 * Queue lock is held here.

7724

++	 */

7725

++	bic = bfq_bic_lookup(bfqd, current->io_context);

7726

++	if (bic == NULL)

7727

++		return 0;

7728

++

7729

++	bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));

7730

++	/*

7731

++	 * We take advantage of this function to perform an early merge

7732

++	 * of the queues of possible cooperating processes.

7733

++	 */

7734

++	if (bfqq != NULL) {

7735

++		new_bfqq = bfq_setup_cooperator(bfqd, bfqq, bio, false);

7736

++		if (new_bfqq != NULL) {

7737

++			bfq_merge_bfqqs(bfqd, bic, bfqq, new_bfqq);

7738

++			/*

7739

++			 * If we get here, the bio will be queued in the

7740

++			 * shared queue, i.e., new_bfqq, so use new_bfqq

7741

++			 * to decide whether bio and rq can be merged.

7742

++			 */

7743

++			bfqq = new_bfqq;

7744

++		} else

7745

++			bfq_bfqq_increase_failed_cooperations(bfqq);

7746

++	}

7747

++

7748

++	return bfqq == RQ_BFQQ(rq);

7749

++}

7750

++

7751

++static void __bfq_set_in_service_queue(struct bfq_data *bfqd,

7752

++				       struct bfq_queue *bfqq)

7753

++{

7754

++	if (bfqq != NULL) {

7755

++		bfq_mark_bfqq_must_alloc(bfqq);

7756

++		bfq_mark_bfqq_budget_new(bfqq);

7757

++		bfq_clear_bfqq_fifo_expire(bfqq);

7758

++

7759

++		bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8;

7760

++

7761

++		bfq_log_bfqq(bfqd, bfqq,

7762

++			     "set_in_service_queue, cur-budget = %lu",

7763

++			     bfqq->entity.budget);

7764

++	}

7765

++

7766

++	bfqd->in_service_queue = bfqq;

7767

++}

7768

++

7769

++/*

7770

++ * Get and set a new queue for service.

7771

++ */

7772

++static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd)

7773

++{

7774

++	struct bfq_queue *bfqq = bfq_get_next_queue(bfqd);

7775

++

7776

++	__bfq_set_in_service_queue(bfqd, bfqq);

7777

++	return bfqq;

7778

++}

7779

++

7780

+ /*

7781

+  * If enough samples have been computed, return the current max budget

7782

+  * stored in bfqd, which is dynamically updated according to the

7783

+@@ -1488,61 +1802,6 @@ static struct request *bfq_check_fifo(struct bfq_queue *bfqq)

7784

+ 	return rq;

7785

+ }

7786

+

7787

+-/* Must be called with the queue_lock held. */

7788

+-static int bfqq_process_refs(struct bfq_queue *bfqq)

7789

+-{

7790

+-	int process_refs, io_refs;

7791

+-

7792

+-	io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE];

7793

+-	process_refs = atomic_read(&bfqq->ref) - io_refs - bfqq->entity.on_st;

7794

+-	BUG_ON(process_refs < 0);

7795

+-	return process_refs;

7796

+-}

7797

+-

7798

+-static void bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)

7799

+-{

7800

+-	int process_refs, new_process_refs;

7801

+-	struct bfq_queue *__bfqq;

7802

+-

7803

+-	/*

7804

+-	 * If there are no process references on the new_bfqq, then it is

7805

+-	 * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain

7806

+-	 * may have dropped their last reference (not just their last process

7807

+-	 * reference).

7808

+-	 */

7809

+-	if (!bfqq_process_refs(new_bfqq))

7810

+-		return;

7811

+-

7812

+-	/* Avoid a circular list and skip interim queue merges. */

7813

+-	while ((__bfqq = new_bfqq->new_bfqq)) {

7814

+-		if (__bfqq == bfqq)

7815

+-			return;

7816

+-		new_bfqq = __bfqq;

7817

+-	}

7818

+-

7819

+-	process_refs = bfqq_process_refs(bfqq);

7820

+-	new_process_refs = bfqq_process_refs(new_bfqq);

7821

+-	/*

7822

+-	 * If the process for the bfqq has gone away, there is no

7823

+-	 * sense in merging the queues.

7824

+-	 */

7825

+-	if (process_refs == 0 || new_process_refs == 0)

7826

+-		return;

7827

+-

7828

+-	/*

7829

+-	 * Merge in the direction of the lesser amount of work.

7830

+-	 */

7831

+-	if (new_process_refs >= process_refs) {

7832

+-		bfqq->new_bfqq = new_bfqq;

7833

+-		atomic_add(process_refs, &new_bfqq->ref);

7834

+-	} else {

7835

+-		new_bfqq->new_bfqq = bfqq;

7836

+-		atomic_add(new_process_refs, &bfqq->ref);

7837

+-	}

7838

+-	bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d",

7839

+-		new_bfqq->pid);

7840

+-}

7841

+-

7842

+ static inline unsigned long bfq_bfqq_budget_left(struct bfq_queue *bfqq)

7843

+ {

7844

+ 	struct bfq_entity *entity = &bfqq->entity;

7845

+@@ -2269,7 +2528,7 @@ static inline bool bfq_bfqq_must_idle(struct bfq_queue *bfqq)

7846

+  */

7847

+ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)

7848

+ {

7849

+-	struct bfq_queue *bfqq, *new_bfqq = NULL;

7850

++	struct bfq_queue *bfqq;

7851

+ 	struct request *next_rq;

7852

+ 	enum bfqq_expiration reason = BFQ_BFQQ_BUDGET_TIMEOUT;

7853

+

7854

+@@ -2279,17 +2538,6 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)

7855

+

7856

+ 	bfq_log_bfqq(bfqd, bfqq, "select_queue: already in-service queue");

7857

+

7858

+-	/*

7859

+-         * If another queue has a request waiting within our mean seek

7860

+-         * distance, let it run. The expire code will check for close

7861

+-         * cooperators and put the close queue at the front of the

7862

+-         * service tree. If possible, merge the expiring queue with the

7863

+-         * new bfqq.

7864

+-         */

7865

+-        new_bfqq = bfq_close_cooperator(bfqd, bfqq);

7866

+-        if (new_bfqq != NULL && bfqq->new_bfqq == NULL)

7867

+-                bfq_setup_merge(bfqq, new_bfqq);

7868

+-

7869

+ 	if (bfq_may_expire_for_budg_timeout(bfqq) &&

7870

+ 	    !timer_pending(&bfqd->idle_slice_timer) &&

7871

+ 	    !bfq_bfqq_must_idle(bfqq))

7872

+@@ -2328,10 +2576,7 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)

7873

+ 				bfq_clear_bfqq_wait_request(bfqq);

7874

+ 				del_timer(&bfqd->idle_slice_timer);

7875

+ 			}

7876

+-			if (new_bfqq == NULL)

7877

+-				goto keep_queue;

7878

+-			else

7879

+-				goto expire;

7880

++			goto keep_queue;

7881

+ 		}

7882

+ 	}

7883

+

7884

+@@ -2340,40 +2585,30 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)

7885

+ 	 * for a new request, or has requests waiting for a completion and

7886

+ 	 * may idle after their completion, then keep it anyway.

7887

+ 	 */

7888

+-	if (new_bfqq == NULL && (timer_pending(&bfqd->idle_slice_timer) ||

7889

+-	    (bfqq->dispatched != 0 && bfq_bfqq_must_not_expire(bfqq)))) {

7890

++	if (timer_pending(&bfqd->idle_slice_timer) ||

7891

++	    (bfqq->dispatched != 0 && bfq_bfqq_must_not_expire(bfqq))) {

7892

+ 		bfqq = NULL;

7893

+ 		goto keep_queue;

7894

+-	} else if (new_bfqq != NULL && timer_pending(&bfqd->idle_slice_timer)) {

7895

+-		/*

7896

+-		 * Expiring the queue because there is a close cooperator,

7897

+-		 * cancel timer.

7898

+-		 */

7899

+-		bfq_clear_bfqq_wait_request(bfqq);

7900

+-		del_timer(&bfqd->idle_slice_timer);

7901

+ 	}

7902

+

7903

+ 	reason = BFQ_BFQQ_NO_MORE_REQUESTS;

7904

+ expire:

7905

+ 	bfq_bfqq_expire(bfqd, bfqq, 0, reason);

7906

+ new_queue:

7907

+-	bfqq = bfq_set_in_service_queue(bfqd, new_bfqq);

7908

++	bfqq = bfq_set_in_service_queue(bfqd);

7909

+ 	bfq_log(bfqd, "select_queue: new queue %d returned",

7910

+ 		bfqq != NULL ? bfqq->pid : 0);

7911

+ keep_queue:

7912

+ 	return bfqq;

7913

+ }

7914

+

7915

+-static void bfq_update_wr_data(struct bfq_data *bfqd,

7916

+-			       struct bfq_queue *bfqq)

7917

++static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq)

7918

+ {

7919

+-	if (bfqq->wr_coeff > 1) { /* queue is being boosted */

7920

+-		struct bfq_entity *entity = &bfqq->entity;

7921

+-

7922

++	struct bfq_entity *entity = &bfqq->entity;

7923

++	if (bfqq->wr_coeff > 1) { /* queue is being weight-raised */

7924

+ 		bfq_log_bfqq(bfqd, bfqq,

7925

+ 			"raising period dur %u/%u msec, old coeff %u, w %d(%d)",

7926

+-			jiffies_to_msecs(jiffies -

7927

+-				bfqq->last_wr_start_finish),

7928

++			jiffies_to_msecs(jiffies - bfqq->last_wr_start_finish),

7929

+ 			jiffies_to_msecs(bfqq->wr_cur_max_time),

7930

+ 			bfqq->wr_coeff,

7931

+ 			bfqq->entity.weight, bfqq->entity.orig_weight);

7932

+@@ -2382,12 +2617,16 @@ static void bfq_update_wr_data(struct bfq_data *bfqd,

7933

+ 		       entity->orig_weight * bfqq->wr_coeff);

7934

+ 		if (entity->ioprio_changed)

7935

+ 			bfq_log_bfqq(bfqd, bfqq, "WARN: pending prio change");

7936

++

7937

+ 		/*

7938

+ 		 * If the queue was activated in a burst, or

7939

+ 		 * too much time has elapsed from the beginning

7940

+-		 * of this weight-raising, then end weight raising.

7941

++		 * of this weight-raising period, or the queue has

7942

++		 * exceeded the acceptable number of cooperations,

7943

++		 * then end weight raising.

7944

+ 		 */

7945

+ 		if (bfq_bfqq_in_large_burst(bfqq) ||

7946

++		    bfq_bfqq_cooperations(bfqq) >= bfqd->bfq_coop_thresh ||

7947

+ 		    time_is_before_jiffies(bfqq->last_wr_start_finish +

7948

+ 					   bfqq->wr_cur_max_time)) {

7949

+ 			bfqq->last_wr_start_finish = jiffies;

7950

+@@ -2396,11 +2635,13 @@ static void bfq_update_wr_data(struct bfq_data *bfqd,

7951

+ 				     bfqq->last_wr_start_finish,

7952

+ 				     jiffies_to_msecs(bfqq->wr_cur_max_time));

7953

+ 			bfq_bfqq_end_wr(bfqq);

7954

+-			__bfq_entity_update_weight_prio(

7955

+-				bfq_entity_service_tree(entity),

7956

+-				entity);

7957

+ 		}

7958

+ 	}

7959

++	/* Update weight both if it must be raised and if it must be lowered */

7960

++	if ((entity->weight > entity->orig_weight) != (bfqq->wr_coeff > 1))

7961

++		__bfq_entity_update_weight_prio(

7962

++			bfq_entity_service_tree(entity),

7963

++			entity);

7964

+ }

7965

+

7966

+ /*

7967

+@@ -2647,6 +2888,25 @@ static inline void bfq_init_icq(struct io_cq *icq)

7968

+ 	struct bfq_io_cq *bic = icq_to_bic(icq);

7969

+

7970

+ 	bic->ttime.last_end_request = jiffies;

7971

++	/*

7972

++	 * A newly created bic indicates that the process has just

7973

++	 * started doing I/O, and is probably mapping into memory its

7974

++	 * executable and libraries: it definitely needs weight raising.

7975

++	 * There is however the possibility that the process performs,

7976

++	 * for a while, I/O close to some other process. EQM intercepts

7977

++	 * this behavior and may merge the queue corresponding to the

7978

++	 * process  with some other queue, BEFORE the weight of the queue

7979

++	 * is raised. Merged queues are not weight-raised (they are assumed

7980

++	 * to belong to processes that benefit only from high throughput).

7981

++	 * If the merge is basically the consequence of an accident, then

7982

++	 * the queue will be split soon and will get back its old weight.

7983

++	 * It is then important to write down somewhere that this queue

7984

++	 * does need weight raising, even if it did not make it to get its

7985

++	 * weight raised before being merged. To this purpose, we overload

7986

++	 * the field raising_time_left and assign 1 to it, to mark the queue

7987

++	 * as needing weight raising.

7988

++	 */

7989

++	bic->wr_time_left = 1;

7990

+ }

7991

+

7992

+ static void bfq_exit_icq(struct io_cq *icq)

7993

+@@ -2660,6 +2920,13 @@ static void bfq_exit_icq(struct io_cq *icq)

7994

+ 	}

7995

+

7996

+ 	if (bic->bfqq[BLK_RW_SYNC]) {

7997

++		/*

7998

++		 * If the bic is using a shared queue, put the reference

7999

++		 * taken on the io_context when the bic started using a

8000

++		 * shared bfq_queue.

8001

++		 */

8002

++		if (bfq_bfqq_coop(bic->bfqq[BLK_RW_SYNC]))

8003

++			put_io_context(icq->ioc);

8004

+ 		bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_SYNC]);

8005

+ 		bic->bfqq[BLK_RW_SYNC] = NULL;

8006

+ 	}

8007

+@@ -2952,6 +3219,10 @@ static void bfq_update_idle_window(struct bfq_data *bfqd,

8008

+ 	if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq))

8009

+ 		return;

8010

+

8011

++	/* Idle window just restored, statistics are meaningless. */

8012

++	if (bfq_bfqq_just_split(bfqq))

8013

++		return;

8014

++

8015

+ 	enable_idle = bfq_bfqq_idle_window(bfqq);

8016

+

8017

+ 	if (atomic_read(&bic->icq.ioc->active_ref) == 0 ||

8018

+@@ -2999,6 +3270,7 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,

8019

+ 	if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 ||

8020

+ 	    !BFQQ_SEEKY(bfqq))

8021

+ 		bfq_update_idle_window(bfqd, bfqq, bic);

8022

++	bfq_clear_bfqq_just_split(bfqq);

8023

+

8024

+ 	bfq_log_bfqq(bfqd, bfqq,

8025

+ 		     "rq_enqueued: idle_window=%d (seeky %d, mean %llu)",

8026

+@@ -3059,12 +3331,47 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,

8027

+ static void bfq_insert_request(struct request_queue *q, struct request *rq)

8028

+ {

8029

+ 	struct bfq_data *bfqd = q->elevator->elevator_data;

8030

+-	struct bfq_queue *bfqq = RQ_BFQQ(rq);

8031

++	struct bfq_queue *bfqq = RQ_BFQQ(rq), *new_bfqq;

8032

+

8033

+ 	assert_spin_locked(bfqd->queue->queue_lock);

8034

+

8035

++	/*

8036

++	 * An unplug may trigger a requeue of a request from the device

8037

++	 * driver: make sure we are in process context while trying to

8038

++	 * merge two bfq_queues.

8039

++	 */

8040

++	if (!in_interrupt()) {

8041

++		new_bfqq = bfq_setup_cooperator(bfqd, bfqq, rq, true);

8042

++		if (new_bfqq != NULL) {

8043

++			if (bic_to_bfqq(RQ_BIC(rq), 1) != bfqq)

8044

++				new_bfqq = bic_to_bfqq(RQ_BIC(rq), 1);

8045

++			/*

8046

++			 * Release the request's reference to the old bfqq

8047

++			 * and make sure one is taken to the shared queue.

8048

++			 */

8049

++			new_bfqq->allocated[rq_data_dir(rq)]++;

8050

++			bfqq->allocated[rq_data_dir(rq)]--;

8051

++			atomic_inc(&new_bfqq->ref);

8052

++			bfq_put_queue(bfqq);

8053

++			if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq)

8054

++				bfq_merge_bfqqs(bfqd, RQ_BIC(rq),

8055

++						bfqq, new_bfqq);

8056

++			rq->elv.priv[1] = new_bfqq;

8057

++			bfqq = new_bfqq;

8058

++		} else

8059

++			bfq_bfqq_increase_failed_cooperations(bfqq);

8060

++	}

8061

++

8062

+ 	bfq_add_request(rq);

8063

+

8064

++	/*

8065

++	 * Here a newly-created bfq_queue has already started a weight-raising

8066

++	 * period: clear raising_time_left to prevent bfq_bfqq_save_state()

8067

++	 * from assigning it a full weight-raising period. See the detailed

8068

++	 * comments about this field in bfq_init_icq().

8069

++	 */

8070

++	if (bfqq->bic != NULL)

8071

++		bfqq->bic->wr_time_left = 0;

8072

+ 	rq->fifo_time = jiffies + bfqd->bfq_fifo_expire[rq_is_sync(rq)];

8073

+ 	list_add_tail(&rq->queuelist, &bfqq->fifo);

8074

+

8075

+@@ -3226,18 +3533,6 @@ static void bfq_put_request(struct request *rq)

8076

+ 	}

8077

+ }

8078

+

8079

+-static struct bfq_queue *

8080

+-bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,

8081

+-		struct bfq_queue *bfqq)

8082

+-{

8083

+-	bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu",

8084

+-		(long unsigned)bfqq->new_bfqq->pid);

8085

+-	bic_set_bfqq(bic, bfqq->new_bfqq, 1);

8086

+-	bfq_mark_bfqq_coop(bfqq->new_bfqq);

8087

+-	bfq_put_queue(bfqq);

8088

+-	return bic_to_bfqq(bic, 1);

8089

+-}

8090

+-

8091

+ /*

8092

+  * Returns NULL if a new bfqq should be allocated, or the old bfqq if this

8093

+  * was the last process referring to said bfqq.

8094

+@@ -3246,6 +3541,9 @@ static struct bfq_queue *

8095

+ bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq)

8096

+ {

8097

+ 	bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue");

8098

++

8099

++	put_io_context(bic->icq.ioc);

8100

++

8101

+ 	if (bfqq_process_refs(bfqq) == 1) {

8102

+ 		bfqq->pid = current->pid;

8103

+ 		bfq_clear_bfqq_coop(bfqq);

8104

+@@ -3274,6 +3572,7 @@ static int bfq_set_request(struct request_queue *q, struct request *rq,

8105

+ 	struct bfq_queue *bfqq;

8106

+ 	struct bfq_group *bfqg;

8107

+ 	unsigned long flags;

8108

++	bool split = false;

8109

+

8110

+ 	might_sleep_if(gfp_mask & __GFP_WAIT);

8111

+

8112

+@@ -3291,25 +3590,26 @@ new_queue:

8113

+ 	if (bfqq == NULL || bfqq == &bfqd->oom_bfqq) {

8114

+ 		bfqq = bfq_get_queue(bfqd, bfqg, is_sync, bic, gfp_mask);

8115

+ 		bic_set_bfqq(bic, bfqq, is_sync);

8116

++		if (split && is_sync) {

8117

++			if ((bic->was_in_burst_list && bfqd->large_burst) ||

8118

++			    bic->saved_in_large_burst)

8119

++				bfq_mark_bfqq_in_large_burst(bfqq);

8120

++			else {

8121

++			    bfq_clear_bfqq_in_large_burst(bfqq);

8122

++			    if (bic->was_in_burst_list)

8123

++			       hlist_add_head(&bfqq->burst_list_node,

8124

++				              &bfqd->burst_list);

8125

++			}

8126

++		}

8127

+ 	} else {

8128

+-		/*

8129

+-		 * If the queue was seeky for too long, break it apart.

8130

+-		 */

8131

++		/* If the queue was seeky for too long, break it apart. */

8132

+ 		if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) {

8133

+ 			bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq");

8134

+ 			bfqq = bfq_split_bfqq(bic, bfqq);

8135

++			split = true;

8136

+ 			if (!bfqq)

8137

+ 				goto new_queue;

8138

+ 		}

8139

+-

8140

+-		/*

8141

+-		 * Check to see if this queue is scheduled to merge with

8142

+-		 * another closely cooperating queue. The merging of queues

8143

+-		 * happens here as it must be done in process context.

8144

+-		 * The reference on new_bfqq was taken in merge_bfqqs.

8145

+-		 */

8146

+-		if (bfqq->new_bfqq != NULL)

8147

+-			bfqq = bfq_merge_bfqqs(bfqd, bic, bfqq);

8148

+ 	}

8149

+

8150

+ 	bfqq->allocated[rw]++;

8151

+@@ -3320,6 +3620,26 @@ new_queue:

8152

+ 	rq->elv.priv[0] = bic;

8153

+ 	rq->elv.priv[1] = bfqq;

8154

+

8155

++	/*

8156

++	 * If a bfq_queue has only one process reference, it is owned

8157

++	 * by only one bfq_io_cq: we can set the bic field of the

8158

++	 * bfq_queue to the address of that structure. Also, if the

8159

++	 * queue has just been split, mark a flag so that the

8160

++	 * information is available to the other scheduler hooks.

8161

++	 */

8162

++	if (likely(bfqq != &bfqd->oom_bfqq) && bfqq_process_refs(bfqq) == 1) {

8163

++		bfqq->bic = bic;

8164

++		if (split) {

8165

++			bfq_mark_bfqq_just_split(bfqq);

8166

++			/*

8167

++			 * If the queue has just been split from a shared

8168

++			 * queue, restore the idle window and the possible

8169

++			 * weight raising period.

8170

++			 */

8171

++			bfq_bfqq_resume_state(bfqq, bic);

8172

++		}

8173

++	}

8174

++

8175

+ 	spin_unlock_irqrestore(q->queue_lock, flags);

8176

+

8177

+ 	return 0;

8178

+diff --git a/block/bfq-sched.c b/block/bfq-sched.c

8179

+index c343099..d0890c6 100644

8180

+--- a/block/bfq-sched.c

8181

++++ b/block/bfq-sched.c

8182

+@@ -1085,34 +1085,6 @@ static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd)

8183

+ 	return bfqq;

8184

+ }

8185

+

8186

+-/*

8187

+- * Forced extraction of the given queue.

8188

+- */

8189

+-static void bfq_get_next_queue_forced(struct bfq_data *bfqd,

8190

+-				      struct bfq_queue *bfqq)

8191

+-{

8192

+-	struct bfq_entity *entity;

8193

+-	struct bfq_sched_data *sd;

8194

+-

8195

+-	BUG_ON(bfqd->in_service_queue != NULL);

8196

+-

8197

+-	entity = &bfqq->entity;

8198

+-	/*

8199

+-	 * Bubble up extraction/update from the leaf to the root.

8200

+-	*/

8201

+-	for_each_entity(entity) {

8202

+-		sd = entity->sched_data;

8203

+-		bfq_update_budget(entity);

8204

+-		bfq_update_vtime(bfq_entity_service_tree(entity));

8205

+-		bfq_active_extract(bfq_entity_service_tree(entity), entity);

8206

+-		sd->in_service_entity = entity;

8207

+-		sd->next_in_service = NULL;

8208

+-		entity->service = 0;

8209

+-	}

8210

+-

8211

+-	return;

8212

+-}

8213

+-

8214

+ static void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd)

8215

+ {

8216

+ 	if (bfqd->in_service_bic != NULL) {

8217

+diff --git a/block/bfq.h b/block/bfq.h

8218

+index e350b5f..93d3f6e 100644

8219

+--- a/block/bfq.h

8220

++++ b/block/bfq.h

8221

+@@ -218,18 +218,21 @@ struct bfq_group;

8222

+  *                      idle @bfq_queue with no outstanding requests, then

8223

+  *                      the task associated with the queue it is deemed as

8224

+  *                      soft real-time (see the comments to the function

8225

+- *                      bfq_bfqq_softrt_next_start()).

8226

++ *                      bfq_bfqq_softrt_next_start())

8227

+  * @last_idle_bklogged: time of the last transition of the @bfq_queue from

8228

+  *                      idle to backlogged

8229

+  * @service_from_backlogged: cumulative service received from the @bfq_queue

8230

+  *                           since the last transition from idle to

8231

+  *                           backlogged

8232

++ * @bic: pointer to the bfq_io_cq owning the bfq_queue, set to %NULL if the

8233

++ *	 queue is shared

8234

+  *

8235

+- * A bfq_queue is a leaf request queue; it can be associated with an io_context

8236

+- * or more, if it is async or shared between cooperating processes. @cgroup

8237

+- * holds a reference to the cgroup, to be sure that it does not disappear while

8238

+- * a bfqq still references it (mostly to avoid races between request issuing and

8239

+- * task migration followed by cgroup destruction).

8240

++ * A bfq_queue is a leaf request queue; it can be associated with an

8241

++ * io_context or more, if it  is  async or shared  between  cooperating

8242

++ * processes. @cgroup holds a reference to the cgroup, to be sure that it

8243

++ * does not disappear while a bfqq still references it (mostly to avoid

8244

++ * races between request issuing and task migration followed by cgroup

8245

++ * destruction).

8246

+  * All the fields are protected by the queue lock of the containing bfqd.

8247

+  */

8248

+ struct bfq_queue {

8249

+@@ -269,6 +272,7 @@ struct bfq_queue {

8250

+ 	unsigned int requests_within_timer;

8251

+

8252

+ 	pid_t pid;

8253

++	struct bfq_io_cq *bic;

8254

+

8255

+ 	/* weight-raising fields */

8256

+ 	unsigned long wr_cur_max_time;

8257

+@@ -298,12 +302,42 @@ struct bfq_ttime {

8258

+  * @icq: associated io_cq structure

8259

+  * @bfqq: array of two process queues, the sync and the async

8260

+  * @ttime: associated @bfq_ttime struct

8261

++ * @wr_time_left: snapshot of the time left before weight raising ends

8262

++ *                for the sync queue associated to this process; this

8263

++ *		  snapshot is taken to remember this value while the weight

8264

++ *		  raising is suspended because the queue is merged with a

8265

++ *		  shared queue, and is used to set @raising_cur_max_time

8266

++ *		  when the queue is split from the shared queue and its

8267

++ *		  weight is raised again

8268

++ * @saved_idle_window: same purpose as the previous field for the idle

8269

++ *                     window

8270

++ * @saved_IO_bound: same purpose as the previous two fields for the I/O

8271

++ *                  bound classification of a queue

8272

++ * @saved_in_large_burst: same purpose as the previous fields for the

8273

++ *                        value of the field keeping the queue's belonging

8274

++ *                        to a large burst

8275

++ * @was_in_burst_list: true if the queue belonged to a burst list

8276

++ *                     before its merge with another cooperating queue

8277

++ * @cooperations: counter of consecutive successful queue merges underwent

8278

++ *                by any of the process' @bfq_queues

8279

++ * @failed_cooperations: counter of consecutive failed queue merges of any

8280

++ *                       of the process' @bfq_queues

8281

+  */

8282

+ struct bfq_io_cq {

8283

+ 	struct io_cq icq; /* must be the first member */

8284

+ 	struct bfq_queue *bfqq[2];

8285

+ 	struct bfq_ttime ttime;

8286

+ 	int ioprio;

8287

++

8288

++	unsigned int wr_time_left;

8289

++	bool saved_idle_window;

8290

++	bool saved_IO_bound;

8291

++

8292

++	bool saved_in_large_burst;

8293

++	bool was_in_burst_list;

8294

++

8295

++	unsigned int cooperations;

8296

++	unsigned int failed_cooperations;

8297

+ };

8298

+

8299

+ enum bfq_device_speed {

8300

+@@ -536,7 +570,7 @@ enum bfqq_state_flags {

8301

+ 	BFQ_BFQQ_FLAG_idle_window,	/* slice idling enabled */

8302

+ 	BFQ_BFQQ_FLAG_sync,		/* synchronous queue */

8303

+ 	BFQ_BFQQ_FLAG_budget_new,	/* no completion with this budget */

8304

+-	BFQ_BFQQ_FLAG_IO_bound,         /*

8305

++	BFQ_BFQQ_FLAG_IO_bound,		/*

8306

+ 					 * bfqq has timed-out at least once

8307

+ 					 * having consumed at most 2/10 of

8308

+ 					 * its budget

8309

+@@ -549,12 +583,13 @@ enum bfqq_state_flags {

8310

+ 					 * bfqq has proved to be slow and

8311

+ 					 * seeky until budget timeout

8312

+ 					 */

8313

+-	BFQ_BFQQ_FLAG_softrt_update,    /*

8314

++	BFQ_BFQQ_FLAG_softrt_update,	/*

8315

+ 					 * may need softrt-next-start

8316

+ 					 * update

8317

+ 					 */

8318

+ 	BFQ_BFQQ_FLAG_coop,		/* bfqq is shared */

8319

+-	BFQ_BFQQ_FLAG_split_coop,	/* shared bfqq will be splitted */

8320

++	BFQ_BFQQ_FLAG_split_coop,	/* shared bfqq will be split */

8321

++	BFQ_BFQQ_FLAG_just_split,	/* queue has just been split */

8322

+ };

8323

+

8324

+ #define BFQ_BFQQ_FNS(name)						\

8325

+@@ -583,6 +618,7 @@ BFQ_BFQQ_FNS(in_large_burst);

8326

+ BFQ_BFQQ_FNS(constantly_seeky);

8327

+ BFQ_BFQQ_FNS(coop);

8328

+ BFQ_BFQQ_FNS(split_coop);

8329

++BFQ_BFQQ_FNS(just_split);

8330

+ BFQ_BFQQ_FNS(softrt_update);

8331

+ #undef BFQ_BFQQ_FNS

8332

+

8333

+--

8334

+1.9.1

8335

+

Gentoo Archives: gentoo-commits